Hi All,
Chages since v1:
- left fixes only, improvements will be posted separately;
- Fixes: and -stable tags added to patch descriptions;
This series is an attempt to fix the violation of lazy MMU mode context
requirement as described for arch_enter_lazy_mmu_mode():
This mode can only be entered and left under the protection of
the page table locks for all page tables which may be modified.
On s390 if I make arch_enter_lazy_mmu_mode() -> preempt_enable() and
arch_leave_lazy_mmu_mode() -> preempt_disable() I am getting this:
[ 553.332108] preempt_count: 1, expected: 0
[ 553.332117] no locks held by multipathd/2116.
[ 553.332128] CPU: 24 PID: 2116 Comm: multipathd Kdump: loaded Tainted:
[ 553.332139] Hardware name: IBM 3931 A01 701 (LPAR)
[ 553.332146] Call Trace:
[ 553.332152] [<00000000158de23a>] dump_stack_lvl+0xfa/0x150
[ 553.332167] [<0000000013e10d12>] __might_resched+0x57a/0x5e8
[ 553.332178] [<00000000144eb6c2>] __alloc_pages+0x2ba/0x7c0
[ 553.332189] [<00000000144d5cdc>] __get_free_pages+0x2c/0x88
[ 553.332198] [<00000000145663f6>] kasan_populate_vmalloc_pte+0x4e/0x110
[ 553.332207] [<000000001447625c>] apply_to_pte_range+0x164/0x3c8
[ 553.332218] [<000000001448125a>] apply_to_pmd_range+0xda/0x318
[ 553.332226] [<000000001448181c>] __apply_to_page_range+0x384/0x768
[ 553.332233] [<0000000014481c28>] apply_to_page_range+0x28/0x38
[ 553.332241] [<00000000145665da>] kasan_populate_vmalloc+0x82/0x98
[ 553.332249] [<00000000144c88d0>] alloc_vmap_area+0x590/0x1c90
[ 553.332257] [<00000000144ca108>] __get_vm_area_node.constprop.0+0x138/0x260
[ 553.332265] [<00000000144d17fc>] __vmalloc_node_range+0x134/0x360
[ 553.332274] [<0000000013d5dbf2>] alloc_thread_stack_node+0x112/0x378
[ 553.332284] [<0000000013d62726>] dup_task_struct+0x66/0x430
[ 553.332293] [<0000000013d63962>] copy_process+0x432/0x4b80
[ 553.332302] [<0000000013d68300>] kernel_clone+0xf0/0x7d0
[ 553.332311] [<0000000013d68bd6>] __do_sys_clone+0xae/0xc8
[ 553.332400] [<0000000013d68dee>] __s390x_sys_clone+0xd6/0x118
[ 553.332410] [<0000000013c9d34c>] do_syscall+0x22c/0x328
[ 553.332419] [<00000000158e7366>] __do_syscall+0xce/0xf0
[ 553.332428] [<0000000015913260>] system_call+0x70/0x98
This exposes a KASAN issue fixed with patch 1 and apply_to_pte_range()
issue fixed with patch 3, while patch 2 is a prerequisite.
Commit b9ef323ea168 ("powerpc/64s: Disable preemption in hash lazy mmu
mode") looks like powerpc-only fix, yet not entirely conforming to the
above provided requirement (page tables itself are still not protected).
If I am not mistaken, xen and sparc are alike.
Thanks!
Alexander Gordeev (3):
kasan: Avoid sleepable page allocation from atomic context
mm: Cleanup apply_to_pte_range() routine
mm: Protect kernel pgtables in apply_to_pte_range()
mm/kasan/shadow.c | 9 +++------
mm/memory.c | 33 +++++++++++++++++++++------------
2 files changed, 24 insertions(+), 18 deletions(-)
--
2.45.2
The quilt patch titled
Subject: mm: protect kernel pgtables in apply_to_pte_range()
has been removed from the -mm tree. Its filename was
mm-protect-kernel-pgtables-in-apply_to_pte_range.patch
This patch was dropped because an updated version will be issued
------------------------------------------------------
From: Alexander Gordeev <agordeev(a)linux.ibm.com>
Subject: mm: protect kernel pgtables in apply_to_pte_range()
Date: Tue, 8 Apr 2025 18:07:32 +0200
The lazy MMU mode can only be entered and left under the protection of the
page table locks for all page tables which may be modified. Yet, when it
comes to kernel mappings apply_to_pte_range() does not take any locks.
That does not conform arch_enter|leave_lazy_mmu_mode() semantics and could
potentially lead to re-schedulling a process while in lazy MMU mode or
racing on a kernel page table updates.
Link: https://lkml.kernel.org/r/ef8f6538b83b7fc3372602f90375348f9b4f3596.17441281…
Fixes: 38e0edb15bd0 ("mm/apply_to_range: call pte function with lazy updates")
Signed-off-by: Alexander Gordeev <agordeev(a)linux.ibm.com>
Cc: Andrey Ryabinin <ryabinin.a.a(a)gmail.com>
Cc: Guenetr Roeck <linux(a)roeck-us.net>
Cc: Hugh Dickins <hughd(a)google.com>
Cc: Jeremy Fitzhardinge <jeremy(a)goop.org>
Cc: Juegren Gross <jgross(a)suse.com>
Cc: Nicholas Piggin <npiggin(a)gmail.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/kasan/shadow.c | 7 ++-----
mm/memory.c | 5 ++++-
2 files changed, 6 insertions(+), 6 deletions(-)
--- a/mm/kasan/shadow.c~mm-protect-kernel-pgtables-in-apply_to_pte_range
+++ a/mm/kasan/shadow.c
@@ -308,14 +308,14 @@ static int kasan_populate_vmalloc_pte(pt
__memset((void *)page, KASAN_VMALLOC_INVALID, PAGE_SIZE);
pte = pfn_pte(PFN_DOWN(__pa(page)), PAGE_KERNEL);
- spin_lock(&init_mm.page_table_lock);
if (likely(pte_none(ptep_get(ptep)))) {
set_pte_at(&init_mm, addr, ptep, pte);
page = 0;
}
- spin_unlock(&init_mm.page_table_lock);
+
if (page)
free_page(page);
+
return 0;
}
@@ -401,13 +401,10 @@ static int kasan_depopulate_vmalloc_pte(
page = (unsigned long)__va(pte_pfn(ptep_get(ptep)) << PAGE_SHIFT);
- spin_lock(&init_mm.page_table_lock);
-
if (likely(!pte_none(ptep_get(ptep)))) {
pte_clear(&init_mm, addr, ptep);
free_page(page);
}
- spin_unlock(&init_mm.page_table_lock);
return 0;
}
--- a/mm/memory.c~mm-protect-kernel-pgtables-in-apply_to_pte_range
+++ a/mm/memory.c
@@ -2926,6 +2926,7 @@ static int apply_to_pte_range(struct mm_
pte = pte_offset_kernel(pmd, addr);
if (!pte)
return err;
+ spin_lock(&init_mm.page_table_lock);
} else {
if (create)
pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
@@ -2951,7 +2952,9 @@ static int apply_to_pte_range(struct mm_
arch_leave_lazy_mmu_mode();
- if (mm != &init_mm)
+ if (mm == &init_mm)
+ spin_unlock(&init_mm.page_table_lock);
+ else
pte_unmap_unlock(mapped_pte, ptl);
*mask |= PGTBL_PTE_MODIFIED;
_
Patches currently in -mm which might be from agordeev(a)linux.ibm.com are
The quilt patch titled
Subject: kasan: avoid sleepable page allocation from atomic context
has been removed from the -mm tree. Its filename was
kasan-avoid-sleepable-page-allocation-from-atomic-context.patch
This patch was dropped because an updated version will be issued
------------------------------------------------------
From: Alexander Gordeev <agordeev(a)linux.ibm.com>
Subject: kasan: avoid sleepable page allocation from atomic context
Date: Tue, 8 Apr 2025 18:07:30 +0200
Patch series "mm: Fix apply_to_pte_range() vs lazy MMU mode", v2.
This series is an attempt to fix the violation of lazy MMU mode context
requirement as described for arch_enter_lazy_mmu_mode():
This mode can only be entered and left under the protection of
the page table locks for all page tables which may be modified.
On s390 if I make arch_enter_lazy_mmu_mode() -> preempt_enable() and
arch_leave_lazy_mmu_mode() -> preempt_disable() I am getting this:
[ 553.332108] preempt_count: 1, expected: 0
[ 553.332117] no locks held by multipathd/2116.
[ 553.332128] CPU: 24 PID: 2116 Comm: multipathd Kdump: loaded Tainted:
[ 553.332139] Hardware name: IBM 3931 A01 701 (LPAR)
[ 553.332146] Call Trace:
[ 553.332152] [<00000000158de23a>] dump_stack_lvl+0xfa/0x150
[ 553.332167] [<0000000013e10d12>] __might_resched+0x57a/0x5e8
[ 553.332178] [<00000000144eb6c2>] __alloc_pages+0x2ba/0x7c0
[ 553.332189] [<00000000144d5cdc>] __get_free_pages+0x2c/0x88
[ 553.332198] [<00000000145663f6>] kasan_populate_vmalloc_pte+0x4e/0x110
[ 553.332207] [<000000001447625c>] apply_to_pte_range+0x164/0x3c8
[ 553.332218] [<000000001448125a>] apply_to_pmd_range+0xda/0x318
[ 553.332226] [<000000001448181c>] __apply_to_page_range+0x384/0x768
[ 553.332233] [<0000000014481c28>] apply_to_page_range+0x28/0x38
[ 553.332241] [<00000000145665da>] kasan_populate_vmalloc+0x82/0x98
[ 553.332249] [<00000000144c88d0>] alloc_vmap_area+0x590/0x1c90
[ 553.332257] [<00000000144ca108>] __get_vm_area_node.constprop.0+0x138/0x260
[ 553.332265] [<00000000144d17fc>] __vmalloc_node_range+0x134/0x360
[ 553.332274] [<0000000013d5dbf2>] alloc_thread_stack_node+0x112/0x378
[ 553.332284] [<0000000013d62726>] dup_task_struct+0x66/0x430
[ 553.332293] [<0000000013d63962>] copy_process+0x432/0x4b80
[ 553.332302] [<0000000013d68300>] kernel_clone+0xf0/0x7d0
[ 553.332311] [<0000000013d68bd6>] __do_sys_clone+0xae/0xc8
[ 553.332400] [<0000000013d68dee>] __s390x_sys_clone+0xd6/0x118
[ 553.332410] [<0000000013c9d34c>] do_syscall+0x22c/0x328
[ 553.332419] [<00000000158e7366>] __do_syscall+0xce/0xf0
[ 553.332428] [<0000000015913260>] system_call+0x70/0x98
This exposes a KASAN issue fixed with patch 1 and apply_to_pte_range()
issue fixed with patch 3, while patch 2 is a prerequisite.
Commit b9ef323ea168 ("powerpc/64s: Disable preemption in hash lazy mmu
mode") looks like powerpc-only fix, yet not entirely conforming to the
above provided requirement (page tables itself are still not protected).
If I am not mistaken, xen and sparc are alike.
This patch (of 3):
apply_to_page_range() enters lazy MMU mode and then invokes
kasan_populate_vmalloc_pte() callback on each page table walk iteration.
The lazy MMU mode may only be entered only under protection of the page
table lock. However, the callback can go into sleep when trying to
allocate a single page.
Change __get_free_page() allocation mode from GFP_KERNEL to GFP_ATOMIC to
avoid scheduling out while in atomic context.
Link: https://lkml.kernel.org/r/cover.1744128123.git.agordeev@linux.ibm.com
Link: https://lkml.kernel.org/r/2d9f4ac4528701b59d511a379a60107fa608ad30.17441281…
Fixes: 3c5c3cfb9ef4 ("kasan: support backing vmalloc space with real shadow memory")
Signed-off-by: Alexander Gordeev <agordeev(a)linux.ibm.com>
Cc: Andrey Ryabinin <ryabinin.a.a(a)gmail.com>
Cc: Guenetr Roeck <linux(a)roeck-us.net>
Cc: Hugh Dickins <hughd(a)google.com>
Cc: Jeremy Fitzhardinge <jeremy(a)goop.org>
Cc: Juegren Gross <jgross(a)suse.com>
Cc: Nicholas Piggin <npiggin(a)gmail.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/kasan/shadow.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/mm/kasan/shadow.c~kasan-avoid-sleepable-page-allocation-from-atomic-context
+++ a/mm/kasan/shadow.c
@@ -301,7 +301,7 @@ static int kasan_populate_vmalloc_pte(pt
if (likely(!pte_none(ptep_get(ptep))))
return 0;
- page = __get_free_page(GFP_KERNEL);
+ page = __get_free_page(GFP_ATOMIC);
if (!page)
return -ENOMEM;
_
Patches currently in -mm which might be from agordeev(a)linux.ibm.com are
mm-cleanup-apply_to_pte_range-routine.patch
mm-protect-kernel-pgtables-in-apply_to_pte_range.patch
Hello,
I'm investigating if v5.15 and early versions are vulnerable to the following CVEs. Could you please help confirm the following cases?
For CVE-2024-36912, the suggested fix is 211f514ebf1e ("Drivers: hv: vmbus: Track decrypted status in vmbus_gpadl") according to https://www.cve.org/CVERecord?id=CVE-2024-36912
It seems 211f514ebf1e is based on d4dccf353db8 ("Drivers: hv: vmbus: Mark vmbus ring buffer visible to host in Isolation VM") which was introduced since v5.16. For v5.15 and early versions, vmbus ring buffer hadn't been made visible to host, so there's no need to backport 211f514ebf1e to those versions, right?
For CVE-2024-36913, the suggested fix is 03f5a999adba ("Drivers: hv: vmbus: Leak pages if set_memory_encrypted() fails") according to https://www.cve.org/CVERecord?id=CVE-2024-36913
It seems 03f5a999adba is based on f2f136c05fb6 ("Drivers: hv: vmbus: Add SNP support for VMbus channel initiate message") which was introduced since v5.16. For v5.15 and early verions, monitor pages hadn't been made visible to host, so there's no need to backport 03f5a999adba to those versions, right?
Thanks,
Zhe
This series backports some recent fixes for SVE/KVM interactions from
Mark Rutland to v5.15.
Signed-off-by: Mark Brown <broonie(a)kernel.org>
---
Changes in v3:
- Explicitly include "KVM: arm64: Always start with clearing SVE flag on
load", it was included previously as part of a conflcit resolution.
- Link to v2: https://lore.kernel.org/r/20250403-stable-sve-5-15-v2-0-30a36a78a20a@kernel…
Changes in v2:
- Resend with Greg and the stable list added.
- Link to v1: https://lore.kernel.org/r/20250402-stable-sve-5-15-v1-0-84d0e5ff1102@kernel…
---
Fuad Tabba (1):
KVM: arm64: Calculate cptr_el2 traps on activating traps
Marc Zyngier (2):
KVM: arm64: Get rid of host SVE tracking/saving
KVM: arm64: Always start with clearing SVE flag on load
Mark Brown (4):
KVM: arm64: Discard any SVE state when entering KVM guests
arm64/fpsimd: Track the saved FPSIMD state type separately to TIF_SVE
arm64/fpsimd: Have KVM explicitly say which FP registers to save
arm64/fpsimd: Stop using TIF_SVE to manage register saving in KVM
Mark Rutland (4):
KVM: arm64: Unconditionally save+flush host FPSIMD/SVE/SME state
KVM: arm64: Remove host FPSIMD saving for non-protected KVM
KVM: arm64: Remove VHE host restore of CPACR_EL1.ZEN
KVM: arm64: Eagerly switch ZCR_EL{1,2}
arch/arm64/include/asm/fpsimd.h | 4 +-
arch/arm64/include/asm/kvm_host.h | 17 +++--
arch/arm64/include/asm/kvm_hyp.h | 7 ++
arch/arm64/include/asm/processor.h | 7 ++
arch/arm64/kernel/fpsimd.c | 117 +++++++++++++++++++++++---------
arch/arm64/kernel/process.c | 3 +
arch/arm64/kernel/ptrace.c | 3 +
arch/arm64/kernel/signal.c | 3 +
arch/arm64/kvm/arm.c | 1 -
arch/arm64/kvm/fpsimd.c | 72 +++++++++-----------
arch/arm64/kvm/hyp/entry.S | 5 ++
arch/arm64/kvm/hyp/include/hyp/switch.h | 86 +++++++++++++++--------
arch/arm64/kvm/hyp/nvhe/hyp-main.c | 9 ++-
arch/arm64/kvm/hyp/nvhe/switch.c | 52 +++++++++-----
arch/arm64/kvm/hyp/vhe/switch.c | 4 ++
arch/arm64/kvm/reset.c | 3 +
16 files changed, 266 insertions(+), 127 deletions(-)
---
base-commit: 0c935c049b5c196b83b968c72d348ae6fff83ea2
change-id: 20250326-stable-sve-5-15-bfd75482dcfa
Best regards,
--
Mark Brown <broonie(a)kernel.org>
From: Chris Wilson <chris.p.wilson(a)intel.com>
commit 78a033433a5ae4fee85511ee075bc9a48312c79e upstream.
If we abort driver initialisation in the middle of gt/engine discovery,
some engines will be fully setup and some not. Those incompletely setup
engines only have 'engine->release == NULL' and so will leak any of the
common objects allocated.
v2:
- Drop the destroy_pinned_context() helper for now. It's not really
worth it with just a single callsite at the moment. (Janusz)
Signed-off-by: Chris Wilson <chris.p.wilson(a)intel.com>
Cc: Janusz Krzysztofik <janusz.krzysztofik(a)linux.intel.com>
Signed-off-by: Matt Roper <matthew.d.roper(a)intel.com>
Reviewed-by: Janusz Krzysztofik <janusz.krzysztofik(a)linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20220915232654.3283095-2-matt…
Signed-off-by: Zhi Yang <Zhi.Yang(a)windriver.com>
Signed-off-by: He Zhe <zhe.he(a)windriver.com>
---
Build test passed.
---
drivers/gpu/drm/i915/gt/intel_engine_cs.c | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
index a19537706ed1..eb6f4d7f1e34 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
@@ -904,8 +904,13 @@ int intel_engines_init(struct intel_gt *gt)
return err;
err = setup(engine);
- if (err)
+ if (err) {
+ intel_engine_cleanup_common(engine);
return err;
+ }
+
+ /* The backend should now be responsible for cleanup */
+ GEM_BUG_ON(engine->release == NULL);
err = engine_init_common(engine);
if (err)
--
2.34.1
This series adds fine grained trap control in EL2 required for FEAT_PMUv3p9
registers like PMICNTR_EL0, PMICFILTR_EL0, and PMUACR_EL1 which are already
being used in the kernel. This is required to prevent their EL1 access trap
into EL2.
The following commits that enabled access into FEAT_PMUv3p9 registers have
already been merged upstream from 6.13 onwards.
d8226d8cfbaf ("perf: arm_pmuv3: Add support for Armv9.4 PMU instruction counter")
0bbff9ed8165 ("perf/arm_pmuv3: Add PMUv3.9 per counter EL0 access control")
The sysreg patches in this series are required for the final patch which
fixes the actual problem.
Anshuman Khandual (7):
arm64/sysreg: Update register fields for ID_AA64MMFR0_EL1
arm64/sysreg: Add register fields for HDFGRTR2_EL2
arm64/sysreg: Add register fields for HDFGWTR2_EL2
arm64/sysreg: Add register fields for HFGITR2_EL2
arm64/sysreg: Add register fields for HFGRTR2_EL2
arm64/sysreg: Add register fields for HFGWTR2_EL2
arm64/boot: Enable EL2 requirements for FEAT_PMUv3p9
Documentation/arch/arm64/booting.rst | 22 ++++++
arch/arm64/include/asm/el2_setup.h | 25 +++++++
arch/arm64/tools/sysreg | 103 +++++++++++++++++++++++++++
3 files changed, 150 insertions(+)
--
2.30.2
From: Chris Wilson <chris.p.wilson(a)intel.com>
commit 78a033433a5ae4fee85511ee075bc9a48312c79e upstream.
If we abort driver initialisation in the middle of gt/engine discovery,
some engines will be fully setup and some not. Those incompletely setup
engines only have 'engine->release == NULL' and so will leak any of the
common objects allocated.
v2:
- Drop the destroy_pinned_context() helper for now. It's not really
worth it with just a single callsite at the moment. (Janusz)
Signed-off-by: Chris Wilson <chris.p.wilson(a)intel.com>
Cc: Janusz Krzysztofik <janusz.krzysztofik(a)linux.intel.com>
Signed-off-by: Matt Roper <matthew.d.roper(a)intel.com>
Reviewed-by: Janusz Krzysztofik <janusz.krzysztofik(a)linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20220915232654.3283095-2-matt…
Signed-off-by: Zhi Yang <Zhi.Yang(a)windriver.com>
Signed-off-by: He Zhe <zhe.he(a)windriver.com>
---
Build test passed.
---
drivers/gpu/drm/i915/gt/intel_engine_cs.c | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
index eb99441e0ada..42cb3ad04d89 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
@@ -983,8 +983,13 @@ int intel_engines_init(struct intel_gt *gt)
return err;
err = setup(engine);
- if (err)
+ if (err) {
+ intel_engine_cleanup_common(engine);
return err;
+ }
+
+ /* The backend should now be responsible for cleanup */
+ GEM_BUG_ON(engine->release == NULL);
err = engine_init_common(engine);
if (err)
--
2.34.1
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x 9f98a4f4e7216dbe366010b4cdcab6b220f229c4
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025040844-busload-dumpling-45ff@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 9f98a4f4e7216dbe366010b4cdcab6b220f229c4 Mon Sep 17 00:00:00 2001
From: Vishal Annapurve <vannapurve(a)google.com>
Date: Fri, 28 Feb 2025 01:44:15 +0000
Subject: [PATCH] x86/tdx: Fix arch_safe_halt() execution for TDX VMs
Direct HLT instruction execution causes #VEs for TDX VMs which is routed
to hypervisor via TDCALL. If HLT is executed in STI-shadow, resulting #VE
handler will enable interrupts before TDCALL is routed to hypervisor
leading to missed wakeup events, as current TDX spec doesn't expose
interruptibility state information to allow #VE handler to selectively
enable interrupts.
Commit bfe6ed0c6727 ("x86/tdx: Add HLT support for TDX guests")
prevented the idle routines from executing HLT instruction in STI-shadow.
But it missed the paravirt routine which can be reached via this path
as an example:
kvm_wait() =>
safe_halt() =>
raw_safe_halt() =>
arch_safe_halt() =>
irq.safe_halt() =>
pv_native_safe_halt()
To reliably handle arch_safe_halt() for TDX VMs, introduce explicit
dependency on CONFIG_PARAVIRT and override paravirt halt()/safe_halt()
routines with TDX-safe versions that execute direct TDCALL and needed
interrupt flag updates. Executing direct TDCALL brings in additional
benefit of avoiding HLT related #VEs altogether.
As tested by Ryan Afranji:
"Tested with the specjbb2015 benchmark. It has heavy lock contention which leads
to many halt calls. TDX VMs suffered a poor score before this patchset.
Verified the major performance improvement with this patchset applied."
Fixes: bfe6ed0c6727 ("x86/tdx: Add HLT support for TDX guests")
Signed-off-by: Vishal Annapurve <vannapurve(a)google.com>
Signed-off-by: Ingo Molnar <mingo(a)kernel.org>
Reviewed-by: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com>
Tested-by: Ryan Afranji <afranji(a)google.com>
Cc: Andy Lutomirski <luto(a)kernel.org>
Cc: Brian Gerst <brgerst(a)gmail.com>
Cc: Juergen Gross <jgross(a)suse.com>
Cc: H. Peter Anvin <hpa(a)zytor.com>
Cc: Linus Torvalds <torvalds(a)linux-foundation.org>
Cc: Josh Poimboeuf <jpoimboe(a)redhat.com>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20250228014416.3925664-3-vannapurve@google.com
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 05b4eca156cf..f614c0522a0b 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -878,6 +878,7 @@ config INTEL_TDX_GUEST
depends on X86_64 && CPU_SUP_INTEL
depends on X86_X2APIC
depends on EFI_STUB
+ depends on PARAVIRT
select ARCH_HAS_CC_PLATFORM
select X86_MEM_ENCRYPT
select X86_MCE
diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c
index 7772b01ab738..aa0eb4057226 100644
--- a/arch/x86/coco/tdx/tdx.c
+++ b/arch/x86/coco/tdx/tdx.c
@@ -14,6 +14,7 @@
#include <asm/ia32.h>
#include <asm/insn.h>
#include <asm/insn-eval.h>
+#include <asm/paravirt_types.h>
#include <asm/pgtable.h>
#include <asm/set_memory.h>
#include <asm/traps.h>
@@ -398,7 +399,7 @@ static int handle_halt(struct ve_info *ve)
return ve_instr_len(ve);
}
-void __cpuidle tdx_safe_halt(void)
+void __cpuidle tdx_halt(void)
{
const bool irq_disabled = false;
@@ -409,6 +410,16 @@ void __cpuidle tdx_safe_halt(void)
WARN_ONCE(1, "HLT instruction emulation failed\n");
}
+static void __cpuidle tdx_safe_halt(void)
+{
+ tdx_halt();
+ /*
+ * "__cpuidle" section doesn't support instrumentation, so stick
+ * with raw_* variant that avoids tracing hooks.
+ */
+ raw_local_irq_enable();
+}
+
static int read_msr(struct pt_regs *regs, struct ve_info *ve)
{
struct tdx_module_args args = {
@@ -1109,6 +1120,19 @@ void __init tdx_early_init(void)
x86_platform.guest.enc_kexec_begin = tdx_kexec_begin;
x86_platform.guest.enc_kexec_finish = tdx_kexec_finish;
+ /*
+ * Avoid "sti;hlt" execution in TDX guests as HLT induces a #VE that
+ * will enable interrupts before HLT TDCALL invocation if executed
+ * in STI-shadow, possibly resulting in missed wakeup events.
+ *
+ * Modify all possible HLT execution paths to use TDX specific routines
+ * that directly execute TDCALL and toggle the interrupt state as
+ * needed after TDCALL completion. This also reduces HLT related #VEs
+ * in addition to having a reliable halt logic execution.
+ */
+ pv_ops.irq.safe_halt = tdx_safe_halt;
+ pv_ops.irq.halt = tdx_halt;
+
/*
* TDX intercepts the RDMSR to read the X2APIC ID in the parallel
* bringup low level code. That raises #VE which cannot be handled
diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
index 65394aa9b49f..4a1922ec80cf 100644
--- a/arch/x86/include/asm/tdx.h
+++ b/arch/x86/include/asm/tdx.h
@@ -58,7 +58,7 @@ void tdx_get_ve_info(struct ve_info *ve);
bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve);
-void tdx_safe_halt(void);
+void tdx_halt(void);
bool tdx_early_handle_ve(struct pt_regs *regs);
@@ -72,7 +72,7 @@ void __init tdx_dump_td_ctls(u64 td_ctls);
#else
static inline void tdx_early_init(void) { };
-static inline void tdx_safe_halt(void) { };
+static inline void tdx_halt(void) { };
static inline bool tdx_early_handle_ve(struct pt_regs *regs) { return false; }
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 91f6ff618852..962c3ce39323 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -939,7 +939,7 @@ void __init select_idle_routine(void)
static_call_update(x86_idle, mwait_idle);
} else if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) {
pr_info("using TDX aware idle routine\n");
- static_call_update(x86_idle, tdx_safe_halt);
+ static_call_update(x86_idle, tdx_halt);
} else {
static_call_update(x86_idle, default_idle);
}
From: Srinivasan Shanmugam <srinivasan.shanmugam(a)amd.com>
commit 15c2990e0f0108b9c3752d7072a97d45d4283aea upstream.
This commit adds null checks for the 'stream' and 'plane' variables in
the dcn30_apply_idle_power_optimizations function. These variables were
previously assumed to be null at line 922, but they were used later in
the code without checking if they were null. This could potentially lead
to a null pointer dereference, which would cause a crash.
The null checks ensure that 'stream' and 'plane' are not null before
they are used, preventing potential crashes.
Fixes the below static smatch checker:
drivers/gpu/drm/amd/amdgpu/../display/dc/hwss/dcn30/dcn30_hwseq.c:938 dcn30_apply_idle_power_optimizations() error: we previously assumed 'stream' could be null (see line 922)
drivers/gpu/drm/amd/amdgpu/../display/dc/hwss/dcn30/dcn30_hwseq.c:940 dcn30_apply_idle_power_optimizations() error: we previously assumed 'plane' could be null (see line 922)
Cc: Tom Chung <chiahsuan.chung(a)amd.com>
Cc: Nicholas Kazlauskas <nicholas.kazlauskas(a)amd.com>
Cc: Bhawanpreet Lakha <Bhawanpreet.Lakha(a)amd.com>
Cc: Rodrigo Siqueira <Rodrigo.Siqueira(a)amd.com>
Cc: Roman Li <roman.li(a)amd.com>
Cc: Hersen Wu <hersenxs.wu(a)amd.com>
Cc: Alex Hung <alex.hung(a)amd.com>
Cc: Aurabindo Pillai <aurabindo.pillai(a)amd.com>
Cc: Harry Wentland <harry.wentland(a)amd.com>
Signed-off-by: Srinivasan Shanmugam <srinivasan.shanmugam(a)amd.com>
Reviewed-by: Aurabindo Pillai <aurabindo.pillai(a)amd.com>
Signed-off-by: Alex Deucher <alexander.deucher(a)amd.com>
Signed-off-by: Zhi Yang <Zhi.Yang(a)windriver.com>
Signed-off-by: He Zhe <zhe.he(a)windriver.com>
---
Build test passed.
---
drivers/gpu/drm/amd/display/dc/dcn30/dcn30_hwseq.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_hwseq.c b/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_hwseq.c
index 81547178a934..30716b88136c 100644
--- a/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_hwseq.c
+++ b/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_hwseq.c
@@ -784,6 +784,9 @@ bool dcn30_apply_idle_power_optimizations(struct dc *dc, bool enable)
stream = dc->current_state->streams[0];
plane = (stream ? dc->current_state->stream_status[0].plane_states[0] : NULL);
+ if (!stream || !plane)
+ return false;
+
if (stream && plane) {
cursor_cache_enable = stream->cursor_position.enable &&
plane->address.grph.cursor_cache_addr.quad_part;
--
2.34.1
The patch below does not apply to the 6.13-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.13.y
git checkout FETCH_HEAD
git cherry-pick -x dd4f730b557ce701a2cd4f604bf1e57667bd8b6e
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025040803-womb-decorated-10be@gregkh' --subject-prefix 'PATCH 6.13.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From dd4f730b557ce701a2cd4f604bf1e57667bd8b6e Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <nathan(a)kernel.org>
Date: Mon, 10 Feb 2025 21:28:25 -0500
Subject: [PATCH] ACPI: platform-profile: Fix CFI violation when accessing
sysfs files
When an attribute group is created with sysfs_create_group(), the
->sysfs_ops() callback is set to kobj_sysfs_ops, which sets the ->show()
and ->store() callbacks to kobj_attr_show() and kobj_attr_store()
respectively. These functions use container_of() to get the respective
callback from the passed attribute, meaning that these callbacks need to
be of the same type as the callbacks in 'struct kobj_attribute'.
However, ->show() and ->store() in the platform_profile driver are
defined for struct device_attribute with the help of DEVICE_ATTR_RO()
and DEVICE_ATTR_RW(), which results in a CFI violation when accessing
platform_profile or platform_profile_choices under /sys/firmware/acpi
because the types do not match:
CFI failure at kobj_attr_show+0x19/0x30 (target: platform_profile_choices_show+0x0/0x140; expected type: 0x7a69590c)
There is no functional issue from the type mismatch because the layout
of 'struct kobj_attribute' and 'struct device_attribute' are the same,
so the container_of() cast does not break anything aside from CFI.
Change the type of platform_profile_choices_show() and
platform_profile_{show,store}() to match the callbacks in
'struct kobj_attribute' and update the attribute variables to
match, which resolves the CFI violation.
Cc: All applicable <stable(a)vger.kernel.org>
Fixes: a2ff95e018f1 ("ACPI: platform: Add platform profile support")
Reported-by: John Rowley <lkml(a)johnrowley.me>
Closes: https://github.com/ClangBuiltLinux/linux/issues/2047
Tested-by: John Rowley <lkml(a)johnrowley.me>
Reviewed-by: Sami Tolvanen <samitolvanen(a)google.com>
Signed-off-by: Nathan Chancellor <nathan(a)kernel.org>
Acked-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Reviewed-by: Mark Pearson <mpearson-lenovo(a)squebb.ca>
Tested-by: Mark Pearson <mpearson-lenovo(a)squebb.ca>
Link: https://patch.msgid.link/20250210-acpi-platform_profile-fix-cfi-violation-v…
[ rjw: Changelog edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki(a)intel.com>
diff --git a/drivers/acpi/platform_profile.c b/drivers/acpi/platform_profile.c
index fc92e43d0fe9..1b6317f759f9 100644
--- a/drivers/acpi/platform_profile.c
+++ b/drivers/acpi/platform_profile.c
@@ -260,14 +260,14 @@ static int _aggregate_choices(struct device *dev, void *data)
/**
* platform_profile_choices_show - Show the available profile choices for legacy sysfs interface
- * @dev: The device
+ * @kobj: The kobject
* @attr: The attribute
* @buf: The buffer to write to
*
* Return: The number of bytes written
*/
-static ssize_t platform_profile_choices_show(struct device *dev,
- struct device_attribute *attr,
+static ssize_t platform_profile_choices_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
char *buf)
{
unsigned long aggregate[BITS_TO_LONGS(PLATFORM_PROFILE_LAST)];
@@ -333,14 +333,14 @@ static int _store_and_notify(struct device *dev, void *data)
/**
* platform_profile_show - Show the current profile for legacy sysfs interface
- * @dev: The device
+ * @kobj: The kobject
* @attr: The attribute
* @buf: The buffer to write to
*
* Return: The number of bytes written
*/
-static ssize_t platform_profile_show(struct device *dev,
- struct device_attribute *attr,
+static ssize_t platform_profile_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
char *buf)
{
enum platform_profile_option profile = PLATFORM_PROFILE_LAST;
@@ -362,15 +362,15 @@ static ssize_t platform_profile_show(struct device *dev,
/**
* platform_profile_store - Set the profile for legacy sysfs interface
- * @dev: The device
+ * @kobj: The kobject
* @attr: The attribute
* @buf: The buffer to read from
* @count: The number of bytes to read
*
* Return: The number of bytes read
*/
-static ssize_t platform_profile_store(struct device *dev,
- struct device_attribute *attr,
+static ssize_t platform_profile_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
const char *buf, size_t count)
{
unsigned long choices[BITS_TO_LONGS(PLATFORM_PROFILE_LAST)];
@@ -401,12 +401,12 @@ static ssize_t platform_profile_store(struct device *dev,
return count;
}
-static DEVICE_ATTR_RO(platform_profile_choices);
-static DEVICE_ATTR_RW(platform_profile);
+static struct kobj_attribute attr_platform_profile_choices = __ATTR_RO(platform_profile_choices);
+static struct kobj_attribute attr_platform_profile = __ATTR_RW(platform_profile);
static struct attribute *platform_profile_attrs[] = {
- &dev_attr_platform_profile_choices.attr,
- &dev_attr_platform_profile.attr,
+ &attr_platform_profile_choices.attr,
+ &attr_platform_profile.attr,
NULL
};
From: Luiz Augusto von Dentz <luiz.von.dentz(a)intel.com>
commit b25e11f978b63cb7857890edb3a698599cddb10e upstream.
This aligned BR/EDR JUST_WORKS method with LE which since 92516cd97fd4
("Bluetooth: Always request for user confirmation for Just Works")
always request user confirmation with confirm_hint set since the
likes of bluetoothd have dedicated policy around JUST_WORKS method
(e.g. main.conf:JustWorksRepairing).
CVE: CVE-2024-8805
Cc: stable(a)vger.kernel.org
Fixes: ba15a58b179e ("Bluetooth: Fix SSP acceptor just-works confirmation without MITM")
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz(a)intel.com>
Tested-by: Kiran K <kiran.k(a)intel.com>
Signed-off-by: Xiangyu Chen <xiangyu.chen(a)windriver.com>
Signed-off-by: He Zhe <zhe.he(a)windriver.com>
---
Verified the build test.
---
net/bluetooth/hci_event.c | 13 +++++--------
1 file changed, 5 insertions(+), 8 deletions(-)
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 50e21f67a73d..83af50c3838a 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -4859,19 +4859,16 @@ static void hci_user_confirm_request_evt(struct hci_dev *hdev,
goto unlock;
}
- /* If no side requires MITM protection; auto-accept */
+ /* If no side requires MITM protection; use JUST_CFM method */
if ((!loc_mitm || conn->remote_cap == HCI_IO_NO_INPUT_OUTPUT) &&
(!rem_mitm || conn->io_capability == HCI_IO_NO_INPUT_OUTPUT)) {
- /* If we're not the initiators request authorization to
- * proceed from user space (mgmt_user_confirm with
- * confirm_hint set to 1). The exception is if neither
- * side had MITM or if the local IO capability is
- * NoInputNoOutput, in which case we do auto-accept
+ /* If we're not the initiator of request authorization and the
+ * local IO capability is not NoInputNoOutput, use JUST_WORKS
+ * method (mgmt_user_confirm with confirm_hint set to 1).
*/
if (!test_bit(HCI_CONN_AUTH_PEND, &conn->flags) &&
- conn->io_capability != HCI_IO_NO_INPUT_OUTPUT &&
- (loc_mitm || rem_mitm)) {
+ conn->io_capability != HCI_IO_NO_INPUT_OUTPUT) {
BT_DBG("Confirming auto-accept as acceptor");
confirm_hint = 1;
goto confirm;
--
2.34.1
The patch below does not apply to the 6.13-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.13.y
git checkout FETCH_HEAD
git cherry-pick -x e7607f7d6d81af71dcc5171278aadccc94d277cd
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025040805-boaster-hazing-36c3@gregkh' --subject-prefix 'PATCH 6.13.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From e7607f7d6d81af71dcc5171278aadccc94d277cd Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <nathan(a)kernel.org>
Date: Thu, 20 Mar 2025 22:33:49 +0100
Subject: [PATCH] ARM: 9443/1: Require linker to support KEEP within OVERLAY
for DCE
ld.lld prior to 21.0.0 does not support using the KEEP keyword within an
overlay description, which may be needed to avoid discarding necessary
sections within an overlay with '--gc-sections', which can be enabled
for the kernel via CONFIG_LD_DEAD_CODE_DATA_ELIMINATION.
Disallow CONFIG_LD_DEAD_CODE_DATA_ELIMINATION without support for KEEP
within OVERLAY and introduce a macro, OVERLAY_KEEP, that can be used to
conditionally add KEEP when it is properly supported to avoid breaking
old versions of ld.lld.
Cc: stable(a)vger.kernel.org
Link: https://github.com/llvm/llvm-project/commit/381599f1fe973afad3094e55ec99b16…
Reviewed-by: Linus Walleij <linus.walleij(a)linaro.org>
Signed-off-by: Nathan Chancellor <nathan(a)kernel.org>
Signed-off-by: Russell King (Oracle) <rmk+kernel(a)armlinux.org.uk>
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 202dbd17ad2f..25ed6f1a7c7a 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -121,7 +121,7 @@ config ARM
select HAVE_KERNEL_XZ
select HAVE_KPROBES if !XIP_KERNEL && !CPU_ENDIAN_BE32 && !CPU_V7M
select HAVE_KRETPROBES if HAVE_KPROBES
- select HAVE_LD_DEAD_CODE_DATA_ELIMINATION if (LD_VERSION >= 23600 || LD_IS_LLD)
+ select HAVE_LD_DEAD_CODE_DATA_ELIMINATION if (LD_VERSION >= 23600 || LD_CAN_USE_KEEP_IN_OVERLAY)
select HAVE_MOD_ARCH_SPECIFIC
select HAVE_NMI
select HAVE_OPTPROBES if !THUMB2_KERNEL
diff --git a/arch/arm/include/asm/vmlinux.lds.h b/arch/arm/include/asm/vmlinux.lds.h
index 89697f204715..a54db342653a 100644
--- a/arch/arm/include/asm/vmlinux.lds.h
+++ b/arch/arm/include/asm/vmlinux.lds.h
@@ -34,6 +34,12 @@
#define NOCROSSREFS
#endif
+#ifdef CONFIG_LD_CAN_USE_KEEP_IN_OVERLAY
+#define OVERLAY_KEEP(x) KEEP(x)
+#else
+#define OVERLAY_KEEP(x) x
+#endif
+
/* Set start/end symbol names to the LMA for the section */
#define ARM_LMA(sym, section) \
sym##_start = LOADADDR(section); \
diff --git a/init/Kconfig b/init/Kconfig
index d0d021b3fa3b..fc994f5cd5db 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -129,6 +129,11 @@ config CC_HAS_COUNTED_BY
# https://github.com/llvm/llvm-project/pull/112636
depends on !(CC_IS_CLANG && CLANG_VERSION < 190103)
+config LD_CAN_USE_KEEP_IN_OVERLAY
+ # ld.lld prior to 21.0.0 did not support KEEP within an overlay description
+ # https://github.com/llvm/llvm-project/pull/130661
+ def_bool LD_IS_BFD || LLD_VERSION >= 210000
+
config RUSTC_HAS_COERCE_POINTEE
def_bool RUSTC_VERSION >= 108400
From: Chunguang Xu <chunguang.xu(a)shopee.com>
[ Upstream commit e5d574ab37f5f2e7937405613d9b1a724811e5ad ]
If a discard request needs to be retried, and that retry may fail before
a new special payload is added, a double free will result. Clear the
RQF_SPECIAL_LOAD when the request is cleaned.
Signed-off-by: Chunguang Xu <chunguang.xu(a)shopee.com>
Reviewed-by: Sagi Grimberg <sagi(a)grimberg.me>
Reviewed-by: Max Gurtovoy <mgurtovoy(a)nvidia.com>
Signed-off-by: Keith Busch <kbusch(a)kernel.org>
[Minor context change fixed]
Signed-off-by: Cliff Liu <donghua.liu(a)windriver.com>
Signed-off-by: He Zhe <Zhe.He(a)windriver.com>
---
Verified the build test.
---
drivers/nvme/host/core.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 019a6dbdcbc2..7d6aab68446e 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -852,6 +852,7 @@ void nvme_cleanup_cmd(struct request *req)
clear_bit_unlock(0, &ns->ctrl->discard_page_busy);
else
kfree(page_address(page) + req->special_vec.bv_offset);
+ req->rq_flags &= ~RQF_SPECIAL_PAYLOAD;
}
}
EXPORT_SYMBOL_GPL(nvme_cleanup_cmd);
--
2.34.1
Current xhci bus resume implementation prevents xHC host from generating
interrupts during high-speed USB 2 and super-speed USB 3 bus resume.
Only reason to disable interrupts during bus resume would be to prevent
the interrupt handler from interfering with the resume process of USB 2
ports.
Host initiated resume of USB 2 ports is done in two stages.
The xhci driver first transitions the port from 'U3' to 'Resume' state,
then wait in Resume for 20ms, and finally moves port to U0 state.
xhci driver can't prevent interrupts by keeping the xhci spinlock
due to this 20ms sleep.
Limit interrupt disabling to the USB 2 port resume case only.
resuming USB 2 ports in bus resume is only done in special cases where
USB 2 ports had to be forced to suspend during bus suspend.
The current way of preventing interrupts by clearing the 'Interrupt
Enable' (INTE) bit in USBCMD register won't prevent the Interrupter
registers 'Interrupt Pending' (IP), 'Event Handler Busy' (EHB) and
USBSTS register Event Interrupt (EINT) bits from being set.
New interrupts can't be issued before those bits are properly clered.
Disable interrupts by clearing the interrupter register 'Interrupt
Enable' (IE) bit instead. This way IP, EHB and INTE won't be set
before IE is enabled again and a new interrupt is triggered.
Reported-by: Devyn Liu <liudingyuan(a)huawei.com>
Closes: https://lore.kernel.org/linux-usb/b1a9e2d51b4d4ff7a304f77c5be8164e@huawei.c…
Cc: stable(a)vger.kernel.org
Tested-by: Devyn Liu <liudingyuan(a)huawei.com>
Signed-off-by: Mathias Nyman <mathias.nyman(a)linux.intel.com>
---
drivers/usb/host/xhci-hub.c | 30 ++++++++++++++++--------------
drivers/usb/host/xhci.c | 4 ++--
drivers/usb/host/xhci.h | 2 ++
3 files changed, 20 insertions(+), 16 deletions(-)
diff --git a/drivers/usb/host/xhci-hub.c b/drivers/usb/host/xhci-hub.c
index c0f226584a40..486347776cb2 100644
--- a/drivers/usb/host/xhci-hub.c
+++ b/drivers/usb/host/xhci-hub.c
@@ -1878,9 +1878,10 @@ int xhci_bus_resume(struct usb_hcd *hcd)
int max_ports, port_index;
int sret;
u32 next_state;
- u32 temp, portsc;
+ u32 portsc;
struct xhci_hub *rhub;
struct xhci_port **ports;
+ bool disabled_irq = false;
rhub = xhci_get_rhub(hcd);
ports = rhub->ports;
@@ -1896,17 +1897,20 @@ int xhci_bus_resume(struct usb_hcd *hcd)
return -ESHUTDOWN;
}
- /* delay the irqs */
- temp = readl(&xhci->op_regs->command);
- temp &= ~CMD_EIE;
- writel(temp, &xhci->op_regs->command);
-
/* bus specific resume for ports we suspended at bus_suspend */
- if (hcd->speed >= HCD_USB3)
+ if (hcd->speed >= HCD_USB3) {
next_state = XDEV_U0;
- else
+ } else {
next_state = XDEV_RESUME;
-
+ if (bus_state->bus_suspended) {
+ /*
+ * prevent port event interrupts from interfering
+ * with usb2 port resume process
+ */
+ xhci_disable_interrupter(xhci->interrupters[0]);
+ disabled_irq = true;
+ }
+ }
port_index = max_ports;
while (port_index--) {
portsc = readl(ports[port_index]->addr);
@@ -1974,11 +1978,9 @@ int xhci_bus_resume(struct usb_hcd *hcd)
(void) readl(&xhci->op_regs->command);
bus_state->next_statechange = jiffies + msecs_to_jiffies(5);
- /* re-enable irqs */
- temp = readl(&xhci->op_regs->command);
- temp |= CMD_EIE;
- writel(temp, &xhci->op_regs->command);
- temp = readl(&xhci->op_regs->command);
+ /* re-enable interrupter */
+ if (disabled_irq)
+ xhci_enable_interrupter(xhci->interrupters[0]);
spin_unlock_irqrestore(&xhci->lock, flags);
return 0;
diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c
index ca390beda85b..90eb491267b5 100644
--- a/drivers/usb/host/xhci.c
+++ b/drivers/usb/host/xhci.c
@@ -322,7 +322,7 @@ static void xhci_zero_64b_regs(struct xhci_hcd *xhci)
xhci_info(xhci, "Fault detected\n");
}
-static int xhci_enable_interrupter(struct xhci_interrupter *ir)
+int xhci_enable_interrupter(struct xhci_interrupter *ir)
{
u32 iman;
@@ -335,7 +335,7 @@ static int xhci_enable_interrupter(struct xhci_interrupter *ir)
return 0;
}
-static int xhci_disable_interrupter(struct xhci_interrupter *ir)
+int xhci_disable_interrupter(struct xhci_interrupter *ir)
{
u32 iman;
diff --git a/drivers/usb/host/xhci.h b/drivers/usb/host/xhci.h
index 28b6264f8b87..242ab9fbc8ae 100644
--- a/drivers/usb/host/xhci.h
+++ b/drivers/usb/host/xhci.h
@@ -1890,6 +1890,8 @@ int xhci_alloc_tt_info(struct xhci_hcd *xhci,
struct usb_tt *tt, gfp_t mem_flags);
int xhci_set_interrupter_moderation(struct xhci_interrupter *ir,
u32 imod_interval);
+int xhci_enable_interrupter(struct xhci_interrupter *ir);
+int xhci_disable_interrupter(struct xhci_interrupter *ir);
/* xHCI ring, segment, TRB, and TD functions */
dma_addr_t xhci_trb_virt_to_dma(struct xhci_segment *seg, union xhci_trb *trb);
--
2.43.0
From: Michal Pecio <michal.pecio(a)gmail.com>
This check is performed before prepare_transfer() and prepare_ring(), so
enqueue can already point at the final link TRB of a segment. And indeed
it will, some 0.4% of times this code is called.
Then enqueue + 1 is an invalid pointer. It will crash the kernel right
away or load some junk which may look like a link TRB and cause the real
link TRB to be replaced with a NOOP. This wouldn't end well.
Use a functionally equivalent test which doesn't dereference the pointer
and always gives correct result.
Something has crashed my machine twice in recent days while playing with
an Etron HC, and a control transfer stress test ran for confirmation has
just crashed it again. The same test passes with this patch applied.
Fixes: 5e1c67abc930 ("xhci: Fix control transfer error on Etron xHCI host")
Cc: stable(a)vger.kernel.org
Signed-off-by: Michal Pecio <michal.pecio(a)gmail.com>
Signed-off-by: Mathias Nyman <mathias.nyman(a)linux.intel.com>
---
drivers/usb/host/xhci-ring.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c
index 4e975caca235..b906bc2eea5f 100644
--- a/drivers/usb/host/xhci-ring.c
+++ b/drivers/usb/host/xhci-ring.c
@@ -3777,7 +3777,7 @@ int xhci_queue_ctrl_tx(struct xhci_hcd *xhci, gfp_t mem_flags,
* enqueue a No Op TRB, this can prevent the Setup and Data Stage
* TRB to be breaked by the Link TRB.
*/
- if (trb_is_link(ep_ring->enqueue + 1)) {
+ if (last_trb_on_seg(ep_ring->enq_seg, ep_ring->enqueue + 1)) {
field = TRB_TYPE(TRB_TR_NOOP) | ep_ring->cycle_state;
queue_trb(xhci, ep_ring, false, 0, 0,
TRB_INTR_TARGET(0), field);
--
2.43.0
commit c929d08df8bee855528b9d15b853c892c54e1eee upstream.
If the warning mode with disabled mitigation mode is used, then on each
CPU where the split lock occurred detection will be disabled in order to
make progress and delayed work will be scheduled, which then will enable
detection back.
Now it turns out that all CPUs use one global delayed work structure.
This leads to the fact that if a split lock occurs on several CPUs
at the same time (within 2 jiffies), only one CPU will schedule delayed
work, but the rest will not.
The return value of schedule_delayed_work_on() would have shown this,
but it is not checked in the code.
A diagram that can help to understand the bug reproduction:
- sld_update_msr() enables/disables SLD on both CPUs on the same core
- schedule_delayed_work_on() internally checks WORK_STRUCT_PENDING_BIT.
If a work has the 'pending' status, then schedule_delayed_work_on()
will return an error code and, most importantly, the work will not
be placed in the workqueue.
Let's say we have a multicore system on which split_lock_mitigate=0 and
a multithreaded application is running that calls splitlock in multiple
threads. Due to the fact that sld_update_msr() affects the entire core
(both CPUs), we will consider 2 CPUs from different cores. Let the 2
threads of this application schedule to CPU0 (core 0) and to CPU 2
(core 1), then:
| || |
| CPU 0 (core 0) || CPU 2 (core 1) |
|_________________________________||___________________________________|
| || |
| 1) SPLIT LOCK occured || |
| || |
| 2) split_lock_warn() || |
| || |
| 3) sysctl_sld_mitigate == 0 || |
| (work = &sl_reenable) || |
| || |
| 4) schedule_delayed_work_on() || |
| (reenable will be called || |
| after 2 jiffies on CPU 0) || |
| || |
| 5) disable SLD for core 0 || |
| || |
| ------------------------- || |
| || |
| || 6) SPLIT LOCK occured |
| || |
| || 7) split_lock_warn() |
| || |
| || 8) sysctl_sld_mitigate == 0 |
| || (work = &sl_reenable, |
| || the same address as in 3) ) |
| || |
| 2 jiffies || 9) schedule_delayed_work_on() |
| || fials because the work is in |
| || the pending state since 4). |
| || The work wasn't placed to the |
| || workqueue. reenable won't be |
| || called on CPU 2 |
| || |
| || 10) disable SLD for core 0 |
| || |
| || From now on SLD will |
| || never be reenabled on core 1 |
| || |
| ------------------------- || |
| || |
| 11) enable SLD for core 0 by || |
| __split_lock_reenable || |
| || |
If the application threads can be scheduled to all processor cores,
then over time there will be only one core left, on which SLD will be
enabled and split lock will be able to be detected; and on all other
cores SLD will be disabled all the time.
Most likely, this bug has not been noticed for so long because
sysctl_sld_mitigate default value is 1, and in this case a semaphore
is used that does not allow 2 different cores to have SLD disabled at
the same time, that is, strictly only one work is placed in the
workqueue.
In order to fix the warning mode with disabled mitigation mode,
delayed work has to be per-CPU. Implement it.
Fixes: 727209376f49 ("x86/split_lock: Add sysctl to control the misery mode")
Signed-off-by: Maksim Davydov <davydov-max(a)yandex-team.ru>
Signed-off-by: Ingo Molnar <mingo(a)kernel.org>
Tested-by: Guilherme G. Piccoli <gpiccoli(a)igalia.com>
Cc: Thomas Gleixner <tglx(a)linutronix.de>
Cc: Ravi Bangoria <ravi.bangoria(a)amd.com>
Cc: Tom Lendacky <thomas.lendacky(a)amd.com>
Link: https://lore.kernel.org/r/20250115131704.132609-1-davydov-max@yandex-team.ru
---
arch/x86/kernel/cpu/intel.c | 20 ++++++++++++++++----
1 file changed, 16 insertions(+), 4 deletions(-)
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index b91f3d72bcdd..2c43a1423b09 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -1204,7 +1204,13 @@ static void __split_lock_reenable(struct work_struct *work)
{
sld_update_msr(true);
}
-static DECLARE_DELAYED_WORK(sl_reenable, __split_lock_reenable);
+/*
+ * In order for each CPU to schedule its delayed work independently of the
+ * others, delayed work struct must be per-CPU. This is not required when
+ * sysctl_sld_mitigate is enabled because of the semaphore that limits
+ * the number of simultaneously scheduled delayed works to 1.
+ */
+static DEFINE_PER_CPU(struct delayed_work, sl_reenable);
/*
* If a CPU goes offline with pending delayed work to re-enable split lock
@@ -1225,7 +1231,7 @@ static int splitlock_cpu_offline(unsigned int cpu)
static void split_lock_warn(unsigned long ip)
{
- struct delayed_work *work;
+ struct delayed_work *work = NULL;
int cpu;
if (!current->reported_split_lock)
@@ -1247,11 +1253,17 @@ static void split_lock_warn(unsigned long ip)
if (down_interruptible(&buslock_sem) == -EINTR)
return;
work = &sl_reenable_unlock;
- } else {
- work = &sl_reenable;
}
cpu = get_cpu();
+
+ if (!work) {
+ work = this_cpu_ptr(&sl_reenable);
+ /* Deferred initialization of per-CPU struct */
+ if (!work->work.func)
+ INIT_DELAYED_WORK(work, __split_lock_reenable);
+ }
+
schedule_delayed_work_on(cpu, work, 2);
/* Disable split lock detection on this CPU to make progress */
--
2.34.1
commit c929d08df8bee855528b9d15b853c892c54e1eee upstream.
If the warning mode with disabled mitigation mode is used, then on each
CPU where the split lock occurred detection will be disabled in order to
make progress and delayed work will be scheduled, which then will enable
detection back.
Now it turns out that all CPUs use one global delayed work structure.
This leads to the fact that if a split lock occurs on several CPUs
at the same time (within 2 jiffies), only one CPU will schedule delayed
work, but the rest will not.
The return value of schedule_delayed_work_on() would have shown this,
but it is not checked in the code.
A diagram that can help to understand the bug reproduction:
- sld_update_msr() enables/disables SLD on both CPUs on the same core
- schedule_delayed_work_on() internally checks WORK_STRUCT_PENDING_BIT.
If a work has the 'pending' status, then schedule_delayed_work_on()
will return an error code and, most importantly, the work will not
be placed in the workqueue.
Let's say we have a multicore system on which split_lock_mitigate=0 and
a multithreaded application is running that calls splitlock in multiple
threads. Due to the fact that sld_update_msr() affects the entire core
(both CPUs), we will consider 2 CPUs from different cores. Let the 2
threads of this application schedule to CPU0 (core 0) and to CPU 2
(core 1), then:
| || |
| CPU 0 (core 0) || CPU 2 (core 1) |
|_________________________________||___________________________________|
| || |
| 1) SPLIT LOCK occured || |
| || |
| 2) split_lock_warn() || |
| || |
| 3) sysctl_sld_mitigate == 0 || |
| (work = &sl_reenable) || |
| || |
| 4) schedule_delayed_work_on() || |
| (reenable will be called || |
| after 2 jiffies on CPU 0) || |
| || |
| 5) disable SLD for core 0 || |
| || |
| ------------------------- || |
| || |
| || 6) SPLIT LOCK occured |
| || |
| || 7) split_lock_warn() |
| || |
| || 8) sysctl_sld_mitigate == 0 |
| || (work = &sl_reenable, |
| || the same address as in 3) ) |
| || |
| 2 jiffies || 9) schedule_delayed_work_on() |
| || fials because the work is in |
| || the pending state since 4). |
| || The work wasn't placed to the |
| || workqueue. reenable won't be |
| || called on CPU 2 |
| || |
| || 10) disable SLD for core 0 |
| || |
| || From now on SLD will |
| || never be reenabled on core 1 |
| || |
| ------------------------- || |
| || |
| 11) enable SLD for core 0 by || |
| __split_lock_reenable || |
| || |
If the application threads can be scheduled to all processor cores,
then over time there will be only one core left, on which SLD will be
enabled and split lock will be able to be detected; and on all other
cores SLD will be disabled all the time.
Most likely, this bug has not been noticed for so long because
sysctl_sld_mitigate default value is 1, and in this case a semaphore
is used that does not allow 2 different cores to have SLD disabled at
the same time, that is, strictly only one work is placed in the
workqueue.
In order to fix the warning mode with disabled mitigation mode,
delayed work has to be per-CPU. Implement it.
Fixes: 727209376f49 ("x86/split_lock: Add sysctl to control the misery mode")
Signed-off-by: Maksim Davydov <davydov-max(a)yandex-team.ru>
Signed-off-by: Ingo Molnar <mingo(a)kernel.org>
Tested-by: Guilherme G. Piccoli <gpiccoli(a)igalia.com>
Cc: Thomas Gleixner <tglx(a)linutronix.de>
Cc: Ravi Bangoria <ravi.bangoria(a)amd.com>
Cc: Tom Lendacky <thomas.lendacky(a)amd.com>
Link: https://lore.kernel.org/r/20250115131704.132609-1-davydov-max@yandex-team.ru
---
arch/x86/kernel/cpu/intel.c | 20 ++++++++++++++++----
1 file changed, 16 insertions(+), 4 deletions(-)
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 38eeff91109f..6c0e0619e6d3 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -1168,7 +1168,13 @@ static void __split_lock_reenable(struct work_struct *work)
{
sld_update_msr(true);
}
-static DECLARE_DELAYED_WORK(sl_reenable, __split_lock_reenable);
+/*
+ * In order for each CPU to schedule its delayed work independently of the
+ * others, delayed work struct must be per-CPU. This is not required when
+ * sysctl_sld_mitigate is enabled because of the semaphore that limits
+ * the number of simultaneously scheduled delayed works to 1.
+ */
+static DEFINE_PER_CPU(struct delayed_work, sl_reenable);
/*
* If a CPU goes offline with pending delayed work to re-enable split lock
@@ -1189,7 +1195,7 @@ static int splitlock_cpu_offline(unsigned int cpu)
static void split_lock_warn(unsigned long ip)
{
- struct delayed_work *work;
+ struct delayed_work *work = NULL;
int cpu;
if (!current->reported_split_lock)
@@ -1211,11 +1217,17 @@ static void split_lock_warn(unsigned long ip)
if (down_interruptible(&buslock_sem) == -EINTR)
return;
work = &sl_reenable_unlock;
- } else {
- work = &sl_reenable;
}
cpu = get_cpu();
+
+ if (!work) {
+ work = this_cpu_ptr(&sl_reenable);
+ /* Deferred initialization of per-CPU struct */
+ if (!work->work.func)
+ INIT_DELAYED_WORK(work, __split_lock_reenable);
+ }
+
schedule_delayed_work_on(cpu, work, 2);
/* Disable split lock detection on this CPU to make progress */
--
2.34.1
startup()/shutdown() callbacks access SIFIVE_SERIAL_IE_OFFS.
The register is also accessed from write() callback.
If console were printing and startup()/shutdown() callback
gets called, its access to the register could be overwritten.
Add port->lock to startup()/shutdown() callbacks to make sure
their access to SIFIVE_SERIAL_IE_OFFS is synchronized against
write() callback.
Fixes: 45c054d0815b ("tty: serial: add driver for the SiFive UART")
Signed-off-by: Ryo Takakura <ryotkkr98(a)gmail.com>
Cc: stable(a)vger.kernel.org
---
This patch used be part of a series for converting sifive driver to
nbcon[0]. It's now sent seperatly as the rest of the series does not
need be applied to the stable branch.
Sincerely,
Ryo Takakura
[0] https://lore.kernel.org/all/20250405043833.397020-1-ryotkkr98@gmail.com/
---
drivers/tty/serial/sifive.c | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/drivers/tty/serial/sifive.c b/drivers/tty/serial/sifive.c
index 5904a2d4c..054a8e630 100644
--- a/drivers/tty/serial/sifive.c
+++ b/drivers/tty/serial/sifive.c
@@ -563,8 +563,11 @@ static void sifive_serial_break_ctl(struct uart_port *port, int break_state)
static int sifive_serial_startup(struct uart_port *port)
{
struct sifive_serial_port *ssp = port_to_sifive_serial_port(port);
+ unsigned long flags;
+ uart_port_lock_irqsave(&ssp->port, &flags);
__ssp_enable_rxwm(ssp);
+ uart_port_unlock_irqrestore(&ssp->port, flags);
return 0;
}
@@ -572,9 +575,12 @@ static int sifive_serial_startup(struct uart_port *port)
static void sifive_serial_shutdown(struct uart_port *port)
{
struct sifive_serial_port *ssp = port_to_sifive_serial_port(port);
+ unsigned long flags;
+ uart_port_lock_irqsave(&ssp->port, &flags);
__ssp_disable_rxwm(ssp);
__ssp_disable_txwm(ssp);
+ uart_port_unlock_irqrestore(&ssp->port, flags);
}
/**
--
2.34.1
In commit 7e119cff9d0a, "ocfs2: convert w_pages to w_folios" the
chunk page allocations became order 0 folio allocations. If an
allocation failed, the folio array entry should be NULL so the
error path can skip the entry. In the port it is -ENOMEM and
the error path panics trying to free this bad value.
Signed-off-by: Mark Tinguely <mark.tinguely(a)oracle.com>
Cc: stable(a)vger.kernel.org
Cc: Changwei Ge <gechangwei(a)live.cn>
Cc: Joel Becker <jlbec(a)evilplan.org>
Cc: Jun Piao <piaojun(a)huawei.com>
Cc: Junxiao Bi <junxiao.bi(a)oracle.com>
Cc: Mark Fasheh <mark(a)fasheh.com>
---
fs/ocfs2/aops.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 40b6bce12951..89aadc6cdd87 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1071,6 +1071,7 @@ static int ocfs2_grab_folios_for_write(struct
address_space *mapping,
if (IS_ERR(wc->w_folios[i])) {
ret = PTR_ERR(wc->w_folios[i]);
mlog_errno(ret);
+ wc->w_folios[i] = NULL;
goto out;
}
}
--
2.39.5 (Apple Git-154)
Hi,
I would like to request a backport of
8b36447c9ae1 ("ASoC: Intel: adl: add 2xrt1316 audio configuration")
to 6.12 LTS and 6.14 stable kernel as we have at least one affected user:
https://github.com/thesofproject/linux/issues/5274
The topology file has been already released.
Thanks,
Péter
The test robot identified c2f6ea38fc1b ("mm: page_alloc: don't steal
single pages from biggest buddy") as the root cause of a 56.4%
regression in vm-scalability::lru-file-mmap-read.
Carlos reports an earlier patch, c0cd6f557b90 ("mm: page_alloc: fix
freelist movement during block conversion"), as the root cause for a
regression in worst-case zone->lock+irqoff hold times.
Both of these patches modify the page allocator's fallback path to be
less greedy in an effort to stave off fragmentation. The flip side of
this is that fallbacks are also less productive each time around,
which means the fallback search can run much more frequently.
Carlos' traces point to rmqueue_bulk() specifically, which tries to
refill the percpu cache by allocating a large batch of pages in a
loop. It highlights how once the native freelists are exhausted, the
fallback code first scans orders top-down for whole blocks to claim,
then falls back to a bottom-up search for the smallest buddy to steal.
For the next batch page, it goes through the same thing again.
This can be made more efficient. Since rmqueue_bulk() holds the
zone->lock over the entire batch, the freelists are not subject to
outside changes; when the search for a block to claim has already
failed, there is no point in trying again for the next page.
Modify __rmqueue() to remember the last successful fallback mode, and
restart directly from there on the next rmqueue_bulk() iteration.
Oliver confirms that this improves beyond the regression that the test
robot reported against c2f6ea38fc1b:
commit:
f3b92176f4 ("tools/selftests: add guard region test for /proc/$pid/pagemap")
c2f6ea38fc ("mm: page_alloc: don't steal single pages from biggest buddy")
acc4d5ff0b ("Merge tag 'net-6.15-rc0' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net")
2c847f27c3 ("mm: page_alloc: speed up fallbacks in rmqueue_bulk()") <--- your patch
f3b92176f4f7100f c2f6ea38fc1b640aa7a2e155cc1 acc4d5ff0b61eb1715c498b6536 2c847f27c37da65a93d23c237c5
---------------- --------------------------- --------------------------- ---------------------------
%stddev %change %stddev %change %stddev %change %stddev
\ | \ | \ | \
25525364 ± 3% -56.4% 11135467 -57.8% 10779336 +31.6% 33581409 vm-scalability.throughput
Carlos confirms that worst-case times are almost fully recovered
compared to before the earlier culprit patch:
2dd482ba627d (before freelist hygiene): 1ms
c0cd6f557b90 (after freelist hygiene): 90ms
next-20250319 (steal smallest buddy): 280ms
this patch : 8ms
Reported-by: kernel test robot <oliver.sang(a)intel.com>
Reported-by: Carlos Song <carlos.song(a)nxp.com>
Tested-by: kernel test robot <oliver.sang(a)intel.com>
Fixes: c0cd6f557b90 ("mm: page_alloc: fix freelist movement during block conversion")
Fixes: c2f6ea38fc1b ("mm: page_alloc: don't steal single pages from biggest buddy")
Closes: https://lore.kernel.org/oe-lkp/202503271547.fc08b188-lkp@intel.com
Cc: stable(a)vger.kernel.org # 6.10+
Signed-off-by: Johannes Weiner <hannes(a)cmpxchg.org>
---
mm/page_alloc.c | 100 +++++++++++++++++++++++++++++++++++-------------
1 file changed, 74 insertions(+), 26 deletions(-)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f51aa6051a99..03b0d45ed45a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2194,11 +2194,11 @@ try_to_claim_block(struct zone *zone, struct page *page,
* The use of signed ints for order and current_order is a deliberate
* deviation from the rest of this file, to make the for loop
* condition simpler.
- *
- * Return the stolen page, or NULL if none can be found.
*/
+
+/* Try to claim a whole foreign block, take a page, expand the remainder */
static __always_inline struct page *
-__rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
+__rmqueue_claim(struct zone *zone, int order, int start_migratetype,
unsigned int alloc_flags)
{
struct free_area *area;
@@ -2236,14 +2236,26 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
page = try_to_claim_block(zone, page, current_order, order,
start_migratetype, fallback_mt,
alloc_flags);
- if (page)
- goto got_one;
+ if (page) {
+ trace_mm_page_alloc_extfrag(page, order, current_order,
+ start_migratetype, fallback_mt);
+ return page;
+ }
}
- if (alloc_flags & ALLOC_NOFRAGMENT)
- return NULL;
+ return NULL;
+}
+
+/* Try to steal a single page from a foreign block */
+static __always_inline struct page *
+__rmqueue_steal(struct zone *zone, int order, int start_migratetype)
+{
+ struct free_area *area;
+ int current_order;
+ struct page *page;
+ int fallback_mt;
+ bool claim_block;
- /* No luck claiming pageblock. Find the smallest fallback page */
for (current_order = order; current_order < NR_PAGE_ORDERS; current_order++) {
area = &(zone->free_area[current_order]);
fallback_mt = find_suitable_fallback(area, current_order,
@@ -2253,25 +2265,28 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
page = get_page_from_free_area(area, fallback_mt);
page_del_and_expand(zone, page, order, current_order, fallback_mt);
- goto got_one;
+ trace_mm_page_alloc_extfrag(page, order, current_order,
+ start_migratetype, fallback_mt);
+ return page;
}
return NULL;
-
-got_one:
- trace_mm_page_alloc_extfrag(page, order, current_order,
- start_migratetype, fallback_mt);
-
- return page;
}
+enum rmqueue_mode {
+ RMQUEUE_NORMAL,
+ RMQUEUE_CMA,
+ RMQUEUE_CLAIM,
+ RMQUEUE_STEAL,
+};
+
/*
* Do the hard work of removing an element from the buddy allocator.
* Call me with the zone->lock already held.
*/
static __always_inline struct page *
__rmqueue(struct zone *zone, unsigned int order, int migratetype,
- unsigned int alloc_flags)
+ unsigned int alloc_flags, enum rmqueue_mode *mode)
{
struct page *page;
@@ -2290,16 +2305,47 @@ __rmqueue(struct zone *zone, unsigned int order, int migratetype,
}
}
- page = __rmqueue_smallest(zone, order, migratetype);
- if (unlikely(!page)) {
- if (alloc_flags & ALLOC_CMA)
+ /*
+ * Try the different freelists, native then foreign.
+ *
+ * The fallback logic is expensive and rmqueue_bulk() calls in
+ * a loop with the zone->lock held, meaning the freelists are
+ * not subject to any outside changes. Remember in *mode where
+ * we found pay dirt, to save us the search on the next call.
+ */
+ switch (*mode) {
+ case RMQUEUE_NORMAL:
+ page = __rmqueue_smallest(zone, order, migratetype);
+ if (page)
+ return page;
+ fallthrough;
+ case RMQUEUE_CMA:
+ if (alloc_flags & ALLOC_CMA) {
page = __rmqueue_cma_fallback(zone, order);
-
- if (!page)
- page = __rmqueue_fallback(zone, order, migratetype,
- alloc_flags);
+ if (page) {
+ *mode = RMQUEUE_CMA;
+ return page;
+ }
+ }
+ fallthrough;
+ case RMQUEUE_CLAIM:
+ page = __rmqueue_claim(zone, order, migratetype, alloc_flags);
+ if (page) {
+ /* Replenished native freelist, back to normal mode */
+ *mode = RMQUEUE_NORMAL;
+ return page;
+ }
+ fallthrough;
+ case RMQUEUE_STEAL:
+ if (!(alloc_flags & ALLOC_NOFRAGMENT)) {
+ page = __rmqueue_steal(zone, order, migratetype);
+ if (page) {
+ *mode = RMQUEUE_STEAL;
+ return page;
+ }
+ }
}
- return page;
+ return NULL;
}
/*
@@ -2311,6 +2357,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
unsigned long count, struct list_head *list,
int migratetype, unsigned int alloc_flags)
{
+ enum rmqueue_mode rmqm = RMQUEUE_NORMAL;
unsigned long flags;
int i;
@@ -2321,7 +2368,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
}
for (i = 0; i < count; ++i) {
struct page *page = __rmqueue(zone, order, migratetype,
- alloc_flags);
+ alloc_flags, &rmqm);
if (unlikely(page == NULL))
break;
@@ -2934,6 +2981,7 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone,
{
struct page *page;
unsigned long flags;
+ enum rmqueue_mode rmqm = RMQUEUE_NORMAL;
do {
page = NULL;
@@ -2945,7 +2993,7 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone,
if (alloc_flags & ALLOC_HIGHATOMIC)
page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
if (!page) {
- page = __rmqueue(zone, order, migratetype, alloc_flags);
+ page = __rmqueue(zone, order, migratetype, alloc_flags, &rmqm);
/*
* If the allocation fails, allow OOM handling and
--
2.49.0
From: yangge <yangge1116(a)126.com>
If a large number of CMA memory are configured in system (for example, the
CMA memory accounts for 50% of the system memory), starting a virtual
virtual machine with device passthrough, it will
call pin_user_pages_remote(..., FOLL_LONGTERM, ...) to pin memory.
Normally if a page is present and in CMA area, pin_user_pages_remote()
will migrate the page from CMA area to non-CMA area because of
FOLL_LONGTERM flag. But the current code will cause the migration failure
due to unexpected page refcounts, and eventually cause the virtual machine
fail to start.
If a page is added in LRU batch, its refcount increases one, remove the
page from LRU batch decreases one. Page migration requires the page is not
referenced by others except page mapping. Before migrating a page, we
should try to drain the page from LRU batch in case the page is in it,
however, folio_test_lru() is not sufficient to tell whether the page is
in LRU batch or not, if the page is in LRU batch, the migration will fail.
To solve the problem above, we modify the logic of adding to LRU batch.
Before adding a page to LRU batch, we clear the LRU flag of the page so
that we can check whether the page is in LRU batch by folio_test_lru(page).
It's quite valuable, because likely we don't want to blindly drain the LRU
batch simply because there is some unexpected reference on a page, as
described above.
This change makes the LRU flag of a page invisible for longer, which
may impact some programs. For example, as long as a page is on a LRU
batch, we cannot isolate it, and we cannot check if it's an LRU page.
Further, a page can now only be on exactly one LRU batch. This doesn't
seem to matter much, because a new page is allocated from buddy and
added to the lru batch, or be isolated, it's LRU flag may also be
invisible for a long time.
Fixes: 9a4e9f3b2d73 ("mm: update get_user_pages_longterm to migrate pages allocated from CMA region")
Cc: <stable(a)vger.kernel.org>
Signed-off-by: yangge <yangge1116(a)126.com>
---
mm/swap.c | 43 +++++++++++++++++++++++++++++++------------
1 file changed, 31 insertions(+), 12 deletions(-)
V4:
Adjust commit message according to David's comments
V3:
Add fixes tag
V2:
Adjust code and commit message according to David's comments
diff --git a/mm/swap.c b/mm/swap.c
index dc205bd..9caf6b0 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -211,10 +211,6 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn)
for (i = 0; i < folio_batch_count(fbatch); i++) {
struct folio *folio = fbatch->folios[i];
- /* block memcg migration while the folio moves between lru */
- if (move_fn != lru_add_fn && !folio_test_clear_lru(folio))
- continue;
-
folio_lruvec_relock_irqsave(folio, &lruvec, &flags);
move_fn(lruvec, folio);
@@ -255,11 +251,16 @@ static void lru_move_tail_fn(struct lruvec *lruvec, struct folio *folio)
void folio_rotate_reclaimable(struct folio *folio)
{
if (!folio_test_locked(folio) && !folio_test_dirty(folio) &&
- !folio_test_unevictable(folio) && folio_test_lru(folio)) {
+ !folio_test_unevictable(folio)) {
struct folio_batch *fbatch;
unsigned long flags;
folio_get(folio);
+ if (!folio_test_clear_lru(folio)) {
+ folio_put(folio);
+ return;
+ }
+
local_lock_irqsave(&lru_rotate.lock, flags);
fbatch = this_cpu_ptr(&lru_rotate.fbatch);
folio_batch_add_and_move(fbatch, folio, lru_move_tail_fn);
@@ -352,11 +353,15 @@ static void folio_activate_drain(int cpu)
void folio_activate(struct folio *folio)
{
- if (folio_test_lru(folio) && !folio_test_active(folio) &&
- !folio_test_unevictable(folio)) {
+ if (!folio_test_active(folio) && !folio_test_unevictable(folio)) {
struct folio_batch *fbatch;
folio_get(folio);
+ if (!folio_test_clear_lru(folio)) {
+ folio_put(folio);
+ return;
+ }
+
local_lock(&cpu_fbatches.lock);
fbatch = this_cpu_ptr(&cpu_fbatches.activate);
folio_batch_add_and_move(fbatch, folio, folio_activate_fn);
@@ -700,6 +705,11 @@ void deactivate_file_folio(struct folio *folio)
return;
folio_get(folio);
+ if (!folio_test_clear_lru(folio)) {
+ folio_put(folio);
+ return;
+ }
+
local_lock(&cpu_fbatches.lock);
fbatch = this_cpu_ptr(&cpu_fbatches.lru_deactivate_file);
folio_batch_add_and_move(fbatch, folio, lru_deactivate_file_fn);
@@ -716,11 +726,16 @@ void deactivate_file_folio(struct folio *folio)
*/
void folio_deactivate(struct folio *folio)
{
- if (folio_test_lru(folio) && !folio_test_unevictable(folio) &&
- (folio_test_active(folio) || lru_gen_enabled())) {
+ if (!folio_test_unevictable(folio) && (folio_test_active(folio) ||
+ lru_gen_enabled())) {
struct folio_batch *fbatch;
folio_get(folio);
+ if (!folio_test_clear_lru(folio)) {
+ folio_put(folio);
+ return;
+ }
+
local_lock(&cpu_fbatches.lock);
fbatch = this_cpu_ptr(&cpu_fbatches.lru_deactivate);
folio_batch_add_and_move(fbatch, folio, lru_deactivate_fn);
@@ -737,12 +752,16 @@ void folio_deactivate(struct folio *folio)
*/
void folio_mark_lazyfree(struct folio *folio)
{
- if (folio_test_lru(folio) && folio_test_anon(folio) &&
- folio_test_swapbacked(folio) && !folio_test_swapcache(folio) &&
- !folio_test_unevictable(folio)) {
+ if (folio_test_anon(folio) && folio_test_swapbacked(folio) &&
+ !folio_test_swapcache(folio) && !folio_test_unevictable(folio)) {
struct folio_batch *fbatch;
folio_get(folio);
+ if (!folio_test_clear_lru(folio)) {
+ folio_put(folio);
+ return;
+ }
+
local_lock(&cpu_fbatches.lock);
fbatch = this_cpu_ptr(&cpu_fbatches.lru_lazyfree);
folio_batch_add_and_move(fbatch, folio, lru_lazyfree_fn);
--
2.7.4
Hi,
The first four patches in this series are miscellaneous fixes and
improvements in the Cadence and TI CSI-RX drivers around probing, fwnode
and link creation.
The last two patches add support for transmitting multiple pixels per
clock on the internal bus between Cadence CSI-RX bridge and TI CSI-RX
wrapper. As this internal bus is 32-bit wide, the maximum number of
pixels that can be transmitted per cycle depend upon the format's bit
width. Secondly, the downstream element must support unpacking of
multiple pixels.
Thus we export a module function that can be used by the downstream
driver to negotiate the pixels per cycle on the output pixel stream of
the Cadence bridge.
Signed-off-by: Jai Luthra <jai.luthra(a)ideasonboard.com>
---
Changes in v2:
- Rebase on v6.15-rc1
- Fix lkp warnings in PATCH 5/6 missing header for FIELD_PREP
- Add R-By tags from Devarsh and Changhuang
- Link to v1: https://lore.kernel.org/r/20250324-probe_fixes-v1-0-5cd5b9e1cfac@ideasonboa…
---
Jai Luthra (6):
media: ti: j721e-csi2rx: Use devm_of_platform_populate
media: ti: j721e-csi2rx: Use fwnode_get_named_child_node
media: ti: j721e-csi2rx: Fix source subdev link creation
media: cadence: csi2rx: Implement get_fwnode_pad op
media: cadence: cdns-csi2rx: Support multiple pixels per clock cycle
media: ti: j721e-csi2rx: Support multiple pixels per clock
drivers/media/platform/cadence/cdns-csi2rx.c | 76 +++++++++++++++++-----
drivers/media/platform/cadence/cdns-csi2rx.h | 19 ++++++
drivers/media/platform/ti/Kconfig | 3 +-
.../media/platform/ti/j721e-csi2rx/j721e-csi2rx.c | 66 ++++++++++++++-----
4 files changed, 129 insertions(+), 35 deletions(-)
---
base-commit: 0af2f6be1b4281385b618cb86ad946eded089ac8
change-id: 20250314-probe_fixes-7e0ec33c7fee
Best regards,
--
Jai Luthra <jai.luthra(a)ideasonboard.com>
Hello ,
These are available for sale. If you’re interested in purchasing
these, please email me
brand new and original
Brand New ST8000NM017B $70 EA
Brand New ST20000NM007D $100 EACH
Brand New ST4000NM000A $30 EA
Brand New WD80EFPX $60 EA
Brand New WD101PURZ $70 EA
Brand New CISCO C9300-48UXM-E
Available 5
$21800 EACH
Intel Xeon Gold 5418Y Processors
QTY28 $780 each
Brand New C9200L-48T-4X-E $1000 EAC
Brand New N9K-C93108TC-FX-24 Nexus
9300-FX w/ 24p 100M/1/10GT & 6p 40/100G
Available 4
$3000 each
Brand New NVIDIA GeForce RTX 4090 Founders
Edition 24GB - QTY: 56 - $700 each
Brand new Palit NVIDIA GeForce RTX 5080
GamingPro OC card with full manufacturer
QTY 48 $750 EAC
BRAND NEW - ASUS TUF Gaming GeForce RTX ™ 5080 16GB
GDDR7 OC Edition Gaming Graphics Card SEALED
QTY50 $700 EACH
Condition: Grade A
Used HP EliteBook 840 G7 i7-10610U 16GB RAM 512GB
SSD Windows 11 Pro TOUCH Screen
QTY 30 USD 100 each
Condition: Grade A
Used HP EliteBook 850 G8 15.6" FHD,
INTEL I7, 256GB SSD, 8GB RAM Win11
QTY50 $240 EACH
SK Hynix 48GB DDR5 4800 1Rx4 PC5-4800B-
PF0-1010-XT 288pin Server EC4 RDIMM RAM
QTY 239 $50 EACH
-----------------------------------------------------------------
---------------
Best Regards,
Winston Taylor
300 Laird St, Wilkes-Barre, PA 18702, USA
Mobile: +1 (570) 890-5512
Email: winstontaylor(a)theleadingone.net
www.theleadingone.net
Hello ,
These are available for sale. If you’re interested in purchasing
these, please email me
brand new and original
Brand New ST8000NM017B $70 EA
Brand New ST20000NM007D $100 EACH
Brand New ST4000NM000A $30 EA
Brand New WD80EFPX $60 EA
Brand New WD101PURZ $70 EA
Brand New CISCO C9300-48UXM-E
Available 5
$21800 EACH
Intel Xeon Gold 5418Y Processors
QTY28 $780 each
Brand New C9200L-48T-4X-E $1000 EAC
Brand New N9K-C93108TC-FX-24 Nexus
9300-FX w/ 24p 100M/1/10GT & 6p 40/100G
Available 4
$3000 each
Brand New NVIDIA GeForce RTX 4090 Founders
Edition 24GB - QTY: 56 - $700 each
Brand new Palit NVIDIA GeForce RTX 5080
GamingPro OC card with full manufacturer
QTY 48 $750 EAC
BRAND NEW - ASUS TUF Gaming GeForce RTX ™ 5080 16GB
GDDR7 OC Edition Gaming Graphics Card SEALED
QTY50 $700 EACH
Condition: Grade A
Used HP EliteBook 840 G7 i7-10610U 16GB RAM 512GB
SSD Windows 11 Pro TOUCH Screen
QTY 30 USD 100 each
Condition: Grade A
Used HP EliteBook 850 G8 15.6" FHD,
INTEL I7, 256GB SSD, 8GB RAM Win11
QTY50 $240 EACH
SK Hynix 48GB DDR5 4800 1Rx4 PC5-4800B-
PF0-1010-XT 288pin Server EC4 RDIMM RAM
QTY 239 $50 EACH
-----------------------------------------------------------------
---------------
Best Regards,
Winston Taylor
300 Laird St, Wilkes-Barre, PA 18702, USA
Mobile: +1 (570) 890-5512
Email: winstontaylor(a)theleadingone.net
www.theleadingone.net
Hello ,
These are available for sale. If you’re interested in purchasing
these, please email me
brand new and original
Brand New ST8000NM017B $70 EA
Brand New ST20000NM007D $100 EACH
Brand New ST4000NM000A $30 EA
Brand New WD80EFPX $60 EA
Brand New WD101PURZ $70 EA
Brand New CISCO C9300-48UXM-E
Available 5
$21800 EACH
Intel Xeon Gold 5418Y Processors
QTY28 $780 each
Brand New C9200L-48T-4X-E $1000 EAC
Brand New N9K-C93108TC-FX-24 Nexus
9300-FX w/ 24p 100M/1/10GT & 6p 40/100G
Available 4
$3000 each
Brand New NVIDIA GeForce RTX 4090 Founders
Edition 24GB - QTY: 56 - $700 each
Brand new Palit NVIDIA GeForce RTX 5080
GamingPro OC card with full manufacturer
QTY 48 $750 EAC
BRAND NEW - ASUS TUF Gaming GeForce RTX ™ 5080 16GB
GDDR7 OC Edition Gaming Graphics Card SEALED
QTY50 $700 EACH
Condition: Grade A
Used HP EliteBook 840 G7 i7-10610U 16GB RAM 512GB
SSD Windows 11 Pro TOUCH Screen
QTY 30 USD 100 each
Condition: Grade A
Used HP EliteBook 850 G8 15.6" FHD,
INTEL I7, 256GB SSD, 8GB RAM Win11
QTY50 $240 EACH
SK Hynix 48GB DDR5 4800 1Rx4 PC5-4800B-
PF0-1010-XT 288pin Server EC4 RDIMM RAM
QTY 239 $50 EACH
-----------------------------------------------------------------
---------------
Best Regards,
Winston Taylor
300 Laird St, Wilkes-Barre, PA 18702, USA
Mobile: +1 (570) 890-5512
Email: winstontaylor(a)theleadingone.net
www.theleadingone.net
The patch titled
Subject: mm: fix apply_to_existing_page_range()
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
mm-fix-apply_to_existing_page_range.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: "Kirill A. Shutemov" <kirill.shutemov(a)linux.intel.com>
Subject: mm: fix apply_to_existing_page_range()
Date: Wed, 9 Apr 2025 12:40:43 +0300
In the case of apply_to_existing_page_range(), apply_to_pte_range() is
reached with 'create' set to false. When !create, the loop over the PTE
page table is broken.
apply_to_pte_range() will only move to the next PTE entry if 'create' is
true or if the current entry is not pte_none().
This means that the user of apply_to_existing_page_range() will not have
'fn' called for any entries after the first pte_none() in the PTE page
table.
Fix the loop logic in apply_to_pte_range().
There are no known runtime issues from this, but the fix is trivial enough
for stable@ even without a known buggy user.
Link: https://lkml.kernel.org/r/20250409094043.1629234-1-kirill.shutemov@linux.in…
Signed-off-by: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com>
Fixes: be1db4753ee6 ("mm/memory.c: add apply_to_existing_page_range() helper")
Cc: Daniel Axtens <dja(a)axtens.net>
Cc: David Hildenbrand <david(a)redhat.com>
Cc: Vlastimil Babka <vbabka(a)suse.cz>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/memory.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
--- a/mm/memory.c~mm-fix-apply_to_existing_page_range
+++ a/mm/memory.c
@@ -2943,11 +2943,11 @@ static int apply_to_pte_range(struct mm_
if (fn) {
do {
if (create || !pte_none(ptep_get(pte))) {
- err = fn(pte++, addr, data);
+ err = fn(pte, addr, data);
if (err)
break;
}
- } while (addr += PAGE_SIZE, addr != end);
+ } while (pte++, addr += PAGE_SIZE, addr != end);
}
arch_leave_lazy_mmu_mode();
_
Patches currently in -mm which might be from kirill.shutemov(a)linux.intel.com are
mm-page_alloc-fix-deadlock-on-cpu_hotplug_lock-in-__accept_page.patch
mm-fix-apply_to_existing_page_range.patch
The patch titled
Subject: alloc_tag: handle incomplete bulk allocations in vm_module_tags_populate
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
alloc_tag-handle-incomplete-bulk-allocations-in-vm_module_tags_populate.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: "T.J. Mercier" <tjmercier(a)google.com>
Subject: alloc_tag: handle incomplete bulk allocations in vm_module_tags_populate
Date: Wed, 9 Apr 2025 22:51:11 +0000
alloc_pages_bulk_node() may partially succeed and allocate fewer than the
requested nr_pages. There are several conditions under which this can
occur, but we have encountered the case where CONFIG_PAGE_OWNER is enabled
causing all bulk allocations to always fallback to single page allocations
due to commit 187ad460b841 ("mm/page_alloc: avoid page allocator recursion
with pagesets.lock held").
Currently vm_module_tags_populate() immediately fails when
alloc_pages_bulk_node() returns fewer than the requested number of pages.
When this happens memory allocation profiling gets disabled, for example
[ 14.297583] [9: modprobe: 465] Failed to allocate memory for allocation tags in the module scsc_wlan. Memory allocation profiling is disabled!
[ 14.299339] [9: modprobe: 465] modprobe: Failed to insmod '/vendor/lib/modules/scsc_wlan.ko' with args '': Out of memory
This patch causes vm_module_tags_populate() to retry bulk allocations for
the remaining memory instead of failing immediately which will avoid the
disablement of memory allocation profiling.
Link: https://lkml.kernel.org/r/20250409225111.3770347-1-tjmercier@google.com
Fixes: 0f9b685626da ("alloc_tag: populate memory for module tags as needed")
Signed-off-by: T.J. Mercier <tjmercier(a)google.com>
Reported-by: Janghyuck Kim <janghyuck.kim(a)samsung.com>
Acked-by: Suren Baghdasaryan <surenb(a)google.com>
Cc: Kent Overstreet <kent.overstreet(a)linux.dev>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
lib/alloc_tag.c | 15 ++++++++++++---
1 file changed, 12 insertions(+), 3 deletions(-)
--- a/lib/alloc_tag.c~alloc_tag-handle-incomplete-bulk-allocations-in-vm_module_tags_populate
+++ a/lib/alloc_tag.c
@@ -422,11 +422,20 @@ static int vm_module_tags_populate(void)
unsigned long old_shadow_end = ALIGN(phys_end, MODULE_ALIGN);
unsigned long new_shadow_end = ALIGN(new_end, MODULE_ALIGN);
unsigned long more_pages;
- unsigned long nr;
+ unsigned long nr = 0;
more_pages = ALIGN(new_end - phys_end, PAGE_SIZE) >> PAGE_SHIFT;
- nr = alloc_pages_bulk_node(GFP_KERNEL | __GFP_NOWARN,
- NUMA_NO_NODE, more_pages, next_page);
+ while (nr < more_pages) {
+ unsigned long allocated;
+
+ allocated = alloc_pages_bulk_node(GFP_KERNEL | __GFP_NOWARN,
+ NUMA_NO_NODE, more_pages - nr, next_page + nr);
+
+ if (!allocated)
+ break;
+ nr += allocated;
+ }
+
if (nr < more_pages ||
vmap_pages_range(phys_end, phys_end + (nr << PAGE_SHIFT), PAGE_KERNEL,
next_page, PAGE_SHIFT) < 0) {
_
Patches currently in -mm which might be from tjmercier(a)google.com are
alloc_tag-handle-incomplete-bulk-allocations-in-vm_module_tags_populate.patch
The patch titled
Subject: alloc_tag: handle incomplete bulk allocations in vm_module_tags_populate
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
alloc_tag-handle-incomplete-bulk-allocations-in-vm_module_tags_populate.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: "T.J. Mercier" <tjmercier(a)google.com>
Subject: alloc_tag: handle incomplete bulk allocations in vm_module_tags_populate
Date: Wed, 9 Apr 2025 19:54:47 +0000
alloc_pages_bulk_node may partially succeed and allocate fewer than the
requested nr_pages. There are several conditions under which this can
occur, but we have encountered the case where CONFIG_PAGE_OWNER is enabled
causing all bulk allocations to always fallback to single page allocations
due to commit 187ad460b841 ("mm/page_alloc: avoid page allocator recursion
with pagesets.lock held").
Currently vm_module_tags_populate immediately fails when
alloc_pages_bulk_node returns fewer than the requested number of pages.
This patch causes vm_module_tags_populate to retry bulk allocations for
the remaining memory instead.
Link: https://lkml.kernel.org/r/20250409195448.3697351-1-tjmercier@google.com
Fixes: 187ad460b841 ("mm/page_alloc: avoid page allocator recursion with pagesets.lock held")
Signed-off-by: T.J. Mercier <tjmercier(a)google.com>
Reported-by: Janghyuck Kim <janghyuck.kim(a)samsung.com>
Cc: Kent Overstreet <kent.overstreet(a)linux.dev>
Cc: Suren Baghdasaryan <surenb(a)google.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
lib/alloc_tag.c | 15 ++++++++++++---
1 file changed, 12 insertions(+), 3 deletions(-)
--- a/lib/alloc_tag.c~alloc_tag-handle-incomplete-bulk-allocations-in-vm_module_tags_populate
+++ a/lib/alloc_tag.c
@@ -422,11 +422,20 @@ static int vm_module_tags_populate(void)
unsigned long old_shadow_end = ALIGN(phys_end, MODULE_ALIGN);
unsigned long new_shadow_end = ALIGN(new_end, MODULE_ALIGN);
unsigned long more_pages;
- unsigned long nr;
+ unsigned long nr = 0;
more_pages = ALIGN(new_end - phys_end, PAGE_SIZE) >> PAGE_SHIFT;
- nr = alloc_pages_bulk_node(GFP_KERNEL | __GFP_NOWARN,
- NUMA_NO_NODE, more_pages, next_page);
+ while (nr < more_pages) {
+ unsigned long allocated;
+
+ allocated = alloc_pages_bulk_node(GFP_KERNEL | __GFP_NOWARN,
+ NUMA_NO_NODE, more_pages - nr, next_page + nr);
+
+ if (!allocated)
+ break;
+ nr += allocated;
+ }
+
if (nr < more_pages ||
vmap_pages_range(phys_end, phys_end + (nr << PAGE_SHIFT), PAGE_KERNEL,
next_page, PAGE_SHIFT) < 0) {
_
Patches currently in -mm which might be from tjmercier(a)google.com are
alloc_tag-handle-incomplete-bulk-allocations-in-vm_module_tags_populate.patch
struct rdma_cm_id has member "struct work_struct net_work"
that is reused for enqueuing cma_netevent_work_handler()s
onto cma_wq.
Below crash[1] can occur if more than one call to
cma_netevent_callback() occurs in quick succession,
which further enqueues cma_netevent_work_handler()s for the
same rdma_cm_id, overwriting any previously queued work-item(s)
that was just scheduled to run i.e. there is no guarantee
the queued work item may run between two successive calls
to cma_netevent_callback() and the 2nd INIT_WORK would overwrite
the 1st work item (for the same rdma_cm_id), despite grabbing
id_table_lock during enqueue.
Also drgn analysis [2] indicates the work item was likely overwritten.
Fix this by moving the INIT_WORK() to __rdma_create_id(),
so that it doesn't race with any existing queue_work() or
its worker thread.
[1] Trimmed crash stack:
=============================================
BUG: kernel NULL pointer dereference, address: 0000000000000008
kworker/u256:6 ... 6.12.0-0...
Workqueue: cma_netevent_work_handler [rdma_cm] (rdma_cm)
RIP: 0010:process_one_work+0xba/0x31a
Call Trace:
worker_thread+0x266/0x3a0
kthread+0xcf/0x100
ret_from_fork+0x31/0x50
ret_from_fork_asm+0x1a/0x30
=============================================
[2] drgn crash analysis:
>>> trace = prog.crashed_thread().stack_trace()
>>> trace
(0) crash_setup_regs (./arch/x86/include/asm/kexec.h:111:15)
(1) __crash_kexec (kernel/crash_core.c:122:4)
(2) panic (kernel/panic.c:399:3)
(3) oops_end (arch/x86/kernel/dumpstack.c:382:3)
...
(8) process_one_work (kernel/workqueue.c:3168:2)
(9) process_scheduled_works (kernel/workqueue.c:3310:3)
(10) worker_thread (kernel/workqueue.c:3391:4)
(11) kthread (kernel/kthread.c:389:9)
Line workqueue.c:3168 for this kernel version is in process_one_work():
3168 strscpy(worker->desc, pwq->wq->name, WORKER_DESC_LEN);
>>> trace[8]["work"]
*(struct work_struct *)0xffff92577d0a21d8 = {
.data = (atomic_long_t){
.counter = (s64)536870912, <=== Note
},
.entry = (struct list_head){
.next = (struct list_head *)0xffff924d075924c0,
.prev = (struct list_head *)0xffff924d075924c0,
},
.func = (work_func_t)cma_netevent_work_handler+0x0 = 0xffffffffc2cec280,
}
Suspicion is that pwq is NULL:
>>> trace[8]["pwq"]
(struct pool_workqueue *)<absent>
In process_one_work(), pwq is assigned from:
struct pool_workqueue *pwq = get_work_pwq(work);
and get_work_pwq() is:
static struct pool_workqueue *get_work_pwq(struct work_struct *work)
{
unsigned long data = atomic_long_read(&work->data);
if (data & WORK_STRUCT_PWQ)
return work_struct_pwq(data);
else
return NULL;
}
WORK_STRUCT_PWQ is 0x4:
>>> print(repr(prog['WORK_STRUCT_PWQ']))
Object(prog, 'enum work_flags', value=4)
But work->data is 536870912 which is 0x20000000.
So, get_work_pwq() returns NULL and we crash in process_one_work():
3168 strscpy(worker->desc, pwq->wq->name, WORKER_DESC_LEN);
=============================================
Fixes: 925d046e7e52 ("RDMA/core: Add a netevent notifier to cma")
Cc: stable(a)vger.kernel.org
Co-developed-by: Håkon Bugge <haakon.bugge(a)oracle.com>
Signed-off-by: Håkon Bugge <haakon.bugge(a)oracle.com>
Signed-off-by: Sharath Srinivasan <sharath.srinivasan(a)oracle.com>
---
v1->v2 cc:stable@vger.kernel.org
---
drivers/infiniband/core/cma.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 91db10515d74..176d0b3e4488 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -72,6 +72,8 @@ static const char * const cma_events[] = {
static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid,
enum ib_gid_type gid_type);
+static void cma_netevent_work_handler(struct work_struct *_work);
+
const char *__attribute_const__ rdma_event_msg(enum rdma_cm_event_type event)
{
size_t index = event;
@@ -1033,6 +1035,7 @@ __rdma_create_id(struct net *net, rdma_cm_event_handler event_handler,
get_random_bytes(&id_priv->seq_num, sizeof id_priv->seq_num);
id_priv->id.route.addr.dev_addr.net = get_net(net);
id_priv->seq_num &= 0x00ffffff;
+ INIT_WORK(&id_priv->id.net_work, cma_netevent_work_handler);
rdma_restrack_new(&id_priv->res, RDMA_RESTRACK_CM_ID);
if (parent)
@@ -5227,7 +5230,6 @@ static int cma_netevent_callback(struct notifier_block *self,
if (!memcmp(current_id->id.route.addr.dev_addr.dst_dev_addr,
neigh->ha, ETH_ALEN))
continue;
- INIT_WORK(¤t_id->id.net_work, cma_netevent_work_handler);
cma_id_get(current_id);
queue_work(cma_wq, ¤t_id->id.net_work);
}
--
2.39.5 (Apple Git-154)
The function brcmf_usb_dlneeded() calls the function brcmf_usb_dl_cmd()
but dose not check its return value. The 'id.chiprev' is uninitialized if
the function brcmf_usb_dl_cmd() fails, and may propagate to
'devinfo->bus_pub.chiprev'.
Add error handling for brcmf_usb_dl_cmd() to return the function if the
'id.chiprev' is uninitialized.
Fixes: 71bb244ba2fd ("brcm80211: fmac: add USB support for bcm43235/6/8 chipsets")
Cc: stable(a)vger.kernel.org # v3.4+
Signed-off-by: Wentao Liang <vulab(a)iscas.ac.cn>
---
drivers/net/wireless/broadcom/brcm80211/brcmfmac/usb.c | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/usb.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/usb.c
index 2821c27f317e..50dddac8a2ab 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/usb.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/usb.c
@@ -790,6 +790,7 @@ brcmf_usb_dlneeded(struct brcmf_usbdev_info *devinfo)
{
struct bootrom_id_le id;
u32 chipid, chiprev;
+ int err;
brcmf_dbg(USB, "Enter\n");
@@ -798,7 +799,11 @@ brcmf_usb_dlneeded(struct brcmf_usbdev_info *devinfo)
/* Check if firmware downloaded already by querying runtime ID */
id.chip = cpu_to_le32(0xDEAD);
- brcmf_usb_dl_cmd(devinfo, DL_GETVER, &id, sizeof(id));
+ err = brcmf_usb_dl_cmd(devinfo, DL_GETVER, &id, sizeof(id));
+ if (err) {
+ brcmf_err("DL_GETID Failed\n");
+ return false;
+ }
chipid = le32_to_cpu(id.chip);
chiprev = le32_to_cpu(id.chiprev);
--
2.42.0.windows.2
The of_find_device_by_node() function increments the reference count of
the embedded device, which should be released with put_device() when it
is no longer needed.
In ill_acc_of_setup(), put_device() is only called on error paths, but
not on the success path. Fix this by calling put_device() before
returning successfully.
Compile-tested only.
Cc: stable(a)vger.kernel.org
Fixes: 5433acd81e873 ("MIPS: ralink: add illegal access driver")
Signed-off-by: Thorsten Blum <thorsten.blum(a)linux.dev>
---
arch/mips/ralink/ill_acc.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/arch/mips/ralink/ill_acc.c b/arch/mips/ralink/ill_acc.c
index 25341b2319d0..6d1d829854b6 100644
--- a/arch/mips/ralink/ill_acc.c
+++ b/arch/mips/ralink/ill_acc.c
@@ -84,6 +84,7 @@ static int __init ill_acc_of_setup(void)
rt_memc_w32(ILL_INT_STATUS, REG_ILL_ACC_TYPE);
dev_info(&pdev->dev, "irq registered\n");
+ put_device(&pdev->dev);
return 0;
}
--
2.49.0
The RK3399 Puma SoM contains the internal Cypress CYUSB3304 USB
hub, that shows instability due to improper reset pin configuration.
Currently reset pin is modeled as a vcc5v0_host regulator, that
might result in too short reset pulse duration.
Starting with the v6.6, the Onboard USB hub driver (later renamed
to Onboard USB dev) contains support for Cypress HX3 hub family.
It can be now used to correctly model the RK3399 Puma SoM hardware.
The first commits in this series fix the onboard USB dev driver to
support all HX3 hub variants, including the CYUSB3304 found in
the RK3399 Puma SoM.
This allows to introduce fix for internal USB hub instability on
RK3399 Puma, by replacing the vcc5v0_host regulator with
cy3304_reset, used inside the hub node.
Please be aware that the patch that fixes USB hub instability in
arch/arm64/boot/dts/rockchip/rk3399-puma.dtsi can me merged only
after updating the Onboard USB dev driver, otherwise the hub
will not work.
Two last commits in the series disable unrouted USB controllers
and PHYs on RK3399 Puma SOM and Haikou carrier board, with no
intended functional changes.
Signed-off-by: Lukasz Czechowski <lukasz.czechowski(a)thaumatec.com>
---
Lukasz Czechowski (3):
usb: misc: onboard_usb_dev: fix support for Cypress HX3 hubs
dt-bindings: usb: cypress,hx3: Add support for all variants
arm64: dts: rockchip: fix internal USB hub instability on RK3399 Puma
Quentin Schulz (2):
arm64: dts: rockchip: disable unrouted USB controllers and PHY on RK3399 Puma
arm64: dts: rockchip: disable unrouted USB controllers and PHY on RK3399 Puma with Haikou
.../devicetree/bindings/usb/cypress,hx3.yaml | 6 +++
.../arm64/boot/dts/rockchip/rk3399-puma-haikou.dts | 8 ----
arch/arm64/boot/dts/rockchip/rk3399-puma.dtsi | 43 ++++++++++------------
drivers/usb/misc/onboard_usb_dev.c | 10 ++++-
drivers/usb/misc/onboard_usb_dev.h | 6 +++
5 files changed, 39 insertions(+), 34 deletions(-)
---
base-commit: 1e26c5e28ca5821a824e90dd359556f5e9e7b89f
change-id: 20250326-onboard_usb_dev-a7c063a8a515
Best regards,
--
Lukasz Czechowski <lukasz.czechowski(a)thaumatec.com>
Good day/night to everyone.
I am sorry about the duplicate post, I had missed that I was also supposed to CC the regressions list.
I have had S3 sleep broken for well over half a year at this point where the kernel simply crashes upon entering sleep, and the PC turns off (literally stops pulling power from the power supply), and turning it on, at first, causes to to run through 1 on/off cycle, with I believe is a diagnostic step on this board, as it happens whenever something causes the PC to unexpectedly turn off (unless the source is external, like a blackout, in which case once power is restored and I turn on the PC, it turns on normally).
At first I thought that it might be an AMDGPU issue, however removing my dGPU, and also, the rest of my hardware, changed nothing.
However I have had at one point had asus-wmi try to put the system into S0ix, despite the firmware lacking support for S0ix, and now FastFetch no longer prints out my motherboards model number on the Host line, instead just showing ASUS MB, so I have reason to suspect that asus-wmi may be the culprit for this regression.
Unfortunately, I am unable to procure logs, since the kernel crashes before anything meaningful is logged.
I know that this is not a hardware issue, since on Windows and older live-images (Solus is my distribution of choice) these issues do not exist.
I am unsure as to which kernel update broke S3 for me, however I believe it was either late in the 6.10.x cycle or the 6.11.x cycle, since the Solus 4.6 live-image has no such issues on 6.10.13, which was also our last 6.10.x kernel update, and I only started experienced it when we updated to 6.11.5, our first update to 6.11.x, and updating to 6.12.x did not fix anything. The current Solus 4.7 live-image has 6.12.9, and my installed system is currently on 6.12.21.
Hardware:
Motherboard: Asus Z97 Pro Gamer
CPU: i5-4570
dGPU: Sapphire Radeon 540 4GB
RAM: 2x Crucial 8GB DDR3L@1600MT/s, 1x Crucial 4GB DDR3L@1600MT/s
Storage: 500GB Western Digital WD5000AAKX SATA3 7200RPM HDD
DVD Drive: Lite-On DH16ABSH
Add-in cards: Intel 3168 PCIe+USB (by means of a simple adapter) VIA VT6315 Firewire 400 PCIe, MosChip MCS9865 Parallel PCI (over integrated ASMedia ASM1083/1085 PCIe to PCI bridge)
PSU: Seasonix SS-860XP² 860W
Good day/night to everyone.
I have had S3 sleep broken for well over half a year at this point where the kernel simply crashes upon entering sleep, and the PC turns off (literally stops pulling power from the power supply), and turning it on, at first, causes to to run through 1 on/off cycle, with I believe is a diagnostic step on this board, as it happens whenever something causes the PC to unexpectedly turn off (unless the source is external, like a blackout, in which case once power is restored and I turn on the PC, it turns on normally).
At first I thought that it might be an AMDGPU issue, however removing my dGPU, and also, the rest of my hardware, changed nothing.
However I have had at one point had asus-wmi try to put the system into S0ix, despite the firmware lacking support for S0ix, and now FastFetch no longer prints out my motherboards model number on the Host line, instead just showing ASUS MB, so I have reason to suspect that asus-wmi may be the culprit for this regression.
Unfortunately, I am unable to procure logs, since the kernel crashes before anything meaningful is logged.
I know that this is not a hardware issue, since on Windows and older live-images (Solus is my distribution of choice) these issues do not exist.
I am unsure as to which kernel update broke S3 for me, however I believe it was either late in the 6.10.x cycle or the 6.11.x cycle, since the Solus 4.6 live-image has no such issues on 6.10.13, which was also our last 6.10.x kernel update, and I only started experienced it when we updated to 6.11.5, our first update to 6.11.x, and updating to 6.12.x did not fix anything. The current Solus 4.7 live-image has 6.12.9, and my installed system is currently on 6.12.21.
Hardware:
Motherboard: Asus Z97 Pro Gamer
CPU: i5-4570
dGPU: Sapphire Radeon 540 4GB
RAM: 2x Crucial 8GB DDR3L@1600MT/s, 1x Crucial 4GB DDR3L@1600MT/s
Storage: 500GB Western Digital WD5000AAKX SATA3 7200RPM HDD
DVD Drive: Lite-On DH16ABSH
Add-in cards: Intel 3168 PCIe+USB (by means of a simple adapter) VIA VT6315 Firewire 400 PCIe, MosChip MCS9865 Parallel PCI (over integrated ASMedia ASM1083/1085 PCIe to PCI bridge)
PSU: Seasonix SS-860XP² 860W
The function brcmf_usb_dl_writeimage() calls the function
brcmf_usb_dl_cmd() but dose not check its return value. The
'state.state' and the 'state.bytes' are uninitialized if the
function brcmf_usb_dl_cmd() fails. It is dangerous to use
uninitialized variables in the conditions.
Add error handling for brcmf_usb_dl_cmd() to jump to error
handling path if the brcmf_usb_dl_cmd() fails and the
'state.state' and the 'state.bytes' are uninitialized.
Fixes: 71bb244ba2fd ("brcm80211: fmac: add USB support for bcm43235/6/8 chipsets")
Cc: stable(a)vger.kernel.org # v3.4+
Signed-off-by: Wentao Liang <vulab(a)iscas.ac.cn>
---
drivers/net/wireless/broadcom/brcm80211/brcmfmac/usb.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/usb.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/usb.c
index 50dddac8a2ab..1c97cd777225 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/usb.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/usb.c
@@ -901,7 +901,9 @@ brcmf_usb_dl_writeimage(struct brcmf_usbdev_info *devinfo, u8 *fw, int fwlen)
}
/* 1) Prepare USB boot loader for runtime image */
- brcmf_usb_dl_cmd(devinfo, DL_START, &state, sizeof(state));
+ err = brcmf_usb_dl_cmd(devinfo, DL_START, &state, sizeof(state));
+ if (err)
+ goto fail;
rdlstate = le32_to_cpu(state.state);
rdlbytes = le32_to_cpu(state.bytes);
--
2.42.0.windows.2
When a CPU chooses to call push_dl_task and picks a task to push to
another CPU's runqueue then it will call find_lock_later_rq method
which would take a double lock on both CPUs' runqueues. If one of the
locks aren't readily available, it may lead to dropping the current
runqueue lock and reacquiring both the locks at once. During this window
it is possible that the task is already migrated and is running on some
other CPU. These cases are already handled. However, if the task is
migrated and has already been executed and another CPU is now trying to
wake it up (ttwu) such that it is queued again on the runqeue
(on_rq is 1) and also if the task was run by the same CPU, then the
current checks will pass even though the task was migrated out and is no
longer in the pushable tasks list.
Please go through the original rt change for more details on the issue.
To fix this, after the lock is obtained inside the find_lock_later_rq,
it ensures that the task is still at the head of pushable tasks list.
Also removed some checks that are no longer needed with the addition of
this new check.
However, the new check of pushable tasks list only applies when
find_lock_later_rq is called by push_dl_task. For the other caller i.e.
dl_task_offline_migration, existing checks are used.
Signed-off-by: Harshit Agarwal <harshit(a)nutanix.com>
Cc: stable(a)vger.kernel.org
---
Changes in v3:
- Incorporated review comments from Juri around the commit message as
well as around the comment regarding checks in find_lock_later_rq.
- Link to v2:
https://lore.kernel.org/stable/20250317022325.52791-1-harshit@nutanix.com/
Changes in v2:
- As per Juri's suggestion, moved the check inside find_lock_later_rq
similar to rt change. Here we distinguish among the push_dl_task
caller vs dl_task_offline_migration by checking if the task is
throttled or not.
- Fixed the commit message to refer to the rt change by title.
- Link to v1:
https://lore.kernel.org/lkml/20250307204255.60640-1-harshit@nutanix.com/
---
kernel/sched/deadline.c | 73 +++++++++++++++++++++++++++--------------
1 file changed, 49 insertions(+), 24 deletions(-)
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 38e4537790af..e0c95f33e1ed 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2621,6 +2621,25 @@ static int find_later_rq(struct task_struct *task)
return -1;
}
+static struct task_struct *pick_next_pushable_dl_task(struct rq *rq)
+{
+ struct task_struct *p;
+
+ if (!has_pushable_dl_tasks(rq))
+ return NULL;
+
+ p = __node_2_pdl(rb_first_cached(&rq->dl.pushable_dl_tasks_root));
+
+ WARN_ON_ONCE(rq->cpu != task_cpu(p));
+ WARN_ON_ONCE(task_current(rq, p));
+ WARN_ON_ONCE(p->nr_cpus_allowed <= 1);
+
+ WARN_ON_ONCE(!task_on_rq_queued(p));
+ WARN_ON_ONCE(!dl_task(p));
+
+ return p;
+}
+
/* Locks the rq it finds */
static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
{
@@ -2648,12 +2667,37 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
/* Retry if something changed. */
if (double_lock_balance(rq, later_rq)) {
- if (unlikely(task_rq(task) != rq ||
+ /*
+ * double_lock_balance had to release rq->lock, in the
+ * meantime, task may no longer be fit to be migrated.
+ * Check the following to ensure that the task is
+ * still suitable for migration:
+ * 1. It is possible the task was scheduled,
+ * migrate_disabled was set and then got preempted,
+ * so we must check the task migration disable
+ * flag.
+ * 2. The CPU picked is in the task's affinity.
+ * 3. For throttled task (dl_task_offline_migration),
+ * check the following:
+ * - the task is not on the rq anymore (it was
+ * migrated)
+ * - the task is not on CPU anymore
+ * - the task is still a dl task
+ * - the task is not queued on the rq anymore
+ * 4. For the non-throttled task (push_dl_task), the
+ * check to ensure that this task is still at the
+ * head of the pushable tasks list is enough.
+ */
+ if (unlikely(is_migration_disabled(task) ||
!cpumask_test_cpu(later_rq->cpu, &task->cpus_mask) ||
- task_on_cpu(rq, task) ||
- !dl_task(task) ||
- is_migration_disabled(task) ||
- !task_on_rq_queued(task))) {
+ (task->dl.dl_throttled &&
+ (task_rq(task) != rq ||
+ task_on_cpu(rq, task) ||
+ !dl_task(task) ||
+ !task_on_rq_queued(task))) ||
+ (!task->dl.dl_throttled &&
+ task != pick_next_pushable_dl_task(rq)))) {
+
double_unlock_balance(rq, later_rq);
later_rq = NULL;
break;
@@ -2676,25 +2720,6 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
return later_rq;
}
-static struct task_struct *pick_next_pushable_dl_task(struct rq *rq)
-{
- struct task_struct *p;
-
- if (!has_pushable_dl_tasks(rq))
- return NULL;
-
- p = __node_2_pdl(rb_first_cached(&rq->dl.pushable_dl_tasks_root));
-
- WARN_ON_ONCE(rq->cpu != task_cpu(p));
- WARN_ON_ONCE(task_current(rq, p));
- WARN_ON_ONCE(p->nr_cpus_allowed <= 1);
-
- WARN_ON_ONCE(!task_on_rq_queued(p));
- WARN_ON_ONCE(!dl_task(p));
-
- return p;
-}
-
/*
* See if the non running -deadline tasks on this rq
* can be sent to some other CPU where they can preempt
--
2.49.0.111.g5b97a56fa0
A recent optimization change in LLVM [1] aims to transform certain loop
idioms into calls to strlen() or wcslen(). This change transforms the
first while loop in UniStrcat() into a call to wcslen(), breaking the
build when UniStrcat() gets inlined into alloc_path_with_tree_prefix():
ld.lld: error: undefined symbol: wcslen
>>> referenced by nls_ucs2_utils.h:54 (fs/smb/client/../../nls/nls_ucs2_utils.h:54)
>>> vmlinux.o:(alloc_path_with_tree_prefix)
>>> referenced by nls_ucs2_utils.h:54 (fs/smb/client/../../nls/nls_ucs2_utils.h:54)
>>> vmlinux.o:(alloc_path_with_tree_prefix)
Disable this optimization with '-fno-builtin-wcslen', which prevents the
compiler from assuming that wcslen() is available in the kernel's C
library
Cc: stable(a)vger.kernel.org
Link: https://github.com/llvm/llvm-project/commit/9694844d7e36fd5e01011ab56b64f27… [1]
Signed-off-by: Nathan Chancellor <nathan(a)kernel.org>
---
Makefile | 3 +++
1 file changed, 3 insertions(+)
diff --git a/Makefile b/Makefile
index 38689a0c3605..f42418556507 100644
--- a/Makefile
+++ b/Makefile
@@ -1068,6 +1068,9 @@ ifdef CONFIG_CC_IS_GCC
KBUILD_CFLAGS += -fconserve-stack
endif
+# Ensure compilers do not transform certain loops into calls to wcslen()
+KBUILD_CFLAGS += -fno-builtin-wcslen
+
# change __FILE__ to the relative path to the source directory
ifdef building_out_of_srctree
KBUILD_CPPFLAGS += $(call cc-option,-ffile-prefix-map=$(srcroot)/=)
---
base-commit: 0af2f6be1b4281385b618cb86ad946eded089ac8
change-id: 20250407-fno-builtin-wcslen-90a858ae7d54
Best regards,
--
Nathan Chancellor <nathan(a)kernel.org>
Recently, during a debugging session using local MPTCP connections, I
noticed MPJoinAckHMacFailure was strangely not zero on the server side.
The first patch fixes this issue -- present since v5.9 -- and the second
one validates it in the selftests.
Signed-off-by: Matthieu Baerts (NGI0) <matttbe(a)kernel.org>
---
Matthieu Baerts (NGI0) (2):
mptcp: only inc MPJoinAckHMacFailure for HMAC failures
selftests: mptcp: validate MPJoin HMacFailure counters
net/mptcp/subflow.c | 8 ++++++--
tools/testing/selftests/net/mptcp/mptcp_join.sh | 18 ++++++++++++++++++
2 files changed, 24 insertions(+), 2 deletions(-)
---
base-commit: 61f96e684edd28ca40555ec49ea1555df31ba619
change-id: 20250407-net-mptcp-hmac-failure-mib-66f599305ff3
Best regards,
--
Matthieu Baerts (NGI0) <matttbe(a)kernel.org>
The patch titled
Subject: mm: protect kernel pgtables in apply_to_pte_range()
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
mm-protect-kernel-pgtables-in-apply_to_pte_range.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Alexander Gordeev <agordeev(a)linux.ibm.com>
Subject: mm: protect kernel pgtables in apply_to_pte_range()
Date: Tue, 8 Apr 2025 18:07:32 +0200
The lazy MMU mode can only be entered and left under the protection of the
page table locks for all page tables which may be modified. Yet, when it
comes to kernel mappings apply_to_pte_range() does not take any locks.
That does not conform arch_enter|leave_lazy_mmu_mode() semantics and could
potentially lead to re-schedulling a process while in lazy MMU mode or
racing on a kernel page table updates.
Link: https://lkml.kernel.org/r/ef8f6538b83b7fc3372602f90375348f9b4f3596.17441281…
Fixes: 38e0edb15bd0 ("mm/apply_to_range: call pte function with lazy updates")
Signed-off-by: Alexander Gordeev <agordeev(a)linux.ibm.com>
Cc: <stable(a)vger.kernel.org>
Cc: Andrey Ryabinin <ryabinin.a.a(a)gmail.com>
Cc: Guenetr Roeck <linux(a)roeck-us.net>
Cc: Hugh Dickins <hughd(a)google.com>
Cc: Jeremy Fitzhardinge <jeremy(a)goop.org>
Cc: Juegren Gross <jgross(a)suse.com>
Cc: Nicholas Piggin <npiggin(a)gmail.com>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/kasan/shadow.c | 7 ++-----
mm/memory.c | 5 ++++-
2 files changed, 6 insertions(+), 6 deletions(-)
--- a/mm/kasan/shadow.c~mm-protect-kernel-pgtables-in-apply_to_pte_range
+++ a/mm/kasan/shadow.c
@@ -308,14 +308,14 @@ static int kasan_populate_vmalloc_pte(pt
__memset((void *)page, KASAN_VMALLOC_INVALID, PAGE_SIZE);
pte = pfn_pte(PFN_DOWN(__pa(page)), PAGE_KERNEL);
- spin_lock(&init_mm.page_table_lock);
if (likely(pte_none(ptep_get(ptep)))) {
set_pte_at(&init_mm, addr, ptep, pte);
page = 0;
}
- spin_unlock(&init_mm.page_table_lock);
+
if (page)
free_page(page);
+
return 0;
}
@@ -401,13 +401,10 @@ static int kasan_depopulate_vmalloc_pte(
page = (unsigned long)__va(pte_pfn(ptep_get(ptep)) << PAGE_SHIFT);
- spin_lock(&init_mm.page_table_lock);
-
if (likely(!pte_none(ptep_get(ptep)))) {
pte_clear(&init_mm, addr, ptep);
free_page(page);
}
- spin_unlock(&init_mm.page_table_lock);
return 0;
}
--- a/mm/memory.c~mm-protect-kernel-pgtables-in-apply_to_pte_range
+++ a/mm/memory.c
@@ -2926,6 +2926,7 @@ static int apply_to_pte_range(struct mm_
pte = pte_offset_kernel(pmd, addr);
if (!pte)
return err;
+ spin_lock(&init_mm.page_table_lock);
} else {
if (create)
pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
@@ -2951,7 +2952,9 @@ static int apply_to_pte_range(struct mm_
arch_leave_lazy_mmu_mode();
- if (mm != &init_mm)
+ if (mm == &init_mm)
+ spin_unlock(&init_mm.page_table_lock);
+ else
pte_unmap_unlock(mapped_pte, ptl);
*mask |= PGTBL_PTE_MODIFIED;
_
Patches currently in -mm which might be from agordeev(a)linux.ibm.com are
kasan-avoid-sleepable-page-allocation-from-atomic-context.patch
mm-cleanup-apply_to_pte_range-routine.patch
mm-protect-kernel-pgtables-in-apply_to_pte_range.patch
The patch titled
Subject: mm: clean up apply_to_pte_range()
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
mm-cleanup-apply_to_pte_range-routine.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Alexander Gordeev <agordeev(a)linux.ibm.com>
Subject: mm: clean up apply_to_pte_range()
Date: Tue, 8 Apr 2025 18:07:31 +0200
Reverse 'create' vs 'mm == &init_mm' conditions and move page table mask
modification out of the atomic context. This is a prerequisite for fixing
missing kernel page tables lock.
Link: https://lkml.kernel.org/r/0c65bc334f17ff1d7d92d31c69d7065769bbce4e.17441281…
Fixes: 38e0edb15bd0 ("mm/apply_to_range: call pte function with lazy updates")
Signed-off-by: Alexander Gordeev <agordeev(a)linux.ibm.com>
Cc: <stable(a)vger.kernel.org>
Cc: Andrey Ryabinin <ryabinin.a.a(a)gmail.com>
Cc: Guenetr Roeck <linux(a)roeck-us.net>
Cc: Hugh Dickins <hughd(a)google.com>
Cc: Jeremy Fitzhardinge <jeremy(a)goop.org>
Cc: Juegren Gross <jgross(a)suse.com>
Cc: Nicholas Piggin <npiggin(a)gmail.com>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/memory.c | 28 +++++++++++++++++-----------
1 file changed, 17 insertions(+), 11 deletions(-)
--- a/mm/memory.c~mm-cleanup-apply_to_pte_range-routine
+++ a/mm/memory.c
@@ -2915,24 +2915,28 @@ static int apply_to_pte_range(struct mm_
pte_fn_t fn, void *data, bool create,
pgtbl_mod_mask *mask)
{
+ int err = create ? -ENOMEM : -EINVAL;
pte_t *pte, *mapped_pte;
- int err = 0;
spinlock_t *ptl;
- if (create) {
- mapped_pte = pte = (mm == &init_mm) ?
- pte_alloc_kernel_track(pmd, addr, mask) :
- pte_alloc_map_lock(mm, pmd, addr, &ptl);
+ if (mm == &init_mm) {
+ if (create)
+ pte = pte_alloc_kernel_track(pmd, addr, mask);
+ else
+ pte = pte_offset_kernel(pmd, addr);
if (!pte)
- return -ENOMEM;
+ return err;
} else {
- mapped_pte = pte = (mm == &init_mm) ?
- pte_offset_kernel(pmd, addr) :
- pte_offset_map_lock(mm, pmd, addr, &ptl);
+ if (create)
+ pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
+ else
+ pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
if (!pte)
- return -EINVAL;
+ return err;
+ mapped_pte = pte;
}
+ err = 0;
arch_enter_lazy_mmu_mode();
if (fn) {
@@ -2944,12 +2948,14 @@ static int apply_to_pte_range(struct mm_
}
} while (addr += PAGE_SIZE, addr != end);
}
- *mask |= PGTBL_PTE_MODIFIED;
arch_leave_lazy_mmu_mode();
if (mm != &init_mm)
pte_unmap_unlock(mapped_pte, ptl);
+
+ *mask |= PGTBL_PTE_MODIFIED;
+
return err;
}
_
Patches currently in -mm which might be from agordeev(a)linux.ibm.com are
kasan-avoid-sleepable-page-allocation-from-atomic-context.patch
mm-cleanup-apply_to_pte_range-routine.patch
mm-protect-kernel-pgtables-in-apply_to_pte_range.patch
The patch titled
Subject: kasan: avoid sleepable page allocation from atomic context
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
kasan-avoid-sleepable-page-allocation-from-atomic-context.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Alexander Gordeev <agordeev(a)linux.ibm.com>
Subject: kasan: avoid sleepable page allocation from atomic context
Date: Tue, 8 Apr 2025 18:07:30 +0200
Patch series "mm: Fix apply_to_pte_range() vs lazy MMU mode", v2.
This series is an attempt to fix the violation of lazy MMU mode context
requirement as described for arch_enter_lazy_mmu_mode():
This mode can only be entered and left under the protection of
the page table locks for all page tables which may be modified.
On s390 if I make arch_enter_lazy_mmu_mode() -> preempt_enable() and
arch_leave_lazy_mmu_mode() -> preempt_disable() I am getting this:
[ 553.332108] preempt_count: 1, expected: 0
[ 553.332117] no locks held by multipathd/2116.
[ 553.332128] CPU: 24 PID: 2116 Comm: multipathd Kdump: loaded Tainted:
[ 553.332139] Hardware name: IBM 3931 A01 701 (LPAR)
[ 553.332146] Call Trace:
[ 553.332152] [<00000000158de23a>] dump_stack_lvl+0xfa/0x150
[ 553.332167] [<0000000013e10d12>] __might_resched+0x57a/0x5e8
[ 553.332178] [<00000000144eb6c2>] __alloc_pages+0x2ba/0x7c0
[ 553.332189] [<00000000144d5cdc>] __get_free_pages+0x2c/0x88
[ 553.332198] [<00000000145663f6>] kasan_populate_vmalloc_pte+0x4e/0x110
[ 553.332207] [<000000001447625c>] apply_to_pte_range+0x164/0x3c8
[ 553.332218] [<000000001448125a>] apply_to_pmd_range+0xda/0x318
[ 553.332226] [<000000001448181c>] __apply_to_page_range+0x384/0x768
[ 553.332233] [<0000000014481c28>] apply_to_page_range+0x28/0x38
[ 553.332241] [<00000000145665da>] kasan_populate_vmalloc+0x82/0x98
[ 553.332249] [<00000000144c88d0>] alloc_vmap_area+0x590/0x1c90
[ 553.332257] [<00000000144ca108>] __get_vm_area_node.constprop.0+0x138/0x260
[ 553.332265] [<00000000144d17fc>] __vmalloc_node_range+0x134/0x360
[ 553.332274] [<0000000013d5dbf2>] alloc_thread_stack_node+0x112/0x378
[ 553.332284] [<0000000013d62726>] dup_task_struct+0x66/0x430
[ 553.332293] [<0000000013d63962>] copy_process+0x432/0x4b80
[ 553.332302] [<0000000013d68300>] kernel_clone+0xf0/0x7d0
[ 553.332311] [<0000000013d68bd6>] __do_sys_clone+0xae/0xc8
[ 553.332400] [<0000000013d68dee>] __s390x_sys_clone+0xd6/0x118
[ 553.332410] [<0000000013c9d34c>] do_syscall+0x22c/0x328
[ 553.332419] [<00000000158e7366>] __do_syscall+0xce/0xf0
[ 553.332428] [<0000000015913260>] system_call+0x70/0x98
This exposes a KASAN issue fixed with patch 1 and apply_to_pte_range()
issue fixed with patch 3, while patch 2 is a prerequisite.
Commit b9ef323ea168 ("powerpc/64s: Disable preemption in hash lazy mmu
mode") looks like powerpc-only fix, yet not entirely conforming to the
above provided requirement (page tables itself are still not protected).
If I am not mistaken, xen and sparc are alike.
This patch (of 3):
apply_to_page_range() enters lazy MMU mode and then invokes
kasan_populate_vmalloc_pte() callback on each page table walk iteration.
The lazy MMU mode may only be entered only under protection of the page
table lock. However, the callback can go into sleep when trying to
allocate a single page.
Change __get_free_page() allocation mode from GFP_KERNEL to GFP_ATOMIC to
avoid scheduling out while in atomic context.
Link: https://lkml.kernel.org/r/cover.1744128123.git.agordeev@linux.ibm.com
Link: https://lkml.kernel.org/r/2d9f4ac4528701b59d511a379a60107fa608ad30.17441281…
Fixes: 3c5c3cfb9ef4 ("kasan: support backing vmalloc space with real shadow memory")
Signed-off-by: Alexander Gordeev <agordeev(a)linux.ibm.com>
Cc: Andrey Ryabinin <ryabinin.a.a(a)gmail.com>
Cc: Guenetr Roeck <linux(a)roeck-us.net>
Cc: Hugh Dickins <hughd(a)google.com>
Cc: Jeremy Fitzhardinge <jeremy(a)goop.org>
Cc: Juegren Gross <jgross(a)suse.com>
Cc: Nicholas Piggin <npiggin(a)gmail.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/kasan/shadow.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/mm/kasan/shadow.c~kasan-avoid-sleepable-page-allocation-from-atomic-context
+++ a/mm/kasan/shadow.c
@@ -301,7 +301,7 @@ static int kasan_populate_vmalloc_pte(pt
if (likely(!pte_none(ptep_get(ptep))))
return 0;
- page = __get_free_page(GFP_KERNEL);
+ page = __get_free_page(GFP_ATOMIC);
if (!page)
return -ENOMEM;
_
Patches currently in -mm which might be from agordeev(a)linux.ibm.com are
kasan-avoid-sleepable-page-allocation-from-atomic-context.patch
mm-cleanup-apply_to_pte_range-routine.patch
mm-protect-kernel-pgtables-in-apply_to_pte_range.patch
Hi folks,
This series fixes support for correctly saving and restoring fltcon0
and fltcon1 registers on gs101 for non-alive banks where the fltcon
register offset is not at a fixed offset (unlike previous SoCs).
This is done by adding a eint_fltcon_offset and providing GS101
specific pin macros that take an additional parameter (similar to
how exynosautov920 handles it's eint_con_offset).
Additionally the SoC specific suspend and resume callbacks are
re-factored so that each SoC variant has it's own callback containing
the peculiarities for that SoC.
Finally support for filter selection on alive banks is added, this is
currently only enabled for gs101. The code path can be excercised using
`echo mem > /sys/power/state`
regards,
Peter
To: Krzysztof Kozlowski <krzk(a)kernel.org>
To: Sylwester Nawrocki <s.nawrocki(a)samsung.com>
To: Alim Akhtar <alim.akhtar(a)samsung.com>
To: Linus Walleij <linus.walleij(a)linaro.org>
Cc: linux-arm-kernel(a)lists.infradead.org
Cc: linux-samsung-soc(a)vger.kernel.org
Cc: linux-gpio(a)vger.kernel.org
Cc: linux-kernel(a)vger.kernel.org
Cc: andre.draszik(a)linaro.org
Cc: tudor.ambarus(a)linaro.org
Cc: willmcvicker(a)google.com
Cc: semen.protsenko(a)linaro.org
Cc: kernel-team(a)android.com
Cc: jaewon02.kim(a)samsung.com
Signed-off-by: Peter Griffin <peter.griffin(a)linaro.org>
---
Changes in v6:
- Make drvdata->suspend/resume symmetrically reversed (Krzysztof)
- rebase on linux-next
- Link to v5: https://lore.kernel.org/r/20250312-pinctrl-fltcon-suspend-v5-0-d98d5b271242…
Changes in v5:
- Split drvdata suspend & resume callbacks into a dedicated patch (Krzysztof)
- Add comment about stable dependency (Krzysztof)
- Add back in {} braces (Krzysztof)
- Link to v4: https://lore.kernel.org/r/20250307-pinctrl-fltcon-suspend-v4-0-2d775e486036…
Changes in v4:
- save->eint_fltcon1 is an argument to pr_debug(), not readl() change alignment accordingly (Andre)
- Link to v3: https://lore.kernel.org/r/20250306-pinctrl-fltcon-suspend-v3-0-f9ab4ff6a24e…
Changes in v3:
- Ensure EXYNOS_FLTCON_DIGITAL bit is cleared (Andre)
- Make it obvious that exynos_eint_set_filter() is conditional on bank type (Andre)
- Make it obvious exynos_set_wakeup() is conditional on bank type (Andre)
- Align style where the '+' is placed first (Andre)
- Remove unnecessary braces (Andre)
- Link to v2: https://lore.kernel.org/r/20250301-pinctrl-fltcon-suspend-v2-0-a7eef9bb443b…
Changes in v2:
- Remove eint_flt_selectable bool as it can be deduced from EINT_TYPE_WKUP (Peter)
- Move filter config register comment to header file (Andre)
- Rename EXYNOS_FLTCON_DELAY to EXYNOS_FLTCON_ANALOG (Andre)
- Remove misleading old comment (Andre)
- Refactor exynos_eint_update_flt_reg() into a loop (Andre)
- Split refactor of suspend/resume callbacks & gs101 parts into separate patches (Andre)
- Link to v1: https://lore.kernel.org/r/20250120-pinctrl-fltcon-suspend-v1-0-e77900b2a854…
---
Peter Griffin (4):
pinctrl: samsung: refactor drvdata suspend & resume callbacks
pinctrl: samsung: add dedicated SoC eint suspend/resume callbacks
pinctrl: samsung: add gs101 specific eint suspend/resume callbacks
pinctrl: samsung: Add filter selection support for alive bank on gs101
drivers/pinctrl/samsung/pinctrl-exynos-arm64.c | 52 ++---
drivers/pinctrl/samsung/pinctrl-exynos.c | 294 +++++++++++++++----------
drivers/pinctrl/samsung/pinctrl-exynos.h | 28 ++-
drivers/pinctrl/samsung/pinctrl-samsung.c | 21 +-
drivers/pinctrl/samsung/pinctrl-samsung.h | 8 +-
5 files changed, 252 insertions(+), 151 deletions(-)
---
base-commit: cd37a617b4bfb43f84dbbf8058317b487f5203ae
change-id: 20250120-pinctrl-fltcon-suspend-2333a137c4d4
Best regards,
--
Peter Griffin <peter.griffin(a)linaro.org>
Overview
========
When a CPU chooses to call push_rt_task and picks a task to push to
another CPU's runqueue then it will call find_lock_lowest_rq method
which would take a double lock on both CPUs' runqueues. If one of the
locks aren't readily available, it may lead to dropping the current
runqueue lock and reacquiring both the locks at once. During this window
it is possible that the task is already migrated and is running on some
other CPU. These cases are already handled. However, if the task is
migrated and has already been executed and another CPU is now trying to
wake it up (ttwu) such that it is queued again on the runqeue
(on_rq is 1) and also if the task was run by the same CPU, then the
current checks will pass even though the task was migrated out and is no
longer in the pushable tasks list.
Crashes
=======
This bug resulted in quite a few flavors of crashes triggering kernel
panics with various crash signatures such as assert failures, page
faults, null pointer dereferences, and queue corruption errors all
coming from scheduler itself.
Some of the crashes:
-> kernel BUG at kernel/sched/rt.c:1616! BUG_ON(idx >= MAX_RT_PRIO)
Call Trace:
? __die_body+0x1a/0x60
? die+0x2a/0x50
? do_trap+0x85/0x100
? pick_next_task_rt+0x6e/0x1d0
? do_error_trap+0x64/0xa0
? pick_next_task_rt+0x6e/0x1d0
? exc_invalid_op+0x4c/0x60
? pick_next_task_rt+0x6e/0x1d0
? asm_exc_invalid_op+0x12/0x20
? pick_next_task_rt+0x6e/0x1d0
__schedule+0x5cb/0x790
? update_ts_time_stats+0x55/0x70
schedule_idle+0x1e/0x40
do_idle+0x15e/0x200
cpu_startup_entry+0x19/0x20
start_secondary+0x117/0x160
secondary_startup_64_no_verify+0xb0/0xbb
-> BUG: kernel NULL pointer dereference, address: 00000000000000c0
Call Trace:
? __die_body+0x1a/0x60
? no_context+0x183/0x350
? __warn+0x8a/0xe0
? exc_page_fault+0x3d6/0x520
? asm_exc_page_fault+0x1e/0x30
? pick_next_task_rt+0xb5/0x1d0
? pick_next_task_rt+0x8c/0x1d0
__schedule+0x583/0x7e0
? update_ts_time_stats+0x55/0x70
schedule_idle+0x1e/0x40
do_idle+0x15e/0x200
cpu_startup_entry+0x19/0x20
start_secondary+0x117/0x160
secondary_startup_64_no_verify+0xb0/0xbb
-> BUG: unable to handle page fault for address: ffff9464daea5900
kernel BUG at kernel/sched/rt.c:1861! BUG_ON(rq->cpu != task_cpu(p))
-> kernel BUG at kernel/sched/rt.c:1055! BUG_ON(!rq->nr_running)
Call Trace:
? __die_body+0x1a/0x60
? die+0x2a/0x50
? do_trap+0x85/0x100
? dequeue_top_rt_rq+0xa2/0xb0
? do_error_trap+0x64/0xa0
? dequeue_top_rt_rq+0xa2/0xb0
? exc_invalid_op+0x4c/0x60
? dequeue_top_rt_rq+0xa2/0xb0
? asm_exc_invalid_op+0x12/0x20
? dequeue_top_rt_rq+0xa2/0xb0
dequeue_rt_entity+0x1f/0x70
dequeue_task_rt+0x2d/0x70
__schedule+0x1a8/0x7e0
? blk_finish_plug+0x25/0x40
schedule+0x3c/0xb0
futex_wait_queue_me+0xb6/0x120
futex_wait+0xd9/0x240
do_futex+0x344/0xa90
? get_mm_exe_file+0x30/0x60
? audit_exe_compare+0x58/0x70
? audit_filter_rules.constprop.26+0x65e/0x1220
__x64_sys_futex+0x148/0x1f0
do_syscall_64+0x30/0x80
entry_SYSCALL_64_after_hwframe+0x62/0xc7
-> BUG: unable to handle page fault for address: ffff8cf3608bc2c0
Call Trace:
? __die_body+0x1a/0x60
? no_context+0x183/0x350
? spurious_kernel_fault+0x171/0x1c0
? exc_page_fault+0x3b6/0x520
? plist_check_list+0x15/0x40
? plist_check_list+0x2e/0x40
? asm_exc_page_fault+0x1e/0x30
? _cond_resched+0x15/0x30
? futex_wait_queue_me+0xc8/0x120
? futex_wait+0xd9/0x240
? try_to_wake_up+0x1b8/0x490
? futex_wake+0x78/0x160
? do_futex+0xcd/0xa90
? plist_check_list+0x15/0x40
? plist_check_list+0x2e/0x40
? plist_del+0x6a/0xd0
? plist_check_list+0x15/0x40
? plist_check_list+0x2e/0x40
? dequeue_pushable_task+0x20/0x70
? __schedule+0x382/0x7e0
? asm_sysvec_reschedule_ipi+0xa/0x20
? schedule+0x3c/0xb0
? exit_to_user_mode_prepare+0x9e/0x150
? irqentry_exit_to_user_mode+0x5/0x30
? asm_sysvec_reschedule_ipi+0x12/0x20
Above are some of the common examples of the crashes that were observed
due to this issue.
Details
=======
Let's look at the following scenario to understand this race.
1) CPU A enters push_rt_task
a) CPU A has chosen next_task = task p.
b) CPU A calls find_lock_lowest_rq(Task p, CPU Z’s rq).
c) CPU A identifies CPU X as a destination CPU (X < Z).
d) CPU A enters double_lock_balance(CPU Z’s rq, CPU X’s rq).
e) Since X is lower than Z, CPU A unlocks CPU Z’s rq. Someone else has
locked CPU X’s rq, and thus, CPU A must wait.
2) At CPU Z
a) Previous task has completed execution and thus, CPU Z enters
schedule, locks its own rq after CPU A releases it.
b) CPU Z dequeues previous task and begins executing task p.
c) CPU Z unlocks its rq.
d) Task p yields the CPU (ex. by doing IO or waiting to acquire a
lock) which triggers the schedule function on CPU Z.
e) CPU Z enters schedule again, locks its own rq, and dequeues task p.
f) As part of dequeue, it sets p.on_rq = 0 and unlocks its rq.
3) At CPU B
a) CPU B enters try_to_wake_up with input task p.
b) Since CPU Z dequeued task p, p.on_rq = 0, and CPU B updates
B.state = WAKING.
c) CPU B via select_task_rq determines CPU Y as the target CPU.
4) The race
a) CPU A acquires CPU X’s lock and relocks CPU Z.
b) CPU A reads task p.cpu = Z and incorrectly concludes task p is
still on CPU Z.
c) CPU A failed to notice task p had been dequeued from CPU Z while
CPU A was waiting for locks in double_lock_balance. If CPU A knew
that task p had been dequeued, it would return NULL forcing
push_rt_task to give up the task p's migration.
d) CPU B updates task p.cpu = Y and calls ttwu_queue.
e) CPU B locks Ys rq. CPU B enqueues task p onto Y and sets task
p.on_rq = 1.
f) CPU B unlocks CPU Y, triggering memory synchronization.
g) CPU A reads task p.on_rq = 1, cementing its assumption that task p
has not migrated.
h) CPU A decides to migrate p to CPU X.
This leads to A dequeuing p from Y's queue and various crashes down the
line.
Solution
========
The solution here is fairly simple. After obtaining the lock (at 4a),
the check is enhanced to make sure that the task is still at the head of
the pushable tasks list. If not, then it is anyway not suitable for
being pushed out.
Testing
=======
The fix is tested on a cluster of 3 nodes, where the panics due to this
are hit every couple of days. A fix similar to this was deployed on such
cluster and was stable for more than 30 days.
Co-developed-by: Jon Kohler <jon(a)nutanix.com>
Signed-off-by: Jon Kohler <jon(a)nutanix.com>
Co-developed-by: Gauri Patwardhan <gauri.patwardhan(a)nutanix.com>
Signed-off-by: Gauri Patwardhan <gauri.patwardhan(a)nutanix.com>
Co-developed-by: Rahul Chunduru <rahul.chunduru(a)nutanix.com>
Signed-off-by: Rahul Chunduru <rahul.chunduru(a)nutanix.com>
Signed-off-by: Harshit Agarwal <harshit(a)nutanix.com>
Tested-by: Will Ton <william.ton(a)nutanix.com>
Reviewed-by: Steven Rostedt (Google) <rostedt(a)goodmis.org>
Cc: stable(a)vger.kernel.org
---
Changes in v2:
- As per Steve's suggestion, removed some checks that are done after
obtaining the lock that are no longer needed with the addition of new
check.
- Moved up is_migration_disabled check.
- Link to v1:
https://lore.kernel.org/lkml/20250211054646.23987-1-harshit@nutanix.com/
Changes in v3:
- Updated commit message to add stable maintainers and reviewed-by tag.
- Link to v2:
https://lore.kernel.org/lkml/20250214170844.201692-1-harshit@nutanix.com/
---
kernel/sched/rt.c | 54 +++++++++++++++++++++++------------------------
1 file changed, 26 insertions(+), 28 deletions(-)
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 4b8e33c615b1..4762dd3f50c5 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1885,6 +1885,27 @@ static int find_lowest_rq(struct task_struct *task)
return -1;
}
+static struct task_struct *pick_next_pushable_task(struct rq *rq)
+{
+ struct task_struct *p;
+
+ if (!has_pushable_tasks(rq))
+ return NULL;
+
+ p = plist_first_entry(&rq->rt.pushable_tasks,
+ struct task_struct, pushable_tasks);
+
+ BUG_ON(rq->cpu != task_cpu(p));
+ BUG_ON(task_current(rq, p));
+ BUG_ON(task_current_donor(rq, p));
+ BUG_ON(p->nr_cpus_allowed <= 1);
+
+ BUG_ON(!task_on_rq_queued(p));
+ BUG_ON(!rt_task(p));
+
+ return p;
+}
+
/* Will lock the rq it finds */
static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
{
@@ -1915,18 +1936,16 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
/*
* We had to unlock the run queue. In
* the mean time, task could have
- * migrated already or had its affinity changed.
- * Also make sure that it wasn't scheduled on its rq.
+ * migrated already or had its affinity changed,
+ * therefore check if the task is still at the
+ * head of the pushable tasks list.
* It is possible the task was scheduled, set
* "migrate_disabled" and then got preempted, so we must
* check the task migration disable flag here too.
*/
- if (unlikely(task_rq(task) != rq ||
+ if (unlikely(is_migration_disabled(task) ||
!cpumask_test_cpu(lowest_rq->cpu, &task->cpus_mask) ||
- task_on_cpu(rq, task) ||
- !rt_task(task) ||
- is_migration_disabled(task) ||
- !task_on_rq_queued(task))) {
+ task != pick_next_pushable_task(rq))) {
double_unlock_balance(rq, lowest_rq);
lowest_rq = NULL;
@@ -1946,27 +1965,6 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
return lowest_rq;
}
-static struct task_struct *pick_next_pushable_task(struct rq *rq)
-{
- struct task_struct *p;
-
- if (!has_pushable_tasks(rq))
- return NULL;
-
- p = plist_first_entry(&rq->rt.pushable_tasks,
- struct task_struct, pushable_tasks);
-
- BUG_ON(rq->cpu != task_cpu(p));
- BUG_ON(task_current(rq, p));
- BUG_ON(task_current_donor(rq, p));
- BUG_ON(p->nr_cpus_allowed <= 1);
-
- BUG_ON(!task_on_rq_queued(p));
- BUG_ON(!rt_task(p));
-
- return p;
-}
-
/*
* If the current CPU has more than one RT task, see if the non
* running task can migrate over to a CPU that is running a task
--
2.22.3
Replace kzalloc with kvzalloc for the exit_dump buffer allocation, which
can require large contiguous memory depending on the implementation.
This change prevents allocation failures by allowing the system to fall
back to vmalloc when contiguous memory allocation fails.
Since this buffer is only used for debugging purposes, physical memory
contiguity is not required, making vmalloc a suitable alternative.
Cc: stable(a)vger.kernel.org
Fixes: 07814a9439a3b0 ("sched_ext: Print debug dump after an error exit")
Suggested-by: Rik van Riel <riel(a)surriel.com>
Signed-off-by: Breno Leitao <leitao(a)debian.org>
Acked-by: Andrea Righi <arighi(a)nvidia.com>
---
Changes in v3:
- Rewording the patch message
- Link to v2: https://lore.kernel.org/r/20250408-scx-v2-1-1979fc040903@debian.org
Changes in v2:
- Use kvfree() on the free path as well.
- Link to v1: https://lore.kernel.org/r/20250407-scx-v1-1-774ba74a2c17@debian.org
---
kernel/sched/ext.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 66bcd40a28ca1..db9af6a3c04fd 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -4623,7 +4623,7 @@ static void scx_ops_bypass(bool bypass)
static void free_exit_info(struct scx_exit_info *ei)
{
- kfree(ei->dump);
+ kvfree(ei->dump);
kfree(ei->msg);
kfree(ei->bt);
kfree(ei);
@@ -4639,7 +4639,7 @@ static struct scx_exit_info *alloc_exit_info(size_t exit_dump_len)
ei->bt = kcalloc(SCX_EXIT_BT_LEN, sizeof(ei->bt[0]), GFP_KERNEL);
ei->msg = kzalloc(SCX_EXIT_MSG_LEN, GFP_KERNEL);
- ei->dump = kzalloc(exit_dump_len, GFP_KERNEL);
+ ei->dump = kvzalloc(exit_dump_len, GFP_KERNEL);
if (!ei->bt || !ei->msg || !ei->dump) {
free_exit_info(ei);
---
base-commit: 0af2f6be1b4281385b618cb86ad946eded089ac8
change-id: 20250407-scx-11dbf94803c3
Best regards,
--
Breno Leitao <leitao(a)debian.org>
This fix is the deadline version of the change made to the rt scheduler
titled: "sched/rt: Fix race in push_rt_task".
Here is the summary of the issue:
When a CPU chooses to call push_dl_task and picks a task to push to
another CPU's runqueue then it will call find_lock_later_rq method
which would take a double lock on both CPUs' runqueues. If one of the
locks aren't readily available, it may lead to dropping the current
runqueue lock and reacquiring both the locks at once. During this window
it is possible that the task is already migrated and is running on some
other CPU. These cases are already handled. However, if the task is
migrated and has already been executed and another CPU is now trying to
wake it up (ttwu) such that it is queued again on the runqeue
(on_rq is 1) and also if the task was run by the same CPU, then the
current checks will pass even though the task was migrated out and is no
longer in the pushable tasks list.
Please go through the original change for more details on the issue.
In this fix, after the lock is obtained inside the find_lock_later_rq we
ensure that the task is still at the head of pushable tasks list. Also
removed some checks that are no longer needed with the addition this new
check.
However, the check of pushable tasks list only applies when
find_lock_later_rq is called by push_dl_task. For the other caller i.e.
dl_task_offline_migration, we use the existing checks.
Signed-off-by: Harshit Agarwal <harshit(a)nutanix.com>
Cc: stable(a)vger.kernel.org
---
Changes in v2:
- As per Juri's suggestion, moved the check inside find_lock_later_rq
similar to rt change. Here we distinguish among the push_dl_task
caller vs dl_task_offline_migration by checking if the task is
throttled or not.
- Fixed the commit message to refer to the rt change by title.
- Link to v1:
https://lore.kernel.org/lkml/20250307204255.60640-1-harshit@nutanix.com/
---
kernel/sched/deadline.c | 66 ++++++++++++++++++++++++++---------------
1 file changed, 42 insertions(+), 24 deletions(-)
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 38e4537790af..2366801b4557 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2621,6 +2621,25 @@ static int find_later_rq(struct task_struct *task)
return -1;
}
+static struct task_struct *pick_next_pushable_dl_task(struct rq *rq)
+{
+ struct task_struct *p;
+
+ if (!has_pushable_dl_tasks(rq))
+ return NULL;
+
+ p = __node_2_pdl(rb_first_cached(&rq->dl.pushable_dl_tasks_root));
+
+ WARN_ON_ONCE(rq->cpu != task_cpu(p));
+ WARN_ON_ONCE(task_current(rq, p));
+ WARN_ON_ONCE(p->nr_cpus_allowed <= 1);
+
+ WARN_ON_ONCE(!task_on_rq_queued(p));
+ WARN_ON_ONCE(!dl_task(p));
+
+ return p;
+}
+
/* Locks the rq it finds */
static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
{
@@ -2648,12 +2667,30 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
/* Retry if something changed. */
if (double_lock_balance(rq, later_rq)) {
- if (unlikely(task_rq(task) != rq ||
+ /*
+ * We had to unlock the run queue. In the meantime,
+ * task could have migrated already or had its affinity
+ * changed.
+ * It is possible the task was scheduled, set
+ * "migrate_disabled" and then got preempted, so we must
+ * check the task migration disable flag here too.
+ * For throttled task (dl_task_offline_migration), we
+ * check if the task is migrated to a different rq or
+ * is not a dl task anymore.
+ * For the non-throttled task (push_dl_task), the check
+ * to ensure that this task is still at the head of the
+ * pushable tasks list is enough.
+ */
+ if (unlikely(is_migration_disabled(task) ||
!cpumask_test_cpu(later_rq->cpu, &task->cpus_mask) ||
- task_on_cpu(rq, task) ||
- !dl_task(task) ||
- is_migration_disabled(task) ||
- !task_on_rq_queued(task))) {
+ (task->dl.dl_throttled &&
+ (task_rq(task) != rq ||
+ task_on_cpu(rq, task) ||
+ !dl_task(task) ||
+ !task_on_rq_queued(task))) ||
+ (!task->dl.dl_throttled &&
+ task != pick_next_pushable_dl_task(rq)))) {
+
double_unlock_balance(rq, later_rq);
later_rq = NULL;
break;
@@ -2676,25 +2713,6 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
return later_rq;
}
-static struct task_struct *pick_next_pushable_dl_task(struct rq *rq)
-{
- struct task_struct *p;
-
- if (!has_pushable_dl_tasks(rq))
- return NULL;
-
- p = __node_2_pdl(rb_first_cached(&rq->dl.pushable_dl_tasks_root));
-
- WARN_ON_ONCE(rq->cpu != task_cpu(p));
- WARN_ON_ONCE(task_current(rq, p));
- WARN_ON_ONCE(p->nr_cpus_allowed <= 1);
-
- WARN_ON_ONCE(!task_on_rq_queued(p));
- WARN_ON_ONCE(!dl_task(p));
-
- return p;
-}
-
/*
* See if the non running -deadline tasks on this rq
* can be sent to some other CPU where they can preempt
--
2.39.3
Some peripheral subsystems request IRQ_TYPE_EDGE_BOTH interrupt type and
report request failures on LIOINTC. To avoid such failures we support to
set IRQ_TYPE_EDGE_BOTH type on LIOINTC, by setting LIOINTC_REG_INTC_EDGE
to true and keep LIOINTC_REG_INTC_POL as is.
Cc: stable(a)vger.kernel.org
Signed-off-by: Yinbo Zhu <zhuyinbo(a)loongson.cn>
Signed-off-by: Huacai Chen <chenhuacai(a)loongson.cn>
---
drivers/irqchip/irq-loongson-liointc.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/drivers/irqchip/irq-loongson-liointc.c b/drivers/irqchip/irq-loongson-liointc.c
index 2b1bd4a96665..c0c8ef8d27cf 100644
--- a/drivers/irqchip/irq-loongson-liointc.c
+++ b/drivers/irqchip/irq-loongson-liointc.c
@@ -128,6 +128,10 @@ static int liointc_set_type(struct irq_data *data, unsigned int type)
liointc_set_bit(gc, LIOINTC_REG_INTC_EDGE, mask, false);
liointc_set_bit(gc, LIOINTC_REG_INTC_POL, mask, true);
break;
+ case IRQ_TYPE_EDGE_BOTH:
+ liointc_set_bit(gc, LIOINTC_REG_INTC_EDGE, mask, true);
+ /* Requester need "both", keep LIOINTC_REG_INTC_POL as is */
+ break;
case IRQ_TYPE_EDGE_RISING:
liointc_set_bit(gc, LIOINTC_REG_INTC_EDGE, mask, true);
liointc_set_bit(gc, LIOINTC_REG_INTC_POL, mask, false);
--
2.47.1
tpm2_start_auth_session() does not mask TPM RC correctly from the callers:
[ 28.766528] tpm tpm0: A TPM error (2307) occurred start auth session
Process TPM RCs inside tpm2_start_auth_session(), and map them to POSIX
error codes.
Cc: stable(a)vger.kernel.org # v6.10+
Fixes: 699e3efd6c64 ("tpm: Add HMAC session start and end functions")
Reported-by: Herbert Xu <herbert(a)gondor.apana.org.au>
Closes: https://lore.kernel.org/linux-integrity/Z_NgdRHuTKP6JK--@gondor.apana.org.a…
Signed-off-by: Jarkko Sakkinen <jarkko(a)kernel.org>
---
drivers/char/tpm/tpm2-sessions.c | 34 ++++++++++++++++----------------
include/linux/tpm.h | 1 +
2 files changed, 18 insertions(+), 17 deletions(-)
diff --git a/drivers/char/tpm/tpm2-sessions.c b/drivers/char/tpm/tpm2-sessions.c
index 3f89635ba5e8..1ed23375e4cb 100644
--- a/drivers/char/tpm/tpm2-sessions.c
+++ b/drivers/char/tpm/tpm2-sessions.c
@@ -40,11 +40,6 @@
*
* These are the usage functions:
*
- * tpm2_start_auth_session() which allocates the opaque auth structure
- * and gets a session from the TPM. This must be called before
- * any of the following functions. The session is protected by a
- * session_key which is derived from a random salt value
- * encrypted to the NULL seed.
* tpm2_end_auth_session() kills the session and frees the resources.
* Under normal operation this function is done by
* tpm_buf_check_hmac_response(), so this is only to be used on
@@ -963,16 +958,13 @@ static int tpm2_load_null(struct tpm_chip *chip, u32 *null_key)
}
/**
- * tpm2_start_auth_session() - create a HMAC authentication session with the TPM
- * @chip: the TPM chip structure to create the session with
+ * tpm2_start_auth_session() - Create an a HMAC authentication session
+ * @chip: A TPM chip
*
- * This function loads the NULL seed from its saved context and starts
- * an authentication session on the null seed, fills in the
- * @chip->auth structure to contain all the session details necessary
- * for performing the HMAC, encrypt and decrypt operations and
- * returns. The NULL seed is flushed before this function returns.
+ * Loads the ephemeral key (null seed), and starts an HMAC authenticated
+ * session. The null seed is flushed before the return.
*
- * Return: zero on success or actual error encountered.
+ * Returns zero on success, or a POSIX error code.
*/
int tpm2_start_auth_session(struct tpm_chip *chip)
{
@@ -1024,11 +1016,19 @@ int tpm2_start_auth_session(struct tpm_chip *chip)
/* hash algorithm for session */
tpm_buf_append_u16(&buf, TPM_ALG_SHA256);
- rc = tpm_transmit_cmd(chip, &buf, 0, "start auth session");
+ rc = tpm_transmit_cmd(chip, &buf, 0, "StartAuthSession");
tpm2_flush_context(chip, null_key);
-
- if (rc == TPM2_RC_SUCCESS)
- rc = tpm2_parse_start_auth_session(auth, &buf);
+ switch (rc) {
+ case TPM2_RC_SUCCESS:
+ break;
+ case TPM2_RC_SESSION_MEMORY:
+ rc = -ENOMEM;
+ goto out;
+ default:
+ rc = -EFAULT;
+ goto out;
+ }
+ rc = tpm2_parse_start_auth_session(auth, &buf);
tpm_buf_destroy(&buf);
diff --git a/include/linux/tpm.h b/include/linux/tpm.h
index 6c3125300c00..c1d3d60b416f 100644
--- a/include/linux/tpm.h
+++ b/include/linux/tpm.h
@@ -257,6 +257,7 @@ enum tpm2_return_codes {
TPM2_RC_TESTING = 0x090A, /* RC_WARN */
TPM2_RC_REFERENCE_H0 = 0x0910,
TPM2_RC_RETRY = 0x0922,
+ TPM2_RC_SESSION_MEMORY = 0x0903,
};
enum tpm2_command_codes {
--
2.39.5
From: Wenlin Kang <wenlin.kang(a)windriver.com>
The selftest tpdir2 terminated with a 'Segmentation fault' during loading.
root@localhost:~# cd linux-kenel/tools/testing/selftests/arm64/abi && make
root@localhost:~/linux-kernel/tools/testing/selftests/arm64/abi# ./tpidr2
Segmentation fault
The cause of this is the __arch_clear_user() failure.
load_elf_binary() [fs/binfmt_elf.c]
-> if (likely(elf_bss != elf_brk) && unlikely(padzero(elf_bes)))
-> padzero()
-> clear_user() [arch/arm64/include/asm/uaccess.h]
-> __arch_clear_user() [arch/arm64/lib/clear_user.S]
For more details, please see:
https://lore.kernel.org/lkml/1d0342f3-0474-482b-b6db-81ca7820a462@t-8ch.de/…
This issue has been fixed in the mainline. Here I have backported
the relevant commits for the linux-6.6.y branch and attached them.
With these patches, tpdir2 works as:
root@localhost:~/linux-kernel/tools/testing/selftests/arm64/abi# ./tpidr2
TAP version 13
1..5
ok 0 skipped, TPIDR2 not supported
ok 1 skipped, TPIDR2 not supported
ok 2 skipped, TPIDR2 not supported
ok 3 skipped, TPIDR2 not supported
ok 4 skipped, TPIDR2 not supported
This issue is resolved by the first patch. However, to ensure
functional completeness, all related patches were backported
according to the following link.
https://lore.kernel.org/all/20230929031716.it.155-kees@kernel.org/#t
Eric W. Biederman (1):
binfmt_elf: Support segments with 0 filesz and misaligned starts
Kees Cook (5):
binfmt_elf: elf_bss no longer used by load_elf_binary()
binfmt_elf: Use elf_load() for interpreter
binfmt_elf: Use elf_load() for library
binfmt_elf: Only report padzero() errors when PROT_WRITE
mm: Remove unused vm_brk()
fs/binfmt_elf.c | 215 ++++++++++++++++-----------------------------
include/linux/mm.h | 3 +-
mm/mmap.c | 6 --
mm/nommu.c | 5 --
4 files changed, 76 insertions(+), 153 deletions(-)
--
2.43.0
Hello,
New build issue found on stable-rc/linux-5.15.y:
---
variable 'base_clk' is used uninitialized whenever 'if' condition is
true [-Werror,-Wsometimes-uninitialized] in
drivers/mmc/host/sdhci-brcmstb.o (drivers/mmc/host/sdhci-brcmstb.c)
[logspec:kbuild,kbuild.compiler.error]
---
- dashboard: https://d.kernelci.org/i/maestro:eb9b0da83cc077e6176b9903d98f0f78704ac17f
- giturl: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git
- commit HEAD: 0b4857306c618d2052f6455b90747ef1df364ecd
Log excerpt:
=====================================================
drivers/mmc/host/sdhci-brcmstb.c:303:6: error: variable 'base_clk' is
used uninitialized whenever 'if' condition is true
[-Werror,-Wsometimes-uninitialized]
303 | if (res)
| ^~~
drivers/mmc/host/sdhci-brcmstb.c:377:24: note: uninitialized use occurs here
377 | clk_disable_unprepare(base_clk);
| ^~~~~~~~
drivers/mmc/host/sdhci-brcmstb.c:303:2: note: remove the 'if' if its
condition is always false
303 | if (res)
| ^~~~~~~~
304 | goto err;
| ~~~~~~~~
drivers/mmc/host/sdhci-brcmstb.c:296:6: error: variable 'base_clk' is
used uninitialized whenever 'if' condition is true
[-Werror,-Wsometimes-uninitialized]
296 | if (IS_ERR(priv->cfg_regs)) {
| ^~~~~~~~~~~~~~~~~~~~~~
drivers/mmc/host/sdhci-brcmstb.c:377:24: note: uninitialized use occurs here
377 | clk_disable_unprepare(base_clk);
| ^~~~~~~~
drivers/mmc/host/sdhci-brcmstb.c:296:2: note: remove the 'if' if its
condition is always false
296 | if (IS_ERR(priv->cfg_regs)) {
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~
297 | res = PTR_ERR(priv->cfg_regs);
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
298 | goto err;
| ~~~~~~~~~
299 | }
| ~
drivers/mmc/host/sdhci-brcmstb.c:281:6: error: variable 'base_clk' is
used uninitialized whenever 'if' condition is true
[-Werror,-Wsometimes-uninitialized]
281 | if (IS_ERR(host)) {
| ^~~~~~~~~~~~
drivers/mmc/host/sdhci-brcmstb.c:377:24: note: uninitialized use occurs here
377 | clk_disable_unprepare(base_clk);
| ^~~~~~~~
drivers/mmc/host/sdhci-brcmstb.c:281:2: note: remove the 'if' if its
condition is always false
281 | if (IS_ERR(host)) {
| ^~~~~~~~~~~~~~~~~~~
282 | res = PTR_ERR(host);
| ~~~~~~~~~~~~~~~~~~~~
283 | goto err_clk;
| ~~~~~~~~~~~~~
284 | }
| ~
drivers/mmc/host/sdhci-brcmstb.c:260:22: note: initialize the variable
'base_clk' to silence this warning
260 | struct clk *base_clk;
| ^
| = NULL
3 errors generated.
CC [M] drivers/gpu/drm/drm_gem.o
CC [M] drivers/staging/rtl8723bs/hal/odm_EdcaTurboCheck.o
CC [M] drivers/net/ethernet/rocker/rocker_tlv.o
CC [M] drivers/staging/nvec/nvec.o
CC [M] drivers/staging/media/zoran/zoran_card.o
=====================================================
# Builds where the incident occurred:
## defconfig+allmodconfig+CONFIG_FRAME_WARN=2048 on (arm):
- compiler: clang-17
- dashboard: https://d.kernelci.org/build/maestro:67f509a66fa43d168f278a2b
#kernelci issue maestro:eb9b0da83cc077e6176b9903d98f0f78704ac17f
Reported-by: kernelci.org bot <bot(a)kernelci.org>
--
This is an experimental report format. Please send feedback in!
Talk to us at kernelci(a)lists.linux.dev
Made with love by the KernelCI team - https://kernelci.org
Replace kzalloc with kvzalloc for the exit_dump buffer allocation, which
can require large contiguous memory (up to order=9) depending on the
implementation. This change prevents allocation failures by allowing the
system to fall back to vmalloc when contiguous memory allocation fails.
Since this buffer is only used for debugging purposes, physical memory
contiguity is not required, making vmalloc a suitable alternative.
Cc: stable(a)vger.kernel.org
Fixes: 07814a9439a3b0 ("sched_ext: Print debug dump after an error exit")
Suggested-by: Rik van Riel <riel(a)surriel.com>
Signed-off-by: Breno Leitao <leitao(a)debian.org>
Acked-by: Andrea Righi <arighi(a)nvidia.com>
---
Changes in v2:
- Use kvfree() on the free path as well.
- Link to v1: https://lore.kernel.org/r/20250407-scx-v1-1-774ba74a2c17@debian.org
---
kernel/sched/ext.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 66bcd40a28ca1..db9af6a3c04fd 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -4623,7 +4623,7 @@ static void scx_ops_bypass(bool bypass)
static void free_exit_info(struct scx_exit_info *ei)
{
- kfree(ei->dump);
+ kvfree(ei->dump);
kfree(ei->msg);
kfree(ei->bt);
kfree(ei);
@@ -4639,7 +4639,7 @@ static struct scx_exit_info *alloc_exit_info(size_t exit_dump_len)
ei->bt = kcalloc(SCX_EXIT_BT_LEN, sizeof(ei->bt[0]), GFP_KERNEL);
ei->msg = kzalloc(SCX_EXIT_MSG_LEN, GFP_KERNEL);
- ei->dump = kzalloc(exit_dump_len, GFP_KERNEL);
+ ei->dump = kvzalloc(exit_dump_len, GFP_KERNEL);
if (!ei->bt || !ei->msg || !ei->dump) {
free_exit_info(ei);
---
base-commit: 0af2f6be1b4281385b618cb86ad946eded089ac8
change-id: 20250407-scx-11dbf94803c3
Best regards,
--
Breno Leitao <leitao(a)debian.org>
Hello,
New build issue found on stable-rc/linux-6.12.y:
---
in vmlinux (vmlinux.lds) [logspec:kbuild,kbuild.compiler]
---
- dashboard: https://d.kernelci.org/i/maestro:9938a6d051bfcd7063bdcf21603c29bf8822a3a5
- giturl: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git
- commit HEAD: 8e9508dd93587658f8f8116bc709aeb272144427
Log excerpt:
=====================================================
arm-linux-gnueabihf-ld:./arch/arm/kernel/vmlinux.lds:30: syntax error
=====================================================
# Builds where the incident occurred:
## multi_v5_defconfig on (arm):
- compiler: gcc-12
- dashboard: https://d.kernelci.org/build/maestro:67f50c196fa43d168f278cea
#kernelci issue maestro:9938a6d051bfcd7063bdcf21603c29bf8822a3a5
Reported-by: kernelci.org bot <bot(a)kernelci.org>
--
This is an experimental report format. Please send feedback in!
Talk to us at kernelci(a)lists.linux.dev
Made with love by the KernelCI team - https://kernelci.org
The fsl-mc bus associated to the root DPRC in a DPAA2 system exports a
device file for userspace access to the MC firmware. In case the DPRC's
local MC portal (DPMCP) is currently in use, a new DPMCP device is
allocated through the fsl_mc_portal_allocate() function.
In this case, the call to fsl_mc_portal_allocate() will fail with -EINVAL
when trying to add a device link between the root DPRC (consumer) and
the newly allocated DPMCP device (supplier). This is because the DPMCP
is a dependent of the DPRC device (the bus).
Fix this by not adding a device link in case the DPMCP is allocated for
the root DPRC's usage.
Fixes: afb77422819f ("bus: fsl-mc: automatically add a device_link on fsl_mc_[portal,object]_allocate")
Signed-off-by: Ioana Ciornei <ioana.ciornei(a)nxp.com>
Cc: stable(a)vger.kernel.org
---
drivers/bus/fsl-mc/mc-io.c | 19 +++++++++++++------
1 file changed, 13 insertions(+), 6 deletions(-)
diff --git a/drivers/bus/fsl-mc/mc-io.c b/drivers/bus/fsl-mc/mc-io.c
index a0ad7866cbfc..cd8754763f40 100644
--- a/drivers/bus/fsl-mc/mc-io.c
+++ b/drivers/bus/fsl-mc/mc-io.c
@@ -214,12 +214,19 @@ int __must_check fsl_mc_portal_allocate(struct fsl_mc_device *mc_dev,
if (error < 0)
goto error_cleanup_resource;
- dpmcp_dev->consumer_link = device_link_add(&mc_dev->dev,
- &dpmcp_dev->dev,
- DL_FLAG_AUTOREMOVE_CONSUMER);
- if (!dpmcp_dev->consumer_link) {
- error = -EINVAL;
- goto error_cleanup_mc_io;
+ /* If the DPRC device itself tries to allocate a portal (usually for
+ * UAPI interaction), don't add a device link between them since the
+ * DPMCP device is an actual child device of the DPRC and a reverse
+ * dependency is not allowed.
+ */
+ if (mc_dev != mc_bus_dev) {
+ dpmcp_dev->consumer_link = device_link_add(&mc_dev->dev,
+ &dpmcp_dev->dev,
+ DL_FLAG_AUTOREMOVE_CONSUMER);
+ if (!dpmcp_dev->consumer_link) {
+ error = -EINVAL;
+ goto error_cleanup_mc_io;
+ }
}
*new_mc_io = mc_io;
--
2.34.1
The patch below does not apply to the 6.12-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.12.y
git checkout FETCH_HEAD
git cherry-pick -x cbef7442fba510b7eb229dcc9f39d3dde4a159a4
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025040846-bonsai-tackiness-6500@gregkh' --subject-prefix 'PATCH 6.12.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From cbef7442fba510b7eb229dcc9f39d3dde4a159a4 Mon Sep 17 00:00:00 2001
From: Tudor Ambarus <tudor.ambarus(a)linaro.org>
Date: Fri, 17 Jan 2025 14:18:51 +0000
Subject: [PATCH] mmc: sdhci-msm: fix dev reference leaked through
of_qcom_ice_get
The driver leaks the device reference taken with
of_find_device_by_node(). Fix the leak by using devm_of_qcom_ice_get().
Fixes: c7eed31e235c ("mmc: sdhci-msm: Switch to the new ICE API")
Cc: stable(a)vger.kernel.org
Signed-off-by: Tudor Ambarus <tudor.ambarus(a)linaro.org>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski(a)linaro.org>
Acked-by: Ulf Hansson <ulf.hansson(a)linaro.org>
Reviewed-by: Abel Vesa <abel.vesa(a)linaro.org>
Reviewed-by: Manivannan Sadhasivam <manivannan.sadhasivam(a)linaro.org>
Link: https://lore.kernel.org/r/20250117-qcom-ice-fix-dev-leak-v2-2-1ffa5b6884cb@…
Signed-off-by: Bjorn Andersson <andersson(a)kernel.org>
diff --git a/drivers/mmc/host/sdhci-msm.c b/drivers/mmc/host/sdhci-msm.c
index e3d39311fdc7..3fd898647237 100644
--- a/drivers/mmc/host/sdhci-msm.c
+++ b/drivers/mmc/host/sdhci-msm.c
@@ -1873,7 +1873,7 @@ static int sdhci_msm_ice_init(struct sdhci_msm_host *msm_host,
if (!(cqhci_readl(cq_host, CQHCI_CAP) & CQHCI_CAP_CS))
return 0;
- ice = of_qcom_ice_get(dev);
+ ice = devm_of_qcom_ice_get(dev);
if (ice == ERR_PTR(-EOPNOTSUPP)) {
dev_warn(dev, "Disabling inline encryption support\n");
ice = NULL;
The patch below does not apply to the 6.13-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.13.y
git checkout FETCH_HEAD
git cherry-pick -x cbef7442fba510b7eb229dcc9f39d3dde4a159a4
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025040846-elude-harpist-dd96@gregkh' --subject-prefix 'PATCH 6.13.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From cbef7442fba510b7eb229dcc9f39d3dde4a159a4 Mon Sep 17 00:00:00 2001
From: Tudor Ambarus <tudor.ambarus(a)linaro.org>
Date: Fri, 17 Jan 2025 14:18:51 +0000
Subject: [PATCH] mmc: sdhci-msm: fix dev reference leaked through
of_qcom_ice_get
The driver leaks the device reference taken with
of_find_device_by_node(). Fix the leak by using devm_of_qcom_ice_get().
Fixes: c7eed31e235c ("mmc: sdhci-msm: Switch to the new ICE API")
Cc: stable(a)vger.kernel.org
Signed-off-by: Tudor Ambarus <tudor.ambarus(a)linaro.org>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski(a)linaro.org>
Acked-by: Ulf Hansson <ulf.hansson(a)linaro.org>
Reviewed-by: Abel Vesa <abel.vesa(a)linaro.org>
Reviewed-by: Manivannan Sadhasivam <manivannan.sadhasivam(a)linaro.org>
Link: https://lore.kernel.org/r/20250117-qcom-ice-fix-dev-leak-v2-2-1ffa5b6884cb@…
Signed-off-by: Bjorn Andersson <andersson(a)kernel.org>
diff --git a/drivers/mmc/host/sdhci-msm.c b/drivers/mmc/host/sdhci-msm.c
index e3d39311fdc7..3fd898647237 100644
--- a/drivers/mmc/host/sdhci-msm.c
+++ b/drivers/mmc/host/sdhci-msm.c
@@ -1873,7 +1873,7 @@ static int sdhci_msm_ice_init(struct sdhci_msm_host *msm_host,
if (!(cqhci_readl(cq_host, CQHCI_CAP) & CQHCI_CAP_CS))
return 0;
- ice = of_qcom_ice_get(dev);
+ ice = devm_of_qcom_ice_get(dev);
if (ice == ERR_PTR(-EOPNOTSUPP)) {
dev_warn(dev, "Disabling inline encryption support\n");
ice = NULL;
The patch below does not apply to the 6.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.14.y
git checkout FETCH_HEAD
git cherry-pick -x cbef7442fba510b7eb229dcc9f39d3dde4a159a4
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025040845-carving-viscous-42cf@gregkh' --subject-prefix 'PATCH 6.14.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From cbef7442fba510b7eb229dcc9f39d3dde4a159a4 Mon Sep 17 00:00:00 2001
From: Tudor Ambarus <tudor.ambarus(a)linaro.org>
Date: Fri, 17 Jan 2025 14:18:51 +0000
Subject: [PATCH] mmc: sdhci-msm: fix dev reference leaked through
of_qcom_ice_get
The driver leaks the device reference taken with
of_find_device_by_node(). Fix the leak by using devm_of_qcom_ice_get().
Fixes: c7eed31e235c ("mmc: sdhci-msm: Switch to the new ICE API")
Cc: stable(a)vger.kernel.org
Signed-off-by: Tudor Ambarus <tudor.ambarus(a)linaro.org>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski(a)linaro.org>
Acked-by: Ulf Hansson <ulf.hansson(a)linaro.org>
Reviewed-by: Abel Vesa <abel.vesa(a)linaro.org>
Reviewed-by: Manivannan Sadhasivam <manivannan.sadhasivam(a)linaro.org>
Link: https://lore.kernel.org/r/20250117-qcom-ice-fix-dev-leak-v2-2-1ffa5b6884cb@…
Signed-off-by: Bjorn Andersson <andersson(a)kernel.org>
diff --git a/drivers/mmc/host/sdhci-msm.c b/drivers/mmc/host/sdhci-msm.c
index e3d39311fdc7..3fd898647237 100644
--- a/drivers/mmc/host/sdhci-msm.c
+++ b/drivers/mmc/host/sdhci-msm.c
@@ -1873,7 +1873,7 @@ static int sdhci_msm_ice_init(struct sdhci_msm_host *msm_host,
if (!(cqhci_readl(cq_host, CQHCI_CAP) & CQHCI_CAP_CS))
return 0;
- ice = of_qcom_ice_get(dev);
+ ice = devm_of_qcom_ice_get(dev);
if (ice == ERR_PTR(-EOPNOTSUPP)) {
dev_warn(dev, "Disabling inline encryption support\n");
ice = NULL;
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 8446a4deb6b6bc998f1d8d2a85d1a0c64b9e3a71
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025040857-displace-theorize-0492@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 8446a4deb6b6bc998f1d8d2a85d1a0c64b9e3a71 Mon Sep 17 00:00:00 2001
From: David Laight <david.laight(a)aculab.com>
Date: Thu, 7 Sep 2023 12:42:20 +0000
Subject: [PATCH] slab: kmalloc_size_roundup() must not return 0 for non-zero
size
The typical use of kmalloc_size_roundup() is:
ptr = kmalloc(sz = kmalloc_size_roundup(size), ...);
if (!ptr) return -ENOMEM.
This means it is vitally important that the returned value isn't less
than the argument even if the argument is insane.
In particular if kmalloc_slab() fails or the value is above
(MAX_ULONG - PAGE_SIZE) zero is returned and kmalloc() will return
its single zero-length buffer ZERO_SIZE_PTR.
Fix this by returning the input size if the size exceeds
KMALLOC_MAX_SIZE. kmalloc() will then return NULL as the size really is
too big.
kmalloc_slab() should not normally return NULL, unless called too early.
Again, returning zero is not the correct action as it can be in some
usage scenarios stored to a variable and only later cause kmalloc()
return ZERO_SIZE_PTR and subsequent crashes on access. Instead we can
simply stop checking the kmalloc_slab() result completely, as calling
kmalloc_size_roundup() too early would then result in an immediate crash
during boot and the developer noticing an issue in their code.
[vbabka(a)suse.cz: remove kmalloc_slab() result check, tweak comments and
commit log]
Fixes: 05a940656e1e ("slab: Introduce kmalloc_size_roundup()")
Signed-off-by: David Laight <david.laight(a)aculab.com>
Signed-off-by: Vlastimil Babka <vbabka(a)suse.cz>
diff --git a/mm/slab_common.c b/mm/slab_common.c
index e99e821065c3..306e6f0074ff 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -745,24 +745,24 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags, unsigned long caller)
size_t kmalloc_size_roundup(size_t size)
{
- struct kmem_cache *c;
+ if (size && size <= KMALLOC_MAX_CACHE_SIZE) {
+ /*
+ * The flags don't matter since size_index is common to all.
+ * Neither does the caller for just getting ->object_size.
+ */
+ return kmalloc_slab(size, GFP_KERNEL, 0)->object_size;
+ }
- /* Short-circuit the 0 size case. */
- if (unlikely(size == 0))
- return 0;
- /* Short-circuit saturated "too-large" case. */
- if (unlikely(size == SIZE_MAX))
- return SIZE_MAX;
/* Above the smaller buckets, size is a multiple of page size. */
- if (size > KMALLOC_MAX_CACHE_SIZE)
+ if (size && size <= KMALLOC_MAX_SIZE)
return PAGE_SIZE << get_order(size);
/*
- * The flags don't matter since size_index is common to all.
- * Neither does the caller for just getting ->object_size.
+ * Return 'size' for 0 - kmalloc() returns ZERO_SIZE_PTR
+ * and very large size - kmalloc() may fail.
*/
- c = kmalloc_slab(size, GFP_KERNEL, 0);
- return c ? c->object_size : 0;
+ return size;
+
}
EXPORT_SYMBOL(kmalloc_size_roundup);
The patch below does not apply to the 6.12-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.12.y
git checkout FETCH_HEAD
git cherry-pick -x 5b1122fc4995f308b21d7cfc64ef9880ac834d20
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025040859-vocalist-germproof-0dee@gregkh' --subject-prefix 'PATCH 6.12.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 5b1122fc4995f308b21d7cfc64ef9880ac834d20 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter(a)linaro.org>
Date: Mon, 10 Mar 2025 22:48:29 +0300
Subject: [PATCH] platform/x86/amd/pmf: fix cleanup in amd_pmf_init_smart_pc()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
There are a few problems in this code:
First, if amd_pmf_tee_init() fails then the function returns directly
instead of cleaning up. We cannot simply do a "goto error;" because
the amd_pmf_tee_init() cleanup calls tee_shm_free(dev->fw_shm_pool);
and amd_pmf_tee_deinit() calls it as well leading to a double free.
I have re-written this code to use an unwind ladder to free the
allocations.
Second, if amd_pmf_start_policy_engine() fails on every iteration though
the loop then the code calls amd_pmf_tee_deinit() twice which is also a
double free. Call amd_pmf_tee_deinit() inside the loop for each failed
iteration. Also on that path the error codes are not necessarily
negative kernel error codes. Set the error code to -EINVAL.
There is a very subtle third bug which is that if the call to
input_register_device() in amd_pmf_register_input_device() fails then
we call input_unregister_device() on an input device that wasn't
registered. This will lead to a reference counting underflow
because of the device_del(&dev->dev) in __input_unregister_device().
It's unlikely that anyone would ever hit this bug in real life.
Fixes: 376a8c2a1443 ("platform/x86/amd/pmf: Update PMF Driver for Compatibility with new PMF-TA")
Signed-off-by: Dan Carpenter <dan.carpenter(a)linaro.org>
Link: https://lore.kernel.org/r/232231fc-6a71-495e-971b-be2a76f6db4c@stanley.moun…
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen(a)linux.intel.com>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen(a)linux.intel.com>
diff --git a/drivers/platform/x86/amd/pmf/tee-if.c b/drivers/platform/x86/amd/pmf/tee-if.c
index ceaff1ebb7b9..a1e43873a07b 100644
--- a/drivers/platform/x86/amd/pmf/tee-if.c
+++ b/drivers/platform/x86/amd/pmf/tee-if.c
@@ -510,18 +510,18 @@ int amd_pmf_init_smart_pc(struct amd_pmf_dev *dev)
ret = amd_pmf_set_dram_addr(dev, true);
if (ret)
- goto error;
+ goto err_cancel_work;
dev->policy_base = devm_ioremap_resource(dev->dev, dev->res);
if (IS_ERR(dev->policy_base)) {
ret = PTR_ERR(dev->policy_base);
- goto error;
+ goto err_free_dram_buf;
}
dev->policy_buf = kzalloc(dev->policy_sz, GFP_KERNEL);
if (!dev->policy_buf) {
ret = -ENOMEM;
- goto error;
+ goto err_free_dram_buf;
}
memcpy_fromio(dev->policy_buf, dev->policy_base, dev->policy_sz);
@@ -531,13 +531,13 @@ int amd_pmf_init_smart_pc(struct amd_pmf_dev *dev)
dev->prev_data = kzalloc(sizeof(*dev->prev_data), GFP_KERNEL);
if (!dev->prev_data) {
ret = -ENOMEM;
- goto error;
+ goto err_free_policy;
}
for (i = 0; i < ARRAY_SIZE(amd_pmf_ta_uuid); i++) {
ret = amd_pmf_tee_init(dev, &amd_pmf_ta_uuid[i]);
if (ret)
- return ret;
+ goto err_free_prev_data;
ret = amd_pmf_start_policy_engine(dev);
switch (ret) {
@@ -550,27 +550,41 @@ int amd_pmf_init_smart_pc(struct amd_pmf_dev *dev)
status = false;
break;
default:
- goto error;
+ ret = -EINVAL;
+ amd_pmf_tee_deinit(dev);
+ goto err_free_prev_data;
}
if (status)
break;
}
- if (!status && !pb_side_load)
- goto error;
+ if (!status && !pb_side_load) {
+ ret = -EINVAL;
+ goto err_free_prev_data;
+ }
if (pb_side_load)
amd_pmf_open_pb(dev, dev->dbgfs_dir);
ret = amd_pmf_register_input_device(dev);
if (ret)
- goto error;
+ goto err_pmf_remove_pb;
return 0;
-error:
- amd_pmf_deinit_smart_pc(dev);
+err_pmf_remove_pb:
+ if (pb_side_load && dev->esbin)
+ amd_pmf_remove_pb(dev);
+ amd_pmf_tee_deinit(dev);
+err_free_prev_data:
+ kfree(dev->prev_data);
+err_free_policy:
+ kfree(dev->policy_buf);
+err_free_dram_buf:
+ kfree(dev->buf);
+err_cancel_work:
+ cancel_delayed_work_sync(&dev->pb_work);
return ret;
}
Polling mode transactions wait for a reply busy-looping without holding a
spinlock, but currently the timeout checks are based only on elapsed time:
as a result we could hit a false positive whenever our busy-looping thread
is pre-empted and scheduled out for a time greater than the polling
timeout.
Change the checks at the end of the busy-loop to make sure that the polling
wasn't indeed successful or an out-of-order reply caused the polling to be
forcibly terminated.
Fixes: 31d2f803c19c ("firmware: arm_scmi: Add sync_cmds_completed_on_ret transport flag")
Reported-by: Huangjie <huangjie1663(a)phytium.com.cn>
Closes: https://lore.kernel.org/arm-scmi/20250123083323.2363749-1-jackhuang021@gmai…
Signed-off-by: Cristian Marussi <cristian.marussi(a)arm.com>
Cc: <stable(a)vger.kernel.org> # 5.18.x
---
This fix got to be backported to 5.4/5.10./5.15 due to small changes in the
context
---
drivers/firmware/arm_scmi/driver.c | 13 ++++++++-----
1 file changed, 8 insertions(+), 5 deletions(-)
diff --git a/drivers/firmware/arm_scmi/driver.c b/drivers/firmware/arm_scmi/driver.c
index 60050da54bf2..e6cf83950875 100644
--- a/drivers/firmware/arm_scmi/driver.c
+++ b/drivers/firmware/arm_scmi/driver.c
@@ -1248,7 +1248,8 @@ static void xfer_put(const struct scmi_protocol_handle *ph,
}
static bool scmi_xfer_done_no_timeout(struct scmi_chan_info *cinfo,
- struct scmi_xfer *xfer, ktime_t stop)
+ struct scmi_xfer *xfer, ktime_t stop,
+ bool *ooo)
{
struct scmi_info *info = handle_to_scmi_info(cinfo->handle);
@@ -1257,7 +1258,7 @@ static bool scmi_xfer_done_no_timeout(struct scmi_chan_info *cinfo,
* in case of out-of-order receptions of delayed responses
*/
return info->desc->ops->poll_done(cinfo, xfer) ||
- try_wait_for_completion(&xfer->done) ||
+ (*ooo = try_wait_for_completion(&xfer->done)) ||
ktime_after(ktime_get(), stop);
}
@@ -1274,15 +1275,17 @@ static int scmi_wait_for_reply(struct device *dev, const struct scmi_desc *desc,
* itself to support synchronous commands replies.
*/
if (!desc->sync_cmds_completed_on_ret) {
+ bool ooo = false;
+
/*
* Poll on xfer using transport provided .poll_done();
* assumes no completion interrupt was available.
*/
ktime_t stop = ktime_add_ms(ktime_get(), timeout_ms);
- spin_until_cond(scmi_xfer_done_no_timeout(cinfo,
- xfer, stop));
- if (ktime_after(ktime_get(), stop)) {
+ spin_until_cond(scmi_xfer_done_no_timeout(cinfo, xfer,
+ stop, &ooo));
+ if (!ooo && !info->desc->ops->poll_done(cinfo, xfer)) {
dev_err(dev,
"timed out in resp(caller: %pS) - polling\n",
(void *)_RET_IP_);
--
2.47.0
The patch below does not apply to the 6.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.14.y
git checkout FETCH_HEAD
git cherry-pick -x 1b0449544c6482179ac84530b61fc192a6527bfd
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025040821-frostlike-grandly-ca1b@gregkh' --subject-prefix 'PATCH 6.14.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 1b0449544c6482179ac84530b61fc192a6527bfd Mon Sep 17 00:00:00 2001
From: Jinjiang Tu <tujinjiang(a)huawei.com>
Date: Tue, 18 Mar 2025 16:39:39 +0800
Subject: [PATCH] mm/vmscan: don't try to reclaim hwpoison folio
Syzkaller reports a bug as follows:
Injecting memory failure for pfn 0x18b00e at process virtual address 0x20ffd000
Memory failure: 0x18b00e: dirty swapcache page still referenced by 2 users
Memory failure: 0x18b00e: recovery action for dirty swapcache page: Failed
page: refcount:2 mapcount:0 mapping:0000000000000000 index:0x20ffd pfn:0x18b00e
memcg:ffff0000dd6d9000
anon flags: 0x5ffffe00482011(locked|dirty|arch_1|swapbacked|hwpoison|node=0|zone=2|lastcpupid=0xfffff)
raw: 005ffffe00482011 dead000000000100 dead000000000122 ffff0000e232a7c9
raw: 0000000000020ffd 0000000000000000 00000002ffffffff ffff0000dd6d9000
page dumped because: VM_BUG_ON_FOLIO(!folio_test_uptodate(folio))
------------[ cut here ]------------
kernel BUG at mm/swap_state.c:184!
Internal error: Oops - BUG: 00000000f2000800 [#1] SMP
Modules linked in:
CPU: 0 PID: 60 Comm: kswapd0 Not tainted 6.6.0-gcb097e7de84e #3
Hardware name: linux,dummy-virt (DT)
pstate: 80400005 (Nzcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
pc : add_to_swap+0xbc/0x158
lr : add_to_swap+0xbc/0x158
sp : ffff800087f37340
x29: ffff800087f37340 x28: fffffc00052c0380 x27: ffff800087f37780
x26: ffff800087f37490 x25: ffff800087f37c78 x24: ffff800087f377a0
x23: ffff800087f37c50 x22: 0000000000000000 x21: fffffc00052c03b4
x20: 0000000000000000 x19: fffffc00052c0380 x18: 0000000000000000
x17: 296f696c6f662865 x16: 7461646f7470755f x15: 747365745f6f696c
x14: 6f6621284f494c4f x13: 0000000000000001 x12: ffff600036d8b97b
x11: 1fffe00036d8b97a x10: ffff600036d8b97a x9 : dfff800000000000
x8 : 00009fffc9274686 x7 : ffff0001b6c5cbd3 x6 : 0000000000000001
x5 : ffff0000c25896c0 x4 : 0000000000000000 x3 : 0000000000000000
x2 : 0000000000000000 x1 : ffff0000c25896c0 x0 : 0000000000000000
Call trace:
add_to_swap+0xbc/0x158
shrink_folio_list+0x12ac/0x2648
shrink_inactive_list+0x318/0x948
shrink_lruvec+0x450/0x720
shrink_node_memcgs+0x280/0x4a8
shrink_node+0x128/0x978
balance_pgdat+0x4f0/0xb20
kswapd+0x228/0x438
kthread+0x214/0x230
ret_from_fork+0x10/0x20
I can reproduce this issue with the following steps:
1) When a dirty swapcache page is isolated by reclaim process and the
page isn't locked, inject memory failure for the page.
me_swapcache_dirty() clears uptodate flag and tries to delete from lru,
but fails. Reclaim process will put the hwpoisoned page back to lru.
2) The process that maps the hwpoisoned page exits, the page is deleted
the page will never be freed and will be in the lru forever.
3) If we trigger a reclaim again and tries to reclaim the page,
add_to_swap() will trigger VM_BUG_ON_FOLIO due to the uptodate flag is
cleared.
To fix it, skip the hwpoisoned page in shrink_folio_list(). Besides, the
hwpoison folio may not be unmapped by hwpoison_user_mappings() yet, unmap
it in shrink_folio_list(), otherwise the folio will fail to be unmaped by
hwpoison_user_mappings() since the folio isn't in lru list.
Link: https://lkml.kernel.org/r/20250318083939.987651-3-tujinjiang@huawei.com
Signed-off-by: Jinjiang Tu <tujinjiang(a)huawei.com>
Acked-by: Miaohe Lin <linmiaohe(a)huawei.com>
Cc: David Hildenbrand <david(a)redhat.com>
Cc: Kefeng Wang <wangkefeng.wang(a)huawei.com>
Cc: Nanyong Sun <sunnanyong(a)huawei.com>
Cc: Naoya Horiguchi <nao.horiguchi(a)gmail.com>
Cc: <stable(a)vger,kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 98e6ac82e428..2b2ab386cab5 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1127,6 +1127,13 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
if (!folio_trylock(folio))
goto keep;
+ if (folio_contain_hwpoisoned_page(folio)) {
+ unmap_poisoned_folio(folio, folio_pfn(folio), false);
+ folio_unlock(folio);
+ folio_put(folio);
+ continue;
+ }
+
VM_BUG_ON_FOLIO(folio_test_active(folio), folio);
nr_pages = folio_nr_pages(folio);
The patch below does not apply to the 6.13-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.13.y
git checkout FETCH_HEAD
git cherry-pick -x 1b0449544c6482179ac84530b61fc192a6527bfd
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025040845-curler-serve-797b@gregkh' --subject-prefix 'PATCH 6.13.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 1b0449544c6482179ac84530b61fc192a6527bfd Mon Sep 17 00:00:00 2001
From: Jinjiang Tu <tujinjiang(a)huawei.com>
Date: Tue, 18 Mar 2025 16:39:39 +0800
Subject: [PATCH] mm/vmscan: don't try to reclaim hwpoison folio
Syzkaller reports a bug as follows:
Injecting memory failure for pfn 0x18b00e at process virtual address 0x20ffd000
Memory failure: 0x18b00e: dirty swapcache page still referenced by 2 users
Memory failure: 0x18b00e: recovery action for dirty swapcache page: Failed
page: refcount:2 mapcount:0 mapping:0000000000000000 index:0x20ffd pfn:0x18b00e
memcg:ffff0000dd6d9000
anon flags: 0x5ffffe00482011(locked|dirty|arch_1|swapbacked|hwpoison|node=0|zone=2|lastcpupid=0xfffff)
raw: 005ffffe00482011 dead000000000100 dead000000000122 ffff0000e232a7c9
raw: 0000000000020ffd 0000000000000000 00000002ffffffff ffff0000dd6d9000
page dumped because: VM_BUG_ON_FOLIO(!folio_test_uptodate(folio))
------------[ cut here ]------------
kernel BUG at mm/swap_state.c:184!
Internal error: Oops - BUG: 00000000f2000800 [#1] SMP
Modules linked in:
CPU: 0 PID: 60 Comm: kswapd0 Not tainted 6.6.0-gcb097e7de84e #3
Hardware name: linux,dummy-virt (DT)
pstate: 80400005 (Nzcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
pc : add_to_swap+0xbc/0x158
lr : add_to_swap+0xbc/0x158
sp : ffff800087f37340
x29: ffff800087f37340 x28: fffffc00052c0380 x27: ffff800087f37780
x26: ffff800087f37490 x25: ffff800087f37c78 x24: ffff800087f377a0
x23: ffff800087f37c50 x22: 0000000000000000 x21: fffffc00052c03b4
x20: 0000000000000000 x19: fffffc00052c0380 x18: 0000000000000000
x17: 296f696c6f662865 x16: 7461646f7470755f x15: 747365745f6f696c
x14: 6f6621284f494c4f x13: 0000000000000001 x12: ffff600036d8b97b
x11: 1fffe00036d8b97a x10: ffff600036d8b97a x9 : dfff800000000000
x8 : 00009fffc9274686 x7 : ffff0001b6c5cbd3 x6 : 0000000000000001
x5 : ffff0000c25896c0 x4 : 0000000000000000 x3 : 0000000000000000
x2 : 0000000000000000 x1 : ffff0000c25896c0 x0 : 0000000000000000
Call trace:
add_to_swap+0xbc/0x158
shrink_folio_list+0x12ac/0x2648
shrink_inactive_list+0x318/0x948
shrink_lruvec+0x450/0x720
shrink_node_memcgs+0x280/0x4a8
shrink_node+0x128/0x978
balance_pgdat+0x4f0/0xb20
kswapd+0x228/0x438
kthread+0x214/0x230
ret_from_fork+0x10/0x20
I can reproduce this issue with the following steps:
1) When a dirty swapcache page is isolated by reclaim process and the
page isn't locked, inject memory failure for the page.
me_swapcache_dirty() clears uptodate flag and tries to delete from lru,
but fails. Reclaim process will put the hwpoisoned page back to lru.
2) The process that maps the hwpoisoned page exits, the page is deleted
the page will never be freed and will be in the lru forever.
3) If we trigger a reclaim again and tries to reclaim the page,
add_to_swap() will trigger VM_BUG_ON_FOLIO due to the uptodate flag is
cleared.
To fix it, skip the hwpoisoned page in shrink_folio_list(). Besides, the
hwpoison folio may not be unmapped by hwpoison_user_mappings() yet, unmap
it in shrink_folio_list(), otherwise the folio will fail to be unmaped by
hwpoison_user_mappings() since the folio isn't in lru list.
Link: https://lkml.kernel.org/r/20250318083939.987651-3-tujinjiang@huawei.com
Signed-off-by: Jinjiang Tu <tujinjiang(a)huawei.com>
Acked-by: Miaohe Lin <linmiaohe(a)huawei.com>
Cc: David Hildenbrand <david(a)redhat.com>
Cc: Kefeng Wang <wangkefeng.wang(a)huawei.com>
Cc: Nanyong Sun <sunnanyong(a)huawei.com>
Cc: Naoya Horiguchi <nao.horiguchi(a)gmail.com>
Cc: <stable(a)vger,kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 98e6ac82e428..2b2ab386cab5 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1127,6 +1127,13 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
if (!folio_trylock(folio))
goto keep;
+ if (folio_contain_hwpoisoned_page(folio)) {
+ unmap_poisoned_folio(folio, folio_pfn(folio), false);
+ folio_unlock(folio);
+ folio_put(folio);
+ continue;
+ }
+
VM_BUG_ON_FOLIO(folio_test_active(folio), folio);
nr_pages = folio_nr_pages(folio);
The patch below does not apply to the 6.12-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.12.y
git checkout FETCH_HEAD
git cherry-pick -x 1b0449544c6482179ac84530b61fc192a6527bfd
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025040803-degraded-tastiness-6579@gregkh' --subject-prefix 'PATCH 6.12.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 1b0449544c6482179ac84530b61fc192a6527bfd Mon Sep 17 00:00:00 2001
From: Jinjiang Tu <tujinjiang(a)huawei.com>
Date: Tue, 18 Mar 2025 16:39:39 +0800
Subject: [PATCH] mm/vmscan: don't try to reclaim hwpoison folio
Syzkaller reports a bug as follows:
Injecting memory failure for pfn 0x18b00e at process virtual address 0x20ffd000
Memory failure: 0x18b00e: dirty swapcache page still referenced by 2 users
Memory failure: 0x18b00e: recovery action for dirty swapcache page: Failed
page: refcount:2 mapcount:0 mapping:0000000000000000 index:0x20ffd pfn:0x18b00e
memcg:ffff0000dd6d9000
anon flags: 0x5ffffe00482011(locked|dirty|arch_1|swapbacked|hwpoison|node=0|zone=2|lastcpupid=0xfffff)
raw: 005ffffe00482011 dead000000000100 dead000000000122 ffff0000e232a7c9
raw: 0000000000020ffd 0000000000000000 00000002ffffffff ffff0000dd6d9000
page dumped because: VM_BUG_ON_FOLIO(!folio_test_uptodate(folio))
------------[ cut here ]------------
kernel BUG at mm/swap_state.c:184!
Internal error: Oops - BUG: 00000000f2000800 [#1] SMP
Modules linked in:
CPU: 0 PID: 60 Comm: kswapd0 Not tainted 6.6.0-gcb097e7de84e #3
Hardware name: linux,dummy-virt (DT)
pstate: 80400005 (Nzcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
pc : add_to_swap+0xbc/0x158
lr : add_to_swap+0xbc/0x158
sp : ffff800087f37340
x29: ffff800087f37340 x28: fffffc00052c0380 x27: ffff800087f37780
x26: ffff800087f37490 x25: ffff800087f37c78 x24: ffff800087f377a0
x23: ffff800087f37c50 x22: 0000000000000000 x21: fffffc00052c03b4
x20: 0000000000000000 x19: fffffc00052c0380 x18: 0000000000000000
x17: 296f696c6f662865 x16: 7461646f7470755f x15: 747365745f6f696c
x14: 6f6621284f494c4f x13: 0000000000000001 x12: ffff600036d8b97b
x11: 1fffe00036d8b97a x10: ffff600036d8b97a x9 : dfff800000000000
x8 : 00009fffc9274686 x7 : ffff0001b6c5cbd3 x6 : 0000000000000001
x5 : ffff0000c25896c0 x4 : 0000000000000000 x3 : 0000000000000000
x2 : 0000000000000000 x1 : ffff0000c25896c0 x0 : 0000000000000000
Call trace:
add_to_swap+0xbc/0x158
shrink_folio_list+0x12ac/0x2648
shrink_inactive_list+0x318/0x948
shrink_lruvec+0x450/0x720
shrink_node_memcgs+0x280/0x4a8
shrink_node+0x128/0x978
balance_pgdat+0x4f0/0xb20
kswapd+0x228/0x438
kthread+0x214/0x230
ret_from_fork+0x10/0x20
I can reproduce this issue with the following steps:
1) When a dirty swapcache page is isolated by reclaim process and the
page isn't locked, inject memory failure for the page.
me_swapcache_dirty() clears uptodate flag and tries to delete from lru,
but fails. Reclaim process will put the hwpoisoned page back to lru.
2) The process that maps the hwpoisoned page exits, the page is deleted
the page will never be freed and will be in the lru forever.
3) If we trigger a reclaim again and tries to reclaim the page,
add_to_swap() will trigger VM_BUG_ON_FOLIO due to the uptodate flag is
cleared.
To fix it, skip the hwpoisoned page in shrink_folio_list(). Besides, the
hwpoison folio may not be unmapped by hwpoison_user_mappings() yet, unmap
it in shrink_folio_list(), otherwise the folio will fail to be unmaped by
hwpoison_user_mappings() since the folio isn't in lru list.
Link: https://lkml.kernel.org/r/20250318083939.987651-3-tujinjiang@huawei.com
Signed-off-by: Jinjiang Tu <tujinjiang(a)huawei.com>
Acked-by: Miaohe Lin <linmiaohe(a)huawei.com>
Cc: David Hildenbrand <david(a)redhat.com>
Cc: Kefeng Wang <wangkefeng.wang(a)huawei.com>
Cc: Nanyong Sun <sunnanyong(a)huawei.com>
Cc: Naoya Horiguchi <nao.horiguchi(a)gmail.com>
Cc: <stable(a)vger,kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 98e6ac82e428..2b2ab386cab5 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1127,6 +1127,13 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
if (!folio_trylock(folio))
goto keep;
+ if (folio_contain_hwpoisoned_page(folio)) {
+ unmap_poisoned_folio(folio, folio_pfn(folio), false);
+ folio_unlock(folio);
+ folio_put(folio);
+ continue;
+ }
+
VM_BUG_ON_FOLIO(folio_test_active(folio), folio);
nr_pages = folio_nr_pages(folio);
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x 1b0449544c6482179ac84530b61fc192a6527bfd
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025040837-caption-feminist-e877@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 1b0449544c6482179ac84530b61fc192a6527bfd Mon Sep 17 00:00:00 2001
From: Jinjiang Tu <tujinjiang(a)huawei.com>
Date: Tue, 18 Mar 2025 16:39:39 +0800
Subject: [PATCH] mm/vmscan: don't try to reclaim hwpoison folio
Syzkaller reports a bug as follows:
Injecting memory failure for pfn 0x18b00e at process virtual address 0x20ffd000
Memory failure: 0x18b00e: dirty swapcache page still referenced by 2 users
Memory failure: 0x18b00e: recovery action for dirty swapcache page: Failed
page: refcount:2 mapcount:0 mapping:0000000000000000 index:0x20ffd pfn:0x18b00e
memcg:ffff0000dd6d9000
anon flags: 0x5ffffe00482011(locked|dirty|arch_1|swapbacked|hwpoison|node=0|zone=2|lastcpupid=0xfffff)
raw: 005ffffe00482011 dead000000000100 dead000000000122 ffff0000e232a7c9
raw: 0000000000020ffd 0000000000000000 00000002ffffffff ffff0000dd6d9000
page dumped because: VM_BUG_ON_FOLIO(!folio_test_uptodate(folio))
------------[ cut here ]------------
kernel BUG at mm/swap_state.c:184!
Internal error: Oops - BUG: 00000000f2000800 [#1] SMP
Modules linked in:
CPU: 0 PID: 60 Comm: kswapd0 Not tainted 6.6.0-gcb097e7de84e #3
Hardware name: linux,dummy-virt (DT)
pstate: 80400005 (Nzcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
pc : add_to_swap+0xbc/0x158
lr : add_to_swap+0xbc/0x158
sp : ffff800087f37340
x29: ffff800087f37340 x28: fffffc00052c0380 x27: ffff800087f37780
x26: ffff800087f37490 x25: ffff800087f37c78 x24: ffff800087f377a0
x23: ffff800087f37c50 x22: 0000000000000000 x21: fffffc00052c03b4
x20: 0000000000000000 x19: fffffc00052c0380 x18: 0000000000000000
x17: 296f696c6f662865 x16: 7461646f7470755f x15: 747365745f6f696c
x14: 6f6621284f494c4f x13: 0000000000000001 x12: ffff600036d8b97b
x11: 1fffe00036d8b97a x10: ffff600036d8b97a x9 : dfff800000000000
x8 : 00009fffc9274686 x7 : ffff0001b6c5cbd3 x6 : 0000000000000001
x5 : ffff0000c25896c0 x4 : 0000000000000000 x3 : 0000000000000000
x2 : 0000000000000000 x1 : ffff0000c25896c0 x0 : 0000000000000000
Call trace:
add_to_swap+0xbc/0x158
shrink_folio_list+0x12ac/0x2648
shrink_inactive_list+0x318/0x948
shrink_lruvec+0x450/0x720
shrink_node_memcgs+0x280/0x4a8
shrink_node+0x128/0x978
balance_pgdat+0x4f0/0xb20
kswapd+0x228/0x438
kthread+0x214/0x230
ret_from_fork+0x10/0x20
I can reproduce this issue with the following steps:
1) When a dirty swapcache page is isolated by reclaim process and the
page isn't locked, inject memory failure for the page.
me_swapcache_dirty() clears uptodate flag and tries to delete from lru,
but fails. Reclaim process will put the hwpoisoned page back to lru.
2) The process that maps the hwpoisoned page exits, the page is deleted
the page will never be freed and will be in the lru forever.
3) If we trigger a reclaim again and tries to reclaim the page,
add_to_swap() will trigger VM_BUG_ON_FOLIO due to the uptodate flag is
cleared.
To fix it, skip the hwpoisoned page in shrink_folio_list(). Besides, the
hwpoison folio may not be unmapped by hwpoison_user_mappings() yet, unmap
it in shrink_folio_list(), otherwise the folio will fail to be unmaped by
hwpoison_user_mappings() since the folio isn't in lru list.
Link: https://lkml.kernel.org/r/20250318083939.987651-3-tujinjiang@huawei.com
Signed-off-by: Jinjiang Tu <tujinjiang(a)huawei.com>
Acked-by: Miaohe Lin <linmiaohe(a)huawei.com>
Cc: David Hildenbrand <david(a)redhat.com>
Cc: Kefeng Wang <wangkefeng.wang(a)huawei.com>
Cc: Nanyong Sun <sunnanyong(a)huawei.com>
Cc: Naoya Horiguchi <nao.horiguchi(a)gmail.com>
Cc: <stable(a)vger,kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 98e6ac82e428..2b2ab386cab5 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1127,6 +1127,13 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
if (!folio_trylock(folio))
goto keep;
+ if (folio_contain_hwpoisoned_page(folio)) {
+ unmap_poisoned_folio(folio, folio_pfn(folio), false);
+ folio_unlock(folio);
+ folio_put(folio);
+ continue;
+ }
+
VM_BUG_ON_FOLIO(folio_test_active(folio), folio);
nr_pages = folio_nr_pages(folio);
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x 1b0449544c6482179ac84530b61fc192a6527bfd
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025040820-basil-afoot-09de@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 1b0449544c6482179ac84530b61fc192a6527bfd Mon Sep 17 00:00:00 2001
From: Jinjiang Tu <tujinjiang(a)huawei.com>
Date: Tue, 18 Mar 2025 16:39:39 +0800
Subject: [PATCH] mm/vmscan: don't try to reclaim hwpoison folio
Syzkaller reports a bug as follows:
Injecting memory failure for pfn 0x18b00e at process virtual address 0x20ffd000
Memory failure: 0x18b00e: dirty swapcache page still referenced by 2 users
Memory failure: 0x18b00e: recovery action for dirty swapcache page: Failed
page: refcount:2 mapcount:0 mapping:0000000000000000 index:0x20ffd pfn:0x18b00e
memcg:ffff0000dd6d9000
anon flags: 0x5ffffe00482011(locked|dirty|arch_1|swapbacked|hwpoison|node=0|zone=2|lastcpupid=0xfffff)
raw: 005ffffe00482011 dead000000000100 dead000000000122 ffff0000e232a7c9
raw: 0000000000020ffd 0000000000000000 00000002ffffffff ffff0000dd6d9000
page dumped because: VM_BUG_ON_FOLIO(!folio_test_uptodate(folio))
------------[ cut here ]------------
kernel BUG at mm/swap_state.c:184!
Internal error: Oops - BUG: 00000000f2000800 [#1] SMP
Modules linked in:
CPU: 0 PID: 60 Comm: kswapd0 Not tainted 6.6.0-gcb097e7de84e #3
Hardware name: linux,dummy-virt (DT)
pstate: 80400005 (Nzcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
pc : add_to_swap+0xbc/0x158
lr : add_to_swap+0xbc/0x158
sp : ffff800087f37340
x29: ffff800087f37340 x28: fffffc00052c0380 x27: ffff800087f37780
x26: ffff800087f37490 x25: ffff800087f37c78 x24: ffff800087f377a0
x23: ffff800087f37c50 x22: 0000000000000000 x21: fffffc00052c03b4
x20: 0000000000000000 x19: fffffc00052c0380 x18: 0000000000000000
x17: 296f696c6f662865 x16: 7461646f7470755f x15: 747365745f6f696c
x14: 6f6621284f494c4f x13: 0000000000000001 x12: ffff600036d8b97b
x11: 1fffe00036d8b97a x10: ffff600036d8b97a x9 : dfff800000000000
x8 : 00009fffc9274686 x7 : ffff0001b6c5cbd3 x6 : 0000000000000001
x5 : ffff0000c25896c0 x4 : 0000000000000000 x3 : 0000000000000000
x2 : 0000000000000000 x1 : ffff0000c25896c0 x0 : 0000000000000000
Call trace:
add_to_swap+0xbc/0x158
shrink_folio_list+0x12ac/0x2648
shrink_inactive_list+0x318/0x948
shrink_lruvec+0x450/0x720
shrink_node_memcgs+0x280/0x4a8
shrink_node+0x128/0x978
balance_pgdat+0x4f0/0xb20
kswapd+0x228/0x438
kthread+0x214/0x230
ret_from_fork+0x10/0x20
I can reproduce this issue with the following steps:
1) When a dirty swapcache page is isolated by reclaim process and the
page isn't locked, inject memory failure for the page.
me_swapcache_dirty() clears uptodate flag and tries to delete from lru,
but fails. Reclaim process will put the hwpoisoned page back to lru.
2) The process that maps the hwpoisoned page exits, the page is deleted
the page will never be freed and will be in the lru forever.
3) If we trigger a reclaim again and tries to reclaim the page,
add_to_swap() will trigger VM_BUG_ON_FOLIO due to the uptodate flag is
cleared.
To fix it, skip the hwpoisoned page in shrink_folio_list(). Besides, the
hwpoison folio may not be unmapped by hwpoison_user_mappings() yet, unmap
it in shrink_folio_list(), otherwise the folio will fail to be unmaped by
hwpoison_user_mappings() since the folio isn't in lru list.
Link: https://lkml.kernel.org/r/20250318083939.987651-3-tujinjiang@huawei.com
Signed-off-by: Jinjiang Tu <tujinjiang(a)huawei.com>
Acked-by: Miaohe Lin <linmiaohe(a)huawei.com>
Cc: David Hildenbrand <david(a)redhat.com>
Cc: Kefeng Wang <wangkefeng.wang(a)huawei.com>
Cc: Nanyong Sun <sunnanyong(a)huawei.com>
Cc: Naoya Horiguchi <nao.horiguchi(a)gmail.com>
Cc: <stable(a)vger,kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 98e6ac82e428..2b2ab386cab5 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1127,6 +1127,13 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
if (!folio_trylock(folio))
goto keep;
+ if (folio_contain_hwpoisoned_page(folio)) {
+ unmap_poisoned_folio(folio, folio_pfn(folio), false);
+ folio_unlock(folio);
+ folio_put(folio);
+ continue;
+ }
+
VM_BUG_ON_FOLIO(folio_test_active(folio), folio);
nr_pages = folio_nr_pages(folio);
The function efx_devlink_info_board_cfg() calls the function
devlink_info_serial_number_put(), but does not check its return
value.
Return the error code if either the devlink_info_serial_number_put()
or the efx_mcdi_get_board_cfg() fails.The control flow of the code is
changed a little bit to simplify the code. The functionality of the
code remain the same.
Fixes: 14743ddd2495 ("sfc: add devlink info support for ef100")
Cc: stable(a)vger.kernel.org # v6.3+
Signed-off-by: Wentao Liang <vulab(a)iscas.ac.cn>
---
v2: Simplify code logic.
drivers/net/ethernet/sfc/efx_devlink.c | 11 ++++++-----
1 file changed, 6 insertions(+), 5 deletions(-)
diff --git a/drivers/net/ethernet/sfc/efx_devlink.c b/drivers/net/ethernet/sfc/efx_devlink.c
index 3cd750820fdd..53b17cd252c8 100644
--- a/drivers/net/ethernet/sfc/efx_devlink.c
+++ b/drivers/net/ethernet/sfc/efx_devlink.c
@@ -584,11 +584,12 @@ static int efx_devlink_info_board_cfg(struct efx_nic *efx,
int rc;
rc = efx_mcdi_get_board_cfg(efx, (u8 *)mac_address, NULL, NULL);
- if (!rc) {
- snprintf(sn, EFX_MAX_SERIALNUM_LEN, "%pm", mac_address);
- devlink_info_serial_number_put(req, sn);
- }
- return rc;
+ if (rc)
+ return rc;
+
+ snprintf(sn, EFX_MAX_SERIALNUM_LEN, "%pm", mac_address);
+
+ return devlink_info_serial_number_put(req, sn);
}
static int efx_devlink_info_get(struct devlink *devlink,
--
2.42.0.windows.2
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x d5e206778e96e8667d3bde695ad372c296dc9353
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025040828-secrecy-alibi-685d@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From d5e206778e96e8667d3bde695ad372c296dc9353 Mon Sep 17 00:00:00 2001
From: "Acs, Jakub" <acsjakub(a)amazon.de>
Date: Thu, 20 Mar 2025 15:46:49 +0000
Subject: [PATCH] ext4: fix OOB read when checking dotdot dir
Mounting a corrupted filesystem with directory which contains '.' dir
entry with rec_len == block size results in out-of-bounds read (later
on, when the corrupted directory is removed).
ext4_empty_dir() assumes every ext4 directory contains at least '.'
and '..' as directory entries in the first data block. It first loads
the '.' dir entry, performs sanity checks by calling ext4_check_dir_entry()
and then uses its rec_len member to compute the location of '..' dir
entry (in ext4_next_entry). It assumes the '..' dir entry fits into the
same data block.
If the rec_len of '.' is precisely one block (4KB), it slips through the
sanity checks (it is considered the last directory entry in the data
block) and leaves "struct ext4_dir_entry_2 *de" point exactly past the
memory slot allocated to the data block. The following call to
ext4_check_dir_entry() on new value of de then dereferences this pointer
which results in out-of-bounds mem access.
Fix this by extending __ext4_check_dir_entry() to check for '.' dir
entries that reach the end of data block. Make sure to ignore the phony
dir entries for checksum (by checking name_len for non-zero).
Note: This is reported by KASAN as use-after-free in case another
structure was recently freed from the slot past the bound, but it is
really an OOB read.
This issue was found by syzkaller tool.
Call Trace:
[ 38.594108] BUG: KASAN: slab-use-after-free in __ext4_check_dir_entry+0x67e/0x710
[ 38.594649] Read of size 2 at addr ffff88802b41a004 by task syz-executor/5375
[ 38.595158]
[ 38.595288] CPU: 0 UID: 0 PID: 5375 Comm: syz-executor Not tainted 6.14.0-rc7 #1
[ 38.595298] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014
[ 38.595304] Call Trace:
[ 38.595308] <TASK>
[ 38.595311] dump_stack_lvl+0xa7/0xd0
[ 38.595325] print_address_description.constprop.0+0x2c/0x3f0
[ 38.595339] ? __ext4_check_dir_entry+0x67e/0x710
[ 38.595349] print_report+0xaa/0x250
[ 38.595359] ? __ext4_check_dir_entry+0x67e/0x710
[ 38.595368] ? kasan_addr_to_slab+0x9/0x90
[ 38.595378] kasan_report+0xab/0xe0
[ 38.595389] ? __ext4_check_dir_entry+0x67e/0x710
[ 38.595400] __ext4_check_dir_entry+0x67e/0x710
[ 38.595410] ext4_empty_dir+0x465/0x990
[ 38.595421] ? __pfx_ext4_empty_dir+0x10/0x10
[ 38.595432] ext4_rmdir.part.0+0x29a/0xd10
[ 38.595441] ? __dquot_initialize+0x2a7/0xbf0
[ 38.595455] ? __pfx_ext4_rmdir.part.0+0x10/0x10
[ 38.595464] ? __pfx___dquot_initialize+0x10/0x10
[ 38.595478] ? down_write+0xdb/0x140
[ 38.595487] ? __pfx_down_write+0x10/0x10
[ 38.595497] ext4_rmdir+0xee/0x140
[ 38.595506] vfs_rmdir+0x209/0x670
[ 38.595517] ? lookup_one_qstr_excl+0x3b/0x190
[ 38.595529] do_rmdir+0x363/0x3c0
[ 38.595537] ? __pfx_do_rmdir+0x10/0x10
[ 38.595544] ? strncpy_from_user+0x1ff/0x2e0
[ 38.595561] __x64_sys_unlinkat+0xf0/0x130
[ 38.595570] do_syscall_64+0x5b/0x180
[ 38.595583] entry_SYSCALL_64_after_hwframe+0x76/0x7e
Fixes: ac27a0ec112a0 ("[PATCH] ext4: initial copy of files from ext3")
Signed-off-by: Jakub Acs <acsjakub(a)amazon.de>
Cc: Theodore Ts'o <tytso(a)mit.edu>
Cc: Andreas Dilger <adilger.kernel(a)dilger.ca>
Cc: linux-ext4(a)vger.kernel.org
Cc: linux-kernel(a)vger.kernel.org
Cc: Mahmoud Adam <mngyadam(a)amazon.com>
Cc: stable(a)vger.kernel.org
Cc: security(a)kernel.org
Link: https://patch.msgid.link/b3ae36a6794c4a01944c7d70b403db5b@amazon.de
Signed-off-by: Theodore Ts'o <tytso(a)mit.edu>
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index d671b2c9eba2..d4164c507a90 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -104,6 +104,9 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,
else if (unlikely(le32_to_cpu(de->inode) >
le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count)))
error_msg = "inode out of bounds";
+ else if (unlikely(next_offset == size && de->name_len == 1 &&
+ de->name[0] == '.'))
+ error_msg = "'.' directory cannot be the last in data block";
else
return 0;
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x 370345b4bd184a49ac68d6591801e5e3605b355a
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025040850-carwash-detention-d475@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 370345b4bd184a49ac68d6591801e5e3605b355a Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever(a)oracle.com>
Date: Sun, 26 Jan 2025 16:50:18 -0500
Subject: [PATCH] NFSD: Never return NFS4ERR_FILE_OPEN when removing a
directory
RFC 8881 Section 18.25.4 paragraph 5 tells us that the server
should return NFS4ERR_FILE_OPEN only if the target object is an
opened file. This suggests that returning this status when removing
a directory will confuse NFS clients.
This is a version-specific issue; nfsd_proc_remove/rmdir() and
nfsd3_proc_remove/rmdir() already return nfserr_access as
appropriate.
Unfortunately there is no quick way for nfsd4_remove() to determine
whether the target object is a file or not, so the check is done in
in nfsd_unlink() for now.
Reported-by: Trond Myklebust <trondmy(a)hammerspace.com>
Fixes: 466e16f0920f ("nfsd: check for EBUSY from vfs_rmdir/vfs_unink.")
Reviewed-by: Jeff Layton <jlayton(a)kernel.org>
Cc: stable(a)vger.kernel.org
Signed-off-by: Chuck Lever <chuck.lever(a)oracle.com>
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 749dd84bdb41..4e0a2c0549c7 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1930,9 +1930,17 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
return err;
}
-/*
- * Unlink a file or directory
- * N.B. After this call fhp needs an fh_put
+/**
+ * nfsd_unlink - remove a directory entry
+ * @rqstp: RPC transaction context
+ * @fhp: the file handle of the parent directory to be modified
+ * @type: enforced file type of the object to be removed
+ * @fname: the name of directory entry to be removed
+ * @flen: length of @fname in octets
+ *
+ * After this call fhp needs an fh_put.
+ *
+ * Returns a generic NFS status code in network byte-order.
*/
__be32
nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
@@ -2006,10 +2014,14 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
fh_drop_write(fhp);
out_nfserr:
if (host_err == -EBUSY) {
- /* name is mounted-on. There is no perfect
- * error status.
+ /*
+ * See RFC 8881 Section 18.25.4 para 4: NFSv4 REMOVE
+ * wants a status unique to the object type.
*/
- err = nfserr_file_open;
+ if (type != S_IFDIR)
+ err = nfserr_file_open;
+ else
+ err = nfserr_acces;
}
out:
return err != nfs_ok ? err : nfserrno(host_err);
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 370345b4bd184a49ac68d6591801e5e3605b355a
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025040849-frays-herald-f892@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 370345b4bd184a49ac68d6591801e5e3605b355a Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever(a)oracle.com>
Date: Sun, 26 Jan 2025 16:50:18 -0500
Subject: [PATCH] NFSD: Never return NFS4ERR_FILE_OPEN when removing a
directory
RFC 8881 Section 18.25.4 paragraph 5 tells us that the server
should return NFS4ERR_FILE_OPEN only if the target object is an
opened file. This suggests that returning this status when removing
a directory will confuse NFS clients.
This is a version-specific issue; nfsd_proc_remove/rmdir() and
nfsd3_proc_remove/rmdir() already return nfserr_access as
appropriate.
Unfortunately there is no quick way for nfsd4_remove() to determine
whether the target object is a file or not, so the check is done in
in nfsd_unlink() for now.
Reported-by: Trond Myklebust <trondmy(a)hammerspace.com>
Fixes: 466e16f0920f ("nfsd: check for EBUSY from vfs_rmdir/vfs_unink.")
Reviewed-by: Jeff Layton <jlayton(a)kernel.org>
Cc: stable(a)vger.kernel.org
Signed-off-by: Chuck Lever <chuck.lever(a)oracle.com>
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 749dd84bdb41..4e0a2c0549c7 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1930,9 +1930,17 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
return err;
}
-/*
- * Unlink a file or directory
- * N.B. After this call fhp needs an fh_put
+/**
+ * nfsd_unlink - remove a directory entry
+ * @rqstp: RPC transaction context
+ * @fhp: the file handle of the parent directory to be modified
+ * @type: enforced file type of the object to be removed
+ * @fname: the name of directory entry to be removed
+ * @flen: length of @fname in octets
+ *
+ * After this call fhp needs an fh_put.
+ *
+ * Returns a generic NFS status code in network byte-order.
*/
__be32
nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
@@ -2006,10 +2014,14 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
fh_drop_write(fhp);
out_nfserr:
if (host_err == -EBUSY) {
- /* name is mounted-on. There is no perfect
- * error status.
+ /*
+ * See RFC 8881 Section 18.25.4 para 4: NFSv4 REMOVE
+ * wants a status unique to the object type.
*/
- err = nfserr_file_open;
+ if (type != S_IFDIR)
+ err = nfserr_file_open;
+ else
+ err = nfserr_acces;
}
out:
return err != nfs_ok ? err : nfserrno(host_err);
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x 370345b4bd184a49ac68d6591801e5e3605b355a
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025040849-skimming-calamari-53b9@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 370345b4bd184a49ac68d6591801e5e3605b355a Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever(a)oracle.com>
Date: Sun, 26 Jan 2025 16:50:18 -0500
Subject: [PATCH] NFSD: Never return NFS4ERR_FILE_OPEN when removing a
directory
RFC 8881 Section 18.25.4 paragraph 5 tells us that the server
should return NFS4ERR_FILE_OPEN only if the target object is an
opened file. This suggests that returning this status when removing
a directory will confuse NFS clients.
This is a version-specific issue; nfsd_proc_remove/rmdir() and
nfsd3_proc_remove/rmdir() already return nfserr_access as
appropriate.
Unfortunately there is no quick way for nfsd4_remove() to determine
whether the target object is a file or not, so the check is done in
in nfsd_unlink() for now.
Reported-by: Trond Myklebust <trondmy(a)hammerspace.com>
Fixes: 466e16f0920f ("nfsd: check for EBUSY from vfs_rmdir/vfs_unink.")
Reviewed-by: Jeff Layton <jlayton(a)kernel.org>
Cc: stable(a)vger.kernel.org
Signed-off-by: Chuck Lever <chuck.lever(a)oracle.com>
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 749dd84bdb41..4e0a2c0549c7 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1930,9 +1930,17 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
return err;
}
-/*
- * Unlink a file or directory
- * N.B. After this call fhp needs an fh_put
+/**
+ * nfsd_unlink - remove a directory entry
+ * @rqstp: RPC transaction context
+ * @fhp: the file handle of the parent directory to be modified
+ * @type: enforced file type of the object to be removed
+ * @fname: the name of directory entry to be removed
+ * @flen: length of @fname in octets
+ *
+ * After this call fhp needs an fh_put.
+ *
+ * Returns a generic NFS status code in network byte-order.
*/
__be32
nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
@@ -2006,10 +2014,14 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
fh_drop_write(fhp);
out_nfserr:
if (host_err == -EBUSY) {
- /* name is mounted-on. There is no perfect
- * error status.
+ /*
+ * See RFC 8881 Section 18.25.4 para 4: NFSv4 REMOVE
+ * wants a status unique to the object type.
*/
- err = nfserr_file_open;
+ if (type != S_IFDIR)
+ err = nfserr_file_open;
+ else
+ err = nfserr_acces;
}
out:
return err != nfs_ok ? err : nfserrno(host_err);
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x 370345b4bd184a49ac68d6591801e5e3605b355a
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025040848-resize-avert-7b05@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 370345b4bd184a49ac68d6591801e5e3605b355a Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever(a)oracle.com>
Date: Sun, 26 Jan 2025 16:50:18 -0500
Subject: [PATCH] NFSD: Never return NFS4ERR_FILE_OPEN when removing a
directory
RFC 8881 Section 18.25.4 paragraph 5 tells us that the server
should return NFS4ERR_FILE_OPEN only if the target object is an
opened file. This suggests that returning this status when removing
a directory will confuse NFS clients.
This is a version-specific issue; nfsd_proc_remove/rmdir() and
nfsd3_proc_remove/rmdir() already return nfserr_access as
appropriate.
Unfortunately there is no quick way for nfsd4_remove() to determine
whether the target object is a file or not, so the check is done in
in nfsd_unlink() for now.
Reported-by: Trond Myklebust <trondmy(a)hammerspace.com>
Fixes: 466e16f0920f ("nfsd: check for EBUSY from vfs_rmdir/vfs_unink.")
Reviewed-by: Jeff Layton <jlayton(a)kernel.org>
Cc: stable(a)vger.kernel.org
Signed-off-by: Chuck Lever <chuck.lever(a)oracle.com>
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 749dd84bdb41..4e0a2c0549c7 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1930,9 +1930,17 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
return err;
}
-/*
- * Unlink a file or directory
- * N.B. After this call fhp needs an fh_put
+/**
+ * nfsd_unlink - remove a directory entry
+ * @rqstp: RPC transaction context
+ * @fhp: the file handle of the parent directory to be modified
+ * @type: enforced file type of the object to be removed
+ * @fname: the name of directory entry to be removed
+ * @flen: length of @fname in octets
+ *
+ * After this call fhp needs an fh_put.
+ *
+ * Returns a generic NFS status code in network byte-order.
*/
__be32
nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
@@ -2006,10 +2014,14 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
fh_drop_write(fhp);
out_nfserr:
if (host_err == -EBUSY) {
- /* name is mounted-on. There is no perfect
- * error status.
+ /*
+ * See RFC 8881 Section 18.25.4 para 4: NFSv4 REMOVE
+ * wants a status unique to the object type.
*/
- err = nfserr_file_open;
+ if (type != S_IFDIR)
+ err = nfserr_file_open;
+ else
+ err = nfserr_acces;
}
out:
return err != nfs_ok ? err : nfserrno(host_err);
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x d7d8e3169b56e7696559a2427c922c0d55debcec
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025040835-legroom-backshift-766c@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From d7d8e3169b56e7696559a2427c922c0d55debcec Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever(a)oracle.com>
Date: Sun, 26 Jan 2025 16:50:17 -0500
Subject: [PATCH] NFSD: nfsd_unlink() clobbers non-zero status returned from
fh_fill_pre_attrs()
If fh_fill_pre_attrs() returns a non-zero status, the error flow
takes it through out_unlock, which then overwrites the returned
status code with
err = nfserrno(host_err);
Fixes: a332018a91c4 ("nfsd: handle failure to collect pre/post-op attrs more sanely")
Reviewed-by: Jeff Layton <jlayton(a)kernel.org>
Cc: stable(a)vger.kernel.org
Signed-off-by: Chuck Lever <chuck.lever(a)oracle.com>
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 188c978a0c79..749dd84bdb41 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -2010,11 +2010,9 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
* error status.
*/
err = nfserr_file_open;
- } else {
- err = nfserrno(host_err);
}
out:
- return err;
+ return err != nfs_ok ? err : nfserrno(host_err);
out_unlock:
inode_unlock(dirp);
goto out_drop_write;
The patch below does not apply to the 6.13-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.13.y
git checkout FETCH_HEAD
git cherry-pick -x 930b64ca0c511521f0abdd1d57ce52b2a6e3476b
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025040838-darling-scooter-0a13@gregkh' --subject-prefix 'PATCH 6.13.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 930b64ca0c511521f0abdd1d57ce52b2a6e3476b Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton(a)kernel.org>
Date: Thu, 6 Feb 2025 13:12:13 -0500
Subject: [PATCH] nfsd: don't ignore the return code of svc_proc_register()
Currently, nfsd_proc_stat_init() ignores the return value of
svc_proc_register(). If the procfile creation fails, then the kernel
will WARN when it tries to remove the entry later.
Fix nfsd_proc_stat_init() to return the same type of pointer as
svc_proc_register(), and fix up nfsd_net_init() to check that and fail
the nfsd_net construction if it occurs.
svc_proc_register() can fail if the dentry can't be allocated, or if an
identical dentry already exists. The second case is pretty unlikely in
the nfsd_net construction codepath, so if this happens, return -ENOMEM.
Reported-by: syzbot+e34ad04f27991521104c(a)syzkaller.appspotmail.com
Closes: https://lore.kernel.org/linux-nfs/67a47501.050a0220.19061f.05f9.GAE@google.…
Cc: stable(a)vger.kernel.org # v6.9
Signed-off-by: Jeff Layton <jlayton(a)kernel.org>
Signed-off-by: Chuck Lever <chuck.lever(a)oracle.com>
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index cca60a33697f..ac265d6fde35 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -2202,8 +2202,14 @@ static __net_init int nfsd_net_init(struct net *net)
NFSD_STATS_COUNTERS_NUM);
if (retval)
goto out_repcache_error;
+
memset(&nn->nfsd_svcstats, 0, sizeof(nn->nfsd_svcstats));
nn->nfsd_svcstats.program = &nfsd_programs[0];
+ if (!nfsd_proc_stat_init(net)) {
+ retval = -ENOMEM;
+ goto out_proc_error;
+ }
+
for (i = 0; i < sizeof(nn->nfsd_versions); i++)
nn->nfsd_versions[i] = nfsd_support_version(i);
for (i = 0; i < sizeof(nn->nfsd4_minorversions); i++)
@@ -2213,13 +2219,14 @@ static __net_init int nfsd_net_init(struct net *net)
nfsd4_init_leases_net(nn);
get_random_bytes(&nn->siphash_key, sizeof(nn->siphash_key));
seqlock_init(&nn->writeverf_lock);
- nfsd_proc_stat_init(net);
#if IS_ENABLED(CONFIG_NFS_LOCALIO)
spin_lock_init(&nn->local_clients_lock);
INIT_LIST_HEAD(&nn->local_clients);
#endif
return 0;
+out_proc_error:
+ percpu_counter_destroy_many(nn->counter, NFSD_STATS_COUNTERS_NUM);
out_repcache_error:
nfsd_idmap_shutdown(net);
out_idmap_error:
diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c
index bb22893f1157..f7eaf95e20fc 100644
--- a/fs/nfsd/stats.c
+++ b/fs/nfsd/stats.c
@@ -73,11 +73,11 @@ static int nfsd_show(struct seq_file *seq, void *v)
DEFINE_PROC_SHOW_ATTRIBUTE(nfsd);
-void nfsd_proc_stat_init(struct net *net)
+struct proc_dir_entry *nfsd_proc_stat_init(struct net *net)
{
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
- svc_proc_register(net, &nn->nfsd_svcstats, &nfsd_proc_ops);
+ return svc_proc_register(net, &nn->nfsd_svcstats, &nfsd_proc_ops);
}
void nfsd_proc_stat_shutdown(struct net *net)
diff --git a/fs/nfsd/stats.h b/fs/nfsd/stats.h
index 04aacb6c36e2..e4efb0e4e56d 100644
--- a/fs/nfsd/stats.h
+++ b/fs/nfsd/stats.h
@@ -10,7 +10,7 @@
#include <uapi/linux/nfsd/stats.h>
#include <linux/percpu_counter.h>
-void nfsd_proc_stat_init(struct net *net);
+struct proc_dir_entry *nfsd_proc_stat_init(struct net *net);
void nfsd_proc_stat_shutdown(struct net *net);
static inline void nfsd_stats_rc_hits_inc(struct nfsd_net *nn)
The patch below does not apply to the 6.12-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.12.y
git checkout FETCH_HEAD
git cherry-pick -x 930b64ca0c511521f0abdd1d57ce52b2a6e3476b
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025040838-january-snooper-9ce0@gregkh' --subject-prefix 'PATCH 6.12.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 930b64ca0c511521f0abdd1d57ce52b2a6e3476b Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton(a)kernel.org>
Date: Thu, 6 Feb 2025 13:12:13 -0500
Subject: [PATCH] nfsd: don't ignore the return code of svc_proc_register()
Currently, nfsd_proc_stat_init() ignores the return value of
svc_proc_register(). If the procfile creation fails, then the kernel
will WARN when it tries to remove the entry later.
Fix nfsd_proc_stat_init() to return the same type of pointer as
svc_proc_register(), and fix up nfsd_net_init() to check that and fail
the nfsd_net construction if it occurs.
svc_proc_register() can fail if the dentry can't be allocated, or if an
identical dentry already exists. The second case is pretty unlikely in
the nfsd_net construction codepath, so if this happens, return -ENOMEM.
Reported-by: syzbot+e34ad04f27991521104c(a)syzkaller.appspotmail.com
Closes: https://lore.kernel.org/linux-nfs/67a47501.050a0220.19061f.05f9.GAE@google.…
Cc: stable(a)vger.kernel.org # v6.9
Signed-off-by: Jeff Layton <jlayton(a)kernel.org>
Signed-off-by: Chuck Lever <chuck.lever(a)oracle.com>
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index cca60a33697f..ac265d6fde35 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -2202,8 +2202,14 @@ static __net_init int nfsd_net_init(struct net *net)
NFSD_STATS_COUNTERS_NUM);
if (retval)
goto out_repcache_error;
+
memset(&nn->nfsd_svcstats, 0, sizeof(nn->nfsd_svcstats));
nn->nfsd_svcstats.program = &nfsd_programs[0];
+ if (!nfsd_proc_stat_init(net)) {
+ retval = -ENOMEM;
+ goto out_proc_error;
+ }
+
for (i = 0; i < sizeof(nn->nfsd_versions); i++)
nn->nfsd_versions[i] = nfsd_support_version(i);
for (i = 0; i < sizeof(nn->nfsd4_minorversions); i++)
@@ -2213,13 +2219,14 @@ static __net_init int nfsd_net_init(struct net *net)
nfsd4_init_leases_net(nn);
get_random_bytes(&nn->siphash_key, sizeof(nn->siphash_key));
seqlock_init(&nn->writeverf_lock);
- nfsd_proc_stat_init(net);
#if IS_ENABLED(CONFIG_NFS_LOCALIO)
spin_lock_init(&nn->local_clients_lock);
INIT_LIST_HEAD(&nn->local_clients);
#endif
return 0;
+out_proc_error:
+ percpu_counter_destroy_many(nn->counter, NFSD_STATS_COUNTERS_NUM);
out_repcache_error:
nfsd_idmap_shutdown(net);
out_idmap_error:
diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c
index bb22893f1157..f7eaf95e20fc 100644
--- a/fs/nfsd/stats.c
+++ b/fs/nfsd/stats.c
@@ -73,11 +73,11 @@ static int nfsd_show(struct seq_file *seq, void *v)
DEFINE_PROC_SHOW_ATTRIBUTE(nfsd);
-void nfsd_proc_stat_init(struct net *net)
+struct proc_dir_entry *nfsd_proc_stat_init(struct net *net)
{
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
- svc_proc_register(net, &nn->nfsd_svcstats, &nfsd_proc_ops);
+ return svc_proc_register(net, &nn->nfsd_svcstats, &nfsd_proc_ops);
}
void nfsd_proc_stat_shutdown(struct net *net)
diff --git a/fs/nfsd/stats.h b/fs/nfsd/stats.h
index 04aacb6c36e2..e4efb0e4e56d 100644
--- a/fs/nfsd/stats.h
+++ b/fs/nfsd/stats.h
@@ -10,7 +10,7 @@
#include <uapi/linux/nfsd/stats.h>
#include <linux/percpu_counter.h>
-void nfsd_proc_stat_init(struct net *net);
+struct proc_dir_entry *nfsd_proc_stat_init(struct net *net);
void nfsd_proc_stat_shutdown(struct net *net);
static inline void nfsd_stats_rc_hits_inc(struct nfsd_net *nn)
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x f656cfbc7a293a039d6a0c7100e1c846845148c1
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025040826-roving-harmony-dcd6@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From f656cfbc7a293a039d6a0c7100e1c846845148c1 Mon Sep 17 00:00:00 2001
From: Murad Masimov <m.masimov(a)mt-integration.ru>
Date: Mon, 13 Jan 2025 13:51:30 +0300
Subject: [PATCH] media: streamzap: fix race between device disconnection and
urb callback
Syzkaller has reported a general protection fault at function
ir_raw_event_store_with_filter(). This crash is caused by a NULL pointer
dereference of dev->raw pointer, even though it is checked for NULL in
the same function, which means there is a race condition. It occurs due
to the incorrect order of actions in the streamzap_disconnect() function:
rc_unregister_device() is called before usb_kill_urb(). The dev->raw
pointer is freed and set to NULL in rc_unregister_device(), and only
after that usb_kill_urb() waits for in-progress requests to finish.
If rc_unregister_device() is called while streamzap_callback() handler is
not finished, this can lead to accessing freed resources. Thus
rc_unregister_device() should be called after usb_kill_urb().
Found by Linux Verification Center (linuxtesting.org) with Syzkaller.
Fixes: 8e9e60640067 ("V4L/DVB: staging/lirc: port lirc_streamzap to ir-core")
Cc: stable(a)vger.kernel.org
Reported-by: syzbot+34008406ee9a31b13c73(a)syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=34008406ee9a31b13c73
Signed-off-by: Murad Masimov <m.masimov(a)mt-integration.ru>
Signed-off-by: Sean Young <sean(a)mess.org>
Signed-off-by: Hans Verkuil <hverkuil(a)xs4all.nl>
diff --git a/drivers/media/rc/streamzap.c b/drivers/media/rc/streamzap.c
index 9b209e687f25..2ce62fe5d60f 100644
--- a/drivers/media/rc/streamzap.c
+++ b/drivers/media/rc/streamzap.c
@@ -385,8 +385,8 @@ static void streamzap_disconnect(struct usb_interface *interface)
if (!sz)
return;
- rc_unregister_device(sz->rdev);
usb_kill_urb(sz->urb_in);
+ rc_unregister_device(sz->rdev);
usb_free_urb(sz->urb_in);
usb_free_coherent(usbdev, sz->buf_in_len, sz->buf_in, sz->dma_in);
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x f656cfbc7a293a039d6a0c7100e1c846845148c1
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025040825-lankiness-posh-a49b@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From f656cfbc7a293a039d6a0c7100e1c846845148c1 Mon Sep 17 00:00:00 2001
From: Murad Masimov <m.masimov(a)mt-integration.ru>
Date: Mon, 13 Jan 2025 13:51:30 +0300
Subject: [PATCH] media: streamzap: fix race between device disconnection and
urb callback
Syzkaller has reported a general protection fault at function
ir_raw_event_store_with_filter(). This crash is caused by a NULL pointer
dereference of dev->raw pointer, even though it is checked for NULL in
the same function, which means there is a race condition. It occurs due
to the incorrect order of actions in the streamzap_disconnect() function:
rc_unregister_device() is called before usb_kill_urb(). The dev->raw
pointer is freed and set to NULL in rc_unregister_device(), and only
after that usb_kill_urb() waits for in-progress requests to finish.
If rc_unregister_device() is called while streamzap_callback() handler is
not finished, this can lead to accessing freed resources. Thus
rc_unregister_device() should be called after usb_kill_urb().
Found by Linux Verification Center (linuxtesting.org) with Syzkaller.
Fixes: 8e9e60640067 ("V4L/DVB: staging/lirc: port lirc_streamzap to ir-core")
Cc: stable(a)vger.kernel.org
Reported-by: syzbot+34008406ee9a31b13c73(a)syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=34008406ee9a31b13c73
Signed-off-by: Murad Masimov <m.masimov(a)mt-integration.ru>
Signed-off-by: Sean Young <sean(a)mess.org>
Signed-off-by: Hans Verkuil <hverkuil(a)xs4all.nl>
diff --git a/drivers/media/rc/streamzap.c b/drivers/media/rc/streamzap.c
index 9b209e687f25..2ce62fe5d60f 100644
--- a/drivers/media/rc/streamzap.c
+++ b/drivers/media/rc/streamzap.c
@@ -385,8 +385,8 @@ static void streamzap_disconnect(struct usb_interface *interface)
if (!sz)
return;
- rc_unregister_device(sz->rdev);
usb_kill_urb(sz->urb_in);
+ rc_unregister_device(sz->rdev);
usb_free_urb(sz->urb_in);
usb_free_coherent(usbdev, sz->buf_in_len, sz->buf_in, sz->dma_in);
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x f656cfbc7a293a039d6a0c7100e1c846845148c1
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025040825-taunt-stencil-d364@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From f656cfbc7a293a039d6a0c7100e1c846845148c1 Mon Sep 17 00:00:00 2001
From: Murad Masimov <m.masimov(a)mt-integration.ru>
Date: Mon, 13 Jan 2025 13:51:30 +0300
Subject: [PATCH] media: streamzap: fix race between device disconnection and
urb callback
Syzkaller has reported a general protection fault at function
ir_raw_event_store_with_filter(). This crash is caused by a NULL pointer
dereference of dev->raw pointer, even though it is checked for NULL in
the same function, which means there is a race condition. It occurs due
to the incorrect order of actions in the streamzap_disconnect() function:
rc_unregister_device() is called before usb_kill_urb(). The dev->raw
pointer is freed and set to NULL in rc_unregister_device(), and only
after that usb_kill_urb() waits for in-progress requests to finish.
If rc_unregister_device() is called while streamzap_callback() handler is
not finished, this can lead to accessing freed resources. Thus
rc_unregister_device() should be called after usb_kill_urb().
Found by Linux Verification Center (linuxtesting.org) with Syzkaller.
Fixes: 8e9e60640067 ("V4L/DVB: staging/lirc: port lirc_streamzap to ir-core")
Cc: stable(a)vger.kernel.org
Reported-by: syzbot+34008406ee9a31b13c73(a)syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=34008406ee9a31b13c73
Signed-off-by: Murad Masimov <m.masimov(a)mt-integration.ru>
Signed-off-by: Sean Young <sean(a)mess.org>
Signed-off-by: Hans Verkuil <hverkuil(a)xs4all.nl>
diff --git a/drivers/media/rc/streamzap.c b/drivers/media/rc/streamzap.c
index 9b209e687f25..2ce62fe5d60f 100644
--- a/drivers/media/rc/streamzap.c
+++ b/drivers/media/rc/streamzap.c
@@ -385,8 +385,8 @@ static void streamzap_disconnect(struct usb_interface *interface)
if (!sz)
return;
- rc_unregister_device(sz->rdev);
usb_kill_urb(sz->urb_in);
+ rc_unregister_device(sz->rdev);
usb_free_urb(sz->urb_in);
usb_free_coherent(usbdev, sz->buf_in_len, sz->buf_in, sz->dma_in);
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x f87d3af7419307ae26e705a2b2db36140db367a2
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025040822-saline-starring-eabe@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From f87d3af7419307ae26e705a2b2db36140db367a2 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso(a)mit.edu>
Date: Fri, 14 Mar 2025 00:38:42 -0400
Subject: [PATCH] ext4: don't over-report free space or inodes in statvfs
This fixes an analogus bug that was fixed in xfs in commit
4b8d867ca6e2 ("xfs: don't over-report free space or inodes in
statvfs") where statfs can report misleading / incorrect information
where project quota is enabled, and the free space is less than the
remaining quota.
This commit will resolve a test failure in generic/762 which tests for
this bug.
Cc: stable(a)kernel.org
Fixes: 689c958cbe6b ("ext4: add project quota support")
Signed-off-by: Theodore Ts'o <tytso(a)mit.edu>
Reviewed-by: "Darrick J. Wong" <djwong(a)kernel.org>
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 4768770715ca..8cafcd3e9f5f 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -6820,22 +6820,29 @@ static int ext4_statfs_project(struct super_block *sb,
dquot->dq_dqb.dqb_bhardlimit);
limit >>= sb->s_blocksize_bits;
- if (limit && buf->f_blocks > limit) {
+ if (limit) {
+ uint64_t remaining = 0;
+
curblock = (dquot->dq_dqb.dqb_curspace +
dquot->dq_dqb.dqb_rsvspace) >> sb->s_blocksize_bits;
- buf->f_blocks = limit;
- buf->f_bfree = buf->f_bavail =
- (buf->f_blocks > curblock) ?
- (buf->f_blocks - curblock) : 0;
+ if (limit > curblock)
+ remaining = limit - curblock;
+
+ buf->f_blocks = min(buf->f_blocks, limit);
+ buf->f_bfree = min(buf->f_bfree, remaining);
+ buf->f_bavail = min(buf->f_bavail, remaining);
}
limit = min_not_zero(dquot->dq_dqb.dqb_isoftlimit,
dquot->dq_dqb.dqb_ihardlimit);
- if (limit && buf->f_files > limit) {
- buf->f_files = limit;
- buf->f_ffree =
- (buf->f_files > dquot->dq_dqb.dqb_curinodes) ?
- (buf->f_files - dquot->dq_dqb.dqb_curinodes) : 0;
+ if (limit) {
+ uint64_t remaining = 0;
+
+ if (limit > dquot->dq_dqb.dqb_curinodes)
+ remaining = limit - dquot->dq_dqb.dqb_curinodes;
+
+ buf->f_files = min(buf->f_files, limit);
+ buf->f_ffree = min(buf->f_ffree, remaining);
}
spin_unlock(&dquot->dq_dqb_lock);
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x ea8d7647f9ddf1f81e2027ed305299797299aa03
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025040813-truffle-chitchat-1344@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From ea8d7647f9ddf1f81e2027ed305299797299aa03 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt(a)goodmis.org>
Date: Thu, 27 Mar 2025 19:53:11 -0400
Subject: [PATCH] tracing: Verify event formats that have "%*p.."
The trace event verifier checks the formats of trace events to make sure
that they do not point at memory that is not in the trace event itself or
in data that will never be freed. If an event references data that was
allocated when the event triggered and that same data is freed before the
event is read, then the kernel can crash by reading freed memory.
The verifier runs at boot up (or module load) and scans the print formats
of the events and checks their arguments to make sure that dereferenced
pointers are safe. If the format uses "%*p.." the verifier will ignore it,
and that could be dangerous. Cover this case as well.
Also add to the sample code a use case of "%*pbl".
Link: https://lore.kernel.org/all/bcba4d76-2c3f-4d11-baf0-02905db953dd@oracle.com/
Cc: stable(a)vger.kernel.org
Cc: Masami Hiramatsu <mhiramat(a)kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers(a)efficios.com>
Fixes: 5013f454a352c ("tracing: Add check of trace event print fmts for dereferencing pointers")
Link: https://lore.kernel.org/20250327195311.2d89ec66@gandalf.local.home
Reported-by: Libo Chen <libo.chen(a)oracle.com>
Reviewed-by: Libo Chen <libo.chen(a)oracle.com>
Tested-by: Libo Chen <libo.chen(a)oracle.com>
Signed-off-by: Steven Rostedt (Google) <rostedt(a)goodmis.org>
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 8638b7f7ff85..069e92856bda 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -470,6 +470,7 @@ static void test_event_printk(struct trace_event_call *call)
case '%':
continue;
case 'p':
+ do_pointer:
/* Find dereferencing fields */
switch (fmt[i + 1]) {
case 'B': case 'R': case 'r':
@@ -498,6 +499,12 @@ static void test_event_printk(struct trace_event_call *call)
continue;
if (fmt[i + j] == '*') {
star = true;
+ /* Handle %*pbl case */
+ if (!j && fmt[i + 1] == 'p') {
+ arg++;
+ i++;
+ goto do_pointer;
+ }
continue;
}
if ((fmt[i + j] == 's')) {
diff --git a/samples/trace_events/trace-events-sample.h b/samples/trace_events/trace-events-sample.h
index 999f78d380ae..1a05fc153353 100644
--- a/samples/trace_events/trace-events-sample.h
+++ b/samples/trace_events/trace-events-sample.h
@@ -319,7 +319,8 @@ TRACE_EVENT(foo_bar,
__assign_cpumask(cpum, cpumask_bits(mask));
),
- TP_printk("foo %s %d %s %s %s %s %s %s (%s) (%s) %s", __entry->foo, __entry->bar,
+ TP_printk("foo %s %d %s %s %s %s %s %s (%s) (%s) %s [%d] %*pbl",
+ __entry->foo, __entry->bar,
/*
* Notice here the use of some helper functions. This includes:
@@ -370,7 +371,10 @@ TRACE_EVENT(foo_bar,
__get_str(str), __get_str(lstr),
__get_bitmask(cpus), __get_cpumask(cpum),
- __get_str(vstr))
+ __get_str(vstr),
+ __get_dynamic_array_len(cpus),
+ __get_dynamic_array_len(cpus),
+ __get_dynamic_array(cpus))
);
/*
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x ea8d7647f9ddf1f81e2027ed305299797299aa03
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025040812-pulse-unsaid-621c@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From ea8d7647f9ddf1f81e2027ed305299797299aa03 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt(a)goodmis.org>
Date: Thu, 27 Mar 2025 19:53:11 -0400
Subject: [PATCH] tracing: Verify event formats that have "%*p.."
The trace event verifier checks the formats of trace events to make sure
that they do not point at memory that is not in the trace event itself or
in data that will never be freed. If an event references data that was
allocated when the event triggered and that same data is freed before the
event is read, then the kernel can crash by reading freed memory.
The verifier runs at boot up (or module load) and scans the print formats
of the events and checks their arguments to make sure that dereferenced
pointers are safe. If the format uses "%*p.." the verifier will ignore it,
and that could be dangerous. Cover this case as well.
Also add to the sample code a use case of "%*pbl".
Link: https://lore.kernel.org/all/bcba4d76-2c3f-4d11-baf0-02905db953dd@oracle.com/
Cc: stable(a)vger.kernel.org
Cc: Masami Hiramatsu <mhiramat(a)kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers(a)efficios.com>
Fixes: 5013f454a352c ("tracing: Add check of trace event print fmts for dereferencing pointers")
Link: https://lore.kernel.org/20250327195311.2d89ec66@gandalf.local.home
Reported-by: Libo Chen <libo.chen(a)oracle.com>
Reviewed-by: Libo Chen <libo.chen(a)oracle.com>
Tested-by: Libo Chen <libo.chen(a)oracle.com>
Signed-off-by: Steven Rostedt (Google) <rostedt(a)goodmis.org>
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 8638b7f7ff85..069e92856bda 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -470,6 +470,7 @@ static void test_event_printk(struct trace_event_call *call)
case '%':
continue;
case 'p':
+ do_pointer:
/* Find dereferencing fields */
switch (fmt[i + 1]) {
case 'B': case 'R': case 'r':
@@ -498,6 +499,12 @@ static void test_event_printk(struct trace_event_call *call)
continue;
if (fmt[i + j] == '*') {
star = true;
+ /* Handle %*pbl case */
+ if (!j && fmt[i + 1] == 'p') {
+ arg++;
+ i++;
+ goto do_pointer;
+ }
continue;
}
if ((fmt[i + j] == 's')) {
diff --git a/samples/trace_events/trace-events-sample.h b/samples/trace_events/trace-events-sample.h
index 999f78d380ae..1a05fc153353 100644
--- a/samples/trace_events/trace-events-sample.h
+++ b/samples/trace_events/trace-events-sample.h
@@ -319,7 +319,8 @@ TRACE_EVENT(foo_bar,
__assign_cpumask(cpum, cpumask_bits(mask));
),
- TP_printk("foo %s %d %s %s %s %s %s %s (%s) (%s) %s", __entry->foo, __entry->bar,
+ TP_printk("foo %s %d %s %s %s %s %s %s (%s) (%s) %s [%d] %*pbl",
+ __entry->foo, __entry->bar,
/*
* Notice here the use of some helper functions. This includes:
@@ -370,7 +371,10 @@ TRACE_EVENT(foo_bar,
__get_str(str), __get_str(lstr),
__get_bitmask(cpus), __get_cpumask(cpum),
- __get_str(vstr))
+ __get_str(vstr),
+ __get_dynamic_array_len(cpus),
+ __get_dynamic_array_len(cpus),
+ __get_dynamic_array(cpus))
);
/*
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x ea8d7647f9ddf1f81e2027ed305299797299aa03
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025040812-upscale-denture-306f@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From ea8d7647f9ddf1f81e2027ed305299797299aa03 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt(a)goodmis.org>
Date: Thu, 27 Mar 2025 19:53:11 -0400
Subject: [PATCH] tracing: Verify event formats that have "%*p.."
The trace event verifier checks the formats of trace events to make sure
that they do not point at memory that is not in the trace event itself or
in data that will never be freed. If an event references data that was
allocated when the event triggered and that same data is freed before the
event is read, then the kernel can crash by reading freed memory.
The verifier runs at boot up (or module load) and scans the print formats
of the events and checks their arguments to make sure that dereferenced
pointers are safe. If the format uses "%*p.." the verifier will ignore it,
and that could be dangerous. Cover this case as well.
Also add to the sample code a use case of "%*pbl".
Link: https://lore.kernel.org/all/bcba4d76-2c3f-4d11-baf0-02905db953dd@oracle.com/
Cc: stable(a)vger.kernel.org
Cc: Masami Hiramatsu <mhiramat(a)kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers(a)efficios.com>
Fixes: 5013f454a352c ("tracing: Add check of trace event print fmts for dereferencing pointers")
Link: https://lore.kernel.org/20250327195311.2d89ec66@gandalf.local.home
Reported-by: Libo Chen <libo.chen(a)oracle.com>
Reviewed-by: Libo Chen <libo.chen(a)oracle.com>
Tested-by: Libo Chen <libo.chen(a)oracle.com>
Signed-off-by: Steven Rostedt (Google) <rostedt(a)goodmis.org>
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 8638b7f7ff85..069e92856bda 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -470,6 +470,7 @@ static void test_event_printk(struct trace_event_call *call)
case '%':
continue;
case 'p':
+ do_pointer:
/* Find dereferencing fields */
switch (fmt[i + 1]) {
case 'B': case 'R': case 'r':
@@ -498,6 +499,12 @@ static void test_event_printk(struct trace_event_call *call)
continue;
if (fmt[i + j] == '*') {
star = true;
+ /* Handle %*pbl case */
+ if (!j && fmt[i + 1] == 'p') {
+ arg++;
+ i++;
+ goto do_pointer;
+ }
continue;
}
if ((fmt[i + j] == 's')) {
diff --git a/samples/trace_events/trace-events-sample.h b/samples/trace_events/trace-events-sample.h
index 999f78d380ae..1a05fc153353 100644
--- a/samples/trace_events/trace-events-sample.h
+++ b/samples/trace_events/trace-events-sample.h
@@ -319,7 +319,8 @@ TRACE_EVENT(foo_bar,
__assign_cpumask(cpum, cpumask_bits(mask));
),
- TP_printk("foo %s %d %s %s %s %s %s %s (%s) (%s) %s", __entry->foo, __entry->bar,
+ TP_printk("foo %s %d %s %s %s %s %s %s (%s) (%s) %s [%d] %*pbl",
+ __entry->foo, __entry->bar,
/*
* Notice here the use of some helper functions. This includes:
@@ -370,7 +371,10 @@ TRACE_EVENT(foo_bar,
__get_str(str), __get_str(lstr),
__get_bitmask(cpus), __get_cpumask(cpum),
- __get_str(vstr))
+ __get_str(vstr),
+ __get_dynamic_array_len(cpus),
+ __get_dynamic_array_len(cpus),
+ __get_dynamic_array(cpus))
);
/*
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x 4d38328eb442dc06aec4350fd9594ffa6488af02
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025040851-snowstorm-anyhow-9887@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 4d38328eb442dc06aec4350fd9594ffa6488af02 Mon Sep 17 00:00:00 2001
From: Douglas Raillard <douglas.raillard(a)arm.com>
Date: Tue, 25 Mar 2025 16:52:02 +0000
Subject: [PATCH] tracing: Fix synth event printk format for str fields
The printk format for synth event uses "%.*s" to print string fields,
but then only passes the pointer part as var arg.
Replace %.*s with %s as the C string is guaranteed to be null-terminated.
The output in print fmt should never have been updated as __get_str()
handles the string limit because it can access the length of the string in
the string meta data that is saved in the ring buffer.
Cc: stable(a)vger.kernel.org
Cc: Masami Hiramatsu <mhiramat(a)kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers(a)efficios.com>
Fixes: 8db4d6bfbbf92 ("tracing: Change synthetic event string format to limit printed length")
Link: https://lore.kernel.org/20250325165202.541088-1-douglas.raillard@arm.com
Signed-off-by: Douglas Raillard <douglas.raillard(a)arm.com>
Signed-off-by: Steven Rostedt (Google) <rostedt(a)goodmis.org>
diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c
index a5c5f34c207a..6d592cbc38e4 100644
--- a/kernel/trace/trace_events_synth.c
+++ b/kernel/trace/trace_events_synth.c
@@ -305,7 +305,7 @@ static const char *synth_field_fmt(char *type)
else if (strcmp(type, "gfp_t") == 0)
fmt = "%x";
else if (synth_field_is_string(type))
- fmt = "%.*s";
+ fmt = "%s";
else if (synth_field_is_stack(type))
fmt = "%s";
sctp_sendmsg() re-uses associations and transports when possible by
doing a lookup based on the socket endpoint and the message destination
address, and then sctp_sendmsg_to_asoc() sets the selected transport in
all the message chunks to be sent.
There's a possible race condition if another thread triggers the removal
of that selected transport, for instance, by explicitly unbinding an
address with setsockopt(SCTP_SOCKOPT_BINDX_REM), after the chunks have
been set up and before the message is sent. This can happen if the send
buffer is full, during the period when the sender thread temporarily
releases the socket lock in sctp_wait_for_sndbuf().
This causes the access to the transport data in
sctp_outq_select_transport(), when the association outqueue is flushed,
to result in a use-after-free read.
This change avoids this scenario by having sctp_transport_free() signal
the freeing of the transport, tagging it as "dead". In order to do this,
the patch restores the "dead" bit in struct sctp_transport, which was
removed in
commit 47faa1e4c50e ("sctp: remove the dead field of sctp_transport").
Then, in the scenario where the sender thread has released the socket
lock in sctp_wait_for_sndbuf(), the bit is checked again after
re-acquiring the socket lock to detect the deletion. This is done while
holding a reference to the transport to prevent it from being freed in
the process.
If the transport was deleted while the socket lock was relinquished,
sctp_sendmsg_to_asoc() will return -EAGAIN to let userspace retry the
send.
The bug was found by a private syzbot instance (see the error report [1]
and the C reproducer that triggers it [2]).
Link: https://people.igalia.com/rcn/kernel_logs/20250402__KASAN_slab-use-after-fr… [1]
Link: https://people.igalia.com/rcn/kernel_logs/20250402__KASAN_slab-use-after-fr… [2]
Cc: stable(a)vger.kernel.org
Fixes: df132eff4638 ("sctp: clear the transport of some out_chunk_list chunks in sctp_assoc_rm_peer")
Suggested-by: Xin Long <lucien.xin(a)gmail.com>
Signed-off-by: Ricardo Cañuelo Navarro <rcn(a)igalia.com>
---
This patch supersedes this one I sent a few days ago, which proposed a
different solution:
https://lore.kernel.org/all/20250402-kasan_slab-use-after-free_read_in_sctp…
As Xin Long pointed out in the discussion on that patch, that solution
would have a significant performance impact, so this alternative was
proposed instead. Although the purpose is the same, the patch
implementation is completely different, hence the new patch instead of a
v2.
Cheers,
Ricardo
---
include/net/sctp/structs.h | 3 ++-
net/sctp/socket.c | 22 ++++++++++++++--------
net/sctp/transport.c | 2 ++
3 files changed, 18 insertions(+), 9 deletions(-)
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 31248cfdfb235f1e6008c4a6b64a1103d2f355ef..dcd288fa1bb6fbbb33bb639a790d8edcbba7c389 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -775,6 +775,7 @@ struct sctp_transport {
/* Reference counting. */
refcount_t refcnt;
+ __u32 dead:1,
/* RTO-Pending : A flag used to track if one of the DATA
* chunks sent to this address is currently being
* used to compute a RTT. If this flag is 0,
@@ -784,7 +785,7 @@ struct sctp_transport {
* calculation completes (i.e. the DATA chunk
* is SACK'd) clear this flag.
*/
- __u32 rto_pending:1,
+ rto_pending:1,
/*
* hb_sent : a flag that signals that we have a pending
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 36ee34f483d703ffcfe5ca9e6cc554fba24c75ef..53725ee7ba06d780e220c3a184b4f611a7cb5e51 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -72,8 +72,9 @@
/* Forward declarations for internal helper functions. */
static bool sctp_writeable(const struct sock *sk);
static void sctp_wfree(struct sk_buff *skb);
-static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p,
- size_t msg_len);
+static int sctp_wait_for_sndbuf(struct sctp_association *asoc,
+ struct sctp_transport *transport,
+ long *timeo_p, size_t msg_len);
static int sctp_wait_for_packet(struct sock *sk, int *err, long *timeo_p);
static int sctp_wait_for_connect(struct sctp_association *, long *timeo_p);
static int sctp_wait_for_accept(struct sock *sk, long timeo);
@@ -1828,7 +1829,7 @@ static int sctp_sendmsg_to_asoc(struct sctp_association *asoc,
if (sctp_wspace(asoc) <= 0 || !sk_wmem_schedule(sk, msg_len)) {
timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
- err = sctp_wait_for_sndbuf(asoc, &timeo, msg_len);
+ err = sctp_wait_for_sndbuf(asoc, transport, &timeo, msg_len);
if (err)
goto err;
if (unlikely(sinfo->sinfo_stream >= asoc->stream.outcnt)) {
@@ -9214,8 +9215,9 @@ void sctp_sock_rfree(struct sk_buff *skb)
/* Helper function to wait for space in the sndbuf. */
-static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p,
- size_t msg_len)
+static int sctp_wait_for_sndbuf(struct sctp_association *asoc,
+ struct sctp_transport *transport,
+ long *timeo_p, size_t msg_len)
{
struct sock *sk = asoc->base.sk;
long current_timeo = *timeo_p;
@@ -9225,7 +9227,9 @@ static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p,
pr_debug("%s: asoc:%p, timeo:%ld, msg_len:%zu\n", __func__, asoc,
*timeo_p, msg_len);
- /* Increment the association's refcnt. */
+ /* Increment the transport and association's refcnt. */
+ if (transport)
+ sctp_transport_hold(transport);
sctp_association_hold(asoc);
/* Wait on the association specific sndbuf space. */
@@ -9234,7 +9238,7 @@ static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p,
TASK_INTERRUPTIBLE);
if (asoc->base.dead)
goto do_dead;
- if (!*timeo_p)
+ if ((!*timeo_p) || (transport && transport->dead))
goto do_nonblock;
if (sk->sk_err || asoc->state >= SCTP_STATE_SHUTDOWN_PENDING)
goto do_error;
@@ -9259,7 +9263,9 @@ static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p,
out:
finish_wait(&asoc->wait, &wait);
- /* Release the association's refcnt. */
+ /* Release the transport and association's refcnt. */
+ if (transport)
+ sctp_transport_put(transport);
sctp_association_put(asoc);
return err;
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index 2abe45af98e7c6efd5baffb88a8687e595cf3f24..31eca29b6cfbfb146c389cc643126ce87620fccd 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -117,6 +117,8 @@ struct sctp_transport *sctp_transport_new(struct net *net,
*/
void sctp_transport_free(struct sctp_transport *transport)
{
+ transport->dead = 1;
+
/* Try to delete the heartbeat timer. */
if (del_timer(&transport->hb_timer))
sctp_transport_put(transport);
---
base-commit: 38fec10eb60d687e30c8c6b5420d86e8149f7557
change-id: 20250404-kasan_slab-use-after-free_read_in_sctp_outq_select_transport__20250404-9e1ff370061d
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x a5951389e58d2e816eed3dbec5877de9327fd881
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025040857-whinny-coziness-437c@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From a5951389e58d2e816eed3dbec5877de9327fd881 Mon Sep 17 00:00:00 2001
From: Douglas Anderson <dianders(a)chromium.org>
Date: Tue, 7 Jan 2025 12:06:02 -0800
Subject: [PATCH] arm64: errata: Add newer ARM cores to the
spectre_bhb_loop_affected() lists
When comparing to the ARM list [1], it appears that several ARM cores
were missing from the lists in spectre_bhb_loop_affected(). Add them.
NOTE: for some of these cores it may not matter since other ways of
clearing the BHB may be used (like the CLRBHB instruction or ECBHB),
but it still seems good to have all the info from ARM's whitepaper
included.
[1] https://developer.arm.com/Arm%20Security%20Center/Spectre-BHB
Fixes: 558c303c9734 ("arm64: Mitigate spectre style branch history side channels")
Cc: stable(a)vger.kernel.org
Signed-off-by: Douglas Anderson <dianders(a)chromium.org>
Reviewed-by: James Morse <james.morse(a)arm.com>
Link: https://lore.kernel.org/r/20250107120555.v4.5.I4a9a527e03f663040721c5401c41…
Signed-off-by: Catalin Marinas <catalin.marinas(a)arm.com>
diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c
index 89405be53d8f..0f51fd10b4b0 100644
--- a/arch/arm64/kernel/proton-pack.c
+++ b/arch/arm64/kernel/proton-pack.c
@@ -876,6 +876,14 @@ static u8 spectre_bhb_loop_affected(void)
{
u8 k = 0;
+ static const struct midr_range spectre_bhb_k132_list[] = {
+ MIDR_ALL_VERSIONS(MIDR_CORTEX_X3),
+ MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V2),
+ };
+ static const struct midr_range spectre_bhb_k38_list[] = {
+ MIDR_ALL_VERSIONS(MIDR_CORTEX_A715),
+ MIDR_ALL_VERSIONS(MIDR_CORTEX_A720),
+ };
static const struct midr_range spectre_bhb_k32_list[] = {
MIDR_ALL_VERSIONS(MIDR_CORTEX_A78),
MIDR_ALL_VERSIONS(MIDR_CORTEX_A78AE),
@@ -889,6 +897,7 @@ static u8 spectre_bhb_loop_affected(void)
};
static const struct midr_range spectre_bhb_k24_list[] = {
MIDR_ALL_VERSIONS(MIDR_CORTEX_A76),
+ MIDR_ALL_VERSIONS(MIDR_CORTEX_A76AE),
MIDR_ALL_VERSIONS(MIDR_CORTEX_A77),
MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N1),
MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_4XX_GOLD),
@@ -904,7 +913,11 @@ static u8 spectre_bhb_loop_affected(void)
{},
};
- if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k32_list))
+ if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k132_list))
+ k = 132;
+ else if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k38_list))
+ k = 38;
+ else if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k32_list))
k = 32;
else if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k24_list))
k = 24;
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x a5951389e58d2e816eed3dbec5877de9327fd881
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025040855-shingle-handcraft-3ca3@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From a5951389e58d2e816eed3dbec5877de9327fd881 Mon Sep 17 00:00:00 2001
From: Douglas Anderson <dianders(a)chromium.org>
Date: Tue, 7 Jan 2025 12:06:02 -0800
Subject: [PATCH] arm64: errata: Add newer ARM cores to the
spectre_bhb_loop_affected() lists
When comparing to the ARM list [1], it appears that several ARM cores
were missing from the lists in spectre_bhb_loop_affected(). Add them.
NOTE: for some of these cores it may not matter since other ways of
clearing the BHB may be used (like the CLRBHB instruction or ECBHB),
but it still seems good to have all the info from ARM's whitepaper
included.
[1] https://developer.arm.com/Arm%20Security%20Center/Spectre-BHB
Fixes: 558c303c9734 ("arm64: Mitigate spectre style branch history side channels")
Cc: stable(a)vger.kernel.org
Signed-off-by: Douglas Anderson <dianders(a)chromium.org>
Reviewed-by: James Morse <james.morse(a)arm.com>
Link: https://lore.kernel.org/r/20250107120555.v4.5.I4a9a527e03f663040721c5401c41…
Signed-off-by: Catalin Marinas <catalin.marinas(a)arm.com>
diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c
index 89405be53d8f..0f51fd10b4b0 100644
--- a/arch/arm64/kernel/proton-pack.c
+++ b/arch/arm64/kernel/proton-pack.c
@@ -876,6 +876,14 @@ static u8 spectre_bhb_loop_affected(void)
{
u8 k = 0;
+ static const struct midr_range spectre_bhb_k132_list[] = {
+ MIDR_ALL_VERSIONS(MIDR_CORTEX_X3),
+ MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V2),
+ };
+ static const struct midr_range spectre_bhb_k38_list[] = {
+ MIDR_ALL_VERSIONS(MIDR_CORTEX_A715),
+ MIDR_ALL_VERSIONS(MIDR_CORTEX_A720),
+ };
static const struct midr_range spectre_bhb_k32_list[] = {
MIDR_ALL_VERSIONS(MIDR_CORTEX_A78),
MIDR_ALL_VERSIONS(MIDR_CORTEX_A78AE),
@@ -889,6 +897,7 @@ static u8 spectre_bhb_loop_affected(void)
};
static const struct midr_range spectre_bhb_k24_list[] = {
MIDR_ALL_VERSIONS(MIDR_CORTEX_A76),
+ MIDR_ALL_VERSIONS(MIDR_CORTEX_A76AE),
MIDR_ALL_VERSIONS(MIDR_CORTEX_A77),
MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N1),
MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_4XX_GOLD),
@@ -904,7 +913,11 @@ static u8 spectre_bhb_loop_affected(void)
{},
};
- if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k32_list))
+ if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k132_list))
+ k = 132;
+ else if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k38_list))
+ k = 38;
+ else if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k32_list))
k = 32;
else if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k24_list))
k = 24;
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x a5951389e58d2e816eed3dbec5877de9327fd881
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025040854-depict-predict-4b0d@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From a5951389e58d2e816eed3dbec5877de9327fd881 Mon Sep 17 00:00:00 2001
From: Douglas Anderson <dianders(a)chromium.org>
Date: Tue, 7 Jan 2025 12:06:02 -0800
Subject: [PATCH] arm64: errata: Add newer ARM cores to the
spectre_bhb_loop_affected() lists
When comparing to the ARM list [1], it appears that several ARM cores
were missing from the lists in spectre_bhb_loop_affected(). Add them.
NOTE: for some of these cores it may not matter since other ways of
clearing the BHB may be used (like the CLRBHB instruction or ECBHB),
but it still seems good to have all the info from ARM's whitepaper
included.
[1] https://developer.arm.com/Arm%20Security%20Center/Spectre-BHB
Fixes: 558c303c9734 ("arm64: Mitigate spectre style branch history side channels")
Cc: stable(a)vger.kernel.org
Signed-off-by: Douglas Anderson <dianders(a)chromium.org>
Reviewed-by: James Morse <james.morse(a)arm.com>
Link: https://lore.kernel.org/r/20250107120555.v4.5.I4a9a527e03f663040721c5401c41…
Signed-off-by: Catalin Marinas <catalin.marinas(a)arm.com>
diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c
index 89405be53d8f..0f51fd10b4b0 100644
--- a/arch/arm64/kernel/proton-pack.c
+++ b/arch/arm64/kernel/proton-pack.c
@@ -876,6 +876,14 @@ static u8 spectre_bhb_loop_affected(void)
{
u8 k = 0;
+ static const struct midr_range spectre_bhb_k132_list[] = {
+ MIDR_ALL_VERSIONS(MIDR_CORTEX_X3),
+ MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V2),
+ };
+ static const struct midr_range spectre_bhb_k38_list[] = {
+ MIDR_ALL_VERSIONS(MIDR_CORTEX_A715),
+ MIDR_ALL_VERSIONS(MIDR_CORTEX_A720),
+ };
static const struct midr_range spectre_bhb_k32_list[] = {
MIDR_ALL_VERSIONS(MIDR_CORTEX_A78),
MIDR_ALL_VERSIONS(MIDR_CORTEX_A78AE),
@@ -889,6 +897,7 @@ static u8 spectre_bhb_loop_affected(void)
};
static const struct midr_range spectre_bhb_k24_list[] = {
MIDR_ALL_VERSIONS(MIDR_CORTEX_A76),
+ MIDR_ALL_VERSIONS(MIDR_CORTEX_A76AE),
MIDR_ALL_VERSIONS(MIDR_CORTEX_A77),
MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N1),
MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_4XX_GOLD),
@@ -904,7 +913,11 @@ static u8 spectre_bhb_loop_affected(void)
{},
};
- if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k32_list))
+ if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k132_list))
+ k = 132;
+ else if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k38_list))
+ k = 38;
+ else if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k32_list))
k = 32;
else if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k24_list))
k = 24;
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x a5951389e58d2e816eed3dbec5877de9327fd881
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025040852-tweezers-roving-1cf8@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From a5951389e58d2e816eed3dbec5877de9327fd881 Mon Sep 17 00:00:00 2001
From: Douglas Anderson <dianders(a)chromium.org>
Date: Tue, 7 Jan 2025 12:06:02 -0800
Subject: [PATCH] arm64: errata: Add newer ARM cores to the
spectre_bhb_loop_affected() lists
When comparing to the ARM list [1], it appears that several ARM cores
were missing from the lists in spectre_bhb_loop_affected(). Add them.
NOTE: for some of these cores it may not matter since other ways of
clearing the BHB may be used (like the CLRBHB instruction or ECBHB),
but it still seems good to have all the info from ARM's whitepaper
included.
[1] https://developer.arm.com/Arm%20Security%20Center/Spectre-BHB
Fixes: 558c303c9734 ("arm64: Mitigate spectre style branch history side channels")
Cc: stable(a)vger.kernel.org
Signed-off-by: Douglas Anderson <dianders(a)chromium.org>
Reviewed-by: James Morse <james.morse(a)arm.com>
Link: https://lore.kernel.org/r/20250107120555.v4.5.I4a9a527e03f663040721c5401c41…
Signed-off-by: Catalin Marinas <catalin.marinas(a)arm.com>
diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c
index 89405be53d8f..0f51fd10b4b0 100644
--- a/arch/arm64/kernel/proton-pack.c
+++ b/arch/arm64/kernel/proton-pack.c
@@ -876,6 +876,14 @@ static u8 spectre_bhb_loop_affected(void)
{
u8 k = 0;
+ static const struct midr_range spectre_bhb_k132_list[] = {
+ MIDR_ALL_VERSIONS(MIDR_CORTEX_X3),
+ MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V2),
+ };
+ static const struct midr_range spectre_bhb_k38_list[] = {
+ MIDR_ALL_VERSIONS(MIDR_CORTEX_A715),
+ MIDR_ALL_VERSIONS(MIDR_CORTEX_A720),
+ };
static const struct midr_range spectre_bhb_k32_list[] = {
MIDR_ALL_VERSIONS(MIDR_CORTEX_A78),
MIDR_ALL_VERSIONS(MIDR_CORTEX_A78AE),
@@ -889,6 +897,7 @@ static u8 spectre_bhb_loop_affected(void)
};
static const struct midr_range spectre_bhb_k24_list[] = {
MIDR_ALL_VERSIONS(MIDR_CORTEX_A76),
+ MIDR_ALL_VERSIONS(MIDR_CORTEX_A76AE),
MIDR_ALL_VERSIONS(MIDR_CORTEX_A77),
MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N1),
MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_4XX_GOLD),
@@ -904,7 +913,11 @@ static u8 spectre_bhb_loop_affected(void)
{},
};
- if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k32_list))
+ if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k132_list))
+ k = 132;
+ else if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k38_list))
+ k = 38;
+ else if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k32_list))
k = 32;
else if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k24_list))
k = 24;
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x a5951389e58d2e816eed3dbec5877de9327fd881
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025040850-subatomic-sake-782d@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From a5951389e58d2e816eed3dbec5877de9327fd881 Mon Sep 17 00:00:00 2001
From: Douglas Anderson <dianders(a)chromium.org>
Date: Tue, 7 Jan 2025 12:06:02 -0800
Subject: [PATCH] arm64: errata: Add newer ARM cores to the
spectre_bhb_loop_affected() lists
When comparing to the ARM list [1], it appears that several ARM cores
were missing from the lists in spectre_bhb_loop_affected(). Add them.
NOTE: for some of these cores it may not matter since other ways of
clearing the BHB may be used (like the CLRBHB instruction or ECBHB),
but it still seems good to have all the info from ARM's whitepaper
included.
[1] https://developer.arm.com/Arm%20Security%20Center/Spectre-BHB
Fixes: 558c303c9734 ("arm64: Mitigate spectre style branch history side channels")
Cc: stable(a)vger.kernel.org
Signed-off-by: Douglas Anderson <dianders(a)chromium.org>
Reviewed-by: James Morse <james.morse(a)arm.com>
Link: https://lore.kernel.org/r/20250107120555.v4.5.I4a9a527e03f663040721c5401c41…
Signed-off-by: Catalin Marinas <catalin.marinas(a)arm.com>
diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c
index 89405be53d8f..0f51fd10b4b0 100644
--- a/arch/arm64/kernel/proton-pack.c
+++ b/arch/arm64/kernel/proton-pack.c
@@ -876,6 +876,14 @@ static u8 spectre_bhb_loop_affected(void)
{
u8 k = 0;
+ static const struct midr_range spectre_bhb_k132_list[] = {
+ MIDR_ALL_VERSIONS(MIDR_CORTEX_X3),
+ MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V2),
+ };
+ static const struct midr_range spectre_bhb_k38_list[] = {
+ MIDR_ALL_VERSIONS(MIDR_CORTEX_A715),
+ MIDR_ALL_VERSIONS(MIDR_CORTEX_A720),
+ };
static const struct midr_range spectre_bhb_k32_list[] = {
MIDR_ALL_VERSIONS(MIDR_CORTEX_A78),
MIDR_ALL_VERSIONS(MIDR_CORTEX_A78AE),
@@ -889,6 +897,7 @@ static u8 spectre_bhb_loop_affected(void)
};
static const struct midr_range spectre_bhb_k24_list[] = {
MIDR_ALL_VERSIONS(MIDR_CORTEX_A76),
+ MIDR_ALL_VERSIONS(MIDR_CORTEX_A76AE),
MIDR_ALL_VERSIONS(MIDR_CORTEX_A77),
MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N1),
MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_4XX_GOLD),
@@ -904,7 +913,11 @@ static u8 spectre_bhb_loop_affected(void)
{},
};
- if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k32_list))
+ if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k132_list))
+ k = 132;
+ else if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k38_list))
+ k = 38;
+ else if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k32_list))
k = 32;
else if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k24_list))
k = 24;
The patch below does not apply to the 6.12-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.12.y
git checkout FETCH_HEAD
git cherry-pick -x a5951389e58d2e816eed3dbec5877de9327fd881
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025040848-lukewarm-footprint-06c8@gregkh' --subject-prefix 'PATCH 6.12.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From a5951389e58d2e816eed3dbec5877de9327fd881 Mon Sep 17 00:00:00 2001
From: Douglas Anderson <dianders(a)chromium.org>
Date: Tue, 7 Jan 2025 12:06:02 -0800
Subject: [PATCH] arm64: errata: Add newer ARM cores to the
spectre_bhb_loop_affected() lists
When comparing to the ARM list [1], it appears that several ARM cores
were missing from the lists in spectre_bhb_loop_affected(). Add them.
NOTE: for some of these cores it may not matter since other ways of
clearing the BHB may be used (like the CLRBHB instruction or ECBHB),
but it still seems good to have all the info from ARM's whitepaper
included.
[1] https://developer.arm.com/Arm%20Security%20Center/Spectre-BHB
Fixes: 558c303c9734 ("arm64: Mitigate spectre style branch history side channels")
Cc: stable(a)vger.kernel.org
Signed-off-by: Douglas Anderson <dianders(a)chromium.org>
Reviewed-by: James Morse <james.morse(a)arm.com>
Link: https://lore.kernel.org/r/20250107120555.v4.5.I4a9a527e03f663040721c5401c41…
Signed-off-by: Catalin Marinas <catalin.marinas(a)arm.com>
diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c
index 89405be53d8f..0f51fd10b4b0 100644
--- a/arch/arm64/kernel/proton-pack.c
+++ b/arch/arm64/kernel/proton-pack.c
@@ -876,6 +876,14 @@ static u8 spectre_bhb_loop_affected(void)
{
u8 k = 0;
+ static const struct midr_range spectre_bhb_k132_list[] = {
+ MIDR_ALL_VERSIONS(MIDR_CORTEX_X3),
+ MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V2),
+ };
+ static const struct midr_range spectre_bhb_k38_list[] = {
+ MIDR_ALL_VERSIONS(MIDR_CORTEX_A715),
+ MIDR_ALL_VERSIONS(MIDR_CORTEX_A720),
+ };
static const struct midr_range spectre_bhb_k32_list[] = {
MIDR_ALL_VERSIONS(MIDR_CORTEX_A78),
MIDR_ALL_VERSIONS(MIDR_CORTEX_A78AE),
@@ -889,6 +897,7 @@ static u8 spectre_bhb_loop_affected(void)
};
static const struct midr_range spectre_bhb_k24_list[] = {
MIDR_ALL_VERSIONS(MIDR_CORTEX_A76),
+ MIDR_ALL_VERSIONS(MIDR_CORTEX_A76AE),
MIDR_ALL_VERSIONS(MIDR_CORTEX_A77),
MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N1),
MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_4XX_GOLD),
@@ -904,7 +913,11 @@ static u8 spectre_bhb_loop_affected(void)
{},
};
- if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k32_list))
+ if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k132_list))
+ k = 132;
+ else if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k38_list))
+ k = 38;
+ else if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k32_list))
k = 32;
else if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k24_list))
k = 24;
The patch below does not apply to the 6.13-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.13.y
git checkout FETCH_HEAD
git cherry-pick -x a5951389e58d2e816eed3dbec5877de9327fd881
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025040846-certified-entering-89c3@gregkh' --subject-prefix 'PATCH 6.13.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From a5951389e58d2e816eed3dbec5877de9327fd881 Mon Sep 17 00:00:00 2001
From: Douglas Anderson <dianders(a)chromium.org>
Date: Tue, 7 Jan 2025 12:06:02 -0800
Subject: [PATCH] arm64: errata: Add newer ARM cores to the
spectre_bhb_loop_affected() lists
When comparing to the ARM list [1], it appears that several ARM cores
were missing from the lists in spectre_bhb_loop_affected(). Add them.
NOTE: for some of these cores it may not matter since other ways of
clearing the BHB may be used (like the CLRBHB instruction or ECBHB),
but it still seems good to have all the info from ARM's whitepaper
included.
[1] https://developer.arm.com/Arm%20Security%20Center/Spectre-BHB
Fixes: 558c303c9734 ("arm64: Mitigate spectre style branch history side channels")
Cc: stable(a)vger.kernel.org
Signed-off-by: Douglas Anderson <dianders(a)chromium.org>
Reviewed-by: James Morse <james.morse(a)arm.com>
Link: https://lore.kernel.org/r/20250107120555.v4.5.I4a9a527e03f663040721c5401c41…
Signed-off-by: Catalin Marinas <catalin.marinas(a)arm.com>
diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c
index 89405be53d8f..0f51fd10b4b0 100644
--- a/arch/arm64/kernel/proton-pack.c
+++ b/arch/arm64/kernel/proton-pack.c
@@ -876,6 +876,14 @@ static u8 spectre_bhb_loop_affected(void)
{
u8 k = 0;
+ static const struct midr_range spectre_bhb_k132_list[] = {
+ MIDR_ALL_VERSIONS(MIDR_CORTEX_X3),
+ MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V2),
+ };
+ static const struct midr_range spectre_bhb_k38_list[] = {
+ MIDR_ALL_VERSIONS(MIDR_CORTEX_A715),
+ MIDR_ALL_VERSIONS(MIDR_CORTEX_A720),
+ };
static const struct midr_range spectre_bhb_k32_list[] = {
MIDR_ALL_VERSIONS(MIDR_CORTEX_A78),
MIDR_ALL_VERSIONS(MIDR_CORTEX_A78AE),
@@ -889,6 +897,7 @@ static u8 spectre_bhb_loop_affected(void)
};
static const struct midr_range spectre_bhb_k24_list[] = {
MIDR_ALL_VERSIONS(MIDR_CORTEX_A76),
+ MIDR_ALL_VERSIONS(MIDR_CORTEX_A76AE),
MIDR_ALL_VERSIONS(MIDR_CORTEX_A77),
MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N1),
MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_4XX_GOLD),
@@ -904,7 +913,11 @@ static u8 spectre_bhb_loop_affected(void)
{},
};
- if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k32_list))
+ if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k132_list))
+ k = 132;
+ else if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k38_list))
+ k = 38;
+ else if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k32_list))
k = 32;
else if (is_midr_in_range_list(read_cpuid_id(), spectre_bhb_k24_list))
k = 24;
The patch below does not apply to the 6.12-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.12.y
git checkout FETCH_HEAD
git cherry-pick -x e7607f7d6d81af71dcc5171278aadccc94d277cd
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025040805-goal-richness-0c23@gregkh' --subject-prefix 'PATCH 6.12.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From e7607f7d6d81af71dcc5171278aadccc94d277cd Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <nathan(a)kernel.org>
Date: Thu, 20 Mar 2025 22:33:49 +0100
Subject: [PATCH] ARM: 9443/1: Require linker to support KEEP within OVERLAY
for DCE
ld.lld prior to 21.0.0 does not support using the KEEP keyword within an
overlay description, which may be needed to avoid discarding necessary
sections within an overlay with '--gc-sections', which can be enabled
for the kernel via CONFIG_LD_DEAD_CODE_DATA_ELIMINATION.
Disallow CONFIG_LD_DEAD_CODE_DATA_ELIMINATION without support for KEEP
within OVERLAY and introduce a macro, OVERLAY_KEEP, that can be used to
conditionally add KEEP when it is properly supported to avoid breaking
old versions of ld.lld.
Cc: stable(a)vger.kernel.org
Link: https://github.com/llvm/llvm-project/commit/381599f1fe973afad3094e55ec99b16…
Reviewed-by: Linus Walleij <linus.walleij(a)linaro.org>
Signed-off-by: Nathan Chancellor <nathan(a)kernel.org>
Signed-off-by: Russell King (Oracle) <rmk+kernel(a)armlinux.org.uk>
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 202dbd17ad2f..25ed6f1a7c7a 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -121,7 +121,7 @@ config ARM
select HAVE_KERNEL_XZ
select HAVE_KPROBES if !XIP_KERNEL && !CPU_ENDIAN_BE32 && !CPU_V7M
select HAVE_KRETPROBES if HAVE_KPROBES
- select HAVE_LD_DEAD_CODE_DATA_ELIMINATION if (LD_VERSION >= 23600 || LD_IS_LLD)
+ select HAVE_LD_DEAD_CODE_DATA_ELIMINATION if (LD_VERSION >= 23600 || LD_CAN_USE_KEEP_IN_OVERLAY)
select HAVE_MOD_ARCH_SPECIFIC
select HAVE_NMI
select HAVE_OPTPROBES if !THUMB2_KERNEL
diff --git a/arch/arm/include/asm/vmlinux.lds.h b/arch/arm/include/asm/vmlinux.lds.h
index 89697f204715..a54db342653a 100644
--- a/arch/arm/include/asm/vmlinux.lds.h
+++ b/arch/arm/include/asm/vmlinux.lds.h
@@ -34,6 +34,12 @@
#define NOCROSSREFS
#endif
+#ifdef CONFIG_LD_CAN_USE_KEEP_IN_OVERLAY
+#define OVERLAY_KEEP(x) KEEP(x)
+#else
+#define OVERLAY_KEEP(x) x
+#endif
+
/* Set start/end symbol names to the LMA for the section */
#define ARM_LMA(sym, section) \
sym##_start = LOADADDR(section); \
diff --git a/init/Kconfig b/init/Kconfig
index d0d021b3fa3b..fc994f5cd5db 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -129,6 +129,11 @@ config CC_HAS_COUNTED_BY
# https://github.com/llvm/llvm-project/pull/112636
depends on !(CC_IS_CLANG && CLANG_VERSION < 190103)
+config LD_CAN_USE_KEEP_IN_OVERLAY
+ # ld.lld prior to 21.0.0 did not support KEEP within an overlay description
+ # https://github.com/llvm/llvm-project/pull/130661
+ def_bool LD_IS_BFD || LLD_VERSION >= 210000
+
config RUSTC_HAS_COERCE_POINTEE
def_bool RUSTC_VERSION >= 108400
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x dd4f730b557ce701a2cd4f604bf1e57667bd8b6e
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025040805-untaken-baggage-498c@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From dd4f730b557ce701a2cd4f604bf1e57667bd8b6e Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <nathan(a)kernel.org>
Date: Mon, 10 Feb 2025 21:28:25 -0500
Subject: [PATCH] ACPI: platform-profile: Fix CFI violation when accessing
sysfs files
When an attribute group is created with sysfs_create_group(), the
->sysfs_ops() callback is set to kobj_sysfs_ops, which sets the ->show()
and ->store() callbacks to kobj_attr_show() and kobj_attr_store()
respectively. These functions use container_of() to get the respective
callback from the passed attribute, meaning that these callbacks need to
be of the same type as the callbacks in 'struct kobj_attribute'.
However, ->show() and ->store() in the platform_profile driver are
defined for struct device_attribute with the help of DEVICE_ATTR_RO()
and DEVICE_ATTR_RW(), which results in a CFI violation when accessing
platform_profile or platform_profile_choices under /sys/firmware/acpi
because the types do not match:
CFI failure at kobj_attr_show+0x19/0x30 (target: platform_profile_choices_show+0x0/0x140; expected type: 0x7a69590c)
There is no functional issue from the type mismatch because the layout
of 'struct kobj_attribute' and 'struct device_attribute' are the same,
so the container_of() cast does not break anything aside from CFI.
Change the type of platform_profile_choices_show() and
platform_profile_{show,store}() to match the callbacks in
'struct kobj_attribute' and update the attribute variables to
match, which resolves the CFI violation.
Cc: All applicable <stable(a)vger.kernel.org>
Fixes: a2ff95e018f1 ("ACPI: platform: Add platform profile support")
Reported-by: John Rowley <lkml(a)johnrowley.me>
Closes: https://github.com/ClangBuiltLinux/linux/issues/2047
Tested-by: John Rowley <lkml(a)johnrowley.me>
Reviewed-by: Sami Tolvanen <samitolvanen(a)google.com>
Signed-off-by: Nathan Chancellor <nathan(a)kernel.org>
Acked-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Reviewed-by: Mark Pearson <mpearson-lenovo(a)squebb.ca>
Tested-by: Mark Pearson <mpearson-lenovo(a)squebb.ca>
Link: https://patch.msgid.link/20250210-acpi-platform_profile-fix-cfi-violation-v…
[ rjw: Changelog edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki(a)intel.com>
diff --git a/drivers/acpi/platform_profile.c b/drivers/acpi/platform_profile.c
index fc92e43d0fe9..1b6317f759f9 100644
--- a/drivers/acpi/platform_profile.c
+++ b/drivers/acpi/platform_profile.c
@@ -260,14 +260,14 @@ static int _aggregate_choices(struct device *dev, void *data)
/**
* platform_profile_choices_show - Show the available profile choices for legacy sysfs interface
- * @dev: The device
+ * @kobj: The kobject
* @attr: The attribute
* @buf: The buffer to write to
*
* Return: The number of bytes written
*/
-static ssize_t platform_profile_choices_show(struct device *dev,
- struct device_attribute *attr,
+static ssize_t platform_profile_choices_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
char *buf)
{
unsigned long aggregate[BITS_TO_LONGS(PLATFORM_PROFILE_LAST)];
@@ -333,14 +333,14 @@ static int _store_and_notify(struct device *dev, void *data)
/**
* platform_profile_show - Show the current profile for legacy sysfs interface
- * @dev: The device
+ * @kobj: The kobject
* @attr: The attribute
* @buf: The buffer to write to
*
* Return: The number of bytes written
*/
-static ssize_t platform_profile_show(struct device *dev,
- struct device_attribute *attr,
+static ssize_t platform_profile_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
char *buf)
{
enum platform_profile_option profile = PLATFORM_PROFILE_LAST;
@@ -362,15 +362,15 @@ static ssize_t platform_profile_show(struct device *dev,
/**
* platform_profile_store - Set the profile for legacy sysfs interface
- * @dev: The device
+ * @kobj: The kobject
* @attr: The attribute
* @buf: The buffer to read from
* @count: The number of bytes to read
*
* Return: The number of bytes read
*/
-static ssize_t platform_profile_store(struct device *dev,
- struct device_attribute *attr,
+static ssize_t platform_profile_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
const char *buf, size_t count)
{
unsigned long choices[BITS_TO_LONGS(PLATFORM_PROFILE_LAST)];
@@ -401,12 +401,12 @@ static ssize_t platform_profile_store(struct device *dev,
return count;
}
-static DEVICE_ATTR_RO(platform_profile_choices);
-static DEVICE_ATTR_RW(platform_profile);
+static struct kobj_attribute attr_platform_profile_choices = __ATTR_RO(platform_profile_choices);
+static struct kobj_attribute attr_platform_profile = __ATTR_RW(platform_profile);
static struct attribute *platform_profile_attrs[] = {
- &dev_attr_platform_profile_choices.attr,
- &dev_attr_platform_profile.attr,
+ &attr_platform_profile_choices.attr,
+ &attr_platform_profile.attr,
NULL
};
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x dd4f730b557ce701a2cd4f604bf1e57667bd8b6e
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025040805-murmuring-number-bbb8@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From dd4f730b557ce701a2cd4f604bf1e57667bd8b6e Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <nathan(a)kernel.org>
Date: Mon, 10 Feb 2025 21:28:25 -0500
Subject: [PATCH] ACPI: platform-profile: Fix CFI violation when accessing
sysfs files
When an attribute group is created with sysfs_create_group(), the
->sysfs_ops() callback is set to kobj_sysfs_ops, which sets the ->show()
and ->store() callbacks to kobj_attr_show() and kobj_attr_store()
respectively. These functions use container_of() to get the respective
callback from the passed attribute, meaning that these callbacks need to
be of the same type as the callbacks in 'struct kobj_attribute'.
However, ->show() and ->store() in the platform_profile driver are
defined for struct device_attribute with the help of DEVICE_ATTR_RO()
and DEVICE_ATTR_RW(), which results in a CFI violation when accessing
platform_profile or platform_profile_choices under /sys/firmware/acpi
because the types do not match:
CFI failure at kobj_attr_show+0x19/0x30 (target: platform_profile_choices_show+0x0/0x140; expected type: 0x7a69590c)
There is no functional issue from the type mismatch because the layout
of 'struct kobj_attribute' and 'struct device_attribute' are the same,
so the container_of() cast does not break anything aside from CFI.
Change the type of platform_profile_choices_show() and
platform_profile_{show,store}() to match the callbacks in
'struct kobj_attribute' and update the attribute variables to
match, which resolves the CFI violation.
Cc: All applicable <stable(a)vger.kernel.org>
Fixes: a2ff95e018f1 ("ACPI: platform: Add platform profile support")
Reported-by: John Rowley <lkml(a)johnrowley.me>
Closes: https://github.com/ClangBuiltLinux/linux/issues/2047
Tested-by: John Rowley <lkml(a)johnrowley.me>
Reviewed-by: Sami Tolvanen <samitolvanen(a)google.com>
Signed-off-by: Nathan Chancellor <nathan(a)kernel.org>
Acked-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Reviewed-by: Mark Pearson <mpearson-lenovo(a)squebb.ca>
Tested-by: Mark Pearson <mpearson-lenovo(a)squebb.ca>
Link: https://patch.msgid.link/20250210-acpi-platform_profile-fix-cfi-violation-v…
[ rjw: Changelog edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki(a)intel.com>
diff --git a/drivers/acpi/platform_profile.c b/drivers/acpi/platform_profile.c
index fc92e43d0fe9..1b6317f759f9 100644
--- a/drivers/acpi/platform_profile.c
+++ b/drivers/acpi/platform_profile.c
@@ -260,14 +260,14 @@ static int _aggregate_choices(struct device *dev, void *data)
/**
* platform_profile_choices_show - Show the available profile choices for legacy sysfs interface
- * @dev: The device
+ * @kobj: The kobject
* @attr: The attribute
* @buf: The buffer to write to
*
* Return: The number of bytes written
*/
-static ssize_t platform_profile_choices_show(struct device *dev,
- struct device_attribute *attr,
+static ssize_t platform_profile_choices_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
char *buf)
{
unsigned long aggregate[BITS_TO_LONGS(PLATFORM_PROFILE_LAST)];
@@ -333,14 +333,14 @@ static int _store_and_notify(struct device *dev, void *data)
/**
* platform_profile_show - Show the current profile for legacy sysfs interface
- * @dev: The device
+ * @kobj: The kobject
* @attr: The attribute
* @buf: The buffer to write to
*
* Return: The number of bytes written
*/
-static ssize_t platform_profile_show(struct device *dev,
- struct device_attribute *attr,
+static ssize_t platform_profile_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
char *buf)
{
enum platform_profile_option profile = PLATFORM_PROFILE_LAST;
@@ -362,15 +362,15 @@ static ssize_t platform_profile_show(struct device *dev,
/**
* platform_profile_store - Set the profile for legacy sysfs interface
- * @dev: The device
+ * @kobj: The kobject
* @attr: The attribute
* @buf: The buffer to read from
* @count: The number of bytes to read
*
* Return: The number of bytes read
*/
-static ssize_t platform_profile_store(struct device *dev,
- struct device_attribute *attr,
+static ssize_t platform_profile_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
const char *buf, size_t count)
{
unsigned long choices[BITS_TO_LONGS(PLATFORM_PROFILE_LAST)];
@@ -401,12 +401,12 @@ static ssize_t platform_profile_store(struct device *dev,
return count;
}
-static DEVICE_ATTR_RO(platform_profile_choices);
-static DEVICE_ATTR_RW(platform_profile);
+static struct kobj_attribute attr_platform_profile_choices = __ATTR_RO(platform_profile_choices);
+static struct kobj_attribute attr_platform_profile = __ATTR_RW(platform_profile);
static struct attribute *platform_profile_attrs[] = {
- &dev_attr_platform_profile_choices.attr,
- &dev_attr_platform_profile.attr,
+ &attr_platform_profile_choices.attr,
+ &attr_platform_profile.attr,
NULL
};
The patch below does not apply to the 6.12-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.12.y
git checkout FETCH_HEAD
git cherry-pick -x dd4f730b557ce701a2cd4f604bf1e57667bd8b6e
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025040804-submitter-spur-cf1a@gregkh' --subject-prefix 'PATCH 6.12.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From dd4f730b557ce701a2cd4f604bf1e57667bd8b6e Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <nathan(a)kernel.org>
Date: Mon, 10 Feb 2025 21:28:25 -0500
Subject: [PATCH] ACPI: platform-profile: Fix CFI violation when accessing
sysfs files
When an attribute group is created with sysfs_create_group(), the
->sysfs_ops() callback is set to kobj_sysfs_ops, which sets the ->show()
and ->store() callbacks to kobj_attr_show() and kobj_attr_store()
respectively. These functions use container_of() to get the respective
callback from the passed attribute, meaning that these callbacks need to
be of the same type as the callbacks in 'struct kobj_attribute'.
However, ->show() and ->store() in the platform_profile driver are
defined for struct device_attribute with the help of DEVICE_ATTR_RO()
and DEVICE_ATTR_RW(), which results in a CFI violation when accessing
platform_profile or platform_profile_choices under /sys/firmware/acpi
because the types do not match:
CFI failure at kobj_attr_show+0x19/0x30 (target: platform_profile_choices_show+0x0/0x140; expected type: 0x7a69590c)
There is no functional issue from the type mismatch because the layout
of 'struct kobj_attribute' and 'struct device_attribute' are the same,
so the container_of() cast does not break anything aside from CFI.
Change the type of platform_profile_choices_show() and
platform_profile_{show,store}() to match the callbacks in
'struct kobj_attribute' and update the attribute variables to
match, which resolves the CFI violation.
Cc: All applicable <stable(a)vger.kernel.org>
Fixes: a2ff95e018f1 ("ACPI: platform: Add platform profile support")
Reported-by: John Rowley <lkml(a)johnrowley.me>
Closes: https://github.com/ClangBuiltLinux/linux/issues/2047
Tested-by: John Rowley <lkml(a)johnrowley.me>
Reviewed-by: Sami Tolvanen <samitolvanen(a)google.com>
Signed-off-by: Nathan Chancellor <nathan(a)kernel.org>
Acked-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Reviewed-by: Mark Pearson <mpearson-lenovo(a)squebb.ca>
Tested-by: Mark Pearson <mpearson-lenovo(a)squebb.ca>
Link: https://patch.msgid.link/20250210-acpi-platform_profile-fix-cfi-violation-v…
[ rjw: Changelog edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki(a)intel.com>
diff --git a/drivers/acpi/platform_profile.c b/drivers/acpi/platform_profile.c
index fc92e43d0fe9..1b6317f759f9 100644
--- a/drivers/acpi/platform_profile.c
+++ b/drivers/acpi/platform_profile.c
@@ -260,14 +260,14 @@ static int _aggregate_choices(struct device *dev, void *data)
/**
* platform_profile_choices_show - Show the available profile choices for legacy sysfs interface
- * @dev: The device
+ * @kobj: The kobject
* @attr: The attribute
* @buf: The buffer to write to
*
* Return: The number of bytes written
*/
-static ssize_t platform_profile_choices_show(struct device *dev,
- struct device_attribute *attr,
+static ssize_t platform_profile_choices_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
char *buf)
{
unsigned long aggregate[BITS_TO_LONGS(PLATFORM_PROFILE_LAST)];
@@ -333,14 +333,14 @@ static int _store_and_notify(struct device *dev, void *data)
/**
* platform_profile_show - Show the current profile for legacy sysfs interface
- * @dev: The device
+ * @kobj: The kobject
* @attr: The attribute
* @buf: The buffer to write to
*
* Return: The number of bytes written
*/
-static ssize_t platform_profile_show(struct device *dev,
- struct device_attribute *attr,
+static ssize_t platform_profile_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
char *buf)
{
enum platform_profile_option profile = PLATFORM_PROFILE_LAST;
@@ -362,15 +362,15 @@ static ssize_t platform_profile_show(struct device *dev,
/**
* platform_profile_store - Set the profile for legacy sysfs interface
- * @dev: The device
+ * @kobj: The kobject
* @attr: The attribute
* @buf: The buffer to read from
* @count: The number of bytes to read
*
* Return: The number of bytes read
*/
-static ssize_t platform_profile_store(struct device *dev,
- struct device_attribute *attr,
+static ssize_t platform_profile_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
const char *buf, size_t count)
{
unsigned long choices[BITS_TO_LONGS(PLATFORM_PROFILE_LAST)];
@@ -401,12 +401,12 @@ static ssize_t platform_profile_store(struct device *dev,
return count;
}
-static DEVICE_ATTR_RO(platform_profile_choices);
-static DEVICE_ATTR_RW(platform_profile);
+static struct kobj_attribute attr_platform_profile_choices = __ATTR_RO(platform_profile_choices);
+static struct kobj_attribute attr_platform_profile = __ATTR_RW(platform_profile);
static struct attribute *platform_profile_attrs[] = {
- &dev_attr_platform_profile_choices.attr,
- &dev_attr_platform_profile.attr,
+ &attr_platform_profile_choices.attr,
+ &attr_platform_profile.attr,
NULL
};
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x dd4f730b557ce701a2cd4f604bf1e57667bd8b6e
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025040804-level-crystal-ca73@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From dd4f730b557ce701a2cd4f604bf1e57667bd8b6e Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <nathan(a)kernel.org>
Date: Mon, 10 Feb 2025 21:28:25 -0500
Subject: [PATCH] ACPI: platform-profile: Fix CFI violation when accessing
sysfs files
When an attribute group is created with sysfs_create_group(), the
->sysfs_ops() callback is set to kobj_sysfs_ops, which sets the ->show()
and ->store() callbacks to kobj_attr_show() and kobj_attr_store()
respectively. These functions use container_of() to get the respective
callback from the passed attribute, meaning that these callbacks need to
be of the same type as the callbacks in 'struct kobj_attribute'.
However, ->show() and ->store() in the platform_profile driver are
defined for struct device_attribute with the help of DEVICE_ATTR_RO()
and DEVICE_ATTR_RW(), which results in a CFI violation when accessing
platform_profile or platform_profile_choices under /sys/firmware/acpi
because the types do not match:
CFI failure at kobj_attr_show+0x19/0x30 (target: platform_profile_choices_show+0x0/0x140; expected type: 0x7a69590c)
There is no functional issue from the type mismatch because the layout
of 'struct kobj_attribute' and 'struct device_attribute' are the same,
so the container_of() cast does not break anything aside from CFI.
Change the type of platform_profile_choices_show() and
platform_profile_{show,store}() to match the callbacks in
'struct kobj_attribute' and update the attribute variables to
match, which resolves the CFI violation.
Cc: All applicable <stable(a)vger.kernel.org>
Fixes: a2ff95e018f1 ("ACPI: platform: Add platform profile support")
Reported-by: John Rowley <lkml(a)johnrowley.me>
Closes: https://github.com/ClangBuiltLinux/linux/issues/2047
Tested-by: John Rowley <lkml(a)johnrowley.me>
Reviewed-by: Sami Tolvanen <samitolvanen(a)google.com>
Signed-off-by: Nathan Chancellor <nathan(a)kernel.org>
Acked-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Reviewed-by: Mark Pearson <mpearson-lenovo(a)squebb.ca>
Tested-by: Mark Pearson <mpearson-lenovo(a)squebb.ca>
Link: https://patch.msgid.link/20250210-acpi-platform_profile-fix-cfi-violation-v…
[ rjw: Changelog edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki(a)intel.com>
diff --git a/drivers/acpi/platform_profile.c b/drivers/acpi/platform_profile.c
index fc92e43d0fe9..1b6317f759f9 100644
--- a/drivers/acpi/platform_profile.c
+++ b/drivers/acpi/platform_profile.c
@@ -260,14 +260,14 @@ static int _aggregate_choices(struct device *dev, void *data)
/**
* platform_profile_choices_show - Show the available profile choices for legacy sysfs interface
- * @dev: The device
+ * @kobj: The kobject
* @attr: The attribute
* @buf: The buffer to write to
*
* Return: The number of bytes written
*/
-static ssize_t platform_profile_choices_show(struct device *dev,
- struct device_attribute *attr,
+static ssize_t platform_profile_choices_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
char *buf)
{
unsigned long aggregate[BITS_TO_LONGS(PLATFORM_PROFILE_LAST)];
@@ -333,14 +333,14 @@ static int _store_and_notify(struct device *dev, void *data)
/**
* platform_profile_show - Show the current profile for legacy sysfs interface
- * @dev: The device
+ * @kobj: The kobject
* @attr: The attribute
* @buf: The buffer to write to
*
* Return: The number of bytes written
*/
-static ssize_t platform_profile_show(struct device *dev,
- struct device_attribute *attr,
+static ssize_t platform_profile_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
char *buf)
{
enum platform_profile_option profile = PLATFORM_PROFILE_LAST;
@@ -362,15 +362,15 @@ static ssize_t platform_profile_show(struct device *dev,
/**
* platform_profile_store - Set the profile for legacy sysfs interface
- * @dev: The device
+ * @kobj: The kobject
* @attr: The attribute
* @buf: The buffer to read from
* @count: The number of bytes to read
*
* Return: The number of bytes read
*/
-static ssize_t platform_profile_store(struct device *dev,
- struct device_attribute *attr,
+static ssize_t platform_profile_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
const char *buf, size_t count)
{
unsigned long choices[BITS_TO_LONGS(PLATFORM_PROFILE_LAST)];
@@ -401,12 +401,12 @@ static ssize_t platform_profile_store(struct device *dev,
return count;
}
-static DEVICE_ATTR_RO(platform_profile_choices);
-static DEVICE_ATTR_RW(platform_profile);
+static struct kobj_attribute attr_platform_profile_choices = __ATTR_RO(platform_profile_choices);
+static struct kobj_attribute attr_platform_profile = __ATTR_RW(platform_profile);
static struct attribute *platform_profile_attrs[] = {
- &dev_attr_platform_profile_choices.attr,
- &dev_attr_platform_profile.attr,
+ &attr_platform_profile_choices.attr,
+ &attr_platform_profile.attr,
NULL
};
On 07/04/2025 15:57, Sasha Levin wrote:
> This is a note to let you know that I've just added the patch titled
>
> sfc: rip out MDIO support
>
> to the 6.14-stable tree which can be found at:
> http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
...
> Stable-dep-of: 8241ecec1cdc ("sfc: fix NULL dereferences in ef100_process_design_param()")
I wouldn't say it's really a dependency; it's completely unrelated, just
happens to create a textual diff conflict which should be pretty trivial
to resolve (it's only the contexts that overlap, not the actual changes).
Obviously you can take this in if you want but I would've said this was a
bit big for -stable.
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x 2fa87c71d2adb4b82c105f9191e6120340feff00
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025040812-alumni-tightness-cb66@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 2fa87c71d2adb4b82c105f9191e6120340feff00 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede(a)redhat.com>
Date: Tue, 25 Mar 2025 22:04:50 +0100
Subject: [PATCH] ACPI: x86: Extend Lenovo Yoga Tab 3 quirk with skip GPIO
event-handlers
Depending on the secureboot signature on EFI\BOOT\BOOTX86.EFI the
Lenovo Yoga Tab 3 UEFI will switch its OSID ACPI variable between
1 (Windows) and 4 (Android(GMIN)).
In Windows mode a GPIO event handler gets installed for GPO1 pin 5,
causing Linux' x86-android-tables code which deals with the general
brokenness of this device's ACPI tables to fail to probe with:
[ 17.853705] x86_android_tablets: error -16 getting GPIO INT33FF:01 5
[ 17.859623] x86_android_tablets x86_android_tablets: probe with driver
which renders sound, the touchscreen, charging-management,
battery-monitoring and more non functional.
Add ACPI_QUIRK_SKIP_GPIO_EVENT_HANDLERS to the existing quirks for this
device to fix this.
Reported-by: Agoston Lorincz <pipacsba(a)gmail.com>
Closes: https://lore.kernel.org/platform-driver-x86/CAMEzqD+DNXrAvUOHviB2O2bjtcbmo3…
Cc: All applicable <stable(a)kernel.org>
Fixes: fe820db35275 ("ACPI: x86: Add skip i2c clients quirk for Lenovo Yoga Tab 3 Pro (YT3-X90F)")
Signed-off-by: Hans de Goede <hdegoede(a)redhat.com>
Link: https://patch.msgid.link/20250325210450.358506-1-hdegoede@redhat.com
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki(a)intel.com>
diff --git a/drivers/acpi/x86/utils.c b/drivers/acpi/x86/utils.c
index 068c1612660b..4ee30c2897a2 100644
--- a/drivers/acpi/x86/utils.c
+++ b/drivers/acpi/x86/utils.c
@@ -374,7 +374,8 @@ static const struct dmi_system_id acpi_quirk_skip_dmi_ids[] = {
DMI_MATCH(DMI_PRODUCT_VERSION, "Blade3-10A-001"),
},
.driver_data = (void *)(ACPI_QUIRK_SKIP_I2C_CLIENTS |
- ACPI_QUIRK_SKIP_ACPI_AC_AND_BATTERY),
+ ACPI_QUIRK_SKIP_ACPI_AC_AND_BATTERY |
+ ACPI_QUIRK_SKIP_GPIO_EVENT_HANDLERS),
},
{
/* Medion Lifetab S10346 */
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x f9bdf1f953392c9edd69a7f884f78c0390127029
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025040839-deputize-undertook-8a56@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From f9bdf1f953392c9edd69a7f884f78c0390127029 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang(a)linux.intel.com>
Date: Tue, 21 Jan 2025 07:23:01 -0800
Subject: [PATCH] perf/x86/intel: Avoid disable PMU if !cpuc->enabled in sample
read
The WARN_ON(this_cpu_read(cpu_hw_events.enabled)) in the
intel_pmu_save_and_restart_reload() is triggered, when sampling read
topdown events.
In a NMI handler, the cpu_hw_events.enabled is set and used to indicate
the status of core PMU. The generic pmu->pmu_disable_count, updated in
the perf_pmu_disable/enable pair, is not touched.
However, the perf_pmu_disable/enable pair is invoked when sampling read
in a NMI handler. The cpuc->enabled is mistakenly set by the
perf_pmu_enable().
Avoid disabling PMU if the core PMU is already disabled.
Merge the logic together.
Fixes: 7b2c05a15d29 ("perf/x86/intel: Generic support for hardware TopDown metrics")
Suggested-by: Peter Zijlstra (Intel) <peterz(a)infradead.org>
Signed-off-by: Kan Liang <kan.liang(a)linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz(a)infradead.org>
Cc: stable(a)vger.kernel.org
Link: https://lkml.kernel.org/r/20250121152303.3128733-2-kan.liang@linux.intel.com
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 2acea83526c6..1ccc961f8182 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -2785,28 +2785,33 @@ static u64 icl_update_topdown_event(struct perf_event *event)
DEFINE_STATIC_CALL(intel_pmu_update_topdown_event, x86_perf_event_update);
-static void intel_pmu_read_topdown_event(struct perf_event *event)
-{
- struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-
- /* Only need to call update_topdown_event() once for group read. */
- if ((cpuc->txn_flags & PERF_PMU_TXN_READ) &&
- !is_slots_event(event))
- return;
-
- perf_pmu_disable(event->pmu);
- static_call(intel_pmu_update_topdown_event)(event);
- perf_pmu_enable(event->pmu);
-}
-
static void intel_pmu_read_event(struct perf_event *event)
{
- if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD)
- intel_pmu_auto_reload_read(event);
- else if (is_topdown_count(event))
- intel_pmu_read_topdown_event(event);
- else
- x86_perf_event_update(event);
+ if (event->hw.flags & (PERF_X86_EVENT_AUTO_RELOAD | PERF_X86_EVENT_TOPDOWN)) {
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ bool pmu_enabled = cpuc->enabled;
+
+ /* Only need to call update_topdown_event() once for group read. */
+ if (is_metric_event(event) && (cpuc->txn_flags & PERF_PMU_TXN_READ))
+ return;
+
+ cpuc->enabled = 0;
+ if (pmu_enabled)
+ intel_pmu_disable_all();
+
+ if (is_topdown_event(event))
+ static_call(intel_pmu_update_topdown_event)(event);
+ else
+ intel_pmu_drain_pebs_buffer();
+
+ cpuc->enabled = pmu_enabled;
+ if (pmu_enabled)
+ intel_pmu_enable_all(0);
+
+ return;
+ }
+
+ x86_perf_event_update(event);
}
static void intel_pmu_enable_fixed(struct perf_event *event)
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 322963b02a91..eb14b46423e5 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -953,7 +953,7 @@ int intel_pmu_drain_bts_buffer(void)
return 1;
}
-static inline void intel_pmu_drain_pebs_buffer(void)
+void intel_pmu_drain_pebs_buffer(void)
{
struct perf_sample_data data;
@@ -2094,15 +2094,6 @@ get_next_pebs_record_by_bit(void *base, void *top, int bit)
return NULL;
}
-void intel_pmu_auto_reload_read(struct perf_event *event)
-{
- WARN_ON(!(event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD));
-
- perf_pmu_disable(event->pmu);
- intel_pmu_drain_pebs_buffer();
- perf_pmu_enable(event->pmu);
-}
-
/*
* Special variant of intel_pmu_save_and_restart() for auto-reload.
*/
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 084e9196b458..536a112f6353 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -1644,7 +1644,7 @@ void intel_pmu_pebs_disable_all(void);
void intel_pmu_pebs_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in);
-void intel_pmu_auto_reload_read(struct perf_event *event);
+void intel_pmu_drain_pebs_buffer(void);
void intel_pmu_store_pebs_lbrs(struct lbr_entry *lbr);
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x 9462e74c5c983cce34019bfb27f734552bebe59f
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025040821-underdone-luster-6e41@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 9462e74c5c983cce34019bfb27f734552bebe59f Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada(a)linux.intel.com>
Date: Fri, 28 Mar 2025 15:47:49 -0700
Subject: [PATCH] platform/x86: ISST: Correct command storage data length
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
After resume/online turbo limit ratio (TRL) is restored partially if
the admin explicitly changed TRL from user space.
A hash table is used to store SST mail box and MSR settings when modified
to restore those settings after resume or online. This uses a struct
isst_cmd field "data" to store these settings. This is a 64 bit field.
But isst_store_new_cmd() is only assigning as u32. This results in
truncation of 32 bits.
Change the argument to u64 from u32.
Fixes: f607874f35cb ("platform/x86: ISST: Restore state on resume")
Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada(a)linux.intel.com>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20250328224749.2691272-1-srinivas.pandruvada@linu…
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen(a)linux.intel.com>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen(a)linux.intel.com>
diff --git a/drivers/platform/x86/intel/speed_select_if/isst_if_common.c b/drivers/platform/x86/intel/speed_select_if/isst_if_common.c
index dbcd3087aaa4..31239a93dd71 100644
--- a/drivers/platform/x86/intel/speed_select_if/isst_if_common.c
+++ b/drivers/platform/x86/intel/speed_select_if/isst_if_common.c
@@ -84,7 +84,7 @@ static DECLARE_HASHTABLE(isst_hash, 8);
static DEFINE_MUTEX(isst_hash_lock);
static int isst_store_new_cmd(int cmd, u32 cpu, int mbox_cmd_type, u32 param,
- u32 data)
+ u64 data)
{
struct isst_cmd *sst_cmd;
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x 9462e74c5c983cce34019bfb27f734552bebe59f
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025040821-pleading-cone-6de8@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 9462e74c5c983cce34019bfb27f734552bebe59f Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada(a)linux.intel.com>
Date: Fri, 28 Mar 2025 15:47:49 -0700
Subject: [PATCH] platform/x86: ISST: Correct command storage data length
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
After resume/online turbo limit ratio (TRL) is restored partially if
the admin explicitly changed TRL from user space.
A hash table is used to store SST mail box and MSR settings when modified
to restore those settings after resume or online. This uses a struct
isst_cmd field "data" to store these settings. This is a 64 bit field.
But isst_store_new_cmd() is only assigning as u32. This results in
truncation of 32 bits.
Change the argument to u64 from u32.
Fixes: f607874f35cb ("platform/x86: ISST: Restore state on resume")
Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada(a)linux.intel.com>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20250328224749.2691272-1-srinivas.pandruvada@linu…
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen(a)linux.intel.com>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen(a)linux.intel.com>
diff --git a/drivers/platform/x86/intel/speed_select_if/isst_if_common.c b/drivers/platform/x86/intel/speed_select_if/isst_if_common.c
index dbcd3087aaa4..31239a93dd71 100644
--- a/drivers/platform/x86/intel/speed_select_if/isst_if_common.c
+++ b/drivers/platform/x86/intel/speed_select_if/isst_if_common.c
@@ -84,7 +84,7 @@ static DECLARE_HASHTABLE(isst_hash, 8);
static DEFINE_MUTEX(isst_hash_lock);
static int isst_store_new_cmd(int cmd, u32 cpu, int mbox_cmd_type, u32 param,
- u32 data)
+ u64 data)
{
struct isst_cmd *sst_cmd;
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 9f98a4f4e7216dbe366010b4cdcab6b220f229c4
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025040846-deafening-unmanaged-f966@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 9f98a4f4e7216dbe366010b4cdcab6b220f229c4 Mon Sep 17 00:00:00 2001
From: Vishal Annapurve <vannapurve(a)google.com>
Date: Fri, 28 Feb 2025 01:44:15 +0000
Subject: [PATCH] x86/tdx: Fix arch_safe_halt() execution for TDX VMs
Direct HLT instruction execution causes #VEs for TDX VMs which is routed
to hypervisor via TDCALL. If HLT is executed in STI-shadow, resulting #VE
handler will enable interrupts before TDCALL is routed to hypervisor
leading to missed wakeup events, as current TDX spec doesn't expose
interruptibility state information to allow #VE handler to selectively
enable interrupts.
Commit bfe6ed0c6727 ("x86/tdx: Add HLT support for TDX guests")
prevented the idle routines from executing HLT instruction in STI-shadow.
But it missed the paravirt routine which can be reached via this path
as an example:
kvm_wait() =>
safe_halt() =>
raw_safe_halt() =>
arch_safe_halt() =>
irq.safe_halt() =>
pv_native_safe_halt()
To reliably handle arch_safe_halt() for TDX VMs, introduce explicit
dependency on CONFIG_PARAVIRT and override paravirt halt()/safe_halt()
routines with TDX-safe versions that execute direct TDCALL and needed
interrupt flag updates. Executing direct TDCALL brings in additional
benefit of avoiding HLT related #VEs altogether.
As tested by Ryan Afranji:
"Tested with the specjbb2015 benchmark. It has heavy lock contention which leads
to many halt calls. TDX VMs suffered a poor score before this patchset.
Verified the major performance improvement with this patchset applied."
Fixes: bfe6ed0c6727 ("x86/tdx: Add HLT support for TDX guests")
Signed-off-by: Vishal Annapurve <vannapurve(a)google.com>
Signed-off-by: Ingo Molnar <mingo(a)kernel.org>
Reviewed-by: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com>
Tested-by: Ryan Afranji <afranji(a)google.com>
Cc: Andy Lutomirski <luto(a)kernel.org>
Cc: Brian Gerst <brgerst(a)gmail.com>
Cc: Juergen Gross <jgross(a)suse.com>
Cc: H. Peter Anvin <hpa(a)zytor.com>
Cc: Linus Torvalds <torvalds(a)linux-foundation.org>
Cc: Josh Poimboeuf <jpoimboe(a)redhat.com>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20250228014416.3925664-3-vannapurve@google.com
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 05b4eca156cf..f614c0522a0b 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -878,6 +878,7 @@ config INTEL_TDX_GUEST
depends on X86_64 && CPU_SUP_INTEL
depends on X86_X2APIC
depends on EFI_STUB
+ depends on PARAVIRT
select ARCH_HAS_CC_PLATFORM
select X86_MEM_ENCRYPT
select X86_MCE
diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c
index 7772b01ab738..aa0eb4057226 100644
--- a/arch/x86/coco/tdx/tdx.c
+++ b/arch/x86/coco/tdx/tdx.c
@@ -14,6 +14,7 @@
#include <asm/ia32.h>
#include <asm/insn.h>
#include <asm/insn-eval.h>
+#include <asm/paravirt_types.h>
#include <asm/pgtable.h>
#include <asm/set_memory.h>
#include <asm/traps.h>
@@ -398,7 +399,7 @@ static int handle_halt(struct ve_info *ve)
return ve_instr_len(ve);
}
-void __cpuidle tdx_safe_halt(void)
+void __cpuidle tdx_halt(void)
{
const bool irq_disabled = false;
@@ -409,6 +410,16 @@ void __cpuidle tdx_safe_halt(void)
WARN_ONCE(1, "HLT instruction emulation failed\n");
}
+static void __cpuidle tdx_safe_halt(void)
+{
+ tdx_halt();
+ /*
+ * "__cpuidle" section doesn't support instrumentation, so stick
+ * with raw_* variant that avoids tracing hooks.
+ */
+ raw_local_irq_enable();
+}
+
static int read_msr(struct pt_regs *regs, struct ve_info *ve)
{
struct tdx_module_args args = {
@@ -1109,6 +1120,19 @@ void __init tdx_early_init(void)
x86_platform.guest.enc_kexec_begin = tdx_kexec_begin;
x86_platform.guest.enc_kexec_finish = tdx_kexec_finish;
+ /*
+ * Avoid "sti;hlt" execution in TDX guests as HLT induces a #VE that
+ * will enable interrupts before HLT TDCALL invocation if executed
+ * in STI-shadow, possibly resulting in missed wakeup events.
+ *
+ * Modify all possible HLT execution paths to use TDX specific routines
+ * that directly execute TDCALL and toggle the interrupt state as
+ * needed after TDCALL completion. This also reduces HLT related #VEs
+ * in addition to having a reliable halt logic execution.
+ */
+ pv_ops.irq.safe_halt = tdx_safe_halt;
+ pv_ops.irq.halt = tdx_halt;
+
/*
* TDX intercepts the RDMSR to read the X2APIC ID in the parallel
* bringup low level code. That raises #VE which cannot be handled
diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
index 65394aa9b49f..4a1922ec80cf 100644
--- a/arch/x86/include/asm/tdx.h
+++ b/arch/x86/include/asm/tdx.h
@@ -58,7 +58,7 @@ void tdx_get_ve_info(struct ve_info *ve);
bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve);
-void tdx_safe_halt(void);
+void tdx_halt(void);
bool tdx_early_handle_ve(struct pt_regs *regs);
@@ -72,7 +72,7 @@ void __init tdx_dump_td_ctls(u64 td_ctls);
#else
static inline void tdx_early_init(void) { };
-static inline void tdx_safe_halt(void) { };
+static inline void tdx_halt(void) { };
static inline bool tdx_early_handle_ve(struct pt_regs *regs) { return false; }
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 91f6ff618852..962c3ce39323 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -939,7 +939,7 @@ void __init select_idle_routine(void)
static_call_update(x86_idle, mwait_idle);
} else if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) {
pr_info("using TDX aware idle routine\n");
- static_call_update(x86_idle, tdx_safe_halt);
+ static_call_update(x86_idle, tdx_halt);
} else {
static_call_update(x86_idle, default_idle);
}
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x 9f98a4f4e7216dbe366010b4cdcab6b220f229c4
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025040845-quantum-situation-f9b0@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 9f98a4f4e7216dbe366010b4cdcab6b220f229c4 Mon Sep 17 00:00:00 2001
From: Vishal Annapurve <vannapurve(a)google.com>
Date: Fri, 28 Feb 2025 01:44:15 +0000
Subject: [PATCH] x86/tdx: Fix arch_safe_halt() execution for TDX VMs
Direct HLT instruction execution causes #VEs for TDX VMs which is routed
to hypervisor via TDCALL. If HLT is executed in STI-shadow, resulting #VE
handler will enable interrupts before TDCALL is routed to hypervisor
leading to missed wakeup events, as current TDX spec doesn't expose
interruptibility state information to allow #VE handler to selectively
enable interrupts.
Commit bfe6ed0c6727 ("x86/tdx: Add HLT support for TDX guests")
prevented the idle routines from executing HLT instruction in STI-shadow.
But it missed the paravirt routine which can be reached via this path
as an example:
kvm_wait() =>
safe_halt() =>
raw_safe_halt() =>
arch_safe_halt() =>
irq.safe_halt() =>
pv_native_safe_halt()
To reliably handle arch_safe_halt() for TDX VMs, introduce explicit
dependency on CONFIG_PARAVIRT and override paravirt halt()/safe_halt()
routines with TDX-safe versions that execute direct TDCALL and needed
interrupt flag updates. Executing direct TDCALL brings in additional
benefit of avoiding HLT related #VEs altogether.
As tested by Ryan Afranji:
"Tested with the specjbb2015 benchmark. It has heavy lock contention which leads
to many halt calls. TDX VMs suffered a poor score before this patchset.
Verified the major performance improvement with this patchset applied."
Fixes: bfe6ed0c6727 ("x86/tdx: Add HLT support for TDX guests")
Signed-off-by: Vishal Annapurve <vannapurve(a)google.com>
Signed-off-by: Ingo Molnar <mingo(a)kernel.org>
Reviewed-by: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com>
Tested-by: Ryan Afranji <afranji(a)google.com>
Cc: Andy Lutomirski <luto(a)kernel.org>
Cc: Brian Gerst <brgerst(a)gmail.com>
Cc: Juergen Gross <jgross(a)suse.com>
Cc: H. Peter Anvin <hpa(a)zytor.com>
Cc: Linus Torvalds <torvalds(a)linux-foundation.org>
Cc: Josh Poimboeuf <jpoimboe(a)redhat.com>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20250228014416.3925664-3-vannapurve@google.com
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 05b4eca156cf..f614c0522a0b 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -878,6 +878,7 @@ config INTEL_TDX_GUEST
depends on X86_64 && CPU_SUP_INTEL
depends on X86_X2APIC
depends on EFI_STUB
+ depends on PARAVIRT
select ARCH_HAS_CC_PLATFORM
select X86_MEM_ENCRYPT
select X86_MCE
diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c
index 7772b01ab738..aa0eb4057226 100644
--- a/arch/x86/coco/tdx/tdx.c
+++ b/arch/x86/coco/tdx/tdx.c
@@ -14,6 +14,7 @@
#include <asm/ia32.h>
#include <asm/insn.h>
#include <asm/insn-eval.h>
+#include <asm/paravirt_types.h>
#include <asm/pgtable.h>
#include <asm/set_memory.h>
#include <asm/traps.h>
@@ -398,7 +399,7 @@ static int handle_halt(struct ve_info *ve)
return ve_instr_len(ve);
}
-void __cpuidle tdx_safe_halt(void)
+void __cpuidle tdx_halt(void)
{
const bool irq_disabled = false;
@@ -409,6 +410,16 @@ void __cpuidle tdx_safe_halt(void)
WARN_ONCE(1, "HLT instruction emulation failed\n");
}
+static void __cpuidle tdx_safe_halt(void)
+{
+ tdx_halt();
+ /*
+ * "__cpuidle" section doesn't support instrumentation, so stick
+ * with raw_* variant that avoids tracing hooks.
+ */
+ raw_local_irq_enable();
+}
+
static int read_msr(struct pt_regs *regs, struct ve_info *ve)
{
struct tdx_module_args args = {
@@ -1109,6 +1120,19 @@ void __init tdx_early_init(void)
x86_platform.guest.enc_kexec_begin = tdx_kexec_begin;
x86_platform.guest.enc_kexec_finish = tdx_kexec_finish;
+ /*
+ * Avoid "sti;hlt" execution in TDX guests as HLT induces a #VE that
+ * will enable interrupts before HLT TDCALL invocation if executed
+ * in STI-shadow, possibly resulting in missed wakeup events.
+ *
+ * Modify all possible HLT execution paths to use TDX specific routines
+ * that directly execute TDCALL and toggle the interrupt state as
+ * needed after TDCALL completion. This also reduces HLT related #VEs
+ * in addition to having a reliable halt logic execution.
+ */
+ pv_ops.irq.safe_halt = tdx_safe_halt;
+ pv_ops.irq.halt = tdx_halt;
+
/*
* TDX intercepts the RDMSR to read the X2APIC ID in the parallel
* bringup low level code. That raises #VE which cannot be handled
diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
index 65394aa9b49f..4a1922ec80cf 100644
--- a/arch/x86/include/asm/tdx.h
+++ b/arch/x86/include/asm/tdx.h
@@ -58,7 +58,7 @@ void tdx_get_ve_info(struct ve_info *ve);
bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve);
-void tdx_safe_halt(void);
+void tdx_halt(void);
bool tdx_early_handle_ve(struct pt_regs *regs);
@@ -72,7 +72,7 @@ void __init tdx_dump_td_ctls(u64 td_ctls);
#else
static inline void tdx_early_init(void) { };
-static inline void tdx_safe_halt(void) { };
+static inline void tdx_halt(void) { };
static inline bool tdx_early_handle_ve(struct pt_regs *regs) { return false; }
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 91f6ff618852..962c3ce39323 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -939,7 +939,7 @@ void __init select_idle_routine(void)
static_call_update(x86_idle, mwait_idle);
} else if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) {
pr_info("using TDX aware idle routine\n");
- static_call_update(x86_idle, tdx_safe_halt);
+ static_call_update(x86_idle, tdx_halt);
} else {
static_call_update(x86_idle, default_idle);
}
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 1a15bb8303b6b104e78028b6c68f76a0d4562134
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025040811-capable-unblock-8997@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 1a15bb8303b6b104e78028b6c68f76a0d4562134 Mon Sep 17 00:00:00 2001
From: Shuai Xue <xueshuai(a)linux.alibaba.com>
Date: Wed, 12 Mar 2025 19:28:50 +0800
Subject: [PATCH] x86/mce: use is_copy_from_user() to determine copy-from-user
context
Patch series "mm/hwpoison: Fix regressions in memory failure handling",
v4.
## 1. What am I trying to do:
This patchset resolves two critical regressions related to memory failure
handling that have appeared in the upstream kernel since version 5.17, as
compared to 5.10 LTS.
- copyin case: poison found in user page while kernel copying from user space
- instr case: poison found while instruction fetching in user space
## 2. What is the expected outcome and why
- For copyin case:
Kernel can recover from poison found where kernel is doing get_user() or
copy_from_user() if those places get an error return and the kernel return
-EFAULT to the process instead of crashing. More specifily, MCE handler
checks the fixup handler type to decide whether an in kernel #MC can be
recovered. When EX_TYPE_UACCESS is found, the PC jumps to recovery code
specified in _ASM_EXTABLE_FAULT() and return a -EFAULT to user space.
- For instr case:
If a poison found while instruction fetching in user space, full recovery
is possible. User process takes #PF, Linux allocates a new page and fills
by reading from storage.
## 3. What actually happens and why
- For copyin case: kernel panic since v5.17
Commit 4c132d1d844a ("x86/futex: Remove .fixup usage") introduced a new
extable fixup type, EX_TYPE_EFAULT_REG, and later patches updated the
extable fixup type for copy-from-user operations, changing it from
EX_TYPE_UACCESS to EX_TYPE_EFAULT_REG. It breaks previous EX_TYPE_UACCESS
handling when posion found in get_user() or copy_from_user().
- For instr case: user process is killed by a SIGBUS signal due to #CMCI
and #MCE race
When an uncorrected memory error is consumed there is a race between the
CMCI from the memory controller reporting an uncorrected error with a UCNA
signature, and the core reporting and SRAR signature machine check when
the data is about to be consumed.
### Background: why *UN*corrected errors tied to *C*MCI in Intel platform [1]
Prior to Icelake memory controllers reported patrol scrub events that
detected a previously unseen uncorrected error in memory by signaling a
broadcast machine check with an SRAO (Software Recoverable Action
Optional) signature in the machine check bank. This was overkill because
it's not an urgent problem that no core is on the verge of consuming that
bad data. It's also found that multi SRAO UCE may cause nested MCE
interrupts and finally become an IERR.
Hence, Intel downgrades the machine check bank signature of patrol scrub
from SRAO to UCNA (Uncorrected, No Action required), and signal changed to
#CMCI. Just to add to the confusion, Linux does take an action (in
uc_decode_notifier()) to try to offline the page despite the UC*NA*
signature name.
### Background: why #CMCI and #MCE race when poison is consuming in
Intel platform [1]
Having decided that CMCI/UCNA is the best action for patrol scrub errors,
the memory controller uses it for reads too. But the memory controller is
executing asynchronously from the core, and can't tell the difference
between a "real" read and a speculative read. So it will do CMCI/UCNA if
an error is found in any read.
Thus:
1) Core is clever and thinks address A is needed soon, issues a
speculative read.
2) Core finds it is going to use address A soon after sending the read
request
3) The CMCI from the memory controller is in a race with MCE from the
core that will soon try to retire the load from address A.
Quite often (because speculation has got better) the CMCI from the memory
controller is delivered before the core is committed to the instruction
reading address A, so the interrupt is taken, and Linux offlines the page
(marking it as poison).
## Why user process is killed for instr case
Commit 046545a661af ("mm/hwpoison: fix error page recovered but reported
"not recovered"") tries to fix noise message "Memory error not recovered"
and skips duplicate SIGBUSs due to the race. But it also introduced a bug
that kill_accessing_process() return -EHWPOISON for instr case, as result,
kill_me_maybe() send a SIGBUS to user process.
# 4. The fix, in my opinion, should be:
- For copyin case:
The key point is whether the error context is in a read from user memory.
We do not care about the ex-type if we know its a MOV reading from
userspace.
is_copy_from_user() return true when both of the following two checks are
true:
- the current instruction is copy
- source address is user memory
If copy_user is true, we set
m->kflags |= MCE_IN_KERNEL_COPYIN | MCE_IN_KERNEL_RECOV;
Then do_machine_check() will try fixup_exception() first.
- For instr case: let kill_accessing_process() return 0 to prevent a SIGBUS.
- For patch 3:
The return value of memory_failure() is quite important while discussed
instr case regression with Tony and Miaohe for patch 2, so add comment
about the return value.
This patch (of 3):
Commit 4c132d1d844a ("x86/futex: Remove .fixup usage") introduced a new
extable fixup type, EX_TYPE_EFAULT_REG, and commit 4c132d1d844a
("x86/futex: Remove .fixup usage") updated the extable fixup type for
copy-from-user operations, changing it from EX_TYPE_UACCESS to
EX_TYPE_EFAULT_REG. The error context for copy-from-user operations no
longer functions as an in-kernel recovery context. Consequently, the
error context for copy-from-user operations no longer functions as an
in-kernel recovery context, resulting in kernel panics with the message:
"Machine check: Data load in unrecoverable area of kernel."
To address this, it is crucial to identify if an error context involves a
read operation from user memory. The function is_copy_from_user() can be
utilized to determine:
- the current operation is copy
- when reading user memory
When these conditions are met, is_copy_from_user() will return true,
confirming that it is indeed a direct copy from user memory. This check
is essential for correctly handling the context of errors in these
operations without relying on the extable fixup types that previously
allowed for in-kernel recovery.
So, use is_copy_from_user() to determine if a context is copy user directly.
Link: https://lkml.kernel.org/r/20250312112852.82415-1-xueshuai@linux.alibaba.com
Link: https://lkml.kernel.org/r/20250312112852.82415-2-xueshuai@linux.alibaba.com
Fixes: 4c132d1d844a ("x86/futex: Remove .fixup usage")
Signed-off-by: Shuai Xue <xueshuai(a)linux.alibaba.com>
Suggested-by: Peter Zijlstra <peterz(a)infradead.org>
Acked-by: Borislav Petkov (AMD) <bp(a)alien8.de>
Tested-by: Tony Luck <tony.luck(a)intel.com>
Cc: Baolin Wang <baolin.wang(a)linux.alibaba.com>
Cc: Borislav Betkov <bp(a)alien8.de>
Cc: Catalin Marinas <catalin.marinas(a)arm.com>
Cc: Dave Hansen <dave.hansen(a)linux.intel.com>
Cc: "H. Peter Anvin" <hpa(a)zytor.com>
Cc: Ingo Molnar <mingo(a)redhat.com>
Cc: Josh Poimboeuf <jpoimboe(a)kernel.org>
Cc: Miaohe Lin <linmiaohe(a)huawei.com>
Cc: Naoya Horiguchi <nao.horiguchi(a)gmail.com>
Cc: Ruidong Tian <tianruidong(a)linux.alibaba.com>
Cc: Thomas Gleinxer <tglx(a)linutronix.de>
Cc: Yazen Ghannam <yazen.ghannam(a)amd.com>
Cc: Jane Chu <jane.chu(a)oracle.com>
Cc: Jarkko Sakkinen <jarkko(a)kernel.org>
Cc: Jonathan Cameron <Jonathan.Cameron(a)huawei.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c
index dac4d64dfb2a..2235a7477436 100644
--- a/arch/x86/kernel/cpu/mce/severity.c
+++ b/arch/x86/kernel/cpu/mce/severity.c
@@ -300,13 +300,12 @@ static noinstr int error_context(struct mce *m, struct pt_regs *regs)
copy_user = is_copy_from_user(regs);
instrumentation_end();
- switch (fixup_type) {
- case EX_TYPE_UACCESS:
- if (!copy_user)
- return IN_KERNEL;
- m->kflags |= MCE_IN_KERNEL_COPYIN;
- fallthrough;
+ if (copy_user) {
+ m->kflags |= MCE_IN_KERNEL_COPYIN | MCE_IN_KERNEL_RECOV;
+ return IN_KERNEL_RECOV;
+ }
+ switch (fixup_type) {
case EX_TYPE_FAULT_MCE_SAFE:
case EX_TYPE_DEFAULT_MCE_SAFE:
m->kflags |= MCE_IN_KERNEL_RECOV;
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x 1a15bb8303b6b104e78028b6c68f76a0d4562134
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025040808-ammonia-petal-1583@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 1a15bb8303b6b104e78028b6c68f76a0d4562134 Mon Sep 17 00:00:00 2001
From: Shuai Xue <xueshuai(a)linux.alibaba.com>
Date: Wed, 12 Mar 2025 19:28:50 +0800
Subject: [PATCH] x86/mce: use is_copy_from_user() to determine copy-from-user
context
Patch series "mm/hwpoison: Fix regressions in memory failure handling",
v4.
## 1. What am I trying to do:
This patchset resolves two critical regressions related to memory failure
handling that have appeared in the upstream kernel since version 5.17, as
compared to 5.10 LTS.
- copyin case: poison found in user page while kernel copying from user space
- instr case: poison found while instruction fetching in user space
## 2. What is the expected outcome and why
- For copyin case:
Kernel can recover from poison found where kernel is doing get_user() or
copy_from_user() if those places get an error return and the kernel return
-EFAULT to the process instead of crashing. More specifily, MCE handler
checks the fixup handler type to decide whether an in kernel #MC can be
recovered. When EX_TYPE_UACCESS is found, the PC jumps to recovery code
specified in _ASM_EXTABLE_FAULT() and return a -EFAULT to user space.
- For instr case:
If a poison found while instruction fetching in user space, full recovery
is possible. User process takes #PF, Linux allocates a new page and fills
by reading from storage.
## 3. What actually happens and why
- For copyin case: kernel panic since v5.17
Commit 4c132d1d844a ("x86/futex: Remove .fixup usage") introduced a new
extable fixup type, EX_TYPE_EFAULT_REG, and later patches updated the
extable fixup type for copy-from-user operations, changing it from
EX_TYPE_UACCESS to EX_TYPE_EFAULT_REG. It breaks previous EX_TYPE_UACCESS
handling when posion found in get_user() or copy_from_user().
- For instr case: user process is killed by a SIGBUS signal due to #CMCI
and #MCE race
When an uncorrected memory error is consumed there is a race between the
CMCI from the memory controller reporting an uncorrected error with a UCNA
signature, and the core reporting and SRAR signature machine check when
the data is about to be consumed.
### Background: why *UN*corrected errors tied to *C*MCI in Intel platform [1]
Prior to Icelake memory controllers reported patrol scrub events that
detected a previously unseen uncorrected error in memory by signaling a
broadcast machine check with an SRAO (Software Recoverable Action
Optional) signature in the machine check bank. This was overkill because
it's not an urgent problem that no core is on the verge of consuming that
bad data. It's also found that multi SRAO UCE may cause nested MCE
interrupts and finally become an IERR.
Hence, Intel downgrades the machine check bank signature of patrol scrub
from SRAO to UCNA (Uncorrected, No Action required), and signal changed to
#CMCI. Just to add to the confusion, Linux does take an action (in
uc_decode_notifier()) to try to offline the page despite the UC*NA*
signature name.
### Background: why #CMCI and #MCE race when poison is consuming in
Intel platform [1]
Having decided that CMCI/UCNA is the best action for patrol scrub errors,
the memory controller uses it for reads too. But the memory controller is
executing asynchronously from the core, and can't tell the difference
between a "real" read and a speculative read. So it will do CMCI/UCNA if
an error is found in any read.
Thus:
1) Core is clever and thinks address A is needed soon, issues a
speculative read.
2) Core finds it is going to use address A soon after sending the read
request
3) The CMCI from the memory controller is in a race with MCE from the
core that will soon try to retire the load from address A.
Quite often (because speculation has got better) the CMCI from the memory
controller is delivered before the core is committed to the instruction
reading address A, so the interrupt is taken, and Linux offlines the page
(marking it as poison).
## Why user process is killed for instr case
Commit 046545a661af ("mm/hwpoison: fix error page recovered but reported
"not recovered"") tries to fix noise message "Memory error not recovered"
and skips duplicate SIGBUSs due to the race. But it also introduced a bug
that kill_accessing_process() return -EHWPOISON for instr case, as result,
kill_me_maybe() send a SIGBUS to user process.
# 4. The fix, in my opinion, should be:
- For copyin case:
The key point is whether the error context is in a read from user memory.
We do not care about the ex-type if we know its a MOV reading from
userspace.
is_copy_from_user() return true when both of the following two checks are
true:
- the current instruction is copy
- source address is user memory
If copy_user is true, we set
m->kflags |= MCE_IN_KERNEL_COPYIN | MCE_IN_KERNEL_RECOV;
Then do_machine_check() will try fixup_exception() first.
- For instr case: let kill_accessing_process() return 0 to prevent a SIGBUS.
- For patch 3:
The return value of memory_failure() is quite important while discussed
instr case regression with Tony and Miaohe for patch 2, so add comment
about the return value.
This patch (of 3):
Commit 4c132d1d844a ("x86/futex: Remove .fixup usage") introduced a new
extable fixup type, EX_TYPE_EFAULT_REG, and commit 4c132d1d844a
("x86/futex: Remove .fixup usage") updated the extable fixup type for
copy-from-user operations, changing it from EX_TYPE_UACCESS to
EX_TYPE_EFAULT_REG. The error context for copy-from-user operations no
longer functions as an in-kernel recovery context. Consequently, the
error context for copy-from-user operations no longer functions as an
in-kernel recovery context, resulting in kernel panics with the message:
"Machine check: Data load in unrecoverable area of kernel."
To address this, it is crucial to identify if an error context involves a
read operation from user memory. The function is_copy_from_user() can be
utilized to determine:
- the current operation is copy
- when reading user memory
When these conditions are met, is_copy_from_user() will return true,
confirming that it is indeed a direct copy from user memory. This check
is essential for correctly handling the context of errors in these
operations without relying on the extable fixup types that previously
allowed for in-kernel recovery.
So, use is_copy_from_user() to determine if a context is copy user directly.
Link: https://lkml.kernel.org/r/20250312112852.82415-1-xueshuai@linux.alibaba.com
Link: https://lkml.kernel.org/r/20250312112852.82415-2-xueshuai@linux.alibaba.com
Fixes: 4c132d1d844a ("x86/futex: Remove .fixup usage")
Signed-off-by: Shuai Xue <xueshuai(a)linux.alibaba.com>
Suggested-by: Peter Zijlstra <peterz(a)infradead.org>
Acked-by: Borislav Petkov (AMD) <bp(a)alien8.de>
Tested-by: Tony Luck <tony.luck(a)intel.com>
Cc: Baolin Wang <baolin.wang(a)linux.alibaba.com>
Cc: Borislav Betkov <bp(a)alien8.de>
Cc: Catalin Marinas <catalin.marinas(a)arm.com>
Cc: Dave Hansen <dave.hansen(a)linux.intel.com>
Cc: "H. Peter Anvin" <hpa(a)zytor.com>
Cc: Ingo Molnar <mingo(a)redhat.com>
Cc: Josh Poimboeuf <jpoimboe(a)kernel.org>
Cc: Miaohe Lin <linmiaohe(a)huawei.com>
Cc: Naoya Horiguchi <nao.horiguchi(a)gmail.com>
Cc: Ruidong Tian <tianruidong(a)linux.alibaba.com>
Cc: Thomas Gleinxer <tglx(a)linutronix.de>
Cc: Yazen Ghannam <yazen.ghannam(a)amd.com>
Cc: Jane Chu <jane.chu(a)oracle.com>
Cc: Jarkko Sakkinen <jarkko(a)kernel.org>
Cc: Jonathan Cameron <Jonathan.Cameron(a)huawei.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c
index dac4d64dfb2a..2235a7477436 100644
--- a/arch/x86/kernel/cpu/mce/severity.c
+++ b/arch/x86/kernel/cpu/mce/severity.c
@@ -300,13 +300,12 @@ static noinstr int error_context(struct mce *m, struct pt_regs *regs)
copy_user = is_copy_from_user(regs);
instrumentation_end();
- switch (fixup_type) {
- case EX_TYPE_UACCESS:
- if (!copy_user)
- return IN_KERNEL;
- m->kflags |= MCE_IN_KERNEL_COPYIN;
- fallthrough;
+ if (copy_user) {
+ m->kflags |= MCE_IN_KERNEL_COPYIN | MCE_IN_KERNEL_RECOV;
+ return IN_KERNEL_RECOV;
+ }
+ switch (fixup_type) {
case EX_TYPE_FAULT_MCE_SAFE:
case EX_TYPE_DEFAULT_MCE_SAFE:
m->kflags |= MCE_IN_KERNEL_RECOV;
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x 1a15bb8303b6b104e78028b6c68f76a0d4562134
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025040805-enlisted-regular-7bac@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 1a15bb8303b6b104e78028b6c68f76a0d4562134 Mon Sep 17 00:00:00 2001
From: Shuai Xue <xueshuai(a)linux.alibaba.com>
Date: Wed, 12 Mar 2025 19:28:50 +0800
Subject: [PATCH] x86/mce: use is_copy_from_user() to determine copy-from-user
context
Patch series "mm/hwpoison: Fix regressions in memory failure handling",
v4.
## 1. What am I trying to do:
This patchset resolves two critical regressions related to memory failure
handling that have appeared in the upstream kernel since version 5.17, as
compared to 5.10 LTS.
- copyin case: poison found in user page while kernel copying from user space
- instr case: poison found while instruction fetching in user space
## 2. What is the expected outcome and why
- For copyin case:
Kernel can recover from poison found where kernel is doing get_user() or
copy_from_user() if those places get an error return and the kernel return
-EFAULT to the process instead of crashing. More specifily, MCE handler
checks the fixup handler type to decide whether an in kernel #MC can be
recovered. When EX_TYPE_UACCESS is found, the PC jumps to recovery code
specified in _ASM_EXTABLE_FAULT() and return a -EFAULT to user space.
- For instr case:
If a poison found while instruction fetching in user space, full recovery
is possible. User process takes #PF, Linux allocates a new page and fills
by reading from storage.
## 3. What actually happens and why
- For copyin case: kernel panic since v5.17
Commit 4c132d1d844a ("x86/futex: Remove .fixup usage") introduced a new
extable fixup type, EX_TYPE_EFAULT_REG, and later patches updated the
extable fixup type for copy-from-user operations, changing it from
EX_TYPE_UACCESS to EX_TYPE_EFAULT_REG. It breaks previous EX_TYPE_UACCESS
handling when posion found in get_user() or copy_from_user().
- For instr case: user process is killed by a SIGBUS signal due to #CMCI
and #MCE race
When an uncorrected memory error is consumed there is a race between the
CMCI from the memory controller reporting an uncorrected error with a UCNA
signature, and the core reporting and SRAR signature machine check when
the data is about to be consumed.
### Background: why *UN*corrected errors tied to *C*MCI in Intel platform [1]
Prior to Icelake memory controllers reported patrol scrub events that
detected a previously unseen uncorrected error in memory by signaling a
broadcast machine check with an SRAO (Software Recoverable Action
Optional) signature in the machine check bank. This was overkill because
it's not an urgent problem that no core is on the verge of consuming that
bad data. It's also found that multi SRAO UCE may cause nested MCE
interrupts and finally become an IERR.
Hence, Intel downgrades the machine check bank signature of patrol scrub
from SRAO to UCNA (Uncorrected, No Action required), and signal changed to
#CMCI. Just to add to the confusion, Linux does take an action (in
uc_decode_notifier()) to try to offline the page despite the UC*NA*
signature name.
### Background: why #CMCI and #MCE race when poison is consuming in
Intel platform [1]
Having decided that CMCI/UCNA is the best action for patrol scrub errors,
the memory controller uses it for reads too. But the memory controller is
executing asynchronously from the core, and can't tell the difference
between a "real" read and a speculative read. So it will do CMCI/UCNA if
an error is found in any read.
Thus:
1) Core is clever and thinks address A is needed soon, issues a
speculative read.
2) Core finds it is going to use address A soon after sending the read
request
3) The CMCI from the memory controller is in a race with MCE from the
core that will soon try to retire the load from address A.
Quite often (because speculation has got better) the CMCI from the memory
controller is delivered before the core is committed to the instruction
reading address A, so the interrupt is taken, and Linux offlines the page
(marking it as poison).
## Why user process is killed for instr case
Commit 046545a661af ("mm/hwpoison: fix error page recovered but reported
"not recovered"") tries to fix noise message "Memory error not recovered"
and skips duplicate SIGBUSs due to the race. But it also introduced a bug
that kill_accessing_process() return -EHWPOISON for instr case, as result,
kill_me_maybe() send a SIGBUS to user process.
# 4. The fix, in my opinion, should be:
- For copyin case:
The key point is whether the error context is in a read from user memory.
We do not care about the ex-type if we know its a MOV reading from
userspace.
is_copy_from_user() return true when both of the following two checks are
true:
- the current instruction is copy
- source address is user memory
If copy_user is true, we set
m->kflags |= MCE_IN_KERNEL_COPYIN | MCE_IN_KERNEL_RECOV;
Then do_machine_check() will try fixup_exception() first.
- For instr case: let kill_accessing_process() return 0 to prevent a SIGBUS.
- For patch 3:
The return value of memory_failure() is quite important while discussed
instr case regression with Tony and Miaohe for patch 2, so add comment
about the return value.
This patch (of 3):
Commit 4c132d1d844a ("x86/futex: Remove .fixup usage") introduced a new
extable fixup type, EX_TYPE_EFAULT_REG, and commit 4c132d1d844a
("x86/futex: Remove .fixup usage") updated the extable fixup type for
copy-from-user operations, changing it from EX_TYPE_UACCESS to
EX_TYPE_EFAULT_REG. The error context for copy-from-user operations no
longer functions as an in-kernel recovery context. Consequently, the
error context for copy-from-user operations no longer functions as an
in-kernel recovery context, resulting in kernel panics with the message:
"Machine check: Data load in unrecoverable area of kernel."
To address this, it is crucial to identify if an error context involves a
read operation from user memory. The function is_copy_from_user() can be
utilized to determine:
- the current operation is copy
- when reading user memory
When these conditions are met, is_copy_from_user() will return true,
confirming that it is indeed a direct copy from user memory. This check
is essential for correctly handling the context of errors in these
operations without relying on the extable fixup types that previously
allowed for in-kernel recovery.
So, use is_copy_from_user() to determine if a context is copy user directly.
Link: https://lkml.kernel.org/r/20250312112852.82415-1-xueshuai@linux.alibaba.com
Link: https://lkml.kernel.org/r/20250312112852.82415-2-xueshuai@linux.alibaba.com
Fixes: 4c132d1d844a ("x86/futex: Remove .fixup usage")
Signed-off-by: Shuai Xue <xueshuai(a)linux.alibaba.com>
Suggested-by: Peter Zijlstra <peterz(a)infradead.org>
Acked-by: Borislav Petkov (AMD) <bp(a)alien8.de>
Tested-by: Tony Luck <tony.luck(a)intel.com>
Cc: Baolin Wang <baolin.wang(a)linux.alibaba.com>
Cc: Borislav Betkov <bp(a)alien8.de>
Cc: Catalin Marinas <catalin.marinas(a)arm.com>
Cc: Dave Hansen <dave.hansen(a)linux.intel.com>
Cc: "H. Peter Anvin" <hpa(a)zytor.com>
Cc: Ingo Molnar <mingo(a)redhat.com>
Cc: Josh Poimboeuf <jpoimboe(a)kernel.org>
Cc: Miaohe Lin <linmiaohe(a)huawei.com>
Cc: Naoya Horiguchi <nao.horiguchi(a)gmail.com>
Cc: Ruidong Tian <tianruidong(a)linux.alibaba.com>
Cc: Thomas Gleinxer <tglx(a)linutronix.de>
Cc: Yazen Ghannam <yazen.ghannam(a)amd.com>
Cc: Jane Chu <jane.chu(a)oracle.com>
Cc: Jarkko Sakkinen <jarkko(a)kernel.org>
Cc: Jonathan Cameron <Jonathan.Cameron(a)huawei.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c
index dac4d64dfb2a..2235a7477436 100644
--- a/arch/x86/kernel/cpu/mce/severity.c
+++ b/arch/x86/kernel/cpu/mce/severity.c
@@ -300,13 +300,12 @@ static noinstr int error_context(struct mce *m, struct pt_regs *regs)
copy_user = is_copy_from_user(regs);
instrumentation_end();
- switch (fixup_type) {
- case EX_TYPE_UACCESS:
- if (!copy_user)
- return IN_KERNEL;
- m->kflags |= MCE_IN_KERNEL_COPYIN;
- fallthrough;
+ if (copy_user) {
+ m->kflags |= MCE_IN_KERNEL_COPYIN | MCE_IN_KERNEL_RECOV;
+ return IN_KERNEL_RECOV;
+ }
+ switch (fixup_type) {
case EX_TYPE_FAULT_MCE_SAFE:
case EX_TYPE_DEFAULT_MCE_SAFE:
m->kflags |= MCE_IN_KERNEL_RECOV;
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x 60f3caff1492e5b8616b9578c4bedb5c0a88ed14
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025040845-repeater-uninvited-9d3b@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 60f3caff1492e5b8616b9578c4bedb5c0a88ed14 Mon Sep 17 00:00:00 2001
From: Hengqi Chen <hengqi.chen(a)gmail.com>
Date: Sun, 30 Mar 2025 16:31:09 +0800
Subject: [PATCH] LoongArch: BPF: Don't override subprog's return value
The verifier test `calls: div by 0 in subprog` triggers a panic at the
ld.bu instruction. The ld.bu insn is trying to load byte from memory
address returned by the subprog. The subprog actually set the correct
address at the a5 register (dedicated register for BPF return values).
But at commit 73c359d1d356 ("LoongArch: BPF: Sign-extend return values")
we also sign extended a5 to the a0 register (return value in LoongArch).
For function call insn, we later propagate the a0 register back to a5
register. This is right for native calls but wrong for bpf2bpf calls
which expect zero-extended return value in a5 register. So only move a0
to a5 for native calls (i.e. non-BPF_PSEUDO_CALL).
Cc: stable(a)vger.kernel.org
Fixes: 73c359d1d356 ("LoongArch: BPF: Sign-extend return values")
Signed-off-by: Hengqi Chen <hengqi.chen(a)gmail.com>
Signed-off-by: Huacai Chen <chenhuacai(a)loongson.cn>
diff --git a/arch/loongarch/net/bpf_jit.c b/arch/loongarch/net/bpf_jit.c
index a06bf89fed67..fa1500d4aa3e 100644
--- a/arch/loongarch/net/bpf_jit.c
+++ b/arch/loongarch/net/bpf_jit.c
@@ -907,7 +907,10 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, bool ext
move_addr(ctx, t1, func_addr);
emit_insn(ctx, jirl, LOONGARCH_GPR_RA, t1, 0);
- move_reg(ctx, regmap[BPF_REG_0], LOONGARCH_GPR_A0);
+
+ if (insn->src_reg != BPF_PSEUDO_CALL)
+ move_reg(ctx, regmap[BPF_REG_0], LOONGARCH_GPR_A0);
+
break;
/* tail call */
On 4/7/25 17:25, Sasha Levin wrote:
> This is a note to let you know that I've just added the patch titled
>
> wifi: mac80211: remove debugfs dir for virtual monitor
>
> to the 6.13-stable tree which can be found at:
> http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
>
> The filename of the patch is:
> wifi-mac80211-remove-debugfs-dir-for-virtual-monitor.patch
> and it can be found in the queue-6.13 subdirectory.
>
> If you, or anyone else, feels it should not be added to the stable tree,
> please let <stable(a)vger.kernel.org> know about it.
>
The commit here is causing a sparse warning. Please always add commit
861d0445e72e ("wifi: mac80211: Fix sparse warning for monitor_sdata")
when you backport this patch to avoid that. (So far I got the backport
notification for 6.12 and 6.13.)
It's also no big deal when you decide to not backport this patch.
>
>
> commit 193bafe31a2a08b5631a98ecf148fa0d18e94b0d
> Author: Alexander Wetzel <Alexander(a)wetzel-home.de>
> Date: Tue Feb 4 17:42:40 2025 +0100
>
> wifi: mac80211: remove debugfs dir for virtual monitor
>
> [ Upstream commit 646262c71aca87bb66945933abe4e620796d6c5a ]
>
> Don't call ieee80211_debugfs_recreate_netdev() for virtual monitor
> interface when deleting it.
>
> The virtual monitor interface shouldn't have debugfs entries and trying
> to update them will *create* them on deletion.
>
> And when the virtual monitor interface is created/destroyed multiple
> times we'll get warnings about debugfs name conflicts.
>
> Signed-off-by: Alexander Wetzel <Alexander(a)wetzel-home.de>
> Link: https://patch.msgid.link/20250204164240.370153-1-Alexander@wetzel-home.de
> Signed-off-by: Johannes Berg <johannes.berg(a)intel.com>
> Signed-off-by: Sasha Levin <sashal(a)kernel.org>
>
> diff --git a/net/mac80211/driver-ops.c b/net/mac80211/driver-ops.c
> index 299d38e9e8630..2fc60e1e77a55 100644
> --- a/net/mac80211/driver-ops.c
> +++ b/net/mac80211/driver-ops.c
> @@ -116,8 +116,14 @@ void drv_remove_interface(struct ieee80211_local *local,
>
> sdata->flags &= ~IEEE80211_SDATA_IN_DRIVER;
>
> - /* Remove driver debugfs entries */
> - ieee80211_debugfs_recreate_netdev(sdata, sdata->vif.valid_links);
> + /*
> + * Remove driver debugfs entries.
> + * The virtual monitor interface doesn't get a debugfs
> + * entry, so it's exempt here.
> + */
> + if (sdata != local->monitor_sdata)
> + ieee80211_debugfs_recreate_netdev(sdata,
> + sdata->vif.valid_links);
>
> trace_drv_remove_interface(local, sdata);
> local->ops->remove_interface(&local->hw, &sdata->vif);
> diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
> index 806dffa48ef92..04b3626387309 100644
> --- a/net/mac80211/iface.c
> +++ b/net/mac80211/iface.c
> @@ -1212,16 +1212,17 @@ void ieee80211_del_virtual_monitor(struct ieee80211_local *local)
> return;
> }
>
> - RCU_INIT_POINTER(local->monitor_sdata, NULL);
> - mutex_unlock(&local->iflist_mtx);
> -
> - synchronize_net();
> -
> + clear_bit(SDATA_STATE_RUNNING, &sdata->state);
> ieee80211_link_release_channel(&sdata->deflink);
>
> if (ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF))
> drv_remove_interface(local, sdata);
>
> + RCU_INIT_POINTER(local->monitor_sdata, NULL);
> + mutex_unlock(&local->iflist_mtx);
> +
> + synchronize_net();
> +
> kfree(sdata);
> }
>
The sdio_read32() calls sd_read(), but does not handle the error if
sd_read() fails. This could lead to subsequent operations processing
invalid data. A proper implementation can be found in sdio_readN(),
which has an error handling for the sd_read().
Add error handling for the sd_read() to free tmpbuf and return error
code if sd_read() fails. This ensure that the memcpy() is only performed
when the read operation is successful.
Since none of the callers check for the errors, there is no need to
return the error code propagated from sd_read(). Returning SDIO_ERR_VAL32
might be a better choice, which is a specialized error code for SDIO.
Another problem of returning propagated error code is that the error
code is a s32 type value, which is not fit with the u32 type return value
of the sdio_read32().
An practical option would be to go through all the callers and add error
handling, which need to pass a pointer to u32 *val and return zero on
success or negative on failure. It is not a better choice since will cost
unnecessary effort on the error code.
The other opion is to replace sd_read() by sd_read32(), which return an
u32 type error code that can be directly used as the return value of
sdio_read32(). But, it is also a bad choice to use sd_read32() in a
alignment failed branch.
Fixes: 554c0a3abf21 ("staging: Add rtl8723bs sdio wifi driver")
Cc: stable(a)vger.kernel.org # v4.12+
Signed-off-by: Wentao Liang <vulab(a)iscas.ac.cn>
---
v7: Fix error code and add patch explanation
v6: Fix improper code to propagate error code
v5: Fix error code
v4: Add change log and fix error code
v3: Add Cc flag
v2: Change code to initialize val
drivers/staging/rtl8723bs/hal/sdio_ops.c | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/drivers/staging/rtl8723bs/hal/sdio_ops.c b/drivers/staging/rtl8723bs/hal/sdio_ops.c
index 21e9f1858745..d79d41727042 100644
--- a/drivers/staging/rtl8723bs/hal/sdio_ops.c
+++ b/drivers/staging/rtl8723bs/hal/sdio_ops.c
@@ -185,7 +185,12 @@ static u32 sdio_read32(struct intf_hdl *intfhdl, u32 addr)
return SDIO_ERR_VAL32;
ftaddr &= ~(u16)0x3;
- sd_read(intfhdl, ftaddr, 8, tmpbuf);
+ err = sd_read(intfhdl, ftaddr, 8, tmpbuf);
+ if (err) {
+ kfree(tmpbuf);
+ return SDIO_ERR_VAL32;
+ }
+
memcpy(&le_tmp, tmpbuf + shift, 4);
val = le32_to_cpu(le_tmp);
--
2.42.0.windows.2
In tpacpi_battery_init(), the return value of tpacpi_check_quirks() needs
to be checked. The battery should not be hooked if there is no matched
battery information in quirk table.
Add an error check and return -ENODEV immediately if the device fail
the check.
Fixes: 1a32ebb26ba9 ("platform/x86: thinkpad_acpi: Support battery quirk")
Cc: stable(a)vger.kernel.org
Signed-off-by: Wentao Liang <vulab(a)iscas.ac.cn>
---
v3: Fix error code
v2: Fix double assignment error
drivers/platform/x86/thinkpad_acpi.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/drivers/platform/x86/thinkpad_acpi.c b/drivers/platform/x86/thinkpad_acpi.c
index 2cfb2ac3f465..ab538bf53716 100644
--- a/drivers/platform/x86/thinkpad_acpi.c
+++ b/drivers/platform/x86/thinkpad_acpi.c
@@ -9974,6 +9974,8 @@ static int __init tpacpi_battery_init(struct ibm_init_struct *ibm)
tp_features.battery_force_primary = tpacpi_check_quirks(
battery_quirk_table,
ARRAY_SIZE(battery_quirk_table));
+ if (!tp_features.battery_force_primary)
+ return -ENODEV;
battery_hook_register(&battery_hook);
return 0;
--
2.42.0.windows.2
From: Alex Hung <alex.hung(a)amd.com>
[ Upstream commit 8aa2864044b9d13e95fe224f32e808afbf79ecdf ]
[WHY & HOW]
dc->links[] has max size of MAX_LINKS and NULL is return when trying to
access with out-of-bound index.
This fixes 3 OVERRUN and 1 RESOURCE_LEAK issues reported by Coverity.
Reviewed-by: Harry Wentland <harry.wentland(a)amd.com>
Acked-by: Tom Chung <chiahsuan.chung(a)amd.com>
Signed-off-by: Alex Hung <alex.hung(a)amd.com>
Tested-by: Daniel Wheeler <daniel.wheeler(a)amd.com>
Signed-off-by: Alex Deucher <alexander.deucher(a)amd.com>
[The macro MAX_LINKS is introduced by Commit 60df5628144b ("drm/amd/display:
handle invalid connector indices") after 6.10. So here we still use the
original array length MAX_PIPES * 2]
Signed-off-by: Jianqi Ren <jianqi.ren.cn(a)windriver.com>
Signed-off-by: He Zhe <zhe.he(a)windriver.com>
---
Verified the build test
---
drivers/gpu/drm/amd/display/dc/core/dc_link_exports.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_link_exports.c b/drivers/gpu/drm/amd/display/dc/core/dc_link_exports.c
index f365773d5714..e9b3c1c7a931 100644
--- a/drivers/gpu/drm/amd/display/dc/core/dc_link_exports.c
+++ b/drivers/gpu/drm/amd/display/dc/core/dc_link_exports.c
@@ -37,6 +37,9 @@
#include "dce/dce_i2c.h"
struct dc_link *dc_get_link_at_index(struct dc *dc, uint32_t link_index)
{
+ if (link_index >= (MAX_PIPES * 2))
+ return NULL;
+
return dc->links[link_index];
}
--
2.34.1
On destroy, we should set each node dead. But current code miss this
when the maple tree has only the root node.
The reason is mt_destroy_walk() leverage mte_destroy_descend() to set
node dead, but this is skipped since the only root node is a leaf.
Fixes this by setting the node dead if it is a leaf.
Fixes: 54a611b60590 ("Maple Tree: add new data structure")
Signed-off-by: Wei Yang <richard.weiyang(a)gmail.com>
CC: Liam R. Howlett <Liam.Howlett(a)Oracle.com>
Cc: <stable(a)vger.kernel.org>
---
v2:
* move the operation into mt_destroy_walk()
* adjust the title accordingly
---
lib/maple_tree.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/lib/maple_tree.c b/lib/maple_tree.c
index 4bd5a5be1440..0696e8d1c4e9 100644
--- a/lib/maple_tree.c
+++ b/lib/maple_tree.c
@@ -5284,6 +5284,7 @@ static void mt_destroy_walk(struct maple_enode *enode, struct maple_tree *mt,
struct maple_enode *start;
if (mte_is_leaf(enode)) {
+ mte_set_node_dead(enode);
node->type = mte_node_type(enode);
goto free_leaf;
}
--
2.34.1
Replace kzalloc with kvzalloc for the exit_dump buffer allocation, which
can require large contiguous memory (up to order=9) depending on the
implementation. This change prevents allocation failures by allowing the
system to fall back to vmalloc when contiguous memory allocation fails.
Since this buffer is only used for debugging purposes, physical memory
contiguity is not required, making vmalloc a suitable alternative.
Cc: stable(a)vger.kernel.org
Fixes: 07814a9439a3b0 ("sched_ext: Print debug dump after an error exit")
Suggested-by: Rik van Riel <riel(a)surriel.com>
Signed-off-by: Breno Leitao <leitao(a)debian.org>
---
kernel/sched/ext.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 66bcd40a28ca1..c82725f9b0559 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -4639,7 +4639,7 @@ static struct scx_exit_info *alloc_exit_info(size_t exit_dump_len)
ei->bt = kcalloc(SCX_EXIT_BT_LEN, sizeof(ei->bt[0]), GFP_KERNEL);
ei->msg = kzalloc(SCX_EXIT_MSG_LEN, GFP_KERNEL);
- ei->dump = kzalloc(exit_dump_len, GFP_KERNEL);
+ ei->dump = kvzalloc(exit_dump_len, GFP_KERNEL);
if (!ei->bt || !ei->msg || !ei->dump) {
free_exit_info(ei);
---
base-commit: 0af2f6be1b4281385b618cb86ad946eded089ac8
change-id: 20250407-scx-11dbf94803c3
Best regards,
--
Breno Leitao <leitao(a)debian.org>
On Mon, Apr 07 2025 at 10:07, Sasha Levin wrote:
> This is a note to let you know that I've just added the patch titled
>
> timekeeping: Fix possible inconsistencies in _COARSE clockids
>
> to the 6.14-stable tree which can be found at:
> http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
>
> The filename of the patch is:
> timekeeping-fix-possible-inconsistencies-in-_coarse-.patch
> and it can be found in the queue-6.14 subdirectory.
>
> If you, or anyone else, feels it should not be added to the stable tree,
> please let <stable(a)vger.kernel.org> know about it.
As I asked on the stable list already, please do not add that to any
stable tree. It has been reverted in Linus tree and the problem will be
fixed differently.
Thanks,
tglx
From: Oleg Nesterov <oleg(a)redhat.com>
[ Upstream commit 975776841e689dd8ba36df9fa72ac3eca3c2957a ]
kernel/sched/isolation.c obviously makes no sense without CONFIG_SMP, but
the Kconfig entry we have right now:
config CPU_ISOLATION
bool "CPU isolation"
depends on SMP || COMPILE_TEST
allows the creation of pointless .config's which cause
build failures.
Reported-by: kernel test robot <lkp(a)intel.com>
Signed-off-by: Oleg Nesterov <oleg(a)redhat.com>
Signed-off-by: Ingo Molnar <mingo(a)kernel.org>
Link: https://lore.kernel.org/r/20250330134955.GA7910@redhat.com
Closes: https://lore.kernel.org/oe-kbuild-all/202503260646.lrUqD3j5-lkp@intel.com/
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
init/Kconfig | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/init/Kconfig b/init/Kconfig
index f641518f4ac5c..01beb047aff2f 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -559,7 +559,7 @@ endmenu # "CPU/Task time and stats accounting"
config CPU_ISOLATION
bool "CPU isolation"
- depends on SMP || COMPILE_TEST
+ depends on SMP
default y
help
Make sure that CPUs running critical tasks are not disturbed by
--
2.39.5
From: Oleg Nesterov <oleg(a)redhat.com>
[ Upstream commit 975776841e689dd8ba36df9fa72ac3eca3c2957a ]
kernel/sched/isolation.c obviously makes no sense without CONFIG_SMP, but
the Kconfig entry we have right now:
config CPU_ISOLATION
bool "CPU isolation"
depends on SMP || COMPILE_TEST
allows the creation of pointless .config's which cause
build failures.
Reported-by: kernel test robot <lkp(a)intel.com>
Signed-off-by: Oleg Nesterov <oleg(a)redhat.com>
Signed-off-by: Ingo Molnar <mingo(a)kernel.org>
Link: https://lore.kernel.org/r/20250330134955.GA7910@redhat.com
Closes: https://lore.kernel.org/oe-kbuild-all/202503260646.lrUqD3j5-lkp@intel.com/
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
init/Kconfig | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/init/Kconfig b/init/Kconfig
index 9807c66b24bb6..d9dd8cbe99b11 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -656,7 +656,7 @@ endmenu # "CPU/Task time and stats accounting"
config CPU_ISOLATION
bool "CPU isolation"
- depends on SMP || COMPILE_TEST
+ depends on SMP
default y
help
Make sure that CPUs running critical tasks are not disturbed by
--
2.39.5
From: Oleg Nesterov <oleg(a)redhat.com>
[ Upstream commit 975776841e689dd8ba36df9fa72ac3eca3c2957a ]
kernel/sched/isolation.c obviously makes no sense without CONFIG_SMP, but
the Kconfig entry we have right now:
config CPU_ISOLATION
bool "CPU isolation"
depends on SMP || COMPILE_TEST
allows the creation of pointless .config's which cause
build failures.
Reported-by: kernel test robot <lkp(a)intel.com>
Signed-off-by: Oleg Nesterov <oleg(a)redhat.com>
Signed-off-by: Ingo Molnar <mingo(a)kernel.org>
Link: https://lore.kernel.org/r/20250330134955.GA7910@redhat.com
Closes: https://lore.kernel.org/oe-kbuild-all/202503260646.lrUqD3j5-lkp@intel.com/
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
init/Kconfig | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/init/Kconfig b/init/Kconfig
index dafc3ba6fa7a1..c7d81fc823964 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -674,7 +674,7 @@ endmenu # "CPU/Task time and stats accounting"
config CPU_ISOLATION
bool "CPU isolation"
- depends on SMP || COMPILE_TEST
+ depends on SMP
default y
help
Make sure that CPUs running critical tasks are not disturbed by
--
2.39.5
From: Oleg Nesterov <oleg(a)redhat.com>
[ Upstream commit 975776841e689dd8ba36df9fa72ac3eca3c2957a ]
kernel/sched/isolation.c obviously makes no sense without CONFIG_SMP, but
the Kconfig entry we have right now:
config CPU_ISOLATION
bool "CPU isolation"
depends on SMP || COMPILE_TEST
allows the creation of pointless .config's which cause
build failures.
Reported-by: kernel test robot <lkp(a)intel.com>
Signed-off-by: Oleg Nesterov <oleg(a)redhat.com>
Signed-off-by: Ingo Molnar <mingo(a)kernel.org>
Link: https://lore.kernel.org/r/20250330134955.GA7910@redhat.com
Closes: https://lore.kernel.org/oe-kbuild-all/202503260646.lrUqD3j5-lkp@intel.com/
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
init/Kconfig | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/init/Kconfig b/init/Kconfig
index b6786ddc88a80..8b6a2848da4a5 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -678,7 +678,7 @@ endmenu # "CPU/Task time and stats accounting"
config CPU_ISOLATION
bool "CPU isolation"
- depends on SMP || COMPILE_TEST
+ depends on SMP
default y
help
Make sure that CPUs running critical tasks are not disturbed by
--
2.39.5
From: Oleg Nesterov <oleg(a)redhat.com>
[ Upstream commit 975776841e689dd8ba36df9fa72ac3eca3c2957a ]
kernel/sched/isolation.c obviously makes no sense without CONFIG_SMP, but
the Kconfig entry we have right now:
config CPU_ISOLATION
bool "CPU isolation"
depends on SMP || COMPILE_TEST
allows the creation of pointless .config's which cause
build failures.
Reported-by: kernel test robot <lkp(a)intel.com>
Signed-off-by: Oleg Nesterov <oleg(a)redhat.com>
Signed-off-by: Ingo Molnar <mingo(a)kernel.org>
Link: https://lore.kernel.org/r/20250330134955.GA7910@redhat.com
Closes: https://lore.kernel.org/oe-kbuild-all/202503260646.lrUqD3j5-lkp@intel.com/
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
init/Kconfig | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/init/Kconfig b/init/Kconfig
index 1105cb53f391a..8b630143c720f 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -689,7 +689,7 @@ endmenu # "CPU/Task time and stats accounting"
config CPU_ISOLATION
bool "CPU isolation"
- depends on SMP || COMPILE_TEST
+ depends on SMP
default y
help
Make sure that CPUs running critical tasks are not disturbed by
--
2.39.5
From: Thomas Weißschuh <linux(a)weissschuh.net>
[ Upstream commit dbdffaf50ff9cee3259a7cef8a7bd9e0f0ba9f13 ]
Remap source path prefixes in all output, including compiler
diagnostics, debug information, macro expansions, etc.
This removes a few absolute paths from the binary and also makes it
possible to use core::panic::Location properly.
Equivalent to the same configuration done for C sources in
commit 1d3730f0012f ("kbuild: support -fmacro-prefix-map for external
modules") and commit a73619a845d5 ("kbuild: use -fmacro-prefix-map to
make __FILE__ a relative path").
Link: https://doc.rust-lang.org/rustc/command-line-arguments.html#--remap-path-pr…
Acked-by: Miguel Ojeda <ojeda(a)kernel.org>
Signed-off-by: Thomas Weißschuh <linux(a)weissschuh.net>
Tested-by: Gary Guo <gary(a)garyguo.net>
Signed-off-by: Masahiro Yamada <masahiroy(a)kernel.org>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
Makefile | 1 +
1 file changed, 1 insertion(+)
diff --git a/Makefile b/Makefile
index 21a34e8991ac6..9c1b66bf4bf6e 100644
--- a/Makefile
+++ b/Makefile
@@ -1067,6 +1067,7 @@ endif
# change __FILE__ to the relative path to the source directory
ifdef building_out_of_srctree
KBUILD_CPPFLAGS += $(call cc-option,-fmacro-prefix-map=$(srcroot)/=)
+KBUILD_RUSTFLAGS += --remap-path-prefix=$(srcroot)/=
endif
# include additional Makefiles when needed
--
2.39.5
From: Thomas Weißschuh <linux(a)weissschuh.net>
[ Upstream commit dbdffaf50ff9cee3259a7cef8a7bd9e0f0ba9f13 ]
Remap source path prefixes in all output, including compiler
diagnostics, debug information, macro expansions, etc.
This removes a few absolute paths from the binary and also makes it
possible to use core::panic::Location properly.
Equivalent to the same configuration done for C sources in
commit 1d3730f0012f ("kbuild: support -fmacro-prefix-map for external
modules") and commit a73619a845d5 ("kbuild: use -fmacro-prefix-map to
make __FILE__ a relative path").
Link: https://doc.rust-lang.org/rustc/command-line-arguments.html#--remap-path-pr…
Acked-by: Miguel Ojeda <ojeda(a)kernel.org>
Signed-off-by: Thomas Weißschuh <linux(a)weissschuh.net>
Tested-by: Gary Guo <gary(a)garyguo.net>
Signed-off-by: Masahiro Yamada <masahiroy(a)kernel.org>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
Makefile | 1 +
1 file changed, 1 insertion(+)
diff --git a/Makefile b/Makefile
index 8b6764d44a610..c4eeb97ab3df2 100644
--- a/Makefile
+++ b/Makefile
@@ -1068,6 +1068,7 @@ endif
# change __FILE__ to the relative path to the source directory
ifdef building_out_of_srctree
KBUILD_CPPFLAGS += $(call cc-option,-fmacro-prefix-map=$(srcroot)/=)
+KBUILD_RUSTFLAGS += --remap-path-prefix=$(srcroot)/=
endif
# include additional Makefiles when needed
--
2.39.5
Asus laptops with sound PCI subsystem ID 1043:1f43 have the DMICs
connected to the host instead of the CS42L43 so need the
SOC_SDW_CODEC_MIC quirk.
Link: https://github.com/thesofproject/sof/issues/9930
Fixes: 084344970808 ("ASoC: Intel: sof_sdw: Add quirk for Asus Zenbook S14")
Signed-off-by: Peter Ujfalusi <peter.ujfalusi(a)linux.intel.com>
Reviewed-by: Bard Liao <yung-chuan.liao(a)linux.intel.com>
Reviewed-by: Simon Trimmer <simont(a)opensource.cirrus.com>
Cc: stable(a)vger.kernel.org
---
Hi,
The S16 variant of the Zenbook has 4 PCH connected DMICs and the new
topology needed for it is released alongside with the -2ch variant for
the S14.
Add a Fixes tag so this is backported to 6.14 since we have at least one
user with this laptop without audio support.
Regards,
Peter
sound/soc/intel/boards/sof_sdw.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/sound/soc/intel/boards/sof_sdw.c b/sound/soc/intel/boards/sof_sdw.c
index 90dafa810b2e..095d08b3fc82 100644
--- a/sound/soc/intel/boards/sof_sdw.c
+++ b/sound/soc/intel/boards/sof_sdw.c
@@ -764,6 +764,7 @@ static const struct dmi_system_id sof_sdw_quirk_table[] = {
static const struct snd_pci_quirk sof_sdw_ssid_quirk_table[] = {
SND_PCI_QUIRK(0x1043, 0x1e13, "ASUS Zenbook S14", SOC_SDW_CODEC_MIC),
+ SND_PCI_QUIRK(0x1043, 0x1f43, "ASUS Zenbook S16", SOC_SDW_CODEC_MIC),
{}
};
--
2.49.0
Case values introduced in commit
5f78e1fb7a3e ("ASoC: qcom: Add driver support for audioreach solution")
cause out of bounds access in arrays of sc7280 driver data (e.g. in case
of RX_CODEC_DMA_RX_0 in sc7280_snd_hw_params()).
Redefine LPASS_MAX_PORTS to consider the maximum possible port id for
q6dsp as sc7280 driver utilizes some of those values.
Found by Linux Verification Center (linuxtesting.org) with SVACE.
Fixes: 77d0ffef793d ("ASoC: qcom: Add macro for lpass DAI id's max limit")
Cc: stable(a)vger.kernel.org # v6.0+
Suggested-by: Mikhail Kobuk <m.kobuk(a)ispras.ru>
Suggested-by: Alexey Khoroshilov <khoroshilov(a)ispras.ru>
Signed-off-by: Evgeny Pimenov <pimenoveu12(a)gmail.com>
---
sound/soc/qcom/lpass.h | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/sound/soc/qcom/lpass.h b/sound/soc/qcom/lpass.h
index 27a2bf9a6613..de3ec6f594c1 100644
--- a/sound/soc/qcom/lpass.h
+++ b/sound/soc/qcom/lpass.h
@@ -13,10 +13,11 @@
#include <linux/platform_device.h>
#include <linux/regmap.h>
#include <dt-bindings/sound/qcom,lpass.h>
+#include <dt-bindings/sound/qcom,q6afe.h>
#include "lpass-hdmi.h"
#define LPASS_AHBIX_CLOCK_FREQUENCY 131072000
-#define LPASS_MAX_PORTS (LPASS_CDC_DMA_VA_TX8 + 1)
+#define LPASS_MAX_PORTS (DISPLAY_PORT_RX_7 + 1)
#define LPASS_MAX_MI2S_PORTS (8)
#define LPASS_MAX_DMA_CHANNELS (8)
#define LPASS_MAX_HDMI_DMA_CHANNELS (4)
--
2.39.5
Commit a8091f039c1e ("maple_tree: add MAS_UNDERFLOW and MAS_OVERFLOW
states") adds more status during maple tree walk. But it introduce a
typo on the status check during walk.
It expects to mean neither active nor start, we would restart the walk,
while current code means we would always restart the walk.
Fixes: a8091f039c1e ("maple_tree: add MAS_UNDERFLOW and MAS_OVERFLOW states")
Signed-off-by: Wei Yang <richard.weiyang(a)gmail.com>
CC: Liam R. Howlett <Liam.Howlett(a)Oracle.com>
CC: <stable(a)vger.kernel.org>
Reviewed-by: Liam R. Howlett <Liam.Howlett(a)Oracle.com>
---
lib/maple_tree.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/lib/maple_tree.c b/lib/maple_tree.c
index 0696e8d1c4e9..81970b3a6af7 100644
--- a/lib/maple_tree.c
+++ b/lib/maple_tree.c
@@ -4895,7 +4895,7 @@ void *mas_walk(struct ma_state *mas)
{
void *entry;
- if (!mas_is_active(mas) || !mas_is_start(mas))
+ if (!mas_is_active(mas) && !mas_is_start(mas))
mas->status = ma_start;
retry:
entry = mas_state_walk(mas);
--
2.34.1
The patch titled
Subject: mm: fix filemap_get_folios_contig returning batches of identical folios
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
mm-fix-filemap_get_folios_contig-returning-batches-of-identical-folios.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: "Vishal Moola (Oracle)" <vishal.moola(a)gmail.com>
Subject: mm: fix filemap_get_folios_contig returning batches of identical folios
Date: Thu, 3 Apr 2025 16:54:17 -0700
filemap_get_folios_contig() is supposed to return distinct folios found
within [start, end]. Large folios in the Xarray become multi-index
entries. xas_next() can iterate through the sub-indexes before finding a
sibling entry and breaking out of the loop.
This can result in a returned folio_batch containing an indeterminate
number of duplicate folios, which forces the callers to skeptically handle
the returned batch. This is inefficient and incurs a large maintenance
overhead.
We can fix this by calling xas_advance() after we have successfully adding
a folio to the batch to ensure our Xarray is positioned such that it will
correctly find the next folio - similar to filemap_get_read_batch().
Link: https://lkml.kernel.org/r/Z-8s1-kiIDkzgRbc@fedora
Fixes: 35b471467f88 ("filemap: add filemap_get_folios_contig()")
Signed-off-by: Vishal Moola (Oracle) <vishal.moola(a)gmail.com>
Reported-by: Qu Wenruo <quwenruo.btrfs(a)gmx.com>
Closes: https://lkml.kernel.org/r/b714e4de-2583-4035-b829-72cfb5eb6fc6@gmx.com
Tested-by: Qu Wenruo <quwenruo.btrfs(a)gmx.com>
Cc: Matthew Wilcow (Oracle) <willy(a)infradead.org>
Cc: Vivek Kasireddy <vivek.kasireddy(a)intel.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/filemap.c | 1 +
1 file changed, 1 insertion(+)
--- a/mm/filemap.c~mm-fix-filemap_get_folios_contig-returning-batches-of-identical-folios
+++ a/mm/filemap.c
@@ -2244,6 +2244,7 @@ unsigned filemap_get_folios_contig(struc
*start = folio->index + nr;
goto out;
}
+ xas_advance(&xas, folio_next_index(folio) - 1);
continue;
put_folio:
folio_put(folio);
_
Patches currently in -mm which might be from vishal.moola(a)gmail.com are
mm-compaction-fix-bug-in-hugetlb-handling-pathway.patch
mm-fix-filemap_get_folios_contig-returning-batches-of-identical-folios.patch
mm-compaction-use-folio-in-hugetlb-pathway.patch
From: Miguel Ojeda <ojeda(a)kernel.org>
Similar to what was done for `Zeroable<NonNull<T>>` in commit
df27cef15360 ("rust: init: fix `Zeroable` implementation for
`Option<NonNull<T>>` and `Option<KBox<T>>`"), the latest Rust
documentation [1] says it guarantees that `transmute::<_,
Option<T>>([0u8; size_of::<T>()])` is sound and produces
`Option::<T>::None` only in some cases. In particular, it says:
`Box<U>` (specifically, only `Box<U, Global>`) when `U: Sized`
Thus restrict the `impl` to `Sized`, and use similar wording as in that
commit too.
Link: https://doc.rust-lang.org/stable/std/option/index.html#representation [1]
Signed-off-by: Miguel Ojeda <ojeda(a)kernel.org>
Link: https://github.com/Rust-for-Linux/pin-init/pull/32/commits/a6007cf555e5946b…
Fixes: 9b2299af3b92 ("rust: pin-init: add `std` and `alloc` support from the user-space version")
Cc: stable(a)vger.kernel.org
[ Adjust mentioned commit to the one from the kernel. - Benno ]
Signed-off-by: Benno Lossin <benno.lossin(a)proton.me>
---
rust/pin-init/src/alloc.rs | 8 +++-----
1 file changed, 3 insertions(+), 5 deletions(-)
diff --git a/rust/pin-init/src/alloc.rs b/rust/pin-init/src/alloc.rs
index e16baa3b434e..5017f57442d8 100644
--- a/rust/pin-init/src/alloc.rs
+++ b/rust/pin-init/src/alloc.rs
@@ -17,11 +17,9 @@
pub extern crate alloc;
-// SAFETY: All zeros is equivalent to `None` (option layout optimization guarantee).
-//
-// In this case we are allowed to use `T: ?Sized`, since all zeros is the `None` variant and there
-// is no problem with a VTABLE pointer being null.
-unsafe impl<T: ?Sized> ZeroableOption for Box<T> {}
+// SAFETY: All zeros is equivalent to `None` (option layout optimization guarantee:
+// <https://doc.rust-lang.org/stable/std/option/index.html#representation>).
+unsafe impl<T> ZeroableOption for Box<T> {}
/// Smart pointer that can initialize memory in-place.
pub trait InPlaceInit<T>: Sized {
--
2.48.1
From: Al Viro <viro(a)zeniv.linux.org.uk>
[ Upstream commit bdb43af4fdb39f844ede401bdb1258f67a580a27 ]
failure to allocate inode => leaked dentry...
this one had been there since the initial merge; to be fair,
if we are that far OOM, the odds of failing at that particular
allocation are low...
Signed-off-by: Al Viro <viro(a)zeniv.linux.org.uk>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
drivers/infiniband/hw/qib/qib_fs.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/drivers/infiniband/hw/qib/qib_fs.c b/drivers/infiniband/hw/qib/qib_fs.c
index e336d778e076e..5ec67e3c2d03c 100644
--- a/drivers/infiniband/hw/qib/qib_fs.c
+++ b/drivers/infiniband/hw/qib/qib_fs.c
@@ -56,6 +56,7 @@ static int qibfs_mknod(struct inode *dir, struct dentry *dentry,
struct inode *inode = new_inode(dir->i_sb);
if (!inode) {
+ dput(dentry);
error = -EPERM;
goto bail;
}
--
2.39.5
From: Al Viro <viro(a)zeniv.linux.org.uk>
[ Upstream commit bdb43af4fdb39f844ede401bdb1258f67a580a27 ]
failure to allocate inode => leaked dentry...
this one had been there since the initial merge; to be fair,
if we are that far OOM, the odds of failing at that particular
allocation are low...
Signed-off-by: Al Viro <viro(a)zeniv.linux.org.uk>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
drivers/infiniband/hw/qib/qib_fs.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/drivers/infiniband/hw/qib/qib_fs.c b/drivers/infiniband/hw/qib/qib_fs.c
index e336d778e076e..5ec67e3c2d03c 100644
--- a/drivers/infiniband/hw/qib/qib_fs.c
+++ b/drivers/infiniband/hw/qib/qib_fs.c
@@ -56,6 +56,7 @@ static int qibfs_mknod(struct inode *dir, struct dentry *dentry,
struct inode *inode = new_inode(dir->i_sb);
if (!inode) {
+ dput(dentry);
error = -EPERM;
goto bail;
}
--
2.39.5
From: Al Viro <viro(a)zeniv.linux.org.uk>
[ Upstream commit bdb43af4fdb39f844ede401bdb1258f67a580a27 ]
failure to allocate inode => leaked dentry...
this one had been there since the initial merge; to be fair,
if we are that far OOM, the odds of failing at that particular
allocation are low...
Signed-off-by: Al Viro <viro(a)zeniv.linux.org.uk>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
drivers/infiniband/hw/qib/qib_fs.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/drivers/infiniband/hw/qib/qib_fs.c b/drivers/infiniband/hw/qib/qib_fs.c
index 8665e506404f9..774037ea44a92 100644
--- a/drivers/infiniband/hw/qib/qib_fs.c
+++ b/drivers/infiniband/hw/qib/qib_fs.c
@@ -56,6 +56,7 @@ static int qibfs_mknod(struct inode *dir, struct dentry *dentry,
struct inode *inode = new_inode(dir->i_sb);
if (!inode) {
+ dput(dentry);
error = -EPERM;
goto bail;
}
--
2.39.5
From: Al Viro <viro(a)zeniv.linux.org.uk>
[ Upstream commit bdb43af4fdb39f844ede401bdb1258f67a580a27 ]
failure to allocate inode => leaked dentry...
this one had been there since the initial merge; to be fair,
if we are that far OOM, the odds of failing at that particular
allocation are low...
Signed-off-by: Al Viro <viro(a)zeniv.linux.org.uk>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
drivers/infiniband/hw/qib/qib_fs.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/drivers/infiniband/hw/qib/qib_fs.c b/drivers/infiniband/hw/qib/qib_fs.c
index 182a89bb24ef4..caade796bc3cb 100644
--- a/drivers/infiniband/hw/qib/qib_fs.c
+++ b/drivers/infiniband/hw/qib/qib_fs.c
@@ -55,6 +55,7 @@ static int qibfs_mknod(struct inode *dir, struct dentry *dentry,
struct inode *inode = new_inode(dir->i_sb);
if (!inode) {
+ dput(dentry);
error = -EPERM;
goto bail;
}
--
2.39.5
From: Al Viro <viro(a)zeniv.linux.org.uk>
[ Upstream commit bdb43af4fdb39f844ede401bdb1258f67a580a27 ]
failure to allocate inode => leaked dentry...
this one had been there since the initial merge; to be fair,
if we are that far OOM, the odds of failing at that particular
allocation are low...
Signed-off-by: Al Viro <viro(a)zeniv.linux.org.uk>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
drivers/infiniband/hw/qib/qib_fs.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/drivers/infiniband/hw/qib/qib_fs.c b/drivers/infiniband/hw/qib/qib_fs.c
index 11155e0fb8395..35d777976c295 100644
--- a/drivers/infiniband/hw/qib/qib_fs.c
+++ b/drivers/infiniband/hw/qib/qib_fs.c
@@ -55,6 +55,7 @@ static int qibfs_mknod(struct inode *dir, struct dentry *dentry,
struct inode *inode = new_inode(dir->i_sb);
if (!inode) {
+ dput(dentry);
error = -EPERM;
goto bail;
}
--
2.39.5
From: Al Viro <viro(a)zeniv.linux.org.uk>
[ Upstream commit bdb43af4fdb39f844ede401bdb1258f67a580a27 ]
failure to allocate inode => leaked dentry...
this one had been there since the initial merge; to be fair,
if we are that far OOM, the odds of failing at that particular
allocation are low...
Signed-off-by: Al Viro <viro(a)zeniv.linux.org.uk>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
drivers/infiniband/hw/qib/qib_fs.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/drivers/infiniband/hw/qib/qib_fs.c b/drivers/infiniband/hw/qib/qib_fs.c
index b27791029fa93..b9f4a2937c3ac 100644
--- a/drivers/infiniband/hw/qib/qib_fs.c
+++ b/drivers/infiniband/hw/qib/qib_fs.c
@@ -55,6 +55,7 @@ static int qibfs_mknod(struct inode *dir, struct dentry *dentry,
struct inode *inode = new_inode(dir->i_sb);
if (!inode) {
+ dput(dentry);
error = -EPERM;
goto bail;
}
--
2.39.5
From: Al Viro <viro(a)zeniv.linux.org.uk>
[ Upstream commit bdb43af4fdb39f844ede401bdb1258f67a580a27 ]
failure to allocate inode => leaked dentry...
this one had been there since the initial merge; to be fair,
if we are that far OOM, the odds of failing at that particular
allocation are low...
Signed-off-by: Al Viro <viro(a)zeniv.linux.org.uk>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
drivers/infiniband/hw/qib/qib_fs.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/drivers/infiniband/hw/qib/qib_fs.c b/drivers/infiniband/hw/qib/qib_fs.c
index b27791029fa93..b9f4a2937c3ac 100644
--- a/drivers/infiniband/hw/qib/qib_fs.c
+++ b/drivers/infiniband/hw/qib/qib_fs.c
@@ -55,6 +55,7 @@ static int qibfs_mknod(struct inode *dir, struct dentry *dentry,
struct inode *inode = new_inode(dir->i_sb);
if (!inode) {
+ dput(dentry);
error = -EPERM;
goto bail;
}
--
2.39.5
From: Al Viro <viro(a)zeniv.linux.org.uk>
[ Upstream commit bdb43af4fdb39f844ede401bdb1258f67a580a27 ]
failure to allocate inode => leaked dentry...
this one had been there since the initial merge; to be fair,
if we are that far OOM, the odds of failing at that particular
allocation are low...
Signed-off-by: Al Viro <viro(a)zeniv.linux.org.uk>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
drivers/infiniband/hw/qib/qib_fs.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/drivers/infiniband/hw/qib/qib_fs.c b/drivers/infiniband/hw/qib/qib_fs.c
index b27791029fa93..b9f4a2937c3ac 100644
--- a/drivers/infiniband/hw/qib/qib_fs.c
+++ b/drivers/infiniband/hw/qib/qib_fs.c
@@ -55,6 +55,7 @@ static int qibfs_mknod(struct inode *dir, struct dentry *dentry,
struct inode *inode = new_inode(dir->i_sb);
if (!inode) {
+ dput(dentry);
error = -EPERM;
goto bail;
}
--
2.39.5
The patch titled
Subject: mm: page_alloc: speed up fallbacks in rmqueue_bulk()
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
mm-page_alloc-speed-up-fallbacks-in-rmqueue_bulk.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Johannes Weiner <hannes(a)cmpxchg.org>
Subject: mm: page_alloc: speed up fallbacks in rmqueue_bulk()
Date: Mon, 7 Apr 2025 14:01:53 -0400
The test robot identified c2f6ea38fc1b ("mm: page_alloc: don't steal
single pages from biggest buddy") as the root cause of a 56.4% regression
in vm-scalability::lru-file-mmap-read.
Carlos reports an earlier patch, c0cd6f557b90 ("mm: page_alloc: fix
freelist movement during block conversion"), as the root cause for a
regression in worst-case zone->lock+irqoff hold times.
Both of these patches modify the page allocator's fallback path to be less
greedy in an effort to stave off fragmentation. The flip side of this is
that fallbacks are also less productive each time around, which means the
fallback search can run much more frequently.
Carlos' traces point to rmqueue_bulk() specifically, which tries to refill
the percpu cache by allocating a large batch of pages in a loop. It
highlights how once the native freelists are exhausted, the fallback code
first scans orders top-down for whole blocks to claim, then falls back to
a bottom-up search for the smallest buddy to steal. For the next batch
page, it goes through the same thing again.
This can be made more efficient. Since rmqueue_bulk() holds the
zone->lock over the entire batch, the freelists are not subject to outside
changes; when the search for a block to claim has already failed, there is
no point in trying again for the next page.
Modify __rmqueue() to remember the last successful fallback mode, and
restart directly from there on the next rmqueue_bulk() iteration.
Oliver confirms that this improves beyond the regression that the test
robot reported against c2f6ea38fc1b:
commit:
f3b92176f4 ("tools/selftests: add guard region test for /proc/$pid/pagemap")
c2f6ea38fc ("mm: page_alloc: don't steal single pages from biggest buddy")
acc4d5ff0b ("Merge tag 'net-6.15-rc0' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net")
2c847f27c3 ("mm: page_alloc: speed up fallbacks in rmqueue_bulk()") <--- your patch
f3b92176f4f7100f c2f6ea38fc1b640aa7a2e155cc1 acc4d5ff0b61eb1715c498b6536 2c847f27c37da65a93d23c237c5
---------------- --------------------------- --------------------------- ---------------------------
%stddev %change %stddev %change %stddev %change %stddev
\ | \ | \ | \
25525364 �� 3% -56.4% 11135467 -57.8% 10779336 +31.6% 33581409 vm-scalability.throughput
Carlos confirms that worst-case times are almost fully recovered
compared to before the earlier culprit patch:
2dd482ba627d (before freelist hygiene): 1ms
c0cd6f557b90 (after freelist hygiene): 90ms
next-20250319 (steal smallest buddy): 280ms
this patch : 8ms
Link: https://lkml.kernel.org/r/20250407180154.63348-1-hannes@cmpxchg.org
Fixes: c0cd6f557b90 ("mm: page_alloc: fix freelist movement during block conversion")
Fixes: c2f6ea38fc1b ("mm: page_alloc: don't steal single pages from biggest buddy")
Signed-off-by: Johannes Weiner <hannes(a)cmpxchg.org>
Reported-by: kernel test robot <oliver.sang(a)intel.com>
Reported-by: Carlos Song <carlos.song(a)nxp.com>
Tested-by: kernel test robot <oliver.sang(a)intel.com>
Closes: https://lore.kernel.org/oe-lkp/202503271547.fc08b188-lkp@intel.com
Cc: <stable(a)vger.kernel.org> [6.10+]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/page_alloc.c | 100 ++++++++++++++++++++++++++++++++++------------
1 file changed, 74 insertions(+), 26 deletions(-)
--- a/mm/page_alloc.c~mm-page_alloc-speed-up-fallbacks-in-rmqueue_bulk
+++ a/mm/page_alloc.c
@@ -2189,11 +2189,11 @@ try_to_claim_block(struct zone *zone, st
* The use of signed ints for order and current_order is a deliberate
* deviation from the rest of this file, to make the for loop
* condition simpler.
- *
- * Return the stolen page, or NULL if none can be found.
*/
+
+/* Try to claim a whole foreign block, take a page, expand the remainder */
static __always_inline struct page *
-__rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
+__rmqueue_claim(struct zone *zone, int order, int start_migratetype,
unsigned int alloc_flags)
{
struct free_area *area;
@@ -2231,14 +2231,26 @@ __rmqueue_fallback(struct zone *zone, in
page = try_to_claim_block(zone, page, current_order, order,
start_migratetype, fallback_mt,
alloc_flags);
- if (page)
- goto got_one;
+ if (page) {
+ trace_mm_page_alloc_extfrag(page, order, current_order,
+ start_migratetype, fallback_mt);
+ return page;
+ }
}
- if (alloc_flags & ALLOC_NOFRAGMENT)
- return NULL;
+ return NULL;
+}
+
+/* Try to steal a single page from a foreign block */
+static __always_inline struct page *
+__rmqueue_steal(struct zone *zone, int order, int start_migratetype)
+{
+ struct free_area *area;
+ int current_order;
+ struct page *page;
+ int fallback_mt;
+ bool claim_block;
- /* No luck claiming pageblock. Find the smallest fallback page */
for (current_order = order; current_order < NR_PAGE_ORDERS; current_order++) {
area = &(zone->free_area[current_order]);
fallback_mt = find_suitable_fallback(area, current_order,
@@ -2248,25 +2260,28 @@ __rmqueue_fallback(struct zone *zone, in
page = get_page_from_free_area(area, fallback_mt);
page_del_and_expand(zone, page, order, current_order, fallback_mt);
- goto got_one;
+ trace_mm_page_alloc_extfrag(page, order, current_order,
+ start_migratetype, fallback_mt);
+ return page;
}
return NULL;
-
-got_one:
- trace_mm_page_alloc_extfrag(page, order, current_order,
- start_migratetype, fallback_mt);
-
- return page;
}
+enum rmqueue_mode {
+ RMQUEUE_NORMAL,
+ RMQUEUE_CMA,
+ RMQUEUE_CLAIM,
+ RMQUEUE_STEAL,
+};
+
/*
* Do the hard work of removing an element from the buddy allocator.
* Call me with the zone->lock already held.
*/
static __always_inline struct page *
__rmqueue(struct zone *zone, unsigned int order, int migratetype,
- unsigned int alloc_flags)
+ unsigned int alloc_flags, enum rmqueue_mode *mode)
{
struct page *page;
@@ -2285,16 +2300,47 @@ __rmqueue(struct zone *zone, unsigned in
}
}
- page = __rmqueue_smallest(zone, order, migratetype);
- if (unlikely(!page)) {
- if (alloc_flags & ALLOC_CMA)
+ /*
+ * Try the different freelists, native then foreign.
+ *
+ * The fallback logic is expensive and rmqueue_bulk() calls in
+ * a loop with the zone->lock held, meaning the freelists are
+ * not subject to any outside changes. Remember in *mode where
+ * we found pay dirt, to save us the search on the next call.
+ */
+ switch (*mode) {
+ case RMQUEUE_NORMAL:
+ page = __rmqueue_smallest(zone, order, migratetype);
+ if (page)
+ return page;
+ fallthrough;
+ case RMQUEUE_CMA:
+ if (alloc_flags & ALLOC_CMA) {
page = __rmqueue_cma_fallback(zone, order);
-
- if (!page)
- page = __rmqueue_fallback(zone, order, migratetype,
- alloc_flags);
+ if (page) {
+ *mode = RMQUEUE_CMA;
+ return page;
+ }
+ }
+ fallthrough;
+ case RMQUEUE_CLAIM:
+ page = __rmqueue_claim(zone, order, migratetype, alloc_flags);
+ if (page) {
+ /* Replenished native freelist, back to normal mode */
+ *mode = RMQUEUE_NORMAL;
+ return page;
+ }
+ fallthrough;
+ case RMQUEUE_STEAL:
+ if (!(alloc_flags & ALLOC_NOFRAGMENT)) {
+ page = __rmqueue_steal(zone, order, migratetype);
+ if (page) {
+ *mode = RMQUEUE_STEAL;
+ return page;
+ }
+ }
}
- return page;
+ return NULL;
}
/*
@@ -2306,6 +2352,7 @@ static int rmqueue_bulk(struct zone *zon
unsigned long count, struct list_head *list,
int migratetype, unsigned int alloc_flags)
{
+ enum rmqueue_mode rmqm = RMQUEUE_NORMAL;
unsigned long flags;
int i;
@@ -2317,7 +2364,7 @@ static int rmqueue_bulk(struct zone *zon
}
for (i = 0; i < count; ++i) {
struct page *page = __rmqueue(zone, order, migratetype,
- alloc_flags);
+ alloc_flags, &rmqm);
if (unlikely(page == NULL))
break;
@@ -2930,6 +2977,7 @@ struct page *rmqueue_buddy(struct zone *
{
struct page *page;
unsigned long flags;
+ enum rmqueue_mode rmqm = RMQUEUE_NORMAL;
do {
page = NULL;
@@ -2942,7 +2990,7 @@ struct page *rmqueue_buddy(struct zone *
if (alloc_flags & ALLOC_HIGHATOMIC)
page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
if (!page) {
- page = __rmqueue(zone, order, migratetype, alloc_flags);
+ page = __rmqueue(zone, order, migratetype, alloc_flags, &rmqm);
/*
* If the allocation fails, allow OOM handling and
_
Patches currently in -mm which might be from hannes(a)cmpxchg.org are
mm-page_alloc-speed-up-fallbacks-in-rmqueue_bulk.patch