From: Marc Zyngier <maz(a)kernel.org>
[ Upstream commit ca88ecdce5f51874a7c151809bd2c936ee0d3805 ]
We currently have two ways to identify CPUs that only implement FEAT_VHE
and not FEAT_E2H0:
- either they advertise it via ID_AA64MMFR4_EL1.E2H0,
- or the HCR_EL2.E2H bit is RAO/WI
However, there is a third category of "cpus" that fall between these
two cases: on CPUs that do not implement FEAT_FGT, it is IMPDEF whether
an access to ID_AA64MMFR4_EL1 can trap to EL2 when the register value
is zero.
A consequence of this is that on systems such as Neoverse V2, a NV
guest cannot reliably detect that it is in a VHE-only configuration
(E2H is writable, and ID_AA64MMFR0_EL1 is 0), despite the hypervisor's
best effort to repaint the id register.
Replace the RAO/WI test by a sequence that makes use of the VHE
register remnapping between EL1 and EL2 to detect this situation,
and work out whether we get the VHE behaviour even after having
set HCR_EL2.E2H to 0.
This solves the NV problem, and provides a more reliable acid test
for CPUs that do not completely follow the letter of the architecture
while providing a RES1 behaviour for HCR_EL2.E2H.
Suggested-by: Mark Rutland <mark.rutland(a)arm.com>
Acked-by: Mark Rutland <mark.rutland(a)arm.com>
Acked-by: Catalin Marinas <catalin.marinas(a)arm.com>
Reviewed-by: Oliver Upton <oliver.upton(a)linux.dev>
Tested-by: Jan Kotas <jank(a)cadence.com>
Signed-off-by: Marc Zyngier <maz(a)kernel.org>
Link: https://lore.kernel.org/r/15A85F2B-1A0C-4FA7-9FE4-EEC2203CC09E@global.caden…
Signed-off-by: Marc Zyngier <maz(a)kernel.org>
Signed-off-by: Wei-Lin Chang <weilin.chang(a)arm.com>
---
arch/arm64/include/asm/el2_setup.h | 38 +++++++++++++++++++++++++-----
1 file changed, 32 insertions(+), 6 deletions(-)
diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h
index 859aa1a3b996..9d1c88536ee4 100644
--- a/arch/arm64/include/asm/el2_setup.h
+++ b/arch/arm64/include/asm/el2_setup.h
@@ -24,22 +24,48 @@
* ID_AA64MMFR4_EL1.E2H0 < 0. On such CPUs HCR_EL2.E2H is RES1, but it
* can reset into an UNKNOWN state and might not read as 1 until it has
* been initialized explicitly.
- *
- * Fruity CPUs seem to have HCR_EL2.E2H set to RAO/WI, but
- * don't advertise it (they predate this relaxation).
- *
* Initalize HCR_EL2.E2H so that later code can rely upon HCR_EL2.E2H
* indicating whether the CPU is running in E2H mode.
*/
mrs_s x1, SYS_ID_AA64MMFR4_EL1
sbfx x1, x1, #ID_AA64MMFR4_EL1_E2H0_SHIFT, #ID_AA64MMFR4_EL1_E2H0_WIDTH
cmp x1, #0
- b.ge .LnVHE_\@
+ b.lt .LnE2H0_\@
+ /*
+ * Unfortunately, HCR_EL2.E2H can be RES1 even if not advertised
+ * as such via ID_AA64MMFR4_EL1.E2H0:
+ *
+ * - Fruity CPUs predate the !FEAT_E2H0 relaxation, and seem to
+ * have HCR_EL2.E2H implemented as RAO/WI.
+ *
+ * - On CPUs that lack FEAT_FGT, a hypervisor can't trap guest
+ * reads of ID_AA64MMFR4_EL1 to advertise !FEAT_E2H0. NV
+ * guests on these hosts can write to HCR_EL2.E2H without
+ * trapping to the hypervisor, but these writes have no
+ * functional effect.
+ *
+ * Handle both cases by checking for an essential VHE property
+ * (system register remapping) to decide whether we're
+ * effectively VHE-only or not.
+ */
+ msr hcr_el2, x0 // Setup HCR_EL2 as nVHE
+ isb
+ mov x1, #1 // Write something to FAR_EL1
+ msr far_el1, x1
+ isb
+ mov x1, #2 // Try to overwrite it via FAR_EL2
+ msr far_el2, x1
+ isb
+ mrs x1, far_el1 // If we see the latest write in FAR_EL1,
+ cmp x1, #2 // we can safely assume we are VHE only.
+ b.ne .LnVHE_\@ // Otherwise, we know that nVHE works.
+
+.LnE2H0_\@:
orr x0, x0, #HCR_E2H
-.LnVHE_\@:
msr hcr_el2, x0
isb
+.LnVHE_\@:
.endm
.macro __init_el2_sctlr
--
2.43.0
From: Ahmed Genidi <ahmed.genidi(a)arm.com>
[ Upstream commit 3855a7b91d42ebf3513b7ccffc44807274978b3d ]
When KVM is in protected mode, host calls to PSCI are proxied via EL2,
and cold entries from CPU_ON, CPU_SUSPEND, and SYSTEM_SUSPEND bounce
through __kvm_hyp_init_cpu() at EL2 before entering the host kernel's
entry point at EL1. While __kvm_hyp_init_cpu() initializes SPSR_EL2 for
the exception return to EL1, it does not initialize SCTLR_EL1.
Due to this, it's possible to enter EL1 with SCTLR_EL1 in an UNKNOWN
state. In practice this has been seen to result in kernel crashes after
CPU_ON as a result of SCTLR_EL1.M being 1 in violation of the initial
core configuration specified by PSCI.
Fix this by initializing SCTLR_EL1 for cold entry to the host kernel.
As it's necessary to write to SCTLR_EL12 in VHE mode, this
initialization is moved into __kvm_host_psci_cpu_entry() where we can
use write_sysreg_el1().
The remnants of the '__init_el2_nvhe_prepare_eret' macro are folded into
its only caller, as this is clearer than having the macro.
Fixes: cdf367192766ad11 ("KVM: arm64: Intercept host's CPU_ON SMCs")
Reported-by: Leo Yan <leo.yan(a)arm.com>
Signed-off-by: Ahmed Genidi <ahmed.genidi(a)arm.com>
[ Mark: clarify commit message, handle E2H, move to C, remove macro ]
Signed-off-by: Mark Rutland <mark.rutland(a)arm.com>
Cc: Ahmed Genidi <ahmed.genidi(a)arm.com>
Cc: Ben Horgan <ben.horgan(a)arm.com>
Cc: Catalin Marinas <catalin.marinas(a)arm.com>
Cc: Leo Yan <leo.yan(a)arm.com>
Cc: Marc Zyngier <maz(a)kernel.org>
Cc: Oliver Upton <oliver.upton(a)linux.dev>
Cc: Will Deacon <will(a)kernel.org>
Reviewed-by: Leo Yan <leo.yan(a)arm.com>
Link: https://lore.kernel.org/r/20250227180526.1204723-3-mark.rutland@arm.com
Signed-off-by: Marc Zyngier <maz(a)kernel.org>
Signed-off-by: Wei-Lin Chang <weilin.chang(a)arm.com>
---
arch/arm64/include/asm/el2_setup.h | 5 -----
arch/arm64/kernel/head.S | 3 ++-
arch/arm64/kvm/hyp/nvhe/hyp-init.S | 2 --
arch/arm64/kvm/hyp/nvhe/psci-relay.c | 3 +++
4 files changed, 5 insertions(+), 8 deletions(-)
diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h
index 00b27c8ed9a2..859aa1a3b996 100644
--- a/arch/arm64/include/asm/el2_setup.h
+++ b/arch/arm64/include/asm/el2_setup.h
@@ -265,11 +265,6 @@
.Lskip_fgt2_\@:
.endm
-.macro __init_el2_nvhe_prepare_eret
- mov x0, #INIT_PSTATE_EL1
- msr spsr_el2, x0
-.endm
-
/**
* Initialize EL2 registers to sane values. This should be called early on all
* cores that were booted in EL2. Note that everything gets initialised as
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 4d28c1e56cb5..25c08a0d228a 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -319,7 +319,8 @@ SYM_INNER_LABEL(init_el2, SYM_L_LOCAL)
msr sctlr_el1, x1
mov x2, xzr
3:
- __init_el2_nvhe_prepare_eret
+ mov x0, #INIT_PSTATE_EL1
+ msr spsr_el2, x0
mov w0, #BOOT_CPU_MODE_EL2
orr x0, x0, x2
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S b/arch/arm64/kvm/hyp/nvhe/hyp-init.S
index 3fb5504a7d7f..f8af11189572 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S
@@ -214,8 +214,6 @@ SYM_CODE_START_LOCAL(__kvm_hyp_init_cpu)
bl __kvm_init_el2_state
- __init_el2_nvhe_prepare_eret
-
/* Enable MMU, set vectors and stack. */
mov x0, x28
bl ___kvm_hyp_init // Clobbers x0..x2
diff --git a/arch/arm64/kvm/hyp/nvhe/psci-relay.c b/arch/arm64/kvm/hyp/nvhe/psci-relay.c
index dfe8fe0f7eaf..bd0f308b694c 100644
--- a/arch/arm64/kvm/hyp/nvhe/psci-relay.c
+++ b/arch/arm64/kvm/hyp/nvhe/psci-relay.c
@@ -218,6 +218,9 @@ asmlinkage void __noreturn __kvm_host_psci_cpu_entry(bool is_cpu_on)
if (is_cpu_on)
release_boot_args(boot_args);
+ write_sysreg_el1(INIT_SCTLR_EL1_MMU_OFF, SYS_SCTLR);
+ write_sysreg(INIT_PSTATE_EL1, SPSR_EL2);
+
__host_enter(host_ctxt);
}
--
2.43.0
From: Mark Rutland <mark.rutland(a)arm.com>
[ Upstream commit 7a68b55ff39b0a1638acb1694c185d49f6077a0d ]
On CPUs without FEAT_E2H0, HCR_EL2.E2H is RES1, but may reset to an
UNKNOWN value out of reset and consequently may not read as 1 unless it
has been explicitly initialized.
We handled this for the head.S boot code in commits:
3944382fa6f22b54 ("arm64: Treat HCR_EL2.E2H as RES1 when ID_AA64MMFR4_EL1.E2H0 is negative")
b3320142f3db9b3f ("arm64: Fix early handling of FEAT_E2H0 not being implemented")
Unfortunately, we forgot to apply a similar fix to the KVM PSCI entry
points used when relaying CPU_ON, CPU_SUSPEND, and SYSTEM SUSPEND. When
KVM is entered via these entry points, the value of HCR_EL2.E2H may be
consumed before it has been initialized (e.g. by the 'init_el2_state'
macro).
Initialize HCR_EL2.E2H early in these paths such that it can be consumed
reliably. The existing code in head.S is factored out into a new
'init_el2_hcr' macro, and this is used in the __kvm_hyp_init_cpu()
function common to all the relevant PSCI entry points.
For clarity, I've tweaked the assembly used to check whether
ID_AA64MMFR4_EL1.E2H0 is negative. The bitfield is extracted as a signed
value, and this is checked with a signed-greater-or-equal (GE) comparison.
As the hyp code will reconfigure HCR_EL2 later in ___kvm_hyp_init(), all
bits other than E2H are initialized to zero in __kvm_hyp_init_cpu().
Fixes: 3944382fa6f22b54 ("arm64: Treat HCR_EL2.E2H as RES1 when ID_AA64MMFR4_EL1.E2H0 is negative")
Fixes: b3320142f3db9b3f ("arm64: Fix early handling of FEAT_E2H0 not being implemented")
Signed-off-by: Mark Rutland <mark.rutland(a)arm.com>
Cc: Ahmed Genidi <ahmed.genidi(a)arm.com>
Cc: Ben Horgan <ben.horgan(a)arm.com>
Cc: Catalin Marinas <catalin.marinas(a)arm.com>
Cc: Leo Yan <leo.yan(a)arm.com>
Cc: Marc Zyngier <maz(a)kernel.org>
Cc: Oliver Upton <oliver.upton(a)linux.dev>
Cc: Will Deacon <will(a)kernel.org>
Link: https://lore.kernel.org/r/20250227180526.1204723-2-mark.rutland@arm.com
[maz: fixed LT->GE thinko]
Signed-off-by: Marc Zyngier <maz(a)kernel.org>
Signed-off-by: Wei-Lin Chang <weilin.chang(a)arm.com>
---
arch/arm64/include/asm/el2_setup.h | 26 ++++++++++++++++++++++++++
arch/arm64/kernel/head.S | 19 +------------------
arch/arm64/kvm/hyp/nvhe/hyp-init.S | 8 +++++++-
3 files changed, 34 insertions(+), 19 deletions(-)
diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h
index bdbe9e08664a..00b27c8ed9a2 100644
--- a/arch/arm64/include/asm/el2_setup.h
+++ b/arch/arm64/include/asm/el2_setup.h
@@ -16,6 +16,32 @@
#include <asm/sysreg.h>
#include <linux/irqchip/arm-gic-v3.h>
+.macro init_el2_hcr val
+ mov_q x0, \val
+
+ /*
+ * Compliant CPUs advertise their VHE-onlyness with
+ * ID_AA64MMFR4_EL1.E2H0 < 0. On such CPUs HCR_EL2.E2H is RES1, but it
+ * can reset into an UNKNOWN state and might not read as 1 until it has
+ * been initialized explicitly.
+ *
+ * Fruity CPUs seem to have HCR_EL2.E2H set to RAO/WI, but
+ * don't advertise it (they predate this relaxation).
+ *
+ * Initalize HCR_EL2.E2H so that later code can rely upon HCR_EL2.E2H
+ * indicating whether the CPU is running in E2H mode.
+ */
+ mrs_s x1, SYS_ID_AA64MMFR4_EL1
+ sbfx x1, x1, #ID_AA64MMFR4_EL1_E2H0_SHIFT, #ID_AA64MMFR4_EL1_E2H0_WIDTH
+ cmp x1, #0
+ b.ge .LnVHE_\@
+
+ orr x0, x0, #HCR_E2H
+.LnVHE_\@:
+ msr hcr_el2, x0
+ isb
+.endm
+
.macro __init_el2_sctlr
mov_q x0, INIT_SCTLR_EL2_MMU_OFF
msr sctlr_el2, x0
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index cb68adcabe07..4d28c1e56cb5 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -295,25 +295,8 @@ SYM_INNER_LABEL(init_el2, SYM_L_LOCAL)
msr sctlr_el2, x0
isb
0:
- mov_q x0, HCR_HOST_NVHE_FLAGS
-
- /*
- * Compliant CPUs advertise their VHE-onlyness with
- * ID_AA64MMFR4_EL1.E2H0 < 0. HCR_EL2.E2H can be
- * RES1 in that case. Publish the E2H bit early so that
- * it can be picked up by the init_el2_state macro.
- *
- * Fruity CPUs seem to have HCR_EL2.E2H set to RAO/WI, but
- * don't advertise it (they predate this relaxation).
- */
- mrs_s x1, SYS_ID_AA64MMFR4_EL1
- tbz x1, #(ID_AA64MMFR4_EL1_E2H0_SHIFT + ID_AA64MMFR4_EL1_E2H0_WIDTH - 1), 1f
-
- orr x0, x0, #HCR_E2H
-1:
- msr hcr_el2, x0
- isb
+ init_el2_hcr HCR_HOST_NVHE_FLAGS
init_el2_state
/* Hypervisor stub */
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S b/arch/arm64/kvm/hyp/nvhe/hyp-init.S
index fc1866226067..3fb5504a7d7f 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S
@@ -73,8 +73,12 @@ __do_hyp_init:
eret
SYM_CODE_END(__kvm_hyp_init)
+/*
+ * Initialize EL2 CPU state to sane values.
+ *
+ * HCR_EL2.E2H must have been initialized already.
+ */
SYM_CODE_START_LOCAL(__kvm_init_el2_state)
- /* Initialize EL2 CPU state to sane values. */
init_el2_state // Clobbers x0..x2
finalise_el2_state
ret
@@ -206,6 +210,8 @@ SYM_CODE_START_LOCAL(__kvm_hyp_init_cpu)
2: msr SPsel, #1 // We want to use SP_EL{1,2}
+ init_el2_hcr 0
+
bl __kvm_init_el2_state
__init_el2_nvhe_prepare_eret
--
2.43.0
From: Ilya Maximets <i.maximets(a)ovn.org>
[ Upstream commit 5ace7ef87f059d68b5f50837ef3e8a1a4870c36e ]
The push_nsh() action structure looks like this:
OVS_ACTION_ATTR_PUSH_NSH(OVS_KEY_ATTR_NSH(OVS_NSH_KEY_ATTR_BASE,...))
The outermost OVS_ACTION_ATTR_PUSH_NSH attribute is OK'ed by the
nla_for_each_nested() inside __ovs_nla_copy_actions(). The innermost
OVS_NSH_KEY_ATTR_BASE/MD1/MD2 are OK'ed by the nla_for_each_nested()
inside nsh_key_put_from_nlattr(). But nothing checks if the attribute
in the middle is OK. We don't even check that this attribute is the
OVS_KEY_ATTR_NSH. We just do a double unwrap with a pair of nla_data()
calls - first time directly while calling validate_push_nsh() and the
second time as part of the nla_for_each_nested() macro, which isn't
safe, potentially causing invalid memory access if the size of this
attribute is incorrect. The failure may not be noticed during
validation due to larger netlink buffer, but cause trouble later during
action execution where the buffer is allocated exactly to the size:
BUG: KASAN: slab-out-of-bounds in nsh_hdr_from_nlattr+0x1dd/0x6a0 [openvswitch]
Read of size 184 at addr ffff88816459a634 by task a.out/22624
CPU: 8 UID: 0 PID: 22624 6.18.0-rc7+ #115 PREEMPT(voluntary)
Call Trace:
<TASK>
dump_stack_lvl+0x51/0x70
print_address_description.constprop.0+0x2c/0x390
kasan_report+0xdd/0x110
kasan_check_range+0x35/0x1b0
__asan_memcpy+0x20/0x60
nsh_hdr_from_nlattr+0x1dd/0x6a0 [openvswitch]
push_nsh+0x82/0x120 [openvswitch]
do_execute_actions+0x1405/0x2840 [openvswitch]
ovs_execute_actions+0xd5/0x3b0 [openvswitch]
ovs_packet_cmd_execute+0x949/0xdb0 [openvswitch]
genl_family_rcv_msg_doit+0x1d6/0x2b0
genl_family_rcv_msg+0x336/0x580
genl_rcv_msg+0x9f/0x130
netlink_rcv_skb+0x11f/0x370
genl_rcv+0x24/0x40
netlink_unicast+0x73e/0xaa0
netlink_sendmsg+0x744/0xbf0
__sys_sendto+0x3d6/0x450
do_syscall_64+0x79/0x2c0
entry_SYSCALL_64_after_hwframe+0x76/0x7e
</TASK>
Let's add some checks that the attribute is properly sized and it's
the only one attribute inside the action. Technically, there is no
real reason for OVS_KEY_ATTR_NSH to be there, as we know that we're
pushing an NSH header already, it just creates extra nesting, but
that's how uAPI works today. So, keeping as it is.
Fixes: b2d0f5d5dc53 ("openvswitch: enable NSH support")
Reported-by: Junvy Yang <zhuque(a)tencent.com>
Signed-off-by: Ilya Maximets <i.maximets(a)ovn.org>
Acked-by: Eelco Chaudron echaudro(a)redhat.com
Reviewed-by: Aaron Conole <aconole(a)redhat.com>
Link: https://patch.msgid.link/20251204105334.900379-1-i.maximets@ovn.org
Signed-off-by: Jakub Kicinski <kuba(a)kernel.org>
(cherry picked from commit 5ace7ef87f059d68b5f50837ef3e8a1a4870c36e)
Signed-off-by: Adrian Yip <adrian.ytw(a)gmail.com>
---
net/openvswitch/flow_netlink.c | 13 ++++++++++---
1 file changed, 10 insertions(+), 3 deletions(-)
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index d0b6e5872081..d4c8b4aa98b1 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -2786,13 +2786,20 @@ static int validate_and_copy_set_tun(const struct nlattr *attr,
return err;
}
-static bool validate_push_nsh(const struct nlattr *attr, bool log)
+static bool validate_push_nsh(const struct nlattr *a, bool log)
{
+ struct nlattr *nsh_key = nla_data(a);
struct sw_flow_match match;
struct sw_flow_key key;
+ /* There must be one and only one NSH header. */
+ if (!nla_ok(nsh_key, nla_len(a)) ||
+ nla_total_size(nla_len(nsh_key)) != nla_len(a) ||
+ nla_type(nsh_key) != OVS_KEY_ATTR_NSH)
+ return false;
+
ovs_match_init(&match, &key, true, NULL);
- return !nsh_key_put_from_nlattr(attr, &match, false, true, log);
+ return !nsh_key_put_from_nlattr(nsh_key, &match, false, true, log);
}
/* Return false if there are any non-masked bits set.
@@ -3346,7 +3353,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
return -EINVAL;
}
mac_proto = MAC_PROTO_NONE;
- if (!validate_push_nsh(nla_data(a), log))
+ if (!validate_push_nsh(a, log))
return -EINVAL;
break;
--
2.52.0
If SMT is disabled or a partial SMT state is enabled, when a new kernel
image is loaded for kexec, on reboot the following warning is observed:
kexec: Waking offline cpu 228.
WARNING: CPU: 0 PID: 9062 at arch/powerpc/kexec/core_64.c:223 kexec_prepare_cpus+0x1b0/0x1bc
[snip]
NIP kexec_prepare_cpus+0x1b0/0x1bc
LR kexec_prepare_cpus+0x1a0/0x1bc
Call Trace:
kexec_prepare_cpus+0x1a0/0x1bc (unreliable)
default_machine_kexec+0x160/0x19c
machine_kexec+0x80/0x88
kernel_kexec+0xd0/0x118
__do_sys_reboot+0x210/0x2c4
system_call_exception+0x124/0x320
system_call_vectored_common+0x15c/0x2ec
This occurs as add_cpu() fails due to cpu_bootable() returning false for
CPUs that fail the cpu_smt_thread_allowed() check or non primary
threads if SMT is disabled.
Fix the issue by enabling SMT and resetting the number of SMT threads to
the number of threads per core, before attempting to wake up all present
CPUs.
Fixes: 38253464bc82 ("cpu/SMT: Create topology_smt_thread_allowed()")
Reported-by: Sachin P Bappalige <sachinpb(a)linux.ibm.com>
Cc: stable(a)vger.kernel.org # v6.6+
Signed-off-by: Nysal Jan K.A. <nysal(a)linux.ibm.com>
---
arch/powerpc/kexec/core_64.c | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/arch/powerpc/kexec/core_64.c b/arch/powerpc/kexec/core_64.c
index 222aa326dace..ff6df43720c4 100644
--- a/arch/powerpc/kexec/core_64.c
+++ b/arch/powerpc/kexec/core_64.c
@@ -216,6 +216,11 @@ static void wake_offline_cpus(void)
{
int cpu = 0;
+ lock_device_hotplug();
+ cpu_smt_num_threads = threads_per_core;
+ cpu_smt_control = CPU_SMT_ENABLED;
+ unlock_device_hotplug();
+
for_each_present_cpu(cpu) {
if (!cpu_online(cpu)) {
printk(KERN_INFO "kexec: Waking offline cpu %d.\n",
--
2.51.0
erofs readahead could fail with ENOMEM under the memory pressure because
it tries to alloc_page with GFP_NOWAIT | GFP_NORETRY, while GFP_KERNEL
for a regular read. And if readahead fails (with non-uptodate folios),
the original request will then fall back to synchronous read, and
`.read_folio()` should return appropriate errnos.
However, in scenarios where readahead and read operations compete,
read operation could return an unintended EIO because of an incorrect
error propagation.
To resolve this, this patch modifies the behavior so that, when the
PCL is for read(which means pcl.besteffort is true), it attempts actual
decompression instead of propagating the privios error except initial EIO.
- Page size: 4K
- The original size of FileA: 16K
- Compress-ratio per PCL: 50% (Uncompressed 8K -> Compressed 4K)
[page0, page1] [page2, page3]
[PCL0]---------[PCL1]
- functions declaration:
. pread(fd, buf, count, offset)
. readahead(fd, offset, count)
- Thread A tries to read the last 4K
- Thread B tries to do readahead 8K from 4K
- RA, besteffort == false
- R, besteffort == true
<process A> <process B>
pread(FileA, buf, 4K, 12K)
do readahead(page3) // failed with ENOMEM
wait_lock(page3)
if (!uptodate(page3))
goto do_read
readahead(FileA, 4K, 8K)
// Here create PCL-chain like below:
// [null, page1] [page2, null]
// [PCL0:RA]-----[PCL1:RA]
...
do read(page3) // found [PCL1:RA] and add page3 into it,
// and then, change PCL1 from RA to R
...
// Now, PCL-chain is as below:
// [null, page1] [page2, page3]
// [PCL0:RA]-----[PCL1:R]
// try to decompress PCL-chain...
z_erofs_decompress_queue
err = 0;
// failed with ENOMEM, so page 1
// only for RA will not be uptodated.
// it's okay.
err = decompress([PCL0:RA], err)
// However, ENOMEM propagated to next
// PCL, even though PCL is not only
// for RA but also for R. As a result,
// it just failed with ENOMEM without
// trying any decompression, so page2
// and page3 will not be uptodated.
** BUG HERE ** --> err = decompress([PCL1:R], err)
return err as ENOMEM
...
wait_lock(page3)
if (!uptodate(page3))
return EIO <-- Return an unexpected EIO!
...
Fixes: 2349d2fa02db ("erofs: sunset unneeded NOFAILs")
Cc: stable(a)vger.kernel.org
Reviewed-by: Jaewook Kim <jw5454.kim(a)samsung.com>
Reviewed-by: Sungjong Seo <sj1557.seo(a)samsung.com>
Signed-off-by: Junbeom Yeom <junbeom.yeom(a)samsung.com>
---
fs/erofs/zdata.c | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 27b1f44d10ce..86bf6e087d34 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -1414,11 +1414,15 @@ static int z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io,
};
struct z_erofs_pcluster *next;
int err = io->eio ? -EIO : 0;
+ int io_err = err;
for (; be.pcl != Z_EROFS_PCLUSTER_TAIL; be.pcl = next) {
+ int propagate_err;
+
DBG_BUGON(!be.pcl);
next = READ_ONCE(be.pcl->next);
- err = z_erofs_decompress_pcluster(&be, err) ?: err;
+ propagate_err = READ_ONCE(be.pcl->besteffort) ? io_err : err;
+ err = z_erofs_decompress_pcluster(&be, propagate_err) ?: err;
}
return err;
}
--
2.34.1
Hi all,
Roland Schwarzkopf reported to the Debian mailing list a problem which
he encountered once updating in Debian from 5.10.244 to 5.10.247. The
report is quoted below and found in
https://lists.debian.org/debian-kernel/2025/12/msg00223.html
Roland did bisect the changes between 5.10.244 to 5.10.247 and found
that the issue is introduced with 1550f3673972 ("net: rtnetlink: add
bulk delete support flag") which is the backport to the 5.10.y series.
On Thu, Dec 18, 2025 at 02:59:55PM +0100, Roland Schwarzkopf wrote:
> Hi Salvatore,
>
> On 12/17/25 20:28, Salvatore Bonaccorso wrote:
> > Hi Roland,
> >
> > I'm CC'ing Ben Hutchings directly as well as he takes care of the
> > Debian LTS kernel updates. Idellly we make this as well a proper bug
> > for easier tracking.
> >
> > On Wed, Dec 17, 2025 at 01:35:54PM +0100, Roland Schwarzkopf wrote:
> > > Hi there,
> > >
> > > after upgrading to the latest kernel on Debian 11
> > > (linux-image-5.10.0-37-amd64) I have an issue using libvirt with qemu/kvm
> > > virtual machines and macvtap networking. When a machine is shut down,
> > > libvirt can not delete the corresponding macvtap device. Thus, starting the
> > > machine again is not possible. After manually removing the macvtap device
> > > using `ip link delete` the vm can be started again.
> > >
> > > In the journal the following message is shown:
> > >
> > > Dec 17 13:19:27 iblis libvirtd[535]: error destroying network device macvtap0: Operation not supported
> > >
> > > After downgrading the kernel to linux-image-5.10.0-36-amd64, the problem
> > > disappears. I tested this on a fresh minimal install of Debian 11 - to
> > > exclude that anything else on my production machines is causing this issue.
> > >
> > > Since the older kernel does not have this issue, I assume this is related to
> > > the kernel and not to libvirt?
> > >
> > > I tried to check for bug reports of the kernel package, but the bug tracker
> > > finds no reports and even states that the package does not exist (I used the
> > > "Bug reports" link on
> > > https://packages.debian.org/bullseye/linux-image-5.10.0-37-amd64). This left
> > > me a bit puzzled. Since I don't have experience with the debian bug
> > > reporting process, I had no other idea than writing to this list.
> > You would need to search for inhttps://bugs.debian.org/src:linux ,
> > but that said I'm not aware of any bug reports in that direction.
> >
> > Would you be in the position of bisecting the problem as you can say
> > that 5.10.244 is good and 5.10.247 is bad and regressed? If you can do
> > that that would involve compiling a couple of kernels to narrow down
> > where the problem is introduced:
> >
> > git clone --single-branch -b linux-5.10.yhttps://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-st…
> > cd linux-stable
> > git checkout v5.10.244
> > cp /boot/config-$(uname -r) .config
> > yes '' | make localmodconfig
> > make savedefconfig
> > mv defconfig arch/x86/configs/my_defconfig
> >
> > # test 5.10.244 to ensure this is "good"
> > make my_defconfig
> > make -j $(nproc) bindeb-pkg
> > ... install the resulting .deb package and confirm it successfully boots / problem does not exist
> >
> > # test 5.10.247 to ensure this is "bad"
> > git checkout v5.10.247
> > make my_defconfig
> > make -j $(nproc) bindeb-pkg
> > ... install the resulting .deb package and confirm it fails to boot / problem exists
> >
> > With that confirmed, the bisection can start:
> >
> > git bisect start
> > git bisect good v5.10.244
> > git bisect bad v5.10.247
> >
> > In each bisection step git checks out a state between the oldest
> > known-bad and the newest known-good commit. In each step test using:
> >
> > make my_defconfig
> > make -j $(nproc) bindeb-pkg
> > ... install, try to boot / verify if problem exists
> >
> > and if the problem is hit run:
> >
> > git bisect bad
> >
> > and if the problem doesn't trigger run:
> >
> > git bisect good
> >
> > . Please pay attention to always select the just built kernel for
> > booting, it won't always be the default kernel picked up by grub.
> >
> > Iterate until git announces to have identified the first bad commit.
> >
> > Then provide the output of
> >
> > git bisect log
> >
> > In the course of the bisection you might have to uninstall previous
> > kernels again to not exhaust the disk space in /boot. Also in the end
> > uninstall all self-built kernels again.
>
> I just did my first bisection \o/ (sorry)
>
> Here are the results:
>
> git bisect start
> # bad: [f964b940099f9982d723d4c77988d4b0dda9c165] Linux 5.10.247
> git bisect bad f964b940099f9982d723d4c77988d4b0dda9c165
> # good: [863b76df7d1e327979946a2d3893479c3275bfa4] Linux 5.10.244
> git bisect good f52ee6ea810273e527a5d319e5f400be8c8424c1
> # good: [dc9fdb7586b90e33c766eac52b6f3d1c9ec365a1] net: usb: lan78xx: Add error handling to lan78xx_init_mac_address
> git bisect good dc9fdb7586b90e33c766eac52b6f3d1c9ec365a1
> # bad: [2272d5757ce5d3fb416d9f2497b015678eb85c0d] phy: cadence: cdns-dphy: Enable lower resolutions in dphy
> git bisect bad 2272d5757ce5d3fb416d9f2497b015678eb85c0d
> # bad: [547539f08b9e3629ce68479889813e58c8087e70] ALSA: usb-audio: fix control pipe direction
> git bisect bad 547539f08b9e3629ce68479889813e58c8087e70
> # bad: [3509c748e79435d09e730673c8c100b7f0ebc87c] most: usb: hdm_probe: Fix calling put_device() before device initialization
> git bisect bad 3509c748e79435d09e730673c8c100b7f0ebc87c
> # bad: [a6ebcafc2f5ff7f0d1ce0c6dc38ac09a16a56ec0] net: add ndo_fdb_del_bulk
> git bisect bad a6ebcafc2f5ff7f0d1ce0c6dc38ac09a16a56ec0
> # good: [b8a72692aa42b7dcd179a96b90bc2763ac74576a] hfsplus: fix KMSAN uninit-value issue in __hfsplus_ext_cache_extent()
> git bisect good b8a72692aa42b7dcd179a96b90bc2763ac74576a
> # good: [2b42a595863556b394bd702d46f4a9d0d2985aaa] m68k: bitops: Fix find_*_bit() signatures
> git bisect good 2b42a595863556b394bd702d46f4a9d0d2985aaa
> # good: [9d9f7d71d46cff3491a443a3cf452cecf87d51ef] net: rtnetlink: use BIT for flag values
> git bisect good 9d9f7d71d46cff3491a443a3cf452cecf87d51ef
> # bad: [1550f3673972c5cfba714135f8bf26784e6f2b0f] net: rtnetlink: add bulk delete support flag
> git bisect bad 1550f3673972c5cfba714135f8bf26784e6f2b0f
> # good: [c8879afa24169e504f78c9ca43a4d0d7397049eb] net: netlink: add NLM_F_BULK delete request modifier
> git bisect good c8879afa24169e504f78c9ca43a4d0d7397049eb
> # first bad commit: [1550f3673972c5cfba714135f8bf26784e6f2b0f] net: rtnetlink: add bulk delete support flag
>
> Is there anything else I can do to help?
Is there soemthing missing?
Roland I think it would be helpful if you can test as well more recent
stable series versions to confirm if the issue is present there as
well or not, which might indicate a 5.10.y specific backporting
problem.
#regzbot introduced: 1550f3673972c5cfba714135f8bf26784e6f2b0f
Regards,
Salvatore
In the non-RT kernel, local_bh_disable() merely disables preemption,
whereas it maps to an actual spin lock in the RT kernel. Consequently,
when attempting to refill RX buffers via netdev_alloc_skb() in
macb_mac_link_up(), a deadlock scenario arises as follows:
Chain caused by macb_mac_link_up():
&bp->lock --> (softirq_ctrl.lock)
Chain caused by macb_start_xmit():
(softirq_ctrl.lock) --> _xmit_ETHER#2 --> &bp->lock
Notably, invoking the mog_init_rings() callback upon link establishment
is unnecessary. Instead, we can exclusively call mog_init_rings() within
the ndo_open() callback. This adjustment resolves the deadlock issue.
Given that mog_init_rings() is only applicable to
non-MACB_CAPS_MACB_IS_EMAC cases, we can simply move it to macb_open()
and simultaneously eliminate the MACB_CAPS_MACB_IS_EMAC check.
Fixes: 633e98a711ac ("net: macb: use resolved link config in mac_link_up()")
Cc: stable(a)vger.kernel.org
Suggested-by: Kevin Hao <kexin.hao(a)windriver.com>
Signed-off-by: Xiaolei Wang <xiaolei.wang(a)windriver.com>
---
V1: https://patchwork.kernel.org/project/netdevbpf/patch/20251128103647.351259-…
V2: Update the correct lock dependency chain and add the Fix tag.
drivers/net/ethernet/cadence/macb_main.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c
index ca2386b83473..064fccdcf699 100644
--- a/drivers/net/ethernet/cadence/macb_main.c
+++ b/drivers/net/ethernet/cadence/macb_main.c
@@ -744,7 +744,6 @@ static void macb_mac_link_up(struct phylink_config *config,
/* Initialize rings & buffers as clearing MACB_BIT(TE) in link down
* cleared the pipeline and control registers.
*/
- bp->macbgem_ops.mog_init_rings(bp);
macb_init_buffers(bp);
for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue)
@@ -2991,6 +2990,8 @@ static int macb_open(struct net_device *dev)
goto pm_exit;
}
+ bp->macbgem_ops.mog_init_rings(bp);
+
for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue) {
napi_enable(&queue->napi_rx);
napi_enable(&queue->napi_tx);
--
2.43.0
The local variable 'i' is initialized with -EINVAL, but the for loop
immediately overwrites it and -EINVAL is never returned.
If no empty compression mode can be found, the function would return the
out-of-bounds index IAA_COMP_MODES_MAX, which would cause an invalid
array access in add_iaa_compression_mode().
Fix both issues by returning either a valid index or -EINVAL.
Cc: stable(a)vger.kernel.org
Fixes: b190447e0fa3 ("crypto: iaa - Add compression mode management along with fixed mode")
Signed-off-by: Thorsten Blum <thorsten.blum(a)linux.dev>
---
drivers/crypto/intel/iaa/iaa_crypto_main.c | 12 +++++-------
1 file changed, 5 insertions(+), 7 deletions(-)
diff --git a/drivers/crypto/intel/iaa/iaa_crypto_main.c b/drivers/crypto/intel/iaa/iaa_crypto_main.c
index 23f585219fb4..8ee2a55ec449 100644
--- a/drivers/crypto/intel/iaa/iaa_crypto_main.c
+++ b/drivers/crypto/intel/iaa/iaa_crypto_main.c
@@ -221,15 +221,13 @@ static struct iaa_compression_mode *iaa_compression_modes[IAA_COMP_MODES_MAX];
static int find_empty_iaa_compression_mode(void)
{
- int i = -EINVAL;
+ int i;
- for (i = 0; i < IAA_COMP_MODES_MAX; i++) {
- if (iaa_compression_modes[i])
- continue;
- break;
- }
+ for (i = 0; i < IAA_COMP_MODES_MAX; i++)
+ if (!iaa_compression_modes[i])
+ return i;
- return i;
+ return -EINVAL;
}
static struct iaa_compression_mode *find_iaa_compression_mode(const char *name, int *idx)
--
Thorsten Blum <thorsten.blum(a)linux.dev>
GPG: 1D60 735E 8AEF 3BE4 73B6 9D84 7336 78FD 8DFE EAD4