The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x 76d2e3890fb169168c73f2e4f8375c7cc24a765e
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025082224-submersed-commend-be4c@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 76d2e3890fb169168c73f2e4f8375c7cc24a765e Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust(a)hammerspace.com>
Date: Sat, 16 Aug 2025 07:25:20 -0700
Subject: [PATCH] NFS: Fix a race when updating an existing write
After nfs_lock_and_join_requests() tests for whether the request is
still attached to the mapping, nothing prevents a call to
nfs_inode_remove_request() from succeeding until we actually lock the
page group.
The reason is that whoever called nfs_inode_remove_request() doesn't
necessarily have a lock on the page group head.
So in order to avoid races, let's take the page group lock earlier in
nfs_lock_and_join_requests(), and hold it across the removal of the
request in nfs_inode_remove_request().
Reported-by: Jeff Layton <jlayton(a)kernel.org>
Tested-by: Joe Quanaim <jdq(a)meta.com>
Tested-by: Andrew Steffen <aksteffen(a)meta.com>
Reviewed-by: Jeff Layton <jlayton(a)kernel.org>
Fixes: bd37d6fce184 ("NFSv4: Convert nfs_lock_and_join_requests() to use nfs_page_find_head_request()")
Cc: stable(a)vger.kernel.org
Signed-off-by: Trond Myklebust <trond.myklebust(a)hammerspace.com>
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 11968dcb7243..6e69ce43a13f 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -253,13 +253,14 @@ nfs_page_group_unlock(struct nfs_page *req)
nfs_page_clear_headlock(req);
}
-/*
- * nfs_page_group_sync_on_bit_locked
+/**
+ * nfs_page_group_sync_on_bit_locked - Test if all requests have @bit set
+ * @req: request in page group
+ * @bit: PG_* bit that is used to sync page group
*
* must be called with page group lock held
*/
-static bool
-nfs_page_group_sync_on_bit_locked(struct nfs_page *req, unsigned int bit)
+bool nfs_page_group_sync_on_bit_locked(struct nfs_page *req, unsigned int bit)
{
struct nfs_page *head = req->wb_head;
struct nfs_page *tmp;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index fa5c41d0989a..8b7c04737967 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -153,20 +153,10 @@ nfs_page_set_inode_ref(struct nfs_page *req, struct inode *inode)
}
}
-static int
-nfs_cancel_remove_inode(struct nfs_page *req, struct inode *inode)
+static void nfs_cancel_remove_inode(struct nfs_page *req, struct inode *inode)
{
- int ret;
-
- if (!test_bit(PG_REMOVE, &req->wb_flags))
- return 0;
- ret = nfs_page_group_lock(req);
- if (ret)
- return ret;
if (test_and_clear_bit(PG_REMOVE, &req->wb_flags))
nfs_page_set_inode_ref(req, inode);
- nfs_page_group_unlock(req);
- return 0;
}
/**
@@ -585,19 +575,18 @@ retry:
}
}
+ ret = nfs_page_group_lock(head);
+ if (ret < 0)
+ goto out_unlock;
+
/* Ensure that nobody removed the request before we locked it */
if (head != folio->private) {
+ nfs_page_group_unlock(head);
nfs_unlock_and_release_request(head);
goto retry;
}
- ret = nfs_cancel_remove_inode(head, inode);
- if (ret < 0)
- goto out_unlock;
-
- ret = nfs_page_group_lock(head);
- if (ret < 0)
- goto out_unlock;
+ nfs_cancel_remove_inode(head, inode);
/* lock each request in the page group */
for (subreq = head->wb_this_page;
@@ -786,7 +775,8 @@ static void nfs_inode_remove_request(struct nfs_page *req)
{
struct nfs_inode *nfsi = NFS_I(nfs_page_to_inode(req));
- if (nfs_page_group_sync_on_bit(req, PG_REMOVE)) {
+ nfs_page_group_lock(req);
+ if (nfs_page_group_sync_on_bit_locked(req, PG_REMOVE)) {
struct folio *folio = nfs_page_to_folio(req->wb_head);
struct address_space *mapping = folio->mapping;
@@ -798,6 +788,7 @@ static void nfs_inode_remove_request(struct nfs_page *req)
}
spin_unlock(&mapping->i_private_lock);
}
+ nfs_page_group_unlock(req);
if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags)) {
atomic_long_dec(&nfsi->nrequests);
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index 169b4ae30ff4..9aed39abc94b 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -160,6 +160,7 @@ extern void nfs_join_page_group(struct nfs_page *head,
extern int nfs_page_group_lock(struct nfs_page *);
extern void nfs_page_group_unlock(struct nfs_page *);
extern bool nfs_page_group_sync_on_bit(struct nfs_page *, unsigned int);
+extern bool nfs_page_group_sync_on_bit_locked(struct nfs_page *, unsigned int);
extern int nfs_page_set_headlock(struct nfs_page *req);
extern void nfs_page_clear_headlock(struct nfs_page *req);
extern bool nfs_async_iocounter_wait(struct rpc_task *, struct nfs_lock_context *);
Prior to the topology parsing rewrite and the switchover to the new
parsing logic for AMD processors in commit c749ce393b8f ("x86/cpu: Use
common topology code for AMD"), the "initial_apicid" on these platforms
was:
- First initialized to the LocalApicId from CPUID leaf 0x1 EBX[31:24].
- Then overwritten by the ExtendedLocalApicId in CPUID leaf 0xb
EDX[31:0] on processors that supported topoext.
With the new parsing flow introduced in commit f7fb3b2dd92c ("x86/cpu:
Provide an AMD/HYGON specific topology parser"), parse_8000_001e() now
unconditionally overwrites the "initial_apicid" already parsed during
cpu_parse_topology_ext().
Although this has not been a problem on baremetal platforms, on
virtualized AMD guests that feature more than 255 cores, QEMU 0's out
the CPUID leaf 0x8000001e on CPUs with "CoreID" > 255 to prevent
collision of these IDs in EBX[7:0] which can only represent a maximum of
255 cores [1].
This results in the following FW_BUG being logged when booting a guest
with more than 255 cores:
[Firmware Bug]: CPU 512: APIC ID mismatch. CPUID: 0x0000 APIC: 0x0200
AMD64 Architecture Programmer's Manual Volume 2: System Programming Pub.
24593 Rev. 3.42 [2] Section 16.12 "x2APIC_ID" mentions the Extended
Enumeration leaf 0x8000001e (which was later superseded by the extended
leaf 0x80000026) provides the full x2APIC ID under all circumstances
unlike the one reported by CPUID leaf 0x8000001e EAX which depends on
the mode in which APIC is configured.
Rely on the APIC ID parsed during cpu_parse_topology_ext() from CPUID
leaf 0x80000026 or 0xb and only use the APIC ID from leaf 0x8000001e if
cpu_parse_topology_ext() failed (has_topoext is false).
On platforms that support the 0xb leaf (Zen2 or later, AMD guests on
QEMU) or the extended leaf 0x80000026 (Zen4 or later), the
"initial_apicid" is now set to the value parsed from EDX[31:0].
On older AMD/Hygon platforms that does not support the 0xb leaf but
supports the TOPOEXT extension (Fam 0x15, 0x16, 0x17[Zen1], and Hygon),
the current behavior is retained where "initial_apicid" is set using
the 0x8000001e leaf.
Cc: stable(a)vger.kernel.org
Link: https://github.com/qemu/qemu/commit/35ac5dfbcaa4b [1]
Link: https://bugzilla.kernel.org/show_bug.cgi?id=206537 [2]
Debugged-by: Naveen N Rao (AMD) <naveen(a)kernel.org>
Debugged-by: Sairaj Kodilkar <sarunkod(a)amd.com>
Fixes: c749ce393b8f ("x86/cpu: Use common topology code for AMD")
Suggested-by: Thomas Gleixner <tglx(a)linutronix.de>
Tested-by: Naveen N Rao (AMD) <naveen(a)kernel.org>
Signed-off-by: K Prateek Nayak <kprateek.nayak(a)amd.com>
---
Changelog v3..v4:
o Refreshed the diff based on Thomas' suggestion. The tags have been
retained since there are no functional changes - only comments around
the code has changed.
o Quoted relevant section of APM justifying the changes.
o Moved this patch up ahead.
o Cc'd stable.
---
arch/x86/kernel/cpu/topology_amd.c | 23 ++++++++++++++---------
1 file changed, 14 insertions(+), 9 deletions(-)
diff --git a/arch/x86/kernel/cpu/topology_amd.c b/arch/x86/kernel/cpu/topology_amd.c
index 843b1655ab45..827dd0dbb6e9 100644
--- a/arch/x86/kernel/cpu/topology_amd.c
+++ b/arch/x86/kernel/cpu/topology_amd.c
@@ -81,20 +81,25 @@ static bool parse_8000_001e(struct topo_scan *tscan, bool has_topoext)
cpuid_leaf(0x8000001e, &leaf);
- tscan->c->topo.initial_apicid = leaf.ext_apic_id;
-
/*
- * If leaf 0xb is available, then the domain shifts are set
- * already and nothing to do here. Only valid for family >= 0x17.
+ * If leaf 0xb/0x26 is available, then the APIC ID and the domain
+ * shifts are set already.
*/
- if (!has_topoext && tscan->c->x86 >= 0x17) {
+ if (!has_topoext) {
+ tscan->c->topo.initial_apicid = leaf.ext_apic_id;
+
/*
- * Leaf 0x80000008 set the CORE domain shift already.
- * Update the SMT domain, but do not propagate it.
+ * Leaf 0x8000008 sets the CORE domain shift but not the
+ * SMT domain shift. On CPUs with family >= 0x17, there
+ * might be hyperthreads.
*/
- unsigned int nthreads = leaf.core_nthreads + 1;
+ if (tscan->c->x86 >= 0x17) {
+ /* Update the SMT domain, but do not propagate it. */
+ unsigned int nthreads = leaf.core_nthreads + 1;
- topology_update_dom(tscan, TOPO_SMT_DOMAIN, get_count_order(nthreads), nthreads);
+ topology_update_dom(tscan, TOPO_SMT_DOMAIN,
+ get_count_order(nthreads), nthreads);
+ }
}
store_node(tscan, leaf.nnodes_per_socket + 1, leaf.node_id);
--
2.34.1
When the host actively triggers SSR and collects coredump data,
the Bluetooth stack sends a reset command to the controller. However, due
to the inability to clear the QCA_SSR_TRIGGERED and QCA_IBS_DISABLED bits,
the reset command times out.
To address this, this patch clears the QCA_SSR_TRIGGERED and
QCA_IBS_DISABLED flags and adds a 50ms delay after SSR, but only when
HCI_QUIRK_NON_PERSISTENT_SETUP is not set. This ensures the controller
completes the SSR process when BT_EN is always high due to hardware.
For the purpose of HCI_QUIRK_NON_PERSISTENT_SETUP, please refer to
the comment in `include/net/bluetooth/hci.h`.
The HCI_QUIRK_NON_PERSISTENT_SETUP quirk is associated with BT_EN,
and its presence can be used to determine whether BT_EN is defined in DTS.
After SSR, host will not download the firmware, causing
controller to remain in the IBS_WAKE state. Host needs
to synchronize with the controller to maintain proper operation.
Multiple triggers of SSR only first generate coredump file,
due to memcoredump_flag no clear.
add clear coredump flag when ssr completed.
When the SSR duration exceeds 2 seconds, it triggers
host tx_idle_timeout, which sets host TX state to sleep. due to the
hardware pulling up bt_en, the firmware is not downloaded after the SSR.
As a result, the controller does not enter sleep mode. Consequently,
when the host sends a command afterward, it sends 0xFD to the controller,
but the controller does not respond, leading to a command timeout.
So reset tx_idle_timer after SSR to prevent host enter TX IBS_Sleep mode.
---
Changs since v8-v9:
-- Update base patch to latest patch.
-- add Cc stable(a)vger.kernel.org on signed-of.
Changes since v6-7:
- Merge the changes into a single patch.
- Update commit.
Changes since v1-5:
- Add an explanation for HCI_QUIRK_NON_PERSISTENT_SETUP.
- Add commments for msleep(50).
- Update format and commit.
Signed-off-by: Shuai Zhang <quic_shuaz(a)quicinc.com>
Cc: stable(a)vger.kernel.org
---
drivers/bluetooth/hci_qca.c | 33 +++++++++++++++++++++++++++++++++
1 file changed, 33 insertions(+)
diff --git a/drivers/bluetooth/hci_qca.c b/drivers/bluetooth/hci_qca.c
index 4cff4d9be..97aaf4985 100644
--- a/drivers/bluetooth/hci_qca.c
+++ b/drivers/bluetooth/hci_qca.c
@@ -1653,6 +1653,39 @@ static void qca_hw_error(struct hci_dev *hdev, u8 code)
skb_queue_purge(&qca->rx_memdump_q);
}
+ /*
+ * If the BT chip's bt_en pin is connected to a 3.3V power supply via
+ * hardware and always stays high, driver cannot control the bt_en pin.
+ * As a result, during SSR (SubSystem Restart), QCA_SSR_TRIGGERED and
+ * QCA_IBS_DISABLED flags cannot be cleared, which leads to a reset
+ * command timeout.
+ * Add an msleep delay to ensure controller completes the SSR process.
+ *
+ * Host will not download the firmware after SSR, controller to remain
+ * in the IBS_WAKE state, and the host needs to synchronize with it
+ *
+ * Since the bluetooth chip has been reset, clear the memdump state.
+ */
+ if (!test_bit(HCI_QUIRK_NON_PERSISTENT_SETUP, &hdev->quirks)) {
+ /*
+ * When the SSR (SubSystem Restart) duration exceeds 2 seconds,
+ * it triggers host tx_idle_delay, which sets host TX state
+ * to sleep. Reset tx_idle_timer after SSR to prevent
+ * host enter TX IBS_Sleep mode.
+ */
+ mod_timer(&qca->tx_idle_timer, jiffies +
+ msecs_to_jiffies(qca->tx_idle_delay));
+
+ /* Controller reset completion time is 50ms */
+ msleep(50);
+
+ clear_bit(QCA_SSR_TRIGGERED, &qca->flags);
+ clear_bit(QCA_IBS_DISABLED, &qca->flags);
+
+ qca->tx_ibs_state = HCI_IBS_TX_AWAKE;
+ qca->memdump_state = QCA_MEMDUMP_IDLE;
+ }
+
clear_bit(QCA_HW_ERROR_EVENT, &qca->flags);
}
--
2.34.1
The following commit has been merged into the x86/urgent branch of tip:
Commit-ID: 24963ae1b0b6596dc36e352c18593800056251d8
Gitweb: https://git.kernel.org/tip/24963ae1b0b6596dc36e352c18593800056251d8
Author: Suchit Karunakaran <suchitkarunakaran(a)gmail.com>
AuthorDate: Sat, 16 Aug 2025 12:21:26 +05:30
Committer: Dave Hansen <dave.hansen(a)linux.intel.com>
CommitterDate: Mon, 25 Aug 2025 08:23:37 -07:00
x86/cpu/intel: Fix the constant_tsc model check for Pentium 4
Pentium 4's which are INTEL_P4_PRESCOTT (model 0x03) and later have
a constant TSC. This was correctly captured until commit fadb6f569b10
("x86/cpu/intel: Limit the non-architectural constant_tsc model checks").
In that commit, an error was introduced while selecting the last P4
model (0x06) as the upper bound. Model 0x06 was transposed to
INTEL_P4_WILLAMETTE, which is just plain wrong. That was presumably a
simple typo, probably just copying and pasting the wrong P4 model.
Fix the constant TSC logic to cover all later P4 models. End at
INTEL_P4_CEDARMILL which accurately corresponds to the last P4 model.
Fixes: fadb6f569b10 ("x86/cpu/intel: Limit the non-architectural constant_tsc model checks")
Signed-off-by: Suchit Karunakaran <suchitkarunakaran(a)gmail.com>
Signed-off-by: Dave Hansen <dave.hansen(a)linux.intel.com>
Reviewed-by: Sohil Mehta <sohil.mehta(a)intel.com>
Cc:stable@vger.kernel.org
Link: https://lore.kernel.org/all/20250816065126.5000-1-suchitkarunakaran%40gmail…
---
arch/x86/kernel/cpu/intel.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 076eaa4..98ae4c3 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -262,7 +262,7 @@ static void early_init_intel(struct cpuinfo_x86 *c)
if (c->x86_power & (1 << 8)) {
set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
- } else if ((c->x86_vfm >= INTEL_P4_PRESCOTT && c->x86_vfm <= INTEL_P4_WILLAMETTE) ||
+ } else if ((c->x86_vfm >= INTEL_P4_PRESCOTT && c->x86_vfm <= INTEL_P4_CEDARMILL) ||
(c->x86_vfm >= INTEL_CORE_YONAH && c->x86_vfm <= INTEL_IVYBRIDGE)) {
set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
}
Introduce and use {pgd,p4d}_populate_kernel() in core MM code when
populating PGD and P4D entries for the kernel address space.
These helpers ensure proper synchronization of page tables when
updating the kernel portion of top-level page tables.
Until now, the kernel has relied on each architecture to handle
synchronization of top-level page tables in an ad-hoc manner.
For example, see commit 9b861528a801 ("x86-64, mem: Update all PGDs for
direct mapping and vmemmap mapping changes").
However, this approach has proven fragile for following reasons:
1) It is easy to forget to perform the necessary page table
synchronization when introducing new changes.
For instance, commit 4917f55b4ef9 ("mm/sparse-vmemmap: improve memory
savings for compound devmaps") overlooked the need to synchronize
page tables for the vmemmap area.
2) It is also easy to overlook that the vmemmap and direct mapping areas
must not be accessed before explicit page table synchronization.
For example, commit 8d400913c231 ("x86/vmemmap: handle unpopulated
sub-pmd ranges")) caused crashes by accessing the vmemmap area
before calling sync_global_pgds().
To address this, as suggested by Dave Hansen, introduce _kernel() variants
of the page table population helpers, which invoke architecture-specific
hooks to properly synchronize page tables. These are introduced in a new
header file, include/linux/pgalloc.h, so they can be called from common code.
They reuse existing infrastructure for vmalloc and ioremap.
Synchronization requirements are determined by ARCH_PAGE_TABLE_SYNC_MASK,
and the actual synchronization is performed by arch_sync_kernel_mappings().
This change currently targets only x86_64, so only PGD and P4D level
helpers are introduced. Currently, these helpers are no-ops since no
architecture sets PGTBL_{PGD,P4D}_MODIFIED in ARCH_PAGE_TABLE_SYNC_MASK.
In theory, PUD and PMD level helpers can be added later if needed by
other architectures. For now, 32-bit architectures (x86-32 and arm) only
handle PGTBL_PMD_MODIFIED, so p*d_populate_kernel() will never affect
them unless we introduce a PMD level helper.
Cc: <stable(a)vger.kernel.org>
Fixes: 8d400913c231 ("x86/vmemmap: handle unpopulated sub-pmd ranges")
Suggested-by: Dave Hansen <dave.hansen(a)linux.intel.com>
Acked-by: Kiryl Shutsemau <kas(a)kernel.org>
Reviewed-by: Mike Rapoport (Microsoft) <rppt(a)kernel.org>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes(a)oracle.com>
Signed-off-by: Harry Yoo <harry.yoo(a)oracle.com>
---
include/linux/pgalloc.h | 24 ++++++++++++++++++++++++
include/linux/pgtable.h | 13 +++++++------
mm/kasan/init.c | 12 ++++++------
mm/percpu.c | 6 +++---
mm/sparse-vmemmap.c | 6 +++---
5 files changed, 43 insertions(+), 18 deletions(-)
create mode 100644 include/linux/pgalloc.h
diff --git a/include/linux/pgalloc.h b/include/linux/pgalloc.h
new file mode 100644
index 000000000000..290ab864320f
--- /dev/null
+++ b/include/linux/pgalloc.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_PGALLOC_H
+#define _LINUX_PGALLOC_H
+
+#include <linux/pgtable.h>
+#include <asm/pgalloc.h>
+
+static inline void pgd_populate_kernel(unsigned long addr, pgd_t *pgd,
+ p4d_t *p4d)
+{
+ pgd_populate(&init_mm, pgd, p4d);
+ if (ARCH_PAGE_TABLE_SYNC_MASK & PGTBL_PGD_MODIFIED)
+ arch_sync_kernel_mappings(addr, addr);
+}
+
+static inline void p4d_populate_kernel(unsigned long addr, p4d_t *p4d,
+ pud_t *pud)
+{
+ p4d_populate(&init_mm, p4d, pud);
+ if (ARCH_PAGE_TABLE_SYNC_MASK & PGTBL_P4D_MODIFIED)
+ arch_sync_kernel_mappings(addr, addr);
+}
+
+#endif /* _LINUX_PGALLOC_H */
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index ba699df6ef69..2b80fd456c8b 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1469,8 +1469,8 @@ static inline void modify_prot_commit_ptes(struct vm_area_struct *vma, unsigned
/*
* Architectures can set this mask to a combination of PGTBL_P?D_MODIFIED values
- * and let generic vmalloc and ioremap code know when arch_sync_kernel_mappings()
- * needs to be called.
+ * and let generic vmalloc, ioremap and page table update code know when
+ * arch_sync_kernel_mappings() needs to be called.
*/
#ifndef ARCH_PAGE_TABLE_SYNC_MASK
#define ARCH_PAGE_TABLE_SYNC_MASK 0
@@ -1954,10 +1954,11 @@ static inline bool arch_has_pfn_modify_check(void)
/*
* Page Table Modification bits for pgtbl_mod_mask.
*
- * These are used by the p?d_alloc_track*() set of functions an in the generic
- * vmalloc/ioremap code to track at which page-table levels entries have been
- * modified. Based on that the code can better decide when vmalloc and ioremap
- * mapping changes need to be synchronized to other page-tables in the system.
+ * These are used by the p?d_alloc_track*() and p*d_populate_kernel()
+ * functions in the generic vmalloc, ioremap and page table update code
+ * to track at which page-table levels entries have been modified.
+ * Based on that the code can better decide when page table changes need
+ * to be synchronized to other page-tables in the system.
*/
#define __PGTBL_PGD_MODIFIED 0
#define __PGTBL_P4D_MODIFIED 1
diff --git a/mm/kasan/init.c b/mm/kasan/init.c
index ced6b29fcf76..8fce3370c84e 100644
--- a/mm/kasan/init.c
+++ b/mm/kasan/init.c
@@ -13,9 +13,9 @@
#include <linux/mm.h>
#include <linux/pfn.h>
#include <linux/slab.h>
+#include <linux/pgalloc.h>
#include <asm/page.h>
-#include <asm/pgalloc.h>
#include "kasan.h"
@@ -191,7 +191,7 @@ static int __ref zero_p4d_populate(pgd_t *pgd, unsigned long addr,
pud_t *pud;
pmd_t *pmd;
- p4d_populate(&init_mm, p4d,
+ p4d_populate_kernel(addr, p4d,
lm_alias(kasan_early_shadow_pud));
pud = pud_offset(p4d, addr);
pud_populate(&init_mm, pud,
@@ -212,7 +212,7 @@ static int __ref zero_p4d_populate(pgd_t *pgd, unsigned long addr,
} else {
p = early_alloc(PAGE_SIZE, NUMA_NO_NODE);
pud_init(p);
- p4d_populate(&init_mm, p4d, p);
+ p4d_populate_kernel(addr, p4d, p);
}
}
zero_pud_populate(p4d, addr, next);
@@ -251,10 +251,10 @@ int __ref kasan_populate_early_shadow(const void *shadow_start,
* puds,pmds, so pgd_populate(), pud_populate()
* is noops.
*/
- pgd_populate(&init_mm, pgd,
+ pgd_populate_kernel(addr, pgd,
lm_alias(kasan_early_shadow_p4d));
p4d = p4d_offset(pgd, addr);
- p4d_populate(&init_mm, p4d,
+ p4d_populate_kernel(addr, p4d,
lm_alias(kasan_early_shadow_pud));
pud = pud_offset(p4d, addr);
pud_populate(&init_mm, pud,
@@ -273,7 +273,7 @@ int __ref kasan_populate_early_shadow(const void *shadow_start,
if (!p)
return -ENOMEM;
} else {
- pgd_populate(&init_mm, pgd,
+ pgd_populate_kernel(addr, pgd,
early_alloc(PAGE_SIZE, NUMA_NO_NODE));
}
}
diff --git a/mm/percpu.c b/mm/percpu.c
index d9cbaee92b60..a56f35dcc417 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -3108,7 +3108,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
#endif /* BUILD_EMBED_FIRST_CHUNK */
#ifdef BUILD_PAGE_FIRST_CHUNK
-#include <asm/pgalloc.h>
+#include <linux/pgalloc.h>
#ifndef P4D_TABLE_SIZE
#define P4D_TABLE_SIZE PAGE_SIZE
@@ -3134,13 +3134,13 @@ void __init __weak pcpu_populate_pte(unsigned long addr)
if (pgd_none(*pgd)) {
p4d = memblock_alloc_or_panic(P4D_TABLE_SIZE, P4D_TABLE_SIZE);
- pgd_populate(&init_mm, pgd, p4d);
+ pgd_populate_kernel(addr, pgd, p4d);
}
p4d = p4d_offset(pgd, addr);
if (p4d_none(*p4d)) {
pud = memblock_alloc_or_panic(PUD_TABLE_SIZE, PUD_TABLE_SIZE);
- p4d_populate(&init_mm, p4d, pud);
+ p4d_populate_kernel(addr, p4d, pud);
}
pud = pud_offset(p4d, addr);
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 41aa0493eb03..dbd8daccade2 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -27,9 +27,9 @@
#include <linux/spinlock.h>
#include <linux/vmalloc.h>
#include <linux/sched.h>
+#include <linux/pgalloc.h>
#include <asm/dma.h>
-#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include "hugetlb_vmemmap.h"
@@ -229,7 +229,7 @@ p4d_t * __meminit vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node)
if (!p)
return NULL;
pud_init(p);
- p4d_populate(&init_mm, p4d, p);
+ p4d_populate_kernel(addr, p4d, p);
}
return p4d;
}
@@ -241,7 +241,7 @@ pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node)
void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node);
if (!p)
return NULL;
- pgd_populate(&init_mm, pgd, p);
+ pgd_populate_kernel(addr, pgd, p);
}
return pgd;
}
--
2.43.0
The patch titled
Subject: arm64: kexec: Initialize kexec_buf struct in image_load()
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
arm64-kexec-initialize-kexec_buf-struct-in-image_load.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Breno Leitao <leitao(a)debian.org>
Subject: arm64: kexec: Initialize kexec_buf struct in image_load()
Date: Tue, 26 Aug 2025 05:08:51 -0700
The kexec_buf structure was previously declared without initialization in
image_load(). This led to a UBSAN warning when the structure was expanded
and uninitialized fields were accessed [1].
Zero-initializing kexec_buf at declaration ensures all fields are cleanly
set, preventing future instances of uninitialized memory being used.
Fixes this UBSAN warning:
[ 32.362488] UBSAN: invalid-load in ./include/linux/kexec.h:210:10
[ 32.362649] load of value 252 is not a valid value for type '_Bool'
Andrew Morton suggested that this function is only called 3x a week[2],
thus, the memset() cost is inexpensive.
Link: https://lore.kernel.org/all/oninomspajhxp4omtdapxnckxydbk2nzmrix7rggmpukpnz… [1]
Link: https://lore.kernel.org/all/20250825180531.94bfb86a26a43127c0a1296f@linux-f… [2]
Link: https://lkml.kernel.org/r/20250826-akpm-v1-1-3c831f0e3799@debian.org
Fixes: bf454ec31add ("kexec_file: allow to place kexec_buf randomly")
Signed-off-by: Breno Leitao <leitao(a)debian.org>
Suggested-by: Andrew Morton <akpm(a)linux-foundation.org>
Cc: Mark Rutland <mark.rutland(a)arm.com>
Cc: Baoquan He <bhe(a)redhat.com>
Cc: Coiby Xu <coxu(a)redhat.com>
Cc: "Daniel P. Berrange" <berrange(a)redhat.com>
Cc: Dave Hansen <dave.hansen(a)intel.com>
Cc: Dave Young <dyoung(a)redhat.com>
Cc: Kairui Song <ryncsn(a)gmail.com>
Cc: Liu Pingfan <kernelfans(a)gmail.com>
Cc: Milan Broz <gmazyland(a)gmail.com>
Cc: Ondrej Kozina <okozina(a)redhat.com>
Cc: Vitaly Kuznetsov <vkuznets(a)redhat.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
arch/arm64/kernel/kexec_image.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/arch/arm64/kernel/kexec_image.c~arm64-kexec-initialize-kexec_buf-struct-in-image_load
+++ a/arch/arm64/kernel/kexec_image.c
@@ -41,7 +41,7 @@ static void *image_load(struct kimage *i
struct arm64_image_header *h;
u64 flags, value;
bool be_image, be_kernel;
- struct kexec_buf kbuf;
+ struct kexec_buf kbuf = {};
unsigned long text_offset, kernel_segment_number;
struct kexec_segment *kernel_segment;
int ret;
_
Patches currently in -mm which might be from leitao(a)debian.org are
arm64-kexec-initialize-kexec_buf-struct-in-image_load.patch
The quilt patch titled
Subject: kexec/arm64: initialize the random field of kbuf to zero in the image loader
has been removed from the -mm tree. Its filename was
kexec-arm64-initialize-the-random-field-of-kbuf-to-zero-in-the-image-loader.patch
This patch was dropped because an updated version will be issued
------------------------------------------------------
From: Breno Leitao <leitao(a)debian.org>
Subject: kexec/arm64: initialize the random field of kbuf to zero in the image loader
Date: Thu Aug 21 04:11:21 2025 -0700
Add an explicit initialization for the random member of the kbuf structure
within the image_load function in arch/arm64/kernel/kexec_image.c.
Setting kbuf.random to zero ensures a deterministic and clean starting
state for the buffer used during kernel image loading, avoiding this UBSAN
issue later, when kbuf.random is read.
[ 32.362488] UBSAN: invalid-load in ./include/linux/kexec.h:210:10
[ 32.362649] load of value 252 is not a valid value for type '_Bool'
Link: https://lkml.kernel.org/r/oninomspajhxp4omtdapxnckxydbk2nzmrix7rggmpukpnzad…
Fixes: bf454ec31add ("kexec_file: allow to place kexec_buf randomly")
Signed-off-by: Breno Leitao <leitao(a)debian.org>
Cc: Mark Rutland <mark.rutland(a)arm.com>
Cc: Baoquan He <bhe(a)redhat.com>
Cc: Coiby Xu <coxu(a)redhat.com>
Cc: "Daniel P. Berrange" <berrange(a)redhat.com>
Cc: Dave Hansen <dave.hansen(a)intel.com>
Cc: Dave Young <dyoung(a)redhat.com>
Cc: Kairui Song <ryncsn(a)gmail.com>
Cc: Liu Pingfan <kernelfans(a)gmail.com>
Cc: Milan Broz <gmazyland(a)gmail.com>
Cc: Ondrej Kozina <okozina(a)redhat.com>
Cc: Vitaly Kuznetsov <vkuznets(a)redhat.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
arch/arm64/kernel/kexec_image.c | 1 +
1 file changed, 1 insertion(+)
--- a/arch/arm64/kernel/kexec_image.c~kexec-arm64-initialize-the-random-field-of-kbuf-to-zero-in-the-image-loader
+++ a/arch/arm64/kernel/kexec_image.c
@@ -76,6 +76,7 @@ static void *image_load(struct kimage *i
kbuf.buf_min = 0;
kbuf.buf_max = ULONG_MAX;
kbuf.top_down = false;
+ kbuf.random = 0;
kbuf.buffer = kernel;
kbuf.bufsz = kernel_len;
_
Patches currently in -mm which might be from leitao(a)debian.org are
There are some AA deadlock issues in kmemleak, similar to the situation
reported by Breno [1]. The deadlock path is as follows:
mem_pool_alloc()
-> raw_spin_lock_irqsave(&kmemleak_lock, flags);
-> pr_warn()
-> netconsole subsystem
-> netpoll
-> __alloc_skb
-> __create_object
-> raw_spin_lock_irqsave(&kmemleak_lock, flags);
To solve this problem, switch to printk_safe mode before printing warning
message, this will redirect all printk()-s to a special per-CPU buffer,
which will be flushed later from a safe context (irq work), and this
deadlock problem can be avoided. The proper API to use should be
printk_deferred_enter()/printk_deferred_exit() [2]. Another way is to
place the warn print after kmemleak is released.
[1]
https://lore.kernel.org/all/20250731-kmemleak_lock-v1-1-728fd470198f@debian…
[2]
https://lore.kernel.org/all/5ca375cd-4a20-4807-b897-68b289626550@redhat.com/
====================
Signed-off-by: Gu Bowen <gubowen5(a)huawei.com>
---
mm/kmemleak.c | 27 ++++++++++++++++++++-------
1 file changed, 20 insertions(+), 7 deletions(-)
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 84265983f239..1ac56ceb29b6 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -437,9 +437,15 @@ static struct kmemleak_object *__lookup_object(unsigned long ptr, int alias,
else if (untagged_objp == untagged_ptr || alias)
return object;
else {
+ /*
+ * Printk deferring due to the kmemleak_lock held.
+ * This is done to avoid deadlock.
+ */
+ printk_deferred_enter();
kmemleak_warn("Found object by alias at 0x%08lx\n",
ptr);
dump_object_info(object);
+ printk_deferred_exit();
break;
}
}
@@ -736,6 +742,11 @@ static int __link_object(struct kmemleak_object *object, unsigned long ptr,
else if (untagged_objp + parent->size <= untagged_ptr)
link = &parent->rb_node.rb_right;
else {
+ /*
+ * Printk deferring due to the kmemleak_lock held.
+ * This is done to avoid deadlock.
+ */
+ printk_deferred_enter();
kmemleak_stop("Cannot insert 0x%lx into the object search tree (overlaps existing)\n",
ptr);
/*
@@ -743,6 +754,7 @@ static int __link_object(struct kmemleak_object *object, unsigned long ptr,
* be freed while the kmemleak_lock is held.
*/
dump_object_info(parent);
+ printk_deferred_exit();
return -EEXIST;
}
}
@@ -856,13 +868,8 @@ static void delete_object_part(unsigned long ptr, size_t size,
raw_spin_lock_irqsave(&kmemleak_lock, flags);
object = __find_and_remove_object(ptr, 1, objflags);
- if (!object) {
-#ifdef DEBUG
- kmemleak_warn("Partially freeing unknown object at 0x%08lx (size %zu)\n",
- ptr, size);
-#endif
+ if (!object)
goto unlock;
- }
/*
* Create one or two objects that may result from the memory block
@@ -882,8 +889,14 @@ static void delete_object_part(unsigned long ptr, size_t size,
unlock:
raw_spin_unlock_irqrestore(&kmemleak_lock, flags);
- if (object)
+ if (object) {
__delete_object(object);
+ } else {
+#ifdef DEBUG
+ kmemleak_warn("Partially freeing unknown object at 0x%08lx (size %zu)\n",
+ ptr, size);
+#endif
+ }
out:
if (object_l)
--
2.43.0
[BUG]
With 64K page size (aarch64 with 64K page size config) and 4K btrfs
block size, the following workload can easily lead to a corrupted read:
mkfs.btrfs -f -s 4k $dev > /dev/null
mount -o compress $dev $mnt
xfs_io -f -c "pwrite -S 0xff 0 64k" $mnt/base > /dev/null
echo "correct result:"
od -Ad -t x1 $mnt/base
xfs_io -f -c "reflink $mnt/base 32k 0 32k" \
-c "reflink $mnt/base 0 32k 32k" \
-c "pwrite -S 0xff 60k 4k" $mnt/new > /dev/null
echo "incorrect result:"
od -Ad -t x1 $mnt/new
umount $mnt
This shows the following result:
correct result:
0000000 ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
*
0065536
incorrect result:
0000000 ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
*
0032768 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
*
0061440 ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
*
0065536
Notice the zero in the range [32K, 60K), which is incorrect.
[CAUSE]
With extra trace printk, it shows the following events during od:
(some unrelated info removed like CPU and context)
od-3457 btrfs_do_readpage: enter r/i=5/258 folio=0(65536) prev_em_start=0000000000000000
The "r/i" is indicating the root and inode number. In our case the file
"new" is using ino 258 from fs tree (root 5).
Here notice the @prev_em_start pointer is NULL. This means the
btrfs_do_readpage() is called from btrfs_read_folio(), not from
btrfs_readahead().
od-3457 btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=0 got em start=0 len=32768
od-3457 btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=4096 got em start=0 len=32768
od-3457 btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=8192 got em start=0 len=32768
od-3457 btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=12288 got em start=0 len=32768
od-3457 btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=16384 got em start=0 len=32768
od-3457 btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=20480 got em start=0 len=32768
od-3457 btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=24576 got em start=0 len=32768
od-3457 btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=28672 got em start=0 len=32768
These above 32K blocks will be read from the first half of the
compressed data extent.
od-3457 btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=32768 got em start=32768 len=32768
Note here there is no btrfs_submit_compressed_read() call. Which is
incorrect now.
Although both extent maps at 0 and 32K are pointing to the same compressed
data, their offsets are different thus can not be merged into the same
read.
So this means the compressed data read merge check is doing something
wrong.
od-3457 btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=36864 got em start=32768 len=32768
od-3457 btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=40960 got em start=32768 len=32768
od-3457 btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=45056 got em start=32768 len=32768
od-3457 btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=49152 got em start=32768 len=32768
od-3457 btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=53248 got em start=32768 len=32768
od-3457 btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=57344 got em start=32768 len=32768
od-3457 btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=61440 skip uptodate
od-3457 btrfs_submit_compressed_read: cb orig_bio: file off=0 len=61440
The function btrfs_submit_compressed_read() is only called at the end of
folio read. The compressed bio will only have an extent map of range [0,
32K), but the original bio passed in is for the whole 64K folio.
This will cause the decompression part to only fill the first 32K,
leaving the rest untouched (aka, filled with zero).
This incorrect compressed read merge leads to the above data corruption.
There were similar problems that happened in the past, commit 808f80b46790
("Btrfs: update fix for read corruption of compressed and shared
extents") is doing pretty much the same fix for readahead.
But that's back to 2015, where btrfs still only supports bs (block size)
== ps (page size) cases.
This means btrfs_do_readpage() only needs to handle a folio which
contains exactly one block.
Only btrfs_readahead() can lead to a read covering multiple blocks.
Thus only btrfs_readahead() passes a non-NULL @prev_em_start pointer.
With v5.15 kernel btrfs introduced bs < ps support. This breaks the above
assumption that a folio can only contain one block.
Now btrfs_read_folio() can also read multiple blocks in one go.
But btrfs_read_folio() doesn't pass a @prev_em_start pointer, thus the
existing bio force submission check will never be triggered.
In theory, this can also happen for btrfs with large folios, but since
large folio is still experimental, we don't need to bother it, thus only
bs < ps support is affected for now.
[FIX]
Instead of passing @prev_em_start to do the proper compressed extent
check, introduce one new member, btrfs_bio_ctrl::last_em_start, so that
the existing bio force submission logic will always be triggered.
CC: stable(a)vger.kernel.org # 5.15+
Reviewed-by: Filipe Manana <fdmanana(a)suse.com>
Signed-off-by: Qu Wenruo <wqu(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
---
Changelog:
v4:
- Minor grammar fixes
- Update the subject to reflect the bug better
This is the merged version inside for-next branch.
However for people who are not following our development branch but
still want to verify the incoming fstests test case, we need this fix
to be publicly avaiable. Newer minor changes will only be done inside
the development branch.
v3:
- Use a single btrfs_bio_ctrl::last_em_start
Since the existing prev_em_start is using U64_MAX as the initial value
to indicate no em hit yet, we can use the same logic, which saves
another 8 bytes from btrfs_bio_ctrl.
- Update the reproducer
Previously I failed to reproduce using a minimal workload.
As regular read from od/md5sum always trigger readahead thus not
hitting the @prev_em_start == NULL path.
Will send out a fstest case for it using the minimal reproducer.
But it will still need a 16K/64K page sized system to reproduce.
Fix the problem by doing an block aligned write into the folio, so
that the folio will be partially dirty and not go through the
readahead path.
- Update the analyze
This includes the trace events of the minimal reproducer, and
mentioning of previous similar fixes and why they do not work for
subpage cases.
- Update the CC tag
Since it's only affecting bs < ps cases (for non-experimental builds),
only need to fix kernels with subpage btrfs supports.
v2:
- Only save extent_map::start/len to save memory for btrfs_bio_ctrl
It's using on-stack memory which is very limited inside the kernel.
- Remove the commit message mentioning of clearing last saved em
Since we're using em::start/len, there is no need to clear them.
Either we hit the same em::start/len, meaning hitting the same extent
map, or we hit a different em, which will have a different start/len.
---
fs/btrfs/extent_io.c | 40 ++++++++++++++++++++++++++++++----------
1 file changed, 30 insertions(+), 10 deletions(-)
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 426a6791a0b2..ca7174fa0240 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -131,6 +131,24 @@ struct btrfs_bio_ctrl {
*/
unsigned long submit_bitmap;
struct readahead_control *ractl;
+
+ /*
+ * The start offset of the last used extent map by a read operation.
+ *
+ * This is for proper compressed read merge.
+ * U64_MAX means we are starting the read and have made no progress yet.
+ *
+ * The current btrfs_bio_is_contig() only uses disk_bytenr as
+ * the condition to check if the read can be merged with previous
+ * bio, which is not correct. E.g. two file extents pointing to the
+ * same extent but with different offset.
+ *
+ * So here we need to do extra checks to only merge reads that are
+ * covered by the same extent map.
+ * Just extent_map::start will be enough, as they are unique
+ * inside the same inode.
+ */
+ u64 last_em_start;
};
/*
@@ -965,7 +983,7 @@ static void btrfs_readahead_expand(struct readahead_control *ractl,
* return 0 on success, otherwise return error
*/
static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached,
- struct btrfs_bio_ctrl *bio_ctrl, u64 *prev_em_start)
+ struct btrfs_bio_ctrl *bio_ctrl)
{
struct inode *inode = folio->mapping->host;
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
@@ -1076,12 +1094,11 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached,
* non-optimal behavior (submitting 2 bios for the same extent).
*/
if (compress_type != BTRFS_COMPRESS_NONE &&
- prev_em_start && *prev_em_start != (u64)-1 &&
- *prev_em_start != em->start)
+ bio_ctrl->last_em_start != U64_MAX &&
+ bio_ctrl->last_em_start != em->start)
force_bio_submit = true;
- if (prev_em_start)
- *prev_em_start = em->start;
+ bio_ctrl->last_em_start = em->start;
em_gen = em->generation;
btrfs_free_extent_map(em);
@@ -1296,12 +1313,15 @@ int btrfs_read_folio(struct file *file, struct folio *folio)
const u64 start = folio_pos(folio);
const u64 end = start + folio_size(folio) - 1;
struct extent_state *cached_state = NULL;
- struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ };
+ struct btrfs_bio_ctrl bio_ctrl = {
+ .opf = REQ_OP_READ,
+ .last_em_start = U64_MAX,
+ };
struct extent_map *em_cached = NULL;
int ret;
lock_extents_for_read(inode, start, end, &cached_state);
- ret = btrfs_do_readpage(folio, &em_cached, &bio_ctrl, NULL);
+ ret = btrfs_do_readpage(folio, &em_cached, &bio_ctrl);
btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state);
btrfs_free_extent_map(em_cached);
@@ -2641,7 +2661,8 @@ void btrfs_readahead(struct readahead_control *rac)
{
struct btrfs_bio_ctrl bio_ctrl = {
.opf = REQ_OP_READ | REQ_RAHEAD,
- .ractl = rac
+ .ractl = rac,
+ .last_em_start = U64_MAX,
};
struct folio *folio;
struct btrfs_inode *inode = BTRFS_I(rac->mapping->host);
@@ -2649,12 +2670,11 @@ void btrfs_readahead(struct readahead_control *rac)
const u64 end = start + readahead_length(rac) - 1;
struct extent_state *cached_state = NULL;
struct extent_map *em_cached = NULL;
- u64 prev_em_start = (u64)-1;
lock_extents_for_read(inode, start, end, &cached_state);
while ((folio = readahead_folio(rac)) != NULL)
- btrfs_do_readpage(folio, &em_cached, &bio_ctrl, &prev_em_start);
+ btrfs_do_readpage(folio, &em_cached, &bio_ctrl);
btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state);
--
2.50.1
In early boot, Linux creates identity virtual->physical address mappings
so that it can enable the MMU before full memory management is ready.
To ensure some available physical memory to back these structures,
vmlinux.lds reserves some space (and defines marker symbols) in the
middle of the kernel image. However, because they are defined outside of
PROGBITS sections, they aren't pre-initialized -- at least as far as ELF
is concerned.
In the typical case, this isn't actually a problem: the boot image is
prepared with objcopy, which zero-fills the gaps, so these structures
are incidentally zero-initialized (an all-zeroes entry is considered
absent, so zero-initialization is appropriate).
However, that is just a happy accident: the `vmlinux` ELF output
authoritatively represents the state of memory at entry. If the ELF
says a region of memory isn't initialized, we must treat it as
uninitialized. Indeed, certain bootloaders (e.g. Broadcom CFE) ingest
the ELF directly -- sidestepping the objcopy-produced image entirely --
and therefore do not initialize the gaps. This results in the early boot
code crashing when it attempts to create identity mappings.
Therefore, add boot-time zero-initialization for the following:
- __pi_init_idmap_pg_dir..__pi_init_idmap_pg_end
- idmap_pg_dir
- reserved_pg_dir
- tramp_pg_dir # Already done, but this patch corrects the size
Note, swapper_pg_dir is already initialized (by copy from idmap_pg_dir)
before use, so this patch does not need to address it.
Cc: stable(a)vger.kernel.org
Signed-off-by: Sam Edwards <CFSworks(a)gmail.com>
---
arch/arm64/kernel/head.S | 12 ++++++++++++
arch/arm64/mm/mmu.c | 3 ++-
2 files changed, 14 insertions(+), 1 deletion(-)
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index ca04b338cb0d..0c3be11d0006 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -86,6 +86,18 @@ SYM_CODE_START(primary_entry)
bl record_mmu_state
bl preserve_boot_args
+ adrp x0, reserved_pg_dir
+ add x1, x0, #PAGE_SIZE
+0: str xzr, [x0], 8
+ cmp x0, x1
+ b.lo 0b
+
+ adrp x0, __pi_init_idmap_pg_dir
+ adrp x1, __pi_init_idmap_pg_end
+1: str xzr, [x0], 8
+ cmp x0, x1
+ b.lo 1b
+
adrp x1, early_init_stack
mov sp, x1
mov x29, xzr
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 34e5d78af076..aaf823565a65 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -761,7 +761,7 @@ static int __init map_entry_trampoline(void)
pgprot_val(prot) &= ~PTE_NG;
/* Map only the text into the trampoline page table */
- memset(tramp_pg_dir, 0, PGD_SIZE);
+ memset(tramp_pg_dir, 0, PAGE_SIZE);
__create_pgd_mapping(tramp_pg_dir, pa_start, TRAMP_VALIAS,
entry_tramp_text_size(), prot,
pgd_pgtable_alloc_init_mm, NO_BLOCK_MAPPINGS);
@@ -806,6 +806,7 @@ static void __init create_idmap(void)
u64 end = __pa_symbol(__idmap_text_end);
u64 ptep = __pa_symbol(idmap_ptes);
+ memset(idmap_pg_dir, 0, PAGE_SIZE);
__pi_map_range(&ptep, start, end, start, PAGE_KERNEL_ROX,
IDMAP_ROOT_LEVEL, (pte_t *)idmap_pg_dir, false,
__phys_to_virt(ptep) - ptep);
--
2.49.1
The seg6_hmac_info structure stores information related to SRv6 HMAC
configurations, including the secret key, HMAC ID, and hashing algorithm
used to authenticate and secure SRv6 packets.
When a seg6_hmac_info object is no longer needed, it is destroyed via
seg6_hmac_info_del(), which eventually calls seg6_hinfo_release(). This
function uses kfree_rcu() to safely deallocate memory after an RCU grace
period has elapsed.
The kfree_rcu() releases memory without sanitization (e.g., zeroing out
the memory). Consequently, sensitive information such as the HMAC secret
and its length may remain in freed memory, potentially leading to data
leaks.
To address this risk, we replaced kfree_rcu() with a custom RCU
callback, seg6_hinfo_free_callback_rcu(). Within this callback, we
explicitly sanitize the seg6_hmac_info object before deallocating it
safely using kfree_sensitive(). This approach ensures the memory is
securely freed and prevents potential HMAC info leaks.
Additionally, in the control path, we ensure proper cleanup of
seg6_hmac_info objects when seg6_hmac_info_add() fails: such objects are
freed using kfree_sensitive() instead of kfree().
Fixes: 4f4853dc1c9c ("ipv6: sr: implement API to control SR HMAC structure")
Fixes: bf355b8d2c30 ("ipv6: sr: add core files for SR HMAC support")
Cc: stable(a)vger.kernel.org
Signed-off-by: Andrea Mayer <andrea.mayer(a)uniroma2.it>
---
net/ipv6/seg6.c | 2 +-
net/ipv6/seg6_hmac.c | 10 +++++++++-
2 files changed, 10 insertions(+), 2 deletions(-)
diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c
index 180da19c148c..88782bdab843 100644
--- a/net/ipv6/seg6.c
+++ b/net/ipv6/seg6.c
@@ -215,7 +215,7 @@ static int seg6_genl_sethmac(struct sk_buff *skb, struct genl_info *info)
err = seg6_hmac_info_add(net, hmackeyid, hinfo);
if (err)
- kfree(hinfo);
+ kfree_sensitive(hinfo);
out_unlock:
mutex_unlock(&sdata->lock);
diff --git a/net/ipv6/seg6_hmac.c b/net/ipv6/seg6_hmac.c
index fd58426f222b..19cdf3791ebf 100644
--- a/net/ipv6/seg6_hmac.c
+++ b/net/ipv6/seg6_hmac.c
@@ -57,9 +57,17 @@ static int seg6_hmac_cmpfn(struct rhashtable_compare_arg *arg, const void *obj)
return (hinfo->hmackeyid != *(__u32 *)arg->key);
}
+static void seg6_hinfo_free_callback_rcu(struct rcu_head *head)
+{
+ struct seg6_hmac_info *hinfo;
+
+ hinfo = container_of(head, struct seg6_hmac_info, rcu);
+ kfree_sensitive(hinfo);
+}
+
static inline void seg6_hinfo_release(struct seg6_hmac_info *hinfo)
{
- kfree_rcu(hinfo, rcu);
+ call_rcu(&hinfo->rcu, seg6_hinfo_free_callback_rcu);
}
static void seg6_free_hi(void *ptr, void *arg)
--
2.20.1
Commit 8ee53820edfd ("thp: mmu_notifier_test_young") introduced
mmu_notifier_test_young(), but we should pass the address need to test.
In xxx_scan_pmd(), the actual iteration address is "_address" not
"address". We seem to misuse the variable on the very beginning.
Change it to the right one.
Fixes: 8ee53820edfd ("thp: mmu_notifier_test_young")
Signed-off-by: Wei Yang <richard.weiyang(a)gmail.com>
Cc: David Hildenbrand <david(a)redhat.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes(a)oracle.com>
Cc: Zi Yan <ziy(a)nvidia.com>
Cc: Baolin Wang <baolin.wang(a)linux.alibaba.com>
Cc: Liam R. Howlett <Liam.Howlett(a)oracle.com>
Cc: Nico Pache <npache(a)redhat.com>
Cc: Ryan Roberts <ryan.roberts(a)arm.com>
Cc: Dev Jain <dev.jain(a)arm.com>
Cc: Barry Song <baohua(a)kernel.org>
CC: <stable(a)vger.kernel.org>
---
The original commit 8ee53820edfd is at 2011.
Then the code is moved to khugepaged.c in commit b46e756f5e470 ("thp:
extract khugepaged from mm/huge_memory.c") in 2022.
---
mm/khugepaged.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 24e18a7f8a93..b000942250d1 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1418,7 +1418,7 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
if (cc->is_khugepaged &&
(pte_young(pteval) || folio_test_young(folio) ||
folio_test_referenced(folio) || mmu_notifier_test_young(vma->vm_mm,
- address)))
+ _address)))
referenced++;
}
if (!writable) {
--
2.34.1
[BUG]
With 64K page size (aarch64 with 64K page size config) and 4K btrfs
block size, the following workload can easily lead to a corrupted read:
mkfs.btrfs -f -s 4k $dev > /dev/null
mount -o compress=zstd $dev $mnt
xfs_io -f -c "pwrite -S 0xff 0 64k" -c sync $mnt/base > /dev/null
echo "correct result:"
od -Ax -x $mnt/base
xfs_io -f -c "reflink $mnt/base 32k 0 32k" \
-c "reflink $mnt/base 0 32k 32k" \
-c "pwrite -S 0xff 60k 4k" $mnt/new > /dev/null
echo "incorrect result:"
od -Ax -x $mnt/new
umount $mnt
This shows the following result:
correct result:
000000 ffff ffff ffff ffff ffff ffff ffff ffff
*
010000
incorrect result:
000000 ffff ffff ffff ffff ffff ffff ffff ffff
*
008000 0000 0000 0000 0000 0000 0000 0000 0000
*
00f000 ffff ffff ffff ffff ffff ffff ffff ffff
*
010000
Notice the zero in the range [0x8000, 0xf000), which is incorrect.
[CAUSE]
With extra trace printk, it shows the following events during od:
(some unrelated info removed like CPU and context)
od-3457 btrfs_do_readpage: enter r/i=5/258 folio=0(65536) prev_em_start=0000000000000000
The "r/i" is indicating the root and inode number. In our case the file
"new" is using ino 258 from fs tree (root 5).
Here notice the @prev_em_start pointer is NULL. This means the
btrfs_do_readpage() is called from btrfs_read_folio(), not from
btrfs_readahead().
od-3457 btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=0 got em start=0 len=32768
od-3457 btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=4096 got em start=0 len=32768
od-3457 btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=8192 got em start=0 len=32768
od-3457 btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=12288 got em start=0 len=32768
od-3457 btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=16384 got em start=0 len=32768
od-3457 btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=20480 got em start=0 len=32768
od-3457 btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=24576 got em start=0 len=32768
od-3457 btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=28672 got em start=0 len=32768
These above 32K blocks will be read from the the first half of the
compressed data extent.
od-3457 btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=32768 got em start=32768 len=32768
Note here there is no btrfs_submit_compressed_read() call. Which is
incorrect now.
As both extent maps at 0 and 32K are pointing to the same compressed
data, their offset are different thus can not be merged into the same
read.
So this means the compressed data read merge check is doing something
wrong.
od-3457 btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=36864 got em start=32768 len=32768
od-3457 btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=40960 got em start=32768 len=32768
od-3457 btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=45056 got em start=32768 len=32768
od-3457 btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=49152 got em start=32768 len=32768
od-3457 btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=53248 got em start=32768 len=32768
od-3457 btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=57344 got em start=32768 len=32768
od-3457 btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=61440 skip uptodate
od-3457 btrfs_submit_compressed_read: cb orig_bio: file off=0 len=61440
The function btrfs_submit_compressed_read() is only called at the end of
folio read. The compressed bio will only have a extent map of range [0,
32K), but the original bio passed in are for the whole 64K folio.
This will cause the decompression part to only fill the first 32K,
leaving the rest untouched (aka, filled with zero).
This incorrect compressed read merge leads to the above data corruption.
There are similar problems happened in the past, commit 808f80b46790
("Btrfs: update fix for read corruption of compressed and shared
extents") is doing pretty much the same fix for readahead.
But that's back to 2015, where btrfs still only supports bs (block size)
== ps (page size) cases.
This means btrfs_do_readpage() only needs to handle a folio which
contains exactly one block.
Only btrfs_readahead() can lead to a read covering multiple blocks.
Thus only btrfs_readahead() passes a non-NULL @prev_em_start pointer.
With the v5.15 btrfs introduced bs < ps support. This breaks the above
assumption that a folio can only contain one block.
Now btrfs_read_folio() can also read multiple blocks in one go.
But btrfs_read_folio() doesn't pass a @prev_em_start pointer, thus the
existing bio force submission check will never be triggered.
In theory, this can also happen for btrfs with large folios, but since
large folio is still experimental, we don't need to bother it, thus only
bs < ps support is affected for now.
[FIX]
Instead of passing @prev_em_start to do the proper compressed extent
check, introduce one new member, btrfs_bio_ctrl::last_em_start, so that
the existing bio force submission logic will always be triggered.
CC: stable(a)vger.kernel.org #5.15+
Signed-off-by: Qu Wenruo <wqu(a)suse.com>
---
Changelog:
v3:
- Use a single btrfs_bio_ctrl::last_em_start
Since the existing prev_em_start is using U64_MAX as the initial value
to indicate no em hit yet, we can use the same logic, which saves
another 8 bytes from btrfs_bio_ctrl.
- Update the reproducer
Previously I failed to reproduce using a minimal workload.
As regular read from od/md5sum always trigger readahead thus not
hitting the @prev_em_start == NULL path.
Will send out a fstest case for it using the minimal reproducer.
But it will still need a 16K/64K page sized system to reproduce.
Fix the problem by doing an block aligned write into the folio, so
that the folio will be partially dirty and not go through the
readahead path.
- Update the analyze
This includes the trace events of the minimal reproducer, and
mentioning of previous similar fixes and why they do not work for
subpage cases.
- Update the CC tag
Since it's only affecting bs < ps cases (for non-experimental builds),
only need to fix kernels with subpage btrfs supports.
v2:
- Only save extent_map::start/len to save memory for btrfs_bio_ctrl
It's using on-stack memory which is very limited inside the kernel.
- Remove the commit message mentioning of clearing last saved em
Since we're using em::start/len, there is no need to clear them.
Either we hit the same em::start/len, meaning hitting the same extent
map, or we hit a different em, which will have a different start/len.
---
fs/btrfs/extent_io.c | 40 ++++++++++++++++++++++++++++++----------
1 file changed, 30 insertions(+), 10 deletions(-)
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 0c12fd64a1f3..b219dbaaedc8 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -131,6 +131,24 @@ struct btrfs_bio_ctrl {
*/
unsigned long submit_bitmap;
struct readahead_control *ractl;
+
+ /*
+ * The start file offset of the last hit extent map.
+ *
+ * This is for proper compressed read merge.
+ * U64_MAX means no extent map hit yet.
+ *
+ * The current btrfs_bio_is_contig() only uses disk_bytenr as
+ * the condition to check if the read can be merged with previous
+ * bio, which is not correct. E.g. two file extents pointing to the
+ * same extent but with different offset.
+ *
+ * So here we need to do extra check to only merge reads that are
+ * covered by the same extent map.
+ * Just extent_map::start will be enough, as they are unique
+ * inside the same inode.
+ */
+ u64 last_em_start;
};
/*
@@ -965,7 +983,7 @@ static void btrfs_readahead_expand(struct readahead_control *ractl,
* return 0 on success, otherwise return error
*/
static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached,
- struct btrfs_bio_ctrl *bio_ctrl, u64 *prev_em_start)
+ struct btrfs_bio_ctrl *bio_ctrl)
{
struct inode *inode = folio->mapping->host;
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
@@ -1076,12 +1094,11 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached,
* non-optimal behavior (submitting 2 bios for the same extent).
*/
if (compress_type != BTRFS_COMPRESS_NONE &&
- prev_em_start && *prev_em_start != (u64)-1 &&
- *prev_em_start != em->start)
+ bio_ctrl->last_em_start != U64_MAX &&
+ bio_ctrl->last_em_start != em->start)
force_bio_submit = true;
- if (prev_em_start)
- *prev_em_start = em->start;
+ bio_ctrl->last_em_start = em->start;
em_gen = em->generation;
btrfs_free_extent_map(em);
@@ -1296,12 +1313,15 @@ int btrfs_read_folio(struct file *file, struct folio *folio)
const u64 start = folio_pos(folio);
const u64 end = start + folio_size(folio) - 1;
struct extent_state *cached_state = NULL;
- struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ };
+ struct btrfs_bio_ctrl bio_ctrl = {
+ .opf = REQ_OP_READ,
+ .last_em_start = U64_MAX,
+ };
struct extent_map *em_cached = NULL;
int ret;
lock_extents_for_read(inode, start, end, &cached_state);
- ret = btrfs_do_readpage(folio, &em_cached, &bio_ctrl, NULL);
+ ret = btrfs_do_readpage(folio, &em_cached, &bio_ctrl);
btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state);
btrfs_free_extent_map(em_cached);
@@ -2641,7 +2661,8 @@ void btrfs_readahead(struct readahead_control *rac)
{
struct btrfs_bio_ctrl bio_ctrl = {
.opf = REQ_OP_READ | REQ_RAHEAD,
- .ractl = rac
+ .ractl = rac,
+ .last_em_start = U64_MAX,
};
struct folio *folio;
struct btrfs_inode *inode = BTRFS_I(rac->mapping->host);
@@ -2649,12 +2670,11 @@ void btrfs_readahead(struct readahead_control *rac)
const u64 end = start + readahead_length(rac) - 1;
struct extent_state *cached_state = NULL;
struct extent_map *em_cached = NULL;
- u64 prev_em_start = (u64)-1;
lock_extents_for_read(inode, start, end, &cached_state);
while ((folio = readahead_folio(rac)) != NULL)
- btrfs_do_readpage(folio, &em_cached, &bio_ctrl, &prev_em_start);
+ btrfs_do_readpage(folio, &em_cached, &bio_ctrl);
btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state);
--
2.50.1
ENODATA (aka ENOATTR) has a very specific meaning in the xfs xattr code;
namely, that the requested attribute name could not be found.
However, a medium error from disk may also return ENODATA. At best,
this medium error may escape to userspace as "attribute not found"
when in fact it's an IO (disk) error.
At worst, we may oops in xfs_attr_leaf_get() when we do:
error = xfs_attr_leaf_hasname(args, &bp);
if (error == -ENOATTR) {
xfs_trans_brelse(args->trans, bp);
return error;
}
because an ENODATA/ENOATTR error from disk leaves us with a null bp,
and the xfs_trans_brelse will then null-deref it.
As discussed on the list, we really need to modify the lower level
IO functions to trap all disk errors and ensure that we don't let
unique errors like this leak up into higher xfs functions - many
like this should be remapped to EIO.
However, this patch directly addresses a reported bug in the xattr
code, and should be safe to backport to stable kernels. A larger-scope
patch to handle more unique errors at lower levels can follow later.
(Note, prior to 07120f1abdff we did not oops, but we did return the
wrong error code to userspace.)
Signed-off-by: Eric Sandeen <sandeen(a)redhat.com>
Fixes: 07120f1abdff ("xfs: Add xfs_has_attr and subroutines")
Cc: <stable(a)vger.kernel.org> # v5.9+
---
V2: Remove the extraneous trap point as pointed out by djwong, oops.
(I get it that sprinkling this around in 2 places might have an ick
factor but I think it's necessary to make a surgical strike on this bug
before we address the general problem.)
Thanks,
-Eric
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
index 4c44ce1c8a64..bff3dc226f81 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -435,6 +435,13 @@ xfs_attr_rmtval_get(
0, &bp, &xfs_attr3_rmt_buf_ops);
if (xfs_metadata_is_sick(error))
xfs_dirattr_mark_sick(args->dp, XFS_ATTR_FORK);
+ /*
+ * ENODATA from disk implies a disk medium failure;
+ * ENODATA for xattrs means attribute not found, so
+ * disambiguate that here.
+ */
+ if (error == -ENODATA)
+ error = -EIO;
if (error)
return error;
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index 17d9e6154f19..723a0643b838 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -2833,6 +2833,12 @@ xfs_da_read_buf(
&bp, ops);
if (xfs_metadata_is_sick(error))
xfs_dirattr_mark_sick(dp, whichfork);
+ /*
+ * ENODATA from disk implies a disk medium failure; ENODATA for
+ * xattrs means attribute not found, so disambiguate that here.
+ */
+ if (error == -ENODATA && whichfork == XFS_ATTR_FORK)
+ error = -EIO;
if (error)
goto out_free;
When restoring a reservation for an anonymous page, we need to check to
freeing a surplus. However, __unmap_hugepage_range() causes data race
because it reads h->surplus_huge_pages without the protection of
hugetlb_lock.
And adjust_reservation is a boolean variable that indicates whether
reservations for anonymous pages in each folio should be restored.
Therefore, it should be initialized to false for each round of the loop.
However, this variable is not initialized to false except when defining
the current adjust_reservation variable.
This means that once adjust_reservation is set to true even once within
the loop, reservations for anonymous pages will be restored
unconditionally in all subsequent rounds, regardless of the folio's state.
To fix this, we need to add the missing hugetlb_lock, unlock the
page_table_lock earlier so that we don't lock the hugetlb_lock inside the
page_table_lock lock, and initialize adjust_reservation to false on each
round within the loop.
Cc: <stable(a)vger.kernel.org>
Reported-by: syzbot+417aeb05fd190f3a6da9(a)syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=417aeb05fd190f3a6da9
Fixes: df7a6d1f6405 ("mm/hugetlb: restore the reservation if needed")
Signed-off-by: Jeongjun Park <aha310510(a)gmail.com>
---
v2: Fix issues with changing the page_table_lock unlock location and initializing adjust_reservation
- Link to v1: https://lore.kernel.org/all/20250822055857.1142454-1-aha310510@gmail.com/
---
mm/hugetlb.c | 9 ++++++---
1 file changed, 6 insertions(+), 3 deletions(-)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 753f99b4c718..eed59cfb5d21 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5851,7 +5851,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
spinlock_t *ptl;
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
- bool adjust_reservation = false;
+ bool adjust_reservation;
unsigned long last_addr_mask;
bool force_flush = false;
@@ -5944,6 +5944,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
sz);
hugetlb_count_sub(pages_per_huge_page(h), mm);
hugetlb_remove_rmap(folio);
+ spin_unlock(ptl);
/*
* Restore the reservation for anonymous page, otherwise the
@@ -5951,14 +5952,16 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
* If there we are freeing a surplus, do not set the restore
* reservation bit.
*/
+ adjust_reservation = false;
+
+ spin_lock_irq(&hugetlb_lock);
if (!h->surplus_huge_pages && __vma_private_lock(vma) &&
folio_test_anon(folio)) {
folio_set_hugetlb_restore_reserve(folio);
/* Reservation to be adjusted after the spin lock */
adjust_reservation = true;
}
-
- spin_unlock(ptl);
+ spin_unlock_irq(&hugetlb_lock);
/*
* Adjust the reservation for the region that will have the
--
From: Lance Yang <lance.yang(a)linux.dev>
The blocker tracking mechanism assumes that lock pointers are at least
4-byte aligned to use their lower bits for type encoding.
However, as reported by Geert Uytterhoeven, some architectures like m68k
only guarantee 2-byte alignment of 32-bit values. This breaks the
assumption and causes two related WARN_ON_ONCE checks to trigger.
To fix this, enforce a minimum of 4-byte alignment on the core lock
structures supported by the blocker tracking mechanism. This ensures the
algorithm's alignment assumption now holds true on all architectures.
This patch adds __aligned(4) to the definitions of "struct mutex",
"struct semaphore", and "struct rw_semaphore", resolving the warnings.
Thanks to Geert for bisecting!
Reported-by: Geert Uytterhoeven <geert(a)linux-m68k.org>
Closes: https://lore.kernel.org/lkml/CAMuHMdW7Ab13DdGs2acMQcix5ObJK0O2dG_Fxzr8_g58R…
Fixes: e711faaafbe5 ("hung_task: replace blocker_mutex with encoded blocker")
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Lance Yang <lance.yang(a)linux.dev>
---
include/linux/mutex_types.h | 2 +-
include/linux/rwsem.h | 2 +-
include/linux/semaphore.h | 2 +-
3 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/include/linux/mutex_types.h b/include/linux/mutex_types.h
index fdf7f515fde8..de798bfbc4c7 100644
--- a/include/linux/mutex_types.h
+++ b/include/linux/mutex_types.h
@@ -51,7 +51,7 @@ struct mutex {
#ifdef CONFIG_DEBUG_LOCK_ALLOC
struct lockdep_map dep_map;
#endif
-};
+} __aligned(4); /* For hung_task blocker tracking, which encodes type in LSBs */
#else /* !CONFIG_PREEMPT_RT */
/*
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index f1aaf676a874..f6ecf4a4710d 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -64,7 +64,7 @@ struct rw_semaphore {
#ifdef CONFIG_DEBUG_LOCK_ALLOC
struct lockdep_map dep_map;
#endif
-};
+} __aligned(4); /* For hung_task blocker tracking, which encodes type in LSBs */
#define RWSEM_UNLOCKED_VALUE 0UL
#define RWSEM_WRITER_LOCKED (1UL << 0)
diff --git a/include/linux/semaphore.h b/include/linux/semaphore.h
index 89706157e622..ac9b9c87bfb7 100644
--- a/include/linux/semaphore.h
+++ b/include/linux/semaphore.h
@@ -20,7 +20,7 @@ struct semaphore {
#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER
unsigned long last_holder;
#endif
-};
+} __aligned(4); /* For hung_task blocker tracking, which encodes type in LSBs */
#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER
#define __LAST_HOLDER_SEMAPHORE_INITIALIZER \
--
2.49.0