The hwprobe vDSO data for some keys, like MISALIGNED_VECTOR_PERF,
is determined by an asynchronous kthread. This can create a race
condition where the kthread finishes after the vDSO data has
already been populated, causing userspace to read stale values.
To fix this, a completion-based framework is introduced to robustly
synchronize the async probes with the vDSO data population. The
waiting function, init_hwprobe_vdso_data(), now blocks on
wait_for_completion() until all probes signal they are done.
Furthermore, to prevent this potential blocking from impacting boot
performance, the initialization is deferred to late_initcall. This
is safe as the data is only required by userspace (which starts
much later) and moves the synchronization delay off the critical
boot path.
Reported-by: Tsukasa OI <research_trasio(a)irq.a4lg.com>
Closes: https://lore.kernel.org/linux-riscv/760d637b-b13b-4518-b6bf-883d55d44e7f@ir…
Fixes: e7c9d66e313b ("RISC-V: Report vector unaligned access speed hwprobe")
Cc: Palmer Dabbelt <palmer(a)dabbelt.com>
Cc: Alexandre Ghiti <alexghiti(a)rivosinc.com>
Cc: Olof Johansson <olof(a)lixom.net>
Cc: stable(a)vger.kernel.org
Signed-off-by: Jingwei Wang <wangjingwei(a)iscas.ac.cn>
---
Changes in v5:
- Reworked the synchronization logic to a robust "sentinel-count"
pattern based on feedback from Alexandre.
- Fixed a "multiple definition" linker error for nommu builds by changing
the header-file stub functions to `static inline`, as pointed out by Olof.
- Updated the commit message to better explain the rationale for moving
the vDSO initialization to `late_initcall`.
Changes in v4:
- Reworked the synchronization mechanism based on feedback from Palmer
and Alexandre.
- Instead of a post-hoc refresh, this version introduces a robust
completion-based framework using an atomic counter to ensure async
probes are finished before populating the vDSO.
- Moved the vdso data initialization to a late_initcall to avoid
impacting boot time.
Changes in v3:
- Retained existing blank line.
Changes in v2:
- Addressed feedback from Yixun's regarding #ifdef CONFIG_MMU usage.
- Updated commit message to provide a high-level summary.
- Added Fixes tag for commit e7c9d66e313b.
v1: https://lore.kernel.org/linux-riscv/20250521052754.185231-1-wangjingwei@isc…
arch/riscv/include/asm/hwprobe.h | 8 +++++++-
arch/riscv/kernel/sys_hwprobe.c | 20 +++++++++++++++++++-
arch/riscv/kernel/unaligned_access_speed.c | 9 +++++++--
3 files changed, 33 insertions(+), 4 deletions(-)
diff --git a/arch/riscv/include/asm/hwprobe.h b/arch/riscv/include/asm/hwprobe.h
index 7fe0a379474ae2c6..3b2888126e659ea1 100644
--- a/arch/riscv/include/asm/hwprobe.h
+++ b/arch/riscv/include/asm/hwprobe.h
@@ -40,5 +40,11 @@ static inline bool riscv_hwprobe_pair_cmp(struct riscv_hwprobe *pair,
return pair->value == other_pair->value;
}
-
+#ifdef CONFIG_MMU
+void riscv_hwprobe_register_async_probe(void);
+void riscv_hwprobe_complete_async_probe(void);
+#else
+static inline void riscv_hwprobe_register_async_probe(void) {}
+static inline void riscv_hwprobe_complete_async_probe(void) {}
+#endif
#endif
diff --git a/arch/riscv/kernel/sys_hwprobe.c b/arch/riscv/kernel/sys_hwprobe.c
index 0b170e18a2beba57..ee02aeb03e7bd3d8 100644
--- a/arch/riscv/kernel/sys_hwprobe.c
+++ b/arch/riscv/kernel/sys_hwprobe.c
@@ -5,6 +5,8 @@
* more details.
*/
#include <linux/syscalls.h>
+#include <linux/completion.h>
+#include <linux/atomic.h>
#include <asm/cacheflush.h>
#include <asm/cpufeature.h>
#include <asm/hwprobe.h>
@@ -467,6 +469,20 @@ static int do_riscv_hwprobe(struct riscv_hwprobe __user *pairs,
#ifdef CONFIG_MMU
+static DECLARE_COMPLETION(boot_probes_done);
+static atomic_t pending_boot_probes = ATOMIC_INIT(1);
+
+void riscv_hwprobe_register_async_probe(void)
+{
+ atomic_inc(&pending_boot_probes);
+}
+
+void riscv_hwprobe_complete_async_probe(void)
+{
+ if (atomic_dec_and_test(&pending_boot_probes))
+ complete(&boot_probes_done);
+}
+
static int __init init_hwprobe_vdso_data(void)
{
struct vdso_arch_data *avd = vdso_k_arch_data;
@@ -474,6 +490,8 @@ static int __init init_hwprobe_vdso_data(void)
struct riscv_hwprobe pair;
int key;
+ if (unlikely(!atomic_dec_and_test(&pending_boot_probes)))
+ wait_for_completion(&boot_probes_done);
/*
* Initialize vDSO data with the answers for the "all CPUs" case, to
* save a syscall in the common case.
@@ -504,7 +522,7 @@ static int __init init_hwprobe_vdso_data(void)
return 0;
}
-arch_initcall_sync(init_hwprobe_vdso_data);
+late_initcall(init_hwprobe_vdso_data);
#endif /* CONFIG_MMU */
diff --git a/arch/riscv/kernel/unaligned_access_speed.c b/arch/riscv/kernel/unaligned_access_speed.c
index ae2068425fbcd207..4b8ad2673b0f7470 100644
--- a/arch/riscv/kernel/unaligned_access_speed.c
+++ b/arch/riscv/kernel/unaligned_access_speed.c
@@ -379,6 +379,7 @@ static void check_vector_unaligned_access(struct work_struct *work __always_unus
static int __init vec_check_unaligned_access_speed_all_cpus(void *unused __always_unused)
{
schedule_on_each_cpu(check_vector_unaligned_access);
+ riscv_hwprobe_complete_async_probe();
return 0;
}
@@ -473,8 +474,12 @@ static int __init check_unaligned_access_all_cpus(void)
per_cpu(vector_misaligned_access, cpu) = unaligned_vector_speed_param;
} else if (!check_vector_unaligned_access_emulated_all_cpus() &&
IS_ENABLED(CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS)) {
- kthread_run(vec_check_unaligned_access_speed_all_cpus,
- NULL, "vec_check_unaligned_access_speed_all_cpus");
+ riscv_hwprobe_register_async_probe();
+ if (IS_ERR(kthread_run(vec_check_unaligned_access_speed_all_cpus,
+ NULL, "vec_check_unaligned_access_speed_all_cpus"))) {
+ pr_warn("Failed to create vec_unalign_check kthread\n");
+ riscv_hwprobe_complete_async_probe();
+ }
}
/*
--
2.50.0
After a recent change in clang to expose uninitialized warnings from
const variables [1], there is a warning in cxacru_heavy_init():
drivers/usb/atm/cxacru.c:1104:6: error: variable 'bp' is used uninitialized whenever 'if' condition is false [-Werror,-Wsometimes-uninitialized]
1104 | if (instance->modem_type->boot_rom_patch) {
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
drivers/usb/atm/cxacru.c:1113:39: note: uninitialized use occurs here
1113 | cxacru_upload_firmware(instance, fw, bp);
| ^~
drivers/usb/atm/cxacru.c:1104:2: note: remove the 'if' if its condition is always true
1104 | if (instance->modem_type->boot_rom_patch) {
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
drivers/usb/atm/cxacru.c:1095:32: note: initialize the variable 'bp' to silence this warning
1095 | const struct firmware *fw, *bp;
| ^
| = NULL
This warning occurs in clang's frontend before inlining occurs, so it
cannot notice that bp is only used within cxacru_upload_firmware() under
the same condition that initializes it in cxacru_heavy_init(). Just
initialize bp to NULL to silence the warning without functionally
changing the code, which is what happens with modern compilers when they
support '-ftrivial-auto-var-init=zero' (CONFIG_INIT_STACK_ALL_ZERO=y).
Cc: stable(a)vger.kernel.org
Fixes: 1b0e61465234 ("[PATCH] USB ATM: driver for the Conexant AccessRunner chipset cxacru")
Closes: https://github.com/ClangBuiltLinux/linux/issues/2102
Link: https://github.com/llvm/llvm-project/commit/2464313eef01c5b1edf0eccf57a32cd… [1]
Signed-off-by: Nathan Chancellor <nathan(a)kernel.org>
---
drivers/usb/atm/cxacru.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/usb/atm/cxacru.c b/drivers/usb/atm/cxacru.c
index a12ab90b3db7..b7c3b224a759 100644
--- a/drivers/usb/atm/cxacru.c
+++ b/drivers/usb/atm/cxacru.c
@@ -1092,7 +1092,7 @@ static int cxacru_find_firmware(struct cxacru_data *instance,
static int cxacru_heavy_init(struct usbatm_data *usbatm_instance,
struct usb_interface *usb_intf)
{
- const struct firmware *fw, *bp;
+ const struct firmware *fw, *bp = NULL;
struct cxacru_data *instance = usbatm_instance->driver_data;
int ret = cxacru_find_firmware(instance, "fw", &fw);
---
base-commit: fdfa018c6962c86d2faa183187669569be4d513f
change-id: 20250715-usb-cxacru-fix-clang-21-uninit-warning-9430d96c6bc1
Best regards,
--
Nathan Chancellor <nathan(a)kernel.org>
mptcp_connect.sh can be executed manually with "-m <MODE>" and "-C" to
make sure everything works as expected when using "mmap" and "sendfile"
modes instead of "poll", and with the MPTCP checksum support.
These modes should be validated, but they are not when the selftests are
executed via the kselftest helpers. It means that most CIs validating
these selftests, like NIPA for the net development trees and LKFT for
the stable ones, are not covering these modes.
To fix that, new test programs have been added, simply calling
mptcp_connect.sh with the right parameters.
The first patch can be backported up to v5.6, and the second one up to
v5.14.
Signed-off-by: Matthieu Baerts (NGI0) <matttbe(a)kernel.org>
---
Changes in v2:
- force using a different prefix in the subtests to avoid having the
same test names in all mptcp_connect*.sh selftests.
- Link to v1: https://lore.kernel.org/r/20250714-net-mptcp-sft-connect-alt-v1-0-bf1c5abbe…
---
Matthieu Baerts (NGI0) (2):
selftests: mptcp: connect: also cover alt modes
selftests: mptcp: connect: also cover checksum
tools/testing/selftests/net/mptcp/Makefile | 3 ++-
tools/testing/selftests/net/mptcp/mptcp_connect_checksum.sh | 5 +++++
tools/testing/selftests/net/mptcp/mptcp_connect_mmap.sh | 5 +++++
tools/testing/selftests/net/mptcp/mptcp_connect_sendfile.sh | 5 +++++
4 files changed, 17 insertions(+), 1 deletion(-)
---
base-commit: b640daa2822a39ff76e70200cb2b7b892b896dce
change-id: 20250714-net-mptcp-sft-connect-alt-c1aaf073ef4e
Best regards,
--
Matthieu Baerts (NGI0) <matttbe(a)kernel.org>
From: Lance Yang <lance.yang(a)linux.dev>
As pointed out by David[1], the batched unmap logic in try_to_unmap_one()
may read past the end of a PTE table when a large folio's PTE mappings
are not fully contained within a single page table.
While this scenario might be rare, an issue triggerable from userspace must
be fixed regardless of its likelihood. This patch fixes the out-of-bounds
access by refactoring the logic into a new helper, folio_unmap_pte_batch().
The new helper correctly calculates the safe batch size by capping the scan
at both the VMA and PMD boundaries. To simplify the code, it also supports
partial batching (i.e., any number of pages from 1 up to the calculated
safe maximum), as there is no strong reason to special-case for fully
mapped folios.
[1] https://lore.kernel.org/linux-mm/a694398c-9f03-4737-81b9-7e49c857fcbe@redha…
Cc: <stable(a)vger.kernel.org>
Reported-by: David Hildenbrand <david(a)redhat.com>
Closes: https://lore.kernel.org/linux-mm/a694398c-9f03-4737-81b9-7e49c857fcbe@redha…
Fixes: 354dffd29575 ("mm: support batched unmap for lazyfree large folios during reclamation")
Suggested-by: Barry Song <baohua(a)kernel.org>
Acked-by: Barry Song <baohua(a)kernel.org>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes(a)oracle.com>
Acked-by: David Hildenbrand <david(a)redhat.com>
Signed-off-by: Lance Yang <lance.yang(a)linux.dev>
---
v3 -> v4:
- Add Reported-by + Closes tags (per David)
- Pick RB from Lorenzo - thanks!
- Pick AB from David - thanks!
- https://lore.kernel.org/linux-mm/20250630011305.23754-1-lance.yang@linux.dev
v2 -> v3:
- Tweak changelog (per Barry and David)
- Pick AB from Barry - thanks!
- https://lore.kernel.org/linux-mm/20250627062319.84936-1-lance.yang@linux.dev
v1 -> v2:
- Update subject and changelog (per Barry)
- https://lore.kernel.org/linux-mm/20250627025214.30887-1-lance.yang@linux.dev
mm/rmap.c | 46 ++++++++++++++++++++++++++++------------------
1 file changed, 28 insertions(+), 18 deletions(-)
diff --git a/mm/rmap.c b/mm/rmap.c
index fb63d9256f09..1320b88fab74 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1845,23 +1845,32 @@ void folio_remove_rmap_pud(struct folio *folio, struct page *page,
#endif
}
-/* We support batch unmapping of PTEs for lazyfree large folios */
-static inline bool can_batch_unmap_folio_ptes(unsigned long addr,
- struct folio *folio, pte_t *ptep)
+static inline unsigned int folio_unmap_pte_batch(struct folio *folio,
+ struct page_vma_mapped_walk *pvmw,
+ enum ttu_flags flags, pte_t pte)
{
const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
- int max_nr = folio_nr_pages(folio);
- pte_t pte = ptep_get(ptep);
+ unsigned long end_addr, addr = pvmw->address;
+ struct vm_area_struct *vma = pvmw->vma;
+ unsigned int max_nr;
+
+ if (flags & TTU_HWPOISON)
+ return 1;
+ if (!folio_test_large(folio))
+ return 1;
+ /* We may only batch within a single VMA and a single page table. */
+ end_addr = pmd_addr_end(addr, vma->vm_end);
+ max_nr = (end_addr - addr) >> PAGE_SHIFT;
+
+ /* We only support lazyfree batching for now ... */
if (!folio_test_anon(folio) || folio_test_swapbacked(folio))
- return false;
+ return 1;
if (pte_unused(pte))
- return false;
- if (pte_pfn(pte) != folio_pfn(folio))
- return false;
+ return 1;
- return folio_pte_batch(folio, addr, ptep, pte, max_nr, fpb_flags, NULL,
- NULL, NULL) == max_nr;
+ return folio_pte_batch(folio, addr, pvmw->pte, pte, max_nr, fpb_flags,
+ NULL, NULL, NULL);
}
/*
@@ -2024,9 +2033,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
if (pte_dirty(pteval))
folio_mark_dirty(folio);
} else if (likely(pte_present(pteval))) {
- if (folio_test_large(folio) && !(flags & TTU_HWPOISON) &&
- can_batch_unmap_folio_ptes(address, folio, pvmw.pte))
- nr_pages = folio_nr_pages(folio);
+ nr_pages = folio_unmap_pte_batch(folio, &pvmw, flags, pteval);
end_addr = address + nr_pages * PAGE_SIZE;
flush_cache_range(vma, address, end_addr);
@@ -2206,13 +2213,16 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
hugetlb_remove_rmap(folio);
} else {
folio_remove_rmap_ptes(folio, subpage, nr_pages, vma);
- folio_ref_sub(folio, nr_pages - 1);
}
if (vma->vm_flags & VM_LOCKED)
mlock_drain_local();
- folio_put(folio);
- /* We have already batched the entire folio */
- if (nr_pages > 1)
+ folio_put_refs(folio, nr_pages);
+
+ /*
+ * If we are sure that we batched the entire folio and cleared
+ * all PTEs, we can just optimize and stop right here.
+ */
+ if (nr_pages == folio_nr_pages(folio))
goto walk_done;
continue;
walk_abort:
--
2.49.0