The value for some hwprobe keys, like MISALIGNED_VECTOR_PERF, is
determined by an asynchronous kthread. This kthread can finish after
the hwprobe vDSO data is populated, creating a race condition where
userspace can read stale values.
A completion-based framework is introduced to synchronize the async
probes with the vDSO population. The init_hwprobe_vdso_data()
function is deferred to `late_initcall` and now blocks until all
probes signal completion.
Reported-by: Tsukasa OI <research_trasio(a)irq.a4lg.com>
Closes: https://lore.kernel.org/linux-riscv/760d637b-b13b-4518-b6bf-883d55d44e7f@ir…
Fixes: e7c9d66e313b ("RISC-V: Report vector unaligned access speed hwprobe")
Cc: Palmer Dabbelt <palmer(a)dabbelt.com>
Cc: Alexandre Ghiti <alexghiti(a)rivosinc.com>
Cc: stable(a)vger.kernel.org
Signed-off-by: Jingwei Wang <wangjingwei(a)iscas.ac.cn>
---
Changes in v4:
- Reworked the synchronization mechanism based on feedback from Palmer
and Alexandre.
- Instead of a post-hoc refresh, this version introduces a robust
completion-based framework using an atomic counter to ensure async
probes are finished before populating the vDSO.
- Moved the vdso data initialization to a late_initcall to avoid
impacting boot time.
Changes in v3:
- Retained existing blank line.
Changes in v2:
- Addressed feedback from Yixun's regarding #ifdef CONFIG_MMU usage.
- Updated commit message to provide a high-level summary.
- Added Fixes tag for commit e7c9d66e313b.
v1: https://lore.kernel.org/linux-riscv/20250521052754.185231-1-wangjingwei@isc…
arch/riscv/include/asm/hwprobe.h | 8 +++++++-
arch/riscv/kernel/sys_hwprobe.c | 20 +++++++++++++++++++-
arch/riscv/kernel/unaligned_access_speed.c | 9 +++++++--
3 files changed, 33 insertions(+), 4 deletions(-)
diff --git a/arch/riscv/include/asm/hwprobe.h b/arch/riscv/include/asm/hwprobe.h
index 7fe0a379474ae2c6..87af186d92e75ddb 100644
--- a/arch/riscv/include/asm/hwprobe.h
+++ b/arch/riscv/include/asm/hwprobe.h
@@ -40,5 +40,11 @@ static inline bool riscv_hwprobe_pair_cmp(struct riscv_hwprobe *pair,
return pair->value == other_pair->value;
}
-
+#ifdef CONFIG_MMU
+void riscv_hwprobe_register_async_probe(void);
+void riscv_hwprobe_complete_async_probe(void);
+#else
+inline void riscv_hwprobe_register_async_probe(void) {}
+inline void riscv_hwprobe_complete_async_probe(void) {}
+#endif
#endif
diff --git a/arch/riscv/kernel/sys_hwprobe.c b/arch/riscv/kernel/sys_hwprobe.c
index 0b170e18a2beba57..8c50dcec2b754c30 100644
--- a/arch/riscv/kernel/sys_hwprobe.c
+++ b/arch/riscv/kernel/sys_hwprobe.c
@@ -5,6 +5,8 @@
* more details.
*/
#include <linux/syscalls.h>
+#include <linux/completion.h>
+#include <linux/atomic.h>
#include <asm/cacheflush.h>
#include <asm/cpufeature.h>
#include <asm/hwprobe.h>
@@ -467,6 +469,20 @@ static int do_riscv_hwprobe(struct riscv_hwprobe __user *pairs,
#ifdef CONFIG_MMU
+static DECLARE_COMPLETION(boot_probes_done);
+static atomic_t pending_boot_probes = ATOMIC_INIT(0);
+
+void riscv_hwprobe_register_async_probe(void)
+{
+ atomic_inc(&pending_boot_probes);
+}
+
+void riscv_hwprobe_complete_async_probe(void)
+{
+ if (atomic_dec_and_test(&pending_boot_probes))
+ complete(&boot_probes_done);
+}
+
static int __init init_hwprobe_vdso_data(void)
{
struct vdso_arch_data *avd = vdso_k_arch_data;
@@ -474,6 +490,8 @@ static int __init init_hwprobe_vdso_data(void)
struct riscv_hwprobe pair;
int key;
+ if (unlikely(atomic_read(&pending_boot_probes) > 0))
+ wait_for_completion(&boot_probes_done);
/*
* Initialize vDSO data with the answers for the "all CPUs" case, to
* save a syscall in the common case.
@@ -504,7 +522,7 @@ static int __init init_hwprobe_vdso_data(void)
return 0;
}
-arch_initcall_sync(init_hwprobe_vdso_data);
+late_initcall(init_hwprobe_vdso_data);
#endif /* CONFIG_MMU */
diff --git a/arch/riscv/kernel/unaligned_access_speed.c b/arch/riscv/kernel/unaligned_access_speed.c
index ae2068425fbcd207..4b8ad2673b0f7470 100644
--- a/arch/riscv/kernel/unaligned_access_speed.c
+++ b/arch/riscv/kernel/unaligned_access_speed.c
@@ -379,6 +379,7 @@ static void check_vector_unaligned_access(struct work_struct *work __always_unus
static int __init vec_check_unaligned_access_speed_all_cpus(void *unused __always_unused)
{
schedule_on_each_cpu(check_vector_unaligned_access);
+ riscv_hwprobe_complete_async_probe();
return 0;
}
@@ -473,8 +474,12 @@ static int __init check_unaligned_access_all_cpus(void)
per_cpu(vector_misaligned_access, cpu) = unaligned_vector_speed_param;
} else if (!check_vector_unaligned_access_emulated_all_cpus() &&
IS_ENABLED(CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS)) {
- kthread_run(vec_check_unaligned_access_speed_all_cpus,
- NULL, "vec_check_unaligned_access_speed_all_cpus");
+ riscv_hwprobe_register_async_probe();
+ if (IS_ERR(kthread_run(vec_check_unaligned_access_speed_all_cpus,
+ NULL, "vec_check_unaligned_access_speed_all_cpus"))) {
+ pr_warn("Failed to create vec_unalign_check kthread\n");
+ riscv_hwprobe_complete_async_probe();
+ }
}
/*
--
2.50.0
This patch series introduces a few minor fixes on Exynos7870 devices.
These fix USB gadget problems and serious crashes on certain variants of
devices. More information is provided in respective commits.
This series has no dependencies. Would be nice to get them merged in
6.16 itself. I assume it's okay to cc stable as the -rc releases are
also owned by the "Stable Group" in git.kernel.org... [1] [2]
[1] https://git.kernel.org/?q=Stable+Group
[2] https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
Signed-off-by: Kaustabh Chakraborty <kauschluss(a)disroot.org>
---
Kaustabh Chakraborty (3):
arm64: dts: exynos7870: add quirk to disable USB2 LPM in gadget mode
arm64: dts: exynos7870-on7xelte: reduce memory ranges to base amount
arm64: dts: exynos7870-j6lte: reduce memory ranges to base amount
arch/arm64/boot/dts/exynos/exynos7870-j6lte.dts | 2 +-
arch/arm64/boot/dts/exynos/exynos7870-on7xelte.dts | 2 +-
arch/arm64/boot/dts/exynos/exynos7870.dtsi | 1 +
3 files changed, 3 insertions(+), 2 deletions(-)
---
base-commit: 1b152eeca84a02bdb648f16b82ef3394007a9dcf
change-id: 20250626-exynos7870-dts-fixes-e730f7086ddc
Best regards,
--
Kaustabh Chakraborty <kauschluss(a)disroot.org>
From: Ge Yang <yangge1116(a)126.com>
Since commit d228814b1913 ("efi/libstub: Add get_event_log() support
for CC platforms") reuses TPM2 support code for the CC platforms, when
launching a TDX virtual machine with coco measurement enabled, the
following error log is generated:
[Firmware Bug]: Failed to parse event in TPM Final Events Log
Call Trace:
efi_config_parse_tables()
efi_tpm_eventlog_init()
tpm2_calc_event_log_size()
__calc_tpm2_event_size()
The pcr_idx value in the Intel TDX log header is 1, causing the
function __calc_tpm2_event_size() to fail to recognize the log header,
ultimately leading to the "Failed to parse event in TPM Final Events
Log" error.
According to UEFI Spec 2.10 Section 38.4.1: For Tdx, TPM PCR 0 maps to
MRTD, so the log header uses TPM PCR 1. To successfully parse the TDX
event log header, the check for a pcr_idx value of 0 has been removed
here, and it appears that this will not affect other functionalities.
Link: https://uefi.org/specs/UEFI/2.10/38_Confidential_Computing.html#intel-trust…
Fixes: d228814b1913 ("efi/libstub: Add get_event_log() support for CC platforms")
Signed-off-by: Ge Yang <yangge1116(a)126.com>
Cc: stable(a)vger.kernel.org
---
include/linux/tpm_eventlog.h | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/include/linux/tpm_eventlog.h b/include/linux/tpm_eventlog.h
index 891368e..05c0ae5 100644
--- a/include/linux/tpm_eventlog.h
+++ b/include/linux/tpm_eventlog.h
@@ -202,8 +202,7 @@ static __always_inline u32 __calc_tpm2_event_size(struct tcg_pcr_event2_head *ev
event_type = event->event_type;
/* Verify that it's the log header */
- if (event_header->pcr_idx != 0 ||
- event_header->event_type != NO_ACTION ||
+ if (event_header->event_type != NO_ACTION ||
memcmp(event_header->digest, zero_digest, sizeof(zero_digest))) {
size = 0;
goto out;
--
2.7.4
Andy, Kees,
On Wed, Apr 16, 2025 at 09:45:52AM +0300, Andy Shevchenko wrote:
> On Tue, Apr 15, 2025 at 04:14:24PM -0700, Kees Cook wrote:
> > The 20 byte length of struct platform_device_id::name is not long enough
> > for many devices (especially regulators), where the string initialization
> > is getting truncated and missing the trailing NUL byte. This is seen
> > with GCC 15's -Wunterminated-string-initialization option:
> >
> > drivers/regulator/hi6421v530-regulator.c:189:19: warning: initializer-string for array of 'char' truncates NUL terminator but destination lacks 'nonstring' attribute (21 chars into 20 available) [-Wunterminated-string-initialization]
> > 189 | { .name = "hi6421v530-regulator" },
> > | ^~~~~~~~~~~~~~~~~~~~~~
> > drivers/regulator/hi6421v600-regulator.c:278:19: warning: initializer-string for array of 'char' truncates NUL terminator but destination lacks 'nonstring' attribute (21 chars into 20 available) [-Wunterminated-string-initialization]
> > 278 | { .name = "hi6421v600-regulator" },
> > | ^~~~~~~~~~~~~~~~~~~~~~
> > drivers/regulator/lp87565-regulator.c:233:11: warning: initializer-string for array of 'char' truncates NUL terminator but destination lacks 'nonstring' attribute (21 chars into 20 available) [-Wunterminated-string-initialization]
> > 233 | { "lp87565-q1-regulator", },
> > | ^~~~~~~~~~~~~~~~~~~~~~
> > sound/soc/fsl/imx-pcm-rpmsg.c:818:19: warning: initializer-string for array of 'char' truncates NUL terminator but destination lacks 'nonstring' attribute (21 chars into 20 available) [-Wunterminated-string-initialization]
> > 818 | { .name = "rpmsg-micfil-channel" },
> > | ^~~~~~~~~~~~~~~~~~~~~~
> > drivers/iio/light/hid-sensor-als.c:457:25: warning: initializer-string for array of 'char' truncates NUL terminator but destination lacks 'nonstring' attribute (21 chars into 20 available) [-Wunterminated-string-initialization]
> > 457 | .name = "HID-SENSOR-LISS-0041",
> > | ^~~~~~~~~~~~~~~~~~~~~~
> > drivers/iio/light/hid-sensor-prox.c:366:25: warning: initializer-string for array of 'char' truncates NUL terminator but destination lacks 'nonstring' attribute (21 chars into 20 available) [-Wunterminated-string-initialization]
> > 366 | .name = "HID-SENSOR-LISS-0226",
> > | ^~~~~~~~~~~~~~~~~~~~~~
> >
> > Increase the length to 24, slightly more than is currently being used by
> > the affected drivers. The string is used in '%s' format strings and via
> > the module code, which appears to do its own length encoding. This size
> > was chosen because there was already a 4 byte hole in the structure:
> >
> > struct platform_device_id {
> > char name[20]; /* 0 20 */
> >
> > /* XXX 4 bytes hole, try to pack */
> >
> > kernel_ulong_t driver_data; /* 24 8 */
> >
> > /* size: 32, cachelines: 1, members: 2 */
> > /* sum members: 28, holes: 1, sum holes: 4 */
> > /* last cacheline: 32 bytes */
> > };
>
> Since there is no even potential ABI breakage, I'm fine with the change.
> Reviewed-by: Andy Shevchenko <andriy.shevchenko(a)linux.intel.com>
This definitely breaks ABI on 32-bit architectures such as i586, because there
is no gap from alignment. Perhaps, this also make the commit not suitable for
backporting to stable branches?
I recently stumbled on build failure on v5.10.239 for i586:
make: Entering directory '/usr/src/kernel-source-5.10'
DEPMOD 5.10.239
depmod: FATAL: Module index: bad character '�'=0x80 - only 7-bit ASCII is supported:
platform:jsl_rt5682_max98360ax�
make: *** [Makefile:1786: modules_install] Error 1
make: Leaving directory '/usr/src/kernel-source-5.10'
With this patch not applied "jsl_rt5682_max98360a" have terminating '\0'
truncated due to PLATFORM_NAME_SIZE being same as the string length and
concatenated with the following binary data:
{
.name = "jsl_rt5682_max98360a",
.driver_data = (kernel_ulong_t)(SOF_RT5682_MCLK_EN |
SOF_RT5682_MCLK_24MHZ |
SOF_RT5682_SSP_CODEC(0) |
SOF_SPEAKER_AMP_PRESENT |
SOF_MAX98360A_SPEAKER_AMP_PRESENT |
SOF_RT5682_SSP_AMP(1)),
},
modpost then interprets it as an asciiz string concatenating with `driver_data`
resulting in bad characters.
static int do_platform_entry(const char *filename,
void *symval, char *alias)
{
DEF_FIELD_ADDR(symval, platform_device_id, name);
sprintf(alias, PLATFORM_MODULE_PREFIX "%s", *name);
return 1;
}
creating in an incorrect alias, and this somehow breaks depmod in kmod 34.2
(maybe earlier).
Old kmod 30 successfully adds incorrect alias:
$ modinfo snd-soc-sof_rt5682.ko | grep jsl_rt5682_max98360a
alias: platform:jsl_rt5682_max98360a
alias: platform:jsl_rt5682_max98360ax�
and
modules.alias:alias platform:jsl_rt5682_max98360ax� snd_soc_sof_rt5682
Perhaps, scripts/mod/file2alias.c should be updated with:
- sprintf(alias, PLATFORM_MODULE_PREFIX "%s", *name);
+ sprintf(alias, PLATFORM_MODULE_PREFIX "%.*s", PLATFORM_NAME_SIZE, *name);
(Or even producing an error if more serious truncation occurs.)
Thanks,
>
> --
> With Best Regards,
> Andy Shevchenko
>
>
The patch titled
Subject: mm/zsmalloc: do not pass __GFP_MOVABLE if CONFIG_COMPACTION=n
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
mm-zsmalloc-do-not-pass-__gfp_movable-if-config_compaction=n.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Harry Yoo <harry.yoo(a)oracle.com>
Subject: mm/zsmalloc: do not pass __GFP_MOVABLE if CONFIG_COMPACTION=n
Date: Fri, 4 Jul 2025 19:30:53 +0900
Commit 48b4800a1c6a ("zsmalloc: page migration support") added support for
migrating zsmalloc pages using the movable_operations migration framework.
However, the commit did not take into account that zsmalloc supports
migration only when CONFIG_COMPACTION is enabled. Tracing shows that
zsmalloc was still passing the __GFP_MOVABLE flag even when compaction is
not supported.
This can result in unmovable pages being allocated from movable page
blocks (even without stealing page blocks), ZONE_MOVABLE and CMA area.
Possible user visible effects:
- Some ZONE_MOVABLE memory can be not actually movable
- CMA allocation can fail because of this
- Increased memory fragmentation due to ignoring the page mobility
grouping feature
I'm not really sure who uses kernels without compaction support, though :(
To fix this, clear the __GFP_MOVABLE flag when
!IS_ENABLED(CONFIG_COMPACTION).
Link: https://lkml.kernel.org/r/20250704103053.6913-1-harry.yoo@oracle.com
Fixes: 48b4800a1c6a ("zsmalloc: page migration support")
Signed-off-by: Harry Yoo <harry.yoo(a)oracle.com>
Acked-by: David Hildenbrand <david(a)redhat.com>
Cc: Minchan Kim <minchan(a)kernel.org>
Cc: Sergey Senozhatsky <senozhatsky(a)chromium.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/zsmalloc.c | 3 +++
1 file changed, 3 insertions(+)
--- a/mm/zsmalloc.c~mm-zsmalloc-do-not-pass-__gfp_movable-if-config_compaction=n
+++ a/mm/zsmalloc.c
@@ -1043,6 +1043,9 @@ static struct zspage *alloc_zspage(struc
if (!zspage)
return NULL;
+ if (!IS_ENABLED(CONFIG_COMPACTION))
+ gfp &= ~__GFP_MOVABLE;
+
zspage->magic = ZSPAGE_MAGIC;
zspage->pool = pool;
zspage->class = class->index;
_
Patches currently in -mm which might be from harry.yoo(a)oracle.com are
lib-alloc_tag-do-not-acquire-non-existent-lock-in-alloc_tag_top_users.patch
lib-alloc_tag-do-not-acquire-non-existent-lock-in-alloc_tag_top_users-v3.patch
mm-zsmalloc-do-not-pass-__gfp_movable-if-config_compaction=n.patch
The patch titled
Subject: mm/shmem, swap: improve cached mTHP handling and fix potential hang
has been added to the -mm mm-unstable branch. Its filename is
mm-shmem-swap-improve-cached-mthp-handling-and-fix-potential-hung.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Kairui Song <kasong(a)tencent.com>
Subject: mm/shmem, swap: improve cached mTHP handling and fix potential hang
Date: Sat, 5 Jul 2025 02:17:40 +0800
Patch series "mm/shmem, swap: bugfix and improvement of mTHP swap-in", v4.
The current mTHP swapin path have several problems. It may potentially
hang, may cause redundant faults due to false positive swap cache lookup,
and it will involve at least 4 Xarray tree walks (get order, get order
again, confirm swap, insert folio). And for !CONFIG_TRANSPARENT_HUGEPAGE
builds, it will performs some mTHP related checks.
This series fixes all of the mentioned issues, and the code should be more
robust and prepared for the swap table series. Now tree walks is reduced
to twice (get order & confirm, insert folio), !CONFIG_TRANSPARENT_HUGEPAGE
build overhead is also minimized, and comes with a sanity check now.
The performance is slightly better after this series, sequential swap in
of 24G data from ZRAM, using transparent_hugepage_tmpfs=always (24 samples
each):
Before: 11.02, stddev: 0.06
After patch 1: 10.74, stddev: 0.03
After patch 2: 10.72, stddev: 0.01
After patch 3: 10.73, stddev: 0.04
After patch 4: 10.72, stddev: 0.02
After patch 5: 10.74, stddev: 0.01
After patch 6: 10.13, stddev: 0.09
After patch 7: 9.95, stddev: 0.02
After patch 8: 9.88, stddev: 0.04
Each patch improves the performance by a little, which is about ~10%
faster in total.
Build kernel test showed very slightly improvement, testing with make -j24
with defconfig in a 256M memcg also using ZRAM as swap, and
transparent_hugepage_tmpfs=always (6 test runs):
Before: system time avg: 3911.80s
After: system time avg: 3863.76s
This patch (of 9):
The current swap-in code assumes that, when a swap entry in shmem mapping
is order 0, its cached folios (if present) must be order 0 too, which
turns out not always correct.
The problem is shmem_split_large_entry is called before verifying the
folio will eventually be swapped in, one possible race is:
CPU1 CPU2
shmem_swapin_folio
/* swap in of order > 0 swap entry S1 */
folio = swap_cache_get_folio
/* folio = NULL */
order = xa_get_order
/* order > 0 */
folio = shmem_swap_alloc_folio
/* mTHP alloc failure, folio = NULL */
<... Interrupted ...>
shmem_swapin_folio
/* S1 is swapped in */
shmem_writeout
/* S1 is swapped out, folio cached */
shmem_split_large_entry(..., S1)
/* S1 is split, but the folio covering it has order > 0 now */
Now any following swapin of S1 will hang: `xa_get_order` returns 0, and
folio lookup will return a folio with order > 0. The
`xa_get_order(&mapping->i_pages, index) != folio_order(folio)` will always
return false causing swap-in to return -EEXIST.
And this looks fragile. So fix this up by allowing seeing a larger folio
in swap cache, and check the whole shmem mapping range covered by the
swapin have the right swap value upon inserting the folio. And drop the
redundant tree walks before the insertion.
This will actually improve performance, as it avoids two redundant Xarray
tree walks in the hot path, and the only side effect is that in the
failure path, shmem may redundantly reallocate a few folios causing
temporary slight memory pressure.
And worth noting, it may seem the order and value check before inserting
might help reducing the lock contention, which is not true. The swap
cache layer ensures raced swapin will either see a swap cache folio or
failed to do a swapin (we have SWAP_HAS_CACHE bit even if swap cache is
bypassed), so holding the folio lock and checking the folio flag is
already good enough for avoiding the lock contention. The chance that a
folio passes the swap entry value check but the shmem mapping slot has
changed should be very low.
Link: https://lkml.kernel.org/r/20250704181748.63181-1-ryncsn@gmail.com
Link: https://lkml.kernel.org/r/20250704181748.63181-2-ryncsn@gmail.com
Fixes: 809bc86517cc ("mm: shmem: support large folio swap out")
Signed-off-by: Kairui Song <kasong(a)tencent.com>
Reviewed-by: Kemeng Shi <shikemeng(a)huaweicloud.com>
Reviewed-by: Baolin Wang <baolin.wang(a)linux.alibaba.com>
Tested-by: Baolin Wang <baolin.wang(a)linux.alibaba.com>
Cc: <stable(a)vger.kernel.org>
Cc: Baoquan He <bhe(a)redhat.com>
Cc: Barry Song <baohua(a)kernel.org>
Cc: Chris Li <chrisl(a)kernel.org>
Cc: Dev Jain <dev.jain(a)arm.com>
Cc: Hugh Dickins <hughd(a)google.com>
Cc: Matthew Wilcox (Oracle) <willy(a)infradead.org>
Cc: Nhat Pham <nphamcs(a)gmail.com>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/shmem.c | 30 +++++++++++++++++++++---------
1 file changed, 21 insertions(+), 9 deletions(-)
--- a/mm/shmem.c~mm-shmem-swap-improve-cached-mthp-handling-and-fix-potential-hung
+++ a/mm/shmem.c
@@ -884,7 +884,9 @@ static int shmem_add_to_page_cache(struc
pgoff_t index, void *expected, gfp_t gfp)
{
XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio));
- long nr = folio_nr_pages(folio);
+ unsigned long nr = folio_nr_pages(folio);
+ swp_entry_t iter, swap;
+ void *entry;
VM_BUG_ON_FOLIO(index != round_down(index, nr), folio);
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
@@ -896,14 +898,24 @@ static int shmem_add_to_page_cache(struc
gfp &= GFP_RECLAIM_MASK;
folio_throttle_swaprate(folio, gfp);
+ swap = iter = radix_to_swp_entry(expected);
do {
xas_lock_irq(&xas);
- if (expected != xas_find_conflict(&xas)) {
- xas_set_err(&xas, -EEXIST);
- goto unlock;
+ xas_for_each_conflict(&xas, entry) {
+ /*
+ * The range must either be empty, or filled with
+ * expected swap entries. Shmem swap entries are never
+ * partially freed without split of both entry and
+ * folio, so there shouldn't be any holes.
+ */
+ if (!expected || entry != swp_to_radix_entry(iter)) {
+ xas_set_err(&xas, -EEXIST);
+ goto unlock;
+ }
+ iter.val += 1 << xas_get_order(&xas);
}
- if (expected && xas_find_conflict(&xas)) {
+ if (expected && iter.val - nr != swap.val) {
xas_set_err(&xas, -EEXIST);
goto unlock;
}
@@ -2323,7 +2335,7 @@ static int shmem_swapin_folio(struct ino
error = -ENOMEM;
goto failed;
}
- } else if (order != folio_order(folio)) {
+ } else if (order > folio_order(folio)) {
/*
* Swap readahead may swap in order 0 folios into swapcache
* asynchronously, while the shmem mapping can still stores
@@ -2348,15 +2360,15 @@ static int shmem_swapin_folio(struct ino
swap = swp_entry(swp_type(swap), swp_offset(swap) + offset);
}
+ } else if (order < folio_order(folio)) {
+ swap.val = round_down(swap.val, 1 << folio_order(folio));
}
alloced:
/* We have to do this with folio locked to prevent races */
folio_lock(folio);
if ((!skip_swapcache && !folio_test_swapcache(folio)) ||
- folio->swap.val != swap.val ||
- !shmem_confirm_swap(mapping, index, swap) ||
- xa_get_order(&mapping->i_pages, index) != folio_order(folio)) {
+ folio->swap.val != swap.val) {
error = -EEXIST;
goto unlock;
}
_
Patches currently in -mm which might be from kasong(a)tencent.com are
mm-list_lru-refactor-the-locking-code.patch
mm-shmem-swap-improve-cached-mthp-handling-and-fix-potential-hung.patch
mm-shmem-swap-avoid-redundant-xarray-lookup-during-swapin.patch
mm-shmem-swap-tidy-up-thp-swapin-checks.patch
mm-shmem-swap-tidy-up-swap-entry-splitting.patch
mm-shmem-swap-avoid-false-positive-swap-cache-lookup.patch
mm-shmem-swap-never-use-swap-cache-and-readahead-for-swp_synchronous_io.patch
mm-shmem-swap-simplify-swapin-path-and-result-handling.patch
mm-shmem-swap-simplify-swap-entry-and-index-calculation-of-large-swapin.patch
mm-shmem-swap-fix-major-fault-counting.patch
From: Kairui Song <kasong(a)tencent.com>
The current swap-in code assumes that, when a swap entry in shmem mapping
is order 0, its cached folios (if present) must be order 0 too, which
turns out not always correct.
The problem is shmem_split_large_entry is called before verifying the
folio will eventually be swapped in, one possible race is:
CPU1 CPU2
shmem_swapin_folio
/* swap in of order > 0 swap entry S1 */
folio = swap_cache_get_folio
/* folio = NULL */
order = xa_get_order
/* order > 0 */
folio = shmem_swap_alloc_folio
/* mTHP alloc failure, folio = NULL */
<... Interrupted ...>
shmem_swapin_folio
/* S1 is swapped in */
shmem_writeout
/* S1 is swapped out, folio cached */
shmem_split_large_entry(..., S1)
/* S1 is split, but the folio covering it has order > 0 now */
Now any following swapin of S1 will hang: `xa_get_order` returns 0, and
folio lookup will return a folio with order > 0. The
`xa_get_order(&mapping->i_pages, index) != folio_order(folio)` will always
return false causing swap-in to return -EEXIST.
And this looks fragile. So fix this up by allowing seeing a larger folio
in swap cache, and check the whole shmem mapping range covered by the
swapin have the right swap value upon inserting the folio. And drop the
redundant tree walks before the insertion.
This will actually improve performance, as it avoids two redundant Xarray
tree walks in the hot path, and the only side effect is that in the
failure path, shmem may redundantly reallocate a few folios causing
temporary slight memory pressure.
And worth noting, it may seems the order and value check before inserting
might help reducing the lock contention, which is not true. The swap
cache layer ensures raced swapin will either see a swap cache folio or
failed to do a swapin (we have SWAP_HAS_CACHE bit even if swap cache is
bypassed), so holding the folio lock and checking the folio flag is
already good enough for avoiding the lock contention. The chance that a
folio passes the swap entry value check but the shmem mapping slot has
changed should be very low.
Fixes: 809bc86517cc ("mm: shmem: support large folio swap out")
Signed-off-by: Kairui Song <kasong(a)tencent.com>
Reviewed-by: Kemeng Shi <shikemeng(a)huaweicloud.com>
Reviewed-by: Baolin Wang <baolin.wang(a)linux.alibaba.com>
Tested-by: Baolin Wang <baolin.wang(a)linux.alibaba.com>
Cc: <stable(a)vger.kernel.org>
---
mm/shmem.c | 30 +++++++++++++++++++++---------
1 file changed, 21 insertions(+), 9 deletions(-)
diff --git a/mm/shmem.c b/mm/shmem.c
index 334b7b4a61a0..e3c9a1365ff4 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -884,7 +884,9 @@ static int shmem_add_to_page_cache(struct folio *folio,
pgoff_t index, void *expected, gfp_t gfp)
{
XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio));
- long nr = folio_nr_pages(folio);
+ unsigned long nr = folio_nr_pages(folio);
+ swp_entry_t iter, swap;
+ void *entry;
VM_BUG_ON_FOLIO(index != round_down(index, nr), folio);
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
@@ -896,14 +898,24 @@ static int shmem_add_to_page_cache(struct folio *folio,
gfp &= GFP_RECLAIM_MASK;
folio_throttle_swaprate(folio, gfp);
+ swap = iter = radix_to_swp_entry(expected);
do {
xas_lock_irq(&xas);
- if (expected != xas_find_conflict(&xas)) {
- xas_set_err(&xas, -EEXIST);
- goto unlock;
+ xas_for_each_conflict(&xas, entry) {
+ /*
+ * The range must either be empty, or filled with
+ * expected swap entries. Shmem swap entries are never
+ * partially freed without split of both entry and
+ * folio, so there shouldn't be any holes.
+ */
+ if (!expected || entry != swp_to_radix_entry(iter)) {
+ xas_set_err(&xas, -EEXIST);
+ goto unlock;
+ }
+ iter.val += 1 << xas_get_order(&xas);
}
- if (expected && xas_find_conflict(&xas)) {
+ if (expected && iter.val - nr != swap.val) {
xas_set_err(&xas, -EEXIST);
goto unlock;
}
@@ -2323,7 +2335,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
error = -ENOMEM;
goto failed;
}
- } else if (order != folio_order(folio)) {
+ } else if (order > folio_order(folio)) {
/*
* Swap readahead may swap in order 0 folios into swapcache
* asynchronously, while the shmem mapping can still stores
@@ -2348,15 +2360,15 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
swap = swp_entry(swp_type(swap), swp_offset(swap) + offset);
}
+ } else if (order < folio_order(folio)) {
+ swap.val = round_down(swap.val, 1 << folio_order(folio));
}
alloced:
/* We have to do this with folio locked to prevent races */
folio_lock(folio);
if ((!skip_swapcache && !folio_test_swapcache(folio)) ||
- folio->swap.val != swap.val ||
- !shmem_confirm_swap(mapping, index, swap) ||
- xa_get_order(&mapping->i_pages, index) != folio_order(folio)) {
+ folio->swap.val != swap.val) {
error = -EEXIST;
goto unlock;
}
--
2.50.0