Hello.
We have observed a huge latency increase using `fork()` after ingesting the CVE-2025-38085 fix which leads to the commit `1013af4f585f: mm/hugetlb: fix huge_pmd_unshare() vs GUP-fast race`. On large machines with 1.5TB of memory with 196 cores, we identified mmapping of 1.2TB of shared memory and forking itself dozens or hundreds of times we see a increase of execution times of a factor of 4. The reproducer is at the end of the email.
Comparing the a kernel without this patch with a kernel with this patch applied when spawning 1000 children we see those execution times:
Patched kernel:
$ time make stress
...
real 0m11.275s
user 0m0.177s
sys 0m23.905s
Original kernel :
$ time make stress
...real 0m2.475s
user 0m1.398s
sys 0m2.501s
The patch in question: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id…
My observation/assumption is:
each child touches 100 random pages and despawns
on each despawn `huge_pmd_unshare()` is called
each call to `huge_pmd_unshare()` syncrhonizes all threads using `tlb_remove_table_sync_one()` leading to the regression
I'm happy to provide more information.
Thank you
Stanislav Uschakow
=== Reproducer ===
Setup:
#!/bin/bash
echo "Setting up hugepages for reproduction..."
# hugepages (1.2TB / 2MB = 614400 pages)
REQUIRED_PAGES=614400
# Check current hugepage allocation
CURRENT_PAGES=$(cat /proc/sys/vm/nr_hugepages)
echo "Current hugepages: $CURRENT_PAGES"
if [ "$CURRENT_PAGES" -lt "$REQUIRED_PAGES" ]; then
echo "Allocating $REQUIRED_PAGES hugepages..."
echo $REQUIRED_PAGES | sudo tee /proc/sys/vm/nr_hugepages
ALLOCATED=$(cat /proc/sys/vm/nr_hugepages)
echo "Allocated hugepages: $ALLOCATED"
if [ "$ALLOCATED" -lt "$REQUIRED_PAGES" ]; then
echo "Warning: Could not allocate all required hugepages"
echo "Available: $ALLOCATED, Required: $REQUIRED_PAGES"
fi
fi
echo never | sudo tee /sys/kernel/mm/transparent_hugepage/enabled
echo -e "\nHugepage information:"
cat /proc/meminfo | grep -i huge
echo -e "\nSetup complete. You can now run the reproduction test."
Makefile:
CXX = gcc
CXXFLAGS = -O2 -Wall
TARGET = hugepage_repro
SOURCE = hugepage_repro.c
$(TARGET): $(SOURCE)
$(CXX) $(CXXFLAGS) -o $(TARGET) $(SOURCE)
clean:
rm -f $(TARGET)
setup:
chmod +x setup_hugepages.sh
./setup_hugepages.sh
test: $(TARGET)
./$(TARGET) 20 3
stress: $(TARGET)
./$(TARGET) 1000 1
.PHONY: clean setup test stress
hugepage_repro.c:
#include <sys/mman.h>
#include <sys/wait.h>
#include <unistd.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <stdio.h>
#define HUGEPAGE_SIZE (2 * 1024 * 1024) // 2MB
#define TOTAL_SIZE (1200ULL * 1024 * 1024 * 1024) // 1.2TB
#define NUM_HUGEPAGES (TOTAL_SIZE / HUGEPAGE_SIZE)
void* create_hugepage_mapping() {
void* addr = mmap(NULL, TOTAL_SIZE, PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0);
if (addr == MAP_FAILED) {
perror("mmap hugepages failed");
exit(1);
}
return addr;
}
void touch_random_pages(void* addr, int num_touches) {
char* base = (char*)addr;
for (int i = 0; i < num_touches; ++i) {
size_t offset = (rand() % NUM_HUGEPAGES) * HUGEPAGE_SIZE;
volatile char val = base[offset];
(void)val;
}
}
void child_process(void* shared_mem, int child_id) {
struct timespec start, end;
clock_gettime(CLOCK_MONOTONIC, &start);
touch_random_pages(shared_mem, 100);
clock_gettime(CLOCK_MONOTONIC, &end);
long duration = (end.tv_sec - start.tv_sec) * 1000000 +
(end.tv_nsec - start.tv_nsec) / 1000;
printf("Child %d completed in %ld μs\n", child_id, duration);
}
int main(int argc, char* argv[]) {
int num_processes = argc > 1 ? atoi(argv[1]) : 50;
int iterations = argc > 2 ? atoi(argv[2]) : 5;
printf("Creating %lluGB hugepage mapping...\n", TOTAL_SIZE / (1024*1024*1024));
void* shared_mem = create_hugepage_mapping();
for (int iter = 0; iter < iterations; ++iter) {
printf("\nIteration %d: Forking %d processes\n", iter + 1, num_processes);
pid_t children[num_processes];
struct timespec iter_start, iter_end;
clock_gettime(CLOCK_MONOTONIC, &iter_start);
for (int i = 0; i < num_processes; ++i) {
pid_t pid = fork();
if (pid == 0) {
child_process(shared_mem, i);
exit(0);
} else if (pid > 0) {
children[i] = pid;
}
}
for (int i = 0; i < num_processes; ++i) {
waitpid(children[i], NULL, 0);
}
clock_gettime(CLOCK_MONOTONIC, &iter_end);
long iter_duration = (iter_end.tv_sec - iter_start.tv_sec) * 1000 +
(iter_end.tv_nsec - iter_start.tv_nsec) / 1000000;
printf("Iteration completed in %ld ms\n", iter_duration);
}
munmap(shared_mem, TOTAL_SIZE);
return 0;
}
Amazon Web Services Development Center Germany GmbH
Tamara-Danz-Str. 13
10243 Berlin
Geschaeftsfuehrung: Christian Schlaeger, Jonathan Weiss
Eingetragen am Amtsgericht Charlottenburg unter HRB 257764 B
Sitz: Berlin
Ust-ID: DE 365 538 597
This is the start of the stable review cycle for the 5.10.242 release.
There are 34 patches in this series, all will be posted as a response
to this one. If anyone has any issues with these being applied, please
let me know.
Responses should be made by Thu, 04 Sep 2025 13:19:14 +0000.
Anything received after that time might be too late.
The whole patch series can be found in one patch at:
https://www.kernel.org/pub/linux/kernel/v5.x/stable-review/patch-5.10.242-r…
or in the git tree and branch at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git linux-5.10.y
and the diffstat can be found below.
thanks,
greg k-h
-------------
Pseudo-Shortlog of commits:
Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Linux 5.10.242-rc1
Eric Sandeen <sandeen(a)redhat.com>
xfs: do not propagate ENODATA disk errors into xattr code
Brent Lu <brent.lu(a)intel.com>
ASoC: Intel: sof_da7219_mx98360a: fail to initialize soundcard
Pierre-Louis Bossart <pierre-louis.bossart(a)linux.intel.com>
ASoC: Intel: glk_rt5682_max98357a: shrink platform_id below 20 characters
Pierre-Louis Bossart <pierre-louis.bossart(a)linux.intel.com>
ASoC: Intel: sof_rt5682: shrink platform_id names below 20 characters
Pierre-Louis Bossart <pierre-louis.bossart(a)linux.intel.com>
ASoC: Intel: bxt_da7219_max98357a: shrink platform_id below 20 characters
Imre Deak <imre.deak(a)intel.com>
Revert "drm/dp: Change AUX DPCD probe address from DPCD_REV to LANE0_1_STATUS"
Hamish Martin <hamish.martin(a)alliedtelesis.co.nz>
HID: mcp2221: Handle reads greater than 60 bytes
Hamish Martin <hamish.martin(a)alliedtelesis.co.nz>
HID: mcp2221: Don't set bus speed on every transfer
James Jones <jajones(a)nvidia.com>
drm/nouveau/disp: Always accept linear modifier
Fabio Porcedda <fabio.porcedda(a)gmail.com>
net: usb: qmi_wwan: add Telit Cinterion LE910C4-WWX new compositions
Shanker Donthineni <sdonthineni(a)nvidia.com>
dma/pool: Ensure DMA_DIRECT_REMAP allocations are decrypted
Alex Deucher <alexander.deucher(a)amd.com>
Revert "drm/amdgpu: fix incorrect vm flags to map bo"
Minjong Kim <minbell.kim(a)samsung.com>
HID: hid-ntrig: fix unable to handle page fault in ntrig_report_version()
Ping Cheng <pinglinux(a)gmail.com>
HID: wacom: Add a new Art Pen 2
Qasim Ijaz <qasdev00(a)gmail.com>
HID: asus: fix UAF via HID_CLAIMED_INPUT validation
Thijs Raymakers <thijs(a)raymakers.nl>
KVM: x86: use array_index_nospec with indices that come from guest
Li Nan <linan122(a)huawei.com>
efivarfs: Fix slab-out-of-bounds in efivarfs_d_compare
Eric Dumazet <edumazet(a)google.com>
sctp: initialize more fields in sctp_v6_from_sk()
Rohan G Thomas <rohan.g.thomas(a)altera.com>
net: stmmac: xgmac: Do not enable RX FIFO Overflow interrupts
Alexei Lazar <alazar(a)nvidia.com>
net/mlx5e: Set local Xoff after FW update
Alexei Lazar <alazar(a)nvidia.com>
net/mlx5e: Update and set Xon/Xoff upon port speed set
Alexei Lazar <alazar(a)nvidia.com>
net/mlx5e: Update and set Xon/Xoff upon MTU set
Yeounsu Moon <yyyynoom(a)gmail.com>
net: dlink: fix multicast stats being counted incorrectly
Kuniyuki Iwashima <kuniyu(a)google.com>
atm: atmtcp: Prevent arbitrary write in atmtcp_recv_control().
Luiz Augusto von Dentz <luiz.von.dentz(a)intel.com>
Bluetooth: hci_event: Detect if HCI_EV_NUM_COMP_PKTS is unbalanced
Madhavan Srinivasan <maddy(a)linux.ibm.com>
powerpc/kvm: Fix ifdef to remove build warning
Oscar Maes <oscmaes92(a)gmail.com>
net: ipv4: fix regression in local-broadcast routes
Nikolay Kuratov <kniv(a)yandex-team.ru>
vhost/net: Protect ubufs with rcu read lock in vhost_net_ubuf_put()
Trond Myklebust <trond.myklebust(a)hammerspace.com>
NFS: Fix a race when updating an existing write
Christoph Hellwig <hch(a)lst.de>
nfs: fold nfs_page_group_lock_subrequests into nfs_lock_and_join_requests
Tianxiang Peng <txpeng(a)tencent.com>
x86/cpu/hygon: Add missing resctrl_cpu_detect() in bsp_init helper
Damien Le Moal <dlemoal(a)kernel.org>
scsi: core: sysfs: Correct sysfs attributes access rights
Tengda Wu <wutengda(a)huaweicloud.com>
ftrace: Fix potential warning in trace_printk_seq during ftrace_dump
Randy Dunlap <rdunlap(a)infradead.org>
pinctrl: STMFX: add missing HAS_IOMEM dependency
-------------
Diffstat:
Makefile | 4 +-
arch/powerpc/kernel/kvm.c | 8 +-
arch/x86/kernel/cpu/hygon.c | 3 +
arch/x86/kvm/lapic.c | 2 +
arch/x86/kvm/x86.c | 7 +-
drivers/atm/atmtcp.c | 17 ++-
drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c | 4 +-
drivers/gpu/drm/drm_dp_helper.c | 2 +-
drivers/gpu/drm/nouveau/dispnv50/wndw.c | 4 +
drivers/hid/hid-asus.c | 8 +-
drivers/hid/hid-mcp2221.c | 71 +++++++----
drivers/hid/hid-ntrig.c | 3 +
drivers/hid/wacom_wac.c | 1 +
drivers/net/ethernet/dlink/dl2k.c | 2 +-
.../ethernet/mellanox/mlx5/core/en/port_buffer.c | 3 +-
.../ethernet/mellanox/mlx5/core/en/port_buffer.h | 12 ++
drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 19 ++-
drivers/net/ethernet/stmicro/stmmac/dwxgmac2_dma.c | 4 -
drivers/net/usb/qmi_wwan.c | 3 +
drivers/pinctrl/Kconfig | 1 +
drivers/scsi/scsi_sysfs.c | 4 +-
drivers/vhost/net.c | 9 +-
fs/efivarfs/super.c | 4 +
fs/nfs/pagelist.c | 86 +------------
fs/nfs/write.c | 142 +++++++++++++--------
fs/xfs/libxfs/xfs_attr_remote.c | 7 +
fs/xfs/libxfs/xfs_da_btree.c | 6 +
include/linux/atmdev.h | 1 +
include/linux/nfs_page.h | 2 +-
kernel/dma/pool.c | 4 +-
kernel/trace/trace.c | 4 +-
net/atm/common.c | 15 ++-
net/bluetooth/hci_event.c | 12 +-
net/ipv4/route.c | 10 +-
net/sctp/ipv6.c | 2 +
sound/soc/intel/boards/bxt_da7219_max98357a.c | 12 +-
sound/soc/intel/boards/glk_rt5682_max98357a.c | 4 +-
sound/soc/intel/boards/sof_da7219_max98373.c | 2 +-
sound/soc/intel/boards/sof_rt5682.c | 12 +-
sound/soc/intel/common/soc-acpi-intel-bxt-match.c | 2 +-
sound/soc/intel/common/soc-acpi-intel-cml-match.c | 2 +-
sound/soc/intel/common/soc-acpi-intel-glk-match.c | 4 +-
sound/soc/intel/common/soc-acpi-intel-jsl-match.c | 2 +-
sound/soc/intel/common/soc-acpi-intel-tgl-match.c | 4 +-
44 files changed, 317 insertions(+), 213 deletions(-)
The 4 patches in this series make the JMP_NOSPEC and CALL_NOSPEC macros used
in the kernel consistent with what is generated by the compiler.
("x86,nospec: Simplify {JMP,CALL}_NOSPEC") was merged in v6.0 and the remaining
3 patches in this series were merged in v6.15. All 4 were included in kernels
v5.15+ as prerequisites for the backport of the ITS mitigations [1].
None of these patches were included in the backport of the ITS mitigations to
the 5.10 kernel [2]. They all apply cleanly and are applicable to the 5.10
kernel. Thus I see no reason that they weren't applied here, unless someone can
correct me?
I am sending them for inclusion in the 5.10 kernel as this kernel is still
actively maintained for these kind of vulnerability mitigations and as such
having these patches will unify the handling of these cases with subsequent
kernel versions easing code understanding and the ease of backports in the
future.
[1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?…
[2] https://lore.kernel.org/stable/20250617-its-5-10-v2-0-3e925a1512a1@linux.in…
Pawan Gupta (3):
x86/speculation: Simplify and make CALL_NOSPEC consistent
x86/speculation: Add a conditional CS prefix to CALL_NOSPEC
x86/speculation: Remove the extra #ifdef around CALL_NOSPEC
Peter Zijlstra (1):
x86,nospec: Simplify {JMP,CALL}_NOSPEC
arch/x86/include/asm/nospec-branch.h | 46 ++++++++++++++++++----------
1 file changed, 30 insertions(+), 16 deletions(-)
--
2.34.1
Backport of AMD's TSA mitigation to 5.15 did not set CPUID bits that are
passed to a guest correctly (commit c334ae4a545a "KVM: SVM: Advertise
TSA CPUID bits to guests").
This series attempts to address this:
* The first patch from Kim allows us to properly use cpuid caps.
* The second patch is a combination of fixes to c334ae4a545a and f3f9deccfc68,
which is stable-only patch to 6.12.y. (Not sure what to do with
attribution)
Alternatively, we can opencode all of this (the way it's currently done in
__do_cpuid_func()'s 0x80000021 case) and do everything in a single patch.
Boris Ostrovsky (1):
KVM: SVM: Properly advertise TSA CPUID bits to guests
Kim Phillips (1):
KVM: x86: Move open-coded CPUID leaf 0x80000021 EAX bit propagation
code
arch/x86/kvm/cpuid.c | 31 ++++++++++++++++++-------------
1 file changed, 18 insertions(+), 13 deletions(-)
--
2.43.5
In cpuset hotplug handling, temporary cpumasks are allocated only when
running under cgroup v2. The current code unconditionally frees these
masks, which can lead to a crash on cgroup v1 case.
Free the temporary cpumasks only when they were actually allocated.
Fixes: 4b842da276a8 ("cpuset: Make CPU hotplug work with partition")
Cc: stable(a)vger.kernel.org
Signed-off-by: Ashay Jaiswal <quic_ashayj(a)quicinc.com>
---
kernel/cgroup/cpuset.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index a78ccd11ce9b43c2e8b0e2c454a8ee845ebdc808..a4f908024f3c0a22628a32f8a5b0ae96c7dccbb9 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -4019,7 +4019,8 @@ static void cpuset_handle_hotplug(void)
if (force_sd_rebuild)
rebuild_sched_domains_cpuslocked();
- free_tmpmasks(ptmp);
+ if (on_dfl && ptmp)
+ free_tmpmasks(ptmp);
}
void cpuset_update_active_cpus(void)
---
base-commit: 33bcf93b9a6b028758105680f8b538a31bc563cf
change-id: 20250902-cpuset-free-on-condition-85cf4eadb18c
Best regards,
--
Ashay Jaiswal <quic_ashayj(a)quicinc.com>
According to documentation, the DP PHY on x1e80100 has another clock
called refclk. Rework the driver to allow different number of clocks.
Fix the dt-bindings schema and add the clock to the DT node as well.
Signed-off-by: Abel Vesa <abel.vesa(a)linaro.org>
---
Changes in v2:
- Fix schema by adding the minItems, as suggested by Krzysztof.
- Use devm_clk_bulk_get_all, as suggested by Konrad.
- Rephrase the commit messages to reflect the flexible number of clocks.
- Link to v1: https://lore.kernel.org/r/20250730-phy-qcom-edp-add-missing-refclk-v1-0-6f7…
---
Abel Vesa (3):
dt-bindings: phy: qcom-edp: Add missing clock for X Elite
phy: qcom: edp: Make the number of clocks flexible
arm64: dts: qcom: Add missing TCSR refclk to the DP PHYs
.../devicetree/bindings/phy/qcom,edp-phy.yaml | 28 +++++++++++++++++++++-
arch/arm64/boot/dts/qcom/x1e80100.dtsi | 12 ++++++----
drivers/phy/qualcomm/phy-qcom-edp.c | 18 +++++++-------
3 files changed, 45 insertions(+), 13 deletions(-)
---
base-commit: 5d50cf9f7cf20a17ac469c20a2e07c29c1f6aab7
change-id: 20250730-phy-qcom-edp-add-missing-refclk-5ab82828f8e7
Best regards,
--
Abel Vesa <abel.vesa(a)linaro.org>
When building powerpc configurations in linux-5.4.y with binutils 2.43
or newer, there is an assembler error in arch/powerpc/boot/util.S:
arch/powerpc/boot/util.S: Assembler messages:
arch/powerpc/boot/util.S:44: Error: junk at end of line, first unrecognized character is `0'
arch/powerpc/boot/util.S:49: Error: syntax error; found `b', expected `,'
arch/powerpc/boot/util.S:49: Error: junk at end of line: `b'
binutils 2.43 contains stricter parsing of certain labels [1], namely
that leading zeros are no longer allowed. The GNU assembler
documentation already somewhat forbade this construct:
To define a local label, write a label of the form 'N:' (where N
represents any non-negative integer).
Eliminate the leading zero in the label to fix the syntax error. This is
only needed in linux-5.4.y because commit 8b14e1dff067 ("powerpc: Remove
support for PowerPC 601") removed this code altogether in 5.10.
Link: https://sourceware.org/git/?p=binutils-gdb.git;a=commit;h=226749d5a6ff0d5c6… [1]
Signed-off-by: Nathan Chancellor <nathan(a)kernel.org>
---
v1 -> v2:
- Adjust commit message to make it clearer this construct was already
incorrect under the existing GNU assembler documentation (Segher)
v1: https://lore.kernel.org/20250902235234.2046667-1-nathan@kernel.org/
---
arch/powerpc/boot/util.S | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/arch/powerpc/boot/util.S b/arch/powerpc/boot/util.S
index f11f0589a669..5ab2bc864e66 100644
--- a/arch/powerpc/boot/util.S
+++ b/arch/powerpc/boot/util.S
@@ -41,12 +41,12 @@ udelay:
srwi r4,r4,16
cmpwi 0,r4,1 /* 601 ? */
bne .Ludelay_not_601
-00: li r0,86 /* Instructions / microsecond? */
+0: li r0,86 /* Instructions / microsecond? */
mtctr r0
10: addi r0,r0,0 /* NOP */
bdnz 10b
subic. r3,r3,1
- bne 00b
+ bne 0b
blr
.Ludelay_not_601:
base-commit: c25f780e491e4734eb27d65aa58e0909fd78ad9f
--
2.51.0