Hello.
We have observed a huge latency increase using `fork()` after ingesting the CVE-2025-38085 fix which leads to the commit `1013af4f585f: mm/hugetlb: fix huge_pmd_unshare() vs GUP-fast race`. On large machines with 1.5TB of memory with 196 cores, we identified mmapping of 1.2TB of shared memory and forking itself dozens or hundreds of times we see a increase of execution times of a factor of 4. The reproducer is at the end of the email.
Comparing the a kernel without this patch with a kernel with this patch applied when spawning 1000 children we see those execution times:
Patched kernel: $ time make stress ... real 0m11.275s user 0m0.177s sys 0m23.905s
Original kernel :
$ time make stress ...real 0m2.475s user 0m1.398s sys 0m2.501s
The patch in question: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
My observation/assumption is:
each child touches 100 random pages and despawns on each despawn `huge_pmd_unshare()` is called each call to `huge_pmd_unshare()` syncrhonizes all threads using `tlb_remove_table_sync_one()` leading to the regression
I'm happy to provide more information.
Thank you Stanislav Uschakow
=== Reproducer ===
Setup:
#!/bin/bash echo "Setting up hugepages for reproduction..."
# hugepages (1.2TB / 2MB = 614400 pages) REQUIRED_PAGES=614400
# Check current hugepage allocation CURRENT_PAGES=$(cat /proc/sys/vm/nr_hugepages) echo "Current hugepages: $CURRENT_PAGES"
if [ "$CURRENT_PAGES" -lt "$REQUIRED_PAGES" ]; then echo "Allocating $REQUIRED_PAGES hugepages..." echo $REQUIRED_PAGES | sudo tee /proc/sys/vm/nr_hugepages
ALLOCATED=$(cat /proc/sys/vm/nr_hugepages) echo "Allocated hugepages: $ALLOCATED" if [ "$ALLOCATED" -lt "$REQUIRED_PAGES" ]; then echo "Warning: Could not allocate all required hugepages" echo "Available: $ALLOCATED, Required: $REQUIRED_PAGES" fi fi
echo never | sudo tee /sys/kernel/mm/transparent_hugepage/enabled
echo -e "\nHugepage information:" cat /proc/meminfo | grep -i huge
echo -e "\nSetup complete. You can now run the reproduction test."
Makefile:
CXX = gcc CXXFLAGS = -O2 -Wall TARGET = hugepage_repro SOURCE = hugepage_repro.c
$(TARGET): $(SOURCE) $(CXX) $(CXXFLAGS) -o $(TARGET) $(SOURCE)
clean: rm -f $(TARGET)
setup: chmod +x setup_hugepages.sh ./setup_hugepages.sh
test: $(TARGET) ./$(TARGET) 20 3
stress: $(TARGET) ./$(TARGET) 1000 1
.PHONY: clean setup test stress
hugepage_repro.c:
#include <sys/mman.h> #include <sys/wait.h> #include <unistd.h> #include <stdlib.h> #include <string.h> #include <time.h> #include <stdio.h>
#define HUGEPAGE_SIZE (2 * 1024 * 1024) // 2MB #define TOTAL_SIZE (1200ULL * 1024 * 1024 * 1024) // 1.2TB #define NUM_HUGEPAGES (TOTAL_SIZE / HUGEPAGE_SIZE)
void* create_hugepage_mapping() { void* addr = mmap(NULL, TOTAL_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0); if (addr == MAP_FAILED) { perror("mmap hugepages failed"); exit(1); } return addr; }
void touch_random_pages(void* addr, int num_touches) { char* base = (char*)addr; for (int i = 0; i < num_touches; ++i) { size_t offset = (rand() % NUM_HUGEPAGES) * HUGEPAGE_SIZE; volatile char val = base[offset]; (void)val; } }
void child_process(void* shared_mem, int child_id) { struct timespec start, end; clock_gettime(CLOCK_MONOTONIC, &start); touch_random_pages(shared_mem, 100); clock_gettime(CLOCK_MONOTONIC, &end); long duration = (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_nsec - start.tv_nsec) / 1000; printf("Child %d completed in %ld μs\n", child_id, duration); }
int main(int argc, char* argv[]) { int num_processes = argc > 1 ? atoi(argv[1]) : 50; int iterations = argc > 2 ? atoi(argv[2]) : 5; printf("Creating %lluGB hugepage mapping...\n", TOTAL_SIZE / (1024*1024*1024)); void* shared_mem = create_hugepage_mapping(); for (int iter = 0; iter < iterations; ++iter) { printf("\nIteration %d: Forking %d processes\n", iter + 1, num_processes); pid_t children[num_processes]; struct timespec iter_start, iter_end; clock_gettime(CLOCK_MONOTONIC, &iter_start); for (int i = 0; i < num_processes; ++i) { pid_t pid = fork(); if (pid == 0) { child_process(shared_mem, i); exit(0); } else if (pid > 0) { children[i] = pid; } } for (int i = 0; i < num_processes; ++i) { waitpid(children[i], NULL, 0); } clock_gettime(CLOCK_MONOTONIC, &iter_end); long iter_duration = (iter_end.tv_sec - iter_start.tv_sec) * 1000 + (iter_end.tv_nsec - iter_start.tv_nsec) / 1000000; printf("Iteration completed in %ld ms\n", iter_duration); } munmap(shared_mem, TOTAL_SIZE); return 0; }
Amazon Web Services Development Center Germany GmbH Tamara-Danz-Str. 13 10243 Berlin Geschaeftsfuehrung: Christian Schlaeger, Jonathan Weiss Eingetragen am Amtsgericht Charlottenburg unter HRB 257764 B Sitz: Berlin Ust-ID: DE 365 538 597