Bug: Performance regression in 1013af4f585f: mm/hugetlb: fix huge_pmd_unshare() vs GUP-fast race - Linux-stable-mirror

29 Aug 2025

      Hello.
We have observed a huge latency increase using `fork()` after ingesting the CVE-2025-38085 fix which leads to the commit `1013af4f585f: mm/hugetlb: fix huge_pmd_unshare() vs GUP-fast race`. On large machines with 1.5TB of memory with 196 cores, we identified mmapping of 1.2TB of shared memory and forking itself dozens or hundreds of times we see a increase of execution times of a factor of 4. The reproducer is at the end of the email.
Comparing the a kernel without this patch with a kernel with this patch applied when spawning 1000 children we see those execution times:
Patched kernel: 
$ time make stress
...
real    0m11.275s
user    0m0.177s
sys     0m23.905s
Original kernel :
$ time make stress
...real    0m2.475s
user    0m1.398s
sys     0m2.501s
The patch in question: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
My observation/assumption is:
each child touches 100 random pages and despawns
on each despawn `huge_pmd_unshare()` is called
each call to `huge_pmd_unshare()` syncrhonizes all threads using `tlb_remove_table_sync_one()` leading to the regression
I'm happy to provide more information.
Thank you
Stanislav Uschakow
=== Reproducer ===
Setup:
#!/bin/bash
echo "Setting up hugepages for reproduction..."
# hugepages (1.2TB / 2MB = 614400 pages)
REQUIRED_PAGES=614400
# Check current hugepage allocation
CURRENT_PAGES=$(cat /proc/sys/vm/nr_hugepages)
echo "Current hugepages: $CURRENT_PAGES"
if [ "$CURRENT_PAGES" -lt "$REQUIRED_PAGES" ]; then
    echo "Allocating $REQUIRED_PAGES hugepages..."
    echo $REQUIRED_PAGES | sudo tee /proc/sys/vm/nr_hugepages
ALLOCATED=$(cat /proc/sys/vm/nr_hugepages)
    echo "Allocated hugepages: $ALLOCATED"

    if [ "$ALLOCATED" -lt "$REQUIRED_PAGES" ]; then
        echo "Warning: Could not allocate all required hugepages"
        echo "Available: $ALLOCATED, Required: $REQUIRED_PAGES"
    fi
fi
echo never | sudo tee /sys/kernel/mm/transparent_hugepage/enabled
echo -e "\nHugepage information:"
cat /proc/meminfo | grep -i huge
echo -e "\nSetup complete. You can now run the reproduction test."
Makefile:
CXX = gcc
CXXFLAGS = -O2 -Wall
TARGET = hugepage_repro
SOURCE = hugepage_repro.c
$(TARGET): $(SOURCE)
    $(CXX) $(CXXFLAGS) -o $(TARGET) $(SOURCE)
clean:
    rm -f $(TARGET)
setup:
    chmod +x setup_hugepages.sh
    ./setup_hugepages.sh
test: $(TARGET)
    ./$(TARGET) 20 3
stress: $(TARGET)
    ./$(TARGET) 1000 1
.PHONY: clean setup test stress
hugepage_repro.c:
#include <sys/mman.h>
#include <sys/wait.h>
#include <unistd.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <stdio.h>
#define HUGEPAGE_SIZE (2 * 1024 * 1024) // 2MB
#define TOTAL_SIZE (1200ULL * 1024 * 1024 * 1024) // 1.2TB
#define NUM_HUGEPAGES (TOTAL_SIZE / HUGEPAGE_SIZE)
void* create_hugepage_mapping() {
    void* addr = mmap(NULL, TOTAL_SIZE, PROT_READ | PROT_WRITE,
                      MAP_SHARED | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0);
    if (addr == MAP_FAILED) {
        perror("mmap hugepages failed");
        exit(1);
    }
    return addr;
}
void touch_random_pages(void* addr, int num_touches) {
    char* base = (char*)addr;
    for (int i = 0; i < num_touches; ++i) {
        size_t offset = (rand() % NUM_HUGEPAGES) * HUGEPAGE_SIZE;
        volatile char val = base[offset];
        (void)val;
    }
}
void child_process(void* shared_mem, int child_id) {
    struct timespec start, end;
    clock_gettime(CLOCK_MONOTONIC, &start);

    touch_random_pages(shared_mem, 100);

    clock_gettime(CLOCK_MONOTONIC, &end);
    long duration = (end.tv_sec - start.tv_sec) * 1000000 + 
                   (end.tv_nsec - start.tv_nsec) / 1000;

    printf("Child %d completed in %ld μs\n", child_id, duration);
}
int main(int argc, char* argv[]) {
    int num_processes = argc > 1 ? atoi(argv[1]) : 50;
    int iterations = argc > 2 ? atoi(argv[2]) : 5;

    printf("Creating %lluGB hugepage mapping...\n", TOTAL_SIZE / (1024*1024*1024));
    void* shared_mem = create_hugepage_mapping();

    for (int iter = 0; iter < iterations; ++iter) {
        printf("\nIteration %d: Forking %d processes\n", iter + 1, num_processes);

        pid_t children[num_processes];
        struct timespec iter_start, iter_end;
        clock_gettime(CLOCK_MONOTONIC, &iter_start);

        for (int i = 0; i < num_processes; ++i) {
            pid_t pid = fork();
            if (pid == 0) {
                child_process(shared_mem, i);
                exit(0);
            } else if (pid > 0) {
                children[i] = pid;
            }
        }

        for (int i = 0; i < num_processes; ++i) {
            waitpid(children[i], NULL, 0);
        }

        clock_gettime(CLOCK_MONOTONIC, &iter_end);
        long iter_duration = (iter_end.tv_sec - iter_start.tv_sec) * 1000 + 
                            (iter_end.tv_nsec - iter_start.tv_nsec) / 1000000;
        printf("Iteration completed in %ld ms\n", iter_duration);
    }

    munmap(shared_mem, TOTAL_SIZE);
    return 0;
}
Amazon Web Services Development Center Germany GmbH
Tamara-Danz-Str. 13
10243 Berlin
Geschaeftsfuehrung: Christian Schlaeger, Jonathan Weiss
Eingetragen am Amtsgericht Charlottenburg unter HRB 257764 B
Sitz: Berlin
Ust-ID: DE 365 538 597