The first patch is a fix with an explanation of the issue, you should read that first. The second patch adds a comment to document the rules because figuring this out from scratch causes brain pain.
Accidentally hitting this issue and getting negative consequences from it would require several stars to line up just right; but if someone out there is using a malloc() implementation that uses lockless data structures across threads or such, this could actually be a problem.
In case someone wants a testcase, here's a very artificial one:
``` #include <pthread.h> #include <err.h> #include <stdio.h> #include <unistd.h> #include <sys/syscall.h> #include <sys/uio.h> #include <sys/mman.h> #include <sys/wait.h> #include <linux/io_uring.h>
#define SYSCHK(x) ({ \ typeof(x) __res = (x); \ if (__res == (typeof(x))-1) \ err(1, "SYSCHK(" #x ")"); \ __res; \ })
#define NUM_SQ_PAGES 4 static int uring_init(struct io_uring_sqe **sqesp, void **cqesp) { struct io_uring_sqe *sqes = SYSCHK(mmap(NULL, NUM_SQ_PAGES*0x1000, PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0)); void *cqes = SYSCHK(mmap(NULL, NUM_SQ_PAGES*0x1000, PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0)); *(volatile unsigned int *)(cqes+4) = 64 * NUM_SQ_PAGES; struct io_uring_params params = { .flags = IORING_SETUP_NO_MMAP|IORING_SETUP_NO_SQARRAY, .sq_off = { .user_addr = (unsigned long)sqes }, .cq_off = { .user_addr = (unsigned long)cqes } }; int uring_fd = SYSCHK(syscall(__NR_io_uring_setup, /*entries=*/10, ¶ms)); if (sqesp) *sqesp = sqes; if (cqesp) *cqesp = cqes; return uring_fd; }
static char *bufmem[0x3000] __attribute__((aligned(0x1000)));
static void *thread_fn(void *dummy) { unsigned long i = 0; while (1) { *(volatile unsigned long *)(bufmem + 0x0000) = i; *(volatile unsigned long *)(bufmem + 0x0f00) = i; *(volatile unsigned long *)(bufmem + 0x1000) = i; *(volatile unsigned long *)(bufmem + 0x1f00) = i; *(volatile unsigned long *)(bufmem + 0x2000) = i; *(volatile unsigned long *)(bufmem + 0x2f00) = i; i++; } }
int main(void) { #if 1 int uring_fd = uring_init(NULL, NULL); struct iovec reg_iov = { .iov_base = bufmem, .iov_len = 0x2000 }; SYSCHK(syscall(__NR_io_uring_register, uring_fd, IORING_REGISTER_BUFFERS, ®_iov, 1)); #endif
pthread_t thread; if (pthread_create(&thread, NULL, thread_fn, NULL)) errx(1, "pthread_create");
sleep(1); int child = SYSCHK(fork()); if (child == 0) { printf("bufmem values:\n"); printf(" 0x0000: 0x%lx\n", *(volatile unsigned long *)(bufmem + 0x0000)); printf(" 0x0f00: 0x%lx\n", *(volatile unsigned long *)(bufmem + 0x0f00)); printf(" 0x1000: 0x%lx\n", *(volatile unsigned long *)(bufmem + 0x1000)); printf(" 0x1f00: 0x%lx\n", *(volatile unsigned long *)(bufmem + 0x1f00)); printf(" 0x2000: 0x%lx\n", *(volatile unsigned long *)(bufmem + 0x2000)); printf(" 0x2f00: 0x%lx\n", *(volatile unsigned long *)(bufmem + 0x2f00)); return 0; } int wstatus; SYSCHK(wait(&wstatus)); return 0; } ```
Without this series, the child will usually print results that are apart by more than 1, which is not a state that ever occurred in the parent; in my opinion, that counts as a bug. If you change the "#if 1" to "#if 0", the bug won't manifest.
Signed-off-by: Jann Horn jannh@google.com --- Jann Horn (2): mm/memory: ensure fork child sees coherent memory snapshot mm/memory: Document how we make a coherent memory snapshot
kernel/fork.c | 34 ++++++++++++++++++++++++++++++++++ mm/memory.c | 18 ++++++++++++++++++ 2 files changed, 52 insertions(+) --- base-commit: 8477ab143069c6b05d6da4a8184ded8b969240f5 change-id: 20250530-fork-tearing-71da211a50cf