Add tests for process_madvise(), focusing on verifying behavior under various conditions including valid usage and error cases.
Signed-off-by: wang lian lianux.mm@gmail.com Suggested-by: Lorenzo Stoakes lorenzo.stoakes@oracle.com Suggested-by: David Hildenbrand david@redhat.com Suggested-by: Zi Yan ziy@nvidia.com Suggested-by: Mark Brown broonie@kernel.org Acked-by: SeongJae Park sj@kernel.org
--- Changelog v5: - Refactor the remote_collapse test to concentrate on its primary goal confirming the successful remote invocation of process_madvise() on a child process. - Split the validation logic for invalid pidfds out of the remote test and into two new (`exited_process_pidfd` and `bad_pidfd`). - Based mm-new branch, can ensure clean application
Changelog v4: https://lore.kernel.org/lkml/20250710112249.58722-1-lianux.mm@gmail.com/ - Refine resource cleanup logic in test teardown to be more robust. - Improve remote_collapse test to correctly handle different THP (Transparent Huge Page) policies ('always', 'madvise', 'never'), including handling race conditions with khugepaged. - Resolve build errors
Changelog v3: https://lore.kernel.org/lkml/20250703044326.65061-1-lianux.mm@gmail.com/ - Rebased onto the latest mm-stable branch to ensure clean application. - Refactor common signal handling logic into vm_util to reduce code duplication. - Improve test robustness and diagnostics based on community feedback. - Address minor code style and script corrections.
Changelog v2: https://lore.kernel.org/lkml/20250630140957.4000-1-lianux.mm@gmail.com/ - Drop MADV_DONTNEED tests based on feedback. - Focus solely on process_madvise() syscall. - Improve error handling and structure. - Add future-proof flag test. - Style and comment cleanups.
-V1: https://lore.kernel.org/lkml/20250621133003.4733-1-lianux.mm@gmail.com/
tools/testing/selftests/mm/.gitignore | 1 + tools/testing/selftests/mm/Makefile | 1 + tools/testing/selftests/mm/process_madv.c | 304 ++++++++++++++++++++++ tools/testing/selftests/mm/run_vmtests.sh | 5 + 4 files changed, 311 insertions(+) create mode 100644 tools/testing/selftests/mm/process_madv.c
diff --git a/tools/testing/selftests/mm/.gitignore b/tools/testing/selftests/mm/.gitignore index f2dafa0b700b..e7b23a8a05fe 100644 --- a/tools/testing/selftests/mm/.gitignore +++ b/tools/testing/selftests/mm/.gitignore @@ -21,6 +21,7 @@ on-fault-limit transhuge-stress pagemap_ioctl pfnmap +process_madv *.tmp* protection_keys protection_keys_32 diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index ae6f994d3add..d13b3cef2a2b 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -85,6 +85,7 @@ TEST_GEN_FILES += mseal_test TEST_GEN_FILES += on-fault-limit TEST_GEN_FILES += pagemap_ioctl TEST_GEN_FILES += pfnmap +TEST_GEN_FILES += process_madv TEST_GEN_FILES += thuge-gen TEST_GEN_FILES += transhuge-stress TEST_GEN_FILES += uffd-stress diff --git a/tools/testing/selftests/mm/process_madv.c b/tools/testing/selftests/mm/process_madv.c new file mode 100644 index 000000000000..249e2ed8dfe9 --- /dev/null +++ b/tools/testing/selftests/mm/process_madv.c @@ -0,0 +1,304 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#define _GNU_SOURCE +#include "../kselftest_harness.h" +#include <errno.h> +#include <setjmp.h> +#include <signal.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <linux/mman.h> +#include <sys/syscall.h> +#include <unistd.h> +#include <sched.h> +#include "vm_util.h" + +#include "../pidfd/pidfd.h" + +FIXTURE(process_madvise) +{ + unsigned long page_size; + pid_t child_pid; + int pidfd; +}; + +FIXTURE_SETUP(process_madvise) +{ + self->page_size = (unsigned long)sysconf(_SC_PAGESIZE); + self->pidfd = PIDFD_SELF; + self->child_pid = -1; +}; + +FIXTURE_TEARDOWN_PARENT(process_madvise) +{ + if (self->child_pid > 0) { + kill(self->child_pid, SIGKILL); + waitpid(self->child_pid, NULL, 0); + } +} + +static ssize_t sys_process_madvise(int pidfd, const struct iovec *iovec, + size_t vlen, int advice, unsigned int flags) +{ + return syscall(__NR_process_madvise, pidfd, iovec, vlen, advice, flags); +} + +/* + * This test uses PIDFD_SELF to target the current process. The main + * goal is to verify the basic behavior of process_madvise() with + * a vector of non-contiguous memory ranges, not its cross-process + * capabilities. + */ +TEST_F(process_madvise, basic) +{ + const unsigned long pagesize = self->page_size; + const int madvise_pages = 4; + struct iovec vec[madvise_pages]; + int pidfd = self->pidfd; + ssize_t ret; + char *map; + + /* + * Create a single large mapping. We will pick pages from this + * mapping to advise on. This ensures we test non-contiguous iovecs. + */ + map = mmap(NULL, pagesize * 10, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (map == MAP_FAILED) + SKIP(return, "mmap failed, not enough memory.\n"); + + /* Fill the entire region with a known pattern. */ + memset(map, 'A', pagesize * 10); + + /* + * Setup the iovec to point to 4 non-contiguous pages + * within the mapping. + */ + vec[0].iov_base = &map[0 * pagesize]; + vec[0].iov_len = pagesize; + vec[1].iov_base = &map[3 * pagesize]; + vec[1].iov_len = pagesize; + vec[2].iov_base = &map[5 * pagesize]; + vec[2].iov_len = pagesize; + vec[3].iov_base = &map[8 * pagesize]; + vec[3].iov_len = pagesize; + + ret = sys_process_madvise(pidfd, vec, madvise_pages, MADV_DONTNEED, 0); + if (ret == -1 && errno == EPERM) + SKIP(return, + "process_madvise() unsupported or permission denied, try running as root.\n"); + else if (errno == EINVAL) + SKIP(return, + "process_madvise() unsupported or parameter invalid, please check arguments.\n"); + + /* The call should succeed and report the total bytes processed. */ + ASSERT_EQ(ret, madvise_pages * pagesize); + + /* Check that advised pages are now zero. */ + for (int i = 0; i < madvise_pages; i++) { + char *advised_page = (char *)vec[i].iov_base; + + /* Content must be 0, not 'A'. */ + ASSERT_EQ(*advised_page, '\0'); + } + + /* Check that an un-advised page in between is still 'A'. */ + char *unadvised_page = &map[1 * pagesize]; + + for (int i = 0; i < pagesize; i++) + ASSERT_EQ(unadvised_page[i], 'A'); + + /* Cleanup. */ + ASSERT_EQ(munmap(map, pagesize * 10), 0); +} + +/* + * This test deterministically validates process_madvise() with MADV_COLLAPSE + * on a remote process, other advices are difficult to verify reliably. + * + * The test verifies that a memory region in a child process, + * focus on process_madv remote result, only check addresses and lengths. + * The correctness of the MADV_COLLAPSE can be found in the relevant test examples in khugepaged. + */ +TEST_F(process_madvise, remote_collapse) +{ + const unsigned long pagesize = self->page_size; + int pidfd; + long huge_page_size; + int pipe_info[2]; + ssize_t ret; + struct iovec vec; + + struct child_info { + pid_t pid; + void *map_addr; + } info; + + huge_page_size = default_huge_page_size(); + if (huge_page_size <= 0) + SKIP(return, "Could not determine a valid huge page size.\n"); + + ASSERT_EQ(pipe(pipe_info), 0); + + self->child_pid = fork(); + ASSERT_NE(self->child_pid, -1); + + if (self->child_pid == 0) { + char *map; + size_t map_size = 2 * huge_page_size; + + close(pipe_info[0]); + + map = mmap(NULL, map_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + ASSERT_NE(map, MAP_FAILED); + + /* Fault in as small pages */ + for (size_t i = 0; i < map_size; i += pagesize) + map[i] = 'A'; + + /* Send info and pause */ + info.pid = getpid(); + info.map_addr = map; + ret = write(pipe_info[1], &info, sizeof(info)); + ASSERT_EQ(ret, sizeof(info)); + close(pipe_info[1]); + + pause(); + exit(0); + } + + close(pipe_info[1]); + + /* Receive child info */ + ret = read(pipe_info[0], &info, sizeof(info)); + if (ret <= 0) { + waitpid(self->child_pid, NULL, 0); + SKIP(return, "Failed to read child info from pipe.\n"); + } + ASSERT_EQ(ret, sizeof(info)); + close(pipe_info[0]); + self->child_pid = info.pid; + + pidfd = syscall(__NR_pidfd_open, self->child_pid, 0); + ASSERT_GE(pidfd, 0); + + vec.iov_base = info.map_addr; + vec.iov_len = huge_page_size; + + ret = sys_process_madvise(pidfd, &vec, 1, MADV_COLLAPSE, 0); + if (ret == -1) { + if (errno == EINVAL) + SKIP(return, "PROCESS_MADV_ADVISE is not supported.\n"); + else if (errno == EPERM) + SKIP(return, + "No process_madvise() permissions, try running as root.\n"); + goto cleanup; + } + + ASSERT_EQ(ret, huge_page_size); + +cleanup: + /* Cleanup */ + kill(self->child_pid, SIGKILL); + waitpid(self->child_pid, NULL, 0); + if (pidfd >= 0) + close(pidfd); +} + +/* + * Test process_madvise() with a pidfd for a process that has already + * exited to ensure correct error handling. + */ +TEST_F(process_madvise, exited_process_pidfd) +{ + struct iovec vec; + ssize_t ret; + int pidfd; + + vec.iov_base = (void *)0x1234; + vec.iov_len = 4096; + + /* + * Using a pidfd for a process that has already exited should fail + * with ESRCH. + */ + self->child_pid = fork(); + ASSERT_NE(self->child_pid, -1); + + if (self->child_pid == 0) + exit(0); + + pidfd = syscall(__NR_pidfd_open, self->child_pid, 0); + ASSERT_GE(pidfd, 0); + + /* Wait for the child to ensure it has terminated. */ + waitpid(self->child_pid, NULL, 0); + + ret = sys_process_madvise(pidfd, &vec, 1, MADV_DONTNEED, 0); + ASSERT_EQ(ret, -1); + ASSERT_EQ(errno, ESRCH); + close(pidfd); +} + +/* + * Test process_madvise() with bad pidfds to ensure correct error + * handling. + */ +TEST_F(process_madvise, bad_pidfd) +{ + struct iovec vec; + ssize_t ret; + + vec.iov_base = (void *)0x1234; + vec.iov_len = 4096; + + /* Using an invalid fd number (-1) should fail with EBADF. */ + ret = sys_process_madvise(-1, &vec, 1, MADV_DONTNEED, 0); + ASSERT_EQ(ret, -1); + ASSERT_EQ(errno, EBADF); + + /* + * Using a valid fd that is not a pidfd (e.g. stdin) should fail + * with EBADF. + */ + ret = sys_process_madvise(STDIN_FILENO, &vec, 1, MADV_DONTNEED, 0); + ASSERT_EQ(ret, -1); + ASSERT_EQ(errno, EBADF); +} + +/* + * Test process_madvise() with an invalid flag value. Currently, only a flag + * value of 0 is supported. This test is reserved for the future, e.g., if + * synchronous flags are added. + */ +TEST_F(process_madvise, flag) +{ + const unsigned long pagesize = self->page_size; + unsigned int invalid_flag; + int pidfd = self->pidfd; + struct iovec vec; + char *map; + ssize_t ret; + + map = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, + 0); + if (map == MAP_FAILED) + SKIP(return, "mmap failed, not enough memory.\n"); + + vec.iov_base = map; + vec.iov_len = pagesize; + + invalid_flag = 0x80000000; + + ret = sys_process_madvise(pidfd, &vec, 1, MADV_DONTNEED, invalid_flag); + ASSERT_EQ(ret, -1); + ASSERT_EQ(errno, EINVAL); + + /* Cleanup. */ + ASSERT_EQ(munmap(map, pagesize), 0); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh index a38c984103ce..471e539d82b8 100755 --- a/tools/testing/selftests/mm/run_vmtests.sh +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -65,6 +65,8 @@ separated by spaces: test pagemap_scan IOCTL - pfnmap tests for VM_PFNMAP handling +- process_madv + test for process_madv - cow test copy-on-write semantics - thp @@ -425,6 +427,9 @@ CATEGORY="madv_guard" run_test ./guard-regions # MADV_POPULATE_READ and MADV_POPULATE_WRITE tests CATEGORY="madv_populate" run_test ./madv_populate
+# PROCESS_MADV test +CATEGORY="process_madv" run_test ./process_madv + CATEGORY="vma_merge" run_test ./merge
if [ -x ./memfd_secret ]