Re: [PATCH v6] selftests/mm: add process_madvise() tests

23 Jul 2025

On 21 Jul 2025, at 7:46, wang lian wrote:
...
Add tests for process_madvise(), focusing on verifying behavior under
various conditions including valid usage and error cases.
Signed-off-by: wang lian lianux.mm@gmail.com
Suggested-by: Lorenzo Stoakes lorenzo.stoakes@oracle.com
Suggested-by: David Hildenbrand david@redhat.com
Suggested-by: Zi Yan ziy@nvidia.com
Suggested-by: Mark Brown broonie@kernel.org
Acked-by: SeongJae Park sj@kernel.org

Changelog v6:

Refactor child process and pidfd management to use the kselftest
fixture's setup and teardown mechanism. This ensures that child
processes are reliably terminated and file descriptors are closed, even
when a test is aborted by an ASSERT or SKIP macro. This resolves the
issue where a failed assertion could lead to a leaked child process.

Changelog v5: https://lore.kernel.org/lkml/20250714122533.3135-1-lianux.mm@gmail.com/

Refactor the remote_collapse test to concentrate on its primary goal
confirming the successful remote invocation of process_madvise() on a child process.
Split the validation logic for invalid pidfds out of the remote test and into two new
(`exited_process_pidfd` and `bad_pidfd`).
Based mm-new branch, can ensure clean application

Changelog v4: https://lore.kernel.org/lkml/20250710112249.58722-1-lianux.mm@gmail.com/

Refine resource cleanup logic in test teardown to be more robust.
Improve remote_collapse test to correctly handle different THP
(Transparent Huge Page) policies ('always', 'madvise', 'never'),
including handling race conditions with khugepaged.
Resolve build errors

Changelog v3: https://lore.kernel.org/lkml/20250703044326.65061-1-lianux.mm@gmail.com/

Rebased onto the latest mm-stable branch to ensure clean application.
Refactor common signal handling logic into vm_util to reduce code duplication.
Improve test robustness and diagnostics based on community feedback.
Address minor code style and script corrections.

Changelog v2: https://lore.kernel.org/lkml/20250630140957.4000-1-lianux.mm@gmail.com/

Drop MADV_DONTNEED tests based on feedback.
Focus solely on process_madvise() syscall.
Improve error handling and structure.
Add future-proof flag test.
Style and comment cleanups.

-V1: https://lore.kernel.org/lkml/20250621133003.4733-1-lianux.mm@gmail.com/
 tools/testing/selftests/mm/.gitignore     |   1 +
 tools/testing/selftests/mm/Makefile       |   1 +
 tools/testing/selftests/mm/process_madv.c | 302 ++++++++++++++++++++++
 tools/testing/selftests/mm/run_vmtests.sh |   5 +
 4 files changed, 309 insertions(+)
 create mode 100644 tools/testing/selftests/mm/process_madv.c

diff --git a/tools/testing/selftests/mm/.gitignore b/tools/testing/selftests/mm/.gitignore
index f2dafa0b700b..e7b23a8a05fe
--- a/tools/testing/selftests/mm/.gitignore
+++ b/tools/testing/selftests/mm/.gitignore
@@ -21,6 +21,7 @@ on-fault-limit
 transhuge-stress
 pagemap_ioctl
 pfnmap
+process_madv
 *.tmp*
 protection_keys
 protection_keys_32
diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile
index ae6f994d3add..d13b3cef2a2b 100644
--- a/tools/testing/selftests/mm/Makefile
+++ b/tools/testing/selftests/mm/Makefile
@@ -85,6 +85,7 @@ TEST_GEN_FILES += mseal_test
 TEST_GEN_FILES += on-fault-limit
 TEST_GEN_FILES += pagemap_ioctl
 TEST_GEN_FILES += pfnmap
+TEST_GEN_FILES += process_madv
 TEST_GEN_FILES += thuge-gen
 TEST_GEN_FILES += transhuge-stress
 TEST_GEN_FILES += uffd-stress
diff --git a/tools/testing/selftests/mm/process_madv.c b/tools/testing/selftests/mm/process_madv.c
new file mode 100644
index 000000000000..8a83eac3bfab
--- /dev/null
+++ b/tools/testing/selftests/mm/process_madv.c
@@ -0,0 +1,302 @@
+// SPDX-License-Identifier: GPL-2.0-or-later



+#define _GNU_SOURCE
+#include "../kselftest_harness.h"
+#include <errno.h>
+#include <setjmp.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <linux/mman.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+#include <sched.h>
+#include "vm_util.h"



+#include "../pidfd/pidfd.h"



+FIXTURE(process_madvise)
+{

unsigned long page_size;
pid_t child_pid;
int remote_pidfd;
int pidfd;

+};



+FIXTURE_SETUP(process_madvise)
+{

self->page_size = (unsigned long)sysconf(_SC_PAGESIZE);
self->pidfd = PIDFD_SELF;
self->remote_pidfd = -1;
self->child_pid = -1;

+};



+FIXTURE_TEARDOWN_PARENT(process_madvise)
+{

/* This teardown is guaranteed to run, even if tests SKIP or ASSERT */
if (self->child_pid > 0) {
kill(self->child_pid, SIGKILL);


waitpid(self->child_pid, NULL, 0);


}

if (self->remote_pidfd >= 0)
close(self->remote_pidfd);



+}



+static ssize_t sys_process_madvise(int pidfd, const struct iovec *iovec,

		   size_t vlen, int advice, unsigned int flags)



+{

return syscall(__NR_process_madvise, pidfd, iovec, vlen, advice, flags);

+}



+/*


This test uses PIDFD_SELF to target the current process. The main



goal is to verify the basic behavior of process_madvise() with



a vector of non-contiguous memory ranges, not its cross-process



capabilities.


*/

+TEST_F(process_madvise, basic)
+{

const unsigned long pagesize = self->page_size;
const int madvise_pages = 4;
struct iovec vec[madvise_pages];
int pidfd = self->pidfd;
ssize_t ret;
char *map;

/*
* Create a single large mapping. We will pick pages from this


* mapping to advise on. This ensures we test non-contiguous iovecs.


*/


map = mmap(NULL, pagesize * 10, PROT_READ | PROT_WRITE,
   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);


if (map == MAP_FAILED)
SKIP(return, "mmap failed, not enough memory.\n");



/* Fill the entire region with a known pattern. */
memset(map, 'A', pagesize * 10);

/*
* Setup the iovec to point to 4 non-contiguous pages


* within the mapping.


*/


vec[0].iov_base = &map[0 * pagesize];
vec[0].iov_len = pagesize;
vec[1].iov_base = &map[3 * pagesize];
vec[1].iov_len = pagesize;
vec[2].iov_base = &map[5 * pagesize];
vec[2].iov_len = pagesize;
vec[3].iov_base = &map[8 * pagesize];
vec[3].iov_len = pagesize;

ret = sys_process_madvise(pidfd, vec, madvise_pages, MADV_DONTNEED, 0);
if (ret == -1 && errno == EPERM)
SKIP(return,


	   "process_madvise() unsupported or permission denied, try running as root.\n");


else if (errno == EINVAL)
SKIP(return,


	   "process_madvise() unsupported or parameter invalid, please check arguments.\n");



/* The call should succeed and report the total bytes processed. */
ASSERT_EQ(ret, madvise_pages * pagesize);

/* Check that advised pages are now zero. */
for (int i = 0; i < madvise_pages; i++) {
char *advised_page = (char *)vec[i].iov_base;



/* Content must be 0, not 'A'. */


ASSERT_EQ(*advised_page, '\0');


}

/* Check that an un-advised page in between is still 'A'. */
char *unadvised_page = &map[1 * pagesize];

for (int i = 0; i < pagesize; i++)
ASSERT_EQ(unadvised_page[i], 'A');



/* Cleanup. */
ASSERT_EQ(munmap(map, pagesize * 10), 0);

+}



+/*


This test deterministically validates process_madvise() with MADV_COLLAPSE



on a remote process, other advices are difficult to verify reliably.







The test verifies that a memory region in a child process,



focus on process_madv remote result, only check addresses and lengths.



The correctness of the MADV_COLLAPSE can be found in the relevant test examples in khugepaged.


*/

+TEST_F(process_madvise, remote_collapse)
+{

const unsigned long pagesize = self->page_size;
long huge_page_size;
int pipe_info[2];
ssize_t ret;
struct iovec vec;

struct child_info {
pid_t pid;


void *map_addr;


} info;

huge_page_size = default_huge_page_size();

You should use read_pmd_pagesize() here, since default_huge_page_size()
is used for hugetlb. See Documentation/admin-guide/mm/hugetlbpage.rst.
MADV_COLLAPSE is for THP. The test passes if HugeTLB size is 2M and
fails otherwise.
...

if (huge_page_size <= 0)
SKIP(return, "Could not determine a valid huge page size.\n");



ASSERT_EQ(pipe(pipe_info), 0);

self->child_pid = fork();
ASSERT_NE(self->child_pid, -1);

if (self->child_pid == 0) {
char *map;


size_t map_size = 2 * huge_page_size;



close(pipe_info[0]);



map = mmap(NULL, map_size, PROT_READ | PROT_WRITE,


	   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);


ASSERT_NE(map, MAP_FAILED);



/* Fault in as small pages */


for (size_t i = 0; i < map_size; i += pagesize)


	map[i] = 'A';



/* Send info and pause */


info.pid = getpid();


info.map_addr = map;


ret = write(pipe_info[1], &info, sizeof(info));


ASSERT_EQ(ret, sizeof(info));


close(pipe_info[1]);



pause();


exit(0);


}

close(pipe_info[1]);

/* Receive child info */
ret = read(pipe_info[0], &info, sizeof(info));
if (ret <= 0) {
waitpid(self->child_pid, NULL, 0);


SKIP(return, "Failed to read child info from pipe.\n");


}
ASSERT_EQ(ret, sizeof(info));
close(pipe_info[0]);
self->child_pid = info.pid;

self->remote_pidfd = syscall(__NR_pidfd_open, self->child_pid, 0);
ASSERT_GE(self->remote_pidfd, 0);

vec.iov_base = info.map_addr;
vec.iov_len = huge_page_size;

ret = sys_process_madvise(self->remote_pidfd, &vec, 1, MADV_COLLAPSE, 0);
if (ret == -1) {
if (errno == EINVAL)


	SKIP(return, "PROCESS_MADV_ADVISE is not supported.\n");


else if (errno == EPERM)


	SKIP(return,


		   "No process_madvise() permissions, try running as root.\n");


return;


}

ASSERT_EQ(ret, huge_page_size);

+}



+/*


Test process_madvise() with a pidfd for a process that has already



exited to ensure correct error handling.


*/

+TEST_F(process_madvise, exited_process_pidfd)
+{

struct iovec vec;
ssize_t ret;
int pidfd;

vec.iov_base = (void *)0x1234;
vec.iov_len = 4096;

Here an invalid address range is provided, since pid is checked before
address ranges are checked.
BTW, the size of iovec array cannot be bigger than IOV_MAX. It might be
worth testing as well, if you want to.
With default_huge_page_size() -> read_pmd_pagesize() fix, feel free to
add:
Reviewed-by: Zi Yan ziy@nvidia.com
Tested-by: Zi Yan ziy@nvidia.com
I am able to compile and run the test on arm64. Thanks.
Best Regards,
Yan, Zi

    

2025

2024

2023

2022

2021

2020

2019

2018

2017

Re: [PATCH v6] selftests/mm: add process_madvise() tests