With /proc/pid/maps now being read under per-vma lock protection we can reuse parts of that code to execute PROCMAP_QUERY ioctl also without taking mmap_lock. The change is designed to reduce mmap_lock contention and prevent PROCMAP_QUERY ioctl calls from blocking address space updates.
This patchset was split out of the original patchset [1] that introduced per-vma lock usage for /proc/pid/maps reading. It contains PROCMAP_QUERY tests, code refactoring patch to simplify the main change and the actual transition to per-vma lock.
Changes since v3 [2] - change lock_vma_range()/unlock_vma_range() parameters, per Lorenzo Stoakes - minimize priv->lock_ctx dereferences by storing it in a local variable, per Lorenzo Stoakes - rename unlock_vma to unlock_ctx_vma, per Lorenzo Stoakes - factored out reset_lock_ctx(), per Lorenzo Stoakes - reset lock_ctx->mmap_locked inside query_vma_teardown(), per Lorenzo Stoakes - add clarifying comments in query_vma_find_by_addr() and procfs_procmap_ioctl(), per Lorenzo Stoakes - refactored error handling code inside query_vma_find_by_addr(), per Lorenzo Stoakes - add Acked-by as changes were cosmetic, per SeongJae Park
[1] https://lore.kernel.org/all/20250704060727.724817-1-surenb@google.com/ [2] https://lore.kernel.org/all/20250806155905.824388-1-surenb@google.com/
Suren Baghdasaryan (3): selftests/proc: test PROCMAP_QUERY ioctl while vma is concurrently modified fs/proc/task_mmu: factor out proc_maps_private fields used by PROCMAP_QUERY fs/proc/task_mmu: execute PROCMAP_QUERY ioctl under per-vma locks
fs/proc/internal.h | 15 +- fs/proc/task_mmu.c | 184 ++++++++++++------ fs/proc/task_nommu.c | 14 +- tools/testing/selftests/proc/proc-maps-race.c | 65 +++++++ 4 files changed, 210 insertions(+), 68 deletions(-)
base-commit: c2144e09b922d422346a44d72b674bf61dbd84c0
Extend /proc/pid/maps tearing tests to verify PROCMAP_QUERY ioctl operation correctness while the vma is being concurrently modified.
Signed-off-by: Suren Baghdasaryan surenb@google.com Tested-by: SeongJae Park sj@kernel.org Acked-by: SeongJae Park sj@kernel.org --- tools/testing/selftests/proc/proc-maps-race.c | 65 +++++++++++++++++++ 1 file changed, 65 insertions(+)
diff --git a/tools/testing/selftests/proc/proc-maps-race.c b/tools/testing/selftests/proc/proc-maps-race.c index 94bba4553130..a546475db550 100644 --- a/tools/testing/selftests/proc/proc-maps-race.c +++ b/tools/testing/selftests/proc/proc-maps-race.c @@ -32,6 +32,8 @@ #include <stdlib.h> #include <string.h> #include <unistd.h> +#include <linux/fs.h> +#include <sys/ioctl.h> #include <sys/mman.h> #include <sys/stat.h> #include <sys/types.h> @@ -317,6 +319,25 @@ static bool capture_mod_pattern(FIXTURE_DATA(proc_maps_race) *self, strcmp(restored_first_line->text, self->first_line.text) == 0; }
+static bool query_addr_at(int maps_fd, void *addr, + unsigned long *vma_start, unsigned long *vma_end) +{ + struct procmap_query q; + + memset(&q, 0, sizeof(q)); + q.size = sizeof(q); + /* Find the VMA at the split address */ + q.query_addr = (unsigned long long)addr; + q.query_flags = 0; + if (ioctl(maps_fd, PROCMAP_QUERY, &q)) + return false; + + *vma_start = q.vma_start; + *vma_end = q.vma_end; + + return true; +} + static inline bool split_vma(FIXTURE_DATA(proc_maps_race) *self) { return mmap(self->mod_info->addr, self->page_size, self->mod_info->prot | PROT_EXEC, @@ -559,6 +580,8 @@ TEST_F(proc_maps_race, test_maps_tearing_from_split) do { bool last_line_changed; bool first_line_changed; + unsigned long vma_start; + unsigned long vma_end;
ASSERT_TRUE(read_boundary_lines(self, &new_last_line, &new_first_line));
@@ -595,6 +618,19 @@ TEST_F(proc_maps_race, test_maps_tearing_from_split) first_line_changed = strcmp(new_first_line.text, self->first_line.text) != 0; ASSERT_EQ(last_line_changed, first_line_changed);
+ /* Check if PROCMAP_QUERY ioclt() finds the right VMA */ + ASSERT_TRUE(query_addr_at(self->maps_fd, mod_info->addr + self->page_size, + &vma_start, &vma_end)); + /* + * The vma at the split address can be either the same as + * original one (if read before the split) or the same as the + * first line in the second page (if read after the split). + */ + ASSERT_TRUE((vma_start == self->last_line.start_addr && + vma_end == self->last_line.end_addr) || + (vma_start == split_first_line.start_addr && + vma_end == split_first_line.end_addr)); + clock_gettime(CLOCK_MONOTONIC_COARSE, &end_ts); end_test_iteration(&end_ts, self->verbose); } while (end_ts.tv_sec - start_ts.tv_sec < self->duration_sec); @@ -636,6 +672,9 @@ TEST_F(proc_maps_race, test_maps_tearing_from_resize) clock_gettime(CLOCK_MONOTONIC_COARSE, &start_ts); start_test_loop(&start_ts, self->verbose); do { + unsigned long vma_start; + unsigned long vma_end; + ASSERT_TRUE(read_boundary_lines(self, &new_last_line, &new_first_line));
/* Check if we read vmas after shrinking it */ @@ -662,6 +701,16 @@ TEST_F(proc_maps_race, test_maps_tearing_from_resize) "Expand result invalid", self)); }
+ /* Check if PROCMAP_QUERY ioclt() finds the right VMA */ + ASSERT_TRUE(query_addr_at(self->maps_fd, mod_info->addr, &vma_start, &vma_end)); + /* + * The vma should stay at the same address and have either the + * original size of 3 pages or 1 page if read after shrinking. + */ + ASSERT_TRUE(vma_start == self->last_line.start_addr && + (vma_end - vma_start == self->page_size * 3 || + vma_end - vma_start == self->page_size)); + clock_gettime(CLOCK_MONOTONIC_COARSE, &end_ts); end_test_iteration(&end_ts, self->verbose); } while (end_ts.tv_sec - start_ts.tv_sec < self->duration_sec); @@ -703,6 +752,9 @@ TEST_F(proc_maps_race, test_maps_tearing_from_remap) clock_gettime(CLOCK_MONOTONIC_COARSE, &start_ts); start_test_loop(&start_ts, self->verbose); do { + unsigned long vma_start; + unsigned long vma_end; + ASSERT_TRUE(read_boundary_lines(self, &new_last_line, &new_first_line));
/* Check if we read vmas after remapping it */ @@ -729,6 +781,19 @@ TEST_F(proc_maps_race, test_maps_tearing_from_remap) "Remap restore result invalid", self)); }
+ /* Check if PROCMAP_QUERY ioclt() finds the right VMA */ + ASSERT_TRUE(query_addr_at(self->maps_fd, mod_info->addr + self->page_size, + &vma_start, &vma_end)); + /* + * The vma should either stay at the same address and have the + * original size of 3 pages or we should find the remapped vma + * at the remap destination address with size of 1 page. + */ + ASSERT_TRUE((vma_start == self->last_line.start_addr && + vma_end - vma_start == self->page_size * 3) || + (vma_start == self->last_line.start_addr + self->page_size && + vma_end - vma_start == self->page_size)); + clock_gettime(CLOCK_MONOTONIC_COARSE, &end_ts); end_test_iteration(&end_ts, self->verbose); } while (end_ts.tv_sec - start_ts.tv_sec < self->duration_sec);
On Fri, Aug 08, 2025 at 08:28:47AM -0700, Suren Baghdasaryan wrote:
Extend /proc/pid/maps tearing tests to verify PROCMAP_QUERY ioctl operation correctness while the vma is being concurrently modified.
General comment, but I really feel like this stuff is mm-specific. Yes it uses proc, but it's using it to check for mm functionality.
I mean I'd love for these to be in the mm self tests but I get obviously why they're in the proc ones...
Signed-off-by: Suren Baghdasaryan surenb@google.com Tested-by: SeongJae Park sj@kernel.org Acked-by: SeongJae Park sj@kernel.org
The tests themselves look good, had a good look through. But I've given you some nice ASCII diagrams to sprinkle liberally around :)
Anyway for tests themselves:
Reviewed-by: Lorenzo Stoakes lorenzo.stoakes@oracle.com
tools/testing/selftests/proc/proc-maps-race.c | 65 +++++++++++++++++++ 1 file changed, 65 insertions(+)
diff --git a/tools/testing/selftests/proc/proc-maps-race.c b/tools/testing/selftests/proc/proc-maps-race.c index 94bba4553130..a546475db550 100644 --- a/tools/testing/selftests/proc/proc-maps-race.c +++ b/tools/testing/selftests/proc/proc-maps-race.c @@ -32,6 +32,8 @@ #include <stdlib.h> #include <string.h> #include <unistd.h> +#include <linux/fs.h> +#include <sys/ioctl.h> #include <sys/mman.h> #include <sys/stat.h> #include <sys/types.h> @@ -317,6 +319,25 @@ static bool capture_mod_pattern(FIXTURE_DATA(proc_maps_race) *self, strcmp(restored_first_line->text, self->first_line.text) == 0; }
+static bool query_addr_at(int maps_fd, void *addr,
unsigned long *vma_start, unsigned long *vma_end)
+{
- struct procmap_query q;
- memset(&q, 0, sizeof(q));
- q.size = sizeof(q);
- /* Find the VMA at the split address */
- q.query_addr = (unsigned long long)addr;
- q.query_flags = 0;
- if (ioctl(maps_fd, PROCMAP_QUERY, &q))
return false;
- *vma_start = q.vma_start;
- *vma_end = q.vma_end;
- return true;
+}
static inline bool split_vma(FIXTURE_DATA(proc_maps_race) *self) { return mmap(self->mod_info->addr, self->page_size, self->mod_info->prot | PROT_EXEC, @@ -559,6 +580,8 @@ TEST_F(proc_maps_race, test_maps_tearing_from_split) do { bool last_line_changed; bool first_line_changed;
unsigned long vma_start;
unsigned long vma_end;
ASSERT_TRUE(read_boundary_lines(self, &new_last_line, &new_first_line));
@@ -595,6 +618,19 @@ TEST_F(proc_maps_race, test_maps_tearing_from_split) first_line_changed = strcmp(new_first_line.text, self->first_line.text) != 0; ASSERT_EQ(last_line_changed, first_line_changed);
/* Check if PROCMAP_QUERY ioclt() finds the right VMA */
Typo ioclt -> ioctl.
I think a little misleading, we're just testing whether we find a VMA at mod_info->addr + self->page_size.
ASSERT_TRUE(query_addr_at(self->maps_fd, mod_info->addr + self->page_size,
&vma_start, &vma_end));
/*
* The vma at the split address can be either the same as
* original one (if read before the split) or the same as the
* first line in the second page (if read after the split).
*/
ASSERT_TRUE((vma_start == self->last_line.start_addr &&
vma_end == self->last_line.end_addr) ||
(vma_start == split_first_line.start_addr &&
vma_end == split_first_line.end_addr));
So I'd make things clearer here with a comment like:
We are mmap()'ing a distinct VMA over the start of a 3 page mapping, which will cause the first page to be unmapped, and we can observe two states:
read | v |---------|------------------| | | | | A | B | or: | | | |---------|------------------|
| v |----------------------------| | | | A | | | |----------------------------|
If we see entries in /proc/$pid/maps it'll be:
7fa86aa15000-7fa86aa16000 rw-p 00000000 00:00 0 (A) 7fa86aa16000-7fa86aa18000 rw-p 00000000 00:00 0 (B)
Or:
7fa86aa15000-7fa86aa18000 rw-p 00000000 00:00 0 (A)
So we assert that the reported range is equivalent to one of these.
Obviously you can mix this in where you feel it makes sense.
clock_gettime(CLOCK_MONOTONIC_COARSE, &end_ts); end_test_iteration(&end_ts, self->verbose);
} while (end_ts.tv_sec - start_ts.tv_sec < self->duration_sec); @@ -636,6 +672,9 @@ TEST_F(proc_maps_race, test_maps_tearing_from_resize) clock_gettime(CLOCK_MONOTONIC_COARSE, &start_ts); start_test_loop(&start_ts, self->verbose); do {
unsigned long vma_start;
unsigned long vma_end;
ASSERT_TRUE(read_boundary_lines(self, &new_last_line, &new_first_line));
/* Check if we read vmas after shrinking it */
@@ -662,6 +701,16 @@ TEST_F(proc_maps_race, test_maps_tearing_from_resize) "Expand result invalid", self)); }
/* Check if PROCMAP_QUERY ioclt() finds the right VMA */
ASSERT_TRUE(query_addr_at(self->maps_fd, mod_info->addr, &vma_start, &vma_end));
Same comments as above.
/*
* The vma should stay at the same address and have either the
* original size of 3 pages or 1 page if read after shrinking.
*/
ASSERT_TRUE(vma_start == self->last_line.start_addr &&
(vma_end - vma_start == self->page_size * 3 ||
vma_end - vma_start == self->page_size));
So I'd make things clearer here with a comment like:
We are shrinking and expanding a VMA from 1 page to 3 pages:
read | v |---------| | | | A | | | |---------|
| v |----------------------------| | | | A | | | |----------------------------|
If we see entries in /proc/$pid/maps it'll be:
7fa86aa15000-7fa86aa16000 rw-p 00000000 00:00 0 (A)
Or:
7fa86aa15000-7fa86aa18000 rw-p 00000000 00:00 0 (A)
So we assert that the reported range is equivalent to one of these.
- clock_gettime(CLOCK_MONOTONIC_COARSE, &end_ts); end_test_iteration(&end_ts, self->verbose); } while (end_ts.tv_sec - start_ts.tv_sec < self->duration_sec);
@@ -703,6 +752,9 @@ TEST_F(proc_maps_race, test_maps_tearing_from_remap) clock_gettime(CLOCK_MONOTONIC_COARSE, &start_ts); start_test_loop(&start_ts, self->verbose); do {
unsigned long vma_start;
unsigned long vma_end;
ASSERT_TRUE(read_boundary_lines(self, &new_last_line, &new_first_line));
/* Check if we read vmas after remapping it */
@@ -729,6 +781,19 @@ TEST_F(proc_maps_race, test_maps_tearing_from_remap) "Remap restore result invalid", self)); }
/* Check if PROCMAP_QUERY ioclt() finds the right VMA */
ASSERT_TRUE(query_addr_at(self->maps_fd, mod_info->addr + self->page_size,
&vma_start, &vma_end));
Same comments as above.
/*
* The vma should either stay at the same address and have the
* original size of 3 pages or we should find the remapped vma
* at the remap destination address with size of 1 page.
*/
ASSERT_TRUE((vma_start == self->last_line.start_addr &&
vma_end - vma_start == self->page_size * 3) ||
(vma_start == self->last_line.start_addr + self->page_size &&
vma_end - vma_start == self->page_size));
Again be good to have more explanation here, similar comments to abov.
We are mremap()'ing the last page of the next VMA (B) into the midle of the current one (A) (using MREMAP_DONTUNMAP leaving the last page of the original VMA zapped but in place:
read | v R/W R/O |----------------------------| |------------------.---------| | | | . | | A | | B . | | | | . | |----------------------------| |------------------.---------|
This will unmap the middle of A, splitting it in two, before placing a copy of B there (Which has different prot bits than A):
| v R/W R/O R/W R/O |---------|---------|--------| |----------------------------| | | | | | | | A1 | B2 | A2 | | B | | | | | | | |---------|---------|--------| |----------------------------|
But then we 'patch' B2 back to R/W prot bits, causing B2 to get merged:
| v R/W R/O |----------------------------| |----------------------------| | | | | | A | | B | | | | | |----------------------------| |----------------------------|
If we see entries in /proc/$pid/maps it'll be:
7fa86aa15000-7fa86aa18000 rw-p 00000000 00:00 0 (A) 7fa86aa19000-7fa86aa20000 r--p 00000000 00:00 0 (B)
Or:
7fa86aa15000-7fa86aa16000 rw-p 00000000 00:00 0 (A1) 7fa86aa16000-7fa86aa17000 r--p 00000000 00:00 0 (B2) 7fa86aa17000-7fa86aa18000 rw-p 00000000 00:00 0 (A3) 7fa86aa19000-7fa86aa20000 r--p 00000000 00:00 0 (B)
We are always examining the first line, so we simply assert that this remains in place and we observe 1 page or 3 pages.
clock_gettime(CLOCK_MONOTONIC_COARSE, &end_ts); end_test_iteration(&end_ts, self->verbose);
} while (end_ts.tv_sec - start_ts.tv_sec < self->duration_sec);
2.50.1.703.g449372360f-goog
On Wed, Aug 13, 2025 at 1:39 PM Lorenzo Stoakes lorenzo.stoakes@oracle.com wrote:
On Fri, Aug 08, 2025 at 08:28:47AM -0700, Suren Baghdasaryan wrote:
Extend /proc/pid/maps tearing tests to verify PROCMAP_QUERY ioctl operation correctness while the vma is being concurrently modified.
General comment, but I really feel like this stuff is mm-specific. Yes it uses proc, but it's using it to check for mm functionality.
I mean I'd love for these to be in the mm self tests but I get obviously why they're in the proc ones...
Signed-off-by: Suren Baghdasaryan surenb@google.com Tested-by: SeongJae Park sj@kernel.org Acked-by: SeongJae Park sj@kernel.org
The tests themselves look good, had a good look through. But I've given you some nice ASCII diagrams to sprinkle liberally around :)
Thanks for the commentary, Lorenzo, it is great! I think I'll post them as a follow-up patch since they do not change the functionality of the test.
Anyway for tests themselves:
Reviewed-by: Lorenzo Stoakes lorenzo.stoakes@oracle.com
Thanks!
tools/testing/selftests/proc/proc-maps-race.c | 65 +++++++++++++++++++ 1 file changed, 65 insertions(+)
diff --git a/tools/testing/selftests/proc/proc-maps-race.c b/tools/testing/selftests/proc/proc-maps-race.c index 94bba4553130..a546475db550 100644 --- a/tools/testing/selftests/proc/proc-maps-race.c +++ b/tools/testing/selftests/proc/proc-maps-race.c @@ -32,6 +32,8 @@ #include <stdlib.h> #include <string.h> #include <unistd.h> +#include <linux/fs.h> +#include <sys/ioctl.h> #include <sys/mman.h> #include <sys/stat.h> #include <sys/types.h> @@ -317,6 +319,25 @@ static bool capture_mod_pattern(FIXTURE_DATA(proc_maps_race) *self, strcmp(restored_first_line->text, self->first_line.text) == 0; }
+static bool query_addr_at(int maps_fd, void *addr,
unsigned long *vma_start, unsigned long *vma_end)
+{
struct procmap_query q;
memset(&q, 0, sizeof(q));
q.size = sizeof(q);
/* Find the VMA at the split address */
q.query_addr = (unsigned long long)addr;
q.query_flags = 0;
if (ioctl(maps_fd, PROCMAP_QUERY, &q))
return false;
*vma_start = q.vma_start;
*vma_end = q.vma_end;
return true;
+}
static inline bool split_vma(FIXTURE_DATA(proc_maps_race) *self) { return mmap(self->mod_info->addr, self->page_size, self->mod_info->prot | PROT_EXEC, @@ -559,6 +580,8 @@ TEST_F(proc_maps_race, test_maps_tearing_from_split) do { bool last_line_changed; bool first_line_changed;
unsigned long vma_start;
unsigned long vma_end; ASSERT_TRUE(read_boundary_lines(self, &new_last_line, &new_first_line));
@@ -595,6 +618,19 @@ TEST_F(proc_maps_race, test_maps_tearing_from_split) first_line_changed = strcmp(new_first_line.text, self->first_line.text) != 0; ASSERT_EQ(last_line_changed, first_line_changed);
/* Check if PROCMAP_QUERY ioclt() finds the right VMA */
Typo ioclt -> ioctl.
I think a little misleading, we're just testing whether we find a VMA at mod_info->addr + self->page_size.
ASSERT_TRUE(query_addr_at(self->maps_fd, mod_info->addr + self->page_size,
&vma_start, &vma_end));
/*
* The vma at the split address can be either the same as
* original one (if read before the split) or the same as the
* first line in the second page (if read after the split).
*/
ASSERT_TRUE((vma_start == self->last_line.start_addr &&
vma_end == self->last_line.end_addr) ||
(vma_start == split_first_line.start_addr &&
vma_end == split_first_line.end_addr));
So I'd make things clearer here with a comment like:
We are mmap()'ing a distinct VMA over the start of a 3 page mapping, which will cause the first page to be unmapped, and we can observe two states: read | v |---------|------------------| | | | | A | B | or: | | | |---------|------------------| | v |----------------------------| | | | A | | | |----------------------------| If we see entries in /proc/$pid/maps it'll be: 7fa86aa15000-7fa86aa16000 rw-p 00000000 00:00 0 (A) 7fa86aa16000-7fa86aa18000 rw-p 00000000 00:00 0 (B) Or: 7fa86aa15000-7fa86aa18000 rw-p 00000000 00:00 0 (A) So we assert that the reported range is equivalent to one of these.
Obviously you can mix this in where you feel it makes sense.
clock_gettime(CLOCK_MONOTONIC_COARSE, &end_ts); end_test_iteration(&end_ts, self->verbose); } while (end_ts.tv_sec - start_ts.tv_sec < self->duration_sec);
@@ -636,6 +672,9 @@ TEST_F(proc_maps_race, test_maps_tearing_from_resize) clock_gettime(CLOCK_MONOTONIC_COARSE, &start_ts); start_test_loop(&start_ts, self->verbose); do {
unsigned long vma_start;
unsigned long vma_end;
ASSERT_TRUE(read_boundary_lines(self, &new_last_line, &new_first_line)); /* Check if we read vmas after shrinking it */
@@ -662,6 +701,16 @@ TEST_F(proc_maps_race, test_maps_tearing_from_resize) "Expand result invalid", self)); }
/* Check if PROCMAP_QUERY ioclt() finds the right VMA */
ASSERT_TRUE(query_addr_at(self->maps_fd, mod_info->addr, &vma_start, &vma_end));
Same comments as above.
/*
* The vma should stay at the same address and have either the
* original size of 3 pages or 1 page if read after shrinking.
*/
ASSERT_TRUE(vma_start == self->last_line.start_addr &&
(vma_end - vma_start == self->page_size * 3 ||
vma_end - vma_start == self->page_size));
So I'd make things clearer here with a comment like:
We are shrinking and expanding a VMA from 1 page to 3 pages: read | v |---------| | | | A | | | |---------| | v |----------------------------| | | | A | | | |----------------------------| If we see entries in /proc/$pid/maps it'll be: 7fa86aa15000-7fa86aa16000 rw-p 00000000 00:00 0 (A) Or: 7fa86aa15000-7fa86aa18000 rw-p 00000000 00:00 0 (A) So we assert that the reported range is equivalent to one of these.
clock_gettime(CLOCK_MONOTONIC_COARSE, &end_ts); end_test_iteration(&end_ts, self->verbose); } while (end_ts.tv_sec - start_ts.tv_sec < self->duration_sec);
@@ -703,6 +752,9 @@ TEST_F(proc_maps_race, test_maps_tearing_from_remap) clock_gettime(CLOCK_MONOTONIC_COARSE, &start_ts); start_test_loop(&start_ts, self->verbose); do {
unsigned long vma_start;
unsigned long vma_end;
ASSERT_TRUE(read_boundary_lines(self, &new_last_line, &new_first_line)); /* Check if we read vmas after remapping it */
@@ -729,6 +781,19 @@ TEST_F(proc_maps_race, test_maps_tearing_from_remap) "Remap restore result invalid", self)); }
/* Check if PROCMAP_QUERY ioclt() finds the right VMA */
ASSERT_TRUE(query_addr_at(self->maps_fd, mod_info->addr + self->page_size,
&vma_start, &vma_end));
Same comments as above.
/*
* The vma should either stay at the same address and have the
* original size of 3 pages or we should find the remapped vma
* at the remap destination address with size of 1 page.
*/
ASSERT_TRUE((vma_start == self->last_line.start_addr &&
vma_end - vma_start == self->page_size * 3) ||
(vma_start == self->last_line.start_addr + self->page_size &&
vma_end - vma_start == self->page_size));
Again be good to have more explanation here, similar comments to abov.
We are mremap()'ing the last page of the next VMA (B) into the midle of the current one (A) (using MREMAP_DONTUNMAP leaving the last page of the original VMA zapped but in place: read | v R/W R/O |----------------------------| |------------------.---------| | | | . | | A | | B . | | | | . | |----------------------------| |------------------.---------| This will unmap the middle of A, splitting it in two, before placing a copy of B there (Which has different prot bits than A): | v R/W R/O R/W R/O |---------|---------|--------| |----------------------------| | | | | | | | A1 | B2 | A2 | | B | | | | | | | |---------|---------|--------| |----------------------------| But then we 'patch' B2 back to R/W prot bits, causing B2 to get merged: | v R/W R/O |----------------------------| |----------------------------| | | | | | A | | B | | | | | |----------------------------| |----------------------------| If we see entries in /proc/$pid/maps it'll be: 7fa86aa15000-7fa86aa18000 rw-p 00000000 00:00 0 (A) 7fa86aa19000-7fa86aa20000 r--p 00000000 00:00 0 (B) Or: 7fa86aa15000-7fa86aa16000 rw-p 00000000 00:00 0 (A1) 7fa86aa16000-7fa86aa17000 r--p 00000000 00:00 0 (B2) 7fa86aa17000-7fa86aa18000 rw-p 00000000 00:00 0 (A3) 7fa86aa19000-7fa86aa20000 r--p 00000000 00:00 0 (B) We are always examining the first line, so we simply assert that this remains in place and we observe 1 page or 3 pages.
clock_gettime(CLOCK_MONOTONIC_COARSE, &end_ts); end_test_iteration(&end_ts, self->verbose); } while (end_ts.tv_sec - start_ts.tv_sec < self->duration_sec);
-- 2.50.1.703.g449372360f-goog
On Wed, Aug 13, 2025 at 04:21:47PM +0000, Suren Baghdasaryan wrote:
On Wed, Aug 13, 2025 at 1:39 PM Lorenzo Stoakes lorenzo.stoakes@oracle.com wrote:
On Fri, Aug 08, 2025 at 08:28:47AM -0700, Suren Baghdasaryan wrote:
Extend /proc/pid/maps tearing tests to verify PROCMAP_QUERY ioctl operation correctness while the vma is being concurrently modified.
General comment, but I really feel like this stuff is mm-specific. Yes it uses proc, but it's using it to check for mm functionality.
I mean I'd love for these to be in the mm self tests but I get obviously why they're in the proc ones...
Signed-off-by: Suren Baghdasaryan surenb@google.com Tested-by: SeongJae Park sj@kernel.org Acked-by: SeongJae Park sj@kernel.org
The tests themselves look good, had a good look through. But I've given you some nice ASCII diagrams to sprinkle liberally around :)
Thanks for the commentary, Lorenzo, it is great! I think I'll post them as a follow-up patch since they do not change the functionality of the test.
No problem, and sure that's fine it's not critical stuff obviously :)
Anyway for tests themselves:
Reviewed-by: Lorenzo Stoakes lorenzo.stoakes@oracle.com
Thanks!
Cheers, Lorenzo
Refactor struct proc_maps_private so that the fields used by PROCMAP_QUERY ioctl are moved into a separate structure. In the next patch this allows ioctl to reuse some of the functions used for reading /proc/pid/maps without using file->private_data. This prevents concurrent modification of file->private_data members by ioctl and /proc/pid/maps readers.
The change is pure code refactoring and has no functional changes.
Signed-off-by: Suren Baghdasaryan surenb@google.com Reviewed-by: Vlastimil Babka vbabka@suse.cz Acked-by: SeongJae Park sj@kernel.org --- fs/proc/internal.h | 15 +++++--- fs/proc/task_mmu.c | 87 +++++++++++++++++++++++--------------------- fs/proc/task_nommu.c | 14 +++---- 3 files changed, 63 insertions(+), 53 deletions(-)
diff --git a/fs/proc/internal.h b/fs/proc/internal.h index e737401d7383..d1598576506c 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -378,16 +378,21 @@ extern void proc_self_init(void); * task_[no]mmu.c */ struct mem_size_stats; -struct proc_maps_private { - struct inode *inode; - struct task_struct *task; + +struct proc_maps_locking_ctx { struct mm_struct *mm; - struct vma_iterator iter; - loff_t last_pos; #ifdef CONFIG_PER_VMA_LOCK bool mmap_locked; struct vm_area_struct *locked_vma; #endif +}; + +struct proc_maps_private { + struct inode *inode; + struct task_struct *task; + struct vma_iterator iter; + loff_t last_pos; + struct proc_maps_locking_ctx lock_ctx; #ifdef CONFIG_NUMA struct mempolicy *task_mempolicy; #endif diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 29cca0e6d0ff..c0968d293b61 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -132,18 +132,18 @@ static void release_task_mempolicy(struct proc_maps_private *priv)
#ifdef CONFIG_PER_VMA_LOCK
-static void unlock_vma(struct proc_maps_private *priv) +static void unlock_ctx_vma(struct proc_maps_locking_ctx *lock_ctx) { - if (priv->locked_vma) { - vma_end_read(priv->locked_vma); - priv->locked_vma = NULL; + if (lock_ctx->locked_vma) { + vma_end_read(lock_ctx->locked_vma); + lock_ctx->locked_vma = NULL; } }
static const struct seq_operations proc_pid_maps_op;
static inline bool lock_vma_range(struct seq_file *m, - struct proc_maps_private *priv) + struct proc_maps_locking_ctx *lock_ctx) { /* * smaps and numa_maps perform page table walk, therefore require @@ -151,25 +151,25 @@ static inline bool lock_vma_range(struct seq_file *m, * walking the vma tree under rcu read protection. */ if (m->op != &proc_pid_maps_op) { - if (mmap_read_lock_killable(priv->mm)) + if (mmap_read_lock_killable(lock_ctx->mm)) return false;
- priv->mmap_locked = true; + lock_ctx->mmap_locked = true; } else { rcu_read_lock(); - priv->locked_vma = NULL; - priv->mmap_locked = false; + lock_ctx->locked_vma = NULL; + lock_ctx->mmap_locked = false; }
return true; }
-static inline void unlock_vma_range(struct proc_maps_private *priv) +static inline void unlock_vma_range(struct proc_maps_locking_ctx *lock_ctx) { - if (priv->mmap_locked) { - mmap_read_unlock(priv->mm); + if (lock_ctx->mmap_locked) { + mmap_read_unlock(lock_ctx->mm); } else { - unlock_vma(priv); + unlock_ctx_vma(lock_ctx); rcu_read_unlock(); } } @@ -177,15 +177,16 @@ static inline void unlock_vma_range(struct proc_maps_private *priv) static struct vm_area_struct *get_next_vma(struct proc_maps_private *priv, loff_t last_pos) { + struct proc_maps_locking_ctx *lock_ctx = &priv->lock_ctx; struct vm_area_struct *vma;
- if (priv->mmap_locked) + if (lock_ctx->mmap_locked) return vma_next(&priv->iter);
- unlock_vma(priv); - vma = lock_next_vma(priv->mm, &priv->iter, last_pos); + unlock_ctx_vma(lock_ctx); + vma = lock_next_vma(lock_ctx->mm, &priv->iter, last_pos); if (!IS_ERR_OR_NULL(vma)) - priv->locked_vma = vma; + lock_ctx->locked_vma = vma;
return vma; } @@ -193,14 +194,16 @@ static struct vm_area_struct *get_next_vma(struct proc_maps_private *priv, static inline bool fallback_to_mmap_lock(struct proc_maps_private *priv, loff_t pos) { - if (priv->mmap_locked) + struct proc_maps_locking_ctx *lock_ctx = &priv->lock_ctx; + + if (lock_ctx->mmap_locked) return false;
rcu_read_unlock(); - mmap_read_lock(priv->mm); + mmap_read_lock(lock_ctx->mm); /* Reinitialize the iterator after taking mmap_lock */ vma_iter_set(&priv->iter, pos); - priv->mmap_locked = true; + lock_ctx->mmap_locked = true;
return true; } @@ -208,14 +211,14 @@ static inline bool fallback_to_mmap_lock(struct proc_maps_private *priv, #else /* CONFIG_PER_VMA_LOCK */
static inline bool lock_vma_range(struct seq_file *m, - struct proc_maps_private *priv) + struct proc_maps_locking_ctx *lock_ctx) { - return mmap_read_lock_killable(priv->mm) == 0; + return mmap_read_lock_killable(lock_ctx->mm) == 0; }
-static inline void unlock_vma_range(struct proc_maps_private *priv) +static inline void unlock_vma_range(struct proc_maps_locking_ctx *lock_ctx) { - mmap_read_unlock(priv->mm); + mmap_read_unlock(lock_ctx->mm); }
static struct vm_area_struct *get_next_vma(struct proc_maps_private *priv, @@ -258,7 +261,7 @@ static struct vm_area_struct *proc_get_vma(struct seq_file *m, loff_t *ppos) *ppos = vma->vm_end; } else { *ppos = SENTINEL_VMA_GATE; - vma = get_gate_vma(priv->mm); + vma = get_gate_vma(priv->lock_ctx.mm); }
return vma; @@ -267,6 +270,7 @@ static struct vm_area_struct *proc_get_vma(struct seq_file *m, loff_t *ppos) static void *m_start(struct seq_file *m, loff_t *ppos) { struct proc_maps_private *priv = m->private; + struct proc_maps_locking_ctx *lock_ctx; loff_t last_addr = *ppos; struct mm_struct *mm;
@@ -278,14 +282,15 @@ static void *m_start(struct seq_file *m, loff_t *ppos) if (!priv->task) return ERR_PTR(-ESRCH);
- mm = priv->mm; + lock_ctx = &priv->lock_ctx; + mm = lock_ctx->mm; if (!mm || !mmget_not_zero(mm)) { put_task_struct(priv->task); priv->task = NULL; return NULL; }
- if (!lock_vma_range(m, priv)) { + if (!lock_vma_range(m, lock_ctx)) { mmput(mm); put_task_struct(priv->task); priv->task = NULL; @@ -318,13 +323,13 @@ static void *m_next(struct seq_file *m, void *v, loff_t *ppos) static void m_stop(struct seq_file *m, void *v) { struct proc_maps_private *priv = m->private; - struct mm_struct *mm = priv->mm; + struct mm_struct *mm = priv->lock_ctx.mm;
if (!priv->task) return;
release_task_mempolicy(priv); - unlock_vma_range(priv); + unlock_vma_range(&priv->lock_ctx); mmput(mm); put_task_struct(priv->task); priv->task = NULL; @@ -339,9 +344,9 @@ static int proc_maps_open(struct inode *inode, struct file *file, return -ENOMEM;
priv->inode = inode; - priv->mm = proc_mem_open(inode, PTRACE_MODE_READ); - if (IS_ERR(priv->mm)) { - int err = PTR_ERR(priv->mm); + priv->lock_ctx.mm = proc_mem_open(inode, PTRACE_MODE_READ); + if (IS_ERR(priv->lock_ctx.mm)) { + int err = PTR_ERR(priv->lock_ctx.mm);
seq_release_private(inode, file); return err; @@ -355,8 +360,8 @@ static int proc_map_release(struct inode *inode, struct file *file) struct seq_file *seq = file->private_data; struct proc_maps_private *priv = seq->private;
- if (priv->mm) - mmdrop(priv->mm); + if (priv->lock_ctx.mm) + mmdrop(priv->lock_ctx.mm);
return seq_release_private(inode, file); } @@ -610,7 +615,7 @@ static int do_procmap_query(struct proc_maps_private *priv, void __user *uarg) if (!!karg.build_id_size != !!karg.build_id_addr) return -EINVAL;
- mm = priv->mm; + mm = priv->lock_ctx.mm; if (!mm || !mmget_not_zero(mm)) return -ESRCH;
@@ -1311,7 +1316,7 @@ static int show_smaps_rollup(struct seq_file *m, void *v) { struct proc_maps_private *priv = m->private; struct mem_size_stats mss = {}; - struct mm_struct *mm = priv->mm; + struct mm_struct *mm = priv->lock_ctx.mm; struct vm_area_struct *vma; unsigned long vma_start = 0, last_vma_end = 0; int ret = 0; @@ -1456,9 +1461,9 @@ static int smaps_rollup_open(struct inode *inode, struct file *file) goto out_free;
priv->inode = inode; - priv->mm = proc_mem_open(inode, PTRACE_MODE_READ); - if (IS_ERR_OR_NULL(priv->mm)) { - ret = priv->mm ? PTR_ERR(priv->mm) : -ESRCH; + priv->lock_ctx.mm = proc_mem_open(inode, PTRACE_MODE_READ); + if (IS_ERR_OR_NULL(priv->lock_ctx.mm)) { + ret = priv->lock_ctx.mm ? PTR_ERR(priv->lock_ctx.mm) : -ESRCH;
single_release(inode, file); goto out_free; @@ -1476,8 +1481,8 @@ static int smaps_rollup_release(struct inode *inode, struct file *file) struct seq_file *seq = file->private_data; struct proc_maps_private *priv = seq->private;
- if (priv->mm) - mmdrop(priv->mm); + if (priv->lock_ctx.mm) + mmdrop(priv->lock_ctx.mm);
kfree(priv); return single_release(inode, file); diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 59bfd61d653a..d362919f4f68 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -204,7 +204,7 @@ static void *m_start(struct seq_file *m, loff_t *ppos) if (!priv->task) return ERR_PTR(-ESRCH);
- mm = priv->mm; + mm = priv->lock_ctx.mm; if (!mm || !mmget_not_zero(mm)) { put_task_struct(priv->task); priv->task = NULL; @@ -226,7 +226,7 @@ static void *m_start(struct seq_file *m, loff_t *ppos) static void m_stop(struct seq_file *m, void *v) { struct proc_maps_private *priv = m->private; - struct mm_struct *mm = priv->mm; + struct mm_struct *mm = priv->lock_ctx.mm;
if (!priv->task) return; @@ -259,9 +259,9 @@ static int maps_open(struct inode *inode, struct file *file, return -ENOMEM;
priv->inode = inode; - priv->mm = proc_mem_open(inode, PTRACE_MODE_READ); - if (IS_ERR_OR_NULL(priv->mm)) { - int err = priv->mm ? PTR_ERR(priv->mm) : -ESRCH; + priv->lock_ctx.mm = proc_mem_open(inode, PTRACE_MODE_READ); + if (IS_ERR_OR_NULL(priv->lock_ctx.mm)) { + int err = priv->lock_ctx.mm ? PTR_ERR(priv->lock_ctx.mm) : -ESRCH;
seq_release_private(inode, file); return err; @@ -276,8 +276,8 @@ static int map_release(struct inode *inode, struct file *file) struct seq_file *seq = file->private_data; struct proc_maps_private *priv = seq->private;
- if (priv->mm) - mmdrop(priv->mm); + if (priv->lock_ctx.mm) + mmdrop(priv->lock_ctx.mm);
return seq_release_private(inode, file); }
On Fri, Aug 08, 2025 at 08:28:48AM -0700, Suren Baghdasaryan wrote:
Refactor struct proc_maps_private so that the fields used by PROCMAP_QUERY ioctl are moved into a separate structure. In the next patch this allows ioctl to reuse some of the functions used for reading /proc/pid/maps without using file->private_data. This prevents concurrent modification of file->private_data members by ioctl and /proc/pid/maps readers.
The change is pure code refactoring and has no functional changes.
Signed-off-by: Suren Baghdasaryan surenb@google.com Reviewed-by: Vlastimil Babka vbabka@suse.cz Acked-by: SeongJae Park sj@kernel.org
LGTM, so:
Reviewed-by: Lorenzo Stoakes lorenzo.stoakes@oracle.com
fs/proc/internal.h | 15 +++++--- fs/proc/task_mmu.c | 87 +++++++++++++++++++++++--------------------- fs/proc/task_nommu.c | 14 +++---- 3 files changed, 63 insertions(+), 53 deletions(-)
diff --git a/fs/proc/internal.h b/fs/proc/internal.h index e737401d7383..d1598576506c 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -378,16 +378,21 @@ extern void proc_self_init(void);
- task_[no]mmu.c
*/ struct mem_size_stats; -struct proc_maps_private {
- struct inode *inode;
- struct task_struct *task;
+struct proc_maps_locking_ctx { struct mm_struct *mm;
- struct vma_iterator iter;
- loff_t last_pos;
#ifdef CONFIG_PER_VMA_LOCK bool mmap_locked; struct vm_area_struct *locked_vma; #endif +};
+struct proc_maps_private {
- struct inode *inode;
- struct task_struct *task;
- struct vma_iterator iter;
- loff_t last_pos;
- struct proc_maps_locking_ctx lock_ctx;
#ifdef CONFIG_NUMA struct mempolicy *task_mempolicy; #endif diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 29cca0e6d0ff..c0968d293b61 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -132,18 +132,18 @@ static void release_task_mempolicy(struct proc_maps_private *priv)
#ifdef CONFIG_PER_VMA_LOCK
-static void unlock_vma(struct proc_maps_private *priv) +static void unlock_ctx_vma(struct proc_maps_locking_ctx *lock_ctx) {
- if (priv->locked_vma) {
vma_end_read(priv->locked_vma);
priv->locked_vma = NULL;
- if (lock_ctx->locked_vma) {
vma_end_read(lock_ctx->locked_vma);
}lock_ctx->locked_vma = NULL;
}
static const struct seq_operations proc_pid_maps_op;
static inline bool lock_vma_range(struct seq_file *m,
struct proc_maps_private *priv)
struct proc_maps_locking_ctx *lock_ctx)
{ /* * smaps and numa_maps perform page table walk, therefore require @@ -151,25 +151,25 @@ static inline bool lock_vma_range(struct seq_file *m, * walking the vma tree under rcu read protection. */ if (m->op != &proc_pid_maps_op) {
if (mmap_read_lock_killable(priv->mm))
if (mmap_read_lock_killable(lock_ctx->mm)) return false;
priv->mmap_locked = true;
} else { rcu_read_lock();lock_ctx->mmap_locked = true;
priv->locked_vma = NULL;
priv->mmap_locked = false;
lock_ctx->locked_vma = NULL;
lock_ctx->mmap_locked = false;
}
return true;
}
-static inline void unlock_vma_range(struct proc_maps_private *priv) +static inline void unlock_vma_range(struct proc_maps_locking_ctx *lock_ctx) {
- if (priv->mmap_locked) {
mmap_read_unlock(priv->mm);
- if (lock_ctx->mmap_locked) {
} else {mmap_read_unlock(lock_ctx->mm);
unlock_vma(priv);
rcu_read_unlock(); }unlock_ctx_vma(lock_ctx);
} @@ -177,15 +177,16 @@ static inline void unlock_vma_range(struct proc_maps_private *priv) static struct vm_area_struct *get_next_vma(struct proc_maps_private *priv, loff_t last_pos) {
- struct proc_maps_locking_ctx *lock_ctx = &priv->lock_ctx; struct vm_area_struct *vma;
- if (priv->mmap_locked)
- if (lock_ctx->mmap_locked) return vma_next(&priv->iter);
- unlock_vma(priv);
- vma = lock_next_vma(priv->mm, &priv->iter, last_pos);
- unlock_ctx_vma(lock_ctx);
- vma = lock_next_vma(lock_ctx->mm, &priv->iter, last_pos); if (!IS_ERR_OR_NULL(vma))
priv->locked_vma = vma;
lock_ctx->locked_vma = vma;
return vma;
} @@ -193,14 +194,16 @@ static struct vm_area_struct *get_next_vma(struct proc_maps_private *priv, static inline bool fallback_to_mmap_lock(struct proc_maps_private *priv, loff_t pos) {
- if (priv->mmap_locked)
struct proc_maps_locking_ctx *lock_ctx = &priv->lock_ctx;
if (lock_ctx->mmap_locked) return false;
rcu_read_unlock();
- mmap_read_lock(priv->mm);
- mmap_read_lock(lock_ctx->mm); /* Reinitialize the iterator after taking mmap_lock */ vma_iter_set(&priv->iter, pos);
- priv->mmap_locked = true;
lock_ctx->mmap_locked = true;
return true;
} @@ -208,14 +211,14 @@ static inline bool fallback_to_mmap_lock(struct proc_maps_private *priv, #else /* CONFIG_PER_VMA_LOCK */
static inline bool lock_vma_range(struct seq_file *m,
struct proc_maps_private *priv)
struct proc_maps_locking_ctx *lock_ctx)
{
- return mmap_read_lock_killable(priv->mm) == 0;
- return mmap_read_lock_killable(lock_ctx->mm) == 0;
}
-static inline void unlock_vma_range(struct proc_maps_private *priv) +static inline void unlock_vma_range(struct proc_maps_locking_ctx *lock_ctx) {
- mmap_read_unlock(priv->mm);
- mmap_read_unlock(lock_ctx->mm);
}
static struct vm_area_struct *get_next_vma(struct proc_maps_private *priv, @@ -258,7 +261,7 @@ static struct vm_area_struct *proc_get_vma(struct seq_file *m, loff_t *ppos) *ppos = vma->vm_end; } else { *ppos = SENTINEL_VMA_GATE;
vma = get_gate_vma(priv->mm);
vma = get_gate_vma(priv->lock_ctx.mm);
}
return vma;
@@ -267,6 +270,7 @@ static struct vm_area_struct *proc_get_vma(struct seq_file *m, loff_t *ppos) static void *m_start(struct seq_file *m, loff_t *ppos) { struct proc_maps_private *priv = m->private;
- struct proc_maps_locking_ctx *lock_ctx; loff_t last_addr = *ppos; struct mm_struct *mm;
@@ -278,14 +282,15 @@ static void *m_start(struct seq_file *m, loff_t *ppos) if (!priv->task) return ERR_PTR(-ESRCH);
- mm = priv->mm;
- lock_ctx = &priv->lock_ctx;
- mm = lock_ctx->mm; if (!mm || !mmget_not_zero(mm)) { put_task_struct(priv->task); priv->task = NULL; return NULL; }
- if (!lock_vma_range(m, priv)) {
- if (!lock_vma_range(m, lock_ctx)) { mmput(mm); put_task_struct(priv->task); priv->task = NULL;
@@ -318,13 +323,13 @@ static void *m_next(struct seq_file *m, void *v, loff_t *ppos) static void m_stop(struct seq_file *m, void *v) { struct proc_maps_private *priv = m->private;
- struct mm_struct *mm = priv->mm;
struct mm_struct *mm = priv->lock_ctx.mm;
if (!priv->task) return;
release_task_mempolicy(priv);
- unlock_vma_range(priv);
- unlock_vma_range(&priv->lock_ctx); mmput(mm); put_task_struct(priv->task); priv->task = NULL;
@@ -339,9 +344,9 @@ static int proc_maps_open(struct inode *inode, struct file *file, return -ENOMEM;
priv->inode = inode;
- priv->mm = proc_mem_open(inode, PTRACE_MODE_READ);
- if (IS_ERR(priv->mm)) {
int err = PTR_ERR(priv->mm);
priv->lock_ctx.mm = proc_mem_open(inode, PTRACE_MODE_READ);
if (IS_ERR(priv->lock_ctx.mm)) {
int err = PTR_ERR(priv->lock_ctx.mm);
seq_release_private(inode, file); return err;
@@ -355,8 +360,8 @@ static int proc_map_release(struct inode *inode, struct file *file) struct seq_file *seq = file->private_data; struct proc_maps_private *priv = seq->private;
- if (priv->mm)
mmdrop(priv->mm);
if (priv->lock_ctx.mm)
mmdrop(priv->lock_ctx.mm);
return seq_release_private(inode, file);
} @@ -610,7 +615,7 @@ static int do_procmap_query(struct proc_maps_private *priv, void __user *uarg) if (!!karg.build_id_size != !!karg.build_id_addr) return -EINVAL;
- mm = priv->mm;
- mm = priv->lock_ctx.mm; if (!mm || !mmget_not_zero(mm)) return -ESRCH;
@@ -1311,7 +1316,7 @@ static int show_smaps_rollup(struct seq_file *m, void *v) { struct proc_maps_private *priv = m->private; struct mem_size_stats mss = {};
- struct mm_struct *mm = priv->mm;
- struct mm_struct *mm = priv->lock_ctx.mm; struct vm_area_struct *vma; unsigned long vma_start = 0, last_vma_end = 0; int ret = 0;
@@ -1456,9 +1461,9 @@ static int smaps_rollup_open(struct inode *inode, struct file *file) goto out_free;
priv->inode = inode;
- priv->mm = proc_mem_open(inode, PTRACE_MODE_READ);
- if (IS_ERR_OR_NULL(priv->mm)) {
ret = priv->mm ? PTR_ERR(priv->mm) : -ESRCH;
priv->lock_ctx.mm = proc_mem_open(inode, PTRACE_MODE_READ);
if (IS_ERR_OR_NULL(priv->lock_ctx.mm)) {
ret = priv->lock_ctx.mm ? PTR_ERR(priv->lock_ctx.mm) : -ESRCH;
single_release(inode, file); goto out_free;
@@ -1476,8 +1481,8 @@ static int smaps_rollup_release(struct inode *inode, struct file *file) struct seq_file *seq = file->private_data; struct proc_maps_private *priv = seq->private;
- if (priv->mm)
mmdrop(priv->mm);
if (priv->lock_ctx.mm)
mmdrop(priv->lock_ctx.mm);
kfree(priv); return single_release(inode, file);
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 59bfd61d653a..d362919f4f68 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -204,7 +204,7 @@ static void *m_start(struct seq_file *m, loff_t *ppos) if (!priv->task) return ERR_PTR(-ESRCH);
- mm = priv->mm;
- mm = priv->lock_ctx.mm; if (!mm || !mmget_not_zero(mm)) { put_task_struct(priv->task); priv->task = NULL;
@@ -226,7 +226,7 @@ static void *m_start(struct seq_file *m, loff_t *ppos) static void m_stop(struct seq_file *m, void *v) { struct proc_maps_private *priv = m->private;
- struct mm_struct *mm = priv->mm;
struct mm_struct *mm = priv->lock_ctx.mm;
if (!priv->task) return;
@@ -259,9 +259,9 @@ static int maps_open(struct inode *inode, struct file *file, return -ENOMEM;
priv->inode = inode;
- priv->mm = proc_mem_open(inode, PTRACE_MODE_READ);
- if (IS_ERR_OR_NULL(priv->mm)) {
int err = priv->mm ? PTR_ERR(priv->mm) : -ESRCH;
- priv->lock_ctx.mm = proc_mem_open(inode, PTRACE_MODE_READ);
- if (IS_ERR_OR_NULL(priv->lock_ctx.mm)) {
int err = priv->lock_ctx.mm ? PTR_ERR(priv->lock_ctx.mm) : -ESRCH;
We could abstract out lock_ctx here also, but I'm not going to be picky about it.
seq_release_private(inode, file); return err;
@@ -276,8 +276,8 @@ static int map_release(struct inode *inode, struct file *file) struct seq_file *seq = file->private_data; struct proc_maps_private *priv = seq->private;
- if (priv->mm)
mmdrop(priv->mm);
if (priv->lock_ctx.mm)
mmdrop(priv->lock_ctx.mm);
return seq_release_private(inode, file);
}
2.50.1.703.g449372360f-goog
Utilize per-vma locks to stabilize vma after lookup without taking mmap_lock during PROCMAP_QUERY ioctl execution. If vma lock is contended, we fall back to mmap_lock but take it only momentarily to lock the vma and release the mmap_lock. In a very unlikely case of vm_refcnt overflow, this fall back path will fail and ioctl is done under mmap_lock protection.
This change is designed to reduce mmap_lock contention and prevent PROCMAP_QUERY ioctl calls from blocking address space updates.
Signed-off-by: Suren Baghdasaryan surenb@google.com Acked-by: SeongJae Park sj@kernel.org --- fs/proc/task_mmu.c | 103 +++++++++++++++++++++++++++++++++++++-------- 1 file changed, 85 insertions(+), 18 deletions(-)
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index c0968d293b61..e64cf40ce9c4 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -132,6 +132,12 @@ static void release_task_mempolicy(struct proc_maps_private *priv)
#ifdef CONFIG_PER_VMA_LOCK
+static void reset_lock_ctx(struct proc_maps_locking_ctx *lock_ctx) +{ + lock_ctx->locked_vma = NULL; + lock_ctx->mmap_locked = false; +} + static void unlock_ctx_vma(struct proc_maps_locking_ctx *lock_ctx) { if (lock_ctx->locked_vma) { @@ -157,8 +163,7 @@ static inline bool lock_vma_range(struct seq_file *m, lock_ctx->mmap_locked = true; } else { rcu_read_lock(); - lock_ctx->locked_vma = NULL; - lock_ctx->mmap_locked = false; + reset_lock_ctx(lock_ctx); }
return true; @@ -522,28 +527,90 @@ static int pid_maps_open(struct inode *inode, struct file *file) PROCMAP_QUERY_VMA_FLAGS \ )
-static int query_vma_setup(struct mm_struct *mm) +#ifdef CONFIG_PER_VMA_LOCK + +static int query_vma_setup(struct proc_maps_locking_ctx *lock_ctx) { - return mmap_read_lock_killable(mm); + reset_lock_ctx(lock_ctx); + + return 0; }
-static void query_vma_teardown(struct mm_struct *mm, struct vm_area_struct *vma) +static void query_vma_teardown(struct proc_maps_locking_ctx *lock_ctx) { - mmap_read_unlock(mm); + if (lock_ctx->mmap_locked) { + mmap_read_unlock(lock_ctx->mm); + lock_ctx->mmap_locked = false; + } else { + unlock_ctx_vma(lock_ctx); + } +} + +static struct vm_area_struct *query_vma_find_by_addr(struct proc_maps_locking_ctx *lock_ctx, + unsigned long addr) +{ + struct mm_struct *mm = lock_ctx->mm; + struct vm_area_struct *vma; + struct vma_iterator vmi; + + if (lock_ctx->mmap_locked) + return find_vma(mm, addr); + + /* Unlock previously locked VMA and find the next one under RCU */ + unlock_ctx_vma(lock_ctx); + rcu_read_lock(); + vma_iter_init(&vmi, mm, addr); + vma = lock_next_vma(mm, &vmi, addr); + rcu_read_unlock(); + + if (!vma) + return NULL; + + if (!IS_ERR(vma)) { + lock_ctx->locked_vma = vma; + return vma; + } + + if (PTR_ERR(vma) == -EAGAIN) { + /* Fallback to mmap_lock on vma->vm_refcnt overflow */ + mmap_read_lock(mm); + vma = find_vma(mm, addr); + lock_ctx->mmap_locked = true; + } + + return vma; }
-static struct vm_area_struct *query_vma_find_by_addr(struct mm_struct *mm, unsigned long addr) +#else /* CONFIG_PER_VMA_LOCK */ + +static int query_vma_setup(struct proc_maps_locking_ctx *lock_ctx) +{ + return mmap_read_lock_killable(lock_ctx->mm); +} + +static void query_vma_teardown(struct proc_maps_locking_ctx *lock_ctx) +{ + mmap_read_unlock(lock_ctx->mm); +} + +static struct vm_area_struct *query_vma_find_by_addr(struct proc_maps_locking_ctx *lock_ctx, + unsigned long addr) { - return find_vma(mm, addr); + return find_vma(lock_ctx->mm, addr); }
-static struct vm_area_struct *query_matching_vma(struct mm_struct *mm, +#endif /* CONFIG_PER_VMA_LOCK */ + +static struct vm_area_struct *query_matching_vma(struct proc_maps_locking_ctx *lock_ctx, unsigned long addr, u32 flags) { struct vm_area_struct *vma;
next_vma: - vma = query_vma_find_by_addr(mm, addr); + vma = query_vma_find_by_addr(lock_ctx, addr); + if (IS_ERR(vma)) + return vma; + if (!vma) goto no_vma;
@@ -584,11 +651,11 @@ static struct vm_area_struct *query_matching_vma(struct mm_struct *mm, return ERR_PTR(-ENOENT); }
-static int do_procmap_query(struct proc_maps_private *priv, void __user *uarg) +static int do_procmap_query(struct mm_struct *mm, void __user *uarg) { + struct proc_maps_locking_ctx lock_ctx = { .mm = mm }; struct procmap_query karg; struct vm_area_struct *vma; - struct mm_struct *mm; const char *name = NULL; char build_id_buf[BUILD_ID_SIZE_MAX], *name_buf = NULL; __u64 usize; @@ -615,17 +682,16 @@ static int do_procmap_query(struct proc_maps_private *priv, void __user *uarg) if (!!karg.build_id_size != !!karg.build_id_addr) return -EINVAL;
- mm = priv->lock_ctx.mm; if (!mm || !mmget_not_zero(mm)) return -ESRCH;
- err = query_vma_setup(mm); + err = query_vma_setup(&lock_ctx); if (err) { mmput(mm); return err; }
- vma = query_matching_vma(mm, karg.query_addr, karg.query_flags); + vma = query_matching_vma(&lock_ctx, karg.query_addr, karg.query_flags); if (IS_ERR(vma)) { err = PTR_ERR(vma); vma = NULL; @@ -710,7 +776,7 @@ static int do_procmap_query(struct proc_maps_private *priv, void __user *uarg) }
/* unlock vma or mmap_lock, and put mm_struct before copying data to user */ - query_vma_teardown(mm, vma); + query_vma_teardown(&lock_ctx); mmput(mm);
if (karg.vma_name_size && copy_to_user(u64_to_user_ptr(karg.vma_name_addr), @@ -730,7 +796,7 @@ static int do_procmap_query(struct proc_maps_private *priv, void __user *uarg) return 0;
out: - query_vma_teardown(mm, vma); + query_vma_teardown(&lock_ctx); mmput(mm); kfree(name_buf); return err; @@ -743,7 +809,8 @@ static long procfs_procmap_ioctl(struct file *file, unsigned int cmd, unsigned l
switch (cmd) { case PROCMAP_QUERY: - return do_procmap_query(priv, (void __user *)arg); + /* priv->lock_ctx.mm is set during file open operation */ + return do_procmap_query(priv->lock_ctx.mm, (void __user *)arg); default: return -ENOIOCTLCMD; }
On 8/8/25 17:28, Suren Baghdasaryan wrote:
Utilize per-vma locks to stabilize vma after lookup without taking mmap_lock during PROCMAP_QUERY ioctl execution. If vma lock is contended, we fall back to mmap_lock but take it only momentarily to lock the vma and release the mmap_lock. In a very unlikely case of vm_refcnt overflow, this fall back path will fail and ioctl is done under mmap_lock protection.
This change is designed to reduce mmap_lock contention and prevent PROCMAP_QUERY ioctl calls from blocking address space updates.
Signed-off-by: Suren Baghdasaryan surenb@google.com Acked-by: SeongJae Park sj@kernel.org
Reviewed-by: Vlastimil Babka vbabka@suse.cz
On Fri, Aug 8, 2025 at 8:29 AM Suren Baghdasaryan surenb@google.com wrote:
Utilize per-vma locks to stabilize vma after lookup without taking mmap_lock during PROCMAP_QUERY ioctl execution. If vma lock is contended, we fall back to mmap_lock but take it only momentarily to lock the vma and release the mmap_lock. In a very unlikely case of vm_refcnt overflow, this fall back path will fail and ioctl is done under mmap_lock protection.
This change is designed to reduce mmap_lock contention and prevent PROCMAP_QUERY ioctl calls from blocking address space updates.
Signed-off-by: Suren Baghdasaryan surenb@google.com Acked-by: SeongJae Park sj@kernel.org
fs/proc/task_mmu.c | 103 +++++++++++++++++++++++++++++++++++++-------- 1 file changed, 85 insertions(+), 18 deletions(-)
LGTM
Acked-by: Andrii Nakryiko andrii@kernel.org
[...]
On Fri, Aug 08, 2025 at 08:28:49AM -0700, Suren Baghdasaryan wrote:
Utilize per-vma locks to stabilize vma after lookup without taking mmap_lock during PROCMAP_QUERY ioctl execution. If vma lock is contended, we fall back to mmap_lock but take it only momentarily to lock the vma and release the mmap_lock. In a very unlikely case of vm_refcnt overflow, this fall back path will fail and ioctl is done under mmap_lock protection.
This change is designed to reduce mmap_lock contention and prevent PROCMAP_QUERY ioctl calls from blocking address space updates.
Signed-off-by: Suren Baghdasaryan surenb@google.com Acked-by: SeongJae Park sj@kernel.org
All LGTM, so:
Reviewed-by: Lorenzo Stoakes lorenzo.stoakes@oracle.com
fs/proc/task_mmu.c | 103 +++++++++++++++++++++++++++++++++++++-------- 1 file changed, 85 insertions(+), 18 deletions(-)
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index c0968d293b61..e64cf40ce9c4 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -132,6 +132,12 @@ static void release_task_mempolicy(struct proc_maps_private *priv)
#ifdef CONFIG_PER_VMA_LOCK
+static void reset_lock_ctx(struct proc_maps_locking_ctx *lock_ctx) +{
- lock_ctx->locked_vma = NULL;
- lock_ctx->mmap_locked = false;
+}
Thanks!
static void unlock_ctx_vma(struct proc_maps_locking_ctx *lock_ctx) { if (lock_ctx->locked_vma) { @@ -157,8 +163,7 @@ static inline bool lock_vma_range(struct seq_file *m, lock_ctx->mmap_locked = true; } else { rcu_read_lock();
lock_ctx->locked_vma = NULL;
lock_ctx->mmap_locked = false;
reset_lock_ctx(lock_ctx);
}
return true;
@@ -522,28 +527,90 @@ static int pid_maps_open(struct inode *inode, struct file *file) PROCMAP_QUERY_VMA_FLAGS \ )
-static int query_vma_setup(struct mm_struct *mm) +#ifdef CONFIG_PER_VMA_LOCK
+static int query_vma_setup(struct proc_maps_locking_ctx *lock_ctx) {
- return mmap_read_lock_killable(mm);
- reset_lock_ctx(lock_ctx);
- return 0;
}
-static void query_vma_teardown(struct mm_struct *mm, struct vm_area_struct *vma) +static void query_vma_teardown(struct proc_maps_locking_ctx *lock_ctx) {
- mmap_read_unlock(mm);
- if (lock_ctx->mmap_locked) {
mmap_read_unlock(lock_ctx->mm);
lock_ctx->mmap_locked = false;
- } else {
unlock_ctx_vma(lock_ctx);
- }
+}
+static struct vm_area_struct *query_vma_find_by_addr(struct proc_maps_locking_ctx *lock_ctx,
unsigned long addr)
+{
- struct mm_struct *mm = lock_ctx->mm;
- struct vm_area_struct *vma;
- struct vma_iterator vmi;
- if (lock_ctx->mmap_locked)
return find_vma(mm, addr);
- /* Unlock previously locked VMA and find the next one under RCU */
- unlock_ctx_vma(lock_ctx);
- rcu_read_lock();
- vma_iter_init(&vmi, mm, addr);
- vma = lock_next_vma(mm, &vmi, addr);
- rcu_read_unlock();
- if (!vma)
return NULL;
- if (!IS_ERR(vma)) {
lock_ctx->locked_vma = vma;
return vma;
- }
- if (PTR_ERR(vma) == -EAGAIN) {
/* Fallback to mmap_lock on vma->vm_refcnt overflow */
mmap_read_lock(mm);
vma = find_vma(mm, addr);
lock_ctx->mmap_locked = true;
- }
Thanks this is great!
- return vma;
}
-static struct vm_area_struct *query_vma_find_by_addr(struct mm_struct *mm, unsigned long addr) +#else /* CONFIG_PER_VMA_LOCK */
+static int query_vma_setup(struct proc_maps_locking_ctx *lock_ctx) +{
- return mmap_read_lock_killable(lock_ctx->mm);
+}
+static void query_vma_teardown(struct proc_maps_locking_ctx *lock_ctx) +{
- mmap_read_unlock(lock_ctx->mm);
+}
+static struct vm_area_struct *query_vma_find_by_addr(struct proc_maps_locking_ctx *lock_ctx,
unsigned long addr)
{
- return find_vma(mm, addr);
- return find_vma(lock_ctx->mm, addr);
}
-static struct vm_area_struct *query_matching_vma(struct mm_struct *mm, +#endif /* CONFIG_PER_VMA_LOCK */
+static struct vm_area_struct *query_matching_vma(struct proc_maps_locking_ctx *lock_ctx, unsigned long addr, u32 flags) { struct vm_area_struct *vma;
next_vma:
- vma = query_vma_find_by_addr(mm, addr);
- vma = query_vma_find_by_addr(lock_ctx, addr);
- if (IS_ERR(vma))
return vma;
- if (!vma) goto no_vma;
@@ -584,11 +651,11 @@ static struct vm_area_struct *query_matching_vma(struct mm_struct *mm, return ERR_PTR(-ENOENT); }
-static int do_procmap_query(struct proc_maps_private *priv, void __user *uarg) +static int do_procmap_query(struct mm_struct *mm, void __user *uarg) {
- struct proc_maps_locking_ctx lock_ctx = { .mm = mm }; struct procmap_query karg; struct vm_area_struct *vma;
- struct mm_struct *mm; const char *name = NULL; char build_id_buf[BUILD_ID_SIZE_MAX], *name_buf = NULL; __u64 usize;
@@ -615,17 +682,16 @@ static int do_procmap_query(struct proc_maps_private *priv, void __user *uarg) if (!!karg.build_id_size != !!karg.build_id_addr) return -EINVAL;
mm = priv->lock_ctx.mm; if (!mm || !mmget_not_zero(mm)) return -ESRCH;
err = query_vma_setup(mm);
- err = query_vma_setup(&lock_ctx); if (err) { mmput(mm); return err; }
- vma = query_matching_vma(mm, karg.query_addr, karg.query_flags);
- vma = query_matching_vma(&lock_ctx, karg.query_addr, karg.query_flags); if (IS_ERR(vma)) { err = PTR_ERR(vma); vma = NULL;
@@ -710,7 +776,7 @@ static int do_procmap_query(struct proc_maps_private *priv, void __user *uarg) }
/* unlock vma or mmap_lock, and put mm_struct before copying data to user */
- query_vma_teardown(mm, vma);
query_vma_teardown(&lock_ctx); mmput(mm);
if (karg.vma_name_size && copy_to_user(u64_to_user_ptr(karg.vma_name_addr),
@@ -730,7 +796,7 @@ static int do_procmap_query(struct proc_maps_private *priv, void __user *uarg) return 0;
out:
- query_vma_teardown(mm, vma);
- query_vma_teardown(&lock_ctx); mmput(mm); kfree(name_buf); return err;
@@ -743,7 +809,8 @@ static long procfs_procmap_ioctl(struct file *file, unsigned int cmd, unsigned l
switch (cmd) { case PROCMAP_QUERY:
return do_procmap_query(priv, (void __user *)arg);
/* priv->lock_ctx.mm is set during file open operation */
Thanks!
default: return -ENOIOCTLCMD; }return do_procmap_query(priv->lock_ctx.mm, (void __user *)arg);
-- 2.50.1.703.g449372360f-goog
linux-kselftest-mirror@lists.linaro.org