With /proc/pid/maps now being read under per-vma lock protection we can reuse parts of that code to execute PROCMAP_QUERY ioctl also without taking mmap_lock. The change is designed to reduce mmap_lock contention and prevent PROCMAP_QUERY ioctl calls from blocking address space updates.
This patchset was split out of the original patchset [1] that introduced per-vma lock usage for /proc/pid/maps reading. It contains PROCMAP_QUERY tests, code refactoring patch to simplify the main change and the actual transition to per-vma lock.
Changes since v1 [2] - Added Tested-by and Acked-by, per SeongJae Park - Fixed NOMMU case, per Vlastimil Babka - Renamed proc_maps_query_data to proc_maps_locking_ctx, per Vlastimil Babka
[1] https://lore.kernel.org/all/20250704060727.724817-1-surenb@google.com/ [2] https://lore.kernel.org/all/20250731220024.702621-1-surenb@google.com/
Suren Baghdasaryan (3): selftests/proc: test PROCMAP_QUERY ioctl while vma is concurrently modified fs/proc/task_mmu: factor out proc_maps_private fields used by PROCMAP_QUERY fs/proc/task_mmu: execute PROCMAP_QUERY ioctl under per-vma locks
fs/proc/internal.h | 15 +- fs/proc/task_mmu.c | 149 ++++++++++++------ fs/proc/task_nommu.c | 14 +- tools/testing/selftests/proc/proc-maps-race.c | 65 ++++++++ 4 files changed, 181 insertions(+), 62 deletions(-)
base-commit: 01da54f10fddf3b01c5a3b80f6b16bbad390c302
Extend /proc/pid/maps tearing tests to verify PROCMAP_QUERY ioctl operation correctness while the vma is being concurrently modified.
Signed-off-by: Suren Baghdasaryan surenb@google.com Tested-by: SeongJae Park sj@kernel.org Acked-by: SeongJae Park sj@kernel.org --- tools/testing/selftests/proc/proc-maps-race.c | 65 +++++++++++++++++++ 1 file changed, 65 insertions(+)
diff --git a/tools/testing/selftests/proc/proc-maps-race.c b/tools/testing/selftests/proc/proc-maps-race.c index 66773685a047..d40854a07ec1 100644 --- a/tools/testing/selftests/proc/proc-maps-race.c +++ b/tools/testing/selftests/proc/proc-maps-race.c @@ -32,6 +32,8 @@ #include <stdlib.h> #include <string.h> #include <unistd.h> +#include <linux/fs.h> +#include <sys/ioctl.h> #include <sys/mman.h> #include <sys/stat.h> #include <sys/types.h> @@ -317,6 +319,25 @@ static bool capture_mod_pattern(FIXTURE_DATA(proc_maps_race) *self, strcmp(restored_first_line->text, self->first_line.text) == 0; }
+static bool query_addr_at(int maps_fd, void *addr, + unsigned long *vma_start, unsigned long *vma_end) +{ + struct procmap_query q; + + memset(&q, 0, sizeof(q)); + q.size = sizeof(q); + /* Find the VMA at the split address */ + q.query_addr = (unsigned long long)addr; + q.query_flags = 0; + if (ioctl(maps_fd, PROCMAP_QUERY, &q)) + return false; + + *vma_start = q.vma_start; + *vma_end = q.vma_end; + + return true; +} + static inline bool split_vma(FIXTURE_DATA(proc_maps_race) *self) { return mmap(self->mod_info->addr, self->page_size, self->mod_info->prot | PROT_EXEC, @@ -559,6 +580,8 @@ TEST_F(proc_maps_race, test_maps_tearing_from_split) do { bool last_line_changed; bool first_line_changed; + unsigned long vma_start; + unsigned long vma_end;
ASSERT_TRUE(read_boundary_lines(self, &new_last_line, &new_first_line));
@@ -595,6 +618,19 @@ TEST_F(proc_maps_race, test_maps_tearing_from_split) first_line_changed = strcmp(new_first_line.text, self->first_line.text) != 0; ASSERT_EQ(last_line_changed, first_line_changed);
+ /* Check if PROCMAP_QUERY ioclt() finds the right VMA */ + ASSERT_TRUE(query_addr_at(self->maps_fd, mod_info->addr + self->page_size, + &vma_start, &vma_end)); + /* + * The vma at the split address can be either the same as + * original one (if read before the split) or the same as the + * first line in the second page (if read after the split). + */ + ASSERT_TRUE((vma_start == self->last_line.start_addr && + vma_end == self->last_line.end_addr) || + (vma_start == split_first_line.start_addr && + vma_end == split_first_line.end_addr)); + clock_gettime(CLOCK_MONOTONIC_COARSE, &end_ts); end_test_iteration(&end_ts, self->verbose); } while (end_ts.tv_sec - start_ts.tv_sec < self->duration_sec); @@ -636,6 +672,9 @@ TEST_F(proc_maps_race, test_maps_tearing_from_resize) clock_gettime(CLOCK_MONOTONIC_COARSE, &start_ts); start_test_loop(&start_ts, self->verbose); do { + unsigned long vma_start; + unsigned long vma_end; + ASSERT_TRUE(read_boundary_lines(self, &new_last_line, &new_first_line));
/* Check if we read vmas after shrinking it */ @@ -662,6 +701,16 @@ TEST_F(proc_maps_race, test_maps_tearing_from_resize) "Expand result invalid", self)); }
+ /* Check if PROCMAP_QUERY ioclt() finds the right VMA */ + ASSERT_TRUE(query_addr_at(self->maps_fd, mod_info->addr, &vma_start, &vma_end)); + /* + * The vma should stay at the same address and have either the + * original size of 3 pages or 1 page if read after shrinking. + */ + ASSERT_TRUE(vma_start == self->last_line.start_addr && + (vma_end - vma_start == self->page_size * 3 || + vma_end - vma_start == self->page_size)); + clock_gettime(CLOCK_MONOTONIC_COARSE, &end_ts); end_test_iteration(&end_ts, self->verbose); } while (end_ts.tv_sec - start_ts.tv_sec < self->duration_sec); @@ -703,6 +752,9 @@ TEST_F(proc_maps_race, test_maps_tearing_from_remap) clock_gettime(CLOCK_MONOTONIC_COARSE, &start_ts); start_test_loop(&start_ts, self->verbose); do { + unsigned long vma_start; + unsigned long vma_end; + ASSERT_TRUE(read_boundary_lines(self, &new_last_line, &new_first_line));
/* Check if we read vmas after remapping it */ @@ -729,6 +781,19 @@ TEST_F(proc_maps_race, test_maps_tearing_from_remap) "Remap restore result invalid", self)); }
+ /* Check if PROCMAP_QUERY ioclt() finds the right VMA */ + ASSERT_TRUE(query_addr_at(self->maps_fd, mod_info->addr + self->page_size, + &vma_start, &vma_end)); + /* + * The vma should either stay at the same address and have the + * original size of 3 pages or we should find the remapped vma + * at the remap destination address with size of 1 page. + */ + ASSERT_TRUE((vma_start == self->last_line.start_addr && + vma_end - vma_start == self->page_size * 3) || + (vma_start == self->last_line.start_addr + self->page_size && + vma_end - vma_start == self->page_size)); + clock_gettime(CLOCK_MONOTONIC_COARSE, &end_ts); end_test_iteration(&end_ts, self->verbose); } while (end_ts.tv_sec - start_ts.tv_sec < self->duration_sec);
Refactor struct proc_maps_private so that the fields used by PROCMAP_QUERY ioctl are moved into a separate structure. In the next patch this allows ioctl to reuse some of the functions used for reading /proc/pid/maps without using file->private_data. This prevents concurrent modification of file->private_data members by ioctl and /proc/pid/maps readers.
The change is pure code refactoring and has no functional changes.
Signed-off-by: Suren Baghdasaryan surenb@google.com --- fs/proc/internal.h | 15 ++++++---- fs/proc/task_mmu.c | 70 ++++++++++++++++++++++---------------------- fs/proc/task_nommu.c | 14 ++++----- 3 files changed, 52 insertions(+), 47 deletions(-)
diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 7c235451c5ea..663dc0510315 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -379,16 +379,21 @@ extern void proc_self_init(void); * task_[no]mmu.c */ struct mem_size_stats; -struct proc_maps_private { - struct inode *inode; - struct task_struct *task; + +struct proc_maps_locking_ctx { struct mm_struct *mm; - struct vma_iterator iter; - loff_t last_pos; #ifdef CONFIG_PER_VMA_LOCK bool mmap_locked; struct vm_area_struct *locked_vma; #endif +}; + +struct proc_maps_private { + struct inode *inode; + struct task_struct *task; + struct vma_iterator iter; + loff_t last_pos; + struct proc_maps_locking_ctx lock_ctx; #ifdef CONFIG_NUMA struct mempolicy *task_mempolicy; #endif diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 3d6d8a9f13fc..843577aa7a32 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -132,11 +132,11 @@ static void release_task_mempolicy(struct proc_maps_private *priv)
#ifdef CONFIG_PER_VMA_LOCK
-static void unlock_vma(struct proc_maps_private *priv) +static void unlock_vma(struct proc_maps_locking_ctx *lock_ctx) { - if (priv->locked_vma) { - vma_end_read(priv->locked_vma); - priv->locked_vma = NULL; + if (lock_ctx->locked_vma) { + vma_end_read(lock_ctx->locked_vma); + lock_ctx->locked_vma = NULL; } }
@@ -151,14 +151,14 @@ static inline bool lock_vma_range(struct seq_file *m, * walking the vma tree under rcu read protection. */ if (m->op != &proc_pid_maps_op) { - if (mmap_read_lock_killable(priv->mm)) + if (mmap_read_lock_killable(priv->lock_ctx.mm)) return false;
- priv->mmap_locked = true; + priv->lock_ctx.mmap_locked = true; } else { rcu_read_lock(); - priv->locked_vma = NULL; - priv->mmap_locked = false; + priv->lock_ctx.locked_vma = NULL; + priv->lock_ctx.mmap_locked = false; }
return true; @@ -166,10 +166,10 @@ static inline bool lock_vma_range(struct seq_file *m,
static inline void unlock_vma_range(struct proc_maps_private *priv) { - if (priv->mmap_locked) { - mmap_read_unlock(priv->mm); + if (priv->lock_ctx.mmap_locked) { + mmap_read_unlock(priv->lock_ctx.mm); } else { - unlock_vma(priv); + unlock_vma(&priv->lock_ctx); rcu_read_unlock(); } } @@ -179,13 +179,13 @@ static struct vm_area_struct *get_next_vma(struct proc_maps_private *priv, { struct vm_area_struct *vma;
- if (priv->mmap_locked) + if (priv->lock_ctx.mmap_locked) return vma_next(&priv->iter);
- unlock_vma(priv); - vma = lock_next_vma(priv->mm, &priv->iter, last_pos); + unlock_vma(&priv->lock_ctx); + vma = lock_next_vma(priv->lock_ctx.mm, &priv->iter, last_pos); if (!IS_ERR_OR_NULL(vma)) - priv->locked_vma = vma; + priv->lock_ctx.locked_vma = vma;
return vma; } @@ -193,14 +193,14 @@ static struct vm_area_struct *get_next_vma(struct proc_maps_private *priv, static inline bool fallback_to_mmap_lock(struct proc_maps_private *priv, loff_t pos) { - if (priv->mmap_locked) + if (priv->lock_ctx.mmap_locked) return false;
rcu_read_unlock(); - mmap_read_lock(priv->mm); + mmap_read_lock(priv->lock_ctx.mm); /* Reinitialize the iterator after taking mmap_lock */ vma_iter_set(&priv->iter, pos); - priv->mmap_locked = true; + priv->lock_ctx.mmap_locked = true;
return true; } @@ -210,12 +210,12 @@ static inline bool fallback_to_mmap_lock(struct proc_maps_private *priv, static inline bool lock_vma_range(struct seq_file *m, struct proc_maps_private *priv) { - return mmap_read_lock_killable(priv->mm) == 0; + return mmap_read_lock_killable(priv->lock_ctx.mm) == 0; }
static inline void unlock_vma_range(struct proc_maps_private *priv) { - mmap_read_unlock(priv->mm); + mmap_read_unlock(priv->lock_ctx.mm); }
static struct vm_area_struct *get_next_vma(struct proc_maps_private *priv, @@ -258,7 +258,7 @@ static struct vm_area_struct *proc_get_vma(struct seq_file *m, loff_t *ppos) *ppos = vma->vm_end; } else { *ppos = SENTINEL_VMA_GATE; - vma = get_gate_vma(priv->mm); + vma = get_gate_vma(priv->lock_ctx.mm); }
return vma; @@ -278,7 +278,7 @@ static void *m_start(struct seq_file *m, loff_t *ppos) if (!priv->task) return ERR_PTR(-ESRCH);
- mm = priv->mm; + mm = priv->lock_ctx.mm; if (!mm || !mmget_not_zero(mm)) { put_task_struct(priv->task); priv->task = NULL; @@ -318,7 +318,7 @@ static void *m_next(struct seq_file *m, void *v, loff_t *ppos) static void m_stop(struct seq_file *m, void *v) { struct proc_maps_private *priv = m->private; - struct mm_struct *mm = priv->mm; + struct mm_struct *mm = priv->lock_ctx.mm;
if (!priv->task) return; @@ -339,9 +339,9 @@ static int proc_maps_open(struct inode *inode, struct file *file, return -ENOMEM;
priv->inode = inode; - priv->mm = proc_mem_open(inode, PTRACE_MODE_READ); - if (IS_ERR_OR_NULL(priv->mm)) { - int err = priv->mm ? PTR_ERR(priv->mm) : -ESRCH; + priv->lock_ctx.mm = proc_mem_open(inode, PTRACE_MODE_READ); + if (IS_ERR_OR_NULL(priv->lock_ctx.mm)) { + int err = priv->lock_ctx.mm ? PTR_ERR(priv->lock_ctx.mm) : -ESRCH;
seq_release_private(inode, file); return err; @@ -355,8 +355,8 @@ static int proc_map_release(struct inode *inode, struct file *file) struct seq_file *seq = file->private_data; struct proc_maps_private *priv = seq->private;
- if (priv->mm) - mmdrop(priv->mm); + if (priv->lock_ctx.mm) + mmdrop(priv->lock_ctx.mm);
return seq_release_private(inode, file); } @@ -610,7 +610,7 @@ static int do_procmap_query(struct proc_maps_private *priv, void __user *uarg) if (!!karg.build_id_size != !!karg.build_id_addr) return -EINVAL;
- mm = priv->mm; + mm = priv->lock_ctx.mm; if (!mm || !mmget_not_zero(mm)) return -ESRCH;
@@ -1307,7 +1307,7 @@ static int show_smaps_rollup(struct seq_file *m, void *v) { struct proc_maps_private *priv = m->private; struct mem_size_stats mss = {}; - struct mm_struct *mm = priv->mm; + struct mm_struct *mm = priv->lock_ctx.mm; struct vm_area_struct *vma; unsigned long vma_start = 0, last_vma_end = 0; int ret = 0; @@ -1452,9 +1452,9 @@ static int smaps_rollup_open(struct inode *inode, struct file *file) goto out_free;
priv->inode = inode; - priv->mm = proc_mem_open(inode, PTRACE_MODE_READ); - if (IS_ERR_OR_NULL(priv->mm)) { - ret = priv->mm ? PTR_ERR(priv->mm) : -ESRCH; + priv->lock_ctx.mm = proc_mem_open(inode, PTRACE_MODE_READ); + if (IS_ERR_OR_NULL(priv->lock_ctx.mm)) { + ret = priv->lock_ctx.mm ? PTR_ERR(priv->lock_ctx.mm) : -ESRCH;
single_release(inode, file); goto out_free; @@ -1472,8 +1472,8 @@ static int smaps_rollup_release(struct inode *inode, struct file *file) struct seq_file *seq = file->private_data; struct proc_maps_private *priv = seq->private;
- if (priv->mm) - mmdrop(priv->mm); + if (priv->lock_ctx.mm) + mmdrop(priv->lock_ctx.mm);
kfree(priv); return single_release(inode, file); diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 59bfd61d653a..d362919f4f68 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -204,7 +204,7 @@ static void *m_start(struct seq_file *m, loff_t *ppos) if (!priv->task) return ERR_PTR(-ESRCH);
- mm = priv->mm; + mm = priv->lock_ctx.mm; if (!mm || !mmget_not_zero(mm)) { put_task_struct(priv->task); priv->task = NULL; @@ -226,7 +226,7 @@ static void *m_start(struct seq_file *m, loff_t *ppos) static void m_stop(struct seq_file *m, void *v) { struct proc_maps_private *priv = m->private; - struct mm_struct *mm = priv->mm; + struct mm_struct *mm = priv->lock_ctx.mm;
if (!priv->task) return; @@ -259,9 +259,9 @@ static int maps_open(struct inode *inode, struct file *file, return -ENOMEM;
priv->inode = inode; - priv->mm = proc_mem_open(inode, PTRACE_MODE_READ); - if (IS_ERR_OR_NULL(priv->mm)) { - int err = priv->mm ? PTR_ERR(priv->mm) : -ESRCH; + priv->lock_ctx.mm = proc_mem_open(inode, PTRACE_MODE_READ); + if (IS_ERR_OR_NULL(priv->lock_ctx.mm)) { + int err = priv->lock_ctx.mm ? PTR_ERR(priv->lock_ctx.mm) : -ESRCH;
seq_release_private(inode, file); return err; @@ -276,8 +276,8 @@ static int map_release(struct inode *inode, struct file *file) struct seq_file *seq = file->private_data; struct proc_maps_private *priv = seq->private;
- if (priv->mm) - mmdrop(priv->mm); + if (priv->lock_ctx.mm) + mmdrop(priv->lock_ctx.mm);
return seq_release_private(inode, file); }
On 8/5/25 1:15 AM, Suren Baghdasaryan wrote:
Refactor struct proc_maps_private so that the fields used by PROCMAP_QUERY ioctl are moved into a separate structure. In the next patch this allows ioctl to reuse some of the functions used for reading /proc/pid/maps without using file->private_data. This prevents concurrent modification of file->private_data members by ioctl and /proc/pid/maps readers.
The change is pure code refactoring and has no functional changes.
Signed-off-by: Suren Baghdasaryan surenb@google.com
Reviewed-by: Vlastimil Babka vbabka@suse.cz
Utilize per-vma locks to stabilize vma after lookup without taking mmap_lock during PROCMAP_QUERY ioctl execution. If vma lock is contended, we fall back to mmap_lock but take it only momentarily to lock the vma and release the mmap_lock. In a very unlikely case of vm_refcnt overflow, this fall back path will fail and ioctl is done under mmap_lock protection.
This change is designed to reduce mmap_lock contention and prevent PROCMAP_QUERY ioctl calls from blocking address space updates.
Signed-off-by: Suren Baghdasaryan surenb@google.com --- fs/proc/task_mmu.c | 81 +++++++++++++++++++++++++++++++++++++--------- 1 file changed, 65 insertions(+), 16 deletions(-)
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 843577aa7a32..1d06ecdbef6f 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -517,28 +517,78 @@ static int pid_maps_open(struct inode *inode, struct file *file) PROCMAP_QUERY_VMA_FLAGS \ )
-static int query_vma_setup(struct mm_struct *mm) +#ifdef CONFIG_PER_VMA_LOCK + +static int query_vma_setup(struct proc_maps_locking_ctx *lock_ctx) { - return mmap_read_lock_killable(mm); + lock_ctx->locked_vma = NULL; + lock_ctx->mmap_locked = false; + + return 0; }
-static void query_vma_teardown(struct mm_struct *mm, struct vm_area_struct *vma) +static void query_vma_teardown(struct proc_maps_locking_ctx *lock_ctx) { - mmap_read_unlock(mm); + if (lock_ctx->mmap_locked) + mmap_read_unlock(lock_ctx->mm); + else + unlock_vma(lock_ctx); }
-static struct vm_area_struct *query_vma_find_by_addr(struct mm_struct *mm, unsigned long addr) +static struct vm_area_struct *query_vma_find_by_addr(struct proc_maps_locking_ctx *lock_ctx, + unsigned long addr) { - return find_vma(mm, addr); + struct vm_area_struct *vma; + struct vma_iterator vmi; + + unlock_vma(lock_ctx); + rcu_read_lock(); + vma_iter_init(&vmi, lock_ctx->mm, addr); + vma = lock_next_vma(lock_ctx->mm, &vmi, addr); + rcu_read_unlock(); + + if (!IS_ERR_OR_NULL(vma)) { + lock_ctx->locked_vma = vma; + } else if (PTR_ERR(vma) == -EAGAIN) { + /* Fallback to mmap_lock on vma->vm_refcnt overflow */ + mmap_read_lock(lock_ctx->mm); + vma = find_vma(lock_ctx->mm, addr); + lock_ctx->mmap_locked = true; + } + + return vma; }
-static struct vm_area_struct *query_matching_vma(struct mm_struct *mm, +#else /* CONFIG_PER_VMA_LOCK */ + +static int query_vma_setup(struct proc_maps_locking_ctx *lock_ctx) +{ + return mmap_read_lock_killable(lock_ctx->mm); +} + +static void query_vma_teardown(struct proc_maps_locking_ctx *lock_ctx) +{ + mmap_read_unlock(lock_ctx->mm); +} + +static struct vm_area_struct *query_vma_find_by_addr(struct proc_maps_locking_ctx *lock_ctx, + unsigned long addr) +{ + return find_vma(lock_ctx->mm, addr); +} + +#endif /* CONFIG_PER_VMA_LOCK */ + +static struct vm_area_struct *query_matching_vma(struct proc_maps_locking_ctx *lock_ctx, unsigned long addr, u32 flags) { struct vm_area_struct *vma;
next_vma: - vma = query_vma_find_by_addr(mm, addr); + vma = query_vma_find_by_addr(lock_ctx, addr); + if (IS_ERR(vma)) + return vma; + if (!vma) goto no_vma;
@@ -579,11 +629,11 @@ static struct vm_area_struct *query_matching_vma(struct mm_struct *mm, return ERR_PTR(-ENOENT); }
-static int do_procmap_query(struct proc_maps_private *priv, void __user *uarg) +static int do_procmap_query(struct mm_struct *mm, void __user *uarg) { + struct proc_maps_locking_ctx lock_ctx = { .mm = mm }; struct procmap_query karg; struct vm_area_struct *vma; - struct mm_struct *mm; const char *name = NULL; char build_id_buf[BUILD_ID_SIZE_MAX], *name_buf = NULL; __u64 usize; @@ -610,17 +660,16 @@ static int do_procmap_query(struct proc_maps_private *priv, void __user *uarg) if (!!karg.build_id_size != !!karg.build_id_addr) return -EINVAL;
- mm = priv->lock_ctx.mm; if (!mm || !mmget_not_zero(mm)) return -ESRCH;
- err = query_vma_setup(mm); + err = query_vma_setup(&lock_ctx); if (err) { mmput(mm); return err; }
- vma = query_matching_vma(mm, karg.query_addr, karg.query_flags); + vma = query_matching_vma(&lock_ctx, karg.query_addr, karg.query_flags); if (IS_ERR(vma)) { err = PTR_ERR(vma); vma = NULL; @@ -705,7 +754,7 @@ static int do_procmap_query(struct proc_maps_private *priv, void __user *uarg) }
/* unlock vma or mmap_lock, and put mm_struct before copying data to user */ - query_vma_teardown(mm, vma); + query_vma_teardown(&lock_ctx); mmput(mm);
if (karg.vma_name_size && copy_to_user(u64_to_user_ptr(karg.vma_name_addr), @@ -725,7 +774,7 @@ static int do_procmap_query(struct proc_maps_private *priv, void __user *uarg) return 0;
out: - query_vma_teardown(mm, vma); + query_vma_teardown(&lock_ctx); mmput(mm); kfree(name_buf); return err; @@ -738,7 +787,7 @@ static long procfs_procmap_ioctl(struct file *file, unsigned int cmd, unsigned l
switch (cmd) { case PROCMAP_QUERY: - return do_procmap_query(priv, (void __user *)arg); + return do_procmap_query(priv->lock_ctx.mm, (void __user *)arg); default: return -ENOIOCTLCMD; }
On 8/5/25 1:15 AM, Suren Baghdasaryan wrote:
Utilize per-vma locks to stabilize vma after lookup without taking mmap_lock during PROCMAP_QUERY ioctl execution. If vma lock is contended, we fall back to mmap_lock but take it only momentarily to lock the vma and release the mmap_lock. In a very unlikely case of vm_refcnt overflow, this fall back path will fail and ioctl is done under mmap_lock protection.
This change is designed to reduce mmap_lock contention and prevent PROCMAP_QUERY ioctl calls from blocking address space updates.
Signed-off-by: Suren Baghdasaryan surenb@google.com
fs/proc/task_mmu.c | 81 +++++++++++++++++++++++++++++++++++++--------- 1 file changed, 65 insertions(+), 16 deletions(-)
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 843577aa7a32..1d06ecdbef6f 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -517,28 +517,78 @@ static int pid_maps_open(struct inode *inode, struct file *file) PROCMAP_QUERY_VMA_FLAGS \ ) -static int query_vma_setup(struct mm_struct *mm) +#ifdef CONFIG_PER_VMA_LOCK
+static int query_vma_setup(struct proc_maps_locking_ctx *lock_ctx) {
- return mmap_read_lock_killable(mm);
- lock_ctx->locked_vma = NULL;
- lock_ctx->mmap_locked = false;
- return 0;
} -static void query_vma_teardown(struct mm_struct *mm, struct vm_area_struct *vma) +static void query_vma_teardown(struct proc_maps_locking_ctx *lock_ctx) {
- mmap_read_unlock(mm);
- if (lock_ctx->mmap_locked)
mmap_read_unlock(lock_ctx->mm);
- else
unlock_vma(lock_ctx);
} -static struct vm_area_struct *query_vma_find_by_addr(struct mm_struct *mm, unsigned long addr) +static struct vm_area_struct *query_vma_find_by_addr(struct proc_maps_locking_ctx *lock_ctx,
unsigned long addr)
{
- return find_vma(mm, addr);
- struct vm_area_struct *vma;
- struct vma_iterator vmi;
Hm I think we can reach here with lock_ctx->mmap_locked being true via "goto next_vma" in query_matching_vma(). In that case we should just "return find_vma()" and doing the below is wrong, no?
- unlock_vma(lock_ctx);
- rcu_read_lock();
- vma_iter_init(&vmi, lock_ctx->mm, addr);
- vma = lock_next_vma(lock_ctx->mm, &vmi, addr);
- rcu_read_unlock();
- if (!IS_ERR_OR_NULL(vma)) {
lock_ctx->locked_vma = vma;
- } else if (PTR_ERR(vma) == -EAGAIN) {
/* Fallback to mmap_lock on vma->vm_refcnt overflow */
mmap_read_lock(lock_ctx->mm);
vma = find_vma(lock_ctx->mm, addr);
lock_ctx->mmap_locked = true;
- }
- return vma;
}
On Tue, Aug 5, 2025 at 7:18 AM Vlastimil Babka vbabka@suse.cz wrote:
On 8/5/25 1:15 AM, Suren Baghdasaryan wrote:
Utilize per-vma locks to stabilize vma after lookup without taking mmap_lock during PROCMAP_QUERY ioctl execution. If vma lock is contended, we fall back to mmap_lock but take it only momentarily to lock the vma and release the mmap_lock. In a very unlikely case of vm_refcnt overflow, this fall back path will fail and ioctl is done under mmap_lock protection.
This change is designed to reduce mmap_lock contention and prevent PROCMAP_QUERY ioctl calls from blocking address space updates.
Signed-off-by: Suren Baghdasaryan surenb@google.com
fs/proc/task_mmu.c | 81 +++++++++++++++++++++++++++++++++++++--------- 1 file changed, 65 insertions(+), 16 deletions(-)
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 843577aa7a32..1d06ecdbef6f 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -517,28 +517,78 @@ static int pid_maps_open(struct inode *inode, struct file *file) PROCMAP_QUERY_VMA_FLAGS \ )
-static int query_vma_setup(struct mm_struct *mm) +#ifdef CONFIG_PER_VMA_LOCK
+static int query_vma_setup(struct proc_maps_locking_ctx *lock_ctx) {
return mmap_read_lock_killable(mm);
lock_ctx->locked_vma = NULL;
lock_ctx->mmap_locked = false;
return 0;
}
-static void query_vma_teardown(struct mm_struct *mm, struct vm_area_struct *vma) +static void query_vma_teardown(struct proc_maps_locking_ctx *lock_ctx) {
mmap_read_unlock(mm);
if (lock_ctx->mmap_locked)
mmap_read_unlock(lock_ctx->mm);
else
unlock_vma(lock_ctx);
}
-static struct vm_area_struct *query_vma_find_by_addr(struct mm_struct *mm, unsigned long addr) +static struct vm_area_struct *query_vma_find_by_addr(struct proc_maps_locking_ctx *lock_ctx,
unsigned long addr)
{
return find_vma(mm, addr);
struct vm_area_struct *vma;
struct vma_iterator vmi;
Hm I think we can reach here with lock_ctx->mmap_locked being true via "goto next_vma" in query_matching_vma(). In that case we should just "return find_vma()" and doing the below is wrong, no?
Ah, you are quite right. I should handle mmap_locked differently in query_vma_find_by_addr(). I will post the fix shortly.
unlock_vma(lock_ctx);
rcu_read_lock();
vma_iter_init(&vmi, lock_ctx->mm, addr);
vma = lock_next_vma(lock_ctx->mm, &vmi, addr);
rcu_read_unlock();
if (!IS_ERR_OR_NULL(vma)) {
lock_ctx->locked_vma = vma;
} else if (PTR_ERR(vma) == -EAGAIN) {
/* Fallback to mmap_lock on vma->vm_refcnt overflow */
mmap_read_lock(lock_ctx->mm);
vma = find_vma(lock_ctx->mm, addr);
lock_ctx->mmap_locked = true;
}
return vma;
}
linux-kselftest-mirror@lists.linaro.org