On Wed, Aug 06, 2025 at 08:59:04AM -0700, Suren Baghdasaryan wrote:
Utilize per-vma locks to stabilize vma after lookup without taking mmap_lock during PROCMAP_QUERY ioctl execution. If vma lock is contended, we fall back to mmap_lock but take it only momentarily to lock the vma and release the mmap_lock. In a very unlikely case of vm_refcnt overflow, this fall back path will fail and ioctl is done under mmap_lock protection.
This change is designed to reduce mmap_lock contention and prevent PROCMAP_QUERY ioctl calls from blocking address space updates.
Signed-off-by: Suren Baghdasaryan surenb@google.com
A lot of nits but nothing's really standing out as broken, AFAICT...
fs/proc/task_mmu.c | 84 +++++++++++++++++++++++++++++++++++++--------- 1 file changed, 68 insertions(+), 16 deletions(-)
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 45134335e086..0396315dfaee 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -517,28 +517,81 @@ static int pid_maps_open(struct inode *inode, struct file *file) PROCMAP_QUERY_VMA_FLAGS \ )
-static int query_vma_setup(struct mm_struct *mm) +#ifdef CONFIG_PER_VMA_LOCK
+static int query_vma_setup(struct proc_maps_locking_ctx *lock_ctx) {
- return mmap_read_lock_killable(mm);
- lock_ctx->locked_vma = NULL;
- lock_ctx->mmap_locked = false;
We also do this in lock_vma_range(), seems sensible to factor out? E.g.:
static void ctx_clear_vma(struct proc_maps_locking_ctx *lock_ctx) { lock_ctx->locked_vma = NULL; lock_ctx->mmap_locked = false; }
- return 0;
}
-static void query_vma_teardown(struct mm_struct *mm, struct vm_area_struct *vma) +static void query_vma_teardown(struct proc_maps_locking_ctx *lock_ctx) {
- mmap_read_unlock(mm);
- if (lock_ctx->mmap_locked)
mmap_read_unlock(lock_ctx->mm);
Maybe worth a comment as to why we leave lock_ctx->mmap_locked set here?
- else
unlock_vma(lock_ctx);
Should have said on 2/3, but I wonder if we should prefix with ctx_, as 'unlock_vma()' and 'lock_vma()' seem like core functions... esp. since we have vma_start_read/write() rather than functions that reference locking.
So - ctx_unlock_vma() and ctx_lock_vma() or unlock_ctx_vma() / lock_ctx_vma()?
+}
+static struct vm_area_struct *query_vma_find_by_addr(struct proc_maps_locking_ctx *lock_ctx,
unsigned long addr)
+{
- struct vm_area_struct *vma;
- struct vma_iterator vmi;
- if (lock_ctx->mmap_locked)
return find_vma(lock_ctx->mm, addr);
- unlock_vma(lock_ctx);
- rcu_read_lock();
- vma_iter_init(&vmi, lock_ctx->mm, addr);
- vma = lock_next_vma(lock_ctx->mm, &vmi, addr);
- rcu_read_unlock();
I think a comment at the top of this block would be useful, something like 'We unlock any previously locked VMA and find the next under RCU'.
- if (!IS_ERR_OR_NULL(vma)) {
Is the NULL bit here really necessary? presumably lock_ctx->locked_vma is expected to be NULL already, so we're not overwriting anything here.
In fact we could get rid of the horrid if/else here with a guard clause like:
if (!IS_ERR(vma) || PTR_ERR(vma) != -EAGAIN) return vma;
(the !IS_ERR() bit is probably a bit redundant but makes things clearer vs. just the PTR_ERR() thing)
Then do the rest below.
lock_ctx->locked_vma = vma;
- } else if (PTR_ERR(vma) == -EAGAIN) {
/* Fallback to mmap_lock on vma->vm_refcnt overflow */
mmap_read_lock(lock_ctx->mm);
vma = find_vma(lock_ctx->mm, addr);
lock_ctx->mmap_locked = true;
Kinda sucks we have two separate ways of doing fallback now, this open-coded thing and fallback_to_mmap_lock().
Sort of hard to combine since we have subtly diffrent semantics - the RCU read lock is being held in the /proc/$pid/maps case, but here we've released it already.
- }
- return vma;
+}
+#else /* CONFIG_PER_VMA_LOCK */
+static int query_vma_setup(struct proc_maps_locking_ctx *lock_ctx) +{
- return mmap_read_lock_killable(lock_ctx->mm);
}
-static struct vm_area_struct *query_vma_find_by_addr(struct mm_struct *mm, unsigned long addr) +static void query_vma_teardown(struct proc_maps_locking_ctx *lock_ctx) {
- return find_vma(mm, addr);
- mmap_read_unlock(lock_ctx->mm);
}
-static struct vm_area_struct *query_matching_vma(struct mm_struct *mm, +static struct vm_area_struct *query_vma_find_by_addr(struct proc_maps_locking_ctx *lock_ctx,
unsigned long addr)
+{
- return find_vma(lock_ctx->mm, addr);
+}
+#endif /* CONFIG_PER_VMA_LOCK */
+static struct vm_area_struct *query_matching_vma(struct proc_maps_locking_ctx *lock_ctx, unsigned long addr, u32 flags) { struct vm_area_struct *vma;
next_vma:
- vma = query_vma_find_by_addr(mm, addr);
- vma = query_vma_find_by_addr(lock_ctx, addr);
- if (IS_ERR(vma))
return vma;
- if (!vma) goto no_vma;
@@ -579,11 +632,11 @@ static struct vm_area_struct *query_matching_vma(struct mm_struct *mm, return ERR_PTR(-ENOENT); }
-static int do_procmap_query(struct proc_maps_private *priv, void __user *uarg) +static int do_procmap_query(struct mm_struct *mm, void __user *uarg) {
- struct proc_maps_locking_ctx lock_ctx = { .mm = mm };
struct procmap_query karg; struct vm_area_struct *vma;
- struct mm_struct *mm; const char *name = NULL; char build_id_buf[BUILD_ID_SIZE_MAX], *name_buf = NULL; __u64 usize;
@@ -610,17 +663,16 @@ static int do_procmap_query(struct proc_maps_private *priv, void __user *uarg) if (!!karg.build_id_size != !!karg.build_id_addr) return -EINVAL;
mm = priv->lock_ctx.mm; if (!mm || !mmget_not_zero(mm)) return -ESRCH;
err = query_vma_setup(mm);
- err = query_vma_setup(&lock_ctx); if (err) { mmput(mm); return err; }
- vma = query_matching_vma(mm, karg.query_addr, karg.query_flags);
- vma = query_matching_vma(&lock_ctx, karg.query_addr, karg.query_flags); if (IS_ERR(vma)) { err = PTR_ERR(vma); vma = NULL;
@@ -705,7 +757,7 @@ static int do_procmap_query(struct proc_maps_private *priv, void __user *uarg) }
/* unlock vma or mmap_lock, and put mm_struct before copying data to user */
- query_vma_teardown(mm, vma);
query_vma_teardown(&lock_ctx); mmput(mm);
if (karg.vma_name_size && copy_to_user(u64_to_user_ptr(karg.vma_name_addr),
@@ -725,7 +777,7 @@ static int do_procmap_query(struct proc_maps_private *priv, void __user *uarg) return 0;
out:
- query_vma_teardown(mm, vma);
- query_vma_teardown(&lock_ctx); mmput(mm); kfree(name_buf); return err;
@@ -738,7 +790,7 @@ static long procfs_procmap_ioctl(struct file *file, unsigned int cmd, unsigned l
switch (cmd) { case PROCMAP_QUERY:
return do_procmap_query(priv, (void __user *)arg);
return do_procmap_query(priv->lock_ctx.mm, (void __user *)arg);
OK this confused me until I worked it through.
We set priv->lock_ctx.mm in:
pid_maps_open() -> do_maps_open() -> proc_maps_open()
Which it gets from proc_mem_open() which figures out the mm.
Maybe one for 2/3, but it'd be nice to have a comment saying something about how this is set, since it being part of lock_ctx makes it seem like it's something that would be set elsewhere.
Since we have fallback stuff and want to thread through this new lokc context type I guess it makes sense to put it here but given that's the case, let's maybe just add a comment here to clarify.
default: return -ENOIOCTLCMD; } -- 2.50.1.565.gc32cd1483b-goog
Cheers, Lorenzo