Re: [PATCH v3 3/3] fs/proc/task_mmu: execute PROCMAP_QUERY ioctl under per-vma locks

6 Aug 2025

On Wed, Aug 06, 2025 at 08:59:04AM -0700, Suren Baghdasaryan wrote:
...
Utilize per-vma locks to stabilize vma after lookup without taking
mmap_lock during PROCMAP_QUERY ioctl execution. If vma lock is
contended, we fall back to mmap_lock but take it only momentarily
to lock the vma and release the mmap_lock. In a very unlikely case
of vm_refcnt overflow, this fall back path will fail and ioctl is
done under mmap_lock protection.
This change is designed to reduce mmap_lock contention and prevent
PROCMAP_QUERY ioctl calls from blocking address space updates.
Signed-off-by: Suren Baghdasaryan surenb@google.com
A lot of nits but nothing's really standing out as broken, AFAICT...
...

fs/proc/task_mmu.c | 84 +++++++++++++++++++++++++++++++++++++---------
 1 file changed, 68 insertions(+), 16 deletions(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 45134335e086..0396315dfaee 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -517,28 +517,81 @@ static int pid_maps_open(struct inode *inode, struct file *file)
   	PROCMAP_QUERY_VMA_FLAGS				\
 )
-static int query_vma_setup(struct mm_struct *mm)
+#ifdef CONFIG_PER_VMA_LOCK



+static int query_vma_setup(struct proc_maps_locking_ctx *lock_ctx)
 {

return mmap_read_lock_killable(mm);


lock_ctx->locked_vma = NULL;
lock_ctx->mmap_locked = false;

We also do this in lock_vma_range(), seems sensible to factor out? E.g.:
static void ctx_clear_vma(struct proc_maps_locking_ctx *lock_ctx)
{
    lock_ctx->locked_vma = NULL;
    lock_ctx->mmap_locked = false;
}
...


return 0;

}
-static void query_vma_teardown(struct mm_struct *mm, struct vm_area_struct *vma)
+static void query_vma_teardown(struct proc_maps_locking_ctx *lock_ctx)
 {

mmap_read_unlock(mm);


if (lock_ctx->mmap_locked)
mmap_read_unlock(lock_ctx->mm);



Maybe worth a comment as to why we leave lock_ctx->mmap_locked set here?
...

else
unlock_vma(lock_ctx);



Should have said on 2/3, but I wonder if we should prefix with ctx_, as
'unlock_vma()' and 'lock_vma()' seem like core functions... esp. since we
have vma_start_read/write() rather than functions that reference locking.
So - ctx_unlock_vma() and ctx_lock_vma() or unlock_ctx_vma() /
lock_ctx_vma()?
...
+}



+static struct vm_area_struct *query_vma_find_by_addr(struct proc_maps_locking_ctx *lock_ctx,

				     unsigned long addr)



+{

struct vm_area_struct *vma;
struct vma_iterator vmi;

if (lock_ctx->mmap_locked)
return find_vma(lock_ctx->mm, addr);



unlock_vma(lock_ctx);
rcu_read_lock();
vma_iter_init(&vmi, lock_ctx->mm, addr);
vma = lock_next_vma(lock_ctx->mm, &vmi, addr);
rcu_read_unlock();

I think a comment at the top of this block would be useful, something like
'We unlock any previously locked VMA and find the next under RCU'.
...


if (!IS_ERR_OR_NULL(vma)) {

Is the NULL bit here really necessary? presumably lock_ctx->locked_vma is
expected to be NULL already, so we're not overwriting anything here.
In fact we could get rid of the horrid if/else here with a guard clause like:
if (!IS_ERR(vma) || PTR_ERR(vma) != -EAGAIN)
    	return vma;
(the !IS_ERR() bit is probably a bit redundant but makes things clearer
vs. just the PTR_ERR() thing)
Then do the rest below.
...

lock_ctx->locked_vma = vma;


} else if (PTR_ERR(vma) == -EAGAIN) {
/* Fallback to mmap_lock on vma->vm_refcnt overflow */


mmap_read_lock(lock_ctx->mm);


vma = find_vma(lock_ctx->mm, addr);


lock_ctx->mmap_locked = true;



Kinda sucks we have two separate ways of doing fallback now, this
open-coded thing and fallback_to_mmap_lock().
Sort of hard to combine since we have subtly diffrent semantics - the RCU
read lock is being held in the /proc/$pid/maps case, but here we've
released it already.
...

}

return vma;

+}



+#else /* CONFIG_PER_VMA_LOCK */



+static int query_vma_setup(struct proc_maps_locking_ctx *lock_ctx)
+{

return mmap_read_lock_killable(lock_ctx->mm);

}
-static struct vm_area_struct *query_vma_find_by_addr(struct mm_struct *mm, unsigned long addr)
+static void query_vma_teardown(struct proc_maps_locking_ctx *lock_ctx)
 {

return find_vma(mm, addr);


mmap_read_unlock(lock_ctx->mm);

}
-static struct vm_area_struct *query_matching_vma(struct mm_struct *mm,
+static struct vm_area_struct *query_vma_find_by_addr(struct proc_maps_locking_ctx *lock_ctx,

				     unsigned long addr)



+{

return find_vma(lock_ctx->mm, addr);

+}



+#endif  /* CONFIG_PER_VMA_LOCK */



+static struct vm_area_struct *query_matching_vma(struct proc_maps_locking_ctx *lock_ctx,
   					 unsigned long addr, u32 flags)
 {
   struct vm_area_struct *vma;
next_vma:

vma = query_vma_find_by_addr(mm, addr);


vma = query_vma_find_by_addr(lock_ctx, addr);
if (IS_ERR(vma))
return vma;


if (!vma)
  goto no_vma;

@@ -579,11 +632,11 @@ static struct vm_area_struct *query_matching_vma(struct mm_struct *mm,
   return ERR_PTR(-ENOENT);
 }
-static int do_procmap_query(struct proc_maps_private *priv, void __user *uarg)
+static int do_procmap_query(struct mm_struct *mm, void __user *uarg)
 {

struct proc_maps_locking_ctx lock_ctx = { .mm = mm };

...
struct procmap_query karg;
   struct vm_area_struct *vma;

struct mm_struct *mm;
const char *name = NULL;
char build_id_buf[BUILD_ID_SIZE_MAX], *name_buf = NULL;
__u64 usize;

@@ -610,17 +663,16 @@ static int do_procmap_query(struct proc_maps_private *priv, void __user *uarg)
   if (!!karg.build_id_size != !!karg.build_id_addr)
   	return -EINVAL;

mm = priv->lock_ctx.mm;
if (!mm || !mmget_not_zero(mm))
return -ESRCH;

err = query_vma_setup(mm);



err = query_vma_setup(&lock_ctx);
if (err) {
mmput(mm);
return err;
}


vma = query_matching_vma(mm, karg.query_addr, karg.query_flags);


vma = query_matching_vma(&lock_ctx, karg.query_addr, karg.query_flags);
if (IS_ERR(vma)) {
err = PTR_ERR(vma);
vma = NULL;

@@ -705,7 +757,7 @@ static int do_procmap_query(struct proc_maps_private *priv, void __user *uarg)
   }
/* unlock vma or mmap_lock, and put mm_struct before copying data to user */

query_vma_teardown(mm, vma);


query_vma_teardown(&lock_ctx);
mmput(mm);
if (karg.vma_name_size && copy_to_user(u64_to_user_ptr(karg.vma_name_addr),


@@ -725,7 +777,7 @@ static int do_procmap_query(struct proc_maps_private *priv, void __user *uarg)
   return 0;
out:

query_vma_teardown(mm, vma);


query_vma_teardown(&lock_ctx);
mmput(mm);
kfree(name_buf);
return err;

@@ -738,7 +790,7 @@ static long procfs_procmap_ioctl(struct file *file, unsigned int cmd, unsigned l
switch (cmd) {
   case PROCMAP_QUERY:

return do_procmap_query(priv, (void __user *)arg);




return do_procmap_query(priv->lock_ctx.mm, (void __user *)arg);



OK this confused me until I worked it through.
We set priv->lock_ctx.mm in:
pid_maps_open() -> do_maps_open() -> proc_maps_open()
Which it gets from proc_mem_open() which figures out the mm.
Maybe one for 2/3, but it'd be nice to have a comment saying something
about how this is set, since it being part of lock_ctx makes it seem like
it's something that would be set elsewhere.
Since we have fallback stuff and want to thread through this new lokc
context type I guess it makes sense to put it here but given that's the
case, let's maybe just add a comment here to clarify.
...
default:
   	return -ENOIOCTLCMD;
   }
--
2.50.1.565.gc32cd1483b-goog
Cheers, Lorenzo

    

2025

2024

2023

2022

2021

2020

2019

2018

2017

Re: [PATCH v3 3/3] fs/proc/task_mmu: execute PROCMAP_QUERY ioctl under per-vma locks