Re: [PATCH 2/3] userfaultfd: UFFDIO_REMAP uABI

14 Sep 2023

On Thu, Sep 14, 2023 at 12:28 PM Jann Horn jannh@google.com wrote:
...
On Thu, Sep 14, 2023 at 5:26 PM Suren Baghdasaryan surenb@google.com wrote:
...
From: Andrea Arcangeli aarcange@redhat.com
This implements the uABI of UFFDIO_REMAP.
Notably one mode bitflag is also forwarded (and in turn known) by the
lowlevel remap_pages method.
Signed-off-by: Andrea Arcangeli aarcange@redhat.com
Signed-off-by: Suren Baghdasaryan surenb@google.com

fs/userfaultfd.c                 |  49 +++
 include/linux/rmap.h             |   5 +
 include/linux/userfaultfd_k.h    |  17 +
 include/uapi/linux/userfaultfd.h |  22 ++
 mm/huge_memory.c                 | 118 +++++++
 mm/khugepaged.c                  |   3 +
 mm/userfaultfd.c                 | 586 +++++++++++++++++++++++++++++++
 7 files changed, 800 insertions(+)

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 56eaae9dac1a..7bf64e7541c1 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -2027,6 +2027,52 @@ static inline unsigned int uffd_ctx_features(__u64 user_features)
        return (unsigned int)user_features | UFFD_FEATURE_INITIALIZED;
 }
+static int userfaultfd_remap(struct userfaultfd_ctx *ctx,

                       unsigned long arg)



+{

  __s64 ret;


  struct uffdio_remap uffdio_remap;


  struct uffdio_remap __user *user_uffdio_remap;


  struct userfaultfd_wake_range range;



  user_uffdio_remap = (struct uffdio_remap __user *) arg;



  ret = -EFAULT;


  if (copy_from_user(&uffdio_remap, user_uffdio_remap,


                     /* don't copy "remap" last field */


                     sizeof(uffdio_remap)-sizeof(__s64)))


          goto out;



  ret = validate_range(ctx->mm, uffdio_remap.dst, uffdio_remap.len);


  if (ret)


          goto out;


  ret = validate_range(current->mm, uffdio_remap.src, uffdio_remap.len);


  if (ret)


          goto out;


  ret = -EINVAL;


  if (uffdio_remap.mode & ~(UFFDIO_REMAP_MODE_ALLOW_SRC_HOLES|


                            UFFDIO_REMAP_MODE_DONTWAKE))


          goto out;



Do you not need mmget_not_zero(ctx->mm) to make sure the MM can't be
concurrently torn down while remap_pages() is running, similar to what
the other userfaultfd ioctl handlers do?
...

  ret = remap_pages(ctx->mm, current->mm,


                    uffdio_remap.dst, uffdio_remap.src,


                    uffdio_remap.len, uffdio_remap.mode);


  if (unlikely(put_user(ret, &user_uffdio_remap->remap)))


          return -EFAULT;


  if (ret < 0)


          goto out;


  /* len == 0 would wake all */


  BUG_ON(!ret);


  range.len = ret;


  if (!(uffdio_remap.mode & UFFDIO_REMAP_MODE_DONTWAKE)) {


          range.start = uffdio_remap.dst;


          wake_userfault(ctx, &range);


  }


  ret = range.len == uffdio_remap.len ? 0 : -EAGAIN;



+out:

  return ret;



+}
[...]
...
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 064fbd90822b..c7a9880a1f6a 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1932,6 +1932,124 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
        return ret;
 }
+#ifdef CONFIG_USERFAULTFD
+/*


The PT lock for src_pmd and the mmap_lock for reading are held by



the caller, but it must return after releasing the



page_table_lock. We're guaranteed the src_pmd is a pmd_trans_huge



until the PT lock of the src_pmd is released. Just move the page



from src_pmd to dst_pmd if possible. Return zero if succeeded in



moving the page, -EAGAIN if it needs to be repeated by the caller,



or other errors in case of failure.


*/

+int remap_pages_huge_pmd(struct mm_struct *dst_mm,

                   struct mm_struct *src_mm,


                   pmd_t *dst_pmd, pmd_t *src_pmd,


                   pmd_t dst_pmdval,


                   struct vm_area_struct *dst_vma,


                   struct vm_area_struct *src_vma,


                   unsigned long dst_addr,


                   unsigned long src_addr)



+{

  pmd_t _dst_pmd, src_pmdval;


  struct page *src_page;


  struct anon_vma *src_anon_vma, *dst_anon_vma;


  spinlock_t *src_ptl, *dst_ptl;


  pgtable_t pgtable;


  struct mmu_notifier_range range;



  src_pmdval = *src_pmd;


  src_ptl = pmd_lockptr(src_mm, src_pmd);



  BUG_ON(!pmd_trans_huge(src_pmdval));


  BUG_ON(!pmd_none(dst_pmdval));



Why can we assert that pmd_none(dst_pmdval) is true here? Can we not
have concurrent faults (or userfaultfd operations) populating that
PMD?
...

  BUG_ON(!spin_is_locked(src_ptl));


  mmap_assert_locked(src_mm);


  mmap_assert_locked(dst_mm);


  BUG_ON(src_addr & ~HPAGE_PMD_MASK);


  BUG_ON(dst_addr & ~HPAGE_PMD_MASK);



  src_page = pmd_page(src_pmdval);


  BUG_ON(!PageHead(src_page));


  BUG_ON(!PageAnon(src_page));


  if (unlikely(page_mapcount(src_page) != 1)) {


          spin_unlock(src_ptl);


          return -EBUSY;


  }



  get_page(src_page);


  spin_unlock(src_ptl);



  mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, src_mm, src_addr,


                          src_addr + HPAGE_PMD_SIZE);


  mmu_notifier_invalidate_range_start(&range);



  /* block all concurrent rmap walks */


  lock_page(src_page);



  /*


   * split_huge_page walks the anon_vma chain without the page


   * lock. Serialize against it with the anon_vma lock, the page


   * lock is not enough.


   */


  src_anon_vma = folio_get_anon_vma(page_folio(src_page));


  if (!src_anon_vma) {


          unlock_page(src_page);


          put_page(src_page);


          mmu_notifier_invalidate_range_end(&range);


          return -EAGAIN;


  }


  anon_vma_lock_write(src_anon_vma);



  dst_ptl = pmd_lockptr(dst_mm, dst_pmd);


  double_pt_lock(src_ptl, dst_ptl);


  if (unlikely(!pmd_same(*src_pmd, src_pmdval) ||


               !pmd_same(*dst_pmd, dst_pmdval) ||


               page_mapcount(src_page) != 1)) {


          double_pt_unlock(src_ptl, dst_ptl);


          anon_vma_unlock_write(src_anon_vma);


          put_anon_vma(src_anon_vma);


          unlock_page(src_page);


          put_page(src_page);


          mmu_notifier_invalidate_range_end(&range);


          return -EAGAIN;


  }



  BUG_ON(!PageHead(src_page));


  BUG_ON(!PageAnon(src_page));


  /* the PT lock is enough to keep the page pinned now */


  put_page(src_page);



  dst_anon_vma = (void *) dst_vma->anon_vma + PAGE_MAPPING_ANON;


  WRITE_ONCE(src_page->mapping, (struct address_space *) dst_anon_vma);


  WRITE_ONCE(src_page->index, linear_page_index(dst_vma, dst_addr));



  if (!pmd_same(pmdp_huge_clear_flush(src_vma, src_addr, src_pmd),


                src_pmdval))


          BUG_ON(1);



I'm not sure we can assert that the PMDs are exactly equal; the CPU
might have changed the A/D bits under us?
...

  _dst_pmd = mk_huge_pmd(src_page, dst_vma->vm_page_prot);


  _dst_pmd = maybe_pmd_mkwrite(pmd_mkdirty(_dst_pmd), dst_vma);


  set_pmd_at(dst_mm, dst_addr, dst_pmd, _dst_pmd);



  pgtable = pgtable_trans_huge_withdraw(src_mm, src_pmd);


  pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);



Are we allowed to move page tables between mm_structs on all
architectures? The first example I found that looks a bit dodgy,
looking through various architectures' pte_alloc_one(), is s390's
page_table_alloc() which looks like page tables are tied to per-MM
lists sometimes.
If that's not allowed, we might have to allocate a new deposit table
and free the old one or something like that.
...

  if (dst_mm != src_mm) {


          add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);


          add_mm_counter(src_mm, MM_ANONPAGES, -HPAGE_PMD_NR);


  }


  double_pt_unlock(src_ptl, dst_ptl);



  anon_vma_unlock_write(src_anon_vma);


  put_anon_vma(src_anon_vma);



  /* unblock rmap walks */


  unlock_page(src_page);



  mmu_notifier_invalidate_range_end(&range);


  return 0;



+}
+#endif /* CONFIG_USERFAULTFD */



/*

Returns page table lock pointer if a given pmd maps a thp, NULL otherwise.


[...]
...
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 96d9eae5c7cc..0cca60dfa8f8 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
[...]
...
+ssize_t remap_pages(struct mm_struct *dst_mm, struct mm_struct *src_mm,

              unsigned long dst_start, unsigned long src_start,


              unsigned long len, __u64 mode)



+{
[...]
...


  if (pgprot_val(src_vma->vm_page_prot) !=


      pgprot_val(dst_vma->vm_page_prot))


          goto out;



Does this check intentionally allow moving pages from a
PROT_READ|PROT_WRITE anonymous private VMA into a PROT_READ anonymous
private VMA (on architectures like x86 and arm64 where CoW memory has
the same protection flags as read-only memory), but forbid moving them
from a PROT_READ|PROT_EXEC VMA into a PROT_READ VMA? I think this
check needs at least a comment to explain what's going on here.
...

  /* only allow remapping if both are mlocked or both aren't */


  if ((src_vma->vm_flags & VM_LOCKED) ^ (dst_vma->vm_flags & VM_LOCKED))


          goto out;



  /*


   * Be strict and only allow remap_pages if either the src or


   * dst range is registered in the userfaultfd to prevent


   * userland errors going unnoticed. As far as the VM


   * consistency is concerned, it would be perfectly safe to


   * remove this check, but there's no useful usage for


   * remap_pages ouside of userfaultfd registered ranges. This


   * is after all why it is an ioctl belonging to the


   * userfaultfd and not a syscall.


   *


   * Allow both vmas to be registered in the userfaultfd, just


   * in case somebody finds a way to make such a case useful.


   * Normally only one of the two vmas would be registered in


   * the userfaultfd.


   */


  if (!dst_vma->vm_userfaultfd_ctx.ctx &&


      !src_vma->vm_userfaultfd_ctx.ctx)


          goto out;



  /*


   * FIXME: only allow remapping across anonymous vmas,


   * tmpfs should be added.


   */


  if (src_vma->vm_ops || dst_vma->vm_ops)


          goto out;



I don't think it's okay to check for anonymous VMAs by checking
->vm_ops. There are some weird drivers whose ->mmap helpers don't set
->vm_ops and instead just shove all the necessary PTEs into the VMA
right on ->mmap, so I think they end up with ->vm_ops==NULL. For
example, kcov_mmap() looks that way. I'm not sure how common this is.
Though, uuuuuh, I guess if that's true, the existing
vma_is_anonymous() is broken, since that also just checks ->vm_ops?
I'm not sure what the consequences of that would be... Either way,
vma_is_anonymous() might be the better way to check for anonymous VMAs
here, and someone should figure out whether vma_is_anonymous() needs
to be fixed.
...

  /*


   * Ensure the dst_vma has a anon_vma or this page


   * would get a NULL anon_vma when moved in the


   * dst_vma.


   */


  err = -ENOMEM;


  if (unlikely(anon_vma_prepare(dst_vma)))


          goto out;



  for (src_addr = src_start, dst_addr = dst_start;


       src_addr < src_start + len;) {


          spinlock_t *ptl;


          pmd_t dst_pmdval;



          BUG_ON(dst_addr >= dst_start + len);


          src_pmd = mm_find_pmd(src_mm, src_addr);



(this would blow up pretty badly if we could have transparent huge PUD
in the region but I think that's limited to file VMAs so it's fine as
it currently is)
...

          if (unlikely(!src_pmd)) {


                  if (!(mode & UFFDIO_REMAP_MODE_ALLOW_SRC_HOLES)) {


                          err = -ENOENT;


                          break;


                  }


                  src_pmd = mm_alloc_pmd(src_mm, src_addr);


                  if (unlikely(!src_pmd)) {


                          err = -ENOMEM;


                          break;


                  }


          }


          dst_pmd = mm_alloc_pmd(dst_mm, dst_addr);


          if (unlikely(!dst_pmd)) {


                  err = -ENOMEM;


                  break;


          }



          dst_pmdval = pmdp_get_lockless(dst_pmd);


          /*


           * If the dst_pmd is mapped as THP don't


           * override it and just be strict.


           */


          if (unlikely(pmd_trans_huge(dst_pmdval))) {


                  err = -EEXIST;


                  break;


          }



This check is racy because the dst_pmd can still change at this point,
from previously pointing to a zeroed PMD to now pointing to a
hugepage, right? And we rely on remap_pages_pte() and
remap_pages_huge_pmd() to recheck for that?
If yes, maybe add a comment noting this and explaining why we want this check.
...

          ptl = pmd_trans_huge_lock(src_pmd, src_vma);


          if (ptl) {


                  /*


                   * Check if we can move the pmd without


                   * splitting it. First check the address


                   * alignment to be the same in src/dst.  These


                   * checks don't actually need the PT lock but


                   * it's good to do it here to optimize this


                   * block away at build time if


                   * CONFIG_TRANSPARENT_HUGEPAGE is not set.


                   */


                  if (thp_aligned == -1)


                          thp_aligned = ((src_addr & ~HPAGE_PMD_MASK) ==


                                         (dst_addr & ~HPAGE_PMD_MASK));


                  if (!thp_aligned || (src_addr & ~HPAGE_PMD_MASK) ||



This seems overly complicated, the only case when you can move a huge
PMD is if both addresses are hugepage-aligned and you have enough
length for one hugepage:
(src_addr & ~HPAGE_PMD_MASK) == 0 && (dst_addr & ~HPAGE_PMD_MASK) == 0
&& (src_start + len - src_addr >= HPAGE_PMD_SIZE).
...

                      !pmd_none(dst_pmdval) ||


                      src_start + len - src_addr < HPAGE_PMD_SIZE) {


                          spin_unlock(ptl);


                          /* Fall through */


                          split_huge_pmd(src_vma, src_pmd, src_addr);


                  } else {


                          err = remap_pages_huge_pmd(dst_mm,


                                                     src_mm,


                                                     dst_pmd,


                                                     src_pmd,


                                                     dst_pmdval,


                                                     dst_vma,


                                                     src_vma,


                                                     dst_addr,


                                                     src_addr);


                          cond_resched();



                          if (!err) {


                                  dst_addr += HPAGE_PMD_SIZE;


                                  src_addr += HPAGE_PMD_SIZE;


                                  moved += HPAGE_PMD_SIZE;


                          }



                          if ((!err || err == -EAGAIN) &&


                              fatal_signal_pending(current))


                                  err = -EINTR;



                          if (err && err != -EAGAIN)


                                  break;



                          continue;


                  }


          }



          if (pmd_none(*src_pmd)) {


                  if (!(mode & UFFDIO_REMAP_MODE_ALLOW_SRC_HOLES)) {


                          err = -ENOENT;


                          break;


                  }


                  if (unlikely(__pte_alloc(src_mm, src_pmd))) {


                          err = -ENOMEM;


                          break;


                  }


          }



          if (unlikely(pmd_none(dst_pmdval)) &&


              unlikely(__pte_alloc(dst_mm, dst_pmd))) {



Maybe just use pte_alloc() here?
...

                  err = -ENOMEM;


                  break;


          }



          err = remap_pages_pte(dst_mm, src_mm,


                                dst_pmd, src_pmd,


                                dst_vma, src_vma,


                                dst_addr, src_addr,


                                mode);



          cond_resched();



          if (!err) {


                  dst_addr += PAGE_SIZE;


                  src_addr += PAGE_SIZE;


                  moved += PAGE_SIZE;


          }



          if ((!err || err == -EAGAIN) &&


              fatal_signal_pending(current))


                  err = -EINTR;



          if (err && err != -EAGAIN)


                  break;


  }




+out:

  mmap_read_unlock(dst_mm);


  if (dst_mm != src_mm)


          mmap_read_unlock(src_mm);


  BUG_ON(moved < 0);


  BUG_ON(err > 0);


  BUG_ON(!moved && !err);


  return moved ? moved : err;



+}
Maybe you could try whether this function would look simpler with a
shape roughly like:
for (src_addr = ...; src_addr < ...;) {
  unsigned long step_size;
if (hugepage case) {
    if (have to split) {
      split it;
      continue;
    }
    step_size = HPAGE_PMD_SIZE;
    ...
  } else {
    ... 4k case ...
    step_size = PAGE_SIZE;
  }
  ...
  cond_resched();
  if (!err) {
    dst_addr += step_size;
    src_addr += step_size;
    moved += step_size;
  }
  ...
}
I'll need some time to gather the answers to all your questions and
will reply once I have them ready.
Thanks for reviewing, Jann!

    

2026

2025

2024

2023

2022

2021

2020

2019

2018

2017

Re: [PATCH 2/3] userfaultfd: UFFDIO_REMAP uABI