+cc linux-api - see the description of the new behavior below
On 7/11/25 13:38, Lorenzo Stoakes wrote:
Historically we've made it a uAPI requirement that mremap() may only operate on a single VMA at a time.
For instances where VMAs need to be resized, this makes sense, as it becomes very difficult to determine what a user actually wants should they indicate a desire to expand or shrink the size of multiple VMAs (truncate? Adjust sizes individually? Some other strategy?).
However, in instances where a user is moving VMAs, it is restrictive to disallow this.
This is especially the case when anonymous mapping remap may or may not be mergeable depending on whether VMAs have or have not been faulted due to anon_vma assignment and folio index alignment with vma->vm_pgoff.
Often this can result in surprising impact where a moved region is faulted, then moved back and a user fails to observe a merge from otherwise compatible, adjacent VMAs.
This change allows such cases to work without the user having to be cognizant of whether a prior mremap() move or other VMA operations has resulted in VMA fragmentation.
We only permit this for mremap() operations that do NOT change the size of the VMA and DO specify MREMAP_MAYMOVE | MREMAP_FIXED.
Should no VMA exist in the range, -EFAULT is returned as usual.
If a VMA move spans a single VMA - then there is no functional change.
Otherwise, we place additional requirements upon VMAs:
They must not have a userfaultfd context associated with them - this requires dropping the lock to notify users, and we want to perform the operation with the mmap write lock held throughout.
If file-backed, they cannot have a custom get_unmapped_area handler - this might result in MREMAP_FIXED not being honoured, which could result in unexpected positioning of VMAs in the moved region.
There may be gaps in the range of VMAs that are moved:
X Y X Y <---> <-> <---> <-> |-------| |-----| |-----| |-------| |-----| |-----| | A | | B | | C | ---> | A' | | B' | | C' | |-------| |-----| |-----| |-------| |-----| |-----| addr new_addr
The move will preserve the gaps between each VMA.
AFAIU "moving a gap" doesn't mean we unmap anything pre-existing where the moved gap's range falls to, right? Worth pointing out explicitly.
Note that any failures encountered will result in a partial move. Since an mremap() can fail at any time, this might result in only some of the VMAs being moved.
Note that failures are very rare and typically require an out of a memory condition or a mapping limit condition to be hit, assuming the VMAs being moved are valid.
We don't try to assess ahead of time whether VMAs are valid according to the multi VMA rules, as it would be rather unusual for a user to mix uffd-enabled VMAs and/or VMAs which map unusual driver mappings that specify custom get_unmapped_area() handlers in an aggregate operation.
So we optimise for the far, far more likely case of the operation being entirely permissible.
Guess it's the sanest thing to do given all the cirumstances.
In the case of the move of a single VMA, the above conditions are permitted. This makes the behaviour identical for a single VMA as before.
Signed-off-by: Lorenzo Stoakes lorenzo.stoakes@oracle.com
Reviewed-by: Vlastimil Babka vbabka@suse.cz
Some nits:
mm/mremap.c | 157 +++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 150 insertions(+), 7 deletions(-)
diff --git a/mm/mremap.c b/mm/mremap.c index 8cb08ccea6ad..59f49de0f84e 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -69,6 +69,8 @@ struct vma_remap_struct { enum mremap_type remap_type; /* expand, shrink, etc. */ bool mmap_locked; /* Is mm currently write-locked? */ unsigned long charged; /* If VM_ACCOUNT, # pages to account. */
- bool seen_vma; /* Is >1 VMA being moved? */
Seems this could be local variable of remap_move().
- bool vmi_needs_reset; /* Was the VMA iterator invalidated? */
}; static pud_t *get_old_pud(struct mm_struct *mm, unsigned long addr) @@ -1188,6 +1190,9 @@ static int copy_vma_and_data(struct vma_remap_struct *vrm, *new_vma_ptr = NULL; return -ENOMEM; }
A newline here?
- if (vma != vrm->vma)
vrm->vmi_needs_reset = true;
A comment on what this condition means wouldn't hurt? Is it when "Source vma may have been merged into new_vma" in copy_vma(), or when not?
(no comments bellow, remaining quoted part left for linux-api reference)
- vrm->vma = vma; pmc.old = vma; pmc.new = new_vma;
@@ -1583,6 +1588,18 @@ static bool vrm_will_map_new(struct vma_remap_struct *vrm) return false; } +/* Does this remap ONLY move mappings? */ +static bool vrm_move_only(struct vma_remap_struct *vrm) +{
- if (!(vrm->flags & MREMAP_FIXED))
return false;
- if (vrm->old_len != vrm->new_len)
return false;
- return true;
+}
static void notify_uffd(struct vma_remap_struct *vrm, bool failed) { struct mm_struct *mm = current->mm; @@ -1597,6 +1614,32 @@ static void notify_uffd(struct vma_remap_struct *vrm, bool failed) userfaultfd_unmap_complete(mm, vrm->uf_unmap); } +static bool vma_multi_allowed(struct vm_area_struct *vma) +{
- struct file *file;
- /*
* We can't support moving multiple uffd VMAs as notify requires
* mmap lock to be dropped.
*/
- if (userfaultfd_armed(vma))
return false;
- /*
* Custom get unmapped area might result in MREMAP_FIXED not
* being obeyed.
*/
- file = vma->vm_file;
- if (file && !vma_is_shmem(vma) && !is_vm_hugetlb_page(vma)) {
const struct file_operations *fop = file->f_op;
if (fop->get_unmapped_area)
return false;
- }
- return true;
+}
static int check_prep_vma(struct vma_remap_struct *vrm) { struct vm_area_struct *vma = vrm->vma; @@ -1646,7 +1689,19 @@ static int check_prep_vma(struct vma_remap_struct *vrm) (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))) return -EINVAL;
- /* We can't remap across vm area boundaries */
- /*
* We can't remap across the end of VMAs, as another VMA may be
* adjacent:
*
* addr vma->vm_end
* |-----.----------|
* | . |
* |-----.----------|
* .<--------->xxx>
* old_len
*
* We also require that vma->vm_start <= addr < vma->vm_end.
if (old_len > vma->vm_end - addr) return -EFAULT;*/
@@ -1746,6 +1801,90 @@ static unsigned long check_mremap_params(struct vma_remap_struct *vrm) return 0; } +static unsigned long remap_move(struct vma_remap_struct *vrm) +{
- struct vm_area_struct *vma;
- unsigned long start = vrm->addr;
- unsigned long end = vrm->addr + vrm->old_len;
- unsigned long new_addr = vrm->new_addr;
- unsigned long target_addr = new_addr;
- unsigned long res = -EFAULT;
- unsigned long last_end;
- bool allowed = true;
- VMA_ITERATOR(vmi, current->mm, start);
- /*
* When moving VMAs we allow for batched moves across multiple VMAs,
* with all VMAs in the input range [addr, addr + old_len) being moved
* (and split as necessary).
*/
- for_each_vma_range(vmi, vma, end) {
/* Account for start, end not aligned with VMA start, end. */
unsigned long addr = max(vma->vm_start, start);
unsigned long len = min(end, vma->vm_end) - addr;
unsigned long offset, res_vma;
if (!allowed)
return -EFAULT;
/* No gap permitted at the start of the range. */
if (!vrm->seen_vma && start < vma->vm_start)
return -EFAULT;
/*
* To sensibly move multiple VMAs, accounting for the fact that
* get_unmapped_area() may align even MAP_FIXED moves, we simply
* attempt to move such that the gaps between source VMAs remain
* consistent in destination VMAs, e.g.:
*
* X Y X Y
* <---> <-> <---> <->
* |-------| |-----| |-----| |-------| |-----| |-----|
* | A | | B | | C | ---> | A' | | B' | | C' |
* |-------| |-----| |-----| |-------| |-----| |-----|
* new_addr
*
* So we map B' at A'->vm_end + X, and C' at B'->vm_end + Y.
*/
offset = vrm->seen_vma ? vma->vm_start - last_end : 0;
last_end = vma->vm_end;
vrm->vma = vma;
vrm->addr = addr;
vrm->new_addr = target_addr + offset;
vrm->old_len = vrm->new_len = len;
allowed = vma_multi_allowed(vma);
if (vrm->seen_vma && !allowed)
return -EFAULT;
res_vma = check_prep_vma(vrm);
if (!res_vma)
res_vma = mremap_to(vrm);
if (IS_ERR_VALUE(res_vma))
return res_vma;
if (!vrm->seen_vma) {
VM_WARN_ON_ONCE(allowed && res_vma != new_addr);
res = res_vma;
}
/* mmap lock is only dropped on shrink. */
VM_WARN_ON_ONCE(!vrm->mmap_locked);
/* This is a move, no expand should occur. */
VM_WARN_ON_ONCE(vrm->populate_expand);
if (vrm->vmi_needs_reset) {
vma_iter_reset(&vmi);
vrm->vmi_needs_reset = false;
}
vrm->seen_vma = true;
target_addr = res_vma + vrm->new_len;
- }
- return res;
+}
static unsigned long do_mremap(struct vma_remap_struct *vrm) { struct mm_struct *mm = current->mm; @@ -1763,13 +1902,17 @@ static unsigned long do_mremap(struct vma_remap_struct *vrm) return -EINTR; vrm->mmap_locked = true;
- vrm->vma = vma_lookup(current->mm, vrm->addr);
- res = check_prep_vma(vrm);
- if (res)
goto out;
- if (vrm_move_only(vrm)) {
res = remap_move(vrm);
- } else {
vrm->vma = vma_lookup(current->mm, vrm->addr);
res = check_prep_vma(vrm);
if (res)
goto out;
- /* Actually execute mremap. */
- res = vrm_implies_new_addr(vrm) ? mremap_to(vrm) : mremap_at(vrm);
/* Actually execute mremap. */
res = vrm_implies_new_addr(vrm) ? mremap_to(vrm) : mremap_at(vrm);
- }
out: failed = IS_ERR_VALUE(res);