On Thu, Sep 14, 2023 at 5:26 PM Suren Baghdasaryan surenb@google.com wrote:
+/*
- The mmap_lock for reading is held by the caller. Just move the page
- from src_pmd to dst_pmd if possible, and return true if succeeded
- in moving the page.
- */
+static int remap_pages_pte(struct mm_struct *dst_mm,
struct mm_struct *src_mm,
pmd_t *dst_pmd,
pmd_t *src_pmd,
struct vm_area_struct *dst_vma,
struct vm_area_struct *src_vma,
unsigned long dst_addr,
unsigned long src_addr,
__u64 mode)
+{
swp_entry_t entry;
pte_t orig_src_pte, orig_dst_pte;
spinlock_t *src_ptl, *dst_ptl;
pte_t *src_pte = NULL;
pte_t *dst_pte = NULL;
struct folio *src_folio = NULL;
struct anon_vma *src_anon_vma = NULL;
struct anon_vma *dst_anon_vma;
struct mmu_notifier_range range;
int err = 0;
+retry:
dst_pte = pte_offset_map_nolock(dst_mm, dst_pmd, dst_addr, &dst_ptl);
/* If an huge pmd materialized from under us fail */
if (unlikely(!dst_pte)) {
err = -EFAULT;
goto out;
}
src_pte = pte_offset_map_nolock(src_mm, src_pmd, src_addr, &src_ptl);
/*
* We held the mmap_lock for reading so MADV_DONTNEED
* can zap transparent huge pages under us, or the
* transparent huge page fault can establish new
* transparent huge pages under us.
*/
if (unlikely(!src_pte)) {
err = -EFAULT;
goto out;
}
BUG_ON(pmd_none(*dst_pmd));
BUG_ON(pmd_none(*src_pmd));
BUG_ON(pmd_trans_huge(*dst_pmd));
BUG_ON(pmd_trans_huge(*src_pmd));
spin_lock(dst_ptl);
orig_dst_pte = *dst_pte;
spin_unlock(dst_ptl);
if (!pte_none(orig_dst_pte)) {
err = -EEXIST;
goto out;
}
spin_lock(src_ptl);
orig_src_pte = *src_pte;
spin_unlock(src_ptl);
if (pte_none(orig_src_pte)) {
if (!(mode & UFFDIO_REMAP_MODE_ALLOW_SRC_HOLES))
err = -ENOENT;
else /* nothing to do to remap a hole */
err = 0;
goto out;
}
if (pte_present(orig_src_pte)) {
if (!src_folio) {
struct folio *folio;
/*
* Pin the page while holding the lock to be sure the
* page isn't freed under us
*/
spin_lock(src_ptl);
if (!pte_same(orig_src_pte, *src_pte)) {
spin_unlock(src_ptl);
err = -EAGAIN;
goto out;
}
folio = vm_normal_folio(src_vma, src_addr, orig_src_pte);
if (!folio || !folio_test_anon(folio) ||
folio_estimated_sharers(folio) != 1) {
spin_unlock(src_ptl);
err = -EBUSY;
goto out;
}
src_folio = folio;
folio_get(src_folio);
spin_unlock(src_ptl);
/* try to block all concurrent rmap walks */
if (!folio_trylock(src_folio)) {
pte_unmap(&orig_src_pte);
pte_unmap(&orig_dst_pte);
src_pte = dst_pte = NULL;
folio_lock(src_folio);
goto retry;
}
}
if (!src_anon_vma) {
/*
* folio_referenced walks the anon_vma chain
* without the folio lock. Serialize against it with
* the anon_vma lock, the folio lock is not enough.
*/
src_anon_vma = folio_get_anon_vma(src_folio);
if (!src_anon_vma) {
/* page was unmapped from under us */
err = -EAGAIN;
goto out;
}
if (!anon_vma_trylock_write(src_anon_vma)) {
pte_unmap(&orig_src_pte);
pte_unmap(&orig_dst_pte);
src_pte = dst_pte = NULL;
anon_vma_lock_write(src_anon_vma);
goto retry;
}
}
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, src_mm,
src_addr, src_addr + PAGE_SIZE);
mmu_notifier_invalidate_range_start_nonblock(&range);
I'm pretty sure you're not allowed to use mmu_notifier_invalidate_range_start_nonblock(). Use mmu_notifier_invalidate_range_start() at a point where it's fine to block; perhaps all the way up in remap_pages() around the whole loop or something like that. mmu_notifier_invalidate_range_start_nonblock() is special magic sauce for OOM reaping (with deceptively inviting naming and no documentation that explains that this is evil hazmat code), and if you chase down what various users of MMU notifiers do when you just hand them a random notification where mmu_notifier_range_blockable() is false, you end up in fun paths like in KVM's kvm_mmu_notifier_invalidate_range_start(), which calls gfn_to_pfn_cache_invalidate_start(), which does this:
/* * If the OOM reaper is active, then all vCPUs should have * been stopped already, so perform the request without * KVM_REQUEST_WAIT and be sad if any needed to be IPI'd. */ if (!may_block) req &= ~KVM_REQUEST_WAIT;
called = kvm_make_vcpus_request_mask(kvm, req, vcpu_bitmap);
WARN_ON_ONCE(called && !may_block);
Which means if you hit this codepath from a place like userfaultfd where the process is probably still running (and not being OOM reaped in a context where it's guaranteed to have been killed and stopped), you could probably get KVM into a situation where it needs to wait for vCPUs to acknowledge that they've processed a message before it's safe to continue, but it can't wait because we're in nonsleepable context, and then it just continues without waiting and presumably the KVM TLB gets out of sync or something?
double_pt_lock(dst_ptl, src_ptl);
if (!pte_same(*src_pte, orig_src_pte) ||
!pte_same(*dst_pte, orig_dst_pte) ||
folio_estimated_sharers(src_folio) != 1) {
double_pt_unlock(dst_ptl, src_ptl);
err = -EAGAIN;
goto out;
}
BUG_ON(!folio_test_anon(src_folio));
/* the PT lock is enough to keep the page pinned now */
folio_put(src_folio);
dst_anon_vma = (void *) dst_vma->anon_vma + PAGE_MAPPING_ANON;
WRITE_ONCE(src_folio->mapping,
(struct address_space *) dst_anon_vma);
WRITE_ONCE(src_folio->index, linear_page_index(dst_vma,
dst_addr));
if (!pte_same(ptep_clear_flush(src_vma, src_addr, src_pte),
orig_src_pte))
BUG_ON(1);
orig_dst_pte = mk_pte(&src_folio->page, dst_vma->vm_page_prot);
orig_dst_pte = maybe_mkwrite(pte_mkdirty(orig_dst_pte),
dst_vma);
set_pte_at(dst_mm, dst_addr, dst_pte, orig_dst_pte);
if (dst_mm != src_mm) {
inc_mm_counter(dst_mm, MM_ANONPAGES);
dec_mm_counter(src_mm, MM_ANONPAGES);
}
double_pt_unlock(dst_ptl, src_ptl);
anon_vma_unlock_write(src_anon_vma);
mmu_notifier_invalidate_range_end(&range);
put_anon_vma(src_anon_vma);
src_anon_vma = NULL;
/* unblock rmap walks */
folio_unlock(src_folio);
src_folio = NULL;
} else {
struct swap_info_struct *si;
int swap_count;
entry = pte_to_swp_entry(orig_src_pte);
if (non_swap_entry(entry)) {
if (is_migration_entry(entry)) {
pte_unmap(&orig_src_pte);
pte_unmap(&orig_dst_pte);
src_pte = dst_pte = NULL;
migration_entry_wait(src_mm, src_pmd,
src_addr);
err = -EAGAIN;
} else
err = -EFAULT;
goto out;
}
/*
* COUNT_CONTINUE to be returned is fine here, no need
* of follow all swap continuation to check against
* number 1.
*/
si = get_swap_device(entry);
if (!si) {
err = -EBUSY;
goto out;
}
swap_count = swap_swapcount(si, entry);
put_swap_device(si);
if (swap_count != 1) {
err = -EBUSY;
goto out;
}
double_pt_lock(dst_ptl, src_ptl);
if (!pte_same(*src_pte, orig_src_pte) ||
!pte_same(*dst_pte, orig_dst_pte) ||
swp_swapcount(entry) != 1) {
double_pt_unlock(dst_ptl, src_ptl);
err = -EAGAIN;
goto out;
}
if (pte_val(ptep_get_and_clear(src_mm, src_addr, src_pte)) !=
pte_val(orig_src_pte))
BUG_ON(1);
set_pte_at(dst_mm, dst_addr, dst_pte, orig_src_pte);
if (dst_mm != src_mm) {
inc_mm_counter(dst_mm, MM_ANONPAGES);
dec_mm_counter(src_mm, MM_ANONPAGES);
}
double_pt_unlock(dst_ptl, src_ptl);
}
+out:
if (src_anon_vma) {
anon_vma_unlock_write(src_anon_vma);
put_anon_vma(src_anon_vma);
}
if (src_folio) {
folio_unlock(src_folio);
folio_put(src_folio);
}
if (dst_pte)
pte_unmap(dst_pte);
if (src_pte)
pte_unmap(src_pte);
return err;
+}