Hi, Mike,
On Sat, Oct 22, 2022 at 07:50:47PM -0700, Mike Kravetz wrote:
[...]
-void __unmap_hugepage_range_final(struct mmu_gather *tlb, +static void __unmap_hugepage_range_locking(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, unsigned long end, struct page *ref_page,
zap_flags_t zap_flags)
zap_flags_t zap_flags, bool final)
{ hugetlb_vma_lock_write(vma); i_mmap_lock_write(vma->vm_file->f_mapping); __unmap_hugepage_range(tlb, vma, start, end, ref_page, zap_flags);
- /*
* Unlock and free the vma lock before releasing i_mmap_rwsem. When
* the vma_lock is freed, this makes the vma ineligible for pmd
* sharing. And, i_mmap_rwsem is required to set up pmd sharing.
* This is important as page tables for this unmapped range will
* be asynchrously deleted. If the page tables are shared, there
* will be issues when accessed by someone else.
*/
- __hugetlb_vma_unlock_write_free(vma);
- if (final) {
/*
* Unlock and free the vma lock before releasing i_mmap_rwsem.
* When the vma_lock is freed, this makes the vma ineligible
* for pmd sharing. And, i_mmap_rwsem is required to set up
* pmd sharing. This is important as page tables for this
* unmapped range will be asynchrously deleted. If the page
* tables are shared, there will be issues when accessed by
* someone else.
*/
__hugetlb_vma_unlock_write_free(vma);
i_mmap_unlock_write(vma->vm_file->f_mapping);
Pure question: can we rely on hugetlb_vm_op_close() to destroy the hugetlb vma lock?
I read the comment above, it seems we are trying to avoid racing with pmd sharing, but I don't see how that could ever happen, since iiuc there should only be two places that unmaps the vma (final==true):
(1) munmap: we're holding write lock, so no page fault possible (2) exit_mmap: we've already reset current->mm so no page fault possible
- } else {
i_mmap_unlock_write(vma->vm_file->f_mapping);
hugetlb_vma_unlock_write(vma);
- }
+}
- i_mmap_unlock_write(vma->vm_file->f_mapping);
+void __unmap_hugepage_range_final(struct mmu_gather *tlb,
struct vm_area_struct *vma, unsigned long start,
unsigned long end, struct page *ref_page,
zap_flags_t zap_flags)
+{
- __unmap_hugepage_range_locking(tlb, vma, start, end, ref_page,
zap_flags, true);
} +#ifdef CONFIG_ADVISE_SYSCALLS +/*
- Similar setup as in zap_page_range(). madvise(MADV_DONTNEED) can not call
- zap_page_range for hugetlb vmas as __unmap_hugepage_range_final will delete
- the associated vma_lock.
- */
+void clear_hugetlb_page_range(struct vm_area_struct *vma, unsigned long start,
unsigned long end)
+{
- struct mmu_notifier_range range;
- struct mmu_gather tlb;
- mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
start, end);
Is mmu_notifier_invalidate_range_start() missing here?
- tlb_gather_mmu(&tlb, vma->vm_mm);
- update_hiwater_rss(vma->vm_mm);
- __unmap_hugepage_range_locking(&tlb, vma, start, end, NULL, 0, false);
- mmu_notifier_invalidate_range_end(&range);
- tlb_finish_mmu(&tlb);
+} +#endif
void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, struct page *ref_page, zap_flags_t zap_flags) diff --git a/mm/madvise.c b/mm/madvise.c index 2baa93ca2310..90577a669635 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -790,7 +790,10 @@ static int madvise_free_single_vma(struct vm_area_struct *vma, static long madvise_dontneed_single_vma(struct vm_area_struct *vma, unsigned long start, unsigned long end) {
- zap_page_range(vma, start, end - start);
- if (!is_vm_hugetlb_page(vma))
zap_page_range(vma, start, end - start);
- else
return 0;clear_hugetlb_page_range(vma, start, end);
}
This does look a bit unfortunate - zap_page_range() contains yet another is_vm_hugetlb_page() check (further down in unmap_single_vma), it can be very confusing on which code path is really handling hugetlb.
The other mm_users check in v3 doesn't need this change, but was a bit hackish to me, because IIUC we're clear on the call paths to trigger this (unmap_vmas), so it seems clean to me to pass that info from the upper stack.
Maybe we can have a new zap_flags passed into unmap_single_vma() showing that it's destroying the vma?
Thanks,