- Linux-stable-mirror - lists.linaro.org

+ hugetlbfs-remove-unnecessary-code-after-i_mmap_rwsem-synchronization.patch added to -mm tree

by akpm＠linux-foundation.org

The patch titled Subject: hugetlbfs: remove unnecessary code after i_mmap_rwsem synchronization has been added to the -mm tree. Its filename is hugetlbfs-remove-unnecessary-code-after-i_mmap_rwsem-synchronization.patch This patch should soon appear at http://ozlabs.org/~akpm/mmots/broken-out/hugetlbfs-remove-unnecessary-code-… and later at http://ozlabs.org/~akpm/mmotm/broken-out/hugetlbfs-remove-unnecessary-code-… Before you just go and hit "reply", please: a) Consider who else should be cc'ed b) Prefer to cc a suitable mailing list as well c) Ideally: find the original patch on the mailing list and do a reply-to-all to that, adding suitable additional cc's *** Remember to use Documentation/process/submit-checklist.rst when testing your code *** The -mm tree is included into linux-next and is updated there every 3-4 working days ------------------------------------------------------ From: Mike Kravetz <mike.kravetz(a)oracle.com> Subject: hugetlbfs: remove unnecessary code after i_mmap_rwsem synchronization After expanding i_mmap_rwsem use for better shared pmd and page fault/ truncation synchronization, remove code that is no longer necessary. Link: http://lkml.kernel.org/r/20181203200850.6460-4-mike.kravetz@oracle.com Fixes: ebed4bfc8da8 ("hugetlb: fix absurd HugePages_Rsvd") Signed-off-by: Mike Kravetz <mike.kravetz(a)oracle.com> Cc: Andrea Arcangeli <aarcange(a)redhat.com> Cc: "Aneesh Kumar K . V" <aneesh.kumar(a)linux.vnet.ibm.com> Cc: Davidlohr Bueso <dave(a)stgolabs.net> Cc: Hugh Dickins <hughd(a)google.com> Cc: "Kirill A . Shutemov" <kirill.shutemov(a)linux.intel.com> Cc: Michal Hocko <mhocko(a)kernel.org> Cc: Naoya Horiguchi <n-horiguchi(a)ah.jp.nec.com> Cc: Prakash Sangappa <prakash.sangappa(a)oracle.com> Cc: <stable(a)vger.kernel.org> Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org> --- fs/hugetlbfs/inode.c | 46 +++++++++++++---------------------------- mm/hugetlb.c | 21 ++++++++---------- 2 files changed, 25 insertions(+), 42 deletions(-) --- a/fs/hugetlbfs/inode.c~hugetlbfs-remove-unnecessary-code-after-i_mmap_rwsem-synchronization +++ a/fs/hugetlbfs/inode.c @@ -383,17 +383,16 @@ hugetlb_vmdelete_list(struct rb_root_cac * truncation is indicated by end of range being LLONG_MAX * In this case, we first scan the range and release found pages. * After releasing pages, hugetlb_unreserve_pages cleans up region/reserv - * maps and global counts. Page faults can not race with truncation - * in this routine. hugetlb_no_page() prevents page faults in the - * truncated range. It checks i_size before allocation, and again after - * with the page table lock for the page held. The same lock must be - * acquired to unmap a page. + * maps and global counts. * hole punch is indicated if end is not LLONG_MAX * In the hole punch case we scan the range and release found pages. * Only when releasing a page is the associated region/reserv map * deleted. The region/reserv map for ranges without associated - * pages are not modified. Page faults can race with hole punch. - * This is indicated if we find a mapped page. + * pages are not modified. + * + * Callers of this routine must hold the i_mmap_rwsem in write mode to prevent + * races with page faults. + * * Note: If the passed end of range value is beyond the end of file, but * not LLONG_MAX this routine still performs a hole punch operation. */ @@ -423,32 +422,14 @@ static void remove_inode_hugepages(struc for (i = 0; i < pagevec_count(&pvec); ++i) { struct page *page = pvec.pages[i]; - u32 hash; index = page->index; - hash = hugetlb_fault_mutex_hash(h, current->mm, - &pseudo_vma, - mapping, index, 0); - mutex_lock(&hugetlb_fault_mutex_table[hash]); - /* - * If page is mapped, it was faulted in after being - * unmapped in caller. Unmap (again) now after taking - * the fault mutex. The mutex will prevent faults - * until we finish removing the page. - * - * This race can only happen in the hole punch case. - * Getting here in a truncate operation is a bug. + * A mapped page is impossible as callers should unmap + * all references before calling. And, i_mmap_rwsem + * prevents the creation of additional mappings. */ - if (unlikely(page_mapped(page))) { - BUG_ON(truncate_op); - - i_mmap_lock_write(mapping); - hugetlb_vmdelete_list(&mapping->i_mmap, - index * pages_per_huge_page(h), - (index + 1) * pages_per_huge_page(h)); - i_mmap_unlock_write(mapping); - } + VM_BUG_ON(page_mapped(page)); lock_page(page); /* @@ -470,7 +451,6 @@ static void remove_inode_hugepages(struc } unlock_page(page); - mutex_unlock(&hugetlb_fault_mutex_table[hash]); } huge_pagevec_release(&pvec); cond_resched(); @@ -624,7 +604,11 @@ static long hugetlbfs_fallocate(struct f /* addr is the offset within the file (zero based) */ addr = index * hpage_size; - /* mutex taken here, fault path and hole punch */ + /* + * fault mutex taken here, protects against fault path + * and hole punch. inode_lock previously taken protects + * against truncation. + */ hash = hugetlb_fault_mutex_hash(h, mm, &pseudo_vma, mapping, index, addr); mutex_lock(&hugetlb_fault_mutex_table[hash]); --- a/mm/hugetlb.c~hugetlbfs-remove-unnecessary-code-after-i_mmap_rwsem-synchronization +++ a/mm/hugetlb.c @@ -3760,16 +3760,16 @@ static vm_fault_t hugetlb_no_page(struct } /* - * Use page lock to guard against racing truncation - * before we get page_table_lock. + * We can not race with truncation due to holding i_mmap_rwsem. + * Check once here for faults beyond end of file. */ + size = i_size_read(mapping->host) >> huge_page_shift(h); + if (idx >= size) + goto out; + retry: page = find_lock_page(mapping, idx); if (!page) { - size = i_size_read(mapping->host) >> huge_page_shift(h); - if (idx >= size) - goto out; - /* * Check for page in userfault range */ @@ -3859,9 +3859,6 @@ retry: } ptl = huge_pte_lock(h, mm, ptep); - size = i_size_read(mapping->host) >> huge_page_shift(h); - if (idx >= size) - goto backout; ret = 0; if (!huge_pte_none(huge_ptep_get(ptep))) @@ -3964,8 +3961,10 @@ vm_fault_t hugetlb_fault(struct mm_struc /* * Acquire i_mmap_rwsem before calling huge_pte_alloc and hold - * until finished with ptep. This prevents huge_pmd_unshare from - * being called elsewhere and making the ptep no longer valid. + * until finished with ptep. This serves two purposes: + * 1) It prevents huge_pmd_unshare from being called elsewhere + * and making the ptep no longer valid. + * 2) It synchronizes us with file truncation. * * ptep could have already be assigned via huge_pte_offset. That * is OK, as huge_pte_alloc will return the same value unless _ Patches currently in -mm which might be from mike.kravetz(a)oracle.com are hugetlbfs-use-i_mmap_rwsem-for-more-pmd-sharing-synchronization.patch hugetlbfs-use-i_mmap_rwsem-to-fix-page-fault-truncate-race.patch hugetlbfs-remove-unnecessary-code-after-i_mmap_rwsem-synchronization.patch

6 years, 10 months

1
0
0 0

+ hugetlbfs-use-i_mmap_rwsem-to-fix-page-fault-truncate-race.patch added to -mm tree

by akpm＠linux-foundation.org

The patch titled Subject: hugetlbfs: Use i_mmap_rwsem to fix page fault/truncate race has been added to the -mm tree. Its filename is hugetlbfs-use-i_mmap_rwsem-to-fix-page-fault-truncate-race.patch This patch should soon appear at http://ozlabs.org/~akpm/mmots/broken-out/hugetlbfs-use-i_mmap_rwsem-to-fix-… and later at http://ozlabs.org/~akpm/mmotm/broken-out/hugetlbfs-use-i_mmap_rwsem-to-fix-… Before you just go and hit "reply", please: a) Consider who else should be cc'ed b) Prefer to cc a suitable mailing list as well c) Ideally: find the original patch on the mailing list and do a reply-to-all to that, adding suitable additional cc's *** Remember to use Documentation/process/submit-checklist.rst when testing your code *** The -mm tree is included into linux-next and is updated there every 3-4 working days ------------------------------------------------------ From: Mike Kravetz <mike.kravetz(a)oracle.com> Subject: hugetlbfs: Use i_mmap_rwsem to fix page fault/truncate race hugetlbfs page faults can race with truncate and hole punch operations. Current code in the page fault path attempts to handle this by 'backing out' operations if we encounter the race. One obvious omission in the current code is removing a page newly added to the page cache. This is pretty straight forward to address, but there is a more subtle and difficult issue of backing out hugetlb reservations. To handle this correctly, the 'reservation state' before page allocation needs to be noted so that it can be properly backed out. There are four distinct possibilities for reservation state: shared/reserved, shared/no-resv, private/reserved and private/no-resv. Backing out a reservation may require memory allocation which could fail so that needs to be taken into account as well. Instead of writing the required complicated code for this rare occurrence, just eliminate the race. i_mmap_rwsem is now held in read mode for the duration of page fault processing. Hold i_mmap_rwsem longer in truncation and hold punch code to cover the call to remove_inode_hugepages. Link: http://lkml.kernel.org/r/20181203200850.6460-3-mike.kravetz@oracle.com Fixes: ebed4bfc8da8 ("hugetlb: fix absurd HugePages_Rsvd") Signed-off-by: Mike Kravetz <mike.kravetz(a)oracle.com> Cc: Andrea Arcangeli <aarcange(a)redhat.com> Cc: "Aneesh Kumar K . V" <aneesh.kumar(a)linux.vnet.ibm.com> Cc: Davidlohr Bueso <dave(a)stgolabs.net> Cc: Hugh Dickins <hughd(a)google.com> Cc: "Kirill A . Shutemov" <kirill.shutemov(a)linux.intel.com> Cc: Michal Hocko <mhocko(a)kernel.org> Cc: Naoya Horiguchi <n-horiguchi(a)ah.jp.nec.com> Cc: Prakash Sangappa <prakash.sangappa(a)oracle.com> Cc: <stable(a)vger.kernel.org> Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org> --- fs/hugetlbfs/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) --- a/fs/hugetlbfs/inode.c~hugetlbfs-use-i_mmap_rwsem-to-fix-page-fault-truncate-race +++ a/fs/hugetlbfs/inode.c @@ -505,8 +505,8 @@ static int hugetlb_vmtruncate(struct ino i_mmap_lock_write(mapping); if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)) hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0); - i_mmap_unlock_write(mapping); remove_inode_hugepages(inode, offset, LLONG_MAX); + i_mmap_unlock_write(mapping); return 0; } @@ -540,8 +540,8 @@ static long hugetlbfs_punch_hole(struct hugetlb_vmdelete_list(&mapping->i_mmap, hole_start >> PAGE_SHIFT, hole_end >> PAGE_SHIFT); - i_mmap_unlock_write(mapping); remove_inode_hugepages(inode, hole_start, hole_end); + i_mmap_unlock_write(mapping); inode_unlock(inode); } _ Patches currently in -mm which might be from mike.kravetz(a)oracle.com are hugetlbfs-use-i_mmap_rwsem-for-more-pmd-sharing-synchronization.patch hugetlbfs-use-i_mmap_rwsem-to-fix-page-fault-truncate-race.patch hugetlbfs-remove-unnecessary-code-after-i_mmap_rwsem-synchronization.patch

6 years, 10 months

1
0
0 0

+ hugetlbfs-use-i_mmap_rwsem-for-more-pmd-sharing-synchronization.patch added to -mm tree

by akpm＠linux-foundation.org

The patch titled Subject: hugetlbfs: use i_mmap_rwsem for more pmd sharing synchronization has been added to the -mm tree. Its filename is hugetlbfs-use-i_mmap_rwsem-for-more-pmd-sharing-synchronization.patch This patch should soon appear at http://ozlabs.org/~akpm/mmots/broken-out/hugetlbfs-use-i_mmap_rwsem-for-mor… and later at http://ozlabs.org/~akpm/mmotm/broken-out/hugetlbfs-use-i_mmap_rwsem-for-mor… Before you just go and hit "reply", please: a) Consider who else should be cc'ed b) Prefer to cc a suitable mailing list as well c) Ideally: find the original patch on the mailing list and do a reply-to-all to that, adding suitable additional cc's *** Remember to use Documentation/process/submit-checklist.rst when testing your code *** The -mm tree is included into linux-next and is updated there every 3-4 working days ------------------------------------------------------ From: Mike Kravetz <mike.kravetz(a)oracle.com> Subject: hugetlbfs: use i_mmap_rwsem for more pmd sharing synchronization Patch series "hugetlbfs: use i_mmap_rwsem for better synchronization". These patches are a follow up to the RFC, http://lkml.kernel.org/r/20181024045053.1467-1-mike.kravetz@oracle.com Comments made by Naoya were addressed. There are two primary issues addressed here: 1) For shared pmds, huge PE pointers returned by huge_pte_alloc can become invalid via a call to huge_pmd_unshare by another thread. 2) hugetlbfs page faults can race with truncation causing invalid global reserve counts and state. Both issues are addressed by expanding the use of i_mmap_rwsem. These issues have existed for a long time. They can be recreated with a test program that causes page fault/truncation races. For simple mappings, this results in a negative HugePages_Rsvd count. If racing with mappings that contain shared pmds, we can hit "BUG at fs/hugetlbfs/inode.c:444!" or Oops! as the result of an invalid memory reference. I broke up the larger RFC into separate patches addressing each issue. Hopefully, this is easier to understand/review. This patch (of 3): While looking at BUGs associated with invalid huge page map counts, it was discovered and observed that a huge pte pointer could become 'invalid' and point to another task's page table. Consider the following: A task takes a page fault on a shared hugetlbfs file and calls huge_pte_alloc to get a ptep. Suppose the returned ptep points to a shared pmd. Now, another task truncates the hugetlbfs file. As part of truncation, it unmaps everyone who has the file mapped. If the range being truncated is covered by a shared pmd, huge_pmd_unshare will be called. For all but the last user of the shared pmd, huge_pmd_unshare will clear the pud pointing to the pmd. If the task in the middle of the page fault is not the last user, the ptep returned by huge_pte_alloc now points to another task's page table or worse. This leads to bad things such as incorrect page map/reference counts or invalid memory references. To fix, expand the use of i_mmap_rwsem as follows: - i_mmap_rwsem is held in read mode whenever huge_pmd_share is called. huge_pmd_share is only called via huge_pte_alloc, so callers of huge_pte_alloc take i_mmap_rwsem before calling. In addition, callers of huge_pte_alloc continue to hold the semaphore until finished with the ptep. - i_mmap_rwsem is held in write mode whenever huge_pmd_unshare is called. Link: http://lkml.kernel.org/r/20181203200850.6460-2-mike.kravetz@oracle.com Fixes: 39dde65c9940 ("shared page table for hugetlb page") Signed-off-by: Mike Kravetz <mike.kravetz(a)oracle.com> Cc: Michal Hocko <mhocko(a)kernel.org> Cc: Hugh Dickins <hughd(a)google.com> Cc: Naoya Horiguchi <n-horiguchi(a)ah.jp.nec.com> Cc: "Aneesh Kumar K . V" <aneesh.kumar(a)linux.vnet.ibm.com> Cc: Andrea Arcangeli <aarcange(a)redhat.com> Cc: "Kirill A . Shutemov" <kirill.shutemov(a)linux.intel.com> Cc: Davidlohr Bueso <dave(a)stgolabs.net> Cc: Prakash Sangappa <prakash.sangappa(a)oracle.com> Cc: <stable(a)vger.kernel.org> Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org> --- mm/hugetlb.c | 70 ++++++++++++++++++++++++++++++++---------- mm/memory-failure.c | 14 +++++++- mm/migrate.c | 13 +++++++ mm/rmap.c | 3 + mm/userfaultfd.c | 11 +++++- 5 files changed, 91 insertions(+), 20 deletions(-) --- a/mm/hugetlb.c~hugetlbfs-use-i_mmap_rwsem-for-more-pmd-sharing-synchronization +++ a/mm/hugetlb.c @@ -3239,6 +3239,7 @@ int copy_hugetlb_page_range(struct mm_st int cow; struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); + struct address_space *mapping = vma->vm_file->f_mapping; unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ int ret = 0; @@ -3252,11 +3253,23 @@ int copy_hugetlb_page_range(struct mm_st for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) { spinlock_t *src_ptl, *dst_ptl; + src_pte = huge_pte_offset(src, addr, sz); if (!src_pte) continue; + + /* + * i_mmap_rwsem must be held to call huge_pte_alloc. + * Continue to hold until finished with dst_pte, otherwise + * it could go away if part of a shared pmd. + * + * Technically, i_mmap_rwsem is only needed in the non-cow + * case as cow mappings are not shared. + */ + i_mmap_lock_read(mapping); dst_pte = huge_pte_alloc(dst, addr, sz); if (!dst_pte) { + i_mmap_unlock_read(mapping); ret = -ENOMEM; break; } @@ -3271,8 +3284,10 @@ int copy_hugetlb_page_range(struct mm_st * after taking the lock below. */ dst_entry = huge_ptep_get(dst_pte); - if ((dst_pte == src_pte) || !huge_pte_none(dst_entry)) + if ((dst_pte == src_pte) || !huge_pte_none(dst_entry)) { + i_mmap_unlock_read(mapping); continue; + } dst_ptl = huge_pte_lock(h, dst, dst_pte); src_ptl = huge_pte_lockptr(h, src, src_pte); @@ -3321,6 +3336,8 @@ int copy_hugetlb_page_range(struct mm_st } spin_unlock(src_ptl); spin_unlock(dst_ptl); + + i_mmap_unlock_read(mapping); } if (cow) @@ -3772,14 +3789,18 @@ retry: }; /* - * hugetlb_fault_mutex must be dropped before - * handling userfault. Reacquire after handling - * fault to make calling code simpler. + * hugetlb_fault_mutex and i_mmap_rwsem must be + * dropped before handling userfault. Reacquire + * after handling fault to make calling code simpler. */ hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, haddr); mutex_unlock(&hugetlb_fault_mutex_table[hash]); + i_mmap_unlock_read(mapping); + ret = handle_userfault(&vmf, VM_UFFD_MISSING); + + i_mmap_lock_read(mapping); mutex_lock(&hugetlb_fault_mutex_table[hash]); goto out; } @@ -3927,6 +3948,11 @@ vm_fault_t hugetlb_fault(struct mm_struc ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); if (ptep) { + /* + * Since we hold no locks, ptep could be stale. That is + * OK as we are only making decisions based on content and + * not actually modifying content here. + */ entry = huge_ptep_get(ptep); if (unlikely(is_hugetlb_entry_migration(entry))) { migration_entry_wait_huge(vma, mm, ptep); @@ -3934,20 +3960,31 @@ vm_fault_t hugetlb_fault(struct mm_struc } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) return VM_FAULT_HWPOISON_LARGE | VM_FAULT_SET_HINDEX(hstate_index(h)); - } else { - ptep = huge_pte_alloc(mm, haddr, huge_page_size(h)); - if (!ptep) - return VM_FAULT_OOM; } + /* + * Acquire i_mmap_rwsem before calling huge_pte_alloc and hold + * until finished with ptep. This prevents huge_pmd_unshare from + * being called elsewhere and making the ptep no longer valid. + * + * ptep could have already be assigned via huge_pte_offset. That + * is OK, as huge_pte_alloc will return the same value unless + * something changed. + */ mapping = vma->vm_file->f_mapping; - idx = vma_hugecache_offset(h, vma, haddr); + i_mmap_lock_read(mapping); + ptep = huge_pte_alloc(mm, haddr, huge_page_size(h)); + if (!ptep) { + i_mmap_unlock_read(mapping); + return VM_FAULT_OOM; + } /* * Serialize hugepage allocation and instantiation, so that we don't * get spurious allocation failures if two CPUs race to instantiate * the same page in the page cache. */ + idx = vma_hugecache_offset(h, vma, haddr); hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, haddr); mutex_lock(&hugetlb_fault_mutex_table[hash]); @@ -4035,6 +4072,7 @@ out_ptl: } out_mutex: mutex_unlock(&hugetlb_fault_mutex_table[hash]); + i_mmap_unlock_read(mapping); /* * Generally it's safe to hold refcount during waiting page lock. But * here we just wait to defer the next page fault to avoid busy loop and @@ -4639,10 +4677,12 @@ void adjust_range_if_pmd_sharing_possibl * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc() * and returns the corresponding pte. While this is not necessary for the * !shared pmd case because we can allocate the pmd later as well, it makes the - * code much cleaner. pmd allocation is essential for the shared case because - * pud has to be populated inside the same i_mmap_rwsem section - otherwise - * racing tasks could either miss the sharing (see huge_pte_offset) or select a - * bad pmd for sharing. + * code much cleaner. + * + * This routine must be called with i_mmap_rwsem held in at least read mode. + * For hugetlbfs, this prevents removal of any page table entries associated + * with the address space. This is important as we are setting up sharing + * based on existing page table entries (mappings). */ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) { @@ -4659,7 +4699,6 @@ pte_t *huge_pmd_share(struct mm_struct * if (!vma_shareable(vma, addr)) return (pte_t *)pmd_alloc(mm, pud, addr); - i_mmap_lock_write(mapping); vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) { if (svma == vma) continue; @@ -4689,7 +4728,6 @@ pte_t *huge_pmd_share(struct mm_struct * spin_unlock(ptl); out: pte = (pte_t *)pmd_alloc(mm, pud, addr); - i_mmap_unlock_write(mapping); return pte; } @@ -4700,7 +4738,7 @@ out: * indicated by page_count > 1, unmap is achieved by clearing pud and * decrementing the ref count. If count == 1, the pte page is not shared. * - * called with page table lock held. + * Called with page table lock held and i_mmap_rwsem held in write mode. * * returns: 1 successfully unmapped a shared pte page * 0 the underlying pte page is not shared, or it is the last user --- a/mm/memory-failure.c~hugetlbfs-use-i_mmap_rwsem-for-more-pmd-sharing-synchronization +++ a/mm/memory-failure.c @@ -1028,7 +1028,19 @@ static bool hwpoison_user_mappings(struc if (kill) collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED); - unmap_success = try_to_unmap(hpage, ttu); + if (!PageHuge(hpage)) { + unmap_success = try_to_unmap(hpage, ttu); + } else { + /* + * For hugetlb pages, try_to_unmap could potentially call + * huge_pmd_unshare. Because of this, take semaphore in + * write mode here and set TTU_RMAP_LOCKED to indicate we + * have taken the lock at this higer level. + */ + i_mmap_lock_write(mapping); + unmap_success = try_to_unmap(hpage, ttu|TTU_RMAP_LOCKED); + i_mmap_unlock_write(mapping); + } if (!unmap_success) pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n", pfn, page_mapcount(hpage)); --- a/mm/migrate.c~hugetlbfs-use-i_mmap_rwsem-for-more-pmd-sharing-synchronization +++ a/mm/migrate.c @@ -1297,8 +1297,19 @@ static int unmap_and_move_huge_page(new_ goto put_anon; if (page_mapped(hpage)) { + struct address_space *mapping = page_mapping(hpage); + + /* + * try_to_unmap could potentially call huge_pmd_unshare. + * Because of this, take semaphore in write mode here and + * set TTU_RMAP_LOCKED to let lower levels know we have + * taken the lock. + */ + i_mmap_lock_write(mapping); try_to_unmap(hpage, - TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); + TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS| + TTU_RMAP_LOCKED); + i_mmap_unlock_write(mapping); page_was_mapped = 1; } --- a/mm/rmap.c~hugetlbfs-use-i_mmap_rwsem-for-more-pmd-sharing-synchronization +++ a/mm/rmap.c @@ -1374,6 +1374,9 @@ static bool try_to_unmap_one(struct page /* * If sharing is possible, start and end will be adjusted * accordingly. + * + * If called for a huge page, caller must hold i_mmap_rwsem + * in write mode as it is possible to call huge_pmd_unshare. */ adjust_range_if_pmd_sharing_possible(vma, &start, &end); } --- a/mm/userfaultfd.c~hugetlbfs-use-i_mmap_rwsem-for-more-pmd-sharing-synchronization +++ a/mm/userfaultfd.c @@ -267,10 +267,14 @@ retry: VM_BUG_ON(dst_addr & ~huge_page_mask(h)); /* - * Serialize via hugetlb_fault_mutex + * Serialize via i_mmap_rwsem and hugetlb_fault_mutex. + * i_mmap_rwsem ensures the dst_pte remains valid even + * in the case of shared pmds. fault mutex prevents + * races with other faulting threads. */ - idx = linear_page_index(dst_vma, dst_addr); mapping = dst_vma->vm_file->f_mapping; + i_mmap_lock_read(mapping); + idx = linear_page_index(dst_vma, dst_addr); hash = hugetlb_fault_mutex_hash(h, dst_mm, dst_vma, mapping, idx, dst_addr); mutex_lock(&hugetlb_fault_mutex_table[hash]); @@ -279,6 +283,7 @@ retry: dst_pte = huge_pte_alloc(dst_mm, dst_addr, huge_page_size(h)); if (!dst_pte) { mutex_unlock(&hugetlb_fault_mutex_table[hash]); + i_mmap_unlock_read(mapping); goto out_unlock; } @@ -286,6 +291,7 @@ retry: dst_pteval = huge_ptep_get(dst_pte); if (!huge_pte_none(dst_pteval)) { mutex_unlock(&hugetlb_fault_mutex_table[hash]); + i_mmap_unlock_read(mapping); goto out_unlock; } @@ -293,6 +299,7 @@ retry: dst_addr, src_addr, &page); mutex_unlock(&hugetlb_fault_mutex_table[hash]); + i_mmap_unlock_read(mapping); vm_alloc_shared = vm_shared; cond_resched(); _ Patches currently in -mm which might be from mike.kravetz(a)oracle.com are hugetlbfs-use-i_mmap_rwsem-for-more-pmd-sharing-synchronization.patch hugetlbfs-use-i_mmap_rwsem-to-fix-page-fault-truncate-race.patch hugetlbfs-remove-unnecessary-code-after-i_mmap_rwsem-synchronization.patch

6 years, 10 months

1
0
0 0

[PATCH for-rc 0/2] Patches for 4.20 rc cycle

by Dennis Dalessandro

Hi Doug and Jason, Here are two patches that would be good to land in the RC. One fixes a bug found by KASAN that was marked stable. The other fixes something that was introduced in the last release cycle. Both patches are pretty small. --- Michael J. Ruhl (1): IB/hfi1: Fix a latency issue for small messages Piotr Stankiewicz (1): IB/hfi1: Fix an out-of-bounds access in get_hw_stats drivers/infiniband/hw/hfi1/chip.c | 3 ++- drivers/infiniband/hw/hfi1/hfi.h | 2 ++ drivers/infiniband/hw/hfi1/qp.c | 7 +++++++ drivers/infiniband/hw/hfi1/verbs.c | 2 +- 4 files changed, 12 insertions(+), 2 deletions(-) -- -Denny

6 years, 10 months

2
2
0 0

[PATCH 1/3] hugetlbfs: use i_mmap_rwsem for more pmd sharing synchronization

by Mike Kravetz

While looking at BUGs associated with invalid huge page map counts, it was discovered and observed that a huge pte pointer could become 'invalid' and point to another task's page table. Consider the following: A task takes a page fault on a shared hugetlbfs file and calls huge_pte_alloc to get a ptep. Suppose the returned ptep points to a shared pmd. Now, another task truncates the hugetlbfs file. As part of truncation, it unmaps everyone who has the file mapped. If the range being truncated is covered by a shared pmd, huge_pmd_unshare will be called. For all but the last user of the shared pmd, huge_pmd_unshare will clear the pud pointing to the pmd. If the task in the middle of the page fault is not the last user, the ptep returned by huge_pte_alloc now points to another task's page table or worse. This leads to bad things such as incorrect page map/reference counts or invalid memory references. To fix, expand the use of i_mmap_rwsem as follows: - i_mmap_rwsem is held in read mode whenever huge_pmd_share is called. huge_pmd_share is only called via huge_pte_alloc, so callers of huge_pte_alloc take i_mmap_rwsem before calling. In addition, callers of huge_pte_alloc continue to hold the semaphore until finished with the ptep. - i_mmap_rwsem is held in write mode whenever huge_pmd_unshare is called. Cc: <stable(a)vger.kernel.org> Fixes: 39dde65c9940 ("shared page table for hugetlb page") Signed-off-by: Mike Kravetz <mike.kravetz(a)oracle.com> --- mm/hugetlb.c | 70 ++++++++++++++++++++++++++++++++++----------- mm/memory-failure.c | 14 ++++++++- mm/migrate.c | 13 ++++++++- mm/rmap.c | 3 ++ mm/userfaultfd.c | 11 +++++-- 5 files changed, 91 insertions(+), 20 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 1931a3d9b282..362601b69c56 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3239,6 +3239,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, int cow; struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); + struct address_space *mapping = vma->vm_file->f_mapping; unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ int ret = 0; @@ -3252,11 +3253,23 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) { spinlock_t *src_ptl, *dst_ptl; + src_pte = huge_pte_offset(src, addr, sz); if (!src_pte) continue; + + /* + * i_mmap_rwsem must be held to call huge_pte_alloc. + * Continue to hold until finished with dst_pte, otherwise + * it could go away if part of a shared pmd. + * + * Technically, i_mmap_rwsem is only needed in the non-cow + * case as cow mappings are not shared. + */ + i_mmap_lock_read(mapping); dst_pte = huge_pte_alloc(dst, addr, sz); if (!dst_pte) { + i_mmap_unlock_read(mapping); ret = -ENOMEM; break; } @@ -3271,8 +3284,10 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, * after taking the lock below. */ dst_entry = huge_ptep_get(dst_pte); - if ((dst_pte == src_pte) || !huge_pte_none(dst_entry)) + if ((dst_pte == src_pte) || !huge_pte_none(dst_entry)) { + i_mmap_unlock_read(mapping); continue; + } dst_ptl = huge_pte_lock(h, dst, dst_pte); src_ptl = huge_pte_lockptr(h, src, src_pte); @@ -3321,6 +3336,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, } spin_unlock(src_ptl); spin_unlock(dst_ptl); + + i_mmap_unlock_read(mapping); } if (cow) @@ -3772,14 +3789,18 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, }; /* - * hugetlb_fault_mutex must be dropped before - * handling userfault. Reacquire after handling - * fault to make calling code simpler. + * hugetlb_fault_mutex and i_mmap_rwsem must be + * dropped before handling userfault. Reacquire + * after handling fault to make calling code simpler. */ hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, haddr); mutex_unlock(&hugetlb_fault_mutex_table[hash]); + i_mmap_unlock_read(mapping); + ret = handle_userfault(&vmf, VM_UFFD_MISSING); + + i_mmap_lock_read(mapping); mutex_lock(&hugetlb_fault_mutex_table[hash]); goto out; } @@ -3927,6 +3948,11 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); if (ptep) { + /* + * Since we hold no locks, ptep could be stale. That is + * OK as we are only making decisions based on content and + * not actually modifying content here. + */ entry = huge_ptep_get(ptep); if (unlikely(is_hugetlb_entry_migration(entry))) { migration_entry_wait_huge(vma, mm, ptep); @@ -3934,20 +3960,31 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) return VM_FAULT_HWPOISON_LARGE | VM_FAULT_SET_HINDEX(hstate_index(h)); - } else { - ptep = huge_pte_alloc(mm, haddr, huge_page_size(h)); - if (!ptep) - return VM_FAULT_OOM; } + /* + * Acquire i_mmap_rwsem before calling huge_pte_alloc and hold + * until finished with ptep. This prevents huge_pmd_unshare from + * being called elsewhere and making the ptep no longer valid. + * + * ptep could have already be assigned via huge_pte_offset. That + * is OK, as huge_pte_alloc will return the same value unless + * something changed. + */ mapping = vma->vm_file->f_mapping; - idx = vma_hugecache_offset(h, vma, haddr); + i_mmap_lock_read(mapping); + ptep = huge_pte_alloc(mm, haddr, huge_page_size(h)); + if (!ptep) { + i_mmap_unlock_read(mapping); + return VM_FAULT_OOM; + } /* * Serialize hugepage allocation and instantiation, so that we don't * get spurious allocation failures if two CPUs race to instantiate * the same page in the page cache. */ + idx = vma_hugecache_offset(h, vma, haddr); hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, haddr); mutex_lock(&hugetlb_fault_mutex_table[hash]); @@ -4035,6 +4072,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, } out_mutex: mutex_unlock(&hugetlb_fault_mutex_table[hash]); + i_mmap_unlock_read(mapping); /* * Generally it's safe to hold refcount during waiting page lock. But * here we just wait to defer the next page fault to avoid busy loop and @@ -4639,10 +4677,12 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc() * and returns the corresponding pte. While this is not necessary for the * !shared pmd case because we can allocate the pmd later as well, it makes the - * code much cleaner. pmd allocation is essential for the shared case because - * pud has to be populated inside the same i_mmap_rwsem section - otherwise - * racing tasks could either miss the sharing (see huge_pte_offset) or select a - * bad pmd for sharing. + * code much cleaner. + * + * This routine must be called with i_mmap_rwsem held in at least read mode. + * For hugetlbfs, this prevents removal of any page table entries associated + * with the address space. This is important as we are setting up sharing + * based on existing page table entries (mappings). */ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) { @@ -4659,7 +4699,6 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) if (!vma_shareable(vma, addr)) return (pte_t *)pmd_alloc(mm, pud, addr); - i_mmap_lock_write(mapping); vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) { if (svma == vma) continue; @@ -4689,7 +4728,6 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) spin_unlock(ptl); out: pte = (pte_t *)pmd_alloc(mm, pud, addr); - i_mmap_unlock_write(mapping); return pte; } @@ -4700,7 +4738,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) * indicated by page_count > 1, unmap is achieved by clearing pud and * decrementing the ref count. If count == 1, the pte page is not shared. * - * called with page table lock held. + * Called with page table lock held and i_mmap_rwsem held in write mode. * * returns: 1 successfully unmapped a shared pte page * 0 the underlying pte page is not shared, or it is the last user diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 0cd3de3550f0..b992d1295578 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1028,7 +1028,19 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, if (kill) collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED); - unmap_success = try_to_unmap(hpage, ttu); + if (!PageHuge(hpage)) { + unmap_success = try_to_unmap(hpage, ttu); + } else { + /* + * For hugetlb pages, try_to_unmap could potentially call + * huge_pmd_unshare. Because of this, take semaphore in + * write mode here and set TTU_RMAP_LOCKED to indicate we + * have taken the lock at this higer level. + */ + i_mmap_lock_write(mapping); + unmap_success = try_to_unmap(hpage, ttu|TTU_RMAP_LOCKED); + i_mmap_unlock_write(mapping); + } if (!unmap_success) pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n", pfn, page_mapcount(hpage)); diff --git a/mm/migrate.c b/mm/migrate.c index 84381b55b2bd..725edaef238a 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1307,8 +1307,19 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, goto put_anon; if (page_mapped(hpage)) { + struct address_space *mapping = page_mapping(hpage); + + /* + * try_to_unmap could potentially call huge_pmd_unshare. + * Because of this, take semaphore in write mode here and + * set TTU_RMAP_LOCKED to let lower levels know we have + * taken the lock. + */ + i_mmap_lock_write(mapping); try_to_unmap(hpage, - TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); + TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS| + TTU_RMAP_LOCKED); + i_mmap_unlock_write(mapping); page_was_mapped = 1; } diff --git a/mm/rmap.c b/mm/rmap.c index 1e79fac3186b..aed241f69fcf 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1374,6 +1374,9 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, /* * If sharing is possible, start and end will be adjusted * accordingly. + * + * If called for a huge page, caller must hold i_mmap_rwsem + * in write mode as it is possible to call huge_pmd_unshare. */ adjust_range_if_pmd_sharing_possible(vma, &start, &end); } diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 5029f241908f..7cf4d8f7494b 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -244,10 +244,14 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, VM_BUG_ON(dst_addr & ~huge_page_mask(h)); /* - * Serialize via hugetlb_fault_mutex + * Serialize via i_mmap_rwsem and hugetlb_fault_mutex. + * i_mmap_rwsem ensures the dst_pte remains valid even + * in the case of shared pmds. fault mutex prevents + * races with other faulting threads. */ - idx = linear_page_index(dst_vma, dst_addr); mapping = dst_vma->vm_file->f_mapping; + i_mmap_lock_read(mapping); + idx = linear_page_index(dst_vma, dst_addr); hash = hugetlb_fault_mutex_hash(h, dst_mm, dst_vma, mapping, idx, dst_addr); mutex_lock(&hugetlb_fault_mutex_table[hash]); @@ -256,6 +260,7 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, dst_pte = huge_pte_alloc(dst_mm, dst_addr, huge_page_size(h)); if (!dst_pte) { mutex_unlock(&hugetlb_fault_mutex_table[hash]); + i_mmap_unlock_read(mapping); goto out_unlock; } @@ -263,6 +268,7 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, dst_pteval = huge_ptep_get(dst_pte); if (!huge_pte_none(dst_pteval)) { mutex_unlock(&hugetlb_fault_mutex_table[hash]); + i_mmap_unlock_read(mapping); goto out_unlock; } @@ -270,6 +276,7 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, dst_addr, src_addr, &page); mutex_unlock(&hugetlb_fault_mutex_table[hash]); + i_mmap_unlock_read(mapping); vm_alloc_shared = vm_shared; cond_resched(); -- 2.17.2

6 years, 10 months

1
0
0 0

[git:media_tree/fixes] media: vb2: don't call __vb2_queue_cancel if vb2_start_streaming failed

by Mauro Carvalho Chehab

This is an automatic generated email to let you know that the following patch were queued: Subject: media: vb2: don't call __vb2_queue_cancel if vb2_start_streaming failed Author: Hans Verkuil <hverkuil-cisco(a)xs4all.nl> Date: Wed Nov 28 03:37:43 2018 -0500 vb2_start_streaming() already rolls back the buffers, so there is no need to call __vb2_queue_cancel(). Especially since __vb2_queue_cancel() does too much, such as zeroing the q->queued_count value, causing vb2 to think that no buffers have been queued. It appears that this call to __vb2_queue_cancel() is a left-over from before commit b3379c6201bb3. Fixes: b3379c6201bb3 ('vb2: only call start_streaming if sufficient buffers are queued') Signed-off-by: Hans Verkuil <hverkuil-cisco(a)xs4all.nl> Cc: <stable(a)vger.kernel.org> # for v4.16 and up Acked-by: Sakari Ailus <sakari.ailus(a)linux.intel.com> Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung(a)kernel.org> drivers/media/common/videobuf2/videobuf2-core.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) --- diff --git a/drivers/media/common/videobuf2/videobuf2-core.c b/drivers/media/common/videobuf2/videobuf2-core.c index 975ff5669f72..e006698807fa 100644 --- a/drivers/media/common/videobuf2/videobuf2-core.c +++ b/drivers/media/common/videobuf2/videobuf2-core.c @@ -1940,10 +1940,8 @@ int vb2_core_streamon(struct vb2_queue *q, unsigned int type) if (ret) return ret; ret = vb2_start_streaming(q); - if (ret) { - __vb2_queue_cancel(q); + if (ret) return ret; - } } q->streaming = 1;

6 years, 10 months

1
0
0 0

[git:media_tree/master] media: vb2: don't call __vb2_queue_cancel if vb2_start_streaming failed

by Mauro Carvalho Chehab

This is an automatic generated email to let you know that the following patch were queued: Subject: media: vb2: don't call __vb2_queue_cancel if vb2_start_streaming failed Author: Hans Verkuil <hverkuil-cisco(a)xs4all.nl> Date: Wed Nov 28 03:37:43 2018 -0500 vb2_start_streaming() already rolls back the buffers, so there is no need to call __vb2_queue_cancel(). Especially since __vb2_queue_cancel() does too much, such as zeroing the q->queued_count value, causing vb2 to think that no buffers have been queued. It appears that this call to __vb2_queue_cancel() is a left-over from before commit b3379c6201bb3. Fixes: b3379c6201bb3 ('vb2: only call start_streaming if sufficient buffers are queued') Signed-off-by: Hans Verkuil <hverkuil-cisco(a)xs4all.nl> Cc: <stable(a)vger.kernel.org> # for v4.16 and up Acked-by: Sakari Ailus <sakari.ailus(a)linux.intel.com> Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung(a)kernel.org> drivers/media/common/videobuf2/videobuf2-core.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) --- diff --git a/drivers/media/common/videobuf2/videobuf2-core.c b/drivers/media/common/videobuf2/videobuf2-core.c index 975ff5669f72..e006698807fa 100644 --- a/drivers/media/common/videobuf2/videobuf2-core.c +++ b/drivers/media/common/videobuf2/videobuf2-core.c @@ -1940,10 +1940,8 @@ int vb2_core_streamon(struct vb2_queue *q, unsigned int type) if (ret) return ret; ret = vb2_start_streaming(q); - if (ret) { - __vb2_queue_cancel(q); + if (ret) return ret; - } } q->streaming = 1;

6 years, 10 months

1
0
0 0

[merged] mm-khugepaged-collapse_shmem-do-not-crash-on-compound.patch removed from -mm tree

by akpm＠linux-foundation.org

The patch titled Subject: mm/khugepaged: collapse_shmem() do not crash on Compound has been removed from the -mm tree. Its filename was mm-khugepaged-collapse_shmem-do-not-crash-on-compound.patch This patch was dropped because it was merged into mainline or a subsystem tree ------------------------------------------------------ From: Hugh Dickins <hughd(a)google.com> Subject: mm/khugepaged: collapse_shmem() do not crash on Compound collapse_shmem()'s VM_BUG_ON_PAGE(PageTransCompound) was unsafe: before it holds page lock of the first page, racing truncation then extension might conceivably have inserted a hugepage there already. Fail with the SCAN_PAGE_COMPOUND result, instead of crashing (CONFIG_DEBUG_VM=y) or otherwise mishandling the unexpected hugepage - though later we might code up a more constructive way of handling it, with SCAN_SUCCESS. Link: http://lkml.kernel.org/r/alpine.LSU.2.11.1811261529310.2275@eggly.anvils Fixes: f3f0e1d2150b2 ("khugepaged: add support of collapse for tmpfs/shmem pages") Signed-off-by: Hugh Dickins <hughd(a)google.com> Cc: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com> Cc: Jerome Glisse <jglisse(a)redhat.com> Cc: Konstantin Khlebnikov <khlebnikov(a)yandex-team.ru> Cc: Matthew Wilcox <willy(a)infradead.org> Cc: <stable(a)vger.kernel.org> [4.8+] Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org> --- mm/khugepaged.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) --- a/mm/khugepaged.c~mm-khugepaged-collapse_shmem-do-not-crash-on-compound +++ a/mm/khugepaged.c @@ -1399,7 +1399,15 @@ static void collapse_shmem(struct mm_str */ VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(!PageUptodate(page), page); - VM_BUG_ON_PAGE(PageTransCompound(page), page); + + /* + * If file was truncated then extended, or hole-punched, before + * we locked the first page, then a THP might be there already. + */ + if (PageTransCompound(page)) { + result = SCAN_PAGE_COMPOUND; + goto out_unlock; + } if (page_mapping(page) != mapping) { result = SCAN_TRUNCATED; _ Patches currently in -mm which might be from hughd(a)google.com are mm-put_and_wait_on_page_locked-while-page-is-migrated.patch

6 years, 10 months

1
0
0 0

[merged] mm-khugepaged-collapse_shmem-without-freezing-new_page.patch removed from -mm tree

by akpm＠linux-foundation.org

The patch titled Subject: mm/khugepaged: collapse_shmem() without freezing new_page has been removed from the -mm tree. Its filename was mm-khugepaged-collapse_shmem-without-freezing-new_page.patch This patch was dropped because it was merged into mainline or a subsystem tree ------------------------------------------------------ From: Hugh Dickins <hughd(a)google.com> Subject: mm/khugepaged: collapse_shmem() without freezing new_page khugepaged's collapse_shmem() does almost all of its work, to assemble the huge new_page from 512 scattered old pages, with the new_page's refcount frozen to 0 (and refcounts of all old pages so far also frozen to 0). Including shmem_getpage() to read in any which were out on swap, memory reclaim if necessary to allocate their intermediate pages, and copying over all the data from old to new. Imagine the frozen refcount as a spinlock held, but without any lock debugging to highlight the abuse: it's not good, and under serious load heads into lockups - speculative getters of the page are not expecting to spin while khugepaged is rescheduled. One can get a little further under load by hacking around elsewhere; but fortunately, freezing the new_page turns out to have been entirely unnecessary, with no hacks needed elsewhere. The huge new_page lock is already held throughout, and guards all its subpages as they are brought one by one into the page cache tree; and anything reading the data in that page, without the lock, before it has been marked PageUptodate, would already be in the wrong. So simply eliminate the freezing of the new_page. Each of the old pages remains frozen with refcount 0 after it has been replaced by a new_page subpage in the page cache tree, until they are all unfrozen on success or failure: just as before. They could be unfrozen sooner, but cause no problem once no longer visible to find_get_entry(), filemap_map_pages() and other speculative lookups. Link: http://lkml.kernel.org/r/alpine.LSU.2.11.1811261527570.2275@eggly.anvils Fixes: f3f0e1d2150b2 ("khugepaged: add support of collapse for tmpfs/shmem pages") Signed-off-by: Hugh Dickins <hughd(a)google.com> Acked-by: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com> Cc: Jerome Glisse <jglisse(a)redhat.com> Cc: Konstantin Khlebnikov <khlebnikov(a)yandex-team.ru> Cc: Matthew Wilcox <willy(a)infradead.org> Cc: <stable(a)vger.kernel.org> [4.8+] Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org> --- mm/khugepaged.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) --- a/mm/khugepaged.c~mm-khugepaged-collapse_shmem-without-freezing-new_page +++ a/mm/khugepaged.c @@ -1287,7 +1287,7 @@ static void retract_page_tables(struct a * collapse_shmem - collapse small tmpfs/shmem pages into huge one. * * Basic scheme is simple, details are more complex: - * - allocate and freeze a new huge page; + * - allocate and lock a new huge page; * - scan page cache replacing old pages with the new one * + swap in pages if necessary; * + fill in gaps; @@ -1295,11 +1295,11 @@ static void retract_page_tables(struct a * - if replacing succeeds: * + copy data over; * + free old pages; - * + unfreeze huge page; + * + unlock huge page; * - if replacing failed; * + put all pages back and unfreeze them; * + restore gaps in the page cache; - * + free huge page; + * + unlock and free huge page; */ static void collapse_shmem(struct mm_struct *mm, struct address_space *mapping, pgoff_t start, @@ -1333,13 +1333,11 @@ static void collapse_shmem(struct mm_str __SetPageSwapBacked(new_page); new_page->index = start; new_page->mapping = mapping; - BUG_ON(!page_ref_freeze(new_page, 1)); /* - * At this point the new_page is 'frozen' (page_count() is zero), - * locked and not up-to-date. It's safe to insert it into the page - * cache, because nobody would be able to map it or use it in other - * way until we unfreeze it. + * At this point the new_page is locked and not up-to-date. + * It's safe to insert it into the page cache, because nobody would + * be able to map it or use it in another way until we unlock it. */ /* This will be less messy when we use multi-index entries */ @@ -1491,9 +1489,8 @@ xa_unlocked: index++; } - /* Everything is ready, let's unfreeze the new_page */ SetPageUptodate(new_page); - page_ref_unfreeze(new_page, HPAGE_PMD_NR); + page_ref_add(new_page, HPAGE_PMD_NR - 1); set_page_dirty(new_page); mem_cgroup_commit_charge(new_page, memcg, false, true); lru_cache_add_anon(new_page); @@ -1541,8 +1538,6 @@ xa_unlocked: VM_BUG_ON(nr_none); xas_unlock_irq(&xas); - /* Unfreeze new_page, caller would take care about freeing it */ - page_ref_unfreeze(new_page, 1); mem_cgroup_cancel_charge(new_page, memcg, true); new_page->mapping = NULL; } _ Patches currently in -mm which might be from hughd(a)google.com are mm-put_and_wait_on_page_locked-while-page-is-migrated.patch

6 years, 10 months

1
0
0 0

[merged] mm-khugepaged-minor-reorderings-in-collapse_shmem.patch removed from -mm tree

by akpm＠linux-foundation.org

The patch titled Subject: mm/khugepaged: minor reorderings in collapse_shmem() has been removed from the -mm tree. Its filename was mm-khugepaged-minor-reorderings-in-collapse_shmem.patch This patch was dropped because it was merged into mainline or a subsystem tree ------------------------------------------------------ From: Hugh Dickins <hughd(a)google.com> Subject: mm/khugepaged: minor reorderings in collapse_shmem() Several cleanups in collapse_shmem(): most of which probably do not really matter, beyond doing things in a more familiar and reassuring order. Simplify the failure gotos in the main loop, and on success update stats while interrupts still disabled from the last iteration. Link: http://lkml.kernel.org/r/alpine.LSU.2.11.1811261526400.2275@eggly.anvils Fixes: f3f0e1d2150b2 ("khugepaged: add support of collapse for tmpfs/shmem pages") Signed-off-by: Hugh Dickins <hughd(a)google.com> Acked-by: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com> Cc: Jerome Glisse <jglisse(a)redhat.com> Cc: Konstantin Khlebnikov <khlebnikov(a)yandex-team.ru> Cc: Matthew Wilcox <willy(a)infradead.org> Cc: <stable(a)vger.kernel.org> [4.8+] Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org> --- mm/khugepaged.c | 72 ++++++++++++++++++++-------------------------- 1 file changed, 32 insertions(+), 40 deletions(-) --- a/mm/khugepaged.c~mm-khugepaged-minor-reorderings-in-collapse_shmem +++ a/mm/khugepaged.c @@ -1329,10 +1329,10 @@ static void collapse_shmem(struct mm_str goto out; } + __SetPageLocked(new_page); + __SetPageSwapBacked(new_page); new_page->index = start; new_page->mapping = mapping; - __SetPageSwapBacked(new_page); - __SetPageLocked(new_page); BUG_ON(!page_ref_freeze(new_page, 1)); /* @@ -1366,13 +1366,13 @@ static void collapse_shmem(struct mm_str if (index == start) { if (!xas_next_entry(&xas, end - 1)) { result = SCAN_TRUNCATED; - break; + goto xa_locked; } xas_set(&xas, index); } if (!shmem_charge(mapping->host, 1)) { result = SCAN_FAIL; - break; + goto xa_locked; } xas_store(&xas, new_page + (index % HPAGE_PMD_NR)); nr_none++; @@ -1387,13 +1387,12 @@ static void collapse_shmem(struct mm_str result = SCAN_FAIL; goto xa_unlocked; } - xas_lock_irq(&xas); - xas_set(&xas, index); } else if (trylock_page(page)) { get_page(page); + xas_unlock_irq(&xas); } else { result = SCAN_PAGE_LOCK; - break; + goto xa_locked; } /* @@ -1408,11 +1407,10 @@ static void collapse_shmem(struct mm_str result = SCAN_TRUNCATED; goto out_unlock; } - xas_unlock_irq(&xas); if (isolate_lru_page(page)) { result = SCAN_DEL_PAGE_LRU; - goto out_isolate_failed; + goto out_unlock; } if (page_mapped(page)) @@ -1432,7 +1430,9 @@ static void collapse_shmem(struct mm_str */ if (!page_ref_freeze(page, 3)) { result = SCAN_PAGE_COUNT; - goto out_lru; + xas_unlock_irq(&xas); + putback_lru_page(page); + goto out_unlock; } /* @@ -1444,24 +1444,26 @@ static void collapse_shmem(struct mm_str /* Finally, replace with the new page. */ xas_store(&xas, new_page + (index % HPAGE_PMD_NR)); continue; -out_lru: - xas_unlock_irq(&xas); - putback_lru_page(page); -out_isolate_failed: - unlock_page(page); - put_page(page); - goto xa_unlocked; out_unlock: unlock_page(page); put_page(page); - break; + goto xa_unlocked; + } + + __inc_node_page_state(new_page, NR_SHMEM_THPS); + if (nr_none) { + struct zone *zone = page_zone(new_page); + + __mod_node_page_state(zone->zone_pgdat, NR_FILE_PAGES, nr_none); + __mod_node_page_state(zone->zone_pgdat, NR_SHMEM, nr_none); } - xas_unlock_irq(&xas); +xa_locked: + xas_unlock_irq(&xas); xa_unlocked: + if (result == SCAN_SUCCEED) { struct page *page, *tmp; - struct zone *zone = page_zone(new_page); /* * Replacing old pages with new one has succeeded, now we @@ -1476,11 +1478,11 @@ xa_unlocked: copy_highpage(new_page + (page->index % HPAGE_PMD_NR), page); list_del(&page->lru); - unlock_page(page); - page_ref_unfreeze(page, 1); page->mapping = NULL; + page_ref_unfreeze(page, 1); ClearPageActive(page); ClearPageUnevictable(page); + unlock_page(page); put_page(page); index++; } @@ -1489,28 +1491,17 @@ xa_unlocked: index++; } - local_irq_disable(); - __inc_node_page_state(new_page, NR_SHMEM_THPS); - if (nr_none) { - __mod_node_page_state(zone->zone_pgdat, NR_FILE_PAGES, nr_none); - __mod_node_page_state(zone->zone_pgdat, NR_SHMEM, nr_none); - } - local_irq_enable(); - - /* - * Remove pte page tables, so we can re-fault - * the page as huge. - */ - retract_page_tables(mapping, start); - /* Everything is ready, let's unfreeze the new_page */ - set_page_dirty(new_page); SetPageUptodate(new_page); page_ref_unfreeze(new_page, HPAGE_PMD_NR); + set_page_dirty(new_page); mem_cgroup_commit_charge(new_page, memcg, false, true); lru_cache_add_anon(new_page); - unlock_page(new_page); + /* + * Remove pte page tables, so we can re-fault the page as huge. + */ + retract_page_tables(mapping, start); *hpage = NULL; khugepaged_pages_collapsed++; @@ -1543,8 +1534,8 @@ xa_unlocked: xas_store(&xas, page); xas_pause(&xas); xas_unlock_irq(&xas); - putback_lru_page(page); unlock_page(page); + putback_lru_page(page); xas_lock_irq(&xas); } VM_BUG_ON(nr_none); @@ -1553,9 +1544,10 @@ xa_unlocked: /* Unfreeze new_page, caller would take care about freeing it */ page_ref_unfreeze(new_page, 1); mem_cgroup_cancel_charge(new_page, memcg, true); - unlock_page(new_page); new_page->mapping = NULL; } + + unlock_page(new_page); out: VM_BUG_ON(!list_empty(&pagelist)); /* TODO: tracepoints */ _ Patches currently in -mm which might be from hughd(a)google.com are mm-put_and_wait_on_page_locked-while-page-is-migrated.patch

6 years, 10 months

1
0
0 0

2025

2024

2023

2022

2021

2020

2019

2018

2017

Linux-stable-mirror