- Linux-stable-mirror - lists.linaro.org

[patch 27/31] mm/khugepaged: minor reorderings in collapse_shmem()

by akpm＠linux-foundation.org

From: Hugh Dickins <hughd(a)google.com> Subject: mm/khugepaged: minor reorderings in collapse_shmem() Several cleanups in collapse_shmem(): most of which probably do not really matter, beyond doing things in a more familiar and reassuring order. Simplify the failure gotos in the main loop, and on success update stats while interrupts still disabled from the last iteration. Link: http://lkml.kernel.org/r/alpine.LSU.2.11.1811261526400.2275@eggly.anvils Fixes: f3f0e1d2150b2 ("khugepaged: add support of collapse for tmpfs/shmem pages") Signed-off-by: Hugh Dickins <hughd(a)google.com> Acked-by: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com> Cc: Jerome Glisse <jglisse(a)redhat.com> Cc: Konstantin Khlebnikov <khlebnikov(a)yandex-team.ru> Cc: Matthew Wilcox <willy(a)infradead.org> Cc: <stable(a)vger.kernel.org> [4.8+] Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org> --- mm/khugepaged.c | 72 ++++++++++++++++++++-------------------------- 1 file changed, 32 insertions(+), 40 deletions(-) --- a/mm/khugepaged.c~mm-khugepaged-minor-reorderings-in-collapse_shmem +++ a/mm/khugepaged.c @@ -1329,10 +1329,10 @@ static void collapse_shmem(struct mm_str goto out; } + __SetPageLocked(new_page); + __SetPageSwapBacked(new_page); new_page->index = start; new_page->mapping = mapping; - __SetPageSwapBacked(new_page); - __SetPageLocked(new_page); BUG_ON(!page_ref_freeze(new_page, 1)); /* @@ -1366,13 +1366,13 @@ static void collapse_shmem(struct mm_str if (index == start) { if (!xas_next_entry(&xas, end - 1)) { result = SCAN_TRUNCATED; - break; + goto xa_locked; } xas_set(&xas, index); } if (!shmem_charge(mapping->host, 1)) { result = SCAN_FAIL; - break; + goto xa_locked; } xas_store(&xas, new_page + (index % HPAGE_PMD_NR)); nr_none++; @@ -1387,13 +1387,12 @@ static void collapse_shmem(struct mm_str result = SCAN_FAIL; goto xa_unlocked; } - xas_lock_irq(&xas); - xas_set(&xas, index); } else if (trylock_page(page)) { get_page(page); + xas_unlock_irq(&xas); } else { result = SCAN_PAGE_LOCK; - break; + goto xa_locked; } /* @@ -1408,11 +1407,10 @@ static void collapse_shmem(struct mm_str result = SCAN_TRUNCATED; goto out_unlock; } - xas_unlock_irq(&xas); if (isolate_lru_page(page)) { result = SCAN_DEL_PAGE_LRU; - goto out_isolate_failed; + goto out_unlock; } if (page_mapped(page)) @@ -1432,7 +1430,9 @@ static void collapse_shmem(struct mm_str */ if (!page_ref_freeze(page, 3)) { result = SCAN_PAGE_COUNT; - goto out_lru; + xas_unlock_irq(&xas); + putback_lru_page(page); + goto out_unlock; } /* @@ -1444,24 +1444,26 @@ static void collapse_shmem(struct mm_str /* Finally, replace with the new page. */ xas_store(&xas, new_page + (index % HPAGE_PMD_NR)); continue; -out_lru: - xas_unlock_irq(&xas); - putback_lru_page(page); -out_isolate_failed: - unlock_page(page); - put_page(page); - goto xa_unlocked; out_unlock: unlock_page(page); put_page(page); - break; + goto xa_unlocked; + } + + __inc_node_page_state(new_page, NR_SHMEM_THPS); + if (nr_none) { + struct zone *zone = page_zone(new_page); + + __mod_node_page_state(zone->zone_pgdat, NR_FILE_PAGES, nr_none); + __mod_node_page_state(zone->zone_pgdat, NR_SHMEM, nr_none); } - xas_unlock_irq(&xas); +xa_locked: + xas_unlock_irq(&xas); xa_unlocked: + if (result == SCAN_SUCCEED) { struct page *page, *tmp; - struct zone *zone = page_zone(new_page); /* * Replacing old pages with new one has succeeded, now we @@ -1476,11 +1478,11 @@ xa_unlocked: copy_highpage(new_page + (page->index % HPAGE_PMD_NR), page); list_del(&page->lru); - unlock_page(page); - page_ref_unfreeze(page, 1); page->mapping = NULL; + page_ref_unfreeze(page, 1); ClearPageActive(page); ClearPageUnevictable(page); + unlock_page(page); put_page(page); index++; } @@ -1489,28 +1491,17 @@ xa_unlocked: index++; } - local_irq_disable(); - __inc_node_page_state(new_page, NR_SHMEM_THPS); - if (nr_none) { - __mod_node_page_state(zone->zone_pgdat, NR_FILE_PAGES, nr_none); - __mod_node_page_state(zone->zone_pgdat, NR_SHMEM, nr_none); - } - local_irq_enable(); - - /* - * Remove pte page tables, so we can re-fault - * the page as huge. - */ - retract_page_tables(mapping, start); - /* Everything is ready, let's unfreeze the new_page */ - set_page_dirty(new_page); SetPageUptodate(new_page); page_ref_unfreeze(new_page, HPAGE_PMD_NR); + set_page_dirty(new_page); mem_cgroup_commit_charge(new_page, memcg, false, true); lru_cache_add_anon(new_page); - unlock_page(new_page); + /* + * Remove pte page tables, so we can re-fault the page as huge. + */ + retract_page_tables(mapping, start); *hpage = NULL; khugepaged_pages_collapsed++; @@ -1543,8 +1534,8 @@ xa_unlocked: xas_store(&xas, page); xas_pause(&xas); xas_unlock_irq(&xas); - putback_lru_page(page); unlock_page(page); + putback_lru_page(page); xas_lock_irq(&xas); } VM_BUG_ON(nr_none); @@ -1553,9 +1544,10 @@ xa_unlocked: /* Unfreeze new_page, caller would take care about freeing it */ page_ref_unfreeze(new_page, 1); mem_cgroup_cancel_charge(new_page, memcg, true); - unlock_page(new_page); new_page->mapping = NULL; } + + unlock_page(new_page); out: VM_BUG_ON(!list_empty(&pagelist)); /* TODO: tracepoints */ _

6 years, 10 months

1
0
0 0

[patch 26/31] mm/khugepaged: collapse_shmem() remember to clear holes

by akpm＠linux-foundation.org

From: Hugh Dickins <hughd(a)google.com> Subject: mm/khugepaged: collapse_shmem() remember to clear holes Huge tmpfs testing reminds us that there is no __GFP_ZERO in the gfp flags khugepaged uses to allocate a huge page - in all common cases it would just be a waste of effort - so collapse_shmem() must remember to clear out any holes that it instantiates. The obvious place to do so, where they are put into the page cache tree, is not a good choice: because interrupts are disabled there. Leave it until further down, once success is assured, where the other pages are copied (before setting PageUptodate). Link: http://lkml.kernel.org/r/alpine.LSU.2.11.1811261525080.2275@eggly.anvils Fixes: f3f0e1d2150b2 ("khugepaged: add support of collapse for tmpfs/shmem pages") Signed-off-by: Hugh Dickins <hughd(a)google.com> Acked-by: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com> Cc: Jerome Glisse <jglisse(a)redhat.com> Cc: Konstantin Khlebnikov <khlebnikov(a)yandex-team.ru> Cc: Matthew Wilcox <willy(a)infradead.org> Cc: <stable(a)vger.kernel.org> [4.8+] Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org> --- mm/khugepaged.c | 10 ++++++++++ 1 file changed, 10 insertions(+) --- a/mm/khugepaged.c~mm-khugepaged-collapse_shmem-remember-to-clear-holes +++ a/mm/khugepaged.c @@ -1467,7 +1467,12 @@ xa_unlocked: * Replacing old pages with new one has succeeded, now we * need to copy the content and free the old pages. */ + index = start; list_for_each_entry_safe(page, tmp, &pagelist, lru) { + while (index < page->index) { + clear_highpage(new_page + (index % HPAGE_PMD_NR)); + index++; + } copy_highpage(new_page + (page->index % HPAGE_PMD_NR), page); list_del(&page->lru); @@ -1477,6 +1482,11 @@ xa_unlocked: ClearPageActive(page); ClearPageUnevictable(page); put_page(page); + index++; + } + while (index < end) { + clear_highpage(new_page + (index % HPAGE_PMD_NR)); + index++; } local_irq_disable(); _

6 years, 10 months

1
0
0 0

[patch 25/31] mm/khugepaged: fix crashes due to misaccounted holes

by akpm＠linux-foundation.org

From: Hugh Dickins <hughd(a)google.com> Subject: mm/khugepaged: fix crashes due to misaccounted holes Huge tmpfs testing on a shortish file mapped into a pmd-rounded extent hit shmem_evict_inode()'s WARN_ON(inode->i_blocks) followed by clear_inode()'s BUG_ON(inode->i_data.nrpages) when the file was later closed and unlinked. khugepaged's collapse_shmem() was forgetting to update mapping->nrpages on the rollback path, after it had added but then needs to undo some holes. There is indeed an irritating asymmetry between shmem_charge(), whose callers want it to increment nrpages after successfully accounting blocks, and shmem_uncharge(), when __delete_from_page_cache() already decremented nrpages itself: oh well, just add a comment on that to them both. And shmem_recalc_inode() is supposed to be called when the accounting is expected to be in balance (so it can deduce from imbalance that reclaim discarded some pages): so change shmem_charge() to update nrpages earlier (though it's rare for the difference to matter at all). Link: http://lkml.kernel.org/r/alpine.LSU.2.11.1811261523450.2275@eggly.anvils Fixes: 800d8c63b2e98 ("shmem: add huge pages support") Fixes: f3f0e1d2150b2 ("khugepaged: add support of collapse for tmpfs/shmem pages") Signed-off-by: Hugh Dickins <hughd(a)google.com> Acked-by: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com> Cc: Jerome Glisse <jglisse(a)redhat.com> Cc: Konstantin Khlebnikov <khlebnikov(a)yandex-team.ru> Cc: Matthew Wilcox <willy(a)infradead.org> Cc: <stable(a)vger.kernel.org> [4.8+] Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org> --- mm/khugepaged.c | 5 ++++- mm/shmem.c | 6 +++++- 2 files changed, 9 insertions(+), 2 deletions(-) --- a/mm/khugepaged.c~mm-khugepaged-fix-crashes-due-to-misaccounted-holes +++ a/mm/khugepaged.c @@ -1506,9 +1506,12 @@ xa_unlocked: khugepaged_pages_collapsed++; } else { struct page *page; + /* Something went wrong: roll back page cache changes */ - shmem_uncharge(mapping->host, nr_none); xas_lock_irq(&xas); + mapping->nrpages -= nr_none; + shmem_uncharge(mapping->host, nr_none); + xas_set(&xas, start); xas_for_each(&xas, page, end - 1) { page = list_first_entry_or_null(&pagelist, --- a/mm/shmem.c~mm-khugepaged-fix-crashes-due-to-misaccounted-holes +++ a/mm/shmem.c @@ -297,12 +297,14 @@ bool shmem_charge(struct inode *inode, l if (!shmem_inode_acct_block(inode, pages)) return false; + /* nrpages adjustment first, then shmem_recalc_inode() when balanced */ + inode->i_mapping->nrpages += pages; + spin_lock_irqsave(&info->lock, flags); info->alloced += pages; inode->i_blocks += pages * BLOCKS_PER_PAGE; shmem_recalc_inode(inode); spin_unlock_irqrestore(&info->lock, flags); - inode->i_mapping->nrpages += pages; return true; } @@ -312,6 +314,8 @@ void shmem_uncharge(struct inode *inode, struct shmem_inode_info *info = SHMEM_I(inode); unsigned long flags; + /* nrpages adjustment done by __delete_from_page_cache() or caller */ + spin_lock_irqsave(&info->lock, flags); info->alloced -= pages; inode->i_blocks -= pages * BLOCKS_PER_PAGE; _

6 years, 10 months

1
0
0 0

[patch 24/31] mm/khugepaged: collapse_shmem() stop if punched or truncated

by akpm＠linux-foundation.org

From: Hugh Dickins <hughd(a)google.com> Subject: mm/khugepaged: collapse_shmem() stop if punched or truncated Huge tmpfs testing showed that although collapse_shmem() recognizes a concurrently truncated or hole-punched page correctly, its handling of holes was liable to refill an emptied extent. Add check to stop that. Link: http://lkml.kernel.org/r/alpine.LSU.2.11.1811261522040.2275@eggly.anvils Fixes: f3f0e1d2150b2 ("khugepaged: add support of collapse for tmpfs/shmem pages") Signed-off-by: Hugh Dickins <hughd(a)google.com> Reviewed-by: Matthew Wilcox <willy(a)infradead.org> Cc: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com> Cc: Jerome Glisse <jglisse(a)redhat.com> Cc: Konstantin Khlebnikov <khlebnikov(a)yandex-team.ru> Cc: <stable(a)vger.kernel.org> [4.8+] Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org> --- mm/khugepaged.c | 11 +++++++++++ 1 file changed, 11 insertions(+) --- a/mm/khugepaged.c~mm-khugepaged-collapse_shmem-stop-if-punched-or-truncated +++ a/mm/khugepaged.c @@ -1359,6 +1359,17 @@ static void collapse_shmem(struct mm_str VM_BUG_ON(index != xas.xa_index); if (!page) { + /* + * Stop if extent has been truncated or hole-punched, + * and is now completely empty. + */ + if (index == start) { + if (!xas_next_entry(&xas, end - 1)) { + result = SCAN_TRUNCATED; + break; + } + xas_set(&xas, index); + } if (!shmem_charge(mapping->host, 1)) { result = SCAN_FAIL; break; _

6 years, 10 months

1
0
0 0

[patch 23/31] mm/huge_memory: fix lockdep complaint on 32-bit i_size_read()

by akpm＠linux-foundation.org

From: Hugh Dickins <hughd(a)google.com> Subject: mm/huge_memory: fix lockdep complaint on 32-bit i_size_read() Huge tmpfs testing, on 32-bit kernel with lockdep enabled, showed that __split_huge_page() was using i_size_read() while holding the irq-safe lru_lock and page tree lock, but the 32-bit i_size_read() uses an irq-unsafe seqlock which should not be nested inside them. Instead, read the i_size earlier in split_huge_page_to_list(), and pass the end offset down to __split_huge_page(): all while holding head page lock, which is enough to prevent truncation of that extent before the page tree lock has been taken. Link: http://lkml.kernel.org/r/alpine.LSU.2.11.1811261520070.2275@eggly.anvils Fixes: baa355fd33142 ("thp: file pages support for split_huge_page()") Signed-off-by: Hugh Dickins <hughd(a)google.com> Acked-by: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com> Cc: Jerome Glisse <jglisse(a)redhat.com> Cc: Konstantin Khlebnikov <khlebnikov(a)yandex-team.ru> Cc: Matthew Wilcox <willy(a)infradead.org> Cc: <stable(a)vger.kernel.org> [4.8+] Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org> --- mm/huge_memory.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) --- a/mm/huge_memory.c~mm-huge_memory-fix-lockdep-complaint-on-32-bit-i_size_read +++ a/mm/huge_memory.c @@ -2439,12 +2439,11 @@ static void __split_huge_page_tail(struc } static void __split_huge_page(struct page *page, struct list_head *list, - unsigned long flags) + pgoff_t end, unsigned long flags) { struct page *head = compound_head(page); struct zone *zone = page_zone(head); struct lruvec *lruvec; - pgoff_t end = -1; int i; lruvec = mem_cgroup_page_lruvec(head, zone->zone_pgdat); @@ -2452,9 +2451,6 @@ static void __split_huge_page(struct pag /* complete memcg works before add pages to LRU */ mem_cgroup_split_huge_fixup(head); - if (!PageAnon(page)) - end = DIV_ROUND_UP(i_size_read(head->mapping->host), PAGE_SIZE); - for (i = HPAGE_PMD_NR - 1; i >= 1; i--) { __split_huge_page_tail(head, i, lruvec, list); /* Some pages can be beyond i_size: drop them from page cache */ @@ -2626,6 +2622,7 @@ int split_huge_page_to_list(struct page int count, mapcount, extra_pins, ret; bool mlocked; unsigned long flags; + pgoff_t end; VM_BUG_ON_PAGE(is_huge_zero_page(page), page); VM_BUG_ON_PAGE(!PageLocked(page), page); @@ -2648,6 +2645,7 @@ int split_huge_page_to_list(struct page ret = -EBUSY; goto out; } + end = -1; mapping = NULL; anon_vma_lock_write(anon_vma); } else { @@ -2661,6 +2659,15 @@ int split_huge_page_to_list(struct page anon_vma = NULL; i_mmap_lock_read(mapping); + + /* + *__split_huge_page() may need to trim off pages beyond EOF: + * but on 32-bit, i_size_read() takes an irq-unsafe seqlock, + * which cannot be nested inside the page tree lock. So note + * end now: i_size itself may be changed at any moment, but + * head page lock is good enough to serialize the trimming. + */ + end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE); } /* @@ -2707,7 +2714,7 @@ int split_huge_page_to_list(struct page if (mapping) __dec_node_page_state(page, NR_SHMEM_THPS); spin_unlock(&pgdata->split_queue_lock); - __split_huge_page(page, list, flags); + __split_huge_page(page, list, end, flags); if (PageSwapCache(head)) { swp_entry_t entry = { .val = page_private(head) }; _

6 years, 10 months

1
0
0 0

[patch 22/31] mm/huge_memory: splitting set mapping+index before unfreeze

by akpm＠linux-foundation.org

From: Hugh Dickins <hughd(a)google.com> Subject: mm/huge_memory: splitting set mapping+index before unfreeze Huge tmpfs stress testing has occasionally hit shmem_undo_range()'s VM_BUG_ON_PAGE(page_to_pgoff(page) != index, page). Move the setting of mapping and index up before the page_ref_unfreeze() in __split_huge_page_tail() to fix this: so that a page cache lookup cannot get a reference while the tail's mapping and index are unstable. In fact, might as well move them up before the smp_wmb(): I don't see an actual need for that, but if I'm missing something, this way round is safer than the other, and no less efficient. You might argue that VM_BUG_ON_PAGE(page_to_pgoff(page) != index, page) is misplaced, and should be left until after the trylock_page(); but left as is has not crashed since, and gives more stringent assurance. Link: http://lkml.kernel.org/r/alpine.LSU.2.11.1811261516380.2275@eggly.anvils Fixes: e9b61f19858a5 ("thp: reintroduce split_huge_page()") Requires: 605ca5ede764 ("mm/huge_memory.c: reorder operations in __split_huge_page_tail()") Signed-off-by: Hugh Dickins <hughd(a)google.com> Acked-by: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com> Cc: Konstantin Khlebnikov <khlebnikov(a)yandex-team.ru> Cc: Jerome Glisse <jglisse(a)redhat.com> Cc: Matthew Wilcox <willy(a)infradead.org> Cc: <stable(a)vger.kernel.org> [4.8+] Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org> --- mm/huge_memory.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) --- a/mm/huge_memory.c~mm-huge_memory-splitting-set-mappingindex-before-unfreeze +++ a/mm/huge_memory.c @@ -2402,6 +2402,12 @@ static void __split_huge_page_tail(struc (1L << PG_unevictable) | (1L << PG_dirty))); + /* ->mapping in first tail page is compound_mapcount */ + VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING, + page_tail); + page_tail->mapping = head->mapping; + page_tail->index = head->index + tail; + /* Page flags must be visible before we make the page non-compound. */ smp_wmb(); @@ -2422,12 +2428,6 @@ static void __split_huge_page_tail(struc if (page_is_idle(head)) set_page_idle(page_tail); - /* ->mapping in first tail page is compound_mapcount */ - VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING, - page_tail); - page_tail->mapping = head->mapping; - - page_tail->index = head->index + tail; page_cpupid_xchg_last(page_tail, page_cpupid_last(head)); /* _

6 years, 10 months

1
0
0 0

[patch 21/31] mm/huge_memory: rename freeze_page() to unmap_page()

by akpm＠linux-foundation.org

From: Hugh Dickins <hughd(a)google.com> Subject: mm/huge_memory: rename freeze_page() to unmap_page() The term "freeze" is used in several ways in the kernel, and in mm it has the particular meaning of forcing page refcount temporarily to 0. freeze_page() is just too confusing a name for a function that unmaps a page: rename it unmap_page(), and rename unfreeze_page() remap_page(). Went to change the mention of freeze_page() added later in mm/rmap.c, but found it to be incorrect: ordinary page reclaim reaches there too; but the substance of the comment still seems correct, so edit it down. Link: http://lkml.kernel.org/r/alpine.LSU.2.11.1811261514080.2275@eggly.anvils Fixes: e9b61f19858a5 ("thp: reintroduce split_huge_page()") Signed-off-by: Hugh Dickins <hughd(a)google.com> Acked-by: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com> Cc: Jerome Glisse <jglisse(a)redhat.com> Cc: Konstantin Khlebnikov <khlebnikov(a)yandex-team.ru> Cc: Matthew Wilcox <willy(a)infradead.org> Cc: <stable(a)vger.kernel.org> [4.8+] Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org> --- mm/huge_memory.c | 12 ++++++------ mm/rmap.c | 13 +++---------- 2 files changed, 9 insertions(+), 16 deletions(-) --- a/mm/huge_memory.c~mm-huge_memory-rename-freeze_page-to-unmap_page +++ a/mm/huge_memory.c @@ -2350,7 +2350,7 @@ void vma_adjust_trans_huge(struct vm_are } } -static void freeze_page(struct page *page) +static void unmap_page(struct page *page) { enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS | TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD; @@ -2365,7 +2365,7 @@ static void freeze_page(struct page *pag VM_BUG_ON_PAGE(!unmap_success, page); } -static void unfreeze_page(struct page *page) +static void remap_page(struct page *page) { int i; if (PageTransHuge(page)) { @@ -2483,7 +2483,7 @@ static void __split_huge_page(struct pag spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags); - unfreeze_page(head); + remap_page(head); for (i = 0; i < HPAGE_PMD_NR; i++) { struct page *subpage = head + i; @@ -2664,7 +2664,7 @@ int split_huge_page_to_list(struct page } /* - * Racy check if we can split the page, before freeze_page() will + * Racy check if we can split the page, before unmap_page() will * split PMDs */ if (!can_split_huge_page(head, &extra_pins)) { @@ -2673,7 +2673,7 @@ int split_huge_page_to_list(struct page } mlocked = PageMlocked(page); - freeze_page(head); + unmap_page(head); VM_BUG_ON_PAGE(compound_mapcount(head), head); /* Make sure the page is not on per-CPU pagevec as it takes pin */ @@ -2727,7 +2727,7 @@ int split_huge_page_to_list(struct page fail: if (mapping) xa_unlock(&mapping->i_pages); spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags); - unfreeze_page(head); + remap_page(head); ret = -EBUSY; } --- a/mm/rmap.c~mm-huge_memory-rename-freeze_page-to-unmap_page +++ a/mm/rmap.c @@ -1627,16 +1627,9 @@ static bool try_to_unmap_one(struct page address + PAGE_SIZE); } else { /* - * We should not need to notify here as we reach this - * case only from freeze_page() itself only call from - * split_huge_page_to_list() so everything below must - * be true: - * - page is not anonymous - * - page is locked - * - * So as it is a locked file back page thus it can not - * be remove from the page cache and replace by a new - * page before mmu_notifier_invalidate_range_end so no + * This is a locked file-backed page, thus it cannot + * be removed from the page cache and replaced by a new + * page before mmu_notifier_invalidate_range_end, so no * concurrent thread might update its page table to * point at new page while a device still is using this * page. _

6 years, 10 months

1
0
0 0

[patch 14/31] userfaultfd: shmem: add i_size checks

by akpm＠linux-foundation.org

From: Andrea Arcangeli <aarcange(a)redhat.com> Subject: userfaultfd: shmem: add i_size checks With MAP_SHARED: recheck the i_size after taking the PT lock, to serialize against truncate with the PT lock. Delete the page from the pagecache if the i_size_read check fails. With MAP_PRIVATE: check the i_size after the PT lock before mapping anonymous memory or zeropages into the MAP_PRIVATE shmem mapping. A mostly irrelevant cleanup: like we do the delete_from_page_cache() pagecache removal after dropping the PT lock, the PT lock is a spinlock so drop it before the sleepable page lock. Link: http://lkml.kernel.org/r/20181126173452.26955-5-aarcange@redhat.com Fixes: 4c27fe4c4c84 ("userfaultfd: shmem: add shmem_mcopy_atomic_pte for userfaultfd support") Signed-off-by: Andrea Arcangeli <aarcange(a)redhat.com> Reviewed-by: Mike Rapoport <rppt(a)linux.ibm.com> Reviewed-by: Hugh Dickins <hughd(a)google.com> Reported-by: Jann Horn <jannh(a)google.com> Cc: <stable(a)vger.kernel.org> Cc: "Dr. David Alan Gilbert" <dgilbert(a)redhat.com> Cc: Mike Kravetz <mike.kravetz(a)oracle.com> Cc: Peter Xu <peterx(a)redhat.com> Cc: stable(a)vger.kernel.org Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org> --- mm/shmem.c | 18 ++++++++++++++++-- mm/userfaultfd.c | 26 ++++++++++++++++++++++++-- 2 files changed, 40 insertions(+), 4 deletions(-) --- a/mm/shmem.c~userfaultfd-shmem-add-i_size-checks +++ a/mm/shmem.c @@ -2216,6 +2216,7 @@ static int shmem_mfill_atomic_pte(struct struct page *page; pte_t _dst_pte, *dst_pte; int ret; + pgoff_t offset, max_off; ret = -ENOMEM; if (!shmem_inode_acct_block(inode, 1)) @@ -2253,6 +2254,12 @@ static int shmem_mfill_atomic_pte(struct __SetPageSwapBacked(page); __SetPageUptodate(page); + ret = -EFAULT; + offset = linear_page_index(dst_vma, dst_addr); + max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + if (unlikely(offset >= max_off)) + goto out_release; + ret = mem_cgroup_try_charge_delay(page, dst_mm, gfp, &memcg, false); if (ret) goto out_release; @@ -2268,8 +2275,14 @@ static int shmem_mfill_atomic_pte(struct if (dst_vma->vm_flags & VM_WRITE) _dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte)); - ret = -EEXIST; dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); + + ret = -EFAULT; + max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + if (unlikely(offset >= max_off)) + goto out_release_uncharge_unlock; + + ret = -EEXIST; if (!pte_none(*dst_pte)) goto out_release_uncharge_unlock; @@ -2287,13 +2300,14 @@ static int shmem_mfill_atomic_pte(struct /* No need to invalidate - it was non-present before */ update_mmu_cache(dst_vma, dst_addr, dst_pte); - unlock_page(page); pte_unmap_unlock(dst_pte, ptl); + unlock_page(page); ret = 0; out: return ret; out_release_uncharge_unlock: pte_unmap_unlock(dst_pte, ptl); + delete_from_page_cache(page); out_release_uncharge: mem_cgroup_cancel_charge(page, memcg, false); out_release: --- a/mm/userfaultfd.c~userfaultfd-shmem-add-i_size-checks +++ a/mm/userfaultfd.c @@ -33,6 +33,8 @@ static int mcopy_atomic_pte(struct mm_st void *page_kaddr; int ret; struct page *page; + pgoff_t offset, max_off; + struct inode *inode; if (!*pagep) { ret = -ENOMEM; @@ -73,8 +75,17 @@ static int mcopy_atomic_pte(struct mm_st if (dst_vma->vm_flags & VM_WRITE) _dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte)); - ret = -EEXIST; dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); + if (dst_vma->vm_file) { + /* the shmem MAP_PRIVATE case requires checking the i_size */ + inode = dst_vma->vm_file->f_inode; + offset = linear_page_index(dst_vma, dst_addr); + max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + ret = -EFAULT; + if (unlikely(offset >= max_off)) + goto out_release_uncharge_unlock; + } + ret = -EEXIST; if (!pte_none(*dst_pte)) goto out_release_uncharge_unlock; @@ -108,11 +119,22 @@ static int mfill_zeropage_pte(struct mm_ pte_t _dst_pte, *dst_pte; spinlock_t *ptl; int ret; + pgoff_t offset, max_off; + struct inode *inode; _dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr), dst_vma->vm_page_prot)); - ret = -EEXIST; dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); + if (dst_vma->vm_file) { + /* the shmem MAP_PRIVATE case requires checking the i_size */ + inode = dst_vma->vm_file->f_inode; + offset = linear_page_index(dst_vma, dst_addr); + max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + ret = -EFAULT; + if (unlikely(offset >= max_off)) + goto out_unlock; + } + ret = -EEXIST; if (!pte_none(*dst_pte)) goto out_unlock; set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); _

6 years, 10 months

1
0
0 0

[patch 13/31] userfaultfd: shmem/hugetlbfs: only allow to register VM_MAYWRITE vmas

by akpm＠linux-foundation.org

From: Andrea Arcangeli <aarcange(a)redhat.com> Subject: userfaultfd: shmem/hugetlbfs: only allow to register VM_MAYWRITE vmas After the VMA to register the uffd onto is found, check that it has VM_MAYWRITE set before allowing registration. This way we inherit all common code checks before allowing to fill file holes in shmem and hugetlbfs with UFFDIO_COPY. The userfaultfd memory model is not applicable for readonly files unless it's a MAP_PRIVATE. Link: http://lkml.kernel.org/r/20181126173452.26955-4-aarcange@redhat.com Fixes: ff62a3421044 ("hugetlb: implement memfd sealing") Signed-off-by: Andrea Arcangeli <aarcange(a)redhat.com> Reviewed-by: Mike Rapoport <rppt(a)linux.ibm.com> Reviewed-by: Hugh Dickins <hughd(a)google.com> Reported-by: Jann Horn <jannh(a)google.com> Fixes: 4c27fe4c4c84 ("userfaultfd: shmem: add shmem_mcopy_atomic_pte for userfaultfd support") Cc: <stable(a)vger.kernel.org> Cc: "Dr. David Alan Gilbert" <dgilbert(a)redhat.com> Cc: Mike Kravetz <mike.kravetz(a)oracle.com> Cc: Peter Xu <peterx(a)redhat.com> Cc: stable(a)vger.kernel.org Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org> --- fs/userfaultfd.c | 15 +++++++++++++++ mm/userfaultfd.c | 15 ++++++--------- 2 files changed, 21 insertions(+), 9 deletions(-) --- a/fs/userfaultfd.c~userfaultfd-shmem-hugetlbfs-only-allow-to-register-vm_maywrite-vmas +++ a/fs/userfaultfd.c @@ -1361,6 +1361,19 @@ static int userfaultfd_register(struct u ret = -EINVAL; if (!vma_can_userfault(cur)) goto out_unlock; + + /* + * UFFDIO_COPY will fill file holes even without + * PROT_WRITE. This check enforces that if this is a + * MAP_SHARED, the process has write permission to the backing + * file. If VM_MAYWRITE is set it also enforces that on a + * MAP_SHARED vma: there is no F_WRITE_SEAL and no further + * F_WRITE_SEAL can be taken until the vma is destroyed. + */ + ret = -EPERM; + if (unlikely(!(cur->vm_flags & VM_MAYWRITE))) + goto out_unlock; + /* * If this vma contains ending address, and huge pages * check alignment. @@ -1406,6 +1419,7 @@ static int userfaultfd_register(struct u BUG_ON(!vma_can_userfault(vma)); BUG_ON(vma->vm_userfaultfd_ctx.ctx && vma->vm_userfaultfd_ctx.ctx != ctx); + WARN_ON(!(vma->vm_flags & VM_MAYWRITE)); /* * Nothing to do: this vma is already registered into this @@ -1552,6 +1566,7 @@ static int userfaultfd_unregister(struct cond_resched(); BUG_ON(!vma_can_userfault(vma)); + WARN_ON(!(vma->vm_flags & VM_MAYWRITE)); /* * Nothing to do: this vma is already registered into this --- a/mm/userfaultfd.c~userfaultfd-shmem-hugetlbfs-only-allow-to-register-vm_maywrite-vmas +++ a/mm/userfaultfd.c @@ -205,8 +205,9 @@ retry: if (!dst_vma || !is_vm_hugetlb_page(dst_vma)) goto out_unlock; /* - * Only allow __mcopy_atomic_hugetlb on userfaultfd - * registered ranges. + * Check the vma is registered in uffd, this is + * required to enforce the VM_MAYWRITE check done at + * uffd registration time. */ if (!dst_vma->vm_userfaultfd_ctx.ctx) goto out_unlock; @@ -459,13 +460,9 @@ retry: if (!dst_vma) goto out_unlock; /* - * Be strict and only allow __mcopy_atomic on userfaultfd - * registered ranges to prevent userland errors going - * unnoticed. As far as the VM consistency is concerned, it - * would be perfectly safe to remove this check, but there's - * no useful usage for __mcopy_atomic ouside of userfaultfd - * registered ranges. This is after all why these are ioctls - * belonging to the userfaultfd and not syscalls. + * Check the vma is registered in uffd, this is required to + * enforce the VM_MAYWRITE check done at uffd registration + * time. */ if (!dst_vma->vm_userfaultfd_ctx.ctx) goto out_unlock; _

6 years, 10 months

1
0
0 0

[patch 12/31] userfaultfd: shmem: allocate anonymous memory for MAP_PRIVATE shmem

by akpm＠linux-foundation.org

From: Andrea Arcangeli <aarcange(a)redhat.com> Subject: userfaultfd: shmem: allocate anonymous memory for MAP_PRIVATE shmem Userfaultfd did not create private memory when UFFDIO_COPY was invoked on a MAP_PRIVATE shmem mapping. Instead it wrote to the shmem file, even when that had not been opened for writing. Though, fortunately, that could only happen where there was a hole in the file. Fix the shmem-backed implementation of UFFDIO_COPY to create private memory for MAP_PRIVATE mappings. The hugetlbfs-backed implementation was already correct. This change is visible to userland, if userfaultfd has been used in unintended ways: so it introduces a small risk of incompatibility, but is necessary in order to respect file permissions. An app that uses UFFDIO_COPY for anything like postcopy live migration won't notice the difference, and in fact it'll run faster because there will be no copy-on-write and memory waste in the tmpfs pagecache anymore. Userfaults on MAP_PRIVATE shmem keep triggering only on file holes like before. The real zeropage can also be built on a MAP_PRIVATE shmem mapping through UFFDIO_ZEROPAGE and that's safe because the zeropage pte is never dirty, in turn even an mprotect upgrading the vma permission from PROT_READ to PROT_READ|PROT_WRITE won't make the zeropage pte writable. Link: http://lkml.kernel.org/r/20181126173452.26955-3-aarcange@redhat.com Fixes: 4c27fe4c4c84 ("userfaultfd: shmem: add shmem_mcopy_atomic_pte for userfaultfd support") Signed-off-by: Andrea Arcangeli <aarcange(a)redhat.com> Reported-by: Mike Rapoport <rppt(a)linux.ibm.com> Reviewed-by: Hugh Dickins <hughd(a)google.com> Cc: <stable(a)vger.kernel.org> Cc: "Dr. David Alan Gilbert" <dgilbert(a)redhat.com> Cc: Jann Horn <jannh(a)google.com> Cc: Mike Kravetz <mike.kravetz(a)oracle.com> Cc: Peter Xu <peterx(a)redhat.com> Cc: stable(a)vger.kernel.org Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org> --- mm/userfaultfd.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) --- a/mm/userfaultfd.c~userfaultfd-shmem-allocate-anonymous-memory-for-map_private-shmem +++ a/mm/userfaultfd.c @@ -380,7 +380,17 @@ static __always_inline ssize_t mfill_ato { ssize_t err; - if (vma_is_anonymous(dst_vma)) { + /* + * The normal page fault path for a shmem will invoke the + * fault, fill the hole in the file and COW it right away. The + * result generates plain anonymous memory. So when we are + * asked to fill an hole in a MAP_PRIVATE shmem mapping, we'll + * generate anonymous memory directly without actually filling + * the hole. For the MAP_PRIVATE case the robustness check + * only happens in the pagetable (to verify it's still none) + * and not in the radix tree. + */ + if (!(dst_vma->vm_flags & VM_SHARED)) { if (!zeropage) err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, src_addr, page); @@ -489,7 +499,8 @@ retry: * dst_vma. */ err = -ENOMEM; - if (vma_is_anonymous(dst_vma) && unlikely(anon_vma_prepare(dst_vma))) + if (!(dst_vma->vm_flags & VM_SHARED) && + unlikely(anon_vma_prepare(dst_vma))) goto out_unlock; while (src_addr < src_start + len) { _

6 years, 10 months

1
0
0 0

2025

2024

2023

2022

2021

2020

2019

2018

2017

Linux-stable-mirror