The patch titled
Subject: mm: page_vma_mapped_walk(): settle PageHuge on entry
has been added to the -mm tree. Its filename is
mm-page_vma_mapped_walk-settle-pagehuge-on-entry.patch
This patch should soon appear at
https://ozlabs.org/~akpm/mmots/broken-out/mm-page_vma_mapped_walk-settle-pa…
and later at
https://ozlabs.org/~akpm/mmotm/broken-out/mm-page_vma_mapped_walk-settle-pa…
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next and is updated
there every 3-4 working days
------------------------------------------------------
From: Hugh Dickins <hughd(a)google.com>
Subject: mm: page_vma_mapped_walk(): settle PageHuge on entry
page_vma_mapped_walk() cleanup: get the hugetlbfs PageHuge case out of the
way at the start, so no need to worry about it later.
Link: https://lkml.kernel.org/r/e31a483c-6d73-a6bb-26c5-43c3b880a2@google.com
Signed-off-by: Hugh Dickins <hughd(a)google.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com>
Reviewed-by: Peter Xu <peterx(a)redhat.com>
Cc: Alistair Popple <apopple(a)nvidia.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov(a)linux.intel.com>
Cc: Matthew Wilcox <willy(a)infradead.org>
Cc: Ralph Campbell <rcampbell(a)nvidia.com>
Cc: Wang Yugui <wangyugui(a)e16-tech.com>
Cc: Will Deacon <will(a)kernel.org>
Cc: Yang Shi <shy828301(a)gmail.com>
Cc: Zi Yan <ziy(a)nvidia.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/page_vma_mapped.c | 12 ++++++++----
1 file changed, 8 insertions(+), 4 deletions(-)
--- a/mm/page_vma_mapped.c~mm-page_vma_mapped_walk-settle-pagehuge-on-entry
+++ a/mm/page_vma_mapped.c
@@ -153,10 +153,11 @@ bool page_vma_mapped_walk(struct page_vm
if (pvmw->pmd && !pvmw->pte)
return not_found(pvmw);
- if (pvmw->pte)
- goto next_pte;
-
if (unlikely(PageHuge(page))) {
+ /* The only possible mapping was handled on last iteration */
+ if (pvmw->pte)
+ return not_found(pvmw);
+
/* when pud is not present, pte will be NULL */
pvmw->pte = huge_pte_offset(mm, pvmw->address, page_size(page));
if (!pvmw->pte)
@@ -168,6 +169,9 @@ bool page_vma_mapped_walk(struct page_vm
return not_found(pvmw);
return true;
}
+
+ if (pvmw->pte)
+ goto next_pte;
restart:
pgd = pgd_offset(mm, pvmw->address);
if (!pgd_present(*pgd))
@@ -233,7 +237,7 @@ restart:
return true;
next_pte:
/* Seek to next pte only makes sense for THP */
- if (!PageTransHuge(page) || PageHuge(page))
+ if (!PageTransHuge(page))
return not_found(pvmw);
end = vma_address_end(page, pvmw->vma);
do {
_
Patches currently in -mm which might be from hughd(a)google.com are
mm-thp-fix-__split_huge_pmd_locked-on-shmem-migration-entry.patch
mm-thp-make-is_huge_zero_pmd-safe-and-quicker.patch
mm-thp-try_to_unmap-use-ttu_sync-for-safe-splitting.patch
mm-thp-fix-vma_address-if-virtual-address-below-file-offset.patch
mm-thp-unmap_mapping_page-to-fix-thp-truncate_cleanup_page.patch
mm-page_vma_mapped_walk-use-page-for-pvmw-page.patch
mm-page_vma_mapped_walk-settle-pagehuge-on-entry.patch
mm-page_vma_mapped_walk-use-pmd_read_atomic.patch
mm-page_vma_mapped_walk-use-pmde-for-pvmw-pmd.patch
mm-page_vma_mapped_walk-prettify-pvmw_migration-block.patch
mm-page_vma_mapped_walk-crossing-page-table-boundary.patch
mm-page_vma_mapped_walk-add-a-level-of-indentation.patch
mm-page_vma_mapped_walk-use-goto-instead-of-while-1.patch
mm-page_vma_mapped_walk-get-vma_address_end-earlier.patch
mm-thp-fix-page_vma_mapped_walk-if-thp-mapped-by-ptes.patch
mm-thp-another-pvmw_sync-fix-in-page_vma_mapped_walk.patch
mm-thp-remap_page-is-only-needed-on-anonymous-thp.patch
mm-hwpoison_user_mappings-try_to_unmap-with-ttu_sync.patch
The patch titled
Subject: mm: page_vma_mapped_walk(): use page for pvmw->page
has been added to the -mm tree. Its filename is
mm-page_vma_mapped_walk-use-page-for-pvmw-page.patch
This patch should soon appear at
https://ozlabs.org/~akpm/mmots/broken-out/mm-page_vma_mapped_walk-use-page-…
and later at
https://ozlabs.org/~akpm/mmotm/broken-out/mm-page_vma_mapped_walk-use-page-…
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next and is updated
there every 3-4 working days
------------------------------------------------------
From: Hugh Dickins <hughd(a)google.com>
Subject: mm: page_vma_mapped_walk(): use page for pvmw->page
Patch series "mm: page_vma_mapped_walk() cleanup and THP fixes".
I've marked all of these for stable: many are merely cleanups,
but I think they are much better before the main fix than after.
This patch (of 11):
page_vma_mapped_walk() cleanup: sometimes the local copy of pvwm->page was
used, sometimes pvmw->page itself: use the local copy "page" throughout.
Link: https://lkml.kernel.org/r/589b358c-febc-c88e-d4c2-7834b37fa7bf@google.com
Link: https://lkml.kernel.org/r/88e67645-f467-c279-bf5e-af4b5c6b13eb@google.com
Signed-off-by: Hugh Dickins <hughd(a)google.com>
Reviewed-by: Alistair Popple <apopple(a)nvidia.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com>
Reviewed-by: Peter Xu <peterx(a)redhat.com>
Cc: Yang Shi <shy828301(a)gmail.com>
Cc: Wang Yugui <wangyugui(a)e16-tech.com>
Cc: Matthew Wilcox <willy(a)infradead.org>
Cc: Ralph Campbell <rcampbell(a)nvidia.com>
Cc: Zi Yan <ziy(a)nvidia.com>
Cc: Will Deacon <will(a)kernel.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/page_vma_mapped.c | 9 ++++-----
1 file changed, 4 insertions(+), 5 deletions(-)
--- a/mm/page_vma_mapped.c~mm-page_vma_mapped_walk-use-page-for-pvmw-page
+++ a/mm/page_vma_mapped.c
@@ -156,7 +156,7 @@ bool page_vma_mapped_walk(struct page_vm
if (pvmw->pte)
goto next_pte;
- if (unlikely(PageHuge(pvmw->page))) {
+ if (unlikely(PageHuge(page))) {
/* when pud is not present, pte will be NULL */
pvmw->pte = huge_pte_offset(mm, pvmw->address, page_size(page));
if (!pvmw->pte)
@@ -217,8 +217,7 @@ restart:
* cannot return prematurely, while zap_huge_pmd() has
* cleared *pmd but not decremented compound_mapcount().
*/
- if ((pvmw->flags & PVMW_SYNC) &&
- PageTransCompound(pvmw->page)) {
+ if ((pvmw->flags & PVMW_SYNC) && PageTransCompound(page)) {
spinlock_t *ptl = pmd_lock(mm, pvmw->pmd);
spin_unlock(ptl);
@@ -234,9 +233,9 @@ restart:
return true;
next_pte:
/* Seek to next pte only makes sense for THP */
- if (!PageTransHuge(pvmw->page) || PageHuge(pvmw->page))
+ if (!PageTransHuge(page) || PageHuge(page))
return not_found(pvmw);
- end = vma_address_end(pvmw->page, pvmw->vma);
+ end = vma_address_end(page, pvmw->vma);
do {
pvmw->address += PAGE_SIZE;
if (pvmw->address >= end)
_
Patches currently in -mm which might be from hughd(a)google.com are
mm-thp-fix-__split_huge_pmd_locked-on-shmem-migration-entry.patch
mm-thp-make-is_huge_zero_pmd-safe-and-quicker.patch
mm-thp-try_to_unmap-use-ttu_sync-for-safe-splitting.patch
mm-thp-fix-vma_address-if-virtual-address-below-file-offset.patch
mm-thp-unmap_mapping_page-to-fix-thp-truncate_cleanup_page.patch
mm-page_vma_mapped_walk-use-page-for-pvmw-page.patch
mm-page_vma_mapped_walk-settle-pagehuge-on-entry.patch
mm-page_vma_mapped_walk-use-pmd_read_atomic.patch
mm-page_vma_mapped_walk-use-pmde-for-pvmw-pmd.patch
mm-page_vma_mapped_walk-prettify-pvmw_migration-block.patch
mm-page_vma_mapped_walk-crossing-page-table-boundary.patch
mm-page_vma_mapped_walk-add-a-level-of-indentation.patch
mm-page_vma_mapped_walk-use-goto-instead-of-while-1.patch
mm-page_vma_mapped_walk-get-vma_address_end-earlier.patch
mm-thp-fix-page_vma_mapped_walk-if-thp-mapped-by-ptes.patch
mm-thp-another-pvmw_sync-fix-in-page_vma_mapped_walk.patch
mm-thp-remap_page-is-only-needed-on-anonymous-thp.patch
mm-hwpoison_user_mappings-try_to_unmap-with-ttu_sync.patch
The patch titled
Subject: mm: thp: replace DEBUG_VM BUG with VM_WARN when unmap fails for split
has been added to the -mm tree. Its filename is
mm-thp-replace-debug_vm-bug-with-vm_warn-when-unmap-fails-for-split.patch
This patch should soon appear at
https://ozlabs.org/~akpm/mmots/broken-out/mm-thp-replace-debug_vm-bug-with-…
and later at
https://ozlabs.org/~akpm/mmotm/broken-out/mm-thp-replace-debug_vm-bug-with-…
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next and is updated
there every 3-4 working days
------------------------------------------------------
From: Yang Shi <shy828301(a)gmail.com>
Subject: mm: thp: replace DEBUG_VM BUG with VM_WARN when unmap fails for split
When debugging the bug reported by Wang Yugui [1], try_to_unmap() may
fail, but the first VM_BUG_ON_PAGE() just checks page_mapcount() however
it may miss the failure when head page is unmapped but other subpage is
mapped. Then the second DEBUG_VM BUG() that check total mapcount would
catch it. This may incur some confusion. And this is not a fatal issue,
so consolidate the two DEBUG_VM checks into one VM_WARN_ON_ONCE_PAGE().
[1] https://lore.kernel.org/linux-mm/20210412180659.B9E3.409509F4@e16-tech.com/
Link: https://lkml.kernel.org/r/d0f0db68-98b8-ebfb-16dc-f29df24cf012@google.com
Signed-off-by: Yang Shi <shy828301(a)gmail.com>
Reviewed-by: Zi Yan <ziy(a)nvidia.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com>
Signed-off-by: Hugh Dickins <hughd(a)google.com>
Cc: <stable(a)vger.kernel.org>
Cc: Alistair Popple <apopple(a)nvidia.com>
Cc: Jan Kara <jack(a)suse.cz>
Cc: Jue Wang <juew(a)google.com>
Cc: "Matthew Wilcox (Oracle)" <willy(a)infradead.org>
Cc: Miaohe Lin <linmiaohe(a)huawei.com>
Cc: Minchan Kim <minchan(a)kernel.org>
Cc: Naoya Horiguchi <naoya.horiguchi(a)nec.com>
Cc: Oscar Salvador <osalvador(a)suse.de>
Cc: Peter Xu <peterx(a)redhat.com>
Cc: Ralph Campbell <rcampbell(a)nvidia.com>
Cc: Shakeel Butt <shakeelb(a)google.com>
Cc: Wang Yugui <wangyugui(a)e16-tech.com>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/huge_memory.c | 24 +++++++-----------------
1 file changed, 7 insertions(+), 17 deletions(-)
--- a/mm/huge_memory.c~mm-thp-replace-debug_vm-bug-with-vm_warn-when-unmap-fails-for-split
+++ a/mm/huge_memory.c
@@ -2352,15 +2352,15 @@ static void unmap_page(struct page *page
{
enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_SYNC |
TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD;
- bool unmap_success;
VM_BUG_ON_PAGE(!PageHead(page), page);
if (PageAnon(page))
ttu_flags |= TTU_SPLIT_FREEZE;
- unmap_success = try_to_unmap(page, ttu_flags);
- VM_BUG_ON_PAGE(!unmap_success, page);
+ try_to_unmap(page, ttu_flags);
+
+ VM_WARN_ON_ONCE_PAGE(page_mapped(page), page);
}
static void remap_page(struct page *page, unsigned int nr)
@@ -2671,7 +2671,7 @@ int split_huge_page_to_list(struct page
struct deferred_split *ds_queue = get_deferred_split_queue(head);
struct anon_vma *anon_vma = NULL;
struct address_space *mapping = NULL;
- int count, mapcount, extra_pins, ret;
+ int extra_pins, ret;
pgoff_t end;
VM_BUG_ON_PAGE(is_huge_zero_page(head), head);
@@ -2730,7 +2730,6 @@ int split_huge_page_to_list(struct page
}
unmap_page(head);
- VM_BUG_ON_PAGE(compound_mapcount(head), head);
/* block interrupt reentry in xa_lock and spinlock */
local_irq_disable();
@@ -2748,9 +2747,7 @@ int split_huge_page_to_list(struct page
/* Prevent deferred_split_scan() touching ->_refcount */
spin_lock(&ds_queue->split_queue_lock);
- count = page_count(head);
- mapcount = total_mapcount(head);
- if (!mapcount && page_ref_freeze(head, 1 + extra_pins)) {
+ if (page_ref_freeze(head, 1 + extra_pins)) {
if (!list_empty(page_deferred_list(head))) {
ds_queue->split_queue_len--;
list_del(page_deferred_list(head));
@@ -2770,16 +2767,9 @@ int split_huge_page_to_list(struct page
__split_huge_page(page, list, end);
ret = 0;
} else {
- if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) {
- pr_alert("total_mapcount: %u, page_count(): %u\n",
- mapcount, count);
- if (PageTail(page))
- dump_page(head, NULL);
- dump_page(page, "total_mapcount(head) > 0");
- BUG();
- }
spin_unlock(&ds_queue->split_queue_lock);
-fail: if (mapping)
+fail:
+ if (mapping)
xa_unlock(&mapping->i_pages);
local_irq_enable();
remap_page(head, thp_nr_pages(head));
_
Patches currently in -mm which might be from shy828301(a)gmail.com are
mm-thp-replace-debug_vm-bug-with-vm_warn-when-unmap-fails-for-split.patch
mm-mempolicy-dont-have-to-split-pmd-for-huge-zero-page.patch
mm-memory-add-orig_pmd-to-struct-vm_fault.patch
mm-memory-make-numa_migrate_prep-non-static.patch
mm-thp-refactor-numa-fault-handling.patch
mm-migrate-account-thp-numa-migration-counters-correctly.patch
mm-migrate-dont-split-thp-for-misplaced-numa-page.patch
mm-migrate-check-mapcount-for-thp-instead-of-refcount.patch
mm-thp-skip-make-pmd-prot_none-if-thp-migration-is-not-supported.patch
mm-rmap-make-try_to_unmap-void-function.patch
The patch titled
Subject: mm/thp: unmap_mapping_page() to fix THP truncate_cleanup_page()
has been added to the -mm tree. Its filename is
mm-thp-unmap_mapping_page-to-fix-thp-truncate_cleanup_page.patch
This patch should soon appear at
https://ozlabs.org/~akpm/mmots/broken-out/mm-thp-unmap_mapping_page-to-fix-…
and later at
https://ozlabs.org/~akpm/mmotm/broken-out/mm-thp-unmap_mapping_page-to-fix-…
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next and is updated
there every 3-4 working days
------------------------------------------------------
From: Hugh Dickins <hughd(a)google.com>
Subject: mm/thp: unmap_mapping_page() to fix THP truncate_cleanup_page()
There is a race between THP unmapping and truncation, when truncate sees
pmd_none() and skips the entry, after munmap's zap_huge_pmd() cleared it,
but before its page_remove_rmap() gets to decrement compound_mapcount:
generating false "BUG: Bad page cache" reports that the page is still
mapped when deleted. This commit fixes that, but not in the way I hoped.
The first attempt used try_to_unmap(page, TTU_SYNC|TTU_IGNORE_MLOCK)
instead of unmap_mapping_range() in truncate_cleanup_page(): it has often
been an annoyance that we usually call unmap_mapping_range() with no pages
locked, but there apply it to a single locked page. try_to_unmap() looks
more suitable for a single locked page.
However, try_to_unmap_one() contains a VM_BUG_ON_PAGE(!pvmw.pte,page): it
is used to insert THP migration entries, but not used to unmap THPs. Copy
zap_huge_pmd() and add THP handling now? Perhaps, but their TLB needs are
different, I'm too ignorant of the DAX cases, and couldn't decide how far
to go for anon+swap. Set that aside.
The second attempt took a different tack: make no change in truncate.c,
but modify zap_huge_pmd() to insert an invalidated huge pmd instead of
clearing it initially, then pmd_clear() between page_remove_rmap() and
unlocking at the end. Nice. But powerpc blows that approach out of the
water, with its serialize_against_pte_lookup(), and interesting pgtable
usage. It would need serious help to get working on powerpc (with a minor
optimization issue on s390 too). Set that aside.
Just add an "if (page_mapped(page)) synchronize_rcu();" or other such
delay, after unmapping in truncate_cleanup_page()? Perhaps, but though
that's likely to reduce or eliminate the number of incidents, it would
give less assurance of whether we had identified the problem correctly.
This successful iteration introduces "unmap_mapping_page(page)" instead of
try_to_unmap(), and goes the usual unmap_mapping_range_tree() route, with
an addition to details. Then zap_pmd_range() watches for this case, and
does spin_unlock(pmd_lock) if so - just like page_vma_mapped_walk() now
does in the PVMW_SYNC case. Not pretty, but safe.
Note that unmap_mapping_page() is doing a VM_BUG_ON(!PageLocked) to assert
its interface; but currently that's only used to make sure that
page->mapping is stable, and zap_pmd_range() doesn't care if the page is
locked or not. Along these lines, in invalidate_inode_pages2_range() move
the initial unmap_mapping_range() out from under page lock, before then
calling unmap_mapping_page() under page lock if still mapped.
Link: https://lkml.kernel.org/r/a2a4a148-cdd8-942c-4ef8-51b77f643dbe@google.com
Fixes: fc127da085c2 ("truncate: handle file thp")
Signed-off-by: Hugh Dickins <hughd(a)google.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com>
Reviewed-by: Yang Shi <shy828301(a)gmail.com>
Cc: <stable(a)vger.kernel.org>
Cc: Alistair Popple <apopple(a)nvidia.com>
Cc: Jan Kara <jack(a)suse.cz>
Cc: Jue Wang <juew(a)google.com>
Cc: "Matthew Wilcox (Oracle)" <willy(a)infradead.org>
Cc: Miaohe Lin <linmiaohe(a)huawei.com>
Cc: Minchan Kim <minchan(a)kernel.org>
Cc: Naoya Horiguchi <naoya.horiguchi(a)nec.com>
Cc: Oscar Salvador <osalvador(a)suse.de>
Cc: Peter Xu <peterx(a)redhat.com>
Cc: Ralph Campbell <rcampbell(a)nvidia.com>
Cc: Shakeel Butt <shakeelb(a)google.com>
Cc: Wang Yugui <wangyugui(a)e16-tech.com>
Cc: Zi Yan <ziy(a)nvidia.com>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
include/linux/mm.h | 3 +++
mm/memory.c | 41 +++++++++++++++++++++++++++++++++++++++++
mm/truncate.c | 43 +++++++++++++++++++------------------------
3 files changed, 63 insertions(+), 24 deletions(-)
--- a/include/linux/mm.h~mm-thp-unmap_mapping_page-to-fix-thp-truncate_cleanup_page
+++ a/include/linux/mm.h
@@ -1719,6 +1719,7 @@ struct zap_details {
struct address_space *check_mapping; /* Check page->mapping if set */
pgoff_t first_index; /* Lowest page->index to unmap */
pgoff_t last_index; /* Highest page->index to unmap */
+ struct page *single_page; /* Locked page to be unmapped */
};
struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
@@ -1766,6 +1767,7 @@ extern vm_fault_t handle_mm_fault(struct
extern int fixup_user_fault(struct mm_struct *mm,
unsigned long address, unsigned int fault_flags,
bool *unlocked);
+void unmap_mapping_page(struct page *page);
void unmap_mapping_pages(struct address_space *mapping,
pgoff_t start, pgoff_t nr, bool even_cows);
void unmap_mapping_range(struct address_space *mapping,
@@ -1786,6 +1788,7 @@ static inline int fixup_user_fault(struc
BUG();
return -EFAULT;
}
+static inline void unmap_mapping_page(struct page *page) { }
static inline void unmap_mapping_pages(struct address_space *mapping,
pgoff_t start, pgoff_t nr, bool even_cows) { }
static inline void unmap_mapping_range(struct address_space *mapping,
--- a/mm/memory.c~mm-thp-unmap_mapping_page-to-fix-thp-truncate_cleanup_page
+++ a/mm/memory.c
@@ -1361,7 +1361,18 @@ static inline unsigned long zap_pmd_rang
else if (zap_huge_pmd(tlb, vma, pmd, addr))
goto next;
/* fall through */
+ } else if (details && details->single_page &&
+ PageTransCompound(details->single_page) &&
+ next - addr == HPAGE_PMD_SIZE && pmd_none(*pmd)) {
+ spinlock_t *ptl = pmd_lock(tlb->mm, pmd);
+ /*
+ * Take and drop THP pmd lock so that we cannot return
+ * prematurely, while zap_huge_pmd() has cleared *pmd,
+ * but not yet decremented compound_mapcount().
+ */
+ spin_unlock(ptl);
}
+
/*
* Here there can be other concurrent MADV_DONTNEED or
* trans huge page faults running, and if the pmd is
@@ -3237,6 +3248,36 @@ static inline void unmap_mapping_range_t
}
/**
+ * unmap_mapping_page() - Unmap single page from processes.
+ * @page: The locked page to be unmapped.
+ *
+ * Unmap this page from any userspace process which still has it mmaped.
+ * Typically, for efficiency, the range of nearby pages has already been
+ * unmapped by unmap_mapping_pages() or unmap_mapping_range(). But once
+ * truncation or invalidation holds the lock on a page, it may find that
+ * the page has been remapped again: and then uses unmap_mapping_page()
+ * to unmap it finally.
+ */
+void unmap_mapping_page(struct page *page)
+{
+ struct address_space *mapping = page->mapping;
+ struct zap_details details = { };
+
+ VM_BUG_ON(!PageLocked(page));
+ VM_BUG_ON(PageTail(page));
+
+ details.check_mapping = mapping;
+ details.first_index = page->index;
+ details.last_index = page->index + thp_nr_pages(page) - 1;
+ details.single_page = page;
+
+ i_mmap_lock_write(mapping);
+ if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
+ unmap_mapping_range_tree(&mapping->i_mmap, &details);
+ i_mmap_unlock_write(mapping);
+}
+
+/**
* unmap_mapping_pages() - Unmap pages from processes.
* @mapping: The address space containing pages to be unmapped.
* @start: Index of first page to be unmapped.
--- a/mm/truncate.c~mm-thp-unmap_mapping_page-to-fix-thp-truncate_cleanup_page
+++ a/mm/truncate.c
@@ -167,13 +167,10 @@ void do_invalidatepage(struct page *page
* its lock, b) when a concurrent invalidate_mapping_pages got there first and
* c) when tmpfs swizzles a page between a tmpfs inode and swapper_space.
*/
-static void
-truncate_cleanup_page(struct address_space *mapping, struct page *page)
+static void truncate_cleanup_page(struct page *page)
{
- if (page_mapped(page)) {
- unsigned int nr = thp_nr_pages(page);
- unmap_mapping_pages(mapping, page->index, nr, false);
- }
+ if (page_mapped(page))
+ unmap_mapping_page(page);
if (page_has_private(page))
do_invalidatepage(page, 0, thp_size(page));
@@ -218,7 +215,7 @@ int truncate_inode_page(struct address_s
if (page->mapping != mapping)
return -EIO;
- truncate_cleanup_page(mapping, page);
+ truncate_cleanup_page(page);
delete_from_page_cache(page);
return 0;
}
@@ -325,7 +322,7 @@ void truncate_inode_pages_range(struct a
index = indices[pagevec_count(&pvec) - 1] + 1;
truncate_exceptional_pvec_entries(mapping, &pvec, indices);
for (i = 0; i < pagevec_count(&pvec); i++)
- truncate_cleanup_page(mapping, pvec.pages[i]);
+ truncate_cleanup_page(pvec.pages[i]);
delete_from_page_cache_batch(mapping, &pvec);
for (i = 0; i < pagevec_count(&pvec); i++)
unlock_page(pvec.pages[i]);
@@ -639,6 +636,16 @@ int invalidate_inode_pages2_range(struct
continue;
}
+ if (!did_range_unmap && page_mapped(page)) {
+ /*
+ * If page is mapped, before taking its lock,
+ * zap the rest of the file in one hit.
+ */
+ unmap_mapping_pages(mapping, index,
+ (1 + end - index), false);
+ did_range_unmap = 1;
+ }
+
lock_page(page);
WARN_ON(page_to_index(page) != index);
if (page->mapping != mapping) {
@@ -646,23 +653,11 @@ int invalidate_inode_pages2_range(struct
continue;
}
wait_on_page_writeback(page);
- if (page_mapped(page)) {
- if (!did_range_unmap) {
- /*
- * Zap the rest of the file in one hit.
- */
- unmap_mapping_pages(mapping, index,
- (1 + end - index), false);
- did_range_unmap = 1;
- } else {
- /*
- * Just zap this page
- */
- unmap_mapping_pages(mapping, index,
- 1, false);
- }
- }
+
+ if (page_mapped(page))
+ unmap_mapping_page(page);
BUG_ON(page_mapped(page));
+
ret2 = do_launder_page(mapping, page);
if (ret2 == 0) {
if (!invalidate_complete_page2(mapping, page))
_
Patches currently in -mm which might be from hughd(a)google.com are
mm-thp-fix-__split_huge_pmd_locked-on-shmem-migration-entry.patch
mm-thp-make-is_huge_zero_pmd-safe-and-quicker.patch
mm-thp-try_to_unmap-use-ttu_sync-for-safe-splitting.patch
mm-thp-fix-vma_address-if-virtual-address-below-file-offset.patch
mm-thp-unmap_mapping_page-to-fix-thp-truncate_cleanup_page.patch
mm-page_vma_mapped_walk-use-page-for-pvmw-page.patch
mm-page_vma_mapped_walk-settle-pagehuge-on-entry.patch
mm-page_vma_mapped_walk-use-pmd_read_atomic.patch
mm-page_vma_mapped_walk-use-pmde-for-pvmw-pmd.patch
mm-page_vma_mapped_walk-prettify-pvmw_migration-block.patch
mm-page_vma_mapped_walk-crossing-page-table-boundary.patch
mm-page_vma_mapped_walk-add-a-level-of-indentation.patch
mm-page_vma_mapped_walk-use-goto-instead-of-while-1.patch
mm-page_vma_mapped_walk-get-vma_address_end-earlier.patch
mm-thp-fix-page_vma_mapped_walk-if-thp-mapped-by-ptes.patch
mm-thp-another-pvmw_sync-fix-in-page_vma_mapped_walk.patch
mm-thp-remap_page-is-only-needed-on-anonymous-thp.patch
mm-hwpoison_user_mappings-try_to_unmap-with-ttu_sync.patch
The patch titled
Subject: mm/thp: fix page_address_in_vma() on file THP tails
has been added to the -mm tree. Its filename is
mm-thp-fix-page_address_in_vma-on-file-thp-tails.patch
This patch should soon appear at
https://ozlabs.org/~akpm/mmots/broken-out/mm-thp-fix-page_address_in_vma-on…
and later at
https://ozlabs.org/~akpm/mmotm/broken-out/mm-thp-fix-page_address_in_vma-on…
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next and is updated
there every 3-4 working days
------------------------------------------------------
From: Jue Wang <juew(a)google.com>
Subject: mm/thp: fix page_address_in_vma() on file THP tails
Anon THP tails were already supported, but memory-failure may need to use
page_address_in_vma() on file THP tails, which its page->mapping check did
not permit: fix it.
hughd adds: no current usage is known to hit the issue, but this does fix
a subtle trap in a general helper: best fixed in stable sooner than later.
Link: https://lkml.kernel.org/r/a0d9b53-bf5d-8bab-ac5-759dc61819c1@google.com
Fixes: 800d8c63b2e9 ("shmem: add huge pages support")
Signed-off-by: Jue Wang <juew(a)google.com>
Signed-off-by: Hugh Dickins <hughd(a)google.com>
Reviewed-by: Matthew Wilcox (Oracle) <willy(a)infradead.org>
Reviewed-by: Yang Shi <shy828301(a)gmail.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com>
Cc: <stable(a)vger.kernel.org>
Cc: Alistair Popple <apopple(a)nvidia.com>
Cc: Jan Kara <jack(a)suse.cz>
Cc: Miaohe Lin <linmiaohe(a)huawei.com>
Cc: Minchan Kim <minchan(a)kernel.org>
Cc: Naoya Horiguchi <naoya.horiguchi(a)nec.com>
Cc: Oscar Salvador <osalvador(a)suse.de>
Cc: Peter Xu <peterx(a)redhat.com>
Cc: Ralph Campbell <rcampbell(a)nvidia.com>
Cc: Shakeel Butt <shakeelb(a)google.com>
Cc: Wang Yugui <wangyugui(a)e16-tech.com>
Cc: Zi Yan <ziy(a)nvidia.com>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/rmap.c | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
--- a/mm/rmap.c~mm-thp-fix-page_address_in_vma-on-file-thp-tails
+++ a/mm/rmap.c
@@ -716,11 +716,11 @@ unsigned long page_address_in_vma(struct
if (!vma->anon_vma || !page__anon_vma ||
vma->anon_vma->root != page__anon_vma->root)
return -EFAULT;
- } else if (page->mapping) {
- if (!vma->vm_file || vma->vm_file->f_mapping != page->mapping)
- return -EFAULT;
- } else
+ } else if (!vma->vm_file) {
+ return -EFAULT;
+ } else if (vma->vm_file->f_mapping != compound_head(page)->mapping) {
return -EFAULT;
+ }
return vma_address(page, vma);
}
_
Patches currently in -mm which might be from juew(a)google.com are
mm-thp-fix-page_address_in_vma-on-file-thp-tails.patch
The patch titled
Subject: mm/thp: fix vma_address() if virtual address below file offset
has been added to the -mm tree. Its filename is
mm-thp-fix-vma_address-if-virtual-address-below-file-offset.patch
This patch should soon appear at
https://ozlabs.org/~akpm/mmots/broken-out/mm-thp-fix-vma_address-if-virtual…
and later at
https://ozlabs.org/~akpm/mmotm/broken-out/mm-thp-fix-vma_address-if-virtual…
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next and is updated
there every 3-4 working days
------------------------------------------------------
From: Hugh Dickins <hughd(a)google.com>
Subject: mm/thp: fix vma_address() if virtual address below file offset
Running certain tests with a DEBUG_VM kernel would crash within hours, on
the total_mapcount BUG() in split_huge_page_to_list(), while trying to
free up some memory by punching a hole in a shmem huge page: split's
try_to_unmap() was unable to find all the mappings of the page (which, on
a !DEBUG_VM kernel, would then keep the huge page pinned in memory).
When that BUG() was changed to a WARN(), it would later crash on the
VM_BUG_ON_VMA(end < vma->vm_start || start >= vma->vm_end, vma) in
mm/internal.h:vma_address(), used by rmap_walk_file() for try_to_unmap().
vma_address() is usually correct, but there's a wraparound case when the
vm_start address is unusually low, but vm_pgoff not so low: vma_address()
chooses max(start, vma->vm_start), but that decides on the wrong address,
because start has become almost ULONG_MAX.
Rewrite vma_address() to be more careful about vm_pgoff; move the
VM_BUG_ON_VMA() out of it, returning -EFAULT for errors, so that it can be
safely used from page_mapped_in_vma() and page_address_in_vma() too.
Add vma_address_end() to apply similar care to end address calculation, in
page_vma_mapped_walk() and page_mkclean_one() and try_to_unmap_one();
though it raises a question of whether callers would do better to supply
pvmw->end to page_vma_mapped_walk() - I chose not, for a smaller patch.
An irritation is that their apparent generality breaks down on KSM pages,
which cannot be located by the page->index that page_to_pgoff() uses: as
4b0ece6fa016 ("mm: migrate: fix remove_migration_pte() for ksm pages")
once discovered. I dithered over the best thing to do about that, and
have ended up with a VM_BUG_ON_PAGE(PageKsm) in both vma_address() and
vma_address_end(); though the only place in danger of using it on them was
try_to_unmap_one().
Sidenote: vma_address() and vma_address_end() now use compound_nr() on a
head page, instead of thp_size(): to make the right calculation on a
hugetlbfs page, whether or not THPs are configured. try_to_unmap() is
used on hugetlbfs pages, but perhaps the wrong calculation never mattered.
Link: https://lkml.kernel.org/r/caf1c1a3-7cfb-7f8f-1beb-ba816e932825@google.com
Fixes: a8fa41ad2f6f ("mm, rmap: check all VMAs that PTE-mapped THP can be part of")
Signed-off-by: Hugh Dickins <hughd(a)google.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com>
Cc: <stable(a)vger.kernel.org>
Cc: Alistair Popple <apopple(a)nvidia.com>
Cc: Jan Kara <jack(a)suse.cz>
Cc: Jue Wang <juew(a)google.com>
Cc: "Matthew Wilcox (Oracle)" <willy(a)infradead.org>
Cc: Miaohe Lin <linmiaohe(a)huawei.com>
Cc: Minchan Kim <minchan(a)kernel.org>
Cc: Naoya Horiguchi <naoya.horiguchi(a)nec.com>
Cc: Oscar Salvador <osalvador(a)suse.de>
Cc: Peter Xu <peterx(a)redhat.com>
Cc: Ralph Campbell <rcampbell(a)nvidia.com>
Cc: Shakeel Butt <shakeelb(a)google.com>
Cc: Wang Yugui <wangyugui(a)e16-tech.com>
Cc: Yang Shi <shy828301(a)gmail.com>
Cc: Zi Yan <ziy(a)nvidia.com>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/internal.h | 51 ++++++++++++++++++++++++++++++-----------
mm/page_vma_mapped.c | 16 ++++--------
mm/rmap.c | 16 ++++++------
3 files changed, 52 insertions(+), 31 deletions(-)
--- a/mm/internal.h~mm-thp-fix-vma_address-if-virtual-address-below-file-offset
+++ a/mm/internal.h
@@ -384,27 +384,52 @@ static inline void mlock_migrate_page(st
extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
/*
- * At what user virtual address is page expected in @vma?
+ * At what user virtual address is page expected in vma?
+ * Returns -EFAULT if all of the page is outside the range of vma.
+ * If page is a compound head, the entire compound page is considered.
*/
static inline unsigned long
-__vma_address(struct page *page, struct vm_area_struct *vma)
+vma_address(struct page *page, struct vm_area_struct *vma)
{
- pgoff_t pgoff = page_to_pgoff(page);
- return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+ pgoff_t pgoff;
+ unsigned long address;
+
+ VM_BUG_ON_PAGE(PageKsm(page), page); /* KSM page->index unusable */
+ pgoff = page_to_pgoff(page);
+ if (pgoff >= vma->vm_pgoff) {
+ address = vma->vm_start +
+ ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+ /* Check for address beyond vma (or wrapped through 0?) */
+ if (address < vma->vm_start || address >= vma->vm_end)
+ address = -EFAULT;
+ } else if (PageHead(page) &&
+ pgoff + compound_nr(page) - 1 >= vma->vm_pgoff) {
+ /* Test above avoids possibility of wrap to 0 on 32-bit */
+ address = vma->vm_start;
+ } else {
+ address = -EFAULT;
+ }
+ return address;
}
+/*
+ * Then at what user virtual address will none of the page be found in vma?
+ * Assumes that vma_address() already returned a good starting address.
+ * If page is a compound head, the entire compound page is considered.
+ */
static inline unsigned long
-vma_address(struct page *page, struct vm_area_struct *vma)
+vma_address_end(struct page *page, struct vm_area_struct *vma)
{
- unsigned long start, end;
-
- start = __vma_address(page, vma);
- end = start + thp_size(page) - PAGE_SIZE;
-
- /* page should be within @vma mapping range */
- VM_BUG_ON_VMA(end < vma->vm_start || start >= vma->vm_end, vma);
+ pgoff_t pgoff;
+ unsigned long address;
- return max(start, vma->vm_start);
+ VM_BUG_ON_PAGE(PageKsm(page), page); /* KSM page->index unusable */
+ pgoff = page_to_pgoff(page) + compound_nr(page);
+ address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+ /* Check for address beyond vma (or wrapped through 0?) */
+ if (address < vma->vm_start || address > vma->vm_end)
+ address = vma->vm_end;
+ return address;
}
static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf,
--- a/mm/page_vma_mapped.c~mm-thp-fix-vma_address-if-virtual-address-below-file-offset
+++ a/mm/page_vma_mapped.c
@@ -228,18 +228,18 @@ restart:
if (!map_pte(pvmw))
goto next_pte;
while (1) {
+ unsigned long end;
+
if (check_pte(pvmw))
return true;
next_pte:
/* Seek to next pte only makes sense for THP */
if (!PageTransHuge(pvmw->page) || PageHuge(pvmw->page))
return not_found(pvmw);
+ end = vma_address_end(pvmw->page, pvmw->vma);
do {
pvmw->address += PAGE_SIZE;
- if (pvmw->address >= pvmw->vma->vm_end ||
- pvmw->address >=
- __vma_address(pvmw->page, pvmw->vma) +
- thp_size(pvmw->page))
+ if (pvmw->address >= end)
return not_found(pvmw);
/* Did we cross page table boundary? */
if (pvmw->address % PMD_SIZE == 0) {
@@ -277,14 +277,10 @@ int page_mapped_in_vma(struct page *page
.vma = vma,
.flags = PVMW_SYNC,
};
- unsigned long start, end;
-
- start = __vma_address(page, vma);
- end = start + thp_size(page) - PAGE_SIZE;
- if (unlikely(end < vma->vm_start || start >= vma->vm_end))
+ pvmw.address = vma_address(page, vma);
+ if (pvmw.address == -EFAULT)
return 0;
- pvmw.address = max(start, vma->vm_start);
if (!page_vma_mapped_walk(&pvmw))
return 0;
page_vma_mapped_walk_done(&pvmw);
--- a/mm/rmap.c~mm-thp-fix-vma_address-if-virtual-address-below-file-offset
+++ a/mm/rmap.c
@@ -707,7 +707,6 @@ static bool should_defer_flush(struct mm
*/
unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
{
- unsigned long address;
if (PageAnon(page)) {
struct anon_vma *page__anon_vma = page_anon_vma(page);
/*
@@ -722,10 +721,8 @@ unsigned long page_address_in_vma(struct
return -EFAULT;
} else
return -EFAULT;
- address = __vma_address(page, vma);
- if (unlikely(address < vma->vm_start || address >= vma->vm_end))
- return -EFAULT;
- return address;
+
+ return vma_address(page, vma);
}
pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
@@ -919,7 +916,7 @@ static bool page_mkclean_one(struct page
*/
mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
0, vma, vma->vm_mm, address,
- min(vma->vm_end, address + page_size(page)));
+ vma_address_end(page, vma));
mmu_notifier_invalidate_range_start(&range);
while (page_vma_mapped_walk(&pvmw)) {
@@ -1435,9 +1432,10 @@ static bool try_to_unmap_one(struct page
* Note that the page can not be free in this function as call of
* try_to_unmap() must hold a reference on the page.
*/
+ range.end = PageKsm(page) ?
+ address + PAGE_SIZE : vma_address_end(page, vma);
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
- address,
- min(vma->vm_end, address + page_size(page)));
+ address, range.end);
if (PageHuge(page)) {
/*
* If sharing is possible, start and end will be adjusted
@@ -1889,6 +1887,7 @@ static void rmap_walk_anon(struct page *
struct vm_area_struct *vma = avc->vma;
unsigned long address = vma_address(page, vma);
+ VM_BUG_ON_VMA(address == -EFAULT, vma);
cond_resched();
if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
@@ -1943,6 +1942,7 @@ static void rmap_walk_file(struct page *
pgoff_start, pgoff_end) {
unsigned long address = vma_address(page, vma);
+ VM_BUG_ON_VMA(address == -EFAULT, vma);
cond_resched();
if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
_
Patches currently in -mm which might be from hughd(a)google.com are
mm-thp-fix-__split_huge_pmd_locked-on-shmem-migration-entry.patch
mm-thp-make-is_huge_zero_pmd-safe-and-quicker.patch
mm-thp-try_to_unmap-use-ttu_sync-for-safe-splitting.patch
mm-thp-fix-vma_address-if-virtual-address-below-file-offset.patch
mm-thp-unmap_mapping_page-to-fix-thp-truncate_cleanup_page.patch
mm-page_vma_mapped_walk-use-page-for-pvmw-page.patch
mm-page_vma_mapped_walk-settle-pagehuge-on-entry.patch
mm-page_vma_mapped_walk-use-pmd_read_atomic.patch
mm-page_vma_mapped_walk-use-pmde-for-pvmw-pmd.patch
mm-page_vma_mapped_walk-prettify-pvmw_migration-block.patch
mm-page_vma_mapped_walk-crossing-page-table-boundary.patch
mm-page_vma_mapped_walk-add-a-level-of-indentation.patch
mm-page_vma_mapped_walk-use-goto-instead-of-while-1.patch
mm-page_vma_mapped_walk-get-vma_address_end-earlier.patch
mm-thp-fix-page_vma_mapped_walk-if-thp-mapped-by-ptes.patch
mm-thp-another-pvmw_sync-fix-in-page_vma_mapped_walk.patch
mm-thp-remap_page-is-only-needed-on-anonymous-thp.patch
mm-hwpoison_user_mappings-try_to_unmap-with-ttu_sync.patch
The patch titled
Subject: mm/thp: try_to_unmap() use TTU_SYNC for safe splitting
has been added to the -mm tree. Its filename is
mm-thp-try_to_unmap-use-ttu_sync-for-safe-splitting.patch
This patch should soon appear at
https://ozlabs.org/~akpm/mmots/broken-out/mm-thp-try_to_unmap-use-ttu_sync-…
and later at
https://ozlabs.org/~akpm/mmotm/broken-out/mm-thp-try_to_unmap-use-ttu_sync-…
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next and is updated
there every 3-4 working days
------------------------------------------------------
From: Hugh Dickins <hughd(a)google.com>
Subject: mm/thp: try_to_unmap() use TTU_SYNC for safe splitting
Stressing huge tmpfs often crashed on unmap_page()'s VM_BUG_ON_PAGE
(!unmap_success): with dump_page() showing mapcount:1, but then its raw
struct page output showing _mapcount ffffffff i.e. mapcount 0.
And even if that particular VM_BUG_ON_PAGE(!unmap_success) is removed, it
is immediately followed by a VM_BUG_ON_PAGE(compound_mapcount(head)), and
further down an IS_ENABLED(CONFIG_DEBUG_VM) total_mapcount BUG(): all
indicative of some mapcount difficulty in development here perhaps. But
the !CONFIG_DEBUG_VM path handles the failures correctly and silently.
I believe the problem is that once a racing unmap has cleared pte or pmd,
try_to_unmap_one() may skip taking the page table lock, and emerge from
try_to_unmap() before the racing task has reached decrementing mapcount.
Instead of abandoning the unsafe VM_BUG_ON_PAGE(), and the ones that
follow, use PVMW_SYNC in try_to_unmap_one() in this case: adding TTU_SYNC
to the options, and passing that from unmap_page().
When CONFIG_DEBUG_VM, or for non-debug too? Consensus is to do the same
for both: the slight overhead added should rarely matter, except perhaps
if splitting sparsely-populated multiply-mapped shmem. Once confident
that bugs are fixed, TTU_SYNC here can be removed, and the race tolerated.
Link: https://lkml.kernel.org/r/c1e95853-8bcd-d8fd-55fa-e7f2488e78f@google.com
Fixes: fec89c109f3a ("thp: rewrite freeze_page()/unfreeze_page() with generic rmap walkers")
Signed-off-by: Hugh Dickins <hughd(a)google.com>
Cc: <stable(a)vger.kernel.org>
Cc: Alistair Popple <apopple(a)nvidia.com>
Cc: Jan Kara <jack(a)suse.cz>
Cc: Jue Wang <juew(a)google.com>
Cc: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com>
Cc: "Matthew Wilcox (Oracle)" <willy(a)infradead.org>
Cc: Miaohe Lin <linmiaohe(a)huawei.com>
Cc: Minchan Kim <minchan(a)kernel.org>
Cc: Naoya Horiguchi <naoya.horiguchi(a)nec.com>
Cc: Oscar Salvador <osalvador(a)suse.de>
Cc: Peter Xu <peterx(a)redhat.com>
Cc: Ralph Campbell <rcampbell(a)nvidia.com>
Cc: Shakeel Butt <shakeelb(a)google.com>
Cc: Wang Yugui <wangyugui(a)e16-tech.com>
Cc: Yang Shi <shy828301(a)gmail.com>
Cc: Zi Yan <ziy(a)nvidia.com>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
include/linux/rmap.h | 1 +
mm/huge_memory.c | 2 +-
mm/page_vma_mapped.c | 11 +++++++++++
mm/rmap.c | 17 ++++++++++++++++-
4 files changed, 29 insertions(+), 2 deletions(-)
--- a/include/linux/rmap.h~mm-thp-try_to_unmap-use-ttu_sync-for-safe-splitting
+++ a/include/linux/rmap.h
@@ -91,6 +91,7 @@ enum ttu_flags {
TTU_SPLIT_HUGE_PMD = 0x4, /* split huge PMD if any */
TTU_IGNORE_MLOCK = 0x8, /* ignore mlock */
+ TTU_SYNC = 0x10, /* avoid racy checks with PVMW_SYNC */
TTU_IGNORE_HWPOISON = 0x20, /* corrupted page is recoverable */
TTU_BATCH_FLUSH = 0x40, /* Batch TLB flushes where possible
* and caller guarantees they will
--- a/mm/huge_memory.c~mm-thp-try_to_unmap-use-ttu_sync-for-safe-splitting
+++ a/mm/huge_memory.c
@@ -2350,7 +2350,7 @@ void vma_adjust_trans_huge(struct vm_are
static void unmap_page(struct page *page)
{
- enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK |
+ enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_SYNC |
TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD;
bool unmap_success;
--- a/mm/page_vma_mapped.c~mm-thp-try_to_unmap-use-ttu_sync-for-safe-splitting
+++ a/mm/page_vma_mapped.c
@@ -212,6 +212,17 @@ restart:
pvmw->ptl = NULL;
}
} else if (!pmd_present(pmde)) {
+ /*
+ * If PVMW_SYNC, take and drop THP pmd lock so that we
+ * cannot return prematurely, while zap_huge_pmd() has
+ * cleared *pmd but not decremented compound_mapcount().
+ */
+ if ((pvmw->flags & PVMW_SYNC) &&
+ PageTransCompound(pvmw->page)) {
+ spinlock_t *ptl = pmd_lock(mm, pvmw->pmd);
+
+ spin_unlock(ptl);
+ }
return false;
}
if (!map_pte(pvmw))
--- a/mm/rmap.c~mm-thp-try_to_unmap-use-ttu_sync-for-safe-splitting
+++ a/mm/rmap.c
@@ -1405,6 +1405,15 @@ static bool try_to_unmap_one(struct page
struct mmu_notifier_range range;
enum ttu_flags flags = (enum ttu_flags)(long)arg;
+ /*
+ * When racing against e.g. zap_pte_range() on another cpu,
+ * in between its ptep_get_and_clear_full() and page_remove_rmap(),
+ * try_to_unmap() may return false when it is about to become true,
+ * if page table locking is skipped: use TTU_SYNC to wait for that.
+ */
+ if (flags & TTU_SYNC)
+ pvmw.flags = PVMW_SYNC;
+
/* munlock has nothing to gain from examining un-locked vmas */
if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED))
return true;
@@ -1777,7 +1786,13 @@ bool try_to_unmap(struct page *page, enu
else
rmap_walk(page, &rwc);
- return !page_mapcount(page) ? true : false;
+ /*
+ * When racing against e.g. zap_pte_range() on another cpu,
+ * in between its ptep_get_and_clear_full() and page_remove_rmap(),
+ * try_to_unmap() may return false when it is about to become true,
+ * if page table locking is skipped: use TTU_SYNC to wait for that.
+ */
+ return !page_mapcount(page);
}
/**
_
Patches currently in -mm which might be from hughd(a)google.com are
mm-thp-fix-__split_huge_pmd_locked-on-shmem-migration-entry.patch
mm-thp-make-is_huge_zero_pmd-safe-and-quicker.patch
mm-thp-try_to_unmap-use-ttu_sync-for-safe-splitting.patch
mm-thp-fix-vma_address-if-virtual-address-below-file-offset.patch
mm-thp-unmap_mapping_page-to-fix-thp-truncate_cleanup_page.patch
mm-page_vma_mapped_walk-use-page-for-pvmw-page.patch
mm-page_vma_mapped_walk-settle-pagehuge-on-entry.patch
mm-page_vma_mapped_walk-use-pmd_read_atomic.patch
mm-page_vma_mapped_walk-use-pmde-for-pvmw-pmd.patch
mm-page_vma_mapped_walk-prettify-pvmw_migration-block.patch
mm-page_vma_mapped_walk-crossing-page-table-boundary.patch
mm-page_vma_mapped_walk-add-a-level-of-indentation.patch
mm-page_vma_mapped_walk-use-goto-instead-of-while-1.patch
mm-page_vma_mapped_walk-get-vma_address_end-earlier.patch
mm-thp-fix-page_vma_mapped_walk-if-thp-mapped-by-ptes.patch
mm-thp-another-pvmw_sync-fix-in-page_vma_mapped_walk.patch
mm-thp-remap_page-is-only-needed-on-anonymous-thp.patch
mm-hwpoison_user_mappings-try_to_unmap-with-ttu_sync.patch
The patch titled
Subject: mm/thp: make is_huge_zero_pmd() safe and quicker
has been added to the -mm tree. Its filename is
mm-thp-make-is_huge_zero_pmd-safe-and-quicker.patch
This patch should soon appear at
https://ozlabs.org/~akpm/mmots/broken-out/mm-thp-make-is_huge_zero_pmd-safe…
and later at
https://ozlabs.org/~akpm/mmotm/broken-out/mm-thp-make-is_huge_zero_pmd-safe…
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next and is updated
there every 3-4 working days
------------------------------------------------------
From: Hugh Dickins <hughd(a)google.com>
Subject: mm/thp: make is_huge_zero_pmd() safe and quicker
Most callers of is_huge_zero_pmd() supply a pmd already verified present;
but a few (notably zap_huge_pmd()) do not - it might be a pmd migration
entry, in which the pfn is encoded differently from a present pmd: which
might pass the is_huge_zero_pmd() test (though not on x86, since L1TF
forced us to protect against that); or perhaps even crash in pmd_page()
applied to a swap-like entry.
Make it safe by adding pmd_present() check into is_huge_zero_pmd() itself;
and make it quicker by saving huge_zero_pfn, so that is_huge_zero_pmd()
will not need to do that pmd_page() lookup each time.
__split_huge_pmd_locked() checked pmd_trans_huge() before: that worked,
but is unnecessary now that is_huge_zero_pmd() checks present.
Link: https://lkml.kernel.org/r/21ea9ca-a1f5-8b90-5e88-95fb1c49bbfa@google.com
Fixes: e71769ae5260 ("mm: enable thp migration for shmem thp")
Signed-off-by: Hugh Dickins <hughd(a)google.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com>
Reviewed-by: Yang Shi <shy828301(a)gmail.com>
Cc: <stable(a)vger.kernel.org>
Cc: Alistair Popple <apopple(a)nvidia.com>
Cc: Jan Kara <jack(a)suse.cz>
Cc: Jue Wang <juew(a)google.com>
Cc: "Matthew Wilcox (Oracle)" <willy(a)infradead.org>
Cc: Miaohe Lin <linmiaohe(a)huawei.com>
Cc: Minchan Kim <minchan(a)kernel.org>
Cc: Naoya Horiguchi <naoya.horiguchi(a)nec.com>
Cc: Oscar Salvador <osalvador(a)suse.de>
Cc: Peter Xu <peterx(a)redhat.com>
Cc: Ralph Campbell <rcampbell(a)nvidia.com>
Cc: Shakeel Butt <shakeelb(a)google.com>
Cc: Wang Yugui <wangyugui(a)e16-tech.com>
Cc: Zi Yan <ziy(a)nvidia.com>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
include/linux/huge_mm.h | 8 +++++++-
mm/huge_memory.c | 5 ++++-
2 files changed, 11 insertions(+), 2 deletions(-)
--- a/include/linux/huge_mm.h~mm-thp-make-is_huge_zero_pmd-safe-and-quicker
+++ a/include/linux/huge_mm.h
@@ -286,6 +286,7 @@ struct page *follow_devmap_pud(struct vm
vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd);
extern struct page *huge_zero_page;
+extern unsigned long huge_zero_pfn;
static inline bool is_huge_zero_page(struct page *page)
{
@@ -294,7 +295,7 @@ static inline bool is_huge_zero_page(str
static inline bool is_huge_zero_pmd(pmd_t pmd)
{
- return is_huge_zero_page(pmd_page(pmd));
+ return READ_ONCE(huge_zero_pfn) == pmd_pfn(pmd) && pmd_present(pmd);
}
static inline bool is_huge_zero_pud(pud_t pud)
@@ -439,6 +440,11 @@ static inline bool is_huge_zero_page(str
{
return false;
}
+
+static inline bool is_huge_zero_pmd(pmd_t pmd)
+{
+ return false;
+}
static inline bool is_huge_zero_pud(pud_t pud)
{
--- a/mm/huge_memory.c~mm-thp-make-is_huge_zero_pmd-safe-and-quicker
+++ a/mm/huge_memory.c
@@ -62,6 +62,7 @@ static struct shrinker deferred_split_sh
static atomic_t huge_zero_refcount;
struct page *huge_zero_page __read_mostly;
+unsigned long huge_zero_pfn __read_mostly = ~0UL;
bool transparent_hugepage_enabled(struct vm_area_struct *vma)
{
@@ -98,6 +99,7 @@ retry:
__free_pages(zero_page, compound_order(zero_page));
goto retry;
}
+ WRITE_ONCE(huge_zero_pfn, page_to_pfn(zero_page));
/* We take additional reference here. It will be put back by shrinker */
atomic_set(&huge_zero_refcount, 2);
@@ -147,6 +149,7 @@ static unsigned long shrink_huge_zero_pa
if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
struct page *zero_page = xchg(&huge_zero_page, NULL);
BUG_ON(zero_page == NULL);
+ WRITE_ONCE(huge_zero_pfn, ~0UL);
__free_pages(zero_page, compound_order(zero_page));
return HPAGE_PMD_NR;
}
@@ -2071,7 +2074,7 @@ static void __split_huge_pmd_locked(stru
return;
}
- if (pmd_trans_huge(*pmd) && is_huge_zero_pmd(*pmd)) {
+ if (is_huge_zero_pmd(*pmd)) {
/*
* FIXME: Do we want to invalidate secondary mmu by calling
* mmu_notifier_invalidate_range() see comments below inside
_
Patches currently in -mm which might be from hughd(a)google.com are
mm-thp-fix-__split_huge_pmd_locked-on-shmem-migration-entry.patch
mm-thp-make-is_huge_zero_pmd-safe-and-quicker.patch
mm-thp-try_to_unmap-use-ttu_sync-for-safe-splitting.patch
mm-thp-fix-vma_address-if-virtual-address-below-file-offset.patch
mm-thp-unmap_mapping_page-to-fix-thp-truncate_cleanup_page.patch
mm-page_vma_mapped_walk-use-page-for-pvmw-page.patch
mm-page_vma_mapped_walk-settle-pagehuge-on-entry.patch
mm-page_vma_mapped_walk-use-pmd_read_atomic.patch
mm-page_vma_mapped_walk-use-pmde-for-pvmw-pmd.patch
mm-page_vma_mapped_walk-prettify-pvmw_migration-block.patch
mm-page_vma_mapped_walk-crossing-page-table-boundary.patch
mm-page_vma_mapped_walk-add-a-level-of-indentation.patch
mm-page_vma_mapped_walk-use-goto-instead-of-while-1.patch
mm-page_vma_mapped_walk-get-vma_address_end-earlier.patch
mm-thp-fix-page_vma_mapped_walk-if-thp-mapped-by-ptes.patch
mm-thp-another-pvmw_sync-fix-in-page_vma_mapped_walk.patch
mm-thp-remap_page-is-only-needed-on-anonymous-thp.patch
mm-hwpoison_user_mappings-try_to_unmap-with-ttu_sync.patch
The patch titled
Subject: mm/thp: fix __split_huge_pmd_locked() on shmem migration entry
has been added to the -mm tree. Its filename is
mm-thp-fix-__split_huge_pmd_locked-on-shmem-migration-entry.patch
This patch should soon appear at
https://ozlabs.org/~akpm/mmots/broken-out/mm-thp-fix-__split_huge_pmd_locke…
and later at
https://ozlabs.org/~akpm/mmotm/broken-out/mm-thp-fix-__split_huge_pmd_locke…
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next and is updated
there every 3-4 working days
------------------------------------------------------
From: Hugh Dickins <hughd(a)google.com>
Subject: mm/thp: fix __split_huge_pmd_locked() on shmem migration entry
Patch series "mm/thp: fix THP splitting unmap BUGs and related", v10.
Here is v2 batch of long-standing THP bug fixes that I had not got around
to sending before, but prompted now by Wang Yugui's report
https://lore.kernel.org/linux-mm/20210412180659.B9E3.409509F4@e16-tech.com/
Wang Yugui has tested a rollup of these fixes applied to 5.10.39, and they
have done no harm, but have *not* fixed that issue: something more is
needed and I have no idea of what.
This patch (of 7):
Stressing huge tmpfs page migration racing hole punch often crashed on the
VM_BUG_ON(!pmd_present) in pmdp_huge_clear_flush(), with DEBUG_VM=y
kernel; or shortly afterwards, on a bad dereference in
__split_huge_pmd_locked() when DEBUG_VM=n. They forgot to allow for pmd
migration entries in the non-anonymous case.
Full disclosure: those particular experiments were on a kernel with more
relaxed mmap_lock and i_mmap_rwsem locking, and were not repeated on the
vanilla kernel: it is conceivable that stricter locking happens to avoid
those cases, or makes them less likely; but __split_huge_pmd_locked()
already allowed for pmd migration entries when handling anonymous THPs, so
this commit brings the shmem and file THP handling into line.
And while there: use old_pmd rather than _pmd, as in the following blocks;
and make it clearer to the eye that the !vma_is_anonymous() block is
self-contained, making an early return after accounting for unmapping.
Link: https://lkml.kernel.org/r/af88612-1473-2eaa-903-8d1a448b26@google.com
Link: https://lkml.kernel.org/r/dd221a99-efb3-cd1d-6256-7e646af29314@google.com
Fixes: e71769ae5260 ("mm: enable thp migration for shmem thp")
Signed-off-by: Hugh Dickins <hughd(a)google.com>
Cc: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com>
Cc: Yang Shi <shy828301(a)gmail.com>
Cc: Wang Yugui <wangyugui(a)e16-tech.com>
Cc: "Matthew Wilcox (Oracle)" <willy(a)infradead.org>
Cc: Naoya Horiguchi <naoya.horiguchi(a)nec.com>
Cc: Alistair Popple <apopple(a)nvidia.com>
Cc: Ralph Campbell <rcampbell(a)nvidia.com>
Cc: Zi Yan <ziy(a)nvidia.com>
Cc: Miaohe Lin <linmiaohe(a)huawei.com>
Cc: Minchan Kim <minchan(a)kernel.org>
Cc: Jue Wang <juew(a)google.com>
Cc: Peter Xu <peterx(a)redhat.com>
Cc: Jan Kara <jack(a)suse.cz>
Cc: Shakeel Butt <shakeelb(a)google.com>
Cc: Oscar Salvador <osalvador(a)suse.de>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/huge_memory.c | 27 ++++++++++++++++++---------
mm/pgtable-generic.c | 5 ++---
2 files changed, 20 insertions(+), 12 deletions(-)
--- a/mm/huge_memory.c~mm-thp-fix-__split_huge_pmd_locked-on-shmem-migration-entry
+++ a/mm/huge_memory.c
@@ -2044,7 +2044,7 @@ static void __split_huge_pmd_locked(stru
count_vm_event(THP_SPLIT_PMD);
if (!vma_is_anonymous(vma)) {
- _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
+ old_pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
/*
* We are going to unmap this huge page. So
* just go ahead and zap it
@@ -2053,16 +2053,25 @@ static void __split_huge_pmd_locked(stru
zap_deposited_table(mm, pmd);
if (vma_is_special_huge(vma))
return;
- page = pmd_page(_pmd);
- if (!PageDirty(page) && pmd_dirty(_pmd))
- set_page_dirty(page);
- if (!PageReferenced(page) && pmd_young(_pmd))
- SetPageReferenced(page);
- page_remove_rmap(page, true);
- put_page(page);
+ if (unlikely(is_pmd_migration_entry(old_pmd))) {
+ swp_entry_t entry;
+
+ entry = pmd_to_swp_entry(old_pmd);
+ page = migration_entry_to_page(entry);
+ } else {
+ page = pmd_page(old_pmd);
+ if (!PageDirty(page) && pmd_dirty(old_pmd))
+ set_page_dirty(page);
+ if (!PageReferenced(page) && pmd_young(old_pmd))
+ SetPageReferenced(page);
+ page_remove_rmap(page, true);
+ put_page(page);
+ }
add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR);
return;
- } else if (pmd_trans_huge(*pmd) && is_huge_zero_pmd(*pmd)) {
+ }
+
+ if (pmd_trans_huge(*pmd) && is_huge_zero_pmd(*pmd)) {
/*
* FIXME: Do we want to invalidate secondary mmu by calling
* mmu_notifier_invalidate_range() see comments below inside
--- a/mm/pgtable-generic.c~mm-thp-fix-__split_huge_pmd_locked-on-shmem-migration-entry
+++ a/mm/pgtable-generic.c
@@ -135,9 +135,8 @@ pmd_t pmdp_huge_clear_flush(struct vm_ar
{
pmd_t pmd;
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
- VM_BUG_ON(!pmd_present(*pmdp));
- /* Below assumes pmd_present() is true */
- VM_BUG_ON(!pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
+ VM_BUG_ON(pmd_present(*pmdp) && !pmd_trans_huge(*pmdp) &&
+ !pmd_devmap(*pmdp));
pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
return pmd;
_
Patches currently in -mm which might be from hughd(a)google.com are
mm-thp-fix-__split_huge_pmd_locked-on-shmem-migration-entry.patch
mm-thp-make-is_huge_zero_pmd-safe-and-quicker.patch
mm-thp-try_to_unmap-use-ttu_sync-for-safe-splitting.patch
mm-thp-fix-vma_address-if-virtual-address-below-file-offset.patch
mm-thp-unmap_mapping_page-to-fix-thp-truncate_cleanup_page.patch
mm-page_vma_mapped_walk-use-page-for-pvmw-page.patch
mm-page_vma_mapped_walk-settle-pagehuge-on-entry.patch
mm-page_vma_mapped_walk-use-pmd_read_atomic.patch
mm-page_vma_mapped_walk-use-pmde-for-pvmw-pmd.patch
mm-page_vma_mapped_walk-prettify-pvmw_migration-block.patch
mm-page_vma_mapped_walk-crossing-page-table-boundary.patch
mm-page_vma_mapped_walk-add-a-level-of-indentation.patch
mm-page_vma_mapped_walk-use-goto-instead-of-while-1.patch
mm-page_vma_mapped_walk-get-vma_address_end-earlier.patch
mm-thp-fix-page_vma_mapped_walk-if-thp-mapped-by-ptes.patch
mm-thp-another-pvmw_sync-fix-in-page_vma_mapped_walk.patch
mm-thp-remap_page-is-only-needed-on-anonymous-thp.patch
mm-hwpoison_user_mappings-try_to_unmap-with-ttu_sync.patch
From: Ian Rogers <irogers(a)google.com>
[ Upstream commit fd7d55172d1e2e501e6da0a5c1de25f06612dc2e ]
Currently perf_rotate_context assumes that if the context's nr_events !=
nr_active a rotation is necessary for perf event multiplexing. With
cgroups, nr_events is the total count of events for all cgroups and
nr_active will not include events in a cgroup other than the current
task's. This makes rotation appear necessary for cgroups when it is not.
Add a perf_event_context flag that is set when rotation is necessary.
Clear the flag during sched_out and set it when a flexible sched_in
fails due to resources.
Signed-off-by: Ian Rogers <irogers(a)google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz(a)infradead.org>
Cc: Alexander Shishkin <alexander.shishkin(a)linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme(a)kernel.org>
Cc: Arnaldo Carvalho de Melo <acme(a)redhat.com>
Cc: Borislav Petkov <bp(a)alien8.de>
Cc: Jiri Olsa <jolsa(a)redhat.com>
Cc: Kan Liang <kan.liang(a)linux.intel.com>
Cc: Linus Torvalds <torvalds(a)linux-foundation.org>
Cc: Namhyung Kim <namhyung(a)kernel.org>
Cc: Peter Zijlstra <peterz(a)infradead.org>
Cc: Stephane Eranian <eranian(a)google.com>
Cc: Thomas Gleixner <tglx(a)linutronix.de>
Cc: Vince Weaver <vincent.weaver(a)maine.edu>
Link: https://lkml.kernel.org/r/20190601082722.44543-1-irogers@google.com
Signed-off-by: Ingo Molnar <mingo(a)kernel.org>
Cc: stable(a)vger.kernel.org # 4.19+
Signed-off-by: Wen Yang <wenyang(a)linux.alibaba.com>
---
include/linux/perf_event.h | 5 +++++
kernel/events/core.c | 42 ++++++++++++++++++++++--------------------
2 files changed, 27 insertions(+), 20 deletions(-)
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index d8b4d31..efe30b9 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -747,6 +747,11 @@ struct perf_event_context {
int nr_stat;
int nr_freq;
int rotate_disable;
+ /*
+ * Set when nr_events != nr_active, except tolerant to events not
+ * necessary to be active due to scheduling constraints, such as cgroups.
+ */
+ int rotate_necessary;
atomic_t refcount;
struct task_struct *task;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index b8b74a4..56e3789 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2952,6 +2952,12 @@ static void ctx_sched_out(struct perf_event_context *ctx,
if (!ctx->nr_active || !(is_active & EVENT_ALL))
return;
+ /*
+ * If we had been multiplexing, no rotations are necessary, now no events
+ * are active.
+ */
+ ctx->rotate_necessary = 0;
+
perf_pmu_disable(ctx->pmu);
if (is_active & EVENT_PINNED) {
list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list)
@@ -3319,10 +3325,13 @@ static int flexible_sched_in(struct perf_event *event, void *data)
return 0;
if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) {
- if (!group_sched_in(event, sid->cpuctx, sid->ctx))
- list_add_tail(&event->active_list, &sid->ctx->flexible_active);
- else
+ int ret = group_sched_in(event, sid->cpuctx, sid->ctx);
+ if (ret) {
sid->can_add_hw = 0;
+ sid->ctx->rotate_necessary = 1;
+ return 0;
+ }
+ list_add_tail(&event->active_list, &sid->ctx->flexible_active);
}
return 0;
@@ -3690,24 +3699,17 @@ static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event)
static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
{
struct perf_event *cpu_event = NULL, *task_event = NULL;
- bool cpu_rotate = false, task_rotate = false;
- struct perf_event_context *ctx = NULL;
+ struct perf_event_context *task_ctx = NULL;
+ int cpu_rotate, task_rotate;
/*
* Since we run this from IRQ context, nobody can install new
* events, thus the event count values are stable.
*/
- if (cpuctx->ctx.nr_events) {
- if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
- cpu_rotate = true;
- }
-
- ctx = cpuctx->task_ctx;
- if (ctx && ctx->nr_events) {
- if (ctx->nr_events != ctx->nr_active)
- task_rotate = true;
- }
+ cpu_rotate = cpuctx->ctx.rotate_necessary;
+ task_ctx = cpuctx->task_ctx;
+ task_rotate = task_ctx ? task_ctx->rotate_necessary : 0;
if (!(cpu_rotate || task_rotate))
return false;
@@ -3716,7 +3718,7 @@ static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
perf_pmu_disable(cpuctx->ctx.pmu);
if (task_rotate)
- task_event = ctx_first_active(ctx);
+ task_event = ctx_first_active(task_ctx);
if (cpu_rotate)
cpu_event = ctx_first_active(&cpuctx->ctx);
@@ -3724,17 +3726,17 @@ static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
* As per the order given at ctx_resched() first 'pop' task flexible
* and then, if needed CPU flexible.
*/
- if (task_event || (ctx && cpu_event))
- ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
+ if (task_event || (task_ctx && cpu_event))
+ ctx_sched_out(task_ctx, cpuctx, EVENT_FLEXIBLE);
if (cpu_event)
cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
if (task_event)
- rotate_ctx(ctx, task_event);
+ rotate_ctx(task_ctx, task_event);
if (cpu_event)
rotate_ctx(&cpuctx->ctx, cpu_event);
- perf_event_sched_in(cpuctx, ctx, current);
+ perf_event_sched_in(cpuctx, task_ctx, current);
perf_pmu_enable(cpuctx->ctx.pmu);
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
--
1.8.3.1