From: Hugh Dickins <hughd(a)google.com>
Subject: mm/khugepaged: fix crashes due to misaccounted holes
Huge tmpfs testing on a shortish file mapped into a pmd-rounded extent hit
shmem_evict_inode()'s WARN_ON(inode->i_blocks) followed by clear_inode()'s
BUG_ON(inode->i_data.nrpages) when the file was later closed and unlinked.
khugepaged's collapse_shmem() was forgetting to update mapping->nrpages on
the rollback path, after it had added but then needs to undo some holes.
There is indeed an irritating asymmetry between shmem_charge(), whose
callers want it to increment nrpages after successfully accounting blocks,
and shmem_uncharge(), when __delete_from_page_cache() already decremented
nrpages itself: oh well, just add a comment on that to them both.
And shmem_recalc_inode() is supposed to be called when the accounting is
expected to be in balance (so it can deduce from imbalance that reclaim
discarded some pages): so change shmem_charge() to update nrpages earlier
(though it's rare for the difference to matter at all).
Link: http://lkml.kernel.org/r/alpine.LSU.2.11.1811261523450.2275@eggly.anvils
Fixes: 800d8c63b2e98 ("shmem: add huge pages support")
Fixes: f3f0e1d2150b2 ("khugepaged: add support of collapse for tmpfs/shmem pages")
Signed-off-by: Hugh Dickins <hughd(a)google.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com>
Cc: Jerome Glisse <jglisse(a)redhat.com>
Cc: Konstantin Khlebnikov <khlebnikov(a)yandex-team.ru>
Cc: Matthew Wilcox <willy(a)infradead.org>
Cc: <stable(a)vger.kernel.org> [4.8+]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/khugepaged.c | 5 ++++-
mm/shmem.c | 6 +++++-
2 files changed, 9 insertions(+), 2 deletions(-)
--- a/mm/khugepaged.c~mm-khugepaged-fix-crashes-due-to-misaccounted-holes
+++ a/mm/khugepaged.c
@@ -1506,9 +1506,12 @@ xa_unlocked:
khugepaged_pages_collapsed++;
} else {
struct page *page;
+
/* Something went wrong: roll back page cache changes */
- shmem_uncharge(mapping->host, nr_none);
xas_lock_irq(&xas);
+ mapping->nrpages -= nr_none;
+ shmem_uncharge(mapping->host, nr_none);
+
xas_set(&xas, start);
xas_for_each(&xas, page, end - 1) {
page = list_first_entry_or_null(&pagelist,
--- a/mm/shmem.c~mm-khugepaged-fix-crashes-due-to-misaccounted-holes
+++ a/mm/shmem.c
@@ -297,12 +297,14 @@ bool shmem_charge(struct inode *inode, l
if (!shmem_inode_acct_block(inode, pages))
return false;
+ /* nrpages adjustment first, then shmem_recalc_inode() when balanced */
+ inode->i_mapping->nrpages += pages;
+
spin_lock_irqsave(&info->lock, flags);
info->alloced += pages;
inode->i_blocks += pages * BLOCKS_PER_PAGE;
shmem_recalc_inode(inode);
spin_unlock_irqrestore(&info->lock, flags);
- inode->i_mapping->nrpages += pages;
return true;
}
@@ -312,6 +314,8 @@ void shmem_uncharge(struct inode *inode,
struct shmem_inode_info *info = SHMEM_I(inode);
unsigned long flags;
+ /* nrpages adjustment done by __delete_from_page_cache() or caller */
+
spin_lock_irqsave(&info->lock, flags);
info->alloced -= pages;
inode->i_blocks -= pages * BLOCKS_PER_PAGE;
_
From: Hugh Dickins <hughd(a)google.com>
Subject: mm/huge_memory: fix lockdep complaint on 32-bit i_size_read()
Huge tmpfs testing, on 32-bit kernel with lockdep enabled, showed that
__split_huge_page() was using i_size_read() while holding the irq-safe
lru_lock and page tree lock, but the 32-bit i_size_read() uses an
irq-unsafe seqlock which should not be nested inside them.
Instead, read the i_size earlier in split_huge_page_to_list(), and pass
the end offset down to __split_huge_page(): all while holding head page
lock, which is enough to prevent truncation of that extent before the page
tree lock has been taken.
Link: http://lkml.kernel.org/r/alpine.LSU.2.11.1811261520070.2275@eggly.anvils
Fixes: baa355fd33142 ("thp: file pages support for split_huge_page()")
Signed-off-by: Hugh Dickins <hughd(a)google.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com>
Cc: Jerome Glisse <jglisse(a)redhat.com>
Cc: Konstantin Khlebnikov <khlebnikov(a)yandex-team.ru>
Cc: Matthew Wilcox <willy(a)infradead.org>
Cc: <stable(a)vger.kernel.org> [4.8+]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/huge_memory.c | 19 +++++++++++++------
1 file changed, 13 insertions(+), 6 deletions(-)
--- a/mm/huge_memory.c~mm-huge_memory-fix-lockdep-complaint-on-32-bit-i_size_read
+++ a/mm/huge_memory.c
@@ -2439,12 +2439,11 @@ static void __split_huge_page_tail(struc
}
static void __split_huge_page(struct page *page, struct list_head *list,
- unsigned long flags)
+ pgoff_t end, unsigned long flags)
{
struct page *head = compound_head(page);
struct zone *zone = page_zone(head);
struct lruvec *lruvec;
- pgoff_t end = -1;
int i;
lruvec = mem_cgroup_page_lruvec(head, zone->zone_pgdat);
@@ -2452,9 +2451,6 @@ static void __split_huge_page(struct pag
/* complete memcg works before add pages to LRU */
mem_cgroup_split_huge_fixup(head);
- if (!PageAnon(page))
- end = DIV_ROUND_UP(i_size_read(head->mapping->host), PAGE_SIZE);
-
for (i = HPAGE_PMD_NR - 1; i >= 1; i--) {
__split_huge_page_tail(head, i, lruvec, list);
/* Some pages can be beyond i_size: drop them from page cache */
@@ -2626,6 +2622,7 @@ int split_huge_page_to_list(struct page
int count, mapcount, extra_pins, ret;
bool mlocked;
unsigned long flags;
+ pgoff_t end;
VM_BUG_ON_PAGE(is_huge_zero_page(page), page);
VM_BUG_ON_PAGE(!PageLocked(page), page);
@@ -2648,6 +2645,7 @@ int split_huge_page_to_list(struct page
ret = -EBUSY;
goto out;
}
+ end = -1;
mapping = NULL;
anon_vma_lock_write(anon_vma);
} else {
@@ -2661,6 +2659,15 @@ int split_huge_page_to_list(struct page
anon_vma = NULL;
i_mmap_lock_read(mapping);
+
+ /*
+ *__split_huge_page() may need to trim off pages beyond EOF:
+ * but on 32-bit, i_size_read() takes an irq-unsafe seqlock,
+ * which cannot be nested inside the page tree lock. So note
+ * end now: i_size itself may be changed at any moment, but
+ * head page lock is good enough to serialize the trimming.
+ */
+ end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
}
/*
@@ -2707,7 +2714,7 @@ int split_huge_page_to_list(struct page
if (mapping)
__dec_node_page_state(page, NR_SHMEM_THPS);
spin_unlock(&pgdata->split_queue_lock);
- __split_huge_page(page, list, flags);
+ __split_huge_page(page, list, end, flags);
if (PageSwapCache(head)) {
swp_entry_t entry = { .val = page_private(head) };
_
From: Hugh Dickins <hughd(a)google.com>
Subject: mm/huge_memory: splitting set mapping+index before unfreeze
Huge tmpfs stress testing has occasionally hit shmem_undo_range()'s
VM_BUG_ON_PAGE(page_to_pgoff(page) != index, page).
Move the setting of mapping and index up before the page_ref_unfreeze() in
__split_huge_page_tail() to fix this: so that a page cache lookup cannot
get a reference while the tail's mapping and index are unstable.
In fact, might as well move them up before the smp_wmb(): I don't see an
actual need for that, but if I'm missing something, this way round is
safer than the other, and no less efficient.
You might argue that VM_BUG_ON_PAGE(page_to_pgoff(page) != index, page) is
misplaced, and should be left until after the trylock_page(); but left as
is has not crashed since, and gives more stringent assurance.
Link: http://lkml.kernel.org/r/alpine.LSU.2.11.1811261516380.2275@eggly.anvils
Fixes: e9b61f19858a5 ("thp: reintroduce split_huge_page()")
Requires: 605ca5ede764 ("mm/huge_memory.c: reorder operations in __split_huge_page_tail()")
Signed-off-by: Hugh Dickins <hughd(a)google.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com>
Cc: Konstantin Khlebnikov <khlebnikov(a)yandex-team.ru>
Cc: Jerome Glisse <jglisse(a)redhat.com>
Cc: Matthew Wilcox <willy(a)infradead.org>
Cc: <stable(a)vger.kernel.org> [4.8+]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/huge_memory.c | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
--- a/mm/huge_memory.c~mm-huge_memory-splitting-set-mappingindex-before-unfreeze
+++ a/mm/huge_memory.c
@@ -2402,6 +2402,12 @@ static void __split_huge_page_tail(struc
(1L << PG_unevictable) |
(1L << PG_dirty)));
+ /* ->mapping in first tail page is compound_mapcount */
+ VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
+ page_tail);
+ page_tail->mapping = head->mapping;
+ page_tail->index = head->index + tail;
+
/* Page flags must be visible before we make the page non-compound. */
smp_wmb();
@@ -2422,12 +2428,6 @@ static void __split_huge_page_tail(struc
if (page_is_idle(head))
set_page_idle(page_tail);
- /* ->mapping in first tail page is compound_mapcount */
- VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
- page_tail);
- page_tail->mapping = head->mapping;
-
- page_tail->index = head->index + tail;
page_cpupid_xchg_last(page_tail, page_cpupid_last(head));
/*
_
From: Hugh Dickins <hughd(a)google.com>
Subject: mm/huge_memory: rename freeze_page() to unmap_page()
The term "freeze" is used in several ways in the kernel, and in mm it has
the particular meaning of forcing page refcount temporarily to 0.
freeze_page() is just too confusing a name for a function that unmaps a
page: rename it unmap_page(), and rename unfreeze_page() remap_page().
Went to change the mention of freeze_page() added later in mm/rmap.c, but
found it to be incorrect: ordinary page reclaim reaches there too; but the
substance of the comment still seems correct, so edit it down.
Link: http://lkml.kernel.org/r/alpine.LSU.2.11.1811261514080.2275@eggly.anvils
Fixes: e9b61f19858a5 ("thp: reintroduce split_huge_page()")
Signed-off-by: Hugh Dickins <hughd(a)google.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com>
Cc: Jerome Glisse <jglisse(a)redhat.com>
Cc: Konstantin Khlebnikov <khlebnikov(a)yandex-team.ru>
Cc: Matthew Wilcox <willy(a)infradead.org>
Cc: <stable(a)vger.kernel.org> [4.8+]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/huge_memory.c | 12 ++++++------
mm/rmap.c | 13 +++----------
2 files changed, 9 insertions(+), 16 deletions(-)
--- a/mm/huge_memory.c~mm-huge_memory-rename-freeze_page-to-unmap_page
+++ a/mm/huge_memory.c
@@ -2350,7 +2350,7 @@ void vma_adjust_trans_huge(struct vm_are
}
}
-static void freeze_page(struct page *page)
+static void unmap_page(struct page *page)
{
enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS |
TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD;
@@ -2365,7 +2365,7 @@ static void freeze_page(struct page *pag
VM_BUG_ON_PAGE(!unmap_success, page);
}
-static void unfreeze_page(struct page *page)
+static void remap_page(struct page *page)
{
int i;
if (PageTransHuge(page)) {
@@ -2483,7 +2483,7 @@ static void __split_huge_page(struct pag
spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags);
- unfreeze_page(head);
+ remap_page(head);
for (i = 0; i < HPAGE_PMD_NR; i++) {
struct page *subpage = head + i;
@@ -2664,7 +2664,7 @@ int split_huge_page_to_list(struct page
}
/*
- * Racy check if we can split the page, before freeze_page() will
+ * Racy check if we can split the page, before unmap_page() will
* split PMDs
*/
if (!can_split_huge_page(head, &extra_pins)) {
@@ -2673,7 +2673,7 @@ int split_huge_page_to_list(struct page
}
mlocked = PageMlocked(page);
- freeze_page(head);
+ unmap_page(head);
VM_BUG_ON_PAGE(compound_mapcount(head), head);
/* Make sure the page is not on per-CPU pagevec as it takes pin */
@@ -2727,7 +2727,7 @@ int split_huge_page_to_list(struct page
fail: if (mapping)
xa_unlock(&mapping->i_pages);
spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags);
- unfreeze_page(head);
+ remap_page(head);
ret = -EBUSY;
}
--- a/mm/rmap.c~mm-huge_memory-rename-freeze_page-to-unmap_page
+++ a/mm/rmap.c
@@ -1627,16 +1627,9 @@ static bool try_to_unmap_one(struct page
address + PAGE_SIZE);
} else {
/*
- * We should not need to notify here as we reach this
- * case only from freeze_page() itself only call from
- * split_huge_page_to_list() so everything below must
- * be true:
- * - page is not anonymous
- * - page is locked
- *
- * So as it is a locked file back page thus it can not
- * be remove from the page cache and replace by a new
- * page before mmu_notifier_invalidate_range_end so no
+ * This is a locked file-backed page, thus it cannot
+ * be removed from the page cache and replaced by a new
+ * page before mmu_notifier_invalidate_range_end, so no
* concurrent thread might update its page table to
* point at new page while a device still is using this
* page.
_
From: Andrea Arcangeli <aarcange(a)redhat.com>
Subject: userfaultfd: shmem/hugetlbfs: only allow to register VM_MAYWRITE vmas
After the VMA to register the uffd onto is found, check that it has
VM_MAYWRITE set before allowing registration. This way we inherit all
common code checks before allowing to fill file holes in shmem and
hugetlbfs with UFFDIO_COPY.
The userfaultfd memory model is not applicable for readonly files unless
it's a MAP_PRIVATE.
Link: http://lkml.kernel.org/r/20181126173452.26955-4-aarcange@redhat.com
Fixes: ff62a3421044 ("hugetlb: implement memfd sealing")
Signed-off-by: Andrea Arcangeli <aarcange(a)redhat.com>
Reviewed-by: Mike Rapoport <rppt(a)linux.ibm.com>
Reviewed-by: Hugh Dickins <hughd(a)google.com>
Reported-by: Jann Horn <jannh(a)google.com>
Fixes: 4c27fe4c4c84 ("userfaultfd: shmem: add shmem_mcopy_atomic_pte for userfaultfd support")
Cc: <stable(a)vger.kernel.org>
Cc: "Dr. David Alan Gilbert" <dgilbert(a)redhat.com>
Cc: Mike Kravetz <mike.kravetz(a)oracle.com>
Cc: Peter Xu <peterx(a)redhat.com>
Cc: stable(a)vger.kernel.org
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
fs/userfaultfd.c | 15 +++++++++++++++
mm/userfaultfd.c | 15 ++++++---------
2 files changed, 21 insertions(+), 9 deletions(-)
--- a/fs/userfaultfd.c~userfaultfd-shmem-hugetlbfs-only-allow-to-register-vm_maywrite-vmas
+++ a/fs/userfaultfd.c
@@ -1361,6 +1361,19 @@ static int userfaultfd_register(struct u
ret = -EINVAL;
if (!vma_can_userfault(cur))
goto out_unlock;
+
+ /*
+ * UFFDIO_COPY will fill file holes even without
+ * PROT_WRITE. This check enforces that if this is a
+ * MAP_SHARED, the process has write permission to the backing
+ * file. If VM_MAYWRITE is set it also enforces that on a
+ * MAP_SHARED vma: there is no F_WRITE_SEAL and no further
+ * F_WRITE_SEAL can be taken until the vma is destroyed.
+ */
+ ret = -EPERM;
+ if (unlikely(!(cur->vm_flags & VM_MAYWRITE)))
+ goto out_unlock;
+
/*
* If this vma contains ending address, and huge pages
* check alignment.
@@ -1406,6 +1419,7 @@ static int userfaultfd_register(struct u
BUG_ON(!vma_can_userfault(vma));
BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
vma->vm_userfaultfd_ctx.ctx != ctx);
+ WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
/*
* Nothing to do: this vma is already registered into this
@@ -1552,6 +1566,7 @@ static int userfaultfd_unregister(struct
cond_resched();
BUG_ON(!vma_can_userfault(vma));
+ WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
/*
* Nothing to do: this vma is already registered into this
--- a/mm/userfaultfd.c~userfaultfd-shmem-hugetlbfs-only-allow-to-register-vm_maywrite-vmas
+++ a/mm/userfaultfd.c
@@ -205,8 +205,9 @@ retry:
if (!dst_vma || !is_vm_hugetlb_page(dst_vma))
goto out_unlock;
/*
- * Only allow __mcopy_atomic_hugetlb on userfaultfd
- * registered ranges.
+ * Check the vma is registered in uffd, this is
+ * required to enforce the VM_MAYWRITE check done at
+ * uffd registration time.
*/
if (!dst_vma->vm_userfaultfd_ctx.ctx)
goto out_unlock;
@@ -459,13 +460,9 @@ retry:
if (!dst_vma)
goto out_unlock;
/*
- * Be strict and only allow __mcopy_atomic on userfaultfd
- * registered ranges to prevent userland errors going
- * unnoticed. As far as the VM consistency is concerned, it
- * would be perfectly safe to remove this check, but there's
- * no useful usage for __mcopy_atomic ouside of userfaultfd
- * registered ranges. This is after all why these are ioctls
- * belonging to the userfaultfd and not syscalls.
+ * Check the vma is registered in uffd, this is required to
+ * enforce the VM_MAYWRITE check done at uffd registration
+ * time.
*/
if (!dst_vma->vm_userfaultfd_ctx.ctx)
goto out_unlock;
_
From: Andrea Arcangeli <aarcange(a)redhat.com>
Subject: userfaultfd: shmem: allocate anonymous memory for MAP_PRIVATE shmem
Userfaultfd did not create private memory when UFFDIO_COPY was invoked on
a MAP_PRIVATE shmem mapping. Instead it wrote to the shmem file, even
when that had not been opened for writing. Though, fortunately, that
could only happen where there was a hole in the file.
Fix the shmem-backed implementation of UFFDIO_COPY to create private
memory for MAP_PRIVATE mappings. The hugetlbfs-backed implementation was
already correct.
This change is visible to userland, if userfaultfd has been used in
unintended ways: so it introduces a small risk of incompatibility, but is
necessary in order to respect file permissions.
An app that uses UFFDIO_COPY for anything like postcopy live migration
won't notice the difference, and in fact it'll run faster because there
will be no copy-on-write and memory waste in the tmpfs pagecache anymore.
Userfaults on MAP_PRIVATE shmem keep triggering only on file holes like
before.
The real zeropage can also be built on a MAP_PRIVATE shmem mapping through
UFFDIO_ZEROPAGE and that's safe because the zeropage pte is never dirty,
in turn even an mprotect upgrading the vma permission from PROT_READ to
PROT_READ|PROT_WRITE won't make the zeropage pte writable.
Link: http://lkml.kernel.org/r/20181126173452.26955-3-aarcange@redhat.com
Fixes: 4c27fe4c4c84 ("userfaultfd: shmem: add shmem_mcopy_atomic_pte for userfaultfd support")
Signed-off-by: Andrea Arcangeli <aarcange(a)redhat.com>
Reported-by: Mike Rapoport <rppt(a)linux.ibm.com>
Reviewed-by: Hugh Dickins <hughd(a)google.com>
Cc: <stable(a)vger.kernel.org>
Cc: "Dr. David Alan Gilbert" <dgilbert(a)redhat.com>
Cc: Jann Horn <jannh(a)google.com>
Cc: Mike Kravetz <mike.kravetz(a)oracle.com>
Cc: Peter Xu <peterx(a)redhat.com>
Cc: stable(a)vger.kernel.org
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/userfaultfd.c | 15 +++++++++++++--
1 file changed, 13 insertions(+), 2 deletions(-)
--- a/mm/userfaultfd.c~userfaultfd-shmem-allocate-anonymous-memory-for-map_private-shmem
+++ a/mm/userfaultfd.c
@@ -380,7 +380,17 @@ static __always_inline ssize_t mfill_ato
{
ssize_t err;
- if (vma_is_anonymous(dst_vma)) {
+ /*
+ * The normal page fault path for a shmem will invoke the
+ * fault, fill the hole in the file and COW it right away. The
+ * result generates plain anonymous memory. So when we are
+ * asked to fill an hole in a MAP_PRIVATE shmem mapping, we'll
+ * generate anonymous memory directly without actually filling
+ * the hole. For the MAP_PRIVATE case the robustness check
+ * only happens in the pagetable (to verify it's still none)
+ * and not in the radix tree.
+ */
+ if (!(dst_vma->vm_flags & VM_SHARED)) {
if (!zeropage)
err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma,
dst_addr, src_addr, page);
@@ -489,7 +499,8 @@ retry:
* dst_vma.
*/
err = -ENOMEM;
- if (vma_is_anonymous(dst_vma) && unlikely(anon_vma_prepare(dst_vma)))
+ if (!(dst_vma->vm_flags & VM_SHARED) &&
+ unlikely(anon_vma_prepare(dst_vma)))
goto out_unlock;
while (src_addr < src_start + len) {
_
From: Andrea Arcangeli <aarcange(a)redhat.com>
Subject: userfaultfd: use ENOENT instead of EFAULT if the atomic copy user fails
Patch series "userfaultfd shmem updates".
Jann found two bugs in the userfaultfd shmem MAP_SHARED backend: the lack
of the VM_MAYWRITE check and the lack of i_size checks.
Then looking into the above we also fixed the MAP_PRIVATE case.
Hugh by source review also found a data loss source if UFFDIO_COPY is used
on shmem MAP_SHARED PROT_READ mappings (the production usages incidentally
run with PROT_READ|PROT_WRITE, so the data loss couldn't happen in those
production usages like with QEMU).
The whole patchset is marked for stable.
We verified QEMU postcopy live migration with guest running on shmem
MAP_PRIVATE run as well as before after the fix of shmem MAP_PRIVATE.
Regardless if it's shmem or hugetlbfs or MAP_PRIVATE or MAP_SHARED, QEMU
unconditionally invokes a punch hole if the guest mapping is filebacked
and a MADV_DONTNEED too (needed to get rid of the MAP_PRIVATE COWs and for
the anon backend).
This patch (of 5):
We internally used EFAULT to communicate with the caller, switch to
ENOENT, so EFAULT can be used as a non internal retval.
Link: http://lkml.kernel.org/r/20181126173452.26955-2-aarcange@redhat.com
Fixes: 4c27fe4c4c84 ("userfaultfd: shmem: add shmem_mcopy_atomic_pte for userfaultfd support")
Signed-off-by: Andrea Arcangeli <aarcange(a)redhat.com>
Reviewed-by: Mike Rapoport <rppt(a)linux.ibm.com>
Reviewed-by: Hugh Dickins <hughd(a)google.com>
Cc: Mike Kravetz <mike.kravetz(a)oracle.com>
Cc: Jann Horn <jannh(a)google.com>
Cc: Peter Xu <peterx(a)redhat.com>
Cc: "Dr. David Alan Gilbert" <dgilbert(a)redhat.com>
Cc: <stable(a)vger.kernel.org>
Cc: stable(a)vger.kernel.org
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/hugetlb.c | 2 +-
mm/shmem.c | 2 +-
mm/userfaultfd.c | 6 +++---
3 files changed, 5 insertions(+), 5 deletions(-)
--- a/mm/hugetlb.c~userfaultfd-use-enoent-instead-of-efault-if-the-atomic-copy-user-fails
+++ a/mm/hugetlb.c
@@ -4080,7 +4080,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_s
/* fallback to copy_from_user outside mmap_sem */
if (unlikely(ret)) {
- ret = -EFAULT;
+ ret = -ENOENT;
*pagep = page;
/* don't free the page */
goto out;
--- a/mm/shmem.c~userfaultfd-use-enoent-instead-of-efault-if-the-atomic-copy-user-fails
+++ a/mm/shmem.c
@@ -2238,7 +2238,7 @@ static int shmem_mfill_atomic_pte(struct
*pagep = page;
shmem_inode_unacct_blocks(inode, 1);
/* don't free the page */
- return -EFAULT;
+ return -ENOENT;
}
} else { /* mfill_zeropage_atomic */
clear_highpage(page);
--- a/mm/userfaultfd.c~userfaultfd-use-enoent-instead-of-efault-if-the-atomic-copy-user-fails
+++ a/mm/userfaultfd.c
@@ -48,7 +48,7 @@ static int mcopy_atomic_pte(struct mm_st
/* fallback to copy_from_user outside mmap_sem */
if (unlikely(ret)) {
- ret = -EFAULT;
+ ret = -ENOENT;
*pagep = page;
/* don't free the page */
goto out;
@@ -274,7 +274,7 @@ retry:
cond_resched();
- if (unlikely(err == -EFAULT)) {
+ if (unlikely(err == -ENOENT)) {
up_read(&dst_mm->mmap_sem);
BUG_ON(!page);
@@ -530,7 +530,7 @@ retry:
src_addr, &page, zeropage);
cond_resched();
- if (unlikely(err == -EFAULT)) {
+ if (unlikely(err == -ENOENT)) {
void *page_kaddr;
up_read(&dst_mm->mmap_sem);
_
From: Yu Zhao <yuzhao(a)google.com>
Subject: mm: use swp_offset as key in shmem_replace_page()
We changed the key of swap cache tree from swp_entry_t.val to swp_offset.
We need to do so in shmem_replace_page() as well.
Hugh said:
: shmem_replace_page() has been wrong since the day I wrote it: good
: enough to work on swap "type" 0, which is all most people ever use
: (especially those few who need shmem_replace_page() at all), but broken
: once there are any non-0 swp_type bits set in the higher order bits.
Link: http://lkml.kernel.org/r/20181121215442.138545-1-yuzhao@google.com
Fixes: f6ab1f7f6b2d ("mm, swap: use offset of swap entry as key of swap cache")
Signed-off-by: Yu Zhao <yuzhao(a)google.com>
Reviewed-by: Matthew Wilcox <willy(a)infradead.org>
Acked-by: Hugh Dickins <hughd(a)google.com>
Cc: <stable(a)vger.kernel.org> [4.9+]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/shmem.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
--- a/mm/shmem.c~mm-use-swp_offset-as-key-in-shmem_replace_page
+++ a/mm/shmem.c
@@ -1509,11 +1509,13 @@ static int shmem_replace_page(struct pag
{
struct page *oldpage, *newpage;
struct address_space *swap_mapping;
+ swp_entry_t entry;
pgoff_t swap_index;
int error;
oldpage = *pagep;
- swap_index = page_private(oldpage);
+ entry.val = page_private(oldpage);
+ swap_index = swp_offset(entry);
swap_mapping = page_mapping(oldpage);
/*
@@ -1532,7 +1534,7 @@ static int shmem_replace_page(struct pag
__SetPageLocked(newpage);
__SetPageSwapBacked(newpage);
SetPageUptodate(newpage);
- set_page_private(newpage, swap_index);
+ set_page_private(newpage, entry.val);
SetPageSwapCache(newpage);
/*
_
From: Pavel Tikhomirov <ptikhomirov(a)virtuozzo.com>
Subject: mm: cleancache: fix corruption on missed inode invalidation
If all pages are deleted from the mapping by memory reclaim and also
moved to the cleancache:
__delete_from_page_cache
(no shadow case)
unaccount_page_cache_page
cleancache_put_page
page_cache_delete
mapping->nrpages -= nr
(nrpages becomes 0)
We don't clean the cleancache for an inode after final file truncation
(removal).
truncate_inode_pages_final
check (nrpages || nrexceptional) is false
no truncate_inode_pages
no cleancache_invalidate_inode(mapping)
These way when reading the new file created with same inode we may get
these trash leftover pages from cleancache and see wrong data instead of
the contents of the new file.
Fix it by always doing truncate_inode_pages which is already ready for
nrpages == 0 && nrexceptional == 0 case and just invalidates inode.
[akpm(a)linux-foundation.org: add comment, per Jan]
Link: http://lkml.kernel.org/r/20181112095734.17979-1-ptikhomirov@virtuozzo.com
Fixes: commit 91b0abe36a7b ("mm + fs: store shadow entries in page cache")
Signed-off-by: Pavel Tikhomirov <ptikhomirov(a)virtuozzo.com>
Reviewed-by: Vasily Averin <vvs(a)virtuozzo.com>
Reviewed-by: Andrey Ryabinin <aryabinin(a)virtuozzo.com>
Reviewed-by: Jan Kara <jack(a)suse.cz>
Cc: Johannes Weiner <hannes(a)cmpxchg.org>
Cc: Mel Gorman <mgorman(a)techsingularity.net>
Cc: Matthew Wilcox <willy(a)infradead.org>
Cc: Andi Kleen <ak(a)linux.intel.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/truncate.c | 8 ++++++--
1 file changed, 6 insertions(+), 2 deletions(-)
--- a/mm/truncate.c~mm-cleancache-fix-corruption-on-missed-inode-invalidation
+++ a/mm/truncate.c
@@ -517,9 +517,13 @@ void truncate_inode_pages_final(struct a
*/
xa_lock_irq(&mapping->i_pages);
xa_unlock_irq(&mapping->i_pages);
-
- truncate_inode_pages(mapping, 0);
}
+
+ /*
+ * Cleancache needs notification even if there are no pages or shadow
+ * entries.
+ */
+ truncate_inode_pages(mapping, 0);
}
EXPORT_SYMBOL(truncate_inode_pages_final);
_
From: Kevin Hilman <khilman(a)baylibre.com>
Change the default defconfig (used with 'make defconfig') to the ARCv2
nsim_hs_defconfig, and also switch the default Kconfig ISA selection to
ARCv2.
This allows several default defconfigs (e.g. make defconfig, make
allnoconfig, make tinyconfig) to all work with ARCv2 by default.
Note since we change default architecture from ARCompact to ARCv2
it's required to explicitly mention architecture type in ARCompact
defconfigs otherwise ARCv2 will be implied and binaries will be
generated for ARCv2.
Signed-off-by: Kevin Hilman <khilman(a)baylibre.com>
Signed-off-by: Alexey Brodkin <abrodkin(a)synopsys.com>
Cc: <stable(a)vger.kernel.org> # 4.4.x
---
Changes v1 -> v2:
* Added CONFIG_ISA_ARCOMPACT to ARCompact defconfigs
arch/arc/Kconfig | 2 +-
arch/arc/Makefile | 2 +-
arch/arc/configs/axs101_defconfig | 1 +
arch/arc/configs/nps_defconfig | 1 +
arch/arc/configs/nsim_700_defconfig | 1 +
arch/arc/configs/nsimosci_defconfig | 1 +
arch/arc/configs/tb10x_defconfig | 1 +
7 files changed, 7 insertions(+), 2 deletions(-)
diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig
index 5fcbda6b37cc..6dd783557330 100644
--- a/arch/arc/Kconfig
+++ b/arch/arc/Kconfig
@@ -109,7 +109,7 @@ endmenu
choice
prompt "ARC Instruction Set"
- default ISA_ARCOMPACT
+ default ISA_ARCV2
config ISA_ARCOMPACT
bool "ARCompact ISA"
diff --git a/arch/arc/Makefile b/arch/arc/Makefile
index c64c505d966c..df00578c279d 100644
--- a/arch/arc/Makefile
+++ b/arch/arc/Makefile
@@ -6,7 +6,7 @@
# published by the Free Software Foundation.
#
-KBUILD_DEFCONFIG := nsim_700_defconfig
+KBUILD_DEFCONFIG := nsim_hs_defconfig
cflags-y += -fno-common -pipe -fno-builtin -mmedium-calls -D__linux__
cflags-$(CONFIG_ISA_ARCOMPACT) += -mA7
diff --git a/arch/arc/configs/axs101_defconfig b/arch/arc/configs/axs101_defconfig
index 8c23bd086cd0..020d4493edfd 100644
--- a/arch/arc/configs/axs101_defconfig
+++ b/arch/arc/configs/axs101_defconfig
@@ -14,6 +14,7 @@ CONFIG_PERF_EVENTS=y
# CONFIG_VM_EVENT_COUNTERS is not set
# CONFIG_SLUB_DEBUG is not set
# CONFIG_COMPAT_BRK is not set
+CONFIG_ISA_ARCOMPACT=y
CONFIG_MODULES=y
CONFIG_MODULE_FORCE_LOAD=y
CONFIG_MODULE_UNLOAD=y
diff --git a/arch/arc/configs/nps_defconfig b/arch/arc/configs/nps_defconfig
index ae7a0d8be98d..6e84060e7c90 100644
--- a/arch/arc/configs/nps_defconfig
+++ b/arch/arc/configs/nps_defconfig
@@ -15,6 +15,7 @@ CONFIG_SYSCTL_SYSCALL=y
CONFIG_EMBEDDED=y
CONFIG_PERF_EVENTS=y
# CONFIG_COMPAT_BRK is not set
+CONFIG_ISA_ARCOMPACT=y
CONFIG_KPROBES=y
CONFIG_MODULES=y
CONFIG_MODULE_FORCE_LOAD=y
diff --git a/arch/arc/configs/nsim_700_defconfig b/arch/arc/configs/nsim_700_defconfig
index 8e0b8b134cd9..219c2a65294b 100644
--- a/arch/arc/configs/nsim_700_defconfig
+++ b/arch/arc/configs/nsim_700_defconfig
@@ -15,6 +15,7 @@ CONFIG_EMBEDDED=y
CONFIG_PERF_EVENTS=y
# CONFIG_SLUB_DEBUG is not set
# CONFIG_COMPAT_BRK is not set
+CONFIG_ISA_ARCOMPACT=y
CONFIG_KPROBES=y
CONFIG_MODULES=y
# CONFIG_LBDAF is not set
diff --git a/arch/arc/configs/nsimosci_defconfig b/arch/arc/configs/nsimosci_defconfig
index ad77f20e5aa6..35dfc6491a09 100644
--- a/arch/arc/configs/nsimosci_defconfig
+++ b/arch/arc/configs/nsimosci_defconfig
@@ -15,6 +15,7 @@ CONFIG_EMBEDDED=y
CONFIG_PERF_EVENTS=y
# CONFIG_SLUB_DEBUG is not set
# CONFIG_COMPAT_BRK is not set
+CONFIG_ISA_ARCOMPACT=y
CONFIG_KPROBES=y
CONFIG_MODULES=y
# CONFIG_LBDAF is not set
diff --git a/arch/arc/configs/tb10x_defconfig b/arch/arc/configs/tb10x_defconfig
index a7f65313f84a..e71ade3cf9c8 100644
--- a/arch/arc/configs/tb10x_defconfig
+++ b/arch/arc/configs/tb10x_defconfig
@@ -19,6 +19,7 @@ CONFIG_KALLSYMS_ALL=y
# CONFIG_AIO is not set
CONFIG_EMBEDDED=y
# CONFIG_COMPAT_BRK is not set
+CONFIG_ISA_ARCOMPACT=y
CONFIG_SLAB=y
CONFIG_MODULES=y
CONFIG_MODULE_FORCE_LOAD=y
--
2.16.2
From: Josef Bacik <jbacik(a)fb.com>
With my delayed refs patches in place we started seeing a large amount
of aborts in __btrfs_free_extent
BTRFS error (device sdb1): unable to find ref byte nr 91947008 parent 0 root 35964 owner 1 offset 0
Call Trace:
? btrfs_merge_delayed_refs+0xaf/0x340
__btrfs_run_delayed_refs+0x6ea/0xfc0
? btrfs_set_path_blocking+0x31/0x60
btrfs_run_delayed_refs+0xeb/0x180
btrfs_commit_transaction+0x179/0x7f0
? btrfs_check_space_for_delayed_refs+0x30/0x50
? should_end_transaction.isra.19+0xe/0x40
btrfs_drop_snapshot+0x41c/0x7c0
btrfs_clean_one_deleted_snapshot+0xb5/0xd0
cleaner_kthread+0xf6/0x120
kthread+0xf8/0x130
? btree_invalidatepage+0x90/0x90
? kthread_bind+0x10/0x10
ret_from_fork+0x35/0x40
This was because btrfs_drop_snapshot depends on the root not being modified
while it's dropping the snapshot. It will unlock the root node (and really
every node) as it walks down the tree, only to re-lock it when it needs to do
something. This is a problem because if we modify the tree we could cow a block
in our path, which free's our reference to that block. Then once we get back to
that shared block we'll free our reference to it again, and get ENOENT when
trying to lookup our extent reference to that block in __btrfs_free_extent.
This is ultimately happening because we have delayed items left to be processed
for our deleted snapshot _after_ all of the inodes are closed for the snapshot.
We only run the delayed inode item if we're deleting the inode, and even then we
do not run the delayed insertions or delayed removals. These can be run at any
point after our final inode does it's last iput, which is what triggers the
snapshot deletion. We can end up with the snapshot deletion happening and then
have the delayed items run on that file system, resulting in the above problem.
This problem has existed forever, however my patches made it much easier to hit
as I wake up the cleaner much more often to deal with delayed iputs, which made
us more likely to start the snapshot dropping work before the transaction
commits, which is when the delayed items would generally be run. Before,
generally speaking, we would run the delayed items, commit the transaction, and
wakeup the cleaner thread to start deleting snapshots, which means we were less
likely to hit this problem. You could still hit it if you had multiple
snapshots to be deleted and ended up with lots of delayed items, but it was
definitely harder.
Fix for now by simply running all the delayed items before starting to drop the
snapshot. We could make this smarter in the future by making the delayed items
per-root, and then simply drop any delayed items for roots that we are going to
delete. But for now just a quick and easy solution is the safest.
Cc: stable(a)vger.kernel.org
Signed-off-by: Josef Bacik <josef(a)toxicpanda.com>
---
fs/btrfs/extent-tree.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index dcb699dd57f3..965702034b22 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -9330,6 +9330,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
goto out_free;
}
+ btrfs_run_delayed_items(trans);
+
if (block_rsv)
trans->block_rsv = block_rsv;
--
2.14.3
Hello,
We ran automated tests on a recent commit from this kernel tree:
Kernel repo: git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git
Commit: 048192fdfd84 Linux 4.19.6-rc2
The results of these automated tests are provided below.
Overall result: PASSED
Patch merge: OK
Compile: OK
Kernel tests: OK
Please reply to this email if you have any questions about the tests that we
ran or if you have any suggestions on how to make future tests more effective.
,-. ,-.
( C ) ( K ) Continuous
`-',-.`-' Kernel
( I ) Integration
`-'
______________________________________________________________________________
Compile testing
---------------
We compiled the kernel for 2 architectures:
x86_64:
make options: make INSTALL_MOD_STRIP=1 -j56 targz-pkg
configuration: https://artifacts.cki-project.org/builds/x86_64/048192fdfd8468e130937cd697f…
aarch64:
make options: make INSTALL_MOD_STRIP=1 -j56 targz-pkg
configuration: https://artifacts.cki-project.org/builds/aarch64/048192fdfd8468e130937cd697…
Hardware testing
----------------
We booted each kernel and ran the following tests:
x86_64:
/distribution/kpkginstall (boot test)
LTP lite - release 20180515
xfstests: ext4
xfstests: xfs
/kernel/misc/amtu
arm64:
/distribution/kpkginstall (boot test)
LTP lite - release 20180515
xfstests: ext4
xfstests: xfs
/kernel/misc/amtu
From: Adrian Hunter <adrian.hunter(a)intel.com>
thread__resolve() is used in the sample_addr_correlates_sym() cases
where 'addr' is a destination of a branch which does not necessarily
have the same cpumode as the 'ip'. Use the fallback function in that
case.
This patch depends on patch "perf tools: Add fallback functions for
cases where cpumode is insufficient".
Signed-off-by: Adrian Hunter <adrian.hunter(a)intel.com>
Cc: Andi Kleen <ak(a)linux.intel.com>
Cc: David S. Miller <davem(a)davemloft.net>
Cc: Jiri Olsa <jolsa(a)redhat.com>
Cc: Leo Yan <leo.yan(a)linaro.org>
Cc: Mathieu Poirier <mathieu.poirier(a)linaro.org>
Cc: stable(a)vger.kernel.org
Link: http://lkml.kernel.org/r/20181106210712.12098-3-adrian.hunter@intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme(a)redhat.com>
---
tools/perf/util/event.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index 9431b20c1337..24493200cf80 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -1706,7 +1706,7 @@ bool sample_addr_correlates_sym(struct perf_event_attr *attr)
void thread__resolve(struct thread *thread, struct addr_location *al,
struct perf_sample *sample)
{
- thread__find_map(thread, sample->cpumode, sample->addr, al);
+ thread__find_map_fb(thread, sample->cpumode, sample->addr, al);
al->cpu = sample->cpu;
al->sym = NULL;
--
2.19.1
From: Adrian Hunter <adrian.hunter(a)intel.com>
Some architectures have a single address space for kernel and user
addresses, which makes it possible to determine if an address is in
kernel space or user space. Some don't, e.g.: sparc.
Cache that info in perf_env so that, for instance, code needing to
fallback failed symbol lookups at the kernel space in single address
space arches can lookup at userspace.
Signed-off-by: Adrian Hunter <adrian.hunter(a)intel.com>
Cc: Andi Kleen <ak(a)linux.intel.com>
Cc: David S. Miller <davem(a)davemloft.net>
Cc: Jiri Olsa <jolsa(a)redhat.com>
Cc: Leo Yan <leo.yan(a)linaro.org>
Cc: Mathieu Poirier <mathieu.poirier(a)linaro.org>
Cc: stable(a)vger.kernel.org
Link: http://lkml.kernel.org/r/20181106210712.12098-2-adrian.hunter@intel.com
[ split from a larger patch ]
Signed-off-by: Arnaldo Carvalho de Melo <acme(a)redhat.com>
---
tools/perf/arch/common.c | 10 ++++++++++
tools/perf/arch/common.h | 1 +
tools/perf/util/machine.h | 1 +
tools/perf/util/session.c | 4 ++++
4 files changed, 16 insertions(+)
diff --git a/tools/perf/arch/common.c b/tools/perf/arch/common.c
index 82657c01a3b8..5f69fd0b745a 100644
--- a/tools/perf/arch/common.c
+++ b/tools/perf/arch/common.c
@@ -200,3 +200,13 @@ int perf_env__lookup_objdump(struct perf_env *env, const char **path)
return perf_env__lookup_binutils_path(env, "objdump", path);
}
+
+/*
+ * Some architectures have a single address space for kernel and user addresses,
+ * which makes it possible to determine if an address is in kernel space or user
+ * space.
+ */
+bool perf_env__single_address_space(struct perf_env *env)
+{
+ return strcmp(perf_env__arch(env), "sparc");
+}
diff --git a/tools/perf/arch/common.h b/tools/perf/arch/common.h
index 2167001b18c5..c298a446d1f6 100644
--- a/tools/perf/arch/common.h
+++ b/tools/perf/arch/common.h
@@ -5,5 +5,6 @@
#include "../util/env.h"
int perf_env__lookup_objdump(struct perf_env *env, const char **path);
+bool perf_env__single_address_space(struct perf_env *env);
#endif /* ARCH_PERF_COMMON_H */
diff --git a/tools/perf/util/machine.h b/tools/perf/util/machine.h
index d856b85862e2..ca897a73014c 100644
--- a/tools/perf/util/machine.h
+++ b/tools/perf/util/machine.h
@@ -42,6 +42,7 @@ struct machine {
u16 id_hdr_size;
bool comm_exec;
bool kptr_restrict_warned;
+ bool single_address_space;
char *root_dir;
char *mmap_name;
struct threads threads[THREADS__TABLE_SIZE];
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 7d2c8ce6cfad..f8eab197f35c 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -24,6 +24,7 @@
#include "thread.h"
#include "thread-stack.h"
#include "stat.h"
+#include "arch/common.h"
static int perf_session__deliver_event(struct perf_session *session,
union perf_event *event,
@@ -150,6 +151,9 @@ struct perf_session *perf_session__new(struct perf_data *data,
session->machines.host.env = &perf_env;
}
+ session->machines.host.single_address_space =
+ perf_env__single_address_space(session->machines.host.env);
+
if (!data || perf_data__is_write(data)) {
/*
* In O_RDONLY mode this will be performed when reading the
--
2.19.1
From: Arnaldo Carvalho de Melo <acme(a)redhat.com>
We'll set a new machine field based on env->arch, which for live mode,
like with 'perf top' means we need to use uname() to figure the name of
the arch, fix perf_env__arch() to consider both (env == NULL) and
(env->arch == NULL) as local operation.
Cc: Adrian Hunter <adrian.hunter(a)intel.com>
Cc: Andi Kleen <ak(a)linux.intel.com>
Cc: David Ahern <dsahern(a)gmail.com>
Cc: David S. Miller <davem(a)davemloft.net>
Cc: Jiri Olsa <jolsa(a)redhat.com>
Cc: Leo Yan <leo.yan(a)linaro.org>
Cc: Mathieu Poirier <mathieu.poirier(a)linaro.org>
Cc: Namhyung Kim <namhyung(a)kernel.org>
Cc: Wang Nan <wangnan0(a)huawei.com>
Cc: stable(a)vger.kernel.org
Link: https://lkml.kernel.org/n/tip-vcz4ufzdon7cwy8dm2ua53xk@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme(a)redhat.com>
---
tools/perf/util/env.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tools/perf/util/env.c b/tools/perf/util/env.c
index 59f38c7693f8..4c23779e271a 100644
--- a/tools/perf/util/env.c
+++ b/tools/perf/util/env.c
@@ -166,7 +166,7 @@ const char *perf_env__arch(struct perf_env *env)
struct utsname uts;
char *arch_name;
- if (!env) { /* Assume local operation */
+ if (!env || !env->arch) { /* Assume local operation */
if (uname(&uts) < 0)
return NULL;
arch_name = uts.machine;
--
2.19.1