From: Lino Sanfilippo <l.sanfilippo(a)kunbus.com>
The following sequence of operations results in a refcount warning:
1. Open device /dev/tpmrm
2. Remove module tpm_tis_spi
3. Write a TPM command to the file descriptor opened at step 1.
------------[ cut here ]------------
WARNING: CPU: 3 PID: 1161 at lib/refcount.c:25 kobject_get+0xa0/0xa4
refcount_t: addition on 0; use-after-free.
Modules linked in: tpm_tis_spi tpm_tis_core tpm mdio_bcm_unimac brcmfmac
sha256_generic libsha256 sha256_arm hci_uart btbcm bluetooth cfg80211 vc4
brcmutil ecdh_generic ecc snd_soc_core crc32_arm_ce libaes
raspberrypi_hwmon ac97_bus snd_pcm_dmaengine bcm2711_thermal snd_pcm
snd_timer genet snd phy_generic soundcore [last unloaded: spi_bcm2835]
CPU: 3 PID: 1161 Comm: hold_open Not tainted 5.10.0ls-main-dirty #2
Hardware name: BCM2711
[<c0410c3c>] (unwind_backtrace) from [<c040b580>] (show_stack+0x10/0x14)
[<c040b580>] (show_stack) from [<c1092174>] (dump_stack+0xc4/0xd8)
[<c1092174>] (dump_stack) from [<c0445a30>] (__warn+0x104/0x108)
[<c0445a30>] (__warn) from [<c0445aa8>] (warn_slowpath_fmt+0x74/0xb8)
[<c0445aa8>] (warn_slowpath_fmt) from [<c08435d0>] (kobject_get+0xa0/0xa4)
[<c08435d0>] (kobject_get) from [<bf0a715c>] (tpm_try_get_ops+0x14/0x54 [tpm])
[<bf0a715c>] (tpm_try_get_ops [tpm]) from [<bf0a7d6c>] (tpm_common_write+0x38/0x60 [tpm])
[<bf0a7d6c>] (tpm_common_write [tpm]) from [<c05a7ac0>] (vfs_write+0xc4/0x3c0)
[<c05a7ac0>] (vfs_write) from [<c05a7ee4>] (ksys_write+0x58/0xcc)
[<c05a7ee4>] (ksys_write) from [<c04001a0>] (ret_fast_syscall+0x0/0x4c)
Exception stack(0xc226bfa8 to 0xc226bff0)
bfa0: 00000000 000105b4 00000003 beafe664 00000014 00000000
bfc0: 00000000 000105b4 000103f8 00000004 00000000 00000000 b6f9c000 beafe684
bfe0: 0000006c beafe648 0001056c b6eb6944
---[ end trace d4b8409def9b8b1f ]---
The reason for this warning is the attempt to get the chip->dev reference
in tpm_common_write() although the reference counter is already zero.
Since commit 8979b02aaf1d ("tpm: Fix reference count to main device") the
extra reference used to prevent a premature zero counter is never taken,
because the required TPM_CHIP_FLAG_TPM2 flag is never set.
Fix this by moving the TPM 2 character device handling from
tpm_chip_alloc() to tpm_add_char_device() which is called at a later point
in time when the flag has been set in case of TPM2.
Commit fdc915f7f719 ("tpm: expose spaces via a device link /dev/tpmrm<n>")
already introduced function tpm_devs_release() to release the extra
reference but did not implement the required put on chip->devs that results
in the call of this function.
Fix this by putting chip->devs in tpm_chip_unregister().
Finally move the new implemenation for the TPM 2 handling into a new
function to avoid multiple checks for the TPM_CHIP_FLAG_TPM2 flag in the
good case and error cases.
Fixes: fdc915f7f719 ("tpm: expose spaces via a device link /dev/tpmrm<n>")
Fixes: 8979b02aaf1d ("tpm: Fix reference count to main device")
Co-developed-by: Jason Gunthorpe <jgg(a)ziepe.ca>
Signed-off-by: Jason Gunthorpe <jgg(a)ziepe.ca>
Signed-off-by: Lino Sanfilippo <l.sanfilippo(a)kunbus.com>
Cc: stable(a)vger.kernel.org
---
drivers/char/tpm/tpm-chip.c | 80 ++++++++++++++++++++++++++++-----------------
1 file changed, 50 insertions(+), 30 deletions(-)
diff --git a/drivers/char/tpm/tpm-chip.c b/drivers/char/tpm/tpm-chip.c
index ddaeceb..44cac3a 100644
--- a/drivers/char/tpm/tpm-chip.c
+++ b/drivers/char/tpm/tpm-chip.c
@@ -344,7 +344,6 @@ struct tpm_chip *tpm_chip_alloc(struct device *pdev,
chip->dev_num = rc;
device_initialize(&chip->dev);
- device_initialize(&chip->devs);
chip->dev.class = tpm_class;
chip->dev.class->shutdown_pre = tpm_class_shutdown;
@@ -352,39 +351,20 @@ struct tpm_chip *tpm_chip_alloc(struct device *pdev,
chip->dev.parent = pdev;
chip->dev.groups = chip->groups;
- chip->devs.parent = pdev;
- chip->devs.class = tpmrm_class;
- chip->devs.release = tpm_devs_release;
- /* get extra reference on main device to hold on
- * behalf of devs. This holds the chip structure
- * while cdevs is in use. The corresponding put
- * is in the tpm_devs_release (TPM2 only)
- */
- if (chip->flags & TPM_CHIP_FLAG_TPM2)
- get_device(&chip->dev);
-
if (chip->dev_num == 0)
chip->dev.devt = MKDEV(MISC_MAJOR, TPM_MINOR);
else
chip->dev.devt = MKDEV(MAJOR(tpm_devt), chip->dev_num);
- chip->devs.devt =
- MKDEV(MAJOR(tpm_devt), chip->dev_num + TPM_NUM_DEVICES);
-
rc = dev_set_name(&chip->dev, "tpm%d", chip->dev_num);
if (rc)
goto out;
- rc = dev_set_name(&chip->devs, "tpmrm%d", chip->dev_num);
- if (rc)
- goto out;
if (!pdev)
chip->flags |= TPM_CHIP_FLAG_VIRTUAL;
cdev_init(&chip->cdev, &tpm_fops);
- cdev_init(&chip->cdevs, &tpmrm_fops);
chip->cdev.owner = THIS_MODULE;
- chip->cdevs.owner = THIS_MODULE;
rc = tpm2_init_space(&chip->work_space, TPM2_SPACE_BUFFER_SIZE);
if (rc) {
@@ -396,7 +376,6 @@ struct tpm_chip *tpm_chip_alloc(struct device *pdev,
return chip;
out:
- put_device(&chip->devs);
put_device(&chip->dev);
return ERR_PTR(rc);
}
@@ -431,6 +410,46 @@ struct tpm_chip *tpmm_chip_alloc(struct device *pdev,
}
EXPORT_SYMBOL_GPL(tpmm_chip_alloc);
+static int tpm_add_tpm2_char_device(struct tpm_chip *chip)
+{
+ int rc;
+
+ device_initialize(&chip->devs);
+ chip->devs.parent = chip->dev.parent;
+ chip->devs.class = tpmrm_class;
+
+ rc = dev_set_name(&chip->devs, "tpmrm%d", chip->dev_num);
+ if (rc)
+ goto out_put_devs;
+ /*
+ * get extra reference on main device to hold on behalf of devs.
+ * This holds the chip structure while cdevs is in use. The
+ * corresponding put is in the tpm_devs_release.
+ */
+ get_device(&chip->dev);
+ chip->devs.release = tpm_devs_release;
+ chip->devs.devt =
+ MKDEV(MAJOR(tpm_devt), chip->dev_num + TPM_NUM_DEVICES);
+ cdev_init(&chip->cdevs, &tpmrm_fops);
+ chip->cdevs.owner = THIS_MODULE;
+
+ rc = cdev_device_add(&chip->cdevs, &chip->devs);
+ if (rc) {
+ dev_err(&chip->devs,
+ "unable to cdev_device_add() %s, major %d, minor %d, err=%d\n",
+ dev_name(&chip->devs), MAJOR(chip->devs.devt),
+ MINOR(chip->devs.devt), rc);
+ goto out_put_devs;
+ }
+
+ return 0;
+
+out_put_devs:
+ put_device(&chip->devs);
+
+ return rc;
+}
+
static int tpm_add_char_device(struct tpm_chip *chip)
{
int rc;
@@ -445,14 +464,9 @@ static int tpm_add_char_device(struct tpm_chip *chip)
}
if (chip->flags & TPM_CHIP_FLAG_TPM2) {
- rc = cdev_device_add(&chip->cdevs, &chip->devs);
- if (rc) {
- dev_err(&chip->devs,
- "unable to cdev_device_add() %s, major %d, minor %d, err=%d\n",
- dev_name(&chip->devs), MAJOR(chip->devs.devt),
- MINOR(chip->devs.devt), rc);
- return rc;
- }
+ rc = tpm_add_tpm2_char_device(chip);
+ if (rc)
+ goto del_cdev;
}
/* Make the chip available. */
@@ -460,6 +474,10 @@ static int tpm_add_char_device(struct tpm_chip *chip)
idr_replace(&dev_nums_idr, chip, chip->dev_num);
mutex_unlock(&idr_lock);
+ return 0;
+
+del_cdev:
+ cdev_device_del(&chip->cdev, &chip->dev);
return rc;
}
@@ -640,8 +658,10 @@ void tpm_chip_unregister(struct tpm_chip *chip)
if (IS_ENABLED(CONFIG_HW_RANDOM_TPM))
hwrng_unregister(&chip->hwrng);
tpm_bios_log_teardown(chip);
- if (chip->flags & TPM_CHIP_FLAG_TPM2)
+ if (chip->flags & TPM_CHIP_FLAG_TPM2) {
cdev_device_del(&chip->cdevs, &chip->devs);
+ put_device(&chip->devs);
+ }
tpm_del_char_device(chip);
}
EXPORT_SYMBOL_GPL(tpm_chip_unregister);
--
2.7.4
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 88bf56d04bc3564542049ec4ec168a8b60d0b48c Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs(a)linux.alibaba.com>
Date: Thu, 17 Dec 2020 23:41:18 +0800
Subject: [PATCH] kvm: check tlbs_dirty directly
In kvm_mmu_notifier_invalidate_range_start(), tlbs_dirty is used as:
need_tlb_flush |= kvm->tlbs_dirty;
with need_tlb_flush's type being int and tlbs_dirty's type being long.
It means that tlbs_dirty is always used as int and the higher 32 bits
is useless. We need to check tlbs_dirty in a correct way and this
change checks it directly without propagating it to need_tlb_flush.
Note: it's _extremely_ unlikely this neglecting of higher 32 bits can
cause problems in practice. It would require encountering tlbs_dirty
on a 4 billion count boundary, and KVM would need to be using shadow
paging or be running a nested guest.
Cc: stable(a)vger.kernel.org
Fixes: a4ee1ca4a36e ("KVM: MMU: delay flush all tlbs on sync_page path")
Signed-off-by: Lai Jiangshan <laijs(a)linux.alibaba.com>
Message-Id: <20201217154118.16497-1-jiangshanlai(a)gmail.com>
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 3abcb2ce5b7d..19dae28904f7 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -485,9 +485,8 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
kvm->mmu_notifier_count++;
need_tlb_flush = kvm_unmap_hva_range(kvm, range->start, range->end,
range->flags);
- need_tlb_flush |= kvm->tlbs_dirty;
/* we've to flush the tlb before the pages can be freed */
- if (need_tlb_flush)
+ if (need_tlb_flush || kvm->tlbs_dirty)
kvm_flush_remote_tlbs(kvm);
spin_unlock(&kvm->mmu_lock);
The patch titled
Subject: mm: memcontrol: fix swap undercounting in cgroup2
has been added to the -mm tree. Its filename is
mm-memcontrol-fix-swap-undercounting-in-cgroup2.patch
This patch should soon appear at
https://ozlabs.org/~akpm/mmots/broken-out/mm-memcontrol-fix-swap-undercount…
and later at
https://ozlabs.org/~akpm/mmotm/broken-out/mm-memcontrol-fix-swap-undercount…
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next and is updated
there every 3-4 working days
------------------------------------------------------
From: Muchun Song <songmuchun(a)bytedance.com>
Subject: mm: memcontrol: fix swap undercounting in cgroup2
When pages are swapped in, the VM may retain the swap copy to avoid
repeated writes in the future. It's also retained if shared pages are
faulted back in some processes, but not in others. During that time we
have an in-memory copy of the page, as well as an on-swap copy. Cgroup1
and cgroup2 handle these overlapping lifetimes slightly differently due to
the nature of how they account memory and swap:
Cgroup1 has a unified memory+swap counter that tracks a data page
regardless whether it's in-core or swapped out. On swapin, we transfer
the charge from the swap entry to the newly allocated swapcache page, even
though the swap entry might stick around for a while. That's why we have
a mem_cgroup_uncharge_swap() call inside mem_cgroup_charge().
Cgroup2 tracks memory and swap as separate, independent resources and thus
has split memory and swap counters. On swapin, we charge the newly
allocated swapcache page as memory, while the swap slot in turn must
remain charged to the swap counter as long as its allocated too.
The cgroup2 logic was broken by commit 2d1c498072de ("mm: memcontrol: make
swap tracking an integral part of memory control"), because it
accidentally removed the do_memsw_account() check in the branch inside
mem_cgroup_uncharge() that was supposed to tell the difference between the
charge transfer in cgroup1 and the separate counters in cgroup2.
As a result, cgroup2 currently undercounts retained swap to varying
degrees: swap slots are cached up to 50% of the configured limit or total
available swap space; partially faulted back shared pages are only limited
by physical capacity. This in turn allows cgroups to significantly
overconsume their alloted swap space.
Add the do_memsw_account() check back to fix this problem.
Link: https://lkml.kernel.org/r/20210217153237.92484-1-songmuchun@bytedance.com
Fixes: 2d1c498072de ("mm: memcontrol: make swap tracking an integral part of memory control")
Signed-off-by: Muchun Song <songmuchun(a)bytedance.com>
Acked-by: Johannes Weiner <hannes(a)cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb(a)google.com>
Acked-by: Michal Hocko <mhocko(a)suse.com>
Cc: Vladimir Davydov <vdavydov.dev(a)gmail.com>
Cc: <stable(a)vger.kernel.org> [5.8+]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/memcontrol.c | 14 +++++++++++++-
1 file changed, 13 insertions(+), 1 deletion(-)
--- a/mm/memcontrol.c~mm-memcontrol-fix-swap-undercounting-in-cgroup2
+++ a/mm/memcontrol.c
@@ -6748,7 +6748,19 @@ int mem_cgroup_charge(struct page *page,
memcg_check_events(memcg, page);
local_irq_enable();
- if (PageSwapCache(page)) {
+ /*
+ * Cgroup1's unified memory+swap counter has been charged with the
+ * new swapcache page, finish the transfer by uncharging the swap
+ * slot. The swap slot would also get uncharged when it dies, but
+ * it can stick around indefinitely and we'd count the page twice
+ * the entire time.
+ *
+ * Cgroup2 has separate resource counters for memory and swap,
+ * so this is a non-issue here. Memory and swap charge lifetimes
+ * correspond 1:1 to page and swap slot lifetimes: we charge the
+ * page to memory here, and uncharge swap when the slot is freed.
+ */
+ if (do_memsw_account() && PageSwapCache(page)) {
swp_entry_t entry = { .val = page_private(page) };
/*
* The swap entry might not get freed for a long time,
_
Patches currently in -mm which might be from songmuchun(a)bytedance.com are
mm-memcontrol-optimize-per-lruvec-stats-counter-memory-usage.patch
mm-memcontrol-fix-nr_anon_thps-accounting-in-charge-moving.patch
mm-memcontrol-convert-nr_anon_thps-account-to-pages.patch
mm-memcontrol-convert-nr_file_thps-account-to-pages.patch
mm-memcontrol-convert-nr_shmem_thps-account-to-pages.patch
mm-memcontrol-convert-nr_shmem_pmdmapped-account-to-pages.patch
mm-memcontrol-convert-nr_file_pmdmapped-account-to-pages.patch
mm-memcontrol-make-the-slab-calculation-consistent.patch
mm-memcontrol-replace-the-loop-with-a-list_for_each_entry.patch
mm-memcontrol-fix-swap-undercounting-in-cgroup2.patch
hugetlb-convert-page_huge_active-hpagemigratable-flag-fix.patch
The patch titled
Subject: mm, compaction: make fast_isolate_freepages() stay within zone
has been added to the -mm tree. Its filename is
mm-compaction-make-fast_isolate_freepages-stay-within-zone.patch
This patch should soon appear at
https://ozlabs.org/~akpm/mmots/broken-out/mm-compaction-make-fast_isolate_f…
and later at
https://ozlabs.org/~akpm/mmotm/broken-out/mm-compaction-make-fast_isolate_f…
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next and is updated
there every 3-4 working days
------------------------------------------------------
From: Vlastimil Babka <vbabka(a)suse.cz>
Subject: mm, compaction: make fast_isolate_freepages() stay within zone
Compaction always operates on pages from a single given zone when
isolating both pages to migrate and freepages. Pageblock boundaries are
intersected with zone boundaries to be safe in case zone starts or ends in
the middle of pageblock. The use of pageblock_pfn_to_page() protects
against non-contiguous pageblocks.
The functions fast_isolate_freepages() and fast_isolate_around() don't
currently protect the fast freepage isolation thoroughly enough against
these corner cases, and can result in freepage isolation operate outside
of zone boundaries:
- in fast_isolate_freepages() if we get a pfn from the first pageblock
of a zone that starts in the middle of that pageblock, 'highest' can be
a pfn outside of the zone. If we fail to isolate anything in this
function, we may then call fast_isolate_around() on a pfn outside of the
zone and there effectively do a set_pageblock_skip(page_to_pfn(highest))
which may currently hit a VM_BUG_ON() in some configurations
- fast_isolate_around() checks only the zone end boundary and not
beginning, nor that the pageblock is contiguous (with
pageblock_pfn_to_page()) so it's possible that we end up calling
isolate_freepages_block() on a range of pfn's from two different zones
and end up e.g. isolating freepages under the wrong zone's lock.
This patch should fix the above issues.
Link: https://lkml.kernel.org/r/20210217173300.6394-1-vbabka@suse.cz
Fixes: 5a811889de10 ("mm, compaction: use free lists to quickly locate a migration target")
Signed-off-by: Vlastimil Babka <vbabka(a)suse.cz>
Acked-by: David Rientjes <rientjes(a)google.com>
Acked-by: Mel Gorman <mgorman(a)techsingularity.net>
Cc: Andrea Arcangeli <aarcange(a)redhat.com>
Cc: David Hildenbrand <david(a)redhat.com>
Cc: Michal Hocko <mhocko(a)kernel.org>
Cc: Mike Rapoport <rppt(a)kernel.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/compaction.c | 16 +++++++++++-----
1 file changed, 11 insertions(+), 5 deletions(-)
--- a/mm/compaction.c~mm-compaction-make-fast_isolate_freepages-stay-within-zone
+++ a/mm/compaction.c
@@ -1284,7 +1284,7 @@ static void
fast_isolate_around(struct compact_control *cc, unsigned long pfn, unsigned long nr_isolated)
{
unsigned long start_pfn, end_pfn;
- struct page *page = pfn_to_page(pfn);
+ struct page *page;
/* Do not search around if there are enough pages already */
if (cc->nr_freepages >= cc->nr_migratepages)
@@ -1295,8 +1295,12 @@ fast_isolate_around(struct compact_contr
return;
/* Pageblock boundaries */
- start_pfn = pageblock_start_pfn(pfn);
- end_pfn = min(pageblock_end_pfn(pfn), zone_end_pfn(cc->zone)) - 1;
+ start_pfn = max(pageblock_start_pfn(pfn), cc->zone->zone_start_pfn);
+ end_pfn = min(pageblock_end_pfn(pfn), zone_end_pfn(cc->zone));
+
+ page = pageblock_pfn_to_page(start_pfn, end_pfn, cc->zone);
+ if (!page)
+ return;
/* Scan before */
if (start_pfn != pfn) {
@@ -1398,7 +1402,8 @@ fast_isolate_freepages(struct compact_co
pfn = page_to_pfn(freepage);
if (pfn >= highest)
- highest = pageblock_start_pfn(pfn);
+ highest = max(pageblock_start_pfn(pfn),
+ cc->zone->zone_start_pfn);
if (pfn >= low_pfn) {
cc->fast_search_fail = 0;
@@ -1468,7 +1473,8 @@ fast_isolate_freepages(struct compact_co
} else {
if (cc->direct_compaction && pfn_valid(min_pfn)) {
page = pageblock_pfn_to_page(min_pfn,
- pageblock_end_pfn(min_pfn),
+ min(pageblock_end_pfn(min_pfn),
+ zone_end_pfn(cc->zone)),
cc->zone);
cc->free_pfn = min_pfn;
}
_
Patches currently in -mm which might be from vbabka(a)suse.cz are
mm-slub-stop-freeing-kmem_cache_node-structures-on-node-offline.patch
mm-slab-slub-stop-taking-memory-hotplug-lock.patch
mm-slab-slub-stop-taking-cpu-hotplug-lock.patch
mm-slub-splice-cpu-and-page-freelists-in-deactivate_slab.patch
mm-slub-remove-slub_memcg_sysfs-boot-param-and-config_slub_memcg_sysfs_on.patch
mm-compaction-make-fast_isolate_freepages-stay-within-zone.patch
maintainers-add-uapi-directories-to-api-abi-section.patch
The patch titled
Subject: hugetlb: fix copy_huge_page_from_user contig page struct assumption
has been added to the -mm tree. Its filename is
hugetlb-fix-copy_huge_page_from_user-contig-page-struct-assumption.patch
This patch should soon appear at
https://ozlabs.org/~akpm/mmots/broken-out/hugetlb-fix-copy_huge_page_from_u…
and later at
https://ozlabs.org/~akpm/mmotm/broken-out/hugetlb-fix-copy_huge_page_from_u…
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next and is updated
there every 3-4 working days
------------------------------------------------------
From: Mike Kravetz <mike.kravetz(a)oracle.com>
Subject: hugetlb: fix copy_huge_page_from_user contig page struct assumption
page structs are not guaranteed to be contiguous for gigantic pages. The
routine copy_huge_page_from_user can encounter gigantic pages, yet it
assumes page structs are contiguous when copying pages from user space.
Since page structs for the target gigantic page are not contiguous, the
data copied from user space could overwrite other pages not associated
with the gigantic page and cause data corruption.
Non-contiguous page structs are generally not an issue. However, they can
exist with a specific kernel configuration and hotplug operations. For
example: Configure the kernel with CONFIG_SPARSEMEM and
!CONFIG_SPARSEMEM_VMEMMAP. Then, hotplug add memory for the area where
the gigantic page will be allocated.
Link: https://lkml.kernel.org/r/20210217184926.33567-2-mike.kravetz@oracle.com
Fixes: 8fb5debc5fcd ("userfaultfd: hugetlbfs: add hugetlb_mcopy_atomic_pte for userfaultfd support")
Signed-off-by: Mike Kravetz <mike.kravetz(a)oracle.com>
Cc: Zi Yan <ziy(a)nvidia.com>
Cc: Davidlohr Bueso <dbueso(a)suse.de>
Cc: "Kirill A . Shutemov" <kirill.shutemov(a)linux.intel.com>
Cc: Andrea Arcangeli <aarcange(a)redhat.com>
Cc: Matthew Wilcox <willy(a)infradead.org>
Cc: Oscar Salvador <osalvador(a)suse.de>
Cc: Joao Martins <joao.m.martins(a)oracle.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/memory.c | 10 ++++++----
1 file changed, 6 insertions(+), 4 deletions(-)
--- a/mm/memory.c~hugetlb-fix-copy_huge_page_from_user-contig-page-struct-assumption
+++ a/mm/memory.c
@@ -5173,17 +5173,19 @@ long copy_huge_page_from_user(struct pag
void *page_kaddr;
unsigned long i, rc = 0;
unsigned long ret_val = pages_per_huge_page * PAGE_SIZE;
+ struct page *subpage = dst_page;
- for (i = 0; i < pages_per_huge_page; i++) {
+ for (i = 0; i < pages_per_huge_page;
+ i++, subpage = mem_map_next(subpage, dst_page, i)) {
if (allow_pagefault)
- page_kaddr = kmap(dst_page + i);
+ page_kaddr = kmap(subpage);
else
- page_kaddr = kmap_atomic(dst_page + i);
+ page_kaddr = kmap_atomic(subpage);
rc = copy_from_user(page_kaddr,
(const void __user *)(src + i * PAGE_SIZE),
PAGE_SIZE);
if (allow_pagefault)
- kunmap(dst_page + i);
+ kunmap(subpage);
else
kunmap_atomic(page_kaddr);
_
Patches currently in -mm which might be from mike.kravetz(a)oracle.com are
hugetlb-fix-update_and_free_page-contig-page-struct-assumption.patch
hugetlb-fix-copy_huge_page_from_user-contig-page-struct-assumption.patch
hugetlb-use-pageprivate-for-hugetlb-specific-page-flags.patch
hugetlb-convert-page_huge_active-hpagemigratable-flag.patch
hugetlb-convert-pagehugetemporary-to-hpagetemporary-flag.patch
hugetlb-convert-pagehugefreed-to-hpagefreed-flag.patch
mm-hugetlb-change-hugetlb_reserve_pages-to-type-bool.patch
hugetlbfs-remove-special-hugetlbfs_set_page_dirty.patch
The patch titled
Subject: hugetlb: fix update_and_free_page contig page struct assumption
has been added to the -mm tree. Its filename is
hugetlb-fix-update_and_free_page-contig-page-struct-assumption.patch
This patch should soon appear at
https://ozlabs.org/~akpm/mmots/broken-out/hugetlb-fix-update_and_free_page-…
and later at
https://ozlabs.org/~akpm/mmotm/broken-out/hugetlb-fix-update_and_free_page-…
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next and is updated
there every 3-4 working days
------------------------------------------------------
From: Mike Kravetz <mike.kravetz(a)oracle.com>
Subject: hugetlb: fix update_and_free_page contig page struct assumption
page structs are not guaranteed to be contiguous for gigantic pages. The
routine update_and_free_page can encounter a gigantic page, yet it assumes
page structs are contiguous when setting page flags in subpages.
If update_and_free_page encounters non-contiguous page structs, we can see
“BUG: Bad page state in process …” errors.
Non-contiguous page structs are generally not an issue. However, they can
exist with a specific kernel configuration and hotplug operations. For
example: Configure the kernel with CONFIG_SPARSEMEM and
!CONFIG_SPARSEMEM_VMEMMAP. Then, hotplug add memory for the area where
the gigantic page will be allocated. Zi Yan outlined steps to reproduce
here [1].
[1] https://lore.kernel.org/linux-mm/16F7C58B-4D79-41C5-9B64-A1A1628F4AF2@nvidi…
Link: https://lkml.kernel.org/r/20210217184926.33567-1-mike.kravetz@oracle.com
Fixes: 944d9fec8d7a ("hugetlb: add support for gigantic page allocation at runtime")
Signed-off-by: Zi Yan <ziy(a)nvidia.com>
Signed-off-by: Mike Kravetz <mike.kravetz(a)oracle.com>
Cc: Zi Yan <ziy(a)nvidia.com>
Cc: Davidlohr Bueso <dbueso(a)suse.de>
Cc: "Kirill A . Shutemov" <kirill.shutemov(a)linux.intel.com>
Cc: Andrea Arcangeli <aarcange(a)redhat.com>
Cc: Matthew Wilcox <willy(a)infradead.org>
Cc: Oscar Salvador <osalvador(a)suse.de>
Cc: Joao Martins <joao.m.martins(a)oracle.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/hugetlb.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
--- a/mm/hugetlb.c~hugetlb-fix-update_and_free_page-contig-page-struct-assumption
+++ a/mm/hugetlb.c
@@ -1321,14 +1321,16 @@ static inline void destroy_compound_giga
static void update_and_free_page(struct hstate *h, struct page *page)
{
int i;
+ struct page *subpage = page;
if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
return;
h->nr_huge_pages--;
h->nr_huge_pages_node[page_to_nid(page)]--;
- for (i = 0; i < pages_per_huge_page(h); i++) {
- page[i].flags &= ~(1 << PG_locked | 1 << PG_error |
+ for (i = 0; i < pages_per_huge_page(h);
+ i++, subpage = mem_map_next(subpage, page, i)) {
+ subpage->flags &= ~(1 << PG_locked | 1 << PG_error |
1 << PG_referenced | 1 << PG_dirty |
1 << PG_active | 1 << PG_private |
1 << PG_writeback);
_
Patches currently in -mm which might be from mike.kravetz(a)oracle.com are
hugetlb-fix-update_and_free_page-contig-page-struct-assumption.patch
hugetlb-fix-copy_huge_page_from_user-contig-page-struct-assumption.patch
hugetlb-use-pageprivate-for-hugetlb-specific-page-flags.patch
hugetlb-convert-page_huge_active-hpagemigratable-flag.patch
hugetlb-convert-pagehugetemporary-to-hpagetemporary-flag.patch
hugetlb-convert-pagehugefreed-to-hpagefreed-flag.patch
mm-hugetlb-change-hugetlb_reserve_pages-to-type-bool.patch
hugetlbfs-remove-special-hugetlbfs_set_page_dirty.patch