In sgx_init(), if misc_register() fails or misc_register() succeeds but
neither sgx_drv_init() nor sgx_vepc_init() succeeds, then ksgxd will be
prematurely stopped. This may leave some unsanitized pages, which does
not matter, because SGX will be disabled for the whole power cycle.
This triggers WARN_ON() because sgx_dirty_page_list ends up being
non-empty, and dumps the call stack:
[ 0.268103] sgx: EPC section 0x40200000-0x45f7ffff
[ 0.268591] ------------[ cut here ]------------
[ 0.268592] WARNING: CPU: 6 PID: 83 at
arch/x86/kernel/cpu/sgx/main.c:401 ksgxd+0x1b7/0x1d0
[ 0.268598] Modules linked in:
[ 0.268600] CPU: 6 PID: 83 Comm: ksgxd Not tainted 6.0.0-rc2 #382
[ 0.268603] Hardware name: Dell Inc. XPS 13 9370/0RMYH9, BIOS 1.21.0
07/06/2022
[ 0.268604] RIP: 0010:ksgxd+0x1b7/0x1d0
[ 0.268607] Code: ff e9 f2 fe ff ff 48 89 df e8 75 07 0e 00 84 c0 0f
84 c3 fe ff ff 31 ff e8 e6 07 0e 00 84 c0 0f 85 94 fe ff ff e9 af fe ff
ff <0f> 0b e9 7f fe ff ff e8 dd 9c 95 00 66 66 2e 0f 1f 84 00 00 00 00
[ 0.268608] RSP: 0000:ffffb6c7404f3ed8 EFLAGS: 00010287
[ 0.268610] RAX: ffffb6c740431a10 RBX: ffff8dcd8117b400 RCX:
0000000000000000
[ 0.268612] RDX: 0000000080000000 RSI: ffffb6c7404319d0 RDI:
00000000ffffffff
[ 0.268613] RBP: ffff8dcd820a4d80 R08: ffff8dcd820a4180 R09:
ffff8dcd820a4180
[ 0.268614] R10: 0000000000000000 R11: 0000000000000006 R12:
ffffb6c74006bce0
[ 0.268615] R13: ffff8dcd80e63880 R14: ffffffffa8a60f10 R15:
0000000000000000
[ 0.268616] FS: 0000000000000000(0000) GS:ffff8dcf25580000(0000)
knlGS:0000000000000000
[ 0.268617] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 0.268619] CR2: 0000000000000000 CR3: 0000000213410001 CR4:
00000000003706e0
[ 0.268620] DR0: 0000000000000000 DR1: 0000000000000000 DR2:
0000000000000000
[ 0.268621] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
0000000000000400
[ 0.268622] Call Trace:
[ 0.268624] <TASK>
[ 0.268627] ? _raw_spin_lock_irqsave+0x24/0x60
[ 0.268632] ? _raw_spin_unlock_irqrestore+0x23/0x40
[ 0.268634] ? __kthread_parkme+0x36/0x90
[ 0.268637] kthread+0xe5/0x110
[ 0.268639] ? kthread_complete_and_exit+0x20/0x20
[ 0.268642] ret_from_fork+0x1f/0x30
[ 0.268647] </TASK>
[ 0.268648] ---[ end trace 0000000000000000 ]---
Ultimately this can crash the kernel, if the following is set:
/proc/sys/kernel/panic_on_warn
In premature stop, print nothing, as the number is by practical means a
random number. Otherwise, it is an indicator of a bug in the driver, and
therefore print the number of unsanitized pages with pr_err().
Link: https://lore.kernel.org/linux-sgx/20220825051827.246698-1-jarkko@kernel.org…
Fixes: 51ab30eb2ad4 ("x86/sgx: Replace section->init_laundry_list with sgx_dirty_page_list")
Cc: stable(a)vger.kernel.org # v5.13+
Reported-by: Paul Menzel <pmenzel(a)molgen.mpg.de>
Signed-off-by: Jarkko Sakkinen <jarkko(a)kernel.org>
---
v6:
- Address Reinette's feedback:
https://lore.kernel.org/linux-sgx/Yw6%2FiTzSdSw%2FY%2FVO@kernel.org/
v5:
- Add the klog dump and sysctl option to the commit message.
v4:
- Explain expectations for dirty_page_list in the function header, instead
of an inline comment.
- Improve commit message to explain the conditions better.
- Return the number of pages left dirty to ksgxd() and print warning after
the 2nd call, if there are any.
v3:
- Remove WARN_ON().
- Tuned comments and the commit message a bit.
v2:
- Replaced WARN_ON() with optional pr_info() inside
__sgx_sanitize_pages().
- Rewrote the commit message.
- Added the fixes tag.
---
arch/x86/kernel/cpu/sgx/main.c | 42 ++++++++++++++++++++++++++++------
1 file changed, 35 insertions(+), 7 deletions(-)
diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
index 515e2a5f25bb..bcd6b64961bd 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -49,17 +49,20 @@ static LIST_HEAD(sgx_dirty_page_list);
* Reset post-kexec EPC pages to the uninitialized state. The pages are removed
* from the input list, and made available for the page allocator. SECS pages
* prepending their children in the input list are left intact.
+ *
+ * Contents of the @dirty_page_list must be thread-local, i.e.
+ * not shared by multiple threads.
*/
-static void __sgx_sanitize_pages(struct list_head *dirty_page_list)
+static long __sgx_sanitize_pages(struct list_head *dirty_page_list)
{
struct sgx_epc_page *page;
+ long left_dirty = 0;
LIST_HEAD(dirty);
int ret;
- /* dirty_page_list is thread-local, no need for a lock: */
while (!list_empty(dirty_page_list)) {
if (kthread_should_stop())
- return;
+ return -ECANCELED;
page = list_first_entry(dirty_page_list, struct sgx_epc_page, list);
@@ -92,12 +95,14 @@ static void __sgx_sanitize_pages(struct list_head *dirty_page_list)
} else {
/* The page is not yet clean - move to the dirty list. */
list_move_tail(&page->list, &dirty);
+ left_dirty++;
}
cond_resched();
}
list_splice(&dirty, dirty_page_list);
+ return left_dirty;
}
static bool sgx_reclaimer_age(struct sgx_epc_page *epc_page)
@@ -388,17 +393,40 @@ void sgx_reclaim_direct(void)
static int ksgxd(void *p)
{
+ long ret;
+
set_freezable();
/*
* Sanitize pages in order to recover from kexec(). The 2nd pass is
* required for SECS pages, whose child pages blocked EREMOVE.
*/
- __sgx_sanitize_pages(&sgx_dirty_page_list);
- __sgx_sanitize_pages(&sgx_dirty_page_list);
+ ret = __sgx_sanitize_pages(&sgx_dirty_page_list);
+ if (ret == -ECANCELED)
+ /* kthread stopped */
+ return 0;
- /* sanity check: */
- WARN_ON(!list_empty(&sgx_dirty_page_list));
+ ret = __sgx_sanitize_pages(&sgx_dirty_page_list);
+ switch (ret) {
+ case 0:
+ /* success, no unsanitized pages */
+ break;
+
+ case -ECANCELED:
+ /* kthread stopped */
+ return 0;
+
+ default:
+ /*
+ * Never expected to happen in a working driver. If it happens
+ * the bug is expected to be in the sanitization process, but
+ * successfully sanitized pages are still valid and driver can
+ * be used and most importantly debugged without issues. To put
+ * short, the global state of kernel is not corrupted so no
+ * reason to do any more complicated rollback.
+ */
+ pr_err("%ld unsanitized pages\n", ret);
+ }
while (!kthread_should_stop()) {
if (try_to_freeze())
--
2.37.2
The patch titled
Subject: mm/migrate_device.c: copy pte dirty bit to page
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
mm-migrate_devicec-copy-pte-dirty-bit-to-page.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Alistair Popple <apopple(a)nvidia.com>
Subject: mm/migrate_device.c: copy pte dirty bit to page
Date: Fri, 2 Sep 2022 10:35:53 +1000
migrate_vma_setup() has a fast path in migrate_vma_collect_pmd() that
installs migration entries directly if it can lock the migrating page.
When removing a dirty pte the dirty bit is supposed to be carried over to
the underlying page to prevent it being lost.
Currently migrate_vma_*() can only be used for private anonymous mappings.
That means loss of the dirty bit usually doesn't result in data loss
because these pages are typically not file-backed. However pages may be
backed by swap storage which can result in data loss if an attempt is made
to migrate a dirty page that doesn't yet have the PageDirty flag set.
In this case migration will fail due to unexpected references but the
dirty pte bit will be lost. If the page is subsequently reclaimed data
won't be written back to swap storage as it is considered uptodate,
resulting in data loss if the page is subsequently accessed.
Prevent this by copying the dirty bit to the page when removing the pte to
match what try_to_migrate_one() does.
Link: https://lkml.kernel.org/r/dd48e4882ce859c295c1a77612f66d198b0403f9.16620785…
Fixes: 8c3328f1f36a ("mm/migrate: migrate_vma() unmap page from vma while collecting pages")
Signed-off-by: Alistair Popple <apopple(a)nvidia.com>
Acked-by: Peter Xu <peterx(a)redhat.com>
Reviewed-by: "Huang, Ying" <ying.huang(a)intel.com>
Reported-by: "Huang, Ying" <ying.huang(a)intel.com>
Acked-by: David Hildenbrand <david(a)redhat.com>
Cc: Alex Sierra <alex.sierra(a)amd.com>
Cc: Ben Skeggs <bskeggs(a)redhat.com>
Cc: Felix Kuehling <Felix.Kuehling(a)amd.com>
Cc: huang ying <huang.ying.caritas(a)gmail.com>
Cc: Jason Gunthorpe <jgg(a)nvidia.com>
Cc: John Hubbard <jhubbard(a)nvidia.com>
Cc: Karol Herbst <kherbst(a)redhat.com>
Cc: Logan Gunthorpe <logang(a)deltatee.com>
Cc: Lyude Paul <lyude(a)redhat.com>
Cc: Matthew Wilcox <willy(a)infradead.org>
Cc: Nadav Amit <nadav.amit(a)gmail.com>
Cc: Paul Mackerras <paulus(a)ozlabs.org>
Cc: Ralph Campbell <rcampbell(a)nvidia.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/migrate_device.c | 9 +++++++--
1 file changed, 7 insertions(+), 2 deletions(-)
--- a/mm/migrate_device.c~mm-migrate_devicec-copy-pte-dirty-bit-to-page
+++ a/mm/migrate_device.c
@@ -7,6 +7,7 @@
#include <linux/export.h>
#include <linux/memremap.h>
#include <linux/migrate.h>
+#include <linux/mm.h>
#include <linux/mm_inline.h>
#include <linux/mmu_notifier.h>
#include <linux/oom.h>
@@ -196,7 +197,7 @@ again:
flush_cache_page(vma, addr, pte_pfn(*ptep));
anon_exclusive = PageAnon(page) && PageAnonExclusive(page);
if (anon_exclusive) {
- ptep_clear_flush(vma, addr, ptep);
+ pte = ptep_clear_flush(vma, addr, ptep);
if (page_try_share_anon_rmap(page)) {
set_pte_at(mm, addr, ptep, pte);
@@ -206,11 +207,15 @@ again:
goto next;
}
} else {
- ptep_get_and_clear(mm, addr, ptep);
+ pte = ptep_get_and_clear(mm, addr, ptep);
}
migrate->cpages++;
+ /* Set the dirty flag on the folio now the pte is gone. */
+ if (pte_dirty(pte))
+ folio_mark_dirty(page_folio(page));
+
/* Setup special migration page table entry */
if (mpfn & MIGRATE_PFN_WRITE)
entry = make_writable_migration_entry(
_
Patches currently in -mm which might be from apopple(a)nvidia.com are
mm-migrate_devicec-flush-tlb-while-holding-ptl.patch
mm-migrate_devicec-add-missing-flush_cache_page.patch
mm-migrate_devicec-copy-pte-dirty-bit-to-page.patch
mm-gupc-simplify-and-fix-check_and_migrate_movable_pages-return-codes.patch
selftests-hmm-tests-add-test-for-dirty-bits.patch
mm-gupc-dont-pass-gup_flags-to-check_and_migrate_movable_pages.patch
mm-gupc-refactor-check_and_migrate_movable_pages.patch
mm-migrate_devicec-fix-a-misleading-and-out-dated-comment.patch
The patch titled
Subject: mm/migrate_device.c: add missing flush_cache_page()
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
mm-migrate_devicec-add-missing-flush_cache_page.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Alistair Popple <apopple(a)nvidia.com>
Subject: mm/migrate_device.c: add missing flush_cache_page()
Date: Fri, 2 Sep 2022 10:35:52 +1000
Currently we only call flush_cache_page() for the anon_exclusive case,
however in both cases we clear the pte so should flush the cache.
Link: https://lkml.kernel.org/r/5676f30436ab71d1a587ac73f835ed8bd2113ff5.16620785…
Fixes: 8c3328f1f36a ("mm/migrate: migrate_vma() unmap page from vma while collecting pages")
Signed-off-by: Alistair Popple <apopple(a)nvidia.com>
Reviewed-by: David Hildenbrand <david(a)redhat.com>
Cc: Alex Sierra <alex.sierra(a)amd.com>
Cc: Ben Skeggs <bskeggs(a)redhat.com>
Cc: Felix Kuehling <Felix.Kuehling(a)amd.com>
Cc: huang ying <huang.ying.caritas(a)gmail.com>
Cc: "Huang, Ying" <ying.huang(a)intel.com>
Cc: Jason Gunthorpe <jgg(a)nvidia.com>
Cc: John Hubbard <jhubbard(a)nvidia.com>
Cc: Karol Herbst <kherbst(a)redhat.com>
Cc: Logan Gunthorpe <logang(a)deltatee.com>
Cc: Lyude Paul <lyude(a)redhat.com>
Cc: Matthew Wilcox <willy(a)infradead.org>
Cc: Nadav Amit <nadav.amit(a)gmail.com>
Cc: Paul Mackerras <paulus(a)ozlabs.org>
Cc: Peter Xu <peterx(a)redhat.com>
Cc: Ralph Campbell <rcampbell(a)nvidia.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/migrate_device.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/mm/migrate_device.c~mm-migrate_devicec-add-missing-flush_cache_page
+++ a/mm/migrate_device.c
@@ -193,9 +193,9 @@ again:
bool anon_exclusive;
pte_t swp_pte;
+ flush_cache_page(vma, addr, pte_pfn(*ptep));
anon_exclusive = PageAnon(page) && PageAnonExclusive(page);
if (anon_exclusive) {
- flush_cache_page(vma, addr, pte_pfn(*ptep));
ptep_clear_flush(vma, addr, ptep);
if (page_try_share_anon_rmap(page)) {
_
Patches currently in -mm which might be from apopple(a)nvidia.com are
mm-migrate_devicec-flush-tlb-while-holding-ptl.patch
mm-migrate_devicec-add-missing-flush_cache_page.patch
mm-migrate_devicec-copy-pte-dirty-bit-to-page.patch
mm-gupc-simplify-and-fix-check_and_migrate_movable_pages-return-codes.patch
selftests-hmm-tests-add-test-for-dirty-bits.patch
mm-gupc-dont-pass-gup_flags-to-check_and_migrate_movable_pages.patch
mm-gupc-refactor-check_and_migrate_movable_pages.patch
mm-migrate_devicec-fix-a-misleading-and-out-dated-comment.patch
The patch titled
Subject: mm/migrate_device.c: flush TLB while holding PTL
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
mm-migrate_devicec-flush-tlb-while-holding-ptl.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Alistair Popple <apopple(a)nvidia.com>
Subject: mm/migrate_device.c: flush TLB while holding PTL
Date: Fri, 2 Sep 2022 10:35:51 +1000
When clearing a PTE the TLB should be flushed whilst still holding the PTL
to avoid a potential race with madvise/munmap/etc. For example consider
the following sequence:
CPU0 CPU1
---- ----
migrate_vma_collect_pmd()
pte_unmap_unlock()
madvise(MADV_DONTNEED)
-> zap_pte_range()
pte_offset_map_lock()
[ PTE not present, TLB not flushed ]
pte_unmap_unlock()
[ page is still accessible via stale TLB ]
flush_tlb_range()
In this case the page may still be accessed via the stale TLB entry after
madvise returns. Fix this by flushing the TLB while holding the PTL.
Fixes: 8c3328f1f36a ("mm/migrate: migrate_vma() unmap page from vma while collecting pages")
Link: https://lkml.kernel.org/r/9f801e9d8d830408f2ca27821f606e09aa856899.16620785…
Signed-off-by: Alistair Popple <apopple(a)nvidia.com>
Reported-by: Nadav Amit <nadav.amit(a)gmail.com>
Reviewed-by: "Huang, Ying" <ying.huang(a)intel.com>
Acked-by: David Hildenbrand <david(a)redhat.com>
Cc: Alex Sierra <alex.sierra(a)amd.com>
Cc: Ben Skeggs <bskeggs(a)redhat.com>
Cc: Felix Kuehling <Felix.Kuehling(a)amd.com>
Cc: huang ying <huang.ying.caritas(a)gmail.com>
Cc: Jason Gunthorpe <jgg(a)nvidia.com>
Cc: John Hubbard <jhubbard(a)nvidia.com>
Cc: Karol Herbst <kherbst(a)redhat.com>
Cc: Logan Gunthorpe <logang(a)deltatee.com>
Cc: Lyude Paul <lyude(a)redhat.com>
Cc: Matthew Wilcox <willy(a)infradead.org>
Cc: Paul Mackerras <paulus(a)ozlabs.org>
Cc: Peter Xu <peterx(a)redhat.com>
Cc: Ralph Campbell <rcampbell(a)nvidia.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/migrate_device.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
--- a/mm/migrate_device.c~mm-migrate_devicec-flush-tlb-while-holding-ptl
+++ a/mm/migrate_device.c
@@ -254,13 +254,14 @@ next:
migrate->dst[migrate->npages] = 0;
migrate->src[migrate->npages++] = mpfn;
}
- arch_leave_lazy_mmu_mode();
- pte_unmap_unlock(ptep - 1, ptl);
/* Only flush the TLB if we actually modified any entries */
if (unmapped)
flush_tlb_range(walk->vma, start, end);
+ arch_leave_lazy_mmu_mode();
+ pte_unmap_unlock(ptep - 1, ptl);
+
return 0;
}
_
Patches currently in -mm which might be from apopple(a)nvidia.com are
mm-migrate_devicec-flush-tlb-while-holding-ptl.patch
mm-migrate_devicec-add-missing-flush_cache_page.patch
mm-migrate_devicec-copy-pte-dirty-bit-to-page.patch
mm-gupc-simplify-and-fix-check_and_migrate_movable_pages-return-codes.patch
selftests-hmm-tests-add-test-for-dirty-bits.patch
mm-gupc-dont-pass-gup_flags-to-check_and_migrate_movable_pages.patch
mm-gupc-refactor-check_and_migrate_movable_pages.patch
mm-migrate_devicec-fix-a-misleading-and-out-dated-comment.patch