The patch titled
Subject: mm/damon/dbgfs: fix memory leak when using debugfs_lookup()
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
mm-damon-dbgfs-fix-memory-leak-when-using.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Subject: mm/damon/dbgfs: fix memory leak when using debugfs_lookup()
Date: Fri, 2 Sep 2022 19:11:49 +0000
When calling debugfs_lookup() the result must have dput() called on it,
otherwise the memory will leak over time. Fix this up by properly calling
dput().
Link: https://lkml.kernel.org/r/20220902191149.112434-1-sj@kernel.org
Fixes: 75c1c2b53c78b ("mm/damon/dbgfs: support multiple contexts")
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Signed-off-by: SeongJae Park <sj(a)kernel.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/damon/dbgfs.c | 19 ++++++++++++++-----
1 file changed, 14 insertions(+), 5 deletions(-)
--- a/mm/damon/dbgfs.c~mm-damon-dbgfs-fix-memory-leak-when-using
+++ a/mm/damon/dbgfs.c
@@ -884,6 +884,7 @@ static int dbgfs_rm_context(char *name)
struct dentry *root, *dir, **new_dirs;
struct damon_ctx **new_ctxs;
int i, j;
+ int ret = 0;
if (damon_nr_running_ctxs())
return -EBUSY;
@@ -898,14 +899,16 @@ static int dbgfs_rm_context(char *name)
new_dirs = kmalloc_array(dbgfs_nr_ctxs - 1, sizeof(*dbgfs_dirs),
GFP_KERNEL);
- if (!new_dirs)
- return -ENOMEM;
+ if (!new_dirs) {
+ ret = -ENOMEM;
+ goto out_dput;
+ }
new_ctxs = kmalloc_array(dbgfs_nr_ctxs - 1, sizeof(*dbgfs_ctxs),
GFP_KERNEL);
if (!new_ctxs) {
- kfree(new_dirs);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto out_new_dirs;
}
for (i = 0, j = 0; i < dbgfs_nr_ctxs; i++) {
@@ -925,7 +928,13 @@ static int dbgfs_rm_context(char *name)
dbgfs_ctxs = new_ctxs;
dbgfs_nr_ctxs--;
- return 0;
+ goto out_dput;
+
+out_new_dirs:
+ kfree(new_dirs);
+out_dput:
+ dput(dir);
+ return ret;
}
static ssize_t dbgfs_rm_context_write(struct file *file,
_
Patches currently in -mm which might be from gregkh(a)linuxfoundation.org are
mm-damon-dbgfs-fix-memory-leak-when-using.patch
The patch titled
Subject: mm/damon/dbgfs: fix memory leak when using
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
mm-damon-dbgfs-fix-memory-leak-when-using.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Subject: mm/damon/dbgfs: fix memory leak when using
Date: Fri, 2 Sep 2022 19:11:49 +0000
When calling debugfs_lookup() the result must have dput() called on it,
otherwise the memory will leak over time. Fix this up by properly calling
dput().
Link: https://lkml.kernel.org/r/20220902191149.112434-1-sj@kernel.org
Fixes: 75c1c2b53c78b ("mm/damon/dbgfs: support multiple contexts")
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Signed-off-by: SeongJae Park <sj(a)kernel.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/damon/dbgfs.c | 19 ++++++++++++++-----
1 file changed, 14 insertions(+), 5 deletions(-)
--- a/mm/damon/dbgfs.c~mm-damon-dbgfs-fix-memory-leak-when-using
+++ a/mm/damon/dbgfs.c
@@ -884,6 +884,7 @@ static int dbgfs_rm_context(char *name)
struct dentry *root, *dir, **new_dirs;
struct damon_ctx **new_ctxs;
int i, j;
+ int ret = 0;
if (damon_nr_running_ctxs())
return -EBUSY;
@@ -898,14 +899,16 @@ static int dbgfs_rm_context(char *name)
new_dirs = kmalloc_array(dbgfs_nr_ctxs - 1, sizeof(*dbgfs_dirs),
GFP_KERNEL);
- if (!new_dirs)
- return -ENOMEM;
+ if (!new_dirs) {
+ ret = -ENOMEM;
+ goto out_dput;
+ }
new_ctxs = kmalloc_array(dbgfs_nr_ctxs - 1, sizeof(*dbgfs_ctxs),
GFP_KERNEL);
if (!new_ctxs) {
- kfree(new_dirs);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto out_new_dirs;
}
for (i = 0, j = 0; i < dbgfs_nr_ctxs; i++) {
@@ -925,7 +928,13 @@ static int dbgfs_rm_context(char *name)
dbgfs_ctxs = new_ctxs;
dbgfs_nr_ctxs--;
- return 0;
+ goto out_dput;
+
+out_new_dirs:
+ kfree(new_dirs);
+out_dput:
+ dput(dir);
+ return ret;
}
static ssize_t dbgfs_rm_context_write(struct file *file,
_
Patches currently in -mm which might be from gregkh(a)linuxfoundation.org are
mm-damon-dbgfs-fix-memory-leak-when-using.patch
Greetings
I have sent you emails, but still have not receive a response. Kindly
get back to me for a mutual benefit transaction.
Thank you.
Chester R. Ellis
In sgx_init(), if misc_register() fails or misc_register() succeeds but
neither sgx_drv_init() nor sgx_vepc_init() succeeds, then ksgxd will be
prematurely stopped. This may leave some unsanitized pages, which does
not matter, because SGX will be disabled for the whole power cycle.
This triggers WARN_ON() because sgx_dirty_page_list ends up being
non-empty, and dumps the call stack:
[ 0.268103] sgx: EPC section 0x40200000-0x45f7ffff
[ 0.268591] ------------[ cut here ]------------
[ 0.268592] WARNING: CPU: 6 PID: 83 at
arch/x86/kernel/cpu/sgx/main.c:401 ksgxd+0x1b7/0x1d0
[ 0.268598] Modules linked in:
[ 0.268600] CPU: 6 PID: 83 Comm: ksgxd Not tainted 6.0.0-rc2 #382
[ 0.268603] Hardware name: Dell Inc. XPS 13 9370/0RMYH9, BIOS 1.21.0
07/06/2022
[ 0.268604] RIP: 0010:ksgxd+0x1b7/0x1d0
[ 0.268607] Code: ff e9 f2 fe ff ff 48 89 df e8 75 07 0e 00 84 c0 0f
84 c3 fe ff ff 31 ff e8 e6 07 0e 00 84 c0 0f 85 94 fe ff ff e9 af fe ff
ff <0f> 0b e9 7f fe ff ff e8 dd 9c 95 00 66 66 2e 0f 1f 84 00 00 00 00
[ 0.268608] RSP: 0000:ffffb6c7404f3ed8 EFLAGS: 00010287
[ 0.268610] RAX: ffffb6c740431a10 RBX: ffff8dcd8117b400 RCX:
0000000000000000
[ 0.268612] RDX: 0000000080000000 RSI: ffffb6c7404319d0 RDI:
00000000ffffffff
[ 0.268613] RBP: ffff8dcd820a4d80 R08: ffff8dcd820a4180 R09:
ffff8dcd820a4180
[ 0.268614] R10: 0000000000000000 R11: 0000000000000006 R12:
ffffb6c74006bce0
[ 0.268615] R13: ffff8dcd80e63880 R14: ffffffffa8a60f10 R15:
0000000000000000
[ 0.268616] FS: 0000000000000000(0000) GS:ffff8dcf25580000(0000)
knlGS:0000000000000000
[ 0.268617] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 0.268619] CR2: 0000000000000000 CR3: 0000000213410001 CR4:
00000000003706e0
[ 0.268620] DR0: 0000000000000000 DR1: 0000000000000000 DR2:
0000000000000000
[ 0.268621] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
0000000000000400
[ 0.268622] Call Trace:
[ 0.268624] <TASK>
[ 0.268627] ? _raw_spin_lock_irqsave+0x24/0x60
[ 0.268632] ? _raw_spin_unlock_irqrestore+0x23/0x40
[ 0.268634] ? __kthread_parkme+0x36/0x90
[ 0.268637] kthread+0xe5/0x110
[ 0.268639] ? kthread_complete_and_exit+0x20/0x20
[ 0.268642] ret_from_fork+0x1f/0x30
[ 0.268647] </TASK>
[ 0.268648] ---[ end trace 0000000000000000 ]---
Ultimately this can crash the kernel, if the following is set:
/proc/sys/kernel/panic_on_warn
In premature stop, print nothing, as the number is by practical means a
random number. Otherwise, it is an indicator of a bug in the driver, and
therefore print the number of unsanitized pages with pr_err().
Link: https://lore.kernel.org/linux-sgx/20220825051827.246698-1-jarkko@kernel.org…
Fixes: 51ab30eb2ad4 ("x86/sgx: Replace section->init_laundry_list with sgx_dirty_page_list")
Cc: stable(a)vger.kernel.org # v5.13+
Reported-by: Paul Menzel <pmenzel(a)molgen.mpg.de>
Signed-off-by: Jarkko Sakkinen <jarkko(a)kernel.org>
---
v6:
- Address Reinette's feedback:
https://lore.kernel.org/linux-sgx/Yw6%2FiTzSdSw%2FY%2FVO@kernel.org/
v5:
- Add the klog dump and sysctl option to the commit message.
v4:
- Explain expectations for dirty_page_list in the function header, instead
of an inline comment.
- Improve commit message to explain the conditions better.
- Return the number of pages left dirty to ksgxd() and print warning after
the 2nd call, if there are any.
v3:
- Remove WARN_ON().
- Tuned comments and the commit message a bit.
v2:
- Replaced WARN_ON() with optional pr_info() inside
__sgx_sanitize_pages().
- Rewrote the commit message.
- Added the fixes tag.
---
arch/x86/kernel/cpu/sgx/main.c | 42 ++++++++++++++++++++++++++++------
1 file changed, 35 insertions(+), 7 deletions(-)
diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
index 515e2a5f25bb..bcd6b64961bd 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -49,17 +49,20 @@ static LIST_HEAD(sgx_dirty_page_list);
* Reset post-kexec EPC pages to the uninitialized state. The pages are removed
* from the input list, and made available for the page allocator. SECS pages
* prepending their children in the input list are left intact.
+ *
+ * Contents of the @dirty_page_list must be thread-local, i.e.
+ * not shared by multiple threads.
*/
-static void __sgx_sanitize_pages(struct list_head *dirty_page_list)
+static long __sgx_sanitize_pages(struct list_head *dirty_page_list)
{
struct sgx_epc_page *page;
+ long left_dirty = 0;
LIST_HEAD(dirty);
int ret;
- /* dirty_page_list is thread-local, no need for a lock: */
while (!list_empty(dirty_page_list)) {
if (kthread_should_stop())
- return;
+ return -ECANCELED;
page = list_first_entry(dirty_page_list, struct sgx_epc_page, list);
@@ -92,12 +95,14 @@ static void __sgx_sanitize_pages(struct list_head *dirty_page_list)
} else {
/* The page is not yet clean - move to the dirty list. */
list_move_tail(&page->list, &dirty);
+ left_dirty++;
}
cond_resched();
}
list_splice(&dirty, dirty_page_list);
+ return left_dirty;
}
static bool sgx_reclaimer_age(struct sgx_epc_page *epc_page)
@@ -388,17 +393,40 @@ void sgx_reclaim_direct(void)
static int ksgxd(void *p)
{
+ long ret;
+
set_freezable();
/*
* Sanitize pages in order to recover from kexec(). The 2nd pass is
* required for SECS pages, whose child pages blocked EREMOVE.
*/
- __sgx_sanitize_pages(&sgx_dirty_page_list);
- __sgx_sanitize_pages(&sgx_dirty_page_list);
+ ret = __sgx_sanitize_pages(&sgx_dirty_page_list);
+ if (ret == -ECANCELED)
+ /* kthread stopped */
+ return 0;
- /* sanity check: */
- WARN_ON(!list_empty(&sgx_dirty_page_list));
+ ret = __sgx_sanitize_pages(&sgx_dirty_page_list);
+ switch (ret) {
+ case 0:
+ /* success, no unsanitized pages */
+ break;
+
+ case -ECANCELED:
+ /* kthread stopped */
+ return 0;
+
+ default:
+ /*
+ * Never expected to happen in a working driver. If it happens
+ * the bug is expected to be in the sanitization process, but
+ * successfully sanitized pages are still valid and driver can
+ * be used and most importantly debugged without issues. To put
+ * short, the global state of kernel is not corrupted so no
+ * reason to do any more complicated rollback.
+ */
+ pr_err("%ld unsanitized pages\n", ret);
+ }
while (!kthread_should_stop()) {
if (try_to_freeze())
--
2.37.2
The patch titled
Subject: mm/migrate_device.c: copy pte dirty bit to page
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
mm-migrate_devicec-copy-pte-dirty-bit-to-page.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Alistair Popple <apopple(a)nvidia.com>
Subject: mm/migrate_device.c: copy pte dirty bit to page
Date: Fri, 2 Sep 2022 10:35:53 +1000
migrate_vma_setup() has a fast path in migrate_vma_collect_pmd() that
installs migration entries directly if it can lock the migrating page.
When removing a dirty pte the dirty bit is supposed to be carried over to
the underlying page to prevent it being lost.
Currently migrate_vma_*() can only be used for private anonymous mappings.
That means loss of the dirty bit usually doesn't result in data loss
because these pages are typically not file-backed. However pages may be
backed by swap storage which can result in data loss if an attempt is made
to migrate a dirty page that doesn't yet have the PageDirty flag set.
In this case migration will fail due to unexpected references but the
dirty pte bit will be lost. If the page is subsequently reclaimed data
won't be written back to swap storage as it is considered uptodate,
resulting in data loss if the page is subsequently accessed.
Prevent this by copying the dirty bit to the page when removing the pte to
match what try_to_migrate_one() does.
Link: https://lkml.kernel.org/r/dd48e4882ce859c295c1a77612f66d198b0403f9.16620785…
Fixes: 8c3328f1f36a ("mm/migrate: migrate_vma() unmap page from vma while collecting pages")
Signed-off-by: Alistair Popple <apopple(a)nvidia.com>
Acked-by: Peter Xu <peterx(a)redhat.com>
Reviewed-by: "Huang, Ying" <ying.huang(a)intel.com>
Reported-by: "Huang, Ying" <ying.huang(a)intel.com>
Acked-by: David Hildenbrand <david(a)redhat.com>
Cc: Alex Sierra <alex.sierra(a)amd.com>
Cc: Ben Skeggs <bskeggs(a)redhat.com>
Cc: Felix Kuehling <Felix.Kuehling(a)amd.com>
Cc: huang ying <huang.ying.caritas(a)gmail.com>
Cc: Jason Gunthorpe <jgg(a)nvidia.com>
Cc: John Hubbard <jhubbard(a)nvidia.com>
Cc: Karol Herbst <kherbst(a)redhat.com>
Cc: Logan Gunthorpe <logang(a)deltatee.com>
Cc: Lyude Paul <lyude(a)redhat.com>
Cc: Matthew Wilcox <willy(a)infradead.org>
Cc: Nadav Amit <nadav.amit(a)gmail.com>
Cc: Paul Mackerras <paulus(a)ozlabs.org>
Cc: Ralph Campbell <rcampbell(a)nvidia.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/migrate_device.c | 9 +++++++--
1 file changed, 7 insertions(+), 2 deletions(-)
--- a/mm/migrate_device.c~mm-migrate_devicec-copy-pte-dirty-bit-to-page
+++ a/mm/migrate_device.c
@@ -7,6 +7,7 @@
#include <linux/export.h>
#include <linux/memremap.h>
#include <linux/migrate.h>
+#include <linux/mm.h>
#include <linux/mm_inline.h>
#include <linux/mmu_notifier.h>
#include <linux/oom.h>
@@ -196,7 +197,7 @@ again:
flush_cache_page(vma, addr, pte_pfn(*ptep));
anon_exclusive = PageAnon(page) && PageAnonExclusive(page);
if (anon_exclusive) {
- ptep_clear_flush(vma, addr, ptep);
+ pte = ptep_clear_flush(vma, addr, ptep);
if (page_try_share_anon_rmap(page)) {
set_pte_at(mm, addr, ptep, pte);
@@ -206,11 +207,15 @@ again:
goto next;
}
} else {
- ptep_get_and_clear(mm, addr, ptep);
+ pte = ptep_get_and_clear(mm, addr, ptep);
}
migrate->cpages++;
+ /* Set the dirty flag on the folio now the pte is gone. */
+ if (pte_dirty(pte))
+ folio_mark_dirty(page_folio(page));
+
/* Setup special migration page table entry */
if (mpfn & MIGRATE_PFN_WRITE)
entry = make_writable_migration_entry(
_
Patches currently in -mm which might be from apopple(a)nvidia.com are
mm-migrate_devicec-flush-tlb-while-holding-ptl.patch
mm-migrate_devicec-add-missing-flush_cache_page.patch
mm-migrate_devicec-copy-pte-dirty-bit-to-page.patch
mm-gupc-simplify-and-fix-check_and_migrate_movable_pages-return-codes.patch
selftests-hmm-tests-add-test-for-dirty-bits.patch
mm-gupc-dont-pass-gup_flags-to-check_and_migrate_movable_pages.patch
mm-gupc-refactor-check_and_migrate_movable_pages.patch
mm-migrate_devicec-fix-a-misleading-and-out-dated-comment.patch
The patch titled
Subject: mm/migrate_device.c: add missing flush_cache_page()
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
mm-migrate_devicec-add-missing-flush_cache_page.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Alistair Popple <apopple(a)nvidia.com>
Subject: mm/migrate_device.c: add missing flush_cache_page()
Date: Fri, 2 Sep 2022 10:35:52 +1000
Currently we only call flush_cache_page() for the anon_exclusive case,
however in both cases we clear the pte so should flush the cache.
Link: https://lkml.kernel.org/r/5676f30436ab71d1a587ac73f835ed8bd2113ff5.16620785…
Fixes: 8c3328f1f36a ("mm/migrate: migrate_vma() unmap page from vma while collecting pages")
Signed-off-by: Alistair Popple <apopple(a)nvidia.com>
Reviewed-by: David Hildenbrand <david(a)redhat.com>
Cc: Alex Sierra <alex.sierra(a)amd.com>
Cc: Ben Skeggs <bskeggs(a)redhat.com>
Cc: Felix Kuehling <Felix.Kuehling(a)amd.com>
Cc: huang ying <huang.ying.caritas(a)gmail.com>
Cc: "Huang, Ying" <ying.huang(a)intel.com>
Cc: Jason Gunthorpe <jgg(a)nvidia.com>
Cc: John Hubbard <jhubbard(a)nvidia.com>
Cc: Karol Herbst <kherbst(a)redhat.com>
Cc: Logan Gunthorpe <logang(a)deltatee.com>
Cc: Lyude Paul <lyude(a)redhat.com>
Cc: Matthew Wilcox <willy(a)infradead.org>
Cc: Nadav Amit <nadav.amit(a)gmail.com>
Cc: Paul Mackerras <paulus(a)ozlabs.org>
Cc: Peter Xu <peterx(a)redhat.com>
Cc: Ralph Campbell <rcampbell(a)nvidia.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/migrate_device.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/mm/migrate_device.c~mm-migrate_devicec-add-missing-flush_cache_page
+++ a/mm/migrate_device.c
@@ -193,9 +193,9 @@ again:
bool anon_exclusive;
pte_t swp_pte;
+ flush_cache_page(vma, addr, pte_pfn(*ptep));
anon_exclusive = PageAnon(page) && PageAnonExclusive(page);
if (anon_exclusive) {
- flush_cache_page(vma, addr, pte_pfn(*ptep));
ptep_clear_flush(vma, addr, ptep);
if (page_try_share_anon_rmap(page)) {
_
Patches currently in -mm which might be from apopple(a)nvidia.com are
mm-migrate_devicec-flush-tlb-while-holding-ptl.patch
mm-migrate_devicec-add-missing-flush_cache_page.patch
mm-migrate_devicec-copy-pte-dirty-bit-to-page.patch
mm-gupc-simplify-and-fix-check_and_migrate_movable_pages-return-codes.patch
selftests-hmm-tests-add-test-for-dirty-bits.patch
mm-gupc-dont-pass-gup_flags-to-check_and_migrate_movable_pages.patch
mm-gupc-refactor-check_and_migrate_movable_pages.patch
mm-migrate_devicec-fix-a-misleading-and-out-dated-comment.patch
The patch titled
Subject: mm/migrate_device.c: flush TLB while holding PTL
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
mm-migrate_devicec-flush-tlb-while-holding-ptl.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Alistair Popple <apopple(a)nvidia.com>
Subject: mm/migrate_device.c: flush TLB while holding PTL
Date: Fri, 2 Sep 2022 10:35:51 +1000
When clearing a PTE the TLB should be flushed whilst still holding the PTL
to avoid a potential race with madvise/munmap/etc. For example consider
the following sequence:
CPU0 CPU1
---- ----
migrate_vma_collect_pmd()
pte_unmap_unlock()
madvise(MADV_DONTNEED)
-> zap_pte_range()
pte_offset_map_lock()
[ PTE not present, TLB not flushed ]
pte_unmap_unlock()
[ page is still accessible via stale TLB ]
flush_tlb_range()
In this case the page may still be accessed via the stale TLB entry after
madvise returns. Fix this by flushing the TLB while holding the PTL.
Fixes: 8c3328f1f36a ("mm/migrate: migrate_vma() unmap page from vma while collecting pages")
Link: https://lkml.kernel.org/r/9f801e9d8d830408f2ca27821f606e09aa856899.16620785…
Signed-off-by: Alistair Popple <apopple(a)nvidia.com>
Reported-by: Nadav Amit <nadav.amit(a)gmail.com>
Reviewed-by: "Huang, Ying" <ying.huang(a)intel.com>
Acked-by: David Hildenbrand <david(a)redhat.com>
Cc: Alex Sierra <alex.sierra(a)amd.com>
Cc: Ben Skeggs <bskeggs(a)redhat.com>
Cc: Felix Kuehling <Felix.Kuehling(a)amd.com>
Cc: huang ying <huang.ying.caritas(a)gmail.com>
Cc: Jason Gunthorpe <jgg(a)nvidia.com>
Cc: John Hubbard <jhubbard(a)nvidia.com>
Cc: Karol Herbst <kherbst(a)redhat.com>
Cc: Logan Gunthorpe <logang(a)deltatee.com>
Cc: Lyude Paul <lyude(a)redhat.com>
Cc: Matthew Wilcox <willy(a)infradead.org>
Cc: Paul Mackerras <paulus(a)ozlabs.org>
Cc: Peter Xu <peterx(a)redhat.com>
Cc: Ralph Campbell <rcampbell(a)nvidia.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/migrate_device.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
--- a/mm/migrate_device.c~mm-migrate_devicec-flush-tlb-while-holding-ptl
+++ a/mm/migrate_device.c
@@ -254,13 +254,14 @@ next:
migrate->dst[migrate->npages] = 0;
migrate->src[migrate->npages++] = mpfn;
}
- arch_leave_lazy_mmu_mode();
- pte_unmap_unlock(ptep - 1, ptl);
/* Only flush the TLB if we actually modified any entries */
if (unmapped)
flush_tlb_range(walk->vma, start, end);
+ arch_leave_lazy_mmu_mode();
+ pte_unmap_unlock(ptep - 1, ptl);
+
return 0;
}
_
Patches currently in -mm which might be from apopple(a)nvidia.com are
mm-migrate_devicec-flush-tlb-while-holding-ptl.patch
mm-migrate_devicec-add-missing-flush_cache_page.patch
mm-migrate_devicec-copy-pte-dirty-bit-to-page.patch
mm-gupc-simplify-and-fix-check_and_migrate_movable_pages-return-codes.patch
selftests-hmm-tests-add-test-for-dirty-bits.patch
mm-gupc-dont-pass-gup_flags-to-check_and_migrate_movable_pages.patch
mm-gupc-refactor-check_and_migrate_movable_pages.patch
mm-migrate_devicec-fix-a-misleading-and-out-dated-comment.patch
The advertisement of the persistent grants feature (writing
'feature-persistent' to xenbus) should mean not the decision for using
the feature but only the availability of the feature. However, commit
aac8a70db24b ("xen-blkback: add a parameter for disabling of persistent
grants") made a field of blkback, which was a place for saving only the
negotiation result, to be used for yet another purpose: caching of the
'feature_persistent' parameter value. As a result, the advertisement,
which should follow only the parameter value, becomes inconsistent.
This commit fixes the misuse of the semantic by making blkback saves the
parameter value in a separate place and advertises the support based on
only the saved value.
Fixes: aac8a70db24b ("xen-blkback: add a parameter for disabling of persistent grants")
Cc: <stable(a)vger.kernel.org> # 5.10.x
Suggested-by: Juergen Gross <jgross(a)suse.com>
Signed-off-by: SeongJae Park <sj(a)kernel.org>
---
drivers/block/xen-blkback/common.h | 3 +++
drivers/block/xen-blkback/xenbus.c | 6 ++++--
2 files changed, 7 insertions(+), 2 deletions(-)
diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h
index bda5c815e441..a28473470e66 100644
--- a/drivers/block/xen-blkback/common.h
+++ b/drivers/block/xen-blkback/common.h
@@ -226,6 +226,9 @@ struct xen_vbd {
sector_t size;
unsigned int flush_support:1;
unsigned int discard_secure:1;
+ /* Connect-time cached feature_persistent parameter value */
+ unsigned int feature_gnt_persistent_parm:1;
+ /* Persistent grants feature negotiation result */
unsigned int feature_gnt_persistent:1;
unsigned int overflow_max_grants:1;
};
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
index ee7ad2fb432d..c0227dfa4688 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -907,7 +907,7 @@ static void connect(struct backend_info *be)
xen_blkbk_barrier(xbt, be, be->blkif->vbd.flush_support);
err = xenbus_printf(xbt, dev->nodename, "feature-persistent", "%u",
- be->blkif->vbd.feature_gnt_persistent);
+ be->blkif->vbd.feature_gnt_persistent_parm);
if (err) {
xenbus_dev_fatal(dev, err, "writing %s/feature-persistent",
dev->nodename);
@@ -1085,7 +1085,9 @@ static int connect_ring(struct backend_info *be)
return -ENOSYS;
}
- blkif->vbd.feature_gnt_persistent = feature_persistent &&
+ blkif->vbd.feature_gnt_persistent_parm = feature_persistent;
+ blkif->vbd.feature_gnt_persistent =
+ blkif->vbd.feature_gnt_persistent_parm &&
xenbus_read_unsigned(dev->otherend, "feature-persistent", 0);
blkif->vbd.overflow_max_grants = 0;
--
2.25.1
From: Ville Syrjälä <ville.syrjala(a)linux.intel.com>
A lot of modern laptops use the Parade PS8461E MUX for eDP
switching. The MUX can operate in jitter cleaning mode or
redriver mode, the first one resulting in higher link
quality. The jitter cleaning mode needs to know the link
rate used and the MUX achieves this by snooping the
LINK_BW_SET, LINK_RATE_SELECT and SUPPORTED_LINK_RATES
DPCD accesses.
When the MUX is powered down (seems this can happen whenever
the display is turned off) it loses track of the snooped
link rates so when we do the LINK_RATE_SELECT write it no
longer knowns which link rate we're selecting, and thus it
falls back to the lower quality redriver mode. This results
in unstable high link rates (eg. usually 8.1Gbps link rate
no longer works correctly).
In order to avoid all that let's re-snoop SUPPORTED_LINK_RATES
from the sink at the start of every link training.
Unfortunately we don't have a way to detect the presence of
the MUX. It looks like the set of laptops equipped with this
MUX is fairly large and contains devices from multiple
manufacturers. It may also still be growing with new models.
So a quirk doesn't seem like a very easily maintainable
option, thus we shall attempt to do this unconditionally on
all machines that use LINK_RATE_SELECT. Hopefully this extra
DPCD read doesn't cause issues for any unaffected machine.
If that turns out to be the case we'll need to convert this
into a quirk in the future.
Cc: stable(a)vger.kernel.org
Cc: Jason A. Donenfeld <Jason(a)zx2c4.com>
Cc: Ankit Nautiyal <ankit.k.nautiyal(a)intel.com>
Cc: Jani Nikula <jani.nikula(a)intel.com>
Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/6205
Signed-off-by: Ville Syrjälä <ville.syrjala(a)linux.intel.com>
---
.../drm/i915/display/intel_dp_link_training.c | 22 +++++++++++++++++++
1 file changed, 22 insertions(+)
diff --git a/drivers/gpu/drm/i915/display/intel_dp_link_training.c b/drivers/gpu/drm/i915/display/intel_dp_link_training.c
index 9feaf1a589f3..d213d8ad1ea5 100644
--- a/drivers/gpu/drm/i915/display/intel_dp_link_training.c
+++ b/drivers/gpu/drm/i915/display/intel_dp_link_training.c
@@ -671,6 +671,28 @@ intel_dp_prepare_link_train(struct intel_dp *intel_dp,
intel_dp_compute_rate(intel_dp, crtc_state->port_clock,
&link_bw, &rate_select);
+ /*
+ * WaEdpLinkRateDataReload
+ *
+ * Parade PS8461E MUX (used on varius TGL+ laptops) needs
+ * to snoop the link rates reported by the sink when we
+ * use LINK_RATE_SET in order to operate in jitter cleaning
+ * mode (as opposed to redriver mode). Unfortunately it
+ * loses track of the snooped link rates when powered down,
+ * so we need to make it re-snoop often. Without this high
+ * link rates are not stable.
+ */
+ if (!link_bw) {
+ struct intel_connector *connector = intel_dp->attached_connector;
+ __le16 sink_rates[DP_MAX_SUPPORTED_RATES];
+
+ drm_dbg_kms(&i915->drm, "[CONNECTOR:%d:%s] Reloading eDP link rates\n",
+ connector->base.base.id, connector->base.name);
+
+ drm_dp_dpcd_read(&intel_dp->aux, DP_SUPPORTED_LINK_RATES,
+ sink_rates, sizeof(sink_rates));
+ }
+
if (link_bw)
drm_dbg_kms(&i915->drm,
"[ENCODER:%d:%s] Using LINK_BW_SET value %02x\n",
--
2.35.1
[Public]
Hi,
A sysfs file /sys/bus/thunderbolt/devices/domainX/iommu_dma_protection is exported from the kernel and used by userspace to make judgments on whether to automatically authorize PCIe tunnels for USB4 devices.
Before kernel 5.19 this file was only populated on Intel USB4 and TBT3 controllers, but starting with 5.19 it also populates for non-Intel as well.
This is accomplished by an assertion from the IOMMU subsystem that it's safe to do so by a combination of firmware and hardware.
Here is the patch series on top of 5.15.64:
3f6634d997dba8140b3a7cba01776b9638d70dff
ed36d04e8f8d7b00db451b0fa56a54e8e02ec43e
d0be55fbeb6ac694d15af5d1aad19cdec8cd64e5
f316ba0a8814f4c91e80a435da3421baf0ddd24c
f1ca70717bcb4525e29da422f3d280acbddb36fe
bfb3ba32061da1a9217ef6d02fbcffb528e4c8df
418e0a3551bbef5b221705b0e5b8412cdc0afd39
acdb89b6c87a2d7b5c48a82756e6f5c6f599f60a
ea4692c75e1c63926e4fb0728f5775ef0d733888
86eaf4a5b4312bea8676fb79399d9e08b53d8e71
Can you please consider backporting them to 5.15.y+?
I've confirmed with this series of patches that a Lenovo Z13 will populate the sysfs file and automatically authorize docks that are plugged in.
Thanks,
By rounding down, we can set a timeout value which reports spaces less
than the timeout as a timeout rather than a space.
Fixes: 877f1a7cee3f ("media: rc: mceusb: allow the timeout to be configurable")
Cc: stable(a)vger.kernel.org
Signed-off-by: Sean Young <sean(a)mess.org>
---
drivers/media/rc/mceusb.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/media/rc/mceusb.c b/drivers/media/rc/mceusb.c
index 39d2b03e2631..c76ba24c1f55 100644
--- a/drivers/media/rc/mceusb.c
+++ b/drivers/media/rc/mceusb.c
@@ -1077,7 +1077,7 @@ static int mceusb_set_timeout(struct rc_dev *dev, unsigned int timeout)
struct mceusb_dev *ir = dev->priv;
unsigned int units;
- units = DIV_ROUND_CLOSEST(timeout, MCE_TIME_UNIT);
+ units = DIV_ROUND_UP(timeout, MCE_TIME_UNIT);
cmdbuf[2] = units >> 8;
cmdbuf[3] = units;
--
2.37.2
Hi Greg,
This 5.10.y backport series contains fixes from v5.18 and v5.19-rc1.
The patches in this series have already been applied to 5.15.y in Leah's
latest update [1], so this 5.10.y is is mostly catching up with 5.15.y.
Thanks,
Amir.
Changes since v2:
- Drop 2 patches not in 5.15.y yet
Changes since v1:
- Added ACKs
- CC stable
[1] https://lore.kernel.org/linux-xfs/20220819181431.4113819-1-leah.rumancik@gm…
Amir Goldstein (1):
xfs: remove infinite loop when reserving free block pool
Brian Foster (1):
xfs: fix soft lockup via spinning in filestream ag selection loop
Darrick J. Wong (2):
xfs: always succeed at setting the reserve pool size
xfs: fix overfilling of reserve pool
Eric Sandeen (1):
xfs: revert "xfs: actually bump warning counts when we send warnings"
fs/xfs/xfs_filestream.c | 7 +++---
fs/xfs/xfs_fsops.c | 52 ++++++++++++++++------------------------
fs/xfs/xfs_mount.h | 8 +++++++
fs/xfs/xfs_trans_dquot.c | 1 -
4 files changed, 33 insertions(+), 35 deletions(-)
--
2.25.1
commit 6c287605fd56 ("mm: remember exclusively mapped anonymous pages with
PG_anon_exclusive") made sure that when PageAnonExclusive() has to be
cleared during temporary unmapping of a page, that the PTE is
cleared/invalidated and that the TLB is flushed.
What we want to achieve in all cases is that we cannot end up with a pin on
an anonymous page that may be shared, because such pins would be
unreliable and could result in memory corruptions when the mapped page
and the pin go out of sync due to a write fault.
That TLB flush handling was inspired by an outdated comment in
mm/ksm.c:write_protect_page(), which similarly required the TLB flush in
the past to synchronize with GUP-fast. However, ever since general RCU GUP
fast was introduced in commit 2667f50e8b81 ("mm: introduce a general RCU
get_user_pages_fast()"), a TLB flush is no longer sufficient to handle
concurrent GUP-fast in all cases -- it only handles traditional IPI-based
GUP-fast correctly.
Peter Xu (thankfully) questioned whether that TLB flush is really
required. On architectures that send an IPI broadcast on TLB flush,
it works as expected. To synchronize with RCU GUP-fast properly, we're
conceptually fine, however, we have to enforce a certain memory order and
are missing memory barriers.
Let's document that, avoid the TLB flush where possible and use proper
explicit memory barriers where required. We shouldn't really care about the
additional memory barriers here, as we're not on extremely hot paths --
and we're getting rid of some TLB flushes.
We use a smp_mb() pair for handling concurrent pinning and a
smp_rmb()/smp_wmb() pair for handling the corner case of only temporary
PTE changes but permanent PageAnonExclusive changes.
One extreme example, whereby GUP-fast takes a R/O pin and KSM wants to
convert an exclusive anonymous page to a KSM page, and that page is already
mapped write-protected (-> no PTE change) would be:
Thread 0 (KSM) Thread 1 (GUP-fast)
(B1) Read the PTE
# (B2) skipped without FOLL_WRITE
(A1) Clear PTE
smp_mb()
(A2) Check pinned
(B3) Pin the mapped page
smp_mb()
(A3) Clear PageAnonExclusive
smp_wmb()
(A4) Restore PTE
(B4) Check if the PTE changed
smp_rmb()
(B5) Check PageAnonExclusive
Thread 1 will properly detect that PageAnonExclusive was cleared and
back off.
Note that we don't need a memory barrier between checking if the page is
pinned and clearing PageAnonExclusive, because stores are not
speculated.
The possible issues due to reordering are of theoretical nature so far
and attempts to reproduce the race failed.
Especially the "no PTE change" case isn't the common case, because we'd
need an exclusive anonymous page that's mapped R/O and the PTE is clean
in KSM code -- and using KSM with page pinning isn't extremely common.
Further, the clear+TLB flush we used for now implies a memory barrier.
So the problematic missing part should be the missing memory barrier
after pinning but before checking if the PTE changed.
Fixes: 6c287605fd56 ("mm: remember exclusively mapped anonymous pages with PG_anon_exclusive")
Cc: <stable(a)vger.kernel.org>
Cc: Andrew Morton <akpm(a)linux-foundation.org>
Cc: Jason Gunthorpe <jgg(a)nvidia.com>
Cc: John Hubbard <jhubbard(a)nvidia.com>
Cc: Andrea Arcangeli <aarcange(a)redhat.com>
Cc: Hugh Dickins <hughd(a)google.com>
Cc: Peter Xu <peterx(a)redhat.com>
Cc: Alistair Popple <apopple(a)nvidia.com>
Cc: Nadav Amit <namit(a)vmware.com>
Cc: Yang Shi <shy828301(a)gmail.com>
Cc: Vlastimil Babka <vbabka(a)suse.cz>
Cc: Michal Hocko <mhocko(a)kernel.org>
Cc: Mike Kravetz <mike.kravetz(a)oracle.com>
Cc: Andrea Parri <parri.andrea(a)gmail.com>
Cc: Will Deacon <will(a)kernel.org>
Cc: Peter Zijlstra <peterz(a)infradead.org>
Cc: "Paul E. McKenney" <paulmck(a)kernel.org>
Cc: Christoph von Recklinghausen <crecklin(a)redhat.com>
Cc: Don Dutile <ddutile(a)redhat.com>
Signed-off-by: David Hildenbrand <david(a)redhat.com>
---
include/linux/mm.h | 9 ++++--
include/linux/rmap.h | 66 ++++++++++++++++++++++++++++++++++++++++----
mm/gup.c | 7 +++++
mm/huge_memory.c | 3 ++
mm/ksm.c | 1 +
mm/migrate_device.c | 23 +++++++--------
mm/rmap.c | 11 ++++----
7 files changed, 95 insertions(+), 25 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 21f8b27bd9fd..9641a2a488c1 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2975,8 +2975,8 @@ static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags)
* PageAnonExclusive() has to protect against concurrent GUP:
* * Ordinary GUP: Using the PT lock
* * GUP-fast and fork(): mm->write_protect_seq
- * * GUP-fast and KSM or temporary unmapping (swap, migration):
- * clear/invalidate+flush of the page table entry
+ * * GUP-fast and KSM or temporary unmapping (swap, migration): see
+ * page_try_share_anon_rmap()
*
* Must be called with the (sub)page that's actually referenced via the
* page table entry, which might not necessarily be the head page for a
@@ -2997,6 +2997,11 @@ static inline bool gup_must_unshare(unsigned int flags, struct page *page)
*/
if (!PageAnon(page))
return false;
+
+ /* Paired with a memory barrier in page_try_share_anon_rmap(). */
+ if (IS_ENABLED(CONFIG_HAVE_FAST_GUP))
+ smp_rmb();
+
/*
* Note that PageKsm() pages cannot be exclusive, and consequently,
* cannot get pinned.
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index bf80adca980b..72b2bcc37f73 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -267,7 +267,7 @@ static inline int page_try_dup_anon_rmap(struct page *page, bool compound,
* @page: the exclusive anonymous page to try marking possibly shared
*
* The caller needs to hold the PT lock and has to have the page table entry
- * cleared/invalidated+flushed, to properly sync against GUP-fast.
+ * cleared/invalidated.
*
* This is similar to page_try_dup_anon_rmap(), however, not used during fork()
* to duplicate a mapping, but instead to prepare for KSM or temporarily
@@ -283,12 +283,68 @@ static inline int page_try_share_anon_rmap(struct page *page)
{
VM_BUG_ON_PAGE(!PageAnon(page) || !PageAnonExclusive(page), page);
- /* See page_try_dup_anon_rmap(). */
- if (likely(!is_device_private_page(page) &&
- unlikely(page_maybe_dma_pinned(page))))
- return -EBUSY;
+ /* device private pages cannot get pinned via GUP. */
+ if (unlikely(is_device_private_page(page))) {
+ ClearPageAnonExclusive(page);
+ return 0;
+ }
+ /*
+ * We have to make sure that when we clear PageAnonExclusive, that
+ * the page is not pinned and that concurrent GUP-fast won't succeed in
+ * concurrently pinning the page.
+ *
+ * Conceptually, PageAnonExclusive clearing consists of:
+ * (A1) Clear PTE
+ * (A2) Check if the page is pinned; back off if so.
+ * (A3) Clear PageAnonExclusive
+ * (A4) Restore PTE (optional, but certainly not writable)
+ *
+ * When clearing PageAnonExclusive, we cannot possibly map the page
+ * writable again, because anon pages that may be shared must never
+ * be writable. So in any case, if the PTE was writable it cannot
+ * be writable anymore afterwards and there would be a PTE change. Only
+ * if the PTE wasn't writable, there might not be a PTE change.
+ *
+ * Conceptually, GUP-fast pinning of an anon page consists of:
+ * (B1) Read the PTE
+ * (B2) FOLL_WRITE: check if the PTE is not writable; back off if so.
+ * (B3) Pin the mapped page
+ * (B4) Check if the PTE changed by re-reading it; back off if so.
+ * (B5) If the original PTE is not writable, check if
+ * PageAnonExclusive is not set; back off if so.
+ *
+ * If the PTE was writable, we only have to make sure that GUP-fast
+ * observes a PTE change and properly backs off.
+ *
+ * If the PTE was not writable, we have to make sure that GUP-fast either
+ * detects a (temporary) PTE change or that PageAnonExclusive is cleared
+ * and properly backs off.
+ *
+ * Consequently, when clearing PageAnonExclusive(), we have to make
+ * sure that (A1), (A2)/(A3) and (A4) happen in the right memory
+ * order. In GUP-fast pinning code, we have to make sure that (B3),(B4)
+ * and (B5) happen in the right memory order.
+ *
+ * We assume that there might not be a memory barrier after
+ * clearing/invalidating the PTE (A1) and before restoring the PTE (A4),
+ * so we use explicit ones here.
+ */
+
+ /* Paired with the memory barrier in try_grab_folio(). */
+ if (IS_ENABLED(CONFIG_HAVE_FAST_GUP))
+ smp_mb();
+
+ if (unlikely(page_maybe_dma_pinned(page)))
+ return -EBUSY;
ClearPageAnonExclusive(page);
+
+ /*
+ * This is conceptually a smp_wmb() paired with the smp_rmb() in
+ * gup_must_unshare().
+ */
+ if (IS_ENABLED(CONFIG_HAVE_FAST_GUP))
+ smp_mb__after_atomic();
return 0;
}
diff --git a/mm/gup.c b/mm/gup.c
index 5abdaf487460..750fb317a41a 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -158,6 +158,13 @@ struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags)
else
folio_ref_add(folio,
refs * (GUP_PIN_COUNTING_BIAS - 1));
+ /*
+ * Adjust the pincount before re-checking the PTE for changes.
+ * This is essentially a smp_mb() and is paired with a memory
+ * barrier in page_try_share_anon_rmap().
+ */
+ smp_mb__after_atomic();
+
node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, refs);
return folio;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index e9414ee57c5b..2aef8d76fcf2 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2140,6 +2140,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
*
* In case we cannot clear PageAnonExclusive(), split the PMD
* only and let try_to_migrate_one() fail later.
+ *
+ * See page_try_share_anon_rmap(): invalidate PMD first.
*/
anon_exclusive = PageAnon(page) && PageAnonExclusive(page);
if (freeze && anon_exclusive && page_try_share_anon_rmap(page))
@@ -3177,6 +3179,7 @@ int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
flush_cache_range(vma, address, address + HPAGE_PMD_SIZE);
pmdval = pmdp_invalidate(vma, address, pvmw->pmd);
+ /* See page_try_share_anon_rmap(): invalidate PMD first. */
anon_exclusive = PageAnon(page) && PageAnonExclusive(page);
if (anon_exclusive && page_try_share_anon_rmap(page)) {
set_pmd_at(mm, address, pvmw->pmd, pmdval);
diff --git a/mm/ksm.c b/mm/ksm.c
index 42ab153335a2..27e067f77097 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1095,6 +1095,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
goto out_unlock;
}
+ /* See page_try_share_anon_rmap(): clear PTE first. */
if (anon_exclusive && page_try_share_anon_rmap(page)) {
set_pte_at(mm, pvmw.address, pvmw.pte, entry);
goto out_unlock;
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index 27fb37d65476..0858bb4ba584 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -193,20 +193,17 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
bool anon_exclusive;
pte_t swp_pte;
+ flush_cache_page(vma, addr, pte_pfn(*ptep));
+ ptep_get_and_clear(mm, addr, ptep);
+
+ /* See page_try_share_anon_rmap(): clear PTE first. */
anon_exclusive = PageAnon(page) && PageAnonExclusive(page);
- if (anon_exclusive) {
- flush_cache_page(vma, addr, pte_pfn(*ptep));
- ptep_clear_flush(vma, addr, ptep);
-
- if (page_try_share_anon_rmap(page)) {
- set_pte_at(mm, addr, ptep, pte);
- unlock_page(page);
- put_page(page);
- mpfn = 0;
- goto next;
- }
- } else {
- ptep_get_and_clear(mm, addr, ptep);
+ if (anon_exclusive && page_try_share_anon_rmap(page)) {
+ set_pte_at(mm, addr, ptep, pte);
+ unlock_page(page);
+ put_page(page);
+ mpfn = 0;
+ goto next;
}
migrate->cpages++;
diff --git a/mm/rmap.c b/mm/rmap.c
index edc06c52bc82..b3a21ea9f3d0 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1579,11 +1579,8 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
} else {
flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
- /*
- * Nuke the page table entry. When having to clear
- * PageAnonExclusive(), we always have to flush.
- */
- if (should_defer_flush(mm, flags) && !anon_exclusive) {
+ /* Nuke the page table entry. */
+ if (should_defer_flush(mm, flags)) {
/*
* We clear the PTE but do not flush so potentially
* a remote CPU could still be writing to the folio.
@@ -1714,6 +1711,8 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
page_vma_mapped_walk_done(&pvmw);
break;
}
+
+ /* See page_try_share_anon_rmap(): clear PTE first. */
if (anon_exclusive &&
page_try_share_anon_rmap(subpage)) {
swap_free(entry);
@@ -2045,6 +2044,8 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
}
VM_BUG_ON_PAGE(pte_write(pteval) && folio_test_anon(folio) &&
!anon_exclusive, subpage);
+
+ /* See page_try_share_anon_rmap(): clear PTE first. */
if (anon_exclusive &&
page_try_share_anon_rmap(subpage)) {
if (folio_test_hugetlb(folio))
--
2.37.1
From: Omar Sandoval <osandov(a)fb.com>
commit ced8ecf026fd8084cf175530ff85c76d6085d715 upstream.
When testing space_cache v2 on a large set of machines, we encountered a
few symptoms:
1. "unable to add free space :-17" (EEXIST) errors.
2. Missing free space info items, sometimes caught with a "missing free
space info for X" error.
3. Double-accounted space: ranges that were allocated in the extent tree
and also marked as free in the free space tree, ranges that were
marked as allocated twice in the extent tree, or ranges that were
marked as free twice in the free space tree. If the latter made it
onto disk, the next reboot would hit the BUG_ON() in
add_new_free_space().
4. On some hosts with no on-disk corruption or error messages, the
in-memory space cache (dumped with drgn) disagreed with the free
space tree.
All of these symptoms have the same underlying cause: a race between
caching the free space for a block group and returning free space to the
in-memory space cache for pinned extents causes us to double-add a free
range to the space cache. This race exists when free space is cached
from the free space tree (space_cache=v2) or the extent tree
(nospace_cache, or space_cache=v1 if the cache needs to be regenerated).
struct btrfs_block_group::last_byte_to_unpin and struct
btrfs_block_group::progress are supposed to protect against this race,
but commit d0c2f4fa555e ("btrfs: make concurrent fsyncs wait less when
waiting for a transaction commit") subtly broke this by allowing
multiple transactions to be unpinning extents at the same time.
Specifically, the race is as follows:
1. An extent is deleted from an uncached block group in transaction A.
2. btrfs_commit_transaction() is called for transaction A.
3. btrfs_run_delayed_refs() -> __btrfs_free_extent() runs the delayed
ref for the deleted extent.
4. __btrfs_free_extent() -> do_free_extent_accounting() ->
add_to_free_space_tree() adds the deleted extent back to the free
space tree.
5. do_free_extent_accounting() -> btrfs_update_block_group() ->
btrfs_cache_block_group() queues up the block group to get cached.
block_group->progress is set to block_group->start.
6. btrfs_commit_transaction() for transaction A calls
switch_commit_roots(). It sets block_group->last_byte_to_unpin to
block_group->progress, which is block_group->start because the block
group hasn't been cached yet.
7. The caching thread gets to our block group. Since the commit roots
were already switched, load_free_space_tree() sees the deleted extent
as free and adds it to the space cache. It finishes caching and sets
block_group->progress to U64_MAX.
8. btrfs_commit_transaction() advances transaction A to
TRANS_STATE_SUPER_COMMITTED.
9. fsync calls btrfs_commit_transaction() for transaction B. Since
transaction A is already in TRANS_STATE_SUPER_COMMITTED and the
commit is for fsync, it advances.
10. btrfs_commit_transaction() for transaction B calls
switch_commit_roots(). This time, the block group has already been
cached, so it sets block_group->last_byte_to_unpin to U64_MAX.
11. btrfs_commit_transaction() for transaction A calls
btrfs_finish_extent_commit(), which calls unpin_extent_range() for
the deleted extent. It sees last_byte_to_unpin set to U64_MAX (by
transaction B!), so it adds the deleted extent to the space cache
again!
This explains all of our symptoms above:
* If the sequence of events is exactly as described above, when the free
space is re-added in step 11, it will fail with EEXIST.
* If another thread reallocates the deleted extent in between steps 7
and 11, then step 11 will silently re-add that space to the space
cache as free even though it is actually allocated. Then, if that
space is allocated *again*, the free space tree will be corrupted
(namely, the wrong item will be deleted).
* If we don't catch this free space tree corruption, it will continue
to get worse as extents are deleted and reallocated.
The v1 space_cache is synchronously loaded when an extent is deleted
(btrfs_update_block_group() with alloc=0 calls btrfs_cache_block_group()
with load_cache_only=1), so it is not normally affected by this bug.
However, as noted above, if we fail to load the space cache, we will
fall back to caching from the extent tree and may hit this bug.
The easiest fix for this race is to also make caching from the free
space tree or extent tree synchronous. Josef tested this and found no
performance regressions.
A few extra changes fall out of this change. Namely, this fix does the
following, with step 2 being the crucial fix:
1. Factor btrfs_caching_ctl_wait_done() out of
btrfs_wait_block_group_cache_done() to allow waiting on a caching_ctl
that we already hold a reference to.
2. Change the call in btrfs_cache_block_group() of
btrfs_wait_space_cache_v1_finished() to
btrfs_caching_ctl_wait_done(), which makes us wait regardless of the
space_cache option.
3. Delete the now unused btrfs_wait_space_cache_v1_finished() and
space_cache_v1_done().
4. Change btrfs_cache_block_group()'s `int load_cache_only` parameter to
`bool wait` to more accurately describe its new meaning.
5. Change a few callers which had a separate call to
btrfs_wait_block_group_cache_done() to use wait = true instead.
6. Make btrfs_wait_block_group_cache_done() static now that it's not
used outside of block-group.c anymore.
Fixes: d0c2f4fa555e ("btrfs: make concurrent fsyncs wait less when waiting for a transaction commit")
CC: stable(a)vger.kernel.org # 5.12+
Reviewed-by: Filipe Manana <fdmanana(a)suse.com>
Signed-off-by: Omar Sandoval <osandov(a)fb.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
---
Hi,
This is the backport of commit ced8ecf026fd8084cf175530ff85c76d6085d715
to the 5.15 stable branch. Please consider it for the next 5.15 stable
release.
Thanks,
Omar
fs/btrfs/block-group.c | 47 ++++++++++++++----------------------------
fs/btrfs/block-group.h | 4 +---
fs/btrfs/ctree.h | 1 -
fs/btrfs/extent-tree.c | 30 ++++++---------------------
4 files changed, 22 insertions(+), 60 deletions(-)
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 909cc00ef5ce..474dcc0540a8 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -418,39 +418,26 @@ void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache,
btrfs_put_caching_control(caching_ctl);
}
-int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache)
+static int btrfs_caching_ctl_wait_done(struct btrfs_block_group *cache,
+ struct btrfs_caching_control *caching_ctl)
+{
+ wait_event(caching_ctl->wait, btrfs_block_group_done(cache));
+ return cache->cached == BTRFS_CACHE_ERROR ? -EIO : 0;
+}
+
+static int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache)
{
struct btrfs_caching_control *caching_ctl;
- int ret = 0;
+ int ret;
caching_ctl = btrfs_get_caching_control(cache);
if (!caching_ctl)
return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
-
- wait_event(caching_ctl->wait, btrfs_block_group_done(cache));
- if (cache->cached == BTRFS_CACHE_ERROR)
- ret = -EIO;
+ ret = btrfs_caching_ctl_wait_done(cache, caching_ctl);
btrfs_put_caching_control(caching_ctl);
return ret;
}
-static bool space_cache_v1_done(struct btrfs_block_group *cache)
-{
- bool ret;
-
- spin_lock(&cache->lock);
- ret = cache->cached != BTRFS_CACHE_FAST;
- spin_unlock(&cache->lock);
-
- return ret;
-}
-
-void btrfs_wait_space_cache_v1_finished(struct btrfs_block_group *cache,
- struct btrfs_caching_control *caching_ctl)
-{
- wait_event(caching_ctl->wait, space_cache_v1_done(cache));
-}
-
#ifdef CONFIG_BTRFS_DEBUG
static void fragment_free_space(struct btrfs_block_group *block_group)
{
@@ -727,9 +714,8 @@ static noinline void caching_thread(struct btrfs_work *work)
btrfs_put_block_group(block_group);
}
-int btrfs_cache_block_group(struct btrfs_block_group *cache, int load_cache_only)
+int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait)
{
- DEFINE_WAIT(wait);
struct btrfs_fs_info *fs_info = cache->fs_info;
struct btrfs_caching_control *caching_ctl = NULL;
int ret = 0;
@@ -762,10 +748,7 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, int load_cache_only
}
WARN_ON(cache->caching_ctl);
cache->caching_ctl = caching_ctl;
- if (btrfs_test_opt(fs_info, SPACE_CACHE))
- cache->cached = BTRFS_CACHE_FAST;
- else
- cache->cached = BTRFS_CACHE_STARTED;
+ cache->cached = BTRFS_CACHE_STARTED;
cache->has_caching_ctl = 1;
spin_unlock(&cache->lock);
@@ -778,8 +761,8 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, int load_cache_only
btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
out:
- if (load_cache_only && caching_ctl)
- btrfs_wait_space_cache_v1_finished(cache, caching_ctl);
+ if (wait && caching_ctl)
+ ret = btrfs_caching_ctl_wait_done(cache, caching_ctl);
if (caching_ctl)
btrfs_put_caching_control(caching_ctl);
@@ -3200,7 +3183,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
* space back to the block group, otherwise we will leak space.
*/
if (!alloc && !btrfs_block_group_done(cache))
- btrfs_cache_block_group(cache, 1);
+ btrfs_cache_block_group(cache, true);
byte_in_group = bytenr - cache->start;
WARN_ON(byte_in_group > cache->length);
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index d73db0dfacb2..a15868d607a9 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -251,9 +251,7 @@ void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr);
void btrfs_wait_nocow_writers(struct btrfs_block_group *bg);
void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache,
u64 num_bytes);
-int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache);
-int btrfs_cache_block_group(struct btrfs_block_group *cache,
- int load_cache_only);
+int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait);
void btrfs_put_caching_control(struct btrfs_caching_control *ctl);
struct btrfs_caching_control *btrfs_get_caching_control(
struct btrfs_block_group *cache);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index d1838de0b39c..8dafb6bf62bd 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -462,7 +462,6 @@ struct btrfs_free_cluster {
enum btrfs_caching_type {
BTRFS_CACHE_NO,
BTRFS_CACHE_STARTED,
- BTRFS_CACHE_FAST,
BTRFS_CACHE_FINISHED,
BTRFS_CACHE_ERROR,
};
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 248ea15c9734..8c007abf81f0 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2572,17 +2572,10 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
return -EINVAL;
/*
- * pull in the free space cache (if any) so that our pin
- * removes the free space from the cache. We have load_only set
- * to one because the slow code to read in the free extents does check
- * the pinned extents.
+ * Fully cache the free space first so that our pin removes the free space
+ * from the cache.
*/
- btrfs_cache_block_group(cache, 1);
- /*
- * Make sure we wait until the cache is completely built in case it is
- * missing or is invalid and therefore needs to be rebuilt.
- */
- ret = btrfs_wait_block_group_cache_done(cache);
+ ret = btrfs_cache_block_group(cache, true);
if (ret)
goto out;
@@ -2605,12 +2598,7 @@ static int __exclude_logged_extent(struct btrfs_fs_info *fs_info,
if (!block_group)
return -EINVAL;
- btrfs_cache_block_group(block_group, 1);
- /*
- * Make sure we wait until the cache is completely built in case it is
- * missing or is invalid and therefore needs to be rebuilt.
- */
- ret = btrfs_wait_block_group_cache_done(block_group);
+ ret = btrfs_cache_block_group(block_group, true);
if (ret)
goto out;
@@ -4324,7 +4312,7 @@ static noinline int find_free_extent(struct btrfs_root *root,
ffe_ctl.cached = btrfs_block_group_done(block_group);
if (unlikely(!ffe_ctl.cached)) {
ffe_ctl.have_caching_bg = true;
- ret = btrfs_cache_block_group(block_group, 0);
+ ret = btrfs_cache_block_group(block_group, false);
/*
* If we get ENOMEM here or something else we want to
@@ -6066,13 +6054,7 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
if (end - start >= range->minlen) {
if (!btrfs_block_group_done(cache)) {
- ret = btrfs_cache_block_group(cache, 0);
- if (ret) {
- bg_failed++;
- bg_ret = ret;
- continue;
- }
- ret = btrfs_wait_block_group_cache_done(cache);
+ ret = btrfs_cache_block_group(cache, true);
if (ret) {
bg_failed++;
bg_ret = ret;
--
2.30.2
commit 2555283eb40df89945557273121e9393ef9b542b upstream.
anon_vma->degree tracks the combined number of child anon_vmas and VMAs
that use the anon_vma as their ->anon_vma.
anon_vma_clone() then assumes that for any anon_vma attached to
src->anon_vma_chain other than src->anon_vma, it is impossible for it to
be a leaf node of the VMA tree, meaning that for such VMAs ->degree is
elevated by 1 because of a child anon_vma, meaning that if ->degree
equals 1 there are no VMAs that use the anon_vma as their ->anon_vma.
This assumption is wrong because the ->degree optimization leads to leaf
nodes being abandoned on anon_vma_clone() - an existing anon_vma is
reused and no new parent-child relationship is created. So it is
possible to reuse an anon_vma for one VMA while it is still tied to
another VMA.
This is an issue because is_mergeable_anon_vma() and its callers assume
that if two VMAs have the same ->anon_vma, the list of anon_vmas
attached to the VMAs is guaranteed to be the same. When this assumption
is violated, vma_merge() can merge pages into a VMA that is not attached
to the corresponding anon_vma, leading to dangling page->mapping
pointers that will be dereferenced during rmap walks.
Fix it by separately tracking the number of child anon_vmas and the
number of VMAs using the anon_vma as their ->anon_vma.
Fixes: 7a3ef208e662 ("mm: prevent endless growth of anon_vma hierarchy")
Cc: stable(a)kernel.org
Acked-by: Michal Hocko <mhocko(a)suse.com>
Acked-by: Vlastimil Babka <vbabka(a)suse.cz>
Signed-off-by: Jann Horn <jannh(a)google.com>
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
[manually fixed up different indentation in stable]
Signed-off-by: Jann Horn <jannh(a)google.com>
---
include/linux/rmap.h | 7 +++++--
mm/rmap.c | 31 +++++++++++++++++--------------
2 files changed, 22 insertions(+), 16 deletions(-)
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index b46bb5620a76d..9dc3617a5bfce 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -37,12 +37,15 @@ struct anon_vma {
atomic_t refcount;
/*
- * Count of child anon_vmas and VMAs which points to this anon_vma.
+ * Count of child anon_vmas. Equals to the count of all anon_vmas that
+ * have ->parent pointing to this one, including itself.
*
* This counter is used for making decision about reusing anon_vma
* instead of forking new one. See comments in function anon_vma_clone.
*/
- unsigned degree;
+ unsigned long num_children;
+ /* Count of VMAs whose ->anon_vma pointer points to this object. */
+ unsigned long num_active_vmas;
struct anon_vma *parent; /* Parent of this anon_vma */
diff --git a/mm/rmap.c b/mm/rmap.c
index 0a5310b76ec85..76064d9649186 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -78,7 +78,8 @@ static inline struct anon_vma *anon_vma_alloc(void)
anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
if (anon_vma) {
atomic_set(&anon_vma->refcount, 1);
- anon_vma->degree = 1; /* Reference for first vma */
+ anon_vma->num_children = 0;
+ anon_vma->num_active_vmas = 0;
anon_vma->parent = anon_vma;
/*
* Initialise the anon_vma root to point to itself. If called
@@ -187,6 +188,7 @@ int anon_vma_prepare(struct vm_area_struct *vma)
anon_vma = anon_vma_alloc();
if (unlikely(!anon_vma))
goto out_enomem_free_avc;
+ anon_vma->num_children++; /* self-parent link for new root */
allocated = anon_vma;
}
@@ -196,8 +198,7 @@ int anon_vma_prepare(struct vm_area_struct *vma)
if (likely(!vma->anon_vma)) {
vma->anon_vma = anon_vma;
anon_vma_chain_link(vma, avc, anon_vma);
- /* vma reference or self-parent link for new root */
- anon_vma->degree++;
+ anon_vma->num_active_vmas++;
allocated = NULL;
avc = NULL;
}
@@ -276,19 +277,19 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
anon_vma_chain_link(dst, avc, anon_vma);
/*
- * Reuse existing anon_vma if its degree lower than two,
- * that means it has no vma and only one anon_vma child.
+ * Reuse existing anon_vma if it has no vma and only one
+ * anon_vma child.
*
- * Do not chose parent anon_vma, otherwise first child
- * will always reuse it. Root anon_vma is never reused:
+ * Root anon_vma is never reused:
* it has self-parent reference and at least one child.
*/
- if (!dst->anon_vma && anon_vma != src->anon_vma &&
- anon_vma->degree < 2)
+ if (!dst->anon_vma &&
+ anon_vma->num_children < 2 &&
+ anon_vma->num_active_vmas == 0)
dst->anon_vma = anon_vma;
}
if (dst->anon_vma)
- dst->anon_vma->degree++;
+ dst->anon_vma->num_active_vmas++;
unlock_anon_vma_root(root);
return 0;
@@ -338,6 +339,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
anon_vma = anon_vma_alloc();
if (!anon_vma)
goto out_error;
+ anon_vma->num_active_vmas++;
avc = anon_vma_chain_alloc(GFP_KERNEL);
if (!avc)
goto out_error_free_anon_vma;
@@ -358,7 +360,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
vma->anon_vma = anon_vma;
anon_vma_lock_write(anon_vma);
anon_vma_chain_link(vma, avc, anon_vma);
- anon_vma->parent->degree++;
+ anon_vma->parent->num_children++;
anon_vma_unlock_write(anon_vma);
return 0;
@@ -390,7 +392,7 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
* to free them outside the lock.
*/
if (RB_EMPTY_ROOT(&anon_vma->rb_root)) {
- anon_vma->parent->degree--;
+ anon_vma->parent->num_children--;
continue;
}
@@ -398,7 +400,7 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
anon_vma_chain_free(avc);
}
if (vma->anon_vma)
- vma->anon_vma->degree--;
+ vma->anon_vma->num_active_vmas--;
unlock_anon_vma_root(root);
/*
@@ -409,7 +411,8 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
struct anon_vma *anon_vma = avc->anon_vma;
- VM_WARN_ON(anon_vma->degree);
+ VM_WARN_ON(anon_vma->num_children);
+ VM_WARN_ON(anon_vma->num_active_vmas);
put_anon_vma(anon_vma);
list_del(&avc->same_vma);
base-commit: c97531caf70f2b533fa1944bf64dd06ac20edad6
--
2.37.2.789.g6183377224-goog
The patch titled
Subject: mm: fix PageAnonExclusive clearing racing with concurrent RCU GUP-fast
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
mm-fix-pageanonexclusive-clearing-racing-with-concurrent-rcu-gup-fast.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: David Hildenbrand <david(a)redhat.com>
Subject: mm: fix PageAnonExclusive clearing racing with concurrent RCU GUP-fast
Date: Thu, 1 Sep 2022 10:35:59 +0200
commit 6c287605fd56 ("mm: remember exclusively mapped anonymous pages with
PG_anon_exclusive") made sure that when PageAnonExclusive() has to be
cleared during temporary unmapping of a page, that the PTE is
cleared/invalidated and that the TLB is flushed.
What we want to achieve in all cases is that we cannot end up with a pin on
an anonymous page that may be shared, because such pins would be
unreliable and could result in memory corruptions when the mapped page
and the pin go out of sync due to a write fault.
That TLB flush handling was inspired by an outdated comment in
mm/ksm.c:write_protect_page(), which similarly required the TLB flush in
the past to synchronize with GUP-fast. However, ever since general RCU GUP
fast was introduced in commit 2667f50e8b81 ("mm: introduce a general RCU
get_user_pages_fast()"), a TLB flush is no longer sufficient to handle
concurrent GUP-fast in all cases -- it only handles traditional IPI-based
GUP-fast correctly.
Peter Xu (thankfully) questioned whether that TLB flush is really
required. On architectures that send an IPI broadcast on TLB flush,
it works as expected. To synchronize with RCU GUP-fast properly, we're
conceptually fine, however, we have to enforce a certain memory order and
are missing memory barriers.
Let's document that, avoid the TLB flush where possible and use proper
explicit memory barriers where required. We shouldn't really care about the
additional memory barriers here, as we're not on extremely hot paths --
and we're getting rid of some TLB flushes.
We use a smp_mb() pair for handling concurrent pinning and a
smp_rmb()/smp_wmb() pair for handling the corner case of only temporary
PTE changes but permanent PageAnonExclusive changes.
One extreme example, whereby GUP-fast takes a R/O pin and KSM wants to
convert an exclusive anonymous page to a KSM page, and that page is already
mapped write-protected (-> no PTE change) would be:
Thread 0 (KSM) Thread 1 (GUP-fast)
(B1) Read the PTE
# (B2) skipped without FOLL_WRITE
(A1) Clear PTE
smp_mb()
(A2) Check pinned
(B3) Pin the mapped page
smp_mb()
(A3) Clear PageAnonExclusive
smp_wmb()
(A4) Restore PTE
(B4) Check if the PTE changed
smp_rmb()
(B5) Check PageAnonExclusive
Thread 1 will properly detect that PageAnonExclusive was cleared and
back off.
Note that we don't need a memory barrier between checking if the page is
pinned and clearing PageAnonExclusive, because stores are not
speculated.
The possible issues due to reordering are of theoretical nature so far
and attempts to reproduce the race failed.
Especially the "no PTE change" case isn't the common case, because we'd
need an exclusive anonymous page that's mapped R/O and the PTE is clean
in KSM code -- and using KSM with page pinning isn't extremely common.
Further, the clear+TLB flush we used for now implies a memory barrier.
So the problematic missing part should be the missing memory barrier
after pinning but before checking if the PTE changed.
Link: https://lkml.kernel.org/r/20220901083559.67446-1-david@redhat.com
Fixes: 6c287605fd56 ("mm: remember exclusively mapped anonymous pages with PG_anon_exclusive")
Signed-off-by: David Hildenbrand <david(a)redhat.com>
Cc: Jason Gunthorpe <jgg(a)nvidia.com>
Cc: John Hubbard <jhubbard(a)nvidia.com>
Cc: Andrea Arcangeli <aarcange(a)redhat.com>
Cc: Hugh Dickins <hughd(a)google.com>
Cc: Peter Xu <peterx(a)redhat.com>
Cc: Alistair Popple <apopple(a)nvidia.com>
Cc: Nadav Amit <namit(a)vmware.com>
Cc: Yang Shi <shy828301(a)gmail.com>
Cc: Vlastimil Babka <vbabka(a)suse.cz>
Cc: Michal Hocko <mhocko(a)kernel.org>
Cc: Mike Kravetz <mike.kravetz(a)oracle.com>
Cc: Andrea Parri <parri.andrea(a)gmail.com>
Cc: Will Deacon <will(a)kernel.org>
Cc: Peter Zijlstra <peterz(a)infradead.org>
Cc: "Paul E. McKenney" <paulmck(a)kernel.org>
Cc: Christoph von Recklinghausen <crecklin(a)redhat.com>
Cc: Don Dutile <ddutile(a)redhat.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
include/linux/mm.h | 9 ++++-
include/linux/rmap.h | 66 +++++++++++++++++++++++++++++++++++++----
mm/gup.c | 7 ++++
mm/huge_memory.c | 3 +
mm/ksm.c | 1
mm/migrate_device.c | 23 ++++++--------
mm/rmap.c | 11 +++---
7 files changed, 95 insertions(+), 25 deletions(-)
--- a/include/linux/mm.h~mm-fix-pageanonexclusive-clearing-racing-with-concurrent-rcu-gup-fast
+++ a/include/linux/mm.h
@@ -2975,8 +2975,8 @@ static inline int vm_fault_to_errno(vm_f
* PageAnonExclusive() has to protect against concurrent GUP:
* * Ordinary GUP: Using the PT lock
* * GUP-fast and fork(): mm->write_protect_seq
- * * GUP-fast and KSM or temporary unmapping (swap, migration):
- * clear/invalidate+flush of the page table entry
+ * * GUP-fast and KSM or temporary unmapping (swap, migration): see
+ * page_try_share_anon_rmap()
*
* Must be called with the (sub)page that's actually referenced via the
* page table entry, which might not necessarily be the head page for a
@@ -2997,6 +2997,11 @@ static inline bool gup_must_unshare(unsi
*/
if (!PageAnon(page))
return false;
+
+ /* Paired with a memory barrier in page_try_share_anon_rmap(). */
+ if (IS_ENABLED(CONFIG_HAVE_FAST_GUP))
+ smp_rmb();
+
/*
* Note that PageKsm() pages cannot be exclusive, and consequently,
* cannot get pinned.
--- a/include/linux/rmap.h~mm-fix-pageanonexclusive-clearing-racing-with-concurrent-rcu-gup-fast
+++ a/include/linux/rmap.h
@@ -267,7 +267,7 @@ dup:
* @page: the exclusive anonymous page to try marking possibly shared
*
* The caller needs to hold the PT lock and has to have the page table entry
- * cleared/invalidated+flushed, to properly sync against GUP-fast.
+ * cleared/invalidated.
*
* This is similar to page_try_dup_anon_rmap(), however, not used during fork()
* to duplicate a mapping, but instead to prepare for KSM or temporarily
@@ -283,12 +283,68 @@ static inline int page_try_share_anon_rm
{
VM_BUG_ON_PAGE(!PageAnon(page) || !PageAnonExclusive(page), page);
- /* See page_try_dup_anon_rmap(). */
- if (likely(!is_device_private_page(page) &&
- unlikely(page_maybe_dma_pinned(page))))
- return -EBUSY;
+ /* device private pages cannot get pinned via GUP. */
+ if (unlikely(is_device_private_page(page))) {
+ ClearPageAnonExclusive(page);
+ return 0;
+ }
+
+ /*
+ * We have to make sure that when we clear PageAnonExclusive, that
+ * the page is not pinned and that concurrent GUP-fast won't succeed in
+ * concurrently pinning the page.
+ *
+ * Conceptually, PageAnonExclusive clearing consists of:
+ * (A1) Clear PTE
+ * (A2) Check if the page is pinned; back off if so.
+ * (A3) Clear PageAnonExclusive
+ * (A4) Restore PTE (optional, but certainly not writable)
+ *
+ * When clearing PageAnonExclusive, we cannot possibly map the page
+ * writable again, because anon pages that may be shared must never
+ * be writable. So in any case, if the PTE was writable it cannot
+ * be writable anymore afterwards and there would be a PTE change. Only
+ * if the PTE wasn't writable, there might not be a PTE change.
+ *
+ * Conceptually, GUP-fast pinning of an anon page consists of:
+ * (B1) Read the PTE
+ * (B2) FOLL_WRITE: check if the PTE is not writable; back off if so.
+ * (B3) Pin the mapped page
+ * (B4) Check if the PTE changed by re-reading it; back off if so.
+ * (B5) If the original PTE is not writable, check if
+ * PageAnonExclusive is not set; back off if so.
+ *
+ * If the PTE was writable, we only have to make sure that GUP-fast
+ * observes a PTE change and properly backs off.
+ *
+ * If the PTE was not writable, we have to make sure that GUP-fast either
+ * detects a (temporary) PTE change or that PageAnonExclusive is cleared
+ * and properly backs off.
+ *
+ * Consequently, when clearing PageAnonExclusive(), we have to make
+ * sure that (A1), (A2)/(A3) and (A4) happen in the right memory
+ * order. In GUP-fast pinning code, we have to make sure that (B3),(B4)
+ * and (B5) happen in the right memory order.
+ *
+ * We assume that there might not be a memory barrier after
+ * clearing/invalidating the PTE (A1) and before restoring the PTE (A4),
+ * so we use explicit ones here.
+ */
+
+ /* Paired with the memory barrier in try_grab_folio(). */
+ if (IS_ENABLED(CONFIG_HAVE_FAST_GUP))
+ smp_mb();
+ if (unlikely(page_maybe_dma_pinned(page)))
+ return -EBUSY;
ClearPageAnonExclusive(page);
+
+ /*
+ * This is conceptually a smp_wmb() paired with the smp_rmb() in
+ * gup_must_unshare().
+ */
+ if (IS_ENABLED(CONFIG_HAVE_FAST_GUP))
+ smp_mb__after_atomic();
return 0;
}
--- a/mm/gup.c~mm-fix-pageanonexclusive-clearing-racing-with-concurrent-rcu-gup-fast
+++ a/mm/gup.c
@@ -158,6 +158,13 @@ struct folio *try_grab_folio(struct page
else
folio_ref_add(folio,
refs * (GUP_PIN_COUNTING_BIAS - 1));
+ /*
+ * Adjust the pincount before re-checking the PTE for changes.
+ * This is essentially a smp_mb() and is paired with a memory
+ * barrier in page_try_share_anon_rmap().
+ */
+ smp_mb__after_atomic();
+
node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, refs);
return folio;
--- a/mm/huge_memory.c~mm-fix-pageanonexclusive-clearing-racing-with-concurrent-rcu-gup-fast
+++ a/mm/huge_memory.c
@@ -2140,6 +2140,8 @@ static void __split_huge_pmd_locked(stru
*
* In case we cannot clear PageAnonExclusive(), split the PMD
* only and let try_to_migrate_one() fail later.
+ *
+ * See page_try_share_anon_rmap(): invalidate PMD first.
*/
anon_exclusive = PageAnon(page) && PageAnonExclusive(page);
if (freeze && anon_exclusive && page_try_share_anon_rmap(page))
@@ -3177,6 +3179,7 @@ int set_pmd_migration_entry(struct page_
flush_cache_range(vma, address, address + HPAGE_PMD_SIZE);
pmdval = pmdp_invalidate(vma, address, pvmw->pmd);
+ /* See page_try_share_anon_rmap(): invalidate PMD first. */
anon_exclusive = PageAnon(page) && PageAnonExclusive(page);
if (anon_exclusive && page_try_share_anon_rmap(page)) {
set_pmd_at(mm, address, pvmw->pmd, pmdval);
--- a/mm/ksm.c~mm-fix-pageanonexclusive-clearing-racing-with-concurrent-rcu-gup-fast
+++ a/mm/ksm.c
@@ -1095,6 +1095,7 @@ static int write_protect_page(struct vm_
goto out_unlock;
}
+ /* See page_try_share_anon_rmap(): clear PTE first. */
if (anon_exclusive && page_try_share_anon_rmap(page)) {
set_pte_at(mm, pvmw.address, pvmw.pte, entry);
goto out_unlock;
--- a/mm/migrate_device.c~mm-fix-pageanonexclusive-clearing-racing-with-concurrent-rcu-gup-fast
+++ a/mm/migrate_device.c
@@ -193,20 +193,17 @@ again:
bool anon_exclusive;
pte_t swp_pte;
- anon_exclusive = PageAnon(page) && PageAnonExclusive(page);
- if (anon_exclusive) {
- flush_cache_page(vma, addr, pte_pfn(*ptep));
- ptep_clear_flush(vma, addr, ptep);
+ flush_cache_page(vma, addr, pte_pfn(*ptep));
+ ptep_get_and_clear(mm, addr, ptep);
- if (page_try_share_anon_rmap(page)) {
- set_pte_at(mm, addr, ptep, pte);
- unlock_page(page);
- put_page(page);
- mpfn = 0;
- goto next;
- }
- } else {
- ptep_get_and_clear(mm, addr, ptep);
+ /* See page_try_share_anon_rmap(): clear PTE first. */
+ anon_exclusive = PageAnon(page) && PageAnonExclusive(page);
+ if (anon_exclusive && page_try_share_anon_rmap(page)) {
+ set_pte_at(mm, addr, ptep, pte);
+ unlock_page(page);
+ put_page(page);
+ mpfn = 0;
+ goto next;
}
migrate->cpages++;
--- a/mm/rmap.c~mm-fix-pageanonexclusive-clearing-racing-with-concurrent-rcu-gup-fast
+++ a/mm/rmap.c
@@ -1579,11 +1579,8 @@ static bool try_to_unmap_one(struct foli
pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
} else {
flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
- /*
- * Nuke the page table entry. When having to clear
- * PageAnonExclusive(), we always have to flush.
- */
- if (should_defer_flush(mm, flags) && !anon_exclusive) {
+ /* Nuke the page table entry. */
+ if (should_defer_flush(mm, flags)) {
/*
* We clear the PTE but do not flush so potentially
* a remote CPU could still be writing to the folio.
@@ -1714,6 +1711,8 @@ static bool try_to_unmap_one(struct foli
page_vma_mapped_walk_done(&pvmw);
break;
}
+
+ /* See page_try_share_anon_rmap(): clear PTE first. */
if (anon_exclusive &&
page_try_share_anon_rmap(subpage)) {
swap_free(entry);
@@ -2045,6 +2044,8 @@ static bool try_to_migrate_one(struct fo
}
VM_BUG_ON_PAGE(pte_write(pteval) && folio_test_anon(folio) &&
!anon_exclusive, subpage);
+
+ /* See page_try_share_anon_rmap(): clear PTE first. */
if (anon_exclusive &&
page_try_share_anon_rmap(subpage)) {
if (folio_test_hugetlb(folio))
_
Patches currently in -mm which might be from david(a)redhat.com are
mm-fix-pageanonexclusive-clearing-racing-with-concurrent-rcu-gup-fast.patch
mm-gup-replace-foll_numa-by-gup_can_follow_protnone.patch
mm-gup-use-gup_can_follow_protnone-also-in-gup-fast.patch
mm-fixup-documentation-regarding-pte_numa-and-prot_numa.patch
--
Hello,
I have sent you two emails and you did not respond, I even sent another
message a few days ago with more details still no response from you.
Please are you still using this email address? I am VERY SORRY if
sincerely you did not receive those emails, I will resend it now as soon
as you confirm you never received them.
Regards,
Susan Lee Yu-Chen
The patch titled
Subject: mm/hugetlb: fix races when looking up a CONT-PTE/PMD size hugetlb page
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
mm-hugetlb-fix-races-when-looking-up-a-cont-pte-pmd-size-hugetlb-page.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Baolin Wang <baolin.wang(a)linux.alibaba.com>
Subject: mm/hugetlb: fix races when looking up a CONT-PTE/PMD size hugetlb page
Date: Thu, 1 Sep 2022 18:41:31 +0800
On some architectures (like ARM64), it can support CONT-PTE/PMD size
hugetlb, which means it can support not only PMD/PUD size hugetlb (2M and
1G), but also CONT-PTE/PMD size(64K and 32M) if a 4K page size specified.
So when looking up a CONT-PTE size hugetlb page by follow_page(), it will
use pte_offset_map_lock() to get the pte entry lock for the CONT-PTE size
hugetlb in follow_page_pte(). However this pte entry lock is incorrect
for the CONT-PTE size hugetlb, since we should use huge_pte_lock() to get
the correct lock, which is mm->page_table_lock.
That means the pte entry of the CONT-PTE size hugetlb under current pte
lock is unstable in follow_page_pte(), we can continue to migrate or
poison the pte entry of the CONT-PTE size hugetlb, which can cause some
potential race issues, even though they are under the 'pte lock'.
For example, suppose thread A is trying to look up a CONT-PTE size hugetlb
page by move_pages() syscall under the lock, however antoher thread B can
migrate the CONT-PTE hugetlb page at the same time, which will cause
thread A to get an incorrect page, if thread A also wants to do page
migration, then data inconsistency error occurs.
Moreover we have the same issue for CONT-PMD size hugetlb in
follow_huge_pmd().
To fix above issues, rename the follow_huge_pmd() as follow_huge_pmd_pte()
to handle PMD and PTE level size hugetlb, which uses huge_pte_lock() to
get the correct pte entry lock to make the pte entry stable.
Link: https://lkml.kernel.org/r/635f43bdd85ac2615a58405da82b4d33c6e5eb05.16620175…
Signed-off-by: Baolin Wang <baolin.wang(a)linux.alibaba.com>
Suggested-by: Mike Kravetz <mike.kravetz(a)oracle.com>
Cc: David Hildenbrand <david(a)redhat.com>
Cc: Muchun Song <songmuchun(a)bytedance.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
include/linux/hugetlb.h | 8 ++++----
mm/gup.c | 14 +++++++++++++-
mm/hugetlb.c | 27 +++++++++++++--------------
3 files changed, 30 insertions(+), 19 deletions(-)
--- a/include/linux/hugetlb.h~mm-hugetlb-fix-races-when-looking-up-a-cont-pte-pmd-size-hugetlb-page
+++ a/include/linux/hugetlb.h
@@ -207,8 +207,8 @@ struct page *follow_huge_addr(struct mm_
struct page *follow_huge_pd(struct vm_area_struct *vma,
unsigned long address, hugepd_t hpd,
int flags, int pdshift);
-struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
- pmd_t *pmd, int flags);
+struct page *follow_huge_pmd_pte(struct vm_area_struct *vma, unsigned long address,
+ int flags);
struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address,
pud_t *pud, int flags);
struct page *follow_huge_pgd(struct mm_struct *mm, unsigned long address,
@@ -312,8 +312,8 @@ static inline struct page *follow_huge_p
return NULL;
}
-static inline struct page *follow_huge_pmd(struct mm_struct *mm,
- unsigned long address, pmd_t *pmd, int flags)
+static inline struct page *follow_huge_pmd_pte(struct vm_area_struct *vma,
+ unsigned long address, int flags)
{
return NULL;
}
--- a/mm/gup.c~mm-hugetlb-fix-races-when-looking-up-a-cont-pte-pmd-size-hugetlb-page
+++ a/mm/gup.c
@@ -530,6 +530,18 @@ static struct page *follow_page_pte(stru
if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
(FOLL_PIN | FOLL_GET)))
return ERR_PTR(-EINVAL);
+
+ /*
+ * Considering PTE level hugetlb, like continuous-PTE hugetlb on
+ * ARM64 architecture.
+ */
+ if (is_vm_hugetlb_page(vma)) {
+ page = follow_huge_pmd_pte(vma, address, flags);
+ if (page)
+ return page;
+ return no_page_table(vma, flags);
+ }
+
retry:
if (unlikely(pmd_bad(*pmd)))
return no_page_table(vma, flags);
@@ -662,7 +674,7 @@ static struct page *follow_pmd_mask(stru
if (pmd_none(pmdval))
return no_page_table(vma, flags);
if (pmd_huge(pmdval) && is_vm_hugetlb_page(vma)) {
- page = follow_huge_pmd(mm, address, pmd, flags);
+ page = follow_huge_pmd_pte(vma, address, flags);
if (page)
return page;
return no_page_table(vma, flags);
--- a/mm/hugetlb.c~mm-hugetlb-fix-races-when-looking-up-a-cont-pte-pmd-size-hugetlb-page
+++ a/mm/hugetlb.c
@@ -6944,12 +6944,13 @@ follow_huge_pd(struct vm_area_struct *vm
}
struct page * __weak
-follow_huge_pmd(struct mm_struct *mm, unsigned long address,
- pmd_t *pmd, int flags)
+follow_huge_pmd_pte(struct vm_area_struct *vma, unsigned long address, int flags)
{
+ struct hstate *h = hstate_vma(vma);
+ struct mm_struct *mm = vma->vm_mm;
struct page *page = NULL;
spinlock_t *ptl;
- pte_t pte;
+ pte_t *ptep, pte;
/*
* FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via
@@ -6959,17 +6960,15 @@ follow_huge_pmd(struct mm_struct *mm, un
return NULL;
retry:
- ptl = pmd_lockptr(mm, pmd);
- spin_lock(ptl);
- /*
- * make sure that the address range covered by this pmd is not
- * unmapped from other threads.
- */
- if (!pmd_huge(*pmd))
- goto out;
- pte = huge_ptep_get((pte_t *)pmd);
+ ptep = huge_pte_offset(mm, address, huge_page_size(h));
+ if (!ptep)
+ return NULL;
+
+ ptl = huge_pte_lock(h, mm, ptep);
+ pte = huge_ptep_get(ptep);
if (pte_present(pte)) {
- page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT);
+ page = pte_page(pte) +
+ ((address & ~huge_page_mask(h)) >> PAGE_SHIFT);
/*
* try_grab_page() should always succeed here, because: a) we
* hold the pmd (ptl) lock, and b) we've just checked that the
@@ -6985,7 +6984,7 @@ retry:
} else {
if (is_hugetlb_entry_migration(pte)) {
spin_unlock(ptl);
- __migration_entry_wait_huge((pte_t *)pmd, ptl);
+ __migration_entry_wait_huge(ptep, ptl);
goto retry;
}
/*
_
Patches currently in -mm which might be from baolin.wang(a)linux.alibaba.com are
mm-hugetlb-fix-races-when-looking-up-a-cont-pte-pmd-size-hugetlb-page.patch
mm-migrate-do-not-retry-10-times-for-the-subpages-of-fail-to-migrate-thp.patch
mm-damon-validate-if-the-pmd-entry-is-present-before-accessing.patch
mm-damon-replace-pmd_huge-with-pmd_trans_huge-for-thp.patch
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 2555283eb40df89945557273121e9393ef9b542b Mon Sep 17 00:00:00 2001
From: Jann Horn <jannh(a)google.com>
Date: Wed, 31 Aug 2022 19:06:00 +0200
Subject: [PATCH] mm/rmap: Fix anon_vma->degree ambiguity leading to
double-reuse
anon_vma->degree tracks the combined number of child anon_vmas and VMAs
that use the anon_vma as their ->anon_vma.
anon_vma_clone() then assumes that for any anon_vma attached to
src->anon_vma_chain other than src->anon_vma, it is impossible for it to
be a leaf node of the VMA tree, meaning that for such VMAs ->degree is
elevated by 1 because of a child anon_vma, meaning that if ->degree
equals 1 there are no VMAs that use the anon_vma as their ->anon_vma.
This assumption is wrong because the ->degree optimization leads to leaf
nodes being abandoned on anon_vma_clone() - an existing anon_vma is
reused and no new parent-child relationship is created. So it is
possible to reuse an anon_vma for one VMA while it is still tied to
another VMA.
This is an issue because is_mergeable_anon_vma() and its callers assume
that if two VMAs have the same ->anon_vma, the list of anon_vmas
attached to the VMAs is guaranteed to be the same. When this assumption
is violated, vma_merge() can merge pages into a VMA that is not attached
to the corresponding anon_vma, leading to dangling page->mapping
pointers that will be dereferenced during rmap walks.
Fix it by separately tracking the number of child anon_vmas and the
number of VMAs using the anon_vma as their ->anon_vma.
Fixes: 7a3ef208e662 ("mm: prevent endless growth of anon_vma hierarchy")
Cc: stable(a)kernel.org
Acked-by: Michal Hocko <mhocko(a)suse.com>
Acked-by: Vlastimil Babka <vbabka(a)suse.cz>
Signed-off-by: Jann Horn <jannh(a)google.com>
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index bf80adca980b..b89b4b86951f 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -41,12 +41,15 @@ struct anon_vma {
atomic_t refcount;
/*
- * Count of child anon_vmas and VMAs which points to this anon_vma.
+ * Count of child anon_vmas. Equals to the count of all anon_vmas that
+ * have ->parent pointing to this one, including itself.
*
* This counter is used for making decision about reusing anon_vma
* instead of forking new one. See comments in function anon_vma_clone.
*/
- unsigned degree;
+ unsigned long num_children;
+ /* Count of VMAs whose ->anon_vma pointer points to this object. */
+ unsigned long num_active_vmas;
struct anon_vma *parent; /* Parent of this anon_vma */
diff --git a/mm/rmap.c b/mm/rmap.c
index edc06c52bc82..93d5a6f793d2 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -93,7 +93,8 @@ static inline struct anon_vma *anon_vma_alloc(void)
anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
if (anon_vma) {
atomic_set(&anon_vma->refcount, 1);
- anon_vma->degree = 1; /* Reference for first vma */
+ anon_vma->num_children = 0;
+ anon_vma->num_active_vmas = 0;
anon_vma->parent = anon_vma;
/*
* Initialise the anon_vma root to point to itself. If called
@@ -201,6 +202,7 @@ int __anon_vma_prepare(struct vm_area_struct *vma)
anon_vma = anon_vma_alloc();
if (unlikely(!anon_vma))
goto out_enomem_free_avc;
+ anon_vma->num_children++; /* self-parent link for new root */
allocated = anon_vma;
}
@@ -210,8 +212,7 @@ int __anon_vma_prepare(struct vm_area_struct *vma)
if (likely(!vma->anon_vma)) {
vma->anon_vma = anon_vma;
anon_vma_chain_link(vma, avc, anon_vma);
- /* vma reference or self-parent link for new root */
- anon_vma->degree++;
+ anon_vma->num_active_vmas++;
allocated = NULL;
avc = NULL;
}
@@ -296,19 +297,19 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
anon_vma_chain_link(dst, avc, anon_vma);
/*
- * Reuse existing anon_vma if its degree lower than two,
- * that means it has no vma and only one anon_vma child.
+ * Reuse existing anon_vma if it has no vma and only one
+ * anon_vma child.
*
- * Do not choose parent anon_vma, otherwise first child
- * will always reuse it. Root anon_vma is never reused:
+ * Root anon_vma is never reused:
* it has self-parent reference and at least one child.
*/
if (!dst->anon_vma && src->anon_vma &&
- anon_vma != src->anon_vma && anon_vma->degree < 2)
+ anon_vma->num_children < 2 &&
+ anon_vma->num_active_vmas == 0)
dst->anon_vma = anon_vma;
}
if (dst->anon_vma)
- dst->anon_vma->degree++;
+ dst->anon_vma->num_active_vmas++;
unlock_anon_vma_root(root);
return 0;
@@ -358,6 +359,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
anon_vma = anon_vma_alloc();
if (!anon_vma)
goto out_error;
+ anon_vma->num_active_vmas++;
avc = anon_vma_chain_alloc(GFP_KERNEL);
if (!avc)
goto out_error_free_anon_vma;
@@ -378,7 +380,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
vma->anon_vma = anon_vma;
anon_vma_lock_write(anon_vma);
anon_vma_chain_link(vma, avc, anon_vma);
- anon_vma->parent->degree++;
+ anon_vma->parent->num_children++;
anon_vma_unlock_write(anon_vma);
return 0;
@@ -410,7 +412,7 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
* to free them outside the lock.
*/
if (RB_EMPTY_ROOT(&anon_vma->rb_root.rb_root)) {
- anon_vma->parent->degree--;
+ anon_vma->parent->num_children--;
continue;
}
@@ -418,7 +420,7 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
anon_vma_chain_free(avc);
}
if (vma->anon_vma) {
- vma->anon_vma->degree--;
+ vma->anon_vma->num_active_vmas--;
/*
* vma would still be needed after unlink, and anon_vma will be prepared
@@ -436,7 +438,8 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
struct anon_vma *anon_vma = avc->anon_vma;
- VM_WARN_ON(anon_vma->degree);
+ VM_WARN_ON(anon_vma->num_children);
+ VM_WARN_ON(anon_vma->num_active_vmas);
put_anon_vma(anon_vma);
list_del(&avc->same_vma);
We need to inform PCODE of a desired ring frequencies so PCODE update
the memory frequencies to us. rps->min_freq and rps->max_freq are the
frequencies used in that request. However they were unset when SLPC was
enabled and PCODE never updated the memory freq.
v2 (as Suggested by Ashutosh): if SLPC is in use, let's pick the right
frequencies from the get_ia_constants instead of the fake init of
rps' min and max.
Fixes: 7ba79a671568 ("drm/i915/guc/slpc: Gate Host RPS when SLPC is enabled")
Cc: <stable(a)vger.kernel.org> # v5.15+
Cc: Ashutosh Dixit <ashutosh.dixit(a)intel.com>
Tested-by: Sushma Venkatesh Reddy <sushma.venkatesh.reddy(a)intel.com>
Signed-off-by: Rodrigo Vivi <rodrigo.vivi(a)intel.com>
---
drivers/gpu/drm/i915/gt/intel_llc.c | 10 ++++++++--
1 file changed, 8 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/i915/gt/intel_llc.c b/drivers/gpu/drm/i915/gt/intel_llc.c
index 14fe65812e42..766f9526da99 100644
--- a/drivers/gpu/drm/i915/gt/intel_llc.c
+++ b/drivers/gpu/drm/i915/gt/intel_llc.c
@@ -49,6 +49,7 @@ static unsigned int cpu_max_MHz(void)
static bool get_ia_constants(struct intel_llc *llc,
struct ia_constants *consts)
{
+ struct intel_guc_slpc *slpc = &llc_to_gt(llc)->uc.guc.slpc;
struct drm_i915_private *i915 = llc_to_gt(llc)->i915;
struct intel_rps *rps = &llc_to_gt(llc)->rps;
@@ -65,8 +66,13 @@ static bool get_ia_constants(struct intel_llc *llc,
/* convert DDR frequency from units of 266.6MHz to bandwidth */
consts->min_ring_freq = mult_frac(consts->min_ring_freq, 8, 3);
- consts->min_gpu_freq = rps->min_freq;
- consts->max_gpu_freq = rps->max_freq;
+ if (intel_uc_uses_guc_slpc(&llc_to_gt(llc)->uc)) {
+ consts->min_gpu_freq = slpc->min_freq;
+ consts->max_gpu_freq = slpc->rp0_freq;
+ } else {
+ consts->min_gpu_freq = rps->min_freq;
+ consts->max_gpu_freq = rps->max_freq;
+ }
if (GRAPHICS_VER(i915) >= 9) {
/* Convert GT frequency to 50 HZ units */
consts->min_gpu_freq /= GEN9_FREQ_SCALER;
--
2.37.1
This patch fixes a possible use after free if tracing for the specific
event is enabled. To avoid the use after free we introduce a out_put
label like all other user lock specific requests and safe in a boolean
to do a put or not which depends on the execution path of
dlm_user_request().
Cc: stable(a)vger.kernel.org
Fixes: 7a3de7324c2b ("fs: dlm: trace user space callbacks")
Reported-by: Dan Carpenter <dan.carpenter(a)oracle.com>
Signed-off-by: Alexander Aring <aahringo(a)redhat.com>
---
fs/dlm/lock.c | 15 ++++++++-------
1 file changed, 8 insertions(+), 7 deletions(-)
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index c830feb26384..94a72ede5764 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -5835,6 +5835,7 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
{
struct dlm_lkb *lkb;
struct dlm_args args;
+ bool do_put = true;
int error;
dlm_lock_recovery(ls);
@@ -5851,9 +5852,8 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
ua->lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_NOFS);
if (!ua->lksb.sb_lvbptr) {
kfree(ua);
- __put_lkb(ls, lkb);
error = -ENOMEM;
- goto out_trace_end;
+ goto out_put;
}
}
#ifdef CONFIG_DLM_DEPRECATED_API
@@ -5867,8 +5867,7 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
kfree(ua->lksb.sb_lvbptr);
ua->lksb.sb_lvbptr = NULL;
kfree(ua);
- __put_lkb(ls, lkb);
- goto out_trace_end;
+ goto out_put;
}
/* After ua is attached to lkb it will be freed by dlm_free_lkb().
@@ -5887,8 +5886,7 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
error = 0;
fallthrough;
default:
- __put_lkb(ls, lkb);
- goto out_trace_end;
+ goto out_put;
}
/* add this new lkb to the per-process list of locks */
@@ -5896,8 +5894,11 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
hold_lkb(lkb);
list_add_tail(&lkb->lkb_ownqueue, &ua->proc->locks);
spin_unlock(&ua->proc->locks_spin);
- out_trace_end:
+ do_put = false;
+ out_put:
trace_dlm_lock_end(ls, lkb, name, namelen, mode, flags, error, false);
+ if (do_put)
+ __put_lkb(ls, lkb);
out:
dlm_unlock_recovery(ls);
return error;
--
2.31.1
Hi Greg,
This 5.10.y backport series contains fixes from v5.18 and v5.19-rc1.
Patches 1-5 in this series have already been applied to 5.15.y in Leah's
latest update [1], so this 5.10.y is is mostly catching up with 5.15.y.
Patches 6-7 in this 5.10.y update have not been applied to 5.15.y yet.
I pointed Leah's attention to these patches and she said she will
include them in a following 5.15.y update.
Thanks,
Amir.
Changes since v1:
- Added ACKs
- CC stable
[1] https://lore.kernel.org/linux-xfs/20220819181431.4113819-1-leah.rumancik@gm…
Amir Goldstein (1):
xfs: remove infinite loop when reserving free block pool
Brian Foster (1):
xfs: fix soft lockup via spinning in filestream ag selection loop
Darrick J. Wong (2):
xfs: always succeed at setting the reserve pool size
xfs: fix overfilling of reserve pool
Dave Chinner (2):
xfs: reorder iunlink remove operation in xfs_ifree
xfs: validate inode fork size against fork format
Eric Sandeen (1):
xfs: revert "xfs: actually bump warning counts when we send warnings"
fs/xfs/libxfs/xfs_inode_buf.c | 35 +++++++++++++++++------
fs/xfs/xfs_filestream.c | 7 +++--
fs/xfs/xfs_fsops.c | 52 ++++++++++++++---------------------
fs/xfs/xfs_inode.c | 22 ++++++++-------
fs/xfs/xfs_mount.h | 8 ++++++
fs/xfs/xfs_trans_dquot.c | 1 -
6 files changed, 71 insertions(+), 54 deletions(-)
--
2.25.1
This is a note to let you know that I've just added the patch titled
binder: fix alloc->vma_vm_mm null-ptr dereference
to my char-misc git tree which can be found at
git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/char-misc.git
in the char-misc-linus branch.
The patch will show up in the next release of the linux-next tree
(usually sometime within the next 24 hours during the week.)
The patch will hopefully also be merged in Linus's tree for the
next -rc kernel release.
If you have any questions about this process, please let me know.
From 1da52815d5f1b654c89044db0cdc6adce43da1f1 Mon Sep 17 00:00:00 2001
From: Carlos Llamas <cmllamas(a)google.com>
Date: Mon, 29 Aug 2022 20:12:48 +0000
Subject: binder: fix alloc->vma_vm_mm null-ptr dereference
Syzbot reported a couple issues introduced by commit 44e602b4e52f
("binder_alloc: add missing mmap_lock calls when using the VMA"), in
which we attempt to acquire the mmap_lock when alloc->vma_vm_mm has not
been initialized yet.
This can happen if a binder_proc receives a transaction without having
previously called mmap() to setup the binder_proc->alloc space in [1].
Also, a similar issue occurs via binder_alloc_print_pages() when we try
to dump the debugfs binder stats file in [2].
Sample of syzbot's crash report:
==================================================================
KASAN: null-ptr-deref in range [0x0000000000000128-0x000000000000012f]
CPU: 0 PID: 3755 Comm: syz-executor229 Not tainted 6.0.0-rc1-next-20220819-syzkaller #0
syz-executor229[3755] cmdline: ./syz-executor2294415195
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 07/22/2022
RIP: 0010:__lock_acquire+0xd83/0x56d0 kernel/locking/lockdep.c:4923
[...]
Call Trace:
<TASK>
lock_acquire kernel/locking/lockdep.c:5666 [inline]
lock_acquire+0x1ab/0x570 kernel/locking/lockdep.c:5631
down_read+0x98/0x450 kernel/locking/rwsem.c:1499
mmap_read_lock include/linux/mmap_lock.h:117 [inline]
binder_alloc_new_buf_locked drivers/android/binder_alloc.c:405 [inline]
binder_alloc_new_buf+0xa5/0x19e0 drivers/android/binder_alloc.c:593
binder_transaction+0x242e/0x9a80 drivers/android/binder.c:3199
binder_thread_write+0x664/0x3220 drivers/android/binder.c:3986
binder_ioctl_write_read drivers/android/binder.c:5036 [inline]
binder_ioctl+0x3470/0x6d00 drivers/android/binder.c:5323
vfs_ioctl fs/ioctl.c:51 [inline]
__do_sys_ioctl fs/ioctl.c:870 [inline]
__se_sys_ioctl fs/ioctl.c:856 [inline]
__x64_sys_ioctl+0x193/0x200 fs/ioctl.c:856
do_syscall_x64 arch/x86/entry/common.c:50 [inline]
do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x63/0xcd
[...]
==================================================================
Fix these issues by setting up alloc->vma_vm_mm pointer during open()
and caching directly from current->mm. This guarantees we have a valid
reference to take the mmap_lock during scenarios described above.
[1] https://syzkaller.appspot.com/bug?extid=f7dc54e5be28950ac459
[2] https://syzkaller.appspot.com/bug?extid=a75ebe0452711c9e56d9
Fixes: 44e602b4e52f ("binder_alloc: add missing mmap_lock calls when using the VMA")
Cc: <stable(a)vger.kernel.org> # v5.15+
Cc: Liam R. Howlett <Liam.Howlett(a)oracle.com>
Reported-by: syzbot+f7dc54e5be28950ac459(a)syzkaller.appspotmail.com
Reported-by: syzbot+a75ebe0452711c9e56d9(a)syzkaller.appspotmail.com
Reviewed-by: Liam R. Howlett <Liam.Howlett(a)oracle.com>
Acked-by: Todd Kjos <tkjos(a)google.com>
Signed-off-by: Carlos Llamas <cmllamas(a)google.com>
Link: https://lore.kernel.org/r/20220829201254.1814484-2-cmllamas@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
drivers/android/binder_alloc.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c
index 1014beb12802..a61df6e1103a 100644
--- a/drivers/android/binder_alloc.c
+++ b/drivers/android/binder_alloc.c
@@ -322,7 +322,6 @@ static inline void binder_alloc_set_vma(struct binder_alloc *alloc,
*/
if (vma) {
vm_start = vma->vm_start;
- alloc->vma_vm_mm = vma->vm_mm;
mmap_assert_write_locked(alloc->vma_vm_mm);
} else {
mmap_assert_locked(alloc->vma_vm_mm);
@@ -792,7 +791,6 @@ int binder_alloc_mmap_handler(struct binder_alloc *alloc,
binder_insert_free_buffer(alloc, buffer);
alloc->free_async_space = alloc->buffer_size / 2;
binder_alloc_set_vma(alloc, vma);
- mmgrab(alloc->vma_vm_mm);
return 0;
@@ -1080,6 +1078,8 @@ static struct shrinker binder_shrinker = {
void binder_alloc_init(struct binder_alloc *alloc)
{
alloc->pid = current->group_leader->pid;
+ alloc->vma_vm_mm = current->mm;
+ mmgrab(alloc->vma_vm_mm);
mutex_init(&alloc->mutex);
INIT_LIST_HEAD(&alloc->buffers);
}
--
2.37.3
Both __device_attach_driver() and __driver_attach() check the return
code of the bus_type.match() function to see if the device needs to be
added to the deferred probe list. After adding the device to the list,
the logic attempts to bind the device to the driver anyway, as if the
device had matched with the driver, which is not correct.
If __device_attach_driver() detects that the device in question is not
ready to match with a driver on the bus, then it doesn't make sense for
the device to attempt to bind with the current driver or continue
attempting to match with any of the other drivers on the bus. So, update
the logic in __device_attach_driver() to reflect this.
If __driver_attach() detects that a driver tried to match with a device
that is not ready to match yet, then the driver should not attempt to bind
with the device. However, the driver can still attempt to match and bind
with other devices on the bus, as drivers can be bound to multiple
devices. So, update the logic in __driver_attach() to reflect this.
Cc: stable(a)vger.kernel.org
Cc: Saravana Kannan <saravanak(a)google.com>
Fixes: 656b8035b0ee ("ARM: 8524/1: driver cohandle -EPROBE_DEFER from bus_type.match()")
Reported-by: Guenter Roeck <linux(a)roeck-us.net>
Signed-off-by: Isaac J. Manjarres <isaacmanjarres(a)google.com>
Tested-by: Guenter Roeck <linux(a)roeck-us.net>
Reviewed-by: Saravana Kannan <saravanak(a)google.com>
---
drivers/base/dd.c | 10 ++++++++++
1 file changed, 10 insertions(+)
v1 -> v2:
- Fixed the logic in __driver_attach() to allow a driver to continue
attempting to match and bind with devices in case of any error, not
just probe deferral.
v2 -> v3:
- Restored the patch back to v1.
- Added Guenter's Tested-by tag.
- Added Saravana's Reviewed-by tag.
- Cc'd stable(a)vger.kernel.org
Greg,
This is the final version of this patch. Can you please pick this up?
Thanks,
Isaac
diff --git a/drivers/base/dd.c b/drivers/base/dd.c
index 70f79fc71539..90b31fb141a5 100644
--- a/drivers/base/dd.c
+++ b/drivers/base/dd.c
@@ -881,6 +881,11 @@ static int __device_attach_driver(struct device_driver *drv, void *_data)
dev_dbg(dev, "Device match requests probe deferral\n");
dev->can_match = true;
driver_deferred_probe_add(dev);
+ /*
+ * Device can't match with a driver right now, so don't attempt
+ * to match or bind with other drivers on the bus.
+ */
+ return ret;
} else if (ret < 0) {
dev_dbg(dev, "Bus failed to match device: %d\n", ret);
return ret;
@@ -1120,6 +1125,11 @@ static int __driver_attach(struct device *dev, void *data)
dev_dbg(dev, "Device match requests probe deferral\n");
dev->can_match = true;
driver_deferred_probe_add(dev);
+ /*
+ * Driver could not match with device, but may match with
+ * another device on the bus.
+ */
+ return 0;
} else if (ret < 0) {
dev_dbg(dev, "Bus failed to match device: %d\n", ret);
return ret;
--
2.37.1.595.g718a3a8f04-goog
Like crashk_res, Calling crash_exclude_mem_range function with
crashk_low_res area would need extra crash_mem range too.
Add one more extra cmem slot in case of crashk_low_res is used.
Signed-off-by: Levi Yun <ppbuk5246(a)gmail.com>
Fixes: 944a45abfabc ("arm64: kdump: Reimplement crashkernel=X")
Cc: <stable(a)vger.kernel.org> # 5.19.x
Acked-by: Baoquan He <bhe(a)redhat.com>
Reviewed-by: Catalin Marinas <catalin.marinas(a)arm.com>
---
arch/arm64/kernel/machine_kexec_file.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/arm64/kernel/machine_kexec_file.c b/arch/arm64/kernel/machine_kexec_file.c
index 889951291cc0..a11a6e14ba89 100644
--- a/arch/arm64/kernel/machine_kexec_file.c
+++ b/arch/arm64/kernel/machine_kexec_file.c
@@ -47,7 +47,7 @@ static int prepare_elf_headers(void **addr, unsigned long *sz)
u64 i;
phys_addr_t start, end;
- nr_ranges = 1; /* for exclusion of crashkernel region */
+ nr_ranges = 2; /* for exclusion of crashkernel region */
for_each_mem_range(i, &start, &end)
nr_ranges++;
--
2.35.1
From: Jing Leng <jleng(a)ambarella.com>
commit 23a0cb8e3225122496bfa79172005c587c2d64bf upstream.
When building an external module, if users don't need to separate the
compilation output and source code, they run the following command:
"make -C $(LINUX_SRC_DIR) M=$(PWD)". At this point, "$(KBUILD_EXTMOD)"
and "$(src)" are the same.
If they need to separate them, they run "make -C $(KERNEL_SRC_DIR)
O=$(KERNEL_OUT_DIR) M=$(OUT_DIR) src=$(PWD)". Before running the
command, they need to copy "Kbuild" or "Makefile" to "$(OUT_DIR)" to
prevent compilation failure.
So the kernel should change the included path to avoid the copy operation.
Signed-off-by: Jing Leng <jleng(a)ambarella.com>
[masahiro: I do not think "M=$(OUT_DIR) src=$(PWD)" is the official way,
but this patch is a nice clean up anyway.]
Signed-off-by: Masahiro Yamada <masahiroy(a)kernel.org>
[nsc: updated context for v5.4]
Signed-off-by: Nicolas Schier <n.schier(a)avm.de>
---
scripts/Makefile.modpost | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/scripts/Makefile.modpost b/scripts/Makefile.modpost
index 48585c4d04ad..0273bf7375e2 100644
--- a/scripts/Makefile.modpost
+++ b/scripts/Makefile.modpost
@@ -87,8 +87,7 @@ obj := $(KBUILD_EXTMOD)
src := $(obj)
# Include the module's Makefile to find KBUILD_EXTRA_SYMBOLS
-include $(if $(wildcard $(KBUILD_EXTMOD)/Kbuild), \
- $(KBUILD_EXTMOD)/Kbuild, $(KBUILD_EXTMOD)/Makefile)
+include $(if $(wildcard $(src)/Kbuild), $(src)/Kbuild, $(src)/Makefile)
# modpost option for external modules
MODPOST += -e
--
2.37.2
From: Jing Leng <jleng(a)ambarella.com>
commit 23a0cb8e3225122496bfa79172005c587c2d64bf upstream.
When building an external module, if users don't need to separate the
compilation output and source code, they run the following command:
"make -C $(LINUX_SRC_DIR) M=$(PWD)". At this point, "$(KBUILD_EXTMOD)"
and "$(src)" are the same.
If they need to separate them, they run "make -C $(KERNEL_SRC_DIR)
O=$(KERNEL_OUT_DIR) M=$(OUT_DIR) src=$(PWD)". Before running the
command, they need to copy "Kbuild" or "Makefile" to "$(OUT_DIR)" to
prevent compilation failure.
So the kernel should change the included path to avoid the copy operation.
Signed-off-by: Jing Leng <jleng(a)ambarella.com>
[masahiro: I do not think "M=$(OUT_DIR) src=$(PWD)" is the official way,
but this patch is a nice clean up anyway.]
Signed-off-by: Masahiro Yamada <masahiroy(a)kernel.org>
[nsc: updated context for v4.19]
Signed-off-by: Nicolas Schier <n.schier(a)avm.de>
---
scripts/Makefile.modpost | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/scripts/Makefile.modpost b/scripts/Makefile.modpost
index 51884c7b8069..4eac2ecb35fb 100644
--- a/scripts/Makefile.modpost
+++ b/scripts/Makefile.modpost
@@ -51,8 +51,7 @@ obj := $(KBUILD_EXTMOD)
src := $(obj)
# Include the module's Makefile to find KBUILD_EXTRA_SYMBOLS
-include $(if $(wildcard $(KBUILD_EXTMOD)/Kbuild), \
- $(KBUILD_EXTMOD)/Kbuild, $(KBUILD_EXTMOD)/Makefile)
+include $(if $(wildcard $(src)/Kbuild), $(src)/Kbuild, $(src)/Makefile)
endif
include scripts/Makefile.lib
--
2.37.2
From: Jing Leng <jleng(a)ambarella.com>
commit 23a0cb8e3225122496bfa79172005c587c2d64bf upstream.
When building an external module, if users don't need to separate the
compilation output and source code, they run the following command:
"make -C $(LINUX_SRC_DIR) M=$(PWD)". At this point, "$(KBUILD_EXTMOD)"
and "$(src)" are the same.
If they need to separate them, they run "make -C $(KERNEL_SRC_DIR)
O=$(KERNEL_OUT_DIR) M=$(OUT_DIR) src=$(PWD)". Before running the
command, they need to copy "Kbuild" or "Makefile" to "$(OUT_DIR)" to
prevent compilation failure.
So the kernel should change the included path to avoid the copy operation.
Signed-off-by: Jing Leng <jleng(a)ambarella.com>
[masahiro: I do not think "M=$(OUT_DIR) src=$(PWD)" is the official way,
but this patch is a nice clean up anyway.]
Signed-off-by: Masahiro Yamada <masahiroy(a)kernel.org>
Signed-off-by: Nicolas Schier <n.schier(a)avm.de>
---
scripts/Makefile.modpost | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/scripts/Makefile.modpost b/scripts/Makefile.modpost
index 12a87be0fb44..42154b6df652 100644
--- a/scripts/Makefile.modpost
+++ b/scripts/Makefile.modpost
@@ -87,8 +87,7 @@ obj := $(KBUILD_EXTMOD)
src := $(obj)
# Include the module's Makefile to find KBUILD_EXTRA_SYMBOLS
-include $(if $(wildcard $(KBUILD_EXTMOD)/Kbuild), \
- $(KBUILD_EXTMOD)/Kbuild, $(KBUILD_EXTMOD)/Makefile)
+include $(if $(wildcard $(src)/Kbuild), $(src)/Kbuild, $(src)/Makefile)
# modpost option for external modules
MODPOST += -e
--
2.37.2
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 41ac42f137080bc230b5882e3c88c392ab7f2d32 Mon Sep 17 00:00:00 2001
From: Gerald Schaefer <gerald.schaefer(a)linux.ibm.com>
Date: Wed, 17 Aug 2022 15:26:03 +0200
Subject: [PATCH] s390/mm: do not trigger write fault when vma does not allow
VM_WRITE
For non-protection pXd_none() page faults in do_dat_exception(), we
call do_exception() with access == (VM_READ | VM_WRITE | VM_EXEC).
In do_exception(), vma->vm_flags is checked against that before
calling handle_mm_fault().
Since commit 92f842eac7ee3 ("[S390] store indication fault optimization"),
we call handle_mm_fault() with FAULT_FLAG_WRITE, when recognizing that
it was a write access. However, the vma flags check is still only
checking against (VM_READ | VM_WRITE | VM_EXEC), and therefore also
calling handle_mm_fault() with FAULT_FLAG_WRITE in cases where the vma
does not allow VM_WRITE.
Fix this by changing access check in do_exception() to VM_WRITE only,
when recognizing write access.
Link: https://lkml.kernel.org/r/20220811103435.188481-3-david@redhat.com
Fixes: 92f842eac7ee3 ("[S390] store indication fault optimization")
Cc: <stable(a)vger.kernel.org>
Reported-by: David Hildenbrand <david(a)redhat.com>
Reviewed-by: Heiko Carstens <hca(a)linux.ibm.com>
Signed-off-by: Gerald Schaefer <gerald.schaefer(a)linux.ibm.com>
Signed-off-by: Vasily Gorbik <gor(a)linux.ibm.com>
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index 13449941516c..09b6e756d521 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -379,7 +379,9 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access)
flags = FAULT_FLAG_DEFAULT;
if (user_mode(regs))
flags |= FAULT_FLAG_USER;
- if (access == VM_WRITE || is_write)
+ if (is_write)
+ access = VM_WRITE;
+ if (access == VM_WRITE)
flags |= FAULT_FLAG_WRITE;
mmap_read_lock(mm);
From: James Morse <james.morse(a)arm.com>
commit 39fdb65f52e9a53d32a6ba719f96669fd300ae78 upstream.
Cortex-A510 is affected by an erratum where in rare circumstances the
CPUs may not handle a race between a break-before-make sequence on one
CPU, and another CPU accessing the same page. This could allow a store
to a page that has been unmapped.
Work around this by adding the affected CPUs to the list that needs
TLB sequences to be done twice.
Cc: stable(a)vger.kernel.org # 5.15.x
Signed-off-by: James Morse <james.morse(a)arm.com>
Link: https://lore.kernel.org/r/20220704155732.21216-1-james.morse@arm.com
Signed-off-by: Will Deacon <will(a)kernel.org>
Signed-off-by: Lucas Wei <lucaswei(a)google.com>
---
Documentation/arm64/silicon-errata.rst | 2 ++
arch/arm64/Kconfig | 17 +++++++++++++++++
arch/arm64/kernel/cpu_errata.c | 8 +++++++-
3 files changed, 26 insertions(+), 1 deletion(-)
diff --git a/Documentation/arm64/silicon-errata.rst b/Documentation/arm64/silicon-errata.rst
index 7c1750bcc5bd..46644736e583 100644
--- a/Documentation/arm64/silicon-errata.rst
+++ b/Documentation/arm64/silicon-errata.rst
@@ -92,6 +92,8 @@ stable kernels.
+----------------+-----------------+-----------------+-----------------------------+
| ARM | Cortex-A77 | #1508412 | ARM64_ERRATUM_1508412 |
+----------------+-----------------+-----------------+-----------------------------+
+| ARM | Cortex-A510 | #2441009 | ARM64_ERRATUM_2441009 |
++----------------+-----------------+-----------------+-----------------------------+
| ARM | Neoverse-N1 | #1188873,1418040| ARM64_ERRATUM_1418040 |
+----------------+-----------------+-----------------+-----------------------------+
| ARM | Neoverse-N1 | #1349291 | N/A |
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 69e7e293f72e..9d80c783142f 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -666,6 +666,23 @@ config ARM64_ERRATUM_1508412
If unsure, say Y.
+config ARM64_ERRATUM_2441009
+ bool "Cortex-A510: Completion of affected memory accesses might not be guaranteed by completion of a TLBI"
+ default y
+ select ARM64_WORKAROUND_REPEAT_TLBI
+ help
+ This option adds a workaround for ARM Cortex-A510 erratum #2441009.
+
+ Under very rare circumstances, affected Cortex-A510 CPUs
+ may not handle a race between a break-before-make sequence on one
+ CPU, and another CPU accessing the same page. This could allow a
+ store to a page that has been unmapped.
+
+ Work around this by adding the affected CPUs to the list that needs
+ TLB sequences to be done twice.
+
+ If unsure, say Y.
+
config CAVIUM_ERRATUM_22375
bool "Cavium erratum 22375, 24313"
default y
diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c
index c67c19d70159..e1be45fc7f5b 100644
--- a/arch/arm64/kernel/cpu_errata.c
+++ b/arch/arm64/kernel/cpu_errata.c
@@ -211,6 +211,12 @@ static const struct arm64_cpu_capabilities arm64_repeat_tlbi_list[] = {
/* Kryo4xx Gold (rcpe to rfpe) => (r0p0 to r3p0) */
ERRATA_MIDR_RANGE(MIDR_QCOM_KRYO_4XX_GOLD, 0xc, 0xe, 0xf, 0xe),
},
+#endif
+#ifdef CONFIG_ARM64_ERRATUM_2441009
+ {
+ /* Cortex-A510 r0p0 -> r1p1. Fixed in r1p2 */
+ ERRATA_MIDR_RANGE(MIDR_CORTEX_A510, 0, 0, 1, 1),
+ },
#endif
{},
};
@@ -427,7 +433,7 @@ const struct arm64_cpu_capabilities arm64_errata[] = {
#endif
#ifdef CONFIG_ARM64_WORKAROUND_REPEAT_TLBI
{
- .desc = "Qualcomm erratum 1009, or ARM erratum 1286807",
+ .desc = "Qualcomm erratum 1009, or ARM erratum 1286807, 2441009",
.capability = ARM64_WORKAROUND_REPEAT_TLBI,
.type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM,
.matches = cpucap_multi_entry_cap_matches,
base-commit: addc9003c2e895fe8a068a66de1de6fdb4c6ac60
--
2.37.2.672.g94769d06f0-goog
Hi,
I think we also want this patch in the 5.15 stable. Without this patch,
hv_balloon() will report incorrect memory usage to hypervisor when
running on ARM64 with PAGE_SIZE > 4k. Only 5.15 needs this because ARM64
support of HyperV guests arrived in 5.15.
Upstream id b3d6dd09ff00fdcf4f7c0cb54700ffd5dd343502
Cc: <stable(a)vger.kernel.org> # 5.15.x
Thanks!
Regards,
Boqun
On Fri, Mar 25, 2022 at 10:32:11AM +0800, Boqun Feng wrote:
> DM_STATUS_REPORT expects the numbers of pages in the unit of 4k pages
> (HV_HYP_PAGE) instead of guest pages, so to make it work when guest page
> sizes are larger than 4k, convert the numbers of guest pages into the
> numbers of HV_HYP_PAGEs.
>
> Note that the numbers of guest pages are still used for tracing because
> tracing is internal to the guest kernel.
>
> Reported-by: Vitaly Kuznetsov <vkuznets(a)redhat.com>
> Signed-off-by: Boqun Feng <boqun.feng(a)gmail.com>
> Reviewed-by: Michael Kelley <mikelley(a)microsoft.com>
> ---
> drivers/hv/hv_balloon.c | 13 ++++++++++---
> 1 file changed, 10 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c
> index f2d05bff4245..062156b88a87 100644
> --- a/drivers/hv/hv_balloon.c
> +++ b/drivers/hv/hv_balloon.c
> @@ -17,6 +17,7 @@
> #include <linux/slab.h>
> #include <linux/kthread.h>
> #include <linux/completion.h>
> +#include <linux/count_zeros.h>
> #include <linux/memory_hotplug.h>
> #include <linux/memory.h>
> #include <linux/notifier.h>
> @@ -1130,6 +1131,7 @@ static void post_status(struct hv_dynmem_device *dm)
> struct dm_status status;
> unsigned long now = jiffies;
> unsigned long last_post = last_post_time;
> + unsigned long num_pages_avail, num_pages_committed;
>
> if (pressure_report_delay > 0) {
> --pressure_report_delay;
> @@ -1154,16 +1156,21 @@ static void post_status(struct hv_dynmem_device *dm)
> * num_pages_onlined) as committed to the host, otherwise it can try
> * asking us to balloon them out.
> */
> - status.num_avail = si_mem_available();
> - status.num_committed = vm_memory_committed() +
> + num_pages_avail = si_mem_available();
> + num_pages_committed = vm_memory_committed() +
> dm->num_pages_ballooned +
> (dm->num_pages_added > dm->num_pages_onlined ?
> dm->num_pages_added - dm->num_pages_onlined : 0) +
> compute_balloon_floor();
>
> - trace_balloon_status(status.num_avail, status.num_committed,
> + trace_balloon_status(num_pages_avail, num_pages_committed,
> vm_memory_committed(), dm->num_pages_ballooned,
> dm->num_pages_added, dm->num_pages_onlined);
> +
> + /* Convert numbers of pages into numbers of HV_HYP_PAGEs. */
> + status.num_avail = num_pages_avail * NR_HV_HYP_PAGES_IN_PAGE;
> + status.num_committed = num_pages_committed * NR_HV_HYP_PAGES_IN_PAGE;
> +
> /*
> * If our transaction ID is no longer current, just don't
> * send the status. This can happen if we were interrupted
> --
> 2.35.1
>
The soc/fsl/dpio driver will perform a soc_device_match()
to determine the optimal cache settings for a given CPU core.
If FSL_GUTS is not enabled, this search will fail and
the driver will not configure cache stashing for the given
DPIO, and a string of "unknown SoC" messages will appear:
fsl_mc_dpio dpio.7: unknown SoC version
fsl_mc_dpio dpio.6: unknown SoC version
fsl_mc_dpio dpio.5: unknown SoC version
Signed-off-by: Mathew McBride <matt(a)traverse.com.au>
Fixes: 51da14e96e9b ("soc: fsl: dpio: configure cache stashing destination")
Cc: stable(a)vger.kernel.org
Reviewed-by: Ioana Ciornei <ioana.ciornei(a)nxp.com>
---
drivers/soc/fsl/Kconfig | 1 +
1 file changed, 1 insertion(+)
diff --git a/drivers/soc/fsl/Kconfig b/drivers/soc/fsl/Kconfig
index 07d52cafbb31..fcec6ed83d5e 100644
--- a/drivers/soc/fsl/Kconfig
+++ b/drivers/soc/fsl/Kconfig
@@ -24,6 +24,7 @@ config FSL_MC_DPIO
tristate "QorIQ DPAA2 DPIO driver"
depends on FSL_MC_BUS
select SOC_BUS
+ select FSL_GUTS
select DIMLIB
help
Driver for the DPAA2 DPIO object. A DPIO provides queue and
--
2.30.1
__mkroute_input() uses fib_validate_source() to trigger an icmp redirect.
My understanding is that fib_validate_source() is used to know if the src
address and the gateway address are on the same link. For that,
fib_validate_source() returns 1 (same link) or 0 (not the same network).
__mkroute_input() is the only user of these positive values, all other
callers only look if the returned value is negative.
Since the below patch, fib_validate_source() didn't return anymore 1 when
both addresses are on the same network, because the route lookup returns
RT_SCOPE_LINK instead of RT_SCOPE_HOST. But this is, in fact, right.
Let's adapat the test to return 1 again when both addresses are on the same
link.
CC: stable(a)vger.kernel.org
Fixes: 747c14307214 ("ip: fix dflt addr selection for connected nexthop")
Reported-by: kernel test robot <yujie.liu(a)intel.com>
Reported-by: Heng Qi <hengqi(a)linux.alibaba.com>
Signed-off-by: Nicolas Dichtel <nicolas.dichtel(a)6wind.com>
---
This code exists since more than two decades:
https://git.kernel.org/pub/scm/linux/kernel/git/davem/netdev-vger-cvs.git/c…
Please, feel free to comment if my analysis seems wrong.
Regards,
Nicolas
net/ipv4/fib_frontend.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index f361d3d56be2..943edf4ad4db 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -389,7 +389,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
dev_match = dev_match || (res.type == RTN_LOCAL &&
dev == net->loopback_dev);
if (dev_match) {
- ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_HOST;
+ ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_LINK;
return ret;
}
if (no_addr)
@@ -401,7 +401,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
ret = 0;
if (fib_lookup(net, &fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE) == 0) {
if (res.type == RTN_UNICAST)
- ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_HOST;
+ ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_LINK;
}
return ret;
--
2.33.0
commit b67fbebd4cf980aecbcc750e1462128bffe8ae15 upstream.
Some drivers rely on having all VMAs through which a PFN might be
accessible listed in the rmap for correctness.
However, on X86, it was possible for a VMA with stale TLB entries
to not be listed in the rmap.
This was fixed in mainline with
commit b67fbebd4cf9 ("mmu_gather: Force tlb-flush VM_PFNMAP vmas"),
but that commit relies on preceding refactoring in
commit 18ba064e42df3 ("mmu_gather: Let there be one tlb_{start,end}_vma()
implementation") and commit 1e9fdf21a4339 ("mmu_gather: Remove per arch
tlb_{start,end}_vma()").
This patch provides equivalent protection without needing that
refactoring, by forcing a TLB flush between removing PTEs in
unmap_vmas() and the call to unlink_file_vma() in free_pgtables().
[This is a stable-specific rewrite of the upstream commit!]
Signed-off-by: Jann Horn <jannh(a)google.com>
---
mm/mmap.c | 12 ++++++++++++
1 file changed, 12 insertions(+)
diff --git a/mm/mmap.c b/mm/mmap.c
index 031fca1a7c65e..ce13266a55c69 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2639,6 +2639,18 @@ static void unmap_region(struct mm_struct *mm,
tlb_gather_mmu(&tlb, mm);
update_hiwater_rss(mm);
unmap_vmas(&tlb, vma, start, end);
+
+ /*
+ * Ensure we have no stale TLB entries by the time this mapping is
+ * removed from the rmap.
+ * Note that we don't have to worry about nested flushes here because
+ * we're holding the mm semaphore for removing the mapping - so any
+ * concurrent flush in this region has to be coming through the rmap,
+ * and we synchronize against that using the rmap lock.
+ */
+ if ((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) != 0)
+ tlb_flush_mmu(&tlb);
+
free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
next ? next->vm_start : USER_PGTABLES_CEILING);
tlb_finish_mmu(&tlb);
base-commit: addc9003c2e895fe8a068a66de1de6fdb4c6ac60
--
2.37.2.672.g94769d06f0-goog
Xen blkfront advertises its support of the persistent grants feature
when it first setting up and when resuming in 'talk_to_blkback()'.
Then, blkback reads the advertised value when it connects with blkfront
and decides if it will use the persistent grants feature or not, and
advertises its decision to blkfront. Blkfront reads the blkback's
decision and it also makes the decision for the use of the feature.
Commit 402c43ea6b34 ("xen-blkfront: Apply 'feature_persistent' parameter
when connect"), however, made the blkfront's read of the parameter for
disabling the advertisement, namely 'feature_persistent', to be done
when it negotiate, not when advertise. Therefore blkfront advertises
without reading the parameter. As the field for caching the parameter
value is zero-initialized, it always advertises as the feature is
disabled, so that the persistent grants feature becomes always disabled.
This commit fixes the issue by making the blkfront does parmeter caching
just before the advertisement.
Fixes: 402c43ea6b34 ("xen-blkfront: Apply 'feature_persistent' parameter when connect")
Cc: <stable(a)vger.kernel.org> # 5.10.x
Reported-by: Marek Marczykowski-Górecki <marmarek(a)invisiblethingslab.com>
Signed-off-by: SeongJae Park <sj(a)kernel.org>
---
drivers/block/xen-blkfront.c | 14 +++++++-------
1 file changed, 7 insertions(+), 7 deletions(-)
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index dfae08115450..35b9bcad9db9 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -1759,6 +1759,12 @@ static int write_per_ring_nodes(struct xenbus_transaction xbt,
return err;
}
+/* Enable the persistent grants feature. */
+static bool feature_persistent = true;
+module_param(feature_persistent, bool, 0644);
+MODULE_PARM_DESC(feature_persistent,
+ "Enables the persistent grants feature");
+
/* Common code used when first setting up, and when resuming. */
static int talk_to_blkback(struct xenbus_device *dev,
struct blkfront_info *info)
@@ -1850,6 +1856,7 @@ static int talk_to_blkback(struct xenbus_device *dev,
message = "writing protocol";
goto abort_transaction;
}
+ info->feature_persistent_parm = feature_persistent;
err = xenbus_printf(xbt, dev->nodename, "feature-persistent", "%u",
info->feature_persistent_parm);
if (err)
@@ -1919,12 +1926,6 @@ static int negotiate_mq(struct blkfront_info *info)
return 0;
}
-/* Enable the persistent grants feature. */
-static bool feature_persistent = true;
-module_param(feature_persistent, bool, 0644);
-MODULE_PARM_DESC(feature_persistent,
- "Enables the persistent grants feature");
-
/*
* Entry point to this code when a new device is created. Allocate the basic
* structures and the ring buffer for communication with the backend, and
@@ -2284,7 +2285,6 @@ static void blkfront_gather_backend_features(struct blkfront_info *info)
if (xenbus_read_unsigned(info->xbdev->otherend, "feature-discard", 0))
blkfront_setup_discard(info);
- info->feature_persistent_parm = feature_persistent;
if (info->feature_persistent_parm)
info->feature_persistent =
!!xenbus_read_unsigned(info->xbdev->otherend,
--
2.25.1
The advertisement of the persistent grants feature (writing
'feature-persistent' to xenbus) should mean not the decision for using
the feature but only the availability of the feature. However, commit
74a852479c68 ("xen-blkfront: add a parameter for disabling of persistent
grants") made a field of blkfront, which was a place for saving only the
negotiation result, to be used for yet another purpose: caching of the
'feature_persistent' parameter value. As a result, the advertisement,
which should follow only the parameter value, becomes inconsistent.
This commit fixes the misuse of the semantic by making blkfront saves
the parameter value in a separate place and advertises the support based
on only the saved value.
Fixes: 74a852479c68 ("xen-blkfront: add a parameter for disabling of persistent grants")
Cc: <stable(a)vger.kernel.org> # 5.10.x
Suggested-by: Juergen Gross <jgross(a)suse.com>
Signed-off-by: SeongJae Park <sj(a)kernel.org>
---
drivers/block/xen-blkfront.c | 8 ++++++--
1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 8e56e69fb4c4..dfae08115450 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -213,6 +213,9 @@ struct blkfront_info
unsigned int feature_fua:1;
unsigned int feature_discard:1;
unsigned int feature_secdiscard:1;
+ /* Connect-time cached feature_persistent parameter */
+ unsigned int feature_persistent_parm:1;
+ /* Persistent grants feature negotiation result */
unsigned int feature_persistent:1;
unsigned int bounce:1;
unsigned int discard_granularity;
@@ -1848,7 +1851,7 @@ static int talk_to_blkback(struct xenbus_device *dev,
goto abort_transaction;
}
err = xenbus_printf(xbt, dev->nodename, "feature-persistent", "%u",
- info->feature_persistent);
+ info->feature_persistent_parm);
if (err)
dev_warn(&dev->dev,
"writing persistent grants feature to xenbus");
@@ -2281,7 +2284,8 @@ static void blkfront_gather_backend_features(struct blkfront_info *info)
if (xenbus_read_unsigned(info->xbdev->otherend, "feature-discard", 0))
blkfront_setup_discard(info);
- if (feature_persistent)
+ info->feature_persistent_parm = feature_persistent;
+ if (info->feature_persistent_parm)
info->feature_persistent =
!!xenbus_read_unsigned(info->xbdev->otherend,
"feature-persistent", 0);
--
2.25.1
If the VDUSE application provides a smaller config space
than the driver expects, the driver may use uninitialized
memory from the stack.
This patch prevents it by initializing the buffer passed by
the driver to store the config value.
This fix addresses CVE-2022-2308.
Cc: stable(a)vger.kernel.org # v5.15+
Fixes: c8a6153b6c59 ("vduse: Introduce VDUSE - vDPA Device in Userspace")
Reviewed-by: Xie Yongji <xieyongji(a)bytedance.com>
Acked-by: Jason Wang <jasowang(a)redhat.com>
Signed-off-by: Maxime Coquelin <maxime.coquelin(a)redhat.com>
---
drivers/vdpa/vdpa_user/vduse_dev.c | 9 +++++++--
1 file changed, 7 insertions(+), 2 deletions(-)
diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
index 41c0b29739f1..35dceee3ed56 100644
--- a/drivers/vdpa/vdpa_user/vduse_dev.c
+++ b/drivers/vdpa/vdpa_user/vduse_dev.c
@@ -673,10 +673,15 @@ static void vduse_vdpa_get_config(struct vdpa_device *vdpa, unsigned int offset,
{
struct vduse_dev *dev = vdpa_to_vduse(vdpa);
- if (offset > dev->config_size ||
- len > dev->config_size - offset)
+ /* Initialize the buffer in case of partial copy. */
+ memset(buf, 0, len);
+
+ if (offset > dev->config_size)
return;
+ if (len > dev->config_size - offset)
+ len = dev->config_size - offset;
+
memcpy(buf, dev->config + offset, len);
}
--
2.37.2
Commit e94c6101e151 ("xen-blkback: Apply 'feature_persistent' parameter
when connect") made blkback to advertise its support of the persistent
grants feature only if the user sets the 'feature_persistent' parameter
of the driver and the frontend advertised its support of the feature.
However, following commit 402c43ea6b34 ("xen-blkfront: Apply
'feature_persistent' parameter when connect") made the blkfront to work
in the same way. That is, blkfront also advertises its support of the
persistent grants feature only if the user sets the 'feature_persistent'
parameter of the driver and the backend advertised its support of the
feature.
Hence blkback and blkfront will never advertise their support of the
feature but wait until the other advertises the support, even though
users set the 'feature_persistent' parameters of the drivers. As a
result, the persistent grants feature is disabled always regardless of
the 'feature_persistent' values[1].
The problem comes from the misuse of the semantic of the advertisement
of the feature. The advertisement of the feature should means only
availability of the feature not the decision for using the feature.
However, current behavior is working in the wrong way.
This commit fixes the issue by making the blkfront advertises its
support of the feature as user requested via 'feature_persistent'
parameter regardless of the otherend's support of the feature.
[1] https://lore.kernel.org/xen-devel/bd818aba-4857-bc07-dc8a-e9b2f8c5f7cd@suse…
Fixes: 402c43ea6b34 ("xen-blkfront: Apply 'feature_persistent' parameter when connect")
Cc: <stable(a)vger.kernel.org> # 5.10.x
Reported-by: Marek Marczykowski-Górecki <marmarek(a)invisiblethingslab.com>
Suggested-by: Juergen Gross <jgross(a)suse.com>
Signed-off-by: SeongJae Park <sj(a)kernel.org>
---
drivers/block/xen-blkfront.c | 8 ++++++--
1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 8e56e69fb4c4..dfae08115450 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -213,6 +213,9 @@ struct blkfront_info
unsigned int feature_fua:1;
unsigned int feature_discard:1;
unsigned int feature_secdiscard:1;
+ /* Connect-time cached feature_persistent parameter */
+ unsigned int feature_persistent_parm:1;
+ /* Persistent grants feature negotiation result */
unsigned int feature_persistent:1;
unsigned int bounce:1;
unsigned int discard_granularity;
@@ -1848,7 +1851,7 @@ static int talk_to_blkback(struct xenbus_device *dev,
goto abort_transaction;
}
err = xenbus_printf(xbt, dev->nodename, "feature-persistent", "%u",
- info->feature_persistent);
+ info->feature_persistent_parm);
if (err)
dev_warn(&dev->dev,
"writing persistent grants feature to xenbus");
@@ -2281,7 +2284,8 @@ static void blkfront_gather_backend_features(struct blkfront_info *info)
if (xenbus_read_unsigned(info->xbdev->otherend, "feature-discard", 0))
blkfront_setup_discard(info);
- if (feature_persistent)
+ info->feature_persistent_parm = feature_persistent;
+ if (info->feature_persistent_parm)
info->feature_persistent =
!!xenbus_read_unsigned(info->xbdev->otherend,
"feature-persistent", 0);
--
2.25.1
Commit e94c6101e151 ("xen-blkback: Apply 'feature_persistent' parameter
when connect") made blkback to advertise its support of the persistent
grants feature only if the user sets the 'feature_persistent' parameter
of the driver and the frontend advertised its support of the feature.
However, following commit 402c43ea6b34 ("xen-blkfront: Apply
'feature_persistent' parameter when connect") made the blkfront to work
in the same way. That is, blkfront also advertises its support of the
persistent grants feature only if the user sets the 'feature_persistent'
parameter of the driver and the backend advertised its support of the
feature.
Hence blkback and blkfront will never advertise their support of the
feature but wait until the other advertises the support, even though
users set the 'feature_persistent' parameters of the drivers. As a
result, the persistent grants feature is disabled always regardless of
the 'feature_persistent' values[1].
The problem comes from the misuse of the semantic of the advertisement
of the feature. The advertisement of the feature should means only
availability of the feature not the decision for using the feature.
However, current behavior is working in the wrong way.
This commit fixes the issue by making the blkback advertises its support
of the feature as user requested via 'feature_persistent' parameter
regardless of the otherend's support of the feature.
[1] https://lore.kernel.org/xen-devel/bd818aba-4857-bc07-dc8a-e9b2f8c5f7cd@suse…
Fixes: e94c6101e151 ("xen-blkback: Apply 'feature_persistent' parameter when connect")
Cc: <stable(a)vger.kernel.org> # 5.10.x
Reported-by: Marek Marczykowski-Górecki <marmarek(a)invisiblethingslab.com>
Suggested-by: Juergen Gross <jgross(a)suse.com>
Signed-off-by: SeongJae Park <sj(a)kernel.org>
---
drivers/block/xen-blkback/common.h | 3 +++
drivers/block/xen-blkback/xenbus.c | 6 ++++--
2 files changed, 7 insertions(+), 2 deletions(-)
diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h
index bda5c815e441..a28473470e66 100644
--- a/drivers/block/xen-blkback/common.h
+++ b/drivers/block/xen-blkback/common.h
@@ -226,6 +226,9 @@ struct xen_vbd {
sector_t size;
unsigned int flush_support:1;
unsigned int discard_secure:1;
+ /* Connect-time cached feature_persistent parameter value */
+ unsigned int feature_gnt_persistent_parm:1;
+ /* Persistent grants feature negotiation result */
unsigned int feature_gnt_persistent:1;
unsigned int overflow_max_grants:1;
};
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
index ee7ad2fb432d..c0227dfa4688 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -907,7 +907,7 @@ static void connect(struct backend_info *be)
xen_blkbk_barrier(xbt, be, be->blkif->vbd.flush_support);
err = xenbus_printf(xbt, dev->nodename, "feature-persistent", "%u",
- be->blkif->vbd.feature_gnt_persistent);
+ be->blkif->vbd.feature_gnt_persistent_parm);
if (err) {
xenbus_dev_fatal(dev, err, "writing %s/feature-persistent",
dev->nodename);
@@ -1085,7 +1085,9 @@ static int connect_ring(struct backend_info *be)
return -ENOSYS;
}
- blkif->vbd.feature_gnt_persistent = feature_persistent &&
+ blkif->vbd.feature_gnt_persistent_parm = feature_persistent;
+ blkif->vbd.feature_gnt_persistent =
+ blkif->vbd.feature_gnt_persistent_parm &&
xenbus_read_unsigned(dev->otherend, "feature-persistent", 0);
blkif->vbd.overflow_max_grants = 0;
--
2.25.1
If the VDUSE application provides a smaller config space
than the driver expects, the driver may use uninitialized
memory from the stack.
This patch prevents it by initializing the buffer passed by
the driver to store the config value.
This fix addresses CVE-2022-2308.
Cc: xieyongji(a)bytedance.com
Cc: stable(a)vger.kernel.org # v5.15+
Fixes: c8a6153b6c59 ("vduse: Introduce VDUSE - vDPA Device in Userspace")
Acked-by: Jason Wang <jasowang(a)redhat.com>
Signed-off-by: Maxime Coquelin <maxime.coquelin(a)redhat.com>
---
drivers/vdpa/vdpa_user/vduse_dev.c | 9 +++++++--
1 file changed, 7 insertions(+), 2 deletions(-)
diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
index 41c0b29739f1..35dceee3ed56 100644
--- a/drivers/vdpa/vdpa_user/vduse_dev.c
+++ b/drivers/vdpa/vdpa_user/vduse_dev.c
@@ -673,10 +673,15 @@ static void vduse_vdpa_get_config(struct vdpa_device *vdpa, unsigned int offset,
{
struct vduse_dev *dev = vdpa_to_vduse(vdpa);
- if (offset > dev->config_size ||
- len > dev->config_size - offset)
+ /* Initialize the buffer in case of partial copy. */
+ memset(buf, 0, len);
+
+ if (offset > dev->config_size)
return;
+ if (len > dev->config_size - offset)
+ len = dev->config_size - offset;
+
memcpy(buf, dev->config + offset, len);
}
--
2.37.2
Disable LCR updates for pre-0x30 devices which use a different (unknown)
protocol for line control and where the current register write causes
the next received character to be lost.
Note that updating LCR using the INIT command has no effect on these
devices either.
Reported-by: Jonathan Woithe <jwoithe(a)just42.net>
Link: https://lore.kernel.org/r/Ys1iPTfiZRWj2gXs@marvin.atrad.com.au
Fixes: 4e46c410e050 ("USB: serial: ch341: reinitialize chip on reconfiguration")
Fixes: 55fa15b5987d ("USB: serial: ch341: fix baud rate and line-control handling")
Cc: stable(a)vger.kernel.org # 4.10
Signed-off-by: Johan Hovold <johan(a)kernel.org>
---
drivers/usb/serial/ch341.c | 10 +++++++++-
1 file changed, 9 insertions(+), 1 deletion(-)
diff --git a/drivers/usb/serial/ch341.c b/drivers/usb/serial/ch341.c
index 2798fca71261..2bcce172355b 100644
--- a/drivers/usb/serial/ch341.c
+++ b/drivers/usb/serial/ch341.c
@@ -97,7 +97,10 @@ struct ch341_private {
u8 mcr;
u8 msr;
u8 lcr;
+
unsigned long quirks;
+ u8 version;
+
unsigned long break_end;
};
@@ -265,6 +268,9 @@ static int ch341_set_baudrate_lcr(struct usb_device *dev,
* (stop bits, parity and word length). Version 0x30 and above use
* CH341_REG_LCR only and CH341_REG_LCR2 is always set to zero.
*/
+ if (priv->version < 0x30)
+ return 0;
+
r = ch341_control_out(dev, CH341_REQ_WRITE_REG,
CH341_REG_LCR2 << 8 | CH341_REG_LCR, lcr);
if (r)
@@ -308,7 +314,9 @@ static int ch341_configure(struct usb_device *dev, struct ch341_private *priv)
r = ch341_control_in(dev, CH341_REQ_READ_VERSION, 0, 0, buffer, size);
if (r)
return r;
- dev_dbg(&dev->dev, "Chip version: 0x%02x\n", buffer[0]);
+
+ priv->version = buffer[0];
+ dev_dbg(&dev->dev, "Chip version: 0x%02x\n", priv->version);
r = ch341_control_out(dev, CH341_REQ_SERIAL_INIT, 0, 0);
if (r < 0)
--
2.35.1
At least one older CH341 appears to have the RX timer enable bit
inverted so that setting it disables the RX timer and prevents the FIFO
from emptying until it is full.
Only set the RX timer enable bit for devices with version newer than
0x27 (even though this probably affects all pre-0x30 devices).
Reported-by: Jonathan Woithe <jwoithe(a)just42.net>
Link: https://lore.kernel.org/r/Ys1iPTfiZRWj2gXs@marvin.atrad.com.au
Fixes: 4e46c410e050 ("USB: serial: ch341: reinitialize chip on reconfiguration")
Cc: stable(a)vger.kernel.org # 4.10
Signed-off-by: Johan Hovold <johan(a)kernel.org>
---
drivers/usb/serial/ch341.c | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/drivers/usb/serial/ch341.c b/drivers/usb/serial/ch341.c
index 2bcce172355b..af01a462cc43 100644
--- a/drivers/usb/serial/ch341.c
+++ b/drivers/usb/serial/ch341.c
@@ -253,8 +253,12 @@ static int ch341_set_baudrate_lcr(struct usb_device *dev,
/*
* CH341A buffers data until a full endpoint-size packet (32 bytes)
* has been received unless bit 7 is set.
+ *
+ * At least one device with version 0x27 appears to have this bit
+ * inverted.
*/
- val |= BIT(7);
+ if (priv->version > 0x27)
+ val |= BIT(7);
r = ch341_control_out(dev, CH341_REQ_WRITE_REG,
CH341_REG_DIVISOR << 8 | CH341_REG_PRESCALER,
--
2.35.1
From: Paolo Valerio <pvalerio(a)redhat.com>
Given a sufficiently large number of actions, while copying and
reserving memory for a new action of a new flow, if next_offset is
greater than MAX_ACTIONS_BUFSIZE, the function reserve_sfa_size() does
not return -EMSGSIZE as expected, but it allocates MAX_ACTIONS_BUFSIZE
bytes increasing actions_len by req_size. This can then lead to an OOB
write access, especially when further actions need to be copied.
Fix it by rearranging the flow action size check.
KASAN splat below:
==================================================================
BUG: KASAN: slab-out-of-bounds in reserve_sfa_size+0x1ba/0x380 [openvswitch]
Write of size 65360 at addr ffff888147e4001c by task handler15/836
CPU: 1 PID: 836 Comm: handler15 Not tainted 5.18.0-rc1+ #27
...
Call Trace:
<TASK>
dump_stack_lvl+0x45/0x5a
print_report.cold+0x5e/0x5db
? __lock_text_start+0x8/0x8
? reserve_sfa_size+0x1ba/0x380 [openvswitch]
kasan_report+0xb5/0x130
? reserve_sfa_size+0x1ba/0x380 [openvswitch]
kasan_check_range+0xf5/0x1d0
memcpy+0x39/0x60
reserve_sfa_size+0x1ba/0x380 [openvswitch]
__add_action+0x24/0x120 [openvswitch]
ovs_nla_add_action+0xe/0x20 [openvswitch]
ovs_ct_copy_action+0x29d/0x1130 [openvswitch]
? __kernel_text_address+0xe/0x30
? unwind_get_return_address+0x56/0xa0
? create_prof_cpu_mask+0x20/0x20
? ovs_ct_verify+0xf0/0xf0 [openvswitch]
? prep_compound_page+0x198/0x2a0
? __kasan_check_byte+0x10/0x40
? kasan_unpoison+0x40/0x70
? ksize+0x44/0x60
? reserve_sfa_size+0x75/0x380 [openvswitch]
__ovs_nla_copy_actions+0xc26/0x2070 [openvswitch]
? __zone_watermark_ok+0x420/0x420
? validate_set.constprop.0+0xc90/0xc90 [openvswitch]
? __alloc_pages+0x1a9/0x3e0
? __alloc_pages_slowpath.constprop.0+0x1da0/0x1da0
? unwind_next_frame+0x991/0x1e40
? __mod_node_page_state+0x99/0x120
? __mod_lruvec_page_state+0x2e3/0x470
? __kasan_kmalloc_large+0x90/0xe0
ovs_nla_copy_actions+0x1b4/0x2c0 [openvswitch]
ovs_flow_cmd_new+0x3cd/0xb10 [openvswitch]
...
Cc: stable(a)vger.kernel.org
Fixes: f28cd2af22a0 ("openvswitch: fix flow actions reallocation")
Signed-off-by: Paolo Valerio <pvalerio(a)redhat.com>
Acked-by: Eelco Chaudron <echaudro(a)redhat.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
---
net/openvswitch/flow_netlink.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index 7176156d3844..4c09cf8a0ab2 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -2465,7 +2465,7 @@ static struct nlattr *reserve_sfa_size(struct sw_flow_actions **sfa,
new_acts_size = max(next_offset + req_size, ksize(*sfa) * 2);
if (new_acts_size > MAX_ACTIONS_BUFSIZE) {
- if ((MAX_ACTIONS_BUFSIZE - next_offset) < req_size) {
+ if ((next_offset + req_size) > MAX_ACTIONS_BUFSIZE) {
OVS_NLERR(log, "Flow action size exceeds max %u",
MAX_ACTIONS_BUFSIZE);
return ERR_PTR(-EMSGSIZE);
--
2.25.1
Syzbot reported a couple issues introduced by commit 44e602b4e52f
("binder_alloc: add missing mmap_lock calls when using the VMA"), in
which we attempt to acquire the mmap_lock when alloc->vma_vm_mm has not
been initialized yet.
This can happen if a binder_proc receives a transaction without having
previously called mmap() to setup the binder_proc->alloc space in [1].
Also, a similar issue occurs via binder_alloc_print_pages() when we try
to dump the debugfs binder stats file in [2].
Sample of syzbot's crash report:
==================================================================
KASAN: null-ptr-deref in range [0x0000000000000128-0x000000000000012f]
CPU: 0 PID: 3755 Comm: syz-executor229 Not tainted 6.0.0-rc1-next-20220819-syzkaller #0
syz-executor229[3755] cmdline: ./syz-executor2294415195
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 07/22/2022
RIP: 0010:__lock_acquire+0xd83/0x56d0 kernel/locking/lockdep.c:4923
[...]
Call Trace:
<TASK>
lock_acquire kernel/locking/lockdep.c:5666 [inline]
lock_acquire+0x1ab/0x570 kernel/locking/lockdep.c:5631
down_read+0x98/0x450 kernel/locking/rwsem.c:1499
mmap_read_lock include/linux/mmap_lock.h:117 [inline]
binder_alloc_new_buf_locked drivers/android/binder_alloc.c:405 [inline]
binder_alloc_new_buf+0xa5/0x19e0 drivers/android/binder_alloc.c:593
binder_transaction+0x242e/0x9a80 drivers/android/binder.c:3199
binder_thread_write+0x664/0x3220 drivers/android/binder.c:3986
binder_ioctl_write_read drivers/android/binder.c:5036 [inline]
binder_ioctl+0x3470/0x6d00 drivers/android/binder.c:5323
vfs_ioctl fs/ioctl.c:51 [inline]
__do_sys_ioctl fs/ioctl.c:870 [inline]
__se_sys_ioctl fs/ioctl.c:856 [inline]
__x64_sys_ioctl+0x193/0x200 fs/ioctl.c:856
do_syscall_x64 arch/x86/entry/common.c:50 [inline]
do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x63/0xcd
[...]
==================================================================
Fix these issues by setting up alloc->vma_vm_mm pointer during open()
and caching directly from current->mm. This guarantees we have a valid
reference to take the mmap_lock during scenarios described above.
[1] https://syzkaller.appspot.com/bug?extid=f7dc54e5be28950ac459
[2] https://syzkaller.appspot.com/bug?extid=a75ebe0452711c9e56d9
Fixes: 44e602b4e52f ("binder_alloc: add missing mmap_lock calls when using the VMA")
Reported-by: syzbot+f7dc54e5be28950ac459(a)syzkaller.appspotmail.com
Reported-by: syzbot+a75ebe0452711c9e56d9(a)syzkaller.appspotmail.com
Cc: <stable(a)vger.kernel.org> # v5.15+
Cc: Liam R. Howlett <Liam.Howlett(a)oracle.com>
Signed-off-by: Carlos Llamas <cmllamas(a)google.com>
---
drivers/android/binder_alloc.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c
index 51f4e1c5cd01..9b1778c00610 100644
--- a/drivers/android/binder_alloc.c
+++ b/drivers/android/binder_alloc.c
@@ -322,7 +322,6 @@ static inline void binder_alloc_set_vma(struct binder_alloc *alloc,
*/
if (vma) {
vm_start = vma->vm_start;
- alloc->vma_vm_mm = vma->vm_mm;
mmap_assert_write_locked(alloc->vma_vm_mm);
} else {
mmap_assert_locked(alloc->vma_vm_mm);
@@ -795,7 +794,6 @@ int binder_alloc_mmap_handler(struct binder_alloc *alloc,
binder_insert_free_buffer(alloc, buffer);
alloc->free_async_space = alloc->buffer_size / 2;
binder_alloc_set_vma(alloc, vma);
- mmgrab(alloc->vma_vm_mm);
return 0;
@@ -1091,6 +1089,8 @@ static struct shrinker binder_shrinker = {
void binder_alloc_init(struct binder_alloc *alloc)
{
alloc->pid = current->group_leader->pid;
+ alloc->vma_vm_mm = current->mm;
+ mmgrab(alloc->vma_vm_mm);
mutex_init(&alloc->mutex);
INIT_LIST_HEAD(&alloc->buffers);
}
--
2.37.2.672.g94769d06f0-goog
__setup() handlers should return 1 to obsolete_checksetup() in
init/main.c to indicate that the boot option has been handled.
A return of 0 causes the boot option/value to be listed as an Unknown
kernel parameter and added to init's (limited) argument or environment
strings. Also, error return codes don't mean anything to
obsolete_checksetup() -- only non-zero (usually 1) or zero.
So return 1 from nmi_debug_setup().
Fixes: 1e1030dccb10 ("sh: nmi_debug support.")
Signed-off-by: Randy Dunlap <rdunlap(a)infradead.org>
Cc: Yoshinori Sato <ysato(a)users.sourceforge.jp>
Cc: Rich Felker <dalias(a)libc.org>
Cc: linux-sh(a)vger.kernel.org
Cc: Andrew Morton <akpm(a)linux-foundation.org>
Cc: stable(a)vger.kernel.org
---
v2: add more Cc's;
refresh and resend;
arch/sh/kernel/nmi_debug.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
--- a/arch/sh/kernel/nmi_debug.c
+++ b/arch/sh/kernel/nmi_debug.c
@@ -49,7 +49,7 @@ static int __init nmi_debug_setup(char *
register_die_notifier(&nmi_debug_nb);
if (*str != '=')
- return 0;
+ return 1;
for (p = str + 1; *p; p = sep + 1) {
sep = strchr(p, ',');
@@ -70,6 +70,6 @@ static int __init nmi_debug_setup(char *
break;
}
- return 0;
+ return 1;
}
__setup("nmi_debug", nmi_debug_setup);
__setup() handlers should return 1 to obsolete_checksetup() in
init/main.c to indicate that the boot option has been handled.
A return of 0 causes the boot option/value to be listed as an Unknown
kernel parameter and added to init's (limited) argument or environment
strings. Also, error return codes don't mean anything to
obsolete_checksetup() -- only non-zero (usually 1) or zero.
So return 1 from vdso_setup().
Fixes: 9a08862a5d2e ("vDSO for sparc")
Signed-off-by: Randy Dunlap <rdunlap(a)infradead.org>
Reported-by: Igor Zhbanov <izh1979(a)gmail.com>
Link: lore.kernel.org/r/64644a2f-4a20-bab3-1e15-3b2cdd0defe3@omprussia.ru
Cc: "David S. Miller" <davem(a)davemloft.net>
Cc: sparclinux(a)vger.kernel.org
Cc: Dan Carpenter <dan.carpenter(a)oracle.com>
Cc: Nick Alcock <nick.alcock(a)oracle.com>
Cc: Sam Ravnborg <sam(a)ravnborg.org>
Cc: Andrew Morton <akpm(a)linux-foundation.org>
Cc: stable(a)vger.kernel.org
---
v2: correct the Fixes: tag (Dan Carpenter)
v3: add more Cc's;
correct Igor's email address;
change From: Igor to Reported-by: Igor;
arch/sparc/vdso/vma.c | 7 +++----
1 file changed, 3 insertions(+), 4 deletions(-)
--- a/arch/sparc/vdso/vma.c
+++ b/arch/sparc/vdso/vma.c
@@ -449,9 +449,8 @@ static __init int vdso_setup(char *s)
unsigned long val;
err = kstrtoul(s, 10, &val);
- if (err)
- return err;
- vdso_enabled = val;
- return 0;
+ if (!err)
+ vdso_enabled = val;
+ return 1;
}
__setup("vdso=", vdso_setup);
__setup() handlers should return 1 to obsolete_checksetup() in
init/main.c to indicate that the boot option has been handled.
A return of 0 causes the boot option/value to be listed as an Unknown
kernel parameter and added to init's (limited) argument or environment
strings. Also, error return codes don't mean anything to
obsolete_checksetup() -- only non-zero (usually 1) or zero.
So return 1 from setup_nmi_watchdog().
Fixes: e5553a6d0442 ("sparc64: Implement NMI watchdog on capable cpus.")
Signed-off-by: Randy Dunlap <rdunlap(a)infradead.org>
Reported-by: Igor Zhbanov <i.zhbanov(a)omprussia.ru>
Link: lore.kernel.org/r/64644a2f-4a20-bab3-1e15-3b2cdd0defe3@omprussia.ru
Cc: "David S. Miller" <davem(a)davemloft.net>
Cc: sparclinux(a)vger.kernel.org
Cc: Sam Ravnborg <sam(a)ravnborg.org>
Cc: Andrew Morton <akpm(a)linux-foundation.org>
Cc: stable(a)vger.kernel.org
---
v2: change From" Igor to Reported-by:
add more Cc's
arch/sparc/kernel/nmi.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/arch/sparc/kernel/nmi.c
+++ b/arch/sparc/kernel/nmi.c
@@ -274,7 +274,7 @@ static int __init setup_nmi_watchdog(cha
if (!strncmp(str, "panic", 5))
panic_on_timeout = 1;
- return 0;
+ return 1;
}
__setup("nmi_watchdog=", setup_nmi_watchdog);
From: James Morse <james.morse(a)arm.com>
Cortex-A510 is affected by an erratum where in rare circumstances the
CPUs may not handle a race between a break-before-make sequence on one
CPU, and another CPU accessing the same page. This could allow a store
to a page that has been unmapped.
Work around this by adding the affected CPUs to the list that needs
TLB sequences to be done twice.
Signed-off-by: James Morse <james.morse(a)arm.com>
Link: https://lore.kernel.org/r/20220704155732.21216-1-james.morse@arm.com
Signed-off-by: Will Deacon <will(a)kernel.org>
---
Documentation/arm64/silicon-errata.rst | 2 ++
arch/arm64/Kconfig | 17 +++++++++++++++++
arch/arm64/kernel/cpu_errata.c | 8 +++++++-
3 files changed, 26 insertions(+), 1 deletion(-)
diff --git a/Documentation/arm64/silicon-errata.rst b/Documentation/arm64/silicon-errata.rst
index 0b4235b1f8c4..33b04db8408f 100644
--- a/Documentation/arm64/silicon-errata.rst
+++ b/Documentation/arm64/silicon-errata.rst
@@ -106,6 +106,8 @@ stable kernels.
+----------------+-----------------+-----------------+-----------------------------+
| ARM | Cortex-A510 | #2077057 | ARM64_ERRATUM_2077057 |
+----------------+-----------------+-----------------+-----------------------------+
+| ARM | Cortex-A510 | #2441009 | ARM64_ERRATUM_2441009 |
++----------------+-----------------+-----------------+-----------------------------+
| ARM | Cortex-A710 | #2119858 | ARM64_ERRATUM_2119858 |
+----------------+-----------------+-----------------+-----------------------------+
| ARM | Cortex-A710 | #2054223 | ARM64_ERRATUM_2054223 |
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index a5d1b561ed53..001eaba5a6b4 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -838,6 +838,23 @@ config ARM64_ERRATUM_2224489
If unsure, say Y.
+config ARM64_ERRATUM_2441009
+ bool "Cortex-A510: Completion of affected memory accesses might not be guaranteed by completion of a TLBI"
+ default y
+ select ARM64_WORKAROUND_REPEAT_TLBI
+ help
+ This option adds a workaround for ARM Cortex-A510 erratum #2441009.
+
+ Under very rare circumstances, affected Cortex-A510 CPUs
+ may not handle a race between a break-before-make sequence on one
+ CPU, and another CPU accessing the same page. This could allow a
+ store to a page that has been unmapped.
+
+ Work around this by adding the affected CPUs to the list that needs
+ TLB sequences to be done twice.
+
+ If unsure, say Y.
+
config ARM64_ERRATUM_2064142
bool "Cortex-A510: 2064142: workaround TRBE register writes while disabled"
depends on CORESIGHT_TRBE
diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c
index 6b92989f4cc2..aa9609e6ca67 100644
--- a/arch/arm64/kernel/cpu_errata.c
+++ b/arch/arm64/kernel/cpu_errata.c
@@ -211,6 +211,12 @@ static const struct arm64_cpu_capabilities arm64_repeat_tlbi_list[] = {
/* Kryo4xx Gold (rcpe to rfpe) => (r0p0 to r3p0) */
ERRATA_MIDR_RANGE(MIDR_QCOM_KRYO_4XX_GOLD, 0xc, 0xe, 0xf, 0xe),
},
+#endif
+#ifdef CONFIG_ARM64_ERRATUM_2441009
+ {
+ /* Cortex-A510 r0p0 -> r1p1. Fixed in r1p2 */
+ ERRATA_MIDR_RANGE(MIDR_CORTEX_A510, 0, 0, 1, 1),
+ },
#endif
{},
};
@@ -488,7 +494,7 @@ const struct arm64_cpu_capabilities arm64_errata[] = {
#endif
#ifdef CONFIG_ARM64_WORKAROUND_REPEAT_TLBI
{
- .desc = "Qualcomm erratum 1009, or ARM erratum 1286807",
+ .desc = "Qualcomm erratum 1009, or ARM erratum 1286807, 2441009",
.capability = ARM64_WORKAROUND_REPEAT_TLBI,
.type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM,
.matches = cpucap_multi_entry_cap_matches,
--
If the GuC CTs are full and we need to stall the request submission
while waiting for space, we save the stalled request and where the stall
occurred; when the CTs have space again we pick up the request submission
from where we left off.
If a full GT reset occurs, the state of all contexts is cleared and all
non-guilty requests are unsubmitted, therefore we need to restart the
stalled request submission from scratch. To make sure that we do so,
clear the saved request after a reset.
Fixes note: the patch that introduced the bug is in 5.15, but no
officially supported platform had GuC submission enabled by default
in that kernel, so the backport to that particular version (and only
that one) can potentially be skipped.
Fixes: 925dc1cf58ed ("drm/i915/guc: Implement GuC submission tasklet")
Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio(a)intel.com>
Cc: Matthew Brost <matthew.brost(a)intel.com>
Cc: John Harrison <john.c.harrison(a)intel.com>
Cc: <stable(a)vger.kernel.org> # v5.15+
---
drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c | 7 +++++++
1 file changed, 7 insertions(+)
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index 0d17da77e787..0d56b615bf78 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -4002,6 +4002,13 @@ static inline void guc_init_lrc_mapping(struct intel_guc *guc)
/* make sure all descriptors are clean... */
xa_destroy(&guc->context_lookup);
+ /*
+ * A reset might have occurred while we had a pending stalled request,
+ * so make sure we clean that up.
+ */
+ guc->stalled_request = NULL;
+ guc->submission_stall_reason = STALL_NONE;
+
/*
* Some contexts might have been pinned before we enabled GuC
* submission, so we need to add them to the GuC bookeeping.
--
2.25.1
From: Ben Hutchings <benh(a)debian.org>
The mitigation for PBRSB includes adding LFENCE instructions to the
RSB filling sequence. However, RSB filling is done on some older CPUs
that don't support the LFENCE instruction.
Define and use a BARRIER_NOSPEC macro which makes the LFENCE
conditional on X86_FEATURE_LFENCE_RDTSC, like the barrier_nospec()
macro defined for C code in <asm/barrier.h>.
Reported-by: Martin-Éric Racine <martin-eric.racine(a)iki.fi>
References: https://bugs.debian.org/1017425
Cc: stable(a)vger.kernel.org
Cc: regressions(a)lists.linux.dev
Cc: Daniel Sneddon <daniel.sneddon(a)linux.intel.com>
Cc: Pawan Gupta <pawan.kumar.gupta(a)linux.intel.com>
Fixes: 2b1299322016 ("x86/speculation: Add RSB VM Exit protections")
Fixes: ba6e31af2be9 ("x86/speculation: Add LFENCE to RSB fill sequence")
Signed-off-by: Ben Hutchings <benh(a)debian.org>
---
Re-sending this with properly matched From address and server.
Apologies if you got 2 copies.
Ben.
arch/x86/include/asm/nospec-branch.h | 11 +++++++----
1 file changed, 7 insertions(+), 4 deletions(-)
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index e64fd20778b6..b1029fd88474 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -34,6 +34,11 @@
#define RSB_CLEAR_LOOPS 32 /* To forcibly overwrite all entries */
+#ifdef __ASSEMBLY__
+
+/* Prevent speculative execution past this barrier. */
+#define BARRIER_NOSPEC ALTERNATIVE "", "lfence", X86_FEATURE_LFENCE_RDTSC
+
/*
* Google experimented with loop-unrolling and this turned out to be
* the optimal version - two calls, each with their own speculation
@@ -62,9 +67,7 @@
dec reg; \
jnz 771b; \
/* barrier for jnz misprediction */ \
- lfence;
-
-#ifdef __ASSEMBLY__
+ BARRIER_NOSPEC;
/*
* This should be used immediately before an indirect jump/call. It tells
@@ -138,7 +141,7 @@
int3
.Lunbalanced_ret_guard_\@:
add $(BITS_PER_LONG/8), %_ASM_SP
- lfence
+ BARRIER_NOSPEC
.endm
/*
This is a note to let you know that I've just added the patch titled
usb: cdns3: fix incorrect handling TRB_SMM flag for ISOC transfer
to my usb git tree which can be found at
git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/usb.git
in the usb-linus branch.
The patch will show up in the next release of the linux-next tree
(usually sometime within the next 24 hours during the week.)
The patch will hopefully also be merged in Linus's tree for the
next -rc kernel release.
If you have any questions about this process, please let me know.
From d5dcc33677d7415c5f23b3c052f9e80cbab9ea4e Mon Sep 17 00:00:00 2001
From: Pawel Laszczak <pawell(a)cadence.com>
Date: Thu, 25 Aug 2022 08:22:07 +0200
Subject: usb: cdns3: fix incorrect handling TRB_SMM flag for ISOC transfer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
The TRB_SMM flag indicates that DMA has completed the TD service with
this TRB. Usually it’s a last TRB in TD. In case of ISOC transfer for
bInterval > 1 each ISOC transfer contains more than one TD associated
with usb request (one TD per ITP). In such case the TRB_SMM flag will
be set in every TD and driver will recognize the end of transfer after
processing the first TD with TRB_SMM. In result driver stops updating
request->actual and returns incorrect actual length.
To fix this issue driver additionally must check TRB_CHAIN which is not
used for isochronous transfers.
Fixes: 249f0a25e8be ("usb: cdns3: gadget: handle sg list use case at completion correctly")
cc: <stable(a)vger.kernel.org>
Acked-by: Peter Chen <peter.chen(a)kernel.org>
Signed-off-by: Pawel Laszczak <pawell(a)cadence.com>
Link: https://lore.kernel.org/r/20220825062207.5824-1-pawell@cadence.com
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
drivers/usb/cdns3/cdns3-gadget.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/usb/cdns3/cdns3-gadget.c b/drivers/usb/cdns3/cdns3-gadget.c
index d21b69997e75..4f11c311acaf 100644
--- a/drivers/usb/cdns3/cdns3-gadget.c
+++ b/drivers/usb/cdns3/cdns3-gadget.c
@@ -1530,7 +1530,8 @@ static void cdns3_transfer_completed(struct cdns3_device *priv_dev,
TRB_LEN(le32_to_cpu(trb->length));
if (priv_req->num_of_trb > 1 &&
- le32_to_cpu(trb->control) & TRB_SMM)
+ le32_to_cpu(trb->control) & TRB_SMM &&
+ le32_to_cpu(trb->control) & TRB_CHAIN)
transfer_end = true;
cdns3_ep_inc_deq(priv_ep);
--
2.37.2
This is a note to let you know that I've just added the patch titled
usb: cdns3: fix issue with rearming ISO OUT endpoint
to my usb git tree which can be found at
git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/usb.git
in the usb-linus branch.
The patch will show up in the next release of the linux-next tree
(usually sometime within the next 24 hours during the week.)
The patch will hopefully also be merged in Linus's tree for the
next -rc kernel release.
If you have any questions about this process, please let me know.
From b46a6b09fa056042a302b181a1941f0056944603 Mon Sep 17 00:00:00 2001
From: Pawel Laszczak <pawell(a)cadence.com>
Date: Thu, 25 Aug 2022 08:21:37 +0200
Subject: usb: cdns3: fix issue with rearming ISO OUT endpoint
ISO OUT endpoint is enabled during queuing first usb request
in transfer ring and disabled when TRBERR is reported by controller.
After TRBERR and before next transfer added to TR driver must again
reenable endpoint but does not.
To solve this issue during processing TRBERR event driver must
set the flag EP_UPDATE_EP_TRBADDR in priv_ep->flags field.
Fixes: 7733f6c32e36 ("usb: cdns3: Add Cadence USB3 DRD Driver")
cc: <stable(a)vger.kernel.org>
Acked-by: Peter Chen <peter.chen(a)kernel.org>
Signed-off-by: Pawel Laszczak <pawell(a)cadence.com>
Link: https://lore.kernel.org/r/20220825062137.5766-1-pawell@cadence.com
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
drivers/usb/cdns3/cdns3-gadget.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/drivers/usb/cdns3/cdns3-gadget.c b/drivers/usb/cdns3/cdns3-gadget.c
index 4f11c311acaf..5adcb349718c 100644
--- a/drivers/usb/cdns3/cdns3-gadget.c
+++ b/drivers/usb/cdns3/cdns3-gadget.c
@@ -1691,6 +1691,7 @@ static int cdns3_check_ep_interrupt_proceed(struct cdns3_endpoint *priv_ep)
ep_cfg &= ~EP_CFG_ENABLE;
writel(ep_cfg, &priv_dev->regs->ep_cfg);
priv_ep->flags &= ~EP_QUIRK_ISO_OUT_EN;
+ priv_ep->flags |= EP_UPDATE_EP_TRBADDR;
}
cdns3_transfer_completed(priv_dev, priv_ep);
} else if (!(priv_ep->flags & EP_STALLED) &&
--
2.37.2
Dear Linux stable kernel maintainers,
I would like to apply the below patch into kernel v5.15-stable.
- subject:arm64: errata: Add Cortex-A510 to the repeat tlbi list
- Upstream Commit ID: 39fdb65f52e9a53d32a6ba719f96669fd300ae78
- Targeted LTS release: v5.15
This patch is an errata of #2441009. Since v5.15 is still in its LTS
lifecycle, I think it fits the rule of "New device IDs and quirks are
also accepted" and I want to request to apply this patch to kernel
v5.15.
Thanks!
On Tue, Aug 30, 2022 at 1:15 AM Lucas Wei <lucaswei(a)google.com> wrote:
>
> Dear Linux stable kernel maintainers,
>
> I would like to apply below patch into kernel v5.15-stable.
>
> subject:arm64: errata: Add Cortex-A510 to the repeat tlbi list
> Upstream Commit ID: 39fdb65f52e9a53d32a6ba719f96669fd300ae78
> Targeted LTS release: v5.15
>
> This patch is an errata of #2441009. Since v5.15 is still in its LTS lifecycle, I think it fits the rule of "New device IDs and quirks are also accepted" and I want to request to apply this patch to kernel v5.15.
> Thanks!
>
> --
>
> Lucas Wei
> Embedded Software Engineer
> lucaswei(a)google.com
> 0287260408
>
--
Lucas Wei
Embedded Software Engineer
lucaswei(a)google.com
0287260408
This is an automatic generated email to let you know that the following patch were queued:
Subject: media: cedrus: Fix endless loop in cedrus_h265_skip_bits()
Author: Dmitry Osipenko <dmitry.osipenko(a)collabora.com>
Date: Thu Aug 18 22:33:08 2022 +0200
The busy status bit may never de-assert if number of programmed skip
bits is incorrect, resulting in a kernel hang because the bit is polled
endlessly in the code. Fix it by adding timeout for the bit-polling.
This problem is reproducible by setting the data_bit_offset field of
the HEVC slice params to a wrong value by userspace.
Cc: stable(a)vger.kernel.org
Fixes: 7678c5462680 (media: cedrus: Fix decoding for some HEVC videos)
Reported-by: Nicolas Dufresne <nicolas.dufresne(a)collabora.com>
Signed-off-by: Dmitry Osipenko <dmitry.osipenko(a)collabora.com>
Signed-off-by: Nicolas Dufresne <nicolas.dufresne(a)collabora.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco(a)xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab(a)kernel.org>
drivers/staging/media/sunxi/cedrus/cedrus_h265.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
---
diff --git a/drivers/staging/media/sunxi/cedrus/cedrus_h265.c b/drivers/staging/media/sunxi/cedrus/cedrus_h265.c
index f703c585d91c..4952fc17f3e6 100644
--- a/drivers/staging/media/sunxi/cedrus/cedrus_h265.c
+++ b/drivers/staging/media/sunxi/cedrus/cedrus_h265.c
@@ -234,8 +234,9 @@ static void cedrus_h265_skip_bits(struct cedrus_dev *dev, int num)
cedrus_write(dev, VE_DEC_H265_TRIGGER,
VE_DEC_H265_TRIGGER_FLUSH_BITS |
VE_DEC_H265_TRIGGER_TYPE_N_BITS(tmp));
- while (cedrus_read(dev, VE_DEC_H265_STATUS) & VE_DEC_H265_STATUS_VLD_BUSY)
- udelay(1);
+
+ if (cedrus_wait_for(dev, VE_DEC_H265_STATUS, VE_DEC_H265_STATUS_VLD_BUSY))
+ dev_err_ratelimited(dev->dev, "timed out waiting to skip bits\n");
count += tmp;
}
This is an automatic generated email to let you know that the following patch were queued:
Subject: media: cedrus: Set the platform driver data earlier
Author: Dmitry Osipenko <dmitry.osipenko(a)collabora.com>
Date: Thu Aug 18 22:33:07 2022 +0200
The cedrus_hw_resume() crashes with NULL deference on driver probe if
runtime PM is disabled because it uses platform data that hasn't been
set up yet. Fix this by setting the platform data earlier during probe.
Cc: stable(a)vger.kernel.org
Fixes: 50e761516f2b (media: platform: Add Cedrus VPU decoder driver)
Signed-off-by: Dmitry Osipenko <dmitry.osipenko(a)collabora.com>
Signed-off-by: Nicolas Dufresne <nicolas.dufresne(a)collabora.com>
Reviewed-by: Samuel Holland <samuel(a)sholland.org>
Acked-by: Paul Kocialkowski <paul.kocialkowski(a)bootlin.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco(a)xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab(a)kernel.org>
drivers/staging/media/sunxi/cedrus/cedrus.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
---
diff --git a/drivers/staging/media/sunxi/cedrus/cedrus.c b/drivers/staging/media/sunxi/cedrus/cedrus.c
index 960a0130cd62..55c54dfdc585 100644
--- a/drivers/staging/media/sunxi/cedrus/cedrus.c
+++ b/drivers/staging/media/sunxi/cedrus/cedrus.c
@@ -448,6 +448,8 @@ static int cedrus_probe(struct platform_device *pdev)
if (!dev)
return -ENOMEM;
+ platform_set_drvdata(pdev, dev);
+
dev->vfd = cedrus_video_device;
dev->dev = &pdev->dev;
dev->pdev = pdev;
@@ -521,8 +523,6 @@ static int cedrus_probe(struct platform_device *pdev)
goto err_m2m_mc;
}
- platform_set_drvdata(pdev, dev);
-
return 0;
err_m2m_mc:
This is an automatic generated email to let you know that the following patch were queued:
Subject: media: cedrus: Fix watchdog race condition
Author: Nicolas Dufresne <nicolas.dufresne(a)collabora.com>
Date: Thu Aug 18 22:33:06 2022 +0200
The watchdog needs to be scheduled before we trigger the decode
operation, otherwise there is a risk that the decoder IRQ will be
called before we have schedule the watchdog. As a side effect, the
watchdog would never be cancelled and its function would be called
at an inappropriate time.
This was observed while running Fluster with GStreamer as a backend.
Some programming error would cause the decoder IRQ to be call very
quickly after the trigger. Later calls into the driver would deadlock
due to the unbalanced state.
Cc: stable(a)vger.kernel.org
Fixes: 7c38a551bda1 ("media: cedrus: Add watchdog for job completion")
Signed-off-by: Nicolas Dufresne <nicolas.dufresne(a)collabora.com>
Reviewed-by: Paul Kocialkowski <paul.kocialkowski(a)bootlin.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco(a)xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab(a)kernel.org>
drivers/staging/media/sunxi/cedrus/cedrus_dec.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
---
diff --git a/drivers/staging/media/sunxi/cedrus/cedrus_dec.c b/drivers/staging/media/sunxi/cedrus/cedrus_dec.c
index 3b6aa78a2985..e7f7602a5ab4 100644
--- a/drivers/staging/media/sunxi/cedrus/cedrus_dec.c
+++ b/drivers/staging/media/sunxi/cedrus/cedrus_dec.c
@@ -106,11 +106,11 @@ void cedrus_device_run(void *priv)
/* Trigger decoding if setup went well, bail out otherwise. */
if (!error) {
- dev->dec_ops[ctx->current_codec]->trigger(ctx);
-
/* Start the watchdog timer. */
schedule_delayed_work(&dev->watchdog_work,
msecs_to_jiffies(2000));
+
+ dev->dec_ops[ctx->current_codec]->trigger(ctx);
} else {
v4l2_m2m_buf_done_and_job_finish(ctx->dev->m2m_dev,
ctx->fh.m2m_ctx,
From: Ville Syrjälä <ville.syrjala(a)linux.intel.com>
Turns out the MIPI sequence block version number and
new block size fields are considered part of the block
header and are not included in the reported new block size
field itself. Bump up the block size appropriately so that
we'll copy over the last five bytes of the block as well.
For this particular machine those last five bytes included
parts of the GPIO op for the backlight on sequence, causing
the backlight no longer to turn back on:
Sequence 6 - MIPI_SEQ_BACKLIGHT_ON
Delay: 20000 us
- GPIO index 0, number 0, set 0 (0x00)
+ GPIO index 1, number 70, set 1 (0x01)
Cc: stable(a)vger.kernel.org
Fixes: e163cfb4c96d ("drm/i915/bios: Make copies of VBT data blocks")
Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/6652
Signed-off-by: Ville Syrjälä <ville.syrjala(a)linux.intel.com>
---
drivers/gpu/drm/i915/display/intel_bios.c | 7 +++++++
1 file changed, 7 insertions(+)
diff --git a/drivers/gpu/drm/i915/display/intel_bios.c b/drivers/gpu/drm/i915/display/intel_bios.c
index 81d6cfbd2615..d493d04f4049 100644
--- a/drivers/gpu/drm/i915/display/intel_bios.c
+++ b/drivers/gpu/drm/i915/display/intel_bios.c
@@ -479,6 +479,13 @@ init_bdb_block(struct drm_i915_private *i915,
block_size = get_blocksize(block);
+ /*
+ * Version number and new block size are considered
+ * part of the header for MIPI sequenece block v3+.
+ */
+ if (section_id == BDB_MIPI_SEQUENCE && *(const u8 *)block >= 3)
+ block_size += 5;
+
entry = kzalloc(struct_size(entry, data, max(min_size, block_size) + 3),
GFP_KERNEL);
if (!entry) {
--
2.35.1
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From ab74ef708dc51df7cf2b8a890b9c6990fac5c0c6 Mon Sep 17 00:00:00 2001
From: Miaohe Lin <linmiaohe(a)huawei.com>
Date: Tue, 12 Jul 2022 21:05:42 +0800
Subject: [PATCH] mm/hugetlb: avoid corrupting page->mapping in
hugetlb_mcopy_atomic_pte
In MCOPY_ATOMIC_CONTINUE case with a non-shared VMA, pages in the page
cache are installed in the ptes. But hugepage_add_new_anon_rmap is called
for them mistakenly because they're not vm_shared. This will corrupt the
page->mapping used by page cache code.
Link: https://lkml.kernel.org/r/20220712130542.18836-1-linmiaohe@huawei.com
Fixes: f619147104c8 ("userfaultfd: add UFFDIO_CONTINUE ioctl")
Signed-off-by: Miaohe Lin <linmiaohe(a)huawei.com>
Reviewed-by: Mike Kravetz <mike.kravetz(a)oracle.com>
Cc: Axel Rasmussen <axelrasmussen(a)google.com>
Cc: Peter Xu <peterx(a)redhat.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 2480ba627aa5..e070b8593b37 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -6041,7 +6041,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
if (!huge_pte_none_mostly(huge_ptep_get(dst_pte)))
goto out_release_unlock;
- if (vm_shared) {
+ if (page_in_pagecache) {
page_dup_file_rmap(page, true);
} else {
ClearHPageRestoreReserve(page);
Dear Sir/Ma,
Are you in need of a business or personal loan? Or is your business
struggling due to covid 19 pandemic, we are offering easy loans to
meet your needs, our funding and loans have repayment plans of 10 to
25 years' time for new and existing businesses, housing projects, and
individual financing.
We are willing to finance your request no matter where you are
stationed or located, our financing is global once you're willing to
meet the process and conditions. Your request will be processed and
sent to your account within 24 hours after the process is completed.
Kindly contact us in order for us to direct you to our procurement
officer, If you are interested.
Thank you in advance as we hope to meet your demand irrespective of
the volume in need.
Thanks & Regards
Mr. Gilbert.
From: Miaohe Lin <linmiaohe(a)huawei.com>
commit ab74ef708dc51df7cf2b8a890b9c6990fac5c0c6 upstream.
In MCOPY_ATOMIC_CONTINUE case with a non-shared VMA, pages in the page
cache are installed in the ptes. But hugepage_add_new_anon_rmap is called
for them mistakenly because they're not vm_shared. This will corrupt the
page->mapping used by page cache code.
Link: https://lkml.kernel.org/r/20220712130542.18836-1-linmiaohe@huawei.com
Fixes: f619147104c8 ("userfaultfd: add UFFDIO_CONTINUE ioctl")
Signed-off-by: Miaohe Lin <linmiaohe(a)huawei.com>
Reviewed-by: Mike Kravetz <mike.kravetz(a)oracle.com>
Cc: Axel Rasmussen <axelrasmussen(a)google.com>
Cc: Peter Xu <peterx(a)redhat.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
Signed-off-by: Axel Rasmussen <axelrasmussen(a)google.com>
---
mm/hugetlb.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 405793b8cf0d..d61b665c45d6 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5371,7 +5371,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
if (!huge_pte_none(huge_ptep_get(dst_pte)))
goto out_release_unlock;
- if (vm_shared) {
+ if (page_in_pagecache) {
page_dup_rmap(page, true);
} else {
ClearHPageRestoreReserve(page);
--
2.37.2.672.g94769d06f0-goog
commit b67fbebd4cf9 ("mmu_gather: Force tlb-flush VM_PFNMAP vmas")
fixes a TLB flushing bug that probably affects some x86 graphics
drivers, although hitting the bug might be fairly gnarly. Still, it'd
probably be a bad idea to leave it unfixed in the stable kernels that
things like Debian stable rely on.
Unfortunately the way the fix is written, it relies on refactoring
prep work in the three preceding commits, and trying to apply those to
older kernels will result in a bunch of merge conflicts.
Would it be acceptable here to fix the issue in a completely different
way in stable to minimize merge conflicts? Or should the refactoring
prep work and the fix commit all be backported?
A minimal but also completely different fix would be:
diff --git a/mm/mmap.c b/mm/mmap.c
index a50042918cc7..c453a1274305 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2665,6 +2665,18 @@ static void unmap_region(struct mm_struct *mm,
tlb_gather_mmu(&tlb, mm, start, end);
update_hiwater_rss(mm);
unmap_vmas(&tlb, vma, start, end);
+
+ /*
+ * Ensure we have no stale TLB entries by the time this mapping is
+ * removed from the rmap.
+ * Note that we don't have to worry about nested flushes here because
+ * we're holding the mm semaphore for removing the mapping - so any
+ * concurrent flush in this region has to be coming through the rmap,
+ * and we synchronize against that using the rmap lock.
+ */
+ if ((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) != 0)
+ tlb_flush_mmu(&tlb);
+
free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
next ? next->vm_start : USER_PGTABLES_CEILING);
tlb_finish_mmu(&tlb, start, end);
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 41ac42f137080bc230b5882e3c88c392ab7f2d32 Mon Sep 17 00:00:00 2001
From: Gerald Schaefer <gerald.schaefer(a)linux.ibm.com>
Date: Wed, 17 Aug 2022 15:26:03 +0200
Subject: [PATCH] s390/mm: do not trigger write fault when vma does not allow
VM_WRITE
For non-protection pXd_none() page faults in do_dat_exception(), we
call do_exception() with access == (VM_READ | VM_WRITE | VM_EXEC).
In do_exception(), vma->vm_flags is checked against that before
calling handle_mm_fault().
Since commit 92f842eac7ee3 ("[S390] store indication fault optimization"),
we call handle_mm_fault() with FAULT_FLAG_WRITE, when recognizing that
it was a write access. However, the vma flags check is still only
checking against (VM_READ | VM_WRITE | VM_EXEC), and therefore also
calling handle_mm_fault() with FAULT_FLAG_WRITE in cases where the vma
does not allow VM_WRITE.
Fix this by changing access check in do_exception() to VM_WRITE only,
when recognizing write access.
Link: https://lkml.kernel.org/r/20220811103435.188481-3-david@redhat.com
Fixes: 92f842eac7ee3 ("[S390] store indication fault optimization")
Cc: <stable(a)vger.kernel.org>
Reported-by: David Hildenbrand <david(a)redhat.com>
Reviewed-by: Heiko Carstens <hca(a)linux.ibm.com>
Signed-off-by: Gerald Schaefer <gerald.schaefer(a)linux.ibm.com>
Signed-off-by: Vasily Gorbik <gor(a)linux.ibm.com>
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index 13449941516c..09b6e756d521 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -379,7 +379,9 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access)
flags = FAULT_FLAG_DEFAULT;
if (user_mode(regs))
flags |= FAULT_FLAG_USER;
- if (access == VM_WRITE || is_write)
+ if (is_write)
+ access = VM_WRITE;
+ if (access == VM_WRITE)
flags |= FAULT_FLAG_WRITE;
mmap_read_lock(mm);
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 41ac42f137080bc230b5882e3c88c392ab7f2d32 Mon Sep 17 00:00:00 2001
From: Gerald Schaefer <gerald.schaefer(a)linux.ibm.com>
Date: Wed, 17 Aug 2022 15:26:03 +0200
Subject: [PATCH] s390/mm: do not trigger write fault when vma does not allow
VM_WRITE
For non-protection pXd_none() page faults in do_dat_exception(), we
call do_exception() with access == (VM_READ | VM_WRITE | VM_EXEC).
In do_exception(), vma->vm_flags is checked against that before
calling handle_mm_fault().
Since commit 92f842eac7ee3 ("[S390] store indication fault optimization"),
we call handle_mm_fault() with FAULT_FLAG_WRITE, when recognizing that
it was a write access. However, the vma flags check is still only
checking against (VM_READ | VM_WRITE | VM_EXEC), and therefore also
calling handle_mm_fault() with FAULT_FLAG_WRITE in cases where the vma
does not allow VM_WRITE.
Fix this by changing access check in do_exception() to VM_WRITE only,
when recognizing write access.
Link: https://lkml.kernel.org/r/20220811103435.188481-3-david@redhat.com
Fixes: 92f842eac7ee3 ("[S390] store indication fault optimization")
Cc: <stable(a)vger.kernel.org>
Reported-by: David Hildenbrand <david(a)redhat.com>
Reviewed-by: Heiko Carstens <hca(a)linux.ibm.com>
Signed-off-by: Gerald Schaefer <gerald.schaefer(a)linux.ibm.com>
Signed-off-by: Vasily Gorbik <gor(a)linux.ibm.com>
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index 13449941516c..09b6e756d521 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -379,7 +379,9 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access)
flags = FAULT_FLAG_DEFAULT;
if (user_mode(regs))
flags |= FAULT_FLAG_USER;
- if (access == VM_WRITE || is_write)
+ if (is_write)
+ access = VM_WRITE;
+ if (access == VM_WRITE)
flags |= FAULT_FLAG_WRITE;
mmap_read_lock(mm);
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 41ac42f137080bc230b5882e3c88c392ab7f2d32 Mon Sep 17 00:00:00 2001
From: Gerald Schaefer <gerald.schaefer(a)linux.ibm.com>
Date: Wed, 17 Aug 2022 15:26:03 +0200
Subject: [PATCH] s390/mm: do not trigger write fault when vma does not allow
VM_WRITE
For non-protection pXd_none() page faults in do_dat_exception(), we
call do_exception() with access == (VM_READ | VM_WRITE | VM_EXEC).
In do_exception(), vma->vm_flags is checked against that before
calling handle_mm_fault().
Since commit 92f842eac7ee3 ("[S390] store indication fault optimization"),
we call handle_mm_fault() with FAULT_FLAG_WRITE, when recognizing that
it was a write access. However, the vma flags check is still only
checking against (VM_READ | VM_WRITE | VM_EXEC), and therefore also
calling handle_mm_fault() with FAULT_FLAG_WRITE in cases where the vma
does not allow VM_WRITE.
Fix this by changing access check in do_exception() to VM_WRITE only,
when recognizing write access.
Link: https://lkml.kernel.org/r/20220811103435.188481-3-david@redhat.com
Fixes: 92f842eac7ee3 ("[S390] store indication fault optimization")
Cc: <stable(a)vger.kernel.org>
Reported-by: David Hildenbrand <david(a)redhat.com>
Reviewed-by: Heiko Carstens <hca(a)linux.ibm.com>
Signed-off-by: Gerald Schaefer <gerald.schaefer(a)linux.ibm.com>
Signed-off-by: Vasily Gorbik <gor(a)linux.ibm.com>
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index 13449941516c..09b6e756d521 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -379,7 +379,9 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access)
flags = FAULT_FLAG_DEFAULT;
if (user_mode(regs))
flags |= FAULT_FLAG_USER;
- if (access == VM_WRITE || is_write)
+ if (is_write)
+ access = VM_WRITE;
+ if (access == VM_WRITE)
flags |= FAULT_FLAG_WRITE;
mmap_read_lock(mm);
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 41ac42f137080bc230b5882e3c88c392ab7f2d32 Mon Sep 17 00:00:00 2001
From: Gerald Schaefer <gerald.schaefer(a)linux.ibm.com>
Date: Wed, 17 Aug 2022 15:26:03 +0200
Subject: [PATCH] s390/mm: do not trigger write fault when vma does not allow
VM_WRITE
For non-protection pXd_none() page faults in do_dat_exception(), we
call do_exception() with access == (VM_READ | VM_WRITE | VM_EXEC).
In do_exception(), vma->vm_flags is checked against that before
calling handle_mm_fault().
Since commit 92f842eac7ee3 ("[S390] store indication fault optimization"),
we call handle_mm_fault() with FAULT_FLAG_WRITE, when recognizing that
it was a write access. However, the vma flags check is still only
checking against (VM_READ | VM_WRITE | VM_EXEC), and therefore also
calling handle_mm_fault() with FAULT_FLAG_WRITE in cases where the vma
does not allow VM_WRITE.
Fix this by changing access check in do_exception() to VM_WRITE only,
when recognizing write access.
Link: https://lkml.kernel.org/r/20220811103435.188481-3-david@redhat.com
Fixes: 92f842eac7ee3 ("[S390] store indication fault optimization")
Cc: <stable(a)vger.kernel.org>
Reported-by: David Hildenbrand <david(a)redhat.com>
Reviewed-by: Heiko Carstens <hca(a)linux.ibm.com>
Signed-off-by: Gerald Schaefer <gerald.schaefer(a)linux.ibm.com>
Signed-off-by: Vasily Gorbik <gor(a)linux.ibm.com>
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index 13449941516c..09b6e756d521 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -379,7 +379,9 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access)
flags = FAULT_FLAG_DEFAULT;
if (user_mode(regs))
flags |= FAULT_FLAG_USER;
- if (access == VM_WRITE || is_write)
+ if (is_write)
+ access = VM_WRITE;
+ if (access == VM_WRITE)
flags |= FAULT_FLAG_WRITE;
mmap_read_lock(mm);
Hello Dear,
I am Dennis Franklin a retired civil servant i have a business to
discuss with you from the Eastern part of Africa aimed at agreed
percentage upon your acceptance of my hand in business and friendship.
Kindly respond to me if you are interested to partner with me for an
update.Very important.
Yours Sincerely,
Dennis Franklin.
Reply to John Peters: jpeters(a)accountant.com
Fix Syzbot bug: kernel BUG in ntfs_lookup_inode_by_name
https://syzkaller.appspot.com/bug?id=32cf53b48c1846ffc25a185a2e92e170d1a95d…
Check whether $Extend is a directory or not( for NTFS3.0+) while
loading system files. If it isn't(as in the case of this bug where the
mft record for $Extend contains a regular file), load_system_files()
returns false.
Reported-by: syzbot+30b7f850c6d98ea461d2(a)syzkaller.appspotmail.com
CC: stable(a)vger.kernel.org # 4.9+
Signed-off-by: Soumya Negi <soumya.negi97(a)gmail.com>
---
Changes since v1:
* Added CC tag for stable
* Formatted changelog to fit within 72 cols
---
fs/ntfs/super.c | 9 +++++++--
1 file changed, 7 insertions(+), 2 deletions(-)
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 5ae8de09b271..18e2902531f9 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -2092,10 +2092,15 @@ static bool load_system_files(ntfs_volume *vol)
// TODO: Initialize security.
/* Get the extended system files' directory inode. */
vol->extend_ino = ntfs_iget(sb, FILE_Extend);
- if (IS_ERR(vol->extend_ino) || is_bad_inode(vol->extend_ino)) {
+ if (IS_ERR(vol->extend_ino) || is_bad_inode(vol->extend_ino) ||
+ !S_ISDIR(vol->extend_ino->i_mode)) {
+ static const char *es1 = "$Extend is not a directory";
+ static const char *es2 = "Failed to load $Extend";
+ const char *es = !S_ISDIR(vol->extend_ino->i_mode) ? es1 : es2;
+
if (!IS_ERR(vol->extend_ino))
iput(vol->extend_ino);
- ntfs_error(sb, "Failed to load $Extend.");
+ ntfs_error(sb, "%s.", es);
goto iput_sec_err_out;
}
#ifdef NTFS_RW
--
2.17.1
Hello my dear,
My Name is, sofia oleksander, a 20 years old girl from Ukraine and
I'm presently in a refugee camp here in Poland. I lost my parents in
the recent war in Ukraine, right now I'm in a refugee camp in Poland.
Please I'm in great need of your help in transferring my late father
deposited fund, the sum of $3.5 MILLION UNITED STATES DOLLAR, he
deposited in a bank in United State.
the deposited money was from the sale of the company shares death
benefits payment, and entitlements of my deceased father by his
company. i have every necessary document for the fund, i seek for an
honest foreigner who will stand as my foreign partner and investor. i
just need this fund to be transferred to your bank account so that I
will come over to your country and complete my education over there in
your country. as you know, my country has been in a deep crisis due to
the recent war and I cannot go back.
Please I need your urgent,
sofia oleksander,
This is a backport of commit 34fc9cc3aebe to v5.15.
The "PolarFire SoC MSS Technical Reference Manual" documents the
following PLIC interrupts:
1 - L2 Cache Controller Signals when a metadata correction event occurs
2 - L2 Cache Controller Signals when an uncorrectable metadata event occurs
3 - L2 Cache Controller Signals when a data correction event occurs
4 - L2 Cache Controller Signals when an uncorrectable data event occurs
This differs from the SiFive FU540 which only has three L2 cache related
interrupts.
The sequence in the device tree is defined by an enum:
enum {
DIR_CORR = 0,
DATA_CORR,
DATA_UNCORR,
DIR_UNCORR,
};
So the correct sequence of the L2 cache interrupts is
interrupts = <1>, <3>, <4>, <2>;
This manifests as an unusable system if the l2-cache driver is enabled,
as the wrong interrupt gets cleared & the handler prints errors to the
console ad infinitum.
Fixes: 0fa6107eca41 ("RISC-V: Initial DTS for Microchip ICICLE board")
CC: stable(a)vger.kernel.org # 5.15: e35b07a7df9b: riscv: dts: microchip: mpfs: Group tuples in interrupt properties
Link: https://lore.kernel.org/all/20220817132521.3159388-1-heinrich.schuchardt@ca…
Signed-off-by: Heinrich Schuchardt <heinrich.schuchardt(a)canonical.com>
---
arch/riscv/boot/dts/microchip/microchip-mpfs.dtsi | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/riscv/boot/dts/microchip/microchip-mpfs.dtsi b/arch/riscv/boot/dts/microchip/microchip-mpfs.dtsi
index 4ef4bcb74872..57989b2ac186 100644
--- a/arch/riscv/boot/dts/microchip/microchip-mpfs.dtsi
+++ b/arch/riscv/boot/dts/microchip/microchip-mpfs.dtsi
@@ -153,7 +153,7 @@ cache-controller@2010000 {
cache-size = <2097152>;
cache-unified;
interrupt-parent = <&plic>;
- interrupts = <1 2 3>;
+ interrupts = <1>, <3>, <4>, <2>;
reg = <0x0 0x2010000 0x0 0x1000>;
};
--
2.37.2
This bug is marked as fixed by commit:
net: core: netlink: add helper refcount dec and lock function
net: sched: add helper function to take reference to Qdisc
net: sched: extend Qdisc with rcu
net: sched: rename qdisc_destroy() to qdisc_put()
net: sched: use Qdisc rcu API instead of relying on rtnl lock
But I can't find it in any tested tree for more than 90 days.
Is it a correct commit? Please update it by replying:
#syz fix: exact-commit-title
Until then the bug is still considered open and
new crashes with the same signature are ignored.
Upon reception, a packet must be categorized, either it's destination is
the host, or it is another host. A packet with no destination addressing
fields may be valid in two situations:
- the packet has no source field: only ACKs are built like that, we
consider the host as the destination.
- the packet has a valid source field: it is directed to the PAN
coordinator, as for know we don't have this information we consider we
are not the PAN coordinator.
There was likely a copy/paste error made during a previous cleanup
because the if clause is now containing exactly the same condition as in
the switch case, which can never be true. In the past the destination
address was used in the switch and the source address was used in the
if, which matches what the spec says.
Cc: stable(a)vger.kernel.org
Fixes: ae531b9475f6 ("ieee802154: use ieee802154_addr instead of *_sa variants")
Signed-off-by: Miquel Raynal <miquel.raynal(a)bootlin.com>
---
net/mac802154/rx.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/mac802154/rx.c b/net/mac802154/rx.c
index b8ce84618a55..c439125ef2b9 100644
--- a/net/mac802154/rx.c
+++ b/net/mac802154/rx.c
@@ -44,7 +44,7 @@ ieee802154_subif_frame(struct ieee802154_sub_if_data *sdata,
switch (mac_cb(skb)->dest.mode) {
case IEEE802154_ADDR_NONE:
- if (mac_cb(skb)->dest.mode != IEEE802154_ADDR_NONE)
+ if (hdr->source.mode != IEEE802154_ADDR_NONE)
/* FIXME: check if we are PAN coordinator */
skb->pkt_type = PACKET_OTHERHOST;
else
--
2.34.1
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 15f7cfae912ea1739c8844b7edf3621ba981a37a Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean(a)nxp.com>
Date: Thu, 18 Aug 2022 19:48:09 +0300
Subject: [PATCH] net: dsa: microchip: make learning configurable and keep it
off while standalone
Address learning should initially be turned off by the driver for port
operation in standalone mode, then the DSA core handles changes to it
via ds->ops->port_bridge_flags().
Leaving address learning enabled while ports are standalone breaks any
kind of communication which involves port B receiving what port A has
sent. Notably it breaks the ksz9477 driver used with a (non offloaded,
ports act as if standalone) bonding interface in active-backup mode,
when the ports are connected together through external switches, for
redundancy purposes.
This fixes a major design flaw in the ksz9477 and ksz8795 drivers, which
unconditionally leave address learning enabled even while ports operate
as standalone.
Fixes: b987e98e50ab ("dsa: add DSA switch driver for Microchip KSZ9477")
Link: https://lore.kernel.org/netdev/CAFZh4h-JVWt80CrQWkFji7tZJahMfOToUJQgKS5s0_=…
Reported-by: Brian Hutchinson <b.hutchman(a)gmail.com>
Signed-off-by: Vladimir Oltean <vladimir.oltean(a)nxp.com>
Link: https://lore.kernel.org/r/20220818164809.3198039-1-vladimir.oltean@nxp.com
Signed-off-by: Jakub Kicinski <kuba(a)kernel.org>
diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c
index 7461272a6d41..6bd69a7e6809 100644
--- a/drivers/net/dsa/microchip/ksz_common.c
+++ b/drivers/net/dsa/microchip/ksz_common.c
@@ -968,6 +968,7 @@ static void ksz_update_port_member(struct ksz_device *dev, int port)
static int ksz_setup(struct dsa_switch *ds)
{
struct ksz_device *dev = ds->priv;
+ struct ksz_port *p;
const u16 *regs;
int ret;
@@ -1007,6 +1008,14 @@ static int ksz_setup(struct dsa_switch *ds)
return ret;
}
+ /* Start with learning disabled on standalone user ports, and enabled
+ * on the CPU port. In lack of other finer mechanisms, learning on the
+ * CPU port will avoid flooding bridge local addresses on the network
+ * in some cases.
+ */
+ p = &dev->ports[dev->cpu_port];
+ p->learning = true;
+
/* start switch */
regmap_update_bits(dev->regmap[0], regs[S_START_CTRL],
SW_START, SW_START);
@@ -1283,6 +1292,8 @@ void ksz_port_stp_state_set(struct dsa_switch *ds, int port, u8 state)
ksz_pread8(dev, port, regs[P_STP_CTRL], &data);
data &= ~(PORT_TX_ENABLE | PORT_RX_ENABLE | PORT_LEARN_DISABLE);
+ p = &dev->ports[port];
+
switch (state) {
case BR_STATE_DISABLED:
data |= PORT_LEARN_DISABLE;
@@ -1292,9 +1303,13 @@ void ksz_port_stp_state_set(struct dsa_switch *ds, int port, u8 state)
break;
case BR_STATE_LEARNING:
data |= PORT_RX_ENABLE;
+ if (!p->learning)
+ data |= PORT_LEARN_DISABLE;
break;
case BR_STATE_FORWARDING:
data |= (PORT_TX_ENABLE | PORT_RX_ENABLE);
+ if (!p->learning)
+ data |= PORT_LEARN_DISABLE;
break;
case BR_STATE_BLOCKING:
data |= PORT_LEARN_DISABLE;
@@ -1306,12 +1321,38 @@ void ksz_port_stp_state_set(struct dsa_switch *ds, int port, u8 state)
ksz_pwrite8(dev, port, regs[P_STP_CTRL], data);
- p = &dev->ports[port];
p->stp_state = state;
ksz_update_port_member(dev, port);
}
+static int ksz_port_pre_bridge_flags(struct dsa_switch *ds, int port,
+ struct switchdev_brport_flags flags,
+ struct netlink_ext_ack *extack)
+{
+ if (flags.mask & ~BR_LEARNING)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int ksz_port_bridge_flags(struct dsa_switch *ds, int port,
+ struct switchdev_brport_flags flags,
+ struct netlink_ext_ack *extack)
+{
+ struct ksz_device *dev = ds->priv;
+ struct ksz_port *p = &dev->ports[port];
+
+ if (flags.mask & BR_LEARNING) {
+ p->learning = !!(flags.val & BR_LEARNING);
+
+ /* Make the change take effect immediately */
+ ksz_port_stp_state_set(ds, port, p->stp_state);
+ }
+
+ return 0;
+}
+
static enum dsa_tag_protocol ksz_get_tag_protocol(struct dsa_switch *ds,
int port,
enum dsa_tag_protocol mp)
@@ -1725,6 +1766,8 @@ static const struct dsa_switch_ops ksz_switch_ops = {
.port_bridge_join = ksz_port_bridge_join,
.port_bridge_leave = ksz_port_bridge_leave,
.port_stp_state_set = ksz_port_stp_state_set,
+ .port_pre_bridge_flags = ksz_port_pre_bridge_flags,
+ .port_bridge_flags = ksz_port_bridge_flags,
.port_fast_age = ksz_port_fast_age,
.port_vlan_filtering = ksz_port_vlan_filtering,
.port_vlan_add = ksz_port_vlan_add,
diff --git a/drivers/net/dsa/microchip/ksz_common.h b/drivers/net/dsa/microchip/ksz_common.h
index 764ada3a0f42..0d9520dc6d2d 100644
--- a/drivers/net/dsa/microchip/ksz_common.h
+++ b/drivers/net/dsa/microchip/ksz_common.h
@@ -65,6 +65,7 @@ struct ksz_chip_data {
struct ksz_port {
bool remove_tag; /* Remove Tag flag set, for ksz8795 only */
+ bool learning;
int stp_state;
struct phy_device phydev;
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 15f7cfae912ea1739c8844b7edf3621ba981a37a Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean(a)nxp.com>
Date: Thu, 18 Aug 2022 19:48:09 +0300
Subject: [PATCH] net: dsa: microchip: make learning configurable and keep it
off while standalone
Address learning should initially be turned off by the driver for port
operation in standalone mode, then the DSA core handles changes to it
via ds->ops->port_bridge_flags().
Leaving address learning enabled while ports are standalone breaks any
kind of communication which involves port B receiving what port A has
sent. Notably it breaks the ksz9477 driver used with a (non offloaded,
ports act as if standalone) bonding interface in active-backup mode,
when the ports are connected together through external switches, for
redundancy purposes.
This fixes a major design flaw in the ksz9477 and ksz8795 drivers, which
unconditionally leave address learning enabled even while ports operate
as standalone.
Fixes: b987e98e50ab ("dsa: add DSA switch driver for Microchip KSZ9477")
Link: https://lore.kernel.org/netdev/CAFZh4h-JVWt80CrQWkFji7tZJahMfOToUJQgKS5s0_=…
Reported-by: Brian Hutchinson <b.hutchman(a)gmail.com>
Signed-off-by: Vladimir Oltean <vladimir.oltean(a)nxp.com>
Link: https://lore.kernel.org/r/20220818164809.3198039-1-vladimir.oltean@nxp.com
Signed-off-by: Jakub Kicinski <kuba(a)kernel.org>
diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c
index 7461272a6d41..6bd69a7e6809 100644
--- a/drivers/net/dsa/microchip/ksz_common.c
+++ b/drivers/net/dsa/microchip/ksz_common.c
@@ -968,6 +968,7 @@ static void ksz_update_port_member(struct ksz_device *dev, int port)
static int ksz_setup(struct dsa_switch *ds)
{
struct ksz_device *dev = ds->priv;
+ struct ksz_port *p;
const u16 *regs;
int ret;
@@ -1007,6 +1008,14 @@ static int ksz_setup(struct dsa_switch *ds)
return ret;
}
+ /* Start with learning disabled on standalone user ports, and enabled
+ * on the CPU port. In lack of other finer mechanisms, learning on the
+ * CPU port will avoid flooding bridge local addresses on the network
+ * in some cases.
+ */
+ p = &dev->ports[dev->cpu_port];
+ p->learning = true;
+
/* start switch */
regmap_update_bits(dev->regmap[0], regs[S_START_CTRL],
SW_START, SW_START);
@@ -1283,6 +1292,8 @@ void ksz_port_stp_state_set(struct dsa_switch *ds, int port, u8 state)
ksz_pread8(dev, port, regs[P_STP_CTRL], &data);
data &= ~(PORT_TX_ENABLE | PORT_RX_ENABLE | PORT_LEARN_DISABLE);
+ p = &dev->ports[port];
+
switch (state) {
case BR_STATE_DISABLED:
data |= PORT_LEARN_DISABLE;
@@ -1292,9 +1303,13 @@ void ksz_port_stp_state_set(struct dsa_switch *ds, int port, u8 state)
break;
case BR_STATE_LEARNING:
data |= PORT_RX_ENABLE;
+ if (!p->learning)
+ data |= PORT_LEARN_DISABLE;
break;
case BR_STATE_FORWARDING:
data |= (PORT_TX_ENABLE | PORT_RX_ENABLE);
+ if (!p->learning)
+ data |= PORT_LEARN_DISABLE;
break;
case BR_STATE_BLOCKING:
data |= PORT_LEARN_DISABLE;
@@ -1306,12 +1321,38 @@ void ksz_port_stp_state_set(struct dsa_switch *ds, int port, u8 state)
ksz_pwrite8(dev, port, regs[P_STP_CTRL], data);
- p = &dev->ports[port];
p->stp_state = state;
ksz_update_port_member(dev, port);
}
+static int ksz_port_pre_bridge_flags(struct dsa_switch *ds, int port,
+ struct switchdev_brport_flags flags,
+ struct netlink_ext_ack *extack)
+{
+ if (flags.mask & ~BR_LEARNING)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int ksz_port_bridge_flags(struct dsa_switch *ds, int port,
+ struct switchdev_brport_flags flags,
+ struct netlink_ext_ack *extack)
+{
+ struct ksz_device *dev = ds->priv;
+ struct ksz_port *p = &dev->ports[port];
+
+ if (flags.mask & BR_LEARNING) {
+ p->learning = !!(flags.val & BR_LEARNING);
+
+ /* Make the change take effect immediately */
+ ksz_port_stp_state_set(ds, port, p->stp_state);
+ }
+
+ return 0;
+}
+
static enum dsa_tag_protocol ksz_get_tag_protocol(struct dsa_switch *ds,
int port,
enum dsa_tag_protocol mp)
@@ -1725,6 +1766,8 @@ static const struct dsa_switch_ops ksz_switch_ops = {
.port_bridge_join = ksz_port_bridge_join,
.port_bridge_leave = ksz_port_bridge_leave,
.port_stp_state_set = ksz_port_stp_state_set,
+ .port_pre_bridge_flags = ksz_port_pre_bridge_flags,
+ .port_bridge_flags = ksz_port_bridge_flags,
.port_fast_age = ksz_port_fast_age,
.port_vlan_filtering = ksz_port_vlan_filtering,
.port_vlan_add = ksz_port_vlan_add,
diff --git a/drivers/net/dsa/microchip/ksz_common.h b/drivers/net/dsa/microchip/ksz_common.h
index 764ada3a0f42..0d9520dc6d2d 100644
--- a/drivers/net/dsa/microchip/ksz_common.h
+++ b/drivers/net/dsa/microchip/ksz_common.h
@@ -65,6 +65,7 @@ struct ksz_chip_data {
struct ksz_port {
bool remove_tag; /* Remove Tag flag set, for ksz8795 only */
+ bool learning;
int stp_state;
struct phy_device phydev;
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 15f7cfae912ea1739c8844b7edf3621ba981a37a Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean(a)nxp.com>
Date: Thu, 18 Aug 2022 19:48:09 +0300
Subject: [PATCH] net: dsa: microchip: make learning configurable and keep it
off while standalone
Address learning should initially be turned off by the driver for port
operation in standalone mode, then the DSA core handles changes to it
via ds->ops->port_bridge_flags().
Leaving address learning enabled while ports are standalone breaks any
kind of communication which involves port B receiving what port A has
sent. Notably it breaks the ksz9477 driver used with a (non offloaded,
ports act as if standalone) bonding interface in active-backup mode,
when the ports are connected together through external switches, for
redundancy purposes.
This fixes a major design flaw in the ksz9477 and ksz8795 drivers, which
unconditionally leave address learning enabled even while ports operate
as standalone.
Fixes: b987e98e50ab ("dsa: add DSA switch driver for Microchip KSZ9477")
Link: https://lore.kernel.org/netdev/CAFZh4h-JVWt80CrQWkFji7tZJahMfOToUJQgKS5s0_=…
Reported-by: Brian Hutchinson <b.hutchman(a)gmail.com>
Signed-off-by: Vladimir Oltean <vladimir.oltean(a)nxp.com>
Link: https://lore.kernel.org/r/20220818164809.3198039-1-vladimir.oltean@nxp.com
Signed-off-by: Jakub Kicinski <kuba(a)kernel.org>
diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c
index 7461272a6d41..6bd69a7e6809 100644
--- a/drivers/net/dsa/microchip/ksz_common.c
+++ b/drivers/net/dsa/microchip/ksz_common.c
@@ -968,6 +968,7 @@ static void ksz_update_port_member(struct ksz_device *dev, int port)
static int ksz_setup(struct dsa_switch *ds)
{
struct ksz_device *dev = ds->priv;
+ struct ksz_port *p;
const u16 *regs;
int ret;
@@ -1007,6 +1008,14 @@ static int ksz_setup(struct dsa_switch *ds)
return ret;
}
+ /* Start with learning disabled on standalone user ports, and enabled
+ * on the CPU port. In lack of other finer mechanisms, learning on the
+ * CPU port will avoid flooding bridge local addresses on the network
+ * in some cases.
+ */
+ p = &dev->ports[dev->cpu_port];
+ p->learning = true;
+
/* start switch */
regmap_update_bits(dev->regmap[0], regs[S_START_CTRL],
SW_START, SW_START);
@@ -1283,6 +1292,8 @@ void ksz_port_stp_state_set(struct dsa_switch *ds, int port, u8 state)
ksz_pread8(dev, port, regs[P_STP_CTRL], &data);
data &= ~(PORT_TX_ENABLE | PORT_RX_ENABLE | PORT_LEARN_DISABLE);
+ p = &dev->ports[port];
+
switch (state) {
case BR_STATE_DISABLED:
data |= PORT_LEARN_DISABLE;
@@ -1292,9 +1303,13 @@ void ksz_port_stp_state_set(struct dsa_switch *ds, int port, u8 state)
break;
case BR_STATE_LEARNING:
data |= PORT_RX_ENABLE;
+ if (!p->learning)
+ data |= PORT_LEARN_DISABLE;
break;
case BR_STATE_FORWARDING:
data |= (PORT_TX_ENABLE | PORT_RX_ENABLE);
+ if (!p->learning)
+ data |= PORT_LEARN_DISABLE;
break;
case BR_STATE_BLOCKING:
data |= PORT_LEARN_DISABLE;
@@ -1306,12 +1321,38 @@ void ksz_port_stp_state_set(struct dsa_switch *ds, int port, u8 state)
ksz_pwrite8(dev, port, regs[P_STP_CTRL], data);
- p = &dev->ports[port];
p->stp_state = state;
ksz_update_port_member(dev, port);
}
+static int ksz_port_pre_bridge_flags(struct dsa_switch *ds, int port,
+ struct switchdev_brport_flags flags,
+ struct netlink_ext_ack *extack)
+{
+ if (flags.mask & ~BR_LEARNING)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int ksz_port_bridge_flags(struct dsa_switch *ds, int port,
+ struct switchdev_brport_flags flags,
+ struct netlink_ext_ack *extack)
+{
+ struct ksz_device *dev = ds->priv;
+ struct ksz_port *p = &dev->ports[port];
+
+ if (flags.mask & BR_LEARNING) {
+ p->learning = !!(flags.val & BR_LEARNING);
+
+ /* Make the change take effect immediately */
+ ksz_port_stp_state_set(ds, port, p->stp_state);
+ }
+
+ return 0;
+}
+
static enum dsa_tag_protocol ksz_get_tag_protocol(struct dsa_switch *ds,
int port,
enum dsa_tag_protocol mp)
@@ -1725,6 +1766,8 @@ static const struct dsa_switch_ops ksz_switch_ops = {
.port_bridge_join = ksz_port_bridge_join,
.port_bridge_leave = ksz_port_bridge_leave,
.port_stp_state_set = ksz_port_stp_state_set,
+ .port_pre_bridge_flags = ksz_port_pre_bridge_flags,
+ .port_bridge_flags = ksz_port_bridge_flags,
.port_fast_age = ksz_port_fast_age,
.port_vlan_filtering = ksz_port_vlan_filtering,
.port_vlan_add = ksz_port_vlan_add,
diff --git a/drivers/net/dsa/microchip/ksz_common.h b/drivers/net/dsa/microchip/ksz_common.h
index 764ada3a0f42..0d9520dc6d2d 100644
--- a/drivers/net/dsa/microchip/ksz_common.h
+++ b/drivers/net/dsa/microchip/ksz_common.h
@@ -65,6 +65,7 @@ struct ksz_chip_data {
struct ksz_port {
bool remove_tag; /* Remove Tag flag set, for ksz8795 only */
+ bool learning;
int stp_state;
struct phy_device phydev;
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 15f7cfae912ea1739c8844b7edf3621ba981a37a Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean(a)nxp.com>
Date: Thu, 18 Aug 2022 19:48:09 +0300
Subject: [PATCH] net: dsa: microchip: make learning configurable and keep it
off while standalone
Address learning should initially be turned off by the driver for port
operation in standalone mode, then the DSA core handles changes to it
via ds->ops->port_bridge_flags().
Leaving address learning enabled while ports are standalone breaks any
kind of communication which involves port B receiving what port A has
sent. Notably it breaks the ksz9477 driver used with a (non offloaded,
ports act as if standalone) bonding interface in active-backup mode,
when the ports are connected together through external switches, for
redundancy purposes.
This fixes a major design flaw in the ksz9477 and ksz8795 drivers, which
unconditionally leave address learning enabled even while ports operate
as standalone.
Fixes: b987e98e50ab ("dsa: add DSA switch driver for Microchip KSZ9477")
Link: https://lore.kernel.org/netdev/CAFZh4h-JVWt80CrQWkFji7tZJahMfOToUJQgKS5s0_=…
Reported-by: Brian Hutchinson <b.hutchman(a)gmail.com>
Signed-off-by: Vladimir Oltean <vladimir.oltean(a)nxp.com>
Link: https://lore.kernel.org/r/20220818164809.3198039-1-vladimir.oltean@nxp.com
Signed-off-by: Jakub Kicinski <kuba(a)kernel.org>
diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c
index 7461272a6d41..6bd69a7e6809 100644
--- a/drivers/net/dsa/microchip/ksz_common.c
+++ b/drivers/net/dsa/microchip/ksz_common.c
@@ -968,6 +968,7 @@ static void ksz_update_port_member(struct ksz_device *dev, int port)
static int ksz_setup(struct dsa_switch *ds)
{
struct ksz_device *dev = ds->priv;
+ struct ksz_port *p;
const u16 *regs;
int ret;
@@ -1007,6 +1008,14 @@ static int ksz_setup(struct dsa_switch *ds)
return ret;
}
+ /* Start with learning disabled on standalone user ports, and enabled
+ * on the CPU port. In lack of other finer mechanisms, learning on the
+ * CPU port will avoid flooding bridge local addresses on the network
+ * in some cases.
+ */
+ p = &dev->ports[dev->cpu_port];
+ p->learning = true;
+
/* start switch */
regmap_update_bits(dev->regmap[0], regs[S_START_CTRL],
SW_START, SW_START);
@@ -1283,6 +1292,8 @@ void ksz_port_stp_state_set(struct dsa_switch *ds, int port, u8 state)
ksz_pread8(dev, port, regs[P_STP_CTRL], &data);
data &= ~(PORT_TX_ENABLE | PORT_RX_ENABLE | PORT_LEARN_DISABLE);
+ p = &dev->ports[port];
+
switch (state) {
case BR_STATE_DISABLED:
data |= PORT_LEARN_DISABLE;
@@ -1292,9 +1303,13 @@ void ksz_port_stp_state_set(struct dsa_switch *ds, int port, u8 state)
break;
case BR_STATE_LEARNING:
data |= PORT_RX_ENABLE;
+ if (!p->learning)
+ data |= PORT_LEARN_DISABLE;
break;
case BR_STATE_FORWARDING:
data |= (PORT_TX_ENABLE | PORT_RX_ENABLE);
+ if (!p->learning)
+ data |= PORT_LEARN_DISABLE;
break;
case BR_STATE_BLOCKING:
data |= PORT_LEARN_DISABLE;
@@ -1306,12 +1321,38 @@ void ksz_port_stp_state_set(struct dsa_switch *ds, int port, u8 state)
ksz_pwrite8(dev, port, regs[P_STP_CTRL], data);
- p = &dev->ports[port];
p->stp_state = state;
ksz_update_port_member(dev, port);
}
+static int ksz_port_pre_bridge_flags(struct dsa_switch *ds, int port,
+ struct switchdev_brport_flags flags,
+ struct netlink_ext_ack *extack)
+{
+ if (flags.mask & ~BR_LEARNING)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int ksz_port_bridge_flags(struct dsa_switch *ds, int port,
+ struct switchdev_brport_flags flags,
+ struct netlink_ext_ack *extack)
+{
+ struct ksz_device *dev = ds->priv;
+ struct ksz_port *p = &dev->ports[port];
+
+ if (flags.mask & BR_LEARNING) {
+ p->learning = !!(flags.val & BR_LEARNING);
+
+ /* Make the change take effect immediately */
+ ksz_port_stp_state_set(ds, port, p->stp_state);
+ }
+
+ return 0;
+}
+
static enum dsa_tag_protocol ksz_get_tag_protocol(struct dsa_switch *ds,
int port,
enum dsa_tag_protocol mp)
@@ -1725,6 +1766,8 @@ static const struct dsa_switch_ops ksz_switch_ops = {
.port_bridge_join = ksz_port_bridge_join,
.port_bridge_leave = ksz_port_bridge_leave,
.port_stp_state_set = ksz_port_stp_state_set,
+ .port_pre_bridge_flags = ksz_port_pre_bridge_flags,
+ .port_bridge_flags = ksz_port_bridge_flags,
.port_fast_age = ksz_port_fast_age,
.port_vlan_filtering = ksz_port_vlan_filtering,
.port_vlan_add = ksz_port_vlan_add,
diff --git a/drivers/net/dsa/microchip/ksz_common.h b/drivers/net/dsa/microchip/ksz_common.h
index 764ada3a0f42..0d9520dc6d2d 100644
--- a/drivers/net/dsa/microchip/ksz_common.h
+++ b/drivers/net/dsa/microchip/ksz_common.h
@@ -65,6 +65,7 @@ struct ksz_chip_data {
struct ksz_port {
bool remove_tag; /* Remove Tag flag set, for ksz8795 only */
+ bool learning;
int stp_state;
struct phy_device phydev;
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 15f7cfae912ea1739c8844b7edf3621ba981a37a Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean(a)nxp.com>
Date: Thu, 18 Aug 2022 19:48:09 +0300
Subject: [PATCH] net: dsa: microchip: make learning configurable and keep it
off while standalone
Address learning should initially be turned off by the driver for port
operation in standalone mode, then the DSA core handles changes to it
via ds->ops->port_bridge_flags().
Leaving address learning enabled while ports are standalone breaks any
kind of communication which involves port B receiving what port A has
sent. Notably it breaks the ksz9477 driver used with a (non offloaded,
ports act as if standalone) bonding interface in active-backup mode,
when the ports are connected together through external switches, for
redundancy purposes.
This fixes a major design flaw in the ksz9477 and ksz8795 drivers, which
unconditionally leave address learning enabled even while ports operate
as standalone.
Fixes: b987e98e50ab ("dsa: add DSA switch driver for Microchip KSZ9477")
Link: https://lore.kernel.org/netdev/CAFZh4h-JVWt80CrQWkFji7tZJahMfOToUJQgKS5s0_=…
Reported-by: Brian Hutchinson <b.hutchman(a)gmail.com>
Signed-off-by: Vladimir Oltean <vladimir.oltean(a)nxp.com>
Link: https://lore.kernel.org/r/20220818164809.3198039-1-vladimir.oltean@nxp.com
Signed-off-by: Jakub Kicinski <kuba(a)kernel.org>
diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c
index 7461272a6d41..6bd69a7e6809 100644
--- a/drivers/net/dsa/microchip/ksz_common.c
+++ b/drivers/net/dsa/microchip/ksz_common.c
@@ -968,6 +968,7 @@ static void ksz_update_port_member(struct ksz_device *dev, int port)
static int ksz_setup(struct dsa_switch *ds)
{
struct ksz_device *dev = ds->priv;
+ struct ksz_port *p;
const u16 *regs;
int ret;
@@ -1007,6 +1008,14 @@ static int ksz_setup(struct dsa_switch *ds)
return ret;
}
+ /* Start with learning disabled on standalone user ports, and enabled
+ * on the CPU port. In lack of other finer mechanisms, learning on the
+ * CPU port will avoid flooding bridge local addresses on the network
+ * in some cases.
+ */
+ p = &dev->ports[dev->cpu_port];
+ p->learning = true;
+
/* start switch */
regmap_update_bits(dev->regmap[0], regs[S_START_CTRL],
SW_START, SW_START);
@@ -1283,6 +1292,8 @@ void ksz_port_stp_state_set(struct dsa_switch *ds, int port, u8 state)
ksz_pread8(dev, port, regs[P_STP_CTRL], &data);
data &= ~(PORT_TX_ENABLE | PORT_RX_ENABLE | PORT_LEARN_DISABLE);
+ p = &dev->ports[port];
+
switch (state) {
case BR_STATE_DISABLED:
data |= PORT_LEARN_DISABLE;
@@ -1292,9 +1303,13 @@ void ksz_port_stp_state_set(struct dsa_switch *ds, int port, u8 state)
break;
case BR_STATE_LEARNING:
data |= PORT_RX_ENABLE;
+ if (!p->learning)
+ data |= PORT_LEARN_DISABLE;
break;
case BR_STATE_FORWARDING:
data |= (PORT_TX_ENABLE | PORT_RX_ENABLE);
+ if (!p->learning)
+ data |= PORT_LEARN_DISABLE;
break;
case BR_STATE_BLOCKING:
data |= PORT_LEARN_DISABLE;
@@ -1306,12 +1321,38 @@ void ksz_port_stp_state_set(struct dsa_switch *ds, int port, u8 state)
ksz_pwrite8(dev, port, regs[P_STP_CTRL], data);
- p = &dev->ports[port];
p->stp_state = state;
ksz_update_port_member(dev, port);
}
+static int ksz_port_pre_bridge_flags(struct dsa_switch *ds, int port,
+ struct switchdev_brport_flags flags,
+ struct netlink_ext_ack *extack)
+{
+ if (flags.mask & ~BR_LEARNING)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int ksz_port_bridge_flags(struct dsa_switch *ds, int port,
+ struct switchdev_brport_flags flags,
+ struct netlink_ext_ack *extack)
+{
+ struct ksz_device *dev = ds->priv;
+ struct ksz_port *p = &dev->ports[port];
+
+ if (flags.mask & BR_LEARNING) {
+ p->learning = !!(flags.val & BR_LEARNING);
+
+ /* Make the change take effect immediately */
+ ksz_port_stp_state_set(ds, port, p->stp_state);
+ }
+
+ return 0;
+}
+
static enum dsa_tag_protocol ksz_get_tag_protocol(struct dsa_switch *ds,
int port,
enum dsa_tag_protocol mp)
@@ -1725,6 +1766,8 @@ static const struct dsa_switch_ops ksz_switch_ops = {
.port_bridge_join = ksz_port_bridge_join,
.port_bridge_leave = ksz_port_bridge_leave,
.port_stp_state_set = ksz_port_stp_state_set,
+ .port_pre_bridge_flags = ksz_port_pre_bridge_flags,
+ .port_bridge_flags = ksz_port_bridge_flags,
.port_fast_age = ksz_port_fast_age,
.port_vlan_filtering = ksz_port_vlan_filtering,
.port_vlan_add = ksz_port_vlan_add,
diff --git a/drivers/net/dsa/microchip/ksz_common.h b/drivers/net/dsa/microchip/ksz_common.h
index 764ada3a0f42..0d9520dc6d2d 100644
--- a/drivers/net/dsa/microchip/ksz_common.h
+++ b/drivers/net/dsa/microchip/ksz_common.h
@@ -65,6 +65,7 @@ struct ksz_chip_data {
struct ksz_port {
bool remove_tag; /* Remove Tag flag set, for ksz8795 only */
+ bool learning;
int stp_state;
struct phy_device phydev;
The patch below does not apply to the 5.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 15f7cfae912ea1739c8844b7edf3621ba981a37a Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean(a)nxp.com>
Date: Thu, 18 Aug 2022 19:48:09 +0300
Subject: [PATCH] net: dsa: microchip: make learning configurable and keep it
off while standalone
Address learning should initially be turned off by the driver for port
operation in standalone mode, then the DSA core handles changes to it
via ds->ops->port_bridge_flags().
Leaving address learning enabled while ports are standalone breaks any
kind of communication which involves port B receiving what port A has
sent. Notably it breaks the ksz9477 driver used with a (non offloaded,
ports act as if standalone) bonding interface in active-backup mode,
when the ports are connected together through external switches, for
redundancy purposes.
This fixes a major design flaw in the ksz9477 and ksz8795 drivers, which
unconditionally leave address learning enabled even while ports operate
as standalone.
Fixes: b987e98e50ab ("dsa: add DSA switch driver for Microchip KSZ9477")
Link: https://lore.kernel.org/netdev/CAFZh4h-JVWt80CrQWkFji7tZJahMfOToUJQgKS5s0_=…
Reported-by: Brian Hutchinson <b.hutchman(a)gmail.com>
Signed-off-by: Vladimir Oltean <vladimir.oltean(a)nxp.com>
Link: https://lore.kernel.org/r/20220818164809.3198039-1-vladimir.oltean@nxp.com
Signed-off-by: Jakub Kicinski <kuba(a)kernel.org>
diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c
index 7461272a6d41..6bd69a7e6809 100644
--- a/drivers/net/dsa/microchip/ksz_common.c
+++ b/drivers/net/dsa/microchip/ksz_common.c
@@ -968,6 +968,7 @@ static void ksz_update_port_member(struct ksz_device *dev, int port)
static int ksz_setup(struct dsa_switch *ds)
{
struct ksz_device *dev = ds->priv;
+ struct ksz_port *p;
const u16 *regs;
int ret;
@@ -1007,6 +1008,14 @@ static int ksz_setup(struct dsa_switch *ds)
return ret;
}
+ /* Start with learning disabled on standalone user ports, and enabled
+ * on the CPU port. In lack of other finer mechanisms, learning on the
+ * CPU port will avoid flooding bridge local addresses on the network
+ * in some cases.
+ */
+ p = &dev->ports[dev->cpu_port];
+ p->learning = true;
+
/* start switch */
regmap_update_bits(dev->regmap[0], regs[S_START_CTRL],
SW_START, SW_START);
@@ -1283,6 +1292,8 @@ void ksz_port_stp_state_set(struct dsa_switch *ds, int port, u8 state)
ksz_pread8(dev, port, regs[P_STP_CTRL], &data);
data &= ~(PORT_TX_ENABLE | PORT_RX_ENABLE | PORT_LEARN_DISABLE);
+ p = &dev->ports[port];
+
switch (state) {
case BR_STATE_DISABLED:
data |= PORT_LEARN_DISABLE;
@@ -1292,9 +1303,13 @@ void ksz_port_stp_state_set(struct dsa_switch *ds, int port, u8 state)
break;
case BR_STATE_LEARNING:
data |= PORT_RX_ENABLE;
+ if (!p->learning)
+ data |= PORT_LEARN_DISABLE;
break;
case BR_STATE_FORWARDING:
data |= (PORT_TX_ENABLE | PORT_RX_ENABLE);
+ if (!p->learning)
+ data |= PORT_LEARN_DISABLE;
break;
case BR_STATE_BLOCKING:
data |= PORT_LEARN_DISABLE;
@@ -1306,12 +1321,38 @@ void ksz_port_stp_state_set(struct dsa_switch *ds, int port, u8 state)
ksz_pwrite8(dev, port, regs[P_STP_CTRL], data);
- p = &dev->ports[port];
p->stp_state = state;
ksz_update_port_member(dev, port);
}
+static int ksz_port_pre_bridge_flags(struct dsa_switch *ds, int port,
+ struct switchdev_brport_flags flags,
+ struct netlink_ext_ack *extack)
+{
+ if (flags.mask & ~BR_LEARNING)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int ksz_port_bridge_flags(struct dsa_switch *ds, int port,
+ struct switchdev_brport_flags flags,
+ struct netlink_ext_ack *extack)
+{
+ struct ksz_device *dev = ds->priv;
+ struct ksz_port *p = &dev->ports[port];
+
+ if (flags.mask & BR_LEARNING) {
+ p->learning = !!(flags.val & BR_LEARNING);
+
+ /* Make the change take effect immediately */
+ ksz_port_stp_state_set(ds, port, p->stp_state);
+ }
+
+ return 0;
+}
+
static enum dsa_tag_protocol ksz_get_tag_protocol(struct dsa_switch *ds,
int port,
enum dsa_tag_protocol mp)
@@ -1725,6 +1766,8 @@ static const struct dsa_switch_ops ksz_switch_ops = {
.port_bridge_join = ksz_port_bridge_join,
.port_bridge_leave = ksz_port_bridge_leave,
.port_stp_state_set = ksz_port_stp_state_set,
+ .port_pre_bridge_flags = ksz_port_pre_bridge_flags,
+ .port_bridge_flags = ksz_port_bridge_flags,
.port_fast_age = ksz_port_fast_age,
.port_vlan_filtering = ksz_port_vlan_filtering,
.port_vlan_add = ksz_port_vlan_add,
diff --git a/drivers/net/dsa/microchip/ksz_common.h b/drivers/net/dsa/microchip/ksz_common.h
index 764ada3a0f42..0d9520dc6d2d 100644
--- a/drivers/net/dsa/microchip/ksz_common.h
+++ b/drivers/net/dsa/microchip/ksz_common.h
@@ -65,6 +65,7 @@ struct ksz_chip_data {
struct ksz_port {
bool remove_tag; /* Remove Tag flag set, for ksz8795 only */
+ bool learning;
int stp_state;
struct phy_device phydev;
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From bf515f024e4c0ca46a1b08c4f31860c01781d8a5 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers(a)google.com>
Date: Mon, 22 Aug 2022 14:33:51 -0700
Subject: [PATCH] perf stat: Clear evsel->reset_group for each stat run
If a weak group is broken then the reset_group flag remains set for
the next run. Having reset_group set means the counter isn't created
and ultimately a segfault.
A simple reproduction of this is:
# perf stat -r2 -e '{cycles,cycles,cycles,cycles,cycles,cycles,cycles,cycles,cycles,cycles}:W
which will be added as a test in the next patch.
Fixes: 4804e0111662d7d8 ("perf stat: Use affinity for opening events")
Reviewed-by: Andi Kleen <ak(a)linux.intel.com>
Signed-off-by: Ian Rogers <irogers(a)google.com>
Tested-by: Arnaldo Carvalho de Melo <acme(a)redhat.com>
Tested-by: Xing Zhengjun <zhengjun.xing(a)linux.intel.com>
Cc: Alexander Shishkin <alexander.shishkin(a)linux.intel.com>
Cc: Andi Kleen <ak(a)linux.intel.com>
Cc: Ingo Molnar <mingo(a)redhat.com>
Cc: Jiri Olsa <jolsa(a)kernel.org>
Cc: Kan Liang <kan.liang(a)linux.intel.com>
Cc: Mark Rutland <mark.rutland(a)arm.com>
Cc: Namhyung Kim <namhyung(a)kernel.org>
Cc: Peter Zijlstra <peterz(a)infradead.org>
Cc: Stephane Eranian <eranian(a)google.com>
Link: https://lore.kernel.org/r/20220822213352.75721-1-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme(a)redhat.com>
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 7fb81a44672d..54cd29d07ca8 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -826,6 +826,7 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
}
evlist__for_each_entry(evsel_list, counter) {
+ counter->reset_group = false;
if (bpf_counter__load(counter, &target))
return -1;
if (!evsel__is_bpf(counter))
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 65fac0d54f374625b43a9d6ad1f2c212bd41f518 Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3(a)huawei.com>
Date: Tue, 26 Jul 2022 20:22:24 +0800
Subject: [PATCH] blk-mq: fix io hung due to missing commit_rqs
Currently, in virtio_scsi, if 'bd->last' is not set to true while
dispatching request, such io will stay in driver's queue, and driver
will wait for block layer to dispatch more rqs. However, if block
layer failed to dispatch more rq, it should trigger commit_rqs to
inform driver.
There is a problem in blk_mq_try_issue_list_directly() that commit_rqs
won't be called:
// assume that queue_depth is set to 1, list contains two rq
blk_mq_try_issue_list_directly
blk_mq_request_issue_directly
// dispatch first rq
// last is false
__blk_mq_try_issue_directly
blk_mq_get_dispatch_budget
// succeed to get first budget
__blk_mq_issue_directly
scsi_queue_rq
cmd->flags |= SCMD_LAST
virtscsi_queuecommand
kick = (sc->flags & SCMD_LAST) != 0
// kick is false, first rq won't issue to disk
queued++
blk_mq_request_issue_directly
// dispatch second rq
__blk_mq_try_issue_directly
blk_mq_get_dispatch_budget
// failed to get second budget
ret == BLK_STS_RESOURCE
blk_mq_request_bypass_insert
// errors is still 0
if (!list_empty(list) || errors && ...)
// won't pass, commit_rqs won't be called
In this situation, first rq relied on second rq to dispatch, while
second rq relied on first rq to complete, thus they will both hung.
Fix the problem by also treat 'BLK_STS_*RESOURCE' as 'errors' since
it means that request is not queued successfully.
Same problem exists in blk_mq_dispatch_rq_list(), 'BLK_STS_*RESOURCE'
can't be treated as 'errors' here, fix the problem by calling
commit_rqs if queue_rq return 'BLK_STS_*RESOURCE'.
Fixes: d666ba98f849 ("blk-mq: add mq_ops->commit_rqs()")
Signed-off-by: Yu Kuai <yukuai3(a)huawei.com>
Reviewed-by: Ming Lei <ming.lei(a)redhat.com>
Link: https://lore.kernel.org/r/20220726122224.1790882-1-yukuai1@huaweicloud.com
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 3c1e6b6d991d..c96c8c4f751b 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1931,7 +1931,8 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
/* If we didn't flush the entire list, we could have told the driver
* there was more coming, but that turned out to be a lie.
*/
- if ((!list_empty(list) || errors) && q->mq_ops->commit_rqs && queued)
+ if ((!list_empty(list) || errors || needs_resource ||
+ ret == BLK_STS_DEV_RESOURCE) && q->mq_ops->commit_rqs && queued)
q->mq_ops->commit_rqs(hctx);
/*
* Any items that need requeuing? Stuff them into hctx->dispatch,
@@ -2660,6 +2661,7 @@ void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
list_del_init(&rq->queuelist);
ret = blk_mq_request_issue_directly(rq, list_empty(list));
if (ret != BLK_STS_OK) {
+ errors++;
if (ret == BLK_STS_RESOURCE ||
ret == BLK_STS_DEV_RESOURCE) {
blk_mq_request_bypass_insert(rq, false,
@@ -2667,7 +2669,6 @@ void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
break;
}
blk_mq_end_request(rq, ret);
- errors++;
} else
queued++;
}
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 332924973725e8cdcc783c175f68cf7e162cb9e5 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz(a)infradead.org>
Date: Fri, 19 Aug 2022 13:01:35 +0200
Subject: [PATCH] x86/nospec: Fix i386 RSB stuffing
Turns out that i386 doesn't unconditionally have LFENCE, as such the
loop in __FILL_RETURN_BUFFER isn't actually speculation safe on such
chips.
Fixes: ba6e31af2be9 ("x86/speculation: Add LFENCE to RSB fill sequence")
Reported-by: Ben Hutchings <ben(a)decadent.org.uk>
Signed-off-by: Peter Zijlstra (Intel) <peterz(a)infradead.org>
Link: https://lkml.kernel.org/r/Yv9tj9vbQ9nNlXoY@worktop.programming.kicks-ass.net
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index 10731ccfed37..c936ce9f0c47 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -50,6 +50,7 @@
* the optimal version - two calls, each with their own speculation
* trap should their return address end up getting used, in a loop.
*/
+#ifdef CONFIG_X86_64
#define __FILL_RETURN_BUFFER(reg, nr) \
mov $(nr/2), reg; \
771: \
@@ -60,6 +61,17 @@
jnz 771b; \
/* barrier for jnz misprediction */ \
lfence;
+#else
+/*
+ * i386 doesn't unconditionally have LFENCE, as such it can't
+ * do a loop.
+ */
+#define __FILL_RETURN_BUFFER(reg, nr) \
+ .rept nr; \
+ __FILL_RETURN_SLOT; \
+ .endr; \
+ add $(BITS_PER_LONG/8) * nr, %_ASM_SP;
+#endif
/*
* Stuff a single RSB slot.
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 332924973725e8cdcc783c175f68cf7e162cb9e5 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz(a)infradead.org>
Date: Fri, 19 Aug 2022 13:01:35 +0200
Subject: [PATCH] x86/nospec: Fix i386 RSB stuffing
Turns out that i386 doesn't unconditionally have LFENCE, as such the
loop in __FILL_RETURN_BUFFER isn't actually speculation safe on such
chips.
Fixes: ba6e31af2be9 ("x86/speculation: Add LFENCE to RSB fill sequence")
Reported-by: Ben Hutchings <ben(a)decadent.org.uk>
Signed-off-by: Peter Zijlstra (Intel) <peterz(a)infradead.org>
Link: https://lkml.kernel.org/r/Yv9tj9vbQ9nNlXoY@worktop.programming.kicks-ass.net
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index 10731ccfed37..c936ce9f0c47 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -50,6 +50,7 @@
* the optimal version - two calls, each with their own speculation
* trap should their return address end up getting used, in a loop.
*/
+#ifdef CONFIG_X86_64
#define __FILL_RETURN_BUFFER(reg, nr) \
mov $(nr/2), reg; \
771: \
@@ -60,6 +61,17 @@
jnz 771b; \
/* barrier for jnz misprediction */ \
lfence;
+#else
+/*
+ * i386 doesn't unconditionally have LFENCE, as such it can't
+ * do a loop.
+ */
+#define __FILL_RETURN_BUFFER(reg, nr) \
+ .rept nr; \
+ __FILL_RETURN_SLOT; \
+ .endr; \
+ add $(BITS_PER_LONG/8) * nr, %_ASM_SP;
+#endif
/*
* Stuff a single RSB slot.
The patch below does not apply to the 5.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From f96f7a40874d7c746680c0b9f57cef2262ae551f Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david(a)redhat.com>
Date: Thu, 11 Aug 2022 12:34:34 +0200
Subject: [PATCH] mm/hugetlb: fix hugetlb not supporting softdirty tracking
Patch series "mm/hugetlb: fix write-fault handling for shared mappings", v2.
I observed that hugetlb does not support/expect write-faults in shared
mappings that would have to map the R/O-mapped page writable -- and I
found two case where we could currently get such faults and would
erroneously map an anon page into a shared mapping.
Reproducers part of the patches.
I propose to backport both fixes to stable trees. The first fix needs a
small adjustment.
This patch (of 2):
Staring at hugetlb_wp(), one might wonder where all the logic for shared
mappings is when stumbling over a write-protected page in a shared
mapping. In fact, there is none, and so far we thought we could get away
with that because e.g., mprotect() should always do the right thing and
map all pages directly writable.
Looks like we were wrong:
--------------------------------------------------------------------------
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <fcntl.h>
#include <unistd.h>
#include <errno.h>
#include <sys/mman.h>
#define HUGETLB_SIZE (2 * 1024 * 1024u)
static void clear_softdirty(void)
{
int fd = open("/proc/self/clear_refs", O_WRONLY);
const char *ctrl = "4";
int ret;
if (fd < 0) {
fprintf(stderr, "open(clear_refs) failed\n");
exit(1);
}
ret = write(fd, ctrl, strlen(ctrl));
if (ret != strlen(ctrl)) {
fprintf(stderr, "write(clear_refs) failed\n");
exit(1);
}
close(fd);
}
int main(int argc, char **argv)
{
char *map;
int fd;
fd = open("/dev/hugepages/tmp", O_RDWR | O_CREAT);
if (!fd) {
fprintf(stderr, "open() failed\n");
return -errno;
}
if (ftruncate(fd, HUGETLB_SIZE)) {
fprintf(stderr, "ftruncate() failed\n");
return -errno;
}
map = mmap(NULL, HUGETLB_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
if (map == MAP_FAILED) {
fprintf(stderr, "mmap() failed\n");
return -errno;
}
*map = 0;
if (mprotect(map, HUGETLB_SIZE, PROT_READ)) {
fprintf(stderr, "mmprotect() failed\n");
return -errno;
}
clear_softdirty();
if (mprotect(map, HUGETLB_SIZE, PROT_READ|PROT_WRITE)) {
fprintf(stderr, "mmprotect() failed\n");
return -errno;
}
*map = 0;
return 0;
}
--------------------------------------------------------------------------
Above test fails with SIGBUS when there is only a single free hugetlb page.
# echo 1 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages
# ./test
Bus error (core dumped)
And worse, with sufficient free hugetlb pages it will map an anonymous page
into a shared mapping, for example, messing up accounting during unmap
and breaking MAP_SHARED semantics:
# echo 2 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages
# ./test
# cat /proc/meminfo | grep HugePages_
HugePages_Total: 2
HugePages_Free: 1
HugePages_Rsvd: 18446744073709551615
HugePages_Surp: 0
Reason in this particular case is that vma_wants_writenotify() will
return "true", removing VM_SHARED in vma_set_page_prot() to map pages
write-protected. Let's teach vma_wants_writenotify() that hugetlb does not
support softdirty tracking.
Link: https://lkml.kernel.org/r/20220811103435.188481-1-david@redhat.com
Link: https://lkml.kernel.org/r/20220811103435.188481-2-david@redhat.com
Fixes: 64e455079e1b ("mm: softdirty: enable write notifications on VMAs after VM_SOFTDIRTY cleared")
Signed-off-by: David Hildenbrand <david(a)redhat.com>
Reviewed-by: Mike Kravetz <mike.kravetz(a)oracle.com>
Cc: Peter Feiner <pfeiner(a)google.com>
Cc: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com>
Cc: Cyrill Gorcunov <gorcunov(a)openvz.org>
Cc: Pavel Emelyanov <xemul(a)parallels.com>
Cc: Jamie Liu <jamieliu(a)google.com>
Cc: Hugh Dickins <hughd(a)google.com>
Cc: Naoya Horiguchi <n-horiguchi(a)ah.jp.nec.com>
Cc: Bjorn Helgaas <bhelgaas(a)google.com>
Cc: Muchun Song <songmuchun(a)bytedance.com>
Cc: Peter Xu <peterx(a)redhat.com>
Cc: <stable(a)vger.kernel.org> [3.18+]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/mmap.c b/mm/mmap.c
index c035020d0c89..9d780f415be3 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1646,8 +1646,11 @@ int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot)
pgprot_val(vm_pgprot_modify(vm_page_prot, vm_flags)))
return 0;
- /* Do we need to track softdirty? */
- if (vma_soft_dirty_enabled(vma))
+ /*
+ * Do we need to track softdirty? hugetlb does not support softdirty
+ * tracking yet.
+ */
+ if (vma_soft_dirty_enabled(vma) && !is_vm_hugetlb_page(vma))
return 1;
/* Specialty mapping? */
Dear Sasha, Dear Greg,
On Mon, Aug 15, 2022 at 02:12:23AM -0400, Sasha Levin wrote:
> This is a note to let you know that I've just added the patch titled
>
> usbnet: smsc95xx: Forward PHY interrupts to PHY driver to avoid polling
>
> to the 5.15-stable tree which can be found at:
Please consider reverting upstream commit
1ce8b37241ed291af56f7a49bbdbf20c08728e88
usbnet: smsc95xx: Forward PHY interrupts to PHY driver to avoid polling")
in 5.15-stable and 5.18-stable.
I have received two independent reports (from Mark Brown and Phil Elwell)
that the commit breaks link detection on LAN95xx USB Ethernet adapters
in those trees.
The commit works fine in v5.19, but the backports to 5.15 and 5.18 are
apparently missing prerequisites. Until we've figured out what those are,
the commit should be reverted.
Thank you!
Lukas
The patch below does not apply to the 5.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 37887783b3fef877bf34b8992c9199864da4afcb Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby(a)kernel.org>
Date: Wed, 10 Aug 2022 09:06:09 +0200
Subject: [PATCH] Revert "zram: remove double compression logic"
This reverts commit e7be8d1dd983156b ("zram: remove double compression
logic") as it causes zram failures. It does not revert cleanly, PTR_ERR
handling was introduced in the meantime. This is handled by appropriate
IS_ERR.
When under memory pressure, zs_malloc() can fail. Before the above
commit, the allocation was retried with direct reclaim enabled (GFP_NOIO).
After the commit, it is not -- only __GFP_KSWAPD_RECLAIM is tried.
So when the failure occurs under memory pressure, the overlaying
filesystem such as ext2 (mounted by ext4 module in this case) can emit
failures, making the (file)system unusable:
EXT4-fs warning (device zram0): ext4_end_bio:343: I/O error 10 writing to inode 16386 starting block 159744)
Buffer I/O error on device zram0, logical block 159744
With direct reclaim, memory is really reclaimed and allocation succeeds,
eventually. In the worst case, the oom killer is invoked, which is proper
outcome if user sets up zram too large (in comparison to available RAM).
This very diff doesn't apply to 5.19 (stable) cleanly (see PTR_ERR note
above). Use revert of e7be8d1dd983 directly.
Link: https://bugzilla.suse.com/show_bug.cgi?id=1202203
Link: https://lkml.kernel.org/r/20220810070609.14402-1-jslaby@suse.cz
Fixes: e7be8d1dd983 ("zram: remove double compression logic")
Signed-off-by: Jiri Slaby <jslaby(a)suse.cz>
Reviewed-by: Sergey Senozhatsky <senozhatsky(a)chromium.org>
Cc: Minchan Kim <minchan(a)kernel.org>
Cc: Nitin Gupta <ngupta(a)vflare.org>
Cc: Alexey Romanov <avromanov(a)sberdevices.ru>
Cc: Dmitry Rokosov <ddrokosov(a)sberdevices.ru>
Cc: Lukas Czerner <lczerner(a)redhat.com>
Cc: <stable(a)vger.kernel.org> [5.19]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 92cb929a45b7..226ea76cc819 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -1146,14 +1146,15 @@ static ssize_t bd_stat_show(struct device *dev,
static ssize_t debug_stat_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
- int version = 2;
+ int version = 1;
struct zram *zram = dev_to_zram(dev);
ssize_t ret;
down_read(&zram->init_lock);
ret = scnprintf(buf, PAGE_SIZE,
- "version: %d\n%8llu\n",
+ "version: %d\n%8llu %8llu\n",
version,
+ (u64)atomic64_read(&zram->stats.writestall),
(u64)atomic64_read(&zram->stats.miss_free));
up_read(&zram->init_lock);
@@ -1351,7 +1352,7 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
{
int ret = 0;
unsigned long alloced_pages;
- unsigned long handle = 0;
+ unsigned long handle = -ENOMEM;
unsigned int comp_len = 0;
void *src, *dst, *mem;
struct zcomp_strm *zstrm;
@@ -1369,6 +1370,7 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
}
kunmap_atomic(mem);
+compress_again:
zstrm = zcomp_stream_get(zram->comp);
src = kmap_atomic(page);
ret = zcomp_compress(zstrm, src, &comp_len);
@@ -1377,20 +1379,39 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
if (unlikely(ret)) {
zcomp_stream_put(zram->comp);
pr_err("Compression failed! err=%d\n", ret);
+ zs_free(zram->mem_pool, handle);
return ret;
}
if (comp_len >= huge_class_size)
comp_len = PAGE_SIZE;
-
- handle = zs_malloc(zram->mem_pool, comp_len,
- __GFP_KSWAPD_RECLAIM |
- __GFP_NOWARN |
- __GFP_HIGHMEM |
- __GFP_MOVABLE);
-
+ /*
+ * handle allocation has 2 paths:
+ * a) fast path is executed with preemption disabled (for
+ * per-cpu streams) and has __GFP_DIRECT_RECLAIM bit clear,
+ * since we can't sleep;
+ * b) slow path enables preemption and attempts to allocate
+ * the page with __GFP_DIRECT_RECLAIM bit set. we have to
+ * put per-cpu compression stream and, thus, to re-do
+ * the compression once handle is allocated.
+ *
+ * if we have a 'non-null' handle here then we are coming
+ * from the slow path and handle has already been allocated.
+ */
+ if (IS_ERR((void *)handle))
+ handle = zs_malloc(zram->mem_pool, comp_len,
+ __GFP_KSWAPD_RECLAIM |
+ __GFP_NOWARN |
+ __GFP_HIGHMEM |
+ __GFP_MOVABLE);
if (IS_ERR((void *)handle)) {
zcomp_stream_put(zram->comp);
+ atomic64_inc(&zram->stats.writestall);
+ handle = zs_malloc(zram->mem_pool, comp_len,
+ GFP_NOIO | __GFP_HIGHMEM |
+ __GFP_MOVABLE);
+ if (!IS_ERR((void *)handle))
+ goto compress_again;
return PTR_ERR((void *)handle);
}
@@ -1948,6 +1969,7 @@ static int zram_add(void)
if (ZRAM_LOGICAL_BLOCK_SIZE == PAGE_SIZE)
blk_queue_max_write_zeroes_sectors(zram->disk->queue, UINT_MAX);
+ blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, zram->disk->queue);
ret = device_add_disk(NULL, zram->disk, zram_disk_groups);
if (ret)
goto out_cleanup_disk;
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index 158c91e54850..80c3b43b4828 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -81,6 +81,7 @@ struct zram_stats {
atomic64_t huge_pages_since; /* no. of huge pages since zram set up */
atomic64_t pages_stored; /* no. of pages currently stored */
atomic_long_t max_used_pages; /* no. of maximum pages stored */
+ atomic64_t writestall; /* no. of write slow paths */
atomic64_t miss_free; /* no. of missed free */
#ifdef CONFIG_ZRAM_WRITEBACK
atomic64_t bd_count; /* no. of pages in backing device */
Hi Greg,
Could you please pick these commits for >= linux-5.7.y
ef605e868212 ("cifs: skip trailing separators of prefix paths")
ee3c8019cce2 ("cifs: fix uninitialized pointer in error case in dfs_cache_get_tgt_share")
as part of a fix for
bacd704a95ad ("cifs: handle prefix paths in reconnect")
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 34fc9cc3aebe8b9e27d3bc821543dd482dc686ca Mon Sep 17 00:00:00 2001
From: Heinrich Schuchardt <heinrich.schuchardt(a)canonical.com>
Date: Wed, 17 Aug 2022 15:25:21 +0200
Subject: [PATCH] riscv: dts: microchip: correct L2 cache interrupts
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
The "PolarFire SoC MSS Technical Reference Manual" documents the
following PLIC interrupts:
1 - L2 Cache Controller Signals when a metadata correction event occurs
2 - L2 Cache Controller Signals when an uncorrectable metadata event occurs
3 - L2 Cache Controller Signals when a data correction event occurs
4 - L2 Cache Controller Signals when an uncorrectable data event occurs
This differs from the SiFive FU540 which only has three L2 cache related
interrupts.
The sequence in the device tree is defined by an enum:
enum {
DIR_CORR = 0,
DATA_CORR,
DATA_UNCORR,
DIR_UNCORR,
};
So the correct sequence of the L2 cache interrupts is
interrupts = <1>, <3>, <4>, <2>;
[Conor]
This manifests as an unusable system if the l2-cache driver is enabled,
as the wrong interrupt gets cleared & the handler prints errors to the
console ad infinitum.
Fixes: 0fa6107eca41 ("RISC-V: Initial DTS for Microchip ICICLE board")
CC: stable(a)vger.kernel.org # 5.15: e35b07a7df9b: riscv: dts: microchip: mpfs: Group tuples in interrupt properties
Signed-off-by: Heinrich Schuchardt <heinrich.schuchardt(a)canonical.com>
Signed-off-by: Conor Dooley <conor.dooley(a)microchip.com>
diff --git a/arch/riscv/boot/dts/microchip/mpfs.dtsi b/arch/riscv/boot/dts/microchip/mpfs.dtsi
index 499c2e63ad35..0a6ad5b9ff8d 100644
--- a/arch/riscv/boot/dts/microchip/mpfs.dtsi
+++ b/arch/riscv/boot/dts/microchip/mpfs.dtsi
@@ -193,7 +193,7 @@
cache-size = <2097152>;
cache-unified;
interrupt-parent = <&plic>;
- interrupts = <1>, <2>, <3>;
+ interrupts = <1>, <3>, <4>, <2>;
};
clint: clint@2000000 {
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From b5c3aca86d2698c4850b6ee8b341938025d2780c Mon Sep 17 00:00:00 2001
From: Conor Dooley <conor.dooley(a)microchip.com>
Date: Sun, 14 Aug 2022 15:12:37 +0100
Subject: [PATCH] riscv: signal: fix missing prototype warning
Fix the warning:
arch/riscv/kernel/signal.c:316:27: warning: no previous prototype for function 'do_notify_resume' [-Wmissing-prototypes]
asmlinkage __visible void do_notify_resume(struct pt_regs *regs,
All other functions in the file are static & none of the existing
headers stood out as an obvious location. Create signal.h to hold the
declaration.
Fixes: e2c0cdfba7f6 ("RISC-V: User-facing API")
Signed-off-by: Conor Dooley <conor.dooley(a)microchip.com>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20220814141237.493457-4-mail@conchuod.ie
Signed-off-by: Palmer Dabbelt <palmer(a)rivosinc.com>
diff --git a/arch/riscv/include/asm/signal.h b/arch/riscv/include/asm/signal.h
new file mode 100644
index 000000000000..532c29ef0376
--- /dev/null
+++ b/arch/riscv/include/asm/signal.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef __ASM_SIGNAL_H
+#define __ASM_SIGNAL_H
+
+#include <uapi/asm/signal.h>
+#include <uapi/asm/ptrace.h>
+
+asmlinkage __visible
+void do_notify_resume(struct pt_regs *regs, unsigned long thread_info_flags);
+
+#endif
diff --git a/arch/riscv/kernel/signal.c b/arch/riscv/kernel/signal.c
index 38b05ca6fe66..5a2de6b6f882 100644
--- a/arch/riscv/kernel/signal.c
+++ b/arch/riscv/kernel/signal.c
@@ -15,6 +15,7 @@
#include <asm/ucontext.h>
#include <asm/vdso.h>
+#include <asm/signal.h>
#include <asm/signal32.h>
#include <asm/switch_to.h>
#include <asm/csr.h>
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From b5c3aca86d2698c4850b6ee8b341938025d2780c Mon Sep 17 00:00:00 2001
From: Conor Dooley <conor.dooley(a)microchip.com>
Date: Sun, 14 Aug 2022 15:12:37 +0100
Subject: [PATCH] riscv: signal: fix missing prototype warning
Fix the warning:
arch/riscv/kernel/signal.c:316:27: warning: no previous prototype for function 'do_notify_resume' [-Wmissing-prototypes]
asmlinkage __visible void do_notify_resume(struct pt_regs *regs,
All other functions in the file are static & none of the existing
headers stood out as an obvious location. Create signal.h to hold the
declaration.
Fixes: e2c0cdfba7f6 ("RISC-V: User-facing API")
Signed-off-by: Conor Dooley <conor.dooley(a)microchip.com>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20220814141237.493457-4-mail@conchuod.ie
Signed-off-by: Palmer Dabbelt <palmer(a)rivosinc.com>
diff --git a/arch/riscv/include/asm/signal.h b/arch/riscv/include/asm/signal.h
new file mode 100644
index 000000000000..532c29ef0376
--- /dev/null
+++ b/arch/riscv/include/asm/signal.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef __ASM_SIGNAL_H
+#define __ASM_SIGNAL_H
+
+#include <uapi/asm/signal.h>
+#include <uapi/asm/ptrace.h>
+
+asmlinkage __visible
+void do_notify_resume(struct pt_regs *regs, unsigned long thread_info_flags);
+
+#endif
diff --git a/arch/riscv/kernel/signal.c b/arch/riscv/kernel/signal.c
index 38b05ca6fe66..5a2de6b6f882 100644
--- a/arch/riscv/kernel/signal.c
+++ b/arch/riscv/kernel/signal.c
@@ -15,6 +15,7 @@
#include <asm/ucontext.h>
#include <asm/vdso.h>
+#include <asm/signal.h>
#include <asm/signal32.h>
#include <asm/switch_to.h>
#include <asm/csr.h>
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From b5c3aca86d2698c4850b6ee8b341938025d2780c Mon Sep 17 00:00:00 2001
From: Conor Dooley <conor.dooley(a)microchip.com>
Date: Sun, 14 Aug 2022 15:12:37 +0100
Subject: [PATCH] riscv: signal: fix missing prototype warning
Fix the warning:
arch/riscv/kernel/signal.c:316:27: warning: no previous prototype for function 'do_notify_resume' [-Wmissing-prototypes]
asmlinkage __visible void do_notify_resume(struct pt_regs *regs,
All other functions in the file are static & none of the existing
headers stood out as an obvious location. Create signal.h to hold the
declaration.
Fixes: e2c0cdfba7f6 ("RISC-V: User-facing API")
Signed-off-by: Conor Dooley <conor.dooley(a)microchip.com>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20220814141237.493457-4-mail@conchuod.ie
Signed-off-by: Palmer Dabbelt <palmer(a)rivosinc.com>
diff --git a/arch/riscv/include/asm/signal.h b/arch/riscv/include/asm/signal.h
new file mode 100644
index 000000000000..532c29ef0376
--- /dev/null
+++ b/arch/riscv/include/asm/signal.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef __ASM_SIGNAL_H
+#define __ASM_SIGNAL_H
+
+#include <uapi/asm/signal.h>
+#include <uapi/asm/ptrace.h>
+
+asmlinkage __visible
+void do_notify_resume(struct pt_regs *regs, unsigned long thread_info_flags);
+
+#endif
diff --git a/arch/riscv/kernel/signal.c b/arch/riscv/kernel/signal.c
index 38b05ca6fe66..5a2de6b6f882 100644
--- a/arch/riscv/kernel/signal.c
+++ b/arch/riscv/kernel/signal.c
@@ -15,6 +15,7 @@
#include <asm/ucontext.h>
#include <asm/vdso.h>
+#include <asm/signal.h>
#include <asm/signal32.h>
#include <asm/switch_to.h>
#include <asm/csr.h>
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From b5c3aca86d2698c4850b6ee8b341938025d2780c Mon Sep 17 00:00:00 2001
From: Conor Dooley <conor.dooley(a)microchip.com>
Date: Sun, 14 Aug 2022 15:12:37 +0100
Subject: [PATCH] riscv: signal: fix missing prototype warning
Fix the warning:
arch/riscv/kernel/signal.c:316:27: warning: no previous prototype for function 'do_notify_resume' [-Wmissing-prototypes]
asmlinkage __visible void do_notify_resume(struct pt_regs *regs,
All other functions in the file are static & none of the existing
headers stood out as an obvious location. Create signal.h to hold the
declaration.
Fixes: e2c0cdfba7f6 ("RISC-V: User-facing API")
Signed-off-by: Conor Dooley <conor.dooley(a)microchip.com>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20220814141237.493457-4-mail@conchuod.ie
Signed-off-by: Palmer Dabbelt <palmer(a)rivosinc.com>
diff --git a/arch/riscv/include/asm/signal.h b/arch/riscv/include/asm/signal.h
new file mode 100644
index 000000000000..532c29ef0376
--- /dev/null
+++ b/arch/riscv/include/asm/signal.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef __ASM_SIGNAL_H
+#define __ASM_SIGNAL_H
+
+#include <uapi/asm/signal.h>
+#include <uapi/asm/ptrace.h>
+
+asmlinkage __visible
+void do_notify_resume(struct pt_regs *regs, unsigned long thread_info_flags);
+
+#endif
diff --git a/arch/riscv/kernel/signal.c b/arch/riscv/kernel/signal.c
index 38b05ca6fe66..5a2de6b6f882 100644
--- a/arch/riscv/kernel/signal.c
+++ b/arch/riscv/kernel/signal.c
@@ -15,6 +15,7 @@
#include <asm/ucontext.h>
#include <asm/vdso.h>
+#include <asm/signal.h>
#include <asm/signal32.h>
#include <asm/switch_to.h>
#include <asm/csr.h>
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From c5deb27895e017a0267de0a20d140ad5fcc55a54 Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross(a)suse.com>
Date: Thu, 25 Aug 2022 16:19:18 +0200
Subject: [PATCH] xen/privcmd: fix error exit of privcmd_ioctl_dm_op()
The error exit of privcmd_ioctl_dm_op() is calling unlock_pages()
potentially with pages being NULL, leading to a NULL dereference.
Additionally lock_pages() doesn't check for pin_user_pages_fast()
having been completely successful, resulting in potentially not
locking all pages into memory. This could result in sporadic failures
when using the related memory in user mode.
Fix all of that by calling unlock_pages() always with the real number
of pinned pages, which will be zero in case pages being NULL, and by
checking the number of pages pinned by pin_user_pages_fast() matching
the expected number of pages.
Cc: <stable(a)vger.kernel.org>
Fixes: ab520be8cd5d ("xen/privcmd: Add IOCTL_PRIVCMD_DM_OP")
Reported-by: Rustam Subkhankulov <subkhankulov(a)ispras.ru>
Signed-off-by: Juergen Gross <jgross(a)suse.com>
Reviewed-by: Jan Beulich <jbeulich(a)suse.com>
Reviewed-by: Oleksandr Tyshchenko <oleksandr_tyshchenko(a)epam.com>
Link: https://lore.kernel.org/r/20220825141918.3581-1-jgross@suse.com
Signed-off-by: Juergen Gross <jgross(a)suse.com>
diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c
index 3369734108af..e88e8f6f0a33 100644
--- a/drivers/xen/privcmd.c
+++ b/drivers/xen/privcmd.c
@@ -581,27 +581,30 @@ static int lock_pages(
struct privcmd_dm_op_buf kbufs[], unsigned int num,
struct page *pages[], unsigned int nr_pages, unsigned int *pinned)
{
- unsigned int i;
+ unsigned int i, off = 0;
- for (i = 0; i < num; i++) {
+ for (i = 0; i < num; ) {
unsigned int requested;
int page_count;
requested = DIV_ROUND_UP(
offset_in_page(kbufs[i].uptr) + kbufs[i].size,
- PAGE_SIZE);
+ PAGE_SIZE) - off;
if (requested > nr_pages)
return -ENOSPC;
page_count = pin_user_pages_fast(
- (unsigned long) kbufs[i].uptr,
+ (unsigned long)kbufs[i].uptr + off * PAGE_SIZE,
requested, FOLL_WRITE, pages);
- if (page_count < 0)
- return page_count;
+ if (page_count <= 0)
+ return page_count ? : -EFAULT;
*pinned += page_count;
nr_pages -= page_count;
pages += page_count;
+
+ off = (requested == page_count) ? 0 : off + page_count;
+ i += !off;
}
return 0;
@@ -677,10 +680,8 @@ static long privcmd_ioctl_dm_op(struct file *file, void __user *udata)
}
rc = lock_pages(kbufs, kdata.num, pages, nr_pages, &pinned);
- if (rc < 0) {
- nr_pages = pinned;
+ if (rc < 0)
goto out;
- }
for (i = 0; i < kdata.num; i++) {
set_xen_guest_handle(xbufs[i].h, kbufs[i].uptr);
@@ -692,7 +693,7 @@ static long privcmd_ioctl_dm_op(struct file *file, void __user *udata)
xen_preemptible_hcall_end();
out:
- unlock_pages(pages, nr_pages);
+ unlock_pages(pages, pinned);
kfree(xbufs);
kfree(pages);
kfree(kbufs);
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From c5deb27895e017a0267de0a20d140ad5fcc55a54 Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross(a)suse.com>
Date: Thu, 25 Aug 2022 16:19:18 +0200
Subject: [PATCH] xen/privcmd: fix error exit of privcmd_ioctl_dm_op()
The error exit of privcmd_ioctl_dm_op() is calling unlock_pages()
potentially with pages being NULL, leading to a NULL dereference.
Additionally lock_pages() doesn't check for pin_user_pages_fast()
having been completely successful, resulting in potentially not
locking all pages into memory. This could result in sporadic failures
when using the related memory in user mode.
Fix all of that by calling unlock_pages() always with the real number
of pinned pages, which will be zero in case pages being NULL, and by
checking the number of pages pinned by pin_user_pages_fast() matching
the expected number of pages.
Cc: <stable(a)vger.kernel.org>
Fixes: ab520be8cd5d ("xen/privcmd: Add IOCTL_PRIVCMD_DM_OP")
Reported-by: Rustam Subkhankulov <subkhankulov(a)ispras.ru>
Signed-off-by: Juergen Gross <jgross(a)suse.com>
Reviewed-by: Jan Beulich <jbeulich(a)suse.com>
Reviewed-by: Oleksandr Tyshchenko <oleksandr_tyshchenko(a)epam.com>
Link: https://lore.kernel.org/r/20220825141918.3581-1-jgross@suse.com
Signed-off-by: Juergen Gross <jgross(a)suse.com>
diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c
index 3369734108af..e88e8f6f0a33 100644
--- a/drivers/xen/privcmd.c
+++ b/drivers/xen/privcmd.c
@@ -581,27 +581,30 @@ static int lock_pages(
struct privcmd_dm_op_buf kbufs[], unsigned int num,
struct page *pages[], unsigned int nr_pages, unsigned int *pinned)
{
- unsigned int i;
+ unsigned int i, off = 0;
- for (i = 0; i < num; i++) {
+ for (i = 0; i < num; ) {
unsigned int requested;
int page_count;
requested = DIV_ROUND_UP(
offset_in_page(kbufs[i].uptr) + kbufs[i].size,
- PAGE_SIZE);
+ PAGE_SIZE) - off;
if (requested > nr_pages)
return -ENOSPC;
page_count = pin_user_pages_fast(
- (unsigned long) kbufs[i].uptr,
+ (unsigned long)kbufs[i].uptr + off * PAGE_SIZE,
requested, FOLL_WRITE, pages);
- if (page_count < 0)
- return page_count;
+ if (page_count <= 0)
+ return page_count ? : -EFAULT;
*pinned += page_count;
nr_pages -= page_count;
pages += page_count;
+
+ off = (requested == page_count) ? 0 : off + page_count;
+ i += !off;
}
return 0;
@@ -677,10 +680,8 @@ static long privcmd_ioctl_dm_op(struct file *file, void __user *udata)
}
rc = lock_pages(kbufs, kdata.num, pages, nr_pages, &pinned);
- if (rc < 0) {
- nr_pages = pinned;
+ if (rc < 0)
goto out;
- }
for (i = 0; i < kdata.num; i++) {
set_xen_guest_handle(xbufs[i].h, kbufs[i].uptr);
@@ -692,7 +693,7 @@ static long privcmd_ioctl_dm_op(struct file *file, void __user *udata)
xen_preemptible_hcall_end();
out:
- unlock_pages(pages, nr_pages);
+ unlock_pages(pages, pinned);
kfree(xbufs);
kfree(pages);
kfree(kbufs);
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From c5deb27895e017a0267de0a20d140ad5fcc55a54 Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross(a)suse.com>
Date: Thu, 25 Aug 2022 16:19:18 +0200
Subject: [PATCH] xen/privcmd: fix error exit of privcmd_ioctl_dm_op()
The error exit of privcmd_ioctl_dm_op() is calling unlock_pages()
potentially with pages being NULL, leading to a NULL dereference.
Additionally lock_pages() doesn't check for pin_user_pages_fast()
having been completely successful, resulting in potentially not
locking all pages into memory. This could result in sporadic failures
when using the related memory in user mode.
Fix all of that by calling unlock_pages() always with the real number
of pinned pages, which will be zero in case pages being NULL, and by
checking the number of pages pinned by pin_user_pages_fast() matching
the expected number of pages.
Cc: <stable(a)vger.kernel.org>
Fixes: ab520be8cd5d ("xen/privcmd: Add IOCTL_PRIVCMD_DM_OP")
Reported-by: Rustam Subkhankulov <subkhankulov(a)ispras.ru>
Signed-off-by: Juergen Gross <jgross(a)suse.com>
Reviewed-by: Jan Beulich <jbeulich(a)suse.com>
Reviewed-by: Oleksandr Tyshchenko <oleksandr_tyshchenko(a)epam.com>
Link: https://lore.kernel.org/r/20220825141918.3581-1-jgross@suse.com
Signed-off-by: Juergen Gross <jgross(a)suse.com>
diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c
index 3369734108af..e88e8f6f0a33 100644
--- a/drivers/xen/privcmd.c
+++ b/drivers/xen/privcmd.c
@@ -581,27 +581,30 @@ static int lock_pages(
struct privcmd_dm_op_buf kbufs[], unsigned int num,
struct page *pages[], unsigned int nr_pages, unsigned int *pinned)
{
- unsigned int i;
+ unsigned int i, off = 0;
- for (i = 0; i < num; i++) {
+ for (i = 0; i < num; ) {
unsigned int requested;
int page_count;
requested = DIV_ROUND_UP(
offset_in_page(kbufs[i].uptr) + kbufs[i].size,
- PAGE_SIZE);
+ PAGE_SIZE) - off;
if (requested > nr_pages)
return -ENOSPC;
page_count = pin_user_pages_fast(
- (unsigned long) kbufs[i].uptr,
+ (unsigned long)kbufs[i].uptr + off * PAGE_SIZE,
requested, FOLL_WRITE, pages);
- if (page_count < 0)
- return page_count;
+ if (page_count <= 0)
+ return page_count ? : -EFAULT;
*pinned += page_count;
nr_pages -= page_count;
pages += page_count;
+
+ off = (requested == page_count) ? 0 : off + page_count;
+ i += !off;
}
return 0;
@@ -677,10 +680,8 @@ static long privcmd_ioctl_dm_op(struct file *file, void __user *udata)
}
rc = lock_pages(kbufs, kdata.num, pages, nr_pages, &pinned);
- if (rc < 0) {
- nr_pages = pinned;
+ if (rc < 0)
goto out;
- }
for (i = 0; i < kdata.num; i++) {
set_xen_guest_handle(xbufs[i].h, kbufs[i].uptr);
@@ -692,7 +693,7 @@ static long privcmd_ioctl_dm_op(struct file *file, void __user *udata)
xen_preemptible_hcall_end();
out:
- unlock_pages(pages, nr_pages);
+ unlock_pages(pages, pinned);
kfree(xbufs);
kfree(pages);
kfree(kbufs);