The patch titled
Subject: mm: memory-failure: update ttu flag inside unmap_poisoned_folio
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
mm-memory-failure-update-ttu-flag-inside-unmap_poisoned_folio.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Ma Wupeng <mawupeng1(a)huawei.com>
Subject: mm: memory-failure: update ttu flag inside unmap_poisoned_folio
Date: Mon, 17 Feb 2025 09:43:27 +0800
Patch series "mm: memory_failure: unmap poisoned folio during migrate
properly", v3.
Fix two bugs during folio migration if the folio is poisoned.
This patch (of 3):
Commit 6da6b1d4a7df ("mm/hwpoison: convert TTU_IGNORE_HWPOISON to
TTU_HWPOISON") introduce TTU_HWPOISON to replace TTU_IGNORE_HWPOISON in
order to stop send SIGBUS signal when accessing an error page after a
memory error on a clean folio. However during page migration, anon folio
must be set with TTU_HWPOISON during unmap_*(). For pagecache we need
some policy just like the one in hwpoison_user_mappings to set this flag.
So move this policy from hwpoison_user_mappings to unmap_poisoned_folio to
handle this warning properly.
Warning will be produced during unamp poison folio with the following log:
------------[ cut here ]------------
WARNING: CPU: 1 PID: 365 at mm/rmap.c:1847 try_to_unmap_one+0x8fc/0xd3c
Modules linked in:
CPU: 1 UID: 0 PID: 365 Comm: bash Tainted: G W 6.13.0-rc1-00018-gacdb4bbda7ab #42
Tainted: [W]=WARN
Hardware name: QEMU QEMU Virtual Machine, BIOS 0.0.0 02/06/2015
pstate: 20400005 (nzCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
pc : try_to_unmap_one+0x8fc/0xd3c
lr : try_to_unmap_one+0x3dc/0xd3c
Call trace:
try_to_unmap_one+0x8fc/0xd3c (P)
try_to_unmap_one+0x3dc/0xd3c (L)
rmap_walk_anon+0xdc/0x1f8
rmap_walk+0x3c/0x58
try_to_unmap+0x88/0x90
unmap_poisoned_folio+0x30/0xa8
do_migrate_range+0x4a0/0x568
offline_pages+0x5a4/0x670
memory_block_action+0x17c/0x374
memory_subsys_offline+0x3c/0x78
device_offline+0xa4/0xd0
state_store+0x8c/0xf0
dev_attr_store+0x18/0x2c
sysfs_kf_write+0x44/0x54
kernfs_fop_write_iter+0x118/0x1a8
vfs_write+0x3a8/0x4bc
ksys_write+0x6c/0xf8
__arm64_sys_write+0x1c/0x28
invoke_syscall+0x44/0x100
el0_svc_common.constprop.0+0x40/0xe0
do_el0_svc+0x1c/0x28
el0_svc+0x30/0xd0
el0t_64_sync_handler+0xc8/0xcc
el0t_64_sync+0x198/0x19c
---[ end trace 0000000000000000 ]---
Link: https://lkml.kernel.org/r/20250217014329.3610326-1-mawupeng1@huawei.com
Link: https://lkml.kernel.org/r/20250217014329.3610326-2-mawupeng1@huawei.com
Fixes: 6da6b1d4a7df ("mm/hwpoison: convert TTU_IGNORE_HWPOISON to TTU_HWPOISON")
Signed-off-by: Ma Wupeng <mawupeng1(a)huawei.com>
Suggested-by: David Hildenbrand <david(a)redhat.com>
Acked-by: David Hildenbrand <david(a)redhat.com>
Cc: Miaohe Lin <linmiaohe(a)huawei.com>
Cc: Ma Wupeng <mawupeng1(a)huawei.com>
Cc: Michal Hocko <mhocko(a)suse.com>
Cc: Naoya Horiguchi <nao.horiguchi(a)gmail.com>
Cc: Oscar Salvador <osalvador(a)suse.de>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/internal.h | 5 ++-
mm/memory-failure.c | 61 +++++++++++++++++++++---------------------
mm/memory_hotplug.c | 3 +-
3 files changed, 36 insertions(+), 33 deletions(-)
--- a/mm/internal.h~mm-memory-failure-update-ttu-flag-inside-unmap_poisoned_folio
+++ a/mm/internal.h
@@ -1115,7 +1115,7 @@ static inline int find_next_best_node(in
* mm/memory-failure.c
*/
#ifdef CONFIG_MEMORY_FAILURE
-void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu);
+int unmap_poisoned_folio(struct folio *folio, unsigned long pfn, bool must_kill);
void shake_folio(struct folio *folio);
extern int hwpoison_filter(struct page *p);
@@ -1138,8 +1138,9 @@ unsigned long page_mapped_in_vma(const s
struct vm_area_struct *vma);
#else
-static inline void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu)
+static inline int unmap_poisoned_folio(struct folio *folio, unsigned long pfn, bool must_kill)
{
+ return -EBUSY;
}
#endif
--- a/mm/memory-failure.c~mm-memory-failure-update-ttu-flag-inside-unmap_poisoned_folio
+++ a/mm/memory-failure.c
@@ -1556,8 +1556,34 @@ static int get_hwpoison_page(struct page
return ret;
}
-void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu)
+int unmap_poisoned_folio(struct folio *folio, unsigned long pfn, bool must_kill)
{
+ enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_SYNC | TTU_HWPOISON;
+ struct address_space *mapping;
+
+ if (folio_test_swapcache(folio)) {
+ pr_err("%#lx: keeping poisoned page in swap cache\n", pfn);
+ ttu &= ~TTU_HWPOISON;
+ }
+
+ /*
+ * Propagate the dirty bit from PTEs to struct page first, because we
+ * need this to decide if we should kill or just drop the page.
+ * XXX: the dirty test could be racy: set_page_dirty() may not always
+ * be called inside page lock (it's recommended but not enforced).
+ */
+ mapping = folio_mapping(folio);
+ if (!must_kill && !folio_test_dirty(folio) && mapping &&
+ mapping_can_writeback(mapping)) {
+ if (folio_mkclean(folio)) {
+ folio_set_dirty(folio);
+ } else {
+ ttu &= ~TTU_HWPOISON;
+ pr_info("%#lx: corrupted page was clean: dropped without side effects\n",
+ pfn);
+ }
+ }
+
if (folio_test_hugetlb(folio) && !folio_test_anon(folio)) {
struct address_space *mapping;
@@ -1572,7 +1598,7 @@ void unmap_poisoned_folio(struct folio *
if (!mapping) {
pr_info("%#lx: could not lock mapping for mapped hugetlb folio\n",
folio_pfn(folio));
- return;
+ return -EBUSY;
}
try_to_unmap(folio, ttu|TTU_RMAP_LOCKED);
@@ -1580,6 +1606,8 @@ void unmap_poisoned_folio(struct folio *
} else {
try_to_unmap(folio, ttu);
}
+
+ return folio_mapped(folio) ? -EBUSY : 0;
}
/*
@@ -1589,8 +1617,6 @@ void unmap_poisoned_folio(struct folio *
static bool hwpoison_user_mappings(struct folio *folio, struct page *p,
unsigned long pfn, int flags)
{
- enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_SYNC | TTU_HWPOISON;
- struct address_space *mapping;
LIST_HEAD(tokill);
bool unmap_success;
int forcekill;
@@ -1613,29 +1639,6 @@ static bool hwpoison_user_mappings(struc
if (!folio_mapped(folio))
return true;
- if (folio_test_swapcache(folio)) {
- pr_err("%#lx: keeping poisoned page in swap cache\n", pfn);
- ttu &= ~TTU_HWPOISON;
- }
-
- /*
- * Propagate the dirty bit from PTEs to struct page first, because we
- * need this to decide if we should kill or just drop the page.
- * XXX: the dirty test could be racy: set_page_dirty() may not always
- * be called inside page lock (it's recommended but not enforced).
- */
- mapping = folio_mapping(folio);
- if (!(flags & MF_MUST_KILL) && !folio_test_dirty(folio) && mapping &&
- mapping_can_writeback(mapping)) {
- if (folio_mkclean(folio)) {
- folio_set_dirty(folio);
- } else {
- ttu &= ~TTU_HWPOISON;
- pr_info("%#lx: corrupted page was clean: dropped without side effects\n",
- pfn);
- }
- }
-
/*
* First collect all the processes that have the page
* mapped in dirty form. This has to be done before try_to_unmap,
@@ -1643,9 +1646,7 @@ static bool hwpoison_user_mappings(struc
*/
collect_procs(folio, p, &tokill, flags & MF_ACTION_REQUIRED);
- unmap_poisoned_folio(folio, ttu);
-
- unmap_success = !folio_mapped(folio);
+ unmap_success = !unmap_poisoned_folio(folio, pfn, flags & MF_MUST_KILL);
if (!unmap_success)
pr_err("%#lx: failed to unmap page (folio mapcount=%d)\n",
pfn, folio_mapcount(folio));
--- a/mm/memory_hotplug.c~mm-memory-failure-update-ttu-flag-inside-unmap_poisoned_folio
+++ a/mm/memory_hotplug.c
@@ -1833,7 +1833,8 @@ static void do_migrate_range(unsigned lo
if (WARN_ON(folio_test_lru(folio)))
folio_isolate_lru(folio);
if (folio_mapped(folio))
- unmap_poisoned_folio(folio, TTU_IGNORE_MLOCK);
+ unmap_poisoned_folio(folio, pfn, false);
+
continue;
}
_
Patches currently in -mm which might be from mawupeng1(a)huawei.com are
mm-memory-failure-update-ttu-flag-inside-unmap_poisoned_folio.patch
mm-memory-hotplug-check-folio-ref-count-first-in-do_migrate_range.patch
hwpoison-memory_hotplug-lock-folio-before-unmap-hwpoisoned-folio.patch
The patch titled
Subject: arm: pgtable: fix NULL pointer dereference issue
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
arm-pgtable-fix-null-pointer-dereference-issue.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Qi Zheng <zhengqi.arch(a)bytedance.com>
Subject: arm: pgtable: fix NULL pointer dereference issue
Date: Mon, 17 Feb 2025 10:49:24 +0800
When update_mmu_cache_range() is called by update_mmu_cache(), the vmf
parameter is NULL, which will cause a NULL pointer dereference issue in
adjust_pte():
Unable to handle kernel NULL pointer dereference at virtual address 00000030 when read
Hardware name: Atmel AT91SAM9
PC is at update_mmu_cache_range+0x1e0/0x278
LR is at pte_offset_map_rw_nolock+0x18/0x2c
Call trace:
update_mmu_cache_range from remove_migration_pte+0x29c/0x2ec
remove_migration_pte from rmap_walk_file+0xcc/0x130
rmap_walk_file from remove_migration_ptes+0x90/0xa4
remove_migration_ptes from migrate_pages_batch+0x6d4/0x858
migrate_pages_batch from migrate_pages+0x188/0x488
migrate_pages from compact_zone+0x56c/0x954
compact_zone from compact_node+0x90/0xf0
compact_node from kcompactd+0x1d4/0x204
kcompactd from kthread+0x120/0x12c
kthread from ret_from_fork+0x14/0x38
Exception stack(0xc0d8bfb0 to 0xc0d8bff8)
To fix it, do not rely on whether 'ptl' is equal to decide whether to hold
the pte lock, but decide it by whether CONFIG_SPLIT_PTE_PTLOCKS is
enabled. In addition, if two vmas map to the same PTE page, there is no
need to hold the pte lock again, otherwise a deadlock will occur. Just
add the need_lock parameter to let adjust_pte() know this information.
Link: https://lkml.kernel.org/r/20250217024924.57996-1-zhengqi.arch@bytedance.com
Fixes: fc9c45b71f43 ("arm: adjust_pte() use pte_offset_map_rw_nolock()")
Signed-off-by: Qi Zheng <zhengqi.arch(a)bytedance.com>
Reported-by: Ezra Buehler <ezra.buehler(a)husqvarnagroup.com>
Closes: https://lore.kernel.org/lkml/CAM1KZSmZ2T_riHvay+7cKEFxoPgeVpHkVFTzVVEQ1BO0c…
Acked-by: David Hildenbrand <david(a)redhat.com>
Cc: Hugh Dickens <hughd(a)google.com>
Cc: Muchun Song <muchun.song(a)linux.dev>
Cc: Qi Zheng <zhengqi.arch(a)bytedance.com>
Cc: Russel King <linux(a)armlinux.org.uk>
Cc: Ryan Roberts <ryan.roberts(a)arm.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
arch/arm/mm/fault-armv.c | 37 +++++++++++++++++++++++++------------
1 file changed, 25 insertions(+), 12 deletions(-)
--- a/arch/arm/mm/fault-armv.c~arm-pgtable-fix-null-pointer-dereference-issue
+++ a/arch/arm/mm/fault-armv.c
@@ -62,7 +62,7 @@ static int do_adjust_pte(struct vm_area_
}
static int adjust_pte(struct vm_area_struct *vma, unsigned long address,
- unsigned long pfn, struct vm_fault *vmf)
+ unsigned long pfn, bool need_lock)
{
spinlock_t *ptl;
pgd_t *pgd;
@@ -99,12 +99,11 @@ again:
if (!pte)
return 0;
- /*
- * If we are using split PTE locks, then we need to take the page
- * lock here. Otherwise we are using shared mm->page_table_lock
- * which is already locked, thus cannot take it.
- */
- if (ptl != vmf->ptl) {
+ if (need_lock) {
+ /*
+ * Use nested version here to indicate that we are already
+ * holding one similar spinlock.
+ */
spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
if (unlikely(!pmd_same(pmdval, pmdp_get_lockless(pmd)))) {
pte_unmap_unlock(pte, ptl);
@@ -114,7 +113,7 @@ again:
ret = do_adjust_pte(vma, address, pfn, pte);
- if (ptl != vmf->ptl)
+ if (need_lock)
spin_unlock(ptl);
pte_unmap(pte);
@@ -123,9 +122,10 @@ again:
static void
make_coherent(struct address_space *mapping, struct vm_area_struct *vma,
- unsigned long addr, pte_t *ptep, unsigned long pfn,
- struct vm_fault *vmf)
+ unsigned long addr, pte_t *ptep, unsigned long pfn)
{
+ const unsigned long pmd_start_addr = ALIGN_DOWN(addr, PMD_SIZE);
+ const unsigned long pmd_end_addr = pmd_start_addr + PMD_SIZE;
struct mm_struct *mm = vma->vm_mm;
struct vm_area_struct *mpnt;
unsigned long offset;
@@ -142,6 +142,14 @@ make_coherent(struct address_space *mapp
flush_dcache_mmap_lock(mapping);
vma_interval_tree_foreach(mpnt, &mapping->i_mmap, pgoff, pgoff) {
/*
+ * If we are using split PTE locks, then we need to take the pte
+ * lock. Otherwise we are using shared mm->page_table_lock which
+ * is already locked, thus cannot take it.
+ */
+ bool need_lock = IS_ENABLED(CONFIG_SPLIT_PTE_PTLOCKS);
+ unsigned long mpnt_addr;
+
+ /*
* If this VMA is not in our MM, we can ignore it.
* Note that we intentionally mask out the VMA
* that we are fixing up.
@@ -151,7 +159,12 @@ make_coherent(struct address_space *mapp
if (!(mpnt->vm_flags & VM_MAYSHARE))
continue;
offset = (pgoff - mpnt->vm_pgoff) << PAGE_SHIFT;
- aliases += adjust_pte(mpnt, mpnt->vm_start + offset, pfn, vmf);
+ mpnt_addr = mpnt->vm_start + offset;
+
+ /* Avoid deadlocks by not grabbing the same PTE lock again. */
+ if (mpnt_addr >= pmd_start_addr && mpnt_addr < pmd_end_addr)
+ need_lock = false;
+ aliases += adjust_pte(mpnt, mpnt_addr, pfn, need_lock);
}
flush_dcache_mmap_unlock(mapping);
if (aliases)
@@ -194,7 +207,7 @@ void update_mmu_cache_range(struct vm_fa
__flush_dcache_folio(mapping, folio);
if (mapping) {
if (cache_is_vivt())
- make_coherent(mapping, vma, addr, ptep, pfn, vmf);
+ make_coherent(mapping, vma, addr, ptep, pfn);
else if (vma->vm_flags & VM_EXEC)
__flush_icache_all();
}
_
Patches currently in -mm which might be from zhengqi.arch(a)bytedance.com are
mm-pgtable-fix-incorrect-reclaim-of-non-empty-pte-pages.patch
arm-pgtable-fix-null-pointer-dereference-issue.patch
The patch titled
Subject: mm/hwpoison: fix incorrect "not recovered" report for recovered clean pages
has been added to the -mm mm-unstable branch. Its filename is
mm-hwpoison-fix-incorrect-not-recovered-report-for-recovered-clean-pages.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Shuai Xue <xueshuai(a)linux.alibaba.com>
Subject: mm/hwpoison: fix incorrect "not recovered" report for recovered clean pages
Date: Mon, 17 Feb 2025 14:33:34 +0800
When an uncorrected memory error is consumed there is a race between the
CMCI from the memory controller reporting an uncorrected error with a UCNA
signature, and the core reporting and SRAR signature machine check when
the data is about to be consumed.
If the CMCI wins that race, the page is marked poisoned when
uc_decode_notifier() calls memory_failure(). For dirty pages,
memory_failure() invokes try_to_unmap() with the TTU_HWPOISON flag,
converting the PTE to a hwpoison entry. As a result,
kill_accessing_process():
- call walk_page_range() and return 1 regardless of whether
try_to_unmap() succeeds or fails,
- call kill_proc() to make sure a SIGBUS is sent
- return -EHWPOISON to indicate that SIGBUS is already sent to the
process and kill_me_maybe() doesn't have to send it again.
However, for clean pages, the TTU_HWPOISON flag is cleared, leaving the
PTE unchanged and not converted to a hwpoison entry. Conversely, for
clean pages where PTE entries are not marked as hwpoison,
kill_accessing_process() returns -EFAULT, causing kill_me_maybe() to send
a SIGBUS.
Console log looks like this:
Memory failure: 0x827ca68: corrupted page was clean: dropped without side effects
Memory failure: 0x827ca68: recovery action for clean LRU page: Recovered
Memory failure: 0x827ca68: already hardware poisoned
mce: Memory error not recovered
To fix it, return 0 for "corrupted page was clean", preventing an
unnecessary SIGBUS.
Link: https://lkml.kernel.org/r/20250217063335.22257-5-xueshuai@linux.alibaba.com
Fixes: 046545a661af ("mm/hwpoison: fix error page recovered but reported "not recovered"")
Signed-off-by: Shuai Xue <xueshuai(a)linux.alibaba.com>
Cc: <stable(a)vger.kernel.org>
Cc: Acked-by:Thomas Gleixner <tglx(a)linutronix.de>
Cc: Baolin Wang <baolin.wang(a)linux.alibaba.com>
Cc: Borislav Betkov <bp(a)alien8.de>
Cc: Dave Hansen <dave.hansen(a)linux.intel.com>
Cc: "H. Peter Anvin" <hpa(a)zytor.com>
Cc: Ingo Molnar <mingo(a)redhat.com>
Cc: Jane Chu <jane.chu(a)oracle.com>
Cc: Jarkko Sakkinen <jarkko(a)kernel.org>
Cc: Jonathan Cameron <Jonathan.Cameron(a)huawei.com>
Cc: Josh Poimboeuf <jpoimboe(a)kernel.org>
Cc: linmiaohe <linmiaohe(a)huawei.com>
Cc: "Luck, Tony" <tony.luck(a)intel.com>
Cc: Naoya Horiguchi <nao.horiguchi(a)gmail.com>
Cc: Peter Zijlstra <peterz(a)infradead.org>
Cc: Ruidong Tian <tianruidong(a)linux.alibaba.com>
Cc: Yazen Ghannam <yazen.ghannam(a)amd.com>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/memory-failure.c | 11 ++++++++---
1 file changed, 8 insertions(+), 3 deletions(-)
--- a/mm/memory-failure.c~mm-hwpoison-fix-incorrect-not-recovered-report-for-recovered-clean-pages
+++ a/mm/memory-failure.c
@@ -881,12 +881,17 @@ static int kill_accessing_process(struct
mmap_read_lock(p->mm);
ret = walk_page_range(p->mm, 0, TASK_SIZE, &hwpoison_walk_ops,
(void *)&priv);
+ /*
+ * ret = 1 when CMCI wins, regardless of whether try_to_unmap()
+ * succeeds or fails, then kill the process with SIGBUS.
+ * ret = 0 when poison page is a clean page and it's dropped, no
+ * SIGBUS is needed.
+ */
if (ret == 1 && priv.tk.addr)
kill_proc(&priv.tk, pfn, flags);
- else
- ret = 0;
mmap_read_unlock(p->mm);
- return ret > 0 ? -EHWPOISON : -EFAULT;
+
+ return ret > 0 ? -EHWPOISON : 0;
}
/*
_
Patches currently in -mm which might be from xueshuai(a)linux.alibaba.com are
x86-mce-collect-error-message-for-severities-below-mce_panic_severity.patch
x86-mce-dump-error-msg-from-severities.patch
x86-mce-add-ex_type_efault_reg-as-in-kernel-recovery-context-to-fix-copy-from-user-operations-regression.patch
mm-hwpoison-fix-incorrect-not-recovered-report-for-recovered-clean-pages.patch
mm-memory-failure-move-return-value-documentation-to-function-declaration.patch
The patch titled
Subject: x86/mce: add EX_TYPE_EFAULT_REG as in-kernel recovery context to fix copy-from-user operations regression
has been added to the -mm mm-unstable branch. Its filename is
x86-mce-add-ex_type_efault_reg-as-in-kernel-recovery-context-to-fix-copy-from-user-operations-regression.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Shuai Xue <xueshuai(a)linux.alibaba.com>
Subject: x86/mce: add EX_TYPE_EFAULT_REG as in-kernel recovery context to fix copy-from-user operations regression
Date: Mon, 17 Feb 2025 14:33:33 +0800
Commit 4c132d1d844a ("x86/futex: Remove .fixup usage") introduced a new
extable fixup type, EX_TYPE_EFAULT_REG, and later patches updated the
extable fixup type for copy-from-user operations, changing it from
EX_TYPE_UACCESS to EX_TYPE_EFAULT_REG.
Specifically, commit 99641e094d6c ("x86/uaccess: Remove .fixup usage")
altered the extable fixup type for the get_user family, while commit
4c132d1d844a ("x86/futex: Remove .fixup usage") addressed the futex
operations. This change inadvertently caused a regression where the error
context for some copy-from-user operations no longer functions as an
in-kernel recovery context, leading to kernel panics with the message:
"Machine check: Data load in unrecoverable area of kernel."
To fix the regression, add EX_TYPE_EFAULT_REG as a in-kernel recovery
context for copy-from-user operations.
Link: https://lkml.kernel.org/r/20250217063335.22257-4-xueshuai@linux.alibaba.com
Signed-off-by: Shuai Xue <xueshuai(a)linux.alibaba.com>
Fixes: 4c132d1d844a ("x86/futex: Remove .fixup usage")
Cc: <stable(a)vger.kernel.org>
Cc: Acked-by:Thomas Gleixner <tglx(a)linutronix.de>
Cc: Baolin Wang <baolin.wang(a)linux.alibaba.com>
Cc: Borislav Betkov <bp(a)alien8.de>
Cc: Dave Hansen <dave.hansen(a)linux.intel.com>
Cc: "H. Peter Anvin" <hpa(a)zytor.com>
Cc: Ingo Molnar <mingo(a)redhat.com>
Cc: Jane Chu <jane.chu(a)oracle.com>
Cc: Jarkko Sakkinen <jarkko(a)kernel.org>
Cc: Jonathan Cameron <Jonathan.Cameron(a)huawei.com>
Cc: Josh Poimboeuf <jpoimboe(a)kernel.org>
Cc: linmiaohe <linmiaohe(a)huawei.com>
Cc: "Luck, Tony" <tony.luck(a)intel.com>
Cc: Naoya Horiguchi <nao.horiguchi(a)gmail.com>
Cc: Peter Zijlstra <peterz(a)infradead.org>
Cc: Ruidong Tian <tianruidong(a)linux.alibaba.com>
Cc: Yazen Ghannam <yazen.ghannam(a)amd.com>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
arch/x86/kernel/cpu/mce/severity.c | 21 ++++++++++++++++-----
1 file changed, 16 insertions(+), 5 deletions(-)
--- a/arch/x86/kernel/cpu/mce/severity.c~x86-mce-add-ex_type_efault_reg-as-in-kernel-recovery-context-to-fix-copy-from-user-operations-regression
+++ a/arch/x86/kernel/cpu/mce/severity.c
@@ -16,6 +16,7 @@
#include <asm/traps.h>
#include <asm/insn.h>
#include <asm/insn-eval.h>
+#include <linux/extable.h>
#include "internal.h"
@@ -285,7 +286,8 @@ static bool is_copy_from_user(struct pt_
*/
static noinstr int error_context(struct mce *m, struct pt_regs *regs)
{
- int fixup_type;
+ const struct exception_table_entry *e;
+ int fixup_type, imm;
bool copy_user;
if ((m->cs & 3) == 3)
@@ -294,9 +296,14 @@ static noinstr int error_context(struct
if (!mc_recoverable(m->mcgstatus))
return IN_KERNEL;
+ e = search_exception_tables(m->ip);
+ if (!e)
+ return IN_KERNEL;
+
/* Allow instrumentation around external facilities usage. */
instrumentation_begin();
- fixup_type = ex_get_fixup_type(m->ip);
+ fixup_type = FIELD_GET(EX_DATA_TYPE_MASK, e->data);
+ imm = FIELD_GET(EX_DATA_IMM_MASK, e->data);
copy_user = is_copy_from_user(regs);
instrumentation_end();
@@ -304,9 +311,13 @@ static noinstr int error_context(struct
case EX_TYPE_UACCESS:
if (!copy_user)
return IN_KERNEL;
- m->kflags |= MCE_IN_KERNEL_COPYIN;
- fallthrough;
-
+ m->kflags |= MCE_IN_KERNEL_COPYIN | MCE_IN_KERNEL_RECOV;
+ return IN_KERNEL_RECOV;
+ case EX_TYPE_IMM_REG:
+ if (!copy_user || imm != -EFAULT)
+ return IN_KERNEL;
+ m->kflags |= MCE_IN_KERNEL_COPYIN | MCE_IN_KERNEL_RECOV;
+ return IN_KERNEL_RECOV;
case EX_TYPE_FAULT_MCE_SAFE:
case EX_TYPE_DEFAULT_MCE_SAFE:
m->kflags |= MCE_IN_KERNEL_RECOV;
_
Patches currently in -mm which might be from xueshuai(a)linux.alibaba.com are
x86-mce-collect-error-message-for-severities-below-mce_panic_severity.patch
x86-mce-dump-error-msg-from-severities.patch
x86-mce-add-ex_type_efault_reg-as-in-kernel-recovery-context-to-fix-copy-from-user-operations-regression.patch
mm-hwpoison-fix-incorrect-not-recovered-report-for-recovered-clean-pages.patch
mm-memory-failure-move-return-value-documentation-to-function-declaration.patch
The patch titled
Subject: m68k: sun3: add check for __pgd_alloc()
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
m68k-sun3-add-check-for-__pgd_alloc.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Haoxiang Li <haoxiang_li2024(a)163.com>
Subject: m68k: sun3: add check for __pgd_alloc()
Date: Tue, 18 Feb 2025 00:00:17 +0800
Add check for the return value of __pgd_alloc() in pgd_alloc() to prevent
null pointer dereference.
Link: https://lkml.kernel.org/r/20250217160017.2375536-1-haoxiang_li2024@163.com
Fixes: a9b3c355c2e6 ("asm-generic: pgalloc: provide generic __pgd_{alloc,free}")
Signed-off-by: Haoxiang Li <haoxiang_li2024(a)163.com>
Cc: Dave Hansen <dave.hansen(a)linux.intel.com>
Cc: Geert Uytterhoeven <geert(a)linux-m68k.org>
Cc: Kevin Brodsky <kevin.brodsky(a)arm.com>
Cc: Qi Zheng <zhengqi.arch(a)bytedance.com>
Cc: Sam Creasey <sammy(a)sammy.net>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
arch/m68k/include/asm/sun3_pgalloc.h | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
--- a/arch/m68k/include/asm/sun3_pgalloc.h~m68k-sun3-add-check-for-__pgd_alloc
+++ a/arch/m68k/include/asm/sun3_pgalloc.h
@@ -44,8 +44,10 @@ static inline pgd_t * pgd_alloc(struct m
pgd_t *new_pgd;
new_pgd = __pgd_alloc(mm, 0);
- memcpy(new_pgd, swapper_pg_dir, PAGE_SIZE);
- memset(new_pgd, 0, (PAGE_OFFSET >> PGDIR_SHIFT));
+ if (likely(new_pgd != NULL)) {
+ memcpy(new_pgd, swapper_pg_dir, PAGE_SIZE);
+ memset(new_pgd, 0, (PAGE_OFFSET >> PGDIR_SHIFT));
+ }
return new_pgd;
}
_
Patches currently in -mm which might be from haoxiang_li2024(a)163.com are
m68k-sun3-add-check-for-__pgd_alloc.patch
The patch titled
Subject: selftests/damon/damos_quota_goal: handle minimum quota that cannot be further reduced
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
selftests-damon-damos_quota_goal-handle-minimum-quota-that-cannot-be-further-reduced.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: SeongJae Park <sj(a)kernel.org>
Subject: selftests/damon/damos_quota_goal: handle minimum quota that cannot be further reduced
Date: Mon, 17 Feb 2025 10:23:04 -0800
damos_quota_goal.py selftest see if DAMOS quota goals tuning feature
increases or reduces the effective size quota for given score as expected.
The tuning feature sets the minimum quota size as one byte, so if the
effective size quota is already one, we cannot expect it further be
reduced. However the test is not aware of the edge case, and fails since
it shown no expected change of the effective quota. Handle the case by
updating the failure logic for no change to see if it was the case, and
simply skips to next test input.
Link: https://lkml.kernel.org/r/20250217182304.45215-1-sj@kernel.org
Fixes: f1c07c0a1662b ("selftests/damon: add a test for DAMOS quota goal")
Signed-off-by: SeongJae Park <sj(a)kernel.org>
Reported-by: kernel test robot <oliver.sang(a)intel.com>
Closes: https://lore.kernel.org/oe-lkp/202502171423.b28a918d-lkp@intel.com
Cc: Shuah Khan (Samsung OSG) <shuah(a)kernel.org>
Cc: <stable(a)vger.kernel.org> [6.10.x]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
tools/testing/selftests/damon/damos_quota_goal.py | 3 +++
1 file changed, 3 insertions(+)
--- a/tools/testing/selftests/damon/damos_quota_goal.py~selftests-damon-damos_quota_goal-handle-minimum-quota-that-cannot-be-further-reduced
+++ a/tools/testing/selftests/damon/damos_quota_goal.py
@@ -63,6 +63,9 @@ def main():
if last_effective_bytes != 0 else -1.0))
if last_effective_bytes == goal.effective_bytes:
+ # effective quota was already minimum that cannot be more reduced
+ if expect_increase is False and last_effective_bytes == 1:
+ continue
print('efective bytes not changed: %d' % goal.effective_bytes)
exit(1)
_
Patches currently in -mm which might be from sj(a)kernel.org are
selftests-damon-damos_quota_goal-handle-minimum-quota-that-cannot-be-further-reduced.patch
mm-madvise-split-out-mmap-locking-operations-for-madvise.patch
mm-madvise-split-out-madvise-input-validity-check.patch
mm-madvise-split-out-madvise-behavior-execution.patch
mm-madvise-remove-redundant-mmap_lock-operations-from-process_madvise.patch
mm-damon-avoid-applying-damos-action-to-same-entity-multiple-times.patch
mm-damon-core-unset-damos-walk_completed-after-confimed-set.patch
mm-damon-core-do-not-call-damos_walk_control-walk-if-walk-is-completed.patch
mm-damon-core-do-damos-walking-in-entire-regions-granularity.patch
In mii_nway_restart() during the line:
bmcr = mii->mdio_read(mii->dev, mii->phy_id, MII_BMCR);
The code attempts to call mii->mdio_read which is ch9200_mdio_read().
ch9200_mdio_read() utilises a local buffer, which is initialised
with control_read():
unsigned char buff[2];
However buff is conditionally initialised inside control_read():
if (err == size) {
memcpy(data, buf, size);
}
If the condition of "err == size" is not met, then buff remains
uninitialised. Once this happens the uninitialised buff is accessed
and returned during ch9200_mdio_read():
return (buff[0] | buff[1] << 8);
The problem stems from the fact that ch9200_mdio_read() ignores the
return value of control_read(), leading to uinit-access of buff.
To fix this we should check the return value of control_read()
and return early on error.
Signed-off-by: Qasim Ijaz <qasdev00(a)gmail.com>
Reported-by: syzbot <syzbot+3361c2d6f78a3e0892f9(a)syzkaller.appspotmail.com>
Tested-by: syzbot <syzbot+3361c2d6f78a3e0892f9(a)syzkaller.appspotmail.com>
Closes: https://syzkaller.appspot.com/bug?extid=3361c2d6f78a3e0892f9
Fixes: 4a476bd6d1d9 ("usbnet: New driver for QinHeng CH9200 devices")
Cc: stable(a)vger.kernel.org
---
drivers/net/mii.c | 2 ++
drivers/net/usb/ch9200.c | 7 +++++--
2 files changed, 7 insertions(+), 2 deletions(-)
diff --git a/drivers/net/mii.c b/drivers/net/mii.c
index 37bc3131d31a..e305bf0f1d04 100644
--- a/drivers/net/mii.c
+++ b/drivers/net/mii.c
@@ -464,6 +464,8 @@ int mii_nway_restart (struct mii_if_info *mii)
/* if autoneg is off, it's an error */
bmcr = mii->mdio_read(mii->dev, mii->phy_id, MII_BMCR);
+ if (bmcr < 0)
+ return bmcr;
if (bmcr & BMCR_ANENABLE) {
bmcr |= BMCR_ANRESTART;
diff --git a/drivers/net/usb/ch9200.c b/drivers/net/usb/ch9200.c
index f69d9b902da0..e32d3c282dc1 100644
--- a/drivers/net/usb/ch9200.c
+++ b/drivers/net/usb/ch9200.c
@@ -178,6 +178,7 @@ static int ch9200_mdio_read(struct net_device *netdev, int phy_id, int loc)
{
struct usbnet *dev = netdev_priv(netdev);
unsigned char buff[2];
+ int ret;
netdev_dbg(netdev, "%s phy_id:%02x loc:%02x\n",
__func__, phy_id, loc);
@@ -185,8 +186,10 @@ static int ch9200_mdio_read(struct net_device *netdev, int phy_id, int loc)
if (phy_id != 0)
return -ENODEV;
- control_read(dev, REQUEST_READ, 0, loc * 2, buff, 0x02,
- CONTROL_TIMEOUT_MS);
+ ret = control_read(dev, REQUEST_READ, 0, loc * 2, buff, 0x02,
+ CONTROL_TIMEOUT_MS);
+ if (ret != 2)
+ return ret < 0 ? ret : -EINVAL;
return (buff[0] | buff[1] << 8);
}
--
2.39.5
Christoph reports that their rk3399 system dies since we merged
773c05f417fa1 ("irqchip/gic-v3: Work around insecure GIC
integrations").
It appears that some rk3399 have some secure payloads, and that
the firmware sets SCR_EL3.FIQ==1. Obivously, disabling security
in that configuration leads to even more problems.
Let's revisit the workaround by:
- making it rk3399 specific
- checking whether Group-0 is available, which is a good proxy
for SCR_EL3.FIQ being 0
- either apply the workaround if Group-0 is available, or disable
pseudo-NMIs if not
Note that this doesn't mean that the secure side is able to receive
interrupts anyway, as we make all interrupts non-secure anyway.
Clearly, nobody ever tested secure interrupts on this platform.
With that, Christoph is able to use their rk3399.
Reported-by: Christoph Fritz <chf.fritz(a)googlemail.com>
Tested-by: Christoph Fritz <chf.fritz(a)googlemail.com>
Signed-off-by: Marc Zyngier <maz(a)kernel.org>
Cc: stable(a)vger.kernel.org
Fixes: 773c05f417fa1 ("irqchip/gic-v3: Work around insecure GIC integrations")
Link: https://lore.kernel.org/r/b1266652fb64857246e8babdf268d0df8f0c36d9.camel@go…
---
drivers/irqchip/irq-gic-v3.c | 53 +++++++++++++++++++++++++++---------
1 file changed, 40 insertions(+), 13 deletions(-)
diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c
index 76dce0aac2465..270d7a4d85a6d 100644
--- a/drivers/irqchip/irq-gic-v3.c
+++ b/drivers/irqchip/irq-gic-v3.c
@@ -44,6 +44,7 @@ static u8 dist_prio_nmi __ro_after_init = GICV3_PRIO_NMI;
#define FLAGS_WORKAROUND_GICR_WAKER_MSM8996 (1ULL << 0)
#define FLAGS_WORKAROUND_CAVIUM_ERRATUM_38539 (1ULL << 1)
#define FLAGS_WORKAROUND_ASR_ERRATUM_8601001 (1ULL << 2)
+#define FLAGS_WORKAROUND_INSECURE (1ULL << 3)
#define GIC_IRQ_TYPE_PARTITION (GIC_IRQ_TYPE_LPI + 1)
@@ -83,6 +84,8 @@ static DEFINE_STATIC_KEY_TRUE(supports_deactivate_key);
#define GIC_LINE_NR min(GICD_TYPER_SPIS(gic_data.rdists.gicd_typer), 1020U)
#define GIC_ESPI_NR GICD_TYPER_ESPIS(gic_data.rdists.gicd_typer)
+static bool nmi_support_forbidden;
+
/*
* There are 16 SGIs, though we only actually use 8 in Linux. The other 8 SGIs
* are potentially stolen by the secure side. Some code, especially code dealing
@@ -163,21 +166,27 @@ static void __init gic_prio_init(void)
{
bool ds;
- ds = gic_dist_security_disabled();
- if (!ds) {
- u32 val;
-
- val = readl_relaxed(gic_data.dist_base + GICD_CTLR);
- val |= GICD_CTLR_DS;
- writel_relaxed(val, gic_data.dist_base + GICD_CTLR);
+ cpus_have_group0 = gic_has_group0();
- ds = gic_dist_security_disabled();
- if (ds)
- pr_warn("Broken GIC integration, security disabled");
+ ds = gic_dist_security_disabled();
+ if ((gic_data.flags & FLAGS_WORKAROUND_INSECURE) && !ds) {
+ if (cpus_have_group0) {
+ u32 val;
+
+ val = readl_relaxed(gic_data.dist_base + GICD_CTLR);
+ val |= GICD_CTLR_DS;
+ writel_relaxed(val, gic_data.dist_base + GICD_CTLR);
+
+ ds = gic_dist_security_disabled();
+ if (ds)
+ pr_warn("Broken GIC integration, security disabled\n");
+ } else {
+ pr_warn("Broken GIC integration, pNMI forbidden\n");
+ nmi_support_forbidden = true;
+ }
}
cpus_have_security_disabled = ds;
- cpus_have_group0 = gic_has_group0();
/*
* How priority values are used by the GIC depends on two things:
@@ -209,7 +218,7 @@ static void __init gic_prio_init(void)
* be in the non-secure range, we program the non-secure values into
* the distributor to match the PMR values we want.
*/
- if (cpus_have_group0 & !cpus_have_security_disabled) {
+ if (cpus_have_group0 && !cpus_have_security_disabled) {
dist_prio_irq = __gicv3_prio_to_ns(dist_prio_irq);
dist_prio_nmi = __gicv3_prio_to_ns(dist_prio_nmi);
}
@@ -1922,6 +1931,18 @@ static bool gic_enable_quirk_arm64_2941627(void *data)
return true;
}
+static bool gic_enable_quirk_rk3399(void *data)
+{
+ struct gic_chip_data *d = data;
+
+ if (of_machine_is_compatible("rockchip,rk3399")) {
+ d->flags |= FLAGS_WORKAROUND_INSECURE;
+ return true;
+ }
+
+ return false;
+}
+
static bool rd_set_non_coherent(void *data)
{
struct gic_chip_data *d = data;
@@ -1996,6 +2017,12 @@ static const struct gic_quirk gic_quirks[] = {
.property = "dma-noncoherent",
.init = rd_set_non_coherent,
},
+ {
+ .desc = "GICv3: Insecure RK3399 integration",
+ .iidr = 0x0000043b,
+ .mask = 0xff000fff,
+ .init = gic_enable_quirk_rk3399,
+ },
{
}
};
@@ -2004,7 +2031,7 @@ static void gic_enable_nmi_support(void)
{
int i;
- if (!gic_prio_masking_enabled())
+ if (!gic_prio_masking_enabled() || nmi_support_forbidden)
return;
rdist_nmi_refs = kcalloc(gic_data.ppi_nr + SGI_NR,
--
2.39.2
The desired clock frequency was correctly set to 400MHz in the device tree
but was lowered by the driver to 300MHz breaking 4K 60Hz content playback.
Fix the issue by removing the driver call to clk_set_rate(), which reduce
the amount of board specific code.
Fixes: 003afda97c65 ("media: verisilicon: Enable AV1 decoder on rk3588")
Signed-off-by: Nicolas Dufresne <nicolas.dufresne(a)collabora.com>
---
This patch fixes user report of AV1 4K60 decoder not being fast enough
on RK3588 based SoC. This is a break from Hantro original authors
habbit of coding the frequencies in the driver instead of specifying this
frequency in the device tree. The other calls to clk_set_rate() are left
since this would require modifying many dtsi files, which would then be
unsuitable for backport.
---
drivers/media/platform/verisilicon/rockchip_vpu_hw.c | 5 +----
1 file changed, 1 insertion(+), 4 deletions(-)
diff --git a/drivers/media/platform/verisilicon/rockchip_vpu_hw.c b/drivers/media/platform/verisilicon/rockchip_vpu_hw.c
index 964122e7c355934cd80eb442219f6ba51bba8b71..9d8eab33556d62733ec7ec6b5e685c86ba7086e4 100644
--- a/drivers/media/platform/verisilicon/rockchip_vpu_hw.c
+++ b/drivers/media/platform/verisilicon/rockchip_vpu_hw.c
@@ -17,7 +17,6 @@
#define RK3066_ACLK_MAX_FREQ (300 * 1000 * 1000)
#define RK3288_ACLK_MAX_FREQ (400 * 1000 * 1000)
-#define RK3588_ACLK_MAX_FREQ (300 * 1000 * 1000)
#define ROCKCHIP_VPU981_MIN_SIZE 64
@@ -440,10 +439,9 @@ static int rk3066_vpu_hw_init(struct hantro_dev *vpu)
return 0;
}
+/* TODO just remove, the CLK are defined correctly in the DTS */
static int rk3588_vpu981_hw_init(struct hantro_dev *vpu)
{
- /* Bump ACLKs to max. possible freq. to improve performance. */
- clk_set_rate(vpu->clocks[0].clk, RK3588_ACLK_MAX_FREQ);
return 0;
}
@@ -807,7 +805,6 @@ const struct hantro_variant rk3588_vpu981_variant = {
.codec_ops = rk3588_vpu981_codec_ops,
.irqs = rk3588_vpu981_irqs,
.num_irqs = ARRAY_SIZE(rk3588_vpu981_irqs),
- .init = rk3588_vpu981_hw_init,
.clk_names = rk3588_vpu981_vpu_clk_names,
.num_clocks = ARRAY_SIZE(rk3588_vpu981_vpu_clk_names)
};
---
base-commit: 2014c95afecee3e76ca4a56956a936e23283f05b
change-id: 20250217-b4-hantro-av1-clock-rate-e5497f1499df
Best regards,
--
Nicolas Dufresne <nicolas.dufresne(a)collabora.com>
damos_quota_goal.py selftest see if DAMOS quota goals tuning feature
increases or reduces the effective size quota for given score as
expected. The tuning feature sets the minimum quota size as one byte,
so if the effective size quota is already one, we cannot expect it
further be reduced. However the test is not aware of the edge case, and
fails since it shown no expected change of the effective quota. Handle
the case by updating the failure logic for no change to see if it was
the case, and simply skips to next test input.
Fixes: f1c07c0a1662b ("selftests/damon: add a test for DAMOS quota goal")
Cc: <stable(a)vger.kernel.org> # 6.10.x
Reported-by: kernel test robot <oliver.sang(a)intel.com>
Closes: https://lore.kernel.org/oe-lkp/202502171423.b28a918d-lkp@intel.com
Signed-off-by: SeongJae Park <sj(a)kernel.org>
---
tools/testing/selftests/damon/damos_quota_goal.py | 3 +++
1 file changed, 3 insertions(+)
diff --git a/tools/testing/selftests/damon/damos_quota_goal.py b/tools/testing/selftests/damon/damos_quota_goal.py
index 18246f3b62f7..f76e0412b564 100755
--- a/tools/testing/selftests/damon/damos_quota_goal.py
+++ b/tools/testing/selftests/damon/damos_quota_goal.py
@@ -63,6 +63,9 @@ def main():
if last_effective_bytes != 0 else -1.0))
if last_effective_bytes == goal.effective_bytes:
+ # effective quota was already minimum that cannot be more reduced
+ if expect_increase is False and last_effective_bytes == 1:
+ continue
print('efective bytes not changed: %d' % goal.effective_bytes)
exit(1)
base-commit: 20017459916819f8ae15ca3840e71fbf0ea8354e
--
2.39.5
From: Darrick J. Wong <djwong(a)kernel.org>
commit 07137e925fa951646325762bda6bd2503dfe64c6 upstream
Quota counter updates are tracked via incore objects which hang off the
xfs_trans object. These changes are then turned into dirty log items in
xfs_trans_apply_dquot_deltas just prior to commiting the log items to
the CIL.
However, updating the incore deltas do not cause XFS_TRANS_DIRTY to be
set on the transaction. In other words, a pure quota counter update
will be silently discarded if there are no other dirty log items
attached to the transaction.
This is currently not the case anywhere in the filesystem because quota
updates always dirty at least one other metadata item, but a subsequent
bug fix will add dquot log item precommits, so we actually need a dirty
dquot log item prior to xfs_trans_run_precommits. Also let's not leave
a logic bomb.
Cc: <stable(a)vger.kernel.org> # v2.6.35
Fixes: 0924378a689ccb ("xfs: split out iclog writing from xfs_trans_commit()")
Signed-off-by: "Darrick J. Wong" <djwong(a)kernel.org>
Reviewed-by: Christoph Hellwig <hch(a)lst.de>
---
fs/xfs/xfs_quota.h | 5 +++--
fs/xfs/xfs_trans.c | 10 +++-------
fs/xfs/xfs_trans_dquot.c | 31 ++++++++++++++++++++++++++-----
3 files changed, 32 insertions(+), 14 deletions(-)
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index 23d71a55bbc006..032f3a70f21ddd 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -96,7 +96,8 @@ extern void xfs_trans_free_dqinfo(struct xfs_trans *);
extern void xfs_trans_mod_dquot_byino(struct xfs_trans *, struct xfs_inode *,
uint, int64_t);
extern void xfs_trans_apply_dquot_deltas(struct xfs_trans *);
-extern void xfs_trans_unreserve_and_mod_dquots(struct xfs_trans *);
+void xfs_trans_unreserve_and_mod_dquots(struct xfs_trans *tp,
+ bool already_locked);
int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp, struct xfs_inode *ip,
int64_t dblocks, int64_t rblocks, bool force);
extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *,
@@ -166,7 +167,7 @@ static inline void xfs_trans_mod_dquot_byino(struct xfs_trans *tp,
{
}
#define xfs_trans_apply_dquot_deltas(tp)
-#define xfs_trans_unreserve_and_mod_dquots(tp)
+#define xfs_trans_unreserve_and_mod_dquots(tp, a)
static inline int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp,
struct xfs_inode *ip, int64_t dblocks, int64_t rblocks,
bool force)
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index ee46051db12dde..39cd11cbe21fcb 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -840,6 +840,7 @@ __xfs_trans_commit(
*/
if (tp->t_flags & XFS_TRANS_SB_DIRTY)
xfs_trans_apply_sb_deltas(tp);
+ xfs_trans_apply_dquot_deltas(tp);
error = xfs_trans_run_precommits(tp);
if (error)
@@ -868,11 +869,6 @@ __xfs_trans_commit(
ASSERT(tp->t_ticket != NULL);
- /*
- * If we need to update the superblock, then do it now.
- */
- xfs_trans_apply_dquot_deltas(tp);
-
xlog_cil_commit(log, tp, &commit_seq, regrant);
xfs_trans_free(tp);
@@ -898,7 +894,7 @@ __xfs_trans_commit(
* the dqinfo portion to be. All that means is that we have some
* (non-persistent) quota reservations that need to be unreserved.
*/
- xfs_trans_unreserve_and_mod_dquots(tp);
+ xfs_trans_unreserve_and_mod_dquots(tp, true);
if (tp->t_ticket) {
if (regrant && !xlog_is_shutdown(log))
xfs_log_ticket_regrant(log, tp->t_ticket);
@@ -992,7 +988,7 @@ xfs_trans_cancel(
}
#endif
xfs_trans_unreserve_and_mod_sb(tp);
- xfs_trans_unreserve_and_mod_dquots(tp);
+ xfs_trans_unreserve_and_mod_dquots(tp, false);
if (tp->t_ticket) {
xfs_log_ticket_ungrant(log, tp->t_ticket);
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index b368e13424c4f4..b92eeaa1a2a9e7 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -602,6 +602,24 @@ xfs_trans_apply_dquot_deltas(
ASSERT(dqp->q_blk.reserved >= dqp->q_blk.count);
ASSERT(dqp->q_ino.reserved >= dqp->q_ino.count);
ASSERT(dqp->q_rtb.reserved >= dqp->q_rtb.count);
+
+ /*
+ * We've applied the count changes and given back
+ * whatever reservation we didn't use. Zero out the
+ * dqtrx fields.
+ */
+ qtrx->qt_blk_res = 0;
+ qtrx->qt_bcount_delta = 0;
+ qtrx->qt_delbcnt_delta = 0;
+
+ qtrx->qt_rtblk_res = 0;
+ qtrx->qt_rtblk_res_used = 0;
+ qtrx->qt_rtbcount_delta = 0;
+ qtrx->qt_delrtb_delta = 0;
+
+ qtrx->qt_ino_res = 0;
+ qtrx->qt_ino_res_used = 0;
+ qtrx->qt_icount_delta = 0;
}
}
}
@@ -638,7 +656,8 @@ xfs_trans_unreserve_and_mod_dquots_hook(
*/
void
xfs_trans_unreserve_and_mod_dquots(
- struct xfs_trans *tp)
+ struct xfs_trans *tp,
+ bool already_locked)
{
int i, j;
struct xfs_dquot *dqp;
@@ -667,10 +686,12 @@ xfs_trans_unreserve_and_mod_dquots(
* about the number of blocks used field, or deltas.
* Also we don't bother to zero the fields.
*/
- locked = false;
+ locked = already_locked;
if (qtrx->qt_blk_res) {
- xfs_dqlock(dqp);
- locked = true;
+ if (!locked) {
+ xfs_dqlock(dqp);
+ locked = true;
+ }
dqp->q_blk.reserved -=
(xfs_qcnt_t)qtrx->qt_blk_res;
}
@@ -691,7 +712,7 @@ xfs_trans_unreserve_and_mod_dquots(
dqp->q_rtb.reserved -=
(xfs_qcnt_t)qtrx->qt_rtblk_res;
}
- if (locked)
+ if (locked && !already_locked)
xfs_dqunlock(dqp);
}
Hello,
This bug report was raised on an IOMMU regression found recently:
https://bugzilla.kernel.org/show_bug.cgi?id=219499
This has been fixed by the following commit in v6.14-rc3:
commit ef75966abf95 ("iommu/amd: Expicitly enable CNTRL.EPHEn bit in
resume path")
I confirmed it backports cleanly to the LTS 6.12.y kernel. Can we
please have it backported there and to 6.13.y?
Thanks!
Hi, all.
recently met an issue: tc-flower not worked when configured dst_port
and src_port range in one rule.
detailed like this:
$ tc qdisc add dev ens38 ingress
$ tc filter add dev ens38 ingress protocol ip flower ip_proto udp \
dst_port 5000 src_port 2000-3000 action drop
I try to find the root cause in kernel source code:
1. FLOW_DISSECTOR_KEY_PORTS and FLOW_DISSECTOR_KEY_PORTS_RANGE flag of
mask->dissector were set
in fl_classify from flow_dissector.c.
2. then skb_flow_dissect -> __skb_flow_dissect -> __skb_flow_dissect_ports.
3. FLOW_DISSECTOR_KEY_PORTS handled and FLOW_DISSECTOR_KEY_PORTS_RANGE
not handled
in __skb_flow_dissect_ports, so tp_range.tp.src was 0 here expected
the actual skb source port.
By the way, __skb_flow_bpf_to_target function may has the same issue.
Please help confirm and fix it, thank you.
source code of __skb_flow_dissect_ports in flow_dissector.c as below:
static void
__skb_flow_dissect_ports(const struct sk_buff *skb,
struct flow_dissector *flow_dissector,
void *target_container, const void *data,
int nhoff, u8 ip_proto, int hlen)
{
enum flow_dissector_key_id dissector_ports = FLOW_DISSECTOR_KEY_MAX;
struct flow_dissector_key_ports *key_ports;
if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS))
dissector_ports = FLOW_DISSECTOR_KEY_PORTS;
else if (dissector_uses_key(flow_dissector,
FLOW_DISSECTOR_KEY_PORTS_RANGE))
dissector_ports = FLOW_DISSECTOR_KEY_PORTS_RANGE;
if (dissector_ports == FLOW_DISSECTOR_KEY_MAX)
return;
key_ports = skb_flow_dissector_target(flow_dissector,
dissector_ports,
target_container);
key_ports->ports = __skb_flow_get_ports(skb, nhoff, ip_proto,
data, hlen);
}
Best regards.
Currently memremap(MEMREMAP_WB) can produce decrypted/shared mapping:
memremap(MEMREMAP_WB)
arch_memremap_wb()
ioremap_cache()
__ioremap_caller(.encrytped = false)
In such cases, the IORES_MAP_ENCRYPTED flag on the memory will determine
if the resulting mapping is encrypted or decrypted.
Creating a decrypted mapping without explicit request from the caller is
risky:
- It can inadvertently expose the guest's data and compromise the
guest.
- Accessing private memory via shared/decrypted mapping on TDX will
either trigger implicit conversion to shared or #VE (depending on
VMM implementation).
Implicit conversion is destructive: subsequent access to the same
memory via private mapping will trigger a hard-to-debug #VE crash.
The kernel already provides a way to request decrypted mapping
explicitly via the MEMREMAP_DEC flag.
Modify memremap(MEMREMAP_WB) to produce encrypted/private mapping by
default unless MEMREMAP_DEC is specified or if the kernel runs on
a machine with SME enabled.
It fixes the crash due to #VE on kexec in TDX guests if CONFIG_EISA is
enabled.
Signed-off-by: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com>
Cc: stable(a)vger.kernel.org # 6.11+
Cc: Tom Lendacky <thomas.lendacky(a)amd.com>
Cc: Ashish Kalra <ashish.kalra(a)amd.com>
Cc: "Maciej W. Rozycki" <macro(a)orcam.me.uk>
---
arch/x86/include/asm/io.h | 3 +++
arch/x86/mm/ioremap.c | 8 ++++++++
2 files changed, 11 insertions(+)
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index ed580c7f9d0a..1a0dc2b2bf5b 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -175,6 +175,9 @@ extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, un
extern void __iomem *ioremap_encrypted(resource_size_t phys_addr, unsigned long size);
#define ioremap_encrypted ioremap_encrypted
+void *arch_memremap_wb(phys_addr_t phys_addr, size_t size, unsigned long flags);
+#define arch_memremap_wb arch_memremap_wb
+
/**
* ioremap - map bus memory into CPU space
* @offset: bus address of the memory
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 8d29163568a7..a4b23d2e92d2 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -503,6 +503,14 @@ void iounmap(volatile void __iomem *addr)
}
EXPORT_SYMBOL(iounmap);
+void *arch_memremap_wb(phys_addr_t phys_addr, size_t size, unsigned long flags)
+{
+ if ((flags & MEMREMAP_DEC) || cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT))
+ return (void __force *)ioremap_cache(phys_addr, size);
+
+ return (void __force *)ioremap_encrypted(phys_addr, size);
+}
+
/*
* Convert a physical pointer to a virtual kernel pointer for /dev/mem
* access
--
2.47.2
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x 1aaf8c122918aa8897605a9aa1e8ed6600d6f930
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025021008-recharger-fastball-ffab@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 1aaf8c122918aa8897605a9aa1e8ed6600d6f930 Mon Sep 17 00:00:00 2001
From: Zhaoyang Huang <zhaoyang.huang(a)unisoc.com>
Date: Tue, 21 Jan 2025 10:01:59 +0800
Subject: [PATCH] mm: gup: fix infinite loop within __get_longterm_locked
We can run into an infinite loop in __get_longterm_locked() when
collect_longterm_unpinnable_folios() finds only folios that are isolated
from the LRU or were never added to the LRU. This can happen when all
folios to be pinned are never added to the LRU, for example when
vm_ops->fault allocated pages using cma_alloc() and never added them to
the LRU.
Fix it by simply taking a look at the list in the single caller, to see if
anything was added.
[zhaoyang.huang(a)unisoc.com: move definition of local]
Link: https://lkml.kernel.org/r/20250122012604.3654667-1-zhaoyang.huang@unisoc.com
Link: https://lkml.kernel.org/r/20250121020159.3636477-1-zhaoyang.huang@unisoc.com
Fixes: 67e139b02d99 ("mm/gup.c: refactor check_and_migrate_movable_pages()")
Signed-off-by: Zhaoyang Huang <zhaoyang.huang(a)unisoc.com>
Reviewed-by: John Hubbard <jhubbard(a)nvidia.com>
Reviewed-by: David Hildenbrand <david(a)redhat.com>
Suggested-by: David Hildenbrand <david(a)redhat.com>
Acked-by: David Hildenbrand <david(a)redhat.com>
Cc: Aijun Sun <aijun.sun(a)unisoc.com>
Cc: Alistair Popple <apopple(a)nvidia.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/gup.c b/mm/gup.c
index 9aaf338cc1f4..3883b307780e 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -2320,13 +2320,13 @@ static void pofs_unpin(struct pages_or_folios *pofs)
/*
* Returns the number of collected folios. Return value is always >= 0.
*/
-static unsigned long collect_longterm_unpinnable_folios(
+static void collect_longterm_unpinnable_folios(
struct list_head *movable_folio_list,
struct pages_or_folios *pofs)
{
- unsigned long i, collected = 0;
struct folio *prev_folio = NULL;
bool drain_allow = true;
+ unsigned long i;
for (i = 0; i < pofs->nr_entries; i++) {
struct folio *folio = pofs_get_folio(pofs, i);
@@ -2338,8 +2338,6 @@ static unsigned long collect_longterm_unpinnable_folios(
if (folio_is_longterm_pinnable(folio))
continue;
- collected++;
-
if (folio_is_device_coherent(folio))
continue;
@@ -2361,8 +2359,6 @@ static unsigned long collect_longterm_unpinnable_folios(
NR_ISOLATED_ANON + folio_is_file_lru(folio),
folio_nr_pages(folio));
}
-
- return collected;
}
/*
@@ -2439,11 +2435,9 @@ static long
check_and_migrate_movable_pages_or_folios(struct pages_or_folios *pofs)
{
LIST_HEAD(movable_folio_list);
- unsigned long collected;
- collected = collect_longterm_unpinnable_folios(&movable_folio_list,
- pofs);
- if (!collected)
+ collect_longterm_unpinnable_folios(&movable_folio_list, pofs);
+ if (list_empty(&movable_folio_list))
return 0;
return migrate_longterm_unpinnable_folios(&movable_folio_list, pofs);
The xHC resources allocated for USB devices are not released in correct order after resuming in case when while suspend device was reconnected.
This issue has been detected during the fallowing scenario:
- connect hub HS to root port
- connect LS/FS device to hub port
- wait for enumeration to finish
- force DUT to suspend
- reconnect hub attached to root port
- wake DUT
For this scenario during enumeration of USB LS/FS device the Cadence xHC reports completion error code for xHCi commands because the devices was not property disconnected and in result the xHC resources has not been correct freed.
XHCI specification doesn't mention that device can be reset in any order so, we should not treat this issue as Cadence xHC controller bug.
Similar as during disconnecting in this case the device should be cleared starting form the last usb device in tree toward the root hub.
To fix this issue usbcore driver should disconnect all USB devices connected to hub which was reconnected while suspending.
Fixes: 3d82904559f4 ("usb: cdnsp: cdns3 Add main part of Cadence USBSSP DRD Driver")
cc: <stable(a)vger.kernel.org>
Signed-off-by: Pawel Laszczak <pawell(a)cadence.com>
---
drivers/usb/core/hub.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c index 0cd44f1fd56d..2473cbf317a8 100644
--- a/drivers/usb/core/hub.c
+++ b/drivers/usb/core/hub.c
@@ -3627,10 +3627,12 @@ static int finish_port_resume(struct usb_device *udev)
* the device will be rediscovered.
*/
retry_reset_resume:
- if (udev->quirks & USB_QUIRK_RESET)
+ if (udev->quirks & USB_QUIRK_RESET) {
status = -ENODEV;
- else
+ } else {
+ hub_disconnect_children(udev);
status = usb_reset_and_verify_device(udev);
+ }
}
/* 10.5.4.5 says be sure devices in the tree are still there.
--
2.43.0
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x 318e8c339c9a0891c389298bb328ed0762a9935e
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025021722-poplar-spoilage-a69f@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 318e8c339c9a0891c389298bb328ed0762a9935e Mon Sep 17 00:00:00 2001
From: Patrick Bellasi <derkling(a)google.com>
Date: Wed, 5 Feb 2025 14:04:41 +0000
Subject: [PATCH] x86/cpu/kvm: SRSO: Fix possible missing IBPB on VM-Exit
In [1] the meaning of the synthetic IBPB flags has been redefined for a
better separation of concerns:
- ENTRY_IBPB -- issue IBPB on entry only
- IBPB_ON_VMEXIT -- issue IBPB on VM-Exit only
and the Retbleed mitigations have been updated to match this new
semantics.
Commit [2] was merged shortly before [1], and their interaction was not
handled properly. This resulted in IBPB not being triggered on VM-Exit
in all SRSO mitigation configs requesting an IBPB there.
Specifically, an IBPB on VM-Exit is triggered only when
X86_FEATURE_IBPB_ON_VMEXIT is set. However:
- X86_FEATURE_IBPB_ON_VMEXIT is not set for "spec_rstack_overflow=ibpb",
because before [1] having X86_FEATURE_ENTRY_IBPB was enough. Hence,
an IBPB is triggered on entry but the expected IBPB on VM-exit is
not.
- X86_FEATURE_IBPB_ON_VMEXIT is not set also when
"spec_rstack_overflow=ibpb-vmexit" if X86_FEATURE_ENTRY_IBPB is
already set.
That's because before [1] this was effectively redundant. Hence, e.g.
a "retbleed=ibpb spec_rstack_overflow=bpb-vmexit" config mistakenly
reports the machine still vulnerable to SRSO, despite an IBPB being
triggered both on entry and VM-Exit, because of the Retbleed selected
mitigation config.
- UNTRAIN_RET_VM won't still actually do anything unless
CONFIG_MITIGATION_IBPB_ENTRY is set.
For "spec_rstack_overflow=ibpb", enable IBPB on both entry and VM-Exit
and clear X86_FEATURE_RSB_VMEXIT which is made superfluous by
X86_FEATURE_IBPB_ON_VMEXIT. This effectively makes this mitigation
option similar to the one for 'retbleed=ibpb', thus re-order the code
for the RETBLEED_MITIGATION_IBPB option to be less confusing by having
all features enabling before the disabling of the not needed ones.
For "spec_rstack_overflow=ibpb-vmexit", guard this mitigation setting
with CONFIG_MITIGATION_IBPB_ENTRY to ensure UNTRAIN_RET_VM sequence is
effectively compiled in. Drop instead the CONFIG_MITIGATION_SRSO guard,
since none of the SRSO compile cruft is required in this configuration.
Also, check only that the required microcode is present to effectively
enabled the IBPB on VM-Exit.
Finally, update the KConfig description for CONFIG_MITIGATION_IBPB_ENTRY
to list also all SRSO config settings enabled by this guard.
Fixes: 864bcaa38ee4 ("x86/cpu/kvm: Provide UNTRAIN_RET_VM") [1]
Fixes: d893832d0e1e ("x86/srso: Add IBPB on VMEXIT") [2]
Reported-by: Yosry Ahmed <yosryahmed(a)google.com>
Signed-off-by: Patrick Bellasi <derkling(a)google.com>
Reviewed-by: Borislav Petkov (AMD) <bp(a)alien8.de>
Cc: stable(a)kernel.org
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 87198d957e2f..be2c311f5118 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2599,7 +2599,8 @@ config MITIGATION_IBPB_ENTRY
depends on CPU_SUP_AMD && X86_64
default y
help
- Compile the kernel with support for the retbleed=ibpb mitigation.
+ Compile the kernel with support for the retbleed=ibpb and
+ spec_rstack_overflow={ibpb,ibpb-vmexit} mitigations.
config MITIGATION_IBRS_ENTRY
bool "Enable IBRS on kernel entry"
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 5a505aa65489..a5d0998d7604 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -1115,6 +1115,8 @@ static void __init retbleed_select_mitigation(void)
case RETBLEED_MITIGATION_IBPB:
setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB);
+ setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT);
+ mitigate_smt = true;
/*
* IBPB on entry already obviates the need for
@@ -1124,9 +1126,6 @@ static void __init retbleed_select_mitigation(void)
setup_clear_cpu_cap(X86_FEATURE_UNRET);
setup_clear_cpu_cap(X86_FEATURE_RETHUNK);
- setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT);
- mitigate_smt = true;
-
/*
* There is no need for RSB filling: entry_ibpb() ensures
* all predictions, including the RSB, are invalidated,
@@ -2646,6 +2645,7 @@ static void __init srso_select_mitigation(void)
if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY)) {
if (has_microcode) {
setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB);
+ setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT);
srso_mitigation = SRSO_MITIGATION_IBPB;
/*
@@ -2655,6 +2655,13 @@ static void __init srso_select_mitigation(void)
*/
setup_clear_cpu_cap(X86_FEATURE_UNRET);
setup_clear_cpu_cap(X86_FEATURE_RETHUNK);
+
+ /*
+ * There is no need for RSB filling: entry_ibpb() ensures
+ * all predictions, including the RSB, are invalidated,
+ * regardless of IBPB implementation.
+ */
+ setup_clear_cpu_cap(X86_FEATURE_RSB_VMEXIT);
}
} else {
pr_err("WARNING: kernel not compiled with MITIGATION_IBPB_ENTRY.\n");
@@ -2663,8 +2670,8 @@ static void __init srso_select_mitigation(void)
ibpb_on_vmexit:
case SRSO_CMD_IBPB_ON_VMEXIT:
- if (IS_ENABLED(CONFIG_MITIGATION_SRSO)) {
- if (!boot_cpu_has(X86_FEATURE_ENTRY_IBPB) && has_microcode) {
+ if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY)) {
+ if (has_microcode) {
setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT);
srso_mitigation = SRSO_MITIGATION_IBPB_ON_VMEXIT;
@@ -2676,8 +2683,8 @@ static void __init srso_select_mitigation(void)
setup_clear_cpu_cap(X86_FEATURE_RSB_VMEXIT);
}
} else {
- pr_err("WARNING: kernel not compiled with MITIGATION_SRSO.\n");
- }
+ pr_err("WARNING: kernel not compiled with MITIGATION_IBPB_ENTRY.\n");
+ }
break;
default:
break;