The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x 0c4a13ba88594fd4a27292853e736c6b4349823d
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025121615-subplot-parachute-73bb@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 0c4a13ba88594fd4a27292853e736c6b4349823d Mon Sep 17 00:00:00 2001
From: Junrui Luo <moonafterrain(a)outlook.com>
Date: Thu, 6 Nov 2025 10:49:46 +0800
Subject: [PATCH] ALSA: wavefront: Fix integer overflow in sample size
validation
The wavefront_send_sample() function has an integer overflow issue
when validating sample size. The header->size field is u32 but gets
cast to int for comparison with dev->freemem
Fix by using unsigned comparison to avoid integer overflow.
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Cc: stable(a)vger.kernel.org
Signed-off-by: Junrui Luo <moonafterrain(a)outlook.com>
Link: https://patch.msgid.link/SYBPR01MB7881B47789D1B060CE8BF4C3AFC2A@SYBPR01MB78…
Signed-off-by: Takashi Iwai <tiwai(a)suse.de>
diff --git a/sound/isa/wavefront/wavefront_synth.c b/sound/isa/wavefront/wavefront_synth.c
index cd5c177943aa..0d78533e1cfd 100644
--- a/sound/isa/wavefront/wavefront_synth.c
+++ b/sound/isa/wavefront/wavefront_synth.c
@@ -950,9 +950,9 @@ wavefront_send_sample (snd_wavefront_t *dev,
if (header->size) {
dev->freemem = wavefront_freemem (dev);
- if (dev->freemem < (int)header->size) {
+ if (dev->freemem < 0 || dev->freemem < header->size) {
dev_err(dev->card->dev,
- "insufficient memory to load %d byte sample.\n",
+ "insufficient memory to load %u byte sample.\n",
header->size);
return -ENOMEM;
}
The patch below does not apply to the 6.17-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.17.y
git checkout FETCH_HEAD
git cherry-pick -x e11c5c13ce0ab2325d38fe63500be1dd88b81e38
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025121659-feline-king-9be3@gregkh' --subject-prefix 'PATCH 6.17.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From e11c5c13ce0ab2325d38fe63500be1dd88b81e38 Mon Sep 17 00:00:00 2001
From: Junrui Luo <moonafterrain(a)outlook.com>
Date: Thu, 6 Nov 2025 10:24:57 +0800
Subject: [PATCH] ALSA: wavefront: Clear substream pointers on close
Clear substream pointers in close functions to avoid leaving dangling
pointers, helping to improve code safety and
prevents potential issues.
Reported-by: Yuhao Jiang <danisjiang(a)gmail.com>
Reported-by: Junrui Luo <moonafterrain(a)outlook.com>
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Cc: stable(a)vger.kernel.org
Signed-off-by: Junrui Luo <moonafterrain(a)outlook.com>
Link: https://patch.msgid.link/SYBPR01MB7881DF762CAB45EE42F6D812AFC2A@SYBPR01MB78…
Signed-off-by: Takashi Iwai <tiwai(a)suse.de>
diff --git a/sound/isa/wavefront/wavefront_midi.c b/sound/isa/wavefront/wavefront_midi.c
index 1250ecba659a..69d87c4cafae 100644
--- a/sound/isa/wavefront/wavefront_midi.c
+++ b/sound/isa/wavefront/wavefront_midi.c
@@ -278,6 +278,7 @@ static int snd_wavefront_midi_input_close(struct snd_rawmidi_substream *substrea
return -EIO;
guard(spinlock_irqsave)(&midi->open);
+ midi->substream_input[mpu] = NULL;
midi->mode[mpu] &= ~MPU401_MODE_INPUT;
return 0;
@@ -300,6 +301,7 @@ static int snd_wavefront_midi_output_close(struct snd_rawmidi_substream *substre
return -EIO;
guard(spinlock_irqsave)(&midi->open);
+ midi->substream_output[mpu] = NULL;
midi->mode[mpu] &= ~MPU401_MODE_OUTPUT;
return 0;
}
From: Johannes Berg <johannes.berg(a)intel.com>
During the transition to use channel contexts throughout, the
ability to do injection while in monitor mode concurrent with
another interface was lost, since the (virtual) monitor won't
have a chanctx assigned in this scenario.
It's harder to fix drivers that actually transitioned to using
channel contexts themselves, such as mt76, but it's easy to do
those that are (still) just using the emulation. Do that.
Cc: stable(a)vger.kernel.org
Link: https://bugzilla.kernel.org/show_bug.cgi?id=218763
Reported-and-tested-by: Oscar Alfonso Diaz <oscar.alfonso.diaz(a)gmail.com>
Fixes: 0a44dfc07074 ("wifi: mac80211: simplify non-chanctx drivers")
Signed-off-by: Johannes Berg <johannes.berg(a)intel.com>
---
net/mac80211/tx.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 9d8b0a25f73c..1b55e8340413 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -2397,6 +2397,8 @@ netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb,
if (chanctx_conf)
chandef = &chanctx_conf->def;
+ else if (local->emulate_chanctx)
+ chandef = &local->hw.conf.chandef;
else
goto fail_rcu;
--
2.52.0
As reported, ever since commit 1013af4f585f ("mm/hugetlb: fix
huge_pmd_unshare() vs GUP-fast race") we can end up in some situations
where we perform so many IPI broadcasts when unsharing hugetlb PMD page
tables that it severely regresses some workloads.
In particular, when we fork()+exit(), or when we munmap() a large
area backed by many shared PMD tables, we perform one IPI broadcast per
unshared PMD table.
There are two optimizations to be had:
(1) When we process (unshare) multiple such PMD tables, such as during
exit(), it is sufficient to send a single IPI broadcast (as long as
we respect locking rules) instead of one per PMD table.
Locking prevents that any of these PMD tables could get reuse before
we drop the lock.
(2) When we are not the last sharer (> 2 users including us), there is
no need to send the IPI broadcast. The shared PMD tables cannot
become exclusive (fully unshared) before an IPI will be broadcasted
by the last sharer.
Concurrent GUP-fast could walk into a PMD table just before we
unshared it. It could then succeed in grabbing a page from the
shared page table even after munmap() etc succeeded (and supressed
an IPI). But there is not difference compared to GUP-fast just
sleeping for a while after grabbing the page and re-enabling IRQs.
Most importantly, GUP-fast will never walk into page tables that are
no-longer shared, because the last sharer will issue an IPI
broadcast.
(if ever required, checking whether the PUD changed in GUP-fast
after grabbing the page like we do in the PTE case could handle
this)
So let's rework PMD sharing TLB flushing + IPI sync to use the mmu_gather
infrastructure so we can implement these optimizations and demystify the
code at least a bit. Extend the mmu_gather infrastructure to be able to
deal with our special hugetlb PMD table sharing implementation.
We'll consolidate the handling for (full) unsharing of PMD tables in
tlb_unshare_pmd_ptdesc() and tlb_flush_unshared_tables(), and track
in "struct mmu_gather" whether we had (full) unsharing of PMD tables.
Because locking is very special (concurrent unsharing+reuse must be
prevented), we disallow deferring flushing to tlb_finish_mmu() and instead
require an explicit earlier call to tlb_flush_unshared_tables().
From hugetlb code, we call huge_pmd_unshare_flush() where we make sure
that the expected lock protecting us from concurrent unsharing+reuse is
still held.
Check with a VM_WARN_ON_ONCE() in tlb_finish_mmu() that
tlb_flush_unshared_tables() was properly called earlier.
Document it all properly.
Notes about tlb_remove_table_sync_one() interaction with unsharing:
There are two fairly tricky things:
(1) tlb_remove_table_sync_one() is a NOP on architectures without
CONFIG_MMU_GATHER_RCU_TABLE_FREE.
Here, the assumption is that the previous TLB flush would send an
IPI to all relevant CPUs. Careful: some architectures like x86 only
send IPIs to all relevant CPUs when tlb->freed_tables is set.
The relevant architectures should be selecting
MMU_GATHER_RCU_TABLE_FREE, but x86 might not do that in stable
kernels and it might have been problematic before this patch.
Also, the arch flushing behavior (independent of IPIs) is different
when tlb->freed_tables is set. Do we have to enlighten them to also
take care of tlb->unshared_tables? So far we didn't care, so
hopefully we are fine. Of course, we could be setting
tlb->freed_tables as well, but that might then unnecessarily flush
too much, because the semantics of tlb->freed_tables are a bit
fuzzy.
This patch changes nothing in this regard.
(2) tlb_remove_table_sync_one() is not a NOP on architectures with
CONFIG_MMU_GATHER_RCU_TABLE_FREE that actually don't need a sync.
Take x86 as an example: in the common case (!pv, !X86_FEATURE_INVLPGB)
we still issue IPIs during TLB flushes and don't actually need the
second tlb_remove_table_sync_one().
This optimized can be implemented on top of this, by checking e.g., in
tlb_remove_table_sync_one() whether we really need IPIs. But as
described in (1), it really must honor tlb->freed_tables then to
send IPIs to all relevant CPUs.
Further note that the ptdesc_pmd_pts_dec() in huge_pmd_share() is not a
concern, as we are holding the i_mmap_lock the whole time, preventing
concurrent unsharing. That ptdesc_pmd_pts_dec() usage will be removed
separately as a cleanup later.
There are plenty more cleanups to be had, but they have to wait until
this is fixed.
Fixes: 1013af4f585f ("mm/hugetlb: fix huge_pmd_unshare() vs GUP-fast race")
Reported-by: Uschakow, Stanislav" <suschako(a)amazon.de>
Closes: https://lore.kernel.org/all/4d3878531c76479d9f8ca9789dc6485d@amazon.de/
Tested-by: Laurence Oberman <loberman(a)redhat.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: David Hildenbrand (Red Hat) <david(a)kernel.org>
---
include/asm-generic/tlb.h | 74 ++++++++++++++++++++++-
include/linux/hugetlb.h | 19 +++---
mm/hugetlb.c | 121 ++++++++++++++++++++++----------------
mm/mmu_gather.c | 7 +++
mm/mprotect.c | 2 +-
mm/rmap.c | 25 +++++---
6 files changed, 179 insertions(+), 69 deletions(-)
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index 1fff717cae510..706416babb3d6 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -364,6 +364,20 @@ struct mmu_gather {
unsigned int vma_huge : 1;
unsigned int vma_pfn : 1;
+ /*
+ * Did we unshare (unmap) any shared page tables? For now only
+ * used for hugetlb PMD table sharing.
+ */
+ unsigned int unshared_tables : 1;
+
+ /*
+ * Did we unshare any page tables such that they are now exclusive
+ * and could get reused+modified by the new owner? When setting this
+ * flag, "unshared_tables" will be set as well. For now only used
+ * for hugetlb PMD table sharing.
+ */
+ unsigned int fully_unshared_tables : 1;
+
unsigned int batch_count;
#ifndef CONFIG_MMU_GATHER_NO_GATHER
@@ -400,6 +414,7 @@ static inline void __tlb_reset_range(struct mmu_gather *tlb)
tlb->cleared_pmds = 0;
tlb->cleared_puds = 0;
tlb->cleared_p4ds = 0;
+ tlb->unshared_tables = 0;
/*
* Do not reset mmu_gather::vma_* fields here, we do not
* call into tlb_start_vma() again to set them if there is an
@@ -484,7 +499,7 @@ static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
* these bits.
*/
if (!(tlb->freed_tables || tlb->cleared_ptes || tlb->cleared_pmds ||
- tlb->cleared_puds || tlb->cleared_p4ds))
+ tlb->cleared_puds || tlb->cleared_p4ds || tlb->unshared_tables))
return;
tlb_flush(tlb);
@@ -773,6 +788,63 @@ static inline bool huge_pmd_needs_flush(pmd_t oldpmd, pmd_t newpmd)
}
#endif
+#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING
+static inline void tlb_unshare_pmd_ptdesc(struct mmu_gather *tlb, struct ptdesc *pt,
+ unsigned long addr)
+{
+ /*
+ * The caller must make sure that concurrent unsharing + exclusive
+ * reuse is impossible until tlb_flush_unshared_tables() was called.
+ */
+ VM_WARN_ON_ONCE(!ptdesc_pmd_is_shared(pt));
+ ptdesc_pmd_pts_dec(pt);
+
+ /* Clearing a PUD pointing at a PMD table with PMD leaves. */
+ tlb_flush_pmd_range(tlb, addr & PUD_MASK, PUD_SIZE);
+
+ /*
+ * If the page table is now exclusively owned, we fully unshared
+ * a page table.
+ */
+ if (!ptdesc_pmd_is_shared(pt))
+ tlb->fully_unshared_tables = true;
+ tlb->unshared_tables = true;
+}
+
+static inline void tlb_flush_unshared_tables(struct mmu_gather *tlb)
+{
+ /*
+ * As soon as the caller drops locks to allow for reuse of
+ * previously-shared tables, these tables could get modified and
+ * even reused outside of hugetlb context, so we have to make sure that
+ * any page table walkers (incl. TLB, GUP-fast) are aware of that
+ * change.
+ *
+ * Even if we are not fully unsharing a PMD table, we must
+ * flush the TLB for the unsharer now.
+ */
+ if (tlb->unshared_tables)
+ tlb_flush_mmu_tlbonly(tlb);
+
+ /*
+ * Similarly, we must make sure that concurrent GUP-fast will not
+ * walk previously-shared page tables that are getting modified+reused
+ * elsewhere. So broadcast an IPI to wait for any concurrent GUP-fast.
+ *
+ * We only perform this when we are the last sharer of a page table,
+ * as the IPI will reach all CPUs: any GUP-fast.
+ *
+ * Note that on configs where tlb_remove_table_sync_one() is a NOP,
+ * the expectation is that the tlb_flush_mmu_tlbonly() would have issued
+ * required IPIs already for us.
+ */
+ if (tlb->fully_unshared_tables) {
+ tlb_remove_table_sync_one();
+ tlb->fully_unshared_tables = false;
+ }
+}
+#endif /* CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING */
+
#endif /* CONFIG_MMU */
#endif /* _ASM_GENERIC__TLB_H */
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 03c8725efa289..63b248c6bfd47 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -240,8 +240,9 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
pte_t *huge_pte_offset(struct mm_struct *mm,
unsigned long addr, unsigned long sz);
unsigned long hugetlb_mask_last_page(struct hstate *h);
-int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long addr, pte_t *ptep);
+int huge_pmd_unshare(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep);
+void huge_pmd_unshare_flush(struct mmu_gather *tlb, struct vm_area_struct *vma);
void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
unsigned long *start, unsigned long *end);
@@ -271,7 +272,7 @@ void hugetlb_vma_unlock_write(struct vm_area_struct *vma);
int hugetlb_vma_trylock_write(struct vm_area_struct *vma);
void hugetlb_vma_assert_locked(struct vm_area_struct *vma);
void hugetlb_vma_lock_release(struct kref *kref);
-long hugetlb_change_protection(struct vm_area_struct *vma,
+long hugetlb_change_protection(struct mmu_gather *tlb, struct vm_area_struct *vma,
unsigned long address, unsigned long end, pgprot_t newprot,
unsigned long cp_flags);
void hugetlb_unshare_all_pmds(struct vm_area_struct *vma);
@@ -300,13 +301,17 @@ static inline struct address_space *hugetlb_folio_mapping_lock_write(
return NULL;
}
-static inline int huge_pmd_unshare(struct mm_struct *mm,
- struct vm_area_struct *vma,
- unsigned long addr, pte_t *ptep)
+static inline int huge_pmd_unshare(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
{
return 0;
}
+static inline void huge_pmd_unshare_flush(struct mmu_gather *tlb,
+ struct vm_area_struct *vma)
+{
+}
+
static inline void adjust_range_if_pmd_sharing_possible(
struct vm_area_struct *vma,
unsigned long *start, unsigned long *end)
@@ -432,7 +437,7 @@ static inline void move_hugetlb_state(struct folio *old_folio,
{
}
-static inline long hugetlb_change_protection(
+static inline long hugetlb_change_protection(struct mmu_gather *tlb,
struct vm_area_struct *vma, unsigned long address,
unsigned long end, pgprot_t newprot,
unsigned long cp_flags)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 3c77cdef12a32..7fef0b94b5d1e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5096,8 +5096,9 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
unsigned long last_addr_mask;
pte_t *src_pte, *dst_pte;
struct mmu_notifier_range range;
- bool shared_pmd = false;
+ struct mmu_gather tlb;
+ tlb_gather_mmu(&tlb, vma->vm_mm);
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, old_addr,
old_end);
adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
@@ -5122,12 +5123,12 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
if (huge_pte_none(huge_ptep_get(mm, old_addr, src_pte)))
continue;
- if (huge_pmd_unshare(mm, vma, old_addr, src_pte)) {
- shared_pmd = true;
+ if (huge_pmd_unshare(&tlb, vma, old_addr, src_pte)) {
old_addr |= last_addr_mask;
new_addr |= last_addr_mask;
continue;
}
+ tlb_remove_huge_tlb_entry(h, &tlb, src_pte, old_addr);
dst_pte = huge_pte_alloc(mm, new_vma, new_addr, sz);
if (!dst_pte)
@@ -5136,13 +5137,13 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
move_huge_pte(vma, old_addr, new_addr, src_pte, dst_pte, sz);
}
- if (shared_pmd)
- flush_hugetlb_tlb_range(vma, range.start, range.end);
- else
- flush_hugetlb_tlb_range(vma, old_end - len, old_end);
+ tlb_flush_mmu_tlbonly(&tlb);
+ huge_pmd_unshare_flush(&tlb, vma);
+
mmu_notifier_invalidate_range_end(&range);
i_mmap_unlock_write(mapping);
hugetlb_vma_unlock_write(vma);
+ tlb_finish_mmu(&tlb);
return len + old_addr - old_end;
}
@@ -5161,7 +5162,6 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
unsigned long sz = huge_page_size(h);
bool adjust_reservation;
unsigned long last_addr_mask;
- bool force_flush = false;
WARN_ON(!is_vm_hugetlb_page(vma));
BUG_ON(start & ~huge_page_mask(h));
@@ -5184,10 +5184,8 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
}
ptl = huge_pte_lock(h, mm, ptep);
- if (huge_pmd_unshare(mm, vma, address, ptep)) {
+ if (huge_pmd_unshare(tlb, vma, address, ptep)) {
spin_unlock(ptl);
- tlb_flush_pmd_range(tlb, address & PUD_MASK, PUD_SIZE);
- force_flush = true;
address |= last_addr_mask;
continue;
}
@@ -5303,14 +5301,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
}
tlb_end_vma(tlb, vma);
- /*
- * There is nothing protecting a previously-shared page table that we
- * unshared through huge_pmd_unshare() from getting freed after we
- * release i_mmap_rwsem, so flush the TLB now. If huge_pmd_unshare()
- * succeeded, flush the range corresponding to the pud.
- */
- if (force_flush)
- tlb_flush_mmu_tlbonly(tlb);
+ huge_pmd_unshare_flush(tlb, vma);
}
void __hugetlb_zap_begin(struct vm_area_struct *vma,
@@ -6399,7 +6390,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
}
#endif /* CONFIG_USERFAULTFD */
-long hugetlb_change_protection(struct vm_area_struct *vma,
+long hugetlb_change_protection(struct mmu_gather *tlb, struct vm_area_struct *vma,
unsigned long address, unsigned long end,
pgprot_t newprot, unsigned long cp_flags)
{
@@ -6409,7 +6400,6 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
pte_t pte;
struct hstate *h = hstate_vma(vma);
long pages = 0, psize = huge_page_size(h);
- bool shared_pmd = false;
struct mmu_notifier_range range;
unsigned long last_addr_mask;
bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
@@ -6452,7 +6442,7 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
}
}
ptl = huge_pte_lock(h, mm, ptep);
- if (huge_pmd_unshare(mm, vma, address, ptep)) {
+ if (huge_pmd_unshare(tlb, vma, address, ptep)) {
/*
* When uffd-wp is enabled on the vma, unshare
* shouldn't happen at all. Warn about it if it
@@ -6461,7 +6451,6 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
WARN_ON_ONCE(uffd_wp || uffd_wp_resolve);
pages++;
spin_unlock(ptl);
- shared_pmd = true;
address |= last_addr_mask;
continue;
}
@@ -6522,22 +6511,16 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
pte = huge_pte_clear_uffd_wp(pte);
huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte);
pages++;
+ tlb_remove_huge_tlb_entry(h, tlb, ptep, address);
}
next:
spin_unlock(ptl);
cond_resched();
}
- /*
- * There is nothing protecting a previously-shared page table that we
- * unshared through huge_pmd_unshare() from getting freed after we
- * release i_mmap_rwsem, so flush the TLB now. If huge_pmd_unshare()
- * succeeded, flush the range corresponding to the pud.
- */
- if (shared_pmd)
- flush_hugetlb_tlb_range(vma, range.start, range.end);
- else
- flush_hugetlb_tlb_range(vma, start, end);
+
+ tlb_flush_mmu_tlbonly(tlb);
+ huge_pmd_unshare_flush(tlb, vma);
/*
* No need to call mmu_notifier_arch_invalidate_secondary_tlbs() we are
* downgrading page table protection not changing it to point to a new
@@ -6904,18 +6887,27 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
return pte;
}
-/*
- * unmap huge page backed by shared pte.
+/**
+ * huge_pmd_unshare - Unmap a pmd table if it is shared by multiple users
+ * @tlb: the current mmu_gather.
+ * @vma: the vma covering the pmd table.
+ * @addr: the address we are trying to unshare.
+ * @ptep: pointer into the (pmd) page table.
+ *
+ * Called with the page table lock held, the i_mmap_rwsem held in write mode
+ * and the hugetlb vma lock held in write mode.
*
- * Called with page table lock held.
+ * Note: The caller must call huge_pmd_unshare_flush() before dropping the
+ * i_mmap_rwsem.
*
- * returns: 1 successfully unmapped a shared pte page
- * 0 the underlying pte page is not shared, or it is the last user
+ * Returns: 1 if it was a shared PMD table and it got unmapped, or 0 if it
+ * was not a shared PMD table.
*/
-int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long addr, pte_t *ptep)
+int huge_pmd_unshare(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep)
{
unsigned long sz = huge_page_size(hstate_vma(vma));
+ struct mm_struct *mm = vma->vm_mm;
pgd_t *pgd = pgd_offset(mm, addr);
p4d_t *p4d = p4d_offset(pgd, addr);
pud_t *pud = pud_offset(p4d, addr);
@@ -6927,18 +6919,36 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
i_mmap_assert_write_locked(vma->vm_file->f_mapping);
hugetlb_vma_assert_locked(vma);
pud_clear(pud);
- /*
- * Once our caller drops the rmap lock, some other process might be
- * using this page table as a normal, non-hugetlb page table.
- * Wait for pending gup_fast() in other threads to finish before letting
- * that happen.
- */
- tlb_remove_table_sync_one();
- ptdesc_pmd_pts_dec(virt_to_ptdesc(ptep));
+
+ tlb_unshare_pmd_ptdesc(tlb, virt_to_ptdesc(ptep), addr);
+
mm_dec_nr_pmds(mm);
return 1;
}
+/*
+ * huge_pmd_unshare_flush - Complete a sequence of huge_pmd_unshare() calls
+ * @tlb: the current mmu_gather.
+ * @vma: the vma covering the pmd table.
+ *
+ * Perform necessary TLB flushes or IPI broadcasts to synchronize PMD table
+ * unsharing with concurrent page table walkers.
+ *
+ * This function must be called after a sequence of huge_pmd_unshare()
+ * calls while still holding the i_mmap_rwsem.
+ */
+void huge_pmd_unshare_flush(struct mmu_gather *tlb, struct vm_area_struct *vma)
+{
+ /*
+ * We must synchronize page table unsharing such that nobody will
+ * try reusing a previously-shared page table while it might still
+ * be in use by previous sharers (TLB, GUP_fast).
+ */
+ i_mmap_assert_write_locked(vma->vm_file->f_mapping);
+
+ tlb_flush_unshared_tables(tlb);
+}
+
#else /* !CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING */
pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -6947,12 +6957,16 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
return NULL;
}
-int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long addr, pte_t *ptep)
+int huge_pmd_unshare(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep)
{
return 0;
}
+void huge_pmd_unshare_flush(struct mmu_gather *tlb, struct vm_area_struct *vma)
+{
+}
+
void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
unsigned long *start, unsigned long *end)
{
@@ -7219,6 +7233,7 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
unsigned long sz = huge_page_size(h);
struct mm_struct *mm = vma->vm_mm;
struct mmu_notifier_range range;
+ struct mmu_gather tlb;
unsigned long address;
spinlock_t *ptl;
pte_t *ptep;
@@ -7229,6 +7244,7 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
if (start >= end)
return;
+ tlb_gather_mmu(&tlb, mm);
flush_cache_range(vma, start, end);
/*
* No need to call adjust_range_if_pmd_sharing_possible(), because
@@ -7248,10 +7264,10 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
if (!ptep)
continue;
ptl = huge_pte_lock(h, mm, ptep);
- huge_pmd_unshare(mm, vma, address, ptep);
+ huge_pmd_unshare(&tlb, vma, address, ptep);
spin_unlock(ptl);
}
- flush_hugetlb_tlb_range(vma, start, end);
+ huge_pmd_unshare_flush(&tlb, vma);
if (take_locks) {
i_mmap_unlock_write(vma->vm_file->f_mapping);
hugetlb_vma_unlock_write(vma);
@@ -7261,6 +7277,7 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
* Documentation/mm/mmu_notifier.rst.
*/
mmu_notifier_invalidate_range_end(&range);
+ tlb_finish_mmu(&tlb);
}
/*
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index 247e3f9db6c7a..030a162a263ba 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -426,6 +426,7 @@ static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
#endif
tlb->vma_pfn = 0;
+ tlb->fully_unshared_tables = 0;
__tlb_reset_range(tlb);
inc_tlb_flush_pending(tlb->mm);
}
@@ -468,6 +469,12 @@ void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm)
*/
void tlb_finish_mmu(struct mmu_gather *tlb)
{
+ /*
+ * We expect an earlier huge_pmd_unshare_flush() call to sort this out,
+ * due to complicated locking requirements with page table unsharing.
+ */
+ VM_WARN_ON_ONCE(tlb->fully_unshared_tables);
+
/*
* If there are parallel threads are doing PTE changes on same range
* under non-exclusive lock (e.g., mmap_lock read-side) but defer TLB
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 283889e4f1cec..5c330e817129e 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -652,7 +652,7 @@ long change_protection(struct mmu_gather *tlb,
#endif
if (is_vm_hugetlb_page(vma))
- pages = hugetlb_change_protection(vma, start, end, newprot,
+ pages = hugetlb_change_protection(tlb, vma, start, end, newprot,
cp_flags);
else
pages = change_protection_range(tlb, vma, start, end, newprot,
diff --git a/mm/rmap.c b/mm/rmap.c
index 748f48727a162..d6799afe11147 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -76,7 +76,7 @@
#include <linux/mm_inline.h>
#include <linux/oom.h>
-#include <asm/tlbflush.h>
+#include <asm/tlb.h>
#define CREATE_TRACE_POINTS
#include <trace/events/migrate.h>
@@ -2008,13 +2008,17 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
* if unsuccessful.
*/
if (!anon) {
+ struct mmu_gather tlb;
+
VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
if (!hugetlb_vma_trylock_write(vma))
goto walk_abort;
- if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
+
+ tlb_gather_mmu(&tlb, mm);
+ if (huge_pmd_unshare(&tlb, vma, address, pvmw.pte)) {
hugetlb_vma_unlock_write(vma);
- flush_tlb_range(vma,
- range.start, range.end);
+ huge_pmd_unshare_flush(&tlb, vma);
+ tlb_finish_mmu(&tlb);
/*
* The PMD table was unmapped,
* consequently unmapping the folio.
@@ -2022,6 +2026,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
goto walk_done;
}
hugetlb_vma_unlock_write(vma);
+ tlb_finish_mmu(&tlb);
}
pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
if (pte_dirty(pteval))
@@ -2398,17 +2403,20 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
* fail if unsuccessful.
*/
if (!anon) {
+ struct mmu_gather tlb;
+
VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
if (!hugetlb_vma_trylock_write(vma)) {
page_vma_mapped_walk_done(&pvmw);
ret = false;
break;
}
- if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
- hugetlb_vma_unlock_write(vma);
- flush_tlb_range(vma,
- range.start, range.end);
+ tlb_gather_mmu(&tlb, mm);
+ if (huge_pmd_unshare(&tlb, vma, address, pvmw.pte)) {
+ hugetlb_vma_unlock_write(vma);
+ huge_pmd_unshare_flush(&tlb, vma);
+ tlb_finish_mmu(&tlb);
/*
* The PMD table was unmapped,
* consequently unmapping the folio.
@@ -2417,6 +2425,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
break;
}
hugetlb_vma_unlock_write(vma);
+ tlb_finish_mmu(&tlb);
}
/* Nuke the hugetlb page table entry */
pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
--
2.52.0
As reported, ever since commit 1013af4f585f ("mm/hugetlb: fix
huge_pmd_unshare() vs GUP-fast race") we can end up in some situations
where we perform so many IPI broadcasts when unsharing hugetlb PMD page
tables that it severely regresses some workloads.
In particular, when we fork()+exit(), or when we munmap() a large
area backed by many shared PMD tables, we perform one IPI broadcast per
unshared PMD table.
There are two optimizations to be had:
(1) When we process (unshare) multiple such PMD tables, such as during
exit(), it is sufficient to send a single IPI broadcast (as long as
we respect locking rules) instead of one per PMD table.
Locking prevents that any of these PMD tables could get reuse before
we drop the lock.
(2) When we are not the last sharer (> 2 users including us), there is
no need to send the IPI broadcast. The shared PMD tables cannot
become exclusive (fully unshared) before an IPI will be broadcasted
by the last sharer.
Concurrent GUP-fast could walk into a PMD table just before we
unshared it. It could then succeed in grabbing a page from the
shared page table even after munmap() etc succeeded (and supressed
an IPI). But there is not difference compared to GUP-fast just
sleeping for a while after grabbing the page and re-enabling IRQs.
Most importantly, GUP-fast will never walk into page tables that are
no-longer shared, because the last sharer will issue an IPI
broadcast.
(if ever required, checking whether the PUD changed in GUP-fast
after grabbing the page like we do in the PTE case could handle
this)
So let's rework PMD sharing TLB flushing + IPI sync to use the mmu_gather
infrastructure so we can implement these optimizations and demystify the
code at least a bit. Extend the mmu_gather infrastructure to be able to
deal with our special hugetlb PMD table sharing implementation.
We'll consolidate the handling for (full) unsharing of PMD tables in
tlb_unshare_pmd_ptdesc() and tlb_flush_unshared_tables(), and track
in "struct mmu_gather" whether we had (full) unsharing of PMD tables.
Because locking is very special (concurrent unsharing+reuse must be
prevented), we disallow deferring flushing to tlb_finish_mmu() and instead
require an explicit earlier call to tlb_flush_unshared_tables().
From hugetlb code, we call huge_pmd_unshare_flush() where we make sure
that the expected lock protecting us from concurrent unsharing+reuse is
still held.
Check with a VM_WARN_ON_ONCE() in tlb_finish_mmu() that
tlb_flush_unshared_tables() was properly called earlier.
Document it all properly.
Notes about tlb_remove_table_sync_one() interaction with unsharing:
There are two fairly tricky things:
(1) tlb_remove_table_sync_one() is a NOP on architectures without
CONFIG_MMU_GATHER_RCU_TABLE_FREE.
Here, the assumption is that the previous TLB flush would send an
IPI to all relevant CPUs. Careful: some architectures like x86 only
send IPIs to all relevant CPUs when tlb->freed_tables is set.
The relevant architectures should be selecting
MMU_GATHER_RCU_TABLE_FREE, but x86 might not do that in stable
kernels and it might have been problematic before this patch.
Also, the arch flushing behavior (independent of IPIs) is different
when tlb->freed_tables is set. Do we have to enlighten them to also
take care of tlb->unshared_tables? So far we didn't care, so
hopefully we are fine. Of course, we could be setting
tlb->freed_tables as well, but that might then unnecessarily flush
too much, because the semantics of tlb->freed_tables are a bit
fuzzy.
This patch changes nothing in this regard.
(2) tlb_remove_table_sync_one() is not a NOP on architectures with
CONFIG_MMU_GATHER_RCU_TABLE_FREE that actually don't need a sync.
Take x86 as an example: in the common case (!pv, !X86_FEATURE_INVLPGB)
we still issue IPIs during TLB flushes and don't actually need the
second tlb_remove_table_sync_one().
This optimized can be implemented on top of this, by checking e.g., in
tlb_remove_table_sync_one() whether we really need IPIs. But as
described in (1), it really must honor tlb->freed_tables then to
send IPIs to all relevant CPUs.
Further note that the ptdesc_pmd_pts_dec() in huge_pmd_share() is not a
concern, as we are holding the i_mmap_lock the whole time, preventing
concurrent unsharing. That ptdesc_pmd_pts_dec() usage will be removed
separately as a cleanup later.
There are plenty more cleanups to be had, but they have to wait until
this is fixed.
Fixes: 1013af4f585f ("mm/hugetlb: fix huge_pmd_unshare() vs GUP-fast race")
Reported-by: Uschakow, Stanislav" <suschako(a)amazon.de>
Closes: https://lore.kernel.org/all/4d3878531c76479d9f8ca9789dc6485d@amazon.de/
Cc: <stable(a)vger.kernel.org>
Signed-off-by: David Hildenbrand (Red Hat) <david(a)kernel.org>
---
include/asm-generic/tlb.h | 69 +++++++++++++++++++++-
include/linux/hugetlb.h | 19 +++---
mm/hugetlb.c | 121 ++++++++++++++++++++++----------------
mm/mmu_gather.c | 6 ++
mm/mprotect.c | 2 +-
mm/rmap.c | 25 +++++---
6 files changed, 173 insertions(+), 69 deletions(-)
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index 1fff717cae510..324a21f53b644 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -364,6 +364,17 @@ struct mmu_gather {
unsigned int vma_huge : 1;
unsigned int vma_pfn : 1;
+ /*
+ * Did we unshare (unmap) any shared page tables?
+ */
+ unsigned int unshared_tables : 1;
+
+ /*
+ * Did we unshare any page tables such that they are now exclusive
+ * and could get reused+modified by the new owner?
+ */
+ unsigned int fully_unshared_tables : 1;
+
unsigned int batch_count;
#ifndef CONFIG_MMU_GATHER_NO_GATHER
@@ -400,6 +411,7 @@ static inline void __tlb_reset_range(struct mmu_gather *tlb)
tlb->cleared_pmds = 0;
tlb->cleared_puds = 0;
tlb->cleared_p4ds = 0;
+ tlb->unshared_tables = 0;
/*
* Do not reset mmu_gather::vma_* fields here, we do not
* call into tlb_start_vma() again to set them if there is an
@@ -484,7 +496,7 @@ static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
* these bits.
*/
if (!(tlb->freed_tables || tlb->cleared_ptes || tlb->cleared_pmds ||
- tlb->cleared_puds || tlb->cleared_p4ds))
+ tlb->cleared_puds || tlb->cleared_p4ds || tlb->unshared_tables))
return;
tlb_flush(tlb);
@@ -773,6 +785,61 @@ static inline bool huge_pmd_needs_flush(pmd_t oldpmd, pmd_t newpmd)
}
#endif
+#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING
+static inline void tlb_unshare_pmd_ptdesc(struct mmu_gather *tlb, struct ptdesc *pt,
+ unsigned long addr)
+{
+ /*
+ * The caller must make sure that concurrent unsharing + exclusive
+ * reuse is impossible until tlb_flush_unshared_tables() was called.
+ */
+ VM_WARN_ON_ONCE(!ptdesc_pmd_is_shared(pt));
+ ptdesc_pmd_pts_dec(pt);
+
+ /* Clearing a PUD pointing at a PMD table with PMD leaves. */
+ tlb_flush_pmd_range(tlb, addr & PUD_MASK, PUD_SIZE);
+
+ /*
+ * If the page table is now exclusively owned, we fully unshared
+ * a page table.
+ */
+ if (!ptdesc_pmd_is_shared(pt))
+ tlb->fully_unshared_tables = true;
+ tlb->unshared_tables = true;
+}
+
+static inline void tlb_flush_unshared_tables(struct mmu_gather *tlb)
+{
+ /*
+ * As soon as the caller drops locks to allow for reuse of
+ * previously-shared tables, these tables could get modified and
+ * even reused outside of hugetlb context. So flush the TLB now.
+ *
+ * Note that we cannot defer the flush to a later point even if we are
+ * not the last sharer of the page table.
+ */
+ if (tlb->unshared_tables)
+ tlb_flush_mmu_tlbonly(tlb);
+
+ /*
+ * Similarly, we must make sure that concurrent GUP-fast will not
+ * walk previously-shared page tables that are getting modified+reused
+ * elsewhere. So broadcast an IPI to wait for any concurrent GUP-fast.
+ *
+ * We only perform this when we are the last sharer of a page table,
+ * as the IPI will reach all CPUs: any GUP-fast.
+ *
+ * Note that on configs where tlb_remove_table_sync_one() is a NOP,
+ * the expectation is that the tlb_flush_mmu_tlbonly() would have issued
+ * required IPIs already for us.
+ */
+ if (tlb->fully_unshared_tables) {
+ tlb_remove_table_sync_one();
+ tlb->fully_unshared_tables = false;
+ }
+}
+#endif /* CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING */
+
#endif /* CONFIG_MMU */
#endif /* _ASM_GENERIC__TLB_H */
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 03c8725efa289..63b248c6bfd47 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -240,8 +240,9 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
pte_t *huge_pte_offset(struct mm_struct *mm,
unsigned long addr, unsigned long sz);
unsigned long hugetlb_mask_last_page(struct hstate *h);
-int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long addr, pte_t *ptep);
+int huge_pmd_unshare(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep);
+void huge_pmd_unshare_flush(struct mmu_gather *tlb, struct vm_area_struct *vma);
void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
unsigned long *start, unsigned long *end);
@@ -271,7 +272,7 @@ void hugetlb_vma_unlock_write(struct vm_area_struct *vma);
int hugetlb_vma_trylock_write(struct vm_area_struct *vma);
void hugetlb_vma_assert_locked(struct vm_area_struct *vma);
void hugetlb_vma_lock_release(struct kref *kref);
-long hugetlb_change_protection(struct vm_area_struct *vma,
+long hugetlb_change_protection(struct mmu_gather *tlb, struct vm_area_struct *vma,
unsigned long address, unsigned long end, pgprot_t newprot,
unsigned long cp_flags);
void hugetlb_unshare_all_pmds(struct vm_area_struct *vma);
@@ -300,13 +301,17 @@ static inline struct address_space *hugetlb_folio_mapping_lock_write(
return NULL;
}
-static inline int huge_pmd_unshare(struct mm_struct *mm,
- struct vm_area_struct *vma,
- unsigned long addr, pte_t *ptep)
+static inline int huge_pmd_unshare(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
{
return 0;
}
+static inline void huge_pmd_unshare_flush(struct mmu_gather *tlb,
+ struct vm_area_struct *vma)
+{
+}
+
static inline void adjust_range_if_pmd_sharing_possible(
struct vm_area_struct *vma,
unsigned long *start, unsigned long *end)
@@ -432,7 +437,7 @@ static inline void move_hugetlb_state(struct folio *old_folio,
{
}
-static inline long hugetlb_change_protection(
+static inline long hugetlb_change_protection(struct mmu_gather *tlb,
struct vm_area_struct *vma, unsigned long address,
unsigned long end, pgprot_t newprot,
unsigned long cp_flags)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 3c77cdef12a32..3db94693a06fc 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5096,8 +5096,9 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
unsigned long last_addr_mask;
pte_t *src_pte, *dst_pte;
struct mmu_notifier_range range;
- bool shared_pmd = false;
+ struct mmu_gather tlb;
+ tlb_gather_mmu(&tlb, vma->vm_mm);
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, old_addr,
old_end);
adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
@@ -5122,12 +5123,12 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
if (huge_pte_none(huge_ptep_get(mm, old_addr, src_pte)))
continue;
- if (huge_pmd_unshare(mm, vma, old_addr, src_pte)) {
- shared_pmd = true;
+ if (huge_pmd_unshare(&tlb, vma, old_addr, src_pte)) {
old_addr |= last_addr_mask;
new_addr |= last_addr_mask;
continue;
}
+ tlb_remove_huge_tlb_entry(h, &tlb, src_pte, old_addr);
dst_pte = huge_pte_alloc(mm, new_vma, new_addr, sz);
if (!dst_pte)
@@ -5136,13 +5137,13 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
move_huge_pte(vma, old_addr, new_addr, src_pte, dst_pte, sz);
}
- if (shared_pmd)
- flush_hugetlb_tlb_range(vma, range.start, range.end);
- else
- flush_hugetlb_tlb_range(vma, old_end - len, old_end);
+ tlb_flush_mmu_tlbonly(&tlb);
+ huge_pmd_unshare_flush(&tlb, vma);
+
mmu_notifier_invalidate_range_end(&range);
i_mmap_unlock_write(mapping);
hugetlb_vma_unlock_write(vma);
+ tlb_finish_mmu(&tlb);
return len + old_addr - old_end;
}
@@ -5161,7 +5162,6 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
unsigned long sz = huge_page_size(h);
bool adjust_reservation;
unsigned long last_addr_mask;
- bool force_flush = false;
WARN_ON(!is_vm_hugetlb_page(vma));
BUG_ON(start & ~huge_page_mask(h));
@@ -5184,10 +5184,8 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
}
ptl = huge_pte_lock(h, mm, ptep);
- if (huge_pmd_unshare(mm, vma, address, ptep)) {
+ if (huge_pmd_unshare(tlb, vma, address, ptep)) {
spin_unlock(ptl);
- tlb_flush_pmd_range(tlb, address & PUD_MASK, PUD_SIZE);
- force_flush = true;
address |= last_addr_mask;
continue;
}
@@ -5303,14 +5301,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
}
tlb_end_vma(tlb, vma);
- /*
- * There is nothing protecting a previously-shared page table that we
- * unshared through huge_pmd_unshare() from getting freed after we
- * release i_mmap_rwsem, so flush the TLB now. If huge_pmd_unshare()
- * succeeded, flush the range corresponding to the pud.
- */
- if (force_flush)
- tlb_flush_mmu_tlbonly(tlb);
+ huge_pmd_unshare_flush(tlb, vma);
}
void __hugetlb_zap_begin(struct vm_area_struct *vma,
@@ -6399,7 +6390,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
}
#endif /* CONFIG_USERFAULTFD */
-long hugetlb_change_protection(struct vm_area_struct *vma,
+long hugetlb_change_protection(struct mmu_gather *tlb, struct vm_area_struct *vma,
unsigned long address, unsigned long end,
pgprot_t newprot, unsigned long cp_flags)
{
@@ -6409,7 +6400,6 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
pte_t pte;
struct hstate *h = hstate_vma(vma);
long pages = 0, psize = huge_page_size(h);
- bool shared_pmd = false;
struct mmu_notifier_range range;
unsigned long last_addr_mask;
bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
@@ -6452,7 +6442,7 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
}
}
ptl = huge_pte_lock(h, mm, ptep);
- if (huge_pmd_unshare(mm, vma, address, ptep)) {
+ if (huge_pmd_unshare(tlb, vma, address, ptep)) {
/*
* When uffd-wp is enabled on the vma, unshare
* shouldn't happen at all. Warn about it if it
@@ -6461,7 +6451,6 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
WARN_ON_ONCE(uffd_wp || uffd_wp_resolve);
pages++;
spin_unlock(ptl);
- shared_pmd = true;
address |= last_addr_mask;
continue;
}
@@ -6522,22 +6511,16 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
pte = huge_pte_clear_uffd_wp(pte);
huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte);
pages++;
+ tlb_remove_huge_tlb_entry(h, tlb, ptep, address);
}
next:
spin_unlock(ptl);
cond_resched();
}
- /*
- * There is nothing protecting a previously-shared page table that we
- * unshared through huge_pmd_unshare() from getting freed after we
- * release i_mmap_rwsem, so flush the TLB now. If huge_pmd_unshare()
- * succeeded, flush the range corresponding to the pud.
- */
- if (shared_pmd)
- flush_hugetlb_tlb_range(vma, range.start, range.end);
- else
- flush_hugetlb_tlb_range(vma, start, end);
+
+ tlb_flush_mmu_tlbonly(tlb);
+ huge_pmd_unshare_flush(tlb, vma);
/*
* No need to call mmu_notifier_arch_invalidate_secondary_tlbs() we are
* downgrading page table protection not changing it to point to a new
@@ -6904,18 +6887,27 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
return pte;
}
-/*
- * unmap huge page backed by shared pte.
+/**
+ * huge_pmd_unshare - Unmap a pmd table if it is shared by multiple users
+ * @tlb: the current mmu_gather.
+ * @vma: the vma covering the pmd table.
+ * @addr: the address we are trying to unshare.
+ * @ptep: pointer into the (pmd) page table.
+ *
+ * Called with the page table lock held, the i_mmap_rwsem held in write mode
+ * and the hugetlb vma lock held in write mode.
*
- * Called with page table lock held.
+ * Note: The caller must call huge_pmd_unshare_flush() before dropping the
+ * i_mmap_rwsem.
*
- * returns: 1 successfully unmapped a shared pte page
- * 0 the underlying pte page is not shared, or it is the last user
+ * Returns: 1 if it was a shared PMD table and it got unmapped, or 0 if it
+ * was not a shared PMD table.
*/
-int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long addr, pte_t *ptep)
+int huge_pmd_unshare(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep)
{
unsigned long sz = huge_page_size(hstate_vma(vma));
+ struct mm_struct *mm = vma->vm_mm;
pgd_t *pgd = pgd_offset(mm, addr);
p4d_t *p4d = p4d_offset(pgd, addr);
pud_t *pud = pud_offset(p4d, addr);
@@ -6927,18 +6919,36 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
i_mmap_assert_write_locked(vma->vm_file->f_mapping);
hugetlb_vma_assert_locked(vma);
pud_clear(pud);
- /*
- * Once our caller drops the rmap lock, some other process might be
- * using this page table as a normal, non-hugetlb page table.
- * Wait for pending gup_fast() in other threads to finish before letting
- * that happen.
- */
- tlb_remove_table_sync_one();
- ptdesc_pmd_pts_dec(virt_to_ptdesc(ptep));
+
+ tlb_unshare_pmd_ptdesc(tlb, virt_to_ptdesc(ptep), addr);
+
mm_dec_nr_pmds(mm);
return 1;
}
+/*
+ * huge_pmd_unshare_flush - Complete a sequence of huge_pmd_unshare() calls
+ * @tlb: the current mmu_gather.
+ * @vma: the vma covering the pmd table.
+ *
+ * Perform necessary TLB flushes or IPI broadcasts to synchronize PMD table
+ * unsharing with concurrent page table walkers (TLB, GUP-fast, etc.).
+ *
+ * This function must be called after a sequence of huge_pmd_unshare()
+ * calls while still holding the i_mmap_rwsem.
+ */
+void huge_pmd_unshare_flush(struct mmu_gather *tlb, struct vm_area_struct *vma)
+{
+ /*
+ * We must synchronize page table unsharing such that nobody will
+ * try reusing a previously-shared page table while it might still
+ * be in use by previous sharers (TLB, GUP_fast).
+ */
+ i_mmap_assert_write_locked(vma->vm_file->f_mapping);
+
+ tlb_flush_unshared_tables(tlb);
+}
+
#else /* !CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING */
pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -6947,12 +6957,16 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
return NULL;
}
-int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long addr, pte_t *ptep)
+int huge_pmd_unshare(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep)
{
return 0;
}
+void huge_pmd_unshare_flush(struct mmu_gather *tlb, struct vm_area_struct *vma)
+{
+}
+
void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
unsigned long *start, unsigned long *end)
{
@@ -7219,6 +7233,7 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
unsigned long sz = huge_page_size(h);
struct mm_struct *mm = vma->vm_mm;
struct mmu_notifier_range range;
+ struct mmu_gather tlb;
unsigned long address;
spinlock_t *ptl;
pte_t *ptep;
@@ -7229,6 +7244,7 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
if (start >= end)
return;
+ tlb_gather_mmu(&tlb, mm);
flush_cache_range(vma, start, end);
/*
* No need to call adjust_range_if_pmd_sharing_possible(), because
@@ -7248,10 +7264,10 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
if (!ptep)
continue;
ptl = huge_pte_lock(h, mm, ptep);
- huge_pmd_unshare(mm, vma, address, ptep);
+ huge_pmd_unshare(&tlb, vma, address, ptep);
spin_unlock(ptl);
}
- flush_hugetlb_tlb_range(vma, start, end);
+ huge_pmd_unshare_flush(&tlb, vma);
if (take_locks) {
i_mmap_unlock_write(vma->vm_file->f_mapping);
hugetlb_vma_unlock_write(vma);
@@ -7261,6 +7277,7 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
* Documentation/mm/mmu_notifier.rst.
*/
mmu_notifier_invalidate_range_end(&range);
+ tlb_finish_mmu(&tlb);
}
/*
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index 247e3f9db6c7a..822a790127375 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -468,6 +468,12 @@ void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm)
*/
void tlb_finish_mmu(struct mmu_gather *tlb)
{
+ /*
+ * We expect an earlier huge_pmd_unshare_flush() call to sort this out,
+ * due to complicated locking requirements with page table unsharing.
+ */
+ VM_WARN_ON_ONCE(tlb->fully_unshared_tables);
+
/*
* If there are parallel threads are doing PTE changes on same range
* under non-exclusive lock (e.g., mmap_lock read-side) but defer TLB
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 283889e4f1cec..5c330e817129e 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -652,7 +652,7 @@ long change_protection(struct mmu_gather *tlb,
#endif
if (is_vm_hugetlb_page(vma))
- pages = hugetlb_change_protection(vma, start, end, newprot,
+ pages = hugetlb_change_protection(tlb, vma, start, end, newprot,
cp_flags);
else
pages = change_protection_range(tlb, vma, start, end, newprot,
diff --git a/mm/rmap.c b/mm/rmap.c
index 748f48727a162..d6799afe11147 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -76,7 +76,7 @@
#include <linux/mm_inline.h>
#include <linux/oom.h>
-#include <asm/tlbflush.h>
+#include <asm/tlb.h>
#define CREATE_TRACE_POINTS
#include <trace/events/migrate.h>
@@ -2008,13 +2008,17 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
* if unsuccessful.
*/
if (!anon) {
+ struct mmu_gather tlb;
+
VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
if (!hugetlb_vma_trylock_write(vma))
goto walk_abort;
- if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
+
+ tlb_gather_mmu(&tlb, mm);
+ if (huge_pmd_unshare(&tlb, vma, address, pvmw.pte)) {
hugetlb_vma_unlock_write(vma);
- flush_tlb_range(vma,
- range.start, range.end);
+ huge_pmd_unshare_flush(&tlb, vma);
+ tlb_finish_mmu(&tlb);
/*
* The PMD table was unmapped,
* consequently unmapping the folio.
@@ -2022,6 +2026,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
goto walk_done;
}
hugetlb_vma_unlock_write(vma);
+ tlb_finish_mmu(&tlb);
}
pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
if (pte_dirty(pteval))
@@ -2398,17 +2403,20 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
* fail if unsuccessful.
*/
if (!anon) {
+ struct mmu_gather tlb;
+
VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
if (!hugetlb_vma_trylock_write(vma)) {
page_vma_mapped_walk_done(&pvmw);
ret = false;
break;
}
- if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
- hugetlb_vma_unlock_write(vma);
- flush_tlb_range(vma,
- range.start, range.end);
+ tlb_gather_mmu(&tlb, mm);
+ if (huge_pmd_unshare(&tlb, vma, address, pvmw.pte)) {
+ hugetlb_vma_unlock_write(vma);
+ huge_pmd_unshare_flush(&tlb, vma);
+ tlb_finish_mmu(&tlb);
/*
* The PMD table was unmapped,
* consequently unmapping the folio.
@@ -2417,6 +2425,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
break;
}
hugetlb_vma_unlock_write(vma);
+ tlb_finish_mmu(&tlb);
}
/* Nuke the hugetlb page table entry */
pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
--
2.52.0
Hi Sasha,
On 13-Dec-25 10:35, Sasha Levin wrote:
> This is a note to let you know that I've just added the patch titled
>
> media: ov02c10: Fix default vertical flip
This fix is incomplete, leading to wrong colors and it causes
the image to be upside down on some Dell XPS models where it
currently is the right way up.
There is a series of fixes which applies on top of this to
fix both issues:
https://lore.kernel.org/linux-media/20251210112436.167212-1-johannes.goede@…
For now (without the fixes on top) we are better of not adding
this patch to the stable series. Can you drop this patch
please?
Same for 6.17 and other stable series.
Regards,
Hans
>
> to the 6.18-stable tree which can be found at:
> http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
>
> The filename of the patch is:
> media-ov02c10-fix-default-vertical-flip.patch
> and it can be found in the queue-6.18 subdirectory.
>
> If you, or anyone else, feels it should not be added to the stable tree,
> please let <stable(a)vger.kernel.org> know about it.
>
>
>
> commit 14cc4474799a595caeccdb8fdf2ca4b867cef972
> Author: Sebastian Reichel <sre(a)kernel.org>
> Date: Wed Aug 20 02:13:19 2025 +0200
>
> media: ov02c10: Fix default vertical flip
>
> [ Upstream commit d5ebe3f7d13d4cee3ff7e718de23564915aaf163 ]
>
> The driver right now defaults to setting the vertical flip bit. This
> conflicts with proper handling of the rotation property defined in
> ACPI or device tree, so drop the VFLIP bit. It should be handled via
> V4L2_CID_VFLIP instead.
>
> Reported-by: Frederic Stuyk <fstuyk(a)runbox.com>
> Closes: https://lore.kernel.org/all/b6df9ae7-ea9f-4e5a-8065-5b130f534f37@runbox.com/
> Fixes: 44f89010dae0 ("media: i2c: Add Omnivision OV02C10 sensor driver")
> Signed-off-by: Sebastian Reichel <sre(a)kernel.org>
> Reviewed-by: Bryan O'Donoghue <bod(a)kernel.org>
> Signed-off-by: Sakari Ailus <sakari.ailus(a)linux.intel.com>
> Signed-off-by: Hans Verkuil <hverkuil+cisco(a)kernel.org>
> Signed-off-by: Sasha Levin <sashal(a)kernel.org>
>
> diff --git a/drivers/media/i2c/ov02c10.c b/drivers/media/i2c/ov02c10.c
> index 8c4d85dc7922e..8e22ff446b0c4 100644
> --- a/drivers/media/i2c/ov02c10.c
> +++ b/drivers/media/i2c/ov02c10.c
> @@ -174,7 +174,7 @@ static const struct reg_sequence sensor_1928x1092_30fps_setting[] = {
> {0x3816, 0x01},
> {0x3817, 0x01},
>
> - {0x3820, 0xb0},
> + {0x3820, 0xa0},
> {0x3821, 0x00},
> {0x3822, 0x80},
> {0x3823, 0x08},
This is the start of the stable review cycle for the 6.12.62 release.
There are 49 patches in this series, all will be posted as a response
to this one. If anyone has any issues with these being applied, please
let me know.
Responses should be made by Fri, 12 Dec 2025 07:29:38 +0000.
Anything received after that time might be too late.
The whole patch series can be found in one patch at:
https://www.kernel.org/pub/linux/kernel/v6.x/stable-review/patch-6.12.62-rc…
or in the git tree and branch at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git linux-6.12.y
and the diffstat can be found below.
thanks,
greg k-h
-------------
Pseudo-Shortlog of commits:
Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Linux 6.12.62-rc1
Daniele Palmas <dnlplm(a)gmail.com>
bus: mhi: host: pci_generic: Add Telit FN990B40 modem support
Daniele Palmas <dnlplm(a)gmail.com>
bus: mhi: host: pci_generic: Add Telit FN920C04 modem support
Navaneeth K <knavaneeth786(a)gmail.com>
staging: rtl8723bs: fix out-of-bounds read in OnBeacon ESR IE parsing
Navaneeth K <knavaneeth786(a)gmail.com>
staging: rtl8723bs: fix stack buffer overflow in OnAssocReq IE parsing
Navaneeth K <knavaneeth786(a)gmail.com>
staging: rtl8723bs: fix out-of-bounds read in rtw_get_ie() parser
Nikita Zhandarovich <n.zhandarovich(a)fintech.ru>
comedi: check device's attached status in compat ioctls
Nikita Zhandarovich <n.zhandarovich(a)fintech.ru>
comedi: multiq3: sanitize config options in multiq3_attach()
Ian Abbott <abbotti(a)mev.co.uk>
comedi: c6xdigio: Fix invalid PNP driver unregistration
Zenm Chen <zenmchen(a)gmail.com>
wifi: rtw88: Add USB ID 2001:3329 for D-Link AC13U rev. A1
Zenm Chen <zenmchen(a)gmail.com>
wifi: rtl8xxxu: Add USB ID 2001:3328 for D-Link AN3U rev. A1
Linus Torvalds <torvalds(a)linux-foundation.org>
samples: work around glibc redefining some of our defines wrong
Huacai Chen <chenhuacai(a)kernel.org>
LoongArch: Mask all interrupts during kexec/kdump
Naoki Ueki <naoki25519(a)gmail.com>
HID: elecom: Add support for ELECOM M-XT3URBK (018F)
Antheas Kapenekakis <lkml(a)antheas.dev>
platform/x86/amd/pmc: Add spurious_8042 to Xbox Ally
Antheas Kapenekakis <lkml(a)antheas.dev>
platform/x86/amd: pmc: Add Lenovo Legion Go 2 to pmc quirk list
Jia Ston <ston.jia(a)outlook.com>
platform/x86: huawei-wmi: add keys for HONOR models
April Grimoire <april(a)aprilg.moe>
HID: apple: Add SONiX AK870 PRO to non_apple_keyboards quirk list
Armin Wolf <W_Armin(a)gmx.de>
platform/x86: acer-wmi: Ignore backlight event
Praveen Talari <praveen.talari(a)oss.qualcomm.com>
pinctrl: qcom: msm: Fix deadlock in pinmux configuration
Keith Busch <kbusch(a)kernel.org>
nvme: fix admin request_queue lifetime
Mario Limonciello (AMD) <superm1(a)kernel.org>
HID: hid-input: Extend Elan ignore battery quirk to USB
Tetsuo Handa <penguin-kernel(a)I-love.SAKURA.ne.jp>
bfs: Reconstruct file type when loading from disk
Lushih Hsieh <bruce(a)mail.kh.edu.tw>
ALSA: usb-audio: Add native DSD quirks for PureAudio DAC series
Harish Kasiviswanathan <Harish.Kasiviswanathan(a)amd.com>
drm/amdkfd: Fix GPU mappings for APU after prefetch
Yiqi Sun <sunyiqixm(a)gmail.com>
smb: fix invalid username check in smb3_fs_context_parse_param()
Max Chou <max.chou(a)realtek.com>
Bluetooth: btrtl: Avoid loading the config file on security chips
Ian Forbes <ian.forbes(a)broadcom.com>
drm/vmwgfx: Use kref in vmw_bo_dirty
Robin Gong <yibin.gong(a)nxp.com>
spi: imx: keep dma request disabled before dma transfer setup
Alvaro Gamez Machado <alvaro.gamez(a)hazent.com>
spi: xilinx: increase number of retries before declaring stall
Song Liu <song(a)kernel.org>
ftrace: bpf: Fix IPMODIFY + DIRECT in modify_ftrace_direct()
Johan Hovold <johan(a)kernel.org>
USB: serial: kobil_sct: fix TIOCMBIS and TIOCMBIC
Johan Hovold <johan(a)kernel.org>
USB: serial: belkin_sa: fix TIOCMBIS and TIOCMBIC
Magne Bruno <magne.bruno(a)addi-data.com>
serial: add support of CPCI cards
Johan Hovold <johan(a)kernel.org>
USB: serial: ftdi_sio: match on interface number for jtag
Fabio Porcedda <fabio.porcedda(a)gmail.com>
USB: serial: option: move Telit 0x10c7 composition in the right place
Fabio Porcedda <fabio.porcedda(a)gmail.com>
USB: serial: option: add Telit Cinterion FE910C04 new compositions
Slark Xiao <slark_xiao(a)163.com>
USB: serial: option: add Foxconn T99W760
Omar Sandoval <osandov(a)fb.com>
KVM: SVM: Don't skip unrelated instruction if INT3/INTO is replaced
Nikita Zhandarovich <n.zhandarovich(a)fintech.ru>
comedi: pcl818: fix null-ptr-deref in pcl818_ai_cancel()
Alexey Nepomnyashih <sdl(a)nppct.ru>
ext4: add i_data_sem protection in ext4_destroy_inline_data_nolock()
Alexander Sverdlin <alexander.sverdlin(a)siemens.com>
locking/spinlock/debug: Fix data-race in do_raw_write_lock
Qianchang Zhao <pioooooooooip(a)gmail.com>
ksmbd: ipc: fix use-after-free in ipc_msg_send_request
Deepanshu Kartikey <kartikey406(a)gmail.com>
ext4: refresh inline data size before write operations
Ye Bin <yebin10(a)huawei.com>
jbd2: avoid bug_on in jbd2_journal_get_create_access() when file system corrupted
Bagas Sanjaya <bagasdotme(a)gmail.com>
Documentation: process: Also mention Sasha Levin as stable tree maintainer
Sabrina Dubroca <sd(a)queasysnail.net>
xfrm: flush all states in xfrm_state_fini
Sabrina Dubroca <sd(a)queasysnail.net>
xfrm: also call xfrm_state_delete_tunnel at destroy time for states that were never added
Sabrina Dubroca <sd(a)queasysnail.net>
Revert "xfrm: destroy xfrm_state synchronously on net exit path"
Sabrina Dubroca <sd(a)queasysnail.net>
xfrm: delete x->tunnel as we delete x
-------------
Diffstat:
Documentation/process/2.Process.rst | 6 ++-
Makefile | 4 +-
arch/loongarch/kernel/machine_kexec.c | 2 +
arch/x86/include/asm/kvm_host.h | 9 ++++
arch/x86/kvm/svm/svm.c | 24 +++++----
arch/x86/kvm/x86.c | 21 ++++++++
drivers/bluetooth/btrtl.c | 24 +++++----
drivers/bus/mhi/host/pci_generic.c | 52 +++++++++++++++++++
drivers/comedi/comedi_fops.c | 42 ++++++++++++---
drivers/comedi/drivers/c6xdigio.c | 46 ++++++++++++----
drivers/comedi/drivers/multiq3.c | 9 ++++
drivers/comedi/drivers/pcl818.c | 5 +-
drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 2 +
drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c | 12 ++---
drivers/hid/hid-apple.c | 1 +
drivers/hid/hid-elecom.c | 6 ++-
drivers/hid/hid-ids.h | 3 +-
drivers/hid/hid-input.c | 5 +-
drivers/hid/hid-quirks.c | 3 +-
drivers/net/wireless/realtek/rtl8xxxu/core.c | 3 ++
drivers/net/wireless/realtek/rtw88/rtw8822cu.c | 2 +
drivers/nvme/host/core.c | 3 +-
drivers/pinctrl/qcom/pinctrl-msm.c | 2 +-
drivers/platform/x86/acer-wmi.c | 4 ++
drivers/platform/x86/amd/pmc/pmc-quirks.c | 25 +++++++++
drivers/platform/x86/huawei-wmi.c | 4 ++
drivers/spi/spi-imx.c | 15 ++++--
drivers/spi/spi-xilinx.c | 2 +-
drivers/staging/rtl8723bs/core/rtw_ieee80211.c | 14 ++---
drivers/staging/rtl8723bs/core/rtw_mlme_ext.c | 13 +++--
drivers/tty/serial/8250/8250_pci.c | 37 +++++++++++++
drivers/usb/serial/belkin_sa.c | 28 ++++++----
drivers/usb/serial/ftdi_sio.c | 72 +++++++++-----------------
drivers/usb/serial/kobil_sct.c | 18 +++----
drivers/usb/serial/option.c | 22 ++++++--
fs/bfs/inode.c | 19 ++++++-
fs/ext4/inline.c | 14 ++++-
fs/jbd2/transaction.c | 19 +++++--
fs/smb/client/fs_context.c | 2 +-
fs/smb/server/transport_ipc.c | 7 ++-
include/net/xfrm.h | 13 ++---
kernel/locking/spinlock_debug.c | 4 +-
kernel/trace/ftrace.c | 40 ++++++++++----
net/ipv4/ipcomp.c | 2 +
net/ipv6/ipcomp6.c | 2 +
net/ipv6/xfrm6_tunnel.c | 2 +-
net/key/af_key.c | 2 +-
net/xfrm/xfrm_ipcomp.c | 1 -
net/xfrm/xfrm_state.c | 41 ++++++---------
net/xfrm/xfrm_user.c | 2 +-
samples/vfs/test-statx.c | 6 +++
samples/watch_queue/watch_test.c | 6 +++
sound/usb/quirks.c | 6 +++
53 files changed, 521 insertions(+), 207 deletions(-)
In function `scmi_devm_notifier_unregister` the notifier-block parameter
was unused and therefore never passed to `devres_release`. This causes
the function to always return -ENOENT and fail to unregister the
notifier.
In drivers that rely on this function for cleanup this causes
unexpected failures including kernel-panic.
This is not needed upstream becaues the bug was fixed
in a refactor by commit 264a2c520628 ("firmware: arm_scmi: Simplify
scmi_devm_notifier_unregister"). It is needed for the 5.15, 6.1 and
6.6 kernels.
Cc: <stable(a)vger.kernel.org> # 5.15.x, 6.1.x, and 6.6.x
Fixes: 5ad3d1cf7d34 ("firmware: arm_scmi: Introduce new devres notification ops")
Reviewed-by: Dan Carpenter <dan.carpenter(a)linaro.org>
Reviewed-by: Cristian Marussi <cristian.marussi(a)arm.com>
Signed-off-by: Amitai Gottlieb <amitaig(a)hailo.ai>
---
drivers/firmware/arm_scmi/notify.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/drivers/firmware/arm_scmi/notify.c b/drivers/firmware/arm_scmi/notify.c
index 0efd20cd9d69..4782b115e6ec 100644
--- a/drivers/firmware/arm_scmi/notify.c
+++ b/drivers/firmware/arm_scmi/notify.c
@@ -1539,6 +1539,7 @@ static int scmi_devm_notifier_unregister(struct scmi_device *sdev,
dres.handle = sdev->handle;
dres.proto_id = proto_id;
dres.evt_id = evt_id;
+ dres.nb = nb;
if (src_id) {
dres.__src_id = *src_id;
dres.src_id = &dres.__src_id;
--
2.34.1
Previously, a sender thread in mbox_send_message() could be woken up at
a wrong time in blocking mode. It is because there was only a single
completion for a channel whereas messages from multiple threads could be
sent on the same channel in any order; since the shared completion could
be signalled in any order, it could wake up a wrong sender thread.
This commit resolves the false wake-up issue with the following changes:
- Completions are created just as many as the number of concurrent sender
threads
- A completion is created on a sender thread's stack
- Each slot of the message queue, i.e. `msg_data`, contains a pointer to
its target completion
- tx_tick() signals the completion of the currently active slot of the
message queue
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/all/1490809381-28869-1-git-send-email-jaswinder.sin…
Signed-off-by: Joonwon Kang <joonwonkang(a)google.com>
---
Link -> v1: The previous solution in the Link tries to have per-message
completion: `tx_cmpl[MBOX_TX_QUEUE_LEN]`; each completion belongs to
each slot of the message queue: `msg_data[i]`. Those completions take
up additional memory even when they are not used. Instead, this patch
tries to have per-"thread" completion; each completion belongs to each
sender thread and each slot of the message queue has a pointer to that
completion; `struct mbox_message` has the "pointer" field
`struct completion *tx_complete` which points to the completion which
is created on the stack of the sender, instead of owning the completion
by `struct completion tx_complete`. This way, we could avoid additional
memory use since a completion will be allocated only when necessary.
Plus, more importantly, we could avoid the window where the same
completion is reused by different sender threads which the previous
solution still has.
drivers/mailbox/mailbox.c | 43 +++++++++++++++++++-----------
drivers/mailbox/tegra-hsp.c | 2 +-
include/linux/mailbox_controller.h | 20 +++++++++-----
3 files changed, 43 insertions(+), 22 deletions(-)
diff --git a/drivers/mailbox/mailbox.c b/drivers/mailbox/mailbox.c
index 617ba505691d..0afe3ae3bfdc 100644
--- a/drivers/mailbox/mailbox.c
+++ b/drivers/mailbox/mailbox.c
@@ -23,7 +23,7 @@
static LIST_HEAD(mbox_cons);
static DEFINE_MUTEX(con_mutex);
-static int add_to_rbuf(struct mbox_chan *chan, void *mssg)
+static int add_to_rbuf(struct mbox_chan *chan, void *mssg, struct completion *tx_complete)
{
int idx;
@@ -34,7 +34,8 @@ static int add_to_rbuf(struct mbox_chan *chan, void *mssg)
return -ENOBUFS;
idx = chan->msg_free;
- chan->msg_data[idx] = mssg;
+ chan->msg_data[idx].data = mssg;
+ chan->msg_data[idx].tx_complete = tx_complete;
chan->msg_count++;
if (idx == MBOX_TX_QUEUE_LEN - 1)
@@ -52,7 +53,7 @@ static void msg_submit(struct mbox_chan *chan)
int err = -EBUSY;
scoped_guard(spinlock_irqsave, &chan->lock) {
- if (!chan->msg_count || chan->active_req)
+ if (!chan->msg_count || chan->active_req >= 0)
break;
count = chan->msg_count;
@@ -62,14 +63,14 @@ static void msg_submit(struct mbox_chan *chan)
else
idx += MBOX_TX_QUEUE_LEN - count;
- data = chan->msg_data[idx];
+ data = chan->msg_data[idx].data;
if (chan->cl->tx_prepare)
chan->cl->tx_prepare(chan->cl, data);
/* Try to submit a message to the MBOX controller */
err = chan->mbox->ops->send_data(chan, data);
if (!err) {
- chan->active_req = data;
+ chan->active_req = idx;
chan->msg_count--;
}
}
@@ -83,11 +84,17 @@ static void msg_submit(struct mbox_chan *chan)
static void tx_tick(struct mbox_chan *chan, int r)
{
- void *mssg;
+ int idx;
+ void *mssg = NULL;
+ struct completion *tx_complete = NULL;
scoped_guard(spinlock_irqsave, &chan->lock) {
- mssg = chan->active_req;
- chan->active_req = NULL;
+ idx = chan->active_req;
+ if (idx >= 0) {
+ mssg = chan->msg_data[idx].data;
+ tx_complete = chan->msg_data[idx].tx_complete;
+ chan->active_req = -1;
+ }
}
/* Submit next message */
@@ -101,7 +108,7 @@ static void tx_tick(struct mbox_chan *chan, int r)
chan->cl->tx_done(chan->cl, mssg, r);
if (r != -ETIME && chan->cl->tx_block)
- complete(&chan->tx_complete);
+ complete(tx_complete);
}
static enum hrtimer_restart txdone_hrtimer(struct hrtimer *hrtimer)
@@ -114,7 +121,7 @@ static enum hrtimer_restart txdone_hrtimer(struct hrtimer *hrtimer)
for (i = 0; i < mbox->num_chans; i++) {
struct mbox_chan *chan = &mbox->chans[i];
- if (chan->active_req && chan->cl) {
+ if (chan->active_req >= 0 && chan->cl) {
txdone = chan->mbox->ops->last_tx_done(chan);
if (txdone)
tx_tick(chan, 0);
@@ -245,11 +252,18 @@ EXPORT_SYMBOL_GPL(mbox_client_peek_data);
int mbox_send_message(struct mbox_chan *chan, void *mssg)
{
int t;
+ struct completion tx_complete;
if (!chan || !chan->cl)
return -EINVAL;
- t = add_to_rbuf(chan, mssg);
+ if (chan->cl->tx_block) {
+ init_completion(&tx_complete);
+ t = add_to_rbuf(chan, mssg, &tx_complete);
+ } else {
+ t = add_to_rbuf(chan, mssg, NULL);
+ }
+
if (t < 0) {
dev_err(chan->mbox->dev, "Try increasing MBOX_TX_QUEUE_LEN\n");
return t;
@@ -266,7 +280,7 @@ int mbox_send_message(struct mbox_chan *chan, void *mssg)
else
wait = msecs_to_jiffies(chan->cl->tx_tout);
- ret = wait_for_completion_timeout(&chan->tx_complete, wait);
+ ret = wait_for_completion_timeout(&tx_complete, wait);
if (ret == 0) {
t = -ETIME;
tx_tick(chan, t);
@@ -319,9 +333,8 @@ static int __mbox_bind_client(struct mbox_chan *chan, struct mbox_client *cl)
scoped_guard(spinlock_irqsave, &chan->lock) {
chan->msg_free = 0;
chan->msg_count = 0;
- chan->active_req = NULL;
+ chan->active_req = -1;
chan->cl = cl;
- init_completion(&chan->tx_complete);
if (chan->txdone_method == TXDONE_BY_POLL && cl->knows_txdone)
chan->txdone_method = TXDONE_BY_ACK;
@@ -477,7 +490,7 @@ void mbox_free_channel(struct mbox_chan *chan)
/* The queued TX requests are simply aborted, no callbacks are made */
scoped_guard(spinlock_irqsave, &chan->lock) {
chan->cl = NULL;
- chan->active_req = NULL;
+ chan->active_req = -1;
if (chan->txdone_method == TXDONE_BY_ACK)
chan->txdone_method = TXDONE_BY_POLL;
}
diff --git a/drivers/mailbox/tegra-hsp.c b/drivers/mailbox/tegra-hsp.c
index ed9a0bb2bcd8..de7494ce0a9f 100644
--- a/drivers/mailbox/tegra-hsp.c
+++ b/drivers/mailbox/tegra-hsp.c
@@ -497,7 +497,7 @@ static int tegra_hsp_mailbox_flush(struct mbox_chan *chan,
mbox_chan_txdone(chan, 0);
/* Wait until channel is empty */
- if (chan->active_req != NULL)
+ if (chan->active_req >= 0)
continue;
return 0;
diff --git a/include/linux/mailbox_controller.h b/include/linux/mailbox_controller.h
index 80a427c7ca29..67e08a440f5f 100644
--- a/include/linux/mailbox_controller.h
+++ b/include/linux/mailbox_controller.h
@@ -105,16 +105,25 @@ struct mbox_controller {
*/
#define MBOX_TX_QUEUE_LEN 20
+/**
+ * struct mbox_message - Internal representation of a mailbox message
+ * @data: Data packet
+ * @tx_complete: Pointer to the transmission completion
+ */
+struct mbox_message {
+ void *data;
+ struct completion *tx_complete;
+};
+
/**
* struct mbox_chan - s/w representation of a communication chan
* @mbox: Pointer to the parent/provider of this channel
* @txdone_method: Way to detect TXDone chosen by the API
* @cl: Pointer to the current owner of this channel
- * @tx_complete: Transmission completion
- * @active_req: Currently active request hook
+ * @active_req: Index of the currently active slot in the queue
* @msg_count: No. of mssg currently queued
* @msg_free: Index of next available mssg slot
- * @msg_data: Hook for data packet
+ * @msg_data: Queue of data packets
* @lock: Serialise access to the channel
* @con_priv: Hook for controller driver to attach private data
*/
@@ -122,10 +131,9 @@ struct mbox_chan {
struct mbox_controller *mbox;
unsigned txdone_method;
struct mbox_client *cl;
- struct completion tx_complete;
- void *active_req;
+ int active_req;
unsigned msg_count, msg_free;
- void *msg_data[MBOX_TX_QUEUE_LEN];
+ struct mbox_message msg_data[MBOX_TX_QUEUE_LEN];
spinlock_t lock; /* Serialise access to the channel */
void *con_priv;
};
--
2.52.0.239.gd5f0c6e74e-goog
Backport commit:5701875f9609 ("ext4: fix out-of-bound read in
ext4_xattr_inode_dec_ref_all()" to linux 5.10 branch.
The fix depends on commit:69f3a3039b0d ("ext4: introduce ITAIL helper")
In order to make a clean backport on stable kernel, backport 2 commits.
It has a single merge conflict where static inline int, which changed
to static int.
To: stable(a)vger.kernel.org
Cc: Theodore Ts'o <tytso(a)mit.edu>
Cc: Ye Bin <yebin10(a)huawei.com>
Cc: Sasha Levin <sashal(a)kernel.org>
Signed-off-by: David Nyström <david.nystrom(a)est.tech>
---
Ye Bin (2):
ext4: introduce ITAIL helper
ext4: fix out-of-bound read in ext4_xattr_inode_dec_ref_all()
fs/ext4/inode.c | 5 +++++
fs/ext4/xattr.c | 32 ++++----------------------------
fs/ext4/xattr.h | 10 ++++++++++
3 files changed, 19 insertions(+), 28 deletions(-)
---
base-commit: f964b940099f9982d723d4c77988d4b0dda9c165
change-id: 20251215-ext4_splat-f59c1acd9e88
Best regards,
--
David Nyström <david.nystrom(a)est.tech>
Commit 9a7c987fb92b ("crypto: arm64/ghash - Use API partial block
handling") made ghash_finup() pass the wrong buffer to
ghash_do_simd_update(). As a result, ghash-neon now produces incorrect
outputs when the message length isn't divisible by 16 bytes. Fix this.
(I didn't notice this earlier because this code is reached only on CPUs
that support NEON but not PMULL. I haven't yet found a way to get
qemu-system-aarch64 to emulate that configuration.)
Fixes: 9a7c987fb92b ("crypto: arm64/ghash - Use API partial block handling")
Cc: stable(a)vger.kernel.org
Reported-by: Diederik de Haas <diederik(a)cknow-tech.com>
Closes: https://lore.kernel.org/linux-crypto/DETXT7QI62KE.F3CGH2VWX1SC@cknow-tech.c…
Signed-off-by: Eric Biggers <ebiggers(a)kernel.org>
---
If it's okay, I'd like to just take this via libcrypto-fixes.
arch/arm64/crypto/ghash-ce-glue.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/arm64/crypto/ghash-ce-glue.c b/arch/arm64/crypto/ghash-ce-glue.c
index 7951557a285a..ef249d06c92c 100644
--- a/arch/arm64/crypto/ghash-ce-glue.c
+++ b/arch/arm64/crypto/ghash-ce-glue.c
@@ -131,11 +131,11 @@ static int ghash_finup(struct shash_desc *desc, const u8 *src,
if (len) {
u8 buf[GHASH_BLOCK_SIZE] = {};
memcpy(buf, src, len);
- ghash_do_simd_update(1, ctx->digest, src, key, NULL,
+ ghash_do_simd_update(1, ctx->digest, buf, key, NULL,
pmull_ghash_update_p8);
memzero_explicit(buf, sizeof(buf));
}
return ghash_export(desc, dst);
}
base-commit: 7a3984bbd69055898add0fe22445f99435f33450
--
2.52.0
When the filesystem is being mounted, the kernel panics while the data
regarding slot map allocation to the local node, is being written to the
disk. This occurs because the value of slot map buffer head block
number, which should have been greater than or equal to
`OCFS2_SUPER_BLOCK_BLKNO` (evaluating to 2) is less than it, indicative
of disk metadata corruption. This triggers
BUG_ON(bh->b_blocknr < OCFS2_SUPER_BLOCK_BLKNO) in ocfs2_write_block(),
causing the kernel to panic.
This is fixed by introducing function ocfs2_validate_slot_map_block() to
validate slot map blocks. It first checks if the buffer head passed to it
is up to date and valid, else it panics the kernel at that point itself.
Further, it contains an if condition block, which checks if `bh->b_blocknr`
is lesser than `OCFS2_SUPER_BLOCK_BLKNO`; if yes, then ocfs2_error is
called, which prints the error log, for debugging purposes, and the return
value of ocfs2_error() is returned. If the if condition is false, value 0
is returned by ocfs2_validate_slot_map_block().
This function is used as validate function in calls to ocfs2_read_blocks()
in ocfs2_refresh_slot_info() and ocfs2_map_slot_buffers().
Reported-by: syzbot+c818e5c4559444f88aa0(a)syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=c818e5c4559444f88aa0
Tested-by: syzbot+c818e5c4559444f88aa0(a)syzkaller.appspotmail.com
Cc: stable(a)vger.kernel.org
Signed-off-by: Prithvi Tambewagh <activprithvi(a)gmail.com>
---
v3->v4:
- Remove if condition in ocfs2_validate_slot_map_block() which checks if
`rc` is zero
- Update commit log message
v3 link: https://lore.kernel.org/ocfs2-devel/tagu2npibmto5bgonhorg5krbvqho4zxsv5pulv…
v2->v3:
- Create new function ocfs2_validate_slot_map_block() to validate block
number of slot map blocks, to be greater then or equal to
OCFS2_SUPER_BLOCK_BLKNO
- Use ocfs2_validate_slot_map_block() in calls to ocfs2_read_blocks() in
ocfs2_refresh_slot_info() and ocfs2_map_slot_buffers()
- In addition to using previously formulated if block in
ocfs2_validate_slot_map_block(), also check if the buffer head passed
in this function is up to date; if not, then kernel panics at that point
- Update title of patch to 'ocfs2: Add validate function for slot map blocks'
v2 link: https://lore.kernel.org/ocfs2-devel/nwkfpkm2wlajswykywnpt4sc6gdkesakw2sw7et…
v1->v2:
- Remove usage of le16_to_cpu() from ocfs2_error()
- Cast bh->b_blocknr to unsigned long long
- Remove type casting for OCFS2_SUPER_BLOCK_BLKNO
- Fix Sparse warnings reported in v1 by kernel test robot
- Update title from 'ocfs2: Fix kernel BUG in ocfs2_write_block' to
'ocfs2: fix kernel BUG in ocfs2_write_block'
v1 link: https://lore.kernel.org/all/20251206154819.175479-1-activprithvi@gmail.com/…
fs/ocfs2/slot_map.c | 27 +++++++++++++++++++++++++--
1 file changed, 25 insertions(+), 2 deletions(-)
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index e544c704b583..ea4a68abc25b 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -44,6 +44,9 @@ struct ocfs2_slot_info {
static int __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
unsigned int node_num);
+static int ocfs2_validate_slot_map_block(struct super_block *sb,
+ struct buffer_head *bh);
+
static void ocfs2_invalidate_slot(struct ocfs2_slot_info *si,
int slot_num)
{
@@ -132,7 +135,8 @@ int ocfs2_refresh_slot_info(struct ocfs2_super *osb)
* this is not true, the read of -1 (UINT64_MAX) will fail.
*/
ret = ocfs2_read_blocks(INODE_CACHE(si->si_inode), -1, si->si_blocks,
- si->si_bh, OCFS2_BH_IGNORE_CACHE, NULL);
+ si->si_bh, OCFS2_BH_IGNORE_CACHE,
+ ocfs2_validate_slot_map_block);
if (ret == 0) {
spin_lock(&osb->osb_lock);
ocfs2_update_slot_info(si);
@@ -332,6 +336,24 @@ int ocfs2_clear_slot(struct ocfs2_super *osb, int slot_num)
return ocfs2_update_disk_slot(osb, osb->slot_info, slot_num);
}
+static int ocfs2_validate_slot_map_block(struct super_block *sb,
+ struct buffer_head *bh)
+{
+ int rc;
+
+ BUG_ON(!buffer_uptodate(bh));
+
+ if (bh->b_blocknr < OCFS2_SUPER_BLOCK_BLKNO) {
+ rc = ocfs2_error(sb,
+ "Invalid Slot Map Buffer Head "
+ "Block Number : %llu, Should be >= %d",
+ (unsigned long long)bh->b_blocknr,
+ OCFS2_SUPER_BLOCK_BLKNO);
+ return rc;
+ }
+ return 0;
+}
+
static int ocfs2_map_slot_buffers(struct ocfs2_super *osb,
struct ocfs2_slot_info *si)
{
@@ -383,7 +405,8 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb,
bh = NULL; /* Acquire a fresh bh */
status = ocfs2_read_blocks(INODE_CACHE(si->si_inode), blkno,
- 1, &bh, OCFS2_BH_IGNORE_CACHE, NULL);
+ 1, &bh, OCFS2_BH_IGNORE_CACHE,
+ ocfs2_validate_slot_map_block);
if (status < 0) {
mlog_errno(status);
goto bail;
base-commit: 24172e0d79900908cf5ebf366600616d29c9b417
--
2.43.0
`build_assert` relies on the compiler to optimize out its error path,
lest build fails with the dreaded error:
ERROR: modpost: "rust_build_error" [path/to/module.ko] undefined!
It has been observed that very trivial code performing I/O accesses
(sometimes even using an immediate value) would seemingly randomly fail
with this error whenever `CLIPPY=1` was set. The same behavior was also
observed until different, very similar conditions [1][2].
The cause, as pointed out by Gary Guo [3], appears to be that the
failing function is eventually using `build_assert` with its argument,
but is only annotated with `#[inline]`. This gives the compiler freedom
to not inline the function, which it notably did when Clippy was active,
triggering the error.
The fix is to annotate functions passing their argument to
`build_assert` with `#[inline(always)]`, telling the compiler to be as
aggressive as possible with their inlining. This is also the correct
behavior as inlining is mandatory for correct behavior in these cases.
This series fixes all possible points of failure in the kernel crate,
and adds documentation to `build_assert` explaining how to properly
inline functions for which this behavior may arise.
[1] https://lore.kernel.org/all/DEEUYUOAEZU3.1J1HM2YQ10EX1@nvidia.com/
[2] https://lore.kernel.org/all/A1A280D4-836E-4D75-863E-30B1C276C80C@collabora.…
[3] https://lore.kernel.org/all/20251121143008.2f5acc33.gary@garyguo.net/
Signed-off-by: Alexandre Courbot <acourbot(a)nvidia.com>
---
Changes in v3:
- Add "Fixes:" tags.
- CC stable on fixup patches.
- Link to v2: https://patch.msgid.link/20251128-io-build-assert-v2-0-a9ea9ce7d45d@nvidia.…
Changes in v2:
- Turn into a series and address other similar cases in the kernel crate.
- Link to v1: https://patch.msgid.link/20251127-io-build-assert-v1-1-04237f2e5850@nvidia.…
---
Alexandre Courbot (7):
rust: build_assert: add instructions for use with function arguments
rust: io: always inline functions using build_assert with arguments
rust: cpufreq: always inline functions using build_assert with arguments
rust: bits: always inline functions using build_assert with arguments
rust: sync: refcount: always inline functions using build_assert with arguments
rust: irq: always inline functions using build_assert with arguments
rust: num: bounded: add missing comment for always inlined function
rust/kernel/bits.rs | 6 ++++--
rust/kernel/build_assert.rs | 7 ++++++-
rust/kernel/cpufreq.rs | 2 ++
rust/kernel/io.rs | 9 ++++++---
rust/kernel/io/resource.rs | 2 ++
rust/kernel/irq/flags.rs | 2 ++
rust/kernel/num/bounded.rs | 1 +
rust/kernel/sync/refcount.rs | 3 ++-
8 files changed, 25 insertions(+), 7 deletions(-)
---
base-commit: ba65a4e7120a616d9c592750d9147f6dcafedffa
change-id: 20251127-io-build-assert-3579a5bfb81c
Best regards,
--
Alexandre Courbot <acourbot(a)nvidia.com>
The patch titled
Subject: tools/mm/page_owner_sort: fix timestamp comparison for stable sorting
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
tools-mm-page_owner_sort-fix-timestamp-comparison-for-stable-sorting.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Kaushlendra Kumar <kaushlendra.kumar(a)intel.com>
Subject: tools/mm/page_owner_sort: fix timestamp comparison for stable sorting
Date: Tue, 9 Dec 2025 10:15:52 +0530
The ternary operator in compare_ts() returns 1 when timestamps are equal,
causing unstable sorting behavior. Replace with explicit three-way
comparison that returns 0 for equal timestamps, ensuring stable qsort
ordering and consistent output.
Link: https://lkml.kernel.org/r/20251209044552.3396468-1-kaushlendra.kumar@intel.…
Fixes: 8f9c447e2e2b ("tools/vm/page_owner_sort.c: support sorting pid and time")
Signed-off-by: Kaushlendra Kumar <kaushlendra.kumar(a)intel.com>
Cc: Chongxi Zhao <zhaochongxi2019(a)email.szu.edu.cn>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
tools/mm/page_owner_sort.c | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
--- a/tools/mm/page_owner_sort.c~tools-mm-page_owner_sort-fix-timestamp-comparison-for-stable-sorting
+++ a/tools/mm/page_owner_sort.c
@@ -181,7 +181,11 @@ static int compare_ts(const void *p1, co
{
const struct block_list *l1 = p1, *l2 = p2;
- return l1->ts_nsec < l2->ts_nsec ? -1 : 1;
+ if (l1->ts_nsec < l2->ts_nsec)
+ return -1;
+ if (l1->ts_nsec > l2->ts_nsec)
+ return 1;
+ return 0;
}
static int compare_cull_condition(const void *p1, const void *p2)
_
Patches currently in -mm which might be from kaushlendra.kumar(a)intel.com are
tools-mm-page_owner_sort-fix-timestamp-comparison-for-stable-sorting.patch
The patch titled
Subject: selftests/mm: fix thread state check in uffd-unit-tests
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
selftests-mm-fix-thread-state-check-in-uffd-unit-tests.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Wake Liu <wakel(a)google.com>
Subject: selftests/mm: fix thread state check in uffd-unit-tests
Date: Wed, 10 Dec 2025 17:14:08 +0800
In the thread_state_get() function, the logic to find the thread's state
character was using `sizeof(header) - 1` to calculate the offset from the
"State:\t" string.
The `header` variable is a `const char *` pointer. `sizeof()` on a
pointer returns the size of the pointer itself, not the length of the
string literal it points to. This makes the code's behavior dependent on
the architecture's pointer size.
This bug was identified on a 32-bit ARM build (`gsi_tv_arm`) for Android,
running on an ARMv8-based device, compiled with Clang 19.0.1.
On this 32-bit architecture, `sizeof(char *)` is 4. The expression
`sizeof(header) - 1` resulted in an incorrect offset of 3, causing the
test to read the wrong character from `/proc/[tid]/status` and fail.
On 64-bit architectures, `sizeof(char *)` is 8, so the expression
coincidentally evaluates to 7, which matches the length of "State:\t".
This is why the bug likely remained hidden on 64-bit builds.
To fix this and make the code portable and correct across all
architectures, this patch replaces `sizeof(header) - 1` with
`strlen(header)`. The `strlen()` function correctly calculates the
string's length, ensuring the correct offset is always used.
Link: https://lkml.kernel.org/r/20251210091408.3781445-1-wakel@google.com
Fixes: f60b6634cd88 ("mm/selftests: add a test to verify mmap_changing race with -EAGAIN")
Signed-off-by: Wake Liu <wakel(a)google.com>
Acked-by: Peter Xu <peterx(a)redhat.com>
Cc: Bill Wendling <morbo(a)google.com>
Cc: Justin Stitt <justinstitt(a)google.com>
Cc: Liam Howlett <liam.howlett(a)oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes(a)oracle.com>
Cc: Michal Hocko <mhocko(a)suse.com>
Cc: Mike Rapoport <rppt(a)kernel.org>
Cc: Nathan Chancellor <nathan(a)kernel.org>
Cc: Shuah Khan <shuah(a)kernel.org>
Cc: Suren Baghdasaryan <surenb(a)google.com>
Cc: Vlastimil Babka <vbabka(a)suse.cz>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
tools/testing/selftests/mm/uffd-unit-tests.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/tools/testing/selftests/mm/uffd-unit-tests.c~selftests-mm-fix-thread-state-check-in-uffd-unit-tests
+++ a/tools/testing/selftests/mm/uffd-unit-tests.c
@@ -1317,7 +1317,7 @@ static thread_state thread_state_get(pid
p = strstr(tmp, header);
if (p) {
/* For example, "State:\tD (disk sleep)" */
- c = *(p + sizeof(header) - 1);
+ c = *(p + strlen(header));
return c == 'D' ?
THR_STATE_UNINTERRUPTIBLE : THR_STATE_UNKNOWN;
}
_
Patches currently in -mm which might be from wakel(a)google.com are
selftests-mm-fix-thread-state-check-in-uffd-unit-tests.patch
From: Yi Sun <yi.sun(a)intel.com>
[ Upstream commit f41c538881eec4dcf5961a242097d447f848cda6 ]
The call to idxd_free() introduces a duplicate put_device() leading to a
reference count underflow:
refcount_t: underflow; use-after-free.
WARNING: CPU: 15 PID: 4428 at lib/refcount.c:28 refcount_warn_saturate+0xbe/0x110
...
Call Trace:
<TASK>
idxd_remove+0xe4/0x120 [idxd]
pci_device_remove+0x3f/0xb0
device_release_driver_internal+0x197/0x200
driver_detach+0x48/0x90
bus_remove_driver+0x74/0xf0
pci_unregister_driver+0x2e/0xb0
idxd_exit_module+0x34/0x7a0 [idxd]
__do_sys_delete_module.constprop.0+0x183/0x280
do_syscall_64+0x54/0xd70
entry_SYSCALL_64_after_hwframe+0x76/0x7e
The idxd_unregister_devices() which is invoked at the very beginning of
idxd_remove(), already takes care of the necessary put_device() through the
following call path:
idxd_unregister_devices() -> device_unregister() -> put_device()
In addition, when CONFIG_DEBUG_KOBJECT_RELEASE is enabled, put_device() may
trigger asynchronous cleanup via schedule_delayed_work(). If idxd_free() is
called immediately after, it can result in a use-after-free.
Remove the improper idxd_free() to avoid both the refcount underflow and
potential memory corruption during module unload.
Fixes: d5449ff1b04d ("dmaengine: idxd: Add missing idxd cleanup to fix memory leak in remove call")
Signed-off-by: Yi Sun <yi.sun(a)intel.com>
Tested-by: Shuai Xue <xueshuai(a)linux.alibaba.com>
Reviewed-by: Dave Jiang <dave.jiang(a)intel.com>
Acked-by: Vinicius Costa Gomes <vinicius.gomes(a)intel.com>
Link: https://lore.kernel.org/r/20250729150313.1934101-2-yi.sun@intel.com
Signed-off-by: Vinod Koul <vkoul(a)kernel.org>
[ Slightly adjust the context. ]
Signed-off-by: Bin Lan <lanbincn(a)139.com>
---
Without this patch, this issue can be reproduced in Linux-6.1.y
when the idxd module is removed.
---
drivers/dma/idxd/init.c | 1 -
1 file changed, 1 deletion(-)
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index 127a6a302a5b..6059ffc08eac 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -816,7 +816,6 @@ static void idxd_remove(struct pci_dev *pdev)
destroy_workqueue(idxd->wq);
perfmon_pmu_remove(idxd);
put_device(idxd_confdev(idxd));
- idxd_free(idxd);
}
static struct pci_driver idxd_pci_driver = {
--
2.43.0
From: Chen Linxuan <me(a)black-desk.cn>
When using fsconfig(..., FSCONFIG_CMD_CREATE, ...), the filesystem
context is retrieved from the file descriptor. Since the file structure
persists across syscall restarts, the context state is preserved:
// fs/fsopen.c
SYSCALL_DEFINE5(fsconfig, ...)
{
...
fc = fd_file(f)->private_data;
...
ret = vfs_fsconfig_locked(fc, cmd, ¶m);
...
}
In vfs_cmd_create(), the context phase is transitioned to
FS_CONTEXT_CREATING before calling vfs_get_tree():
// fs/fsopen.c
static int vfs_cmd_create(struct fs_context *fc, bool exclusive)
{
...
fc->phase = FS_CONTEXT_CREATING;
...
ret = vfs_get_tree(fc);
...
}
However, vfs_get_tree() may return -ERESTARTNOINTR if the filesystem
implementation needs to restart the syscall. For example, cgroup v1 does
this when it encounters a race condition where the root is dying:
// kernel/cgroup/cgroup-v1.c
int cgroup1_get_tree(struct fs_context *fc)
{
...
if (unlikely(ret > 0)) {
msleep(10);
return restart_syscall();
}
return ret;
}
If the syscall is restarted, fsconfig() is called again and retrieves
the *same* fs_context. However, vfs_cmd_create() rejects the call
because the phase was left as FS_CONTEXT_CREATING during the first
attempt:
if (fc->phase != FS_CONTEXT_CREATE_PARAMS)
return -EBUSY;
Fix this by resetting fc->phase back to FS_CONTEXT_CREATE_PARAMS if
vfs_get_tree() returns -ERESTARTNOINTR.
Cc: stable(a)vger.kernel.org
Signed-off-by: Chen Linxuan <me(a)black-desk.cn>
---
fs/fsopen.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/fs/fsopen.c b/fs/fsopen.c
index f645c99204eb..8a7cb031af50 100644
--- a/fs/fsopen.c
+++ b/fs/fsopen.c
@@ -229,6 +229,10 @@ static int vfs_cmd_create(struct fs_context *fc, bool exclusive)
fc->exclusive = exclusive;
ret = vfs_get_tree(fc);
+ if (ret == -ERESTARTNOINTR) {
+ fc->phase = FS_CONTEXT_CREATE_PARAMS;
+ return ret;
+ }
if (ret) {
fc->phase = FS_CONTEXT_FAILED;
return ret;
---
base-commit: 187d0801404f415f22c0b31531982c7ea97fa341
change-id: 20251213-mount-ebusy-8ee3888a7e4f
Best regards,
--
Chen Linxuan <me(a)black-desk.cn>
The patch titled
Subject: kernel/kexec: fix IMA when allocation happens in CMA area
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
kernel-kexec-fix-ima-when-allocation-happens-in-cma-area.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Pingfan Liu <piliu(a)redhat.com>
Subject: kernel/kexec: fix IMA when allocation happens in CMA area
Date: Tue, 16 Dec 2025 09:48:52 +0800
*** Bug description ***
When I tested kexec with the latest kernel, I ran into the following warning:
[ 40.712410] ------------[ cut here ]------------
[ 40.712576] WARNING: CPU: 2 PID: 1562 at kernel/kexec_core.c:1001 kimage_map_segment+0x144/0x198
[...]
[ 40.816047] Call trace:
[ 40.818498] kimage_map_segment+0x144/0x198 (P)
[ 40.823221] ima_kexec_post_load+0x58/0xc0
[ 40.827246] __do_sys_kexec_file_load+0x29c/0x368
[...]
[ 40.855423] ---[ end trace 0000000000000000 ]---
*** How to reproduce ***
This bug is only triggered when the kexec target address is allocated in
the CMA area. If no CMA area is reserved in the kernel, use the "cma="
option in the kernel command line to reserve one.
*** Root cause ***
The commit 07d24902977e ("kexec: enable CMA based contiguous
allocation") allocates the kexec target address directly on the CMA area
to avoid copying during the jump. In this case, there is no IND_SOURCE
for the kexec segment. But the current implementation of
kimage_map_segment() assumes that IND_SOURCE pages exist and map them
into a contiguous virtual address by vmap().
*** Solution ***
If IMA segment is allocated in the CMA area, use its page_address()
directly.
Link: https://lkml.kernel.org/r/20251216014852.8737-2-piliu@redhat.com
Fixes: 07d24902977e ("kexec: enable CMA based contiguous allocation")
Signed-off-by: Pingfan Liu <piliu(a)redhat.com>
Cc: Baoquan He <bhe(a)redhat.com>
Cc: Alexander Graf <graf(a)amazon.com>
Cc: Steven Chen <chenste(a)linux.microsoft.com>
Cc: Mimi Zohar <zohar(a)linux.ibm.com>
Cc: Roberto Sassu <roberto.sassu(a)huawei.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
kernel/kexec_core.c | 9 +++++++--
1 file changed, 7 insertions(+), 2 deletions(-)
--- a/kernel/kexec_core.c~kernel-kexec-fix-ima-when-allocation-happens-in-cma-area
+++ a/kernel/kexec_core.c
@@ -960,13 +960,17 @@ void *kimage_map_segment(struct kimage *
kimage_entry_t *ptr, entry;
struct page **src_pages;
unsigned int npages;
+ struct page *cma;
void *vaddr = NULL;
int i;
+ cma = image->segment_cma[idx];
+ if (cma)
+ return page_address(cma);
+
addr = image->segment[idx].mem;
size = image->segment[idx].memsz;
eaddr = addr + size;
-
/*
* Collect the source pages and map them in a contiguous VA range.
*/
@@ -1007,7 +1011,8 @@ void *kimage_map_segment(struct kimage *
void kimage_unmap_segment(void *segment_buffer)
{
- vunmap(segment_buffer);
+ if (is_vmalloc_addr(segment_buffer))
+ vunmap(segment_buffer);
}
struct kexec_load_limit {
_
Patches currently in -mm which might be from piliu(a)redhat.com are
kernel-kexec-change-the-prototype-of-kimage_map_segment.patch
kernel-kexec-fix-ima-when-allocation-happens-in-cma-area.patch
The patch titled
Subject: kernel/kexec: change the prototype of kimage_map_segment()
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
kernel-kexec-change-the-prototype-of-kimage_map_segment.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Pingfan Liu <piliu(a)redhat.com>
Subject: kernel/kexec: change the prototype of kimage_map_segment()
Date: Tue, 16 Dec 2025 09:48:51 +0800
The kexec segment index will be required to extract the corresponding
information for that segment in kimage_map_segment(). Additionally,
kexec_segment already holds the kexec relocation destination address and
size. Therefore, the prototype of kimage_map_segment() can be changed.
Link: https://lkml.kernel.org/r/20251216014852.8737-1-piliu@redhat.com
Fixes: 07d24902977e ("kexec: enable CMA based contiguous allocation")
Signed-off-by: Pingfan Liu <piliu(a)redhat.com>
Cc: Baoquan He <bhe(a)redhat.com>
Cc: Mimi Zohar <zohar(a)linux.ibm.com>
Cc: Roberto Sassu <roberto.sassu(a)huawei.com>
Cc: Alexander Graf <graf(a)amazon.com>
Cc: Steven Chen <chenste(a)linux.microsoft.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
include/linux/kexec.h | 4 ++--
kernel/kexec_core.c | 9 ++++++---
security/integrity/ima/ima_kexec.c | 4 +---
3 files changed, 9 insertions(+), 8 deletions(-)
--- a/include/linux/kexec.h~kernel-kexec-change-the-prototype-of-kimage_map_segment
+++ a/include/linux/kexec.h
@@ -530,7 +530,7 @@ extern bool kexec_file_dbg_print;
#define kexec_dprintk(fmt, arg...) \
do { if (kexec_file_dbg_print) pr_info(fmt, ##arg); } while (0)
-extern void *kimage_map_segment(struct kimage *image, unsigned long addr, unsigned long size);
+extern void *kimage_map_segment(struct kimage *image, int idx);
extern void kimage_unmap_segment(void *buffer);
#else /* !CONFIG_KEXEC_CORE */
struct pt_regs;
@@ -540,7 +540,7 @@ static inline void __crash_kexec(struct
static inline void crash_kexec(struct pt_regs *regs) { }
static inline int kexec_should_crash(struct task_struct *p) { return 0; }
static inline int kexec_crash_loaded(void) { return 0; }
-static inline void *kimage_map_segment(struct kimage *image, unsigned long addr, unsigned long size)
+static inline void *kimage_map_segment(struct kimage *image, int idx)
{ return NULL; }
static inline void kimage_unmap_segment(void *buffer) { }
#define kexec_in_progress false
--- a/kernel/kexec_core.c~kernel-kexec-change-the-prototype-of-kimage_map_segment
+++ a/kernel/kexec_core.c
@@ -953,17 +953,20 @@ int kimage_load_segment(struct kimage *i
return result;
}
-void *kimage_map_segment(struct kimage *image,
- unsigned long addr, unsigned long size)
+void *kimage_map_segment(struct kimage *image, int idx)
{
+ unsigned long addr, size, eaddr;
unsigned long src_page_addr, dest_page_addr = 0;
- unsigned long eaddr = addr + size;
kimage_entry_t *ptr, entry;
struct page **src_pages;
unsigned int npages;
void *vaddr = NULL;
int i;
+ addr = image->segment[idx].mem;
+ size = image->segment[idx].memsz;
+ eaddr = addr + size;
+
/*
* Collect the source pages and map them in a contiguous VA range.
*/
--- a/security/integrity/ima/ima_kexec.c~kernel-kexec-change-the-prototype-of-kimage_map_segment
+++ a/security/integrity/ima/ima_kexec.c
@@ -250,9 +250,7 @@ void ima_kexec_post_load(struct kimage *
if (!image->ima_buffer_addr)
return;
- ima_kexec_buffer = kimage_map_segment(image,
- image->ima_buffer_addr,
- image->ima_buffer_size);
+ ima_kexec_buffer = kimage_map_segment(image, image->ima_segment_index);
if (!ima_kexec_buffer) {
pr_err("Could not map measurements buffer.\n");
return;
_
Patches currently in -mm which might be from piliu(a)redhat.com are
kernel-kexec-change-the-prototype-of-kimage_map_segment.patch
kernel-kexec-fix-ima-when-allocation-happens-in-cma-area.patch