Currently, kvfree_rcu_barrier() flushes RCU sheaves across all slab
caches when a cache is destroyed. This is unnecessary; only the RCU
sheaves belonging to the cache being destroyed need to be flushed.
As suggested by Vlastimil Babka, introduce a weaker form of
kvfree_rcu_barrier() that operates on a specific slab cache.
Factor out flush_rcu_sheaves_on_cache() from flush_all_rcu_sheaves() and
call it from flush_all_rcu_sheaves() and kvfree_rcu_barrier_on_cache().
Call kvfree_rcu_barrier_on_cache() instead of kvfree_rcu_barrier() on
cache destruction.
The performance benefit is evaluated on a 12 core 24 threads AMD Ryzen
5900X machine (1 socket), by loading slub_kunit module.
Before:
Total calls: 19
Average latency (us): 18127
Total time (us): 344414
After:
Total calls: 19
Average latency (us): 10066
Total time (us): 191264
Two performance regression have been reported:
- stress module loader test's runtime increases by 50-60% (Daniel)
- internal graphics test's runtime on Tegra23 increases by 35% (Jon)
They are fixed by this change.
Suggested-by: Vlastimil Babka <vbabka(a)suse.cz>
Fixes: ec66e0d59952 ("slab: add sheaf support for batching kfree_rcu() operations")
Cc: <stable(a)vger.kernel.org>
Link: https://lore.kernel.org/linux-mm/1bda09da-93be-4737-aef0-d47f8c5c9301@suse.…
Reported-and-tested-by: Daniel Gomez <da.gomez(a)samsung.com>
Closes: https://lore.kernel.org/linux-mm/0406562e-2066-4cf8-9902-b2b0616dd742@kerne…
Reported-and-tested-by: Jon Hunter <jonathanh(a)nvidia.com>
Closes: https://lore.kernel.org/linux-mm/e988eff6-1287-425e-a06c-805af5bbf262@nvidi…
Signed-off-by: Harry Yoo <harry.yoo(a)oracle.com>
---
v2 -> v3:
- Addressed Suren's comment [1], thanks!
[1] https://lore.kernel.org/linux-mm/CAJuCfpE+g65Dm8-r=psDJQf_O1rfBG62DOzx4mE1m…
include/linux/slab.h | 7 ++++++
mm/slab.h | 1 +
mm/slab_common.c | 52 +++++++++++++++++++++++++++++------------
mm/slub.c | 55 ++++++++++++++++++++++++--------------------
4 files changed, 75 insertions(+), 40 deletions(-)
diff --git a/include/linux/slab.h b/include/linux/slab.h
index cf443f064a66..2482992248dc 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -1150,10 +1150,17 @@ static inline void kvfree_rcu_barrier(void)
rcu_barrier();
}
+static inline void kvfree_rcu_barrier_on_cache(struct kmem_cache *s)
+{
+ rcu_barrier();
+}
+
static inline void kfree_rcu_scheduler_running(void) { }
#else
void kvfree_rcu_barrier(void);
+void kvfree_rcu_barrier_on_cache(struct kmem_cache *s);
+
void kfree_rcu_scheduler_running(void);
#endif
diff --git a/mm/slab.h b/mm/slab.h
index f730e012553c..e767aa7e91b0 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -422,6 +422,7 @@ static inline bool is_kmalloc_normal(struct kmem_cache *s)
bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj);
void flush_all_rcu_sheaves(void);
+void flush_rcu_sheaves_on_cache(struct kmem_cache *s);
#define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | \
SLAB_CACHE_DMA32 | SLAB_PANIC | \
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 84dfff4f7b1f..dd8a49d6f9cc 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -492,7 +492,7 @@ void kmem_cache_destroy(struct kmem_cache *s)
return;
/* in-flight kfree_rcu()'s may include objects from our cache */
- kvfree_rcu_barrier();
+ kvfree_rcu_barrier_on_cache(s);
if (IS_ENABLED(CONFIG_SLUB_RCU_DEBUG) &&
(s->flags & SLAB_TYPESAFE_BY_RCU)) {
@@ -2038,25 +2038,13 @@ void kvfree_call_rcu(struct rcu_head *head, void *ptr)
}
EXPORT_SYMBOL_GPL(kvfree_call_rcu);
-/**
- * kvfree_rcu_barrier - Wait until all in-flight kvfree_rcu() complete.
- *
- * Note that a single argument of kvfree_rcu() call has a slow path that
- * triggers synchronize_rcu() following by freeing a pointer. It is done
- * before the return from the function. Therefore for any single-argument
- * call that will result in a kfree() to a cache that is to be destroyed
- * during module exit, it is developer's responsibility to ensure that all
- * such calls have returned before the call to kmem_cache_destroy().
- */
-void kvfree_rcu_barrier(void)
+static inline void __kvfree_rcu_barrier(void)
{
struct kfree_rcu_cpu_work *krwp;
struct kfree_rcu_cpu *krcp;
bool queued;
int i, cpu;
- flush_all_rcu_sheaves();
-
/*
* Firstly we detach objects and queue them over an RCU-batch
* for all CPUs. Finally queued works are flushed for each CPU.
@@ -2118,8 +2106,43 @@ void kvfree_rcu_barrier(void)
}
}
}
+
+/**
+ * kvfree_rcu_barrier - Wait until all in-flight kvfree_rcu() complete.
+ *
+ * Note that a single argument of kvfree_rcu() call has a slow path that
+ * triggers synchronize_rcu() following by freeing a pointer. It is done
+ * before the return from the function. Therefore for any single-argument
+ * call that will result in a kfree() to a cache that is to be destroyed
+ * during module exit, it is developer's responsibility to ensure that all
+ * such calls have returned before the call to kmem_cache_destroy().
+ */
+void kvfree_rcu_barrier(void)
+{
+ flush_all_rcu_sheaves();
+ __kvfree_rcu_barrier();
+}
EXPORT_SYMBOL_GPL(kvfree_rcu_barrier);
+/**
+ * kvfree_rcu_barrier_on_cache - Wait for in-flight kvfree_rcu() calls on a
+ * specific slab cache.
+ * @s: slab cache to wait for
+ *
+ * See the description of kvfree_rcu_barrier() for details.
+ */
+void kvfree_rcu_barrier_on_cache(struct kmem_cache *s)
+{
+ if (s->cpu_sheaves)
+ flush_rcu_sheaves_on_cache(s);
+ /*
+ * TODO: Introduce a version of __kvfree_rcu_barrier() that works
+ * on a specific slab cache.
+ */
+ __kvfree_rcu_barrier();
+}
+EXPORT_SYMBOL_GPL(kvfree_rcu_barrier_on_cache);
+
static unsigned long
kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
{
@@ -2215,4 +2238,3 @@ void __init kvfree_rcu_init(void)
}
#endif /* CONFIG_KVFREE_RCU_BATCHED */
-
diff --git a/mm/slub.c b/mm/slub.c
index 785e25a14999..7cec2220712b 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -4118,42 +4118,47 @@ static void flush_rcu_sheaf(struct work_struct *w)
/* needed for kvfree_rcu_barrier() */
-void flush_all_rcu_sheaves(void)
+void flush_rcu_sheaves_on_cache(struct kmem_cache *s)
{
struct slub_flush_work *sfw;
- struct kmem_cache *s;
unsigned int cpu;
- cpus_read_lock();
- mutex_lock(&slab_mutex);
+ mutex_lock(&flush_lock);
- list_for_each_entry(s, &slab_caches, list) {
- if (!s->cpu_sheaves)
- continue;
+ for_each_online_cpu(cpu) {
+ sfw = &per_cpu(slub_flush, cpu);
- mutex_lock(&flush_lock);
+ /*
+ * we don't check if rcu_free sheaf exists - racing
+ * __kfree_rcu_sheaf() might have just removed it.
+ * by executing flush_rcu_sheaf() on the cpu we make
+ * sure the __kfree_rcu_sheaf() finished its call_rcu()
+ */
- for_each_online_cpu(cpu) {
- sfw = &per_cpu(slub_flush, cpu);
+ INIT_WORK(&sfw->work, flush_rcu_sheaf);
+ sfw->s = s;
+ queue_work_on(cpu, flushwq, &sfw->work);
+ }
- /*
- * we don't check if rcu_free sheaf exists - racing
- * __kfree_rcu_sheaf() might have just removed it.
- * by executing flush_rcu_sheaf() on the cpu we make
- * sure the __kfree_rcu_sheaf() finished its call_rcu()
- */
+ for_each_online_cpu(cpu) {
+ sfw = &per_cpu(slub_flush, cpu);
+ flush_work(&sfw->work);
+ }
- INIT_WORK(&sfw->work, flush_rcu_sheaf);
- sfw->s = s;
- queue_work_on(cpu, flushwq, &sfw->work);
- }
+ mutex_unlock(&flush_lock);
+}
- for_each_online_cpu(cpu) {
- sfw = &per_cpu(slub_flush, cpu);
- flush_work(&sfw->work);
- }
+void flush_all_rcu_sheaves(void)
+{
+ struct kmem_cache *s;
+
+ cpus_read_lock();
+ mutex_lock(&slab_mutex);
- mutex_unlock(&flush_lock);
+ list_for_each_entry(s, &slab_caches, list) {
+ if (!s->cpu_sheaves)
+ continue;
+ flush_rcu_sheaves_on_cache(s);
}
mutex_unlock(&slab_mutex);
--
2.43.0
The local variable 'curr_energy' was never clamped to
PAC_193X_MIN_POWER_ACC or PAC_193X_MAX_POWER_ACC because the return
value of clamp() was not used. Fix this by assigning the clamped value
back to 'curr_energy'.
Cc: stable(a)vger.kernel.org
Fixes: 0fb528c8255b ("iio: adc: adding support for PAC193x")
Signed-off-by: Thorsten Blum <thorsten.blum(a)linux.dev>
---
drivers/iio/adc/pac1934.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/iio/adc/pac1934.c b/drivers/iio/adc/pac1934.c
index 48df16509260..256488d3936b 100644
--- a/drivers/iio/adc/pac1934.c
+++ b/drivers/iio/adc/pac1934.c
@@ -665,7 +665,8 @@ static int pac1934_reg_snapshot(struct pac1934_chip_info *info,
/* add the power_acc field */
curr_energy += inc;
- clamp(curr_energy, PAC_193X_MIN_POWER_ACC, PAC_193X_MAX_POWER_ACC);
+ curr_energy = clamp(curr_energy, PAC_193X_MIN_POWER_ACC,
+ PAC_193X_MAX_POWER_ACC);
reg_data->energy_sec_acc[cnt] = curr_energy;
}
--
Thorsten Blum <thorsten.blum(a)linux.dev>
GPG: 1D60 735E 8AEF 3BE4 73B6 9D84 7336 78FD 8DFE EAD4
As reported, ever since commit 1013af4f585f ("mm/hugetlb: fix
huge_pmd_unshare() vs GUP-fast race") we can end up in some situations
where we perform so many IPI broadcasts when unsharing hugetlb PMD page
tables that it severely regresses some workloads.
In particular, when we fork()+exit(), or when we munmap() a large
area backed by many shared PMD tables, we perform one IPI broadcast per
unshared PMD table.
There are two optimizations to be had:
(1) When we process (unshare) multiple such PMD tables, such as during
exit(), it is sufficient to send a single IPI broadcast (as long as
we respect locking rules) instead of one per PMD table.
Locking prevents that any of these PMD tables could get reuse before
we drop the lock.
(2) When we are not the last sharer (> 2 users including us), there is
no need to send the IPI broadcast. The shared PMD tables cannot
become exclusive (fully unshared) before an IPI will be broadcasted
by the last sharer.
Concurrent GUP-fast could walk into a PMD table just before we
unshared it. It could then succeed in grabbing a page from the
shared page table even after munmap() etc succeeded (and supressed
an IPI). But there is not difference compared to GUP-fast just
sleeping for a while after grabbing the page and re-enabling IRQs.
Most importantly, GUP-fast will never walk into page tables that are
no-longer shared, because the last sharer will issue an IPI
broadcast.
(if ever required, checking whether the PUD changed in GUP-fast
after grabbing the page like we do in the PTE case could handle
this)
So let's rework PMD sharing TLB flushing + IPI sync to use the mmu_gather
infrastructure so we can implement these optimizations and demystify the
code at least a bit. Extend the mmu_gather infrastructure to be able to
deal with our special hugetlb PMD table sharing implementation.
We'll consolidate the handling for (full) unsharing of PMD tables in
tlb_unshare_pmd_ptdesc() and tlb_flush_unshared_tables(), and track
in "struct mmu_gather" whether we had (full) unsharing of PMD tables.
Because locking is very special (concurrent unsharing+reuse must be
prevented), we disallow deferring flushing to tlb_finish_mmu() and instead
require an explicit earlier call to tlb_flush_unshared_tables().
From hugetlb code, we call huge_pmd_unshare_flush() where we make sure
that the expected lock protecting us from concurrent unsharing+reuse is
still held.
Check with a VM_WARN_ON_ONCE() in tlb_finish_mmu() that
tlb_flush_unshared_tables() was properly called earlier.
Document it all properly.
Notes about tlb_remove_table_sync_one() interaction with unsharing:
There are two fairly tricky things:
(1) tlb_remove_table_sync_one() is a NOP on architectures without
CONFIG_MMU_GATHER_RCU_TABLE_FREE.
Here, the assumption is that the previous TLB flush would send an
IPI to all relevant CPUs. Careful: some architectures like x86 only
send IPIs to all relevant CPUs when tlb->freed_tables is set.
The relevant architectures should be selecting
MMU_GATHER_RCU_TABLE_FREE, but x86 might not do that in stable
kernels and it might have been problematic before this patch.
Also, the arch flushing behavior (independent of IPIs) is different
when tlb->freed_tables is set. Do we have to enlighten them to also
take care of tlb->unshared_tables? So far we didn't care, so
hopefully we are fine. Of course, we could be setting
tlb->freed_tables as well, but that might then unnecessarily flush
too much, because the semantics of tlb->freed_tables are a bit
fuzzy.
This patch changes nothing in this regard.
(2) tlb_remove_table_sync_one() is not a NOP on architectures with
CONFIG_MMU_GATHER_RCU_TABLE_FREE that actually don't need a sync.
Take x86 as an example: in the common case (!pv, !X86_FEATURE_INVLPGB)
we still issue IPIs during TLB flushes and don't actually need the
second tlb_remove_table_sync_one().
This optimized can be implemented on top of this, by checking e.g., in
tlb_remove_table_sync_one() whether we really need IPIs. But as
described in (1), it really must honor tlb->freed_tables then to
send IPIs to all relevant CPUs.
Further note that the ptdesc_pmd_pts_dec() in huge_pmd_share() is not a
concern, as we are holding the i_mmap_lock the whole time, preventing
concurrent unsharing. That ptdesc_pmd_pts_dec() usage will be removed
separately as a cleanup later.
There are plenty more cleanups to be had, but they have to wait until
this is fixed.
Fixes: 1013af4f585f ("mm/hugetlb: fix huge_pmd_unshare() vs GUP-fast race")
Reported-by: Uschakow, Stanislav" <suschako(a)amazon.de>
Closes: https://lore.kernel.org/all/4d3878531c76479d9f8ca9789dc6485d@amazon.de/
Cc: <stable(a)vger.kernel.org>
Signed-off-by: David Hildenbrand (Red Hat) <david(a)kernel.org>
---
include/asm-generic/tlb.h | 69 +++++++++++++++++++++-
include/linux/hugetlb.h | 19 +++---
mm/hugetlb.c | 121 ++++++++++++++++++++++----------------
mm/mmu_gather.c | 6 ++
mm/mprotect.c | 2 +-
mm/rmap.c | 25 +++++---
6 files changed, 173 insertions(+), 69 deletions(-)
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index 1fff717cae510..324a21f53b644 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -364,6 +364,17 @@ struct mmu_gather {
unsigned int vma_huge : 1;
unsigned int vma_pfn : 1;
+ /*
+ * Did we unshare (unmap) any shared page tables?
+ */
+ unsigned int unshared_tables : 1;
+
+ /*
+ * Did we unshare any page tables such that they are now exclusive
+ * and could get reused+modified by the new owner?
+ */
+ unsigned int fully_unshared_tables : 1;
+
unsigned int batch_count;
#ifndef CONFIG_MMU_GATHER_NO_GATHER
@@ -400,6 +411,7 @@ static inline void __tlb_reset_range(struct mmu_gather *tlb)
tlb->cleared_pmds = 0;
tlb->cleared_puds = 0;
tlb->cleared_p4ds = 0;
+ tlb->unshared_tables = 0;
/*
* Do not reset mmu_gather::vma_* fields here, we do not
* call into tlb_start_vma() again to set them if there is an
@@ -484,7 +496,7 @@ static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
* these bits.
*/
if (!(tlb->freed_tables || tlb->cleared_ptes || tlb->cleared_pmds ||
- tlb->cleared_puds || tlb->cleared_p4ds))
+ tlb->cleared_puds || tlb->cleared_p4ds || tlb->unshared_tables))
return;
tlb_flush(tlb);
@@ -773,6 +785,61 @@ static inline bool huge_pmd_needs_flush(pmd_t oldpmd, pmd_t newpmd)
}
#endif
+#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING
+static inline void tlb_unshare_pmd_ptdesc(struct mmu_gather *tlb, struct ptdesc *pt,
+ unsigned long addr)
+{
+ /*
+ * The caller must make sure that concurrent unsharing + exclusive
+ * reuse is impossible until tlb_flush_unshared_tables() was called.
+ */
+ VM_WARN_ON_ONCE(!ptdesc_pmd_is_shared(pt));
+ ptdesc_pmd_pts_dec(pt);
+
+ /* Clearing a PUD pointing at a PMD table with PMD leaves. */
+ tlb_flush_pmd_range(tlb, addr & PUD_MASK, PUD_SIZE);
+
+ /*
+ * If the page table is now exclusively owned, we fully unshared
+ * a page table.
+ */
+ if (!ptdesc_pmd_is_shared(pt))
+ tlb->fully_unshared_tables = true;
+ tlb->unshared_tables = true;
+}
+
+static inline void tlb_flush_unshared_tables(struct mmu_gather *tlb)
+{
+ /*
+ * As soon as the caller drops locks to allow for reuse of
+ * previously-shared tables, these tables could get modified and
+ * even reused outside of hugetlb context. So flush the TLB now.
+ *
+ * Note that we cannot defer the flush to a later point even if we are
+ * not the last sharer of the page table.
+ */
+ if (tlb->unshared_tables)
+ tlb_flush_mmu_tlbonly(tlb);
+
+ /*
+ * Similarly, we must make sure that concurrent GUP-fast will not
+ * walk previously-shared page tables that are getting modified+reused
+ * elsewhere. So broadcast an IPI to wait for any concurrent GUP-fast.
+ *
+ * We only perform this when we are the last sharer of a page table,
+ * as the IPI will reach all CPUs: any GUP-fast.
+ *
+ * Note that on configs where tlb_remove_table_sync_one() is a NOP,
+ * the expectation is that the tlb_flush_mmu_tlbonly() would have issued
+ * required IPIs already for us.
+ */
+ if (tlb->fully_unshared_tables) {
+ tlb_remove_table_sync_one();
+ tlb->fully_unshared_tables = false;
+ }
+}
+#endif /* CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING */
+
#endif /* CONFIG_MMU */
#endif /* _ASM_GENERIC__TLB_H */
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 03c8725efa289..63b248c6bfd47 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -240,8 +240,9 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
pte_t *huge_pte_offset(struct mm_struct *mm,
unsigned long addr, unsigned long sz);
unsigned long hugetlb_mask_last_page(struct hstate *h);
-int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long addr, pte_t *ptep);
+int huge_pmd_unshare(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep);
+void huge_pmd_unshare_flush(struct mmu_gather *tlb, struct vm_area_struct *vma);
void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
unsigned long *start, unsigned long *end);
@@ -271,7 +272,7 @@ void hugetlb_vma_unlock_write(struct vm_area_struct *vma);
int hugetlb_vma_trylock_write(struct vm_area_struct *vma);
void hugetlb_vma_assert_locked(struct vm_area_struct *vma);
void hugetlb_vma_lock_release(struct kref *kref);
-long hugetlb_change_protection(struct vm_area_struct *vma,
+long hugetlb_change_protection(struct mmu_gather *tlb, struct vm_area_struct *vma,
unsigned long address, unsigned long end, pgprot_t newprot,
unsigned long cp_flags);
void hugetlb_unshare_all_pmds(struct vm_area_struct *vma);
@@ -300,13 +301,17 @@ static inline struct address_space *hugetlb_folio_mapping_lock_write(
return NULL;
}
-static inline int huge_pmd_unshare(struct mm_struct *mm,
- struct vm_area_struct *vma,
- unsigned long addr, pte_t *ptep)
+static inline int huge_pmd_unshare(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
{
return 0;
}
+static inline void huge_pmd_unshare_flush(struct mmu_gather *tlb,
+ struct vm_area_struct *vma)
+{
+}
+
static inline void adjust_range_if_pmd_sharing_possible(
struct vm_area_struct *vma,
unsigned long *start, unsigned long *end)
@@ -432,7 +437,7 @@ static inline void move_hugetlb_state(struct folio *old_folio,
{
}
-static inline long hugetlb_change_protection(
+static inline long hugetlb_change_protection(struct mmu_gather *tlb,
struct vm_area_struct *vma, unsigned long address,
unsigned long end, pgprot_t newprot,
unsigned long cp_flags)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 3c77cdef12a32..3db94693a06fc 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5096,8 +5096,9 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
unsigned long last_addr_mask;
pte_t *src_pte, *dst_pte;
struct mmu_notifier_range range;
- bool shared_pmd = false;
+ struct mmu_gather tlb;
+ tlb_gather_mmu(&tlb, vma->vm_mm);
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, old_addr,
old_end);
adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
@@ -5122,12 +5123,12 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
if (huge_pte_none(huge_ptep_get(mm, old_addr, src_pte)))
continue;
- if (huge_pmd_unshare(mm, vma, old_addr, src_pte)) {
- shared_pmd = true;
+ if (huge_pmd_unshare(&tlb, vma, old_addr, src_pte)) {
old_addr |= last_addr_mask;
new_addr |= last_addr_mask;
continue;
}
+ tlb_remove_huge_tlb_entry(h, &tlb, src_pte, old_addr);
dst_pte = huge_pte_alloc(mm, new_vma, new_addr, sz);
if (!dst_pte)
@@ -5136,13 +5137,13 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
move_huge_pte(vma, old_addr, new_addr, src_pte, dst_pte, sz);
}
- if (shared_pmd)
- flush_hugetlb_tlb_range(vma, range.start, range.end);
- else
- flush_hugetlb_tlb_range(vma, old_end - len, old_end);
+ tlb_flush_mmu_tlbonly(&tlb);
+ huge_pmd_unshare_flush(&tlb, vma);
+
mmu_notifier_invalidate_range_end(&range);
i_mmap_unlock_write(mapping);
hugetlb_vma_unlock_write(vma);
+ tlb_finish_mmu(&tlb);
return len + old_addr - old_end;
}
@@ -5161,7 +5162,6 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
unsigned long sz = huge_page_size(h);
bool adjust_reservation;
unsigned long last_addr_mask;
- bool force_flush = false;
WARN_ON(!is_vm_hugetlb_page(vma));
BUG_ON(start & ~huge_page_mask(h));
@@ -5184,10 +5184,8 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
}
ptl = huge_pte_lock(h, mm, ptep);
- if (huge_pmd_unshare(mm, vma, address, ptep)) {
+ if (huge_pmd_unshare(tlb, vma, address, ptep)) {
spin_unlock(ptl);
- tlb_flush_pmd_range(tlb, address & PUD_MASK, PUD_SIZE);
- force_flush = true;
address |= last_addr_mask;
continue;
}
@@ -5303,14 +5301,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
}
tlb_end_vma(tlb, vma);
- /*
- * There is nothing protecting a previously-shared page table that we
- * unshared through huge_pmd_unshare() from getting freed after we
- * release i_mmap_rwsem, so flush the TLB now. If huge_pmd_unshare()
- * succeeded, flush the range corresponding to the pud.
- */
- if (force_flush)
- tlb_flush_mmu_tlbonly(tlb);
+ huge_pmd_unshare_flush(tlb, vma);
}
void __hugetlb_zap_begin(struct vm_area_struct *vma,
@@ -6399,7 +6390,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
}
#endif /* CONFIG_USERFAULTFD */
-long hugetlb_change_protection(struct vm_area_struct *vma,
+long hugetlb_change_protection(struct mmu_gather *tlb, struct vm_area_struct *vma,
unsigned long address, unsigned long end,
pgprot_t newprot, unsigned long cp_flags)
{
@@ -6409,7 +6400,6 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
pte_t pte;
struct hstate *h = hstate_vma(vma);
long pages = 0, psize = huge_page_size(h);
- bool shared_pmd = false;
struct mmu_notifier_range range;
unsigned long last_addr_mask;
bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
@@ -6452,7 +6442,7 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
}
}
ptl = huge_pte_lock(h, mm, ptep);
- if (huge_pmd_unshare(mm, vma, address, ptep)) {
+ if (huge_pmd_unshare(tlb, vma, address, ptep)) {
/*
* When uffd-wp is enabled on the vma, unshare
* shouldn't happen at all. Warn about it if it
@@ -6461,7 +6451,6 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
WARN_ON_ONCE(uffd_wp || uffd_wp_resolve);
pages++;
spin_unlock(ptl);
- shared_pmd = true;
address |= last_addr_mask;
continue;
}
@@ -6522,22 +6511,16 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
pte = huge_pte_clear_uffd_wp(pte);
huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte);
pages++;
+ tlb_remove_huge_tlb_entry(h, tlb, ptep, address);
}
next:
spin_unlock(ptl);
cond_resched();
}
- /*
- * There is nothing protecting a previously-shared page table that we
- * unshared through huge_pmd_unshare() from getting freed after we
- * release i_mmap_rwsem, so flush the TLB now. If huge_pmd_unshare()
- * succeeded, flush the range corresponding to the pud.
- */
- if (shared_pmd)
- flush_hugetlb_tlb_range(vma, range.start, range.end);
- else
- flush_hugetlb_tlb_range(vma, start, end);
+
+ tlb_flush_mmu_tlbonly(tlb);
+ huge_pmd_unshare_flush(tlb, vma);
/*
* No need to call mmu_notifier_arch_invalidate_secondary_tlbs() we are
* downgrading page table protection not changing it to point to a new
@@ -6904,18 +6887,27 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
return pte;
}
-/*
- * unmap huge page backed by shared pte.
+/**
+ * huge_pmd_unshare - Unmap a pmd table if it is shared by multiple users
+ * @tlb: the current mmu_gather.
+ * @vma: the vma covering the pmd table.
+ * @addr: the address we are trying to unshare.
+ * @ptep: pointer into the (pmd) page table.
+ *
+ * Called with the page table lock held, the i_mmap_rwsem held in write mode
+ * and the hugetlb vma lock held in write mode.
*
- * Called with page table lock held.
+ * Note: The caller must call huge_pmd_unshare_flush() before dropping the
+ * i_mmap_rwsem.
*
- * returns: 1 successfully unmapped a shared pte page
- * 0 the underlying pte page is not shared, or it is the last user
+ * Returns: 1 if it was a shared PMD table and it got unmapped, or 0 if it
+ * was not a shared PMD table.
*/
-int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long addr, pte_t *ptep)
+int huge_pmd_unshare(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep)
{
unsigned long sz = huge_page_size(hstate_vma(vma));
+ struct mm_struct *mm = vma->vm_mm;
pgd_t *pgd = pgd_offset(mm, addr);
p4d_t *p4d = p4d_offset(pgd, addr);
pud_t *pud = pud_offset(p4d, addr);
@@ -6927,18 +6919,36 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
i_mmap_assert_write_locked(vma->vm_file->f_mapping);
hugetlb_vma_assert_locked(vma);
pud_clear(pud);
- /*
- * Once our caller drops the rmap lock, some other process might be
- * using this page table as a normal, non-hugetlb page table.
- * Wait for pending gup_fast() in other threads to finish before letting
- * that happen.
- */
- tlb_remove_table_sync_one();
- ptdesc_pmd_pts_dec(virt_to_ptdesc(ptep));
+
+ tlb_unshare_pmd_ptdesc(tlb, virt_to_ptdesc(ptep), addr);
+
mm_dec_nr_pmds(mm);
return 1;
}
+/*
+ * huge_pmd_unshare_flush - Complete a sequence of huge_pmd_unshare() calls
+ * @tlb: the current mmu_gather.
+ * @vma: the vma covering the pmd table.
+ *
+ * Perform necessary TLB flushes or IPI broadcasts to synchronize PMD table
+ * unsharing with concurrent page table walkers (TLB, GUP-fast, etc.).
+ *
+ * This function must be called after a sequence of huge_pmd_unshare()
+ * calls while still holding the i_mmap_rwsem.
+ */
+void huge_pmd_unshare_flush(struct mmu_gather *tlb, struct vm_area_struct *vma)
+{
+ /*
+ * We must synchronize page table unsharing such that nobody will
+ * try reusing a previously-shared page table while it might still
+ * be in use by previous sharers (TLB, GUP_fast).
+ */
+ i_mmap_assert_write_locked(vma->vm_file->f_mapping);
+
+ tlb_flush_unshared_tables(tlb);
+}
+
#else /* !CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING */
pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -6947,12 +6957,16 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
return NULL;
}
-int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long addr, pte_t *ptep)
+int huge_pmd_unshare(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep)
{
return 0;
}
+void huge_pmd_unshare_flush(struct mmu_gather *tlb, struct vm_area_struct *vma)
+{
+}
+
void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
unsigned long *start, unsigned long *end)
{
@@ -7219,6 +7233,7 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
unsigned long sz = huge_page_size(h);
struct mm_struct *mm = vma->vm_mm;
struct mmu_notifier_range range;
+ struct mmu_gather tlb;
unsigned long address;
spinlock_t *ptl;
pte_t *ptep;
@@ -7229,6 +7244,7 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
if (start >= end)
return;
+ tlb_gather_mmu(&tlb, mm);
flush_cache_range(vma, start, end);
/*
* No need to call adjust_range_if_pmd_sharing_possible(), because
@@ -7248,10 +7264,10 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
if (!ptep)
continue;
ptl = huge_pte_lock(h, mm, ptep);
- huge_pmd_unshare(mm, vma, address, ptep);
+ huge_pmd_unshare(&tlb, vma, address, ptep);
spin_unlock(ptl);
}
- flush_hugetlb_tlb_range(vma, start, end);
+ huge_pmd_unshare_flush(&tlb, vma);
if (take_locks) {
i_mmap_unlock_write(vma->vm_file->f_mapping);
hugetlb_vma_unlock_write(vma);
@@ -7261,6 +7277,7 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
* Documentation/mm/mmu_notifier.rst.
*/
mmu_notifier_invalidate_range_end(&range);
+ tlb_finish_mmu(&tlb);
}
/*
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index 247e3f9db6c7a..822a790127375 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -468,6 +468,12 @@ void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm)
*/
void tlb_finish_mmu(struct mmu_gather *tlb)
{
+ /*
+ * We expect an earlier huge_pmd_unshare_flush() call to sort this out,
+ * due to complicated locking requirements with page table unsharing.
+ */
+ VM_WARN_ON_ONCE(tlb->fully_unshared_tables);
+
/*
* If there are parallel threads are doing PTE changes on same range
* under non-exclusive lock (e.g., mmap_lock read-side) but defer TLB
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 283889e4f1cec..5c330e817129e 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -652,7 +652,7 @@ long change_protection(struct mmu_gather *tlb,
#endif
if (is_vm_hugetlb_page(vma))
- pages = hugetlb_change_protection(vma, start, end, newprot,
+ pages = hugetlb_change_protection(tlb, vma, start, end, newprot,
cp_flags);
else
pages = change_protection_range(tlb, vma, start, end, newprot,
diff --git a/mm/rmap.c b/mm/rmap.c
index 748f48727a162..d6799afe11147 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -76,7 +76,7 @@
#include <linux/mm_inline.h>
#include <linux/oom.h>
-#include <asm/tlbflush.h>
+#include <asm/tlb.h>
#define CREATE_TRACE_POINTS
#include <trace/events/migrate.h>
@@ -2008,13 +2008,17 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
* if unsuccessful.
*/
if (!anon) {
+ struct mmu_gather tlb;
+
VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
if (!hugetlb_vma_trylock_write(vma))
goto walk_abort;
- if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
+
+ tlb_gather_mmu(&tlb, mm);
+ if (huge_pmd_unshare(&tlb, vma, address, pvmw.pte)) {
hugetlb_vma_unlock_write(vma);
- flush_tlb_range(vma,
- range.start, range.end);
+ huge_pmd_unshare_flush(&tlb, vma);
+ tlb_finish_mmu(&tlb);
/*
* The PMD table was unmapped,
* consequently unmapping the folio.
@@ -2022,6 +2026,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
goto walk_done;
}
hugetlb_vma_unlock_write(vma);
+ tlb_finish_mmu(&tlb);
}
pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
if (pte_dirty(pteval))
@@ -2398,17 +2403,20 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
* fail if unsuccessful.
*/
if (!anon) {
+ struct mmu_gather tlb;
+
VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
if (!hugetlb_vma_trylock_write(vma)) {
page_vma_mapped_walk_done(&pvmw);
ret = false;
break;
}
- if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
- hugetlb_vma_unlock_write(vma);
- flush_tlb_range(vma,
- range.start, range.end);
+ tlb_gather_mmu(&tlb, mm);
+ if (huge_pmd_unshare(&tlb, vma, address, pvmw.pte)) {
+ hugetlb_vma_unlock_write(vma);
+ huge_pmd_unshare_flush(&tlb, vma);
+ tlb_finish_mmu(&tlb);
/*
* The PMD table was unmapped,
* consequently unmapping the folio.
@@ -2417,6 +2425,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
break;
}
hugetlb_vma_unlock_write(vma);
+ tlb_finish_mmu(&tlb);
}
/* Nuke the hugetlb page table entry */
pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
--
2.52.0
Hi stable folks,
Please apply commit 5b1e38c0792c ("dpaa2-mac: bail if the dpmacs fwnode
is not found") to 5.15, where it addresses an instance of
-Wsometimes-uninitialized with clang-21 and newer, introduced by commit
3264f599c1a8 ("net: dpaa2-mac: Add ACPI support for DPAA2 MAC driver")
in 5.14.
drivers/net/ethernet/freescale/dpaa2/dpaa2-mac.c:54:13: error: variable 'parent' is used uninitialized whenever 'if' condition is false [-Werror,-Wsometimes-uninitialized]
54 | } else if (is_acpi_node(fwnode)) {
| ^~~~~~~~~~~~~~~~~~~~
drivers/net/ethernet/freescale/dpaa2/dpaa2-mac.c:58:29: note: uninitialized use occurs here
58 | fwnode_for_each_child_node(parent, child) {
| ^~~~~~
drivers/net/ethernet/freescale/dpaa2/dpaa2-mac.c:54:9: note: remove the 'if' if its condition is always true
54 | } else if (is_acpi_node(fwnode)) {
| ^~~~~~~~~~~~~~~~~~~~~~~~~
drivers/net/ethernet/freescale/dpaa2/dpaa2-mac.c:43:39: note: initialize the variable 'parent' to silence this warning
43 | struct fwnode_handle *fwnode, *parent, *child = NULL;
| ^
| = NULL
It applies and builds cleanly for me. If there are any issues, please
let me know.
Cheers,
Nathan
Hi stable folks,
Please apply commit fc7bf4c0d65a ("drm/i915/selftests: Fix inconsistent
IS_ERR and PTR_ERR") to 5.15, where it resolves a couple of instances of
-Wuninitialized with clang-21 or newer that were introduced by commit
cdb35d1ed6d2 ("drm/i915/gem: Migrate to system at dma-buf attach time
(v7)") in 5.15.
In file included from drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c:329:
drivers/gpu/drm/i915/gem/selftests/i915_gem_dmabuf.c:105:18: error: variable 'dmabuf' is uninitialized when used here [-Werror,-Wuninitialized]
105 | PTR_ERR(dmabuf));
| ^~~~~~
drivers/gpu/drm/i915/gem/selftests/i915_gem_dmabuf.c:94:24: note: initialize the variable 'dmabuf' to silence this warning
94 | struct dma_buf *dmabuf;
| ^
| = NULL
drivers/gpu/drm/i915/gem/selftests/i915_gem_dmabuf.c:161:18: error: variable 'dmabuf' is uninitialized when used here [-Werror,-Wuninitialized]
161 | PTR_ERR(dmabuf));
| ^~~~~~
drivers/gpu/drm/i915/gem/selftests/i915_gem_dmabuf.c:149:24: note: initialize the variable 'dmabuf' to silence this warning
149 | struct dma_buf *dmabuf;
| ^
| = NULL
It applies and builds cleanly for me. If there are any issues, please
let me know.
Cheers,
Nathan
Hi stable folks,
Please apply commit ccc35ff2fd29 ("leds: spi-byte: Use
devm_led_classdev_register_ext()") to 5.15, 6.1, and 6.6. It
inadvertently addresses an instance of -Wuninitialized visible with
clang-21 and newer:
drivers/leds/leds-spi-byte.c:99:26: error: variable 'child' is uninitialized when used here [-Werror,-Wuninitialized]
99 | of_property_read_string(child, "label", &name);
| ^~~~~
drivers/leds/leds-spi-byte.c:83:27: note: initialize the variable 'child' to silence this warning
83 | struct device_node *child;
| ^
| = NULL
It applies cleanly to 6.6. I have attached a backport for 6.1 and 5.15,
which can be generated locally with:
$ git format-patch -1 --stdout ccc35ff2fd2911986b716a87fe65e03fac2312c9 | sed 's;strscpy;strlcpy;' | patch -p1
This change seems safe to me but if I am missing a massive dependency
chain, an alternative would be moving child's initialization up in these
stable branches.
Cheers,
Nathan
diff --git a/drivers/leds/leds-spi-byte.c b/drivers/leds/leds-spi-byte.c
index 82696e0607a5..7dd876df8b36 100644
--- a/drivers/leds/leds-spi-byte.c
+++ b/drivers/leds/leds-spi-byte.c
@@ -96,6 +96,7 @@ static int spi_byte_probe(struct spi_device *spi)
if (!led)
return -ENOMEM;
+ child = of_get_next_available_child(dev_of_node(dev), NULL);
of_property_read_string(child, "label", &name);
strlcpy(led->name, name, sizeof(led->name));
led->spi = spi;
@@ -106,7 +107,6 @@ static int spi_byte_probe(struct spi_device *spi)
led->ldev.max_brightness = led->cdef->max_value - led->cdef->off_value;
led->ldev.brightness_set_blocking = spi_byte_brightness_set_blocking;
- child = of_get_next_available_child(dev_of_node(dev), NULL);
state = of_get_property(child, "default-state", NULL);
if (state) {
if (!strcmp(state, "on")) {
Replace the RISCV_ISA_V dependency of the RISC-V crypto code with
RISCV_EFFICIENT_VECTOR_UNALIGNED_ACCESS, which implies RISCV_ISA_V as
well as vector unaligned accesses being efficient.
This is necessary because this code assumes that vector unaligned
accesses are supported and are efficient. (It does so to avoid having
to use lots of extra vsetvli instructions to switch the element width
back and forth between 8 and either 32 or 64.)
This was omitted from the code originally just because the RISC-V kernel
support for detecting this feature didn't exist yet. Support has now
been added, but it's fragmented into per-CPU runtime detection, a
command-line parameter, and a kconfig option. The kconfig option is the
only reasonable way to do it, though, so let's just rely on that.
Fixes: eb24af5d7a05 ("crypto: riscv - add vector crypto accelerated AES-{ECB,CBC,CTR,XTS}")
Fixes: bb54668837a0 ("crypto: riscv - add vector crypto accelerated ChaCha20")
Fixes: 600a3853dfa0 ("crypto: riscv - add vector crypto accelerated GHASH")
Fixes: 8c8e40470ffe ("crypto: riscv - add vector crypto accelerated SHA-{256,224}")
Fixes: b3415925a08b ("crypto: riscv - add vector crypto accelerated SHA-{512,384}")
Fixes: 563a5255afa2 ("crypto: riscv - add vector crypto accelerated SM3")
Fixes: b8d06352bbf3 ("crypto: riscv - add vector crypto accelerated SM4")
Cc: stable(a)vger.kernel.org
Signed-off-by: Eric Biggers <ebiggers(a)kernel.org>
---
arch/riscv/crypto/Kconfig | 12 ++++++++----
lib/crypto/Kconfig | 9 ++++++---
2 files changed, 14 insertions(+), 7 deletions(-)
diff --git a/arch/riscv/crypto/Kconfig b/arch/riscv/crypto/Kconfig
index a75d6325607b..14c5acb935e9 100644
--- a/arch/riscv/crypto/Kconfig
+++ b/arch/riscv/crypto/Kconfig
@@ -2,11 +2,12 @@
menu "Accelerated Cryptographic Algorithms for CPU (riscv)"
config CRYPTO_AES_RISCV64
tristate "Ciphers: AES, modes: ECB, CBC, CTS, CTR, XTS"
- depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO
+ depends on 64BIT && TOOLCHAIN_HAS_VECTOR_CRYPTO && \
+ RISCV_EFFICIENT_VECTOR_UNALIGNED_ACCESS
select CRYPTO_ALGAPI
select CRYPTO_LIB_AES
select CRYPTO_SKCIPHER
help
Block cipher: AES cipher algorithms
@@ -18,21 +19,23 @@ config CRYPTO_AES_RISCV64
- Zvkb vector crypto extension (CTR)
- Zvkg vector crypto extension (XTS)
config CRYPTO_GHASH_RISCV64
tristate "Hash functions: GHASH"
- depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO
+ depends on 64BIT && TOOLCHAIN_HAS_VECTOR_CRYPTO && \
+ RISCV_EFFICIENT_VECTOR_UNALIGNED_ACCESS
select CRYPTO_GCM
help
GCM GHASH function (NIST SP 800-38D)
Architecture: riscv64 using:
- Zvkg vector crypto extension
config CRYPTO_SM3_RISCV64
tristate "Hash functions: SM3 (ShangMi 3)"
- depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO
+ depends on 64BIT && TOOLCHAIN_HAS_VECTOR_CRYPTO && \
+ RISCV_EFFICIENT_VECTOR_UNALIGNED_ACCESS
select CRYPTO_HASH
select CRYPTO_LIB_SM3
help
SM3 (ShangMi 3) secure hash function (OSCCA GM/T 0004-2012)
@@ -40,11 +43,12 @@ config CRYPTO_SM3_RISCV64
- Zvksh vector crypto extension
- Zvkb vector crypto extension
config CRYPTO_SM4_RISCV64
tristate "Ciphers: SM4 (ShangMi 4)"
- depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO
+ depends on 64BIT && TOOLCHAIN_HAS_VECTOR_CRYPTO && \
+ RISCV_EFFICIENT_VECTOR_UNALIGNED_ACCESS
select CRYPTO_ALGAPI
select CRYPTO_SM4
help
SM4 block cipher algorithm (OSCCA GB/T 32907-2016,
ISO/IEC 18033-3:2010/Amd 1:2021)
diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig
index a3647352bff6..6871a41e5069 100644
--- a/lib/crypto/Kconfig
+++ b/lib/crypto/Kconfig
@@ -59,11 +59,12 @@ config CRYPTO_LIB_CHACHA_ARCH
depends on CRYPTO_LIB_CHACHA && !UML && !KMSAN
default y if ARM
default y if ARM64 && KERNEL_MODE_NEON
default y if MIPS && CPU_MIPS32_R2
default y if PPC64 && CPU_LITTLE_ENDIAN && VSX
- default y if RISCV && 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO
+ default y if RISCV && 64BIT && TOOLCHAIN_HAS_VECTOR_CRYPTO && \
+ RISCV_EFFICIENT_VECTOR_UNALIGNED_ACCESS
default y if S390
default y if X86_64
config CRYPTO_LIB_CURVE25519
tristate
@@ -182,11 +183,12 @@ config CRYPTO_LIB_SHA256_ARCH
depends on CRYPTO_LIB_SHA256 && !UML
default y if ARM && !CPU_V7M
default y if ARM64
default y if MIPS && CPU_CAVIUM_OCTEON
default y if PPC && SPE
- default y if RISCV && 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO
+ default y if RISCV && 64BIT && TOOLCHAIN_HAS_VECTOR_CRYPTO && \
+ RISCV_EFFICIENT_VECTOR_UNALIGNED_ACCESS
default y if S390
default y if SPARC64
default y if X86_64
config CRYPTO_LIB_SHA512
@@ -200,11 +202,12 @@ config CRYPTO_LIB_SHA512_ARCH
bool
depends on CRYPTO_LIB_SHA512 && !UML
default y if ARM && !CPU_V7M
default y if ARM64
default y if MIPS && CPU_CAVIUM_OCTEON
- default y if RISCV && 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO
+ default y if RISCV && 64BIT && TOOLCHAIN_HAS_VECTOR_CRYPTO && \
+ RISCV_EFFICIENT_VECTOR_UNALIGNED_ACCESS
default y if S390
default y if SPARC64
default y if X86_64
config CRYPTO_LIB_SHA3
base-commit: 43dfc13ca972988e620a6edb72956981b75ab6b0
--
2.52.0
The patch titled
Subject: mm/hugetlb: fix excessive IPI broadcasts when unsharing PMD tables using mmu_gather
has been added to the -mm mm-unstable branch. Its filename is
mm-hugetlb-fix-excessive-ipi-broadcasts-when-unsharing-pmd-tables-using-mmu_gather.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: "David Hildenbrand (Red Hat)" <david(a)kernel.org>
Subject: mm/hugetlb: fix excessive IPI broadcasts when unsharing PMD tables using mmu_gather
Date: Fri, 5 Dec 2025 22:35:58 +0100
As reported, ever since commit 1013af4f585f ("mm/hugetlb: fix
huge_pmd_unshare() vs GUP-fast race") we can end up in some situations
where we perform so many IPI broadcasts when unsharing hugetlb PMD page
tables that it severely regresses some workloads.
In particular, when we fork()+exit(), or when we munmap() a large area
backed by many shared PMD tables, we perform one IPI broadcast per
unshared PMD table.
There are two optimizations to be had:
(1) When we process (unshare) multiple such PMD tables, such as during
exit(), it is sufficient to send a single IPI broadcast (as long as
we respect locking rules) instead of one per PMD table.
Locking prevents that any of these PMD tables could get reuse before
we drop the lock.
(2) When we are not the last sharer (> 2 users including us), there is
no need to send the IPI broadcast. The shared PMD tables cannot
become exclusive (fully unshared) before an IPI will be broadcasted
by the last sharer.
Concurrent GUP-fast could walk into a PMD table just before we
unshared it. It could then succeed in grabbing a page from the
shared page table even after munmap() etc succeeded (and supressed
an IPI). But there is not difference compared to GUP-fast just
sleeping for a while after grabbing the page and re-enabling IRQs.
Most importantly, GUP-fast will never walk into page tables that are
no-longer shared, because the last sharer will issue an IPI
broadcast.
(if ever required, checking whether the PUD changed in GUP-fast
after grabbing the page like we do in the PTE case could handle
this)
So let's rework PMD sharing TLB flushing + IPI sync to use the mmu_gather
infrastructure so we can implement these optimizations and demystify the
code at least a bit. Extend the mmu_gather infrastructure to be able to
deal with our special hugetlb PMD table sharing implementation.
We'll consolidate the handling for (full) unsharing of PMD tables in
tlb_unshare_pmd_ptdesc() and tlb_flush_unshared_tables(), and track in
"struct mmu_gather" whether we had (full) unsharing of PMD tables.
Because locking is very special (concurrent unsharing+reuse must be
prevented), we disallow deferring flushing to tlb_finish_mmu() and instead
require an explicit earlier call to tlb_flush_unshared_tables().
From hugetlb code, we call huge_pmd_unshare_flush() where we make sure
that the expected lock protecting us from concurrent unsharing+reuse is
still held.
Check with a VM_WARN_ON_ONCE() in tlb_finish_mmu() that
tlb_flush_unshared_tables() was properly called earlier.
Document it all properly.
Notes about tlb_remove_table_sync_one() interaction with unsharing:
There are two fairly tricky things:
(1) tlb_remove_table_sync_one() is a NOP on architectures without
CONFIG_MMU_GATHER_RCU_TABLE_FREE.
Here, the assumption is that the previous TLB flush would send an
IPI to all relevant CPUs. Careful: some architectures like x86 only
send IPIs to all relevant CPUs when tlb->freed_tables is set.
The relevant architectures should be selecting
MMU_GATHER_RCU_TABLE_FREE, but x86 might not do that in stable
kernels and it might have been problematic before this patch.
Also, the arch flushing behavior (independent of IPIs) is different
when tlb->freed_tables is set. Do we have to enlighten them to also
take care of tlb->unshared_tables? So far we didn't care, so
hopefully we are fine. Of course, we could be setting
tlb->freed_tables as well, but that might then unnecessarily flush
too much, because the semantics of tlb->freed_tables are a bit
fuzzy.
This patch changes nothing in this regard.
(2) tlb_remove_table_sync_one() is not a NOP on architectures with
CONFIG_MMU_GATHER_RCU_TABLE_FREE that actually don't need a sync.
Take x86 as an example: in the common case (!pv, !X86_FEATURE_INVLPGB)
we still issue IPIs during TLB flushes and don't actually need the
second tlb_remove_table_sync_one().
This optimized can be implemented on top of this, by checking e.g., in
tlb_remove_table_sync_one() whether we really need IPIs. But as
described in (1), it really must honor tlb->freed_tables then to
send IPIs to all relevant CPUs.
Further note that the ptdesc_pmd_pts_dec() in huge_pmd_share() is not a
concern, as we are holding the i_mmap_lock the whole time, preventing
concurrent unsharing. That ptdesc_pmd_pts_dec() usage will be removed
separately as a cleanup later.
There are plenty more cleanups to be had, but they have to wait until
this is fixed.
Link: https://lkml.kernel.org/r/20251205213558.2980480-5-david@kernel.org
Fixes: 1013af4f585f ("mm/hugetlb: fix huge_pmd_unshare() vs GUP-fast race")
Signed-off-by: David Hildenbrand (Red Hat) <david(a)kernel.org>
Reported-by: Uschakow, Stanislav" <suschako(a)amazon.de>
Closes: https://lore.kernel.org/all/4d3878531c76479d9f8ca9789dc6485d@amazon.de/
Tested-by: Laurence Oberman <loberman(a)redhat.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar(a)kernel.org>
Cc: Arnd Bergmann <arnd(a)arndb.de>
Cc: Jann Horn <jannh(a)google.com>
Cc: Liam Howlett <liam.howlett(a)oracle.com>
Cc: Liu Shixin <liushixin2(a)huawei.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes(a)oracle.com>
Cc: Muchun Song <muchun.song(a)linux.dev>
Cc: Nadav Amit <nadav.amit(a)gmail.com>
Cc: Nicholas Piggin <npiggin(a)gmail.com>
Cc: Oscar Salvador <osalvador(a)suse.de>
Cc: Peter Zijlstra <peterz(a)infradead.org>
Cc: Prakash Sangappa <prakash.sangappa(a)oracle.com>
Cc: Rik van Riel <riel(a)surriel.com>
Cc: Vlastimil Babka <vbabka(a)suse.cz>
Cc: Will Deacon <will(a)kernel.org>
Cc: Lance Yang <lance.yang(a)linux.dev>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
include/asm-generic/tlb.h | 69 ++++++++++++++++++++
include/linux/hugetlb.h | 19 +++--
mm/hugetlb.c | 121 ++++++++++++++++++++----------------
mm/mmu_gather.c | 6 +
mm/mprotect.c | 2
mm/rmap.c | 25 +++++--
6 files changed, 173 insertions(+), 69 deletions(-)
--- a/include/asm-generic/tlb.h~mm-hugetlb-fix-excessive-ipi-broadcasts-when-unsharing-pmd-tables-using-mmu_gather
+++ a/include/asm-generic/tlb.h
@@ -364,6 +364,17 @@ struct mmu_gather {
unsigned int vma_huge : 1;
unsigned int vma_pfn : 1;
+ /*
+ * Did we unshare (unmap) any shared page tables?
+ */
+ unsigned int unshared_tables : 1;
+
+ /*
+ * Did we unshare any page tables such that they are now exclusive
+ * and could get reused+modified by the new owner?
+ */
+ unsigned int fully_unshared_tables : 1;
+
unsigned int batch_count;
#ifndef CONFIG_MMU_GATHER_NO_GATHER
@@ -400,6 +411,7 @@ static inline void __tlb_reset_range(str
tlb->cleared_pmds = 0;
tlb->cleared_puds = 0;
tlb->cleared_p4ds = 0;
+ tlb->unshared_tables = 0;
/*
* Do not reset mmu_gather::vma_* fields here, we do not
* call into tlb_start_vma() again to set them if there is an
@@ -484,7 +496,7 @@ static inline void tlb_flush_mmu_tlbonly
* these bits.
*/
if (!(tlb->freed_tables || tlb->cleared_ptes || tlb->cleared_pmds ||
- tlb->cleared_puds || tlb->cleared_p4ds))
+ tlb->cleared_puds || tlb->cleared_p4ds || tlb->unshared_tables))
return;
tlb_flush(tlb);
@@ -773,6 +785,61 @@ static inline bool huge_pmd_needs_flush(
}
#endif
+#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING
+static inline void tlb_unshare_pmd_ptdesc(struct mmu_gather *tlb, struct ptdesc *pt,
+ unsigned long addr)
+{
+ /*
+ * The caller must make sure that concurrent unsharing + exclusive
+ * reuse is impossible until tlb_flush_unshared_tables() was called.
+ */
+ VM_WARN_ON_ONCE(!ptdesc_pmd_is_shared(pt));
+ ptdesc_pmd_pts_dec(pt);
+
+ /* Clearing a PUD pointing at a PMD table with PMD leaves. */
+ tlb_flush_pmd_range(tlb, addr & PUD_MASK, PUD_SIZE);
+
+ /*
+ * If the page table is now exclusively owned, we fully unshared
+ * a page table.
+ */
+ if (!ptdesc_pmd_is_shared(pt))
+ tlb->fully_unshared_tables = true;
+ tlb->unshared_tables = true;
+}
+
+static inline void tlb_flush_unshared_tables(struct mmu_gather *tlb)
+{
+ /*
+ * As soon as the caller drops locks to allow for reuse of
+ * previously-shared tables, these tables could get modified and
+ * even reused outside of hugetlb context. So flush the TLB now.
+ *
+ * Note that we cannot defer the flush to a later point even if we are
+ * not the last sharer of the page table.
+ */
+ if (tlb->unshared_tables)
+ tlb_flush_mmu_tlbonly(tlb);
+
+ /*
+ * Similarly, we must make sure that concurrent GUP-fast will not
+ * walk previously-shared page tables that are getting modified+reused
+ * elsewhere. So broadcast an IPI to wait for any concurrent GUP-fast.
+ *
+ * We only perform this when we are the last sharer of a page table,
+ * as the IPI will reach all CPUs: any GUP-fast.
+ *
+ * Note that on configs where tlb_remove_table_sync_one() is a NOP,
+ * the expectation is that the tlb_flush_mmu_tlbonly() would have issued
+ * required IPIs already for us.
+ */
+ if (tlb->fully_unshared_tables) {
+ tlb_remove_table_sync_one();
+ tlb->fully_unshared_tables = false;
+ }
+}
+#endif /* CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING */
+
#endif /* CONFIG_MMU */
#endif /* _ASM_GENERIC__TLB_H */
--- a/include/linux/hugetlb.h~mm-hugetlb-fix-excessive-ipi-broadcasts-when-unsharing-pmd-tables-using-mmu_gather
+++ a/include/linux/hugetlb.h
@@ -240,8 +240,9 @@ pte_t *huge_pte_alloc(struct mm_struct *
pte_t *huge_pte_offset(struct mm_struct *mm,
unsigned long addr, unsigned long sz);
unsigned long hugetlb_mask_last_page(struct hstate *h);
-int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long addr, pte_t *ptep);
+int huge_pmd_unshare(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep);
+void huge_pmd_unshare_flush(struct mmu_gather *tlb, struct vm_area_struct *vma);
void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
unsigned long *start, unsigned long *end);
@@ -271,7 +272,7 @@ void hugetlb_vma_unlock_write(struct vm_
int hugetlb_vma_trylock_write(struct vm_area_struct *vma);
void hugetlb_vma_assert_locked(struct vm_area_struct *vma);
void hugetlb_vma_lock_release(struct kref *kref);
-long hugetlb_change_protection(struct vm_area_struct *vma,
+long hugetlb_change_protection(struct mmu_gather *tlb, struct vm_area_struct *vma,
unsigned long address, unsigned long end, pgprot_t newprot,
unsigned long cp_flags);
void hugetlb_unshare_all_pmds(struct vm_area_struct *vma);
@@ -300,13 +301,17 @@ static inline struct address_space *huge
return NULL;
}
-static inline int huge_pmd_unshare(struct mm_struct *mm,
- struct vm_area_struct *vma,
- unsigned long addr, pte_t *ptep)
+static inline int huge_pmd_unshare(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
{
return 0;
}
+static inline void huge_pmd_unshare_flush(struct mmu_gather *tlb,
+ struct vm_area_struct *vma)
+{
+}
+
static inline void adjust_range_if_pmd_sharing_possible(
struct vm_area_struct *vma,
unsigned long *start, unsigned long *end)
@@ -432,7 +437,7 @@ static inline void move_hugetlb_state(st
{
}
-static inline long hugetlb_change_protection(
+static inline long hugetlb_change_protection(struct mmu_gather *tlb,
struct vm_area_struct *vma, unsigned long address,
unsigned long end, pgprot_t newprot,
unsigned long cp_flags)
--- a/mm/hugetlb.c~mm-hugetlb-fix-excessive-ipi-broadcasts-when-unsharing-pmd-tables-using-mmu_gather
+++ a/mm/hugetlb.c
@@ -5096,8 +5096,9 @@ int move_hugetlb_page_tables(struct vm_a
unsigned long last_addr_mask;
pte_t *src_pte, *dst_pte;
struct mmu_notifier_range range;
- bool shared_pmd = false;
+ struct mmu_gather tlb;
+ tlb_gather_mmu(&tlb, vma->vm_mm);
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, old_addr,
old_end);
adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
@@ -5122,12 +5123,12 @@ int move_hugetlb_page_tables(struct vm_a
if (huge_pte_none(huge_ptep_get(mm, old_addr, src_pte)))
continue;
- if (huge_pmd_unshare(mm, vma, old_addr, src_pte)) {
- shared_pmd = true;
+ if (huge_pmd_unshare(&tlb, vma, old_addr, src_pte)) {
old_addr |= last_addr_mask;
new_addr |= last_addr_mask;
continue;
}
+ tlb_remove_huge_tlb_entry(h, &tlb, src_pte, old_addr);
dst_pte = huge_pte_alloc(mm, new_vma, new_addr, sz);
if (!dst_pte)
@@ -5136,13 +5137,13 @@ int move_hugetlb_page_tables(struct vm_a
move_huge_pte(vma, old_addr, new_addr, src_pte, dst_pte, sz);
}
- if (shared_pmd)
- flush_hugetlb_tlb_range(vma, range.start, range.end);
- else
- flush_hugetlb_tlb_range(vma, old_end - len, old_end);
+ tlb_flush_mmu_tlbonly(&tlb);
+ huge_pmd_unshare_flush(&tlb, vma);
+
mmu_notifier_invalidate_range_end(&range);
i_mmap_unlock_write(mapping);
hugetlb_vma_unlock_write(vma);
+ tlb_finish_mmu(&tlb);
return len + old_addr - old_end;
}
@@ -5161,7 +5162,6 @@ void __unmap_hugepage_range(struct mmu_g
unsigned long sz = huge_page_size(h);
bool adjust_reservation;
unsigned long last_addr_mask;
- bool force_flush = false;
WARN_ON(!is_vm_hugetlb_page(vma));
BUG_ON(start & ~huge_page_mask(h));
@@ -5184,10 +5184,8 @@ void __unmap_hugepage_range(struct mmu_g
}
ptl = huge_pte_lock(h, mm, ptep);
- if (huge_pmd_unshare(mm, vma, address, ptep)) {
+ if (huge_pmd_unshare(tlb, vma, address, ptep)) {
spin_unlock(ptl);
- tlb_flush_pmd_range(tlb, address & PUD_MASK, PUD_SIZE);
- force_flush = true;
address |= last_addr_mask;
continue;
}
@@ -5303,14 +5301,7 @@ void __unmap_hugepage_range(struct mmu_g
}
tlb_end_vma(tlb, vma);
- /*
- * There is nothing protecting a previously-shared page table that we
- * unshared through huge_pmd_unshare() from getting freed after we
- * release i_mmap_rwsem, so flush the TLB now. If huge_pmd_unshare()
- * succeeded, flush the range corresponding to the pud.
- */
- if (force_flush)
- tlb_flush_mmu_tlbonly(tlb);
+ huge_pmd_unshare_flush(tlb, vma);
}
void __hugetlb_zap_begin(struct vm_area_struct *vma,
@@ -6399,7 +6390,7 @@ out_release_nounlock:
}
#endif /* CONFIG_USERFAULTFD */
-long hugetlb_change_protection(struct vm_area_struct *vma,
+long hugetlb_change_protection(struct mmu_gather *tlb, struct vm_area_struct *vma,
unsigned long address, unsigned long end,
pgprot_t newprot, unsigned long cp_flags)
{
@@ -6409,7 +6400,6 @@ long hugetlb_change_protection(struct vm
pte_t pte;
struct hstate *h = hstate_vma(vma);
long pages = 0, psize = huge_page_size(h);
- bool shared_pmd = false;
struct mmu_notifier_range range;
unsigned long last_addr_mask;
bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
@@ -6452,7 +6442,7 @@ long hugetlb_change_protection(struct vm
}
}
ptl = huge_pte_lock(h, mm, ptep);
- if (huge_pmd_unshare(mm, vma, address, ptep)) {
+ if (huge_pmd_unshare(tlb, vma, address, ptep)) {
/*
* When uffd-wp is enabled on the vma, unshare
* shouldn't happen at all. Warn about it if it
@@ -6461,7 +6451,6 @@ long hugetlb_change_protection(struct vm
WARN_ON_ONCE(uffd_wp || uffd_wp_resolve);
pages++;
spin_unlock(ptl);
- shared_pmd = true;
address |= last_addr_mask;
continue;
}
@@ -6522,22 +6511,16 @@ long hugetlb_change_protection(struct vm
pte = huge_pte_clear_uffd_wp(pte);
huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte);
pages++;
+ tlb_remove_huge_tlb_entry(h, tlb, ptep, address);
}
next:
spin_unlock(ptl);
cond_resched();
}
- /*
- * There is nothing protecting a previously-shared page table that we
- * unshared through huge_pmd_unshare() from getting freed after we
- * release i_mmap_rwsem, so flush the TLB now. If huge_pmd_unshare()
- * succeeded, flush the range corresponding to the pud.
- */
- if (shared_pmd)
- flush_hugetlb_tlb_range(vma, range.start, range.end);
- else
- flush_hugetlb_tlb_range(vma, start, end);
+
+ tlb_flush_mmu_tlbonly(tlb);
+ huge_pmd_unshare_flush(tlb, vma);
/*
* No need to call mmu_notifier_arch_invalidate_secondary_tlbs() we are
* downgrading page table protection not changing it to point to a new
@@ -6904,18 +6887,27 @@ out:
return pte;
}
-/*
- * unmap huge page backed by shared pte.
+/**
+ * huge_pmd_unshare - Unmap a pmd table if it is shared by multiple users
+ * @tlb: the current mmu_gather.
+ * @vma: the vma covering the pmd table.
+ * @addr: the address we are trying to unshare.
+ * @ptep: pointer into the (pmd) page table.
+ *
+ * Called with the page table lock held, the i_mmap_rwsem held in write mode
+ * and the hugetlb vma lock held in write mode.
*
- * Called with page table lock held.
+ * Note: The caller must call huge_pmd_unshare_flush() before dropping the
+ * i_mmap_rwsem.
*
- * returns: 1 successfully unmapped a shared pte page
- * 0 the underlying pte page is not shared, or it is the last user
+ * Returns: 1 if it was a shared PMD table and it got unmapped, or 0 if it
+ * was not a shared PMD table.
*/
-int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long addr, pte_t *ptep)
+int huge_pmd_unshare(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep)
{
unsigned long sz = huge_page_size(hstate_vma(vma));
+ struct mm_struct *mm = vma->vm_mm;
pgd_t *pgd = pgd_offset(mm, addr);
p4d_t *p4d = p4d_offset(pgd, addr);
pud_t *pud = pud_offset(p4d, addr);
@@ -6927,18 +6919,36 @@ int huge_pmd_unshare(struct mm_struct *m
i_mmap_assert_write_locked(vma->vm_file->f_mapping);
hugetlb_vma_assert_locked(vma);
pud_clear(pud);
- /*
- * Once our caller drops the rmap lock, some other process might be
- * using this page table as a normal, non-hugetlb page table.
- * Wait for pending gup_fast() in other threads to finish before letting
- * that happen.
- */
- tlb_remove_table_sync_one();
- ptdesc_pmd_pts_dec(virt_to_ptdesc(ptep));
+
+ tlb_unshare_pmd_ptdesc(tlb, virt_to_ptdesc(ptep), addr);
+
mm_dec_nr_pmds(mm);
return 1;
}
+/*
+ * huge_pmd_unshare_flush - Complete a sequence of huge_pmd_unshare() calls
+ * @tlb: the current mmu_gather.
+ * @vma: the vma covering the pmd table.
+ *
+ * Perform necessary TLB flushes or IPI broadcasts to synchronize PMD table
+ * unsharing with concurrent page table walkers (TLB, GUP-fast, etc.).
+ *
+ * This function must be called after a sequence of huge_pmd_unshare()
+ * calls while still holding the i_mmap_rwsem.
+ */
+void huge_pmd_unshare_flush(struct mmu_gather *tlb, struct vm_area_struct *vma)
+{
+ /*
+ * We must synchronize page table unsharing such that nobody will
+ * try reusing a previously-shared page table while it might still
+ * be in use by previous sharers (TLB, GUP_fast).
+ */
+ i_mmap_assert_write_locked(vma->vm_file->f_mapping);
+
+ tlb_flush_unshared_tables(tlb);
+}
+
#else /* !CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING */
pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -6947,12 +6957,16 @@ pte_t *huge_pmd_share(struct mm_struct *
return NULL;
}
-int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long addr, pte_t *ptep)
+int huge_pmd_unshare(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep)
{
return 0;
}
+void huge_pmd_unshare_flush(struct mmu_gather *tlb, struct vm_area_struct *vma)
+{
+}
+
void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
unsigned long *start, unsigned long *end)
{
@@ -7219,6 +7233,7 @@ static void hugetlb_unshare_pmds(struct
unsigned long sz = huge_page_size(h);
struct mm_struct *mm = vma->vm_mm;
struct mmu_notifier_range range;
+ struct mmu_gather tlb;
unsigned long address;
spinlock_t *ptl;
pte_t *ptep;
@@ -7229,6 +7244,7 @@ static void hugetlb_unshare_pmds(struct
if (start >= end)
return;
+ tlb_gather_mmu(&tlb, mm);
flush_cache_range(vma, start, end);
/*
* No need to call adjust_range_if_pmd_sharing_possible(), because
@@ -7248,10 +7264,10 @@ static void hugetlb_unshare_pmds(struct
if (!ptep)
continue;
ptl = huge_pte_lock(h, mm, ptep);
- huge_pmd_unshare(mm, vma, address, ptep);
+ huge_pmd_unshare(&tlb, vma, address, ptep);
spin_unlock(ptl);
}
- flush_hugetlb_tlb_range(vma, start, end);
+ huge_pmd_unshare_flush(&tlb, vma);
if (take_locks) {
i_mmap_unlock_write(vma->vm_file->f_mapping);
hugetlb_vma_unlock_write(vma);
@@ -7261,6 +7277,7 @@ static void hugetlb_unshare_pmds(struct
* Documentation/mm/mmu_notifier.rst.
*/
mmu_notifier_invalidate_range_end(&range);
+ tlb_finish_mmu(&tlb);
}
/*
--- a/mm/mmu_gather.c~mm-hugetlb-fix-excessive-ipi-broadcasts-when-unsharing-pmd-tables-using-mmu_gather
+++ a/mm/mmu_gather.c
@@ -469,6 +469,12 @@ void tlb_gather_mmu_fullmm(struct mmu_ga
void tlb_finish_mmu(struct mmu_gather *tlb)
{
/*
+ * We expect an earlier huge_pmd_unshare_flush() call to sort this out,
+ * due to complicated locking requirements with page table unsharing.
+ */
+ VM_WARN_ON_ONCE(tlb->fully_unshared_tables);
+
+ /*
* If there are parallel threads are doing PTE changes on same range
* under non-exclusive lock (e.g., mmap_lock read-side) but defer TLB
* flush by batching, one thread may end up seeing inconsistent PTEs
--- a/mm/mprotect.c~mm-hugetlb-fix-excessive-ipi-broadcasts-when-unsharing-pmd-tables-using-mmu_gather
+++ a/mm/mprotect.c
@@ -652,7 +652,7 @@ long change_protection(struct mmu_gather
#endif
if (is_vm_hugetlb_page(vma))
- pages = hugetlb_change_protection(vma, start, end, newprot,
+ pages = hugetlb_change_protection(tlb, vma, start, end, newprot,
cp_flags);
else
pages = change_protection_range(tlb, vma, start, end, newprot,
--- a/mm/rmap.c~mm-hugetlb-fix-excessive-ipi-broadcasts-when-unsharing-pmd-tables-using-mmu_gather
+++ a/mm/rmap.c
@@ -76,7 +76,7 @@
#include <linux/mm_inline.h>
#include <linux/oom.h>
-#include <asm/tlbflush.h>
+#include <asm/tlb.h>
#define CREATE_TRACE_POINTS
#include <trace/events/migrate.h>
@@ -2008,13 +2008,17 @@ static bool try_to_unmap_one(struct foli
* if unsuccessful.
*/
if (!anon) {
+ struct mmu_gather tlb;
+
VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
if (!hugetlb_vma_trylock_write(vma))
goto walk_abort;
- if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
+
+ tlb_gather_mmu(&tlb, mm);
+ if (huge_pmd_unshare(&tlb, vma, address, pvmw.pte)) {
hugetlb_vma_unlock_write(vma);
- flush_tlb_range(vma,
- range.start, range.end);
+ huge_pmd_unshare_flush(&tlb, vma);
+ tlb_finish_mmu(&tlb);
/*
* The PMD table was unmapped,
* consequently unmapping the folio.
@@ -2022,6 +2026,7 @@ static bool try_to_unmap_one(struct foli
goto walk_done;
}
hugetlb_vma_unlock_write(vma);
+ tlb_finish_mmu(&tlb);
}
pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
if (pte_dirty(pteval))
@@ -2398,17 +2403,20 @@ static bool try_to_migrate_one(struct fo
* fail if unsuccessful.
*/
if (!anon) {
+ struct mmu_gather tlb;
+
VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
if (!hugetlb_vma_trylock_write(vma)) {
page_vma_mapped_walk_done(&pvmw);
ret = false;
break;
}
- if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
- hugetlb_vma_unlock_write(vma);
- flush_tlb_range(vma,
- range.start, range.end);
+ tlb_gather_mmu(&tlb, mm);
+ if (huge_pmd_unshare(&tlb, vma, address, pvmw.pte)) {
+ hugetlb_vma_unlock_write(vma);
+ huge_pmd_unshare_flush(&tlb, vma);
+ tlb_finish_mmu(&tlb);
/*
* The PMD table was unmapped,
* consequently unmapping the folio.
@@ -2417,6 +2425,7 @@ static bool try_to_migrate_one(struct fo
break;
}
hugetlb_vma_unlock_write(vma);
+ tlb_finish_mmu(&tlb);
}
/* Nuke the hugetlb page table entry */
pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
_
Patches currently in -mm which might be from david(a)kernel.org are
mm-hugetlb-fix-hugetlb_pmd_shared.patch
mm-hugetlb-fix-two-comments-related-to-huge_pmd_unshare.patch
mm-rmap-fix-two-comments-related-to-huge_pmd_unshare.patch
mm-hugetlb-fix-excessive-ipi-broadcasts-when-unsharing-pmd-tables-using-mmu_gather.patch
The patch titled
Subject: mm/hugetlb: fix hugetlb_pmd_shared()
has been added to the -mm mm-unstable branch. Its filename is
mm-hugetlb-fix-hugetlb_pmd_shared.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: "David Hildenbrand (Red Hat)" <david(a)kernel.org>
Subject: mm/hugetlb: fix hugetlb_pmd_shared()
Date: Fri, 5 Dec 2025 22:35:55 +0100
Patch series "mm/hugetlb: fixes for PMD table sharing (incl. using
mmu_gather)".
One functional fix, one performance regression fix, and two related
comment fixes.
I cleaned up my prototype I recently shared [1] for the performance fix,
deferring most of the cleanups I had in the prototype to a later point.
While doing that I identified the other things.
The goal of this patch set is to be backported to stable trees "fairly"
easily. At least patch #1 and #4.
Patch #1 fixes hugetlb_pmd_shared() not detecting any sharing
Patch #2 + #3 are simple comment fixes that patch #4 interacts with.
Patch #4 is a fix for the reported performance regression due to excessive
IPI broadcasts during fork()+exit().
The last patch is all about TLB flushes, IPIs and mmu_gather. Read:
complicated
I added as much comments + description that I possibly could, and I am
hoping for review from Jann.
There are plenty of cleanups in the future to be had + one reasonable
optimization on x86. But that's all out of scope for this series.
This patch (of 4):
We switched from (wrongly) using the page count to an independent shared
count. Now, shared page tables have a refcount of 1 (excluding
speculative references) and instead use ptdesc->pt_share_count to identify
sharing.
We didn't convert hugetlb_pmd_shared(), so right now, we would never
detect a shared PMD table as such, because sharing/unsharing no longer
touches the refcount of a PMD table.
Page migration, like mbind() or migrate_pages() would allow for migrating
folios mapped into such shared PMD tables, even though the folios are not
exclusive. In smaps we would account them as "private" although they are
"shared", and we would be wrongly setting the PM_MMAP_EXCLUSIVE in the
pagemap interface.
Fix it by properly using ptdesc_pmd_is_shared() in hugetlb_pmd_shared().
Link: https://lkml.kernel.org/r/20251205213558.2980480-1-david@kernel.org
Link: https://lkml.kernel.org/r/20251205213558.2980480-2-david@kernel.org
Link: https://lore.kernel.org/all/8cab934d-4a56-44aa-b641-bfd7e23bd673@kernel.org/ [1]
Fixes: 59d9094df3d7 ("mm: hugetlb: independent PMD page table shared count")
Signed-off-by: David Hildenbrand (Red Hat) <david(a)kernel.org>
Tested-by: Laurence Oberman <loberman(a)redhat.com>
Reviewed-by: Rik van Riel <riel(a)surriel.com>
Reviewed-by: Lance Yang <lance.yang(a)linux.dev>
Cc: Liu Shixin <liushixin2(a)huawei.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar(a)kernel.org>
Cc: Arnd Bergmann <arnd(a)arndb.de>
Cc: Jann Horn <jannh(a)google.com>
Cc: Liam Howlett <liam.howlett(a)oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes(a)oracle.com>
Cc: Muchun Song <muchun.song(a)linux.dev>
Cc: Nadav Amit <nadav.amit(a)gmail.com>
Cc: Nicholas Piggin <npiggin(a)gmail.com>
Cc: Oscar Salvador <osalvador(a)suse.de>
Cc: Peter Zijlstra <peterz(a)infradead.org>
Cc: Prakash Sangappa <prakash.sangappa(a)oracle.com>
Cc: Vlastimil Babka <vbabka(a)suse.cz>
Cc: Will Deacon <will(a)kernel.org>
Cc: Uschakow, Stanislav" <suschako(a)amazon.de>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
include/linux/hugetlb.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/include/linux/hugetlb.h~mm-hugetlb-fix-hugetlb_pmd_shared
+++ a/include/linux/hugetlb.h
@@ -1326,7 +1326,7 @@ static inline __init void hugetlb_cma_re
#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING
static inline bool hugetlb_pmd_shared(pte_t *pte)
{
- return page_count(virt_to_page(pte)) > 1;
+ return ptdesc_pmd_is_shared(virt_to_ptdesc(pte));
}
#else
static inline bool hugetlb_pmd_shared(pte_t *pte)
_
Patches currently in -mm which might be from david(a)kernel.org are
mm-hugetlb-fix-hugetlb_pmd_shared.patch
mm-hugetlb-fix-two-comments-related-to-huge_pmd_unshare.patch
mm-rmap-fix-two-comments-related-to-huge_pmd_unshare.patch
mm-hugetlb-fix-excessive-ipi-broadcasts-when-unsharing-pmd-tables-using-mmu_gather.patch
Hi Ilpo,
I managed to get my hands on acpidumps for these models so this is
verified against those.
Thanks for all your latest reviews!
Signed-off-by: Kurt Borja <kuurtb(a)gmail.com>
---
Kurt Borja (3):
platform/x86: alienware-wmi-wmax: Add support for new Area-51 laptops
platform/x86: alienware-wmi-wmax: Add AWCC support for Alienware x16
platform/x86: alienware-wmi-wmax: Add support for Alienware 16X Aurora
drivers/platform/x86/dell/alienware-wmi-wmax.c | 32 ++++++++++++++++++++++++++
1 file changed, 32 insertions(+)
---
base-commit: 9b9c0adbc3f8a524d291baccc9d0c04097fb4869
change-id: 20251111-area-51-7e6c2501e4eb
--
~ Kurt
Hi,
We now have the verified International IAA Mobility 2025 Database with 501,479 attendees and 750 exhibitors.
Each contact includes: names, job titles, company details, locations, phone numbers, and verified email addresses — all carefully sourced and checked for accuracy.
delivered within 48 hours. Season-End Offer: 30% Discount on All Databases.
Reply “Send me the cost” to get pricing and more details.
Best regards,
Grace Taylor
Sr. Marketing Manager
P.S. Reply “Unfollow” to opt out.