The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x bc8169003b41e89fe7052e408cf9fdbecb4017fe
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025062040-daylight-scrubber-8837@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From bc8169003b41e89fe7052e408cf9fdbecb4017fe Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers(a)google.com>
Date: Tue, 20 May 2025 10:39:29 +0800
Subject: [PATCH] crypto: powerpc/poly1305 - add depends on BROKEN for now
As discussed in the thread containing
https://lore.kernel.org/linux-crypto/20250510053308.GB505731@sol/, the
Power10-optimized Poly1305 code is currently not safe to call in softirq
context. Disable it for now. It can be re-enabled once it is fixed.
Fixes: ba8f8624fde2 ("crypto: poly1305-p10 - Glue code for optmized Poly1305 implementation for ppc64le")
Cc: stable(a)vger.kernel.org
Signed-off-by: Eric Biggers <ebiggers(a)google.com>
Signed-off-by: Herbert Xu <herbert(a)gondor.apana.org.au>
diff --git a/arch/powerpc/lib/crypto/Kconfig b/arch/powerpc/lib/crypto/Kconfig
index ffa541ad6d5d..3f9e1bbd9905 100644
--- a/arch/powerpc/lib/crypto/Kconfig
+++ b/arch/powerpc/lib/crypto/Kconfig
@@ -10,6 +10,7 @@ config CRYPTO_CHACHA20_P10
config CRYPTO_POLY1305_P10
tristate
depends on PPC64 && CPU_LITTLE_ENDIAN && VSX
+ depends on BROKEN # Needs to be fixed to work in softirq context
default CRYPTO_LIB_POLY1305
select CRYPTO_ARCH_HAVE_LIB_POLY1305
select CRYPTO_LIB_POLY1305_GENERIC
The patch below does not apply to the 6.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.15.y
git checkout FETCH_HEAD
git cherry-pick -x bc8169003b41e89fe7052e408cf9fdbecb4017fe
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025062040-slinging-salvaging-1638@gregkh' --subject-prefix 'PATCH 6.15.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From bc8169003b41e89fe7052e408cf9fdbecb4017fe Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers(a)google.com>
Date: Tue, 20 May 2025 10:39:29 +0800
Subject: [PATCH] crypto: powerpc/poly1305 - add depends on BROKEN for now
As discussed in the thread containing
https://lore.kernel.org/linux-crypto/20250510053308.GB505731@sol/, the
Power10-optimized Poly1305 code is currently not safe to call in softirq
context. Disable it for now. It can be re-enabled once it is fixed.
Fixes: ba8f8624fde2 ("crypto: poly1305-p10 - Glue code for optmized Poly1305 implementation for ppc64le")
Cc: stable(a)vger.kernel.org
Signed-off-by: Eric Biggers <ebiggers(a)google.com>
Signed-off-by: Herbert Xu <herbert(a)gondor.apana.org.au>
diff --git a/arch/powerpc/lib/crypto/Kconfig b/arch/powerpc/lib/crypto/Kconfig
index ffa541ad6d5d..3f9e1bbd9905 100644
--- a/arch/powerpc/lib/crypto/Kconfig
+++ b/arch/powerpc/lib/crypto/Kconfig
@@ -10,6 +10,7 @@ config CRYPTO_CHACHA20_P10
config CRYPTO_POLY1305_P10
tristate
depends on PPC64 && CPU_LITTLE_ENDIAN && VSX
+ depends on BROKEN # Needs to be fixed to work in softirq context
default CRYPTO_LIB_POLY1305
select CRYPTO_ARCH_HAVE_LIB_POLY1305
select CRYPTO_LIB_POLY1305_GENERIC
The patch below does not apply to the 6.12-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.12.y
git checkout FETCH_HEAD
git cherry-pick -x bc8169003b41e89fe7052e408cf9fdbecb4017fe
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025062040-renderer-tremor-6a52@gregkh' --subject-prefix 'PATCH 6.12.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From bc8169003b41e89fe7052e408cf9fdbecb4017fe Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers(a)google.com>
Date: Tue, 20 May 2025 10:39:29 +0800
Subject: [PATCH] crypto: powerpc/poly1305 - add depends on BROKEN for now
As discussed in the thread containing
https://lore.kernel.org/linux-crypto/20250510053308.GB505731@sol/, the
Power10-optimized Poly1305 code is currently not safe to call in softirq
context. Disable it for now. It can be re-enabled once it is fixed.
Fixes: ba8f8624fde2 ("crypto: poly1305-p10 - Glue code for optmized Poly1305 implementation for ppc64le")
Cc: stable(a)vger.kernel.org
Signed-off-by: Eric Biggers <ebiggers(a)google.com>
Signed-off-by: Herbert Xu <herbert(a)gondor.apana.org.au>
diff --git a/arch/powerpc/lib/crypto/Kconfig b/arch/powerpc/lib/crypto/Kconfig
index ffa541ad6d5d..3f9e1bbd9905 100644
--- a/arch/powerpc/lib/crypto/Kconfig
+++ b/arch/powerpc/lib/crypto/Kconfig
@@ -10,6 +10,7 @@ config CRYPTO_CHACHA20_P10
config CRYPTO_POLY1305_P10
tristate
depends on PPC64 && CPU_LITTLE_ENDIAN && VSX
+ depends on BROKEN # Needs to be fixed to work in softirq context
default CRYPTO_LIB_POLY1305
select CRYPTO_ARCH_HAVE_LIB_POLY1305
select CRYPTO_LIB_POLY1305_GENERIC
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x 0413bcf0fc460a68a2a7a8354aee833293d7d693
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025062023-fidgety-landslide-5061@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 0413bcf0fc460a68a2a7a8354aee833293d7d693 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert(a)gondor.apana.org.au>
Date: Thu, 8 May 2025 13:22:16 +0800
Subject: [PATCH] crypto: marvell/cesa - Do not chain submitted requests
This driver tries to chain requests together before submitting them
to hardware in order to reduce completion interrupts.
However, it even extends chains that have already been submitted
to hardware. This is dangerous because there is no way of knowing
whether the hardware has already read the DMA memory in question
or not.
Fix this by splitting the chain list into two. One for submitted
requests and one for requests that have not yet been submitted.
Only extend the latter.
Reported-by: Klaus Kudielka <klaus.kudielka(a)gmail.com>
Fixes: 85030c5168f1 ("crypto: marvell - Add support for chaining crypto requests in TDMA mode")
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Herbert Xu <herbert(a)gondor.apana.org.au>
diff --git a/drivers/crypto/marvell/cesa/cesa.c b/drivers/crypto/marvell/cesa/cesa.c
index fa08f10e6f3f..9c21f5d835d2 100644
--- a/drivers/crypto/marvell/cesa/cesa.c
+++ b/drivers/crypto/marvell/cesa/cesa.c
@@ -94,7 +94,7 @@ static int mv_cesa_std_process(struct mv_cesa_engine *engine, u32 status)
static int mv_cesa_int_process(struct mv_cesa_engine *engine, u32 status)
{
- if (engine->chain.first && engine->chain.last)
+ if (engine->chain_hw.first && engine->chain_hw.last)
return mv_cesa_tdma_process(engine, status);
return mv_cesa_std_process(engine, status);
diff --git a/drivers/crypto/marvell/cesa/cesa.h b/drivers/crypto/marvell/cesa/cesa.h
index d215a6bed6bc..50ca1039fdaa 100644
--- a/drivers/crypto/marvell/cesa/cesa.h
+++ b/drivers/crypto/marvell/cesa/cesa.h
@@ -440,8 +440,10 @@ struct mv_cesa_dev {
* SRAM
* @queue: fifo of the pending crypto requests
* @load: engine load counter, useful for load balancing
- * @chain: list of the current tdma descriptors being processed
- * by this engine.
+ * @chain_hw: list of the current tdma descriptors being processed
+ * by the hardware.
+ * @chain_sw: list of the current tdma descriptors that will be
+ * submitted to the hardware.
* @complete_queue: fifo of the processed requests by the engine
*
* Structure storing CESA engine information.
@@ -463,7 +465,8 @@ struct mv_cesa_engine {
struct gen_pool *pool;
struct crypto_queue queue;
atomic_t load;
- struct mv_cesa_tdma_chain chain;
+ struct mv_cesa_tdma_chain chain_hw;
+ struct mv_cesa_tdma_chain chain_sw;
struct list_head complete_queue;
int irq;
};
diff --git a/drivers/crypto/marvell/cesa/tdma.c b/drivers/crypto/marvell/cesa/tdma.c
index 388a06e180d6..243305354420 100644
--- a/drivers/crypto/marvell/cesa/tdma.c
+++ b/drivers/crypto/marvell/cesa/tdma.c
@@ -38,6 +38,15 @@ void mv_cesa_dma_step(struct mv_cesa_req *dreq)
{
struct mv_cesa_engine *engine = dreq->engine;
+ spin_lock_bh(&engine->lock);
+ if (engine->chain_sw.first == dreq->chain.first) {
+ engine->chain_sw.first = NULL;
+ engine->chain_sw.last = NULL;
+ }
+ engine->chain_hw.first = dreq->chain.first;
+ engine->chain_hw.last = dreq->chain.last;
+ spin_unlock_bh(&engine->lock);
+
writel_relaxed(0, engine->regs + CESA_SA_CFG);
mv_cesa_set_int_mask(engine, CESA_SA_INT_ACC0_IDMA_DONE);
@@ -96,25 +105,27 @@ void mv_cesa_dma_prepare(struct mv_cesa_req *dreq,
void mv_cesa_tdma_chain(struct mv_cesa_engine *engine,
struct mv_cesa_req *dreq)
{
- if (engine->chain.first == NULL && engine->chain.last == NULL) {
- engine->chain.first = dreq->chain.first;
- engine->chain.last = dreq->chain.last;
- } else {
- struct mv_cesa_tdma_desc *last;
+ struct mv_cesa_tdma_desc *last = engine->chain_sw.last;
- last = engine->chain.last;
+ /*
+ * Break the DMA chain if the request being queued needs the IV
+ * regs to be set before lauching the request.
+ */
+ if (!last || dreq->chain.first->flags & CESA_TDMA_SET_STATE)
+ engine->chain_sw.first = dreq->chain.first;
+ else {
last->next = dreq->chain.first;
- engine->chain.last = dreq->chain.last;
-
- /*
- * Break the DMA chain if the CESA_TDMA_BREAK_CHAIN is set on
- * the last element of the current chain, or if the request
- * being queued needs the IV regs to be set before lauching
- * the request.
- */
- if (!(last->flags & CESA_TDMA_BREAK_CHAIN) &&
- !(dreq->chain.first->flags & CESA_TDMA_SET_STATE))
- last->next_dma = cpu_to_le32(dreq->chain.first->cur_dma);
+ last->next_dma = cpu_to_le32(dreq->chain.first->cur_dma);
+ }
+ last = dreq->chain.last;
+ engine->chain_sw.last = last;
+ /*
+ * Break the DMA chain if the CESA_TDMA_BREAK_CHAIN is set on
+ * the last element of the current chain.
+ */
+ if (last->flags & CESA_TDMA_BREAK_CHAIN) {
+ engine->chain_sw.first = NULL;
+ engine->chain_sw.last = NULL;
}
}
@@ -127,7 +138,7 @@ int mv_cesa_tdma_process(struct mv_cesa_engine *engine, u32 status)
tdma_cur = readl(engine->regs + CESA_TDMA_CUR);
- for (tdma = engine->chain.first; tdma; tdma = next) {
+ for (tdma = engine->chain_hw.first; tdma; tdma = next) {
spin_lock_bh(&engine->lock);
next = tdma->next;
spin_unlock_bh(&engine->lock);
@@ -149,12 +160,12 @@ int mv_cesa_tdma_process(struct mv_cesa_engine *engine, u32 status)
&backlog);
/* Re-chaining to the next request */
- engine->chain.first = tdma->next;
+ engine->chain_hw.first = tdma->next;
tdma->next = NULL;
/* If this is the last request, clear the chain */
- if (engine->chain.first == NULL)
- engine->chain.last = NULL;
+ if (engine->chain_hw.first == NULL)
+ engine->chain_hw.last = NULL;
spin_unlock_bh(&engine->lock);
ctx = crypto_tfm_ctx(req->tfm);
The quilt patch titled
Subject: maple_tree: fix MA_STATE_PREALLOC flag in mas_preallocate()
has been removed from the -mm tree. Its filename was
maple_tree-fix-ma_state_prealloc-flag-in-mas_preallocate.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: "Liam R. Howlett" <Liam.Howlett(a)oracle.com>
Subject: maple_tree: fix MA_STATE_PREALLOC flag in mas_preallocate()
Date: Mon, 16 Jun 2025 14:45:20 -0400
Temporarily clear the preallocation flag when explicitly requesting
allocations. Pre-existing allocations are already counted against the
request through mas_node_count_gfp(), but the allocations will not happen
if the MA_STATE_PREALLOC flag is set. This flag is meant to avoid
re-allocating in bulk allocation mode, and to detect issues with
preallocation calculations.
The MA_STATE_PREALLOC flag should also always be set on zero allocations
so that detection of underflow allocations will print a WARN_ON() during
consumption.
User visible effect of this flaw is a WARN_ON() followed by a null pointer
dereference when subsequent requests for larger number of nodes is
ignored, such as the vma merge retry in mmap_region() caused by drivers
altering the vma flags (which happens in v6.6, at least)
Link: https://lkml.kernel.org/r/20250616184521.3382795-3-Liam.Howlett@oracle.com
Fixes: 54a611b60590 ("Maple Tree: add new data structure")
Signed-off-by: Liam R. Howlett <Liam.Howlett(a)oracle.com>
Reported-by: Zhaoyang Huang <zhaoyang.huang(a)unisoc.com>
Reported-by: Hailong Liu <hailong.liu(a)oppo.com>
Link: https://lore.kernel.org/all/1652f7eb-a51b-4fee-8058-c73af63bacd1@oppo.com/
Link: https://lore.kernel.org/all/20250428184058.1416274-1-Liam.Howlett@oracle.co…
Link: https://lore.kernel.org/all/20250429014754.1479118-1-Liam.Howlett@oracle.co…
Cc: Lorenzo Stoakes <lorenzo.stoakes(a)oracle.com>
Cc: Suren Baghdasaryan <surenb(a)google.com>
Cc: Hailong Liu <hailong.liu(a)oppo.com>
Cc: zhangpeng.00(a)bytedance.com <zhangpeng.00(a)bytedance.com>
Cc: Steve Kang <Steve.Kang(a)unisoc.com>
Cc: Matthew Wilcox <willy(a)infradead.org>
Cc: Sidhartha Kumar <sidhartha.kumar(a)oracle.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
lib/maple_tree.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
--- a/lib/maple_tree.c~maple_tree-fix-ma_state_prealloc-flag-in-mas_preallocate
+++ a/lib/maple_tree.c
@@ -5527,8 +5527,9 @@ int mas_preallocate(struct ma_state *mas
mas->store_type = mas_wr_store_type(&wr_mas);
request = mas_prealloc_calc(&wr_mas, entry);
if (!request)
- return ret;
+ goto set_flag;
+ mas->mas_flags &= ~MA_STATE_PREALLOC;
mas_node_count_gfp(mas, request, gfp);
if (mas_is_err(mas)) {
mas_set_alloc_req(mas, 0);
@@ -5538,6 +5539,7 @@ int mas_preallocate(struct ma_state *mas
return ret;
}
+set_flag:
mas->mas_flags |= MA_STATE_PREALLOC;
return ret;
}
_
Patches currently in -mm which might be from Liam.Howlett(a)oracle.com are
tools-testing-radix-tree-test-maple-tree-chaining-mas_preallocate-calls.patch
The quilt patch titled
Subject: bcache: remove unnecessary select MIN_HEAP
has been removed from the -mm tree. Its filename was
bcache-remove-unnecessary-select-min_heap.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Kuan-Wei Chiu <visitorckw(a)gmail.com>
Subject: bcache: remove unnecessary select MIN_HEAP
Date: Sun, 15 Jun 2025 04:23:53 +0800
After reverting the transition to the generic min heap library, bcache no
longer depends on MIN_HEAP. The select entry can be removed to reduce
code size and shrink the kernel's attack surface.
This change effectively reverts the bcache-related part of commit
92a8b224b833 ("lib/min_heap: introduce non-inline versions of min heap API
functions").
This is part of a series of changes to address a performance regression
caused by the use of the generic min_heap implementation.
As reported by Robert, bcache now suffers from latency spikes, with P100
(max) latency increasing from 600 ms to 2.4 seconds every 5 minutes.
These regressions degrade bcache's effectiveness as a low-latency cache
layer and lead to frequent timeouts and application stalls in production
environments.
Link: https://lore.kernel.org/lkml/CAJhEC05+0S69z+3+FB2Cd0hD+pCRyWTKLEOsc8BOmH73p…
Link: https://lkml.kernel.org/r/20250614202353.1632957-4-visitorckw@gmail.com
Fixes: 866898efbb25 ("bcache: remove heap-related macros and switch to generic min_heap")
Fixes: 92a8b224b833 ("lib/min_heap: introduce non-inline versions of min heap API functions")
Signed-off-by: Kuan-Wei Chiu <visitorckw(a)gmail.com>
Reported-by: Robert Pang <robertpang(a)google.com>
Closes: https://lore.kernel.org/linux-bcache/CAJhEC06F_AtrPgw2-7CvCqZgeStgCtitbD-ry…
Acked-by: Coly Li <colyli(a)kernel.org>
Cc: Ching-Chun (Jim) Huang <jserv(a)ccns.ncku.edu.tw>
Cc: Kent Overstreet <kent.overstreet(a)linux.dev>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
drivers/md/bcache/Kconfig | 1 -
1 file changed, 1 deletion(-)
--- a/drivers/md/bcache/Kconfig~bcache-remove-unnecessary-select-min_heap
+++ a/drivers/md/bcache/Kconfig
@@ -5,7 +5,6 @@ config BCACHE
select BLOCK_HOLDER_DEPRECATED if SYSFS
select CRC64
select CLOSURES
- select MIN_HEAP
help
Allows a block device to be used as cache for other devices; uses
a btree for indexing and the layout is optimized for SSDs.
_
Patches currently in -mm which might be from visitorckw(a)gmail.com are
lib-math-gcd-use-static-key-to-select-implementation-at-runtime.patch
riscv-optimize-gcd-code-size-when-config_riscv_isa_zbb-is-disabled.patch
riscv-optimize-gcd-performance-on-risc-v-without-zbb-extension.patch
The quilt patch titled
Subject: mm: userfaultfd: fix race of userfaultfd_move and swap cache
has been removed from the -mm tree. Its filename was
mm-userfaultfd-fix-race-of-userfaultfd_move-and-swap-cache.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Kairui Song <kasong(a)tencent.com>
Subject: mm: userfaultfd: fix race of userfaultfd_move and swap cache
Date: Wed, 4 Jun 2025 23:10:38 +0800
This commit fixes two kinds of races, they may have different results:
Barry reported a BUG_ON in commit c50f8e6053b0, we may see the same
BUG_ON if the filemap lookup returned NULL and folio is added to swap
cache after that.
If another kind of race is triggered (folio changed after lookup) we
may see RSS counter is corrupted:
[ 406.893936] BUG: Bad rss-counter state mm:ffff0000c5a9ddc0
type:MM_ANONPAGES val:-1
[ 406.894071] BUG: Bad rss-counter state mm:ffff0000c5a9ddc0
type:MM_SHMEMPAGES val:1
Because the folio is being accounted to the wrong VMA.
I'm not sure if there will be any data corruption though, seems no.
The issues above are critical already.
On seeing a swap entry PTE, userfaultfd_move does a lockless swap cache
lookup, and tries to move the found folio to the faulting vma. Currently,
it relies on checking the PTE value to ensure that the moved folio still
belongs to the src swap entry and that no new folio has been added to the
swap cache, which turns out to be unreliable.
While working and reviewing the swap table series with Barry, following
existing races are observed and reproduced [1]:
In the example below, move_pages_pte is moving src_pte to dst_pte, where
src_pte is a swap entry PTE holding swap entry S1, and S1 is not in the
swap cache:
CPU1 CPU2
userfaultfd_move
move_pages_pte()
entry = pte_to_swp_entry(orig_src_pte);
// Here it got entry = S1
... < interrupted> ...
<swapin src_pte, alloc and use folio A>
// folio A is a new allocated folio
// and get installed into src_pte
<frees swap entry S1>
// src_pte now points to folio A, S1
// has swap count == 0, it can be freed
// by folio_swap_swap or swap
// allocator's reclaim.
<try to swap out another folio B>
// folio B is a folio in another VMA.
<put folio B to swap cache using S1 >
// S1 is freed, folio B can use it
// for swap out with no problem.
...
folio = filemap_get_folio(S1)
// Got folio B here !!!
... < interrupted again> ...
<swapin folio B and free S1>
// Now S1 is free to be used again.
<swapout src_pte & folio A using S1>
// Now src_pte is a swap entry PTE
// holding S1 again.
folio_trylock(folio)
move_swap_pte
double_pt_lock
is_pte_pages_stable
// Check passed because src_pte == S1
folio_move_anon_rmap(...)
// Moved invalid folio B here !!!
The race window is very short and requires multiple collisions of multiple
rare events, so it's very unlikely to happen, but with a deliberately
constructed reproducer and increased time window, it can be reproduced
easily.
This can be fixed by checking if the folio returned by filemap is the
valid swap cache folio after acquiring the folio lock.
Another similar race is possible: filemap_get_folio may return NULL, but
folio (A) could be swapped in and then swapped out again using the same
swap entry after the lookup. In such a case, folio (A) may remain in the
swap cache, so it must be moved too:
CPU1 CPU2
userfaultfd_move
move_pages_pte()
entry = pte_to_swp_entry(orig_src_pte);
// Here it got entry = S1, and S1 is not in swap cache
folio = filemap_get_folio(S1)
// Got NULL
... < interrupted again> ...
<swapin folio A and free S1>
<swapout folio A re-using S1>
move_swap_pte
double_pt_lock
is_pte_pages_stable
// Check passed because src_pte == S1
folio_move_anon_rmap(...)
// folio A is ignored !!!
Fix this by checking the swap cache again after acquiring the src_pte
lock. And to avoid the filemap overhead, we check swap_map directly [2].
The SWP_SYNCHRONOUS_IO path does make the problem more complex, but so far
we don't need to worry about that, since folios can only be exposed to the
swap cache in the swap out path, and this is covered in this patch by
checking the swap cache again after acquiring the src_pte lock.
Testing with a simple C program that allocates and moves several GB of
memory did not show any observable performance change.
Link: https://lkml.kernel.org/r/20250604151038.21968-1-ryncsn@gmail.com
Fixes: adef440691ba ("userfaultfd: UFFDIO_MOVE uABI")
Signed-off-by: Kairui Song <kasong(a)tencent.com>
Closes: https://lore.kernel.org/linux-mm/CAMgjq7B1K=6OOrK2OUZ0-tqCzi+EJt+2_K97TPGoS… [1]
Link: https://lore.kernel.org/all/CAGsJ_4yJhJBo16XhiC-nUzSheyX-V3-nFE+tAi=8Y560K8… [2]
Reviewed-by: Lokesh Gidra <lokeshgidra(a)google.com>
Acked-by: Peter Xu <peterx(a)redhat.com>
Reviewed-by: Suren Baghdasaryan <surenb(a)google.com>
Reviewed-by: Barry Song <baohua(a)kernel.org>
Reviewed-by: Chris Li <chrisl(a)kernel.org>
Cc: Andrea Arcangeli <aarcange(a)redhat.com>
Cc: David Hildenbrand <david(a)redhat.com>
Cc: Kairui Song <kasong(a)tencent.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/userfaultfd.c | 33 +++++++++++++++++++++++++++++++--
1 file changed, 31 insertions(+), 2 deletions(-)
--- a/mm/userfaultfd.c~mm-userfaultfd-fix-race-of-userfaultfd_move-and-swap-cache
+++ a/mm/userfaultfd.c
@@ -1084,8 +1084,18 @@ static int move_swap_pte(struct mm_struc
pte_t orig_dst_pte, pte_t orig_src_pte,
pmd_t *dst_pmd, pmd_t dst_pmdval,
spinlock_t *dst_ptl, spinlock_t *src_ptl,
- struct folio *src_folio)
+ struct folio *src_folio,
+ struct swap_info_struct *si, swp_entry_t entry)
{
+ /*
+ * Check if the folio still belongs to the target swap entry after
+ * acquiring the lock. Folio can be freed in the swap cache while
+ * not locked.
+ */
+ if (src_folio && unlikely(!folio_test_swapcache(src_folio) ||
+ entry.val != src_folio->swap.val))
+ return -EAGAIN;
+
double_pt_lock(dst_ptl, src_ptl);
if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte,
@@ -1102,6 +1112,25 @@ static int move_swap_pte(struct mm_struc
if (src_folio) {
folio_move_anon_rmap(src_folio, dst_vma);
src_folio->index = linear_page_index(dst_vma, dst_addr);
+ } else {
+ /*
+ * Check if the swap entry is cached after acquiring the src_pte
+ * lock. Otherwise, we might miss a newly loaded swap cache folio.
+ *
+ * Check swap_map directly to minimize overhead, READ_ONCE is sufficient.
+ * We are trying to catch newly added swap cache, the only possible case is
+ * when a folio is swapped in and out again staying in swap cache, using the
+ * same entry before the PTE check above. The PTL is acquired and released
+ * twice, each time after updating the swap_map's flag. So holding
+ * the PTL here ensures we see the updated value. False positive is possible,
+ * e.g. SWP_SYNCHRONOUS_IO swapin may set the flag without touching the
+ * cache, or during the tiny synchronization window between swap cache and
+ * swap_map, but it will be gone very quickly, worst result is retry jitters.
+ */
+ if (READ_ONCE(si->swap_map[swp_offset(entry)]) & SWAP_HAS_CACHE) {
+ double_pt_unlock(dst_ptl, src_ptl);
+ return -EAGAIN;
+ }
}
orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte);
@@ -1412,7 +1441,7 @@ retry:
}
err = move_swap_pte(mm, dst_vma, dst_addr, src_addr, dst_pte, src_pte,
orig_dst_pte, orig_src_pte, dst_pmd, dst_pmdval,
- dst_ptl, src_ptl, src_folio);
+ dst_ptl, src_ptl, src_folio, si, entry);
}
out:
_
Patches currently in -mm which might be from kasong(a)tencent.com are
mm-list_lru-refactor-the-locking-code.patch
mm-shmem-swap-improve-cached-mthp-handling-and-fix-potential-hung.patch
mm-shmem-swap-avoid-redundant-xarray-lookup-during-swapin.patch
mm-shmem-swap-improve-mthp-swapin-process.patch
mm-shmem-swap-avoid-false-positive-swap-cache-lookup.patch
The quilt patch titled
Subject: mm/gup: revert "mm: gup: fix infinite loop within __get_longterm_locked"
has been removed from the -mm tree. Its filename was
mm-gup-revert-mm-gup-fix-infinite-loop-within-__get_longterm_locked.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: David Hildenbrand <david(a)redhat.com>
Subject: mm/gup: revert "mm: gup: fix infinite loop within __get_longterm_locked"
Date: Wed, 11 Jun 2025 15:13:14 +0200
After commit 1aaf8c122918 ("mm: gup: fix infinite loop within
__get_longterm_locked") we are able to longterm pin folios that are not
supposed to get longterm pinned, simply because they temporarily have the
LRU flag cleared (esp. temporarily isolated).
For example, two __get_longterm_locked() callers can race, or
__get_longterm_locked() can race with anything else that temporarily
isolates folios.
The introducing commit mentions the use case of a driver that uses
vm_ops->fault to insert pages allocated through cma_alloc() into the page
tables, assuming they can later get longterm pinned. These pages/ folios
would never have the LRU flag set and consequently cannot get isolated.
There is no known in-tree user making use of that so far, fortunately.
To handle that in the future -- and avoid retrying forever to
isolate/migrate them -- we will need a different mechanism for the CMA
area *owner* to indicate that it actually already allocated the page and
is fine with longterm pinning it. The LRU flag is not suitable for that.
Probably we can lookup the relevant CMA area and query the bitmap; we only
have have to care about some races, probably. If already allocated, we
could just allow longterm pinning)
Anyhow, let's fix the "must not be longterm pinned" problem first by
reverting the original commit.
Link: https://lkml.kernel.org/r/20250611131314.594529-1-david@redhat.com
Fixes: 1aaf8c122918 ("mm: gup: fix infinite loop within __get_longterm_locked")
Signed-off-by: David Hildenbrand <david(a)redhat.com>
Closes: https://lore.kernel.org/all/20250522092755.GA3277597@tiffany/
Reported-by: Hyesoo Yu <hyesoo.yu(a)samsung.com>
Reviewed-by: John Hubbard <jhubbard(a)nvidia.com>
Cc: Jason Gunthorpe <jgg(a)ziepe.ca>
Cc: Peter Xu <peterx(a)redhat.com>
Cc: Zhaoyang Huang <zhaoyang.huang(a)unisoc.com>
Cc: Aijun Sun <aijun.sun(a)unisoc.com>
Cc: Alistair Popple <apopple(a)nvidia.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/gup.c | 14 ++++++++++----
1 file changed, 10 insertions(+), 4 deletions(-)
--- a/mm/gup.c~mm-gup-revert-mm-gup-fix-infinite-loop-within-__get_longterm_locked
+++ a/mm/gup.c
@@ -2303,13 +2303,13 @@ static void pofs_unpin(struct pages_or_f
/*
* Returns the number of collected folios. Return value is always >= 0.
*/
-static void collect_longterm_unpinnable_folios(
+static unsigned long collect_longterm_unpinnable_folios(
struct list_head *movable_folio_list,
struct pages_or_folios *pofs)
{
+ unsigned long i, collected = 0;
struct folio *prev_folio = NULL;
bool drain_allow = true;
- unsigned long i;
for (i = 0; i < pofs->nr_entries; i++) {
struct folio *folio = pofs_get_folio(pofs, i);
@@ -2321,6 +2321,8 @@ static void collect_longterm_unpinnable_
if (folio_is_longterm_pinnable(folio))
continue;
+ collected++;
+
if (folio_is_device_coherent(folio))
continue;
@@ -2342,6 +2344,8 @@ static void collect_longterm_unpinnable_
NR_ISOLATED_ANON + folio_is_file_lru(folio),
folio_nr_pages(folio));
}
+
+ return collected;
}
/*
@@ -2418,9 +2422,11 @@ static long
check_and_migrate_movable_pages_or_folios(struct pages_or_folios *pofs)
{
LIST_HEAD(movable_folio_list);
+ unsigned long collected;
- collect_longterm_unpinnable_folios(&movable_folio_list, pofs);
- if (list_empty(&movable_folio_list))
+ collected = collect_longterm_unpinnable_folios(&movable_folio_list,
+ pofs);
+ if (!collected)
return 0;
return migrate_longterm_unpinnable_folios(&movable_folio_list, pofs);
_
Patches currently in -mm which might be from david(a)redhat.com are
fs-proc-task_mmu-fix-page_is_pfnzero-detection-for-the-huge-zero-folio.patch
mm-gup-remove-vm_bug_ons.patch
mm-gup-remove-vm_bug_ons-fix.patch
mm-huge_memory-dont-ignore-queried-cachemode-in-vmf_insert_pfn_pud.patch
mm-huge_memory-dont-mark-refcounted-folios-special-in-vmf_insert_folio_pmd.patch
mm-huge_memory-dont-mark-refcounted-folios-special-in-vmf_insert_folio_pud.patch