The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 0453aad676ff99787124b9b3af4a5f59fbe808e2
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024072924-robin-manger-e92b@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
0453aad676ff ("io_uring/io-wq: limit retrying worker initialisation")
8a565304927f ("io_uring/io-wq: Use set_bit() and test_bit() at worker->flags")
eb47943f2238 ("io-wq: Drop struct io_wqe")
dfd63baf892c ("io-wq: Move wq accounting to io_wq")
da64d6db3bd3 ("io_uring: One wqe per wq")
01e68ce08a30 ("io_uring/io-wq: stop setting PF_NO_SETAFFINITY on io-wq workers")
996d3efeb091 ("io-wq: Fix memory leak in worker creation")
024f15e033a5 ("io_uring: dedup io_run_task_work")
a6b21fbb4ce3 ("io_uring: move list helpers to a separate file")
ab1c84d855cf ("io_uring: make io_uring_types.h public")
48c13d898084 ("io_uring: explain io_wq_work::cancel_seq placement")
e418bbc97bff ("io_uring: move our reference counting into a header")
cd40cae29ef8 ("io_uring: split out open/close operations")
453b329be5ea ("io_uring: separate out file table handling code")
f4c163dd7d4b ("io_uring: split out fadvise/madvise operations")
0d5847274037 ("io_uring: split out fs related sync/fallocate functions")
531113bbd5bf ("io_uring: split out splice related operations")
11aeb71406dd ("io_uring: split out filesystem related operations")
e28683bdfc2f ("io_uring: move nop into its own file")
5e2a18d93fec ("io_uring: move xattr related opcodes to its own file")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 0453aad676ff99787124b9b3af4a5f59fbe808e2 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence(a)gmail.com>
Date: Wed, 10 Jul 2024 18:58:17 +0100
Subject: [PATCH] io_uring/io-wq: limit retrying worker initialisation
If io-wq worker creation fails, we retry it by queueing up a task_work.
tasK_work is needed because it should be done from the user process
context. The problem is that retries are not limited, and if queueing a
task_work is the reason for the failure, we might get into an infinite
loop.
It doesn't seem to happen now but it would with the following patch
executing task_work in the freezer's loop. For now, arbitrarily limit the
number of attempts to create a worker.
Cc: stable(a)vger.kernel.org
Fixes: 3146cba99aa28 ("io-wq: make worker creation resilient against signals")
Reported-by: Julian Orth <ju.orth(a)gmail.com>
Signed-off-by: Pavel Begunkov <asml.silence(a)gmail.com>
Link: https://lore.kernel.org/r/8280436925db88448c7c85c6656edee1a43029ea.17206341…
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c
index 913c92249522..f1e7c670add8 100644
--- a/io_uring/io-wq.c
+++ b/io_uring/io-wq.c
@@ -23,6 +23,7 @@
#include "io_uring.h"
#define WORKER_IDLE_TIMEOUT (5 * HZ)
+#define WORKER_INIT_LIMIT 3
enum {
IO_WORKER_F_UP = 0, /* up and active */
@@ -58,6 +59,7 @@ struct io_worker {
unsigned long create_state;
struct callback_head create_work;
+ int init_retries;
union {
struct rcu_head rcu;
@@ -745,7 +747,7 @@ static bool io_wq_work_match_all(struct io_wq_work *work, void *data)
return true;
}
-static inline bool io_should_retry_thread(long err)
+static inline bool io_should_retry_thread(struct io_worker *worker, long err)
{
/*
* Prevent perpetual task_work retry, if the task (or its group) is
@@ -753,6 +755,8 @@ static inline bool io_should_retry_thread(long err)
*/
if (fatal_signal_pending(current))
return false;
+ if (worker->init_retries++ >= WORKER_INIT_LIMIT)
+ return false;
switch (err) {
case -EAGAIN:
@@ -779,7 +783,7 @@ static void create_worker_cont(struct callback_head *cb)
io_init_new_worker(wq, worker, tsk);
io_worker_release(worker);
return;
- } else if (!io_should_retry_thread(PTR_ERR(tsk))) {
+ } else if (!io_should_retry_thread(worker, PTR_ERR(tsk))) {
struct io_wq_acct *acct = io_wq_get_acct(worker);
atomic_dec(&acct->nr_running);
@@ -846,7 +850,7 @@ static bool create_io_worker(struct io_wq *wq, int index)
tsk = create_io_thread(io_wq_worker, worker, NUMA_NO_NODE);
if (!IS_ERR(tsk)) {
io_init_new_worker(wq, worker, tsk);
- } else if (!io_should_retry_thread(PTR_ERR(tsk))) {
+ } else if (!io_should_retry_thread(worker, PTR_ERR(tsk))) {
kfree(worker);
goto fail;
} else {
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x 0453aad676ff99787124b9b3af4a5f59fbe808e2
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024072923-bodacious-claw-442b@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
0453aad676ff ("io_uring/io-wq: limit retrying worker initialisation")
8a565304927f ("io_uring/io-wq: Use set_bit() and test_bit() at worker->flags")
eb47943f2238 ("io-wq: Drop struct io_wqe")
dfd63baf892c ("io-wq: Move wq accounting to io_wq")
da64d6db3bd3 ("io_uring: One wqe per wq")
01e68ce08a30 ("io_uring/io-wq: stop setting PF_NO_SETAFFINITY on io-wq workers")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 0453aad676ff99787124b9b3af4a5f59fbe808e2 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence(a)gmail.com>
Date: Wed, 10 Jul 2024 18:58:17 +0100
Subject: [PATCH] io_uring/io-wq: limit retrying worker initialisation
If io-wq worker creation fails, we retry it by queueing up a task_work.
tasK_work is needed because it should be done from the user process
context. The problem is that retries are not limited, and if queueing a
task_work is the reason for the failure, we might get into an infinite
loop.
It doesn't seem to happen now but it would with the following patch
executing task_work in the freezer's loop. For now, arbitrarily limit the
number of attempts to create a worker.
Cc: stable(a)vger.kernel.org
Fixes: 3146cba99aa28 ("io-wq: make worker creation resilient against signals")
Reported-by: Julian Orth <ju.orth(a)gmail.com>
Signed-off-by: Pavel Begunkov <asml.silence(a)gmail.com>
Link: https://lore.kernel.org/r/8280436925db88448c7c85c6656edee1a43029ea.17206341…
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c
index 913c92249522..f1e7c670add8 100644
--- a/io_uring/io-wq.c
+++ b/io_uring/io-wq.c
@@ -23,6 +23,7 @@
#include "io_uring.h"
#define WORKER_IDLE_TIMEOUT (5 * HZ)
+#define WORKER_INIT_LIMIT 3
enum {
IO_WORKER_F_UP = 0, /* up and active */
@@ -58,6 +59,7 @@ struct io_worker {
unsigned long create_state;
struct callback_head create_work;
+ int init_retries;
union {
struct rcu_head rcu;
@@ -745,7 +747,7 @@ static bool io_wq_work_match_all(struct io_wq_work *work, void *data)
return true;
}
-static inline bool io_should_retry_thread(long err)
+static inline bool io_should_retry_thread(struct io_worker *worker, long err)
{
/*
* Prevent perpetual task_work retry, if the task (or its group) is
@@ -753,6 +755,8 @@ static inline bool io_should_retry_thread(long err)
*/
if (fatal_signal_pending(current))
return false;
+ if (worker->init_retries++ >= WORKER_INIT_LIMIT)
+ return false;
switch (err) {
case -EAGAIN:
@@ -779,7 +783,7 @@ static void create_worker_cont(struct callback_head *cb)
io_init_new_worker(wq, worker, tsk);
io_worker_release(worker);
return;
- } else if (!io_should_retry_thread(PTR_ERR(tsk))) {
+ } else if (!io_should_retry_thread(worker, PTR_ERR(tsk))) {
struct io_wq_acct *acct = io_wq_get_acct(worker);
atomic_dec(&acct->nr_running);
@@ -846,7 +850,7 @@ static bool create_io_worker(struct io_wq *wq, int index)
tsk = create_io_thread(io_wq_worker, worker, NUMA_NO_NODE);
if (!IS_ERR(tsk)) {
io_init_new_worker(wq, worker, tsk);
- } else if (!io_should_retry_thread(PTR_ERR(tsk))) {
+ } else if (!io_should_retry_thread(worker, PTR_ERR(tsk))) {
kfree(worker);
goto fail;
} else {
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x 30d77b7eef019fa4422980806e8b7cdc8674493e
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024072912-during-vitalize-fe0c@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
30d77b7eef01 ("mm/mglru: fix ineffective protection calculation")
3f74e6bd3b84 ("mm/mglru: fix overshooting shrinker memory")
4acef5694e01 ("mm/mglru: improve swappiness handling")
745b13e647cd ("mm/mglru: remove CONFIG_MEMCG")
4376807bf2d5 ("mm/mglru: reclaim offlined memcgs harder")
8aa420617918 ("mm/mglru: respect min_ttl_ms with memcgs")
5095a2b23987 ("mm/mglru: try to stop at high watermarks")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 30d77b7eef019fa4422980806e8b7cdc8674493e Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao(a)google.com>
Date: Fri, 12 Jul 2024 17:29:56 -0600
Subject: [PATCH] mm/mglru: fix ineffective protection calculation
mem_cgroup_calculate_protection() is not stateless and should only be used
as part of a top-down tree traversal. shrink_one() traverses the per-node
memcg LRU instead of the root_mem_cgroup tree, and therefore it should not
call mem_cgroup_calculate_protection().
The existing misuse in shrink_one() can cause ineffective protection of
sub-trees that are grandchildren of root_mem_cgroup. Fix it by reusing
lru_gen_age_node(), which already traverses the root_mem_cgroup tree, to
calculate the protection.
Previously lru_gen_age_node() opportunistically skips the first pass,
i.e., when scan_control->priority is DEF_PRIORITY. On the second pass,
lruvec_is_sizable() uses appropriate scan_control->priority, set by
set_initial_priority() from lru_gen_shrink_node(), to decide whether a
memcg is too small to reclaim from.
Now lru_gen_age_node() unconditionally traverses the root_mem_cgroup tree.
So it should call set_initial_priority() upfront, to make sure
lruvec_is_sizable() uses appropriate scan_control->priority on the first
pass. Otherwise, lruvec_is_reclaimable() can return false negatives and
result in premature OOM kills when min_ttl_ms is used.
Link: https://lkml.kernel.org/r/20240712232956.1427127-1-yuzhao@google.com
Fixes: e4dde56cd208 ("mm: multi-gen LRU: per-node lru_gen_folio lists")
Signed-off-by: Yu Zhao <yuzhao(a)google.com>
Reported-by: T.J. Mercier <tjmercier(a)google.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 6216d79edb7f..525d3ffa8451 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3915,6 +3915,32 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long seq,
* working set protection
******************************************************************************/
+static void set_initial_priority(struct pglist_data *pgdat, struct scan_control *sc)
+{
+ int priority;
+ unsigned long reclaimable;
+
+ if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH)
+ return;
+ /*
+ * Determine the initial priority based on
+ * (total >> priority) * reclaimed_to_scanned_ratio = nr_to_reclaim,
+ * where reclaimed_to_scanned_ratio = inactive / total.
+ */
+ reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE);
+ if (can_reclaim_anon_pages(NULL, pgdat->node_id, sc))
+ reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON);
+
+ /* round down reclaimable and round up sc->nr_to_reclaim */
+ priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1);
+
+ /*
+ * The estimation is based on LRU pages only, so cap it to prevent
+ * overshoots of shrinker objects by large margins.
+ */
+ sc->priority = clamp(priority, DEF_PRIORITY / 2, DEF_PRIORITY);
+}
+
static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc)
{
int gen, type, zone;
@@ -3948,19 +3974,17 @@ static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
DEFINE_MIN_SEQ(lruvec);
- /* see the comment on lru_gen_folio */
- gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]);
- birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
-
- if (time_is_after_jiffies(birth + min_ttl))
+ if (mem_cgroup_below_min(NULL, memcg))
return false;
if (!lruvec_is_sizable(lruvec, sc))
return false;
- mem_cgroup_calculate_protection(NULL, memcg);
+ /* see the comment on lru_gen_folio */
+ gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]);
+ birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
- return !mem_cgroup_below_min(NULL, memcg);
+ return time_is_before_jiffies(birth + min_ttl);
}
/* to protect the working set of the last N jiffies */
@@ -3970,23 +3994,20 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
{
struct mem_cgroup *memcg;
unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl);
+ bool reclaimable = !min_ttl;
VM_WARN_ON_ONCE(!current_is_kswapd());
- /* check the order to exclude compaction-induced reclaim */
- if (!min_ttl || sc->order || sc->priority == DEF_PRIORITY)
- return;
+ set_initial_priority(pgdat, sc);
memcg = mem_cgroup_iter(NULL, NULL, NULL);
do {
struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
- if (lruvec_is_reclaimable(lruvec, sc, min_ttl)) {
- mem_cgroup_iter_break(NULL, memcg);
- return;
- }
+ mem_cgroup_calculate_protection(NULL, memcg);
- cond_resched();
+ if (!reclaimable)
+ reclaimable = lruvec_is_reclaimable(lruvec, sc, min_ttl);
} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
/*
@@ -3994,7 +4015,7 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
* younger than min_ttl. However, another possibility is all memcgs are
* either too small or below min.
*/
- if (mutex_trylock(&oom_lock)) {
+ if (!reclaimable && mutex_trylock(&oom_lock)) {
struct oom_control oc = {
.gfp_mask = sc->gfp_mask,
};
@@ -4786,8 +4807,7 @@ static int shrink_one(struct lruvec *lruvec, struct scan_control *sc)
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
- mem_cgroup_calculate_protection(NULL, memcg);
-
+ /* lru_gen_age_node() called mem_cgroup_calculate_protection() */
if (mem_cgroup_below_min(NULL, memcg))
return MEMCG_LRU_YOUNG;
@@ -4911,32 +4931,6 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc
blk_finish_plug(&plug);
}
-static void set_initial_priority(struct pglist_data *pgdat, struct scan_control *sc)
-{
- int priority;
- unsigned long reclaimable;
-
- if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH)
- return;
- /*
- * Determine the initial priority based on
- * (total >> priority) * reclaimed_to_scanned_ratio = nr_to_reclaim,
- * where reclaimed_to_scanned_ratio = inactive / total.
- */
- reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE);
- if (can_reclaim_anon_pages(NULL, pgdat->node_id, sc))
- reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON);
-
- /* round down reclaimable and round up sc->nr_to_reclaim */
- priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1);
-
- /*
- * The estimation is based on LRU pages only, so cap it to prevent
- * overshoots of shrinker objects by large margins.
- */
- sc->priority = clamp(priority, DEF_PRIORITY / 2, DEF_PRIORITY);
-}
-
static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc)
{
struct blk_plug plug;
commit 4811f7af6090e8f5a398fbdd766f903ef6c0d787 upstream.
Syzbot reported that a buffer state inconsistency was detected in
nilfs_btnode_create_block(), triggering a kernel bug.
It is not appropriate to treat this inconsistency as a bug; it can occur
if the argument block address (the buffer index of the newly created
block) is a virtual block number and has been reallocated due to
corruption of the bitmap used to manage its allocation state.
So, modify nilfs_btnode_create_block() and its callers to treat it as a
possible filesystem error, rather than triggering a kernel bug.
Link: https://lkml.kernel.org/r/20240725052007.4562-1-konishi.ryusuke@gmail.com
Fixes: a60be987d45d ("nilfs2: B-tree node cache")
Signed-off-by: Ryusuke Konishi <konishi.ryusuke(a)gmail.com>
Reported-by: syzbot+89cc4f2324ed37988b60(a)syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=89cc4f2324ed37988b60
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
Please apply this patch to the stable trees indicated by the subject
prefix instead of the failed patches or the one I asked you to drop.
This patch is tailored to take page/folio conversion into account and
can be applied from v4.11 to v6.7.
Also, all the builds and tests I did on each stable tree passed.
Thanks,
Ryusuke Konishi
fs/nilfs2/btnode.c | 25 ++++++++++++++++++++-----
fs/nilfs2/btree.c | 4 ++--
2 files changed, 22 insertions(+), 7 deletions(-)
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 5710833ac1cc..8fe348bceabe 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -51,12 +51,21 @@ nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr)
bh = nilfs_grab_buffer(inode, btnc, blocknr, BIT(BH_NILFS_Node));
if (unlikely(!bh))
- return NULL;
+ return ERR_PTR(-ENOMEM);
if (unlikely(buffer_mapped(bh) || buffer_uptodate(bh) ||
buffer_dirty(bh))) {
- brelse(bh);
- BUG();
+ /*
+ * The block buffer at the specified new address was already
+ * in use. This can happen if it is a virtual block number
+ * and has been reallocated due to corruption of the bitmap
+ * used to manage its allocation state (if not, the buffer
+ * clearing of an abandoned b-tree node is missing somewhere).
+ */
+ nilfs_error(inode->i_sb,
+ "state inconsistency probably due to duplicate use of b-tree node block address %llu (ino=%lu)",
+ (unsigned long long)blocknr, inode->i_ino);
+ goto failed;
}
memset(bh->b_data, 0, i_blocksize(inode));
bh->b_bdev = inode->i_sb->s_bdev;
@@ -67,6 +76,12 @@ nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr)
unlock_page(bh->b_page);
put_page(bh->b_page);
return bh;
+
+failed:
+ unlock_page(bh->b_page);
+ put_page(bh->b_page);
+ brelse(bh);
+ return ERR_PTR(-EIO);
}
int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
@@ -217,8 +232,8 @@ int nilfs_btnode_prepare_change_key(struct address_space *btnc,
}
nbh = nilfs_btnode_create_block(btnc, newkey);
- if (!nbh)
- return -ENOMEM;
+ if (IS_ERR(nbh))
+ return PTR_ERR(nbh);
BUG_ON(nbh == obh);
ctxt->newbh = nbh;
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index 65659fa0372e..598f05867059 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -63,8 +63,8 @@ static int nilfs_btree_get_new_block(const struct nilfs_bmap *btree,
struct buffer_head *bh;
bh = nilfs_btnode_create_block(btnc, ptr);
- if (!bh)
- return -ENOMEM;
+ if (IS_ERR(bh))
+ return PTR_ERR(bh);
set_buffer_nilfs_volatile(bh);
*bhp = bh;
--
2.43.5