The patch below does not apply to the 6.12-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.12.y
git checkout FETCH_HEAD
git cherry-pick -x 935a20d1bebf6236076785fac3ff81e3931834e9
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025122946-opt-excuse-6662@gregkh' --subject-prefix 'PATCH 6.12.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 935a20d1bebf6236076785fac3ff81e3931834e9 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche(a)acm.org>
Date: Fri, 14 Nov 2025 13:04:07 -0800
Subject: [PATCH] block: Remove queue freezing from several sysfs store
callbacks
Freezing the request queue from inside sysfs store callbacks may cause a
deadlock in combination with the dm-multipath driver and the
queue_if_no_path option. Additionally, freezing the request queue slows
down system boot on systems where sysfs attributes are set synchronously.
Fix this by removing the blk_mq_freeze_queue() / blk_mq_unfreeze_queue()
calls from the store callbacks that do not strictly need these callbacks.
Add the __data_racy annotation to request_queue.rq_timeout to suppress
KCSAN data race reports about the rq_timeout reads.
This patch may cause a small delay in applying the new settings.
For all the attributes affected by this patch, I/O will complete
correctly whether the old or the new value of the attribute is used.
This patch affects the following sysfs attributes:
* io_poll_delay
* io_timeout
* nomerges
* read_ahead_kb
* rq_affinity
Here is an example of a deadlock triggered by running test srp/002
if this patch is not applied:
task:multipathd
Call Trace:
<TASK>
__schedule+0x8c1/0x1bf0
schedule+0xdd/0x270
schedule_preempt_disabled+0x1c/0x30
__mutex_lock+0xb89/0x1650
mutex_lock_nested+0x1f/0x30
dm_table_set_restrictions+0x823/0xdf0
__bind+0x166/0x590
dm_swap_table+0x2a7/0x490
do_resume+0x1b1/0x610
dev_suspend+0x55/0x1a0
ctl_ioctl+0x3a5/0x7e0
dm_ctl_ioctl+0x12/0x20
__x64_sys_ioctl+0x127/0x1a0
x64_sys_call+0xe2b/0x17d0
do_syscall_64+0x96/0x3a0
entry_SYSCALL_64_after_hwframe+0x4b/0x53
</TASK>
task:(udev-worker)
Call Trace:
<TASK>
__schedule+0x8c1/0x1bf0
schedule+0xdd/0x270
blk_mq_freeze_queue_wait+0xf2/0x140
blk_mq_freeze_queue_nomemsave+0x23/0x30
queue_ra_store+0x14e/0x290
queue_attr_store+0x23e/0x2c0
sysfs_kf_write+0xde/0x140
kernfs_fop_write_iter+0x3b2/0x630
vfs_write+0x4fd/0x1390
ksys_write+0xfd/0x230
__x64_sys_write+0x76/0xc0
x64_sys_call+0x276/0x17d0
do_syscall_64+0x96/0x3a0
entry_SYSCALL_64_after_hwframe+0x4b/0x53
</TASK>
Cc: Christoph Hellwig <hch(a)lst.de>
Cc: Ming Lei <ming.lei(a)redhat.com>
Cc: Nilay Shroff <nilay(a)linux.ibm.com>
Cc: Martin Wilck <mwilck(a)suse.com>
Cc: Benjamin Marzinski <bmarzins(a)redhat.com>
Cc: stable(a)vger.kernel.org
Fixes: af2814149883 ("block: freeze the queue in queue_attr_store")
Signed-off-by: Bart Van Assche <bvanassche(a)acm.org>
Reviewed-by: Nilay Shroff <nilay(a)linux.ibm.com>
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 76c47fe9b8d6..8684c57498cc 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -143,21 +143,22 @@ queue_ra_store(struct gendisk *disk, const char *page, size_t count)
{
unsigned long ra_kb;
ssize_t ret;
- unsigned int memflags;
struct request_queue *q = disk->queue;
ret = queue_var_store(&ra_kb, page, count);
if (ret < 0)
return ret;
/*
- * ->ra_pages is protected by ->limits_lock because it is usually
- * calculated from the queue limits by queue_limits_commit_update.
+ * The ->ra_pages change below is protected by ->limits_lock because it
+ * is usually calculated from the queue limits by
+ * queue_limits_commit_update().
+ *
+ * bdi->ra_pages reads are not serialized against bdi->ra_pages writes.
+ * Use WRITE_ONCE() to write bdi->ra_pages once.
*/
mutex_lock(&q->limits_lock);
- memflags = blk_mq_freeze_queue(q);
- disk->bdi->ra_pages = ra_kb >> (PAGE_SHIFT - 10);
+ WRITE_ONCE(disk->bdi->ra_pages, ra_kb >> (PAGE_SHIFT - 10));
mutex_unlock(&q->limits_lock);
- blk_mq_unfreeze_queue(q, memflags);
return ret;
}
@@ -375,21 +376,18 @@ static ssize_t queue_nomerges_store(struct gendisk *disk, const char *page,
size_t count)
{
unsigned long nm;
- unsigned int memflags;
struct request_queue *q = disk->queue;
ssize_t ret = queue_var_store(&nm, page, count);
if (ret < 0)
return ret;
- memflags = blk_mq_freeze_queue(q);
blk_queue_flag_clear(QUEUE_FLAG_NOMERGES, q);
blk_queue_flag_clear(QUEUE_FLAG_NOXMERGES, q);
if (nm == 2)
blk_queue_flag_set(QUEUE_FLAG_NOMERGES, q);
else if (nm)
blk_queue_flag_set(QUEUE_FLAG_NOXMERGES, q);
- blk_mq_unfreeze_queue(q, memflags);
return ret;
}
@@ -409,7 +407,6 @@ queue_rq_affinity_store(struct gendisk *disk, const char *page, size_t count)
#ifdef CONFIG_SMP
struct request_queue *q = disk->queue;
unsigned long val;
- unsigned int memflags;
ret = queue_var_store(&val, page, count);
if (ret < 0)
@@ -421,7 +418,6 @@ queue_rq_affinity_store(struct gendisk *disk, const char *page, size_t count)
* are accessed individually using atomic test_bit operation. So we
* don't grab any lock while updating these flags.
*/
- memflags = blk_mq_freeze_queue(q);
if (val == 2) {
blk_queue_flag_set(QUEUE_FLAG_SAME_COMP, q);
blk_queue_flag_set(QUEUE_FLAG_SAME_FORCE, q);
@@ -432,7 +428,6 @@ queue_rq_affinity_store(struct gendisk *disk, const char *page, size_t count)
blk_queue_flag_clear(QUEUE_FLAG_SAME_COMP, q);
blk_queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q);
}
- blk_mq_unfreeze_queue(q, memflags);
#endif
return ret;
}
@@ -446,11 +441,9 @@ static ssize_t queue_poll_delay_store(struct gendisk *disk, const char *page,
static ssize_t queue_poll_store(struct gendisk *disk, const char *page,
size_t count)
{
- unsigned int memflags;
ssize_t ret = count;
struct request_queue *q = disk->queue;
- memflags = blk_mq_freeze_queue(q);
if (!(q->limits.features & BLK_FEAT_POLL)) {
ret = -EINVAL;
goto out;
@@ -459,7 +452,6 @@ static ssize_t queue_poll_store(struct gendisk *disk, const char *page,
pr_info_ratelimited("writes to the poll attribute are ignored.\n");
pr_info_ratelimited("please use driver specific parameters instead.\n");
out:
- blk_mq_unfreeze_queue(q, memflags);
return ret;
}
@@ -472,7 +464,7 @@ static ssize_t queue_io_timeout_show(struct gendisk *disk, char *page)
static ssize_t queue_io_timeout_store(struct gendisk *disk, const char *page,
size_t count)
{
- unsigned int val, memflags;
+ unsigned int val;
int err;
struct request_queue *q = disk->queue;
@@ -480,9 +472,7 @@ static ssize_t queue_io_timeout_store(struct gendisk *disk, const char *page,
if (err || val == 0)
return -EINVAL;
- memflags = blk_mq_freeze_queue(q);
blk_queue_rq_timeout(q, msecs_to_jiffies(val));
- blk_mq_unfreeze_queue(q, memflags);
return count;
}
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 2fff8a80dbd2..cb4ba09959ee 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -495,7 +495,7 @@ struct request_queue {
*/
unsigned long queue_flags;
- unsigned int rq_timeout;
+ unsigned int __data_racy rq_timeout;
unsigned int queue_depth;
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x ee5a977b4e771cc181f39d504426dbd31ed701cc
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025122958-ambitious-prenatal-99c2@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From ee5a977b4e771cc181f39d504426dbd31ed701cc Mon Sep 17 00:00:00 2001
From: Fedor Pchelkin <pchelkin(a)ispras.ru>
Date: Sat, 1 Nov 2025 19:04:28 +0300
Subject: [PATCH] ext4: fix string copying in parse_apply_sb_mount_options()
strscpy_pad() can't be used to copy a non-NUL-term string into a NUL-term
string of possibly bigger size. Commit 0efc5990bca5 ("string.h: Introduce
memtostr() and memtostr_pad()") provides additional information in that
regard. So if this happens, the following warning is observed:
strnlen: detected buffer overflow: 65 byte read of buffer size 64
WARNING: CPU: 0 PID: 28655 at lib/string_helpers.c:1032 __fortify_report+0x96/0xc0 lib/string_helpers.c:1032
Modules linked in:
CPU: 0 UID: 0 PID: 28655 Comm: syz-executor.3 Not tainted 6.12.54-syzkaller-00144-g5f0270f1ba00 #0
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-debian-1.16.3-2 04/01/2014
RIP: 0010:__fortify_report+0x96/0xc0 lib/string_helpers.c:1032
Call Trace:
<TASK>
__fortify_panic+0x1f/0x30 lib/string_helpers.c:1039
strnlen include/linux/fortify-string.h:235 [inline]
sized_strscpy include/linux/fortify-string.h:309 [inline]
parse_apply_sb_mount_options fs/ext4/super.c:2504 [inline]
__ext4_fill_super fs/ext4/super.c:5261 [inline]
ext4_fill_super+0x3c35/0xad00 fs/ext4/super.c:5706
get_tree_bdev_flags+0x387/0x620 fs/super.c:1636
vfs_get_tree+0x93/0x380 fs/super.c:1814
do_new_mount fs/namespace.c:3553 [inline]
path_mount+0x6ae/0x1f70 fs/namespace.c:3880
do_mount fs/namespace.c:3893 [inline]
__do_sys_mount fs/namespace.c:4103 [inline]
__se_sys_mount fs/namespace.c:4080 [inline]
__x64_sys_mount+0x280/0x300 fs/namespace.c:4080
do_syscall_x64 arch/x86/entry/common.c:52 [inline]
do_syscall_64+0x64/0x140 arch/x86/entry/common.c:83
entry_SYSCALL_64_after_hwframe+0x76/0x7e
Since userspace is expected to provide s_mount_opts field to be at most 63
characters long with the ending byte being NUL-term, use a 64-byte buffer
which matches the size of s_mount_opts, so that strscpy_pad() does its job
properly. Return with error if the user still managed to provide a
non-NUL-term string here.
Found by Linux Verification Center (linuxtesting.org) with Syzkaller.
Fixes: 8ecb790ea8c3 ("ext4: avoid potential buffer over-read in parse_apply_sb_mount_options()")
Cc: stable(a)vger.kernel.org
Signed-off-by: Fedor Pchelkin <pchelkin(a)ispras.ru>
Reviewed-by: Baokun Li <libaokun1(a)huawei.com>
Reviewed-by: Jan Kara <jack(a)suse.cz>
Message-ID: <20251101160430.222297-1-pchelkin(a)ispras.ru>
Signed-off-by: Theodore Ts'o <tytso(a)mit.edu>
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 7de15249e826..d1ba894c0e0a 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2476,7 +2476,7 @@ static int parse_apply_sb_mount_options(struct super_block *sb,
struct ext4_fs_context *m_ctx)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
- char s_mount_opts[65];
+ char s_mount_opts[64];
struct ext4_fs_context *s_ctx = NULL;
struct fs_context *fc = NULL;
int ret = -ENOMEM;
@@ -2484,7 +2484,8 @@ static int parse_apply_sb_mount_options(struct super_block *sb,
if (!sbi->s_es->s_mount_opts[0])
return 0;
- strscpy_pad(s_mount_opts, sbi->s_es->s_mount_opts);
+ if (strscpy_pad(s_mount_opts, sbi->s_es->s_mount_opts) < 0)
+ return -E2BIG;
fc = kzalloc(sizeof(struct fs_context), GFP_KERNEL);
if (!fc)
Hi Greg,
On 2025-12-29, <gregkh(a)linuxfoundation.org> wrote:
> This is a note to let you know that I've just added the patch titled
>
> printk: Avoid scheduling irq_work on suspend
>
> to the 6.18-stable tree which can be found at:
> http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
>
> The filename of the patch is:
> printk-avoid-scheduling-irq_work-on-suspend.patch
> and it can be found in the queue-6.18 subdirectory.
>
> If you, or anyone else, feels it should not be added to the stable tree,
> please let <stable(a)vger.kernel.org> know about it.
This patch should be accompanied with the preceeding commit:
d01ff281bd9b ("printk: Allow printk_trigger_flush() to flush all types")
and the later commit:
66e7c1e0ee08 ("printk: Avoid irq_work for printk_deferred() on suspend")
in order to completely avoid irq_work triggering during suspend for all
possible call paths.
John Ogness
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x 1f73b8b56cf35de29a433aee7bfff26cea98be3f
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025120110-coastal-litigator-8952@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 1f73b8b56cf35de29a433aee7bfff26cea98be3f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=81ukasz=20Bartosik?= <ukaszb(a)chromium.org>
Date: Wed, 19 Nov 2025 21:29:09 +0000
Subject: [PATCH] xhci: dbgtty: fix device unregister
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
When DbC is disconnected then xhci_dbc_tty_unregister_device()
is called. However if there is any user space process blocked
on write to DbC terminal device then it will never be signalled
and thus stay blocked indifinitely.
This fix adds a tty_vhangup() call in xhci_dbc_tty_unregister_device().
The tty_vhangup() wakes up any blocked writers and causes subsequent
write attempts to DbC terminal device to fail.
Cc: stable <stable(a)kernel.org>
Fixes: dfba2174dc42 ("usb: xhci: Add DbC support in xHCI driver")
Signed-off-by: Łukasz Bartosik <ukaszb(a)chromium.org>
Link: https://patch.msgid.link/20251119212910.1245694-1-ukaszb@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
diff --git a/drivers/usb/host/xhci-dbgtty.c b/drivers/usb/host/xhci-dbgtty.c
index b7f95565524d..57cdda4e09c8 100644
--- a/drivers/usb/host/xhci-dbgtty.c
+++ b/drivers/usb/host/xhci-dbgtty.c
@@ -550,6 +550,12 @@ static void xhci_dbc_tty_unregister_device(struct xhci_dbc *dbc)
if (!port->registered)
return;
+ /*
+ * Hang up the TTY. This wakes up any blocked
+ * writers and causes subsequent writes to fail.
+ */
+ tty_vhangup(port->port.tty);
+
tty_unregister_device(dbc_tty_driver, port->minor);
xhci_dbc_tty_exit_port(port);
port->registered = false;
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x 79f3f9bedd149ea438aaeb0fb6a083637affe205
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025122903-sterile-from-4520@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 79f3f9bedd149ea438aaeb0fb6a083637affe205 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz(a)infradead.org>
Date: Wed, 2 Apr 2025 20:07:34 +0200
Subject: [PATCH] sched/eevdf: Fix min_vruntime vs avg_vruntime
Basically, from the constraint that the sum of lag is zero, you can
infer that the 0-lag point is the weighted average of the individual
vruntime, which is what we're trying to compute:
\Sum w_i * v_i
avg = --------------
\Sum w_i
Now, since vruntime takes the whole u64 (worse, it wraps), this
multiplication term in the numerator is not something we can compute;
instead we do the min_vruntime (v0 henceforth) thing like:
v_i = (v_i - v0) + v0
This does two things:
- it keeps the key: (v_i - v0) 'small';
- it creates a relative 0-point in the modular space.
If you do that subtitution and work it all out, you end up with:
\Sum w_i * (v_i - v0)
avg = --------------------- + v0
\Sum w_i
Since you cannot very well track a ratio like that (and not suffer
terrible numerical problems) we simpy track the numerator and
denominator individually and only perform the division when strictly
needed.
Notably, the numerator lives in cfs_rq->avg_vruntime and the denominator
lives in cfs_rq->avg_load.
The one extra 'funny' is that these numbers track the entities in the
tree, and current is typically outside of the tree, so avg_vruntime()
adds current when needed before doing the division.
(vruntime_eligible() elides the division by cross-wise multiplication)
Anyway, as mentioned above, we currently use the CFS era min_vruntime
for this purpose. However, this thing can only move forward, while the
above avg can in fact move backward (when a non-eligible task leaves,
the average becomes smaller), this can cause trouble when through
happenstance (or construction) these values drift far enough apart to
wreck the game.
Replace cfs_rq::min_vruntime with cfs_rq::zero_vruntime which is kept
near/at avg_vruntime, following its motion.
The down-side is that this requires computing the avg more often.
Fixes: 147f3efaa241 ("sched/fair: Implement an EEVDF-like scheduling policy")
Reported-by: Zicheng Qu <quzicheng(a)huawei.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz(a)infradead.org>
Link: https://patch.msgid.link/20251106111741.GC4068168@noisy.programming.kicks-a…
Cc: stable(a)vger.kernel.org
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 02e16b70a790..41caa22e0680 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -796,7 +796,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
{
- s64 left_vruntime = -1, min_vruntime, right_vruntime = -1, left_deadline = -1, spread;
+ s64 left_vruntime = -1, zero_vruntime, right_vruntime = -1, left_deadline = -1, spread;
struct sched_entity *last, *first, *root;
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
@@ -819,15 +819,15 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
last = __pick_last_entity(cfs_rq);
if (last)
right_vruntime = last->vruntime;
- min_vruntime = cfs_rq->min_vruntime;
+ zero_vruntime = cfs_rq->zero_vruntime;
raw_spin_rq_unlock_irqrestore(rq, flags);
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "left_deadline",
SPLIT_NS(left_deadline));
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "left_vruntime",
SPLIT_NS(left_vruntime));
- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime",
- SPLIT_NS(min_vruntime));
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "zero_vruntime",
+ SPLIT_NS(zero_vruntime));
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "avg_vruntime",
SPLIT_NS(avg_vruntime(cfs_rq)));
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "right_vruntime",
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4a11a832d63e..8d971d48669f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -554,7 +554,7 @@ static inline bool entity_before(const struct sched_entity *a,
static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- return (s64)(se->vruntime - cfs_rq->min_vruntime);
+ return (s64)(se->vruntime - cfs_rq->zero_vruntime);
}
#define __node_2_se(node) \
@@ -606,13 +606,13 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
*
* Which we track using:
*
- * v0 := cfs_rq->min_vruntime
+ * v0 := cfs_rq->zero_vruntime
* \Sum (v_i - v0) * w_i := cfs_rq->avg_vruntime
* \Sum w_i := cfs_rq->avg_load
*
- * Since min_vruntime is a monotonic increasing variable that closely tracks
- * the per-task service, these deltas: (v_i - v), will be in the order of the
- * maximal (virtual) lag induced in the system due to quantisation.
+ * Since zero_vruntime closely tracks the per-task service, these
+ * deltas: (v_i - v), will be in the order of the maximal (virtual) lag
+ * induced in the system due to quantisation.
*
* Also, we use scale_load_down() to reduce the size.
*
@@ -671,7 +671,7 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq)
avg = div_s64(avg, load);
}
- return cfs_rq->min_vruntime + avg;
+ return cfs_rq->zero_vruntime + avg;
}
/*
@@ -732,7 +732,7 @@ static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime)
load += weight;
}
- return avg >= (s64)(vruntime - cfs_rq->min_vruntime) * load;
+ return avg >= (s64)(vruntime - cfs_rq->zero_vruntime) * load;
}
int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -740,42 +740,14 @@ int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se)
return vruntime_eligible(cfs_rq, se->vruntime);
}
-static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime)
+static void update_zero_vruntime(struct cfs_rq *cfs_rq)
{
- u64 min_vruntime = cfs_rq->min_vruntime;
- /*
- * open coded max_vruntime() to allow updating avg_vruntime
- */
- s64 delta = (s64)(vruntime - min_vruntime);
- if (delta > 0) {
- avg_vruntime_update(cfs_rq, delta);
- min_vruntime = vruntime;
- }
- return min_vruntime;
-}
+ u64 vruntime = avg_vruntime(cfs_rq);
+ s64 delta = (s64)(vruntime - cfs_rq->zero_vruntime);
-static void update_min_vruntime(struct cfs_rq *cfs_rq)
-{
- struct sched_entity *se = __pick_root_entity(cfs_rq);
- struct sched_entity *curr = cfs_rq->curr;
- u64 vruntime = cfs_rq->min_vruntime;
+ avg_vruntime_update(cfs_rq, delta);
- if (curr) {
- if (curr->on_rq)
- vruntime = curr->vruntime;
- else
- curr = NULL;
- }
-
- if (se) {
- if (!curr)
- vruntime = se->min_vruntime;
- else
- vruntime = min_vruntime(vruntime, se->min_vruntime);
- }
-
- /* ensure we never gain time by being placed backwards. */
- cfs_rq->min_vruntime = __update_min_vruntime(cfs_rq, vruntime);
+ cfs_rq->zero_vruntime = vruntime;
}
static inline u64 cfs_rq_min_slice(struct cfs_rq *cfs_rq)
@@ -848,6 +820,7 @@ RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity,
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
avg_vruntime_add(cfs_rq, se);
+ update_zero_vruntime(cfs_rq);
se->min_vruntime = se->vruntime;
se->min_slice = se->slice;
rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
@@ -859,6 +832,7 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
&min_vruntime_cb);
avg_vruntime_sub(cfs_rq, se);
+ update_zero_vruntime(cfs_rq);
}
struct sched_entity *__pick_root_entity(struct cfs_rq *cfs_rq)
@@ -1226,7 +1200,6 @@ static void update_curr(struct cfs_rq *cfs_rq)
curr->vruntime += calc_delta_fair(delta_exec, curr);
resched = update_deadline(cfs_rq, curr);
- update_min_vruntime(cfs_rq);
if (entity_is_task(curr)) {
/*
@@ -3808,15 +3781,6 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
if (!curr)
__enqueue_entity(cfs_rq, se);
cfs_rq->nr_queued++;
-
- /*
- * The entity's vruntime has been adjusted, so let's check
- * whether the rq-wide min_vruntime needs updated too. Since
- * the calculations above require stable min_vruntime rather
- * than up-to-date one, we do the update at the end of the
- * reweight process.
- */
- update_min_vruntime(cfs_rq);
}
}
@@ -5429,15 +5393,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
update_cfs_group(se);
- /*
- * Now advance min_vruntime if @se was the entity holding it back,
- * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be
- * put back on, and if we advance min_vruntime, we'll be placed back
- * further than we started -- i.e. we'll be penalized.
- */
- if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
- update_min_vruntime(cfs_rq);
-
if (flags & DEQUEUE_DELAYED)
finish_delayed_dequeue_entity(se);
@@ -9015,7 +8970,6 @@ static void yield_task_fair(struct rq *rq)
if (entity_eligible(cfs_rq, se)) {
se->vruntime = se->deadline;
se->deadline += calc_delta_fair(se->slice, se);
- update_min_vruntime(cfs_rq);
}
}
@@ -13078,23 +13032,6 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr)
* Which shows that S and s_i transform alike (which makes perfect sense
* given that S is basically the (weighted) average of s_i).
*
- * Then:
- *
- * x -> s_min := min{s_i} (8)
- *
- * to obtain:
- *
- * \Sum_i w_i (s_i - s_min)
- * S = s_min + ------------------------ (9)
- * \Sum_i w_i
- *
- * Which already looks familiar, and is the basis for our current
- * approximation:
- *
- * S ~= s_min (10)
- *
- * Now, obviously, (10) is absolute crap :-), but it sorta works.
- *
* So the thing to remember is that the above is strictly UP. It is
* possible to generalize to multiple runqueues -- however it gets really
* yuck when you have to add affinity support, as illustrated by our very
@@ -13116,23 +13053,23 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr)
* Let, for our runqueue 'k':
*
* T_k = \Sum_i w_i s_i
- * W_k = \Sum_i w_i ; for all i of k (11)
+ * W_k = \Sum_i w_i ; for all i of k (8)
*
* Then we can write (6) like:
*
* T_k
- * S_k = --- (12)
+ * S_k = --- (9)
* W_k
*
* From which immediately follows that:
*
* T_k + T_l
- * S_k+l = --------- (13)
+ * S_k+l = --------- (10)
* W_k + W_l
*
* On which we can define a combined lag:
*
- * lag_k+l(i) := S_k+l - s_i (14)
+ * lag_k+l(i) := S_k+l - s_i (11)
*
* And that gives us the tools to compare tasks across a combined runqueue.
*
@@ -13143,7 +13080,7 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr)
* using (7); this only requires storing single 'time'-stamps.
*
* b) when comparing tasks between 2 runqueues of which one is forced-idle,
- * compare the combined lag, per (14).
+ * compare the combined lag, per (11).
*
* Now, of course cgroups (I so hate them) make this more interesting in
* that a) seems to suggest we need to iterate all cgroup on a CPU at such
@@ -13191,12 +13128,11 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr)
* every tick. This limits the observed divergence due to the work
* conservancy.
*
- * On top of that, we can improve upon things by moving away from our
- * horrible (10) hack and moving to (9) and employing (13) here.
+ * On top of that, we can improve upon things by employing (10) here.
*/
/*
- * se_fi_update - Update the cfs_rq->min_vruntime_fi in a CFS hierarchy if needed.
+ * se_fi_update - Update the cfs_rq->zero_vruntime_fi in a CFS hierarchy if needed.
*/
static void se_fi_update(const struct sched_entity *se, unsigned int fi_seq,
bool forceidle)
@@ -13210,7 +13146,7 @@ static void se_fi_update(const struct sched_entity *se, unsigned int fi_seq,
cfs_rq->forceidle_seq = fi_seq;
}
- cfs_rq->min_vruntime_fi = cfs_rq->min_vruntime;
+ cfs_rq->zero_vruntime_fi = cfs_rq->zero_vruntime;
}
}
@@ -13263,11 +13199,11 @@ bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b,
/*
* Find delta after normalizing se's vruntime with its cfs_rq's
- * min_vruntime_fi, which would have been updated in prior calls
+ * zero_vruntime_fi, which would have been updated in prior calls
* to se_fi_update().
*/
delta = (s64)(sea->vruntime - seb->vruntime) +
- (s64)(cfs_rqb->min_vruntime_fi - cfs_rqa->min_vruntime_fi);
+ (s64)(cfs_rqb->zero_vruntime_fi - cfs_rqa->zero_vruntime_fi);
return delta > 0;
}
@@ -13513,7 +13449,7 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
void init_cfs_rq(struct cfs_rq *cfs_rq)
{
cfs_rq->tasks_timeline = RB_ROOT_CACHED;
- cfs_rq->min_vruntime = (u64)(-(1LL << 20));
+ cfs_rq->zero_vruntime = (u64)(-(1LL << 20));
raw_spin_lock_init(&cfs_rq->removed.lock);
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 82e74e8ca2ea..5a3cf81c27be 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -681,10 +681,10 @@ struct cfs_rq {
s64 avg_vruntime;
u64 avg_load;
- u64 min_vruntime;
+ u64 zero_vruntime;
#ifdef CONFIG_SCHED_CORE
unsigned int forceidle_seq;
- u64 min_vruntime_fi;
+ u64 zero_vruntime_fi;
#endif
struct rb_root_cached tasks_timeline;
The patch below does not apply to the 6.12-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.12.y
git checkout FETCH_HEAD
git cherry-pick -x f5e1e5ec204da11fa87fdf006d451d80ce06e118
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025122951-giggle-reveler-2e6b@gregkh' --subject-prefix 'PATCH 6.12.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From f5e1e5ec204da11fa87fdf006d451d80ce06e118 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj(a)kernel.org>
Date: Thu, 11 Dec 2025 15:45:04 -1000
Subject: [PATCH] sched_ext: Fix missing post-enqueue handling in
move_local_task_to_local_dsq()
move_local_task_to_local_dsq() is used when moving a task from a non-local
DSQ to a local DSQ on the same CPU. It directly manipulates the local DSQ
without going through dispatch_enqueue() and was missing the post-enqueue
handling that triggers preemption when SCX_ENQ_PREEMPT is set or the idle
task is running.
The function is used by move_task_between_dsqs() which backs
scx_bpf_dsq_move() and may be called while the CPU is busy.
Add local_dsq_post_enq() call to move_local_task_to_local_dsq(). As the
dispatch path doesn't need post-enqueue handling, add SCX_RQ_IN_BALANCE
early exit to keep consume_dispatch_q() behavior unchanged and avoid
triggering unnecessary resched when scx_bpf_dsq_move() is used from the
dispatch path.
Fixes: 4c30f5ce4f7a ("sched_ext: Implement scx_bpf_dispatch[_vtime]_from_dsq()")
Cc: stable(a)vger.kernel.org # v6.12+
Reviewed-by: Andrea Righi <arighi(a)nvidia.com>
Reviewed-by: Emil Tsalapatis <emil(a)etsalapatis.com>
Signed-off-by: Tejun Heo <tj(a)kernel.org>
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index c78efa99406f..695503a2f7d1 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -988,6 +988,14 @@ static void local_dsq_post_enq(struct scx_dispatch_q *dsq, struct task_struct *p
struct rq *rq = container_of(dsq, struct rq, scx.local_dsq);
bool preempt = false;
+ /*
+ * If @rq is in balance, the CPU is already vacant and looking for the
+ * next task to run. No need to preempt or trigger resched after moving
+ * @p into its local DSQ.
+ */
+ if (rq->scx.flags & SCX_RQ_IN_BALANCE)
+ return;
+
if ((enq_flags & SCX_ENQ_PREEMPT) && p != rq->curr &&
rq->curr->sched_class == &ext_sched_class) {
rq->curr->scx.slice = 0;
@@ -1636,6 +1644,8 @@ static void move_local_task_to_local_dsq(struct task_struct *p, u64 enq_flags,
dsq_mod_nr(dst_dsq, 1);
p->scx.dsq = dst_dsq;
+
+ local_dsq_post_enq(dst_dsq, p, enq_flags);
}
/**
The patch below does not apply to the 6.12-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.12.y
git checkout FETCH_HEAD
git cherry-pick -x 9f769637a93fac81689b80df6855f545839cf999
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025122940-sneak-unvocal-9de2@gregkh' --subject-prefix 'PATCH 6.12.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 9f769637a93fac81689b80df6855f545839cf999 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj(a)kernel.org>
Date: Tue, 9 Dec 2025 11:04:33 -1000
Subject: [PATCH] sched_ext: Fix bypass depth leak on scx_enable() failure
scx_enable() calls scx_bypass(true) to initialize in bypass mode and then
scx_bypass(false) on success to exit. If scx_enable() fails during task
initialization - e.g. scx_cgroup_init() or scx_init_task() returns an error -
it jumps to err_disable while bypass is still active. scx_disable_workfn()
then calls scx_bypass(true/false) for its own bypass, leaving the bypass depth
at 1 instead of 0. This causes the system to remain permanently in bypass mode
after a failed scx_enable().
Failures after task initialization is complete - e.g. scx_tryset_enable_state()
at the end - already call scx_bypass(false) before reaching the error path and
are not affected. This only affects a subset of failure modes.
Fix it by tracking whether scx_enable() called scx_bypass(true) in a bool and
having scx_disable_workfn() call an extra scx_bypass(false) to clear it. This
is a temporary measure as the bypass depth will be moved into the sched
instance, which will make this tracking unnecessary.
Fixes: 8c2090c504e9 ("sched_ext: Initialize in bypass mode")
Cc: stable(a)vger.kernel.org # v6.12+
Reported-by: Chris Mason <clm(a)meta.com>
Reviewed-by: Emil Tsalapatis <emil(a)etsalapatis.com>
Link: https://lore.kernel.org/stable/286e6f7787a81239e1ce2989b52391ce%40kernel.org
Signed-off-by: Tejun Heo <tj(a)kernel.org>
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index bd74b371f52d..c4465ccefea4 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -41,6 +41,13 @@ static bool scx_init_task_enabled;
static bool scx_switching_all;
DEFINE_STATIC_KEY_FALSE(__scx_switched_all);
+/*
+ * Tracks whether scx_enable() called scx_bypass(true). Used to balance bypass
+ * depth on enable failure. Will be removed when bypass depth is moved into the
+ * sched instance.
+ */
+static bool scx_bypassed_for_enable;
+
static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0);
static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0);
@@ -4318,6 +4325,11 @@ static void scx_disable_workfn(struct kthread_work *work)
scx_dsp_max_batch = 0;
free_kick_syncs();
+ if (scx_bypassed_for_enable) {
+ scx_bypassed_for_enable = false;
+ scx_bypass(false);
+ }
+
mutex_unlock(&scx_enable_mutex);
WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING);
@@ -4970,6 +4982,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
* Init in bypass mode to guarantee forward progress.
*/
scx_bypass(true);
+ scx_bypassed_for_enable = true;
for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++)
if (((void (**)(void))ops)[i])
@@ -5067,6 +5080,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
scx_task_iter_stop(&sti);
percpu_up_write(&scx_fork_rwsem);
+ scx_bypassed_for_enable = false;
scx_bypass(false);
if (!scx_tryset_enable_state(SCX_ENABLED, SCX_ENABLING)) {