Limit the WiFi PCIe link speed to Gen2 speed (500 MB/s), which is the
speed that the boot firmware has brought up the link at (and that
Windows uses).
This is specifically needed to avoid a large amount of link errors when
restarting the link during boot (but which are currently not reported).
This also appears to fix intermittent failures to download the ath11k
firmware during boot which can be seen when there is a longer delay
between restarting the link and loading the WiFi driver (e.g. when using
full disk encryption).
Fixes: 123b30a75623 ("arm64: dts: qcom: sc8280xp-x13s: enable WiFi controller")
Cc: stable(a)vger.kernel.org # 6.2
Signed-off-by: Johan Hovold <johan+linaro(a)kernel.org>
---
arch/arm64/boot/dts/qcom/sc8280xp-lenovo-thinkpad-x13s.dts | 2 ++
1 file changed, 2 insertions(+)
diff --git a/arch/arm64/boot/dts/qcom/sc8280xp-lenovo-thinkpad-x13s.dts b/arch/arm64/boot/dts/qcom/sc8280xp-lenovo-thinkpad-x13s.dts
index 2c17e137563a..a67756ada990 100644
--- a/arch/arm64/boot/dts/qcom/sc8280xp-lenovo-thinkpad-x13s.dts
+++ b/arch/arm64/boot/dts/qcom/sc8280xp-lenovo-thinkpad-x13s.dts
@@ -768,6 +768,8 @@ &pcie3a_phy {
};
&pcie4 {
+ max-link-speed = <2>;
+
perst-gpios = <&tlmm 141 GPIO_ACTIVE_LOW>;
wake-gpios = <&tlmm 139 GPIO_ACTIVE_LOW>;
--
2.43.0
From: Paul E. McKenney <paulmck(a)kernel.org>
commit bc31e6cb27a9334140ff2f0a209d59b08bc0bc8c upstream.
Holding a mutex across synchronize_rcu_tasks() and acquiring
that same mutex in code called from do_exit() after its call to
exit_tasks_rcu_start() but before its call to exit_tasks_rcu_stop()
results in deadlock. This is by design, because tasks that are far
enough into do_exit() are no longer present on the tasks list, making
it a bit difficult for RCU Tasks to find them, let alone wait on them
to do a voluntary context switch. However, such deadlocks are becoming
more frequent. In addition, lockdep currently does not detect such
deadlocks and they can be difficult to reproduce.
In addition, if a task voluntarily context switches during that time
(for example, if it blocks acquiring a mutex), then this task is in an
RCU Tasks quiescent state. And with some adjustments, RCU Tasks could
just as well take advantage of that fact.
This commit therefore eliminates these deadlock by replacing the
SRCU-based wait for do_exit() completion with per-CPU lists of tasks
currently exiting. A given task will be on one of these per-CPU lists for
the same period of time that this task would previously have been in the
previous SRCU read-side critical section. These lists enable RCU Tasks
to find the tasks that have already been removed from the tasks list,
but that must nevertheless be waited upon.
The RCU Tasks grace period gathers any of these do_exit() tasks that it
must wait on, and adds them to the list of holdouts. Per-CPU locking
and get_task_struct() are used to synchronize addition to and removal
from these lists.
Link: https://lore.kernel.org/all/20240118021842.290665-1-chenzhongjin@huawei.com/
Reported-by: Chen Zhongjin <chenzhongjin(a)huawei.com>
Signed-off-by: Paul E. McKenney <paulmck(a)kernel.org>
Signed-off-by: Zqiang <qiang.zhang1211(a)gmail.com>
---
include/linux/sched.h | 1 +
init/init_task.c | 1 +
kernel/fork.c | 1 +
kernel/rcu/tasks.h | 54 ++++++++++++++++++++++++++++---------------
4 files changed, 39 insertions(+), 18 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index aa015416c569..80499f7ab39a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -740,6 +740,7 @@ struct task_struct {
u8 rcu_tasks_idx;
int rcu_tasks_idle_cpu;
struct list_head rcu_tasks_holdout_list;
+ struct list_head rcu_tasks_exit_list;
#endif /* #ifdef CONFIG_TASKS_RCU */
#ifdef CONFIG_TASKS_TRACE_RCU
diff --git a/init/init_task.c b/init/init_task.c
index 5fa18ed59d33..59454d6e2c2a 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -151,6 +151,7 @@ struct task_struct init_task
.rcu_tasks_holdout = false,
.rcu_tasks_holdout_list = LIST_HEAD_INIT(init_task.rcu_tasks_holdout_list),
.rcu_tasks_idle_cpu = -1,
+ .rcu_tasks_exit_list = LIST_HEAD_INIT(init_task.rcu_tasks_exit_list),
#endif
#ifdef CONFIG_TASKS_TRACE_RCU
.trc_reader_nesting = 0,
diff --git a/kernel/fork.c b/kernel/fork.c
index 633b0af1d1a7..86803165aa00 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1699,6 +1699,7 @@ static inline void rcu_copy_process(struct task_struct *p)
p->rcu_tasks_holdout = false;
INIT_LIST_HEAD(&p->rcu_tasks_holdout_list);
p->rcu_tasks_idle_cpu = -1;
+ INIT_LIST_HEAD(&p->rcu_tasks_exit_list);
#endif /* #ifdef CONFIG_TASKS_RCU */
#ifdef CONFIG_TASKS_TRACE_RCU
p->trc_reader_nesting = 0;
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index c5624ab0580c..901cd7bc78ed 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -81,9 +81,6 @@ static struct rcu_tasks rt_name = \
.kname = #rt_name, \
}
-/* Track exiting tasks in order to allow them to be waited for. */
-DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu);
-
/* Avoid IPIing CPUs early in the grace period. */
#define RCU_TASK_IPI_DELAY (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) ? HZ / 2 : 0)
static int rcu_task_ipi_delay __read_mostly = RCU_TASK_IPI_DELAY;
@@ -383,6 +380,9 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp)
// rates from multiple CPUs. If this is required, per-CPU callback lists
// will be needed.
+static LIST_HEAD(rtp_exit_list);
+static DEFINE_RAW_SPINLOCK(rtp_exit_list_lock);
+
/* Pre-grace-period preparation. */
static void rcu_tasks_pregp_step(void)
{
@@ -416,15 +416,18 @@ static void rcu_tasks_pertask(struct task_struct *t, struct list_head *hop)
/* Processing between scanning taskslist and draining the holdout list. */
static void rcu_tasks_postscan(struct list_head *hop)
{
+ unsigned long flags;
+ struct task_struct *t;
+
/*
* Exiting tasks may escape the tasklist scan. Those are vulnerable
* until their final schedule() with TASK_DEAD state. To cope with
* this, divide the fragile exit path part in two intersecting
* read side critical sections:
*
- * 1) An _SRCU_ read side starting before calling exit_notify(),
- * which may remove the task from the tasklist, and ending after
- * the final preempt_disable() call in do_exit().
+ * 1) A task_struct list addition before calling exit_notify(),
+ * which may remove the task from the tasklist, with the
+ * removal after the final preempt_disable() call in do_exit().
*
* 2) An _RCU_ read side starting with the final preempt_disable()
* call in do_exit() and ending with the final call to schedule()
@@ -433,7 +436,12 @@ static void rcu_tasks_postscan(struct list_head *hop)
* This handles the part 1). And postgp will handle part 2) with a
* call to synchronize_rcu().
*/
- synchronize_srcu(&tasks_rcu_exit_srcu);
+ raw_spin_lock_irqsave(&rtp_exit_list_lock, flags);
+ list_for_each_entry(t, &rtp_exit_list, rcu_tasks_exit_list) {
+ if (list_empty(&t->rcu_tasks_holdout_list))
+ rcu_tasks_pertask(t, hop);
+ }
+ raw_spin_unlock_irqrestore(&rtp_exit_list_lock, flags);
}
/* See if tasks are still holding out, complain if so. */
@@ -498,7 +506,6 @@ static void rcu_tasks_postgp(struct rcu_tasks *rtp)
*
* In addition, this synchronize_rcu() waits for exiting tasks
* to complete their final preempt_disable() region of execution,
- * cleaning up after synchronize_srcu(&tasks_rcu_exit_srcu),
* enforcing the whole region before tasklist removal until
* the final schedule() with TASK_DEAD state to be an RCU TASKS
* read side critical section.
@@ -591,25 +598,36 @@ static void show_rcu_tasks_classic_gp_kthread(void)
#endif /* #ifndef CONFIG_TINY_RCU */
/*
- * Contribute to protect against tasklist scan blind spot while the
- * task is exiting and may be removed from the tasklist. See
- * corresponding synchronize_srcu() for further details.
+ * Protect against tasklist scan blind spot while the task is exiting and
+ * may be removed from the tasklist. Do this by adding the task to yet
+ * another list.
*/
-void exit_tasks_rcu_start(void) __acquires(&tasks_rcu_exit_srcu)
+void exit_tasks_rcu_start(void)
{
- current->rcu_tasks_idx = __srcu_read_lock(&tasks_rcu_exit_srcu);
+ unsigned long flags;
+ struct task_struct *t = current;
+
+ WARN_ON_ONCE(!list_empty(&t->rcu_tasks_exit_list));
+ get_task_struct(t);
+ raw_spin_lock_irqsave(&rtp_exit_list_lock, flags);
+ list_add(&t->rcu_tasks_exit_list, &rtp_exit_list);
+ raw_spin_unlock_irqrestore(&rtp_exit_list_lock, flags);
}
/*
- * Contribute to protect against tasklist scan blind spot while the
- * task is exiting and may be removed from the tasklist. See
- * corresponding synchronize_srcu() for further details.
+ * Remove the task from the "yet another list" because do_exit() is now
+ * non-preemptible, allowing synchronize_rcu() to wait beyond this point.
*/
-void exit_tasks_rcu_stop(void) __releases(&tasks_rcu_exit_srcu)
+void exit_tasks_rcu_stop(void)
{
+ unsigned long flags;
struct task_struct *t = current;
- __srcu_read_unlock(&tasks_rcu_exit_srcu, t->rcu_tasks_idx);
+ WARN_ON_ONCE(list_empty(&t->rcu_tasks_exit_list));
+ raw_spin_lock_irqsave(&rtp_exit_list_lock, flags);
+ list_del_init(&t->rcu_tasks_exit_list);
+ raw_spin_unlock_irqrestore(&rtp_exit_list_lock, flags);
+ put_task_struct(t);
}
/*
--
2.17.1
Hi stable team - please don't take patches for fs/bcachefs/ except from
myself; I'll be doing backports and sending pull requests after stuff
has been tested by my CI.
Thanks, and let me know if there's any other workflow things I should
know about
-Kent
From: Alexander Sverdlin <alexander.sverdlin(a)siemens.com>
Fix link error:
ld.bfd: drivers/mfd/twl-core.o: in function `twl_probe':
git/drivers/mfd/twl-core.c:846: undefined reference to `devm_mfd_add_devices'
Cc: <stable(a)vger.kernel.org>
Fixes: 63416320419e ("mfd: twl-core: Add a clock subdevice for the TWL6032")
Signed-off-by: Alexander Sverdlin <alexander.sverdlin(a)siemens.com>
---
drivers/mfd/Kconfig | 1 +
1 file changed, 1 insertion(+)
diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index 90ce58fd629e5..1195a27c881e4 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -1772,6 +1772,7 @@ config TWL4030_CORE
bool "TI TWL4030/TWL5030/TWL6030/TPS659x0 Support"
depends on I2C=y
select IRQ_DOMAIN
+ select MFD_CORE
select REGMAP_I2C
help
Say yes here if you have TWL4030 / TWL6030 family chip on your board.
--
2.43.0
From: Cyril Hrubis <chrubis(a)suse.cz>
[ Upstream commit c7fcb99877f9f542c918509b2801065adcaf46fa ]
There is a 10% rounding error in the intial value of the
sysctl_sched_rr_timeslice with CONFIG_HZ_300=y.
This was found with LTP test sched_rr_get_interval01:
sched_rr_get_interval01.c:57: TPASS: sched_rr_get_interval() passed
sched_rr_get_interval01.c:64: TPASS: Time quantum 0s 99999990ns
sched_rr_get_interval01.c:72: TFAIL: /proc/sys/kernel/sched_rr_timeslice_ms != 100 got 90
sched_rr_get_interval01.c:57: TPASS: sched_rr_get_interval() passed
sched_rr_get_interval01.c:64: TPASS: Time quantum 0s 99999990ns
sched_rr_get_interval01.c:72: TFAIL: /proc/sys/kernel/sched_rr_timeslice_ms != 100 got 90
What this test does is to compare the return value from the
sched_rr_get_interval() and the sched_rr_timeslice_ms sysctl file and
fails if they do not match.
The problem it found is the intial sysctl file value which was computed as:
static int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
which works fine as long as MSEC_PER_SEC is multiple of HZ, however it
introduces 10% rounding error for CONFIG_HZ_300:
(MSEC_PER_SEC / HZ) * (100 * HZ / 1000)
(1000 / 300) * (100 * 300 / 1000)
3 * 30 = 90
This can be easily fixed by reversing the order of the multiplication
and division. After this fix we get:
(MSEC_PER_SEC * (100 * HZ / 1000)) / HZ
(1000 * (100 * 300 / 1000)) / 300
(1000 * 30) / 300 = 100
Fixes: 975e155ed873 ("sched/rt: Show the 'sched_rr_timeslice' SCHED_RR timeslice tuning knob in milliseconds")
Signed-off-by: Cyril Hrubis <chrubis(a)suse.cz>
Signed-off-by: Peter Zijlstra (Intel) <peterz(a)infradead.org>
Reviewed-by: Petr Vorel <pvorel(a)suse.cz>
Acked-by: Mel Gorman <mgorman(a)suse.de>
Tested-by: Petr Vorel <pvorel(a)suse.cz>
Link: https://lore.kernel.org/r/20230802151906.25258-2-chrubis@suse.cz
[ pvorel: rebased for 4.19 ]
Signed-off-by: Petr Vorel <pvorel(a)suse.cz>
---
kernel/sched/rt.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 394c66442cff..ce4594215728 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -8,7 +8,7 @@
#include "pelt.h"
int sched_rr_timeslice = RR_TIMESLICE;
-int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
+int sysctl_sched_rr_timeslice = (MSEC_PER_SEC * RR_TIMESLICE) / HZ;
static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
--
2.35.3
From: Jan Kiszka <jan.kiszka(a)siemens.com>
commit afb2a4fb84555ef9e61061f6ea63ed7087b295d5 upstream.
The cflags for the RISC-V efistub were missing -mno-relax, thus were
under the risk that the compiler could use GP-relative addressing. That
happened for _edata with binutils-2.41 and kernel 6.1, causing the
relocation to fail due to an invalid kernel_size in handle_kernel_image.
It was not yet observed with newer versions, but that may just be luck.
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Jan Kiszka <jan.kiszka(a)siemens.com>
Signed-off-by: Ard Biesheuvel <ardb(a)kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
drivers/firmware/efi/libstub/Makefile | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile
index ef5045a53ce0..b6e1dcb98a64 100644
--- a/drivers/firmware/efi/libstub/Makefile
+++ b/drivers/firmware/efi/libstub/Makefile
@@ -25,7 +25,7 @@ cflags-$(CONFIG_ARM) := $(subst $(CC_FLAGS_FTRACE),,$(KBUILD_CFLAGS)) \
-fno-builtin -fpic \
$(call cc-option,-mno-single-pic-base)
cflags-$(CONFIG_RISCV) := $(subst $(CC_FLAGS_FTRACE),,$(KBUILD_CFLAGS)) \
- -fpic
+ -fpic -mno-relax
cflags-$(CONFIG_LOONGARCH) := $(subst $(CC_FLAGS_FTRACE),,$(KBUILD_CFLAGS)) \
-fpie
--
2.35.3
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 14db5f64a971fce3d8ea35de4dfc7f443a3efb92
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021942-driven-backhand-7edd@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
14db5f64a971 ("zonefs: Improve error handling")
77af13ba3c7f ("zonefs: Do not propagate iomap_dio_rw() ENOTBLK error to user space")
aa7f243f32e1 ("zonefs: Separate zone information from inode information")
34422914dc00 ("zonefs: Reduce struct zonefs_inode_info size")
46a9c526eef7 ("zonefs: Simplify IO error handling")
4008e2a0b01a ("zonefs: Reorganize code")
a608da3bd730 ("zonefs: Detect append writes at invalid locations")
db58653ce0c7 ("zonefs: Fix active zone accounting")
7dd12d65ac64 ("zonefs: fix zone report size in __zonefs_io_error()")
8745889a7fd0 ("Merge tag 'iomap-6.0-merge-2' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 14db5f64a971fce3d8ea35de4dfc7f443a3efb92 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <dlemoal(a)kernel.org>
Date: Thu, 8 Feb 2024 17:26:59 +0900
Subject: [PATCH] zonefs: Improve error handling
Write error handling is racy and can sometime lead to the error recovery
path wrongly changing the inode size of a sequential zone file to an
incorrect value which results in garbage data being readable at the end
of a file. There are 2 problems:
1) zonefs_file_dio_write() updates a zone file write pointer offset
after issuing a direct IO with iomap_dio_rw(). This update is done
only if the IO succeed for synchronous direct writes. However, for
asynchronous direct writes, the update is done without waiting for
the IO completion so that the next asynchronous IO can be
immediately issued. However, if an asynchronous IO completes with a
failure right before the i_truncate_mutex lock protecting the update,
the update may change the value of the inode write pointer offset
that was corrected by the error path (zonefs_io_error() function).
2) zonefs_io_error() is called when a read or write error occurs. This
function executes a report zone operation using the callback function
zonefs_io_error_cb(), which does all the error recovery handling
based on the current zone condition, write pointer position and
according to the mount options being used. However, depending on the
zoned device being used, a report zone callback may be executed in a
context that is different from the context of __zonefs_io_error(). As
a result, zonefs_io_error_cb() may be executed without the inode
truncate mutex lock held, which can lead to invalid error processing.
Fix both problems as follows:
- Problem 1: Perform the inode write pointer offset update before a
direct write is issued with iomap_dio_rw(). This is safe to do as
partial direct writes are not supported (IOMAP_DIO_PARTIAL is not
set) and any failed IO will trigger the execution of zonefs_io_error()
which will correct the inode write pointer offset to reflect the
current state of the one on the device.
- Problem 2: Change zonefs_io_error_cb() into zonefs_handle_io_error()
and call this function directly from __zonefs_io_error() after
obtaining the zone information using blkdev_report_zones() with a
simple callback function that copies to a local stack variable the
struct blk_zone obtained from the device. This ensures that error
handling is performed holding the inode truncate mutex.
This change also simplifies error handling for conventional zone files
by bypassing the execution of report zones entirely. This is safe to
do because the condition of conventional zones cannot be read-only or
offline and conventional zone files are always fully mapped with a
constant file size.
Reported-by: Shin'ichiro Kawasaki <shinichiro.kawasaki(a)wdc.com>
Fixes: 8dcc1a9d90c1 ("fs: New zonefs file system")
Cc: stable(a)vger.kernel.org
Signed-off-by: Damien Le Moal <dlemoal(a)kernel.org>
Tested-by: Shin'ichiro Kawasaki <shinichiro.kawasaki(a)wdc.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn(a)wdc.com>
Reviewed-by: Himanshu Madhani <himanshu.madhani(a)oracle.com>
diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c
index 6ab2318a9c8e..dba5dcb62bef 100644
--- a/fs/zonefs/file.c
+++ b/fs/zonefs/file.c
@@ -348,7 +348,12 @@ static int zonefs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size,
struct zonefs_inode_info *zi = ZONEFS_I(inode);
if (error) {
- zonefs_io_error(inode, true);
+ /*
+ * For Sync IOs, error recovery is called from
+ * zonefs_file_dio_write().
+ */
+ if (!is_sync_kiocb(iocb))
+ zonefs_io_error(inode, true);
return error;
}
@@ -491,6 +496,14 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
ret = -EINVAL;
goto inode_unlock;
}
+ /*
+ * Advance the zone write pointer offset. This assumes that the
+ * IO will succeed, which is OK to do because we do not allow
+ * partial writes (IOMAP_DIO_PARTIAL is not set) and if the IO
+ * fails, the error path will correct the write pointer offset.
+ */
+ z->z_wpoffset += count;
+ zonefs_inode_account_active(inode);
mutex_unlock(&zi->i_truncate_mutex);
}
@@ -504,20 +517,19 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
if (ret == -ENOTBLK)
ret = -EBUSY;
- if (zonefs_zone_is_seq(z) &&
- (ret > 0 || ret == -EIOCBQUEUED)) {
- if (ret > 0)
- count = ret;
-
- /*
- * Update the zone write pointer offset assuming the write
- * operation succeeded. If it did not, the error recovery path
- * will correct it. Also do active seq file accounting.
- */
- mutex_lock(&zi->i_truncate_mutex);
- z->z_wpoffset += count;
- zonefs_inode_account_active(inode);
- mutex_unlock(&zi->i_truncate_mutex);
+ /*
+ * For a failed IO or partial completion, trigger error recovery
+ * to update the zone write pointer offset to a correct value.
+ * For asynchronous IOs, zonefs_file_write_dio_end_io() may already
+ * have executed error recovery if the IO already completed when we
+ * reach here. However, we cannot know that and execute error recovery
+ * again (that will not change anything).
+ */
+ if (zonefs_zone_is_seq(z)) {
+ if (ret > 0 && ret != count)
+ ret = -EIO;
+ if (ret < 0 && ret != -EIOCBQUEUED)
+ zonefs_io_error(inode, true);
}
inode_unlock:
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index 93971742613a..b6e8e7c96251 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -246,16 +246,18 @@ static void zonefs_inode_update_mode(struct inode *inode)
z->z_mode = inode->i_mode;
}
-struct zonefs_ioerr_data {
- struct inode *inode;
- bool write;
-};
-
static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx,
void *data)
{
- struct zonefs_ioerr_data *err = data;
- struct inode *inode = err->inode;
+ struct blk_zone *z = data;
+
+ *z = *zone;
+ return 0;
+}
+
+static void zonefs_handle_io_error(struct inode *inode, struct blk_zone *zone,
+ bool write)
+{
struct zonefs_zone *z = zonefs_inode_zone(inode);
struct super_block *sb = inode->i_sb;
struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
@@ -270,8 +272,8 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx,
data_size = zonefs_check_zone_condition(sb, z, zone);
isize = i_size_read(inode);
if (!(z->z_flags & (ZONEFS_ZONE_READONLY | ZONEFS_ZONE_OFFLINE)) &&
- !err->write && isize == data_size)
- return 0;
+ !write && isize == data_size)
+ return;
/*
* At this point, we detected either a bad zone or an inconsistency
@@ -292,7 +294,7 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx,
* In all cases, warn about inode size inconsistency and handle the
* IO error according to the zone condition and to the mount options.
*/
- if (zonefs_zone_is_seq(z) && isize != data_size)
+ if (isize != data_size)
zonefs_warn(sb,
"inode %lu: invalid size %lld (should be %lld)\n",
inode->i_ino, isize, data_size);
@@ -352,8 +354,6 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx,
zonefs_i_size_write(inode, data_size);
z->z_wpoffset = data_size;
zonefs_inode_account_active(inode);
-
- return 0;
}
/*
@@ -367,23 +367,25 @@ void __zonefs_io_error(struct inode *inode, bool write)
{
struct zonefs_zone *z = zonefs_inode_zone(inode);
struct super_block *sb = inode->i_sb;
- struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
unsigned int noio_flag;
- unsigned int nr_zones = 1;
- struct zonefs_ioerr_data err = {
- .inode = inode,
- .write = write,
- };
+ struct blk_zone zone;
int ret;
/*
- * The only files that have more than one zone are conventional zone
- * files with aggregated conventional zones, for which the inode zone
- * size is always larger than the device zone size.
+ * Conventional zone have no write pointer and cannot become read-only
+ * or offline. So simply fake a report for a single or aggregated zone
+ * and let zonefs_handle_io_error() correct the zone inode information
+ * according to the mount options.
*/
- if (z->z_size > bdev_zone_sectors(sb->s_bdev))
- nr_zones = z->z_size >>
- (sbi->s_zone_sectors_shift + SECTOR_SHIFT);
+ if (!zonefs_zone_is_seq(z)) {
+ zone.start = z->z_sector;
+ zone.len = z->z_size >> SECTOR_SHIFT;
+ zone.wp = zone.start + zone.len;
+ zone.type = BLK_ZONE_TYPE_CONVENTIONAL;
+ zone.cond = BLK_ZONE_COND_NOT_WP;
+ zone.capacity = zone.len;
+ goto handle_io_error;
+ }
/*
* Memory allocations in blkdev_report_zones() can trigger a memory
@@ -394,12 +396,20 @@ void __zonefs_io_error(struct inode *inode, bool write)
* the GFP_NOIO context avoids both problems.
*/
noio_flag = memalloc_noio_save();
- ret = blkdev_report_zones(sb->s_bdev, z->z_sector, nr_zones,
- zonefs_io_error_cb, &err);
- if (ret != nr_zones)
+ ret = blkdev_report_zones(sb->s_bdev, z->z_sector, 1,
+ zonefs_io_error_cb, &zone);
+ memalloc_noio_restore(noio_flag);
+
+ if (ret != 1) {
zonefs_err(sb, "Get inode %lu zone information failed %d\n",
inode->i_ino, ret);
- memalloc_noio_restore(noio_flag);
+ zonefs_warn(sb, "remounting filesystem read-only\n");
+ sb->s_flags |= SB_RDONLY;
+ return;
+ }
+
+handle_io_error:
+ zonefs_handle_io_error(inode, &zone, write);
}
static struct kmem_cache *zonefs_inode_cachep;
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x 67695f18d55924b2013534ef3bdc363bc9e14605
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021850-vaseline-mongrel-489e@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 67695f18d55924b2013534ef3bdc363bc9e14605 Mon Sep 17 00:00:00 2001
From: Lokesh Gidra <lokeshgidra(a)google.com>
Date: Wed, 17 Jan 2024 14:37:29 -0800
Subject: [PATCH] userfaultfd: fix mmap_changing checking in
mfill_atomic_hugetlb
In mfill_atomic_hugetlb(), mmap_changing isn't being checked
again if we drop mmap_lock and reacquire it. When the lock is not held,
mmap_changing could have been incremented. This is also inconsistent
with the behavior in mfill_atomic().
Link: https://lkml.kernel.org/r/20240117223729.1444522-1-lokeshgidra@google.com
Fixes: df2cc96e77011 ("userfaultfd: prevent non-cooperative events vs mcopy_atomic races")
Signed-off-by: Lokesh Gidra <lokeshgidra(a)google.com>
Cc: Andrea Arcangeli <aarcange(a)redhat.com>
Cc: Mike Rapoport <rppt(a)kernel.org>
Cc: Axel Rasmussen <axelrasmussen(a)google.com>
Cc: Brian Geffon <bgeffon(a)google.com>
Cc: David Hildenbrand <david(a)redhat.com>
Cc: Jann Horn <jannh(a)google.com>
Cc: Kalesh Singh <kaleshsingh(a)google.com>
Cc: Matthew Wilcox (Oracle) <willy(a)infradead.org>
Cc: Nicolas Geoffray <ngeoffray(a)google.com>
Cc: Peter Xu <peterx(a)redhat.com>
Cc: Suren Baghdasaryan <surenb(a)google.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 20e3b0d9cf7e..75fcf1f783bc 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -357,6 +357,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
unsigned long dst_start,
unsigned long src_start,
unsigned long len,
+ atomic_t *mmap_changing,
uffd_flags_t flags)
{
struct mm_struct *dst_mm = dst_vma->vm_mm;
@@ -472,6 +473,15 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
goto out;
}
mmap_read_lock(dst_mm);
+ /*
+ * If memory mappings are changing because of non-cooperative
+ * operation (e.g. mremap) running in parallel, bail out and
+ * request the user to retry later
+ */
+ if (mmap_changing && atomic_read(mmap_changing)) {
+ err = -EAGAIN;
+ break;
+ }
dst_vma = NULL;
goto retry;
@@ -506,6 +516,7 @@ extern ssize_t mfill_atomic_hugetlb(struct vm_area_struct *dst_vma,
unsigned long dst_start,
unsigned long src_start,
unsigned long len,
+ atomic_t *mmap_changing,
uffd_flags_t flags);
#endif /* CONFIG_HUGETLB_PAGE */
@@ -622,8 +633,8 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
* If this is a HUGETLB vma, pass off to appropriate routine
*/
if (is_vm_hugetlb_page(dst_vma))
- return mfill_atomic_hugetlb(dst_vma, dst_start,
- src_start, len, flags);
+ return mfill_atomic_hugetlb(dst_vma, dst_start, src_start,
+ len, mmap_changing, flags);
if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
goto out_unlock;
commit 5124a0a549857c4b87173280e192eea24dea72ad upstream.
If DAT metadata file block access fails due to corruption of the DAT file
or abnormal virtual block numbers held by b-trees or inodes, a kernel
warning is generated.
This replaces the WARN_ONs by error output, so that a kernel, booted with
panic_on_warn, does not panic. This patch also replaces the detected
return code -ENOENT with another internal code -EINVAL to notify the bmap
layer of metadata corruption. When the bmap layer sees -EINVAL, it
handles the abnormal situation with nilfs_bmap_convert_error() and finally
returns code -EIO as it should.
Link: https://lkml.kernel.org/r/0000000000005cc3d205ea23ddcf@google.com
Link: https://lkml.kernel.org/r/20230126164114.6911-1-konishi.ryusuke@gmail.com
Signed-off-by: Ryusuke Konishi <konishi.ryusuke(a)gmail.com>
Reported-by: <syzbot+5d5d25f90f195a3cfcb4(a)syzkaller.appspotmail.com>
Tested-by: Ryusuke Konishi <konishi.ryusuke(a)gmail.com>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
Please use this patch for these versions instead of the patch I asked
you to drop in the previous review comments.
This replacement patch uses an equivalent call using nilfs_msg()
instead of nilfs_err(), which does not exist in these versions.
Thanks,
Ryusuke Konishi
fs/nilfs2/dat.c | 27 +++++++++++++++++----------
1 file changed, 17 insertions(+), 10 deletions(-)
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index e2a5320f2718..b9c759addd50 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -40,8 +40,21 @@ static inline struct nilfs_dat_info *NILFS_DAT_I(struct inode *dat)
static int nilfs_dat_prepare_entry(struct inode *dat,
struct nilfs_palloc_req *req, int create)
{
- return nilfs_palloc_get_entry_block(dat, req->pr_entry_nr,
- create, &req->pr_entry_bh);
+ int ret;
+
+ ret = nilfs_palloc_get_entry_block(dat, req->pr_entry_nr,
+ create, &req->pr_entry_bh);
+ if (unlikely(ret == -ENOENT)) {
+ nilfs_msg(dat->i_sb, KERN_ERR,
+ "DAT doesn't have a block to manage vblocknr = %llu",
+ (unsigned long long)req->pr_entry_nr);
+ /*
+ * Return internal code -EINVAL to notify bmap layer of
+ * metadata corruption.
+ */
+ ret = -EINVAL;
+ }
+ return ret;
}
static void nilfs_dat_commit_entry(struct inode *dat,
@@ -123,11 +136,7 @@ static void nilfs_dat_commit_free(struct inode *dat,
int nilfs_dat_prepare_start(struct inode *dat, struct nilfs_palloc_req *req)
{
- int ret;
-
- ret = nilfs_dat_prepare_entry(dat, req, 0);
- WARN_ON(ret == -ENOENT);
- return ret;
+ return nilfs_dat_prepare_entry(dat, req, 0);
}
void nilfs_dat_commit_start(struct inode *dat, struct nilfs_palloc_req *req,
@@ -154,10 +163,8 @@ int nilfs_dat_prepare_end(struct inode *dat, struct nilfs_palloc_req *req)
int ret;
ret = nilfs_dat_prepare_entry(dat, req, 0);
- if (ret < 0) {
- WARN_ON(ret == -ENOENT);
+ if (ret < 0)
return ret;
- }
kaddr = kmap_atomic(req->pr_entry_bh->b_page);
entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
--
2.39.3
When bpf_trace_printk is called without any args in a second depth level,
it will enable preemption without disabling it.
These patch series fix this for 5.15 and 6.1. The fix was introduced in
6.3, so later kernels already have it. And 5.10 and earlier did not have
the code that disabled preemption, so they are fine in that regard.
This was tested by attaching a bpf program doing a non-0 arguments
trace_printk at sys_enter and a 0 arguments snprintf at local_timer_entry.
Dave Marchevsky (1):
bpf: Merge printk and seq_printf VARARG max macros
Jiri Olsa (3):
bpf: Add struct for bin_args arg in bpf_bprintf_prepare
bpf: Do cleanup in bpf_bprintf_cleanup only when needed
bpf: Remove trace_printk_lock
include/linux/bpf.h | 14 ++++++--
kernel/bpf/helpers.c | 71 ++++++++++++++++++++++------------------
kernel/bpf/verifier.c | 3 +-
kernel/trace/bpf_trace.c | 39 ++++++++++------------
4 files changed, 72 insertions(+), 55 deletions(-)
--
2.34.1
There are indications that ASPM L0s is not working very well on this
machine so disable it also for the NVMe and modem controllers for now.
Note that this is done as a precaution based on problems with the Wi-Fi
on the X13s as well as the NVMe, modem and Wi-Fi on the sc8280xp-crd
reference design (the NVMe controller on my X13s does not support L0 and
the machine lacks a modem).
Fixes: 9f4f3dfad8cf ("PCI: qcom: Enable ASPM for platforms supporting 1.9.0 ops")
Cc: stable(a)vger.kernel.org # 6.7
Signed-off-by: Johan Hovold <johan+linaro(a)kernel.org>
---
arch/arm64/boot/dts/qcom/sc8280xp-lenovo-thinkpad-x13s.dts | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/arch/arm64/boot/dts/qcom/sc8280xp-lenovo-thinkpad-x13s.dts b/arch/arm64/boot/dts/qcom/sc8280xp-lenovo-thinkpad-x13s.dts
index 70824294108e..06fc88d5d025 100644
--- a/arch/arm64/boot/dts/qcom/sc8280xp-lenovo-thinkpad-x13s.dts
+++ b/arch/arm64/boot/dts/qcom/sc8280xp-lenovo-thinkpad-x13s.dts
@@ -730,6 +730,8 @@ keyboard@68 {
};
&pcie2a {
+ aspm-no-l0s;
+
perst-gpios = <&tlmm 143 GPIO_ACTIVE_LOW>;
wake-gpios = <&tlmm 145 GPIO_ACTIVE_LOW>;
@@ -749,6 +751,8 @@ &pcie2a_phy {
};
&pcie3a {
+ aspm-no-l0s;
+
perst-gpios = <&tlmm 151 GPIO_ACTIVE_LOW>;
wake-gpios = <&tlmm 148 GPIO_ACTIVE_LOW>;
--
2.43.0
There are indications that ASPM L0s is not working very well on this
machine so disable it also for the modem and Wi-Fi controllers for now.
This specifically avoids having the modem and Wi-Fi controllers bounce
in an out of L0s when not used (the modem still bounces in and out of
L1) as well as intermittent Correctable errors on the Wi-Fi link when
not used.
Fixes: 9f4f3dfad8cf ("PCI: qcom: Enable ASPM for platforms supporting 1.9.0 ops")
Cc: stable(a)vger.kernel.org # 6.7
Signed-off-by: Johan Hovold <johan+linaro(a)kernel.org>
---
arch/arm64/boot/dts/qcom/sc8280xp-crd.dts | 3 +++
1 file changed, 3 insertions(+)
diff --git a/arch/arm64/boot/dts/qcom/sc8280xp-crd.dts b/arch/arm64/boot/dts/qcom/sc8280xp-crd.dts
index 7e94a68d5d9f..8fc0380f65a0 100644
--- a/arch/arm64/boot/dts/qcom/sc8280xp-crd.dts
+++ b/arch/arm64/boot/dts/qcom/sc8280xp-crd.dts
@@ -546,6 +546,8 @@ &pcie2a_phy {
};
&pcie3a {
+ aspm-no-l0s;
+
perst-gpios = <&tlmm 151 GPIO_ACTIVE_LOW>;
wake-gpios = <&tlmm 148 GPIO_ACTIVE_LOW>;
@@ -566,6 +568,7 @@ &pcie3a_phy {
&pcie4 {
max-link-speed = <2>;
+ aspm-no-l0s;
perst-gpios = <&tlmm 141 GPIO_ACTIVE_LOW>;
wake-gpios = <&tlmm 139 GPIO_ACTIVE_LOW>;
--
2.43.0
While debugging the `X86_DECODER_SELFTEST` failure first reported in [1],
we noticed that the line numbers reported by the `insn_decoder_test` tool
do not correspond to the line in the output of `objdump_reformat.awk` that
was causing the failure:
# TEST posttest
llvm-objdump -d -j .text ./vmlinux | \
awk -f ./arch/x86/tools/objdump_reformat.awk | \
arch/x86/tools/insn_decoder_test -y -v
arch/x86/tools/insn_decoder_test: error: malformed line 1657116:
68db0
$ llvm-objdump -d -j .text ./vmlinux | \
awk -f ./arch/x86/tools/objdump_reformat.awk > objdump_reformat.txt
$ head -n `echo 1657116+2 | bc` objdump_reformat.txt | tail -n 5
ffffffff815430b1 41 8b 47 1c movl
ffffffff815430b5 89 c1 movl
ffffffff815430b7 81 c9 00 40 00 00 orl
ffffffff815430bd 41 89 4e 18 movl
ffffffff815430c1 a8 40 testb
These lines are perfectly fine. The reason is that the line count reported
by the tool only includes instruction lines, i.e., it does not count symbol
lines. This behavior was introduced in Commit 35039eb6b199 ("x86: Show
symbol name if insn decoder test failed"), which included symbol lines
in the output of the awk script. This broke the `instuction lines == total
lines` property without accounting for it in `insn_decoder_test.c`.
Add a new variable to count the combined (insn+symbol) line count and
report this in the error message. With this patch, the line reported by the
tool is the line causing the failure (long line wrapped at 75 chars):
# TEST posttest
llvm-objdump -d -j .text ./vmlinux | \
awk -f ./arch/x86/tools/objdump_reformat.awk | \
arch/x86/tools/insn_decoder_test -y -v
arch/x86/tools/insn_decoder_test: error: malformed line 1699686:
68db0
$ head -n ` echo 1699686+2 | bc` objdump_reformat.txt | tail -n 5
ffffffff81568dac c3 retq
<_RNvXsP_NtCs7qddEHlz8fK_4core3fmtRINtNtNtNtB7_4iter8adapters5chain5Chain
INtNtBA_7flatten7FlattenINtNtB7_6option8IntoIterNtNtB7_4char11EscapeDebug
EEINtB1a_7FlatMapNtNtNtB7_3str4iter5CharsB1T_NtB2D_23CharEscapeDebugConti
nueEENtB5_5Debug3fmtB7_>:ffffffff81568db0
ffffffff81568dad 0f 1f 00 nopl
ffffffff81568db0 f3 0f 1e fa endbr64
ffffffff81568db4 41 56 pushq
[In this case the line causing the failure is interpreted as two lines by
the tool (due to its length, but this is fixed by [1, 2]), and the second
line is reported. Still the spatial closeness between the reported line and
the line causing the failure would have made debugging a lot easier.]
Link: https://lore.kernel.org/lkml/Y9ES4UKl%2F+DtvAVS@gmail.com/T/ [1]
Link: https://lore.kernel.org/rust-for-linux/20231119180145.157455-1-sergio.colla… [2]
Fixes: 35039eb6b199 ("x86: Show symbol name if insn decoder test failed")
Reviewed-by: Miguel Ojeda <ojeda(a)kernel.org>
Tested-by: Miguel Ojeda <ojeda(a)kernel.org>
Reported-by: John Baublitz <john.m.baublitz(a)gmail.com>
Debugged-by: John Baublitz <john.m.baublitz(a)gmail.com>
Signed-off-by: Valentin Obst <kernel(a)valentinobst.de>
---
Changes in v2:
- Added tags 'Reviewed-by', 'Tested-by', 'Reported-by', 'Debugged-by',
'Link', and 'Fixes'.
- Explain why this patch fixes the commit mentioned in the 'Fixes' tag.
- CCed the stable list and sent to all x86 maintainers.
- Link to v1: https://lore.kernel.org/r/20240221-x86-insn-decoder-line-fix-v1-1-47cd5a171…
---
arch/x86/tools/insn_decoder_test.c | 7 +++++--
1 file changed, 5 insertions(+), 2 deletions(-)
diff --git a/arch/x86/tools/insn_decoder_test.c b/arch/x86/tools/insn_decoder_test.c
index 472540aeabc2..727017a3c3c7 100644
--- a/arch/x86/tools/insn_decoder_test.c
+++ b/arch/x86/tools/insn_decoder_test.c
@@ -114,6 +114,7 @@ int main(int argc, char **argv)
unsigned char insn_buff[16];
struct insn insn;
int insns = 0;
+ int lines = 0;
int warnings = 0;
parse_args(argc, argv);
@@ -123,6 +124,8 @@ int main(int argc, char **argv)
int nb = 0, ret;
unsigned int b;
+ lines++;
+
if (line[0] == '<') {
/* Symbol line */
strcpy(sym, line);
@@ -134,12 +137,12 @@ int main(int argc, char **argv)
strcpy(copy, line);
tab1 = strchr(copy, '\t');
if (!tab1)
- malformed_line(line, insns);
+ malformed_line(line, lines);
s = tab1 + 1;
s += strspn(s, " ");
tab2 = strchr(s, '\t');
if (!tab2)
- malformed_line(line, insns);
+ malformed_line(line, lines);
*tab2 = '\0'; /* Characters beyond tab2 aren't examined */
while (s < tab2) {
if (sscanf(s, "%x", &b) == 1) {
---
base-commit: b401b621758e46812da61fa58a67c3fd8d91de0d
change-id: 20240221-x86-insn-decoder-line-fix-7b1f2e1732ff
Best regards,
--
Valentin Obst <kernel(a)valentinobst.de>
I'm announcing the release of the 5.10.210 kernel.
All users of the 5.10 kernel series must upgrade.
The updated 5.10.y git tree can be found at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git linux-5.10.y
and can be browsed at the normal kernel.org git web browser:
https://git.kernel.org/?p=linux/kernel/git/stable/linux-stable.git;a=summary
thanks,
greg k-h
------------
Makefile | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
Aaron Conole (1):
net: openvswitch: limit the number of recursions from action sets
Adrian Reber (1):
tty: allow TIOCSLCKTRMIOS with CAP_CHECKPOINT_RESTORE
Al Viro (2):
rename(): fix the locking of subdirectories
fast_dput(): handle underflows gracefully
Aleksander Mazur (1):
x86/Kconfig: Transmeta Crusoe is CPU family 5, not 6
Alex Henrie (1):
HID: apple: Add support for the 2021 Magic Keyboard
Alex Lyakas (1):
md: Whenassemble the array, consult the superblock of the freshest device
Alexander Stein (4):
ARM: dts: imx7d: Fix coresight funnel ports
ARM: dts: imx7s: Fix lcdif compatible
ARM: dts: imx7s: Fix nand-controller #size-cells
mmc: slot-gpio: Allow non-sleeping GPIO ro
Alexandra Winter (1):
s390/qeth: Fix potential loss of L3-IP@ in case of network issues
Alexey Dobriyan (1):
uapi: stddef.h: Fix __DECLARE_FLEX_ARRAY for C++
Alexey Khoroshilov (1):
ASoC: rt5645: Fix deadlock in rt5645_jack_detect_work()
Alfred Piccioni (1):
lsm: new security_file_ioctl_compat() hook
Amelie Delaunay (1):
dmaengine: fix NULL pointer in channel unregistration function
Andrii Nakryiko (1):
selftests/bpf: satisfy compiler by having explicit return in btf test
Andrii Staikov (1):
i40e: Fix VF disable behavior to block all traffic
Andrzej Hajda (1):
debugobjects: Stop accessing objects after releasing hash bucket lock
Andy Shevchenko (1):
mmc: mmc_spi: remove custom DMA mapped buffers
Anna Schumaker (1):
SUNRPC: Fix a suspicious RCU usage warning
Antoine Tenart (1):
tunnels: fix out of bounds access when building IPv6 PMTU error
Anton Ivanov (1):
um: Fix naming clash between UML and scheduler
Arjun Roy (1):
net-zerocopy: Refactor frag-is-remappable test.
Arnd Bergmann (1):
drm/exynos: fix accidental on-stack copy of exynos_drm_plane
Avri Altman (1):
mmc: core: Use mrq.sbc in close-ended ffu
Baokun Li (4):
ext4: unify the type of flexbg_size to unsigned int
ext4: remove unnecessary check from alloc_flex_gd()
ext4: avoid online resizing failures due to oversized flex bg
ext4: fix double-free of blocks due to wrong extents moved_len
Bart Van Assche (1):
scsi: core: Introduce enum scsi_disposition
Benjamin Berg (3):
wifi: cfg80211: free beacon_ies when overridden from hidden BSS
um: Don't use vfprintf() for os_info()
HID: apple: Add 2021 magic keyboard FN key mapping
Bernd Edlinger (1):
exec: Fix error handling in begin_new_exec()
Bjorn Helgaas (2):
PM: sleep: Use dev_printk() when possible
PCI/AER: Decode Requester ID when no error info found
Boris Burkov (2):
btrfs: forbid creating subvol qgroups
btrfs: forbid deleting live subvol qgroup
Breno Leitao (2):
net: sysfs: Fix /sys/class/net/<iface> path
net: sysfs: Fix /sys/class/net/<iface> path for statistics
Carlos Llamas (2):
binder: signal epoll threads of self-work
scripts/decode_stacktrace.sh: optionally use LLVM utilities
Chao Yu (1):
f2fs: fix to check return value of f2fs_reserve_new_block()
Charan Teja Kalla (1):
mm/sparsemem: fix race in accessing memory_section->usage
Chris Riches (1):
audit: Send netlink ACK before setting connection in auditd_set
Christian A. Ehrhardt (2):
of: unittest: Fix compile in the non-dynamic case
usb: ucsi_acpi: Fix command completion handling
Christoph Hellwig (1):
block: prevent an integer overflow in bvec_try_merge_hw_page
Christophe JAILLET (3):
ixgbe: Fix an error handling path in ixgbe_read_iosf_sb_reg_x550()
dmaengine: fsl-qdma: Fix a memory leak related to the status queue DMA
dmaengine: fsl-qdma: Fix a memory leak related to the queue command DMA
Chung-Chiang Cheng (1):
btrfs: tree-checker: fix inline ref size in error messages
Cristian Ciocaltea (1):
ASoC: doc: Fix undefined SND_SOC_DAPM_NOPM argument
Dan Carpenter (4):
drm/bridge: nxp-ptn3460: fix i2c_master_send() error checking
drm/bridge: nxp-ptn3460: simplify some error checking
netfilter: nf_tables: fix pointer math issue in nft_byteorder_eval()
PCI: dwc: Fix a 64bit bug in dw_pcie_ep_raise_msix_irq()
Daniel Basilio (1):
nfp: use correct macro for LengthSelect in BAR config
Daniel Lezcano (2):
units: change from 'L' to 'UL'
units: add the HZ macros
Daniel Stodden (1):
PCI: switchtec: Fix stdev_release() crash after surprise hot remove
Daniel Vacek (1):
IB/ipoib: Fix mcast list locking
Daniel de Villiers (1):
nfp: flower: prevent re-adding mac index for bonded port
Dave Airlie (1):
nouveau/vmm: don't set addr on the fail path to avoid warning
David Howells (2):
afs: Hide silly-rename files from userspace
rxrpc: Fix response to PING RESPONSE ACKs to a dead call
David Schiller (1):
staging: iio: ad5933: fix type mismatch regression
David Senoner (1):
ALSA: hda/realtek: Fix the external mic not being recognised for Acer Swift 1 SF114-32
David Sterba (2):
btrfs: don't warn if discard range is not aligned to sector
btrfs: send: return EOPNOTSUPP on unknown flags
Davidlohr Bueso (1):
hrtimer: Ignore slack time for RT tasks in schedule_hrtimeout_range()
Dmitry Antipov (1):
PNP: ACPI: fix fortify warning
Dmitry Baryshkov (1):
PM: runtime: add devm_pm_runtime_enable helper
Doug Berger (1):
irqchip/irq-brcmstb-l2: Add write memory barrier before exit
Douglas Anderson (2):
drm/exynos: Call drm_atomic_helper_shutdown() at shutdown/unbind time
PM: runtime: Have devm_pm_runtime_enable() handle pm_runtime_dont_use_autosuspend()
Edson Juliano Drosdeck (1):
ALSA: hda/realtek: Enable headset mic on Vaio VJFE-ADL
Edward Adam Davis (3):
jfs: fix uaf in jfs_evict_inode
jfs: fix array-index-out-of-bounds in diNewExt
wifi: cfg80211: fix RCU dereference in __cfg80211_bss_update
Ekansh Gupta (1):
misc: fastrpc: Mark all sessions as invalid in cb_remove
Emmanuel Grumbach (1):
wifi: iwlwifi: fix a memory corruption
Eric Dumazet (9):
llc: make llc_ui_sendmsg() more robust against bonding changes
ip6_tunnel: use dev_sw_netstats_rx_add()
ip6_tunnel: make sure to pull inner header in __ip6_tnl_rcv()
tcp: add sanity checks to rx zerocopy
llc: call sock_orphan() at release time
af_unix: fix lockdep positive in sk_diag_dump_icons()
inet: read sk->sk_family once in inet_recv_error()
ppp_async: limit MRU to 64K
net: prevent mss overflow in skb_segment()
Fabio Estevam (9):
ARM: dts: imx25/27-eukrea: Fix RTC node name
ARM: dts: imx: Use flash@0,0 pattern
ARM: dts: imx27: Fix sram node
ARM: dts: imx1: Fix sram node
ARM: dts: imx25: Fix the iim compatible string
ARM: dts: imx25/27: Pass timing0
ARM: dts: imx27-apf27dev: Fix LED name
ARM: dts: imx23-sansa: Use preferred i2c-gpios properties
ARM: dts: imx23/28: Fix the DMA controller node name
Fedor Pchelkin (3):
btrfs: ref-verify: free ref cache before clearing mount opt
drm/exynos: gsc: minor fix for loop iteration in gsc_runtime_resume
nfc: nci: free rx_data_reassembly skb on NCI device cleanup
Felix Kuehling (1):
drm/amdgpu: Let KFD sync with VM fences
Florian Fainelli (1):
net: bcmgenet: Fix EEE implementation
Florian Westphal (5):
netfilter: nf_tables: restrict anonymous set and map names to 16 bytes
netfilter: nf_tables: reject QUEUE/DROP verdict parameters
netfilter: nft_set_pipapo: store index in scratch maps
netfilter: nft_set_pipapo: add helper to release pcpu scratch area
netfilter: nft_set_pipapo: remove scratch_aligned pointer
Frank Li (5):
usb: cdns3: fix uvc failure work since sg support enabled
usb: cdns3: fix incorrect calculation of ep_buf_size when more than one config
usb: cdns3: fix iso transfer error when mult is not zero
usb: cdns3: Fix uvc fail when DMA cross 4k boundery since sg enabled
dmaengine: fix is_slave_direction() return false when DMA_DEV_TO_DEV
Frederic Weisbecker (1):
hrtimer: Report offline hrtimer enqueue
Frédéric Danis (1):
Bluetooth: L2CAP: Fix possible multiple reject send
Furong Xu (2):
net: stmmac: xgmac: fix handling of DPP safety error for DMA channels
net: stmmac: xgmac: fix a typo of register name in DPP safety handling
Gabriel Krisman Bertazi (1):
ecryptfs: Reject casefold directory inodes
Ghanshyam Agrawal (1):
media: stk1160: Fixed high volume of stk1160_dbg messages
Greg KH (1):
perf/core: Fix narrow startup race when creating the perf nr_addr_filters sysfs file
Greg Kroah-Hartman (1):
Linux 5.10.210
Guanhua Gao (1):
dmaengine: fsl-dpaa2-qdma: Fix the size of dma pools
Guenter Roeck (1):
MIPS: Add 'memory' clobber to csum_ipv6_magic() inline assembler
Guilherme G. Piccoli (1):
PCI: Only override AMD USB controller if required
Hannes Reinecke (2):
scsi: libfc: Don't schedule abort twice
scsi: libfc: Fix up timeout error in fc_fcp_rec_error()
Hans de Goede (1):
Input: atkbd - skip ATKBD_CMD_SETLEDS when skipping ATKBD_CMD_GETID
Hardik Gajjar (1):
usb: hub: Replace hardcoded quirk value with BIT() macro
Harshit Shah (1):
i3c: master: cdns: Update maximum prescaler value for i2c clock
Heiko Carstens (2):
s390/ptrace: handle setting of fpc register correctly
KVM: s390: fix setting of fpc register
Heiner Kallweit (2):
leds: trigger: panic: Don't register panic notifier if creating the trigger failed
i2c: i801: Remove i801_set_block_buffer_mode
Helge Deller (2):
parisc/firmware: Fix F-extend for PDC addresses
ipv6: Ensure natural alignment of const ipv6 loopback and router addresses
Herbert Xu (3):
crypto: api - Disallow identical driver names
hwrng: core - Fix page fault dead lock on mmap-ed hwrng
crypto: s390/aes - Fix buffer overread in CTR mode
Hongchen Zhang (1):
PM: hibernate: Enforce ordering during image compression/decompression
Hou Tao (2):
bpf: Add map and need_defer parameters to .map_fd_put_ptr()
bpf: Set uattr->batch.count as zero before batched update or deletion
Hugo Villeneuve (4):
serial: sc16is7xx: set safe default SPI clock frequency
serial: sc16is7xx: add check for unsupported SPI modes during probe
serial: max310x: set default value when reading clock ready bit
serial: max310x: improve crystal stable clock detection
Ian Rogers (1):
libsubcmd: Fix memory leak in uniq()
Ido Schimmel (1):
PCI: Add no PM reset quirk for NVIDIA Spectrum devices
Ilpo Järvinen (2):
serial: Add rs485_supported to uart_port
serial: 8250_exar: Fill in rs485_supported
Ilya Dryomov (1):
rbd: don't move requests to the running list on errors
Ivan Vecera (1):
i40e: Fix waiting for queues of all VSIs to be disabled
Jack Wang (1):
RDMA/IPoIB: Fix error code return in ipoib_mcast_join
JackBB Wu (1):
USB: serial: qcserial: add new usb-id for Dell Wireless DW5826e
Jaegeuk Kim (1):
f2fs: fix write pointers on zoned device after roll forward
Jai Luthra (1):
dmaengine: ti: k3-udma: Report short packet errors
Jakub Kicinski (1):
selftests: netdevsim: fix the udp_tunnel_nic test
Jan Beulich (1):
xen-netback: properly sync TX responses
Jason Gerecke (1):
HID: wacom: Do not register input devices until after hid_hw_start
Jean Delvare (1):
i2c: i801: Fix block process call transactions
Jedrzej Jagielski (2):
ixgbe: Refactor returning internal error codes
ixgbe: Refactor overtemp event handling
Jenishkumar Maheshbhai Patel (1):
net: mvpp2: clear BM pool before initialization
Jiangfeng Xiao (1):
powerpc/kasan: Fix addr error caused by page alignment
Jiri Wiesner (1):
clocksource: Skip watchdog check for large watchdog intervals
Johan Hovold (3):
arm64: dts: qcom: sdm845: fix USB wakeup interrupt types
arm64: dts: qcom: sdm845: fix USB DP/DM HS PHY interrupts
arm64: dts: qcom: sc7180: fix USB wakeup interrupt types
Johan Jonker (1):
ARM: dts: rockchip: fix rk3036 hdmi ports node
Johannes Berg (1):
wifi: mac80211: reload info pointer in ieee80211_tx_dequeue()
Jonathan Cameron (1):
iio:adc:ad7091r: Move exports into IIO_AD7091R namespace.
Jozsef Kadlecsik (2):
netfilter: ipset: fix performance regression in swap operation
netfilter: ipset: Missing gc cancellations fixed
Julian Wiedmann (1):
net/af_iucv: clean up a try_then_request_module()
Jun'ichi Nomura (1):
x86/boot: Ignore NMIs during very early boot
Junxiao Bi (1):
Revert "md/raid5: Wait for MD_SB_CHANGE_PENDING in raid5d"
Justin Tee (1):
scsi: lpfc: Fix possible file string name overflow when updating firmware
Kamal Dasu (1):
spi: bcm-qspi: fix SFDP BFPT read by usig mspi read
Kees Cook (3):
stddef: Introduce DECLARE_FLEX_ARRAY() helper
smb3: Replace smb2pdu 1-element arrays with flex-arrays
block/rnbd-srv: Check for unlikely string overflow
Kim Phillips (1):
crypto: ccp - Fix null pointer dereference in __sev_platform_shutdown_locked
Konrad Dybcio (2):
pmdomain: core: Move the unused cleanup to a _sync initcall
drm/msm/dsi: Enable runtime PM
Kuan-Wei Chiu (2):
clk: hi3620: Fix memory leak in hi3620_mmc_clk_init()
clk: mmp: pxa168: Fix memory leak in pxa168_clk_init()
Kuniyuki Iwashima (1):
llc: Drop support for ETH_P_TR_802_2.
Kunwu Chan (1):
powerpc/mm: Fix null-pointer dereference in pgtable_cache_add
Kuogee Hsieh (1):
drm/msm/dp: return correct Colorimetry for DP_TEST_DYNAMIC_RANGE_CEA case
Lee Duncan (1):
scsi: Revert "scsi: fcoe: Fix potential deadlock on &fip->ctlr_lock"
Leonard Dallmayr (1):
USB: serial: cp210x: add ID for IMST iM871A-USB
Li zeming (1):
PM: core: Remove unnecessary (void *) conversions
Lin Ma (1):
vlan: skip nested type that is not IFLA_VLAN_QOS_MAPPING
Lino Sanfilippo (1):
serial: 8250_exar: Set missing rs485_supported flag
Linus Torvalds (1):
sched/membarrier: reduce the ability to hammer on sys_membarrier
Loic Prylli (1):
hwmon: (aspeed-pwm-tacho) mutex for tach reading
Luka Guzenko (1):
ALSA: hda/realtek: Enable Mute LED on HP Laptop 14-fq0xxx
Lukas Schauer (1):
pipe: wakeup wr_wait after setting max_usage
Manas Ghandat (2):
jfs: fix slab-out-of-bounds Read in dtSearch
jfs: fix array-index-out-of-bounds in dbAdjTree
Mao Jinlong (2):
arm64: dts: qcom: msm8996: Fix 'in-ports' is a required property
arm64: dts: qcom: msm8998: Fix 'out-ports' is a required property
Marc Zyngier (1):
irqchip/gic-v3-its: Fix GICv4.1 VPE affinity update
Marcelo Schmitt (3):
iio: adc: ad7091r: Set alert bit in config register
iio: adc: ad7091r: Allow users to configure device events
iio: adc: ad7091r: Enable internal vref if external vref is not supplied
Mario Limonciello (3):
rtc: Adjust failure return code for cmos_set_alarm()
gpiolib: acpi: Ignore touchpad wakeup on GPD G1619-04
iio: accel: bma400: Fix a compilation problem
Mark Rutland (1):
drivers/perf: pmuv3: don't expose SW_INCR event in sysfs
Markus Niebel (1):
drm: panel-simple: add missing bus flags for Tianma tm070jvhg[30/33]
Masami Hiramatsu (Google) (1):
tracing/trigger: Fix to return error if failed to alloc snapshot
Matthew Wilcox (Oracle) (1):
block: Remove special-casing of compound pages
Max Kellermann (2):
fs/pipe: move check to pipe_has_watch_queue()
fs/kernfs/dir: obey S_ISGID
Meenakshikumar Somasundaram (1):
drm/amd/display: Fix tiled display misalignment
Michael Chan (1):
bnxt_en: Wait for FLR to complete during probe
Michael Ellerman (2):
powerpc: Fix build error due to is_valid_bugaddr()
powerpc/mm: Fix build failures due to arch_reserved_kernel_pages()
Michael Tretter (1):
media: rockchip: rga: fix swizzling for RGB formats
Miguel Ojeda (1):
scripts: decode_stacktrace: demangle Rust symbols
Mikulas Patocka (1):
dm: limit the number of targets and parameter size area
Ming Lei (3):
blk-mq: fix IO hang from sbitmap wakeup race
scsi: core: Move scsi_host_busy() out of host lock for waking up EH handler
scsi: core: Move scsi_host_busy() out of host lock if it is for per-command
Minsuk Kang (1):
wifi: ath9k: Fix potential array-index-out-of-bounds read in ath9k_htc_txstatus()
Mukesh Ojha (1):
PM / devfreq: Synchronize devfreq_monitor_[start/stop]
Nathan Chancellor (2):
um: net: Fix return type of uml_net_start_xmit()
kbuild: Fix changing ELF file type for output of gen_btf for big endian
Naveen N Rao (1):
powerpc/lib: Validate size for vector operations
Nikita Zhandarovich (1):
net: hsr: remove WARN_ONCE() in send_hsr_supervision_frame()
Niklas Cassel (1):
PCI: dwc: endpoint: Fix dw_pcie_ep_raise_msix_irq() alignment support
Nikolay Borisov (1):
btrfs: remove err variable from btrfs_delete_subvolume
Nuno Sa (1):
of: property: fix typo in io-channels
Oleg Nesterov (3):
afs: fix the usage of read_seqbegin_or_lock() in afs_lookup_volume_rcu()
afs: fix the usage of read_seqbegin_or_lock() in afs_find_server*()
rxrpc_find_service_conn_rcu: fix the usage of read_seqbegin_or_lock()
Oleksandr Tyshchenko (1):
xen/gntdev: Fix the abuse of underlying struct page in DMA-buf import
Oleksij Rempel (2):
spi: introduce SPI_MODE_X_MASK macro
can: j1939: Fix UAF in j1939_sk_match_filter during setsockopt(SO_J1939_FILTER)
Oliver Neukum (1):
USB: hub: check for alternate port before enabling A_ALT_HNP_SUPPORT
Omar Sandoval (2):
btrfs: don't abort filesystem when attempting to snapshot deleted subvolume
btrfs: avoid copying BTRFS_ROOT_SUBVOL_DEAD flag to snapshot of subvolume being deleted
Ondrej Mosnacek (1):
lsm: fix the logic in security_inode_getsecctx()
Osama Muhammad (2):
FS:JFS:UBSAN:array-index-out-of-bounds in dbAdjTree
UBSAN: array-index-out-of-bounds in dtSplitRoot
Pablo Neira Ayuso (8):
netfilter: nf_tables: validate NFPROTO_* family
netfilter: nft_chain_filter: handle NETDEV_UNREGISTER for inet/ingress basechain
netfilter: nf_log: replace BUG_ON by WARN_ON_ONCE when putting logger
netfilter: nft_ct: sanitize layer 3 and 4 protocol number in custom expectations
netfilter: nft_compat: reject unused compat flag
netfilter: nft_compat: restrict match/target protocol to u16
netfilter: nft_ct: reject direction for ct id
netfilter: nft_set_rbtree: skip end interval element from gc
Paolo Abeni (1):
selftests: net: avoid just another constant wait
Paolo Bonzini (2):
mm: vmalloc: introduce array allocation functions
KVM: use __vcalloc for very large allocations
Paul Cercueil (1):
ARM: dts: samsung: exynos4210-i9100: Unconditionally enable LDO12
Pawel Laszczak (1):
usb: cdns3: Fixes for sparse warnings
Peter Robinson (1):
mfd: ti_am335x_tscadc: Fix TI SoC dependencies
Peter Zijlstra (1):
perf: Fix the nr_addr_filters fix
Petr Pavlu (1):
tracing: Ensure visibility when inserting an element into tracing_map
Pierre-Louis Bossart (3):
PCI: add INTEL_HDA_ARL to pci_ids.h
ALSA: hda: Intel: add HDA_ARL PCI ID support
ALSA: hda: intel-dspcfg: add filters for ARL-S and ARL
Piotr Skajewski (1):
ixgbe: Remove non-inclusive language
Prarit Bhargava (1):
ACPI: extlog: fix NULL pointer dereference check
Prashanth K (1):
usb: host: xhci-plat: Add support for XHCI_SG_TRB_CACHE_SIZE_QUIRK
Prathu Baronia (1):
vhost: use kzalloc() instead of kmalloc() followed by memset()
Puliang Lu (1):
USB: serial: option: add Fibocom FM101-GL variant
Qiang Yu (1):
bus: mhi: host: Drop chan lock before queuing buffers
Qu Wenruo (2):
btrfs: defrag: reject unknown flags of btrfs_ioctl_defrag_range_args
btrfs: do not ASSERT() if the newly created subvolume already got read
Radek Krejci (1):
modpost: trim leading spaces when processing source files list
Rafael J. Wysocki (5):
async: Split async_schedule_node_domain()
async: Introduce async_schedule_dev_nocall()
PM: sleep: Avoid calling put_device() under dpm_list_mtx
PM: sleep: Fix possible deadlocks in core system-wide PM code
PM: sleep: Fix error handling in dpm_prepare()
Richard Palethorpe (1):
x86/entry/ia32: Ensure s32 is sign extended to s64
Rishabh Dave (1):
ceph: prevent use-after-free in encode_cap_msg()
Rob Clark (1):
drm/msm/dpu: Ratelimit framedone timeout msgs
Rolf Eike Beer (1):
mm: use __pfn_to_section() instead of open coding it
Rui Zhang (1):
regulator: core: Only increment use_count when enable_count changes
Ryusuke Konishi (4):
nilfs2: fix data corruption in dsync block recovery for small block sizes
nilfs2: fix hang in nilfs_lookup_dirty_data_buffers()
nilfs2: fix potential bug in end_buffer_async_write
nilfs2: replace WARN_ONs for invalid DAT metadata block requests
Salvatore Dipietro (1):
tcp: Add memory barrier to tcp_push()
Sandeep Maheswaram (1):
arm64: dts: qcom: sc7180: Use pdc interrupts for USB instead of GIC interrupts
Schspa Shi (1):
scripts/decode_stacktrace.sh: support old bash version
Sean Young (1):
media: rc: bpf attach/detach requires write permission
Serge Semin (1):
mips: Fix max_mapnr being uninitialized on early stages
Shannon Nelson (1):
ionic: pass opcode to devcmd_wait
Sharath Srinivasan (1):
net/rds: Fix UBSAN: array-index-out-of-bounds in rds_cmsg_recv
Shenwei Wang (1):
net: fec: fix the unhandled context fault from smmu
Shigeru Yoshida (1):
tipc: Check the bearer type before calling tipc_udp_nl_bearer_add()
Shiji Yang (1):
wifi: rt2x00: restart beacon queue when hardware reset
Shuai Xue (1):
ACPI: APEI: set memory failure flags as MF_ACTION_REQUIRED on synchronous events
Simon Horman (1):
net: stmmac: xgmac: use #define for string constants
Sjoerd Simons (1):
bus: moxtet: Add spi device table
Souradeep Chakrabarti (1):
hv_netvsc: Fix race condition between netvsc_probe and netvsc_remove
Srinivasan Shanmugam (3):
drm/amdgpu: Drop 'fence' check in 'to_amdgpu_amdkfd_fence()'
drm/amd/powerplay: Fix kzalloc parameter 'ATOM_Tonga_PPM_Table' in 'get_platform_power_management_table()'
drm/amdgpu: Release 'adev->pm.fw' before return in 'amdgpu_device_need_post()'
Stephen Boyd (1):
scripts/decode_stacktrace.sh: silence stderr messages from addr2line/nm
Stephen Rothwell (2):
powerpc: pmd_move_must_withdraw() is only needed for CONFIG_TRANSPARENT_HUGEPAGE
drm: using mul_u32_u32() requires linux/math64.h
Steve Wahl (1):
x86/mm/ident_map: Use gbpages only where full GB page should be mapped.
Steven Rostedt (Google) (2):
tracing: Fix wasted memory in saved_cmdlines logic
tracing: Inform kmemleak of saved_cmdlines allocation
Su Hui (3):
wifi: rtlwifi: rtl8723{be,ae}: using calculate_bit_shift()
media: ddbridge: fix an error code problem in ddb_probe
scsi: isci: Fix an error code problem in isci_io_request_build()
Suraj Jitindar Singh (1):
ext4: allow for the last group to be marked as trimmed
Takashi Iwai (1):
ALSA: hda: Refer to correct stream index at loops
Takashi Sakamoto (1):
firewire: core: correct documentation of fw_csr_string() kernel API
Tatsunosuke Tobita (1):
HID: wacom: generic: Avoid reporting a serial of '0' to userspace
Tejun Heo (1):
blk-iocost: Fix an UBSAN shift-out-of-bounds warning
Thomas Bourgoin (1):
crypto: stm32/crc32 - fix parsing list of devices
Tianjia Zhang (1):
crypto: lib/mpi - Fix unexpected pointer access in mpi_ec_init
Tim Chen (1):
tick/sched: Preserve number of idle sleeps across CPU hotplug events
Tobias Waldekranz (1):
net: dsa: mv88e6xxx: Fix mv88e6352_serdes_get_stats error path
Tomi Valkeinen (4):
drm/tidss: Fix atomic_flush check
drm/drm_file: fix use of uninitialized variable
drm/framebuffer: Fix use of uninitialized variable
drm/mipi-dsi: Fix detach call without attach
Tony Lindgren (1):
phy: ti: phy-omap-usb2: Fix NULL pointer dereference for SRP
Uwe Kleine-König (1):
spi: ppc4xx: Drop write-only variable
Vegard Nossum (1):
scripts/get_abi: fix source path leak
Ville Syrjälä (1):
drm: Don't unref the same fb many times by mistake due to deadlock handling
Vincent Donnefort (1):
ring-buffer: Clean ring_buffer_poll_wait() error return
Weichen Chen (1):
pstore/ram: Fix crash when setting number of cpus to an odd number
Wen Gu (1):
net/smc: fix illegal rmb_desc access in SMC-D connection dump
Wenhua Lin (1):
gpio: eic-sprd: Clear interrupt after set the interrupt type
Werner Fischer (1):
watchdog: it87_wdt: Keep WDTCTRL bit 3 unmodified for IT8784/IT8786
Werner Sembach (1):
Input: i8042 - fix strange behavior of touchpad on Clevo NS70PU
Xi Ruoyao (1):
mips: Call lose_fpu(0) before initializing fcr31 in mips_set_personality_nan
Xiang Yang (1):
Revert "arm64: Stash shadow stack pointer in the task struct on interrupt"
Xiaolei Wang (1):
rpmsg: virtio: Free driver_override when rpmsg_remove()
Xiubo Li (1):
ceph: fix deadlock or deadcode of misusing dget()
Ye Bin (1):
ext4: fix inconsistent between segment fstrim and full fstrim
Yevgeny Kliteynik (1):
net/mlx5: DR, Use the right GVMI number for drop action
Yonghong Song (1):
selftests/bpf: Fix pyperf180 compilation failure with clang18
Yoshihiro Shimoda (1):
phy: renesas: rcar-gen3-usb2: Fix returning wrong error code
Yuluo Qiu (1):
ACPI: video: Add quirk for the Colorful X15 AT 23 Laptop
Zach O'Keefe (1):
mm/writeback: fix possible divide-by-zero in wb_dirty_limits(), again
Zenm Chen (1):
wifi: rtl8xxxu: Add additional USB IDs for RTL8192EU devices
Zhang Rui (2):
hwmon: (coretemp) Fix out-of-bounds memory access
hwmon: (coretemp) Fix bogus core_id to attr name mapping
Zheng Wang (1):
media: mtk-jpeg: Fix use after free bug due to error path handling in mtk_jpeg_dec_device_run
Zhengchao Shao (5):
tcp: make sure init the accept_queue's spinlocks once
netlink: fix potential sleeping issue in mqueue_flush_file
ipv6: init the accept_queue's spinlocks in inet6_create
bonding: return -ENOMEM instead of BUG in alb_upper_dev_walk
bonding: remove print in bond_verify_device_path
Zhihao Cheng (1):
ubifs: ubifs_symlink: Fix memleak of inode->i_link in error path
Zhipeng Lu (5):
net/mlx5e: fix a double-free in arfs_create_groups
fjes: fix memleaks in fjes_hw_setup
net: ipv4: fix a memleak in ip_setup_cork
atm: idt77252: fix a memleak in open_card_ubr0
media: ir_toy: fix a memleak in irtoy_tx
Zhiquan Li (1):
x86/mce: Mark fatal MCE's page as poison to avoid panic in the kdump kernel
Zhu Yanjun (1):
virtio_net: Fix "‘%d’ directive writing between 1 and 11 bytes into a region of size 10" warnings
Zijun Hu (1):
Bluetooth: qca: Set both WIDEBAND_SPEECH and LE_STATES quirks for QCA2066
bo liu (1):
ALSA: hda/conexant: Add quirk for SWS JS201D
ching Huang (1):
scsi: arcmsr: Support new PCI device IDs 1883 and 1886
qizhong cheng (1):
PCI: mediatek: Clear interrupt status before dispatching handler
yuan linyu (1):
usb: f_mass_storage: forbid async queue when shutdown happen
zhili.liu (1):
iio: magnetometer: rm3100: add boundary check for the value read from RM3100_REG_TMRC
Hi,
Please include commit be80e9cdbca8 ("libbpf: Rename DECLARE_LIBBPF_OPTS
into LIBBPF_OPTS")
to the 5.15 stable branch.
Commit 3eefb2fbf4ec ("selftests/bpf: Test tail call counting with
bpf2bpf and data on stack")
introduced in v5.15.39 is dependent on it, and now building selftests
fails with:
...
linux/tools/testing/selftests/bpf/prog_tests/tailcalls.c: In function
‘test_tailcall_bpf2bpf_6’:
linux/tools/testing/selftests/bpf/prog_tests/tailcalls.c:822:9: warning:
implicit declaration of function ‘LIBBPF_OPTS’; did you mean
‘LIBBPF_API’? [-Wimplicit-function-declaration]
822 | LIBBPF_OPTS(bpf_test_run_opts, topts,
| ^~~~~~~~~~~
| LIBBPF_API
linux/tools/testing/selftests/bpf/prog_tests/tailcalls.c:822:21: error:
‘bpf_test_run_opts’ undeclared (first use in this function)
822 | LIBBPF_OPTS(bpf_test_run_opts, topts,
| ^~~~~~~~~~~~~~~~~
linux/tools/testing/selftests/bpf/prog_tests/tailcalls.c:822:21: note:
each undeclared identifier is reported only once for each function it
appears in
linux/tools/testing/selftests/bpf/prog_tests/tailcalls.c:822:40:
error: ‘topts’ undeclared (first use in this function)
822 | LIBBPF_OPTS(bpf_test_run_opts, topts,
| ^~~~~
tools/testing/selftests/bpf/prog_tests/tailcalls.c:823:17: error:
expected expression before ‘.’ token
823 | .data_in = &pkt_v4,
| ^
...
Thanks in advance,
Roxana
Fix kernel freeze reproduced when probing stmmac devices on kernel 4.19:
Upstream commit 474a31e13a4e9749fb3ee55794d69d0f17ee0998 to fix freeze and
upstream commit 8d72ab119f42f25abb393093472ae0ca275088b6 to apply the fix correctly.
Hi,
The commit `c1fc6484e1fb sched/rt: sysctl_sched_rr_timeslice show
default timeslice after reset` is a clean cherry-pick for v5.4+ kernels
and the commit `079be8fc6309 sched/rt: sysctl_sched_rr_timeslice show
default timeslice after reset` cleanly applicable for v6.1
These are trivial fixes which fix the parsing the negative values when
read from the userspace, but will also make LTP test `proc_sched_rt01.c`
happy instead of ignoring it.
-MNAdam
Due to a long-standing issue in driver core, drivers may not probe defer
after having registered child devices to avoid triggering a probe
deferral loop (see fbc35b45f9f6 ("Add documentation on meaning of
-EPROBE_DEFER")).
Move registration of the typec switch to after looking up clocks and
other resources.
Note that PHY creation can in theory also trigger a probe deferral when
a 'phy' supply is used. This does not seem to affect the QMP PHY driver
but the PHY subsystem should be reworked to address this (i.e. by
separating initialisation and registration of the PHY).
Fixes: 2851117f8f42 ("phy: qcom-qmp-combo: Introduce orientation switching")
Cc: stable(a)vger.kernel.org # 6.5
Cc: Bjorn Andersson <quic_bjorande(a)quicinc.com>
Signed-off-by: Johan Hovold <johan+linaro(a)kernel.org>
---
drivers/phy/qualcomm/phy-qcom-qmp-combo.c | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/drivers/phy/qualcomm/phy-qcom-qmp-combo.c b/drivers/phy/qualcomm/phy-qcom-qmp-combo.c
index e19d6a084f10..17c4ad7553a5 100644
--- a/drivers/phy/qualcomm/phy-qcom-qmp-combo.c
+++ b/drivers/phy/qualcomm/phy-qcom-qmp-combo.c
@@ -3562,10 +3562,6 @@ static int qmp_combo_probe(struct platform_device *pdev)
if (ret)
return ret;
- ret = qmp_combo_typec_switch_register(qmp);
- if (ret)
- return ret;
-
/* Check for legacy binding with child nodes. */
usb_np = of_get_child_by_name(dev->of_node, "usb3-phy");
if (usb_np) {
@@ -3585,6 +3581,10 @@ static int qmp_combo_probe(struct platform_device *pdev)
if (ret)
goto err_node_put;
+ ret = qmp_combo_typec_switch_register(qmp);
+ if (ret)
+ goto err_node_put;
+
ret = drm_aux_bridge_register(dev);
if (ret)
goto err_node_put;
--
2.43.0
Due to a long-standing issue in driver core, drivers may not probe defer
after having registered child devices to avoid triggering a probe
deferral loop (see fbc35b45f9f6 ("Add documentation on meaning of
-EPROBE_DEFER")).
This could potentially also trigger a bug in the DRM bridge
implementation which does not expect bridges to go away even if device
links may avoid triggering this (when enabled).
Move registration of the DRM aux bridge to after looking up clocks and
other resources.
Note that PHY creation can in theory also trigger a probe deferral when
a 'phy' supply is used. This does not seem to affect the QMP PHY driver
but the PHY subsystem should be reworked to address this (i.e. by
separating initialisation and registration of the PHY).
Fixes: 35921910bbd0 ("phy: qcom: qmp-combo: switch to DRM_AUX_BRIDGE")
Fixes: 1904c3f578dc ("phy: qcom-qmp-combo: Introduce drm_bridge")
Cc: stable(a)vger.kernel.org # 6.5
Cc: Bjorn Andersson <quic_bjorande(a)quicinc.com>
Cc: Dmitry Baryshkov <dmitry.baryshkov(a)linaro.org>
Signed-off-by: Johan Hovold <johan+linaro(a)kernel.org>
---
drivers/phy/qualcomm/phy-qcom-qmp-combo.c | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/drivers/phy/qualcomm/phy-qcom-qmp-combo.c b/drivers/phy/qualcomm/phy-qcom-qmp-combo.c
index 1ad10110dd25..e19d6a084f10 100644
--- a/drivers/phy/qualcomm/phy-qcom-qmp-combo.c
+++ b/drivers/phy/qualcomm/phy-qcom-qmp-combo.c
@@ -3566,10 +3566,6 @@ static int qmp_combo_probe(struct platform_device *pdev)
if (ret)
return ret;
- ret = drm_aux_bridge_register(dev);
- if (ret)
- return ret;
-
/* Check for legacy binding with child nodes. */
usb_np = of_get_child_by_name(dev->of_node, "usb3-phy");
if (usb_np) {
@@ -3589,6 +3585,10 @@ static int qmp_combo_probe(struct platform_device *pdev)
if (ret)
goto err_node_put;
+ ret = drm_aux_bridge_register(dev);
+ if (ret)
+ goto err_node_put;
+
pm_runtime_set_active(dev);
ret = devm_pm_runtime_enable(dev);
if (ret)
--
2.43.0
The PTDMA driver sets DMA masks in two different places for the same
device inconsistently. First call is in pt_pci_probe(), where it uses
48bit mask. The second call is in pt_dmaengine_register(), where it
uses a 64bit mask. Using 64bit dma mask causes IO_PAGE_FAULT errors
on DMA transfers between main memory and other devices.
Without the extra call it works fine. Additionally the second call
doesn't check the return value so it can silently fail.
Remove the superfluous dma_set_mask() call and only use 48bit mask.
Cc: stable(a)vger.kernel.org
Fixes: b0b4a6b10577 ("dmaengine: ptdma: register PTDMA controller as a DMA resource")
Signed-off-by: Tadeusz Struk <tstruk(a)gigaio.com>
---
drivers/dma/ptdma/ptdma-dmaengine.c | 2 --
1 file changed, 2 deletions(-)
diff --git a/drivers/dma/ptdma/ptdma-dmaengine.c b/drivers/dma/ptdma/ptdma-dmaengine.c
index 1aa65e5de0f3..f79240734807 100644
--- a/drivers/dma/ptdma/ptdma-dmaengine.c
+++ b/drivers/dma/ptdma/ptdma-dmaengine.c
@@ -385,8 +385,6 @@ int pt_dmaengine_register(struct pt_device *pt)
chan->vc.desc_free = pt_do_cleanup;
vchan_init(&chan->vc, dma_dev);
- dma_set_mask_and_coherent(pt->dev, DMA_BIT_MASK(64));
-
ret = dma_async_device_register(dma_dev);
if (ret)
goto err_reg;
--
2.43.2
Hello,
Do you need help with your ongoing or any new Project in the following?
*Followed are our key services*-
· Mobile app development (iOS and Android)
· Custom software development
Let me know if you need help with the above services and would be happy to
discuss over a no-obligation consultation on how you may plan your project
and how we may help.
Look forward to your reply.
Thanks & Regards
Anjali
For LUN crossing boundaries, it is handy to know what is the index of
the last page in a LUN. This helper will soon be reused. At the same
time I rename page_per_lun to ppl in the calling function to clarify the
lines.
Cc: stable(a)vger.kernel.org
Signed-off-by: Miquel Raynal <miquel.raynal(a)bootlin.com>
---
This is a dependency for the next patch, so I Cc'd stable on it as well.
---
drivers/mtd/nand/raw/nand_base.c | 16 +++++++++++-----
1 file changed, 11 insertions(+), 5 deletions(-)
diff --git a/drivers/mtd/nand/raw/nand_base.c b/drivers/mtd/nand/raw/nand_base.c
index bcfd99a1699f..d6a27e08b112 100644
--- a/drivers/mtd/nand/raw/nand_base.c
+++ b/drivers/mtd/nand/raw/nand_base.c
@@ -1211,19 +1211,25 @@ static int nand_lp_exec_read_page_op(struct nand_chip *chip, unsigned int page,
return nand_exec_op(chip, &op);
}
+static unsigned int rawnand_last_page_of_lun(unsigned int pages_per_lun, unsigned int lun)
+{
+ /* lun is expected to be very small */
+ return (lun * pages_per_lun) + pages_per_lun - 1;
+}
+
static void rawnand_cap_cont_reads(struct nand_chip *chip)
{
struct nand_memory_organization *memorg;
- unsigned int pages_per_lun, first_lun, last_lun;
+ unsigned int ppl, first_lun, last_lun;
memorg = nanddev_get_memorg(&chip->base);
- pages_per_lun = memorg->pages_per_eraseblock * memorg->eraseblocks_per_lun;
- first_lun = chip->cont_read.first_page / pages_per_lun;
- last_lun = chip->cont_read.last_page / pages_per_lun;
+ ppl = memorg->pages_per_eraseblock * memorg->eraseblocks_per_lun;
+ first_lun = chip->cont_read.first_page / ppl;
+ last_lun = chip->cont_read.last_page / ppl;
/* Prevent sequential cache reads across LUN boundaries */
if (first_lun != last_lun)
- chip->cont_read.pause_page = first_lun * pages_per_lun + pages_per_lun - 1;
+ chip->cont_read.pause_page = rawnand_last_page_of_lun(ppl, first_lun);
else
chip->cont_read.pause_page = chip->cont_read.last_page;
}
--
2.34.1
From: Rob Clark <robdclark(a)chromium.org>
We need to bail out before adding/removing devices if we are going to
-EPROBE_DEFER. Otherwise boot can get stuck in a probe deferral loop due
to a long-standing issue in driver core (see fbc35b45f9f6 ("Add
documentation on meaning of -EPROBE_DEFER")).
Deregistering the altmode child device can potentially also trigger bugs
in the DRM bridge implementation, which does not expect bridges to go
away.
Suggested-by: Dmitry Baryshkov <dmitry.baryshkov(a)linaro.org>
Signed-off-by: Rob Clark <robdclark(a)chromium.org>
Link: https://lore.kernel.org/r/20231213210644.8702-1-robdclark@gmail.com
[ johan: rebase on 6.8-rc4, amend commit message and mention DRM ]
Fixes: 58ef4ece1e41 ("soc: qcom: pmic_glink: Introduce base PMIC GLINK driver")
Cc: stable(a)vger.kernel.org # 6.3
Cc: Bjorn Andersson <andersson(a)kernel.org>
Signed-off-by: Johan Hovold <johan+linaro(a)kernel.org>
---
drivers/soc/qcom/pmic_glink.c | 21 +++++++++++----------
1 file changed, 11 insertions(+), 10 deletions(-)
diff --git a/drivers/soc/qcom/pmic_glink.c b/drivers/soc/qcom/pmic_glink.c
index f4bfd24386f1..f913e9bd57ed 100644
--- a/drivers/soc/qcom/pmic_glink.c
+++ b/drivers/soc/qcom/pmic_glink.c
@@ -265,10 +265,17 @@ static int pmic_glink_probe(struct platform_device *pdev)
pg->client_mask = *match_data;
+ pg->pdr = pdr_handle_alloc(pmic_glink_pdr_callback, pg);
+ if (IS_ERR(pg->pdr)) {
+ ret = dev_err_probe(&pdev->dev, PTR_ERR(pg->pdr),
+ "failed to initialize pdr\n");
+ return ret;
+ }
+
if (pg->client_mask & BIT(PMIC_GLINK_CLIENT_UCSI)) {
ret = pmic_glink_add_aux_device(pg, &pg->ucsi_aux, "ucsi");
if (ret)
- return ret;
+ goto out_release_pdr_handle;
}
if (pg->client_mask & BIT(PMIC_GLINK_CLIENT_ALTMODE)) {
ret = pmic_glink_add_aux_device(pg, &pg->altmode_aux, "altmode");
@@ -281,17 +288,11 @@ static int pmic_glink_probe(struct platform_device *pdev)
goto out_release_altmode_aux;
}
- pg->pdr = pdr_handle_alloc(pmic_glink_pdr_callback, pg);
- if (IS_ERR(pg->pdr)) {
- ret = dev_err_probe(&pdev->dev, PTR_ERR(pg->pdr), "failed to initialize pdr\n");
- goto out_release_aux_devices;
- }
-
service = pdr_add_lookup(pg->pdr, "tms/servreg", "msm/adsp/charger_pd");
if (IS_ERR(service)) {
ret = dev_err_probe(&pdev->dev, PTR_ERR(service),
"failed adding pdr lookup for charger_pd\n");
- goto out_release_pdr_handle;
+ goto out_release_aux_devices;
}
mutex_lock(&__pmic_glink_lock);
@@ -300,8 +301,6 @@ static int pmic_glink_probe(struct platform_device *pdev)
return 0;
-out_release_pdr_handle:
- pdr_handle_release(pg->pdr);
out_release_aux_devices:
if (pg->client_mask & BIT(PMIC_GLINK_CLIENT_BATT))
pmic_glink_del_aux_device(pg, &pg->ps_aux);
@@ -311,6 +310,8 @@ static int pmic_glink_probe(struct platform_device *pdev)
out_release_ucsi_aux:
if (pg->client_mask & BIT(PMIC_GLINK_CLIENT_UCSI))
pmic_glink_del_aux_device(pg, &pg->ucsi_aux);
+out_release_pdr_handle:
+ pdr_handle_release(pg->pdr);
return ret;
}
--
2.43.0
On Wed, Feb 21, 2024 at 10:00 AM Valentin Obst <kernel(a)valentinobst.de> wrote:
>
> While debugging the `X86_DECODER_SELFTEST` failure first reported in [1],
> [In this case the line causing the failure is interpreted as two lines by
> the tool (due to its length, but this is fixed by [1, 2]), and the second
> line is reported. Still the spatial closeness between the reported line and
> the line causing the failure would have made debugging a lot easier.]
Thanks Valentin, John et al. for digging into this (and the related
issue) -- very much appreciated.
It looks good to me:
Reviewed-by: Miguel Ojeda <ojeda(a)kernel.org>
Tested-by: Miguel Ojeda <ojeda(a)kernel.org>
This should probably have a Fixes tag -- from a quick look, the
original test did not seem to have the problem because `insns` was
equivalent to the number of lines since there was no `if ... {
continue; }` for the symbol case. At some point that branch was added,
so that was not true anymore, thus that one should probably be the
Fixes tag, though please double-check:
Fixes: 35039eb6b199 ("x86: Show symbol name if insn decoder test failed")
It is a minor issue for sure, so perhaps not worth backporting, but
nevertheless the hash is in a very old kernel, and thus the issue
applies to all stable kernels. So it does not hurt flagging it to the
stable team:
Cc: stable(a)vger.kernel.org
In addition, John reported the original issue, but this one was found
due to that one, and I am not exactly sure who did what here. Please
consider whether one of the following (or similar) may be fair:
Reported-by: John Baublitz <john.m.baublitz(a)gmail.com>
Debugged-by: John Baublitz <john.m.baublitz(a)gmail.com>
An extra Link to the discussion in Zulip could be nice too:
Link: https://rust-for-linux.zulipchat.com/#narrow/stream/291565-Help/topic/insn_…
Finally, a nit: links are typically written like the following -- you
can still use bracket references at the end:
Link: https://lore.kernel.org/lkml/Y9ES4UKl%2F+DtvAVS@gmail.com/T/ [1]
Link: https://lore.kernel.org/rust-for-linux/20231119180145.157455-1-sergio.colla…
[2]
Cheers,
Miguel
The dwc3->gadget_driver is not initialized during the dwc3 probe
process. This leads to a warning when the runtime power management (PM)
attempts to suspend the gadget using dwc3_gadget_suspend().
This patch adds a check to prevent the warning.
Cc: stable(a)vger.kernel.org
Fixes: 61a348857e86 ("usb: dwc3: gadget: Fix NULL pointer dereference in dwc3_gadget_suspend")
Signed-off-by: Ray Chi <raychi(a)google.com>
---
drivers/usb/dwc3/gadget.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c
index 28f49400f3e8..de987cffe1ec 100644
--- a/drivers/usb/dwc3/gadget.c
+++ b/drivers/usb/dwc3/gadget.c
@@ -4708,6 +4708,9 @@ int dwc3_gadget_suspend(struct dwc3 *dwc)
unsigned long flags;
int ret;
+ if (!dwc->gadget_driver)
+ return 0;
+
ret = dwc3_gadget_soft_disconnect(dwc);
if (ret)
goto err;
--
2.44.0.rc0.258.g7320e95886-goog
psci_init_system_suspend() invokes suspend_set_ops() very early during
bootup even before kernel command line for mem_sleep_default is setup.
This leads to kernel command line mem_sleep_default=s2idle not working
as mem_sleep_current gets changed to deep via suspend_set_ops() and never
changes back to s2idle.
Move psci_init_system_suspend() to late_initcall() to make sure kernel
command line mem_sleep_default=s2idle sets up s2idle as default suspend
mode.
Fixes: faf7ec4a92c0 ("drivers: firmware: psci: add system suspend support")
CC: stable(a)vger.kernel.org # 5.15+
Signed-off-by: Maulik Shah <quic_mkshah(a)quicinc.com>
---
drivers/firmware/psci/psci.c | 13 ++++++++++---
1 file changed, 10 insertions(+), 3 deletions(-)
diff --git a/drivers/firmware/psci/psci.c b/drivers/firmware/psci/psci.c
index d9629ff87861..655a2db70a67 100644
--- a/drivers/firmware/psci/psci.c
+++ b/drivers/firmware/psci/psci.c
@@ -523,18 +523,26 @@ static void __init psci_init_system_reset2(void)
psci_system_reset2_supported = true;
}
-static void __init psci_init_system_suspend(void)
+static int __init psci_init_system_suspend(void)
{
int ret;
+ u32 ver;
if (!IS_ENABLED(CONFIG_SUSPEND))
- return;
+ return 0;
+
+ ver = psci_0_2_get_version();
+ if (PSCI_VERSION_MAJOR(ver) < 1)
+ return 0;
ret = psci_features(PSCI_FN_NATIVE(1_0, SYSTEM_SUSPEND));
if (ret != PSCI_RET_NOT_SUPPORTED)
suspend_set_ops(&psci_suspend_ops);
+
+ return ret;
}
+late_initcall(psci_init_system_suspend)
static void __init psci_init_cpu_suspend(void)
{
@@ -651,7 +659,6 @@ static int __init psci_probe(void)
if (PSCI_VERSION_MAJOR(ver) >= 1) {
psci_init_smccc();
psci_init_cpu_suspend();
- psci_init_system_suspend();
psci_init_system_reset2();
kvm_init_hyp_services();
}
---
base-commit: d37e1e4c52bc60578969f391fb81f947c3e83118
change-id: 20240219-suspend_ops_late_init-27fb0b15baee
Best regards,
--
Maulik Shah <quic_mkshah(a)quicinc.com>
I'm new here, first time reporting a regression, apologies in advance if
I'm doing something wrong of if this was already reported (I found some
CIFS issues but not exactly this one).
I'm using x86-64 Arch Linux and LTS kernel (6.1.71 as I write this) and
I noticed a regression that I could reproduce in other boxes with other
architectures as well (aarch64 with 6.1.70).
# mount.cifs //server/share /mnt
# mount
//server/share on /mnt type cifs (rw,relatime,vers=3.1.1...)
# cd /mnt
# df .
df: .: Resource temporarily unavailable
# ls -al
ls: .: Resource temporarily unavailable
ls: file1: Resource temporarily unavailable
ls: file2: Resource temporarily unavailable
[...then ls shows the listing...]
If I use strace with df, the problem is:
statfs(".", 0x.....) = -1 EAGAIN (Resource temporarily unavailable)
And with ls:
listxattr(".", 0x..., 152): -1 EAGAIN (Resource temporarily unavailable)
listxattr("file1", ..., 152): -1 EAGAIN (same as above)
...
Initially I thought the problem was with the Samba server and/or the
client mount flags, but I've spent a day trying a *lot* of different
combinations and nothing worked. This happens with any share that I try,
and I've tried mounting shares from multiple Linux boxes running
different Samba and kernel versions.
Then I tried changing kernel versions at my client box. I booted latest
6.6.9 and the problem simply disappeared. My Debian server with 6.5.11
also doesn't have it. I then started a VM and tried a "bisection" of
6.1.x versions, leading to kernel 6.1.70 when this started to happen.
6.1.69 and older look fine.
I hope that this is enough information to reproduce this issue. I will
be glad to provide more info if necessary.
// Leonardo.
When debugging issues with a workload using SysV shmem, Michal Hocko has
come up with a reproducer that shows how a series of mprotect()
operations can result in an elevated shm_nattch and thus leak of the
resource.
The problem is caused by wrong assumptions in vma_merge() commit
714965ca8252 ("mm/mmap: start distinguishing if vma can be removed in
mergeability test"). The shmem vmas have a vma_ops->close callback
that decrements shm_nattch, and we remove the vma without calling it.
vma_merge() has thus historically avoided merging vma's with
vma_ops->close and commit 714965ca8252 was supposed to keep it that way.
It relaxed the checks for vma_ops->close in can_vma_merge_after()
assuming that it is never called on a vma that would be a candidate for
removal. However, the vma_merge() code does also use the result of this
check in the decision to remove a different vma in the merge case 7.
A robust solution would be to refactor vma_merge() code in a way that
the vma_ops->close check is only done for vma's that are actually going
to be removed, and not as part of the preliminary checks. That would
both solve the existing bug, and also allow additional merges that the
checks currently prevent unnecessarily in some cases.
However to fix the existing bug first with a minimized risk, and for
easier stable backports, this patch only adds a vma_ops->close check to
the buggy case 7 specifically. All other cases of vma removal are
covered by the can_vma_merge_before() check that includes the test for
vma_ops->close.
The reproducer code, adapted from Michal Hocko's code:
int main(int argc, char *argv[]) {
int segment_id;
size_t segment_size = 20 * PAGE_SIZE;
char * sh_mem;
struct shmid_ds shmid_ds;
key_t key = 0x1234;
segment_id = shmget(key, segment_size,
IPC_CREAT | IPC_EXCL | S_IRUSR | S_IWUSR);
sh_mem = (char *)shmat(segment_id, NULL, 0);
mprotect(sh_mem + 2*PAGE_SIZE, PAGE_SIZE, PROT_NONE);
mprotect(sh_mem + PAGE_SIZE, PAGE_SIZE, PROT_WRITE);
mprotect(sh_mem + 2*PAGE_SIZE, PAGE_SIZE, PROT_WRITE);
shmdt(sh_mem);
shmctl(segment_id, IPC_STAT, &shmid_ds);
printf("nattch after shmdt(): %lu (expected: 0)\n", shmid_ds.shm_nattch);
if (shmctl(segment_id, IPC_RMID, 0))
printf("IPCRM failed %d\n", errno);
return (shmid_ds.shm_nattch) ? 1 : 0;
}
Fixes: 714965ca8252 ("mm/mmap: start distinguishing if vma can be removed in mergeability test")
Signed-off-by: Vlastimil Babka <vbabka(a)suse.cz>
Reported-by: Michal Hocko <mhocko(a)suse.com>
Cc: Liam R. Howlett <Liam.Howlett(a)oracle.com>
Cc: Lorenzo Stoakes <lstoakes(a)gmail.com>
Cc: <stable(a)vger.kernel.org>
---
v2: deduplicate code, per Lorenzo
mm/mmap.c | 10 +++++++++-
1 file changed, 9 insertions(+), 1 deletion(-)
diff --git a/mm/mmap.c b/mm/mmap.c
index d89770eaab6b..3281287771c9 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -954,13 +954,21 @@ static struct vm_area_struct
} else if (merge_prev) { /* case 2 */
if (curr) {
vma_start_write(curr);
- err = dup_anon_vma(prev, curr, &anon_dup);
if (end == curr->vm_end) { /* case 7 */
+ /*
+ * can_vma_merge_after() assumed we would not be
+ * removing prev vma, so it skipped the check
+ * for vm_ops->close, but we are removing curr
+ */
+ if (curr->vm_ops && curr->vm_ops->close)
+ err = -EINVAL;
remove = curr;
} else { /* case 5 */
adjust = curr;
adj_start = (end - curr->vm_start);
}
+ if (!err)
+ err = dup_anon_vma(prev, curr, &anon_dup);
}
} else { /* merge_next */
vma_start_write(next);
--
2.43.1
From: Fabio Estevam <festevam(a)denx.de>
Since commit bfac19e239a7 ("fbdev: mx3fb: Remove the driver") backlight
is no longer functional.
The fbdev mx3fb driver used to automatically select
CONFIG_BACKLIGHT_CLASS_DEVICE.
Now that the mx3fb driver has been removed, enable the
CONFIG_BACKLIGHT_CLASS_DEVICE option so that backlight can still work
by default.
Tested on a imx6dl-sabresd board.
Cc: stable(a)vger.kernel.org
Fixes: bfac19e239a7 ("fbdev: mx3fb: Remove the driver")
Signed-off-by: Fabio Estevam <festevam(a)denx.de>
---
arch/arm/configs/imx_v6_v7_defconfig | 1 +
1 file changed, 1 insertion(+)
diff --git a/arch/arm/configs/imx_v6_v7_defconfig b/arch/arm/configs/imx_v6_v7_defconfig
index 0a90583f9f01..8f9dbe8d9029 100644
--- a/arch/arm/configs/imx_v6_v7_defconfig
+++ b/arch/arm/configs/imx_v6_v7_defconfig
@@ -297,6 +297,7 @@ CONFIG_FB_MODE_HELPERS=y
CONFIG_LCD_CLASS_DEVICE=y
CONFIG_LCD_L4F00242T03=y
CONFIG_LCD_PLATFORM=y
+CONFIG_BACKLIGHT_CLASS_DEVICE=y
CONFIG_BACKLIGHT_PWM=y
CONFIG_BACKLIGHT_GPIO=y
CONFIG_FRAMEBUFFER_CONSOLE=y
--
2.34.1
The quilt patch titled
Subject: fat: fix uninitialized field in nostale filehandles
has been removed from the -mm tree. Its filename was
fat-fix-uninitialized-field-in-nostale-filehandles.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Jan Kara <jack(a)suse.cz>
Subject: fat: fix uninitialized field in nostale filehandles
Date: Mon, 5 Feb 2024 13:26:26 +0100
When fat_encode_fh_nostale() encodes file handle without a parent it
stores only first 10 bytes of the file handle. However the length of the
file handle must be a multiple of 4 so the file handle is actually 12
bytes long and the last two bytes remain uninitialized. This is not
great at we potentially leak uninitialized information with the handle
to userspace. Properly initialize the full handle length.
Link: https://lkml.kernel.org/r/20240205122626.13701-1-jack@suse.cz
Reported-by: syzbot+3ce5dea5b1539ff36769(a)syzkaller.appspotmail.com
Fixes: ea3983ace6b7 ("fat: restructure export_operations")
Signed-off-by: Jan Kara <jack(a)suse.cz>
Acked-by: OGAWA Hirofumi <hirofumi(a)mail.parknet.co.jp>
Cc: Amir Goldstein <amir73il(a)gmail.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
fs/fat/nfs.c | 6 ++++++
1 file changed, 6 insertions(+)
--- a/fs/fat/nfs.c~fat-fix-uninitialized-field-in-nostale-filehandles
+++ a/fs/fat/nfs.c
@@ -130,6 +130,12 @@ fat_encode_fh_nostale(struct inode *inod
fid->parent_i_gen = parent->i_generation;
type = FILEID_FAT_WITH_PARENT;
*lenp = FAT_FID_SIZE_WITH_PARENT;
+ } else {
+ /*
+ * We need to initialize this field because the fh is actually
+ * 12 bytes long
+ */
+ fid->parent_i_pos_hi = 0;
}
return type;
_
Patches currently in -mm which might be from jack(a)suse.cz are
shmem-properly-report-quota-mount-options.patch
The quilt patch titled
Subject: bounds: support non-power-of-two CONFIG_NR_CPUS
has been removed from the -mm tree. Its filename was
bounds-support-non-power-of-two-config_nr_cpus.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: "Matthew Wilcox (Oracle)" <willy(a)infradead.org>
Subject: bounds: support non-power-of-two CONFIG_NR_CPUS
Date: Tue, 10 Oct 2023 15:55:49 +0100
ilog2() rounds down, so for example when PowerPC 85xx sets CONFIG_NR_CPUS
to 24, we will only allocate 4 bits to store the number of CPUs instead of
5. Use bits_per() instead, which rounds up. Found by code inspection.
The effect of this would probably be a misaccounting when doing NUMA
balancing, so to a user, it would only be a performance penalty. The
effects may be more wide-spread; it's hard to tell.
Link: https://lkml.kernel.org/r/20231010145549.1244748-1-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy(a)infradead.org>
Fixes: 90572890d202 ("mm: numa: Change page last {nid,pid} into {cpu,pid}")
Reviewed-by: Rik van Riel <riel(a)surriel.com>
Acked-by: Mel Gorman <mgorman(a)techsingularity.net>
Cc: Peter Zijlstra <peterz(a)infradead.org>
Cc: Ingo Molnar <mingo(a)kernel.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
kernel/bounds.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/kernel/bounds.c~bounds-support-non-power-of-two-config_nr_cpus
+++ a/kernel/bounds.c
@@ -19,7 +19,7 @@ int main(void)
DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS);
DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES);
#ifdef CONFIG_SMP
- DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS));
+ DEFINE(NR_CPUS_BITS, bits_per(CONFIG_NR_CPUS));
#endif
DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t));
#ifdef CONFIG_LRU_GEN
_
Patches currently in -mm which might be from willy(a)infradead.org are
writeback-remove-a-duplicate-prototype-for-tag_pages_for_writeback.patch
writeback-factor-folio_prepare_writeback-out-of-write_cache_pages.patch
writeback-factor-writeback_get_batch-out-of-write_cache_pages.patch
writeback-simplify-the-loops-in-write_cache_pages.patch
pagevec-add-ability-to-iterate-a-queue.patch
writeback-use-the-folio_batch-queue-iterator.patch
writeback-move-the-folio_prepare_writeback-loop-out-of-write_cache_pages.patch
writeback-remove-a-use-of-write_cache_pages-from-do_writepages.patch
Hi,
this series does basically two things:
1. Disables automatic load balancing as adviced by the hardware
workaround.
2. Assigns all the CCS slices to one single user engine. The user
will then be able to query only one CCS engine
Changelog
=========
- In Patch 1 use the correct workaround number (thanks Matt).
- In Patch 2 do not add the extra CCS engines to the exposed UABI
engine list and adapt the engine counting accordingly (thanks
Tvrtko).
- Reword the commit of Patch 2 (thanks John).
Andi Shyti (2):
drm/i915/gt: Disable HW load balancing for CCS
drm/i915/gt: Enable only one CCS for compute workload
drivers/gpu/drm/i915/gt/intel_engine_user.c | 9 +++++++++
drivers/gpu/drm/i915/gt/intel_gt.c | 11 +++++++++++
drivers/gpu/drm/i915/gt/intel_gt_regs.h | 3 +++
drivers/gpu/drm/i915/gt/intel_workarounds.c | 6 ++++++
drivers/gpu/drm/i915/i915_query.c | 1 +
5 files changed, 30 insertions(+)
--
2.43.0
The patch titled
Subject: mm, mmap: fix vma_merge() case 7 with vma_ops->close
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
mm-mmap-fix-vma_merge-case-7-with-vma_ops-close.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Vlastimil Babka <vbabka(a)suse.cz>
Subject: mm, mmap: fix vma_merge() case 7 with vma_ops->close
Date: Thu, 22 Feb 2024 22:59:31 +0100
When debugging issues with a workload using SysV shmem, Michal Hocko has
come up with a reproducer that shows how a series of mprotect() operations
can result in an elevated shm_nattch and thus leak of the resource.
The problem is caused by wrong assumptions in vma_merge() commit
714965ca8252 ("mm/mmap: start distinguishing if vma can be removed in
mergeability test"). The shmem vmas have a vma_ops->close callback that
decrements shm_nattch, and we remove the vma without calling it.
vma_merge() has thus historically avoided merging vma's with
vma_ops->close and commit 714965ca8252 was supposed to keep it that way.
It relaxed the checks for vma_ops->close in can_vma_merge_after() assuming
that it is never called on a vma that would be a candidate for removal.
However, the vma_merge() code does also use the result of this check in
the decision to remove a different vma in the merge case 7.
A robust solution would be to refactor vma_merge() code in a way that the
vma_ops->close check is only done for vma's that are actually going to be
removed, and not as part of the preliminary checks. That would both solve
the existing bug, and also allow additional merges that the checks
currently prevent unnecessarily in some cases.
However to fix the existing bug first with a minimized risk, and for
easier stable backports, this patch only adds a vma_ops->close check to
the buggy case 7 specifically. All other cases of vma removal are covered
by the can_vma_merge_before() check that includes the test for
vma_ops->close.
The reproducer code, adapted from Michal Hocko's code:
int main(int argc, char *argv[]) {
int segment_id;
size_t segment_size = 20 * PAGE_SIZE;
char * sh_mem;
struct shmid_ds shmid_ds;
key_t key = 0x1234;
segment_id = shmget(key, segment_size,
IPC_CREAT | IPC_EXCL | S_IRUSR | S_IWUSR);
sh_mem = (char *)shmat(segment_id, NULL, 0);
mprotect(sh_mem + 2*PAGE_SIZE, PAGE_SIZE, PROT_NONE);
mprotect(sh_mem + PAGE_SIZE, PAGE_SIZE, PROT_WRITE);
mprotect(sh_mem + 2*PAGE_SIZE, PAGE_SIZE, PROT_WRITE);
shmdt(sh_mem);
shmctl(segment_id, IPC_STAT, &shmid_ds);
printf("nattch after shmdt(): %lu (expected: 0)\n", shmid_ds.shm_nattch);
if (shmctl(segment_id, IPC_RMID, 0))
printf("IPCRM failed %d\n", errno);
return (shmid_ds.shm_nattch) ? 1 : 0;
}
Link: https://lkml.kernel.org/r/20240222215930.14637-2-vbabka@suse.cz
Fixes: 714965ca8252 ("mm/mmap: start distinguishing if vma can be removed in mergeability test")
Signed-off-by: Vlastimil Babka <vbabka(a)suse.cz>
Reported-by: Michal Hocko <mhocko(a)suse.com>
Reviewed-by: Lorenzo Stoakes <lstoakes(a)gmail.com>
Cc: Liam R. Howlett <Liam.Howlett(a)oracle.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/mmap.c | 10 +++++++++-
1 file changed, 9 insertions(+), 1 deletion(-)
--- a/mm/mmap.c~mm-mmap-fix-vma_merge-case-7-with-vma_ops-close
+++ a/mm/mmap.c
@@ -954,13 +954,21 @@ static struct vm_area_struct
} else if (merge_prev) { /* case 2 */
if (curr) {
vma_start_write(curr);
- err = dup_anon_vma(prev, curr, &anon_dup);
if (end == curr->vm_end) { /* case 7 */
+ /*
+ * can_vma_merge_after() assumed we would not be
+ * removing prev vma, so it skipped the check
+ * for vm_ops->close, but we are removing curr
+ */
+ if (curr->vm_ops && curr->vm_ops->close)
+ err = -EINVAL;
remove = curr;
} else { /* case 5 */
adjust = curr;
adj_start = (end - curr->vm_start);
}
+ if (!err)
+ err = dup_anon_vma(prev, curr, &anon_dup);
}
} else { /* merge_next */
vma_start_write(next);
_
Patches currently in -mm which might be from vbabka(a)suse.cz are
mm-vmscan-prevent-infinite-loop-for-costly-gfp_noio-__gfp_retry_mayfail-allocations.patch
mm-mmap-fix-vma_merge-case-7-with-vma_ops-close.patch
When debugging issues with a workload using SysV shmem, Michal Hocko has
come up with a reproducer that shows how a series of mprotect()
operations can result in an elevated shm_nattch and thus leak of the
resource.
The problem is caused by wrong assumptions in vma_merge() commit
714965ca8252 ("mm/mmap: start distinguishing if vma can be removed in
mergeability test"). The shmem vmas have a vma_ops->close callback
that decrements shm_nattch, and we remove the vma without calling it.
vma_merge() has thus historically avoided merging vma's with
vma_ops->close and commit 714965ca8252 was supposed to keep it that way.
It relaxed the checks for vma_ops->close in can_vma_merge_after()
assuming that it is never called on a vma that would be a candidate for
removal. However, the vma_merge() code does also use the result of this
check in the decision to remove a different vma in the merge case 7.
A robust solution would be to refactor vma_merge() code in a way that
the vma_ops->close check is only done for vma's that are actually going
to be removed, and not as part of the preliminary checks. That would
both solve the existing bug, and also allow additional merges that the
checks currently prevent unnecessarily in some cases.
However to fix the existing bug first with a minimized risk, and for
easier stable backports, this patch only adds a vma_ops->close check to
the buggy case 7 specifically. All other cases of vma removal are
covered by the can_vma_merge_before() check that includes the test for
vma_ops->close.
The reproducer code, adapted from Michal Hocko's code:
int main(int argc, char *argv[]) {
int segment_id;
size_t segment_size = 20 * PAGE_SIZE;
char * sh_mem;
struct shmid_ds shmid_ds;
key_t key = 0x1234;
segment_id = shmget(key, segment_size,
IPC_CREAT | IPC_EXCL | S_IRUSR | S_IWUSR);
sh_mem = (char *)shmat(segment_id, NULL, 0);
mprotect(sh_mem + 2*PAGE_SIZE, PAGE_SIZE, PROT_NONE);
mprotect(sh_mem + PAGE_SIZE, PAGE_SIZE, PROT_WRITE);
mprotect(sh_mem + 2*PAGE_SIZE, PAGE_SIZE, PROT_WRITE);
shmdt(sh_mem);
shmctl(segment_id, IPC_STAT, &shmid_ds);
printf("nattch after shmdt(): %lu (expected: 0)\n", shmid_ds.shm_nattch);
if (shmctl(segment_id, IPC_RMID, 0))
printf("IPCRM failed %d\n", errno);
return (shmid_ds.shm_nattch) ? 1 : 0;
}
Fixes: 714965ca8252 ("mm/mmap: start distinguishing if vma can be removed in mergeability test")
Reported-by: Michal Hocko <mhocko(a)suse.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Vlastimil Babka <vbabka(a)suse.cz>
---
mm/mmap.c | 11 ++++++++++-
1 file changed, 10 insertions(+), 1 deletion(-)
diff --git a/mm/mmap.c b/mm/mmap.c
index d89770eaab6b..a4238373ee9b 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -954,10 +954,19 @@ static struct vm_area_struct
} else if (merge_prev) { /* case 2 */
if (curr) {
vma_start_write(curr);
- err = dup_anon_vma(prev, curr, &anon_dup);
if (end == curr->vm_end) { /* case 7 */
+ /*
+ * can_vma_merge_after() assumed we would not be
+ * removing prev vma, so it skipped the check
+ * for vm_ops->close, but we are removing curr
+ */
+ if (curr->vm_ops && curr->vm_ops->close)
+ err = -EINVAL;
+ else
+ err = dup_anon_vma(prev, curr, &anon_dup);
remove = curr;
} else { /* case 5 */
+ err = dup_anon_vma(prev, curr, &anon_dup);
adjust = curr;
adj_start = (end - curr->vm_start);
}
--
2.43.1
A recent DRM series purporting to simplify support for "transparent
bridges" and handling of probe deferrals ironically exposed a
use-after-free issue on pmic_glink_altmode probe deferral.
This has manifested itself as the display subsystem occasionally failing
to initialise and NULL-pointer dereferences during boot of machines like
the Lenovo ThinkPad X13s.
Specifically, the dp-hpd bridge is currently registered before all
resources have been acquired which means that it can also be
deregistered on probe deferrals.
In the meantime there is a race window where the new aux bridge driver
(or PHY driver previously) may have looked up the dp-hpd bridge and
stored a (non-reference-counted) pointer to the bridge which is about to
be deallocated.
When the display controller is later initialised, this triggers a
use-after-free when attaching the bridges:
dp -> aux -> dp-hpd (freed)
which may, for example, result in the freed bridge failing to attach:
[drm:drm_bridge_attach [drm]] *ERROR* failed to attach bridge /soc@0/phy@88eb000 to encoder TMDS-31: -16
or a NULL-pointer dereference:
Unable to handle kernel NULL pointer dereference at virtual address 0000000000000000
...
Call trace:
drm_bridge_attach+0x70/0x1a8 [drm]
drm_aux_bridge_attach+0x24/0x38 [aux_bridge]
drm_bridge_attach+0x80/0x1a8 [drm]
dp_bridge_init+0xa8/0x15c [msm]
msm_dp_modeset_init+0x28/0xc4 [msm]
The DRM bridge implementation is clearly fragile and implicitly built on
the assumption that bridges may never go away. In this case, the fix is
to move the bridge registration in the pmic_glink_altmode driver to
after all resources have been looked up.
Incidentally, with the new dp-hpd bridge implementation, which registers
child devices, this is also a requirement due to a long-standing issue
in driver core that can otherwise lead to a probe deferral loop (see
fbc35b45f9f6 ("Add documentation on meaning of -EPROBE_DEFER")).
Fixes: 080b4e24852b ("soc: qcom: pmic_glink: Introduce altmode support")
Fixes: 2bcca96abfbf ("soc: qcom: pmic-glink: switch to DRM_AUX_HPD_BRIDGE")
Cc: stable(a)vger.kernel.org # 6.3
Cc: Bjorn Andersson <andersson(a)kernel.org>
Cc: Dmitry Baryshkov <dmitry.baryshkov(a)linaro.org>
Signed-off-by: Johan Hovold <johan+linaro(a)kernel.org>
---
drivers/soc/qcom/pmic_glink_altmode.c | 16 +++++++++++++---
1 file changed, 13 insertions(+), 3 deletions(-)
diff --git a/drivers/soc/qcom/pmic_glink_altmode.c b/drivers/soc/qcom/pmic_glink_altmode.c
index 5fcd0fdd2faa..b3808fc24c69 100644
--- a/drivers/soc/qcom/pmic_glink_altmode.c
+++ b/drivers/soc/qcom/pmic_glink_altmode.c
@@ -76,7 +76,7 @@ struct pmic_glink_altmode_port {
struct work_struct work;
- struct device *bridge;
+ struct auxiliary_device *bridge;
enum typec_orientation orientation;
u16 svid;
@@ -230,7 +230,7 @@ static void pmic_glink_altmode_worker(struct work_struct *work)
else
pmic_glink_altmode_enable_usb(altmode, alt_port);
- drm_aux_hpd_bridge_notify(alt_port->bridge,
+ drm_aux_hpd_bridge_notify(&alt_port->bridge->dev,
alt_port->hpd_state ?
connector_status_connected :
connector_status_disconnected);
@@ -454,7 +454,7 @@ static int pmic_glink_altmode_probe(struct auxiliary_device *adev,
alt_port->index = port;
INIT_WORK(&alt_port->work, pmic_glink_altmode_worker);
- alt_port->bridge = drm_dp_hpd_bridge_register(dev, to_of_node(fwnode));
+ alt_port->bridge = devm_drm_dp_hpd_bridge_alloc(dev, to_of_node(fwnode));
if (IS_ERR(alt_port->bridge)) {
fwnode_handle_put(fwnode);
return PTR_ERR(alt_port->bridge);
@@ -510,6 +510,16 @@ static int pmic_glink_altmode_probe(struct auxiliary_device *adev,
}
}
+ for (port = 0; port < ARRAY_SIZE(altmode->ports); port++) {
+ alt_port = &altmode->ports[port];
+ if (!alt_port->bridge)
+ continue;
+
+ ret = devm_drm_dp_hpd_bridge_add(dev, alt_port->bridge);
+ if (ret)
+ return ret;
+ }
+
altmode->client = devm_pmic_glink_register_client(dev,
altmode->owner_id,
pmic_glink_altmode_callback,
--
2.43.0
From: Elad Nachman <enachman(a)marvell.com>
AC5X spec says PHY init complete bit must be polled until zero.
We see cases in which timeout can take longer than the standard
calculation on AC5X, which is expected following the spec comment above.
According to the spec, we must wait as long as it takes for that bit to
toggle on AC5X.
Cap that with 100 delay loops so we won't get stuck forever.
Fixes: 06c8b667ff5b ("mmc: sdhci-xenon: Add support to PHYs of Marvell Xenon SDHC")
Acked-by: Adrian Hunter <adrian.hunter(a)intel.com>
Cc: stable(a)vger.kernel.org
Signed-off-by: Elad Nachman <enachman(a)marvell.com>
---
drivers/mmc/host/sdhci-xenon-phy.c | 29 ++++++++++++++++++++---------
1 file changed, 20 insertions(+), 9 deletions(-)
diff --git a/drivers/mmc/host/sdhci-xenon-phy.c b/drivers/mmc/host/sdhci-xenon-phy.c
index c3096230a969..cc9d28b75eb9 100644
--- a/drivers/mmc/host/sdhci-xenon-phy.c
+++ b/drivers/mmc/host/sdhci-xenon-phy.c
@@ -110,6 +110,8 @@
#define XENON_EMMC_PHY_LOGIC_TIMING_ADJUST (XENON_EMMC_PHY_REG_BASE + 0x18)
#define XENON_LOGIC_TIMING_VALUE 0x00AA8977
+#define XENON_MAX_PHY_TIMEOUT_LOOPS 100
+
/*
* List offset of PHY registers and some special register values
* in eMMC PHY 5.0 or eMMC PHY 5.1
@@ -278,18 +280,27 @@ static int xenon_emmc_phy_init(struct sdhci_host *host)
/* get the wait time */
wait /= clock;
wait++;
- /* wait for host eMMC PHY init completes */
- udelay(wait);
- reg = sdhci_readl(host, phy_regs->timing_adj);
- reg &= XENON_PHY_INITIALIZAION;
- if (reg) {
+ /*
+ * AC5X spec says bit must be polled until zero.
+ * We see cases in which timeout can take longer
+ * than the standard calculation on AC5X, which is
+ * expected following the spec comment above.
+ * According to the spec, we must wait as long as
+ * it takes for that bit to toggle on AC5X.
+ * Cap that with 100 delay loops so we won't get
+ * stuck here forever:
+ */
+
+ ret = read_poll_timeout(sdhci_readl, reg,
+ !(reg & XENON_PHY_INITIALIZAION),
+ wait, XENON_MAX_PHY_TIMEOUT_LOOPS * wait,
+ false, host, phy_regs->timing_adj);
+ if (ret)
dev_err(mmc_dev(host->mmc), "eMMC PHY init cannot complete after %d us\n",
- wait);
- return -ETIMEDOUT;
- }
+ wait * XENON_MAX_PHY_TIMEOUT_LOOPS);
- return 0;
+ return ret;
}
#define ARMADA_3700_SOC_PAD_1_8V 0x1
--
2.25.1
The patch titled
Subject: mm, mmap: fix vma_merge() case 7 with vma_ops->close
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
mm-mmap-fix-vma_merge-case-7-with-vma_ops-close.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Vlastimil Babka <vbabka(a)suse.cz>
Subject: mm, mmap: fix vma_merge() case 7 with vma_ops->close
Date: Thu, 22 Feb 2024 17:55:50 +0100
When debugging issues with a workload using SysV shmem, Michal Hocko has
come up with a reproducer that shows how a series of mprotect() operations
can result in an elevated shm_nattch and thus leak of the resource.
The problem is caused by wrong assumptions in vma_merge() commit
714965ca8252 ("mm/mmap: start distinguishing if vma can be removed in
mergeability test"). The shmem vmas have a vma_ops->close callback that
decrements shm_nattch, and we remove the vma without calling it.
vma_merge() has thus historically avoided merging vma's with
vma_ops->close and commit 714965ca8252 was supposed to keep it that way.
It relaxed the checks for vma_ops->close in can_vma_merge_after() assuming
that it is never called on a vma that would be a candidate for removal.
However, the vma_merge() code does also use the result of this check in
the decision to remove a different vma in the merge case 7.
A robust solution would be to refactor vma_merge() code in a way that the
vma_ops->close check is only done for vma's that are actually going to be
removed, and not as part of the preliminary checks. That would both solve
the existing bug, and also allow additional merges that the checks
currently prevent unnecessarily in some cases.
However to fix the existing bug first with a minimized risk, and for
easier stable backports, this patch only adds a vma_ops->close check to
the buggy case 7 specifically. All other cases of vma removal are covered
by the can_vma_merge_before() check that includes the test for
vma_ops->close.
The reproducer code, adapted from Michal Hocko's code:
int main(int argc, char *argv[]) {
int segment_id;
size_t segment_size = 20 * PAGE_SIZE;
char * sh_mem;
struct shmid_ds shmid_ds;
key_t key = 0x1234;
segment_id = shmget(key, segment_size,
IPC_CREAT | IPC_EXCL | S_IRUSR | S_IWUSR);
sh_mem = (char *)shmat(segment_id, NULL, 0);
mprotect(sh_mem + 2*PAGE_SIZE, PAGE_SIZE, PROT_NONE);
mprotect(sh_mem + PAGE_SIZE, PAGE_SIZE, PROT_WRITE);
mprotect(sh_mem + 2*PAGE_SIZE, PAGE_SIZE, PROT_WRITE);
shmdt(sh_mem);
shmctl(segment_id, IPC_STAT, &shmid_ds);
printf("nattch after shmdt(): %lu (expected: 0)\n", shmid_ds.shm_nattch);
if (shmctl(segment_id, IPC_RMID, 0))
printf("IPCRM failed %d\n", errno);
return (shmid_ds.shm_nattch) ? 1 : 0;
}
Link: https://lkml.kernel.org/r/20240222165549.32753-2-vbabka@suse.cz
Fixes: 714965ca8252 ("mm/mmap: start distinguishing if vma can be removed in mergeability test")
Signed-off-by: Vlastimil Babka <vbabka(a)suse.cz>
Reported-by: Michal Hocko <mhocko(a)suse.com>
Cc: Liam R. Howlett <Liam.Howlett(a)oracle.com>
Cc: Lorenzo Stoakes <lstoakes(a)gmail.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/mmap.c | 11 ++++++++++-
1 file changed, 10 insertions(+), 1 deletion(-)
--- a/mm/mmap.c~mm-mmap-fix-vma_merge-case-7-with-vma_ops-close
+++ a/mm/mmap.c
@@ -954,10 +954,19 @@ static struct vm_area_struct
} else if (merge_prev) { /* case 2 */
if (curr) {
vma_start_write(curr);
- err = dup_anon_vma(prev, curr, &anon_dup);
if (end == curr->vm_end) { /* case 7 */
+ /*
+ * can_vma_merge_after() assumed we would not be
+ * removing prev vma, so it skipped the check
+ * for vm_ops->close, but we are removing curr
+ */
+ if (curr->vm_ops && curr->vm_ops->close)
+ err = -EINVAL;
+ else
+ err = dup_anon_vma(prev, curr, &anon_dup);
remove = curr;
} else { /* case 5 */
+ err = dup_anon_vma(prev, curr, &anon_dup);
adjust = curr;
adj_start = (end - curr->vm_start);
}
_
Patches currently in -mm which might be from vbabka(a)suse.cz are
mm-vmscan-prevent-infinite-loop-for-costly-gfp_noio-__gfp_retry_mayfail-allocations.patch
mm-mmap-fix-vma_merge-case-7-with-vma_ops-close.patch
Hi,
maybe you will not like introducing 'static int int_max = INT_MAX;' for
this old kernel which EOL in 10 months.
Cyril Hrubis (3):
sched/rt: Fix sysctl_sched_rr_timeslice intial value
sched/rt: sysctl_sched_rr_timeslice show default timeslice after reset
sched/rt: Disallow writing invalid values to sched_rt_period_us
kernel/sched/rt.c | 10 +++++-----
kernel/sysctl.c | 5 +++++
2 files changed, 10 insertions(+), 5 deletions(-)
--
2.35.3
From: Cyril Hrubis <chrubis(a)suse.cz>
[ Upstream commit 079be8fc630943d9fc70a97807feb73d169ee3fc ]
The validation of the value written to sched_rt_period_us was broken
because:
- the sysclt_sched_rt_period is declared as unsigned int
- parsed by proc_do_intvec()
- the range is asserted after the value parsed by proc_do_intvec()
Because of this negative values written to the file were written into a
unsigned integer that were later on interpreted as large positive
integers which did passed the check:
if (sysclt_sched_rt_period <= 0)
return EINVAL;
This commit fixes the parsing by setting explicit range for both
perid_us and runtime_us into the sched_rt_sysctls table and processes
the values with proc_dointvec_minmax() instead.
Alternatively if we wanted to use full range of unsigned int for the
period value we would have to split the proc_handler and use
proc_douintvec() for it however even the
Documentation/scheduller/sched-rt-group.rst describes the range as 1 to
INT_MAX.
As far as I can tell the only problem this causes is that the sysctl
file allows writing negative values which when read back may confuse
userspace.
There is also a LTP test being submitted for these sysctl files at:
http://patchwork.ozlabs.org/project/ltp/patch/20230901144433.2526-1-chrubis…
Signed-off-by: Cyril Hrubis <chrubis(a)suse.cz>
Signed-off-by: Ingo Molnar <mingo(a)kernel.org>
Link: https://lore.kernel.org/r/20231002115553.3007-2-chrubis@suse.cz
Signed-off-by: Petr Vorel <pvorel(a)suse.cz>
---
kernel/sched/rt.c | 9 +++++----
1 file changed, 5 insertions(+), 4 deletions(-)
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 904dd8534597..4ac36eb4cdee 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -37,6 +37,8 @@ static struct ctl_table sched_rt_sysctls[] = {
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = sched_rt_handler,
+ .extra1 = SYSCTL_ONE,
+ .extra2 = SYSCTL_INT_MAX,
},
{
.procname = "sched_rt_runtime_us",
@@ -44,6 +46,8 @@ static struct ctl_table sched_rt_sysctls[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = sched_rt_handler,
+ .extra1 = SYSCTL_NEG_ONE,
+ .extra2 = SYSCTL_INT_MAX,
},
{
.procname = "sched_rr_timeslice_ms",
@@ -2989,9 +2993,6 @@ static int sched_rt_global_constraints(void)
#ifdef CONFIG_SYSCTL
static int sched_rt_global_validate(void)
{
- if (sysctl_sched_rt_period <= 0)
- return -EINVAL;
-
if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
((sysctl_sched_rt_runtime > sysctl_sched_rt_period) ||
((u64)sysctl_sched_rt_runtime *
@@ -3022,7 +3023,7 @@ static int sched_rt_handler(struct ctl_table *table, int write, void *buffer,
old_period = sysctl_sched_rt_period;
old_runtime = sysctl_sched_rt_runtime;
- ret = proc_dointvec(table, write, buffer, lenp, ppos);
+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (!ret && write) {
ret = sched_rt_global_validate();
--
2.35.3
From: Cyril Hrubis <chrubis(a)suse.cz>
[ Upstream commit c1fc6484e1fb7cc2481d169bfef129a1b0676abe ]
The sched_rr_timeslice can be reset to default by writing value that is
<= 0. However after reading from this file we always got the last value
written, which is not useful at all.
$ echo -1 > /proc/sys/kernel/sched_rr_timeslice_ms
$ cat /proc/sys/kernel/sched_rr_timeslice_ms
-1
Fix this by setting the variable that holds the sysctl file value to the
jiffies_to_msecs(RR_TIMESLICE) in case that <= 0 value was written.
Signed-off-by: Cyril Hrubis <chrubis(a)suse.cz>
Signed-off-by: Peter Zijlstra (Intel) <peterz(a)infradead.org>
Reviewed-by: Petr Vorel <pvorel(a)suse.cz>
Acked-by: Mel Gorman <mgorman(a)suse.de>
Tested-by: Petr Vorel <pvorel(a)suse.cz>
Link: https://lore.kernel.org/r/20230802151906.25258-3-chrubis@suse.cz
Signed-off-by: Petr Vorel <pvorel(a)suse.cz>
---
kernel/sched/rt.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 76bafa8d331a..f79a6f36777a 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -3047,6 +3047,9 @@ static int sched_rr_handler(struct ctl_table *table, int write, void *buffer,
sched_rr_timeslice =
sysctl_sched_rr_timeslice <= 0 ? RR_TIMESLICE :
msecs_to_jiffies(sysctl_sched_rr_timeslice);
+
+ if (sysctl_sched_rr_timeslice <= 0)
+ sysctl_sched_rr_timeslice = jiffies_to_msecs(RR_TIMESLICE);
}
mutex_unlock(&mutex);
--
2.35.3
From: Cyril Hrubis <chrubis(a)suse.cz>
[ Upstream commit c7fcb99877f9f542c918509b2801065adcaf46fa ]
There is a 10% rounding error in the intial value of the
sysctl_sched_rr_timeslice with CONFIG_HZ_300=y.
This was found with LTP test sched_rr_get_interval01:
sched_rr_get_interval01.c:57: TPASS: sched_rr_get_interval() passed
sched_rr_get_interval01.c:64: TPASS: Time quantum 0s 99999990ns
sched_rr_get_interval01.c:72: TFAIL: /proc/sys/kernel/sched_rr_timeslice_ms != 100 got 90
sched_rr_get_interval01.c:57: TPASS: sched_rr_get_interval() passed
sched_rr_get_interval01.c:64: TPASS: Time quantum 0s 99999990ns
sched_rr_get_interval01.c:72: TFAIL: /proc/sys/kernel/sched_rr_timeslice_ms != 100 got 90
What this test does is to compare the return value from the
sched_rr_get_interval() and the sched_rr_timeslice_ms sysctl file and
fails if they do not match.
The problem it found is the intial sysctl file value which was computed as:
static int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
which works fine as long as MSEC_PER_SEC is multiple of HZ, however it
introduces 10% rounding error for CONFIG_HZ_300:
(MSEC_PER_SEC / HZ) * (100 * HZ / 1000)
(1000 / 300) * (100 * 300 / 1000)
3 * 30 = 90
This can be easily fixed by reversing the order of the multiplication
and division. After this fix we get:
(MSEC_PER_SEC * (100 * HZ / 1000)) / HZ
(1000 * (100 * 300 / 1000)) / 300
(1000 * 30) / 300 = 100
Fixes: 975e155ed873 ("sched/rt: Show the 'sched_rr_timeslice' SCHED_RR timeslice tuning knob in milliseconds")
Signed-off-by: Cyril Hrubis <chrubis(a)suse.cz>
Signed-off-by: Peter Zijlstra (Intel) <peterz(a)infradead.org>
Reviewed-by: Petr Vorel <pvorel(a)suse.cz>
Acked-by: Mel Gorman <mgorman(a)suse.de>
Tested-by: Petr Vorel <pvorel(a)suse.cz>
Link: https://lore.kernel.org/r/20230802151906.25258-2-chrubis@suse.cz
[ pvorel: rebased for 5.4 ]
Signed-off-by: Petr Vorel <pvorel(a)suse.cz>
---
kernel/sched/rt.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 186e7d78ded5..e40f32e3ab06 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -8,7 +8,7 @@
#include "pelt.h"
int sched_rr_timeslice = RR_TIMESLICE;
-int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
+int sysctl_sched_rr_timeslice = (MSEC_PER_SEC * RR_TIMESLICE) / HZ;
/* More than 4 hours if BW_SHIFT equals 20. */
static const u64 max_rt_runtime = MAX_BW;
--
2.35.3
From: Cyril Hrubis <chrubis(a)suse.cz>
[ Upstream commit c7fcb99877f9f542c918509b2801065adcaf46fa ]
There is a 10% rounding error in the intial value of the
sysctl_sched_rr_timeslice with CONFIG_HZ_300=y.
This was found with LTP test sched_rr_get_interval01:
sched_rr_get_interval01.c:57: TPASS: sched_rr_get_interval() passed
sched_rr_get_interval01.c:64: TPASS: Time quantum 0s 99999990ns
sched_rr_get_interval01.c:72: TFAIL: /proc/sys/kernel/sched_rr_timeslice_ms != 100 got 90
sched_rr_get_interval01.c:57: TPASS: sched_rr_get_interval() passed
sched_rr_get_interval01.c:64: TPASS: Time quantum 0s 99999990ns
sched_rr_get_interval01.c:72: TFAIL: /proc/sys/kernel/sched_rr_timeslice_ms != 100 got 90
What this test does is to compare the return value from the
sched_rr_get_interval() and the sched_rr_timeslice_ms sysctl file and
fails if they do not match.
The problem it found is the intial sysctl file value which was computed as:
static int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
which works fine as long as MSEC_PER_SEC is multiple of HZ, however it
introduces 10% rounding error for CONFIG_HZ_300:
(MSEC_PER_SEC / HZ) * (100 * HZ / 1000)
(1000 / 300) * (100 * 300 / 1000)
3 * 30 = 90
This can be easily fixed by reversing the order of the multiplication
and division. After this fix we get:
(MSEC_PER_SEC * (100 * HZ / 1000)) / HZ
(1000 * (100 * 300 / 1000)) / 300
(1000 * 30) / 300 = 100
Fixes: 975e155ed873 ("sched/rt: Show the 'sched_rr_timeslice' SCHED_RR timeslice tuning knob in milliseconds")
Signed-off-by: Cyril Hrubis <chrubis(a)suse.cz>
Signed-off-by: Peter Zijlstra (Intel) <peterz(a)infradead.org>
Reviewed-by: Petr Vorel <pvorel(a)suse.cz>
Acked-by: Mel Gorman <mgorman(a)suse.de>
Tested-by: Petr Vorel <pvorel(a)suse.cz>
Link: https://lore.kernel.org/r/20230802151906.25258-2-chrubis@suse.cz
[ pvorel: rebased for 5.15, 5.10 ]
Signed-off-by: Petr Vorel <pvorel(a)suse.cz>
---
kernel/sched/rt.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 7045595aacac..3394b7f923a0 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -8,7 +8,7 @@
#include "pelt.h"
int sched_rr_timeslice = RR_TIMESLICE;
-int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
+int sysctl_sched_rr_timeslice = (MSEC_PER_SEC * RR_TIMESLICE) / HZ;
/* More than 4 hours if BW_SHIFT equals 20. */
static const u64 max_rt_runtime = MAX_BW;
--
2.35.3
We observed a corruption during on-line resize of a file system that is
larger than 16 TiB with 4k block size. With having more then 2^32 blocks
resize_inode is turned off by default by mke2fs. The issue can be
reproduced on a smaller file system for convenience by explicitly
turning off resize_inode. An on-line resize across an 8 GiB boundary (the
size of a meta block group in this setup) then leads to a corruption:
dev=/dev/<some_dev> # should be >= 16 GiB
mkdir -p /corruption
/sbin/mke2fs -t ext4 -b 4096 -O ^resize_inode $dev $((2 * 2**21 - 2**15))
mount -t ext4 $dev /corruption
dd if=/dev/zero bs=4096 of=/corruption/test count=$((2*2**21 - 4*2**15))
sha1sum /corruption/test
# 79d2658b39dcfd77274e435b0934028adafaab11 /corruption/test
/sbin/resize2fs $dev $((2*2**21))
# drop page cache to force reload the block from disk
echo 1 > /proc/sys/vm/drop_caches
sha1sum /corruption/test
# 3c2abc63cbf1a94c9e6977e0fbd72cd832c4d5c3 /corruption/test
2^21 = 2^15*2^6 equals 8 GiB whereof 2^15 is the number of blocks per
block group and 2^6 are the number of block groups that make a meta
block group.
The last checksum might be different depending on how the file is laid
out across the physical blocks. The actual corruption occurs at physical
block 63*2^15 = 2064384 which would be the location of the backup of the
meta block group's block descriptor. During the on-line resize the file
system will be converted to meta_bg starting at s_first_meta_bg which is
2 in the example - meaning all block groups after 16 GiB. However, in
ext4_flex_group_add we might add block groups that are not part of the
first meta block group yet. In the reproducer we achieved this by
substracting the size of a whole block group from the point where the
meta block group would start. This must be considered when updating the
backup block group descriptors to follow the non-meta_bg layout. The fix
is to add a test whether the group to add is already part of the meta
block group or not.
Fixes: 01f795f9e0d67 ("ext4: add online resizing support for meta_bg and 64-bit file systems")
Cc: stable(a)vger.kernel.org
Signed-off-by: Maximilian Heyne <mheyne(a)amazon.de>
---
fs/ext4/resize.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 4d4a5a32e310..3c0d12382e06 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -1602,7 +1602,8 @@ static int ext4_flex_group_add(struct super_block *sb,
int gdb_num = group / EXT4_DESC_PER_BLOCK(sb);
int gdb_num_end = ((group + flex_gd->count - 1) /
EXT4_DESC_PER_BLOCK(sb));
- int meta_bg = ext4_has_feature_meta_bg(sb);
+ int meta_bg = ext4_has_feature_meta_bg(sb) &&
+ gdb_num >= le32_to_cpu(es->s_first_meta_bg);
sector_t padding_blocks = meta_bg ? 0 : sbi->s_sbh->b_blocknr -
ext4_group_first_block_no(sb, 0);
--
2.40.1
Amazon Development Center Germany GmbH
Krausenstr. 38
10117 Berlin
Geschaeftsfuehrung: Christian Schlaeger, Jonathan Weiss
Eingetragen am Amtsgericht Charlottenburg unter HRB 149173 B
Sitz: Berlin
Ust-ID: DE 289 237 879
When the PCI device is surprise removed, requests won't complete from
the device. These IOs are never completed and disk deletion hangs
indefinitely.
Fix it by aborting the IOs which the device will never complete
when the VQ is broken.
With this fix now fio completes swiftly.
An alternative of IO timeout has been considered, however
when the driver knows about unresponsive block device, swiftly clearing
them enables users and upper layers to react quickly.
Verified with multiple device unplug cycles with pending IOs in virtio
used ring and some pending with device.
In future instead of VQ broken, a more elegant method can be used. At the
moment the patch is kept to its minimal changes given its urgency to fix
broken kernels.
Fixes: 43bb40c5b926 ("virtio_pci: Support surprise removal of virtio pci device")
Cc: stable(a)vger.kernel.org
Reported-by: lirongqing(a)baidu.com
Closes: https://lore.kernel.org/virtualization/c45dd68698cd47238c55fb73ca9b4741@bai…
Co-developed-by: Chaitanya Kulkarni <kch(a)nvidia.com>
Signed-off-by: Chaitanya Kulkarni <kch(a)nvidia.com>
Signed-off-by: Parav Pandit <parav(a)nvidia.com>
---
drivers/block/virtio_blk.c | 54 ++++++++++++++++++++++++++++++++++++++
1 file changed, 54 insertions(+)
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 2bf14a0e2815..59b49899b229 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -1562,10 +1562,64 @@ static int virtblk_probe(struct virtio_device *vdev)
return err;
}
+static bool virtblk_cancel_request(struct request *rq, void *data)
+{
+ struct virtblk_req *vbr = blk_mq_rq_to_pdu(rq);
+
+ vbr->in_hdr.status = VIRTIO_BLK_S_IOERR;
+ if (blk_mq_request_started(rq) && !blk_mq_request_completed(rq))
+ blk_mq_complete_request(rq);
+
+ return true;
+}
+
+static void virtblk_cleanup_reqs(struct virtio_blk *vblk)
+{
+ struct virtio_blk_vq *blk_vq;
+ struct request_queue *q;
+ struct virtqueue *vq;
+ unsigned long flags;
+ int i;
+
+ vq = vblk->vqs[0].vq;
+ if (!virtqueue_is_broken(vq))
+ return;
+
+ q = vblk->disk->queue;
+ /* Block upper layer to not get any new requests */
+ blk_mq_quiesce_queue(q);
+
+ for (i = 0; i < vblk->num_vqs; i++) {
+ blk_vq = &vblk->vqs[i];
+
+ /* Synchronize with any ongoing virtblk_poll() which may be
+ * completing the requests to uppper layer which has already
+ * crossed the broken vq check.
+ */
+ spin_lock_irqsave(&blk_vq->lock, flags);
+ spin_unlock_irqrestore(&blk_vq->lock, flags);
+ }
+
+ blk_sync_queue(q);
+
+ /* Complete remaining pending requests with error */
+ blk_mq_tagset_busy_iter(&vblk->tag_set, virtblk_cancel_request, vblk);
+ blk_mq_tagset_wait_completed_request(&vblk->tag_set);
+
+ /*
+ * Unblock any pending dispatch I/Os before we destroy device. From
+ * del_gendisk() -> __blk_mark_disk_dead(disk) will set GD_DEAD flag,
+ * that will make sure any new I/O from bio_queue_enter() to fail.
+ */
+ blk_mq_unquiesce_queue(q);
+}
+
static void virtblk_remove(struct virtio_device *vdev)
{
struct virtio_blk *vblk = vdev->priv;
+ virtblk_cleanup_reqs(vblk);
+
/* Make sure no work handler is accessing the device. */
flush_work(&vblk->config_work);
--
2.34.1
From: Michal Kazior <michal(a)plume.com>
[ Upstream commit a6e4f85d3820d00694ed10f581f4c650445dbcda ]
The nl80211_dump_interface() supports resumption
in case nl80211_send_iface() doesn't have the
resources to complete its work.
The logic would store the progress as iteration
offsets for rdev and wdev loops.
However the logic did not properly handle
resumption for non-last rdev. Assuming a system
with 2 rdevs, with 2 wdevs each, this could
happen:
dump(cb=[0, 0]):
if_start=cb[1] (=0)
send rdev0.wdev0 -> ok
send rdev0.wdev1 -> yield
cb[1] = 1
dump(cb=[0, 1]):
if_start=cb[1] (=1)
send rdev0.wdev1 -> ok
// since if_start=1 the rdev0.wdev0 got skipped
// through if_idx < if_start
send rdev1.wdev1 -> ok
The if_start needs to be reset back to 0 upon wdev
loop end.
The problem is actually hard to hit on a desktop,
and even on most routers. The prerequisites for
this manifesting was:
- more than 1 wiphy
- a few handful of interfaces
- dump without rdev or wdev filter
I was seeing this with 4 wiphys 9 interfaces each.
It'd miss 6 interfaces from the last wiphy
reported to userspace.
Signed-off-by: Michal Kazior <michal(a)plume.com>
Link: https://msgid.link/20240116142340.89678-1-kazikcz@gmail.com
Signed-off-by: Johannes Berg <johannes.berg(a)intel.com>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
net/wireless/nl80211.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 1cbbb11ea503..fbf95b7ff6b4 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -4008,6 +4008,7 @@ static int nl80211_dump_interface(struct sk_buff *skb, struct netlink_callback *
}
wiphy_unlock(&rdev->wiphy);
+ if_start = 0;
wp_idx++;
}
out:
--
2.43.0
From: "Guilherme G. Piccoli" <gpiccoli(a)igalia.com>
[ Upstream commit e585a37e5061f6d5060517aed1ca4ccb2e56a34c ]
By running a Van Gogh device (Steam Deck), the following message
was noticed in the kernel log:
pci 0000:04:00.3: PCI class overridden (0x0c03fe -> 0x0c03fe) so dwc3 driver can claim this instead of xhci
Effectively this means the quirk executed but changed nothing, since the
class of this device was already the proper one (likely adjusted by newer
firmware versions).
Check and perform the override only if necessary.
Link: https://lore.kernel.org/r/20231120160531.361552-1-gpiccoli@igalia.com
Signed-off-by: Guilherme G. Piccoli <gpiccoli(a)igalia.com>
Signed-off-by: Bjorn Helgaas <bhelgaas(a)google.com>
Cc: Huang Rui <ray.huang(a)amd.com>
Cc: Vicki Pfau <vi(a)endrift.com>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
drivers/pci/quirks.c | 11 +++++++----
1 file changed, 7 insertions(+), 4 deletions(-)
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index 8765544bac35..75b297c15cf5 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -607,10 +607,13 @@ static void quirk_amd_dwc_class(struct pci_dev *pdev)
{
u32 class = pdev->class;
- /* Use "USB Device (not host controller)" class */
- pdev->class = PCI_CLASS_SERIAL_USB_DEVICE;
- pci_info(pdev, "PCI class overridden (%#08x -> %#08x) so dwc3 driver can claim this instead of xhci\n",
- class, pdev->class);
+ if (class != PCI_CLASS_SERIAL_USB_DEVICE) {
+ /* Use "USB Device (not host controller)" class */
+ pdev->class = PCI_CLASS_SERIAL_USB_DEVICE;
+ pci_info(pdev,
+ "PCI class overridden (%#08x -> %#08x) so dwc3 driver can claim this instead of xhci\n",
+ class, pdev->class);
+ }
}
DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_NL_USB,
quirk_amd_dwc_class);
--
2.43.0
From: Michal Kazior <michal(a)plume.com>
[ Upstream commit a6e4f85d3820d00694ed10f581f4c650445dbcda ]
The nl80211_dump_interface() supports resumption
in case nl80211_send_iface() doesn't have the
resources to complete its work.
The logic would store the progress as iteration
offsets for rdev and wdev loops.
However the logic did not properly handle
resumption for non-last rdev. Assuming a system
with 2 rdevs, with 2 wdevs each, this could
happen:
dump(cb=[0, 0]):
if_start=cb[1] (=0)
send rdev0.wdev0 -> ok
send rdev0.wdev1 -> yield
cb[1] = 1
dump(cb=[0, 1]):
if_start=cb[1] (=1)
send rdev0.wdev1 -> ok
// since if_start=1 the rdev0.wdev0 got skipped
// through if_idx < if_start
send rdev1.wdev1 -> ok
The if_start needs to be reset back to 0 upon wdev
loop end.
The problem is actually hard to hit on a desktop,
and even on most routers. The prerequisites for
this manifesting was:
- more than 1 wiphy
- a few handful of interfaces
- dump without rdev or wdev filter
I was seeing this with 4 wiphys 9 interfaces each.
It'd miss 6 interfaces from the last wiphy
reported to userspace.
Signed-off-by: Michal Kazior <michal(a)plume.com>
Link: https://msgid.link/20240116142340.89678-1-kazikcz@gmail.com
Signed-off-by: Johannes Berg <johannes.berg(a)intel.com>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
net/wireless/nl80211.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 0ac829c8f188..279f4977e2ee 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -3595,6 +3595,7 @@ static int nl80211_dump_interface(struct sk_buff *skb, struct netlink_callback *
if_idx++;
}
+ if_start = 0;
wp_idx++;
}
out:
--
2.43.0
From: Michal Kazior <michal(a)plume.com>
[ Upstream commit a6e4f85d3820d00694ed10f581f4c650445dbcda ]
The nl80211_dump_interface() supports resumption
in case nl80211_send_iface() doesn't have the
resources to complete its work.
The logic would store the progress as iteration
offsets for rdev and wdev loops.
However the logic did not properly handle
resumption for non-last rdev. Assuming a system
with 2 rdevs, with 2 wdevs each, this could
happen:
dump(cb=[0, 0]):
if_start=cb[1] (=0)
send rdev0.wdev0 -> ok
send rdev0.wdev1 -> yield
cb[1] = 1
dump(cb=[0, 1]):
if_start=cb[1] (=1)
send rdev0.wdev1 -> ok
// since if_start=1 the rdev0.wdev0 got skipped
// through if_idx < if_start
send rdev1.wdev1 -> ok
The if_start needs to be reset back to 0 upon wdev
loop end.
The problem is actually hard to hit on a desktop,
and even on most routers. The prerequisites for
this manifesting was:
- more than 1 wiphy
- a few handful of interfaces
- dump without rdev or wdev filter
I was seeing this with 4 wiphys 9 interfaces each.
It'd miss 6 interfaces from the last wiphy
reported to userspace.
Signed-off-by: Michal Kazior <michal(a)plume.com>
Link: https://msgid.link/20240116142340.89678-1-kazikcz@gmail.com
Signed-off-by: Johannes Berg <johannes.berg(a)intel.com>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
net/wireless/nl80211.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 70fb14b8bab0..c259d3227a9e 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -3960,6 +3960,7 @@ static int nl80211_dump_interface(struct sk_buff *skb, struct netlink_callback *
if_idx++;
}
+ if_start = 0;
wp_idx++;
}
out:
--
2.43.0
From: Baokun Li <libaokun1(a)huawei.com>
[ Upstream commit 4530b3660d396a646aad91a787b6ab37cf604b53 ]
Determine if the group block bitmap is corrupted before using ac_b_ex in
ext4_mb_try_best_found() to avoid allocating blocks from a group with a
corrupted block bitmap in the following concurrency and making the
situation worse.
ext4_mb_regular_allocator
ext4_lock_group(sb, group)
ext4_mb_good_group
// check if the group bbitmap is corrupted
ext4_mb_complex_scan_group
// Scan group gets ac_b_ex but doesn't use it
ext4_unlock_group(sb, group)
ext4_mark_group_bitmap_corrupted(group)
// The block bitmap was corrupted during
// the group unlock gap.
ext4_mb_try_best_found
ext4_lock_group(ac->ac_sb, group)
ext4_mb_use_best_found
mb_mark_used
// Allocating blocks in block bitmap corrupted group
Signed-off-by: Baokun Li <libaokun1(a)huawei.com>
Reviewed-by: Jan Kara <jack(a)suse.cz>
Link: https://lore.kernel.org/r/20240104142040.2835097-7-libaokun1@huawei.com
Signed-off-by: Theodore Ts'o <tytso(a)mit.edu>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
fs/ext4/mballoc.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 3babc07ae613..d7724601f42b 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1854,6 +1854,9 @@ int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
return err;
ext4_lock_group(ac->ac_sb, group);
+ if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)))
+ goto out;
+
max = mb_find_extent(e4b, ex.fe_start, ex.fe_len, &ex);
if (max > 0) {
@@ -1861,6 +1864,7 @@ int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
ext4_mb_use_best_found(ac, e4b);
}
+out:
ext4_unlock_group(ac->ac_sb, group);
ext4_mb_unload_buddy(e4b);
--
2.43.0
l2tp_ip6_sendmsg needs to avoid accounting for the transport header
twice when splicing more data into an already partially-occupied skbuff.
To manage this, we check whether the skbuff contains data using
skb_queue_empty when deciding how much data to append using
ip6_append_data.
However, the code which performed the calculation was incorrect:
ulen = len + skb_queue_empty(&sk->sk_write_queue) ? transhdrlen : 0;
...due to C operator precedence, this ends up setting ulen to
transhdrlen for messages with a non-zero length, which results in
corrupted packets on the wire.
Add parentheses to correct the calculation in line with the original
intent.
Fixes: 9d4c75800f61 ("ipv4, ipv6: Fix handling of transhdrlen in __ip{,6}_append_data()")
Cc: David Howells <dhowells(a)redhat.com>
Cc: stable(a)vger.kernel.org
Signed-off-by: Tom Parkin <tparkin(a)katalix.com>
---
This issue was uncovered by Debian build-testing for the
golang-github-katalix-go-l2tp package[1].
It seems 9d4c75800f61 has been backported to the linux-6.1.y stable
kernel (and possibly others), so I think this fix will also need
backporting.
The bug is currently seen on at least Debian Bookworm, Ubuntu Jammy, and
Debian testing/unstable.
Unfortunately tests using "ip l2tp" and which focus on dataplane
transport will not uncover this bug: it's necessary to send a packet
using an L2TPIP6 socket opened by userspace, and to verify the packet on
the wire. The l2tp-ktest[2] test suite has been extended to cover this.
[1]. https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=1063746
[2]. https://github.com/katalix/l2tp-ktest
---
net/l2tp/l2tp_ip6.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
index dd3153966173..7bf14cf9ffaa 100644
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -627,7 +627,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
back_from_confirm:
lock_sock(sk);
- ulen = len + skb_queue_empty(&sk->sk_write_queue) ? transhdrlen : 0;
+ ulen = len + (skb_queue_empty(&sk->sk_write_queue) ? transhdrlen : 0);
err = ip6_append_data(sk, ip_generic_getfrag, msg,
ulen, transhdrlen, &ipc6,
&fl6, (struct rt6_info *)dst,
--
2.34.1
If caching mode change fails due to, for example, OOM we
free the allocated pages in a two-step process. First the pages
for which the caching change has already succeeded. Secondly
the pages for which a caching change did not succeed.
However the second step was incorrectly freeing the pages already
freed in the first step.
Fix.
Signed-off-by: Thomas Hellström <thomas.hellstrom(a)linux.intel.com>
Fixes: 379989e7cbdc ("drm/ttm/pool: Fix ttm_pool_alloc error path")
Cc: Christian König <christian.koenig(a)amd.com>
Cc: Dave Airlie <airlied(a)redhat.com>
Cc: Christian Koenig <christian.koenig(a)amd.com>
Cc: Huang Rui <ray.huang(a)amd.com>
Cc: dri-devel(a)lists.freedesktop.org
Cc: <stable(a)vger.kernel.org> # v6.4+
---
drivers/gpu/drm/ttm/ttm_pool.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/ttm/ttm_pool.c b/drivers/gpu/drm/ttm/ttm_pool.c
index b62f420a9f96..112438d965ff 100644
--- a/drivers/gpu/drm/ttm/ttm_pool.c
+++ b/drivers/gpu/drm/ttm/ttm_pool.c
@@ -387,7 +387,7 @@ static void ttm_pool_free_range(struct ttm_pool *pool, struct ttm_tt *tt,
enum ttm_caching caching,
pgoff_t start_page, pgoff_t end_page)
{
- struct page **pages = tt->pages;
+ struct page **pages = &tt->pages[start_page];
unsigned int order;
pgoff_t i, nr;
--
2.43.0
From: Ard Biesheuvel <ardb(a)kernel.org>
The bit-sliced implementation of AES-CTR operates on blocks of 128
bytes, and will fall back to the plain NEON version for tail blocks or
inputs that are shorter than 128 bytes to begin with.
It will call straight into the plain NEON asm helper, which performs all
memory accesses in granules of 16 bytes (the size of a NEON register).
For this reason, the associated plain NEON glue code will copy inputs
shorter than 16 bytes into a temporary buffer, given that this is a rare
occurrence and it is not worth the effort to work around this in the asm
code.
The fallback from the bit-sliced NEON version fails to take this into
account, potentially resulting in out-of-bounds accesses. So clone the
same workaround, and use a temp buffer for short in/outputs.
Cc: <stable(a)vger.kernel.org>
Reported-by: syzbot+f1ceaa1a09ab891e1934(a)syzkaller.appspotmail.com
Tested-by: syzbot+f1ceaa1a09ab891e1934(a)syzkaller.appspotmail.com
Signed-off-by: Ard Biesheuvel <ardb(a)kernel.org>
---
arch/arm64/crypto/aes-neonbs-glue.c | 11 +++++++++++
1 file changed, 11 insertions(+)
diff --git a/arch/arm64/crypto/aes-neonbs-glue.c b/arch/arm64/crypto/aes-neonbs-glue.c
index bac4cabef607..849dc41320db 100644
--- a/arch/arm64/crypto/aes-neonbs-glue.c
+++ b/arch/arm64/crypto/aes-neonbs-glue.c
@@ -227,8 +227,19 @@ static int ctr_encrypt(struct skcipher_request *req)
src += blocks * AES_BLOCK_SIZE;
}
if (nbytes && walk.nbytes == walk.total) {
+ u8 buf[AES_BLOCK_SIZE];
+ u8 *d = dst;
+
+ if (unlikely(nbytes < AES_BLOCK_SIZE))
+ src = dst = memcpy(buf + sizeof(buf) - nbytes,
+ src, nbytes);
+
neon_aes_ctr_encrypt(dst, src, ctx->enc, ctx->key.rounds,
nbytes, walk.iv);
+
+ if (unlikely(nbytes < AES_BLOCK_SIZE))
+ memcpy(d, buf + sizeof(buf) - nbytes, nbytes);
+
nbytes = 0;
}
kernel_neon_end();
--
2.44.0.rc0.258.g7320e95886-goog
The is_psr_su parameter is a boolean flag indicating whether the Panel
Self Refresh Selective Update (PSR SU) feature is enabled which is a
power-saving feature that allows only the updated regions of the screen
to be refreshed, reducing the amount of data that needs to be sent to
the display.
Fixes the below with gcc W=1:
drivers/gpu/drm/amd/amdgpu/../display/amdgpu_dm/amdgpu_dm.c:5257: warning: Function parameter or member 'is_psr_su' not described in 'fill_dc_dirty_rects'
Fixes: 13d6b0812e58 ("drm/amdgpu: make damage clips support configurable")
Cc: stable(a)vger.kernel.org
Cc: Hamza Mahfooz <hamza.mahfooz(a)amd.com>
Cc: Mario Limonciello <mario.limonciello(a)amd.com>
Cc: Rodrigo Siqueira <Rodrigo.Siqueira(a)amd.com>
Cc: Aurabindo Pillai <aurabindo.pillai(a)amd.com>
Signed-off-by: Srinivasan Shanmugam <srinivasan.shanmugam(a)amd.com>
Reviewed-by: Rodrigo Siqueira <Rodrigo.Siqueira(a)amd.com>
---
drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index ed4873060da7..379836383ea9 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -5234,6 +5234,10 @@ static inline void fill_dc_dirty_rect(struct drm_plane *plane,
* @new_plane_state: New state of @plane
* @crtc_state: New state of CRTC connected to the @plane
* @flip_addrs: DC flip tracking struct, which also tracts dirty rects
+ * @is_psr_su: Flag indicating whether Panel Self Refresh Selective Update (PSR SU) is enabled.
+ * If PSR SU is enabled and damage clips are available, only the regions of the screen
+ * that have changed will be updated. If PSR SU is not enabled,
+ * or if damage clips are not available, the entire screen will be updated.
* @dirty_regions_changed: dirty regions changed
*
* For PSR SU, DC informs the DMUB uController of dirty rectangle regions
--
2.34.1
In erofs_find_target_block() when erofs_dirnamecmp() returns 0,
we do not assign the target metabuf. This causes the caller
erofs_namei()'s erofs_put_metabuf() at the end to be not effective
leaving the refcount on the page.
As the page from metabuf (buf->page) is never put, such page cannot be
migrated or reclaimed. Fix it now by putting the metabuf from
previous loop and assigning the current metabuf to target before
returning so caller erofs_namei() can do the final put as it was
intended.
Fixes: 500edd095648 ("erofs: use meta buffers for inode lookup")
Cc: stable(a)vger.kernel.org
Signed-off-by: Sandeep Dhavale <dhavale(a)google.com>
---
Changes since v1
- Rearrange the cases as suggested by Gao so there is less duplication
of the code and it is more readable
fs/erofs/namei.c | 28 ++++++++++++++--------------
1 file changed, 14 insertions(+), 14 deletions(-)
diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c
index d4f631d39f0f..f0110a78acb2 100644
--- a/fs/erofs/namei.c
+++ b/fs/erofs/namei.c
@@ -130,24 +130,24 @@ static void *erofs_find_target_block(struct erofs_buf *target,
/* string comparison without already matched prefix */
diff = erofs_dirnamecmp(name, &dname, &matched);
- if (!diff) {
- *_ndirents = 0;
- goto out;
- } else if (diff > 0) {
- head = mid + 1;
- startprfx = matched;
-
- if (!IS_ERR(candidate))
- erofs_put_metabuf(target);
- *target = buf;
- candidate = de;
- *_ndirents = ndirents;
- } else {
+ if (diff < 0) {
erofs_put_metabuf(&buf);
-
back = mid - 1;
endprfx = matched;
+ continue;
+ }
+
+ if (!IS_ERR(candidate))
+ erofs_put_metabuf(target);
+ *target = buf;
+ if (!diff) {
+ *_ndirents = 0;
+ return de;
}
+ head = mid + 1;
+ startprfx = matched;
+ candidate = de;
+ *_ndirents = ndirents;
continue;
}
out: /* free if the candidate is valid */
--
2.44.0.rc0.258.g7320e95886-goog
On Wed, Feb 21, 2024 at 10:52:41AM -0800, He Gao wrote:
> I used "git apply" and it required the change. But "patch" can work
> directly so yes the original patch works fine.
>
> In that case, I believe the original patch will also work for 6.6 and 6.7.
Great, all done, thanks!
greg k-h
commit 1a3e1f40962c445b997151a542314f3c6097f8c3 upstream.
NOTE: This is a partial backport since we only need the refcnt between
memcg and stock to fix the problem stated below, and in this way
multiple versions use the same code and align with each other.
---
There was a kernel panic happened on an in-house environment running
3.10, and the same problem was reproduced on 4.19:
general protection fault: 0000 [#1] SMP PTI
CPU: 1 PID: 2085 Comm: bash Kdump: loaded Tainted: G L 4.19.90+ #7
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.15.0-0-g2dd4b9b3f840-prebuilt.qemu.org 04/01/2014
RIP: 0010 drain_all_stock+0xad/0x140
Code: 00 00 4d 85 ff 74 2c 45 85 c9 74 27 4d 39 fc 74 42 41 80 bc 24 28 04 00 00 00 74 17 49 8b 04 24 49 8b 17 48 8b 88 90 02 00 00 <48> 39 8a 90 02 00 00 74 02 eb 86 48 63 88 3c 01 00 00 39 8a 3c 01
RSP: 0018:ffffa7efc5813d70 EFLAGS: 00010202
RAX: ffff8cb185548800 RBX: ffff8cb89f420160 RCX: ffff8cb1867b6000
RDX: babababababababa RSI: 0000000000000001 RDI: 0000000000231876
RBP: 0000000000000000 R08: 0000000000000415 R09: 0000000000000002
R10: 0000000000000000 R11: 0000000000000001 R12: ffff8cb186f89040
R13: 0000000000020160 R14: 0000000000000001 R15: ffff8cb186b27040
FS: 00007f4a308d3740(0000) GS:ffff8cb89f440000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007ffe4d634a68 CR3: 000000010b022000 CR4: 00000000000006e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
mem_cgroup_force_empty_write+0x31/0xb0
cgroup_file_write+0x60/0x140
? __check_object_size+0x136/0x147
kernfs_fop_write+0x10e/0x190
__vfs_write+0x37/0x1b0
? selinux_file_permission+0xe8/0x130
? security_file_permission+0x2e/0xb0
vfs_write+0xb6/0x1a0
ksys_write+0x57/0xd0
do_syscall_64+0x63/0x250
? async_page_fault+0x8/0x30
entry_SYSCALL_64_after_hwframe+0x5c/0xc1
Modules linked in: ...
It is found that in case of stock->nr_pages == 0, the memcg on
stock->cached could be freed due to its refcnt decreased to 0, which
made stock->cached become a dangling pointer. It could cause a UAF
problem in drain_all_stock() in the following concurrent scenario. Note
that drain_all_stock() doesn't disable irq but only preemption.
CPU1 CPU2
==============================================================================
stock->cached = memcgA (freed)
drain_all_stock(memcgB)
rcu_read_lock()
memcg = CPU1's stock->cached (memcgA)
(interrupted)
refill_stock(memcgC)
drain_stock(memcgA)
stock->cached = memcgC
stock->nr_pages += xxx (> 0)
stock->nr_pages > 0
mem_cgroup_is_descendant(memcgA, memcgB) [UAF]
rcu_read_unlock()
This problem is, unintentionally, fixed at 5.9, where commit
1a3e1f40962c ("mm: memcontrol: decouple reference counting from page
accounting") adds memcg refcnt for stock. Therefore affected LTS
versions include 4.19 and 5.4.
For 4.19, memcg's css offline process doesn't call drain_all_stock(). so
it's easier for the released memcg to be left on the stock. For 5.4,
although mem_cgroup_css_offline() does call drain_all_stock(), but the
flushing could be skipped when stock->nr_pages happens to be 0, and
besides the async draining could be delayed and take place after the UAF
problem has happened.
Fix this problem by adding (and decreasing) memcg's refcnt when memcg is
put onto (and removed from) stock, just like how commit 1a3e1f40962c
("mm: memcontrol: decouple reference counting from page accounting")
does. After all, "being on the stock" is a kind of reference with
regards to memcg. As such, it's guaranteed that a css on stock would not
be freed.
It's good to mention that refill_stock() is executed in an irq-disabled
context, so the drain_stock() patched with css_put() would not actually
free memcgA until the end of refill_stock(), since css_put() is an RCU
free and it's still in grace period. For CPU2, the access to CPU1's
stock->cached is protected by rcu_read_lock(), so in this case it gets
either NULL from stock->cached or a memcgA that is still good.
Cc: stable(a)vger.kernel.org # 4.19 5.4
Fixes: cdec2e4265df ("memcg: coalesce charging via percpu storage")
Signed-off-by: GONG, Ruiqi <gongruiqi1(a)huawei.com>
Acked-by: Michal Hocko <mhocko(a)suse.com>
---
v2:
- Add a statement of this patch being a partial backport
- Add a paragraph to mention the grace period in refill_stock()
mm/memcontrol.c | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 5a366cf79821..8c04296df1c7 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2015,6 +2015,9 @@ static void drain_stock(struct memcg_stock_pcp *stock)
{
struct mem_cgroup *old = stock->cached;
+ if (!old)
+ return;
+
if (stock->nr_pages) {
page_counter_uncharge(&old->memory, stock->nr_pages);
if (do_memsw_account())
@@ -2022,6 +2025,8 @@ static void drain_stock(struct memcg_stock_pcp *stock)
css_put_many(&old->css, stock->nr_pages);
stock->nr_pages = 0;
}
+
+ css_put(&old->css);
stock->cached = NULL;
}
@@ -2057,6 +2062,7 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
stock = this_cpu_ptr(&memcg_stock);
if (stock->cached != memcg) { /* reset if necessary */
drain_stock(stock);
+ css_get(&memcg->css);
stock->cached = memcg;
}
stock->nr_pages += nr_pages;
--
2.25.1
commit 1a3e1f40962c445b997151a542314f3c6097f8c3 upstream.
There was a kernel panic happened on an in-house environment running
3.10, and the same problem was reproduced on 4.19:
general protection fault: 0000 [#1] SMP PTI
CPU: 1 PID: 2085 Comm: bash Kdump: loaded Tainted: G L 4.19.90+ #7
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.15.0-0-g2dd4b9b3f840-prebuilt.qemu.org 04/01/2014
RIP: 0010 drain_all_stock+0xad/0x140
Code: 00 00 4d 85 ff 74 2c 45 85 c9 74 27 4d 39 fc 74 42 41 80 bc 24 28 04 00 00 00 74 17 49 8b 04 24 49 8b 17 48 8b 88 90 02 00 00 <48> 39 8a 90 02 00 00 74 02 eb 86 48 63 88 3c 01 00 00 39 8a 3c 01
RSP: 0018:ffffa7efc5813d70 EFLAGS: 00010202
RAX: ffff8cb185548800 RBX: ffff8cb89f420160 RCX: ffff8cb1867b6000
RDX: babababababababa RSI: 0000000000000001 RDI: 0000000000231876
RBP: 0000000000000000 R08: 0000000000000415 R09: 0000000000000002
R10: 0000000000000000 R11: 0000000000000001 R12: ffff8cb186f89040
R13: 0000000000020160 R14: 0000000000000001 R15: ffff8cb186b27040
FS: 00007f4a308d3740(0000) GS:ffff8cb89f440000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007ffe4d634a68 CR3: 000000010b022000 CR4: 00000000000006e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
mem_cgroup_force_empty_write+0x31/0xb0
cgroup_file_write+0x60/0x140
? __check_object_size+0x136/0x147
kernfs_fop_write+0x10e/0x190
__vfs_write+0x37/0x1b0
? selinux_file_permission+0xe8/0x130
? security_file_permission+0x2e/0xb0
vfs_write+0xb6/0x1a0
ksys_write+0x57/0xd0
do_syscall_64+0x63/0x250
? async_page_fault+0x8/0x30
entry_SYSCALL_64_after_hwframe+0x5c/0xc1
Modules linked in: ...
It is found that in case of stock->nr_pages == 0, the memcg on
stock->cached could be freed due to its refcnt decreased to 0, which
made stock->cached become a dangling pointer. It could cause a UAF
problem in drain_all_stock() in the following concurrent scenario. Note
that drain_all_stock() doesn't disable irq but only preemption.
CPU1 CPU2
==============================================================================
stock->cached = memcgA (freed)
drain_all_stock(memcgB)
rcu_read_lock()
memcg = CPU1's stock->cached (memcgA)
(interrupted)
refill_stock(memcgC)
drain_stock(memcgA)
stock->cached = memcgC
stock->nr_pages += xxx (> 0)
stock->nr_pages > 0
mem_cgroup_is_descendant(memcgA, memcgB) [UAF]
rcu_read_unlock()
This problem is, unintenionally, fixed at 5.9, where commit 1a3e1f40962c
("mm: memcontrol: decouple reference counting from page accounting")
adds memcg refcnt for stock. Therefore affected LTS versions include
4.19 and 5.4.
For 4.19, memcg's css offline process doesn't call drain_all_stock(). so
it's easier for the released memcg to be left on the stock. For 5.4,
although mem_cgroup_css_offline() does call drain_all_stock(), but the
flushing could be skipped when stock->nr_pages happens to be 0, and
besides the async draining could be delayed and take place after the UAF
problem has happened.
Fix this problem by adding (and decreasing) memcg's refcnt when memcg is
put onto (and removed from) stock, just like how commit 1a3e1f40962c
("mm: memcontrol: decouple reference counting from page accounting")
does. After all, "being on the stock" is a kind of reference with
regards to memcg. As such, it's guaranteed that a css on stock would not
be freed.
Cc: stable(a)vger.kernel.org # 4.19 5.4
Fixes: cdec2e4265df ("memcg: coalesce charging via percpu storage")
Signed-off-by: GONG, Ruiqi <gongruiqi1(a)huawei.com>
---
mm/memcontrol.c | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 5a366cf79821..8c04296df1c7 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2015,6 +2015,9 @@ static void drain_stock(struct memcg_stock_pcp *stock)
{
struct mem_cgroup *old = stock->cached;
+ if (!old)
+ return;
+
if (stock->nr_pages) {
page_counter_uncharge(&old->memory, stock->nr_pages);
if (do_memsw_account())
@@ -2022,6 +2025,8 @@ static void drain_stock(struct memcg_stock_pcp *stock)
css_put_many(&old->css, stock->nr_pages);
stock->nr_pages = 0;
}
+
+ css_put(&old->css);
stock->cached = NULL;
}
@@ -2057,6 +2062,7 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
stock = this_cpu_ptr(&memcg_stock);
if (stock->cached != memcg) { /* reset if necessary */
drain_stock(stock);
+ css_get(&memcg->css);
stock->cached = memcg;
}
stock->nr_pages += nr_pages;
--
2.25.1
The quilt patch titled
Subject: kasan/test: avoid gcc warning for intentional overflow
has been removed from the -mm tree. Its filename was
kasan-test-avoid-gcc-warning-for-intentional-overflow.patch
This patch was dropped because it was merged into the mm-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Arnd Bergmann <arnd(a)arndb.de>
Subject: kasan/test: avoid gcc warning for intentional overflow
Date: Mon, 12 Feb 2024 12:15:52 +0100
The out-of-bounds test allocates an object that is three bytes too short
in order to validate the bounds checking. Starting with gcc-14, this
causes a compile-time warning as gcc has grown smart enough to understand
the sizeof() logic:
mm/kasan/kasan_test.c: In function 'kmalloc_oob_16':
mm/kasan/kasan_test.c:443:14: error: allocation of insufficient size '13' for type 'struct <anonymous>' with size '16' [-Werror=alloc-size]
443 | ptr1 = kmalloc(sizeof(*ptr1) - 3, GFP_KERNEL);
| ^
Hide the actual computation behind a RELOC_HIDE() that ensures
the compiler misses the intentional bug.
Link: https://lkml.kernel.org/r/20240212111609.869266-1-arnd@kernel.org
Fixes: 3f15801cdc23 ("lib: add kasan test module")
Signed-off-by: Arnd Bergmann <arnd(a)arndb.de>
Reviewed-by: Andrey Konovalov <andreyknvl(a)gmail.com>
Cc: Alexander Potapenko <glider(a)google.com>
Cc: Andrey Ryabinin <ryabinin.a.a(a)gmail.com>
Cc: Arnd Bergmann <arnd(a)arndb.de>
Cc: Dmitry Vyukov <dvyukov(a)google.com>
Cc: Marco Elver <elver(a)google.com>
Cc: Vincenzo Frascino <vincenzo.frascino(a)arm.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/kasan/kasan_test.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
--- a/mm/kasan/kasan_test.c~kasan-test-avoid-gcc-warning-for-intentional-overflow
+++ a/mm/kasan/kasan_test.c
@@ -440,7 +440,8 @@ static void kmalloc_oob_16(struct kunit
/* This test is specifically crafted for the generic mode. */
KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_GENERIC);
- ptr1 = kmalloc(sizeof(*ptr1) - 3, GFP_KERNEL);
+ /* RELOC_HIDE to prevent gcc from warning about short alloc */
+ ptr1 = RELOC_HIDE(kmalloc(sizeof(*ptr1) - 3, GFP_KERNEL), 0);
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr1);
ptr2 = kmalloc(sizeof(*ptr2), GFP_KERNEL);
_
Patches currently in -mm which might be from arnd(a)arndb.de are
mm-mmu_gather-add-tlb_remove_tlb_entries-fix.patch
From: Jan Kiszka <jan.kiszka(a)siemens.com>
commit afb2a4fb84555ef9e61061f6ea63ed7087b295d5 upstream.
The cflags for the RISC-V efistub were missing -mno-relax, thus were
under the risk that the compiler could use GP-relative addressing. That
happened for _edata with binutils-2.41 and kernel 6.1, causing the
relocation to fail due to an invalid kernel_size in handle_kernel_image.
It was not yet observed with newer versions, but that may just be luck.
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Jan Kiszka <jan.kiszka(a)siemens.com>
Signed-off-by: Ard Biesheuvel <ardb(a)kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
drivers/firmware/efi/libstub/Makefile | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile
index a1157c2a7170..f54715672d52 100644
--- a/drivers/firmware/efi/libstub/Makefile
+++ b/drivers/firmware/efi/libstub/Makefile
@@ -28,7 +28,7 @@ cflags-$(CONFIG_ARM) += -DEFI_HAVE_STRLEN -DEFI_HAVE_STRNLEN \
-DEFI_HAVE_MEMCHR -DEFI_HAVE_STRRCHR \
-DEFI_HAVE_STRCMP -fno-builtin -fpic \
$(call cc-option,-mno-single-pic-base)
-cflags-$(CONFIG_RISCV) += -fpic
+cflags-$(CONFIG_RISCV) += -fpic -mno-relax
cflags-$(CONFIG_LOONGARCH) += -fpie
cflags-$(CONFIG_EFI_PARAMS_FROM_FDT) += -I$(srctree)/scripts/dtc/libfdt
--
2.35.3
--
Siemens AG, Technology
Linux Expert Center
The patch titled
Subject: mm, vmscan: prevent infinite loop for costly GFP_NOIO | __GFP_RETRY_MAYFAIL allocations
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
mm-vmscan-prevent-infinite-loop-for-costly-gfp_noio-__gfp_retry_mayfail-allocations.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Vlastimil Babka <vbabka(a)suse.cz>
Subject: mm, vmscan: prevent infinite loop for costly GFP_NOIO | __GFP_RETRY_MAYFAIL allocations
Date: Wed, 21 Feb 2024 12:43:58 +0100
Sven reports an infinite loop in __alloc_pages_slowpath() for costly order
__GFP_RETRY_MAYFAIL allocations that are also GFP_NOIO. Such combination
can happen in a suspend/resume context where a GFP_KERNEL allocation can
have __GFP_IO masked out via gfp_allowed_mask.
Quoting Sven:
1. try to do a "costly" allocation (order > PAGE_ALLOC_COSTLY_ORDER)
with __GFP_RETRY_MAYFAIL set.
2. page alloc's __alloc_pages_slowpath tries to get a page from the
freelist. This fails because there is nothing free of that costly
order.
3. page alloc tries to reclaim by calling __alloc_pages_direct_reclaim,
which bails out because a zone is ready to be compacted; it pretends
to have made a single page of progress.
4. page alloc tries to compact, but this always bails out early because
__GFP_IO is not set (it's not passed by the snd allocator, and even
if it were, we are suspending so the __GFP_IO flag would be cleared
anyway).
5. page alloc believes reclaim progress was made (because of the
pretense in item 3) and so it checks whether it should retry
compaction. The compaction retry logic thinks it should try again,
because:
a) reclaim is needed because of the early bail-out in item 4
b) a zonelist is suitable for compaction
6. goto 2. indefinite stall.
(end quote)
The immediate root cause is confusing the COMPACT_SKIPPED returned from
__alloc_pages_direct_compact() (step 4) due to lack of __GFP_IO to be
indicating a lack of order-0 pages, and in step 5 evaluating that in
should_compact_retry() as a reason to retry, before incrementing and
limiting the number of retries. There are however other places that
wrongly assume that compaction can happen while we lack __GFP_IO.
To fix this, introduce gfp_compaction_allowed() to abstract the __GFP_IO
evaluation and switch the open-coded test in try_to_compact_pages() to use
it.
Also use the new helper in:
- compaction_ready(), which will make reclaim not bail out in step 3, so
there's at least one attempt to actually reclaim, even if chances are
small for a costly order
- in_reclaim_compaction() which will make should_continue_reclaim()
return false and we don't over-reclaim unnecessarily
- in __alloc_pages_slowpath() to set a local variable can_compact,
which is then used to avoid retrying reclaim/compaction for costly
allocations (step 5) if we can't compact and also to skip the early
compaction attempt that we do in some cases
Link: https://lkml.kernel.org/r/20240221114357.13655-2-vbabka@suse.cz
Fixes: 3250845d0526 ("Revert "mm, oom: prevent premature OOM killer invocation for high order request"")
Signed-off-by: Vlastimil Babka <vbabka(a)suse.cz>
Reported-by: Sven van Ashbrook <svenva(a)chromium.org>
Closes: https://lore.kernel.org/all/CAG-rBihs_xMKb3wrMO1%2B-%2Bp4fowP9oy1pa_OTkfxBz…
Cc: Brian Geffon <bgeffon(a)google.com>
Cc: Curtis Malainey <cujomalainey(a)chromium.org>
Cc: Jaroslav Kysela <perex(a)perex.cz>
Cc: Mel Gorman <mgorman(a)techsingularity.net>
Cc: Michal Hocko <mhocko(a)kernel.org>
Cc: Takashi Iwai <tiwai(a)suse.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
include/linux/gfp.h | 9 +++++++++
mm/compaction.c | 7 +------
mm/page_alloc.c | 10 ++++++----
mm/vmscan.c | 5 ++++-
4 files changed, 20 insertions(+), 11 deletions(-)
--- a/include/linux/gfp.h~mm-vmscan-prevent-infinite-loop-for-costly-gfp_noio-__gfp_retry_mayfail-allocations
+++ a/include/linux/gfp.h
@@ -353,6 +353,15 @@ static inline bool gfp_has_io_fs(gfp_t g
return (gfp & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS);
}
+/*
+ * Check if the gfp flags allow compaction - GFP_NOIO is a really
+ * tricky context because the migration might require IO.
+ */
+static inline bool gfp_compaction_allowed(gfp_t gfp_mask)
+{
+ return IS_ENABLED(CONFIG_COMPACTION) && (gfp_mask & __GFP_IO);
+}
+
extern gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma);
#ifdef CONFIG_CONTIG_ALLOC
--- a/mm/compaction.c~mm-vmscan-prevent-infinite-loop-for-costly-gfp_noio-__gfp_retry_mayfail-allocations
+++ a/mm/compaction.c
@@ -2723,16 +2723,11 @@ enum compact_result try_to_compact_pages
unsigned int alloc_flags, const struct alloc_context *ac,
enum compact_priority prio, struct page **capture)
{
- int may_perform_io = (__force int)(gfp_mask & __GFP_IO);
struct zoneref *z;
struct zone *zone;
enum compact_result rc = COMPACT_SKIPPED;
- /*
- * Check if the GFP flags allow compaction - GFP_NOIO is really
- * tricky context because the migration might require IO
- */
- if (!may_perform_io)
+ if (!gfp_compaction_allowed(gfp_mask))
return COMPACT_SKIPPED;
trace_mm_compaction_try_to_compact_pages(order, gfp_mask, prio);
--- a/mm/page_alloc.c~mm-vmscan-prevent-infinite-loop-for-costly-gfp_noio-__gfp_retry_mayfail-allocations
+++ a/mm/page_alloc.c
@@ -4041,6 +4041,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, u
struct alloc_context *ac)
{
bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
+ bool can_compact = gfp_compaction_allowed(gfp_mask);
const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER;
struct page *page = NULL;
unsigned int alloc_flags;
@@ -4111,7 +4112,7 @@ restart:
* Don't try this for allocations that are allowed to ignore
* watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen.
*/
- if (can_direct_reclaim &&
+ if (can_direct_reclaim && can_compact &&
(costly_order ||
(order > 0 && ac->migratetype != MIGRATE_MOVABLE))
&& !gfp_pfmemalloc_allowed(gfp_mask)) {
@@ -4209,9 +4210,10 @@ retry:
/*
* Do not retry costly high order allocations unless they are
- * __GFP_RETRY_MAYFAIL
+ * __GFP_RETRY_MAYFAIL and we can compact
*/
- if (costly_order && !(gfp_mask & __GFP_RETRY_MAYFAIL))
+ if (costly_order && (!can_compact ||
+ !(gfp_mask & __GFP_RETRY_MAYFAIL)))
goto nopage;
if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
@@ -4224,7 +4226,7 @@ retry:
* implementation of the compaction depends on the sufficient amount
* of free memory (see __compaction_suitable)
*/
- if (did_some_progress > 0 &&
+ if (did_some_progress > 0 && can_compact &&
should_compact_retry(ac, order, alloc_flags,
compact_result, &compact_priority,
&compaction_retries))
--- a/mm/vmscan.c~mm-vmscan-prevent-infinite-loop-for-costly-gfp_noio-__gfp_retry_mayfail-allocations
+++ a/mm/vmscan.c
@@ -5753,7 +5753,7 @@ static void shrink_lruvec(struct lruvec
/* Use reclaim/compaction for costly allocs or under memory pressure */
static bool in_reclaim_compaction(struct scan_control *sc)
{
- if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
+ if (gfp_compaction_allowed(sc->gfp_mask) && sc->order &&
(sc->order > PAGE_ALLOC_COSTLY_ORDER ||
sc->priority < DEF_PRIORITY - 2))
return true;
@@ -5998,6 +5998,9 @@ static inline bool compaction_ready(stru
{
unsigned long watermark;
+ if (!gfp_compaction_allowed(sc->gfp_mask))
+ return false;
+
/* Allocation can already succeed, nothing to do */
if (zone_watermark_ok(zone, sc->order, min_wmark_pages(zone),
sc->reclaim_idx, 0))
_
Patches currently in -mm which might be from vbabka(a)suse.cz are
mm-vmscan-prevent-infinite-loop-for-costly-gfp_noio-__gfp_retry_mayfail-allocations.patch
mm-document-memalloc_noreclaim_save-and-memalloc_pin_save.patch
mm-document-memalloc_noreclaim_save-and-memalloc_pin_save-v2.patch
We go through quite a bit of trouble to make sure we pick up any
preallocated LPI tables on the redistributors, as enabling LPIs is a
one-way switch. There is no such restriction for vLPIs, and for GICv4.1
we expect to allocate a new vPE table at boot.
This works as intended when initializing an ITS, however when setting up
a redistributor in its_cpu_init_lpis() the early return for preallocated
RD tables skips straight past GICv4 setup. This all comes to a head when
trying to kexec into a new kernel, as the new kernel silently fails to
set up GICv4, leading to a complete loss of SGIs and LPIs for KVM VMs
(ouch).
Slap a band-aid on the problem by ensuring its_cpu_init_lpis() always
initializes GICv4 on the way out, even if the other RD tables were
preallocated.
Cc: stable(a)vger.kernel.org
Fixes: 6479450f72c1 ("irqchip/gic-v4: Fix occasional VLPI drop")
Reported-by: George Cherian <gcherian(a)marvell.com>
Co-developed-by: Marc Zyngier <maz(a)kernel.org>
Signed-off-by: Marc Zyngier <maz(a)kernel.org>
Signed-off-by: Oliver Upton <oliver.upton(a)linux.dev>
---
I debated a bit on the fixes tag between the blamed commit and commit
5e5168461c22 ("irqchip/gic-v4.1: VPE table (aka GICR_VPROPBASER)
allocation"), although it would appear GICv4 could be left in an unknown
state after kexec too.
drivers/irqchip/irq-gic-v3-its.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index d097001c1e3e..0022852ce494 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -3172,6 +3172,7 @@ static void its_cpu_init_lpis(void)
val |= GICR_CTLR_ENABLE_LPIS;
writel_relaxed(val, rbase + GICR_CTLR);
+out:
if (gic_rdists->has_vlpis && !gic_rdists->has_rvpeid) {
void __iomem *vlpi_base = gic_data_rdist_vlpi_base();
@@ -3207,7 +3208,6 @@ static void its_cpu_init_lpis(void)
/* Make sure the GIC has seen the above */
dsb(sy);
-out:
gic_data_rdist()->flags |= RD_LOCAL_LPI_ENABLED;
pr_info("GICv3: CPU%d: using %s LPI pending table @%pa\n",
smp_processor_id(),
--
2.44.0.rc0.258.g7320e95886-goog
Hi all,
Eric reported that builds of LLVM with [1] (close to tip of tree) have
CONFIG_AS_HAS_OPTION_ARCH=n because the test for expected failure on
invalid input has started succeeding.
This Kconfig test was added because '.option arch' only causes an
assembler warning when it is unsupported, rather than a hard error,
which is what users of as-instr expect when something is unsupported.
This can be resolved by turning assembler warnings into errors with
'-Wa,--fatal-warnings' like we do with the compiler with '-Werror',
which is what the first patch does. The second patch removes the invalid
test, as the valid test is good enough with fatal warnings.
I have diffed several configurations for the different architectures
that use as-instr and I have found no issues.
I think this could go in through either the kbuild or RISC-V tree with
sufficient acks but I will let them fight over who takes it :)
[1]: https://github.com/llvm/llvm-project/commit/3ac9fe69f70a2b3541266daedbaaa7d…
---
Nathan Chancellor (2):
kbuild: Add -Wa,--fatal-warnings to as-instr invocation
RISC-V: Drop invalid test from CONFIG_AS_HAS_OPTION_ARCH
arch/riscv/Kconfig | 1 -
scripts/Kconfig.include | 2 +-
scripts/Makefile.compiler | 2 +-
3 files changed, 2 insertions(+), 3 deletions(-)
---
base-commit: 6613476e225e090cc9aad49be7fa504e290dd33d
change-id: 20240124-fix-riscv-option-arch-llvm-18-3cbe7b09a216
Best regards,
--
Nathan Chancellor <nathan(a)kernel.org>
From: Konstantin Komarov <almaz.alexandrovich(a)paragon-software.com>
[ Upstream commit 22457c047ed971f2f2e33be593ddfabd9639a409 ]
Unfortunately reparse attribute is used for many purposes (several dozens).
It is not possible here to know is this name symlink or not.
To get exactly the type of name we should to open inode (read mft).
getattr for opened file (fstat) correctly returns symlink.
Signed-off-by: Konstantin Komarov <almaz.alexandrovich(a)paragon-software.com>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
fs/ntfs3/dir.c | 30 +++++++++++++++++++++++++-----
1 file changed, 25 insertions(+), 5 deletions(-)
diff --git a/fs/ntfs3/dir.c b/fs/ntfs3/dir.c
index d4d9f4ffb6d9..c2fb76bb28f4 100644
--- a/fs/ntfs3/dir.c
+++ b/fs/ntfs3/dir.c
@@ -309,11 +309,31 @@ static inline int ntfs_filldir(struct ntfs_sb_info *sbi, struct ntfs_inode *ni,
return 0;
}
- /* NTFS: symlinks are "dir + reparse" or "file + reparse" */
- if (fname->dup.fa & FILE_ATTRIBUTE_REPARSE_POINT)
- dt_type = DT_LNK;
- else
- dt_type = (fname->dup.fa & FILE_ATTRIBUTE_DIRECTORY) ? DT_DIR : DT_REG;
+ /*
+ * NTFS: symlinks are "dir + reparse" or "file + reparse"
+ * Unfortunately reparse attribute is used for many purposes (several dozens).
+ * It is not possible here to know is this name symlink or not.
+ * To get exactly the type of name we should to open inode (read mft).
+ * getattr for opened file (fstat) correctly returns symlink.
+ */
+ dt_type = (fname->dup.fa & FILE_ATTRIBUTE_DIRECTORY) ? DT_DIR : DT_REG;
+
+ /*
+ * It is not reliable to detect the type of name using duplicated information
+ * stored in parent directory.
+ * The only correct way to get the type of name - read MFT record and find ATTR_STD.
+ * The code below is not good idea.
+ * It does additional locks/reads just to get the type of name.
+ * Should we use additional mount option to enable branch below?
+ */
+ if ((fname->dup.fa & FILE_ATTRIBUTE_REPARSE_POINT) &&
+ ino != ni->mi.rno) {
+ struct inode *inode = ntfs_iget5(sbi->sb, &e->ref, NULL);
+ if (!IS_ERR_OR_NULL(inode)) {
+ dt_type = fs_umode_to_dtype(inode->i_mode);
+ iput(inode);
+ }
+ }
return !dir_emit(ctx, (s8 *)name, name_len, ino, dt_type);
}
--
2.43.0
From: Damien Le Moal <dlemoal(a)kernel.org>
For regular system shutdown, ata_dev_power_set_standby() will be
executed twice: once the scsi device is removed and another when
ata_pci_shutdown_one() executes and EH completes unloading the devices.
Make the second call to ata_dev_power_set_standby() do nothing by using
ata_dev_power_is_active() and return if the device is already in
standby.
Fixes: 2da4c5e24e86 ("ata: libata-core: Improve ata_dev_power_set_active()")
Cc: stable(a)vger.kernel.org
Signed-off-by: Damien Le Moal <dlemoal(a)kernel.org>
Signed-off-by: Niklas Cassel <cassel(a)kernel.org>
---
Changes since V1: Move the function instead of using a forward declaration.
drivers/ata/libata-core.c | 59 ++++++++++++++++++++-------------------
1 file changed, 30 insertions(+), 29 deletions(-)
diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index d9f80f4f70f5..be3412cdb22e 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -2001,6 +2001,33 @@ bool ata_dev_power_init_tf(struct ata_device *dev, struct ata_taskfile *tf,
return true;
}
+static bool ata_dev_power_is_active(struct ata_device *dev)
+{
+ struct ata_taskfile tf;
+ unsigned int err_mask;
+
+ ata_tf_init(dev, &tf);
+ tf.flags |= ATA_TFLAG_DEVICE | ATA_TFLAG_ISADDR;
+ tf.protocol = ATA_PROT_NODATA;
+ tf.command = ATA_CMD_CHK_POWER;
+
+ err_mask = ata_exec_internal(dev, &tf, NULL, DMA_NONE, NULL, 0, 0);
+ if (err_mask) {
+ ata_dev_err(dev, "Check power mode failed (err_mask=0x%x)\n",
+ err_mask);
+ /*
+ * Assume we are in standby mode so that we always force a
+ * spinup in ata_dev_power_set_active().
+ */
+ return false;
+ }
+
+ ata_dev_dbg(dev, "Power mode: 0x%02x\n", tf.nsect);
+
+ /* Active or idle */
+ return tf.nsect == 0xff;
+}
+
/**
* ata_dev_power_set_standby - Set a device power mode to standby
* @dev: target device
@@ -2017,8 +2044,9 @@ void ata_dev_power_set_standby(struct ata_device *dev)
struct ata_taskfile tf;
unsigned int err_mask;
- /* If the device is already sleeping, do nothing. */
- if (dev->flags & ATA_DFLAG_SLEEPING)
+ /* If the device is already sleeping or in standby, do nothing. */
+ if ((dev->flags & ATA_DFLAG_SLEEPING) ||
+ !ata_dev_power_is_active(dev))
return;
/*
@@ -2046,33 +2074,6 @@ void ata_dev_power_set_standby(struct ata_device *dev)
err_mask);
}
-static bool ata_dev_power_is_active(struct ata_device *dev)
-{
- struct ata_taskfile tf;
- unsigned int err_mask;
-
- ata_tf_init(dev, &tf);
- tf.flags |= ATA_TFLAG_DEVICE | ATA_TFLAG_ISADDR;
- tf.protocol = ATA_PROT_NODATA;
- tf.command = ATA_CMD_CHK_POWER;
-
- err_mask = ata_exec_internal(dev, &tf, NULL, DMA_NONE, NULL, 0, 0);
- if (err_mask) {
- ata_dev_err(dev, "Check power mode failed (err_mask=0x%x)\n",
- err_mask);
- /*
- * Assume we are in standby mode so that we always force a
- * spinup in ata_dev_power_set_active().
- */
- return false;
- }
-
- ata_dev_dbg(dev, "Power mode: 0x%02x\n", tf.nsect);
-
- /* Active or idle */
- return tf.nsect == 0xff;
-}
-
/**
* ata_dev_power_set_active - Set a device power mode to active
* @dev: target device
--
2.43.2
The following commit has been merged into the irq/urgent branch of tip:
Commit-ID: fb33a46cd75e18773dd5a414744507d84ae90870
Gitweb: https://git.kernel.org/tip/fb33a46cd75e18773dd5a414744507d84ae90870
Author: Chen Jun <chenjun102(a)huawei.com>
AuthorDate: Tue, 20 Feb 2024 19:14:29 +08:00
Committer: Thomas Gleixner <tglx(a)linutronix.de>
CommitterDate: Wed, 21 Feb 2024 18:40:00 +01:00
irqchip/mbigen: Don't use bus_get_dev_root() to find the parent
bus_get_dev_root() returns sp->dev_root which is set in subsys_register(),
but subsys_register() is not called by platform_bus_init().
Therefor for the platform_bus_type, bus_get_dev_root() always returns NULL.
This makes mbigen_of_create_domain() always return -ENODEV.
Don't try to retrieve the parent via bus_get_dev_root() and
unconditionally hand a NULL pointer to of_platform_device_create() to
fix this.
Fixes: fea087fc291b ("irqchip/mbigen: move to use bus_get_dev_root()")
Signed-off-by: Chen Jun <chenjun102(a)huawei.com>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20240220111429.110666-1-chenjun102@huawei.com
---
drivers/irqchip/irq-mbigen.c | 8 +-------
1 file changed, 1 insertion(+), 7 deletions(-)
diff --git a/drivers/irqchip/irq-mbigen.c b/drivers/irqchip/irq-mbigen.c
index 5101a3f..58881d3 100644
--- a/drivers/irqchip/irq-mbigen.c
+++ b/drivers/irqchip/irq-mbigen.c
@@ -235,22 +235,17 @@ static const struct irq_domain_ops mbigen_domain_ops = {
static int mbigen_of_create_domain(struct platform_device *pdev,
struct mbigen_device *mgn_chip)
{
- struct device *parent;
struct platform_device *child;
struct irq_domain *domain;
struct device_node *np;
u32 num_pins;
int ret = 0;
- parent = bus_get_dev_root(&platform_bus_type);
- if (!parent)
- return -ENODEV;
-
for_each_child_of_node(pdev->dev.of_node, np) {
if (!of_property_read_bool(np, "interrupt-controller"))
continue;
- child = of_platform_device_create(np, NULL, parent);
+ child = of_platform_device_create(np, NULL, NULL);
if (!child) {
ret = -ENOMEM;
break;
@@ -273,7 +268,6 @@ static int mbigen_of_create_domain(struct platform_device *pdev,
}
}
- put_device(parent);
if (ret)
of_node_put(np);
The is_psr_su parameter is a boolean flag indicating whether the Panel
Self Refresh Selective Update (PSR SU) feature is enabled which is a
power-saving feature that allows only the updated regions of the screen
to be refreshed, reducing the amount of data that needs to be sent to
the display.
Fixes the below with gcc W=1:
drivers/gpu/drm/amd/amdgpu/../display/amdgpu_dm/amdgpu_dm.c:5257: warning: Function parameter or member 'is_psr_su' not described in 'fill_dc_dirty_rects'
Fixes: 13d6b0812e58 ("drm/amdgpu: make damage clips support configurable")
Cc: stable(a)vger.kernel.org
Cc: Hamza Mahfooz <hamza.mahfooz(a)amd.com>
Cc: Mario Limonciello <mario.limonciello(a)amd.com>
Cc: Rodrigo Siqueira <Rodrigo.Siqueira(a)amd.com>
Cc: Aurabindo Pillai <aurabindo.pillai(a)amd.com>
Signed-off-by: Srinivasan Shanmugam <srinivasan.shanmugam(a)amd.com>
---
drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index b9ac3d2f8029..1b51f7fb48ea 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -5234,6 +5234,10 @@ static inline void fill_dc_dirty_rect(struct drm_plane *plane,
* @new_plane_state: New state of @plane
* @crtc_state: New state of CRTC connected to the @plane
* @flip_addrs: DC flip tracking struct, which also tracts dirty rects
+ * @is_psr_su: Flag indicating whether Panel Self Refresh Selective Update (PSR SU) is enabled.
+ * If PSR SU is enabled and damage clips are available, only the regions of the screen
+ * that have changed will be updated. If PSR SU is not enabled,
+ * or if damage clips are not available, the entire screen will be updated.
* @dirty_regions_changed: dirty regions changed
*
* For PSR SU, DC informs the DMUB uController of dirty rectangle regions
--
2.34.1
Hi Greg,
I believe these patches are likely already on your radar. I just wanted to
inform you that it would be highly appreciated if we could see their inclusion
in the upcoming release.
e0526ec5360a 2024-01-30hv_netvsc: Fix race condition between
netvsc_probe and netvsc_remove [Jakub Kicinski]
We would like to even get:
9cae43da9867 hv_netvsc: Register VF in netvsc_probe if
NET_DEVICE_REGISTER missed
included, but the patch is still in netdev and has not made it into
Linus's tree. If it does come in,
could you please consider including it too.
Thanks,
- Allen
The current logic is probably fine but is a bit convoluted. Plus, we
don't want partial pages to be part of the sequential operation just in
case the core would optimize the page read with a subpage read (which
would break the sequence). This may happen on the first and last page
only, so if the start offset or the end offset is not aligned with a
page boundary, better avoid them to prevent any risk.
Cc: stable(a)vger.kernel.org
Fixes: 003fe4b9545b ("mtd: rawnand: Support for sequential cache reads")
Signed-off-by: Miquel Raynal <miquel.raynal(a)bootlin.com>
---
drivers/mtd/nand/raw/nand_base.c | 26 +++++++++++++++++---------
1 file changed, 17 insertions(+), 9 deletions(-)
diff --git a/drivers/mtd/nand/raw/nand_base.c b/drivers/mtd/nand/raw/nand_base.c
index 139fdf3e58c0..bbdcfbe643f3 100644
--- a/drivers/mtd/nand/raw/nand_base.c
+++ b/drivers/mtd/nand/raw/nand_base.c
@@ -3460,21 +3460,29 @@ static void rawnand_enable_cont_reads(struct nand_chip *chip, unsigned int page,
u32 readlen, int col)
{
struct mtd_info *mtd = nand_to_mtd(chip);
+ unsigned int end_page, end_col;
+
+ chip->cont_read.ongoing = false;
if (!chip->controller->supported_op.cont_read)
return;
- if ((col && col + readlen < (3 * mtd->writesize)) ||
- (!col && readlen < (2 * mtd->writesize))) {
- chip->cont_read.ongoing = false;
- return;
- }
+ end_page = DIV_ROUND_UP(col + readlen, mtd->writesize);
+ end_col = (col + readlen) % mtd->writesize;
- chip->cont_read.ongoing = true;
- chip->cont_read.first_page = page;
if (col)
- chip->cont_read.first_page++;
- chip->cont_read.last_page = page + ((readlen >> chip->page_shift) & chip->pagemask);
+ page++;
+
+ if (end_col && end_page)
+ end_page--;
+
+ if (page + 1 > end_page)
+ return;
+
+ chip->cont_read.first_page = page;
+ chip->cont_read.last_page = end_page;
+ chip->cont_read.ongoing = true;
+
rawnand_cap_cont_reads(chip);
}
--
2.34.1
On Wed, Feb 21, 2024 at 1:33 PM Jason A. Donenfeld <Jason(a)zx2c4.com> wrote:
>
> There are few uses of CoCo that don't rely on working cryptography and
> hence a working RNG. Unfortunately, the CoCo threat model means that the
> VM host cannot be trusted and may actively work against guests to
> extract secrets or manipulate computation. Since a malicious host can
> modify or observe nearly all inputs to guests, the only remaining source
> of entropy for CoCo guests is RDRAND.
>
> If RDRAND is broken -- due to CPU hardware fault -- the RNG as a whole
> is meant to gracefully continue on gathering entropy from other sources,
> but since there aren't other sources on CoCo, this is catastrophic.
> This is mostly a concern at boot time when initially seeding the RNG, as
> after that the consequences of a broken RDRAND are much more
> theoretical.
>
> So, try at boot to seed the RNG using 256 bits of RDRAND output. If this
> fails, panic(). This will also trigger if the system is booted without
> RDRAND, as RDRAND is essential for a safe CoCo boot.
>
> This patch is deliberately written to be "just a CoCo x86 driver
> feature" and not part of the RNG itself. Many device drivers and
> platforms have some desire to contribute something to the RNG, and
> add_device_randomness() is specifically meant for this purpose. Any
> driver can call this with seed data of any quality, or even garbage
> quality, and it can only possibly make the quality of the RNG better or
> have no effect, but can never make it worse. Rather than trying to
> build something into the core of the RNG, this patch interprets the
> particular CoCo issue as just a CoCo issue, and therefore separates this
> all out into driver (well, arch/platform) code.
>
> Cc: Borislav Petkov <bp(a)alien8.de>
> Cc: Daniel P. Berrangé <berrange(a)redhat.com>
> Cc: Dave Hansen <dave.hansen(a)linux.intel.com>
> Cc: Elena Reshetova <elena.reshetova(a)intel.com>
> Cc: H. Peter Anvin <hpa(a)zytor.com>
> Cc: Ingo Molnar <mingo(a)redhat.com>
> Cc: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com>
> Cc: Theodore Ts'o <tytso(a)mit.edu>
> Cc: Thomas Gleixner <tglx(a)linutronix.de>
> Signed-off-by: Jason A. Donenfeld <Jason(a)zx2c4.com>
Also,
Cc: stable(a)vger.kernel.org
At least, I think that's probably what we want, though I don't know
what version range is relevant for CoCo.
Syzkaller reports a potential circular dependency leading to deadlock
in 5.10 and 5.15 stable releases since the commit
92d4abd66f70 ("Bluetooth: vhci: Fix race when opening vhci device")
that caused this crash was backported to these branches.
The problem has been fixed by the following upstream patch that was
adapted to 5.10 and 5.15. All of the changes made to the patch
in order to adapt it are described at the end of commit message.
This patch has already been backported to the following stable branches:
v6.6 - https://lore.kernel.org/stable/20231230115814.038261305@linuxfoundation.org/
v6.1 - https://lore.kernel.org/stable/20231230115807.749489379@linuxfoundation.org/
Found by Linux Verification Center (linuxtesting.org) with Syzkaller.
From: Max Krummenacher <max.krummenacher(a)toradex.com>
Hello
With the backported commit e09ff743e30b ("mtd: rawnand: gpmi: Set
WAIT_FOR_READY timeout based on program/erase times") in kernel 5.4.y
I see corruption of the NAND content during kernel boot.
Reverting said commit on top of current 5.4.y fixes the issue.
It seems that the commit relies on commit 71c76f56b97c ("mtd:
rawnand: gpmi: Fix setting busy timeout setting"), but its
backport got reverted.
One should either backport both commits or none, having only one
results in potential bugs.
I've seen it in 5.4.y, however in 5.10.y and 5.15.y there one of
the two backports is also reverted and likely the same regression
exists.
Any comments?
Max
Max Krummenacher (1):
Revert "Revert "mtd: rawnand: gpmi: Fix setting busy timeout setting""
drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--
2.42.0
From: Dan Carpenter <dan.carpenter(a)linaro.org>
commit c301f0981fdd3fd1ffac6836b423c4d7a8e0eb63 upstream.
The problem is in nft_byteorder_eval() where we are iterating through a
loop and writing to dst[0], dst[1], dst[2] and so on... On each
iteration we are writing 8 bytes. But dst[] is an array of u32 so each
element only has space for 4 bytes. That means that every iteration
overwrites part of the previous element.
I spotted this bug while reviewing commit caf3ef7468f7 ("netfilter:
nf_tables: prevent OOB access in nft_byteorder_eval") which is a related
issue. I think that the reason we have not detected this bug in testing
is that most of time we only write one element.
Fixes: ce1e7989d989 ("netfilter: nft_byteorder: provide 64bit le/be conversion")
Signed-off-by: Dan Carpenter <dan.carpenter(a)linaro.org>
Signed-off-by: Pablo Neira Ayuso <pablo(a)netfilter.org>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
[Ajay: Modified to apply on v4.19.y]
Signed-off-by: Ajay Kaher <ajay.kaher(a)broadcom.com>
---
net/netfilter/nft_byteorder.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/net/netfilter/nft_byteorder.c b/net/netfilter/nft_byteorder.c
index dba1612..8c4ee49 100644
--- a/net/netfilter/nft_byteorder.c
+++ b/net/netfilter/nft_byteorder.c
@@ -41,19 +41,20 @@ static void nft_byteorder_eval(const struct nft_expr *expr,
switch (priv->size) {
case 8: {
+ u64 *dst64 = (void *)dst;
u64 src64;
switch (priv->op) {
case NFT_BYTEORDER_NTOH:
for (i = 0; i < priv->len / 8; i++) {
src64 = get_unaligned((u64 *)&src[i]);
- put_unaligned_be64(src64, &dst[i]);
+ put_unaligned_be64(src64, &dst64[i]);
}
break;
case NFT_BYTEORDER_HTON:
for (i = 0; i < priv->len / 8; i++) {
src64 = get_unaligned_be64(&src[i]);
- put_unaligned(src64, (u64 *)&dst[i]);
+ put_unaligned(src64, &dst64[i]);
}
break;
}
--
2.7.4
From: Alfred Piccioni <alpic(a)google.com>
commit f1bb47a31dff6d4b34fb14e99850860ee74bb003 upstream.
[Please apply to 5.4-stable and 4.19-stable. The upstream commit failed
to apply to these kernels. This patch resolves the conflicts.]
Some ioctl commands do not require ioctl permission, but are routed to
other permissions such as FILE_GETATTR or FILE_SETATTR. This routing is
done by comparing the ioctl cmd to a set of 64-bit flags (FS_IOC_*).
However, if a 32-bit process is running on a 64-bit kernel, it emits
32-bit flags (FS_IOC32_*) for certain ioctl operations. These flags are
being checked erroneously, which leads to these ioctl operations being
routed to the ioctl permission, rather than the correct file
permissions.
This was also noted in a RED-PEN finding from a while back -
"/* RED-PEN how should LSM module know it's handling 32bit? */".
This patch introduces a new hook, security_file_ioctl_compat(), that is
called from the compat ioctl syscall. All current LSMs have been changed
to support this hook.
Reviewing the three places where we are currently using
security_file_ioctl(), it appears that only SELinux needs a dedicated
compat change; TOMOYO and SMACK appear to be functional without any
change.
Cc: stable(a)vger.kernel.org
Fixes: 0b24dcb7f2f7 ("Revert "selinux: simplify ioctl checking"")
Signed-off-by: Alfred Piccioni <alpic(a)google.com>
Reviewed-by: Stephen Smalley <stephen.smalley.work(a)gmail.com>
[PM: subject tweak, line length fixes, and alignment corrections]
Signed-off-by: Paul Moore <paul(a)paul-moore.com>
Signed-off-by: Eric Biggers <ebiggers(a)google.com>
---
fs/compat_ioctl.c | 3 +--
include/linux/lsm_hooks.h | 9 +++++++++
include/linux/security.h | 9 +++++++++
security/security.c | 17 +++++++++++++++++
security/selinux/hooks.c | 28 ++++++++++++++++++++++++++++
security/smack/smack_lsm.c | 1 +
security/tomoyo/tomoyo.c | 1 +
7 files changed, 66 insertions(+), 2 deletions(-)
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 8fcc53d83af2d..22f7dc6688dee 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -994,8 +994,7 @@ COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd,
if (!f.file)
goto out;
- /* RED-PEN how should LSM module know it's handling 32bit? */
- error = security_file_ioctl(f.file, cmd, arg);
+ error = security_file_ioctl_compat(f.file, cmd, arg);
if (error)
goto out_fput;
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index a21dc5413653e..0f4897e97c70f 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -498,6 +498,12 @@
* simple integer value. When @arg represents a user space pointer, it
* should never be used by the security module.
* Return 0 if permission is granted.
+ * @file_ioctl_compat:
+ * @file contains the file structure.
+ * @cmd contains the operation to perform.
+ * @arg contains the operational arguments.
+ * Check permission for a compat ioctl operation on @file.
+ * Return 0 if permission is granted.
* @mmap_addr :
* Check permissions for a mmap operation at @addr.
* @addr contains virtual address that will be used for the operation.
@@ -1602,6 +1608,8 @@ union security_list_options {
void (*file_free_security)(struct file *file);
int (*file_ioctl)(struct file *file, unsigned int cmd,
unsigned long arg);
+ int (*file_ioctl_compat)(struct file *file, unsigned int cmd,
+ unsigned long arg);
int (*mmap_addr)(unsigned long addr);
int (*mmap_file)(struct file *file, unsigned long reqprot,
unsigned long prot, unsigned long flags);
@@ -1907,6 +1915,7 @@ struct security_hook_heads {
struct hlist_head file_alloc_security;
struct hlist_head file_free_security;
struct hlist_head file_ioctl;
+ struct hlist_head file_ioctl_compat;
struct hlist_head mmap_addr;
struct hlist_head mmap_file;
struct hlist_head file_mprotect;
diff --git a/include/linux/security.h b/include/linux/security.h
index aa5c7141c8d17..1a99958b850b5 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -362,6 +362,8 @@ int security_file_permission(struct file *file, int mask);
int security_file_alloc(struct file *file);
void security_file_free(struct file *file);
int security_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+int security_file_ioctl_compat(struct file *file, unsigned int cmd,
+ unsigned long arg);
int security_mmap_file(struct file *file, unsigned long prot,
unsigned long flags);
int security_mmap_addr(unsigned long addr);
@@ -907,6 +909,13 @@ static inline int security_file_ioctl(struct file *file, unsigned int cmd,
return 0;
}
+static inline int security_file_ioctl_compat(struct file *file,
+ unsigned int cmd,
+ unsigned long arg)
+{
+ return 0;
+}
+
static inline int security_mmap_file(struct file *file, unsigned long prot,
unsigned long flags)
{
diff --git a/security/security.c b/security/security.c
index 460c3826f6401..6c06296548c21 100644
--- a/security/security.c
+++ b/security/security.c
@@ -1422,6 +1422,23 @@ int security_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
return call_int_hook(file_ioctl, 0, file, cmd, arg);
}
+/**
+ * security_file_ioctl_compat() - Check if an ioctl is allowed in compat mode
+ * @file: associated file
+ * @cmd: ioctl cmd
+ * @arg: ioctl arguments
+ *
+ * Compat version of security_file_ioctl() that correctly handles 32-bit
+ * processes running on 64-bit kernels.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
+int security_file_ioctl_compat(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ return call_int_hook(file_ioctl_compat, 0, file, cmd, arg);
+}
+
static inline unsigned long mmap_prot(struct file *file, unsigned long prot)
{
/*
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index c1bf319b459a9..6fec9fba41a84 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -3668,6 +3668,33 @@ static int selinux_file_ioctl(struct file *file, unsigned int cmd,
return error;
}
+static int selinux_file_ioctl_compat(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ /*
+ * If we are in a 64-bit kernel running 32-bit userspace, we need to
+ * make sure we don't compare 32-bit flags to 64-bit flags.
+ */
+ switch (cmd) {
+ case FS_IOC32_GETFLAGS:
+ cmd = FS_IOC_GETFLAGS;
+ break;
+ case FS_IOC32_SETFLAGS:
+ cmd = FS_IOC_SETFLAGS;
+ break;
+ case FS_IOC32_GETVERSION:
+ cmd = FS_IOC_GETVERSION;
+ break;
+ case FS_IOC32_SETVERSION:
+ cmd = FS_IOC_SETVERSION;
+ break;
+ default:
+ break;
+ }
+
+ return selinux_file_ioctl(file, cmd, arg);
+}
+
static int default_noexec;
static int file_map_prot_check(struct file *file, unsigned long prot, int shared)
@@ -6933,6 +6960,7 @@ static struct security_hook_list selinux_hooks[] __lsm_ro_after_init = {
LSM_HOOK_INIT(file_permission, selinux_file_permission),
LSM_HOOK_INIT(file_alloc_security, selinux_file_alloc_security),
LSM_HOOK_INIT(file_ioctl, selinux_file_ioctl),
+ LSM_HOOK_INIT(file_ioctl_compat, selinux_file_ioctl_compat),
LSM_HOOK_INIT(mmap_file, selinux_mmap_file),
LSM_HOOK_INIT(mmap_addr, selinux_mmap_addr),
LSM_HOOK_INIT(file_mprotect, selinux_file_mprotect),
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index 9e48c8b36b678..6f2613f874fa9 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -4648,6 +4648,7 @@ static struct security_hook_list smack_hooks[] __lsm_ro_after_init = {
LSM_HOOK_INIT(file_alloc_security, smack_file_alloc_security),
LSM_HOOK_INIT(file_ioctl, smack_file_ioctl),
+ LSM_HOOK_INIT(file_ioctl_compat, smack_file_ioctl),
LSM_HOOK_INIT(file_lock, smack_file_lock),
LSM_HOOK_INIT(file_fcntl, smack_file_fcntl),
LSM_HOOK_INIT(mmap_file, smack_mmap_file),
diff --git a/security/tomoyo/tomoyo.c b/security/tomoyo/tomoyo.c
index 716c92ec941ad..0176612bac967 100644
--- a/security/tomoyo/tomoyo.c
+++ b/security/tomoyo/tomoyo.c
@@ -554,6 +554,7 @@ static struct security_hook_list tomoyo_hooks[] __lsm_ro_after_init = {
LSM_HOOK_INIT(path_rename, tomoyo_path_rename),
LSM_HOOK_INIT(inode_getattr, tomoyo_inode_getattr),
LSM_HOOK_INIT(file_ioctl, tomoyo_file_ioctl),
+ LSM_HOOK_INIT(file_ioctl_compat, tomoyo_file_ioctl),
LSM_HOOK_INIT(path_chmod, tomoyo_path_chmod),
LSM_HOOK_INIT(path_chown, tomoyo_path_chown),
LSM_HOOK_INIT(path_chroot, tomoyo_path_chroot),
base-commit: f0602893f43a54097fcf22bd8c2f7b8e75ca643e
--
2.43.0
From: "Borislav Petkov (AMD)" <bp(a)alien8.de>
commit 04c3024560d3a14acd18d0a51a1d0a89d29b7eb5 upstream.
AMD does not have the requirement for a synchronization barrier when
acccessing a certain group of MSRs. Do not incur that unnecessary
penalty there.
There will be a CPUID bit which explicitly states that a MFENCE is not
needed. Once that bit is added to the APM, this will be extended with
it.
While at it, move to processor.h to avoid include hell. Untangling that
file properly is a matter for another day.
Some notes on the performance aspect of why this is relevant, courtesy
of Kishon VijayAbraham <Kishon.VijayAbraham(a)amd.com>:
On a AMD Zen4 system with 96 cores, a modified ipi-bench[1] on a VM
shows x2AVIC IPI rate is 3% to 4% lower than AVIC IPI rate. The
ipi-bench is modified so that the IPIs are sent between two vCPUs in the
same CCX. This also requires to pin the vCPU to a physical core to
prevent any latencies. This simulates the use case of pinning vCPUs to
the thread of a single CCX to avoid interrupt IPI latency.
In order to avoid run-to-run variance (for both x2AVIC and AVIC), the
below configurations are done:
1) Disable Power States in BIOS (to prevent the system from going to
lower power state)
2) Run the system at fixed frequency 2500MHz (to prevent the system
from increasing the frequency when the load is more)
With the above configuration:
*) Performance measured using ipi-bench for AVIC:
Average Latency: 1124.98ns [Time to send IPI from one vCPU to another vCPU]
Cumulative throughput: 42.6759M/s [Total number of IPIs sent in a second from
48 vCPUs simultaneously]
*) Performance measured using ipi-bench for x2AVIC:
Average Latency: 1172.42ns [Time to send IPI from one vCPU to another vCPU]
Cumulative throughput: 40.9432M/s [Total number of IPIs sent in a second from
48 vCPUs simultaneously]
From above, x2AVIC latency is ~4% more than AVIC. However, the expectation is
x2AVIC performance to be better or equivalent to AVIC. Upon analyzing
the perf captures, it is observed significant time is spent in
weak_wrmsr_fence() invoked by x2apic_send_IPI().
With the fix to skip weak_wrmsr_fence()
*) Performance measured using ipi-bench for x2AVIC:
Average Latency: 1117.44ns [Time to send IPI from one vCPU to another vCPU]
Cumulative throughput: 42.9608M/s [Total number of IPIs sent in a second from
48 vCPUs simultaneously]
Comparing the performance of x2AVIC with and without the fix, it can be seen
the performance improves by ~4%.
Performance captured using an unmodified ipi-bench using the 'mesh-ipi' option
with and without weak_wrmsr_fence() on a Zen4 system also showed significant
performance improvement without weak_wrmsr_fence(). The 'mesh-ipi' option ignores
CCX or CCD and just picks random vCPU.
Average throughput (10 iterations) with weak_wrmsr_fence(),
Cumulative throughput: 4933374 IPI/s
Average throughput (10 iterations) without weak_wrmsr_fence(),
Cumulative throughput: 6355156 IPI/s
[1] https://github.com/bytedance/kvm-utils/tree/master/microbenchmark/ipi-bench
Cc: stable(a)vger.kernel.org # 6.6+
Signed-off-by: Borislav Petkov (AMD) <bp(a)alien8.de>
Link: https://lore.kernel.org/r/20230622095212.20940-1-bp@alien8.de
Signed-off-by: Kishon Vijay Abraham I <kvijayab(a)amd.com>
---
Kindly merge this patch to stable releases (v6.6+) as it's a perf optimization.
[It does not apply as is on earlier releases and have to be reworked]
arch/x86/include/asm/barrier.h | 18 ------------------
arch/x86/include/asm/cpufeatures.h | 2 +-
arch/x86/include/asm/processor.h | 18 ++++++++++++++++++
arch/x86/kernel/cpu/amd.c | 3 +++
arch/x86/kernel/cpu/common.c | 7 +++++++
arch/x86/kernel/cpu/hygon.c | 3 +++
6 files changed, 32 insertions(+), 19 deletions(-)
diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h
index 35389b2af88e..0216f63a366b 100644
--- a/arch/x86/include/asm/barrier.h
+++ b/arch/x86/include/asm/barrier.h
@@ -81,22 +81,4 @@ do { \
#include <asm-generic/barrier.h>
-/*
- * Make previous memory operations globally visible before
- * a WRMSR.
- *
- * MFENCE makes writes visible, but only affects load/store
- * instructions. WRMSR is unfortunately not a load/store
- * instruction and is unaffected by MFENCE. The LFENCE ensures
- * that the WRMSR is not reordered.
- *
- * Most WRMSRs are full serializing instructions themselves and
- * do not require this barrier. This is only required for the
- * IA32_TSC_DEADLINE and X2APIC MSRs.
- */
-static inline void weak_wrmsr_fence(void)
-{
- asm volatile("mfence; lfence" : : : "memory");
-}
-
#endif /* _ASM_X86_BARRIER_H */
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 58cb9495e40f..0091f1008314 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -308,10 +308,10 @@
#define X86_FEATURE_SMBA (11*32+21) /* "" Slow Memory Bandwidth Allocation */
#define X86_FEATURE_BMEC (11*32+22) /* "" Bandwidth Monitoring Event Configuration */
#define X86_FEATURE_USER_SHSTK (11*32+23) /* Shadow stack support for user mode applications */
-
#define X86_FEATURE_SRSO (11*32+24) /* "" AMD BTB untrain RETs */
#define X86_FEATURE_SRSO_ALIAS (11*32+25) /* "" AMD BTB untrain RETs through aliasing */
#define X86_FEATURE_IBPB_ON_VMEXIT (11*32+26) /* "" Issue an IBPB only on VMEXIT */
+#define X86_FEATURE_APIC_MSRS_FENCE (11*32+27) /* "" IA32_TSC_DEADLINE and X2APIC MSRs need fencing */
/* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */
#define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index a3669a7774ed..191f1d8f0506 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -734,4 +734,22 @@ bool arch_is_platform_page(u64 paddr);
extern bool gds_ucode_mitigated(void);
+/*
+ * Make previous memory operations globally visible before
+ * a WRMSR.
+ *
+ * MFENCE makes writes visible, but only affects load/store
+ * instructions. WRMSR is unfortunately not a load/store
+ * instruction and is unaffected by MFENCE. The LFENCE ensures
+ * that the WRMSR is not reordered.
+ *
+ * Most WRMSRs are full serializing instructions themselves and
+ * do not require this barrier. This is only required for the
+ * IA32_TSC_DEADLINE and X2APIC MSRs.
+ */
+static inline void weak_wrmsr_fence(void)
+{
+ alternative("mfence; lfence", "", ALT_NOT(X86_FEATURE_APIC_MSRS_FENCE));
+}
+
#endif /* _ASM_X86_PROCESSOR_H */
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 6e4f23f314ac..bb3efc825bf4 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -1157,6 +1157,9 @@ static void init_amd(struct cpuinfo_x86 *c)
if (!cpu_has(c, X86_FEATURE_HYPERVISOR) &&
cpu_has_amd_erratum(c, amd_erratum_1485))
msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_SHARED_BTB_FIX_BIT);
+
+ /* AMD CPUs don't need fencing after x2APIC/TSC_DEADLINE MSR writes. */
+ clear_cpu_cap(c, X86_FEATURE_APIC_MSRS_FENCE);
}
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 4e5ffc8b0e46..d98d023ae497 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1858,6 +1858,13 @@ static void identify_cpu(struct cpuinfo_x86 *c)
c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
#endif
+
+ /*
+ * Set default APIC and TSC_DEADLINE MSR fencing flag. AMD and
+ * Hygon will clear it in ->c_init() below.
+ */
+ set_cpu_cap(c, X86_FEATURE_APIC_MSRS_FENCE);
+
/*
* Vendor-specific initialization. In this section we
* canonicalize the feature flags, meaning if there are
diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c
index a7b3ef4c4de9..6e738759779e 100644
--- a/arch/x86/kernel/cpu/hygon.c
+++ b/arch/x86/kernel/cpu/hygon.c
@@ -348,6 +348,9 @@ static void init_hygon(struct cpuinfo_x86 *c)
set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS);
check_null_seg_clears_base(c);
+
+ /* Hygon CPUs don't need fencing after x2APIC/TSC_DEADLINE MSR writes. */
+ clear_cpu_cap(c, X86_FEATURE_APIC_MSRS_FENCE);
}
static void cpu_detect_tlb_hygon(struct cpuinfo_x86 *c)
--
2.34.1
Change WARN to pr_warn in check_map_prog_compatibility,
because this functionality was added in kernels 6.1 and
because fuzzing kernels with syzkaller while
kernel was started with parameter panic_on_warn
produces false positive crashes.
Signed-off-by: Alexander Ofitserov <oficerovas(a)altlinux.org>
Cc: stable(a)vger.kernel.org
---
kernel/bpf/verifier.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 45c50ee9b0370..7a7a6e3087ba2 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -10478,7 +10478,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
verbose(env, "trace type programs can only use preallocated hash map\n");
return -EINVAL;
}
- WARN_ONCE(1, "trace type BPF program uses run-time allocation\n");
+ pr_warn_once("trace type BPF program uses run-time allocation\n");
verbose(env, "trace type programs with run-time allocated hash maps are unsafe. Switch to preallocated hash maps.\n");
}
--
2.42.1
It is possible that an LPI mapped in a different ITS gets unmapped while
handling the MOVALL command. If that is the case, there is no state that
can be migrated to the destination. Silently ignore it and continue
migrating other LPIs.
Cc: stable(a)vger.kernel.org
Fixes: ff9c114394aa ("KVM: arm/arm64: GICv4: Handle MOVALL applied to a vPE")
Signed-off-by: Oliver Upton <oliver.upton(a)linux.dev>
---
arch/arm64/kvm/vgic/vgic-its.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c
index 082448de27ed..28a93074eca1 100644
--- a/arch/arm64/kvm/vgic/vgic-its.c
+++ b/arch/arm64/kvm/vgic/vgic-its.c
@@ -1435,6 +1435,8 @@ static int vgic_its_cmd_handle_movall(struct kvm *kvm, struct vgic_its *its,
for (i = 0; i < irq_count; i++) {
irq = vgic_get_irq(kvm, NULL, intids[i]);
+ if (!irq)
+ continue;
update_affinity(irq, vcpu2);
--
2.44.0.rc0.258.g7320e95886-goog
vgic_get_irq() may not return a valid descriptor if there is no ITS that
holds a valid translation for the specified INTID. If that is the case,
it is safe to silently ignore it and continue processing the LPI pending
table.
Cc: stable(a)vger.kernel.org
Fixes: 33d3bc9556a7 ("KVM: arm64: vgic-its: Read initial LPI pending table")
Signed-off-by: Oliver Upton <oliver.upton(a)linux.dev>
---
arch/arm64/kvm/vgic/vgic-its.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c
index e2764d0ffa9f..082448de27ed 100644
--- a/arch/arm64/kvm/vgic/vgic-its.c
+++ b/arch/arm64/kvm/vgic/vgic-its.c
@@ -468,6 +468,9 @@ static int its_sync_lpi_pending_table(struct kvm_vcpu *vcpu)
}
irq = vgic_get_irq(vcpu->kvm, NULL, intids[i]);
+ if (!irq)
+ continue;
+
raw_spin_lock_irqsave(&irq->irq_lock, flags);
irq->pending_latch = pendmask & (1U << bit_nr);
vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
--
2.44.0.rc0.258.g7320e95886-goog
Dear Nini,
Unfortunately I forgot to add a 'Fixes' tag to the patch, if I had, then it would
have happened automatically.
Please remind me of this once kernel 6.9-rc1 is released since that will contain
the fix. Then I can post the same patch to the stable mailinglist for inclusion in
older kernels.
It has to wait until 6.9-rc1 is release though, patches need to be in mainline first
before they can be backported.
Regards,
Hans
On 21/02/2024 07:30, Nini Song (宋宛妮) wrote:
> Dear Hans,
>
> Thank your reply.
> Could you also help to marge solution into v5.15? Our customer used v5.15 for MP production, which requires this solution.
>
>
> BR,
> Nini Song
> On Mon, 2024-02-05 at 13:00 +0100, Hans Verkuil wrote:
>>
>>
>> External email : Please do not click links or open attachments until you have verified the sender or the content.
>>
>> On 25/01/2024 14:28, nini.song(a)mediatek.com wrote:
>> > From: "nini.song" <nini.song(a)mediatek.com>
>> >
>> > The valid_la is used to check the length requirements,
>> > including special cases of Timer Status. If the length is
>> > shorter than 5, that means no Duration Available is returned,
>> > the message will be forced to be invalid.
>> >
>> > However, the description of Duration Available in the spec
>> > is that this parameter may be returned when these cases, or
>> > that it can be optionally return when these cases. The key
>> > words in the spec description are flexible choices.
>>
>> Good catch, the spec indeed says 'may', so dropping the check
>> in this patch is the correct thing to do.
>>
>> It's merged in our staging tree and it will appear in v6.9.
>>
>> Regards,
>>
>> Hans
>>
>> >
>> > Remove the special length check of Timer Status to fit the
>> > spec which is not compulsory about that.
>> >
>> > Signed-off-by: Nini Song <nini.song(a)mediatek.com>
>> > ---
>> > drivers/media/cec/core/cec-adap.c | 14 --------------
>> > 1 file changed, 14 deletions(-)
>> >
>> > diff --git a/drivers/media/cec/core/cec-adap.c b/drivers/media/cec/core/cec-adap.c
>> > index 5741adf09a2e..559a172ebc6c 100644
>> > --- a/drivers/media/cec/core/cec-adap.c
>> > +++ b/drivers/media/cec/core/cec-adap.c
>> > @@ -1151,20 +1151,6 @@ void cec_received_msg_ts(struct cec_adapter *adap,
>> > if (valid_la && min_len) {
>> > /* These messages have special length requirements */
>> > switch (cmd) {
>> > -case CEC_MSG_TIMER_STATUS:
>> > -if (msg->msg[2] & 0x10) {
>> > -switch (msg->msg[2] & 0xf) {
>> > -case CEC_OP_PROG_INFO_NOT_ENOUGH_SPACE:
>> > -case CEC_OP_PROG_INFO_MIGHT_NOT_BE_ENOUGH_SPACE:
>> > -if (msg->len < 5)
>> > -valid_la = false;
>> > -break;
>> > -}
>> > -} else if ((msg->msg[2] & 0xf) == CEC_OP_PROG_ERROR_DUPLICATE) {
>> > -if (msg->len < 5)
>> > -valid_la = false;
>> > -}
>> > -break;
>> > case CEC_MSG_RECORD_ON:
>> > switch (msg->msg[2]) {
>> > case CEC_OP_RECORD_SRC_OWN:
>>
>>
This is the fix of CVE-2024-23851 for kernel v6.1.
Upstream commit: https://github.com/torvalds/linux/commit/bd504bcfec41a503b32054da5472904b40…
Changed argument name "blk_mode_t" back to "fmode_t" for the old version. The argument
is not affected by the patch.
He Gao (1):
dm: limit the number of targets and parameter size area
drivers/md/dm-core.h | 2 ++
drivers/md/dm-ioctl.c | 3 ++-
drivers/md/dm-table.c | 9 +++++++--
3 files changed, 11 insertions(+), 3 deletions(-)
--
2.44.0.rc0.258.g7320e95886-goog
commit 5bc09b397cbf1221f8a8aacb1152650c9195b02b upstream.
According to a syzbot report, end_buffer_async_write(), which handles the
completion of block device writes, may detect abnormal condition of the
buffer async_write flag and cause a BUG_ON failure when using nilfs2.
Nilfs2 itself does not use end_buffer_async_write(). But, the async_write
flag is now used as a marker by commit 7f42ec394156 ("nilfs2: fix issue
with race condition of competition between segments for dirty blocks") as
a means of resolving double list insertion of dirty blocks in
nilfs_lookup_dirty_data_buffers() and nilfs_lookup_node_buffers() and the
resulting crash.
This modification is safe as long as it is used for file data and b-tree
node blocks where the page caches are independent. However, it was
irrelevant and redundant to also introduce async_write for segment summary
and super root blocks that share buffers with the backing device. This
led to the possibility that the BUG_ON check in end_buffer_async_write
would fail as described above, if independent writebacks of the backing
device occurred in parallel.
The use of async_write for segment summary buffers has already been
removed in a previous change.
Fix this issue by removing the manipulation of the async_write flag for
the remaining super root block buffer.
Link: https://lkml.kernel.org/r/20240203161645.4992-1-konishi.ryusuke@gmail.com
Fixes: 7f42ec394156 ("nilfs2: fix issue with race condition of competition between segments for dirty blocks")
Signed-off-by: Ryusuke Konishi <konishi.ryusuke(a)gmail.com>
Reported-by: syzbot+5c04210f7c7f897c1e7f(a)syzkaller.appspotmail.com
Closes: https://lkml.kernel.org/r/00000000000019a97c05fd42f8c8@google.com
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
Please queue this patch to these stable trees instead of the patch
that failed to apply to them.
This patch is tailored to account for page/folio conversion and can
be applied from v4.8 to v6.7.
Also, all the builds and tests I did on each stable tree passed.
Thanks,
Ryusuke Konishi
fs/nilfs2/segment.c | 8 +++++---
1 file changed, 5 insertions(+), 3 deletions(-)
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 55e31cc903d1..0f21dbcd0bfb 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -1703,7 +1703,6 @@ static void nilfs_segctor_prepare_write(struct nilfs_sc_info *sci)
list_for_each_entry(bh, &segbuf->sb_payload_buffers,
b_assoc_buffers) {
- set_buffer_async_write(bh);
if (bh == segbuf->sb_super_root) {
if (bh->b_page != bd_page) {
lock_page(bd_page);
@@ -1714,6 +1713,7 @@ static void nilfs_segctor_prepare_write(struct nilfs_sc_info *sci)
}
break;
}
+ set_buffer_async_write(bh);
if (bh->b_page != fs_page) {
nilfs_begin_page_io(fs_page);
fs_page = bh->b_page;
@@ -1799,7 +1799,6 @@ static void nilfs_abort_logs(struct list_head *logs, int err)
list_for_each_entry(bh, &segbuf->sb_payload_buffers,
b_assoc_buffers) {
- clear_buffer_async_write(bh);
if (bh == segbuf->sb_super_root) {
clear_buffer_uptodate(bh);
if (bh->b_page != bd_page) {
@@ -1808,6 +1807,7 @@ static void nilfs_abort_logs(struct list_head *logs, int err)
}
break;
}
+ clear_buffer_async_write(bh);
if (bh->b_page != fs_page) {
nilfs_end_page_io(fs_page, err);
fs_page = bh->b_page;
@@ -1895,8 +1895,9 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
BIT(BH_Delay) | BIT(BH_NILFS_Volatile) |
BIT(BH_NILFS_Redirected));
- set_mask_bits(&bh->b_state, clear_bits, set_bits);
if (bh == segbuf->sb_super_root) {
+ set_buffer_uptodate(bh);
+ clear_buffer_dirty(bh);
if (bh->b_page != bd_page) {
end_page_writeback(bd_page);
bd_page = bh->b_page;
@@ -1904,6 +1905,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
update_sr = true;
break;
}
+ set_mask_bits(&bh->b_state, clear_bits, set_bits);
if (bh->b_page != fs_page) {
nilfs_end_page_io(fs_page, 0);
fs_page = bh->b_page;
--
2.39.3
On Tue, Feb 6, 2024 at 5:18 PM Saravana Kannan <saravanak(a)google.com> wrote:
>
> Similar to the existing "ports" node name, coresight device tree bindings
> have added "in-ports" and "out-ports" as standard node names for a
> collection of ports.
>
> Add support for these name to of_graph_get_port_parent() so that
> remote-endpoint parsing can find the correct parent node for these
> coresight ports too.
>
> Signed-off-by: Saravana Kannan <saravanak(a)google.com>
Greg,
I saw that you pulled the previous 2 patches in this series to 6.1,
6.6 and 6.7 kernel branches. I really should have added both of those
Fixes tag to this patch too.
Can you please pull in the patch to those stable branches too?
Thanks,
Saravana
> ---
> drivers/of/property.c | 4 +++-
> 1 file changed, 3 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/of/property.c b/drivers/of/property.c
> index 7bb2d8e290de..39a3ee1dfb58 100644
> --- a/drivers/of/property.c
> +++ b/drivers/of/property.c
> @@ -763,7 +763,9 @@ struct device_node *of_graph_get_port_parent(struct device_node *node)
> /* Walk 3 levels up only if there is 'ports' node. */
> for (depth = 3; depth && node; depth--) {
> node = of_get_next_parent(node);
> - if (depth == 2 && !of_node_name_eq(node, "ports"))
> + if (depth == 2 && !of_node_name_eq(node, "ports") &&
> + !of_node_name_eq(node, "in-ports") &&
> + !of_node_name_eq(node, "out-ports"))
> break;
> }
> return node;
> --
> 2.43.0.594.gd9cf4e227d-goog
>
In erofs_find_target_block() when erofs_dirnamecmp() returns 0,
we do not assign the target metabuf. This causes the caller
erofs_namei()'s erofs_put_metabuf() at the end to be not effective
leaving the refcount on the page.
As the page from metabuf (buf->page) is never put, such page cannot be
migrated or reclaimed. Fix it now by putting the metabuf from
previous loop and assigning the current metabuf to target before
returning so caller erofs_namei() can do the final put as it was
intended.
Fixes: 500edd095648 ("erofs: use meta buffers for inode lookup")
Cc: stable(a)vger.kernel.org
Signed-off-by: Sandeep Dhavale <dhavale(a)google.com>
---
fs/erofs/namei.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c
index d4f631d39f0f..bfe1c926436b 100644
--- a/fs/erofs/namei.c
+++ b/fs/erofs/namei.c
@@ -132,7 +132,10 @@ static void *erofs_find_target_block(struct erofs_buf *target,
if (!diff) {
*_ndirents = 0;
- goto out;
+ if (!IS_ERR(candidate))
+ erofs_put_metabuf(target);
+ *target = buf;
+ return de;
} else if (diff > 0) {
head = mid + 1;
startprfx = matched;
--
2.44.0.rc0.258.g7320e95886-goog
While mq_perf_tests runs with the default kselftest timeout limit, which
is 45 seconds, the test takes about 60 seconds to complete on i3.metal
AWS instances. Hence, the test always times out. Increase the timeout
to 180 seconds.
Fixes: 852c8cbf34d3 ("selftests/kselftest/runner.sh: Add 45 second timeout per test")
Cc: <stable(a)vger.kernel.org> # 5.4.x
Signed-off-by: SeongJae Park <sj(a)kernel.org>
Reviewed-by: Kees Cook <keescook(a)chromium.org>
---
Changes from v2
(https://lore.kernel.org/r/20240220000243.162285-1-sj@kernel.org)
- Update commit message about the new timeout limit to 180 seconds
- Remove wrong Link: line
Changes from v1
(https://lore.kernel.org/r/20240208212925.68286-1-sj@kernel.org)
- Use 180 seconds timeout instead of 100 seconds
tools/testing/selftests/mqueue/setting | 1 +
1 file changed, 1 insertion(+)
create mode 100644 tools/testing/selftests/mqueue/setting
diff --git a/tools/testing/selftests/mqueue/setting b/tools/testing/selftests/mqueue/setting
new file mode 100644
index 000000000000..a953c96aa16e
--- /dev/null
+++ b/tools/testing/selftests/mqueue/setting
@@ -0,0 +1 @@
+timeout=180
--
2.39.2
This is the fix of CVE-2024-23851 for kernel v5.10.
Upstream commit: https://github.com/torvalds/linux/commit/bd504bcfec41a503b32054da5472904b40…
Changed code not affected by the patch for the old version.
He Gao (1):
dm: limit the number of targets and parameter size area
drivers/md/dm-core.h | 2 ++
drivers/md/dm-ioctl.c | 3 ++-
drivers/md/dm-table.c | 9 +++++++--
3 files changed, 11 insertions(+), 3 deletions(-)
--
2.44.0.rc0.258.g7320e95886-goog
This is the fix of CVE-2024-23851 for kernel v5.15.
Upstream commit: https://github.com/torvalds/linux/commit/bd504bcfec41a503b32054da5472904b40…
Changed code not affected by the patch for the old version.
He Gao (1):
dm: limit the number of targets and parameter size area
drivers/md/dm-core.h | 2 ++
drivers/md/dm-ioctl.c | 3 ++-
drivers/md/dm-table.c | 9 +++++++--
3 files changed, 11 insertions(+), 3 deletions(-)
--
2.44.0.rc0.258.g7320e95886-goog
The quilt patch titled
Subject: mm/damon/lru_sort: fix quota status loss due to online tunings
has been removed from the -mm tree. Its filename was
mm-damon-lru_sort-fix-quota-status-loss-due-to-online-tunings.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: SeongJae Park <sj(a)kernel.org>
Subject: mm/damon/lru_sort: fix quota status loss due to online tunings
Date: Fri, 16 Feb 2024 11:40:25 -0800
For online parameters change, DAMON_LRU_SORT creates new schemes based on
latest values of the parameters and replaces the old schemes with the new
one. When creating it, the internal status of the quotas of the old
schemes is not preserved. As a result, charging of the quota starts from
zero after the online tuning. The data that collected to estimate the
throughput of the scheme's action is also reset, and therefore the
estimation should start from the scratch again. Because the throughput
estimation is being used to convert the time quota to the effective size
quota, this could result in temporal time quota inaccuracy. It would be
recovered over time, though. In short, the quota accuracy could be
temporarily degraded after online parameters update.
Fix the problem by checking the case and copying the internal fields for
the status.
Link: https://lkml.kernel.org/r/20240216194025.9207-3-sj@kernel.org
Fixes: 40e983cca927 ("mm/damon: introduce DAMON-based LRU-lists Sorting")
Signed-off-by: SeongJae Park <sj(a)kernel.org>
Cc: <stable(a)vger.kernel.org> [6.0+]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/damon/lru_sort.c | 43 +++++++++++++++++++++++++++++++++++-------
1 file changed, 36 insertions(+), 7 deletions(-)
--- a/mm/damon/lru_sort.c~mm-damon-lru_sort-fix-quota-status-loss-due-to-online-tunings
+++ a/mm/damon/lru_sort.c
@@ -185,9 +185,21 @@ static struct damos *damon_lru_sort_new_
return damon_lru_sort_new_scheme(&pattern, DAMOS_LRU_DEPRIO);
}
+static void damon_lru_sort_copy_quota_status(struct damos_quota *dst,
+ struct damos_quota *src)
+{
+ dst->total_charged_sz = src->total_charged_sz;
+ dst->total_charged_ns = src->total_charged_ns;
+ dst->charged_sz = src->charged_sz;
+ dst->charged_from = src->charged_from;
+ dst->charge_target_from = src->charge_target_from;
+ dst->charge_addr_from = src->charge_addr_from;
+}
+
static int damon_lru_sort_apply_parameters(void)
{
- struct damos *scheme;
+ struct damos *scheme, *hot_scheme, *cold_scheme;
+ struct damos *old_hot_scheme = NULL, *old_cold_scheme = NULL;
unsigned int hot_thres, cold_thres;
int err = 0;
@@ -195,18 +207,35 @@ static int damon_lru_sort_apply_paramete
if (err)
return err;
+ damon_for_each_scheme(scheme, ctx) {
+ if (!old_hot_scheme) {
+ old_hot_scheme = scheme;
+ continue;
+ }
+ old_cold_scheme = scheme;
+ }
+
hot_thres = damon_max_nr_accesses(&damon_lru_sort_mon_attrs) *
hot_thres_access_freq / 1000;
- scheme = damon_lru_sort_new_hot_scheme(hot_thres);
- if (!scheme)
+ hot_scheme = damon_lru_sort_new_hot_scheme(hot_thres);
+ if (!hot_scheme)
return -ENOMEM;
- damon_set_schemes(ctx, &scheme, 1);
+ if (old_hot_scheme)
+ damon_lru_sort_copy_quota_status(&hot_scheme->quota,
+ &old_hot_scheme->quota);
cold_thres = cold_min_age / damon_lru_sort_mon_attrs.aggr_interval;
- scheme = damon_lru_sort_new_cold_scheme(cold_thres);
- if (!scheme)
+ cold_scheme = damon_lru_sort_new_cold_scheme(cold_thres);
+ if (!cold_scheme) {
+ damon_destroy_scheme(hot_scheme);
return -ENOMEM;
- damon_add_scheme(ctx, scheme);
+ }
+ if (old_cold_scheme)
+ damon_lru_sort_copy_quota_status(&cold_scheme->quota,
+ &old_cold_scheme->quota);
+
+ damon_set_schemes(ctx, &hot_scheme, 1);
+ damon_add_scheme(ctx, cold_scheme);
return damon_set_region_biggest_system_ram_default(target,
&monitor_region_start,
_
Patches currently in -mm which might be from sj(a)kernel.org are
docs-admin-guide-mm-damon-usage-use-sysfs-interface-for-tracepoints-example.patch
mm-damon-rename-config_damon_dbgfs-to-damon_dbgfs_deprecated.patch
mm-damon-dbgfs-implement-deprecation-notice-file.patch
mm-damon-dbgfs-make-debugfs-interface-deprecation-message-a-macro.patch
docs-admin-guide-mm-damon-usage-document-deprecated-file-of-damon-debugfs-interface.patch
selftets-damon-prepare-for-monitor_on-file-renaming.patch
mm-damon-dbgfs-rename-monitor_on-file-to-monitor_on_deprecated.patch
docs-admin-guide-mm-damon-usage-update-for-monitor_on-renaming.patch
docs-translations-damon-usage-update-for-monitor_on-renaming.patch
mm-damon-sysfs-handle-state-file-inputs-for-every-sampling-interval-if-possible.patch
selftests-damon-_damon_sysfs-support-damos-quota.patch
selftests-damon-_damon_sysfs-support-damos-stats.patch
selftests-damon-_damon_sysfs-support-damos-apply-interval.patch
selftests-damon-add-a-test-for-damos-quota.patch
selftests-damon-add-a-test-for-damos-apply-intervals.patch
selftests-damon-add-a-test-for-a-race-between-target_ids_read-and-dbgfs_before_terminate.patch
selftests-damon-add-a-test-for-the-pid-leak-of-dbgfs_target_ids_write.patch
selftests-damon-_chk_dependency-get-debugfs-mount-point-from-proc-mounts.patch
docs-mm-damon-maintainer-profile-fix-reference-links-for-mm-stable-tree.patch
docs-mm-damon-move-the-list-of-damos-actions-to-design-doc.patch
docs-mm-damon-move-damon-operation-sets-list-from-the-usage-to-the-design-document.patch
docs-mm-damon-move-monitoring-target-regions-setup-detail-from-the-usage-to-the-design-document.patch
docs-admin-guide-mm-damon-usage-fix-wrong-quotas-diabling-condition.patch
mm-damon-core-set-damos_quota-esz-as-public-field-and-document.patch
mm-damon-sysfs-schemes-implement-quota-effective_bytes-file.patch
mm-damon-sysfs-implement-a-kdamond-command-for-updating-schemes-effective-quotas.patch
docs-abi-damon-document-effective_bytes-sysfs-file.patch
docs-admin-guide-mm-damon-usage-document-effective_bytes-file.patch
mm-damon-move-comments-and-fields-for-damos-quota-prioritization-to-the-end.patch
mm-damon-core-split-out-quota-goal-related-fields-to-a-struct.patch
mm-damon-core-add-multiple-goals-per-damos_quota-and-helpers-for-those.patch
mm-damon-sysfs-use-only-quota-goals.patch
mm-damon-core-remove-goal-field-of-damos_quota.patch
mm-damon-core-let-goal-specified-with-only-target-and-current-values.patch
mm-damon-core-support-multiple-metrics-for-quota-goal.patch
mm-damon-core-implement-psi-metric-damos-quota-goal.patch
mm-damon-sysfs-schemes-support-psi-based-quota-auto-tune.patch
docs-mm-damon-design-document-quota-goal-self-tuning.patch
docs-abi-damon-document-quota-goal-metric-file.patch
docs-admin-guide-mm-damon-usage-document-quota-goal-metric-file.patch
mm-damon-reclaim-implement-user-feedback-driven-quota-auto-tuning.patch
mm-damon-reclaim-implement-memory-psi-driven-quota-self-tuning.patch
docs-admin-guide-mm-damon-reclaim-document-auto-tuning-parameters.patch
The quilt patch titled
Subject: mm/damon/reclaim: fix quota stauts loss due to online tunings
has been removed from the -mm tree. Its filename was
mm-damon-reclaim-fix-quota-stauts-loss-due-to-online-tunings.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: SeongJae Park <sj(a)kernel.org>
Subject: mm/damon/reclaim: fix quota stauts loss due to online tunings
Date: Fri, 16 Feb 2024 11:40:24 -0800
Patch series "mm/damon: fix quota status loss due to online tunings".
DAMON_RECLAIM and DAMON_LRU_SORT is not preserving internal quota status
when applying new user parameters, and hence could cause temporal quota
accuracy degradation. Fix it by preserving the status.
This patch (of 2):
For online parameters change, DAMON_RECLAIM creates new scheme based on
latest values of the parameters and replaces the old scheme with the new
one. When creating it, the internal status of the quota of the old
scheme is not preserved. As a result, charging of the quota starts from
zero after the online tuning. The data that collected to estimate the
throughput of the scheme's action is also reset, and therefore the
estimation should start from the scratch again. Because the throughput
estimation is being used to convert the time quota to the effective size
quota, this could result in temporal time quota inaccuracy. It would be
recovered over time, though. In short, the quota accuracy could be
temporarily degraded after online parameters update.
Fix the problem by checking the case and copying the internal fields for
the status.
Link: https://lkml.kernel.org/r/20240216194025.9207-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20240216194025.9207-2-sj@kernel.org
Fixes: e035c280f6df ("mm/damon/reclaim: support online inputs update")
Signed-off-by: SeongJae Park <sj(a)kernel.org>
Cc: <stable(a)vger.kernel.org> [5.19+]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/damon/reclaim.c | 18 +++++++++++++++++-
1 file changed, 17 insertions(+), 1 deletion(-)
--- a/mm/damon/reclaim.c~mm-damon-reclaim-fix-quota-stauts-loss-due-to-online-tunings
+++ a/mm/damon/reclaim.c
@@ -150,9 +150,20 @@ static struct damos *damon_reclaim_new_s
&damon_reclaim_wmarks);
}
+static void damon_reclaim_copy_quota_status(struct damos_quota *dst,
+ struct damos_quota *src)
+{
+ dst->total_charged_sz = src->total_charged_sz;
+ dst->total_charged_ns = src->total_charged_ns;
+ dst->charged_sz = src->charged_sz;
+ dst->charged_from = src->charged_from;
+ dst->charge_target_from = src->charge_target_from;
+ dst->charge_addr_from = src->charge_addr_from;
+}
+
static int damon_reclaim_apply_parameters(void)
{
- struct damos *scheme;
+ struct damos *scheme, *old_scheme;
struct damos_filter *filter;
int err = 0;
@@ -164,6 +175,11 @@ static int damon_reclaim_apply_parameter
scheme = damon_reclaim_new_scheme();
if (!scheme)
return -ENOMEM;
+ if (!list_empty(&ctx->schemes)) {
+ damon_for_each_scheme(old_scheme, ctx)
+ damon_reclaim_copy_quota_status(&scheme->quota,
+ &old_scheme->quota);
+ }
if (skip_anon) {
filter = damos_new_filter(DAMOS_FILTER_TYPE_ANON, true);
if (!filter) {
_
Patches currently in -mm which might be from sj(a)kernel.org are
docs-admin-guide-mm-damon-usage-use-sysfs-interface-for-tracepoints-example.patch
mm-damon-rename-config_damon_dbgfs-to-damon_dbgfs_deprecated.patch
mm-damon-dbgfs-implement-deprecation-notice-file.patch
mm-damon-dbgfs-make-debugfs-interface-deprecation-message-a-macro.patch
docs-admin-guide-mm-damon-usage-document-deprecated-file-of-damon-debugfs-interface.patch
selftets-damon-prepare-for-monitor_on-file-renaming.patch
mm-damon-dbgfs-rename-monitor_on-file-to-monitor_on_deprecated.patch
docs-admin-guide-mm-damon-usage-update-for-monitor_on-renaming.patch
docs-translations-damon-usage-update-for-monitor_on-renaming.patch
mm-damon-sysfs-handle-state-file-inputs-for-every-sampling-interval-if-possible.patch
selftests-damon-_damon_sysfs-support-damos-quota.patch
selftests-damon-_damon_sysfs-support-damos-stats.patch
selftests-damon-_damon_sysfs-support-damos-apply-interval.patch
selftests-damon-add-a-test-for-damos-quota.patch
selftests-damon-add-a-test-for-damos-apply-intervals.patch
selftests-damon-add-a-test-for-a-race-between-target_ids_read-and-dbgfs_before_terminate.patch
selftests-damon-add-a-test-for-the-pid-leak-of-dbgfs_target_ids_write.patch
selftests-damon-_chk_dependency-get-debugfs-mount-point-from-proc-mounts.patch
docs-mm-damon-maintainer-profile-fix-reference-links-for-mm-stable-tree.patch
docs-mm-damon-move-the-list-of-damos-actions-to-design-doc.patch
docs-mm-damon-move-damon-operation-sets-list-from-the-usage-to-the-design-document.patch
docs-mm-damon-move-monitoring-target-regions-setup-detail-from-the-usage-to-the-design-document.patch
docs-admin-guide-mm-damon-usage-fix-wrong-quotas-diabling-condition.patch
mm-damon-core-set-damos_quota-esz-as-public-field-and-document.patch
mm-damon-sysfs-schemes-implement-quota-effective_bytes-file.patch
mm-damon-sysfs-implement-a-kdamond-command-for-updating-schemes-effective-quotas.patch
docs-abi-damon-document-effective_bytes-sysfs-file.patch
docs-admin-guide-mm-damon-usage-document-effective_bytes-file.patch
mm-damon-move-comments-and-fields-for-damos-quota-prioritization-to-the-end.patch
mm-damon-core-split-out-quota-goal-related-fields-to-a-struct.patch
mm-damon-core-add-multiple-goals-per-damos_quota-and-helpers-for-those.patch
mm-damon-sysfs-use-only-quota-goals.patch
mm-damon-core-remove-goal-field-of-damos_quota.patch
mm-damon-core-let-goal-specified-with-only-target-and-current-values.patch
mm-damon-core-support-multiple-metrics-for-quota-goal.patch
mm-damon-core-implement-psi-metric-damos-quota-goal.patch
mm-damon-sysfs-schemes-support-psi-based-quota-auto-tune.patch
docs-mm-damon-design-document-quota-goal-self-tuning.patch
docs-abi-damon-document-quota-goal-metric-file.patch
docs-admin-guide-mm-damon-usage-document-quota-goal-metric-file.patch
mm-damon-reclaim-implement-user-feedback-driven-quota-auto-tuning.patch
mm-damon-reclaim-implement-memory-psi-driven-quota-self-tuning.patch
docs-admin-guide-mm-damon-reclaim-document-auto-tuning-parameters.patch
The quilt patch titled
Subject: mm: memcontrol: clarify swapaccount=0 deprecation warning
has been removed from the -mm tree. Its filename was
mm-memcontrol-clarify-swapaccount=0-deprecation-warning.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Johannes Weiner <hannes(a)cmpxchg.org>
Subject: mm: memcontrol: clarify swapaccount=0 deprecation warning
Date: Tue, 13 Feb 2024 03:16:34 -0500
The swapaccount deprecation warning is throwing false positives. Since we
deprecated the knob and defaulted to enabling, the only reports we've been
getting are from folks that set swapaccount=1. While this is a nice
affirmation that always-enabling was the right choice, we certainly don't
want to warn when users request the supported mode.
Only warn when disabling is requested, and clarify the warning.
[colin.i.king(a)gmail.com: spelling: "commdandline" -> "commandline"]
Link: https://lkml.kernel.org/r/20240215090544.1649201-1-colin.i.king@gmail.com
Link: https://lkml.kernel.org/r/20240213081634.3652326-1-hannes@cmpxchg.org
Fixes: b25806dcd3d5 ("mm: memcontrol: deprecate swapaccounting=0 mode")
Signed-off-by: Colin Ian King <colin.i.king(a)gmail.com>
Reported-by: "Jonas Sch��fer" <jonas(a)wielicki.name>
Reported-by: Narcis Garcia <debianlists(a)actiu.net>
Suggested-by: Yosry Ahmed <yosryahmed(a)google.com>
Signed-off-by: Johannes Weiner <hannes(a)cmpxchg.org>
Reviewed-by: Yosry Ahmed <yosryahmed(a)google.com>
Acked-by: Michal Hocko <mhocko(a)suse.com>
Acked-by: Shakeel Butt <shakeelb(a)google.com>
Cc: Roman Gushchin <roman.gushchin(a)linux.dev>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/memcontrol.c | 10 +++++++---
1 file changed, 7 insertions(+), 3 deletions(-)
--- a/mm/memcontrol.c~mm-memcontrol-clarify-swapaccount=0-deprecation-warning
+++ a/mm/memcontrol.c
@@ -7971,9 +7971,13 @@ bool mem_cgroup_swap_full(struct folio *
static int __init setup_swap_account(char *s)
{
- pr_warn_once("The swapaccount= commandline option is deprecated. "
- "Please report your usecase to linux-mm(a)kvack.org if you "
- "depend on this functionality.\n");
+ bool res;
+
+ if (!kstrtobool(s, &res) && !res)
+ pr_warn_once("The swapaccount=0 commandline option is deprecated "
+ "in favor of configuring swap control via cgroupfs. "
+ "Please report your usecase to linux-mm(a)kvack.org if you "
+ "depend on this functionality.\n");
return 1;
}
__setup("swapaccount=", setup_swap_account);
_
Patches currently in -mm which might be from hannes(a)cmpxchg.org are
mm-zswap-rename-zswap_free_entry-to-zswap_entry_free.patch
mm-zswap-inline-and-remove-zswap_entry_find_get.patch
mm-zswap-move-zswap_invalidate_entry-to-related-functions.patch
mm-zswap-warn-when-referencing-a-dead-entry.patch
mm-zswap-clean-up-zswap_entry_put.patch
mm-zswap-rename-__zswap_load-to-zswap_decompress.patch
mm-zswap-break-out-zwap_compress.patch
mm-zswap-further-cleanup-zswap_store.patch
mm-zswap-simplify-zswap_invalidate.patch
mm-zswap-function-ordering-pool-alloc-free.patch
mm-zswap-function-ordering-pool-refcounting.patch
mm-zswap-function-ordering-zswap_pools.patch
mm-zswap-function-ordering-pool-params.patch
mm-zswap-function-ordering-public-lru-api.patch
mm-zswap-function-ordering-move-entry-sections-out-of-lru-section.patch
mm-zswap-function-ordering-move-entry-section-out-of-tree-section.patch
mm-zswap-function-ordering-compress-decompress-functions.patch
mm-zswap-function-ordering-per-cpu-compression-infra.patch
mm-zswap-function-ordering-writeback.patch
mm-zswap-function-ordering-shrink_memcg_cb.patch
The quilt patch titled
Subject: mm/zswap: invalidate duplicate entry when !zswap_enabled
has been removed from the -mm tree. Its filename was
mm-zswap-invalidate-duplicate-entry-when-zswap_enabled.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Chengming Zhou <zhouchengming(a)bytedance.com>
Subject: mm/zswap: invalidate duplicate entry when !zswap_enabled
Date: Thu, 8 Feb 2024 02:32:54 +0000
We have to invalidate any duplicate entry even when !zswap_enabled since
zswap can be disabled anytime. If the folio store success before, then
got dirtied again but zswap disabled, we won't invalidate the old
duplicate entry in the zswap_store(). So later lru writeback may
overwrite the new data in swapfile.
Link: https://lkml.kernel.org/r/20240208023254.3873823-1-chengming.zhou@linux.dev
Fixes: 42c06a0e8ebe ("mm: kill frontswap")
Signed-off-by: Chengming Zhou <zhouchengming(a)bytedance.com>
Acked-by: Johannes Weiner <hannes(a)cmpxchg.org>
Cc: Nhat Pham <nphamcs(a)gmail.com>
Cc: Yosry Ahmed <yosryahmed(a)google.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/zswap.c | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
--- a/mm/zswap.c~mm-zswap-invalidate-duplicate-entry-when-zswap_enabled
+++ a/mm/zswap.c
@@ -1518,7 +1518,7 @@ bool zswap_store(struct folio *folio)
if (folio_test_large(folio))
return false;
- if (!zswap_enabled || !tree)
+ if (!tree)
return false;
/*
@@ -1533,6 +1533,10 @@ bool zswap_store(struct folio *folio)
zswap_invalidate_entry(tree, dupentry);
}
spin_unlock(&tree->lock);
+
+ if (!zswap_enabled)
+ return false;
+
objcg = get_obj_cgroup_from_folio(folio);
if (objcg && !obj_cgroup_may_zswap(objcg)) {
memcg = get_mem_cgroup_from_objcg(objcg);
_
Patches currently in -mm which might be from zhouchengming(a)bytedance.com are
mm-zswap-make-sure-each-swapfile-always-have-zswap-rb-tree.patch
mm-zswap-split-zswap-rb-tree.patch
mm-zswap-fix-race-between-lru-writeback-and-swapoff.patch
mm-list_lru-remove-list_lru_putback.patch
mm-zswap-add-more-comments-in-shrink_memcg_cb.patch
mm-zswap-invalidate-zswap-entry-when-swap-entry-free.patch
mm-zswap-stop-lru-list-shrinking-when-encounter-warm-region.patch
mm-zswap-remove-duplicate_entry-debug-value.patch
mm-zswap-only-support-zswap_exclusive_loads_enabled.patch
mm-zswap-zswap-entry-doesnt-need-refcount-anymore.patch
mm-zswap-optimize-and-cleanup-the-invalidation-of-duplicate-entry.patch
mm-zsmalloc-fix-migrate_write_lock-when-config_compaction.patch
mm-zsmalloc-remove-migrate_write_lock_nested.patch
mm-zsmalloc-remove-unused-zspage-isolated.patch
mm-zswap-global-lru-and-shrinker-shared-by-all-zswap_pools.patch
mm-zswap-change-zswap_pool-kref-to-percpu_ref.patch
mm-zsmalloc-remove-set_zspage_mapping.patch
mm-zsmalloc-remove_zspage-dont-need-fullness-parameter.patch
mm-zsmalloc-remove-get_zspage_mapping.patch
maintainers-add-chengming-zhou-as-a-zswap-reviewer.patch
The quilt patch titled
Subject: lib/Kconfig.debug: TEST_IOV_ITER depends on MMU
has been removed from the -mm tree. Its filename was
lib-kconfigdebug-test_iov_iter-depends-on-mmu.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Guenter Roeck <linux(a)roeck-us.net>
Subject: lib/Kconfig.debug: TEST_IOV_ITER depends on MMU
Date: Thu, 8 Feb 2024 07:30:10 -0800
Trying to run the iov_iter unit test on a nommu system such as the qemu
kc705-nommu emulation results in a crash.
KTAP version 1
# Subtest: iov_iter
# module: kunit_iov_iter
1..9
BUG: failure at mm/nommu.c:318/vmap()!
Kernel panic - not syncing: BUG!
The test calls vmap() directly, but vmap() is not supported on nommu
systems, causing the crash. TEST_IOV_ITER therefore needs to depend on
MMU.
Link: https://lkml.kernel.org/r/20240208153010.1439753-1-linux@roeck-us.net
Fixes: 2d71340ff1d4 ("iov_iter: Kunit tests for copying to/from an iterator")
Signed-off-by: Guenter Roeck <linux(a)roeck-us.net>
Cc: David Howells <dhowells(a)redhat.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
lib/Kconfig.debug | 1 +
1 file changed, 1 insertion(+)
--- a/lib/Kconfig.debug~lib-kconfigdebug-test_iov_iter-depends-on-mmu
+++ a/lib/Kconfig.debug
@@ -2235,6 +2235,7 @@ config TEST_DIV64
config TEST_IOV_ITER
tristate "Test iov_iter operation" if !KUNIT_ALL_TESTS
depends on KUNIT
+ depends on MMU
default KUNIT_ALL_TESTS
help
Enable this to turn on testing of the operation of the I/O iterator
_
Patches currently in -mm which might be from linux(a)roeck-us.net are
The quilt patch titled
Subject: mm/swap: fix race when skipping swapcache
has been removed from the -mm tree. Its filename was
mm-swap-fix-race-when-skipping-swapcache.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Kairui Song <kasong(a)tencent.com>
Subject: mm/swap: fix race when skipping swapcache
Date: Wed, 7 Feb 2024 02:25:59 +0800
When skipping swapcache for SWP_SYNCHRONOUS_IO, if two or more threads
swapin the same entry at the same time, they get different pages (A, B).
Before one thread (T0) finishes the swapin and installs page (A) to the
PTE, another thread (T1) could finish swapin of page (B), swap_free the
entry, then swap out the possibly modified page reusing the same entry.
It breaks the pte_same check in (T0) because PTE value is unchanged,
causing ABA problem. Thread (T0) will install a stalled page (A) into the
PTE and cause data corruption.
One possible callstack is like this:
CPU0 CPU1
---- ----
do_swap_page() do_swap_page() with same entry
<direct swapin path> <direct swapin path>
<alloc page A> <alloc page B>
swap_read_folio() <- read to page A swap_read_folio() <- read to page B
<slow on later locks or interrupt> <finished swapin first>
... set_pte_at()
swap_free() <- entry is free
<write to page B, now page A stalled>
<swap out page B to same swap entry>
pte_same() <- Check pass, PTE seems
unchanged, but page A
is stalled!
swap_free() <- page B content lost!
set_pte_at() <- staled page A installed!
And besides, for ZRAM, swap_free() allows the swap device to discard the
entry content, so even if page (B) is not modified, if swap_read_folio()
on CPU0 happens later than swap_free() on CPU1, it may also cause data
loss.
To fix this, reuse swapcache_prepare which will pin the swap entry using
the cache flag, and allow only one thread to swap it in, also prevent any
parallel code from putting the entry in the cache. Release the pin after
PT unlocked.
Racers just loop and wait since it's a rare and very short event. A
schedule_timeout_uninterruptible(1) call is added to avoid repeated page
faults wasting too much CPU, causing livelock or adding too much noise to
perf statistics. A similar livelock issue was described in commit
029c4628b2eb ("mm: swap: get rid of livelock in swapin readahead")
Reproducer:
This race issue can be triggered easily using a well constructed
reproducer and patched brd (with a delay in read path) [1]:
With latest 6.8 mainline, race caused data loss can be observed easily:
$ gcc -g -lpthread test-thread-swap-race.c && ./a.out
Polulating 32MB of memory region...
Keep swapping out...
Starting round 0...
Spawning 65536 workers...
32746 workers spawned, wait for done...
Round 0: Error on 0x5aa00, expected 32746, got 32743, 3 data loss!
Round 0: Error on 0x395200, expected 32746, got 32743, 3 data loss!
Round 0: Error on 0x3fd000, expected 32746, got 32737, 9 data loss!
Round 0 Failed, 15 data loss!
This reproducer spawns multiple threads sharing the same memory region
using a small swap device. Every two threads updates mapped pages one by
one in opposite direction trying to create a race, with one dedicated
thread keep swapping out the data out using madvise.
The reproducer created a reproduce rate of about once every 5 minutes, so
the race should be totally possible in production.
After this patch, I ran the reproducer for over a few hundred rounds and
no data loss observed.
Performance overhead is minimal, microbenchmark swapin 10G from 32G
zram:
Before: 10934698 us
After: 11157121 us
Cached: 13155355 us (Dropping SWP_SYNCHRONOUS_IO flag)
[kasong(a)tencent.com: v4]
Link: https://lkml.kernel.org/r/20240219082040.7495-1-ryncsn@gmail.com
Link: https://lkml.kernel.org/r/20240206182559.32264-1-ryncsn@gmail.com
Fixes: 0bcac06f27d7 ("mm, swap: skip swapcache for swapin of synchronous device")
Reported-by: "Huang, Ying" <ying.huang(a)intel.com>
Closes: https://lore.kernel.org/lkml/87bk92gqpx.fsf_-_@yhuang6-desk2.ccr.corp.intel…
Link: https://github.com/ryncsn/emm-test-project/tree/master/swap-stress-race [1]
Signed-off-by: Kairui Song <kasong(a)tencent.com>
Reviewed-by: "Huang, Ying" <ying.huang(a)intel.com>
Acked-by: Yu Zhao <yuzhao(a)google.com>
Acked-by: David Hildenbrand <david(a)redhat.com>
Acked-by: Chris Li <chrisl(a)kernel.org>
Cc: Hugh Dickins <hughd(a)google.com>
Cc: Johannes Weiner <hannes(a)cmpxchg.org>
Cc: Matthew Wilcox (Oracle) <willy(a)infradead.org>
Cc: Michal Hocko <mhocko(a)suse.com>
Cc: Minchan Kim <minchan(a)kernel.org>
Cc: Yosry Ahmed <yosryahmed(a)google.com>
Cc: Yu Zhao <yuzhao(a)google.com>
Cc: Barry Song <21cnbao(a)gmail.com>
Cc: SeongJae Park <sj(a)kernel.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
include/linux/swap.h | 5 +++++
mm/memory.c | 20 ++++++++++++++++++++
mm/swap.h | 5 +++++
mm/swapfile.c | 13 +++++++++++++
4 files changed, 43 insertions(+)
--- a/include/linux/swap.h~mm-swap-fix-race-when-skipping-swapcache
+++ a/include/linux/swap.h
@@ -549,6 +549,11 @@ static inline int swap_duplicate(swp_ent
return 0;
}
+static inline int swapcache_prepare(swp_entry_t swp)
+{
+ return 0;
+}
+
static inline void swap_free(swp_entry_t swp)
{
}
--- a/mm/memory.c~mm-swap-fix-race-when-skipping-swapcache
+++ a/mm/memory.c
@@ -3799,6 +3799,7 @@ vm_fault_t do_swap_page(struct vm_fault
struct page *page;
struct swap_info_struct *si = NULL;
rmap_t rmap_flags = RMAP_NONE;
+ bool need_clear_cache = false;
bool exclusive = false;
swp_entry_t entry;
pte_t pte;
@@ -3867,6 +3868,20 @@ vm_fault_t do_swap_page(struct vm_fault
if (!folio) {
if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
__swap_count(entry) == 1) {
+ /*
+ * Prevent parallel swapin from proceeding with
+ * the cache flag. Otherwise, another thread may
+ * finish swapin first, free the entry, and swapout
+ * reusing the same entry. It's undetectable as
+ * pte_same() returns true due to entry reuse.
+ */
+ if (swapcache_prepare(entry)) {
+ /* Relax a bit to prevent rapid repeated page faults */
+ schedule_timeout_uninterruptible(1);
+ goto out;
+ }
+ need_clear_cache = true;
+
/* skip swapcache */
folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0,
vma, vmf->address, false);
@@ -4117,6 +4132,9 @@ unlock:
if (vmf->pte)
pte_unmap_unlock(vmf->pte, vmf->ptl);
out:
+ /* Clear the swap cache pin for direct swapin after PTL unlock */
+ if (need_clear_cache)
+ swapcache_clear(si, entry);
if (si)
put_swap_device(si);
return ret;
@@ -4131,6 +4149,8 @@ out_release:
folio_unlock(swapcache);
folio_put(swapcache);
}
+ if (need_clear_cache)
+ swapcache_clear(si, entry);
if (si)
put_swap_device(si);
return ret;
--- a/mm/swapfile.c~mm-swap-fix-race-when-skipping-swapcache
+++ a/mm/swapfile.c
@@ -3365,6 +3365,19 @@ int swapcache_prepare(swp_entry_t entry)
return __swap_duplicate(entry, SWAP_HAS_CACHE);
}
+void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry)
+{
+ struct swap_cluster_info *ci;
+ unsigned long offset = swp_offset(entry);
+ unsigned char usage;
+
+ ci = lock_cluster_or_swap_info(si, offset);
+ usage = __swap_entry_free_locked(si, offset, SWAP_HAS_CACHE);
+ unlock_cluster_or_swap_info(si, ci);
+ if (!usage)
+ free_swap_slot(entry);
+}
+
struct swap_info_struct *swp_swap_info(swp_entry_t entry)
{
return swap_type_to_swap_info(swp_type(entry));
--- a/mm/swap.h~mm-swap-fix-race-when-skipping-swapcache
+++ a/mm/swap.h
@@ -41,6 +41,7 @@ void __delete_from_swap_cache(struct fol
void delete_from_swap_cache(struct folio *folio);
void clear_shadow_from_swap_cache(int type, unsigned long begin,
unsigned long end);
+void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry);
struct folio *swap_cache_get_folio(swp_entry_t entry,
struct vm_area_struct *vma, unsigned long addr);
struct folio *filemap_get_incore_folio(struct address_space *mapping,
@@ -97,6 +98,10 @@ static inline int swap_writepage(struct
return 0;
}
+static inline void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry)
+{
+}
+
static inline struct folio *swap_cache_get_folio(swp_entry_t entry,
struct vm_area_struct *vma, unsigned long addr)
{
_
Patches currently in -mm which might be from kasong(a)tencent.com are
The quilt patch titled
Subject: selftests/mm: uffd-unit-test check if huge page size is 0
has been removed from the -mm tree. Its filename was
selftests-mm-uffd-unit-test-check-if-huge-page-size-is-0.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Terry Tritton <terry.tritton(a)linaro.org>
Subject: selftests/mm: uffd-unit-test check if huge page size is 0
Date: Mon, 5 Feb 2024 14:50:56 +0000
If HUGETLBFS is not enabled then the default_huge_page_size function will
return 0 and cause a divide by 0 error. Add a check to see if the huge page
size is 0 and skip the hugetlb tests if it is.
Link: https://lkml.kernel.org/r/20240205145055.3545806-2-terry.tritton@linaro.org
Fixes: 16a45b57cbf2 ("selftests/mm: add framework for uffd-unit-test")
Signed-off-by: Terry Tritton <terry.tritton(a)linaro.org>
Cc: Peter Griffin <peter.griffin(a)linaro.org>
Cc: Shuah Khan <shuah(a)kernel.org>
Cc: Peter Xu <peterx(a)redhat.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
tools/testing/selftests/mm/uffd-unit-tests.c | 6 ++++++
1 file changed, 6 insertions(+)
--- a/tools/testing/selftests/mm/uffd-unit-tests.c~selftests-mm-uffd-unit-test-check-if-huge-page-size-is-0
+++ a/tools/testing/selftests/mm/uffd-unit-tests.c
@@ -1517,6 +1517,12 @@ int main(int argc, char *argv[])
continue;
uffd_test_start("%s on %s", test->name, mem_type->name);
+ if ((mem_type->mem_flag == MEM_HUGETLB ||
+ mem_type->mem_flag == MEM_HUGETLB_PRIVATE) &&
+ (default_huge_page_size() == 0)) {
+ uffd_test_skip("huge page size is 0, feature missing?");
+ continue;
+ }
if (!uffd_feature_supported(test)) {
uffd_test_skip("feature missing");
continue;
_
Patches currently in -mm which might be from terry.tritton(a)linaro.org are
The quilt patch titled
Subject: mm: zswap: fix missing folio cleanup in writeback race path
has been removed from the -mm tree. Its filename was
mm-zswap-fix-missing-folio-cleanup-in-writeback-race-path.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Yosry Ahmed <yosryahmed(a)google.com>
Subject: mm: zswap: fix missing folio cleanup in writeback race path
Date: Thu, 25 Jan 2024 08:51:27 +0000
In zswap_writeback_entry(), after we get a folio from
__read_swap_cache_async(), we grab the tree lock again to check that the
swap entry was not invalidated and recycled. If it was, we delete the
folio we just added to the swap cache and exit.
However, __read_swap_cache_async() returns the folio locked when it is
newly allocated, which is always true for this path, and the folio is
ref'd. Make sure to unlock and put the folio before returning.
This was discovered by code inspection, probably because this path handles
a race condition that should not happen often, and the bug would not crash
the system, it will only strand the folio indefinitely.
Link: https://lkml.kernel.org/r/20240125085127.1327013-1-yosryahmed@google.com
Fixes: 04fc7816089c ("mm: fix zswap writeback race condition")
Signed-off-by: Yosry Ahmed <yosryahmed(a)google.com>
Reviewed-by: Chengming Zhou <zhouchengming(a)bytedance.com>
Acked-by: Johannes Weiner <hannes(a)cmpxchg.org>
Reviewed-by: Nhat Pham <nphamcs(a)gmail.com>
Cc: Domenico Cerasuolo <cerasuolodomenico(a)gmail.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/zswap.c | 2 ++
1 file changed, 2 insertions(+)
--- a/mm/zswap.c~mm-zswap-fix-missing-folio-cleanup-in-writeback-race-path
+++ a/mm/zswap.c
@@ -1440,6 +1440,8 @@ static int zswap_writeback_entry(struct
if (zswap_rb_search(&tree->rbroot, swp_offset(entry->swpentry)) != entry) {
spin_unlock(&tree->lock);
delete_from_swap_cache(folio);
+ folio_unlock(folio);
+ folio_put(folio);
return -ENOMEM;
}
spin_unlock(&tree->lock);
_
Patches currently in -mm which might be from yosryahmed(a)google.com are
mm-swap-enforce-updating-inuse_pages-at-the-end-of-swap_range_free.patch
mm-zswap-remove-unnecessary-trees-cleanups-in-zswap_swapoff.patch
mm-zswap-remove-unused-tree-argument-in-zswap_entry_put.patch
x86-mm-delete-unused-cpu-argument-to-leave_mm.patch
x86-mm-clarify-prev-usage-in-switch_mm_irqs_off.patch
The patch below does not apply to the 6.7-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.7.y
git checkout FETCH_HEAD
git cherry-pick -x e870920bbe68e52335a4c31a059e6af6a9a59dbb
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021921-bleak-sputter-5ecf@gregkh' --subject-prefix 'PATCH 6.7.y' HEAD^..
Possible dependencies:
e870920bbe68 ("arch/arm/mm: fix major fault accounting when retrying under per-VMA lock")
c16af1212479 ("ARM: 9328/1: mm: try VMA lock-based page fault handling first")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From e870920bbe68e52335a4c31a059e6af6a9a59dbb Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb(a)google.com>
Date: Mon, 22 Jan 2024 22:43:05 -0800
Subject: [PATCH] arch/arm/mm: fix major fault accounting when retrying under
per-VMA lock
The change [1] missed ARM architecture when fixing major fault accounting
for page fault retry under per-VMA lock.
The user-visible effects is that it restores correct major fault
accounting that was broken after [2] was merged in 6.7 kernel. The
more detailed description is in [3] and this patch simply adds the
same fix to ARM architecture which I missed in [3].
Add missing code to fix ARM architecture fault accounting.
[1] 46e714c729c8 ("arch/mm/fault: fix major fault accounting when retrying under per-VMA lock")
[2] https://lore.kernel.org/all/20231006195318.4087158-6-willy@infradead.org/
[3] https://lore.kernel.org/all/20231226214610.109282-1-surenb@google.com/
Link: https://lkml.kernel.org/r/20240123064305.2829244-1-surenb@google.com
Fixes: 12214eba1992 ("mm: handle read faults under the VMA lock")
Reported-by: Russell King (Oracle) <rmk+kernel(a)armlinux.org.uk>
Signed-off-by: Suren Baghdasaryan <surenb(a)google.com>
Cc: Alexander Gordeev <agordeev(a)linux.ibm.com>
Cc: Andy Lutomirski <luto(a)kernel.org>
Cc: Catalin Marinas <catalin.marinas(a)arm.com>
Cc: Christophe Leroy <christophe.leroy(a)csgroup.eu>
Cc: Dave Hansen <dave.hansen(a)linux.intel.com>
Cc: Gerald Schaefer <gerald.schaefer(a)linux.ibm.com>
Cc: Matthew Wilcox (Oracle) <willy(a)infradead.org>
Cc: Michael Ellerman <mpe(a)ellerman.id.au>
Cc: Palmer Dabbelt <palmer(a)dabbelt.com>
Cc: Peter Zijlstra <peterz(a)infradead.org>
Cc: Will Deacon <will(a)kernel.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
index e96fb40b9cc3..07565b593ed6 100644
--- a/arch/arm/mm/fault.c
+++ b/arch/arm/mm/fault.c
@@ -298,6 +298,8 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
goto done;
}
count_vm_vma_lock_event(VMA_LOCK_RETRY);
+ if (fault & VM_FAULT_MAJOR)
+ flags |= FAULT_FLAG_TRIED;
/* Quick path to respond to signals */
if (fault_signal_pending(fault, regs)) {
Hello,
I am sending this patch for inclusion in the stable tree, as it fixes
a critical stack-out-of-bounds bug in the cifs module related to the
`smb2_set_next_command()` function.
Problem Summary:
A problem was observed in the `statfs` system call for cifs, where it
failed with a "Resource temporarily unavailable" message. Further
investigation with KASAN revealed a stack-out-of-bounds error. The
root cause was a miscalculation of the size of the `smb2_query_info_req`
structure in the `SMB2_query_info_init()` function.
This situation arose due to a dependency on a prior commit
(`eb3e28c1e89b`) that replaced a 1-element array with a flexible
array member in the `smb2_query_info_req` structure. This commit was
not backported to the 5.10.y and 5.15.y stable branch, leading to an
incorrect size calculation after the backport of commit `33eae65c6f49`.
Fix Details:
The patch corrects the size calculation to ensure the correct length
is used when initializing the `smb2_query_info_req` structure. It has
been tested and confirmed to resolve the issue without introducing
any regressions.
Maybe the prior commit eb3e28c1e89b ("smb3: Replace smb2pdu 1-element
arrays with flex-arrays") should be backported to solve this problem
directly. The patch does not seem to conflict.
Best regards,
ZhaoLong Wang
ZhaoLong Wang (1):
cifs: Fix stack-out-of-bounds in smb2_set_next_command()
fs/cifs/smb2pdu.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--
2.39.2
The patch titled
Subject: mm/debug_vm_pgtable: fix BUG_ON with pud advanced test
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
mm-debug_vm_pgtable-fix-bug_on-with-pud-advanced-test.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: "Aneesh Kumar K.V (IBM)" <aneesh.kumar(a)kernel.org>
Subject: mm/debug_vm_pgtable: fix BUG_ON with pud advanced test
Date: Mon, 29 Jan 2024 11:30:22 +0530
Architectures like powerpc add debug checks to ensure we find only devmap
PUD pte entries. These debug checks are only done with CONFIG_DEBUG_VM.
This patch marks the ptes used for PUD advanced test devmap pte entries so
that we don't hit on debug checks on architecture like ppc64 as below.
WARNING: CPU: 2 PID: 1 at arch/powerpc/mm/book3s64/radix_pgtable.c:1382 radix__pud_hugepage_update+0x38/0x138
....
NIP [c0000000000a7004] radix__pud_hugepage_update+0x38/0x138
LR [c0000000000a77a8] radix__pudp_huge_get_and_clear+0x28/0x60
Call Trace:
[c000000004a2f950] [c000000004a2f9a0] 0xc000000004a2f9a0 (unreliable)
[c000000004a2f980] [000d34c100000000] 0xd34c100000000
[c000000004a2f9a0] [c00000000206ba98] pud_advanced_tests+0x118/0x334
[c000000004a2fa40] [c00000000206db34] debug_vm_pgtable+0xcbc/0x1c48
[c000000004a2fc10] [c00000000000fd28] do_one_initcall+0x60/0x388
Also
kernel BUG at arch/powerpc/mm/book3s64/pgtable.c:202!
....
NIP [c000000000096510] pudp_huge_get_and_clear_full+0x98/0x174
LR [c00000000206bb34] pud_advanced_tests+0x1b4/0x334
Call Trace:
[c000000004a2f950] [000d34c100000000] 0xd34c100000000 (unreliable)
[c000000004a2f9a0] [c00000000206bb34] pud_advanced_tests+0x1b4/0x334
[c000000004a2fa40] [c00000000206db34] debug_vm_pgtable+0xcbc/0x1c48
[c000000004a2fc10] [c00000000000fd28] do_one_initcall+0x60/0x388
Link: https://lkml.kernel.org/r/20240129060022.68044-1-aneesh.kumar@kernel.org
Fixes: 27af67f35631 ("powerpc/book3s64/mm: enable transparent pud hugepage")
Signed-off-by: Aneesh Kumar K.V (IBM) <aneesh.kumar(a)kernel.org>
Cc: Anshuman Khandual <anshuman.khandual(a)arm.com>
Cc: Michael Ellerman <mpe(a)ellerman.id.au>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/debug_vm_pgtable.c | 8 ++++++++
1 file changed, 8 insertions(+)
--- a/mm/debug_vm_pgtable.c~mm-debug_vm_pgtable-fix-bug_on-with-pud-advanced-test
+++ a/mm/debug_vm_pgtable.c
@@ -362,6 +362,12 @@ static void __init pud_advanced_tests(st
vaddr &= HPAGE_PUD_MASK;
pud = pfn_pud(args->pud_pfn, args->page_prot);
+ /*
+ * Some architectures have debug checks to make sure
+ * huge pud mapping are only found with devmap entries
+ * For now test with only devmap entries.
+ */
+ pud = pud_mkdevmap(pud);
set_pud_at(args->mm, vaddr, args->pudp, pud);
flush_dcache_page(page);
pudp_set_wrprotect(args->mm, vaddr, args->pudp);
@@ -374,6 +380,7 @@ static void __init pud_advanced_tests(st
WARN_ON(!pud_none(pud));
#endif /* __PAGETABLE_PMD_FOLDED */
pud = pfn_pud(args->pud_pfn, args->page_prot);
+ pud = pud_mkdevmap(pud);
pud = pud_wrprotect(pud);
pud = pud_mkclean(pud);
set_pud_at(args->mm, vaddr, args->pudp, pud);
@@ -391,6 +398,7 @@ static void __init pud_advanced_tests(st
#endif /* __PAGETABLE_PMD_FOLDED */
pud = pfn_pud(args->pud_pfn, args->page_prot);
+ pud = pud_mkdevmap(pud);
pud = pud_mkyoung(pud);
set_pud_at(args->mm, vaddr, args->pudp, pud);
flush_dcache_page(page);
_
Patches currently in -mm which might be from aneesh.kumar(a)kernel.org are
mm-debug_vm_pgtable-fix-bug_on-with-pud-advanced-test.patch
From: Alexander Usyskin <alexander.usyskin(a)intel.com>
On Arrow Lake S systems, MEI is no longer strictly connected to bus 0,
while graphics remain exclusively on bus 0. Adapt the component
matching logic to accommodate this change:
Original behavior: Required both MEI and graphics to be on the same
bus 0.
New behavior: Only enforces graphics to be on bus 0 (integrated),
allowing MEI to reside on any bus.
This ensures compatibility with Arrow Lake S and maintains functionality
for the legacy systems.
Fixes: 1dd924f6885b ("mei: gsc_proxy: add gsc proxy driver")
Cc: <stable(a)vger.kernel.org> # v6.3+
Signed-off-by: Alexander Usyskin <alexander.usyskin(a)intel.com>
Signed-off-by: Tomas Winkler <tomas.winkler(a)intel.com>
---
V2: Add reference to fixed commit
Requires 'mei: me: add arrow lake point S DID'
https://lore.kernel.org/lkml/20240211103912.117105-1-tomas.winkler@intel.co…
drivers/misc/mei/gsc_proxy/mei_gsc_proxy.c | 8 +++++---
1 file changed, 5 insertions(+), 3 deletions(-)
diff --git a/drivers/misc/mei/gsc_proxy/mei_gsc_proxy.c b/drivers/misc/mei/gsc_proxy/mei_gsc_proxy.c
index be52b113aea937c7c658e06c..89364bdbb1290f5726a34945 100644
--- a/drivers/misc/mei/gsc_proxy/mei_gsc_proxy.c
+++ b/drivers/misc/mei/gsc_proxy/mei_gsc_proxy.c
@@ -96,7 +96,8 @@ static const struct component_master_ops mei_component_master_ops = {
*
* The function checks if the device is pci device and
* Intel VGA adapter, the subcomponent is SW Proxy
- * and the parent of MEI PCI and the parent of VGA are the same PCH device.
+ * and the VGA is on the bus 0 reserved for built-in devices
+ * to reject discrete GFX.
*
* @dev: master device
* @subcomponent: subcomponent to match (I915_COMPONENT_SWPROXY)
@@ -123,7 +124,8 @@ static int mei_gsc_proxy_component_match(struct device *dev, int subcomponent,
if (subcomponent != I915_COMPONENT_GSC_PROXY)
return 0;
- return component_compare_dev(dev->parent, ((struct device *)data)->parent);
+ /* Only built-in GFX */
+ return (pdev->bus->number == 0);
}
static int mei_gsc_proxy_probe(struct mei_cl_device *cldev,
@@ -146,7 +148,7 @@ static int mei_gsc_proxy_probe(struct mei_cl_device *cldev,
}
component_match_add_typed(&cldev->dev, &master_match,
- mei_gsc_proxy_component_match, cldev->dev.parent);
+ mei_gsc_proxy_component_match, NULL);
if (IS_ERR_OR_NULL(master_match)) {
ret = -ENOMEM;
goto err_exit;
--
2.43.0
From: Alexander Usyskin <alexander.usyskin(a)intel.com>
On Arrow Lake S systems, MEI is no longer strictly connected to bus 0,
while graphics remain exclusively on bus 0. Adapt the component
matching logic to accommodate this change:
Original behavior: Required both MEI and graphics to be on the same
bus 0.
New behavior: Only enforces graphics to be on bus 0 (integrated),
allowing MEI to reside on any bus.
This ensures compatibility with Arrow Lake S and maintains functionality
for the legacy systems.
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Alexander Usyskin <alexander.usyskin(a)intel.com>
Signed-off-by: Tomas Winkler <tomas.winkler(a)intel.com>
---
Requires 'mei: me: add arrow lake point S DID'
https://lore.kernel.org/lkml/20240211103912.117105-1-tomas.winkler@intel.co…
drivers/misc/mei/gsc_proxy/mei_gsc_proxy.c | 8 +++++---
1 file changed, 5 insertions(+), 3 deletions(-)
diff --git a/drivers/misc/mei/gsc_proxy/mei_gsc_proxy.c b/drivers/misc/mei/gsc_proxy/mei_gsc_proxy.c
index be52b113aea937c7c658e06c..89364bdbb1290f5726a34945 100644
--- a/drivers/misc/mei/gsc_proxy/mei_gsc_proxy.c
+++ b/drivers/misc/mei/gsc_proxy/mei_gsc_proxy.c
@@ -96,7 +96,8 @@ static const struct component_master_ops mei_component_master_ops = {
*
* The function checks if the device is pci device and
* Intel VGA adapter, the subcomponent is SW Proxy
- * and the parent of MEI PCI and the parent of VGA are the same PCH device.
+ * and the VGA is on the bus 0 reserved for built-in devices
+ * to reject discrete GFX.
*
* @dev: master device
* @subcomponent: subcomponent to match (I915_COMPONENT_SWPROXY)
@@ -123,7 +124,8 @@ static int mei_gsc_proxy_component_match(struct device *dev, int subcomponent,
if (subcomponent != I915_COMPONENT_GSC_PROXY)
return 0;
- return component_compare_dev(dev->parent, ((struct device *)data)->parent);
+ /* Only built-in GFX */
+ return (pdev->bus->number == 0);
}
static int mei_gsc_proxy_probe(struct mei_cl_device *cldev,
@@ -146,7 +148,7 @@ static int mei_gsc_proxy_probe(struct mei_cl_device *cldev,
}
component_match_add_typed(&cldev->dev, &master_match,
- mei_gsc_proxy_component_match, cldev->dev.parent);
+ mei_gsc_proxy_component_match, NULL);
if (IS_ERR_OR_NULL(master_match)) {
ret = -ENOMEM;
goto err_exit;
--
2.43.0
The patch titled
Subject: mm: cachestat: fix folio read-after-free in cache walk
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
mm-cachestat-fix-folio-read-after-free-in-cache-walk.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Nhat Pham <nphamcs(a)gmail.com>
Subject: mm: cachestat: fix folio read-after-free in cache walk
Date: Mon, 19 Feb 2024 19:01:21 -0800
In cachestat, we access the folio from the page cache's xarray to compute
its page offset, and check for its dirty and writeback flags. However, we
do not hold a reference to the folio before performing these actions,
which means the folio can concurrently be released and reused as another
folio/page/slab.
Get around this altogether by just using xarray's existing machinery for
the folio page offsets and dirty/writeback states.
This changes behavior for tmpfs files to now always report zeroes in their
dirty and writeback counters. This is okay as tmpfs doesn't follow
conventional writeback cache behavior: its pages get "cleaned" during
swapout, after which they're no longer resident etc.
Link: https://lkml.kernel.org/r/20240220153409.GA216065@cmpxchg.org
Fixes: cf264e1329fb ("cachestat: implement cachestat syscall")
Reported-by: Jann Horn <jannh(a)google.com>
Suggested-by: Matthew Wilcox <willy(a)infradead.org>
Signed-off-by: Nhat Pham <nphamcs(a)gmail.com>
Signed-off-by: Johannes Weiner <hannes(a)cmpxchg.org>
Cc: <stable(a)vger.kernel.org> [6.4+]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/filemap.c | 51 ++++++++++++++++++++++++-------------------------
1 file changed, 26 insertions(+), 25 deletions(-)
--- a/mm/filemap.c~mm-cachestat-fix-folio-read-after-free-in-cache-walk
+++ a/mm/filemap.c
@@ -4111,28 +4111,40 @@ static void filemap_cachestat(struct add
rcu_read_lock();
xas_for_each(&xas, folio, last_index) {
+ int order;
unsigned long nr_pages;
pgoff_t folio_first_index, folio_last_index;
+ /*
+ * Don't deref the folio. It is not pinned, and might
+ * get freed (and reused) underneath us.
+ *
+ * We *could* pin it, but that would be expensive for
+ * what should be a fast and lightweight syscall.
+ *
+ * Instead, derive all information of interest from
+ * the rcu-protected xarray.
+ */
+
if (xas_retry(&xas, folio))
continue;
+ order = xa_get_order(xas.xa, xas.xa_index);
+ nr_pages = 1 << order;
+ folio_first_index = round_down(xas.xa_index, 1 << order);
+ folio_last_index = folio_first_index + nr_pages - 1;
+
+ /* Folios might straddle the range boundaries, only count covered pages */
+ if (folio_first_index < first_index)
+ nr_pages -= first_index - folio_first_index;
+
+ if (folio_last_index > last_index)
+ nr_pages -= folio_last_index - last_index;
+
if (xa_is_value(folio)) {
/* page is evicted */
void *shadow = (void *)folio;
bool workingset; /* not used */
- int order = xa_get_order(xas.xa, xas.xa_index);
-
- nr_pages = 1 << order;
- folio_first_index = round_down(xas.xa_index, 1 << order);
- folio_last_index = folio_first_index + nr_pages - 1;
-
- /* Folios might straddle the range boundaries, only count covered pages */
- if (folio_first_index < first_index)
- nr_pages -= first_index - folio_first_index;
-
- if (folio_last_index > last_index)
- nr_pages -= folio_last_index - last_index;
cs->nr_evicted += nr_pages;
@@ -4150,24 +4162,13 @@ static void filemap_cachestat(struct add
goto resched;
}
- nr_pages = folio_nr_pages(folio);
- folio_first_index = folio_pgoff(folio);
- folio_last_index = folio_first_index + nr_pages - 1;
-
- /* Folios might straddle the range boundaries, only count covered pages */
- if (folio_first_index < first_index)
- nr_pages -= first_index - folio_first_index;
-
- if (folio_last_index > last_index)
- nr_pages -= folio_last_index - last_index;
-
/* page is in cache */
cs->nr_cache += nr_pages;
- if (folio_test_dirty(folio))
+ if (xas_get_mark(&xas, PAGECACHE_TAG_DIRTY))
cs->nr_dirty += nr_pages;
- if (folio_test_writeback(folio))
+ if (xas_get_mark(&xas, PAGECACHE_TAG_WRITEBACK))
cs->nr_writeback += nr_pages;
resched:
_
Patches currently in -mm which might be from nphamcs(a)gmail.com are
mm-swap_state-update-zswap-lrus-protection-range-with-the-folio-locked.patch
mm-swap_state-update-zswap-lrus-protection-range-with-the-folio-locked-v2.patch
mm-swap_state-update-zswap-lrus-protection-range-with-the-folio-locked-fix.patch
mm-cachestat-fix-folio-read-after-free-in-cache-walk.patch
selftests-zswap-add-zswap-selftest-file-to-zswap-maintainer-entry.patch
selftests-fix-the-zswap-invasive-shrink-test.patch
selftests-add-zswapin-and-no-zswap-tests.patch
The patch titled
Subject: mm/vmscan: fix a bug calling wakeup_kswapd() with a wrong zone index
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
mm-vmscan-fix-a-bug-calling-wakeup_kswapd-with-a-wrong-zone-index.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Byungchul Park <byungchul(a)sk.com>
Subject: mm/vmscan: fix a bug calling wakeup_kswapd() with a wrong zone index
Date: Fri, 16 Feb 2024 20:15:02 +0900
With numa balancing on, when a numa system is running where a numa node
doesn't have its local memory so it has no managed zones, the following
oops has been observed. It's because wakeup_kswapd() is called with a
wrong zone index, -1. Fixed it by checking the index before calling
wakeup_kswapd().
> BUG: unable to handle page fault for address: 00000000000033f3
> #PF: supervisor read access in kernel mode
> #PF: error_code(0x0000) - not-present page
> PGD 0 P4D 0
> Oops: 0000 [#1] PREEMPT SMP NOPTI
> CPU: 2 PID: 895 Comm: masim Not tainted 6.6.0-dirty #255
> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
> rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014
> RIP: 0010:wakeup_kswapd (./linux/mm/vmscan.c:7812)
> Code: (omitted)
> RSP: 0000:ffffc90004257d58 EFLAGS: 00010286
> RAX: ffffffffffffffff RBX: ffff88883fff0480 RCX: 0000000000000003
> RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff88883fff0480
> RBP: ffffffffffffffff R08: ff0003ffffffffff R09: ffffffffffffffff
> R10: ffff888106c95540 R11: 0000000055555554 R12: 0000000000000003
> R13: 0000000000000000 R14: 0000000000000000 R15: ffff88883fff0940
> FS: 00007fc4b8124740(0000) GS:ffff888827c00000(0000) knlGS:0000000000000000
> CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> CR2: 00000000000033f3 CR3: 000000026cc08004 CR4: 0000000000770ee0
> DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> PKRU: 55555554
> Call Trace:
> <TASK>
> ? __die
> ? page_fault_oops
> ? __pte_offset_map_lock
> ? exc_page_fault
> ? asm_exc_page_fault
> ? wakeup_kswapd
> migrate_misplaced_page
> __handle_mm_fault
> handle_mm_fault
> do_user_addr_fault
> exc_page_fault
> asm_exc_page_fault
> RIP: 0033:0x55b897ba0808
> Code: (omitted)
> RSP: 002b:00007ffeefa821a0 EFLAGS: 00010287
> RAX: 000055b89983acd0 RBX: 00007ffeefa823f8 RCX: 000055b89983acd0
> RDX: 00007fc2f8122010 RSI: 0000000000020000 RDI: 000055b89983acd0
> RBP: 00007ffeefa821a0 R08: 0000000000000037 R09: 0000000000000075
> R10: 0000000000000000 R11: 0000000000000202 R12: 0000000000000000
> R13: 00007ffeefa82410 R14: 000055b897ba5dd8 R15: 00007fc4b8340000
> </TASK>
Link: https://lkml.kernel.org/r/20240216111502.79759-1-byungchul@sk.com
Signed-off-by: Byungchul Park <byungchul(a)sk.com>
Reported-by: Hyeongtak Ji <hyeongtak.ji(a)sk.com>
Fixes: c574bbe917036 ("NUMA balancing: optimize page placement for memory tiering system")
Cc: Baolin Wang <baolin.wang(a)linux.alibaba.com>
Cc: "Huang, Ying" <ying.huang(a)intel.com>
Cc: Johannes Weiner <hannes(a)cmpxchg.org>
Cc: Oscar Salvador <osalvador(a)suse.de>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/migrate.c | 8 ++++++++
1 file changed, 8 insertions(+)
--- a/mm/migrate.c~mm-vmscan-fix-a-bug-calling-wakeup_kswapd-with-a-wrong-zone-index
+++ a/mm/migrate.c
@@ -2519,6 +2519,14 @@ static int numamigrate_isolate_folio(pg_
if (managed_zone(pgdat->node_zones + z))
break;
}
+
+ /*
+ * If there are no managed zones, it should not proceed
+ * further.
+ */
+ if (z < 0)
+ return 0;
+
wakeup_kswapd(pgdat->node_zones + z, 0,
folio_order(folio), ZONE_MOVABLE);
return 0;
_
Patches currently in -mm which might be from byungchul(a)sk.com are
mm-vmscan-fix-a-bug-calling-wakeup_kswapd-with-a-wrong-zone-index.patch
mm-vmscan-dont-turn-on-cache_trim_mode-at-the-highest-scan-priority.patch
sched-numa-mm-do-not-try-to-migrate-memory-to-memoryless-nodes.patch
This series includes 4 types of fixes:
Patches 1 and 2 force the path-managers not to allocate a new address
entry when dealing with the "special" ID 0, reserved to the address of
the initial subflow. These patches can be backported up to v5.19 and
v5.12 respectively.
Patch 3 to 6 fix the in-kernel path-manager not to create duplicated
subflows. Patch 6 is the main fix, but patches 3 to 5 are some kind of
pre-requisities: they fix some data races that could also lead to the
creation of unexpected subflows. These patches can be backported up to
v5.7, v5.10, v6.0, and v5.15 respectively.
Note that patch 3 modifies the existing ULP API. No better solutions
have been found for -net, and there is some similar prior art, see
commit 0df48c26d841 ("tcp: add tcpi_bytes_acked to tcp_info"). Please
also note that TLS ULP Diag has likely the same issue.
Patches 7 to 9 fix issues in the selftests, when executing them on older
kernels, e.g. when testing the last version of these kselftests on the
v5.15.148 kernel as it is done by LKFT when validating stable kernels.
These patches only avoid printing expected errors the console and
marking some tests as "OK" while they have been skipped. Patches 7 and 8
can be backported up to v6.6.
Patches 10 to 13 make sure all MPTCP selftests subtests have a unique
name. It is important to have a unique (sub)test name in TAP, because
that's the test identifier. Some CI environments might drop tests with
duplicated names. Patches 10 to 12 can be backported up to v6.6.
Signed-off-by: Matthieu Baerts (NGI0) <matttbe(a)kernel.org>
---
Geliang Tang (2):
mptcp: add needs_id for userspace appending addr
mptcp: add needs_id for netlink appending addr
Matthieu Baerts (NGI0) (7):
selftests: mptcp: pm nl: also list skipped tests
selftests: mptcp: pm nl: avoid error msg on older kernels
selftests: mptcp: diag: fix bash warnings on older kernels
selftests: mptcp: simult flows: fix some subtest names
selftests: mptcp: userspace_pm: unique subtest names
selftests: mptcp: diag: unique 'in use' subtest names
selftests: mptcp: diag: unique 'cestab' subtest names
Paolo Abeni (4):
mptcp: fix lockless access in subflow ULP diag
mptcp: fix data races on local_id
mptcp: fix data races on remote_id
mptcp: fix duplicate subflow creation
include/net/tcp.h | 2 +-
net/mptcp/diag.c | 8 ++-
net/mptcp/pm_netlink.c | 69 ++++++++++++++---------
net/mptcp/pm_userspace.c | 15 ++---
net/mptcp/protocol.c | 2 +-
net/mptcp/protocol.h | 15 ++++-
net/mptcp/subflow.c | 15 ++---
net/tls/tls_main.c | 2 +-
tools/testing/selftests/net/mptcp/diag.sh | 41 ++++++++------
tools/testing/selftests/net/mptcp/pm_netlink.sh | 8 ++-
tools/testing/selftests/net/mptcp/simult_flows.sh | 3 +-
tools/testing/selftests/net/mptcp/userspace_pm.sh | 4 +-
12 files changed, 116 insertions(+), 68 deletions(-)
---
base-commit: c40c0d3a768c78a023a72fb2ceea00743e3a695d
change-id: 20240215-upstream-net-20240215-misc-fixes-03815ec14dc6
Best regards,
--
Matthieu Baerts (NGI0) <matttbe(a)kernel.org>
Hi Greg, few stable updates for you -
Cheers,
Kent
The following changes since commit 0dd3ee31125508cd67f7e7172247f05b7fd1753a:
Linux 6.7 (2024-01-07 12:18:38 -0800)
are available in the Git repository at:
https://evilpiepirate.org/git/bcachefs.git tags/bcachefs-for-v6.7-stable-20240208
for you to fetch changes up to f1582f4774ac7c30c5460a8c7a6e5a82b9ce5a6a:
bcachefs: time_stats: Check for last_event == 0 when updating freq stats (2024-02-08 15:33:11 -0500)
----------------------------------------------------------------
bcachefs updates for v6.7 stable:
locking fixes in subvolume create, destroy paths - Al, Su Yue, Guoyu Ou
fix race in thread_with_file - Mathias Krause
small rebalance fixes - Daniel, myself
workaround for building with old clang (can't take a pointer to memcmp)
build fix on parisc
minor time_stats fix
----------------------------------------------------------------
Al Viro (2):
new helper: user_path_locked_at()
bch2_ioctl_subvolume_destroy(): fix locking
Christoph Hellwig (1):
bcachefs: fix incorrect usage of REQ_OP_FLUSH
Daniel Hill (1):
bcachefs: rebalance should wakeup on shutdown if disabled
Guoyu Ou (1):
bcachefs: unlock parent dir if entry is not found in subvolume deletion
Helge Deller (1):
bcachefs: Fix build on parisc by avoiding __multi3()
Kent Overstreet (4):
bcachefs: Don't pass memcmp() as a pointer
bcachefs: Add missing bch2_moving_ctxt_flush_all()
bcachefs: bch2_kthread_io_clock_wait() no longer sleeps until full amount
bcachefs: time_stats: Check for last_event == 0 when updating freq stats
Mathias Krause (1):
bcachefs: install fd later to avoid race with close
Su Yue (2):
bcachefs: kvfree bch_fs::snapshots in bch2_fs_snapshots_exit
bcachefs: grab s_umount only if snapshotting
fs/bcachefs/chardev.c | 3 +--
fs/bcachefs/clock.c | 4 ++--
fs/bcachefs/fs-io.c | 2 +-
fs/bcachefs/fs-ioctl.c | 42 +++++++++++++++++++++--------------------
fs/bcachefs/journal_io.c | 3 ++-
fs/bcachefs/mean_and_variance.h | 2 +-
fs/bcachefs/move.c | 2 +-
fs/bcachefs/move.h | 1 +
fs/bcachefs/rebalance.c | 13 +++++++++++--
fs/bcachefs/replicas.c | 10 ++++++++--
fs/bcachefs/snapshot.c | 2 +-
fs/bcachefs/util.c | 5 +++--
fs/namei.c | 16 +++++++++++++---
include/linux/namei.h | 1 +
14 files changed, 68 insertions(+), 38 deletions(-)
Dear ,
Please find the attached copy of our contract for your reference.
Confirm the details, sign and return as soon as possible. The shipment cost remains the same.
See No 4 highlighted in RED confirm if we can increase as before.
Let us know if you need further clarification.
Best Regards,
Connor Gilchrist
Sales Shadow's Ridge inc.
The Shipyard, Bath Road, Lymington,
Hampshire, SO41 3YL, England
Ph. +44 (0)1590 647406.
From: Andrzej Kacprowski <Andrzej.Kacprowski(a)intel.com>
There is no point in requesting 1 tile on VPU40xx as the FW will
probably need more tiles to run workloads, so it will have to
reconfigure PLL anyway. Don't enable any tiles and allow the FW to
perform initial tile configuration.
This improves NPU boot stability as the tiles are always enabled only
by the FW from the same initial state.
Fixes: 79cdc56c4a54 ("accel/ivpu: Add initial support for VPU 4")
Signed-off-by: Andrzej Kacprowski <Andrzej.Kacprowski(a)intel.com>
Signed-off-by: Jacek Lawrynowicz <jacek.lawrynowicz(a)linux.intel.com>
---
drivers/accel/ivpu/ivpu_hw_40xx.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/accel/ivpu/ivpu_hw_40xx.c b/drivers/accel/ivpu/ivpu_hw_40xx.c
index 1c995307c113..a1523d0b1ef3 100644
--- a/drivers/accel/ivpu/ivpu_hw_40xx.c
+++ b/drivers/accel/ivpu/ivpu_hw_40xx.c
@@ -24,7 +24,7 @@
#define SKU_HW_ID_SHIFT 16u
#define SKU_HW_ID_MASK 0xffff0000u
-#define PLL_CONFIG_DEFAULT 0x1
+#define PLL_CONFIG_DEFAULT 0x0
#define PLL_CDYN_DEFAULT 0x80
#define PLL_EPP_DEFAULT 0x80
#define PLL_REF_CLK_FREQ (50 * 1000000)
--
2.43.0
From: Kairui Song <kasong(a)tencent.com>
When skipping swapcache for SWP_SYNCHRONOUS_IO, if two or more threads
swapin the same entry at the same time, they get different pages (A, B).
Before one thread (T0) finishes the swapin and installs page (A)
to the PTE, another thread (T1) could finish swapin of page (B),
swap_free the entry, then swap out the possibly modified page
reusing the same entry. It breaks the pte_same check in (T0) because
PTE value is unchanged, causing ABA problem. Thread (T0) will
install a stalled page (A) into the PTE and cause data corruption.
One possible callstack is like this:
CPU0 CPU1
---- ----
do_swap_page() do_swap_page() with same entry
<direct swapin path> <direct swapin path>
<alloc page A> <alloc page B>
swap_read_folio() <- read to page A swap_read_folio() <- read to page B
<slow on later locks or interrupt> <finished swapin first>
... set_pte_at()
swap_free() <- entry is free
<write to page B, now page A stalled>
<swap out page B to same swap entry>
pte_same() <- Check pass, PTE seems
unchanged, but page A
is stalled!
swap_free() <- page B content lost!
set_pte_at() <- staled page A installed!
And besides, for ZRAM, swap_free() allows the swap device to discard
the entry content, so even if page (B) is not modified, if
swap_read_folio() on CPU0 happens later than swap_free() on CPU1,
it may also cause data loss.
To fix this, reuse swapcache_prepare which will pin the swap entry using
the cache flag, and allow only one thread to swap it in, also prevent
any parallel code from putting the entry in the cache. Release the pin
after PT unlocked.
Racers just loop and wait since it's a rare and very short event.
A schedule_timeout_uninterruptible(1) call is added to avoid repeated
page faults wasting too much CPU, causing livelock or adding too much
noise to perf statistics. A similar livelock issue was described in
commit 029c4628b2eb ("mm: swap: get rid of livelock in swapin readahead")
Reproducer:
This race issue can be triggered easily using a well constructed
reproducer and patched brd (with a delay in read path) [1]:
With latest 6.8 mainline, race caused data loss can be observed easily:
$ gcc -g -lpthread test-thread-swap-race.c && ./a.out
Polulating 32MB of memory region...
Keep swapping out...
Starting round 0...
Spawning 65536 workers...
32746 workers spawned, wait for done...
Round 0: Error on 0x5aa00, expected 32746, got 32743, 3 data loss!
Round 0: Error on 0x395200, expected 32746, got 32743, 3 data loss!
Round 0: Error on 0x3fd000, expected 32746, got 32737, 9 data loss!
Round 0 Failed, 15 data loss!
This reproducer spawns multiple threads sharing the same memory region
using a small swap device. Every two threads updates mapped pages one by
one in opposite direction trying to create a race, with one dedicated
thread keep swapping out the data out using madvise.
The reproducer created a reproduce rate of about once every 5 minutes,
so the race should be totally possible in production.
After this patch, I ran the reproducer for over a few hundred rounds
and no data loss observed.
Performance overhead is minimal, microbenchmark swapin 10G from 32G
zram:
Before: 10934698 us
After: 11157121 us
Cached: 13155355 us (Dropping SWP_SYNCHRONOUS_IO flag)
Fixes: 0bcac06f27d7 ("mm, swap: skip swapcache for swapin of synchronous device")
Link: https://github.com/ryncsn/emm-test-project/tree/master/swap-stress-race [1]
Reported-by: "Huang, Ying" <ying.huang(a)intel.com>
Closes: https://lore.kernel.org/lkml/87bk92gqpx.fsf_-_@yhuang6-desk2.ccr.corp.intel…
Signed-off-by: Kairui Song <kasong(a)tencent.com>
Cc: stable(a)vger.kernel.org
---
V3: https://lore.kernel.org/all/20240216095105.14502-1-ryncsn@gmail.com/
Update from V3:
- Use schedule_timeout_uninterruptible(1) for now instead of schedule() to
prevent the busy faulting task holds CPU and livelocks [Huang, Ying]
V2: https://lore.kernel.org/all/20240206182559.32264-1-ryncsn@gmail.com/
Update from V2:
- Add a schedule() if raced to prevent repeated page faults wasting CPU
and add noise to perf statistics.
- Use a bool to state the special case instead of reusing existing
variables fixing error handling [Minchan Kim].
V1: https://lore.kernel.org/all/20240205110959.4021-1-ryncsn@gmail.com/
Update from V1:
- Add some words on ZRAM case, it will discard swap content on swap_free
so the race window is a bit different but cure is the same. [Barry Song]
- Update comments make it cleaner [Huang, Ying]
- Add a function place holder to fix CONFIG_SWAP=n built [SeongJae Park]
- Update the commit message and summary, refer to SWP_SYNCHRONOUS_IO
instead of "direct swapin path" [Yu Zhao]
- Update commit message.
- Collect Review and Acks.
include/linux/swap.h | 5 +++++
mm/memory.c | 20 ++++++++++++++++++++
mm/swap.h | 5 +++++
mm/swapfile.c | 13 +++++++++++++
4 files changed, 43 insertions(+)
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 4db00ddad261..8d28f6091a32 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -549,6 +549,11 @@ static inline int swap_duplicate(swp_entry_t swp)
return 0;
}
+static inline int swapcache_prepare(swp_entry_t swp)
+{
+ return 0;
+}
+
static inline void swap_free(swp_entry_t swp)
{
}
diff --git a/mm/memory.c b/mm/memory.c
index 7e1f4849463a..a99f5e7be9a5 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3799,6 +3799,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
struct page *page;
struct swap_info_struct *si = NULL;
rmap_t rmap_flags = RMAP_NONE;
+ bool need_clear_cache = false;
bool exclusive = false;
swp_entry_t entry;
pte_t pte;
@@ -3867,6 +3868,20 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
if (!folio) {
if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
__swap_count(entry) == 1) {
+ /*
+ * Prevent parallel swapin from proceeding with
+ * the cache flag. Otherwise, another thread may
+ * finish swapin first, free the entry, and swapout
+ * reusing the same entry. It's undetectable as
+ * pte_same() returns true due to entry reuse.
+ */
+ if (swapcache_prepare(entry)) {
+ /* Relax a bit to prevent rapid repeated page faults */
+ schedule_timeout_uninterruptible(1);
+ goto out;
+ }
+ need_clear_cache = true;
+
/* skip swapcache */
folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0,
vma, vmf->address, false);
@@ -4117,6 +4132,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
if (vmf->pte)
pte_unmap_unlock(vmf->pte, vmf->ptl);
out:
+ /* Clear the swap cache pin for direct swapin after PTL unlock */
+ if (need_clear_cache)
+ swapcache_clear(si, entry);
if (si)
put_swap_device(si);
return ret;
@@ -4131,6 +4149,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
folio_unlock(swapcache);
folio_put(swapcache);
}
+ if (need_clear_cache)
+ swapcache_clear(si, entry);
if (si)
put_swap_device(si);
return ret;
diff --git a/mm/swap.h b/mm/swap.h
index 758c46ca671e..fc2f6ade7f80 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -41,6 +41,7 @@ void __delete_from_swap_cache(struct folio *folio,
void delete_from_swap_cache(struct folio *folio);
void clear_shadow_from_swap_cache(int type, unsigned long begin,
unsigned long end);
+void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry);
struct folio *swap_cache_get_folio(swp_entry_t entry,
struct vm_area_struct *vma, unsigned long addr);
struct folio *filemap_get_incore_folio(struct address_space *mapping,
@@ -97,6 +98,10 @@ static inline int swap_writepage(struct page *p, struct writeback_control *wbc)
return 0;
}
+static inline void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry)
+{
+}
+
static inline struct folio *swap_cache_get_folio(swp_entry_t entry,
struct vm_area_struct *vma, unsigned long addr)
{
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 556ff7347d5f..746aa9da5302 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -3365,6 +3365,19 @@ int swapcache_prepare(swp_entry_t entry)
return __swap_duplicate(entry, SWAP_HAS_CACHE);
}
+void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry)
+{
+ struct swap_cluster_info *ci;
+ unsigned long offset = swp_offset(entry);
+ unsigned char usage;
+
+ ci = lock_cluster_or_swap_info(si, offset);
+ usage = __swap_entry_free_locked(si, offset, SWAP_HAS_CACHE);
+ unlock_cluster_or_swap_info(si, ci);
+ if (!usage)
+ free_swap_slot(entry);
+}
+
struct swap_info_struct *swp_swap_info(swp_entry_t entry)
{
return swap_type_to_swap_info(swp_type(entry));
--
2.43.0
Currently ACPI_MEMORY_NVS is omitted from the linear map, which causes
a trouble with the following firmware memory region setup:
[..] efi: 0x0000dfd62000-0x0000dfd83fff [ACPI Reclaim|...]
[..] efi: 0x0000dfd84000-0x0000dfd87fff [ACPI Mem NVS|...]
, on ARM64 with 64k page size, the whole 0x0000dfd80000-0x0000dfd8ffff
range will be omitted from the the linear map due to 64k round-up. And
a page fault happens when trying to access the ACPI_RECLAIM_MEMORY:
[...] Unable to handle kernel paging request at virtual address ffff0000dfd80000
To fix this, add ACPI_MEMORY_NVS into the linear map.
Signed-off-by: Boqun Feng <boqun.feng(a)gmail.com>
Cc: stable(a)vger.kernel.org # 5.15+
---
We hit this in an ARM64 Hyper-V VM when using 64k page size, although
this issue may also be fixed if the efi memory regions are all 64k
aligned, but I don't find this memory region setup is invalid per UEFI
spec, also I don't find that spec disallows ACPI_MEMORY_NVS to be mapped
in the OS linear map, but if there is any better way or I'm reading the
spec incorrectly, please let me know.
It's Cced stable since 5.15 because that's when Hyper-V ARM64 support is
added, and Hyper-V is the only one that hits the problem so far.
drivers/firmware/efi/efi-init.c | 9 +++++++--
1 file changed, 7 insertions(+), 2 deletions(-)
diff --git a/drivers/firmware/efi/efi-init.c b/drivers/firmware/efi/efi-init.c
index a00e07b853f2..9a1b9bc66d50 100644
--- a/drivers/firmware/efi/efi-init.c
+++ b/drivers/firmware/efi/efi-init.c
@@ -139,6 +139,7 @@ static __init int is_usable_memory(efi_memory_desc_t *md)
case EFI_LOADER_CODE:
case EFI_LOADER_DATA:
case EFI_ACPI_RECLAIM_MEMORY:
+ case EFI_ACPI_MEMORY_NVS:
case EFI_BOOT_SERVICES_CODE:
case EFI_BOOT_SERVICES_DATA:
case EFI_CONVENTIONAL_MEMORY:
@@ -202,8 +203,12 @@ static __init void reserve_regions(void)
if (!is_usable_memory(md))
memblock_mark_nomap(paddr, size);
- /* keep ACPI reclaim memory intact for kexec etc. */
- if (md->type == EFI_ACPI_RECLAIM_MEMORY)
+ /*
+ * keep ACPI reclaim and NVS memory and intact for kexec
+ * etc.
+ */
+ if (md->type == EFI_ACPI_RECLAIM_MEMORY ||
+ md->type == EFI_ACPI_MEMORY_NVS)
memblock_reserve(paddr, size);
}
}
--
2.43.0
From: Mike Marciniszyn <mike.marciniszyn(a)intel.com>
[ Upstream commit 0a5ec366de7e94192669ba08de6ed336607fd282 ]
The SQ is shared for between kernel and used by storing the kernel page
pointer and passing that to a kmap_atomic().
This then requires that the alignment is PAGE_SIZE aligned.
Fix by adding an iWarp specific alignment check.
The patch needed to be reworked because the separate routines
present upstream are not there in older irdma drivers.
Fixes: e965ef0e7b2c ("RDMA/irdma: Split QP handler into irdma_reg_user_mr_type_qp")
Link: https://lore.kernel.org/r/20231129202143.1434-3-shiraz.saleem@intel.com
Signed-off-by: Mike Marciniszyn <mike.marciniszyn(a)intel.com>
Signed-off-by: Shiraz Saleem <shiraz.saleem(a)intel.com>
Signed-off-by: Jason Gunthorpe <jgg(a)nvidia.com>
---
drivers/infiniband/hw/irdma/verbs.c | 7 +++++++
1 file changed, 7 insertions(+)
diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c
index 447e1bcc82a3..3c437c8070b6 100644
--- a/drivers/infiniband/hw/irdma/verbs.c
+++ b/drivers/infiniband/hw/irdma/verbs.c
@@ -2845,6 +2845,13 @@ static struct ib_mr *irdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 len,
switch (req.reg_type) {
case IRDMA_MEMREG_TYPE_QP:
+ /* iWarp: Catch page not starting on OS page boundary */
+ if (!rdma_protocol_roce(&iwdev->ibdev, 1) &&
+ ib_umem_offset(iwmr->region)) {
+ err = -EINVAL;
+ goto error;
+ }
+
total = req.sq_pages + req.rq_pages + shadow_pgcnt;
if (total > iwmr->page_cnt) {
err = -EINVAL;
--
1.8.3.1
This reverts commit 3f225f29c69c13ce1cbdb1d607a42efeef080056.
The shadow call stack for irq now is stored in current task's thread info
in irq_stack_entry. There is a possibility that we have some soft irqs
pending at the end of hard irq, and when we process softirq with the irq
enabled, irq_stack_entry will enter again and overwrite the shadow call
stack whitch stored in current task's thread info, leading to the
incorrect shadow call stack restoration for the first entry of the hard
IRQ, then the system end up with a panic.
task A | task A
-------------------------------------+------------------------------------
el1_irq //irq1 enter |
irq_handler //save scs_sp1 |
gic_handle_irq |
irq_exit |
__do_softirq |
| el1_irq //irq2 enter
| irq_handler //save scs_sp2
| //overwrite scs_sp1
| ...
| irq_stack_exit //restore scs_sp2
irq_stack_exit //restore wrong |
//scs_sp2 |
So revert this commit to fix it.
Fixes: 3f225f29c69c ("arm64: Stash shadow stack pointer in the task struct on interrupt")
Signed-off-by: Xiang Yang <xiangyang3(a)huawei.com>
---
arch/arm64/kernel/entry.S | 8 ++++++--
1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index a94acea770c7..020a455824be 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -431,7 +431,9 @@ SYM_CODE_END(__swpan_exit_el0)
.macro irq_stack_entry
mov x19, sp // preserve the original sp
- scs_save tsk // preserve the original shadow stack
+#ifdef CONFIG_SHADOW_CALL_STACK
+ mov x24, scs_sp // preserve the original shadow stack
+#endif
/*
* Compare sp with the base of the task stack.
@@ -465,7 +467,9 @@ SYM_CODE_END(__swpan_exit_el0)
*/
.macro irq_stack_exit
mov sp, x19
- scs_load_current
+#ifdef CONFIG_SHADOW_CALL_STACK
+ mov scs_sp, x24
+#endif
.endm
/* GPRs used by entry code */
--
2.34.1
Changes since v2:
- added signed-off
Changes since v1:
- added upstream commit id to the commit message
This suggests a fix from 6.3 for stable that fixes a nasty bug in the
timing behavior of periodic RT tasks w.r.t timerslack_ns. While the
documentation clearly states that the slack time is ignored for RT tasks,
this is not the case for the hrtimer code. This patch fixes the issue and
applies to all stable kernels.
Best regards,
Felix Moessbauer
Siemens AG
Davidlohr Bueso (1):
hrtimer: Ignore slack time for RT tasks in schedule_hrtimeout_range()
kernel/time/hrtimer.c | 14 +++++++++++---
1 file changed, 11 insertions(+), 3 deletions(-)
--
2.39.2
(cc stakeholders from various distros - apologies if I missed anyone)
Please consider the patches below for backporting to the linux-6.6.y
stable tree.
These are prerequisites for building a signed x86 efistub kernel image
that complies with the tightened UEFI boot requirements imposed by
MicroSoft, and this is the condition under which it is willing to sign
future Linux secure boot shim builds with its 3rd party CA
certificate. (Such builds must enforce a strict separation between
executable and writable code, among other things)
The patches apply cleanly onto 6.6.17 (-rc2), resulting in a defconfig
build that boots as expected under OVMF/KVM.
5f51c5d0e905 x86/efi: Drop EFI stub .bss from .data section
7e50262229fa x86/efi: Disregard setup header of loaded image
bfab35f552ab x86/efi: Drop alignment flags from PE section headers
768171d7ebbc x86/boot: Remove the 'bugger off' message
8eace5b35556 x86/boot: Omit compression buffer from PE/COFF image
memory footprint
7448e8e5d15a x86/boot: Drop redundant code setting the root device
b618d31f112b x86/boot: Drop references to startup_64
2e765c02dcbf x86/boot: Grab kernel_info offset from zoffset header directly
eac956345f99 x86/boot: Set EFI handover offset directly in header asm
093ab258e3fb x86/boot: Define setup size in linker script
aeb92067f6ae x86/boot: Derive file size from _edata symbol
efa089e63b56 x86/boot: Construct PE/COFF .text section from assembler
fa5750521e0a x86/boot: Drop PE/COFF .reloc section
34951f3c28bd x86/boot: Split off PE/COFF .data section
3e3eabe26dc8 x86/boot: Increase section and file alignment to 4k/512
1ad55cecf22f x86/efistub: Use 1:1 file:memory mapping for PE/COFF
.compat section
arch/x86/boot/Makefile | 2 +-
arch/x86/boot/compressed/vmlinux.lds.S | 6 +-
arch/x86/boot/header.S | 211 ++++++++++--------------
arch/x86/boot/setup.ld | 14 +-
arch/x86/boot/tools/build.c | 273 ++------------------------------
drivers/firmware/efi/libstub/Makefile | 7 -
drivers/firmware/efi/libstub/x86-stub.c | 46 +-----
7 files changed, 112 insertions(+), 447 deletions(-)
The following changes since commit 8b4118fabd6eb75fed19483b04dab3a036886489:
Linux 6.1.78 (2024-02-16 19:06:32 +0100)
are available in the Git repository at:
https://git.kernel.org/pub/scm/linux/kernel/git/cel/linux.git nfsd-6.1.y
for you to fetch changes up to d432d1006b60bd6b5c38974727bdce78f449eeea:
nfsd: don't take fi_lock in nfsd_break_deleg_cb() (2024-02-16 13:58:29 -0500)
----------------------------------------------------------------
NeilBrown (2):
nfsd: fix RELEASE_LOCKOWNER
nfsd: don't take fi_lock in nfsd_break_deleg_cb()
fs/nfsd/nfs4state.c | 37 ++++++++++++++++++++-----------------
1 file changed, 20 insertions(+), 17 deletions(-)
--
Chuck Lever
This is a backport of all the work that lead up to the work that Linus made
on eventfs. I trust Linus's version more so than the versions in 6.6 and
6.7. There may be plenty of hidden issues due to the design.
This is the update for 6.6. It includes Linus's updates as well as all the
patches leading up to them. As the eventfs work went in in two parts, half
went in in 6.6 and the other in 6.7, there were 6 backports that were done
custom to 6.6 as the bugs found in 6.7 were in 6.6 but implemented
differently. This series starts with reverting those 6 backports and then
applying the updated patches to get to Linus's simplification.
I ran these through my full test suite that I use before sending anything to
Linus, although I did not run my "bisect" test that walks through the
patches. The tests were just run on the end result.
This was created with the following command against v6.6.15, after reverting
the 6 patches:
git log --reverse --no-merges --pretty=oneline v6.6..origin/master fs/tracefs/ | cut -d' ' -f1 |
while read a; do if ! git cherry-pick -x $a; then break; fi ; done
Which adds -x to the cherry pick to add the upstream commit SHAs.
There was one patch in tracefs that didn't need to be backported and I removed
that one.
Beau Belgrave (1):
eventfs: Fix events beyond NAME_MAX blocking tasks
Erick Archer (1):
eventfs: Use kcalloc() instead of kzalloc()
Jiapeng Chong (1):
tracefs/eventfs: Modify mismatched function name
Linus Torvalds (7):
tracefs: remove stale 'update_gid' code
eventfs: Initialize the tracefs inode properly
tracefs: Avoid using the ei->dentry pointer unnecessarily
tracefs: dentry lookup crapectomy
eventfs: Remove unused d_parent pointer field
eventfs: Clean up dentry ops and add revalidate function
eventfs: Get rid of dentry pointers without refcounts
Nathan Chancellor (1):
eventfs: Use ERR_CAST() in eventfs_create_events_dir()
Steven Rostedt (Google) (46):
Revert "eventfs: Do not allow NULL parent to eventfs_start_creating()"
Revert "eventfs: Check for NULL ef in eventfs_set_attr()"
Revert "eventfs: Use simple_recursive_removal() to clean up dentries"
Revert "eventfs: Delete eventfs_inode when the last dentry is freed"
Revert "eventfs: Save ownership and mode"
Revert "eventfs: Remove "is_freed" union with rcu head"
eventfs: Remove eventfs_file and just use eventfs_inode
eventfs: Use eventfs_remove_events_dir()
eventfs: Fix failure path in eventfs_create_events_dir()
eventfs: Fix WARN_ON() in create_file_dentry()
eventfs: Fix typo in eventfs_inode union comment
eventfs: Remove extra dget() in eventfs_create_events_dir()
eventfs: Fix kerneldoc of eventfs_remove_rec()
eventfs: Remove "is_freed" union with rcu head
eventfs: Have a free_ei() that just frees the eventfs_inode
eventfs: Test for ei->is_freed when accessing ei->dentry
eventfs: Save ownership and mode
eventfs: Hold eventfs_mutex when calling callback functions
eventfs: Delete eventfs_inode when the last dentry is freed
eventfs: Remove special processing of dput() of events directory
eventfs: Use simple_recursive_removal() to clean up dentries
eventfs: Remove expectation that ei->is_freed means ei->dentry == NULL
eventfs: Do not invalidate dentry in create_file/dir_dentry()
eventfs: Use GFP_NOFS for allocation when eventfs_mutex is held
eventfs: Move taking of inode_lock into dcache_dir_open_wrapper()
eventfs: Do not allow NULL parent to eventfs_start_creating()
eventfs: Make sure that parent->d_inode is locked in creating files/dirs
eventfs: Have event files and directories default to parent uid and gid
eventfs: Fix file and directory uid and gid ownership
tracefs: Check for dentry->d_inode exists in set_gid()
eventfs: Fix bitwise fields for "is_events"
eventfs: Remove "lookup" parameter from create_dir/file_dentry()
eventfs: Stop using dcache_readdir() for getdents()
tracefs/eventfs: Use root and instance inodes as default ownership
eventfs: Have eventfs_iterate() stop immediately if ei->is_freed is set
eventfs: Do ctx->pos update for all iterations in eventfs_iterate()
eventfs: Read ei->entries before ei->children in eventfs_iterate()
eventfs: Shortcut eventfs_iterate() by skipping entries already read
eventfs: Have the inodes all for files and directories all be the same
eventfs: Do not create dentries nor inodes in iterate_shared
eventfs: Save directory inodes in the eventfs_inode structure
tracefs: Zero out the tracefs_inode when allocating it
eventfs: Warn if an eventfs_inode is freed without is_freed being set
eventfs: Restructure eventfs_inode structure to be more condensed
eventfs: Remove fsnotify*() functions from lookup()
eventfs: Keep all directory links at 1
----
fs/tracefs/event_inode.c | 1250 +++++++++++++++++++-----------------------
fs/tracefs/inode.c | 276 +++++-----
fs/tracefs/internal.h | 60 +-
include/linux/trace_events.h | 2 +-
include/linux/tracefs.h | 73 ++-
kernel/trace/trace.c | 7 +-
kernel/trace/trace.h | 4 +-
kernel/trace/trace_events.c | 311 +++++++----
8 files changed, 1029 insertions(+), 954 deletions(-)
Changes since v1:
- added upstream commit id to the commit message
This suggests a fix from 6.3 for stable that fixes a nasty bug in the
timing behavior of periodic RT tasks w.r.t timerslack_ns. While the
documentation clearly states that the slack time is ignored for RT tasks,
this is not the case for the hrtimer code. This patch fixes the issue and
applies to all stable kernels.
Best regards,
Felix Moessbauer
Siemens AG
Davidlohr Bueso (1):
hrtimer: Ignore slack time for RT tasks in schedule_hrtimeout_range()
kernel/time/hrtimer.c | 14 +++++++++++---
1 file changed, 11 insertions(+), 3 deletions(-)
--
2.39.2
l2tp_ip6_sendmsg needs to avoid accounting for the transport header
twice when splicing more data into an already partially-occupied skbuff.
To manage this, we check whether the skbuff contains data using
skb_queue_empty when deciding how much data to append using
ip6_append_data.
However, the code which performed the calculation was incorrect:
ulen = len + skb_queue_empty(&sk->sk_write_queue) ? transhdrlen : 0;
...due to C operator precedence, this ends up setting ulen to
transhdrlen for messages with a non-zero length, which results in
corrupted packets on the wire.
Add parentheses to correct the calculation in line with the original
intent.
Fixes: 9d4c75800f61 ("ipv4, ipv6: Fix handling of transhdrlen in __ip{,6}_append_data()")
Cc: David Howells <dhowells(a)redhat.com>
Cc: stable(a)vger.kernel.org
Signed-off-by: Tom Parkin <tparkin(a)katalix.com>
---
This issue was uncovered by Debian build-testing for the
golang-github-katalix-go-l2tp package[1].
It seems 9d4c75800f61 has been backported to the linux-6.1.y stable
kernel (and possibly others), so I think this fix will also need
backporting.
The bug is currently seen on at least Debian Bookworm, Ubuntu Jammy, and
Debian testing/unstable.
Unfortunately tests using "ip l2tp" and which focus on dataplane
transport will not uncover this bug: it's necessary to send a packet
using an L2TPIP6 socket opened by userspace, and to verify the packet on
the wire. The l2tp-ktest[2] test suite has been extended to cover this.
[1]. https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=1063746
[2]. https://github.com/katalix/l2tp-ktest
---
net/l2tp/l2tp_ip6.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
index dd3153966173..7bf14cf9ffaa 100644
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -627,7 +627,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
back_from_confirm:
lock_sock(sk);
- ulen = len + skb_queue_empty(&sk->sk_write_queue) ? transhdrlen : 0;
+ ulen = len + (skb_queue_empty(&sk->sk_write_queue) ? transhdrlen : 0);
err = ip6_append_data(sk, ip_generic_getfrag, msg,
ulen, transhdrlen, &ipc6,
&fl6, (struct rt6_info *)dst,
--
2.34.1
Hi,
this series does basically two things:
1. Disables automatic load balancing as adviced by the hardware
workaround.
2. Forces the sharing of the load submitted to CCS among all the
CCS available (as of now only DG2 has more than one CCS). This
way the user, when sending a query, will see only one CCS
available.
Andi
Andi Shyti (2):
drm/i915/gt: Disable HW load balancing for CCS
drm/i915/gt: Set default CCS mode '1'
drivers/gpu/drm/i915/gt/intel_gt.c | 11 +++++++++++
drivers/gpu/drm/i915/gt/intel_gt_regs.h | 3 +++
drivers/gpu/drm/i915/gt/intel_workarounds.c | 6 ++++++
drivers/gpu/drm/i915/i915_drv.h | 17 +++++++++++++++++
drivers/gpu/drm/i915/i915_query.c | 5 +++--
5 files changed, 40 insertions(+), 2 deletions(-)
--
2.43.0
Commit fb24ea52f78e0d595852e ("drivers: Remove explicit invocations of
mmiowb()") remove all mmiowb() in drivers, but it says:
"NOTE: mmiowb() has only ever guaranteed ordering in conjunction with
spin_unlock(). However, pairing each mmiowb() removal in this patch with
the corresponding call to spin_unlock() is not at all trivial, so there
is a small chance that this change may regress any drivers incorrectly
relying on mmiowb() to order MMIO writes between CPUs using lock-free
synchronisation."
The mmio in radeon_ring_commit() is protected by a mutex rather than a
spinlock, but in the mutex fastpath it behaves similar to spinlock and
need a mmiowb() to make sure the wptr is up-to-date for hardware.
Without this, we get such an error when run 'glxgears' on weak ordering
architectures such as LoongArch:
radeon 0000:04:00.0: ring 0 stalled for more than 10324msec
radeon 0000:04:00.0: ring 3 stalled for more than 10240msec
radeon 0000:04:00.0: GPU lockup (current fence id 0x000000000001f412 last fence id 0x000000000001f414 on ring 3)
radeon 0000:04:00.0: GPU lockup (current fence id 0x000000000000f940 last fence id 0x000000000000f941 on ring 0)
radeon 0000:04:00.0: scheduling IB failed (-35).
[drm:radeon_gem_va_ioctl [radeon]] *ERROR* Couldn't update BO_VA (-35)
radeon 0000:04:00.0: scheduling IB failed (-35).
[drm:radeon_gem_va_ioctl [radeon]] *ERROR* Couldn't update BO_VA (-35)
radeon 0000:04:00.0: scheduling IB failed (-35).
[drm:radeon_gem_va_ioctl [radeon]] *ERROR* Couldn't update BO_VA (-35)
radeon 0000:04:00.0: scheduling IB failed (-35).
[drm:radeon_gem_va_ioctl [radeon]] *ERROR* Couldn't update BO_VA (-35)
radeon 0000:04:00.0: scheduling IB failed (-35).
[drm:radeon_gem_va_ioctl [radeon]] *ERROR* Couldn't update BO_VA (-35)
radeon 0000:04:00.0: scheduling IB failed (-35).
[drm:radeon_gem_va_ioctl [radeon]] *ERROR* Couldn't update BO_VA (-35)
radeon 0000:04:00.0: scheduling IB failed (-35).
[drm:radeon_gem_va_ioctl [radeon]] *ERROR* Couldn't update BO_VA (-35)
Cc: stable(a)vger.kernel.org
Signed-off-by: Tianyang Zhang <zhangtianyang(a)loongson.cn>
Signed-off-by: Huacai Chen <chenhuacai(a)loongson.cn>
---
drivers/gpu/drm/radeon/radeon_ring.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/drivers/gpu/drm/radeon/radeon_ring.c b/drivers/gpu/drm/radeon/radeon_ring.c
index 38048593bb4a..d461dc85d820 100644
--- a/drivers/gpu/drm/radeon/radeon_ring.c
+++ b/drivers/gpu/drm/radeon/radeon_ring.c
@@ -183,6 +183,7 @@ void radeon_ring_commit(struct radeon_device *rdev, struct radeon_ring *ring,
if (hdp_flush && rdev->asic->mmio_hdp_flush)
rdev->asic->mmio_hdp_flush(rdev);
radeon_ring_set_wptr(rdev, ring);
+ mmiowb(); /* Make sure wptr is up-to-date for hw */
}
/**
--
2.43.0
On 2024/2/20 13:32, Kairui Song wrote:
> On Tue, Feb 20, 2024 at 12:49 PM Chengming Zhou <zhouchengming(a)bytedance.com>
> wrote:
>>
>> On 2024/2/20 06:10, Barry Song wrote:
>>> On Mon, Feb 19, 2024 at 9:21 PM Kairui Song <ryncsn(a)gmail.com> wrote:
>>>>
>>>> From: Kairui Song <kasong(a)tencent.com>
>>>>
>>>> When skipping swapcache for SWP_SYNCHRONOUS_IO, if two or more threads
>>>> swapin the same entry at the same time, they get different pages (A,
> B).
>>>> Before one thread (T0) finishes the swapin and installs page (A)
>>>> to the PTE, another thread (T1) could finish swapin of page (B),
>>>> swap_free the entry, then swap out the possibly modified page
>>>> reusing the same entry. It breaks the pte_same check in (T0) because
>>>> PTE value is unchanged, causing ABA problem. Thread (T0) will
>>>> install a stalled page (A) into the PTE and cause data corruption.
>>>>
>>>> One possible callstack is like this:
>>>>
>>>> CPU0 CPU1
>>>> ---- ----
>>>> do_swap_page() do_swap_page() with same entry
>>>> <direct swapin path> <direct swapin path>
>>>> <alloc page A> <alloc page B>
>>>> swap_read_folio() <- read to page A swap_read_folio() <- read to page
> B
>>>> <slow on later locks or interrupt> <finished swapin first>
>>>> .. set_pte_at()
>>>> swap_free() <- entry is free
>>>> <write to page B, now page A
> stalled>
>>>> <swap out page B to same swap
> entry>
>>>> pte_same() <- Check pass, PTE seems
>>>> unchanged, but page A
>>>> is stalled!
>>>> swap_free() <- page B content lost!
>>>> set_pte_at() <- staled page A installed!
>>>>
>>>> And besides, for ZRAM, swap_free() allows the swap device to discard
>>>> the entry content, so even if page (B) is not modified, if
>>>> swap_read_folio() on CPU0 happens later than swap_free() on CPU1,
>>>> it may also cause data loss.
>>>>
>>>> To fix this, reuse swapcache_prepare which will pin the swap entry
> using
>>>> the cache flag, and allow only one thread to swap it in, also prevent
>>>> any parallel code from putting the entry in the cache. Release the pin
>>>> after PT unlocked.
>>>>
>>>> Racers just loop and wait since it's a rare and very short event.
>>>> A schedule_timeout_uninterruptible(1) call is added to avoid repeated
>>>> page faults wasting too much CPU, causing livelock or adding too much
>>>> noise to perf statistics. A similar livelock issue was described in
>>>> commit 029c4628b2eb ("mm: swap: get rid of livelock in swapin
> readahead")
>>>>
>>>> Reproducer:
>>>>
>>>> This race issue can be triggered easily using a well constructed
>>>> reproducer and patched brd (with a delay in read path) [1]:
>>>>
>>>> With latest 6.8 mainline, race caused data loss can be observed easily:
>>>> $ gcc -g -lpthread test-thread-swap-race.c && ./a.out
>>>> Polulating 32MB of memory region...
>>>> Keep swapping out...
>>>> Starting round 0...
>>>> Spawning 65536 workers...
>>>> 32746 workers spawned, wait for done...
>>>> Round 0: Error on 0x5aa00, expected 32746, got 32743, 3 data loss!
>>>> Round 0: Error on 0x395200, expected 32746, got 32743, 3 data loss!
>>>> Round 0: Error on 0x3fd000, expected 32746, got 32737, 9 data loss!
>>>> Round 0 Failed, 15 data loss!
>>>>
>>>> This reproducer spawns multiple threads sharing the same memory region
>>>> using a small swap device. Every two threads updates mapped pages one
> by
>>>> one in opposite direction trying to create a race, with one dedicated
>>>> thread keep swapping out the data out using madvise.
>>>>
>>>> The reproducer created a reproduce rate of about once every 5 minutes,
>>>> so the race should be totally possible in production.
>>>>
>>>> After this patch, I ran the reproducer for over a few hundred rounds
>>>> and no data loss observed.
>>>>
>>>> Performance overhead is minimal, microbenchmark swapin 10G from 32G
>>>> zram:
>>>>
>>>> Before: 10934698 us
>>>> After: 11157121 us
>>>> Cached: 13155355 us (Dropping SWP_SYNCHRONOUS_IO flag)
>>>>
>>>> Fixes: 0bcac06f27d7 ("mm, swap: skip swapcache for swapin of
> synchronous device")
>>>> Link:
> https://github.com/ryncsn/emm-test-project/tree/master/swap-stress-race [1]
>>>> Reported-by: "Huang, Ying" <ying.huang(a)intel.com>
>>>> Closes:
> https://lore.kernel.org/lkml/87bk92gqpx.fsf_-_@yhuang6-desk2.ccr.corp.intel…
>>>> Signed-off-by: Kairui Song <kasong(a)tencent.com>
>>>> Cc: stable(a)vger.kernel.org
>>>>
>>>> ---
>>>> V3:
> https://lore.kernel.org/all/20240216095105.14502-1-ryncsn@gmail.com/
>>>> Update from V3:
>>>> - Use schedule_timeout_uninterruptible(1) for now instead of
> schedule() to
>>>> prevent the busy faulting task holds CPU and livelocks [Huang, Ying]
>>>>
>>>> V2:
> https://lore.kernel.org/all/20240206182559.32264-1-ryncsn@gmail.com/
>>>> Update from V2:
>>>> - Add a schedule() if raced to prevent repeated page faults wasting CPU
>>>> and add noise to perf statistics.
>>>> - Use a bool to state the special case instead of reusing existing
>>>> variables fixing error handling [Minchan Kim].
>>>>
>>>> V1: https://lore.kernel.org/all/20240205110959.4021-1-ryncsn@gmail.com/
>>>> Update from V1:
>>>> - Add some words on ZRAM case, it will discard swap content on
> swap_free
>>>> so the race window is a bit different but cure is the same. [Barry
> Song]
>>>> - Update comments make it cleaner [Huang, Ying]
>>>> - Add a function place holder to fix CONFIG_SWAP=n built [SeongJae
> Park]
>>>> - Update the commit message and summary, refer to SWP_SYNCHRONOUS_IO
>>>> instead of "direct swapin path" [Yu Zhao]
>>>> - Update commit message.
>>>> - Collect Review and Acks.
>>>>
>>>> include/linux/swap.h | 5 +++++
>>>> mm/memory.c | 20 ++++++++++++++++++++
>>>> mm/swap.h | 5 +++++
>>>> mm/swapfile.c | 13 +++++++++++++
>>>> 4 files changed, 43 insertions(+)
>>>>
>>>> diff --git a/include/linux/swap.h b/include/linux/swap.h
>>>> index 4db00ddad261..8d28f6091a32 100644
>>>> --- a/include/linux/swap.h
>>>> +++ b/include/linux/swap.h
>>>> @@ -549,6 +549,11 @@ static inline int swap_duplicate(swp_entry_t swp)
>>>> return 0;
>>>> }
>>>>
>>>> +static inline int swapcache_prepare(swp_entry_t swp)
>>>> +{
>>>> + return 0;
>>>> +}
>>>> +
>>>> static inline void swap_free(swp_entry_t swp)
>>>> {
>>>> }
>>>> diff --git a/mm/memory.c b/mm/memory.c
>>>> index 7e1f4849463a..a99f5e7be9a5 100644
>>>> --- a/mm/memory.c
>>>> +++ b/mm/memory.c
>>>> @@ -3799,6 +3799,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>>>> struct page *page;
>>>> struct swap_info_struct *si = NULL;
>>>> rmap_t rmap_flags = RMAP_NONE;
>>>> + bool need_clear_cache = false;
>>>> bool exclusive = false;
>>>> swp_entry_t entry;
>>>> pte_t pte;
>>>> @@ -3867,6 +3868,20 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>>>> if (!folio) {
>>>> if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
>>>> __swap_count(entry) == 1) {
>>>> + /*
>>>> + * Prevent parallel swapin from proceeding with
>>>> + * the cache flag. Otherwise, another thread
> may
>>>> + * finish swapin first, free the entry, and
> swapout
>>>> + * reusing the same entry. It's undetectable as
>>>> + * pte_same() returns true due to entry reuse.
>>>> + */
>>>> + if (swapcache_prepare(entry)) {
>>>> + /* Relax a bit to prevent rapid
> repeated page faults */
>>>> + schedule_timeout_uninterruptible(1);
>>>
>>> Not a ideal model, imaging two tasks,
>>>
>>> task A - low priority running on a LITTLE core
>>> task B - high priority and have real-time requirements such as audio,
>>> graphics running on a big core.
>>>
>>> The original code will make B win even if it is a bit later than A as
> its CPU is
>>> much faster to finish swap_read_folio for example from zRAM. task B's
>>> swap-in can finish very soon.
>>>
>>> With the patch, B will wait a tick and its real-time performance will be
>>> negatively affected from time to time once low priority and high
> priority
>>> tasks fault in the same PTE and high priority tasks are a bit later than
>>> low priority tasks. This is a kind of priority inversion.
>>>
>>> When we support large folio swap-in, things can get worse. For example,
>>> to swap-in 16 or even more pages in one do_swap_page, the chance for
>>> task A and task B located in the same range of 16 PTEs will increase
>>> though they are not located in the same PTE.
>>>
>>> Please consider this is not a blocker for this patch. But I will put
> the problem
>>> in my list and run some real tests on Android phones later.
>>
>> Good point. Late for the discussion, I'm wondering why not get an extra
> reference
>> on the swap entry, instead of swapcache_prepare()? Then the faster thread
> will
>> succeed, but can't free the swap entry. Later, the slower thread will
> find the
>> changed pte value and fail, and free the swap entry. Maybe I missed
> something?
>
> Hi, Chengming
>
> That was my initial purpose. Then found a lot of problems with it. Increase
> swap count here, it may race with another swap free and end up increasing
> the swap count of a freed entry.
>
> That can be fixed with audits and new helpers, but there are many other
> potential issues too. One major problem is that after count bump, raced
> swap threads will fallback to cached swap in. Pages in swapcache can be
> swaped out without allocating an entry, making the problem we were trying
> to resolve more serious.
Thanks for your clarification! Right, there are many issues I just ignored...
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x 14db5f64a971fce3d8ea35de4dfc7f443a3efb92
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021944-kettle-upturned-4371@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
14db5f64a971 ("zonefs: Improve error handling")
77af13ba3c7f ("zonefs: Do not propagate iomap_dio_rw() ENOTBLK error to user space")
aa7f243f32e1 ("zonefs: Separate zone information from inode information")
34422914dc00 ("zonefs: Reduce struct zonefs_inode_info size")
46a9c526eef7 ("zonefs: Simplify IO error handling")
4008e2a0b01a ("zonefs: Reorganize code")
a608da3bd730 ("zonefs: Detect append writes at invalid locations")
db58653ce0c7 ("zonefs: Fix active zone accounting")
7dd12d65ac64 ("zonefs: fix zone report size in __zonefs_io_error()")
8745889a7fd0 ("Merge tag 'iomap-6.0-merge-2' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 14db5f64a971fce3d8ea35de4dfc7f443a3efb92 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <dlemoal(a)kernel.org>
Date: Thu, 8 Feb 2024 17:26:59 +0900
Subject: [PATCH] zonefs: Improve error handling
Write error handling is racy and can sometime lead to the error recovery
path wrongly changing the inode size of a sequential zone file to an
incorrect value which results in garbage data being readable at the end
of a file. There are 2 problems:
1) zonefs_file_dio_write() updates a zone file write pointer offset
after issuing a direct IO with iomap_dio_rw(). This update is done
only if the IO succeed for synchronous direct writes. However, for
asynchronous direct writes, the update is done without waiting for
the IO completion so that the next asynchronous IO can be
immediately issued. However, if an asynchronous IO completes with a
failure right before the i_truncate_mutex lock protecting the update,
the update may change the value of the inode write pointer offset
that was corrected by the error path (zonefs_io_error() function).
2) zonefs_io_error() is called when a read or write error occurs. This
function executes a report zone operation using the callback function
zonefs_io_error_cb(), which does all the error recovery handling
based on the current zone condition, write pointer position and
according to the mount options being used. However, depending on the
zoned device being used, a report zone callback may be executed in a
context that is different from the context of __zonefs_io_error(). As
a result, zonefs_io_error_cb() may be executed without the inode
truncate mutex lock held, which can lead to invalid error processing.
Fix both problems as follows:
- Problem 1: Perform the inode write pointer offset update before a
direct write is issued with iomap_dio_rw(). This is safe to do as
partial direct writes are not supported (IOMAP_DIO_PARTIAL is not
set) and any failed IO will trigger the execution of zonefs_io_error()
which will correct the inode write pointer offset to reflect the
current state of the one on the device.
- Problem 2: Change zonefs_io_error_cb() into zonefs_handle_io_error()
and call this function directly from __zonefs_io_error() after
obtaining the zone information using blkdev_report_zones() with a
simple callback function that copies to a local stack variable the
struct blk_zone obtained from the device. This ensures that error
handling is performed holding the inode truncate mutex.
This change also simplifies error handling for conventional zone files
by bypassing the execution of report zones entirely. This is safe to
do because the condition of conventional zones cannot be read-only or
offline and conventional zone files are always fully mapped with a
constant file size.
Reported-by: Shin'ichiro Kawasaki <shinichiro.kawasaki(a)wdc.com>
Fixes: 8dcc1a9d90c1 ("fs: New zonefs file system")
Cc: stable(a)vger.kernel.org
Signed-off-by: Damien Le Moal <dlemoal(a)kernel.org>
Tested-by: Shin'ichiro Kawasaki <shinichiro.kawasaki(a)wdc.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn(a)wdc.com>
Reviewed-by: Himanshu Madhani <himanshu.madhani(a)oracle.com>
diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c
index 6ab2318a9c8e..dba5dcb62bef 100644
--- a/fs/zonefs/file.c
+++ b/fs/zonefs/file.c
@@ -348,7 +348,12 @@ static int zonefs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size,
struct zonefs_inode_info *zi = ZONEFS_I(inode);
if (error) {
- zonefs_io_error(inode, true);
+ /*
+ * For Sync IOs, error recovery is called from
+ * zonefs_file_dio_write().
+ */
+ if (!is_sync_kiocb(iocb))
+ zonefs_io_error(inode, true);
return error;
}
@@ -491,6 +496,14 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
ret = -EINVAL;
goto inode_unlock;
}
+ /*
+ * Advance the zone write pointer offset. This assumes that the
+ * IO will succeed, which is OK to do because we do not allow
+ * partial writes (IOMAP_DIO_PARTIAL is not set) and if the IO
+ * fails, the error path will correct the write pointer offset.
+ */
+ z->z_wpoffset += count;
+ zonefs_inode_account_active(inode);
mutex_unlock(&zi->i_truncate_mutex);
}
@@ -504,20 +517,19 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
if (ret == -ENOTBLK)
ret = -EBUSY;
- if (zonefs_zone_is_seq(z) &&
- (ret > 0 || ret == -EIOCBQUEUED)) {
- if (ret > 0)
- count = ret;
-
- /*
- * Update the zone write pointer offset assuming the write
- * operation succeeded. If it did not, the error recovery path
- * will correct it. Also do active seq file accounting.
- */
- mutex_lock(&zi->i_truncate_mutex);
- z->z_wpoffset += count;
- zonefs_inode_account_active(inode);
- mutex_unlock(&zi->i_truncate_mutex);
+ /*
+ * For a failed IO or partial completion, trigger error recovery
+ * to update the zone write pointer offset to a correct value.
+ * For asynchronous IOs, zonefs_file_write_dio_end_io() may already
+ * have executed error recovery if the IO already completed when we
+ * reach here. However, we cannot know that and execute error recovery
+ * again (that will not change anything).
+ */
+ if (zonefs_zone_is_seq(z)) {
+ if (ret > 0 && ret != count)
+ ret = -EIO;
+ if (ret < 0 && ret != -EIOCBQUEUED)
+ zonefs_io_error(inode, true);
}
inode_unlock:
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index 93971742613a..b6e8e7c96251 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -246,16 +246,18 @@ static void zonefs_inode_update_mode(struct inode *inode)
z->z_mode = inode->i_mode;
}
-struct zonefs_ioerr_data {
- struct inode *inode;
- bool write;
-};
-
static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx,
void *data)
{
- struct zonefs_ioerr_data *err = data;
- struct inode *inode = err->inode;
+ struct blk_zone *z = data;
+
+ *z = *zone;
+ return 0;
+}
+
+static void zonefs_handle_io_error(struct inode *inode, struct blk_zone *zone,
+ bool write)
+{
struct zonefs_zone *z = zonefs_inode_zone(inode);
struct super_block *sb = inode->i_sb;
struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
@@ -270,8 +272,8 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx,
data_size = zonefs_check_zone_condition(sb, z, zone);
isize = i_size_read(inode);
if (!(z->z_flags & (ZONEFS_ZONE_READONLY | ZONEFS_ZONE_OFFLINE)) &&
- !err->write && isize == data_size)
- return 0;
+ !write && isize == data_size)
+ return;
/*
* At this point, we detected either a bad zone or an inconsistency
@@ -292,7 +294,7 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx,
* In all cases, warn about inode size inconsistency and handle the
* IO error according to the zone condition and to the mount options.
*/
- if (zonefs_zone_is_seq(z) && isize != data_size)
+ if (isize != data_size)
zonefs_warn(sb,
"inode %lu: invalid size %lld (should be %lld)\n",
inode->i_ino, isize, data_size);
@@ -352,8 +354,6 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx,
zonefs_i_size_write(inode, data_size);
z->z_wpoffset = data_size;
zonefs_inode_account_active(inode);
-
- return 0;
}
/*
@@ -367,23 +367,25 @@ void __zonefs_io_error(struct inode *inode, bool write)
{
struct zonefs_zone *z = zonefs_inode_zone(inode);
struct super_block *sb = inode->i_sb;
- struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
unsigned int noio_flag;
- unsigned int nr_zones = 1;
- struct zonefs_ioerr_data err = {
- .inode = inode,
- .write = write,
- };
+ struct blk_zone zone;
int ret;
/*
- * The only files that have more than one zone are conventional zone
- * files with aggregated conventional zones, for which the inode zone
- * size is always larger than the device zone size.
+ * Conventional zone have no write pointer and cannot become read-only
+ * or offline. So simply fake a report for a single or aggregated zone
+ * and let zonefs_handle_io_error() correct the zone inode information
+ * according to the mount options.
*/
- if (z->z_size > bdev_zone_sectors(sb->s_bdev))
- nr_zones = z->z_size >>
- (sbi->s_zone_sectors_shift + SECTOR_SHIFT);
+ if (!zonefs_zone_is_seq(z)) {
+ zone.start = z->z_sector;
+ zone.len = z->z_size >> SECTOR_SHIFT;
+ zone.wp = zone.start + zone.len;
+ zone.type = BLK_ZONE_TYPE_CONVENTIONAL;
+ zone.cond = BLK_ZONE_COND_NOT_WP;
+ zone.capacity = zone.len;
+ goto handle_io_error;
+ }
/*
* Memory allocations in blkdev_report_zones() can trigger a memory
@@ -394,12 +396,20 @@ void __zonefs_io_error(struct inode *inode, bool write)
* the GFP_NOIO context avoids both problems.
*/
noio_flag = memalloc_noio_save();
- ret = blkdev_report_zones(sb->s_bdev, z->z_sector, nr_zones,
- zonefs_io_error_cb, &err);
- if (ret != nr_zones)
+ ret = blkdev_report_zones(sb->s_bdev, z->z_sector, 1,
+ zonefs_io_error_cb, &zone);
+ memalloc_noio_restore(noio_flag);
+
+ if (ret != 1) {
zonefs_err(sb, "Get inode %lu zone information failed %d\n",
inode->i_ino, ret);
- memalloc_noio_restore(noio_flag);
+ zonefs_warn(sb, "remounting filesystem read-only\n");
+ sb->s_flags |= SB_RDONLY;
+ return;
+ }
+
+handle_io_error:
+ zonefs_handle_io_error(inode, &zone, write);
}
static struct kmem_cache *zonefs_inode_cachep;
Hi Jaegeuk Kim, Chao Yu,
In Debian the following regression was reported after a Dhya updated
to 6.1.76:
On Wed, Feb 07, 2024 at 10:43:47PM -0500, Dhya wrote:
> Package: src:linux
> Version: 6.1.76-1
> Severity: critical
> Justification: breaks the whole system
>
> Dear Maintainer,
>
> After upgrade to linux-image-6.1.0-18-amd64 6.1.76-1 F2FS filesystem
> fails to mount rw. Message in the boot journal:
>
> kernel: F2FS-fs (nvme0n1p6): invalid zstd compress level: 6
>
> There was recently an f2fs patch to the 6.1 kernel tree which might be
> related: https://www.spinics.net/lists/stable-commits/msg329957.html
>
> Was able to recover the system by doing:
>
> sudo mount -o remount,rw,relatime,lazytime,background_gc=on,discard,no_heap,user_xattr,inline_xattr,acl,inline_data,inline_dentry,extent_cache,mode=adaptive,active_logs=6,alloc_mode=default,checkpoint_merge,fsync_mode=posix,compress_algorithm=lz4,compress_log_size=2,compress_mode=fs,atgc,discard_unit=block,memory=normal /dev/nvme0n1p6 /
>
> under the running bad 6.1.0-18-amd64 kernel, then editing
> /etc/default/grub:
>
> GRUB_DEFAULT="Advanced options for Debian GNU/Linux>Debian GNU/Linux, with Linux 6.1.0-17-amd64"
>
> and running 'update-grub' and rebooting to boot the 6.1.0-17-amd64
> kernel.
The issue is easily reproducible by:
# dd if=/dev/zero of=test.img count=100 bs=1M
# mkfs.f2fs -f -O compression,extra_attr ./test.img
# mount -t f2fs -o compress_algorithm=zstd:6,compress_chksum,atgc,gc_merge,lazytime ./test.img /mnt
resulting in
[ 60.789982] F2FS-fs (loop0): invalid zstd compress level: 6
A bugzilla report has been submitted in
https://bugzilla.kernel.org/show_bug.cgi?id=218471
#regzbot introduced: v6.1.69..v6.1.76
#regzbot link: https://bugs.debian.org/1063422
#regzbot link: https://bugzilla.kernel.org/show_bug.cgi?id=218471
Regards,
Salvatore
The patch titled
Subject: sched/numa, mm: do not try to migrate memory to memoryless nodes
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
sched-numa-mm-do-not-try-to-migrate-memory-to-memoryless-nodes.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Byungchul Park <byungchul(a)sk.com>
Subject: sched/numa, mm: do not try to migrate memory to memoryless nodes
Date: Mon, 19 Feb 2024 13:10:47 +0900
With numa balancing on, when a numa system is running where a numa node
doesn't have its local memory so it has no managed zones, the following
oops has been observed. It's because wakeup_kswapd() is called with a
wrong zone index, -1. Fixed it by checking the index before calling
wakeup_kswapd().
> BUG: unable to handle page fault for address: 00000000000033f3
> #PF: supervisor read access in kernel mode
> #PF: error_code(0x0000) - not-present page
> PGD 0 P4D 0
> Oops: 0000 [#1] PREEMPT SMP NOPTI
> CPU: 2 PID: 895 Comm: masim Not tainted 6.6.0-dirty #255
> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
> rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014
> RIP: 0010:wakeup_kswapd (./linux/mm/vmscan.c:7812)
> Code: (omitted)
> RSP: 0000:ffffc90004257d58 EFLAGS: 00010286
> RAX: ffffffffffffffff RBX: ffff88883fff0480 RCX: 0000000000000003
> RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff88883fff0480
> RBP: ffffffffffffffff R08: ff0003ffffffffff R09: ffffffffffffffff
> R10: ffff888106c95540 R11: 0000000055555554 R12: 0000000000000003
> R13: 0000000000000000 R14: 0000000000000000 R15: ffff88883fff0940
> FS: 00007fc4b8124740(0000) GS:ffff888827c00000(0000) knlGS:0000000000000000
> CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> CR2: 00000000000033f3 CR3: 000000026cc08004 CR4: 0000000000770ee0
> DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> PKRU: 55555554
> Call Trace:
> <TASK>
> ? __die
> ? page_fault_oops
> ? __pte_offset_map_lock
> ? exc_page_fault
> ? asm_exc_page_fault
> ? wakeup_kswapd
> migrate_misplaced_page
> __handle_mm_fault
> handle_mm_fault
> do_user_addr_fault
> exc_page_fault
> asm_exc_page_fault
> RIP: 0033:0x55b897ba0808
> Code: (omitted)
> RSP: 002b:00007ffeefa821a0 EFLAGS: 00010287
> RAX: 000055b89983acd0 RBX: 00007ffeefa823f8 RCX: 000055b89983acd0
> RDX: 00007fc2f8122010 RSI: 0000000000020000 RDI: 000055b89983acd0
> RBP: 00007ffeefa821a0 R08: 0000000000000037 R09: 0000000000000075
> R10: 0000000000000000 R11: 0000000000000202 R12: 0000000000000000
> R13: 00007ffeefa82410 R14: 000055b897ba5dd8 R15: 00007fc4b8340000
> </TASK>
Fix this by avoiding any attempt to migrate memory to memoryless nodes.
Link: https://lkml.kernel.org/r/20240219041920.1183-1-byungchul@sk.com
Link: https://lkml.kernel.org/r/20240216111502.79759-1-byungchul@sk.com
Fixes: c574bbe917036 ("NUMA balancing: optimize page placement for memory tiering system")
Signed-off-by: Byungchul Park <byungchul(a)sk.com>
Reviewed-by: Oscar Salvador <osalvador(a)suse.de>
Reviewed-by: "Huang, Ying" <ying.huang(a)intel.com>
Reviewed-by: Phil Auld <pauld(a)redhat.com>
Cc: Benjamin Segall <bsegall(a)google.com>
Cc: Daniel Bristot de Oliveira <bristot(a)redhat.com>
Cc: Dietmar Eggemann <dietmar.eggemann(a)arm.com>
Cc: Ingo Molnar <mingo(a)redhat.com>
Cc: Juri Lelli <juri.lelli(a)redhat.com>
Cc: Mel Gorman <mgorman(a)suse.de>
Cc: Peter Zijlstra <peterz(a)infradead.org>
Cc: Steven Rostedt <rostedt(a)goodmis.org>
Cc: Valentin Schneider <vschneid(a)redhat.com>
Cc: Vincent Guittot <vincent.guittot(a)linaro.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
kernel/sched/fair.c | 6 ++++++
1 file changed, 6 insertions(+)
--- a/kernel/sched/fair.c~sched-numa-mm-do-not-try-to-migrate-memory-to-memoryless-nodes
+++ a/kernel/sched/fair.c
@@ -1831,6 +1831,12 @@ bool should_numa_migrate_memory(struct t
int last_cpupid, this_cpupid;
/*
+ * Cannot migrate to memoryless nodes.
+ */
+ if (!node_state(dst_nid, N_MEMORY))
+ return false;
+
+ /*
* The pages in slow memory node should be migrated according
* to hot/cold instead of private/shared.
*/
_
Patches currently in -mm which might be from byungchul(a)sk.com are
sched-numa-mm-do-not-try-to-migrate-memory-to-memoryless-nodes.patch
mm-vmscan-dont-turn-on-cache_trim_mode-at-the-highest-scan-priority.patch
While mq_perf_tests runs with the default kselftest timeout limit, which
is 45 seconds, the test takes about 60 seconds to complete on i3.metal
AWS instances. Hence, the test always times out. Increase the timeout
to 100 seconds.
Link: https://lore.kernel.org/r/20240208212925.68286-1-sj@kernel.org
Fixes: 852c8cbf34d3 ("selftests/kselftest/runner.sh: Add 45 second timeout per test")
Cc: <stable(a)vger.kernel.org> # 5.4.x
Signed-off-by: SeongJae Park <sj(a)kernel.org>
Reviewed-by: Kees Cook <keescook(a)chromium.org>
---
Changes from v1
(https://lore.kernel.org/r/20240208212925.68286-1-sj@kernel.org)
- Use 180 seconds timeout instead of 100 seconds
tools/testing/selftests/mqueue/setting | 1 +
1 file changed, 1 insertion(+)
create mode 100644 tools/testing/selftests/mqueue/setting
diff --git a/tools/testing/selftests/mqueue/setting b/tools/testing/selftests/mqueue/setting
new file mode 100644
index 000000000000..a953c96aa16e
--- /dev/null
+++ b/tools/testing/selftests/mqueue/setting
@@ -0,0 +1 @@
+timeout=180
--
2.39.2
The patch titled
Subject: mm/damon/lru_sort: fix quota status loss due to online tunings
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
mm-damon-lru_sort-fix-quota-status-loss-due-to-online-tunings.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: SeongJae Park <sj(a)kernel.org>
Subject: mm/damon/lru_sort: fix quota status loss due to online tunings
Date: Fri, 16 Feb 2024 11:40:25 -0800
For online parameters change, DAMON_LRU_SORT creates new schemes based on
latest values of the parameters and replaces the old schemes with the new
one. When creating it, the internal status of the quotas of the old
schemes is not preserved. As a result, charging of the quota starts from
zero after the online tuning. The data that collected to estimate the
throughput of the scheme's action is also reset, and therefore the
estimation should start from the scratch again. Because the throughput
estimation is being used to convert the time quota to the effective size
quota, this could result in temporal time quota inaccuracy. It would be
recovered over time, though. In short, the quota accuracy could be
temporarily degraded after online parameters update.
Fix the problem by checking the case and copying the internal fields for
the status.
Link: https://lkml.kernel.org/r/20240216194025.9207-3-sj@kernel.org
Fixes: 40e983cca927 ("mm/damon: introduce DAMON-based LRU-lists Sorting")
Signed-off-by: SeongJae Park <sj(a)kernel.org>
Cc: <stable(a)vger.kernel.org> [6.0+]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/damon/lru_sort.c | 43 +++++++++++++++++++++++++++++++++++-------
1 file changed, 36 insertions(+), 7 deletions(-)
--- a/mm/damon/lru_sort.c~mm-damon-lru_sort-fix-quota-status-loss-due-to-online-tunings
+++ a/mm/damon/lru_sort.c
@@ -185,9 +185,21 @@ static struct damos *damon_lru_sort_new_
return damon_lru_sort_new_scheme(&pattern, DAMOS_LRU_DEPRIO);
}
+static void damon_lru_sort_copy_quota_status(struct damos_quota *dst,
+ struct damos_quota *src)
+{
+ dst->total_charged_sz = src->total_charged_sz;
+ dst->total_charged_ns = src->total_charged_ns;
+ dst->charged_sz = src->charged_sz;
+ dst->charged_from = src->charged_from;
+ dst->charge_target_from = src->charge_target_from;
+ dst->charge_addr_from = src->charge_addr_from;
+}
+
static int damon_lru_sort_apply_parameters(void)
{
- struct damos *scheme;
+ struct damos *scheme, *hot_scheme, *cold_scheme;
+ struct damos *old_hot_scheme = NULL, *old_cold_scheme = NULL;
unsigned int hot_thres, cold_thres;
int err = 0;
@@ -195,18 +207,35 @@ static int damon_lru_sort_apply_paramete
if (err)
return err;
+ damon_for_each_scheme(scheme, ctx) {
+ if (!old_hot_scheme) {
+ old_hot_scheme = scheme;
+ continue;
+ }
+ old_cold_scheme = scheme;
+ }
+
hot_thres = damon_max_nr_accesses(&damon_lru_sort_mon_attrs) *
hot_thres_access_freq / 1000;
- scheme = damon_lru_sort_new_hot_scheme(hot_thres);
- if (!scheme)
+ hot_scheme = damon_lru_sort_new_hot_scheme(hot_thres);
+ if (!hot_scheme)
return -ENOMEM;
- damon_set_schemes(ctx, &scheme, 1);
+ if (old_hot_scheme)
+ damon_lru_sort_copy_quota_status(&hot_scheme->quota,
+ &old_hot_scheme->quota);
cold_thres = cold_min_age / damon_lru_sort_mon_attrs.aggr_interval;
- scheme = damon_lru_sort_new_cold_scheme(cold_thres);
- if (!scheme)
+ cold_scheme = damon_lru_sort_new_cold_scheme(cold_thres);
+ if (!cold_scheme) {
+ damon_destroy_scheme(hot_scheme);
return -ENOMEM;
- damon_add_scheme(ctx, scheme);
+ }
+ if (old_cold_scheme)
+ damon_lru_sort_copy_quota_status(&cold_scheme->quota,
+ &old_cold_scheme->quota);
+
+ damon_set_schemes(ctx, &hot_scheme, 1);
+ damon_add_scheme(ctx, cold_scheme);
return damon_set_region_biggest_system_ram_default(target,
&monitor_region_start,
_
Patches currently in -mm which might be from sj(a)kernel.org are
mm-damon-core-check-apply-interval-in-damon_do_apply_schemes.patch
mm-damon-sysfs-schemes-handle-schemes-sysfs-dir-removal-before-commit_schemes_quota_goals.patch
mm-damon-reclaim-fix-quota-stauts-loss-due-to-online-tunings.patch
mm-damon-lru_sort-fix-quota-status-loss-due-to-online-tunings.patch
docs-admin-guide-mm-damon-usage-use-sysfs-interface-for-tracepoints-example.patch
mm-damon-rename-config_damon_dbgfs-to-damon_dbgfs_deprecated.patch
mm-damon-dbgfs-implement-deprecation-notice-file.patch
mm-damon-dbgfs-make-debugfs-interface-deprecation-message-a-macro.patch
docs-admin-guide-mm-damon-usage-document-deprecated-file-of-damon-debugfs-interface.patch
selftets-damon-prepare-for-monitor_on-file-renaming.patch
mm-damon-dbgfs-rename-monitor_on-file-to-monitor_on_deprecated.patch
docs-admin-guide-mm-damon-usage-update-for-monitor_on-renaming.patch
docs-translations-damon-usage-update-for-monitor_on-renaming.patch
mm-damon-sysfs-handle-state-file-inputs-for-every-sampling-interval-if-possible.patch
selftests-damon-_damon_sysfs-support-damos-quota.patch
selftests-damon-_damon_sysfs-support-damos-stats.patch
selftests-damon-_damon_sysfs-support-damos-apply-interval.patch
selftests-damon-add-a-test-for-damos-quota.patch
selftests-damon-add-a-test-for-damos-apply-intervals.patch
selftests-damon-add-a-test-for-a-race-between-target_ids_read-and-dbgfs_before_terminate.patch
selftests-damon-add-a-test-for-the-pid-leak-of-dbgfs_target_ids_write.patch
selftests-damon-_chk_dependency-get-debugfs-mount-point-from-proc-mounts.patch
docs-mm-damon-maintainer-profile-fix-reference-links-for-mm-stable-tree.patch
docs-mm-damon-move-the-list-of-damos-actions-to-design-doc.patch
docs-mm-damon-move-damon-operation-sets-list-from-the-usage-to-the-design-document.patch
docs-mm-damon-move-monitoring-target-regions-setup-detail-from-the-usage-to-the-design-document.patch
docs-admin-guide-mm-damon-usage-fix-wrong-quotas-diabling-condition.patch
mm-damon-core-set-damos_quota-esz-as-public-field-and-document.patch
mm-damon-sysfs-schemes-implement-quota-effective_bytes-file.patch
mm-damon-sysfs-implement-a-kdamond-command-for-updating-schemes-effective-quotas.patch
docs-abi-damon-document-effective_bytes-sysfs-file.patch
docs-admin-guide-mm-damon-usage-document-effective_bytes-file.patch
mm-damon-move-comments-and-fields-for-damos-quota-prioritization-to-the-end.patch
mm-damon-core-split-out-quota-goal-related-fields-to-a-struct.patch
mm-damon-core-add-multiple-goals-per-damos_quota-and-helpers-for-those.patch
mm-damon-sysfs-use-only-quota-goals.patch
mm-damon-core-remove-goal-field-of-damos_quota.patch
mm-damon-core-let-goal-specified-with-only-target-and-current-values.patch
mm-damon-core-support-multiple-metrics-for-quota-goal.patch
mm-damon-core-implement-psi-metric-damos-quota-goal.patch
mm-damon-sysfs-schemes-support-psi-based-quota-auto-tune.patch
docs-mm-damon-design-document-quota-goal-self-tuning.patch
docs-abi-damon-document-quota-goal-metric-file.patch
docs-admin-guide-mm-damon-usage-document-quota-goal-metric-file.patch
mm-damon-reclaim-implement-user-feedback-driven-quota-auto-tuning.patch
mm-damon-reclaim-implement-memory-psi-driven-quota-self-tuning.patch
docs-admin-guide-mm-damon-reclaim-document-auto-tuning-parameters.patch
The patch titled
Subject: mm/damon/reclaim: fix quota stauts loss due to online tunings
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
mm-damon-reclaim-fix-quota-stauts-loss-due-to-online-tunings.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: SeongJae Park <sj(a)kernel.org>
Subject: mm/damon/reclaim: fix quota stauts loss due to online tunings
Date: Fri, 16 Feb 2024 11:40:24 -0800
Patch series "mm/damon: fix quota status loss due to online tunings".
DAMON_RECLAIM and DAMON_LRU_SORT is not preserving internal quota status
when applying new user parameters, and hence could cause temporal quota
accuracy degradation. Fix it by preserving the status.
This patch (of 2):
For online parameters change, DAMON_RECLAIM creates new scheme based on
latest values of the parameters and replaces the old scheme with the new
one. When creating it, the internal status of the quota of the old
scheme is not preserved. As a result, charging of the quota starts from
zero after the online tuning. The data that collected to estimate the
throughput of the scheme's action is also reset, and therefore the
estimation should start from the scratch again. Because the throughput
estimation is being used to convert the time quota to the effective size
quota, this could result in temporal time quota inaccuracy. It would be
recovered over time, though. In short, the quota accuracy could be
temporarily degraded after online parameters update.
Fix the problem by checking the case and copying the internal fields for
the status.
Link: https://lkml.kernel.org/r/20240216194025.9207-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20240216194025.9207-2-sj@kernel.org
Fixes: e035c280f6df ("mm/damon/reclaim: support online inputs update")
Signed-off-by: SeongJae Park <sj(a)kernel.org>
Cc: <stable(a)vger.kernel.org> [5.19+]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/damon/reclaim.c | 18 +++++++++++++++++-
1 file changed, 17 insertions(+), 1 deletion(-)
--- a/mm/damon/reclaim.c~mm-damon-reclaim-fix-quota-stauts-loss-due-to-online-tunings
+++ a/mm/damon/reclaim.c
@@ -150,9 +150,20 @@ static struct damos *damon_reclaim_new_s
&damon_reclaim_wmarks);
}
+static void damon_reclaim_copy_quota_status(struct damos_quota *dst,
+ struct damos_quota *src)
+{
+ dst->total_charged_sz = src->total_charged_sz;
+ dst->total_charged_ns = src->total_charged_ns;
+ dst->charged_sz = src->charged_sz;
+ dst->charged_from = src->charged_from;
+ dst->charge_target_from = src->charge_target_from;
+ dst->charge_addr_from = src->charge_addr_from;
+}
+
static int damon_reclaim_apply_parameters(void)
{
- struct damos *scheme;
+ struct damos *scheme, *old_scheme;
struct damos_filter *filter;
int err = 0;
@@ -164,6 +175,11 @@ static int damon_reclaim_apply_parameter
scheme = damon_reclaim_new_scheme();
if (!scheme)
return -ENOMEM;
+ if (!list_empty(&ctx->schemes)) {
+ damon_for_each_scheme(old_scheme, ctx)
+ damon_reclaim_copy_quota_status(&scheme->quota,
+ &old_scheme->quota);
+ }
if (skip_anon) {
filter = damos_new_filter(DAMOS_FILTER_TYPE_ANON, true);
if (!filter) {
_
Patches currently in -mm which might be from sj(a)kernel.org are
mm-damon-core-check-apply-interval-in-damon_do_apply_schemes.patch
mm-damon-sysfs-schemes-handle-schemes-sysfs-dir-removal-before-commit_schemes_quota_goals.patch
mm-damon-reclaim-fix-quota-stauts-loss-due-to-online-tunings.patch
mm-damon-lru_sort-fix-quota-status-loss-due-to-online-tunings.patch
docs-admin-guide-mm-damon-usage-use-sysfs-interface-for-tracepoints-example.patch
mm-damon-rename-config_damon_dbgfs-to-damon_dbgfs_deprecated.patch
mm-damon-dbgfs-implement-deprecation-notice-file.patch
mm-damon-dbgfs-make-debugfs-interface-deprecation-message-a-macro.patch
docs-admin-guide-mm-damon-usage-document-deprecated-file-of-damon-debugfs-interface.patch
selftets-damon-prepare-for-monitor_on-file-renaming.patch
mm-damon-dbgfs-rename-monitor_on-file-to-monitor_on_deprecated.patch
docs-admin-guide-mm-damon-usage-update-for-monitor_on-renaming.patch
docs-translations-damon-usage-update-for-monitor_on-renaming.patch
mm-damon-sysfs-handle-state-file-inputs-for-every-sampling-interval-if-possible.patch
selftests-damon-_damon_sysfs-support-damos-quota.patch
selftests-damon-_damon_sysfs-support-damos-stats.patch
selftests-damon-_damon_sysfs-support-damos-apply-interval.patch
selftests-damon-add-a-test-for-damos-quota.patch
selftests-damon-add-a-test-for-damos-apply-intervals.patch
selftests-damon-add-a-test-for-a-race-between-target_ids_read-and-dbgfs_before_terminate.patch
selftests-damon-add-a-test-for-the-pid-leak-of-dbgfs_target_ids_write.patch
selftests-damon-_chk_dependency-get-debugfs-mount-point-from-proc-mounts.patch
docs-mm-damon-maintainer-profile-fix-reference-links-for-mm-stable-tree.patch
docs-mm-damon-move-the-list-of-damos-actions-to-design-doc.patch
docs-mm-damon-move-damon-operation-sets-list-from-the-usage-to-the-design-document.patch
docs-mm-damon-move-monitoring-target-regions-setup-detail-from-the-usage-to-the-design-document.patch
docs-admin-guide-mm-damon-usage-fix-wrong-quotas-diabling-condition.patch
mm-damon-core-set-damos_quota-esz-as-public-field-and-document.patch
mm-damon-sysfs-schemes-implement-quota-effective_bytes-file.patch
mm-damon-sysfs-implement-a-kdamond-command-for-updating-schemes-effective-quotas.patch
docs-abi-damon-document-effective_bytes-sysfs-file.patch
docs-admin-guide-mm-damon-usage-document-effective_bytes-file.patch
mm-damon-move-comments-and-fields-for-damos-quota-prioritization-to-the-end.patch
mm-damon-core-split-out-quota-goal-related-fields-to-a-struct.patch
mm-damon-core-add-multiple-goals-per-damos_quota-and-helpers-for-those.patch
mm-damon-sysfs-use-only-quota-goals.patch
mm-damon-core-remove-goal-field-of-damos_quota.patch
mm-damon-core-let-goal-specified-with-only-target-and-current-values.patch
mm-damon-core-support-multiple-metrics-for-quota-goal.patch
mm-damon-core-implement-psi-metric-damos-quota-goal.patch
mm-damon-sysfs-schemes-support-psi-based-quota-auto-tune.patch
docs-mm-damon-design-document-quota-goal-self-tuning.patch
docs-abi-damon-document-quota-goal-metric-file.patch
docs-admin-guide-mm-damon-usage-document-quota-goal-metric-file.patch
mm-damon-reclaim-implement-user-feedback-driven-quota-auto-tuning.patch
mm-damon-reclaim-implement-memory-psi-driven-quota-self-tuning.patch
docs-admin-guide-mm-damon-reclaim-document-auto-tuning-parameters.patch
While mq_perf_tests runs with the default kselftest timeout limit, which
is 45 seconds, the test takes about 60 seconds to complete on i3.metal
AWS instances. Hence, the test always times out. Increase the timeout
to 100 seconds.
Fixes: 852c8cbf34d3 ("selftests/kselftest/runner.sh: Add 45 second timeout per test")
Cc: <stable(a)vger.kernel.org> # 5.4.x
Signed-off-by: SeongJae Park <sj(a)kernel.org>
---
tools/testing/selftests/mqueue/setting | 1 +
1 file changed, 1 insertion(+)
create mode 100644 tools/testing/selftests/mqueue/setting
diff --git a/tools/testing/selftests/mqueue/setting b/tools/testing/selftests/mqueue/setting
new file mode 100644
index 000000000000..54dc12287839
--- /dev/null
+++ b/tools/testing/selftests/mqueue/setting
@@ -0,0 +1 @@
+timeout=100
--
2.39.2
When linking or renaming a file, if only one of the source or
destination directory is backed by an S_PRIVATE inode, then the related
set of layer masks would be used as uninitialized by
is_access_to_paths_allowed(). This would result to indeterministic
access for one side instead of always being allowed.
This bug could only be triggered with a mounted filesystem containing
both S_PRIVATE and !S_PRIVATE inodes, which doesn't seem possible.
The collect_domain_accesses() calls return early if
is_nouser_or_private() returns false, which means that the directory's
superblock has SB_NOUSER or its inode has S_PRIVATE. Because rename or
link actions are only allowed on the same mounted filesystem, the
superblock is always the same for both source and destination
directories. However, it might be possible in theory to have an
S_PRIVATE parent source inode with an !S_PRIVATE parent destination
inode, or vice versa.
To make sure this case is not an issue, explicitly initialized both set
of layer masks to 0, which means to allow all actions on the related
side. If at least on side has !S_PRIVATE, then
collect_domain_accesses() and is_access_to_paths_allowed() check for the
required access rights.
Cc: Arnd Bergmann <arnd(a)arndb.de>
Cc: Christian Brauner <brauner(a)kernel.org>
Cc: Günther Noack <gnoack(a)google.com>
Cc: Jann Horn <jannh(a)google.com>
Cc: Shervin Oloumi <enlightened(a)chromium.org>
Cc: stable(a)vger.kernel.org
Fixes: b91c3e4ea756 ("landlock: Add support for file reparenting with LANDLOCK_ACCESS_FS_REFER")
Signed-off-by: Mickaël Salaün <mic(a)digikod.net>
---
security/landlock/fs.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/security/landlock/fs.c b/security/landlock/fs.c
index 90f7f6db1e87..f243c6a392ee 100644
--- a/security/landlock/fs.c
+++ b/security/landlock/fs.c
@@ -1093,8 +1093,8 @@ static int current_check_refer_path(struct dentry *const old_dentry,
bool allow_parent1, allow_parent2;
access_mask_t access_request_parent1, access_request_parent2;
struct path mnt_dir;
- layer_mask_t layer_masks_parent1[LANDLOCK_NUM_ACCESS_FS],
- layer_masks_parent2[LANDLOCK_NUM_ACCESS_FS];
+ layer_mask_t layer_masks_parent1[LANDLOCK_NUM_ACCESS_FS] = {},
+ layer_masks_parent2[LANDLOCK_NUM_ACCESS_FS] = {};
if (!dom)
return 0;
--
2.43.0
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.19.y
git checkout FETCH_HEAD
git cherry-pick -x 4e440abc894585a34c2904a32cd54af1742311b3
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021941-jelly-tubular-7919@gregkh' --subject-prefix 'PATCH 4.19.y' HEAD^..
Possible dependencies:
4e440abc8945 ("hwmon: (coretemp) Fix out-of-bounds memory access")
7108b80a542b ("hwmon/coretemp: Handle large core ID value")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 4e440abc894585a34c2904a32cd54af1742311b3 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang(a)intel.com>
Date: Fri, 2 Feb 2024 17:21:34 +0800
Subject: [PATCH] hwmon: (coretemp) Fix out-of-bounds memory access
Fix a bug that pdata->cpu_map[] is set before out-of-bounds check.
The problem might be triggered on systems with more than 128 cores per
package.
Fixes: 7108b80a542b ("hwmon/coretemp: Handle large core ID value")
Signed-off-by: Zhang Rui <rui.zhang(a)intel.com>
Cc: <stable(a)vger.kernel.org>
Link: https://lore.kernel.org/r/20240202092144.71180-2-rui.zhang@intel.com
Signed-off-by: Guenter Roeck <linux(a)roeck-us.net>
diff --git a/drivers/hwmon/coretemp.c b/drivers/hwmon/coretemp.c
index ba82d1e79c13..e78c76919111 100644
--- a/drivers/hwmon/coretemp.c
+++ b/drivers/hwmon/coretemp.c
@@ -509,18 +509,14 @@ static int create_core_data(struct platform_device *pdev, unsigned int cpu,
if (pkg_flag) {
attr_no = PKG_SYSFS_ATTR_NO;
} else {
- index = ida_alloc(&pdata->ida, GFP_KERNEL);
+ index = ida_alloc_max(&pdata->ida, NUM_REAL_CORES - 1, GFP_KERNEL);
if (index < 0)
return index;
+
pdata->cpu_map[index] = topology_core_id(cpu);
attr_no = index + BASE_SYSFS_ATTR_NO;
}
- if (attr_no > MAX_CORE_DATA - 1) {
- err = -ERANGE;
- goto ida_free;
- }
-
tdata = init_temp_data(cpu, pkg_flag);
if (!tdata) {
err = -ENOMEM;
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x 4e440abc894585a34c2904a32cd54af1742311b3
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021939-concierge-eclipse-4ffd@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
4e440abc8945 ("hwmon: (coretemp) Fix out-of-bounds memory access")
7108b80a542b ("hwmon/coretemp: Handle large core ID value")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 4e440abc894585a34c2904a32cd54af1742311b3 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang(a)intel.com>
Date: Fri, 2 Feb 2024 17:21:34 +0800
Subject: [PATCH] hwmon: (coretemp) Fix out-of-bounds memory access
Fix a bug that pdata->cpu_map[] is set before out-of-bounds check.
The problem might be triggered on systems with more than 128 cores per
package.
Fixes: 7108b80a542b ("hwmon/coretemp: Handle large core ID value")
Signed-off-by: Zhang Rui <rui.zhang(a)intel.com>
Cc: <stable(a)vger.kernel.org>
Link: https://lore.kernel.org/r/20240202092144.71180-2-rui.zhang@intel.com
Signed-off-by: Guenter Roeck <linux(a)roeck-us.net>
diff --git a/drivers/hwmon/coretemp.c b/drivers/hwmon/coretemp.c
index ba82d1e79c13..e78c76919111 100644
--- a/drivers/hwmon/coretemp.c
+++ b/drivers/hwmon/coretemp.c
@@ -509,18 +509,14 @@ static int create_core_data(struct platform_device *pdev, unsigned int cpu,
if (pkg_flag) {
attr_no = PKG_SYSFS_ATTR_NO;
} else {
- index = ida_alloc(&pdata->ida, GFP_KERNEL);
+ index = ida_alloc_max(&pdata->ida, NUM_REAL_CORES - 1, GFP_KERNEL);
if (index < 0)
return index;
+
pdata->cpu_map[index] = topology_core_id(cpu);
attr_no = index + BASE_SYSFS_ATTR_NO;
}
- if (attr_no > MAX_CORE_DATA - 1) {
- err = -ERANGE;
- goto ida_free;
- }
-
tdata = init_temp_data(cpu, pkg_flag);
if (!tdata) {
err = -ENOMEM;
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x 4e440abc894585a34c2904a32cd54af1742311b3
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021940-monsieur-unshipped-a926@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
4e440abc8945 ("hwmon: (coretemp) Fix out-of-bounds memory access")
7108b80a542b ("hwmon/coretemp: Handle large core ID value")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 4e440abc894585a34c2904a32cd54af1742311b3 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang(a)intel.com>
Date: Fri, 2 Feb 2024 17:21:34 +0800
Subject: [PATCH] hwmon: (coretemp) Fix out-of-bounds memory access
Fix a bug that pdata->cpu_map[] is set before out-of-bounds check.
The problem might be triggered on systems with more than 128 cores per
package.
Fixes: 7108b80a542b ("hwmon/coretemp: Handle large core ID value")
Signed-off-by: Zhang Rui <rui.zhang(a)intel.com>
Cc: <stable(a)vger.kernel.org>
Link: https://lore.kernel.org/r/20240202092144.71180-2-rui.zhang@intel.com
Signed-off-by: Guenter Roeck <linux(a)roeck-us.net>
diff --git a/drivers/hwmon/coretemp.c b/drivers/hwmon/coretemp.c
index ba82d1e79c13..e78c76919111 100644
--- a/drivers/hwmon/coretemp.c
+++ b/drivers/hwmon/coretemp.c
@@ -509,18 +509,14 @@ static int create_core_data(struct platform_device *pdev, unsigned int cpu,
if (pkg_flag) {
attr_no = PKG_SYSFS_ATTR_NO;
} else {
- index = ida_alloc(&pdata->ida, GFP_KERNEL);
+ index = ida_alloc_max(&pdata->ida, NUM_REAL_CORES - 1, GFP_KERNEL);
if (index < 0)
return index;
+
pdata->cpu_map[index] = topology_core_id(cpu);
attr_no = index + BASE_SYSFS_ATTR_NO;
}
- if (attr_no > MAX_CORE_DATA - 1) {
- err = -ERANGE;
- goto ida_free;
- }
-
tdata = init_temp_data(cpu, pkg_flag);
if (!tdata) {
err = -ENOMEM;
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 4e440abc894585a34c2904a32cd54af1742311b3
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021939-blunt-uncouple-5a31@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
4e440abc8945 ("hwmon: (coretemp) Fix out-of-bounds memory access")
7108b80a542b ("hwmon/coretemp: Handle large core ID value")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 4e440abc894585a34c2904a32cd54af1742311b3 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang(a)intel.com>
Date: Fri, 2 Feb 2024 17:21:34 +0800
Subject: [PATCH] hwmon: (coretemp) Fix out-of-bounds memory access
Fix a bug that pdata->cpu_map[] is set before out-of-bounds check.
The problem might be triggered on systems with more than 128 cores per
package.
Fixes: 7108b80a542b ("hwmon/coretemp: Handle large core ID value")
Signed-off-by: Zhang Rui <rui.zhang(a)intel.com>
Cc: <stable(a)vger.kernel.org>
Link: https://lore.kernel.org/r/20240202092144.71180-2-rui.zhang@intel.com
Signed-off-by: Guenter Roeck <linux(a)roeck-us.net>
diff --git a/drivers/hwmon/coretemp.c b/drivers/hwmon/coretemp.c
index ba82d1e79c13..e78c76919111 100644
--- a/drivers/hwmon/coretemp.c
+++ b/drivers/hwmon/coretemp.c
@@ -509,18 +509,14 @@ static int create_core_data(struct platform_device *pdev, unsigned int cpu,
if (pkg_flag) {
attr_no = PKG_SYSFS_ATTR_NO;
} else {
- index = ida_alloc(&pdata->ida, GFP_KERNEL);
+ index = ida_alloc_max(&pdata->ida, NUM_REAL_CORES - 1, GFP_KERNEL);
if (index < 0)
return index;
+
pdata->cpu_map[index] = topology_core_id(cpu);
attr_no = index + BASE_SYSFS_ATTR_NO;
}
- if (attr_no > MAX_CORE_DATA - 1) {
- err = -ERANGE;
- goto ida_free;
- }
-
tdata = init_temp_data(cpu, pkg_flag);
if (!tdata) {
err = -ENOMEM;
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x 4e440abc894585a34c2904a32cd54af1742311b3
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021938-relieving-boneless-4cca@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
4e440abc8945 ("hwmon: (coretemp) Fix out-of-bounds memory access")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 4e440abc894585a34c2904a32cd54af1742311b3 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang(a)intel.com>
Date: Fri, 2 Feb 2024 17:21:34 +0800
Subject: [PATCH] hwmon: (coretemp) Fix out-of-bounds memory access
Fix a bug that pdata->cpu_map[] is set before out-of-bounds check.
The problem might be triggered on systems with more than 128 cores per
package.
Fixes: 7108b80a542b ("hwmon/coretemp: Handle large core ID value")
Signed-off-by: Zhang Rui <rui.zhang(a)intel.com>
Cc: <stable(a)vger.kernel.org>
Link: https://lore.kernel.org/r/20240202092144.71180-2-rui.zhang@intel.com
Signed-off-by: Guenter Roeck <linux(a)roeck-us.net>
diff --git a/drivers/hwmon/coretemp.c b/drivers/hwmon/coretemp.c
index ba82d1e79c13..e78c76919111 100644
--- a/drivers/hwmon/coretemp.c
+++ b/drivers/hwmon/coretemp.c
@@ -509,18 +509,14 @@ static int create_core_data(struct platform_device *pdev, unsigned int cpu,
if (pkg_flag) {
attr_no = PKG_SYSFS_ATTR_NO;
} else {
- index = ida_alloc(&pdata->ida, GFP_KERNEL);
+ index = ida_alloc_max(&pdata->ida, NUM_REAL_CORES - 1, GFP_KERNEL);
if (index < 0)
return index;
+
pdata->cpu_map[index] = topology_core_id(cpu);
attr_no = index + BASE_SYSFS_ATTR_NO;
}
- if (attr_no > MAX_CORE_DATA - 1) {
- err = -ERANGE;
- goto ida_free;
- }
-
tdata = init_temp_data(cpu, pkg_flag);
if (!tdata) {
err = -ENOMEM;
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x 4e440abc894585a34c2904a32cd54af1742311b3
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021937-revered-agreement-261e@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
4e440abc8945 ("hwmon: (coretemp) Fix out-of-bounds memory access")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 4e440abc894585a34c2904a32cd54af1742311b3 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang(a)intel.com>
Date: Fri, 2 Feb 2024 17:21:34 +0800
Subject: [PATCH] hwmon: (coretemp) Fix out-of-bounds memory access
Fix a bug that pdata->cpu_map[] is set before out-of-bounds check.
The problem might be triggered on systems with more than 128 cores per
package.
Fixes: 7108b80a542b ("hwmon/coretemp: Handle large core ID value")
Signed-off-by: Zhang Rui <rui.zhang(a)intel.com>
Cc: <stable(a)vger.kernel.org>
Link: https://lore.kernel.org/r/20240202092144.71180-2-rui.zhang@intel.com
Signed-off-by: Guenter Roeck <linux(a)roeck-us.net>
diff --git a/drivers/hwmon/coretemp.c b/drivers/hwmon/coretemp.c
index ba82d1e79c13..e78c76919111 100644
--- a/drivers/hwmon/coretemp.c
+++ b/drivers/hwmon/coretemp.c
@@ -509,18 +509,14 @@ static int create_core_data(struct platform_device *pdev, unsigned int cpu,
if (pkg_flag) {
attr_no = PKG_SYSFS_ATTR_NO;
} else {
- index = ida_alloc(&pdata->ida, GFP_KERNEL);
+ index = ida_alloc_max(&pdata->ida, NUM_REAL_CORES - 1, GFP_KERNEL);
if (index < 0)
return index;
+
pdata->cpu_map[index] = topology_core_id(cpu);
attr_no = index + BASE_SYSFS_ATTR_NO;
}
- if (attr_no > MAX_CORE_DATA - 1) {
- err = -ERANGE;
- goto ida_free;
- }
-
tdata = init_temp_data(cpu, pkg_flag);
if (!tdata) {
err = -ENOMEM;
The patch below does not apply to the 6.7-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.7.y
git checkout FETCH_HEAD
git cherry-pick -x 4e440abc894585a34c2904a32cd54af1742311b3
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021936-mulch-prone-c0a6@gregkh' --subject-prefix 'PATCH 6.7.y' HEAD^..
Possible dependencies:
4e440abc8945 ("hwmon: (coretemp) Fix out-of-bounds memory access")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 4e440abc894585a34c2904a32cd54af1742311b3 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang(a)intel.com>
Date: Fri, 2 Feb 2024 17:21:34 +0800
Subject: [PATCH] hwmon: (coretemp) Fix out-of-bounds memory access
Fix a bug that pdata->cpu_map[] is set before out-of-bounds check.
The problem might be triggered on systems with more than 128 cores per
package.
Fixes: 7108b80a542b ("hwmon/coretemp: Handle large core ID value")
Signed-off-by: Zhang Rui <rui.zhang(a)intel.com>
Cc: <stable(a)vger.kernel.org>
Link: https://lore.kernel.org/r/20240202092144.71180-2-rui.zhang@intel.com
Signed-off-by: Guenter Roeck <linux(a)roeck-us.net>
diff --git a/drivers/hwmon/coretemp.c b/drivers/hwmon/coretemp.c
index ba82d1e79c13..e78c76919111 100644
--- a/drivers/hwmon/coretemp.c
+++ b/drivers/hwmon/coretemp.c
@@ -509,18 +509,14 @@ static int create_core_data(struct platform_device *pdev, unsigned int cpu,
if (pkg_flag) {
attr_no = PKG_SYSFS_ATTR_NO;
} else {
- index = ida_alloc(&pdata->ida, GFP_KERNEL);
+ index = ida_alloc_max(&pdata->ida, NUM_REAL_CORES - 1, GFP_KERNEL);
if (index < 0)
return index;
+
pdata->cpu_map[index] = topology_core_id(cpu);
attr_no = index + BASE_SYSFS_ATTR_NO;
}
- if (attr_no > MAX_CORE_DATA - 1) {
- err = -ERANGE;
- goto ida_free;
- }
-
tdata = init_temp_data(cpu, pkg_flag);
if (!tdata) {
err = -ENOMEM;
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.19.y
git checkout FETCH_HEAD
git cherry-pick -x fa765c4b4aed2d64266b694520ecb025c862c5a9
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021921-why-roamer-871c@gregkh' --subject-prefix 'PATCH 4.19.y' HEAD^..
Possible dependencies:
fa765c4b4aed ("xen/events: close evtchn after mapping cleanup")
3fcdaf3d7634 ("xen/events: modify internal [un]bind interfaces")
5dd9ad32d775 ("xen/events: drop xen_allocate_irqs_dynamic()")
3bdb0ac350fe ("xen/events: remove some simple helpers from events_base.c")
686464514fbe ("xen/events: reduce externally visible helper functions")
e64e7c74b99e ("xen/events: avoid using info_for_irq() in xen_send_IPI_one()")
9e90e58c11b7 ("xen: evtchn: Allow shared registration of IRQ handers")
87797fad6cce ("xen/events: replace evtchn_rwlock with RCU")
58f6259b7a08 ("xen/evtchn: Introduce new IOCTL to bind static evtchn")
04d684875b30 ("xen: xen_debug_interrupt prototype to global header")
073352e951f6 ("genirq: Add and use an irq_data_update_affinity helper")
961343d78226 ("genirq: Refactor accessors to use irq_data_get_affinity_mask")
83f877a09516 ("xen/events: remove redundant initialization of variable irq")
3de218ff39b9 ("xen/events: reset active flag for lateeoi events later")
d120198bd5ff ("xen/evtchn: Change irq_info lock to raw_spinlock_t")
b6622798bc50 ("xen/events: avoid handling the same event on two cpus at the same time")
25da4618af24 ("xen/events: don't unmask an event channel when an eoi is pending")
06f45fe96fcd ("xen/events: add per-xenbus device event statistics and settings")
f2fa0e5e9f31 ("xen/events: link interdomain events to associated xenbus device")
88f0a9d06644 ("xen/events: Implement irq distribution")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From fa765c4b4aed2d64266b694520ecb025c862c5a9 Mon Sep 17 00:00:00 2001
From: Maximilian Heyne <mheyne(a)amazon.de>
Date: Wed, 24 Jan 2024 16:31:28 +0000
Subject: [PATCH] xen/events: close evtchn after mapping cleanup
shutdown_pirq and startup_pirq are not taking the
irq_mapping_update_lock because they can't due to lock inversion. Both
are called with the irq_desc->lock being taking. The lock order,
however, is first irq_mapping_update_lock and then irq_desc->lock.
This opens multiple races:
- shutdown_pirq can be interrupted by a function that allocates an event
channel:
CPU0 CPU1
shutdown_pirq {
xen_evtchn_close(e)
__startup_pirq {
EVTCHNOP_bind_pirq
-> returns just freed evtchn e
set_evtchn_to_irq(e, irq)
}
xen_irq_info_cleanup() {
set_evtchn_to_irq(e, -1)
}
}
Assume here event channel e refers here to the same event channel
number.
After this race the evtchn_to_irq mapping for e is invalid (-1).
- __startup_pirq races with __unbind_from_irq in a similar way. Because
__startup_pirq doesn't take irq_mapping_update_lock it can grab the
evtchn that __unbind_from_irq is currently freeing and cleaning up. In
this case even though the event channel is allocated, its mapping can
be unset in evtchn_to_irq.
The fix is to first cleanup the mappings and then close the event
channel. In this way, when an event channel gets allocated it's
potential previous evtchn_to_irq mappings are guaranteed to be unset already.
This is also the reverse order of the allocation where first the event
channel is allocated and then the mappings are setup.
On a 5.10 kernel prior to commit 3fcdaf3d7634 ("xen/events: modify internal
[un]bind interfaces"), we hit a BUG like the following during probing of NVMe
devices. The issue is that during nvme_setup_io_queues, pci_free_irq
is called for every device which results in a call to shutdown_pirq.
With many nvme devices it's therefore likely to hit this race during
boot because there will be multiple calls to shutdown_pirq and
startup_pirq are running potentially in parallel.
------------[ cut here ]------------
blkfront: xvda: barrier or flush: disabled; persistent grants: enabled; indirect descriptors: enabled; bounce buffer: enabled
kernel BUG at drivers/xen/events/events_base.c:499!
invalid opcode: 0000 [#1] SMP PTI
CPU: 44 PID: 375 Comm: kworker/u257:23 Not tainted 5.10.201-191.748.amzn2.x86_64 #1
Hardware name: Xen HVM domU, BIOS 4.11.amazon 08/24/2006
Workqueue: nvme-reset-wq nvme_reset_work
RIP: 0010:bind_evtchn_to_cpu+0xdf/0xf0
Code: 5d 41 5e c3 cc cc cc cc 44 89 f7 e8 2b 55 ad ff 49 89 c5 48 85 c0 0f 84 64 ff ff ff 4c 8b 68 30 41 83 fe ff 0f 85 60 ff ff ff <0f> 0b 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 0f 1f 44 00 00
RSP: 0000:ffffc9000d533b08 EFLAGS: 00010046
RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000006
RDX: 0000000000000028 RSI: 00000000ffffffff RDI: 00000000ffffffff
RBP: ffff888107419680 R08: 0000000000000000 R09: ffffffff82d72b00
R10: 0000000000000000 R11: 0000000000000000 R12: 00000000000001ed
R13: 0000000000000000 R14: 00000000ffffffff R15: 0000000000000002
FS: 0000000000000000(0000) GS:ffff88bc8b500000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000000 CR3: 0000000002610001 CR4: 00000000001706e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
? show_trace_log_lvl+0x1c1/0x2d9
? show_trace_log_lvl+0x1c1/0x2d9
? set_affinity_irq+0xdc/0x1c0
? __die_body.cold+0x8/0xd
? die+0x2b/0x50
? do_trap+0x90/0x110
? bind_evtchn_to_cpu+0xdf/0xf0
? do_error_trap+0x65/0x80
? bind_evtchn_to_cpu+0xdf/0xf0
? exc_invalid_op+0x4e/0x70
? bind_evtchn_to_cpu+0xdf/0xf0
? asm_exc_invalid_op+0x12/0x20
? bind_evtchn_to_cpu+0xdf/0xf0
? bind_evtchn_to_cpu+0xc5/0xf0
set_affinity_irq+0xdc/0x1c0
irq_do_set_affinity+0x1d7/0x1f0
irq_setup_affinity+0xd6/0x1a0
irq_startup+0x8a/0xf0
__setup_irq+0x639/0x6d0
? nvme_suspend+0x150/0x150
request_threaded_irq+0x10c/0x180
? nvme_suspend+0x150/0x150
pci_request_irq+0xa8/0xf0
? __blk_mq_free_request+0x74/0xa0
queue_request_irq+0x6f/0x80
nvme_create_queue+0x1af/0x200
nvme_create_io_queues+0xbd/0xf0
nvme_setup_io_queues+0x246/0x320
? nvme_irq_check+0x30/0x30
nvme_reset_work+0x1c8/0x400
process_one_work+0x1b0/0x350
worker_thread+0x49/0x310
? process_one_work+0x350/0x350
kthread+0x11b/0x140
? __kthread_bind_mask+0x60/0x60
ret_from_fork+0x22/0x30
Modules linked in:
---[ end trace a11715de1eee1873 ]---
Fixes: d46a78b05c0e ("xen: implement pirq type event channels")
Cc: stable(a)vger.kernel.org
Co-debugged-by: Andrew Panyakin <apanyaki(a)amazon.com>
Signed-off-by: Maximilian Heyne <mheyne(a)amazon.de>
Reviewed-by: Juergen Gross <jgross(a)suse.com>
Link: https://lore.kernel.org/r/20240124163130.31324-1-mheyne@amazon.de
Signed-off-by: Juergen Gross <jgross(a)suse.com>
diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c
index b8cfea7812d6..3b9f080109d7 100644
--- a/drivers/xen/events/events_base.c
+++ b/drivers/xen/events/events_base.c
@@ -923,8 +923,8 @@ static void shutdown_pirq(struct irq_data *data)
return;
do_mask(info, EVT_MASK_REASON_EXPLICIT);
- xen_evtchn_close(evtchn);
xen_irq_info_cleanup(info);
+ xen_evtchn_close(evtchn);
}
static void enable_pirq(struct irq_data *data)
@@ -956,6 +956,7 @@ EXPORT_SYMBOL_GPL(xen_irq_from_gsi);
static void __unbind_from_irq(struct irq_info *info, unsigned int irq)
{
evtchn_port_t evtchn;
+ bool close_evtchn = false;
if (!info) {
xen_irq_free_desc(irq);
@@ -975,7 +976,7 @@ static void __unbind_from_irq(struct irq_info *info, unsigned int irq)
struct xenbus_device *dev;
if (!info->is_static)
- xen_evtchn_close(evtchn);
+ close_evtchn = true;
switch (info->type) {
case IRQT_VIRQ:
@@ -995,6 +996,9 @@ static void __unbind_from_irq(struct irq_info *info, unsigned int irq)
}
xen_irq_info_cleanup(info);
+
+ if (close_evtchn)
+ xen_evtchn_close(evtchn);
}
xen_free_irq(info);
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x fa765c4b4aed2d64266b694520ecb025c862c5a9
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021917-krypton-upcountry-d467@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
fa765c4b4aed ("xen/events: close evtchn after mapping cleanup")
3fcdaf3d7634 ("xen/events: modify internal [un]bind interfaces")
5dd9ad32d775 ("xen/events: drop xen_allocate_irqs_dynamic()")
3bdb0ac350fe ("xen/events: remove some simple helpers from events_base.c")
686464514fbe ("xen/events: reduce externally visible helper functions")
e64e7c74b99e ("xen/events: avoid using info_for_irq() in xen_send_IPI_one()")
9e90e58c11b7 ("xen: evtchn: Allow shared registration of IRQ handers")
87797fad6cce ("xen/events: replace evtchn_rwlock with RCU")
58f6259b7a08 ("xen/evtchn: Introduce new IOCTL to bind static evtchn")
04d684875b30 ("xen: xen_debug_interrupt prototype to global header")
073352e951f6 ("genirq: Add and use an irq_data_update_affinity helper")
961343d78226 ("genirq: Refactor accessors to use irq_data_get_affinity_mask")
83f877a09516 ("xen/events: remove redundant initialization of variable irq")
3de218ff39b9 ("xen/events: reset active flag for lateeoi events later")
d120198bd5ff ("xen/evtchn: Change irq_info lock to raw_spinlock_t")
b6622798bc50 ("xen/events: avoid handling the same event on two cpus at the same time")
25da4618af24 ("xen/events: don't unmask an event channel when an eoi is pending")
06f45fe96fcd ("xen/events: add per-xenbus device event statistics and settings")
f2fa0e5e9f31 ("xen/events: link interdomain events to associated xenbus device")
88f0a9d06644 ("xen/events: Implement irq distribution")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From fa765c4b4aed2d64266b694520ecb025c862c5a9 Mon Sep 17 00:00:00 2001
From: Maximilian Heyne <mheyne(a)amazon.de>
Date: Wed, 24 Jan 2024 16:31:28 +0000
Subject: [PATCH] xen/events: close evtchn after mapping cleanup
shutdown_pirq and startup_pirq are not taking the
irq_mapping_update_lock because they can't due to lock inversion. Both
are called with the irq_desc->lock being taking. The lock order,
however, is first irq_mapping_update_lock and then irq_desc->lock.
This opens multiple races:
- shutdown_pirq can be interrupted by a function that allocates an event
channel:
CPU0 CPU1
shutdown_pirq {
xen_evtchn_close(e)
__startup_pirq {
EVTCHNOP_bind_pirq
-> returns just freed evtchn e
set_evtchn_to_irq(e, irq)
}
xen_irq_info_cleanup() {
set_evtchn_to_irq(e, -1)
}
}
Assume here event channel e refers here to the same event channel
number.
After this race the evtchn_to_irq mapping for e is invalid (-1).
- __startup_pirq races with __unbind_from_irq in a similar way. Because
__startup_pirq doesn't take irq_mapping_update_lock it can grab the
evtchn that __unbind_from_irq is currently freeing and cleaning up. In
this case even though the event channel is allocated, its mapping can
be unset in evtchn_to_irq.
The fix is to first cleanup the mappings and then close the event
channel. In this way, when an event channel gets allocated it's
potential previous evtchn_to_irq mappings are guaranteed to be unset already.
This is also the reverse order of the allocation where first the event
channel is allocated and then the mappings are setup.
On a 5.10 kernel prior to commit 3fcdaf3d7634 ("xen/events: modify internal
[un]bind interfaces"), we hit a BUG like the following during probing of NVMe
devices. The issue is that during nvme_setup_io_queues, pci_free_irq
is called for every device which results in a call to shutdown_pirq.
With many nvme devices it's therefore likely to hit this race during
boot because there will be multiple calls to shutdown_pirq and
startup_pirq are running potentially in parallel.
------------[ cut here ]------------
blkfront: xvda: barrier or flush: disabled; persistent grants: enabled; indirect descriptors: enabled; bounce buffer: enabled
kernel BUG at drivers/xen/events/events_base.c:499!
invalid opcode: 0000 [#1] SMP PTI
CPU: 44 PID: 375 Comm: kworker/u257:23 Not tainted 5.10.201-191.748.amzn2.x86_64 #1
Hardware name: Xen HVM domU, BIOS 4.11.amazon 08/24/2006
Workqueue: nvme-reset-wq nvme_reset_work
RIP: 0010:bind_evtchn_to_cpu+0xdf/0xf0
Code: 5d 41 5e c3 cc cc cc cc 44 89 f7 e8 2b 55 ad ff 49 89 c5 48 85 c0 0f 84 64 ff ff ff 4c 8b 68 30 41 83 fe ff 0f 85 60 ff ff ff <0f> 0b 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 0f 1f 44 00 00
RSP: 0000:ffffc9000d533b08 EFLAGS: 00010046
RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000006
RDX: 0000000000000028 RSI: 00000000ffffffff RDI: 00000000ffffffff
RBP: ffff888107419680 R08: 0000000000000000 R09: ffffffff82d72b00
R10: 0000000000000000 R11: 0000000000000000 R12: 00000000000001ed
R13: 0000000000000000 R14: 00000000ffffffff R15: 0000000000000002
FS: 0000000000000000(0000) GS:ffff88bc8b500000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000000 CR3: 0000000002610001 CR4: 00000000001706e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
? show_trace_log_lvl+0x1c1/0x2d9
? show_trace_log_lvl+0x1c1/0x2d9
? set_affinity_irq+0xdc/0x1c0
? __die_body.cold+0x8/0xd
? die+0x2b/0x50
? do_trap+0x90/0x110
? bind_evtchn_to_cpu+0xdf/0xf0
? do_error_trap+0x65/0x80
? bind_evtchn_to_cpu+0xdf/0xf0
? exc_invalid_op+0x4e/0x70
? bind_evtchn_to_cpu+0xdf/0xf0
? asm_exc_invalid_op+0x12/0x20
? bind_evtchn_to_cpu+0xdf/0xf0
? bind_evtchn_to_cpu+0xc5/0xf0
set_affinity_irq+0xdc/0x1c0
irq_do_set_affinity+0x1d7/0x1f0
irq_setup_affinity+0xd6/0x1a0
irq_startup+0x8a/0xf0
__setup_irq+0x639/0x6d0
? nvme_suspend+0x150/0x150
request_threaded_irq+0x10c/0x180
? nvme_suspend+0x150/0x150
pci_request_irq+0xa8/0xf0
? __blk_mq_free_request+0x74/0xa0
queue_request_irq+0x6f/0x80
nvme_create_queue+0x1af/0x200
nvme_create_io_queues+0xbd/0xf0
nvme_setup_io_queues+0x246/0x320
? nvme_irq_check+0x30/0x30
nvme_reset_work+0x1c8/0x400
process_one_work+0x1b0/0x350
worker_thread+0x49/0x310
? process_one_work+0x350/0x350
kthread+0x11b/0x140
? __kthread_bind_mask+0x60/0x60
ret_from_fork+0x22/0x30
Modules linked in:
---[ end trace a11715de1eee1873 ]---
Fixes: d46a78b05c0e ("xen: implement pirq type event channels")
Cc: stable(a)vger.kernel.org
Co-debugged-by: Andrew Panyakin <apanyaki(a)amazon.com>
Signed-off-by: Maximilian Heyne <mheyne(a)amazon.de>
Reviewed-by: Juergen Gross <jgross(a)suse.com>
Link: https://lore.kernel.org/r/20240124163130.31324-1-mheyne@amazon.de
Signed-off-by: Juergen Gross <jgross(a)suse.com>
diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c
index b8cfea7812d6..3b9f080109d7 100644
--- a/drivers/xen/events/events_base.c
+++ b/drivers/xen/events/events_base.c
@@ -923,8 +923,8 @@ static void shutdown_pirq(struct irq_data *data)
return;
do_mask(info, EVT_MASK_REASON_EXPLICIT);
- xen_evtchn_close(evtchn);
xen_irq_info_cleanup(info);
+ xen_evtchn_close(evtchn);
}
static void enable_pirq(struct irq_data *data)
@@ -956,6 +956,7 @@ EXPORT_SYMBOL_GPL(xen_irq_from_gsi);
static void __unbind_from_irq(struct irq_info *info, unsigned int irq)
{
evtchn_port_t evtchn;
+ bool close_evtchn = false;
if (!info) {
xen_irq_free_desc(irq);
@@ -975,7 +976,7 @@ static void __unbind_from_irq(struct irq_info *info, unsigned int irq)
struct xenbus_device *dev;
if (!info->is_static)
- xen_evtchn_close(evtchn);
+ close_evtchn = true;
switch (info->type) {
case IRQT_VIRQ:
@@ -995,6 +996,9 @@ static void __unbind_from_irq(struct irq_info *info, unsigned int irq)
}
xen_irq_info_cleanup(info);
+
+ if (close_evtchn)
+ xen_evtchn_close(evtchn);
}
xen_free_irq(info);
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x fa765c4b4aed2d64266b694520ecb025c862c5a9
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021914-wackiness-diagnoses-52e2@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
fa765c4b4aed ("xen/events: close evtchn after mapping cleanup")
3fcdaf3d7634 ("xen/events: modify internal [un]bind interfaces")
5dd9ad32d775 ("xen/events: drop xen_allocate_irqs_dynamic()")
3bdb0ac350fe ("xen/events: remove some simple helpers from events_base.c")
686464514fbe ("xen/events: reduce externally visible helper functions")
e64e7c74b99e ("xen/events: avoid using info_for_irq() in xen_send_IPI_one()")
9e90e58c11b7 ("xen: evtchn: Allow shared registration of IRQ handers")
87797fad6cce ("xen/events: replace evtchn_rwlock with RCU")
58f6259b7a08 ("xen/evtchn: Introduce new IOCTL to bind static evtchn")
04d684875b30 ("xen: xen_debug_interrupt prototype to global header")
073352e951f6 ("genirq: Add and use an irq_data_update_affinity helper")
961343d78226 ("genirq: Refactor accessors to use irq_data_get_affinity_mask")
83f877a09516 ("xen/events: remove redundant initialization of variable irq")
3de218ff39b9 ("xen/events: reset active flag for lateeoi events later")
d120198bd5ff ("xen/evtchn: Change irq_info lock to raw_spinlock_t")
b6622798bc50 ("xen/events: avoid handling the same event on two cpus at the same time")
25da4618af24 ("xen/events: don't unmask an event channel when an eoi is pending")
06f45fe96fcd ("xen/events: add per-xenbus device event statistics and settings")
f2fa0e5e9f31 ("xen/events: link interdomain events to associated xenbus device")
88f0a9d06644 ("xen/events: Implement irq distribution")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From fa765c4b4aed2d64266b694520ecb025c862c5a9 Mon Sep 17 00:00:00 2001
From: Maximilian Heyne <mheyne(a)amazon.de>
Date: Wed, 24 Jan 2024 16:31:28 +0000
Subject: [PATCH] xen/events: close evtchn after mapping cleanup
shutdown_pirq and startup_pirq are not taking the
irq_mapping_update_lock because they can't due to lock inversion. Both
are called with the irq_desc->lock being taking. The lock order,
however, is first irq_mapping_update_lock and then irq_desc->lock.
This opens multiple races:
- shutdown_pirq can be interrupted by a function that allocates an event
channel:
CPU0 CPU1
shutdown_pirq {
xen_evtchn_close(e)
__startup_pirq {
EVTCHNOP_bind_pirq
-> returns just freed evtchn e
set_evtchn_to_irq(e, irq)
}
xen_irq_info_cleanup() {
set_evtchn_to_irq(e, -1)
}
}
Assume here event channel e refers here to the same event channel
number.
After this race the evtchn_to_irq mapping for e is invalid (-1).
- __startup_pirq races with __unbind_from_irq in a similar way. Because
__startup_pirq doesn't take irq_mapping_update_lock it can grab the
evtchn that __unbind_from_irq is currently freeing and cleaning up. In
this case even though the event channel is allocated, its mapping can
be unset in evtchn_to_irq.
The fix is to first cleanup the mappings and then close the event
channel. In this way, when an event channel gets allocated it's
potential previous evtchn_to_irq mappings are guaranteed to be unset already.
This is also the reverse order of the allocation where first the event
channel is allocated and then the mappings are setup.
On a 5.10 kernel prior to commit 3fcdaf3d7634 ("xen/events: modify internal
[un]bind interfaces"), we hit a BUG like the following during probing of NVMe
devices. The issue is that during nvme_setup_io_queues, pci_free_irq
is called for every device which results in a call to shutdown_pirq.
With many nvme devices it's therefore likely to hit this race during
boot because there will be multiple calls to shutdown_pirq and
startup_pirq are running potentially in parallel.
------------[ cut here ]------------
blkfront: xvda: barrier or flush: disabled; persistent grants: enabled; indirect descriptors: enabled; bounce buffer: enabled
kernel BUG at drivers/xen/events/events_base.c:499!
invalid opcode: 0000 [#1] SMP PTI
CPU: 44 PID: 375 Comm: kworker/u257:23 Not tainted 5.10.201-191.748.amzn2.x86_64 #1
Hardware name: Xen HVM domU, BIOS 4.11.amazon 08/24/2006
Workqueue: nvme-reset-wq nvme_reset_work
RIP: 0010:bind_evtchn_to_cpu+0xdf/0xf0
Code: 5d 41 5e c3 cc cc cc cc 44 89 f7 e8 2b 55 ad ff 49 89 c5 48 85 c0 0f 84 64 ff ff ff 4c 8b 68 30 41 83 fe ff 0f 85 60 ff ff ff <0f> 0b 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 0f 1f 44 00 00
RSP: 0000:ffffc9000d533b08 EFLAGS: 00010046
RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000006
RDX: 0000000000000028 RSI: 00000000ffffffff RDI: 00000000ffffffff
RBP: ffff888107419680 R08: 0000000000000000 R09: ffffffff82d72b00
R10: 0000000000000000 R11: 0000000000000000 R12: 00000000000001ed
R13: 0000000000000000 R14: 00000000ffffffff R15: 0000000000000002
FS: 0000000000000000(0000) GS:ffff88bc8b500000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000000 CR3: 0000000002610001 CR4: 00000000001706e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
? show_trace_log_lvl+0x1c1/0x2d9
? show_trace_log_lvl+0x1c1/0x2d9
? set_affinity_irq+0xdc/0x1c0
? __die_body.cold+0x8/0xd
? die+0x2b/0x50
? do_trap+0x90/0x110
? bind_evtchn_to_cpu+0xdf/0xf0
? do_error_trap+0x65/0x80
? bind_evtchn_to_cpu+0xdf/0xf0
? exc_invalid_op+0x4e/0x70
? bind_evtchn_to_cpu+0xdf/0xf0
? asm_exc_invalid_op+0x12/0x20
? bind_evtchn_to_cpu+0xdf/0xf0
? bind_evtchn_to_cpu+0xc5/0xf0
set_affinity_irq+0xdc/0x1c0
irq_do_set_affinity+0x1d7/0x1f0
irq_setup_affinity+0xd6/0x1a0
irq_startup+0x8a/0xf0
__setup_irq+0x639/0x6d0
? nvme_suspend+0x150/0x150
request_threaded_irq+0x10c/0x180
? nvme_suspend+0x150/0x150
pci_request_irq+0xa8/0xf0
? __blk_mq_free_request+0x74/0xa0
queue_request_irq+0x6f/0x80
nvme_create_queue+0x1af/0x200
nvme_create_io_queues+0xbd/0xf0
nvme_setup_io_queues+0x246/0x320
? nvme_irq_check+0x30/0x30
nvme_reset_work+0x1c8/0x400
process_one_work+0x1b0/0x350
worker_thread+0x49/0x310
? process_one_work+0x350/0x350
kthread+0x11b/0x140
? __kthread_bind_mask+0x60/0x60
ret_from_fork+0x22/0x30
Modules linked in:
---[ end trace a11715de1eee1873 ]---
Fixes: d46a78b05c0e ("xen: implement pirq type event channels")
Cc: stable(a)vger.kernel.org
Co-debugged-by: Andrew Panyakin <apanyaki(a)amazon.com>
Signed-off-by: Maximilian Heyne <mheyne(a)amazon.de>
Reviewed-by: Juergen Gross <jgross(a)suse.com>
Link: https://lore.kernel.org/r/20240124163130.31324-1-mheyne@amazon.de
Signed-off-by: Juergen Gross <jgross(a)suse.com>
diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c
index b8cfea7812d6..3b9f080109d7 100644
--- a/drivers/xen/events/events_base.c
+++ b/drivers/xen/events/events_base.c
@@ -923,8 +923,8 @@ static void shutdown_pirq(struct irq_data *data)
return;
do_mask(info, EVT_MASK_REASON_EXPLICIT);
- xen_evtchn_close(evtchn);
xen_irq_info_cleanup(info);
+ xen_evtchn_close(evtchn);
}
static void enable_pirq(struct irq_data *data)
@@ -956,6 +956,7 @@ EXPORT_SYMBOL_GPL(xen_irq_from_gsi);
static void __unbind_from_irq(struct irq_info *info, unsigned int irq)
{
evtchn_port_t evtchn;
+ bool close_evtchn = false;
if (!info) {
xen_irq_free_desc(irq);
@@ -975,7 +976,7 @@ static void __unbind_from_irq(struct irq_info *info, unsigned int irq)
struct xenbus_device *dev;
if (!info->is_static)
- xen_evtchn_close(evtchn);
+ close_evtchn = true;
switch (info->type) {
case IRQT_VIRQ:
@@ -995,6 +996,9 @@ static void __unbind_from_irq(struct irq_info *info, unsigned int irq)
}
xen_irq_info_cleanup(info);
+
+ if (close_evtchn)
+ xen_evtchn_close(evtchn);
}
xen_free_irq(info);
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x fa765c4b4aed2d64266b694520ecb025c862c5a9
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021911-king-sugar-1024@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
fa765c4b4aed ("xen/events: close evtchn after mapping cleanup")
3fcdaf3d7634 ("xen/events: modify internal [un]bind interfaces")
5dd9ad32d775 ("xen/events: drop xen_allocate_irqs_dynamic()")
3bdb0ac350fe ("xen/events: remove some simple helpers from events_base.c")
686464514fbe ("xen/events: reduce externally visible helper functions")
e64e7c74b99e ("xen/events: avoid using info_for_irq() in xen_send_IPI_one()")
9e90e58c11b7 ("xen: evtchn: Allow shared registration of IRQ handers")
87797fad6cce ("xen/events: replace evtchn_rwlock with RCU")
58f6259b7a08 ("xen/evtchn: Introduce new IOCTL to bind static evtchn")
04d684875b30 ("xen: xen_debug_interrupt prototype to global header")
073352e951f6 ("genirq: Add and use an irq_data_update_affinity helper")
961343d78226 ("genirq: Refactor accessors to use irq_data_get_affinity_mask")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From fa765c4b4aed2d64266b694520ecb025c862c5a9 Mon Sep 17 00:00:00 2001
From: Maximilian Heyne <mheyne(a)amazon.de>
Date: Wed, 24 Jan 2024 16:31:28 +0000
Subject: [PATCH] xen/events: close evtchn after mapping cleanup
shutdown_pirq and startup_pirq are not taking the
irq_mapping_update_lock because they can't due to lock inversion. Both
are called with the irq_desc->lock being taking. The lock order,
however, is first irq_mapping_update_lock and then irq_desc->lock.
This opens multiple races:
- shutdown_pirq can be interrupted by a function that allocates an event
channel:
CPU0 CPU1
shutdown_pirq {
xen_evtchn_close(e)
__startup_pirq {
EVTCHNOP_bind_pirq
-> returns just freed evtchn e
set_evtchn_to_irq(e, irq)
}
xen_irq_info_cleanup() {
set_evtchn_to_irq(e, -1)
}
}
Assume here event channel e refers here to the same event channel
number.
After this race the evtchn_to_irq mapping for e is invalid (-1).
- __startup_pirq races with __unbind_from_irq in a similar way. Because
__startup_pirq doesn't take irq_mapping_update_lock it can grab the
evtchn that __unbind_from_irq is currently freeing and cleaning up. In
this case even though the event channel is allocated, its mapping can
be unset in evtchn_to_irq.
The fix is to first cleanup the mappings and then close the event
channel. In this way, when an event channel gets allocated it's
potential previous evtchn_to_irq mappings are guaranteed to be unset already.
This is also the reverse order of the allocation where first the event
channel is allocated and then the mappings are setup.
On a 5.10 kernel prior to commit 3fcdaf3d7634 ("xen/events: modify internal
[un]bind interfaces"), we hit a BUG like the following during probing of NVMe
devices. The issue is that during nvme_setup_io_queues, pci_free_irq
is called for every device which results in a call to shutdown_pirq.
With many nvme devices it's therefore likely to hit this race during
boot because there will be multiple calls to shutdown_pirq and
startup_pirq are running potentially in parallel.
------------[ cut here ]------------
blkfront: xvda: barrier or flush: disabled; persistent grants: enabled; indirect descriptors: enabled; bounce buffer: enabled
kernel BUG at drivers/xen/events/events_base.c:499!
invalid opcode: 0000 [#1] SMP PTI
CPU: 44 PID: 375 Comm: kworker/u257:23 Not tainted 5.10.201-191.748.amzn2.x86_64 #1
Hardware name: Xen HVM domU, BIOS 4.11.amazon 08/24/2006
Workqueue: nvme-reset-wq nvme_reset_work
RIP: 0010:bind_evtchn_to_cpu+0xdf/0xf0
Code: 5d 41 5e c3 cc cc cc cc 44 89 f7 e8 2b 55 ad ff 49 89 c5 48 85 c0 0f 84 64 ff ff ff 4c 8b 68 30 41 83 fe ff 0f 85 60 ff ff ff <0f> 0b 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 0f 1f 44 00 00
RSP: 0000:ffffc9000d533b08 EFLAGS: 00010046
RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000006
RDX: 0000000000000028 RSI: 00000000ffffffff RDI: 00000000ffffffff
RBP: ffff888107419680 R08: 0000000000000000 R09: ffffffff82d72b00
R10: 0000000000000000 R11: 0000000000000000 R12: 00000000000001ed
R13: 0000000000000000 R14: 00000000ffffffff R15: 0000000000000002
FS: 0000000000000000(0000) GS:ffff88bc8b500000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000000 CR3: 0000000002610001 CR4: 00000000001706e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
? show_trace_log_lvl+0x1c1/0x2d9
? show_trace_log_lvl+0x1c1/0x2d9
? set_affinity_irq+0xdc/0x1c0
? __die_body.cold+0x8/0xd
? die+0x2b/0x50
? do_trap+0x90/0x110
? bind_evtchn_to_cpu+0xdf/0xf0
? do_error_trap+0x65/0x80
? bind_evtchn_to_cpu+0xdf/0xf0
? exc_invalid_op+0x4e/0x70
? bind_evtchn_to_cpu+0xdf/0xf0
? asm_exc_invalid_op+0x12/0x20
? bind_evtchn_to_cpu+0xdf/0xf0
? bind_evtchn_to_cpu+0xc5/0xf0
set_affinity_irq+0xdc/0x1c0
irq_do_set_affinity+0x1d7/0x1f0
irq_setup_affinity+0xd6/0x1a0
irq_startup+0x8a/0xf0
__setup_irq+0x639/0x6d0
? nvme_suspend+0x150/0x150
request_threaded_irq+0x10c/0x180
? nvme_suspend+0x150/0x150
pci_request_irq+0xa8/0xf0
? __blk_mq_free_request+0x74/0xa0
queue_request_irq+0x6f/0x80
nvme_create_queue+0x1af/0x200
nvme_create_io_queues+0xbd/0xf0
nvme_setup_io_queues+0x246/0x320
? nvme_irq_check+0x30/0x30
nvme_reset_work+0x1c8/0x400
process_one_work+0x1b0/0x350
worker_thread+0x49/0x310
? process_one_work+0x350/0x350
kthread+0x11b/0x140
? __kthread_bind_mask+0x60/0x60
ret_from_fork+0x22/0x30
Modules linked in:
---[ end trace a11715de1eee1873 ]---
Fixes: d46a78b05c0e ("xen: implement pirq type event channels")
Cc: stable(a)vger.kernel.org
Co-debugged-by: Andrew Panyakin <apanyaki(a)amazon.com>
Signed-off-by: Maximilian Heyne <mheyne(a)amazon.de>
Reviewed-by: Juergen Gross <jgross(a)suse.com>
Link: https://lore.kernel.org/r/20240124163130.31324-1-mheyne@amazon.de
Signed-off-by: Juergen Gross <jgross(a)suse.com>
diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c
index b8cfea7812d6..3b9f080109d7 100644
--- a/drivers/xen/events/events_base.c
+++ b/drivers/xen/events/events_base.c
@@ -923,8 +923,8 @@ static void shutdown_pirq(struct irq_data *data)
return;
do_mask(info, EVT_MASK_REASON_EXPLICIT);
- xen_evtchn_close(evtchn);
xen_irq_info_cleanup(info);
+ xen_evtchn_close(evtchn);
}
static void enable_pirq(struct irq_data *data)
@@ -956,6 +956,7 @@ EXPORT_SYMBOL_GPL(xen_irq_from_gsi);
static void __unbind_from_irq(struct irq_info *info, unsigned int irq)
{
evtchn_port_t evtchn;
+ bool close_evtchn = false;
if (!info) {
xen_irq_free_desc(irq);
@@ -975,7 +976,7 @@ static void __unbind_from_irq(struct irq_info *info, unsigned int irq)
struct xenbus_device *dev;
if (!info->is_static)
- xen_evtchn_close(evtchn);
+ close_evtchn = true;
switch (info->type) {
case IRQT_VIRQ:
@@ -995,6 +996,9 @@ static void __unbind_from_irq(struct irq_info *info, unsigned int irq)
}
xen_irq_info_cleanup(info);
+
+ if (close_evtchn)
+ xen_evtchn_close(evtchn);
}
xen_free_irq(info);
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x fa765c4b4aed2d64266b694520ecb025c862c5a9
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021908-polymer-backyard-ce35@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
fa765c4b4aed ("xen/events: close evtchn after mapping cleanup")
3fcdaf3d7634 ("xen/events: modify internal [un]bind interfaces")
5dd9ad32d775 ("xen/events: drop xen_allocate_irqs_dynamic()")
3bdb0ac350fe ("xen/events: remove some simple helpers from events_base.c")
686464514fbe ("xen/events: reduce externally visible helper functions")
e64e7c74b99e ("xen/events: avoid using info_for_irq() in xen_send_IPI_one()")
9e90e58c11b7 ("xen: evtchn: Allow shared registration of IRQ handers")
87797fad6cce ("xen/events: replace evtchn_rwlock with RCU")
58f6259b7a08 ("xen/evtchn: Introduce new IOCTL to bind static evtchn")
04d684875b30 ("xen: xen_debug_interrupt prototype to global header")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From fa765c4b4aed2d64266b694520ecb025c862c5a9 Mon Sep 17 00:00:00 2001
From: Maximilian Heyne <mheyne(a)amazon.de>
Date: Wed, 24 Jan 2024 16:31:28 +0000
Subject: [PATCH] xen/events: close evtchn after mapping cleanup
shutdown_pirq and startup_pirq are not taking the
irq_mapping_update_lock because they can't due to lock inversion. Both
are called with the irq_desc->lock being taking. The lock order,
however, is first irq_mapping_update_lock and then irq_desc->lock.
This opens multiple races:
- shutdown_pirq can be interrupted by a function that allocates an event
channel:
CPU0 CPU1
shutdown_pirq {
xen_evtchn_close(e)
__startup_pirq {
EVTCHNOP_bind_pirq
-> returns just freed evtchn e
set_evtchn_to_irq(e, irq)
}
xen_irq_info_cleanup() {
set_evtchn_to_irq(e, -1)
}
}
Assume here event channel e refers here to the same event channel
number.
After this race the evtchn_to_irq mapping for e is invalid (-1).
- __startup_pirq races with __unbind_from_irq in a similar way. Because
__startup_pirq doesn't take irq_mapping_update_lock it can grab the
evtchn that __unbind_from_irq is currently freeing and cleaning up. In
this case even though the event channel is allocated, its mapping can
be unset in evtchn_to_irq.
The fix is to first cleanup the mappings and then close the event
channel. In this way, when an event channel gets allocated it's
potential previous evtchn_to_irq mappings are guaranteed to be unset already.
This is also the reverse order of the allocation where first the event
channel is allocated and then the mappings are setup.
On a 5.10 kernel prior to commit 3fcdaf3d7634 ("xen/events: modify internal
[un]bind interfaces"), we hit a BUG like the following during probing of NVMe
devices. The issue is that during nvme_setup_io_queues, pci_free_irq
is called for every device which results in a call to shutdown_pirq.
With many nvme devices it's therefore likely to hit this race during
boot because there will be multiple calls to shutdown_pirq and
startup_pirq are running potentially in parallel.
------------[ cut here ]------------
blkfront: xvda: barrier or flush: disabled; persistent grants: enabled; indirect descriptors: enabled; bounce buffer: enabled
kernel BUG at drivers/xen/events/events_base.c:499!
invalid opcode: 0000 [#1] SMP PTI
CPU: 44 PID: 375 Comm: kworker/u257:23 Not tainted 5.10.201-191.748.amzn2.x86_64 #1
Hardware name: Xen HVM domU, BIOS 4.11.amazon 08/24/2006
Workqueue: nvme-reset-wq nvme_reset_work
RIP: 0010:bind_evtchn_to_cpu+0xdf/0xf0
Code: 5d 41 5e c3 cc cc cc cc 44 89 f7 e8 2b 55 ad ff 49 89 c5 48 85 c0 0f 84 64 ff ff ff 4c 8b 68 30 41 83 fe ff 0f 85 60 ff ff ff <0f> 0b 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 0f 1f 44 00 00
RSP: 0000:ffffc9000d533b08 EFLAGS: 00010046
RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000006
RDX: 0000000000000028 RSI: 00000000ffffffff RDI: 00000000ffffffff
RBP: ffff888107419680 R08: 0000000000000000 R09: ffffffff82d72b00
R10: 0000000000000000 R11: 0000000000000000 R12: 00000000000001ed
R13: 0000000000000000 R14: 00000000ffffffff R15: 0000000000000002
FS: 0000000000000000(0000) GS:ffff88bc8b500000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000000 CR3: 0000000002610001 CR4: 00000000001706e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
? show_trace_log_lvl+0x1c1/0x2d9
? show_trace_log_lvl+0x1c1/0x2d9
? set_affinity_irq+0xdc/0x1c0
? __die_body.cold+0x8/0xd
? die+0x2b/0x50
? do_trap+0x90/0x110
? bind_evtchn_to_cpu+0xdf/0xf0
? do_error_trap+0x65/0x80
? bind_evtchn_to_cpu+0xdf/0xf0
? exc_invalid_op+0x4e/0x70
? bind_evtchn_to_cpu+0xdf/0xf0
? asm_exc_invalid_op+0x12/0x20
? bind_evtchn_to_cpu+0xdf/0xf0
? bind_evtchn_to_cpu+0xc5/0xf0
set_affinity_irq+0xdc/0x1c0
irq_do_set_affinity+0x1d7/0x1f0
irq_setup_affinity+0xd6/0x1a0
irq_startup+0x8a/0xf0
__setup_irq+0x639/0x6d0
? nvme_suspend+0x150/0x150
request_threaded_irq+0x10c/0x180
? nvme_suspend+0x150/0x150
pci_request_irq+0xa8/0xf0
? __blk_mq_free_request+0x74/0xa0
queue_request_irq+0x6f/0x80
nvme_create_queue+0x1af/0x200
nvme_create_io_queues+0xbd/0xf0
nvme_setup_io_queues+0x246/0x320
? nvme_irq_check+0x30/0x30
nvme_reset_work+0x1c8/0x400
process_one_work+0x1b0/0x350
worker_thread+0x49/0x310
? process_one_work+0x350/0x350
kthread+0x11b/0x140
? __kthread_bind_mask+0x60/0x60
ret_from_fork+0x22/0x30
Modules linked in:
---[ end trace a11715de1eee1873 ]---
Fixes: d46a78b05c0e ("xen: implement pirq type event channels")
Cc: stable(a)vger.kernel.org
Co-debugged-by: Andrew Panyakin <apanyaki(a)amazon.com>
Signed-off-by: Maximilian Heyne <mheyne(a)amazon.de>
Reviewed-by: Juergen Gross <jgross(a)suse.com>
Link: https://lore.kernel.org/r/20240124163130.31324-1-mheyne@amazon.de
Signed-off-by: Juergen Gross <jgross(a)suse.com>
diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c
index b8cfea7812d6..3b9f080109d7 100644
--- a/drivers/xen/events/events_base.c
+++ b/drivers/xen/events/events_base.c
@@ -923,8 +923,8 @@ static void shutdown_pirq(struct irq_data *data)
return;
do_mask(info, EVT_MASK_REASON_EXPLICIT);
- xen_evtchn_close(evtchn);
xen_irq_info_cleanup(info);
+ xen_evtchn_close(evtchn);
}
static void enable_pirq(struct irq_data *data)
@@ -956,6 +956,7 @@ EXPORT_SYMBOL_GPL(xen_irq_from_gsi);
static void __unbind_from_irq(struct irq_info *info, unsigned int irq)
{
evtchn_port_t evtchn;
+ bool close_evtchn = false;
if (!info) {
xen_irq_free_desc(irq);
@@ -975,7 +976,7 @@ static void __unbind_from_irq(struct irq_info *info, unsigned int irq)
struct xenbus_device *dev;
if (!info->is_static)
- xen_evtchn_close(evtchn);
+ close_evtchn = true;
switch (info->type) {
case IRQT_VIRQ:
@@ -995,6 +996,9 @@ static void __unbind_from_irq(struct irq_info *info, unsigned int irq)
}
xen_irq_info_cleanup(info);
+
+ if (close_evtchn)
+ xen_evtchn_close(evtchn);
}
xen_free_irq(info);
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x fa765c4b4aed2d64266b694520ecb025c862c5a9
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021905-anger-exclusion-a2ae@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
fa765c4b4aed ("xen/events: close evtchn after mapping cleanup")
3fcdaf3d7634 ("xen/events: modify internal [un]bind interfaces")
5dd9ad32d775 ("xen/events: drop xen_allocate_irqs_dynamic()")
3bdb0ac350fe ("xen/events: remove some simple helpers from events_base.c")
686464514fbe ("xen/events: reduce externally visible helper functions")
e64e7c74b99e ("xen/events: avoid using info_for_irq() in xen_send_IPI_one()")
9e90e58c11b7 ("xen: evtchn: Allow shared registration of IRQ handers")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From fa765c4b4aed2d64266b694520ecb025c862c5a9 Mon Sep 17 00:00:00 2001
From: Maximilian Heyne <mheyne(a)amazon.de>
Date: Wed, 24 Jan 2024 16:31:28 +0000
Subject: [PATCH] xen/events: close evtchn after mapping cleanup
shutdown_pirq and startup_pirq are not taking the
irq_mapping_update_lock because they can't due to lock inversion. Both
are called with the irq_desc->lock being taking. The lock order,
however, is first irq_mapping_update_lock and then irq_desc->lock.
This opens multiple races:
- shutdown_pirq can be interrupted by a function that allocates an event
channel:
CPU0 CPU1
shutdown_pirq {
xen_evtchn_close(e)
__startup_pirq {
EVTCHNOP_bind_pirq
-> returns just freed evtchn e
set_evtchn_to_irq(e, irq)
}
xen_irq_info_cleanup() {
set_evtchn_to_irq(e, -1)
}
}
Assume here event channel e refers here to the same event channel
number.
After this race the evtchn_to_irq mapping for e is invalid (-1).
- __startup_pirq races with __unbind_from_irq in a similar way. Because
__startup_pirq doesn't take irq_mapping_update_lock it can grab the
evtchn that __unbind_from_irq is currently freeing and cleaning up. In
this case even though the event channel is allocated, its mapping can
be unset in evtchn_to_irq.
The fix is to first cleanup the mappings and then close the event
channel. In this way, when an event channel gets allocated it's
potential previous evtchn_to_irq mappings are guaranteed to be unset already.
This is also the reverse order of the allocation where first the event
channel is allocated and then the mappings are setup.
On a 5.10 kernel prior to commit 3fcdaf3d7634 ("xen/events: modify internal
[un]bind interfaces"), we hit a BUG like the following during probing of NVMe
devices. The issue is that during nvme_setup_io_queues, pci_free_irq
is called for every device which results in a call to shutdown_pirq.
With many nvme devices it's therefore likely to hit this race during
boot because there will be multiple calls to shutdown_pirq and
startup_pirq are running potentially in parallel.
------------[ cut here ]------------
blkfront: xvda: barrier or flush: disabled; persistent grants: enabled; indirect descriptors: enabled; bounce buffer: enabled
kernel BUG at drivers/xen/events/events_base.c:499!
invalid opcode: 0000 [#1] SMP PTI
CPU: 44 PID: 375 Comm: kworker/u257:23 Not tainted 5.10.201-191.748.amzn2.x86_64 #1
Hardware name: Xen HVM domU, BIOS 4.11.amazon 08/24/2006
Workqueue: nvme-reset-wq nvme_reset_work
RIP: 0010:bind_evtchn_to_cpu+0xdf/0xf0
Code: 5d 41 5e c3 cc cc cc cc 44 89 f7 e8 2b 55 ad ff 49 89 c5 48 85 c0 0f 84 64 ff ff ff 4c 8b 68 30 41 83 fe ff 0f 85 60 ff ff ff <0f> 0b 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 0f 1f 44 00 00
RSP: 0000:ffffc9000d533b08 EFLAGS: 00010046
RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000006
RDX: 0000000000000028 RSI: 00000000ffffffff RDI: 00000000ffffffff
RBP: ffff888107419680 R08: 0000000000000000 R09: ffffffff82d72b00
R10: 0000000000000000 R11: 0000000000000000 R12: 00000000000001ed
R13: 0000000000000000 R14: 00000000ffffffff R15: 0000000000000002
FS: 0000000000000000(0000) GS:ffff88bc8b500000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000000 CR3: 0000000002610001 CR4: 00000000001706e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
? show_trace_log_lvl+0x1c1/0x2d9
? show_trace_log_lvl+0x1c1/0x2d9
? set_affinity_irq+0xdc/0x1c0
? __die_body.cold+0x8/0xd
? die+0x2b/0x50
? do_trap+0x90/0x110
? bind_evtchn_to_cpu+0xdf/0xf0
? do_error_trap+0x65/0x80
? bind_evtchn_to_cpu+0xdf/0xf0
? exc_invalid_op+0x4e/0x70
? bind_evtchn_to_cpu+0xdf/0xf0
? asm_exc_invalid_op+0x12/0x20
? bind_evtchn_to_cpu+0xdf/0xf0
? bind_evtchn_to_cpu+0xc5/0xf0
set_affinity_irq+0xdc/0x1c0
irq_do_set_affinity+0x1d7/0x1f0
irq_setup_affinity+0xd6/0x1a0
irq_startup+0x8a/0xf0
__setup_irq+0x639/0x6d0
? nvme_suspend+0x150/0x150
request_threaded_irq+0x10c/0x180
? nvme_suspend+0x150/0x150
pci_request_irq+0xa8/0xf0
? __blk_mq_free_request+0x74/0xa0
queue_request_irq+0x6f/0x80
nvme_create_queue+0x1af/0x200
nvme_create_io_queues+0xbd/0xf0
nvme_setup_io_queues+0x246/0x320
? nvme_irq_check+0x30/0x30
nvme_reset_work+0x1c8/0x400
process_one_work+0x1b0/0x350
worker_thread+0x49/0x310
? process_one_work+0x350/0x350
kthread+0x11b/0x140
? __kthread_bind_mask+0x60/0x60
ret_from_fork+0x22/0x30
Modules linked in:
---[ end trace a11715de1eee1873 ]---
Fixes: d46a78b05c0e ("xen: implement pirq type event channels")
Cc: stable(a)vger.kernel.org
Co-debugged-by: Andrew Panyakin <apanyaki(a)amazon.com>
Signed-off-by: Maximilian Heyne <mheyne(a)amazon.de>
Reviewed-by: Juergen Gross <jgross(a)suse.com>
Link: https://lore.kernel.org/r/20240124163130.31324-1-mheyne@amazon.de
Signed-off-by: Juergen Gross <jgross(a)suse.com>
diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c
index b8cfea7812d6..3b9f080109d7 100644
--- a/drivers/xen/events/events_base.c
+++ b/drivers/xen/events/events_base.c
@@ -923,8 +923,8 @@ static void shutdown_pirq(struct irq_data *data)
return;
do_mask(info, EVT_MASK_REASON_EXPLICIT);
- xen_evtchn_close(evtchn);
xen_irq_info_cleanup(info);
+ xen_evtchn_close(evtchn);
}
static void enable_pirq(struct irq_data *data)
@@ -956,6 +956,7 @@ EXPORT_SYMBOL_GPL(xen_irq_from_gsi);
static void __unbind_from_irq(struct irq_info *info, unsigned int irq)
{
evtchn_port_t evtchn;
+ bool close_evtchn = false;
if (!info) {
xen_irq_free_desc(irq);
@@ -975,7 +976,7 @@ static void __unbind_from_irq(struct irq_info *info, unsigned int irq)
struct xenbus_device *dev;
if (!info->is_static)
- xen_evtchn_close(evtchn);
+ close_evtchn = true;
switch (info->type) {
case IRQT_VIRQ:
@@ -995,6 +996,9 @@ static void __unbind_from_irq(struct irq_info *info, unsigned int irq)
}
xen_irq_info_cleanup(info);
+
+ if (close_evtchn)
+ xen_evtchn_close(evtchn);
}
xen_free_irq(info);
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.19.y
git checkout FETCH_HEAD
git cherry-pick -x f814bdda774c183b0cc15ec8f3b6e7c6f4527ba5
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021925-onscreen-cancel-9be1@gregkh' --subject-prefix 'PATCH 4.19.y' HEAD^..
Possible dependencies:
f814bdda774c ("blk-wbt: Fix detection of dirty-throttled tasks")
ba91c849fa50 ("blk-rq-qos: store a gendisk instead of request_queue in struct rq_qos")
3963d84df797 ("blk-rq-qos: constify rq_qos_ops")
ce57b558604e ("blk-rq-qos: make rq_qos_add and rq_qos_del more useful")
b494f9c566ba ("blk-rq-qos: move rq_qos_add and rq_qos_del out of line")
de185b56e8a6 ("blk-cgroup: pass a gendisk to blkcg_schedule_throttle")
9df3e65139b9 ("blk-iocost: simplify ioc_name")
14a6e2eb7df5 ("block: don't allow the same type rq_qos add more than once")
5cf9c91ba927 ("block: serialize all debugfs operations using q->debugfs_mutex")
8a177a36da6c ("blk-iolatency: Fix inflight count imbalances and IO hangs on offline")
c97ab271576d ("blk-cgroup: remove unneeded includes from <linux/blk-cgroup.h>")
7f20ba7c42fd ("blk-cgroup: remove pointless CONFIG_BLOCK ifdefs")
bbb1ebe7a909 ("blk-cgroup: replace bio_blkcg with bio_blkcg_css")
dec223c92a46 ("blk-cgroup: move struct blkcg to block/blk-cgroup.h")
397c9f46ee4d ("blk-cgroup: move blkcg_{pin,unpin}_online out of line")
216889aad362 ("blk-cgroup: move blk_cgroup_congested out line")
d589ae0d4460 ("Merge tag 'for-5.18/block-2022-04-01' of git://git.kernel.dk/linux-block")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From f814bdda774c183b0cc15ec8f3b6e7c6f4527ba5 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack(a)suse.cz>
Date: Tue, 23 Jan 2024 18:58:26 +0100
Subject: [PATCH] blk-wbt: Fix detection of dirty-throttled tasks
The detection of dirty-throttled tasks in blk-wbt has been subtly broken
since its beginning in 2016. Namely if we are doing cgroup writeback and
the throttled task is not in the root cgroup, balance_dirty_pages() will
set dirty_sleep for the non-root bdi_writeback structure. However
blk-wbt checks dirty_sleep only in the root cgroup bdi_writeback
structure. Thus detection of recently throttled tasks is not working in
this case (we noticed this when we switched to cgroup v2 and suddently
writeback was slow).
Since blk-wbt has no easy way to get to proper bdi_writeback and
furthermore its intention has always been to work on the whole device
rather than on individual cgroups, just move the dirty_sleep timestamp
from bdi_writeback to backing_dev_info. That fixes the checking for
recently throttled task and saves memory for everybody as a bonus.
CC: stable(a)vger.kernel.org
Fixes: b57d74aff9ab ("writeback: track if we're sleeping on progress in balance_dirty_pages()")
Signed-off-by: Jan Kara <jack(a)suse.cz>
Link: https://lore.kernel.org/r/20240123175826.21452-1-jack@suse.cz
[axboe: fixup indentation errors]
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 5ba3cd574eac..0c0e270a8265 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -163,9 +163,9 @@ static void wb_timestamp(struct rq_wb *rwb, unsigned long *var)
*/
static bool wb_recent_wait(struct rq_wb *rwb)
{
- struct bdi_writeback *wb = &rwb->rqos.disk->bdi->wb;
+ struct backing_dev_info *bdi = rwb->rqos.disk->bdi;
- return time_before(jiffies, wb->dirty_sleep + HZ);
+ return time_before(jiffies, bdi->last_bdp_sleep + HZ);
}
static inline struct rq_wait *get_rq_wait(struct rq_wb *rwb,
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index ae12696ec492..2ad261082bba 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -141,8 +141,6 @@ struct bdi_writeback {
struct delayed_work dwork; /* work item used for writeback */
struct delayed_work bw_dwork; /* work item used for bandwidth estimate */
- unsigned long dirty_sleep; /* last wait */
-
struct list_head bdi_node; /* anchored at bdi->wb_list */
#ifdef CONFIG_CGROUP_WRITEBACK
@@ -179,6 +177,11 @@ struct backing_dev_info {
* any dirty wbs, which is depended upon by bdi_has_dirty().
*/
atomic_long_t tot_write_bandwidth;
+ /*
+ * Jiffies when last process was dirty throttled on this bdi. Used by
+ * blk-wbt.
+ */
+ unsigned long last_bdp_sleep;
struct bdi_writeback wb; /* the root writeback info for this bdi */
struct list_head wb_list; /* list of all wbs */
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 1e3447bccdb1..e039d05304dd 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -436,7 +436,6 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
INIT_LIST_HEAD(&wb->work_list);
INIT_DELAYED_WORK(&wb->dwork, wb_workfn);
INIT_DELAYED_WORK(&wb->bw_dwork, wb_update_bandwidth_workfn);
- wb->dirty_sleep = jiffies;
err = fprop_local_init_percpu(&wb->completions, gfp);
if (err)
@@ -921,6 +920,7 @@ int bdi_init(struct backing_dev_info *bdi)
INIT_LIST_HEAD(&bdi->bdi_list);
INIT_LIST_HEAD(&bdi->wb_list);
init_waitqueue_head(&bdi->wb_waitq);
+ bdi->last_bdp_sleep = jiffies;
return cgwb_bdi_init(bdi);
}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index cd4e4ae77c40..cc37fa7f3364 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1921,7 +1921,7 @@ static int balance_dirty_pages(struct bdi_writeback *wb,
break;
}
__set_current_state(TASK_KILLABLE);
- wb->dirty_sleep = now;
+ bdi->last_bdp_sleep = jiffies;
io_schedule_timeout(pause);
current->dirty_paused_when = now + pause;
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x f814bdda774c183b0cc15ec8f3b6e7c6f4527ba5
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021924-handcart-displease-1607@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
f814bdda774c ("blk-wbt: Fix detection of dirty-throttled tasks")
ba91c849fa50 ("blk-rq-qos: store a gendisk instead of request_queue in struct rq_qos")
3963d84df797 ("blk-rq-qos: constify rq_qos_ops")
ce57b558604e ("blk-rq-qos: make rq_qos_add and rq_qos_del more useful")
b494f9c566ba ("blk-rq-qos: move rq_qos_add and rq_qos_del out of line")
de185b56e8a6 ("blk-cgroup: pass a gendisk to blkcg_schedule_throttle")
9df3e65139b9 ("blk-iocost: simplify ioc_name")
14a6e2eb7df5 ("block: don't allow the same type rq_qos add more than once")
5cf9c91ba927 ("block: serialize all debugfs operations using q->debugfs_mutex")
8a177a36da6c ("blk-iolatency: Fix inflight count imbalances and IO hangs on offline")
c97ab271576d ("blk-cgroup: remove unneeded includes from <linux/blk-cgroup.h>")
7f20ba7c42fd ("blk-cgroup: remove pointless CONFIG_BLOCK ifdefs")
bbb1ebe7a909 ("blk-cgroup: replace bio_blkcg with bio_blkcg_css")
dec223c92a46 ("blk-cgroup: move struct blkcg to block/blk-cgroup.h")
397c9f46ee4d ("blk-cgroup: move blkcg_{pin,unpin}_online out of line")
216889aad362 ("blk-cgroup: move blk_cgroup_congested out line")
d589ae0d4460 ("Merge tag 'for-5.18/block-2022-04-01' of git://git.kernel.dk/linux-block")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From f814bdda774c183b0cc15ec8f3b6e7c6f4527ba5 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack(a)suse.cz>
Date: Tue, 23 Jan 2024 18:58:26 +0100
Subject: [PATCH] blk-wbt: Fix detection of dirty-throttled tasks
The detection of dirty-throttled tasks in blk-wbt has been subtly broken
since its beginning in 2016. Namely if we are doing cgroup writeback and
the throttled task is not in the root cgroup, balance_dirty_pages() will
set dirty_sleep for the non-root bdi_writeback structure. However
blk-wbt checks dirty_sleep only in the root cgroup bdi_writeback
structure. Thus detection of recently throttled tasks is not working in
this case (we noticed this when we switched to cgroup v2 and suddently
writeback was slow).
Since blk-wbt has no easy way to get to proper bdi_writeback and
furthermore its intention has always been to work on the whole device
rather than on individual cgroups, just move the dirty_sleep timestamp
from bdi_writeback to backing_dev_info. That fixes the checking for
recently throttled task and saves memory for everybody as a bonus.
CC: stable(a)vger.kernel.org
Fixes: b57d74aff9ab ("writeback: track if we're sleeping on progress in balance_dirty_pages()")
Signed-off-by: Jan Kara <jack(a)suse.cz>
Link: https://lore.kernel.org/r/20240123175826.21452-1-jack@suse.cz
[axboe: fixup indentation errors]
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 5ba3cd574eac..0c0e270a8265 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -163,9 +163,9 @@ static void wb_timestamp(struct rq_wb *rwb, unsigned long *var)
*/
static bool wb_recent_wait(struct rq_wb *rwb)
{
- struct bdi_writeback *wb = &rwb->rqos.disk->bdi->wb;
+ struct backing_dev_info *bdi = rwb->rqos.disk->bdi;
- return time_before(jiffies, wb->dirty_sleep + HZ);
+ return time_before(jiffies, bdi->last_bdp_sleep + HZ);
}
static inline struct rq_wait *get_rq_wait(struct rq_wb *rwb,
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index ae12696ec492..2ad261082bba 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -141,8 +141,6 @@ struct bdi_writeback {
struct delayed_work dwork; /* work item used for writeback */
struct delayed_work bw_dwork; /* work item used for bandwidth estimate */
- unsigned long dirty_sleep; /* last wait */
-
struct list_head bdi_node; /* anchored at bdi->wb_list */
#ifdef CONFIG_CGROUP_WRITEBACK
@@ -179,6 +177,11 @@ struct backing_dev_info {
* any dirty wbs, which is depended upon by bdi_has_dirty().
*/
atomic_long_t tot_write_bandwidth;
+ /*
+ * Jiffies when last process was dirty throttled on this bdi. Used by
+ * blk-wbt.
+ */
+ unsigned long last_bdp_sleep;
struct bdi_writeback wb; /* the root writeback info for this bdi */
struct list_head wb_list; /* list of all wbs */
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 1e3447bccdb1..e039d05304dd 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -436,7 +436,6 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
INIT_LIST_HEAD(&wb->work_list);
INIT_DELAYED_WORK(&wb->dwork, wb_workfn);
INIT_DELAYED_WORK(&wb->bw_dwork, wb_update_bandwidth_workfn);
- wb->dirty_sleep = jiffies;
err = fprop_local_init_percpu(&wb->completions, gfp);
if (err)
@@ -921,6 +920,7 @@ int bdi_init(struct backing_dev_info *bdi)
INIT_LIST_HEAD(&bdi->bdi_list);
INIT_LIST_HEAD(&bdi->wb_list);
init_waitqueue_head(&bdi->wb_waitq);
+ bdi->last_bdp_sleep = jiffies;
return cgwb_bdi_init(bdi);
}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index cd4e4ae77c40..cc37fa7f3364 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1921,7 +1921,7 @@ static int balance_dirty_pages(struct bdi_writeback *wb,
break;
}
__set_current_state(TASK_KILLABLE);
- wb->dirty_sleep = now;
+ bdi->last_bdp_sleep = jiffies;
io_schedule_timeout(pause);
current->dirty_paused_when = now + pause;
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x f814bdda774c183b0cc15ec8f3b6e7c6f4527ba5
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021923-onboard-pork-7d99@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
f814bdda774c ("blk-wbt: Fix detection of dirty-throttled tasks")
ba91c849fa50 ("blk-rq-qos: store a gendisk instead of request_queue in struct rq_qos")
3963d84df797 ("blk-rq-qos: constify rq_qos_ops")
ce57b558604e ("blk-rq-qos: make rq_qos_add and rq_qos_del more useful")
b494f9c566ba ("blk-rq-qos: move rq_qos_add and rq_qos_del out of line")
de185b56e8a6 ("blk-cgroup: pass a gendisk to blkcg_schedule_throttle")
9df3e65139b9 ("blk-iocost: simplify ioc_name")
14a6e2eb7df5 ("block: don't allow the same type rq_qos add more than once")
5cf9c91ba927 ("block: serialize all debugfs operations using q->debugfs_mutex")
8a177a36da6c ("blk-iolatency: Fix inflight count imbalances and IO hangs on offline")
c97ab271576d ("blk-cgroup: remove unneeded includes from <linux/blk-cgroup.h>")
7f20ba7c42fd ("blk-cgroup: remove pointless CONFIG_BLOCK ifdefs")
bbb1ebe7a909 ("blk-cgroup: replace bio_blkcg with bio_blkcg_css")
dec223c92a46 ("blk-cgroup: move struct blkcg to block/blk-cgroup.h")
397c9f46ee4d ("blk-cgroup: move blkcg_{pin,unpin}_online out of line")
216889aad362 ("blk-cgroup: move blk_cgroup_congested out line")
d589ae0d4460 ("Merge tag 'for-5.18/block-2022-04-01' of git://git.kernel.dk/linux-block")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From f814bdda774c183b0cc15ec8f3b6e7c6f4527ba5 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack(a)suse.cz>
Date: Tue, 23 Jan 2024 18:58:26 +0100
Subject: [PATCH] blk-wbt: Fix detection of dirty-throttled tasks
The detection of dirty-throttled tasks in blk-wbt has been subtly broken
since its beginning in 2016. Namely if we are doing cgroup writeback and
the throttled task is not in the root cgroup, balance_dirty_pages() will
set dirty_sleep for the non-root bdi_writeback structure. However
blk-wbt checks dirty_sleep only in the root cgroup bdi_writeback
structure. Thus detection of recently throttled tasks is not working in
this case (we noticed this when we switched to cgroup v2 and suddently
writeback was slow).
Since blk-wbt has no easy way to get to proper bdi_writeback and
furthermore its intention has always been to work on the whole device
rather than on individual cgroups, just move the dirty_sleep timestamp
from bdi_writeback to backing_dev_info. That fixes the checking for
recently throttled task and saves memory for everybody as a bonus.
CC: stable(a)vger.kernel.org
Fixes: b57d74aff9ab ("writeback: track if we're sleeping on progress in balance_dirty_pages()")
Signed-off-by: Jan Kara <jack(a)suse.cz>
Link: https://lore.kernel.org/r/20240123175826.21452-1-jack@suse.cz
[axboe: fixup indentation errors]
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 5ba3cd574eac..0c0e270a8265 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -163,9 +163,9 @@ static void wb_timestamp(struct rq_wb *rwb, unsigned long *var)
*/
static bool wb_recent_wait(struct rq_wb *rwb)
{
- struct bdi_writeback *wb = &rwb->rqos.disk->bdi->wb;
+ struct backing_dev_info *bdi = rwb->rqos.disk->bdi;
- return time_before(jiffies, wb->dirty_sleep + HZ);
+ return time_before(jiffies, bdi->last_bdp_sleep + HZ);
}
static inline struct rq_wait *get_rq_wait(struct rq_wb *rwb,
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index ae12696ec492..2ad261082bba 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -141,8 +141,6 @@ struct bdi_writeback {
struct delayed_work dwork; /* work item used for writeback */
struct delayed_work bw_dwork; /* work item used for bandwidth estimate */
- unsigned long dirty_sleep; /* last wait */
-
struct list_head bdi_node; /* anchored at bdi->wb_list */
#ifdef CONFIG_CGROUP_WRITEBACK
@@ -179,6 +177,11 @@ struct backing_dev_info {
* any dirty wbs, which is depended upon by bdi_has_dirty().
*/
atomic_long_t tot_write_bandwidth;
+ /*
+ * Jiffies when last process was dirty throttled on this bdi. Used by
+ * blk-wbt.
+ */
+ unsigned long last_bdp_sleep;
struct bdi_writeback wb; /* the root writeback info for this bdi */
struct list_head wb_list; /* list of all wbs */
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 1e3447bccdb1..e039d05304dd 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -436,7 +436,6 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
INIT_LIST_HEAD(&wb->work_list);
INIT_DELAYED_WORK(&wb->dwork, wb_workfn);
INIT_DELAYED_WORK(&wb->bw_dwork, wb_update_bandwidth_workfn);
- wb->dirty_sleep = jiffies;
err = fprop_local_init_percpu(&wb->completions, gfp);
if (err)
@@ -921,6 +920,7 @@ int bdi_init(struct backing_dev_info *bdi)
INIT_LIST_HEAD(&bdi->bdi_list);
INIT_LIST_HEAD(&bdi->wb_list);
init_waitqueue_head(&bdi->wb_waitq);
+ bdi->last_bdp_sleep = jiffies;
return cgwb_bdi_init(bdi);
}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index cd4e4ae77c40..cc37fa7f3364 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1921,7 +1921,7 @@ static int balance_dirty_pages(struct bdi_writeback *wb,
break;
}
__set_current_state(TASK_KILLABLE);
- wb->dirty_sleep = now;
+ bdi->last_bdp_sleep = jiffies;
io_schedule_timeout(pause);
current->dirty_paused_when = now + pause;
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x f814bdda774c183b0cc15ec8f3b6e7c6f4527ba5
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021921-unspoiled-despite-59cc@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
f814bdda774c ("blk-wbt: Fix detection of dirty-throttled tasks")
ba91c849fa50 ("blk-rq-qos: store a gendisk instead of request_queue in struct rq_qos")
3963d84df797 ("blk-rq-qos: constify rq_qos_ops")
ce57b558604e ("blk-rq-qos: make rq_qos_add and rq_qos_del more useful")
b494f9c566ba ("blk-rq-qos: move rq_qos_add and rq_qos_del out of line")
de185b56e8a6 ("blk-cgroup: pass a gendisk to blkcg_schedule_throttle")
9df3e65139b9 ("blk-iocost: simplify ioc_name")
14a6e2eb7df5 ("block: don't allow the same type rq_qos add more than once")
5cf9c91ba927 ("block: serialize all debugfs operations using q->debugfs_mutex")
8a177a36da6c ("blk-iolatency: Fix inflight count imbalances and IO hangs on offline")
c97ab271576d ("blk-cgroup: remove unneeded includes from <linux/blk-cgroup.h>")
7f20ba7c42fd ("blk-cgroup: remove pointless CONFIG_BLOCK ifdefs")
bbb1ebe7a909 ("blk-cgroup: replace bio_blkcg with bio_blkcg_css")
dec223c92a46 ("blk-cgroup: move struct blkcg to block/blk-cgroup.h")
397c9f46ee4d ("blk-cgroup: move blkcg_{pin,unpin}_online out of line")
216889aad362 ("blk-cgroup: move blk_cgroup_congested out line")
d589ae0d4460 ("Merge tag 'for-5.18/block-2022-04-01' of git://git.kernel.dk/linux-block")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From f814bdda774c183b0cc15ec8f3b6e7c6f4527ba5 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack(a)suse.cz>
Date: Tue, 23 Jan 2024 18:58:26 +0100
Subject: [PATCH] blk-wbt: Fix detection of dirty-throttled tasks
The detection of dirty-throttled tasks in blk-wbt has been subtly broken
since its beginning in 2016. Namely if we are doing cgroup writeback and
the throttled task is not in the root cgroup, balance_dirty_pages() will
set dirty_sleep for the non-root bdi_writeback structure. However
blk-wbt checks dirty_sleep only in the root cgroup bdi_writeback
structure. Thus detection of recently throttled tasks is not working in
this case (we noticed this when we switched to cgroup v2 and suddently
writeback was slow).
Since blk-wbt has no easy way to get to proper bdi_writeback and
furthermore its intention has always been to work on the whole device
rather than on individual cgroups, just move the dirty_sleep timestamp
from bdi_writeback to backing_dev_info. That fixes the checking for
recently throttled task and saves memory for everybody as a bonus.
CC: stable(a)vger.kernel.org
Fixes: b57d74aff9ab ("writeback: track if we're sleeping on progress in balance_dirty_pages()")
Signed-off-by: Jan Kara <jack(a)suse.cz>
Link: https://lore.kernel.org/r/20240123175826.21452-1-jack@suse.cz
[axboe: fixup indentation errors]
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 5ba3cd574eac..0c0e270a8265 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -163,9 +163,9 @@ static void wb_timestamp(struct rq_wb *rwb, unsigned long *var)
*/
static bool wb_recent_wait(struct rq_wb *rwb)
{
- struct bdi_writeback *wb = &rwb->rqos.disk->bdi->wb;
+ struct backing_dev_info *bdi = rwb->rqos.disk->bdi;
- return time_before(jiffies, wb->dirty_sleep + HZ);
+ return time_before(jiffies, bdi->last_bdp_sleep + HZ);
}
static inline struct rq_wait *get_rq_wait(struct rq_wb *rwb,
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index ae12696ec492..2ad261082bba 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -141,8 +141,6 @@ struct bdi_writeback {
struct delayed_work dwork; /* work item used for writeback */
struct delayed_work bw_dwork; /* work item used for bandwidth estimate */
- unsigned long dirty_sleep; /* last wait */
-
struct list_head bdi_node; /* anchored at bdi->wb_list */
#ifdef CONFIG_CGROUP_WRITEBACK
@@ -179,6 +177,11 @@ struct backing_dev_info {
* any dirty wbs, which is depended upon by bdi_has_dirty().
*/
atomic_long_t tot_write_bandwidth;
+ /*
+ * Jiffies when last process was dirty throttled on this bdi. Used by
+ * blk-wbt.
+ */
+ unsigned long last_bdp_sleep;
struct bdi_writeback wb; /* the root writeback info for this bdi */
struct list_head wb_list; /* list of all wbs */
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 1e3447bccdb1..e039d05304dd 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -436,7 +436,6 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
INIT_LIST_HEAD(&wb->work_list);
INIT_DELAYED_WORK(&wb->dwork, wb_workfn);
INIT_DELAYED_WORK(&wb->bw_dwork, wb_update_bandwidth_workfn);
- wb->dirty_sleep = jiffies;
err = fprop_local_init_percpu(&wb->completions, gfp);
if (err)
@@ -921,6 +920,7 @@ int bdi_init(struct backing_dev_info *bdi)
INIT_LIST_HEAD(&bdi->bdi_list);
INIT_LIST_HEAD(&bdi->wb_list);
init_waitqueue_head(&bdi->wb_waitq);
+ bdi->last_bdp_sleep = jiffies;
return cgwb_bdi_init(bdi);
}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index cd4e4ae77c40..cc37fa7f3364 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1921,7 +1921,7 @@ static int balance_dirty_pages(struct bdi_writeback *wb,
break;
}
__set_current_state(TASK_KILLABLE);
- wb->dirty_sleep = now;
+ bdi->last_bdp_sleep = jiffies;
io_schedule_timeout(pause);
current->dirty_paused_when = now + pause;
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x f814bdda774c183b0cc15ec8f3b6e7c6f4527ba5
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021920-latitude-quilt-dfa8@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
f814bdda774c ("blk-wbt: Fix detection of dirty-throttled tasks")
ba91c849fa50 ("blk-rq-qos: store a gendisk instead of request_queue in struct rq_qos")
3963d84df797 ("blk-rq-qos: constify rq_qos_ops")
ce57b558604e ("blk-rq-qos: make rq_qos_add and rq_qos_del more useful")
b494f9c566ba ("blk-rq-qos: move rq_qos_add and rq_qos_del out of line")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From f814bdda774c183b0cc15ec8f3b6e7c6f4527ba5 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack(a)suse.cz>
Date: Tue, 23 Jan 2024 18:58:26 +0100
Subject: [PATCH] blk-wbt: Fix detection of dirty-throttled tasks
The detection of dirty-throttled tasks in blk-wbt has been subtly broken
since its beginning in 2016. Namely if we are doing cgroup writeback and
the throttled task is not in the root cgroup, balance_dirty_pages() will
set dirty_sleep for the non-root bdi_writeback structure. However
blk-wbt checks dirty_sleep only in the root cgroup bdi_writeback
structure. Thus detection of recently throttled tasks is not working in
this case (we noticed this when we switched to cgroup v2 and suddently
writeback was slow).
Since blk-wbt has no easy way to get to proper bdi_writeback and
furthermore its intention has always been to work on the whole device
rather than on individual cgroups, just move the dirty_sleep timestamp
from bdi_writeback to backing_dev_info. That fixes the checking for
recently throttled task and saves memory for everybody as a bonus.
CC: stable(a)vger.kernel.org
Fixes: b57d74aff9ab ("writeback: track if we're sleeping on progress in balance_dirty_pages()")
Signed-off-by: Jan Kara <jack(a)suse.cz>
Link: https://lore.kernel.org/r/20240123175826.21452-1-jack@suse.cz
[axboe: fixup indentation errors]
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 5ba3cd574eac..0c0e270a8265 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -163,9 +163,9 @@ static void wb_timestamp(struct rq_wb *rwb, unsigned long *var)
*/
static bool wb_recent_wait(struct rq_wb *rwb)
{
- struct bdi_writeback *wb = &rwb->rqos.disk->bdi->wb;
+ struct backing_dev_info *bdi = rwb->rqos.disk->bdi;
- return time_before(jiffies, wb->dirty_sleep + HZ);
+ return time_before(jiffies, bdi->last_bdp_sleep + HZ);
}
static inline struct rq_wait *get_rq_wait(struct rq_wb *rwb,
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index ae12696ec492..2ad261082bba 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -141,8 +141,6 @@ struct bdi_writeback {
struct delayed_work dwork; /* work item used for writeback */
struct delayed_work bw_dwork; /* work item used for bandwidth estimate */
- unsigned long dirty_sleep; /* last wait */
-
struct list_head bdi_node; /* anchored at bdi->wb_list */
#ifdef CONFIG_CGROUP_WRITEBACK
@@ -179,6 +177,11 @@ struct backing_dev_info {
* any dirty wbs, which is depended upon by bdi_has_dirty().
*/
atomic_long_t tot_write_bandwidth;
+ /*
+ * Jiffies when last process was dirty throttled on this bdi. Used by
+ * blk-wbt.
+ */
+ unsigned long last_bdp_sleep;
struct bdi_writeback wb; /* the root writeback info for this bdi */
struct list_head wb_list; /* list of all wbs */
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 1e3447bccdb1..e039d05304dd 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -436,7 +436,6 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
INIT_LIST_HEAD(&wb->work_list);
INIT_DELAYED_WORK(&wb->dwork, wb_workfn);
INIT_DELAYED_WORK(&wb->bw_dwork, wb_update_bandwidth_workfn);
- wb->dirty_sleep = jiffies;
err = fprop_local_init_percpu(&wb->completions, gfp);
if (err)
@@ -921,6 +920,7 @@ int bdi_init(struct backing_dev_info *bdi)
INIT_LIST_HEAD(&bdi->bdi_list);
INIT_LIST_HEAD(&bdi->wb_list);
init_waitqueue_head(&bdi->wb_waitq);
+ bdi->last_bdp_sleep = jiffies;
return cgwb_bdi_init(bdi);
}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index cd4e4ae77c40..cc37fa7f3364 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1921,7 +1921,7 @@ static int balance_dirty_pages(struct bdi_writeback *wb,
break;
}
__set_current_state(TASK_KILLABLE);
- wb->dirty_sleep = now;
+ bdi->last_bdp_sleep = jiffies;
io_schedule_timeout(pause);
current->dirty_paused_when = now + pause;