Hi,
maybe you will not like introducing 'static int int_max = INT_MAX;' for
this old kernel which EOL in 10 months.
Cyril Hrubis (3):
sched/rt: Fix sysctl_sched_rr_timeslice intial value
sched/rt: sysctl_sched_rr_timeslice show default timeslice after reset
sched/rt: Disallow writing invalid values to sched_rt_period_us
kernel/sched/rt.c | 10 +++++-----
kernel/sysctl.c | 5 +++++
2 files changed, 10 insertions(+), 5 deletions(-)
--
2.35.3
From: Cyril Hrubis <chrubis(a)suse.cz>
[ Upstream commit 079be8fc630943d9fc70a97807feb73d169ee3fc ]
The validation of the value written to sched_rt_period_us was broken
because:
- the sysclt_sched_rt_period is declared as unsigned int
- parsed by proc_do_intvec()
- the range is asserted after the value parsed by proc_do_intvec()
Because of this negative values written to the file were written into a
unsigned integer that were later on interpreted as large positive
integers which did passed the check:
if (sysclt_sched_rt_period <= 0)
return EINVAL;
This commit fixes the parsing by setting explicit range for both
perid_us and runtime_us into the sched_rt_sysctls table and processes
the values with proc_dointvec_minmax() instead.
Alternatively if we wanted to use full range of unsigned int for the
period value we would have to split the proc_handler and use
proc_douintvec() for it however even the
Documentation/scheduller/sched-rt-group.rst describes the range as 1 to
INT_MAX.
As far as I can tell the only problem this causes is that the sysctl
file allows writing negative values which when read back may confuse
userspace.
There is also a LTP test being submitted for these sysctl files at:
http://patchwork.ozlabs.org/project/ltp/patch/20230901144433.2526-1-chrubis…
Signed-off-by: Cyril Hrubis <chrubis(a)suse.cz>
Signed-off-by: Ingo Molnar <mingo(a)kernel.org>
Link: https://lore.kernel.org/r/20231002115553.3007-2-chrubis@suse.cz
Signed-off-by: Petr Vorel <pvorel(a)suse.cz>
---
kernel/sched/rt.c | 9 +++++----
1 file changed, 5 insertions(+), 4 deletions(-)
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 904dd8534597..4ac36eb4cdee 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -37,6 +37,8 @@ static struct ctl_table sched_rt_sysctls[] = {
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = sched_rt_handler,
+ .extra1 = SYSCTL_ONE,
+ .extra2 = SYSCTL_INT_MAX,
},
{
.procname = "sched_rt_runtime_us",
@@ -44,6 +46,8 @@ static struct ctl_table sched_rt_sysctls[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = sched_rt_handler,
+ .extra1 = SYSCTL_NEG_ONE,
+ .extra2 = SYSCTL_INT_MAX,
},
{
.procname = "sched_rr_timeslice_ms",
@@ -2989,9 +2993,6 @@ static int sched_rt_global_constraints(void)
#ifdef CONFIG_SYSCTL
static int sched_rt_global_validate(void)
{
- if (sysctl_sched_rt_period <= 0)
- return -EINVAL;
-
if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
((sysctl_sched_rt_runtime > sysctl_sched_rt_period) ||
((u64)sysctl_sched_rt_runtime *
@@ -3022,7 +3023,7 @@ static int sched_rt_handler(struct ctl_table *table, int write, void *buffer,
old_period = sysctl_sched_rt_period;
old_runtime = sysctl_sched_rt_runtime;
- ret = proc_dointvec(table, write, buffer, lenp, ppos);
+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (!ret && write) {
ret = sched_rt_global_validate();
--
2.35.3
From: Cyril Hrubis <chrubis(a)suse.cz>
[ Upstream commit c1fc6484e1fb7cc2481d169bfef129a1b0676abe ]
The sched_rr_timeslice can be reset to default by writing value that is
<= 0. However after reading from this file we always got the last value
written, which is not useful at all.
$ echo -1 > /proc/sys/kernel/sched_rr_timeslice_ms
$ cat /proc/sys/kernel/sched_rr_timeslice_ms
-1
Fix this by setting the variable that holds the sysctl file value to the
jiffies_to_msecs(RR_TIMESLICE) in case that <= 0 value was written.
Signed-off-by: Cyril Hrubis <chrubis(a)suse.cz>
Signed-off-by: Peter Zijlstra (Intel) <peterz(a)infradead.org>
Reviewed-by: Petr Vorel <pvorel(a)suse.cz>
Acked-by: Mel Gorman <mgorman(a)suse.de>
Tested-by: Petr Vorel <pvorel(a)suse.cz>
Link: https://lore.kernel.org/r/20230802151906.25258-3-chrubis@suse.cz
Signed-off-by: Petr Vorel <pvorel(a)suse.cz>
---
kernel/sched/rt.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 76bafa8d331a..f79a6f36777a 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -3047,6 +3047,9 @@ static int sched_rr_handler(struct ctl_table *table, int write, void *buffer,
sched_rr_timeslice =
sysctl_sched_rr_timeslice <= 0 ? RR_TIMESLICE :
msecs_to_jiffies(sysctl_sched_rr_timeslice);
+
+ if (sysctl_sched_rr_timeslice <= 0)
+ sysctl_sched_rr_timeslice = jiffies_to_msecs(RR_TIMESLICE);
}
mutex_unlock(&mutex);
--
2.35.3
From: Cyril Hrubis <chrubis(a)suse.cz>
[ Upstream commit c7fcb99877f9f542c918509b2801065adcaf46fa ]
There is a 10% rounding error in the intial value of the
sysctl_sched_rr_timeslice with CONFIG_HZ_300=y.
This was found with LTP test sched_rr_get_interval01:
sched_rr_get_interval01.c:57: TPASS: sched_rr_get_interval() passed
sched_rr_get_interval01.c:64: TPASS: Time quantum 0s 99999990ns
sched_rr_get_interval01.c:72: TFAIL: /proc/sys/kernel/sched_rr_timeslice_ms != 100 got 90
sched_rr_get_interval01.c:57: TPASS: sched_rr_get_interval() passed
sched_rr_get_interval01.c:64: TPASS: Time quantum 0s 99999990ns
sched_rr_get_interval01.c:72: TFAIL: /proc/sys/kernel/sched_rr_timeslice_ms != 100 got 90
What this test does is to compare the return value from the
sched_rr_get_interval() and the sched_rr_timeslice_ms sysctl file and
fails if they do not match.
The problem it found is the intial sysctl file value which was computed as:
static int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
which works fine as long as MSEC_PER_SEC is multiple of HZ, however it
introduces 10% rounding error for CONFIG_HZ_300:
(MSEC_PER_SEC / HZ) * (100 * HZ / 1000)
(1000 / 300) * (100 * 300 / 1000)
3 * 30 = 90
This can be easily fixed by reversing the order of the multiplication
and division. After this fix we get:
(MSEC_PER_SEC * (100 * HZ / 1000)) / HZ
(1000 * (100 * 300 / 1000)) / 300
(1000 * 30) / 300 = 100
Fixes: 975e155ed873 ("sched/rt: Show the 'sched_rr_timeslice' SCHED_RR timeslice tuning knob in milliseconds")
Signed-off-by: Cyril Hrubis <chrubis(a)suse.cz>
Signed-off-by: Peter Zijlstra (Intel) <peterz(a)infradead.org>
Reviewed-by: Petr Vorel <pvorel(a)suse.cz>
Acked-by: Mel Gorman <mgorman(a)suse.de>
Tested-by: Petr Vorel <pvorel(a)suse.cz>
Link: https://lore.kernel.org/r/20230802151906.25258-2-chrubis@suse.cz
[ pvorel: rebased for 5.4 ]
Signed-off-by: Petr Vorel <pvorel(a)suse.cz>
---
kernel/sched/rt.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 186e7d78ded5..e40f32e3ab06 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -8,7 +8,7 @@
#include "pelt.h"
int sched_rr_timeslice = RR_TIMESLICE;
-int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
+int sysctl_sched_rr_timeslice = (MSEC_PER_SEC * RR_TIMESLICE) / HZ;
/* More than 4 hours if BW_SHIFT equals 20. */
static const u64 max_rt_runtime = MAX_BW;
--
2.35.3
From: Cyril Hrubis <chrubis(a)suse.cz>
[ Upstream commit c7fcb99877f9f542c918509b2801065adcaf46fa ]
There is a 10% rounding error in the intial value of the
sysctl_sched_rr_timeslice with CONFIG_HZ_300=y.
This was found with LTP test sched_rr_get_interval01:
sched_rr_get_interval01.c:57: TPASS: sched_rr_get_interval() passed
sched_rr_get_interval01.c:64: TPASS: Time quantum 0s 99999990ns
sched_rr_get_interval01.c:72: TFAIL: /proc/sys/kernel/sched_rr_timeslice_ms != 100 got 90
sched_rr_get_interval01.c:57: TPASS: sched_rr_get_interval() passed
sched_rr_get_interval01.c:64: TPASS: Time quantum 0s 99999990ns
sched_rr_get_interval01.c:72: TFAIL: /proc/sys/kernel/sched_rr_timeslice_ms != 100 got 90
What this test does is to compare the return value from the
sched_rr_get_interval() and the sched_rr_timeslice_ms sysctl file and
fails if they do not match.
The problem it found is the intial sysctl file value which was computed as:
static int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
which works fine as long as MSEC_PER_SEC is multiple of HZ, however it
introduces 10% rounding error for CONFIG_HZ_300:
(MSEC_PER_SEC / HZ) * (100 * HZ / 1000)
(1000 / 300) * (100 * 300 / 1000)
3 * 30 = 90
This can be easily fixed by reversing the order of the multiplication
and division. After this fix we get:
(MSEC_PER_SEC * (100 * HZ / 1000)) / HZ
(1000 * (100 * 300 / 1000)) / 300
(1000 * 30) / 300 = 100
Fixes: 975e155ed873 ("sched/rt: Show the 'sched_rr_timeslice' SCHED_RR timeslice tuning knob in milliseconds")
Signed-off-by: Cyril Hrubis <chrubis(a)suse.cz>
Signed-off-by: Peter Zijlstra (Intel) <peterz(a)infradead.org>
Reviewed-by: Petr Vorel <pvorel(a)suse.cz>
Acked-by: Mel Gorman <mgorman(a)suse.de>
Tested-by: Petr Vorel <pvorel(a)suse.cz>
Link: https://lore.kernel.org/r/20230802151906.25258-2-chrubis@suse.cz
[ pvorel: rebased for 5.15, 5.10 ]
Signed-off-by: Petr Vorel <pvorel(a)suse.cz>
---
kernel/sched/rt.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 7045595aacac..3394b7f923a0 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -8,7 +8,7 @@
#include "pelt.h"
int sched_rr_timeslice = RR_TIMESLICE;
-int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
+int sysctl_sched_rr_timeslice = (MSEC_PER_SEC * RR_TIMESLICE) / HZ;
/* More than 4 hours if BW_SHIFT equals 20. */
static const u64 max_rt_runtime = MAX_BW;
--
2.35.3
We observed a corruption during on-line resize of a file system that is
larger than 16 TiB with 4k block size. With having more then 2^32 blocks
resize_inode is turned off by default by mke2fs. The issue can be
reproduced on a smaller file system for convenience by explicitly
turning off resize_inode. An on-line resize across an 8 GiB boundary (the
size of a meta block group in this setup) then leads to a corruption:
dev=/dev/<some_dev> # should be >= 16 GiB
mkdir -p /corruption
/sbin/mke2fs -t ext4 -b 4096 -O ^resize_inode $dev $((2 * 2**21 - 2**15))
mount -t ext4 $dev /corruption
dd if=/dev/zero bs=4096 of=/corruption/test count=$((2*2**21 - 4*2**15))
sha1sum /corruption/test
# 79d2658b39dcfd77274e435b0934028adafaab11 /corruption/test
/sbin/resize2fs $dev $((2*2**21))
# drop page cache to force reload the block from disk
echo 1 > /proc/sys/vm/drop_caches
sha1sum /corruption/test
# 3c2abc63cbf1a94c9e6977e0fbd72cd832c4d5c3 /corruption/test
2^21 = 2^15*2^6 equals 8 GiB whereof 2^15 is the number of blocks per
block group and 2^6 are the number of block groups that make a meta
block group.
The last checksum might be different depending on how the file is laid
out across the physical blocks. The actual corruption occurs at physical
block 63*2^15 = 2064384 which would be the location of the backup of the
meta block group's block descriptor. During the on-line resize the file
system will be converted to meta_bg starting at s_first_meta_bg which is
2 in the example - meaning all block groups after 16 GiB. However, in
ext4_flex_group_add we might add block groups that are not part of the
first meta block group yet. In the reproducer we achieved this by
substracting the size of a whole block group from the point where the
meta block group would start. This must be considered when updating the
backup block group descriptors to follow the non-meta_bg layout. The fix
is to add a test whether the group to add is already part of the meta
block group or not.
Fixes: 01f795f9e0d67 ("ext4: add online resizing support for meta_bg and 64-bit file systems")
Cc: stable(a)vger.kernel.org
Signed-off-by: Maximilian Heyne <mheyne(a)amazon.de>
---
fs/ext4/resize.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 4d4a5a32e310..3c0d12382e06 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -1602,7 +1602,8 @@ static int ext4_flex_group_add(struct super_block *sb,
int gdb_num = group / EXT4_DESC_PER_BLOCK(sb);
int gdb_num_end = ((group + flex_gd->count - 1) /
EXT4_DESC_PER_BLOCK(sb));
- int meta_bg = ext4_has_feature_meta_bg(sb);
+ int meta_bg = ext4_has_feature_meta_bg(sb) &&
+ gdb_num >= le32_to_cpu(es->s_first_meta_bg);
sector_t padding_blocks = meta_bg ? 0 : sbi->s_sbh->b_blocknr -
ext4_group_first_block_no(sb, 0);
--
2.40.1
Amazon Development Center Germany GmbH
Krausenstr. 38
10117 Berlin
Geschaeftsfuehrung: Christian Schlaeger, Jonathan Weiss
Eingetragen am Amtsgericht Charlottenburg unter HRB 149173 B
Sitz: Berlin
Ust-ID: DE 289 237 879
When the PCI device is surprise removed, requests won't complete from
the device. These IOs are never completed and disk deletion hangs
indefinitely.
Fix it by aborting the IOs which the device will never complete
when the VQ is broken.
With this fix now fio completes swiftly.
An alternative of IO timeout has been considered, however
when the driver knows about unresponsive block device, swiftly clearing
them enables users and upper layers to react quickly.
Verified with multiple device unplug cycles with pending IOs in virtio
used ring and some pending with device.
In future instead of VQ broken, a more elegant method can be used. At the
moment the patch is kept to its minimal changes given its urgency to fix
broken kernels.
Fixes: 43bb40c5b926 ("virtio_pci: Support surprise removal of virtio pci device")
Cc: stable(a)vger.kernel.org
Reported-by: lirongqing(a)baidu.com
Closes: https://lore.kernel.org/virtualization/c45dd68698cd47238c55fb73ca9b4741@bai…
Co-developed-by: Chaitanya Kulkarni <kch(a)nvidia.com>
Signed-off-by: Chaitanya Kulkarni <kch(a)nvidia.com>
Signed-off-by: Parav Pandit <parav(a)nvidia.com>
---
drivers/block/virtio_blk.c | 54 ++++++++++++++++++++++++++++++++++++++
1 file changed, 54 insertions(+)
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 2bf14a0e2815..59b49899b229 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -1562,10 +1562,64 @@ static int virtblk_probe(struct virtio_device *vdev)
return err;
}
+static bool virtblk_cancel_request(struct request *rq, void *data)
+{
+ struct virtblk_req *vbr = blk_mq_rq_to_pdu(rq);
+
+ vbr->in_hdr.status = VIRTIO_BLK_S_IOERR;
+ if (blk_mq_request_started(rq) && !blk_mq_request_completed(rq))
+ blk_mq_complete_request(rq);
+
+ return true;
+}
+
+static void virtblk_cleanup_reqs(struct virtio_blk *vblk)
+{
+ struct virtio_blk_vq *blk_vq;
+ struct request_queue *q;
+ struct virtqueue *vq;
+ unsigned long flags;
+ int i;
+
+ vq = vblk->vqs[0].vq;
+ if (!virtqueue_is_broken(vq))
+ return;
+
+ q = vblk->disk->queue;
+ /* Block upper layer to not get any new requests */
+ blk_mq_quiesce_queue(q);
+
+ for (i = 0; i < vblk->num_vqs; i++) {
+ blk_vq = &vblk->vqs[i];
+
+ /* Synchronize with any ongoing virtblk_poll() which may be
+ * completing the requests to uppper layer which has already
+ * crossed the broken vq check.
+ */
+ spin_lock_irqsave(&blk_vq->lock, flags);
+ spin_unlock_irqrestore(&blk_vq->lock, flags);
+ }
+
+ blk_sync_queue(q);
+
+ /* Complete remaining pending requests with error */
+ blk_mq_tagset_busy_iter(&vblk->tag_set, virtblk_cancel_request, vblk);
+ blk_mq_tagset_wait_completed_request(&vblk->tag_set);
+
+ /*
+ * Unblock any pending dispatch I/Os before we destroy device. From
+ * del_gendisk() -> __blk_mark_disk_dead(disk) will set GD_DEAD flag,
+ * that will make sure any new I/O from bio_queue_enter() to fail.
+ */
+ blk_mq_unquiesce_queue(q);
+}
+
static void virtblk_remove(struct virtio_device *vdev)
{
struct virtio_blk *vblk = vdev->priv;
+ virtblk_cleanup_reqs(vblk);
+
/* Make sure no work handler is accessing the device. */
flush_work(&vblk->config_work);
--
2.34.1
From: Michal Kazior <michal(a)plume.com>
[ Upstream commit a6e4f85d3820d00694ed10f581f4c650445dbcda ]
The nl80211_dump_interface() supports resumption
in case nl80211_send_iface() doesn't have the
resources to complete its work.
The logic would store the progress as iteration
offsets for rdev and wdev loops.
However the logic did not properly handle
resumption for non-last rdev. Assuming a system
with 2 rdevs, with 2 wdevs each, this could
happen:
dump(cb=[0, 0]):
if_start=cb[1] (=0)
send rdev0.wdev0 -> ok
send rdev0.wdev1 -> yield
cb[1] = 1
dump(cb=[0, 1]):
if_start=cb[1] (=1)
send rdev0.wdev1 -> ok
// since if_start=1 the rdev0.wdev0 got skipped
// through if_idx < if_start
send rdev1.wdev1 -> ok
The if_start needs to be reset back to 0 upon wdev
loop end.
The problem is actually hard to hit on a desktop,
and even on most routers. The prerequisites for
this manifesting was:
- more than 1 wiphy
- a few handful of interfaces
- dump without rdev or wdev filter
I was seeing this with 4 wiphys 9 interfaces each.
It'd miss 6 interfaces from the last wiphy
reported to userspace.
Signed-off-by: Michal Kazior <michal(a)plume.com>
Link: https://msgid.link/20240116142340.89678-1-kazikcz@gmail.com
Signed-off-by: Johannes Berg <johannes.berg(a)intel.com>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
net/wireless/nl80211.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 1cbbb11ea503..fbf95b7ff6b4 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -4008,6 +4008,7 @@ static int nl80211_dump_interface(struct sk_buff *skb, struct netlink_callback *
}
wiphy_unlock(&rdev->wiphy);
+ if_start = 0;
wp_idx++;
}
out:
--
2.43.0
From: "Guilherme G. Piccoli" <gpiccoli(a)igalia.com>
[ Upstream commit e585a37e5061f6d5060517aed1ca4ccb2e56a34c ]
By running a Van Gogh device (Steam Deck), the following message
was noticed in the kernel log:
pci 0000:04:00.3: PCI class overridden (0x0c03fe -> 0x0c03fe) so dwc3 driver can claim this instead of xhci
Effectively this means the quirk executed but changed nothing, since the
class of this device was already the proper one (likely adjusted by newer
firmware versions).
Check and perform the override only if necessary.
Link: https://lore.kernel.org/r/20231120160531.361552-1-gpiccoli@igalia.com
Signed-off-by: Guilherme G. Piccoli <gpiccoli(a)igalia.com>
Signed-off-by: Bjorn Helgaas <bhelgaas(a)google.com>
Cc: Huang Rui <ray.huang(a)amd.com>
Cc: Vicki Pfau <vi(a)endrift.com>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
drivers/pci/quirks.c | 11 +++++++----
1 file changed, 7 insertions(+), 4 deletions(-)
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index 8765544bac35..75b297c15cf5 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -607,10 +607,13 @@ static void quirk_amd_dwc_class(struct pci_dev *pdev)
{
u32 class = pdev->class;
- /* Use "USB Device (not host controller)" class */
- pdev->class = PCI_CLASS_SERIAL_USB_DEVICE;
- pci_info(pdev, "PCI class overridden (%#08x -> %#08x) so dwc3 driver can claim this instead of xhci\n",
- class, pdev->class);
+ if (class != PCI_CLASS_SERIAL_USB_DEVICE) {
+ /* Use "USB Device (not host controller)" class */
+ pdev->class = PCI_CLASS_SERIAL_USB_DEVICE;
+ pci_info(pdev,
+ "PCI class overridden (%#08x -> %#08x) so dwc3 driver can claim this instead of xhci\n",
+ class, pdev->class);
+ }
}
DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_NL_USB,
quirk_amd_dwc_class);
--
2.43.0
From: Michal Kazior <michal(a)plume.com>
[ Upstream commit a6e4f85d3820d00694ed10f581f4c650445dbcda ]
The nl80211_dump_interface() supports resumption
in case nl80211_send_iface() doesn't have the
resources to complete its work.
The logic would store the progress as iteration
offsets for rdev and wdev loops.
However the logic did not properly handle
resumption for non-last rdev. Assuming a system
with 2 rdevs, with 2 wdevs each, this could
happen:
dump(cb=[0, 0]):
if_start=cb[1] (=0)
send rdev0.wdev0 -> ok
send rdev0.wdev1 -> yield
cb[1] = 1
dump(cb=[0, 1]):
if_start=cb[1] (=1)
send rdev0.wdev1 -> ok
// since if_start=1 the rdev0.wdev0 got skipped
// through if_idx < if_start
send rdev1.wdev1 -> ok
The if_start needs to be reset back to 0 upon wdev
loop end.
The problem is actually hard to hit on a desktop,
and even on most routers. The prerequisites for
this manifesting was:
- more than 1 wiphy
- a few handful of interfaces
- dump without rdev or wdev filter
I was seeing this with 4 wiphys 9 interfaces each.
It'd miss 6 interfaces from the last wiphy
reported to userspace.
Signed-off-by: Michal Kazior <michal(a)plume.com>
Link: https://msgid.link/20240116142340.89678-1-kazikcz@gmail.com
Signed-off-by: Johannes Berg <johannes.berg(a)intel.com>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
net/wireless/nl80211.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 0ac829c8f188..279f4977e2ee 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -3595,6 +3595,7 @@ static int nl80211_dump_interface(struct sk_buff *skb, struct netlink_callback *
if_idx++;
}
+ if_start = 0;
wp_idx++;
}
out:
--
2.43.0
From: Michal Kazior <michal(a)plume.com>
[ Upstream commit a6e4f85d3820d00694ed10f581f4c650445dbcda ]
The nl80211_dump_interface() supports resumption
in case nl80211_send_iface() doesn't have the
resources to complete its work.
The logic would store the progress as iteration
offsets for rdev and wdev loops.
However the logic did not properly handle
resumption for non-last rdev. Assuming a system
with 2 rdevs, with 2 wdevs each, this could
happen:
dump(cb=[0, 0]):
if_start=cb[1] (=0)
send rdev0.wdev0 -> ok
send rdev0.wdev1 -> yield
cb[1] = 1
dump(cb=[0, 1]):
if_start=cb[1] (=1)
send rdev0.wdev1 -> ok
// since if_start=1 the rdev0.wdev0 got skipped
// through if_idx < if_start
send rdev1.wdev1 -> ok
The if_start needs to be reset back to 0 upon wdev
loop end.
The problem is actually hard to hit on a desktop,
and even on most routers. The prerequisites for
this manifesting was:
- more than 1 wiphy
- a few handful of interfaces
- dump without rdev or wdev filter
I was seeing this with 4 wiphys 9 interfaces each.
It'd miss 6 interfaces from the last wiphy
reported to userspace.
Signed-off-by: Michal Kazior <michal(a)plume.com>
Link: https://msgid.link/20240116142340.89678-1-kazikcz@gmail.com
Signed-off-by: Johannes Berg <johannes.berg(a)intel.com>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
net/wireless/nl80211.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 70fb14b8bab0..c259d3227a9e 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -3960,6 +3960,7 @@ static int nl80211_dump_interface(struct sk_buff *skb, struct netlink_callback *
if_idx++;
}
+ if_start = 0;
wp_idx++;
}
out:
--
2.43.0
From: Baokun Li <libaokun1(a)huawei.com>
[ Upstream commit 4530b3660d396a646aad91a787b6ab37cf604b53 ]
Determine if the group block bitmap is corrupted before using ac_b_ex in
ext4_mb_try_best_found() to avoid allocating blocks from a group with a
corrupted block bitmap in the following concurrency and making the
situation worse.
ext4_mb_regular_allocator
ext4_lock_group(sb, group)
ext4_mb_good_group
// check if the group bbitmap is corrupted
ext4_mb_complex_scan_group
// Scan group gets ac_b_ex but doesn't use it
ext4_unlock_group(sb, group)
ext4_mark_group_bitmap_corrupted(group)
// The block bitmap was corrupted during
// the group unlock gap.
ext4_mb_try_best_found
ext4_lock_group(ac->ac_sb, group)
ext4_mb_use_best_found
mb_mark_used
// Allocating blocks in block bitmap corrupted group
Signed-off-by: Baokun Li <libaokun1(a)huawei.com>
Reviewed-by: Jan Kara <jack(a)suse.cz>
Link: https://lore.kernel.org/r/20240104142040.2835097-7-libaokun1@huawei.com
Signed-off-by: Theodore Ts'o <tytso(a)mit.edu>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
fs/ext4/mballoc.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 3babc07ae613..d7724601f42b 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1854,6 +1854,9 @@ int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
return err;
ext4_lock_group(ac->ac_sb, group);
+ if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)))
+ goto out;
+
max = mb_find_extent(e4b, ex.fe_start, ex.fe_len, &ex);
if (max > 0) {
@@ -1861,6 +1864,7 @@ int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
ext4_mb_use_best_found(ac, e4b);
}
+out:
ext4_unlock_group(ac->ac_sb, group);
ext4_mb_unload_buddy(e4b);
--
2.43.0
l2tp_ip6_sendmsg needs to avoid accounting for the transport header
twice when splicing more data into an already partially-occupied skbuff.
To manage this, we check whether the skbuff contains data using
skb_queue_empty when deciding how much data to append using
ip6_append_data.
However, the code which performed the calculation was incorrect:
ulen = len + skb_queue_empty(&sk->sk_write_queue) ? transhdrlen : 0;
...due to C operator precedence, this ends up setting ulen to
transhdrlen for messages with a non-zero length, which results in
corrupted packets on the wire.
Add parentheses to correct the calculation in line with the original
intent.
Fixes: 9d4c75800f61 ("ipv4, ipv6: Fix handling of transhdrlen in __ip{,6}_append_data()")
Cc: David Howells <dhowells(a)redhat.com>
Cc: stable(a)vger.kernel.org
Signed-off-by: Tom Parkin <tparkin(a)katalix.com>
---
This issue was uncovered by Debian build-testing for the
golang-github-katalix-go-l2tp package[1].
It seems 9d4c75800f61 has been backported to the linux-6.1.y stable
kernel (and possibly others), so I think this fix will also need
backporting.
The bug is currently seen on at least Debian Bookworm, Ubuntu Jammy, and
Debian testing/unstable.
Unfortunately tests using "ip l2tp" and which focus on dataplane
transport will not uncover this bug: it's necessary to send a packet
using an L2TPIP6 socket opened by userspace, and to verify the packet on
the wire. The l2tp-ktest[2] test suite has been extended to cover this.
[1]. https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=1063746
[2]. https://github.com/katalix/l2tp-ktest
---
net/l2tp/l2tp_ip6.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
index dd3153966173..7bf14cf9ffaa 100644
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -627,7 +627,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
back_from_confirm:
lock_sock(sk);
- ulen = len + skb_queue_empty(&sk->sk_write_queue) ? transhdrlen : 0;
+ ulen = len + (skb_queue_empty(&sk->sk_write_queue) ? transhdrlen : 0);
err = ip6_append_data(sk, ip_generic_getfrag, msg,
ulen, transhdrlen, &ipc6,
&fl6, (struct rt6_info *)dst,
--
2.34.1
If caching mode change fails due to, for example, OOM we
free the allocated pages in a two-step process. First the pages
for which the caching change has already succeeded. Secondly
the pages for which a caching change did not succeed.
However the second step was incorrectly freeing the pages already
freed in the first step.
Fix.
Signed-off-by: Thomas Hellström <thomas.hellstrom(a)linux.intel.com>
Fixes: 379989e7cbdc ("drm/ttm/pool: Fix ttm_pool_alloc error path")
Cc: Christian König <christian.koenig(a)amd.com>
Cc: Dave Airlie <airlied(a)redhat.com>
Cc: Christian Koenig <christian.koenig(a)amd.com>
Cc: Huang Rui <ray.huang(a)amd.com>
Cc: dri-devel(a)lists.freedesktop.org
Cc: <stable(a)vger.kernel.org> # v6.4+
---
drivers/gpu/drm/ttm/ttm_pool.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/ttm/ttm_pool.c b/drivers/gpu/drm/ttm/ttm_pool.c
index b62f420a9f96..112438d965ff 100644
--- a/drivers/gpu/drm/ttm/ttm_pool.c
+++ b/drivers/gpu/drm/ttm/ttm_pool.c
@@ -387,7 +387,7 @@ static void ttm_pool_free_range(struct ttm_pool *pool, struct ttm_tt *tt,
enum ttm_caching caching,
pgoff_t start_page, pgoff_t end_page)
{
- struct page **pages = tt->pages;
+ struct page **pages = &tt->pages[start_page];
unsigned int order;
pgoff_t i, nr;
--
2.43.0
From: Ard Biesheuvel <ardb(a)kernel.org>
The bit-sliced implementation of AES-CTR operates on blocks of 128
bytes, and will fall back to the plain NEON version for tail blocks or
inputs that are shorter than 128 bytes to begin with.
It will call straight into the plain NEON asm helper, which performs all
memory accesses in granules of 16 bytes (the size of a NEON register).
For this reason, the associated plain NEON glue code will copy inputs
shorter than 16 bytes into a temporary buffer, given that this is a rare
occurrence and it is not worth the effort to work around this in the asm
code.
The fallback from the bit-sliced NEON version fails to take this into
account, potentially resulting in out-of-bounds accesses. So clone the
same workaround, and use a temp buffer for short in/outputs.
Cc: <stable(a)vger.kernel.org>
Reported-by: syzbot+f1ceaa1a09ab891e1934(a)syzkaller.appspotmail.com
Tested-by: syzbot+f1ceaa1a09ab891e1934(a)syzkaller.appspotmail.com
Signed-off-by: Ard Biesheuvel <ardb(a)kernel.org>
---
arch/arm64/crypto/aes-neonbs-glue.c | 11 +++++++++++
1 file changed, 11 insertions(+)
diff --git a/arch/arm64/crypto/aes-neonbs-glue.c b/arch/arm64/crypto/aes-neonbs-glue.c
index bac4cabef607..849dc41320db 100644
--- a/arch/arm64/crypto/aes-neonbs-glue.c
+++ b/arch/arm64/crypto/aes-neonbs-glue.c
@@ -227,8 +227,19 @@ static int ctr_encrypt(struct skcipher_request *req)
src += blocks * AES_BLOCK_SIZE;
}
if (nbytes && walk.nbytes == walk.total) {
+ u8 buf[AES_BLOCK_SIZE];
+ u8 *d = dst;
+
+ if (unlikely(nbytes < AES_BLOCK_SIZE))
+ src = dst = memcpy(buf + sizeof(buf) - nbytes,
+ src, nbytes);
+
neon_aes_ctr_encrypt(dst, src, ctx->enc, ctx->key.rounds,
nbytes, walk.iv);
+
+ if (unlikely(nbytes < AES_BLOCK_SIZE))
+ memcpy(d, buf + sizeof(buf) - nbytes, nbytes);
+
nbytes = 0;
}
kernel_neon_end();
--
2.44.0.rc0.258.g7320e95886-goog
The is_psr_su parameter is a boolean flag indicating whether the Panel
Self Refresh Selective Update (PSR SU) feature is enabled which is a
power-saving feature that allows only the updated regions of the screen
to be refreshed, reducing the amount of data that needs to be sent to
the display.
Fixes the below with gcc W=1:
drivers/gpu/drm/amd/amdgpu/../display/amdgpu_dm/amdgpu_dm.c:5257: warning: Function parameter or member 'is_psr_su' not described in 'fill_dc_dirty_rects'
Fixes: 13d6b0812e58 ("drm/amdgpu: make damage clips support configurable")
Cc: stable(a)vger.kernel.org
Cc: Hamza Mahfooz <hamza.mahfooz(a)amd.com>
Cc: Mario Limonciello <mario.limonciello(a)amd.com>
Cc: Rodrigo Siqueira <Rodrigo.Siqueira(a)amd.com>
Cc: Aurabindo Pillai <aurabindo.pillai(a)amd.com>
Signed-off-by: Srinivasan Shanmugam <srinivasan.shanmugam(a)amd.com>
Reviewed-by: Rodrigo Siqueira <Rodrigo.Siqueira(a)amd.com>
---
drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index ed4873060da7..379836383ea9 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -5234,6 +5234,10 @@ static inline void fill_dc_dirty_rect(struct drm_plane *plane,
* @new_plane_state: New state of @plane
* @crtc_state: New state of CRTC connected to the @plane
* @flip_addrs: DC flip tracking struct, which also tracts dirty rects
+ * @is_psr_su: Flag indicating whether Panel Self Refresh Selective Update (PSR SU) is enabled.
+ * If PSR SU is enabled and damage clips are available, only the regions of the screen
+ * that have changed will be updated. If PSR SU is not enabled,
+ * or if damage clips are not available, the entire screen will be updated.
* @dirty_regions_changed: dirty regions changed
*
* For PSR SU, DC informs the DMUB uController of dirty rectangle regions
--
2.34.1
In erofs_find_target_block() when erofs_dirnamecmp() returns 0,
we do not assign the target metabuf. This causes the caller
erofs_namei()'s erofs_put_metabuf() at the end to be not effective
leaving the refcount on the page.
As the page from metabuf (buf->page) is never put, such page cannot be
migrated or reclaimed. Fix it now by putting the metabuf from
previous loop and assigning the current metabuf to target before
returning so caller erofs_namei() can do the final put as it was
intended.
Fixes: 500edd095648 ("erofs: use meta buffers for inode lookup")
Cc: stable(a)vger.kernel.org
Signed-off-by: Sandeep Dhavale <dhavale(a)google.com>
---
Changes since v1
- Rearrange the cases as suggested by Gao so there is less duplication
of the code and it is more readable
fs/erofs/namei.c | 28 ++++++++++++++--------------
1 file changed, 14 insertions(+), 14 deletions(-)
diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c
index d4f631d39f0f..f0110a78acb2 100644
--- a/fs/erofs/namei.c
+++ b/fs/erofs/namei.c
@@ -130,24 +130,24 @@ static void *erofs_find_target_block(struct erofs_buf *target,
/* string comparison without already matched prefix */
diff = erofs_dirnamecmp(name, &dname, &matched);
- if (!diff) {
- *_ndirents = 0;
- goto out;
- } else if (diff > 0) {
- head = mid + 1;
- startprfx = matched;
-
- if (!IS_ERR(candidate))
- erofs_put_metabuf(target);
- *target = buf;
- candidate = de;
- *_ndirents = ndirents;
- } else {
+ if (diff < 0) {
erofs_put_metabuf(&buf);
-
back = mid - 1;
endprfx = matched;
+ continue;
+ }
+
+ if (!IS_ERR(candidate))
+ erofs_put_metabuf(target);
+ *target = buf;
+ if (!diff) {
+ *_ndirents = 0;
+ return de;
}
+ head = mid + 1;
+ startprfx = matched;
+ candidate = de;
+ *_ndirents = ndirents;
continue;
}
out: /* free if the candidate is valid */
--
2.44.0.rc0.258.g7320e95886-goog
On Wed, Feb 21, 2024 at 10:52:41AM -0800, He Gao wrote:
> I used "git apply" and it required the change. But "patch" can work
> directly so yes the original patch works fine.
>
> In that case, I believe the original patch will also work for 6.6 and 6.7.
Great, all done, thanks!
greg k-h
commit 1a3e1f40962c445b997151a542314f3c6097f8c3 upstream.
NOTE: This is a partial backport since we only need the refcnt between
memcg and stock to fix the problem stated below, and in this way
multiple versions use the same code and align with each other.
---
There was a kernel panic happened on an in-house environment running
3.10, and the same problem was reproduced on 4.19:
general protection fault: 0000 [#1] SMP PTI
CPU: 1 PID: 2085 Comm: bash Kdump: loaded Tainted: G L 4.19.90+ #7
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.15.0-0-g2dd4b9b3f840-prebuilt.qemu.org 04/01/2014
RIP: 0010 drain_all_stock+0xad/0x140
Code: 00 00 4d 85 ff 74 2c 45 85 c9 74 27 4d 39 fc 74 42 41 80 bc 24 28 04 00 00 00 74 17 49 8b 04 24 49 8b 17 48 8b 88 90 02 00 00 <48> 39 8a 90 02 00 00 74 02 eb 86 48 63 88 3c 01 00 00 39 8a 3c 01
RSP: 0018:ffffa7efc5813d70 EFLAGS: 00010202
RAX: ffff8cb185548800 RBX: ffff8cb89f420160 RCX: ffff8cb1867b6000
RDX: babababababababa RSI: 0000000000000001 RDI: 0000000000231876
RBP: 0000000000000000 R08: 0000000000000415 R09: 0000000000000002
R10: 0000000000000000 R11: 0000000000000001 R12: ffff8cb186f89040
R13: 0000000000020160 R14: 0000000000000001 R15: ffff8cb186b27040
FS: 00007f4a308d3740(0000) GS:ffff8cb89f440000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007ffe4d634a68 CR3: 000000010b022000 CR4: 00000000000006e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
mem_cgroup_force_empty_write+0x31/0xb0
cgroup_file_write+0x60/0x140
? __check_object_size+0x136/0x147
kernfs_fop_write+0x10e/0x190
__vfs_write+0x37/0x1b0
? selinux_file_permission+0xe8/0x130
? security_file_permission+0x2e/0xb0
vfs_write+0xb6/0x1a0
ksys_write+0x57/0xd0
do_syscall_64+0x63/0x250
? async_page_fault+0x8/0x30
entry_SYSCALL_64_after_hwframe+0x5c/0xc1
Modules linked in: ...
It is found that in case of stock->nr_pages == 0, the memcg on
stock->cached could be freed due to its refcnt decreased to 0, which
made stock->cached become a dangling pointer. It could cause a UAF
problem in drain_all_stock() in the following concurrent scenario. Note
that drain_all_stock() doesn't disable irq but only preemption.
CPU1 CPU2
==============================================================================
stock->cached = memcgA (freed)
drain_all_stock(memcgB)
rcu_read_lock()
memcg = CPU1's stock->cached (memcgA)
(interrupted)
refill_stock(memcgC)
drain_stock(memcgA)
stock->cached = memcgC
stock->nr_pages += xxx (> 0)
stock->nr_pages > 0
mem_cgroup_is_descendant(memcgA, memcgB) [UAF]
rcu_read_unlock()
This problem is, unintentionally, fixed at 5.9, where commit
1a3e1f40962c ("mm: memcontrol: decouple reference counting from page
accounting") adds memcg refcnt for stock. Therefore affected LTS
versions include 4.19 and 5.4.
For 4.19, memcg's css offline process doesn't call drain_all_stock(). so
it's easier for the released memcg to be left on the stock. For 5.4,
although mem_cgroup_css_offline() does call drain_all_stock(), but the
flushing could be skipped when stock->nr_pages happens to be 0, and
besides the async draining could be delayed and take place after the UAF
problem has happened.
Fix this problem by adding (and decreasing) memcg's refcnt when memcg is
put onto (and removed from) stock, just like how commit 1a3e1f40962c
("mm: memcontrol: decouple reference counting from page accounting")
does. After all, "being on the stock" is a kind of reference with
regards to memcg. As such, it's guaranteed that a css on stock would not
be freed.
It's good to mention that refill_stock() is executed in an irq-disabled
context, so the drain_stock() patched with css_put() would not actually
free memcgA until the end of refill_stock(), since css_put() is an RCU
free and it's still in grace period. For CPU2, the access to CPU1's
stock->cached is protected by rcu_read_lock(), so in this case it gets
either NULL from stock->cached or a memcgA that is still good.
Cc: stable(a)vger.kernel.org # 4.19 5.4
Fixes: cdec2e4265df ("memcg: coalesce charging via percpu storage")
Signed-off-by: GONG, Ruiqi <gongruiqi1(a)huawei.com>
Acked-by: Michal Hocko <mhocko(a)suse.com>
---
v2:
- Add a statement of this patch being a partial backport
- Add a paragraph to mention the grace period in refill_stock()
mm/memcontrol.c | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 5a366cf79821..8c04296df1c7 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2015,6 +2015,9 @@ static void drain_stock(struct memcg_stock_pcp *stock)
{
struct mem_cgroup *old = stock->cached;
+ if (!old)
+ return;
+
if (stock->nr_pages) {
page_counter_uncharge(&old->memory, stock->nr_pages);
if (do_memsw_account())
@@ -2022,6 +2025,8 @@ static void drain_stock(struct memcg_stock_pcp *stock)
css_put_many(&old->css, stock->nr_pages);
stock->nr_pages = 0;
}
+
+ css_put(&old->css);
stock->cached = NULL;
}
@@ -2057,6 +2062,7 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
stock = this_cpu_ptr(&memcg_stock);
if (stock->cached != memcg) { /* reset if necessary */
drain_stock(stock);
+ css_get(&memcg->css);
stock->cached = memcg;
}
stock->nr_pages += nr_pages;
--
2.25.1
commit 1a3e1f40962c445b997151a542314f3c6097f8c3 upstream.
There was a kernel panic happened on an in-house environment running
3.10, and the same problem was reproduced on 4.19:
general protection fault: 0000 [#1] SMP PTI
CPU: 1 PID: 2085 Comm: bash Kdump: loaded Tainted: G L 4.19.90+ #7
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.15.0-0-g2dd4b9b3f840-prebuilt.qemu.org 04/01/2014
RIP: 0010 drain_all_stock+0xad/0x140
Code: 00 00 4d 85 ff 74 2c 45 85 c9 74 27 4d 39 fc 74 42 41 80 bc 24 28 04 00 00 00 74 17 49 8b 04 24 49 8b 17 48 8b 88 90 02 00 00 <48> 39 8a 90 02 00 00 74 02 eb 86 48 63 88 3c 01 00 00 39 8a 3c 01
RSP: 0018:ffffa7efc5813d70 EFLAGS: 00010202
RAX: ffff8cb185548800 RBX: ffff8cb89f420160 RCX: ffff8cb1867b6000
RDX: babababababababa RSI: 0000000000000001 RDI: 0000000000231876
RBP: 0000000000000000 R08: 0000000000000415 R09: 0000000000000002
R10: 0000000000000000 R11: 0000000000000001 R12: ffff8cb186f89040
R13: 0000000000020160 R14: 0000000000000001 R15: ffff8cb186b27040
FS: 00007f4a308d3740(0000) GS:ffff8cb89f440000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007ffe4d634a68 CR3: 000000010b022000 CR4: 00000000000006e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
mem_cgroup_force_empty_write+0x31/0xb0
cgroup_file_write+0x60/0x140
? __check_object_size+0x136/0x147
kernfs_fop_write+0x10e/0x190
__vfs_write+0x37/0x1b0
? selinux_file_permission+0xe8/0x130
? security_file_permission+0x2e/0xb0
vfs_write+0xb6/0x1a0
ksys_write+0x57/0xd0
do_syscall_64+0x63/0x250
? async_page_fault+0x8/0x30
entry_SYSCALL_64_after_hwframe+0x5c/0xc1
Modules linked in: ...
It is found that in case of stock->nr_pages == 0, the memcg on
stock->cached could be freed due to its refcnt decreased to 0, which
made stock->cached become a dangling pointer. It could cause a UAF
problem in drain_all_stock() in the following concurrent scenario. Note
that drain_all_stock() doesn't disable irq but only preemption.
CPU1 CPU2
==============================================================================
stock->cached = memcgA (freed)
drain_all_stock(memcgB)
rcu_read_lock()
memcg = CPU1's stock->cached (memcgA)
(interrupted)
refill_stock(memcgC)
drain_stock(memcgA)
stock->cached = memcgC
stock->nr_pages += xxx (> 0)
stock->nr_pages > 0
mem_cgroup_is_descendant(memcgA, memcgB) [UAF]
rcu_read_unlock()
This problem is, unintenionally, fixed at 5.9, where commit 1a3e1f40962c
("mm: memcontrol: decouple reference counting from page accounting")
adds memcg refcnt for stock. Therefore affected LTS versions include
4.19 and 5.4.
For 4.19, memcg's css offline process doesn't call drain_all_stock(). so
it's easier for the released memcg to be left on the stock. For 5.4,
although mem_cgroup_css_offline() does call drain_all_stock(), but the
flushing could be skipped when stock->nr_pages happens to be 0, and
besides the async draining could be delayed and take place after the UAF
problem has happened.
Fix this problem by adding (and decreasing) memcg's refcnt when memcg is
put onto (and removed from) stock, just like how commit 1a3e1f40962c
("mm: memcontrol: decouple reference counting from page accounting")
does. After all, "being on the stock" is a kind of reference with
regards to memcg. As such, it's guaranteed that a css on stock would not
be freed.
Cc: stable(a)vger.kernel.org # 4.19 5.4
Fixes: cdec2e4265df ("memcg: coalesce charging via percpu storage")
Signed-off-by: GONG, Ruiqi <gongruiqi1(a)huawei.com>
---
mm/memcontrol.c | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 5a366cf79821..8c04296df1c7 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2015,6 +2015,9 @@ static void drain_stock(struct memcg_stock_pcp *stock)
{
struct mem_cgroup *old = stock->cached;
+ if (!old)
+ return;
+
if (stock->nr_pages) {
page_counter_uncharge(&old->memory, stock->nr_pages);
if (do_memsw_account())
@@ -2022,6 +2025,8 @@ static void drain_stock(struct memcg_stock_pcp *stock)
css_put_many(&old->css, stock->nr_pages);
stock->nr_pages = 0;
}
+
+ css_put(&old->css);
stock->cached = NULL;
}
@@ -2057,6 +2062,7 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
stock = this_cpu_ptr(&memcg_stock);
if (stock->cached != memcg) { /* reset if necessary */
drain_stock(stock);
+ css_get(&memcg->css);
stock->cached = memcg;
}
stock->nr_pages += nr_pages;
--
2.25.1
The quilt patch titled
Subject: kasan/test: avoid gcc warning for intentional overflow
has been removed from the -mm tree. Its filename was
kasan-test-avoid-gcc-warning-for-intentional-overflow.patch
This patch was dropped because it was merged into the mm-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Arnd Bergmann <arnd(a)arndb.de>
Subject: kasan/test: avoid gcc warning for intentional overflow
Date: Mon, 12 Feb 2024 12:15:52 +0100
The out-of-bounds test allocates an object that is three bytes too short
in order to validate the bounds checking. Starting with gcc-14, this
causes a compile-time warning as gcc has grown smart enough to understand
the sizeof() logic:
mm/kasan/kasan_test.c: In function 'kmalloc_oob_16':
mm/kasan/kasan_test.c:443:14: error: allocation of insufficient size '13' for type 'struct <anonymous>' with size '16' [-Werror=alloc-size]
443 | ptr1 = kmalloc(sizeof(*ptr1) - 3, GFP_KERNEL);
| ^
Hide the actual computation behind a RELOC_HIDE() that ensures
the compiler misses the intentional bug.
Link: https://lkml.kernel.org/r/20240212111609.869266-1-arnd@kernel.org
Fixes: 3f15801cdc23 ("lib: add kasan test module")
Signed-off-by: Arnd Bergmann <arnd(a)arndb.de>
Reviewed-by: Andrey Konovalov <andreyknvl(a)gmail.com>
Cc: Alexander Potapenko <glider(a)google.com>
Cc: Andrey Ryabinin <ryabinin.a.a(a)gmail.com>
Cc: Arnd Bergmann <arnd(a)arndb.de>
Cc: Dmitry Vyukov <dvyukov(a)google.com>
Cc: Marco Elver <elver(a)google.com>
Cc: Vincenzo Frascino <vincenzo.frascino(a)arm.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/kasan/kasan_test.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
--- a/mm/kasan/kasan_test.c~kasan-test-avoid-gcc-warning-for-intentional-overflow
+++ a/mm/kasan/kasan_test.c
@@ -440,7 +440,8 @@ static void kmalloc_oob_16(struct kunit
/* This test is specifically crafted for the generic mode. */
KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_GENERIC);
- ptr1 = kmalloc(sizeof(*ptr1) - 3, GFP_KERNEL);
+ /* RELOC_HIDE to prevent gcc from warning about short alloc */
+ ptr1 = RELOC_HIDE(kmalloc(sizeof(*ptr1) - 3, GFP_KERNEL), 0);
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr1);
ptr2 = kmalloc(sizeof(*ptr2), GFP_KERNEL);
_
Patches currently in -mm which might be from arnd(a)arndb.de are
mm-mmu_gather-add-tlb_remove_tlb_entries-fix.patch
From: Jan Kiszka <jan.kiszka(a)siemens.com>
commit afb2a4fb84555ef9e61061f6ea63ed7087b295d5 upstream.
The cflags for the RISC-V efistub were missing -mno-relax, thus were
under the risk that the compiler could use GP-relative addressing. That
happened for _edata with binutils-2.41 and kernel 6.1, causing the
relocation to fail due to an invalid kernel_size in handle_kernel_image.
It was not yet observed with newer versions, but that may just be luck.
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Jan Kiszka <jan.kiszka(a)siemens.com>
Signed-off-by: Ard Biesheuvel <ardb(a)kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
drivers/firmware/efi/libstub/Makefile | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile
index a1157c2a7170..f54715672d52 100644
--- a/drivers/firmware/efi/libstub/Makefile
+++ b/drivers/firmware/efi/libstub/Makefile
@@ -28,7 +28,7 @@ cflags-$(CONFIG_ARM) += -DEFI_HAVE_STRLEN -DEFI_HAVE_STRNLEN \
-DEFI_HAVE_MEMCHR -DEFI_HAVE_STRRCHR \
-DEFI_HAVE_STRCMP -fno-builtin -fpic \
$(call cc-option,-mno-single-pic-base)
-cflags-$(CONFIG_RISCV) += -fpic
+cflags-$(CONFIG_RISCV) += -fpic -mno-relax
cflags-$(CONFIG_LOONGARCH) += -fpie
cflags-$(CONFIG_EFI_PARAMS_FROM_FDT) += -I$(srctree)/scripts/dtc/libfdt
--
2.35.3
--
Siemens AG, Technology
Linux Expert Center
The patch titled
Subject: mm, vmscan: prevent infinite loop for costly GFP_NOIO | __GFP_RETRY_MAYFAIL allocations
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
mm-vmscan-prevent-infinite-loop-for-costly-gfp_noio-__gfp_retry_mayfail-allocations.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Vlastimil Babka <vbabka(a)suse.cz>
Subject: mm, vmscan: prevent infinite loop for costly GFP_NOIO | __GFP_RETRY_MAYFAIL allocations
Date: Wed, 21 Feb 2024 12:43:58 +0100
Sven reports an infinite loop in __alloc_pages_slowpath() for costly order
__GFP_RETRY_MAYFAIL allocations that are also GFP_NOIO. Such combination
can happen in a suspend/resume context where a GFP_KERNEL allocation can
have __GFP_IO masked out via gfp_allowed_mask.
Quoting Sven:
1. try to do a "costly" allocation (order > PAGE_ALLOC_COSTLY_ORDER)
with __GFP_RETRY_MAYFAIL set.
2. page alloc's __alloc_pages_slowpath tries to get a page from the
freelist. This fails because there is nothing free of that costly
order.
3. page alloc tries to reclaim by calling __alloc_pages_direct_reclaim,
which bails out because a zone is ready to be compacted; it pretends
to have made a single page of progress.
4. page alloc tries to compact, but this always bails out early because
__GFP_IO is not set (it's not passed by the snd allocator, and even
if it were, we are suspending so the __GFP_IO flag would be cleared
anyway).
5. page alloc believes reclaim progress was made (because of the
pretense in item 3) and so it checks whether it should retry
compaction. The compaction retry logic thinks it should try again,
because:
a) reclaim is needed because of the early bail-out in item 4
b) a zonelist is suitable for compaction
6. goto 2. indefinite stall.
(end quote)
The immediate root cause is confusing the COMPACT_SKIPPED returned from
__alloc_pages_direct_compact() (step 4) due to lack of __GFP_IO to be
indicating a lack of order-0 pages, and in step 5 evaluating that in
should_compact_retry() as a reason to retry, before incrementing and
limiting the number of retries. There are however other places that
wrongly assume that compaction can happen while we lack __GFP_IO.
To fix this, introduce gfp_compaction_allowed() to abstract the __GFP_IO
evaluation and switch the open-coded test in try_to_compact_pages() to use
it.
Also use the new helper in:
- compaction_ready(), which will make reclaim not bail out in step 3, so
there's at least one attempt to actually reclaim, even if chances are
small for a costly order
- in_reclaim_compaction() which will make should_continue_reclaim()
return false and we don't over-reclaim unnecessarily
- in __alloc_pages_slowpath() to set a local variable can_compact,
which is then used to avoid retrying reclaim/compaction for costly
allocations (step 5) if we can't compact and also to skip the early
compaction attempt that we do in some cases
Link: https://lkml.kernel.org/r/20240221114357.13655-2-vbabka@suse.cz
Fixes: 3250845d0526 ("Revert "mm, oom: prevent premature OOM killer invocation for high order request"")
Signed-off-by: Vlastimil Babka <vbabka(a)suse.cz>
Reported-by: Sven van Ashbrook <svenva(a)chromium.org>
Closes: https://lore.kernel.org/all/CAG-rBihs_xMKb3wrMO1%2B-%2Bp4fowP9oy1pa_OTkfxBz…
Cc: Brian Geffon <bgeffon(a)google.com>
Cc: Curtis Malainey <cujomalainey(a)chromium.org>
Cc: Jaroslav Kysela <perex(a)perex.cz>
Cc: Mel Gorman <mgorman(a)techsingularity.net>
Cc: Michal Hocko <mhocko(a)kernel.org>
Cc: Takashi Iwai <tiwai(a)suse.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
include/linux/gfp.h | 9 +++++++++
mm/compaction.c | 7 +------
mm/page_alloc.c | 10 ++++++----
mm/vmscan.c | 5 ++++-
4 files changed, 20 insertions(+), 11 deletions(-)
--- a/include/linux/gfp.h~mm-vmscan-prevent-infinite-loop-for-costly-gfp_noio-__gfp_retry_mayfail-allocations
+++ a/include/linux/gfp.h
@@ -353,6 +353,15 @@ static inline bool gfp_has_io_fs(gfp_t g
return (gfp & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS);
}
+/*
+ * Check if the gfp flags allow compaction - GFP_NOIO is a really
+ * tricky context because the migration might require IO.
+ */
+static inline bool gfp_compaction_allowed(gfp_t gfp_mask)
+{
+ return IS_ENABLED(CONFIG_COMPACTION) && (gfp_mask & __GFP_IO);
+}
+
extern gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma);
#ifdef CONFIG_CONTIG_ALLOC
--- a/mm/compaction.c~mm-vmscan-prevent-infinite-loop-for-costly-gfp_noio-__gfp_retry_mayfail-allocations
+++ a/mm/compaction.c
@@ -2723,16 +2723,11 @@ enum compact_result try_to_compact_pages
unsigned int alloc_flags, const struct alloc_context *ac,
enum compact_priority prio, struct page **capture)
{
- int may_perform_io = (__force int)(gfp_mask & __GFP_IO);
struct zoneref *z;
struct zone *zone;
enum compact_result rc = COMPACT_SKIPPED;
- /*
- * Check if the GFP flags allow compaction - GFP_NOIO is really
- * tricky context because the migration might require IO
- */
- if (!may_perform_io)
+ if (!gfp_compaction_allowed(gfp_mask))
return COMPACT_SKIPPED;
trace_mm_compaction_try_to_compact_pages(order, gfp_mask, prio);
--- a/mm/page_alloc.c~mm-vmscan-prevent-infinite-loop-for-costly-gfp_noio-__gfp_retry_mayfail-allocations
+++ a/mm/page_alloc.c
@@ -4041,6 +4041,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, u
struct alloc_context *ac)
{
bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
+ bool can_compact = gfp_compaction_allowed(gfp_mask);
const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER;
struct page *page = NULL;
unsigned int alloc_flags;
@@ -4111,7 +4112,7 @@ restart:
* Don't try this for allocations that are allowed to ignore
* watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen.
*/
- if (can_direct_reclaim &&
+ if (can_direct_reclaim && can_compact &&
(costly_order ||
(order > 0 && ac->migratetype != MIGRATE_MOVABLE))
&& !gfp_pfmemalloc_allowed(gfp_mask)) {
@@ -4209,9 +4210,10 @@ retry:
/*
* Do not retry costly high order allocations unless they are
- * __GFP_RETRY_MAYFAIL
+ * __GFP_RETRY_MAYFAIL and we can compact
*/
- if (costly_order && !(gfp_mask & __GFP_RETRY_MAYFAIL))
+ if (costly_order && (!can_compact ||
+ !(gfp_mask & __GFP_RETRY_MAYFAIL)))
goto nopage;
if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
@@ -4224,7 +4226,7 @@ retry:
* implementation of the compaction depends on the sufficient amount
* of free memory (see __compaction_suitable)
*/
- if (did_some_progress > 0 &&
+ if (did_some_progress > 0 && can_compact &&
should_compact_retry(ac, order, alloc_flags,
compact_result, &compact_priority,
&compaction_retries))
--- a/mm/vmscan.c~mm-vmscan-prevent-infinite-loop-for-costly-gfp_noio-__gfp_retry_mayfail-allocations
+++ a/mm/vmscan.c
@@ -5753,7 +5753,7 @@ static void shrink_lruvec(struct lruvec
/* Use reclaim/compaction for costly allocs or under memory pressure */
static bool in_reclaim_compaction(struct scan_control *sc)
{
- if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
+ if (gfp_compaction_allowed(sc->gfp_mask) && sc->order &&
(sc->order > PAGE_ALLOC_COSTLY_ORDER ||
sc->priority < DEF_PRIORITY - 2))
return true;
@@ -5998,6 +5998,9 @@ static inline bool compaction_ready(stru
{
unsigned long watermark;
+ if (!gfp_compaction_allowed(sc->gfp_mask))
+ return false;
+
/* Allocation can already succeed, nothing to do */
if (zone_watermark_ok(zone, sc->order, min_wmark_pages(zone),
sc->reclaim_idx, 0))
_
Patches currently in -mm which might be from vbabka(a)suse.cz are
mm-vmscan-prevent-infinite-loop-for-costly-gfp_noio-__gfp_retry_mayfail-allocations.patch
mm-document-memalloc_noreclaim_save-and-memalloc_pin_save.patch
mm-document-memalloc_noreclaim_save-and-memalloc_pin_save-v2.patch
We go through quite a bit of trouble to make sure we pick up any
preallocated LPI tables on the redistributors, as enabling LPIs is a
one-way switch. There is no such restriction for vLPIs, and for GICv4.1
we expect to allocate a new vPE table at boot.
This works as intended when initializing an ITS, however when setting up
a redistributor in its_cpu_init_lpis() the early return for preallocated
RD tables skips straight past GICv4 setup. This all comes to a head when
trying to kexec into a new kernel, as the new kernel silently fails to
set up GICv4, leading to a complete loss of SGIs and LPIs for KVM VMs
(ouch).
Slap a band-aid on the problem by ensuring its_cpu_init_lpis() always
initializes GICv4 on the way out, even if the other RD tables were
preallocated.
Cc: stable(a)vger.kernel.org
Fixes: 6479450f72c1 ("irqchip/gic-v4: Fix occasional VLPI drop")
Reported-by: George Cherian <gcherian(a)marvell.com>
Co-developed-by: Marc Zyngier <maz(a)kernel.org>
Signed-off-by: Marc Zyngier <maz(a)kernel.org>
Signed-off-by: Oliver Upton <oliver.upton(a)linux.dev>
---
I debated a bit on the fixes tag between the blamed commit and commit
5e5168461c22 ("irqchip/gic-v4.1: VPE table (aka GICR_VPROPBASER)
allocation"), although it would appear GICv4 could be left in an unknown
state after kexec too.
drivers/irqchip/irq-gic-v3-its.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index d097001c1e3e..0022852ce494 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -3172,6 +3172,7 @@ static void its_cpu_init_lpis(void)
val |= GICR_CTLR_ENABLE_LPIS;
writel_relaxed(val, rbase + GICR_CTLR);
+out:
if (gic_rdists->has_vlpis && !gic_rdists->has_rvpeid) {
void __iomem *vlpi_base = gic_data_rdist_vlpi_base();
@@ -3207,7 +3208,6 @@ static void its_cpu_init_lpis(void)
/* Make sure the GIC has seen the above */
dsb(sy);
-out:
gic_data_rdist()->flags |= RD_LOCAL_LPI_ENABLED;
pr_info("GICv3: CPU%d: using %s LPI pending table @%pa\n",
smp_processor_id(),
--
2.44.0.rc0.258.g7320e95886-goog
Hi all,
Eric reported that builds of LLVM with [1] (close to tip of tree) have
CONFIG_AS_HAS_OPTION_ARCH=n because the test for expected failure on
invalid input has started succeeding.
This Kconfig test was added because '.option arch' only causes an
assembler warning when it is unsupported, rather than a hard error,
which is what users of as-instr expect when something is unsupported.
This can be resolved by turning assembler warnings into errors with
'-Wa,--fatal-warnings' like we do with the compiler with '-Werror',
which is what the first patch does. The second patch removes the invalid
test, as the valid test is good enough with fatal warnings.
I have diffed several configurations for the different architectures
that use as-instr and I have found no issues.
I think this could go in through either the kbuild or RISC-V tree with
sufficient acks but I will let them fight over who takes it :)
[1]: https://github.com/llvm/llvm-project/commit/3ac9fe69f70a2b3541266daedbaaa7d…
---
Nathan Chancellor (2):
kbuild: Add -Wa,--fatal-warnings to as-instr invocation
RISC-V: Drop invalid test from CONFIG_AS_HAS_OPTION_ARCH
arch/riscv/Kconfig | 1 -
scripts/Kconfig.include | 2 +-
scripts/Makefile.compiler | 2 +-
3 files changed, 2 insertions(+), 3 deletions(-)
---
base-commit: 6613476e225e090cc9aad49be7fa504e290dd33d
change-id: 20240124-fix-riscv-option-arch-llvm-18-3cbe7b09a216
Best regards,
--
Nathan Chancellor <nathan(a)kernel.org>
From: Konstantin Komarov <almaz.alexandrovich(a)paragon-software.com>
[ Upstream commit 22457c047ed971f2f2e33be593ddfabd9639a409 ]
Unfortunately reparse attribute is used for many purposes (several dozens).
It is not possible here to know is this name symlink or not.
To get exactly the type of name we should to open inode (read mft).
getattr for opened file (fstat) correctly returns symlink.
Signed-off-by: Konstantin Komarov <almaz.alexandrovich(a)paragon-software.com>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
fs/ntfs3/dir.c | 30 +++++++++++++++++++++++++-----
1 file changed, 25 insertions(+), 5 deletions(-)
diff --git a/fs/ntfs3/dir.c b/fs/ntfs3/dir.c
index d4d9f4ffb6d9..c2fb76bb28f4 100644
--- a/fs/ntfs3/dir.c
+++ b/fs/ntfs3/dir.c
@@ -309,11 +309,31 @@ static inline int ntfs_filldir(struct ntfs_sb_info *sbi, struct ntfs_inode *ni,
return 0;
}
- /* NTFS: symlinks are "dir + reparse" or "file + reparse" */
- if (fname->dup.fa & FILE_ATTRIBUTE_REPARSE_POINT)
- dt_type = DT_LNK;
- else
- dt_type = (fname->dup.fa & FILE_ATTRIBUTE_DIRECTORY) ? DT_DIR : DT_REG;
+ /*
+ * NTFS: symlinks are "dir + reparse" or "file + reparse"
+ * Unfortunately reparse attribute is used for many purposes (several dozens).
+ * It is not possible here to know is this name symlink or not.
+ * To get exactly the type of name we should to open inode (read mft).
+ * getattr for opened file (fstat) correctly returns symlink.
+ */
+ dt_type = (fname->dup.fa & FILE_ATTRIBUTE_DIRECTORY) ? DT_DIR : DT_REG;
+
+ /*
+ * It is not reliable to detect the type of name using duplicated information
+ * stored in parent directory.
+ * The only correct way to get the type of name - read MFT record and find ATTR_STD.
+ * The code below is not good idea.
+ * It does additional locks/reads just to get the type of name.
+ * Should we use additional mount option to enable branch below?
+ */
+ if ((fname->dup.fa & FILE_ATTRIBUTE_REPARSE_POINT) &&
+ ino != ni->mi.rno) {
+ struct inode *inode = ntfs_iget5(sbi->sb, &e->ref, NULL);
+ if (!IS_ERR_OR_NULL(inode)) {
+ dt_type = fs_umode_to_dtype(inode->i_mode);
+ iput(inode);
+ }
+ }
return !dir_emit(ctx, (s8 *)name, name_len, ino, dt_type);
}
--
2.43.0
From: Damien Le Moal <dlemoal(a)kernel.org>
For regular system shutdown, ata_dev_power_set_standby() will be
executed twice: once the scsi device is removed and another when
ata_pci_shutdown_one() executes and EH completes unloading the devices.
Make the second call to ata_dev_power_set_standby() do nothing by using
ata_dev_power_is_active() and return if the device is already in
standby.
Fixes: 2da4c5e24e86 ("ata: libata-core: Improve ata_dev_power_set_active()")
Cc: stable(a)vger.kernel.org
Signed-off-by: Damien Le Moal <dlemoal(a)kernel.org>
Signed-off-by: Niklas Cassel <cassel(a)kernel.org>
---
Changes since V1: Move the function instead of using a forward declaration.
drivers/ata/libata-core.c | 59 ++++++++++++++++++++-------------------
1 file changed, 30 insertions(+), 29 deletions(-)
diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index d9f80f4f70f5..be3412cdb22e 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -2001,6 +2001,33 @@ bool ata_dev_power_init_tf(struct ata_device *dev, struct ata_taskfile *tf,
return true;
}
+static bool ata_dev_power_is_active(struct ata_device *dev)
+{
+ struct ata_taskfile tf;
+ unsigned int err_mask;
+
+ ata_tf_init(dev, &tf);
+ tf.flags |= ATA_TFLAG_DEVICE | ATA_TFLAG_ISADDR;
+ tf.protocol = ATA_PROT_NODATA;
+ tf.command = ATA_CMD_CHK_POWER;
+
+ err_mask = ata_exec_internal(dev, &tf, NULL, DMA_NONE, NULL, 0, 0);
+ if (err_mask) {
+ ata_dev_err(dev, "Check power mode failed (err_mask=0x%x)\n",
+ err_mask);
+ /*
+ * Assume we are in standby mode so that we always force a
+ * spinup in ata_dev_power_set_active().
+ */
+ return false;
+ }
+
+ ata_dev_dbg(dev, "Power mode: 0x%02x\n", tf.nsect);
+
+ /* Active or idle */
+ return tf.nsect == 0xff;
+}
+
/**
* ata_dev_power_set_standby - Set a device power mode to standby
* @dev: target device
@@ -2017,8 +2044,9 @@ void ata_dev_power_set_standby(struct ata_device *dev)
struct ata_taskfile tf;
unsigned int err_mask;
- /* If the device is already sleeping, do nothing. */
- if (dev->flags & ATA_DFLAG_SLEEPING)
+ /* If the device is already sleeping or in standby, do nothing. */
+ if ((dev->flags & ATA_DFLAG_SLEEPING) ||
+ !ata_dev_power_is_active(dev))
return;
/*
@@ -2046,33 +2074,6 @@ void ata_dev_power_set_standby(struct ata_device *dev)
err_mask);
}
-static bool ata_dev_power_is_active(struct ata_device *dev)
-{
- struct ata_taskfile tf;
- unsigned int err_mask;
-
- ata_tf_init(dev, &tf);
- tf.flags |= ATA_TFLAG_DEVICE | ATA_TFLAG_ISADDR;
- tf.protocol = ATA_PROT_NODATA;
- tf.command = ATA_CMD_CHK_POWER;
-
- err_mask = ata_exec_internal(dev, &tf, NULL, DMA_NONE, NULL, 0, 0);
- if (err_mask) {
- ata_dev_err(dev, "Check power mode failed (err_mask=0x%x)\n",
- err_mask);
- /*
- * Assume we are in standby mode so that we always force a
- * spinup in ata_dev_power_set_active().
- */
- return false;
- }
-
- ata_dev_dbg(dev, "Power mode: 0x%02x\n", tf.nsect);
-
- /* Active or idle */
- return tf.nsect == 0xff;
-}
-
/**
* ata_dev_power_set_active - Set a device power mode to active
* @dev: target device
--
2.43.2
The following commit has been merged into the irq/urgent branch of tip:
Commit-ID: fb33a46cd75e18773dd5a414744507d84ae90870
Gitweb: https://git.kernel.org/tip/fb33a46cd75e18773dd5a414744507d84ae90870
Author: Chen Jun <chenjun102(a)huawei.com>
AuthorDate: Tue, 20 Feb 2024 19:14:29 +08:00
Committer: Thomas Gleixner <tglx(a)linutronix.de>
CommitterDate: Wed, 21 Feb 2024 18:40:00 +01:00
irqchip/mbigen: Don't use bus_get_dev_root() to find the parent
bus_get_dev_root() returns sp->dev_root which is set in subsys_register(),
but subsys_register() is not called by platform_bus_init().
Therefor for the platform_bus_type, bus_get_dev_root() always returns NULL.
This makes mbigen_of_create_domain() always return -ENODEV.
Don't try to retrieve the parent via bus_get_dev_root() and
unconditionally hand a NULL pointer to of_platform_device_create() to
fix this.
Fixes: fea087fc291b ("irqchip/mbigen: move to use bus_get_dev_root()")
Signed-off-by: Chen Jun <chenjun102(a)huawei.com>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20240220111429.110666-1-chenjun102@huawei.com
---
drivers/irqchip/irq-mbigen.c | 8 +-------
1 file changed, 1 insertion(+), 7 deletions(-)
diff --git a/drivers/irqchip/irq-mbigen.c b/drivers/irqchip/irq-mbigen.c
index 5101a3f..58881d3 100644
--- a/drivers/irqchip/irq-mbigen.c
+++ b/drivers/irqchip/irq-mbigen.c
@@ -235,22 +235,17 @@ static const struct irq_domain_ops mbigen_domain_ops = {
static int mbigen_of_create_domain(struct platform_device *pdev,
struct mbigen_device *mgn_chip)
{
- struct device *parent;
struct platform_device *child;
struct irq_domain *domain;
struct device_node *np;
u32 num_pins;
int ret = 0;
- parent = bus_get_dev_root(&platform_bus_type);
- if (!parent)
- return -ENODEV;
-
for_each_child_of_node(pdev->dev.of_node, np) {
if (!of_property_read_bool(np, "interrupt-controller"))
continue;
- child = of_platform_device_create(np, NULL, parent);
+ child = of_platform_device_create(np, NULL, NULL);
if (!child) {
ret = -ENOMEM;
break;
@@ -273,7 +268,6 @@ static int mbigen_of_create_domain(struct platform_device *pdev,
}
}
- put_device(parent);
if (ret)
of_node_put(np);
The is_psr_su parameter is a boolean flag indicating whether the Panel
Self Refresh Selective Update (PSR SU) feature is enabled which is a
power-saving feature that allows only the updated regions of the screen
to be refreshed, reducing the amount of data that needs to be sent to
the display.
Fixes the below with gcc W=1:
drivers/gpu/drm/amd/amdgpu/../display/amdgpu_dm/amdgpu_dm.c:5257: warning: Function parameter or member 'is_psr_su' not described in 'fill_dc_dirty_rects'
Fixes: 13d6b0812e58 ("drm/amdgpu: make damage clips support configurable")
Cc: stable(a)vger.kernel.org
Cc: Hamza Mahfooz <hamza.mahfooz(a)amd.com>
Cc: Mario Limonciello <mario.limonciello(a)amd.com>
Cc: Rodrigo Siqueira <Rodrigo.Siqueira(a)amd.com>
Cc: Aurabindo Pillai <aurabindo.pillai(a)amd.com>
Signed-off-by: Srinivasan Shanmugam <srinivasan.shanmugam(a)amd.com>
---
drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index b9ac3d2f8029..1b51f7fb48ea 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -5234,6 +5234,10 @@ static inline void fill_dc_dirty_rect(struct drm_plane *plane,
* @new_plane_state: New state of @plane
* @crtc_state: New state of CRTC connected to the @plane
* @flip_addrs: DC flip tracking struct, which also tracts dirty rects
+ * @is_psr_su: Flag indicating whether Panel Self Refresh Selective Update (PSR SU) is enabled.
+ * If PSR SU is enabled and damage clips are available, only the regions of the screen
+ * that have changed will be updated. If PSR SU is not enabled,
+ * or if damage clips are not available, the entire screen will be updated.
* @dirty_regions_changed: dirty regions changed
*
* For PSR SU, DC informs the DMUB uController of dirty rectangle regions
--
2.34.1
Hi Greg,
I believe these patches are likely already on your radar. I just wanted to
inform you that it would be highly appreciated if we could see their inclusion
in the upcoming release.
e0526ec5360a 2024-01-30hv_netvsc: Fix race condition between
netvsc_probe and netvsc_remove [Jakub Kicinski]
We would like to even get:
9cae43da9867 hv_netvsc: Register VF in netvsc_probe if
NET_DEVICE_REGISTER missed
included, but the patch is still in netdev and has not made it into
Linus's tree. If it does come in,
could you please consider including it too.
Thanks,
- Allen
The current logic is probably fine but is a bit convoluted. Plus, we
don't want partial pages to be part of the sequential operation just in
case the core would optimize the page read with a subpage read (which
would break the sequence). This may happen on the first and last page
only, so if the start offset or the end offset is not aligned with a
page boundary, better avoid them to prevent any risk.
Cc: stable(a)vger.kernel.org
Fixes: 003fe4b9545b ("mtd: rawnand: Support for sequential cache reads")
Signed-off-by: Miquel Raynal <miquel.raynal(a)bootlin.com>
---
drivers/mtd/nand/raw/nand_base.c | 26 +++++++++++++++++---------
1 file changed, 17 insertions(+), 9 deletions(-)
diff --git a/drivers/mtd/nand/raw/nand_base.c b/drivers/mtd/nand/raw/nand_base.c
index 139fdf3e58c0..bbdcfbe643f3 100644
--- a/drivers/mtd/nand/raw/nand_base.c
+++ b/drivers/mtd/nand/raw/nand_base.c
@@ -3460,21 +3460,29 @@ static void rawnand_enable_cont_reads(struct nand_chip *chip, unsigned int page,
u32 readlen, int col)
{
struct mtd_info *mtd = nand_to_mtd(chip);
+ unsigned int end_page, end_col;
+
+ chip->cont_read.ongoing = false;
if (!chip->controller->supported_op.cont_read)
return;
- if ((col && col + readlen < (3 * mtd->writesize)) ||
- (!col && readlen < (2 * mtd->writesize))) {
- chip->cont_read.ongoing = false;
- return;
- }
+ end_page = DIV_ROUND_UP(col + readlen, mtd->writesize);
+ end_col = (col + readlen) % mtd->writesize;
- chip->cont_read.ongoing = true;
- chip->cont_read.first_page = page;
if (col)
- chip->cont_read.first_page++;
- chip->cont_read.last_page = page + ((readlen >> chip->page_shift) & chip->pagemask);
+ page++;
+
+ if (end_col && end_page)
+ end_page--;
+
+ if (page + 1 > end_page)
+ return;
+
+ chip->cont_read.first_page = page;
+ chip->cont_read.last_page = end_page;
+ chip->cont_read.ongoing = true;
+
rawnand_cap_cont_reads(chip);
}
--
2.34.1
On Wed, Feb 21, 2024 at 1:33 PM Jason A. Donenfeld <Jason(a)zx2c4.com> wrote:
>
> There are few uses of CoCo that don't rely on working cryptography and
> hence a working RNG. Unfortunately, the CoCo threat model means that the
> VM host cannot be trusted and may actively work against guests to
> extract secrets or manipulate computation. Since a malicious host can
> modify or observe nearly all inputs to guests, the only remaining source
> of entropy for CoCo guests is RDRAND.
>
> If RDRAND is broken -- due to CPU hardware fault -- the RNG as a whole
> is meant to gracefully continue on gathering entropy from other sources,
> but since there aren't other sources on CoCo, this is catastrophic.
> This is mostly a concern at boot time when initially seeding the RNG, as
> after that the consequences of a broken RDRAND are much more
> theoretical.
>
> So, try at boot to seed the RNG using 256 bits of RDRAND output. If this
> fails, panic(). This will also trigger if the system is booted without
> RDRAND, as RDRAND is essential for a safe CoCo boot.
>
> This patch is deliberately written to be "just a CoCo x86 driver
> feature" and not part of the RNG itself. Many device drivers and
> platforms have some desire to contribute something to the RNG, and
> add_device_randomness() is specifically meant for this purpose. Any
> driver can call this with seed data of any quality, or even garbage
> quality, and it can only possibly make the quality of the RNG better or
> have no effect, but can never make it worse. Rather than trying to
> build something into the core of the RNG, this patch interprets the
> particular CoCo issue as just a CoCo issue, and therefore separates this
> all out into driver (well, arch/platform) code.
>
> Cc: Borislav Petkov <bp(a)alien8.de>
> Cc: Daniel P. Berrangé <berrange(a)redhat.com>
> Cc: Dave Hansen <dave.hansen(a)linux.intel.com>
> Cc: Elena Reshetova <elena.reshetova(a)intel.com>
> Cc: H. Peter Anvin <hpa(a)zytor.com>
> Cc: Ingo Molnar <mingo(a)redhat.com>
> Cc: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com>
> Cc: Theodore Ts'o <tytso(a)mit.edu>
> Cc: Thomas Gleixner <tglx(a)linutronix.de>
> Signed-off-by: Jason A. Donenfeld <Jason(a)zx2c4.com>
Also,
Cc: stable(a)vger.kernel.org
At least, I think that's probably what we want, though I don't know
what version range is relevant for CoCo.
Syzkaller reports a potential circular dependency leading to deadlock
in 5.10 and 5.15 stable releases since the commit
92d4abd66f70 ("Bluetooth: vhci: Fix race when opening vhci device")
that caused this crash was backported to these branches.
The problem has been fixed by the following upstream patch that was
adapted to 5.10 and 5.15. All of the changes made to the patch
in order to adapt it are described at the end of commit message.
This patch has already been backported to the following stable branches:
v6.6 - https://lore.kernel.org/stable/20231230115814.038261305@linuxfoundation.org/
v6.1 - https://lore.kernel.org/stable/20231230115807.749489379@linuxfoundation.org/
Found by Linux Verification Center (linuxtesting.org) with Syzkaller.
From: Max Krummenacher <max.krummenacher(a)toradex.com>
Hello
With the backported commit e09ff743e30b ("mtd: rawnand: gpmi: Set
WAIT_FOR_READY timeout based on program/erase times") in kernel 5.4.y
I see corruption of the NAND content during kernel boot.
Reverting said commit on top of current 5.4.y fixes the issue.
It seems that the commit relies on commit 71c76f56b97c ("mtd:
rawnand: gpmi: Fix setting busy timeout setting"), but its
backport got reverted.
One should either backport both commits or none, having only one
results in potential bugs.
I've seen it in 5.4.y, however in 5.10.y and 5.15.y there one of
the two backports is also reverted and likely the same regression
exists.
Any comments?
Max
Max Krummenacher (1):
Revert "Revert "mtd: rawnand: gpmi: Fix setting busy timeout setting""
drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--
2.42.0
From: Dan Carpenter <dan.carpenter(a)linaro.org>
commit c301f0981fdd3fd1ffac6836b423c4d7a8e0eb63 upstream.
The problem is in nft_byteorder_eval() where we are iterating through a
loop and writing to dst[0], dst[1], dst[2] and so on... On each
iteration we are writing 8 bytes. But dst[] is an array of u32 so each
element only has space for 4 bytes. That means that every iteration
overwrites part of the previous element.
I spotted this bug while reviewing commit caf3ef7468f7 ("netfilter:
nf_tables: prevent OOB access in nft_byteorder_eval") which is a related
issue. I think that the reason we have not detected this bug in testing
is that most of time we only write one element.
Fixes: ce1e7989d989 ("netfilter: nft_byteorder: provide 64bit le/be conversion")
Signed-off-by: Dan Carpenter <dan.carpenter(a)linaro.org>
Signed-off-by: Pablo Neira Ayuso <pablo(a)netfilter.org>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
[Ajay: Modified to apply on v4.19.y]
Signed-off-by: Ajay Kaher <ajay.kaher(a)broadcom.com>
---
net/netfilter/nft_byteorder.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/net/netfilter/nft_byteorder.c b/net/netfilter/nft_byteorder.c
index dba1612..8c4ee49 100644
--- a/net/netfilter/nft_byteorder.c
+++ b/net/netfilter/nft_byteorder.c
@@ -41,19 +41,20 @@ static void nft_byteorder_eval(const struct nft_expr *expr,
switch (priv->size) {
case 8: {
+ u64 *dst64 = (void *)dst;
u64 src64;
switch (priv->op) {
case NFT_BYTEORDER_NTOH:
for (i = 0; i < priv->len / 8; i++) {
src64 = get_unaligned((u64 *)&src[i]);
- put_unaligned_be64(src64, &dst[i]);
+ put_unaligned_be64(src64, &dst64[i]);
}
break;
case NFT_BYTEORDER_HTON:
for (i = 0; i < priv->len / 8; i++) {
src64 = get_unaligned_be64(&src[i]);
- put_unaligned(src64, (u64 *)&dst[i]);
+ put_unaligned(src64, &dst64[i]);
}
break;
}
--
2.7.4
From: Alfred Piccioni <alpic(a)google.com>
commit f1bb47a31dff6d4b34fb14e99850860ee74bb003 upstream.
[Please apply to 5.4-stable and 4.19-stable. The upstream commit failed
to apply to these kernels. This patch resolves the conflicts.]
Some ioctl commands do not require ioctl permission, but are routed to
other permissions such as FILE_GETATTR or FILE_SETATTR. This routing is
done by comparing the ioctl cmd to a set of 64-bit flags (FS_IOC_*).
However, if a 32-bit process is running on a 64-bit kernel, it emits
32-bit flags (FS_IOC32_*) for certain ioctl operations. These flags are
being checked erroneously, which leads to these ioctl operations being
routed to the ioctl permission, rather than the correct file
permissions.
This was also noted in a RED-PEN finding from a while back -
"/* RED-PEN how should LSM module know it's handling 32bit? */".
This patch introduces a new hook, security_file_ioctl_compat(), that is
called from the compat ioctl syscall. All current LSMs have been changed
to support this hook.
Reviewing the three places where we are currently using
security_file_ioctl(), it appears that only SELinux needs a dedicated
compat change; TOMOYO and SMACK appear to be functional without any
change.
Cc: stable(a)vger.kernel.org
Fixes: 0b24dcb7f2f7 ("Revert "selinux: simplify ioctl checking"")
Signed-off-by: Alfred Piccioni <alpic(a)google.com>
Reviewed-by: Stephen Smalley <stephen.smalley.work(a)gmail.com>
[PM: subject tweak, line length fixes, and alignment corrections]
Signed-off-by: Paul Moore <paul(a)paul-moore.com>
Signed-off-by: Eric Biggers <ebiggers(a)google.com>
---
fs/compat_ioctl.c | 3 +--
include/linux/lsm_hooks.h | 9 +++++++++
include/linux/security.h | 9 +++++++++
security/security.c | 17 +++++++++++++++++
security/selinux/hooks.c | 28 ++++++++++++++++++++++++++++
security/smack/smack_lsm.c | 1 +
security/tomoyo/tomoyo.c | 1 +
7 files changed, 66 insertions(+), 2 deletions(-)
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 8fcc53d83af2d..22f7dc6688dee 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -994,8 +994,7 @@ COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd,
if (!f.file)
goto out;
- /* RED-PEN how should LSM module know it's handling 32bit? */
- error = security_file_ioctl(f.file, cmd, arg);
+ error = security_file_ioctl_compat(f.file, cmd, arg);
if (error)
goto out_fput;
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index a21dc5413653e..0f4897e97c70f 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -498,6 +498,12 @@
* simple integer value. When @arg represents a user space pointer, it
* should never be used by the security module.
* Return 0 if permission is granted.
+ * @file_ioctl_compat:
+ * @file contains the file structure.
+ * @cmd contains the operation to perform.
+ * @arg contains the operational arguments.
+ * Check permission for a compat ioctl operation on @file.
+ * Return 0 if permission is granted.
* @mmap_addr :
* Check permissions for a mmap operation at @addr.
* @addr contains virtual address that will be used for the operation.
@@ -1602,6 +1608,8 @@ union security_list_options {
void (*file_free_security)(struct file *file);
int (*file_ioctl)(struct file *file, unsigned int cmd,
unsigned long arg);
+ int (*file_ioctl_compat)(struct file *file, unsigned int cmd,
+ unsigned long arg);
int (*mmap_addr)(unsigned long addr);
int (*mmap_file)(struct file *file, unsigned long reqprot,
unsigned long prot, unsigned long flags);
@@ -1907,6 +1915,7 @@ struct security_hook_heads {
struct hlist_head file_alloc_security;
struct hlist_head file_free_security;
struct hlist_head file_ioctl;
+ struct hlist_head file_ioctl_compat;
struct hlist_head mmap_addr;
struct hlist_head mmap_file;
struct hlist_head file_mprotect;
diff --git a/include/linux/security.h b/include/linux/security.h
index aa5c7141c8d17..1a99958b850b5 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -362,6 +362,8 @@ int security_file_permission(struct file *file, int mask);
int security_file_alloc(struct file *file);
void security_file_free(struct file *file);
int security_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+int security_file_ioctl_compat(struct file *file, unsigned int cmd,
+ unsigned long arg);
int security_mmap_file(struct file *file, unsigned long prot,
unsigned long flags);
int security_mmap_addr(unsigned long addr);
@@ -907,6 +909,13 @@ static inline int security_file_ioctl(struct file *file, unsigned int cmd,
return 0;
}
+static inline int security_file_ioctl_compat(struct file *file,
+ unsigned int cmd,
+ unsigned long arg)
+{
+ return 0;
+}
+
static inline int security_mmap_file(struct file *file, unsigned long prot,
unsigned long flags)
{
diff --git a/security/security.c b/security/security.c
index 460c3826f6401..6c06296548c21 100644
--- a/security/security.c
+++ b/security/security.c
@@ -1422,6 +1422,23 @@ int security_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
return call_int_hook(file_ioctl, 0, file, cmd, arg);
}
+/**
+ * security_file_ioctl_compat() - Check if an ioctl is allowed in compat mode
+ * @file: associated file
+ * @cmd: ioctl cmd
+ * @arg: ioctl arguments
+ *
+ * Compat version of security_file_ioctl() that correctly handles 32-bit
+ * processes running on 64-bit kernels.
+ *
+ * Return: Returns 0 if permission is granted.
+ */
+int security_file_ioctl_compat(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ return call_int_hook(file_ioctl_compat, 0, file, cmd, arg);
+}
+
static inline unsigned long mmap_prot(struct file *file, unsigned long prot)
{
/*
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index c1bf319b459a9..6fec9fba41a84 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -3668,6 +3668,33 @@ static int selinux_file_ioctl(struct file *file, unsigned int cmd,
return error;
}
+static int selinux_file_ioctl_compat(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ /*
+ * If we are in a 64-bit kernel running 32-bit userspace, we need to
+ * make sure we don't compare 32-bit flags to 64-bit flags.
+ */
+ switch (cmd) {
+ case FS_IOC32_GETFLAGS:
+ cmd = FS_IOC_GETFLAGS;
+ break;
+ case FS_IOC32_SETFLAGS:
+ cmd = FS_IOC_SETFLAGS;
+ break;
+ case FS_IOC32_GETVERSION:
+ cmd = FS_IOC_GETVERSION;
+ break;
+ case FS_IOC32_SETVERSION:
+ cmd = FS_IOC_SETVERSION;
+ break;
+ default:
+ break;
+ }
+
+ return selinux_file_ioctl(file, cmd, arg);
+}
+
static int default_noexec;
static int file_map_prot_check(struct file *file, unsigned long prot, int shared)
@@ -6933,6 +6960,7 @@ static struct security_hook_list selinux_hooks[] __lsm_ro_after_init = {
LSM_HOOK_INIT(file_permission, selinux_file_permission),
LSM_HOOK_INIT(file_alloc_security, selinux_file_alloc_security),
LSM_HOOK_INIT(file_ioctl, selinux_file_ioctl),
+ LSM_HOOK_INIT(file_ioctl_compat, selinux_file_ioctl_compat),
LSM_HOOK_INIT(mmap_file, selinux_mmap_file),
LSM_HOOK_INIT(mmap_addr, selinux_mmap_addr),
LSM_HOOK_INIT(file_mprotect, selinux_file_mprotect),
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index 9e48c8b36b678..6f2613f874fa9 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -4648,6 +4648,7 @@ static struct security_hook_list smack_hooks[] __lsm_ro_after_init = {
LSM_HOOK_INIT(file_alloc_security, smack_file_alloc_security),
LSM_HOOK_INIT(file_ioctl, smack_file_ioctl),
+ LSM_HOOK_INIT(file_ioctl_compat, smack_file_ioctl),
LSM_HOOK_INIT(file_lock, smack_file_lock),
LSM_HOOK_INIT(file_fcntl, smack_file_fcntl),
LSM_HOOK_INIT(mmap_file, smack_mmap_file),
diff --git a/security/tomoyo/tomoyo.c b/security/tomoyo/tomoyo.c
index 716c92ec941ad..0176612bac967 100644
--- a/security/tomoyo/tomoyo.c
+++ b/security/tomoyo/tomoyo.c
@@ -554,6 +554,7 @@ static struct security_hook_list tomoyo_hooks[] __lsm_ro_after_init = {
LSM_HOOK_INIT(path_rename, tomoyo_path_rename),
LSM_HOOK_INIT(inode_getattr, tomoyo_inode_getattr),
LSM_HOOK_INIT(file_ioctl, tomoyo_file_ioctl),
+ LSM_HOOK_INIT(file_ioctl_compat, tomoyo_file_ioctl),
LSM_HOOK_INIT(path_chmod, tomoyo_path_chmod),
LSM_HOOK_INIT(path_chown, tomoyo_path_chown),
LSM_HOOK_INIT(path_chroot, tomoyo_path_chroot),
base-commit: f0602893f43a54097fcf22bd8c2f7b8e75ca643e
--
2.43.0
From: "Borislav Petkov (AMD)" <bp(a)alien8.de>
commit 04c3024560d3a14acd18d0a51a1d0a89d29b7eb5 upstream.
AMD does not have the requirement for a synchronization barrier when
acccessing a certain group of MSRs. Do not incur that unnecessary
penalty there.
There will be a CPUID bit which explicitly states that a MFENCE is not
needed. Once that bit is added to the APM, this will be extended with
it.
While at it, move to processor.h to avoid include hell. Untangling that
file properly is a matter for another day.
Some notes on the performance aspect of why this is relevant, courtesy
of Kishon VijayAbraham <Kishon.VijayAbraham(a)amd.com>:
On a AMD Zen4 system with 96 cores, a modified ipi-bench[1] on a VM
shows x2AVIC IPI rate is 3% to 4% lower than AVIC IPI rate. The
ipi-bench is modified so that the IPIs are sent between two vCPUs in the
same CCX. This also requires to pin the vCPU to a physical core to
prevent any latencies. This simulates the use case of pinning vCPUs to
the thread of a single CCX to avoid interrupt IPI latency.
In order to avoid run-to-run variance (for both x2AVIC and AVIC), the
below configurations are done:
1) Disable Power States in BIOS (to prevent the system from going to
lower power state)
2) Run the system at fixed frequency 2500MHz (to prevent the system
from increasing the frequency when the load is more)
With the above configuration:
*) Performance measured using ipi-bench for AVIC:
Average Latency: 1124.98ns [Time to send IPI from one vCPU to another vCPU]
Cumulative throughput: 42.6759M/s [Total number of IPIs sent in a second from
48 vCPUs simultaneously]
*) Performance measured using ipi-bench for x2AVIC:
Average Latency: 1172.42ns [Time to send IPI from one vCPU to another vCPU]
Cumulative throughput: 40.9432M/s [Total number of IPIs sent in a second from
48 vCPUs simultaneously]
From above, x2AVIC latency is ~4% more than AVIC. However, the expectation is
x2AVIC performance to be better or equivalent to AVIC. Upon analyzing
the perf captures, it is observed significant time is spent in
weak_wrmsr_fence() invoked by x2apic_send_IPI().
With the fix to skip weak_wrmsr_fence()
*) Performance measured using ipi-bench for x2AVIC:
Average Latency: 1117.44ns [Time to send IPI from one vCPU to another vCPU]
Cumulative throughput: 42.9608M/s [Total number of IPIs sent in a second from
48 vCPUs simultaneously]
Comparing the performance of x2AVIC with and without the fix, it can be seen
the performance improves by ~4%.
Performance captured using an unmodified ipi-bench using the 'mesh-ipi' option
with and without weak_wrmsr_fence() on a Zen4 system also showed significant
performance improvement without weak_wrmsr_fence(). The 'mesh-ipi' option ignores
CCX or CCD and just picks random vCPU.
Average throughput (10 iterations) with weak_wrmsr_fence(),
Cumulative throughput: 4933374 IPI/s
Average throughput (10 iterations) without weak_wrmsr_fence(),
Cumulative throughput: 6355156 IPI/s
[1] https://github.com/bytedance/kvm-utils/tree/master/microbenchmark/ipi-bench
Cc: stable(a)vger.kernel.org # 6.6+
Signed-off-by: Borislav Petkov (AMD) <bp(a)alien8.de>
Link: https://lore.kernel.org/r/20230622095212.20940-1-bp@alien8.de
Signed-off-by: Kishon Vijay Abraham I <kvijayab(a)amd.com>
---
Kindly merge this patch to stable releases (v6.6+) as it's a perf optimization.
[It does not apply as is on earlier releases and have to be reworked]
arch/x86/include/asm/barrier.h | 18 ------------------
arch/x86/include/asm/cpufeatures.h | 2 +-
arch/x86/include/asm/processor.h | 18 ++++++++++++++++++
arch/x86/kernel/cpu/amd.c | 3 +++
arch/x86/kernel/cpu/common.c | 7 +++++++
arch/x86/kernel/cpu/hygon.c | 3 +++
6 files changed, 32 insertions(+), 19 deletions(-)
diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h
index 35389b2af88e..0216f63a366b 100644
--- a/arch/x86/include/asm/barrier.h
+++ b/arch/x86/include/asm/barrier.h
@@ -81,22 +81,4 @@ do { \
#include <asm-generic/barrier.h>
-/*
- * Make previous memory operations globally visible before
- * a WRMSR.
- *
- * MFENCE makes writes visible, but only affects load/store
- * instructions. WRMSR is unfortunately not a load/store
- * instruction and is unaffected by MFENCE. The LFENCE ensures
- * that the WRMSR is not reordered.
- *
- * Most WRMSRs are full serializing instructions themselves and
- * do not require this barrier. This is only required for the
- * IA32_TSC_DEADLINE and X2APIC MSRs.
- */
-static inline void weak_wrmsr_fence(void)
-{
- asm volatile("mfence; lfence" : : : "memory");
-}
-
#endif /* _ASM_X86_BARRIER_H */
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 58cb9495e40f..0091f1008314 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -308,10 +308,10 @@
#define X86_FEATURE_SMBA (11*32+21) /* "" Slow Memory Bandwidth Allocation */
#define X86_FEATURE_BMEC (11*32+22) /* "" Bandwidth Monitoring Event Configuration */
#define X86_FEATURE_USER_SHSTK (11*32+23) /* Shadow stack support for user mode applications */
-
#define X86_FEATURE_SRSO (11*32+24) /* "" AMD BTB untrain RETs */
#define X86_FEATURE_SRSO_ALIAS (11*32+25) /* "" AMD BTB untrain RETs through aliasing */
#define X86_FEATURE_IBPB_ON_VMEXIT (11*32+26) /* "" Issue an IBPB only on VMEXIT */
+#define X86_FEATURE_APIC_MSRS_FENCE (11*32+27) /* "" IA32_TSC_DEADLINE and X2APIC MSRs need fencing */
/* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */
#define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index a3669a7774ed..191f1d8f0506 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -734,4 +734,22 @@ bool arch_is_platform_page(u64 paddr);
extern bool gds_ucode_mitigated(void);
+/*
+ * Make previous memory operations globally visible before
+ * a WRMSR.
+ *
+ * MFENCE makes writes visible, but only affects load/store
+ * instructions. WRMSR is unfortunately not a load/store
+ * instruction and is unaffected by MFENCE. The LFENCE ensures
+ * that the WRMSR is not reordered.
+ *
+ * Most WRMSRs are full serializing instructions themselves and
+ * do not require this barrier. This is only required for the
+ * IA32_TSC_DEADLINE and X2APIC MSRs.
+ */
+static inline void weak_wrmsr_fence(void)
+{
+ alternative("mfence; lfence", "", ALT_NOT(X86_FEATURE_APIC_MSRS_FENCE));
+}
+
#endif /* _ASM_X86_PROCESSOR_H */
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 6e4f23f314ac..bb3efc825bf4 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -1157,6 +1157,9 @@ static void init_amd(struct cpuinfo_x86 *c)
if (!cpu_has(c, X86_FEATURE_HYPERVISOR) &&
cpu_has_amd_erratum(c, amd_erratum_1485))
msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_SHARED_BTB_FIX_BIT);
+
+ /* AMD CPUs don't need fencing after x2APIC/TSC_DEADLINE MSR writes. */
+ clear_cpu_cap(c, X86_FEATURE_APIC_MSRS_FENCE);
}
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 4e5ffc8b0e46..d98d023ae497 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1858,6 +1858,13 @@ static void identify_cpu(struct cpuinfo_x86 *c)
c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
#endif
+
+ /*
+ * Set default APIC and TSC_DEADLINE MSR fencing flag. AMD and
+ * Hygon will clear it in ->c_init() below.
+ */
+ set_cpu_cap(c, X86_FEATURE_APIC_MSRS_FENCE);
+
/*
* Vendor-specific initialization. In this section we
* canonicalize the feature flags, meaning if there are
diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c
index a7b3ef4c4de9..6e738759779e 100644
--- a/arch/x86/kernel/cpu/hygon.c
+++ b/arch/x86/kernel/cpu/hygon.c
@@ -348,6 +348,9 @@ static void init_hygon(struct cpuinfo_x86 *c)
set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS);
check_null_seg_clears_base(c);
+
+ /* Hygon CPUs don't need fencing after x2APIC/TSC_DEADLINE MSR writes. */
+ clear_cpu_cap(c, X86_FEATURE_APIC_MSRS_FENCE);
}
static void cpu_detect_tlb_hygon(struct cpuinfo_x86 *c)
--
2.34.1
Change WARN to pr_warn in check_map_prog_compatibility,
because this functionality was added in kernels 6.1 and
because fuzzing kernels with syzkaller while
kernel was started with parameter panic_on_warn
produces false positive crashes.
Signed-off-by: Alexander Ofitserov <oficerovas(a)altlinux.org>
Cc: stable(a)vger.kernel.org
---
kernel/bpf/verifier.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 45c50ee9b0370..7a7a6e3087ba2 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -10478,7 +10478,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
verbose(env, "trace type programs can only use preallocated hash map\n");
return -EINVAL;
}
- WARN_ONCE(1, "trace type BPF program uses run-time allocation\n");
+ pr_warn_once("trace type BPF program uses run-time allocation\n");
verbose(env, "trace type programs with run-time allocated hash maps are unsafe. Switch to preallocated hash maps.\n");
}
--
2.42.1
It is possible that an LPI mapped in a different ITS gets unmapped while
handling the MOVALL command. If that is the case, there is no state that
can be migrated to the destination. Silently ignore it and continue
migrating other LPIs.
Cc: stable(a)vger.kernel.org
Fixes: ff9c114394aa ("KVM: arm/arm64: GICv4: Handle MOVALL applied to a vPE")
Signed-off-by: Oliver Upton <oliver.upton(a)linux.dev>
---
arch/arm64/kvm/vgic/vgic-its.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c
index 082448de27ed..28a93074eca1 100644
--- a/arch/arm64/kvm/vgic/vgic-its.c
+++ b/arch/arm64/kvm/vgic/vgic-its.c
@@ -1435,6 +1435,8 @@ static int vgic_its_cmd_handle_movall(struct kvm *kvm, struct vgic_its *its,
for (i = 0; i < irq_count; i++) {
irq = vgic_get_irq(kvm, NULL, intids[i]);
+ if (!irq)
+ continue;
update_affinity(irq, vcpu2);
--
2.44.0.rc0.258.g7320e95886-goog
vgic_get_irq() may not return a valid descriptor if there is no ITS that
holds a valid translation for the specified INTID. If that is the case,
it is safe to silently ignore it and continue processing the LPI pending
table.
Cc: stable(a)vger.kernel.org
Fixes: 33d3bc9556a7 ("KVM: arm64: vgic-its: Read initial LPI pending table")
Signed-off-by: Oliver Upton <oliver.upton(a)linux.dev>
---
arch/arm64/kvm/vgic/vgic-its.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c
index e2764d0ffa9f..082448de27ed 100644
--- a/arch/arm64/kvm/vgic/vgic-its.c
+++ b/arch/arm64/kvm/vgic/vgic-its.c
@@ -468,6 +468,9 @@ static int its_sync_lpi_pending_table(struct kvm_vcpu *vcpu)
}
irq = vgic_get_irq(vcpu->kvm, NULL, intids[i]);
+ if (!irq)
+ continue;
+
raw_spin_lock_irqsave(&irq->irq_lock, flags);
irq->pending_latch = pendmask & (1U << bit_nr);
vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
--
2.44.0.rc0.258.g7320e95886-goog
Dear Nini,
Unfortunately I forgot to add a 'Fixes' tag to the patch, if I had, then it would
have happened automatically.
Please remind me of this once kernel 6.9-rc1 is released since that will contain
the fix. Then I can post the same patch to the stable mailinglist for inclusion in
older kernels.
It has to wait until 6.9-rc1 is release though, patches need to be in mainline first
before they can be backported.
Regards,
Hans
On 21/02/2024 07:30, Nini Song (宋宛妮) wrote:
> Dear Hans,
>
> Thank your reply.
> Could you also help to marge solution into v5.15? Our customer used v5.15 for MP production, which requires this solution.
>
>
> BR,
> Nini Song
> On Mon, 2024-02-05 at 13:00 +0100, Hans Verkuil wrote:
>>
>>
>> External email : Please do not click links or open attachments until you have verified the sender or the content.
>>
>> On 25/01/2024 14:28, nini.song(a)mediatek.com wrote:
>> > From: "nini.song" <nini.song(a)mediatek.com>
>> >
>> > The valid_la is used to check the length requirements,
>> > including special cases of Timer Status. If the length is
>> > shorter than 5, that means no Duration Available is returned,
>> > the message will be forced to be invalid.
>> >
>> > However, the description of Duration Available in the spec
>> > is that this parameter may be returned when these cases, or
>> > that it can be optionally return when these cases. The key
>> > words in the spec description are flexible choices.
>>
>> Good catch, the spec indeed says 'may', so dropping the check
>> in this patch is the correct thing to do.
>>
>> It's merged in our staging tree and it will appear in v6.9.
>>
>> Regards,
>>
>> Hans
>>
>> >
>> > Remove the special length check of Timer Status to fit the
>> > spec which is not compulsory about that.
>> >
>> > Signed-off-by: Nini Song <nini.song(a)mediatek.com>
>> > ---
>> > drivers/media/cec/core/cec-adap.c | 14 --------------
>> > 1 file changed, 14 deletions(-)
>> >
>> > diff --git a/drivers/media/cec/core/cec-adap.c b/drivers/media/cec/core/cec-adap.c
>> > index 5741adf09a2e..559a172ebc6c 100644
>> > --- a/drivers/media/cec/core/cec-adap.c
>> > +++ b/drivers/media/cec/core/cec-adap.c
>> > @@ -1151,20 +1151,6 @@ void cec_received_msg_ts(struct cec_adapter *adap,
>> > if (valid_la && min_len) {
>> > /* These messages have special length requirements */
>> > switch (cmd) {
>> > -case CEC_MSG_TIMER_STATUS:
>> > -if (msg->msg[2] & 0x10) {
>> > -switch (msg->msg[2] & 0xf) {
>> > -case CEC_OP_PROG_INFO_NOT_ENOUGH_SPACE:
>> > -case CEC_OP_PROG_INFO_MIGHT_NOT_BE_ENOUGH_SPACE:
>> > -if (msg->len < 5)
>> > -valid_la = false;
>> > -break;
>> > -}
>> > -} else if ((msg->msg[2] & 0xf) == CEC_OP_PROG_ERROR_DUPLICATE) {
>> > -if (msg->len < 5)
>> > -valid_la = false;
>> > -}
>> > -break;
>> > case CEC_MSG_RECORD_ON:
>> > switch (msg->msg[2]) {
>> > case CEC_OP_RECORD_SRC_OWN:
>>
>>
This is the fix of CVE-2024-23851 for kernel v6.1.
Upstream commit: https://github.com/torvalds/linux/commit/bd504bcfec41a503b32054da5472904b40…
Changed argument name "blk_mode_t" back to "fmode_t" for the old version. The argument
is not affected by the patch.
He Gao (1):
dm: limit the number of targets and parameter size area
drivers/md/dm-core.h | 2 ++
drivers/md/dm-ioctl.c | 3 ++-
drivers/md/dm-table.c | 9 +++++++--
3 files changed, 11 insertions(+), 3 deletions(-)
--
2.44.0.rc0.258.g7320e95886-goog
commit 5bc09b397cbf1221f8a8aacb1152650c9195b02b upstream.
According to a syzbot report, end_buffer_async_write(), which handles the
completion of block device writes, may detect abnormal condition of the
buffer async_write flag and cause a BUG_ON failure when using nilfs2.
Nilfs2 itself does not use end_buffer_async_write(). But, the async_write
flag is now used as a marker by commit 7f42ec394156 ("nilfs2: fix issue
with race condition of competition between segments for dirty blocks") as
a means of resolving double list insertion of dirty blocks in
nilfs_lookup_dirty_data_buffers() and nilfs_lookup_node_buffers() and the
resulting crash.
This modification is safe as long as it is used for file data and b-tree
node blocks where the page caches are independent. However, it was
irrelevant and redundant to also introduce async_write for segment summary
and super root blocks that share buffers with the backing device. This
led to the possibility that the BUG_ON check in end_buffer_async_write
would fail as described above, if independent writebacks of the backing
device occurred in parallel.
The use of async_write for segment summary buffers has already been
removed in a previous change.
Fix this issue by removing the manipulation of the async_write flag for
the remaining super root block buffer.
Link: https://lkml.kernel.org/r/20240203161645.4992-1-konishi.ryusuke@gmail.com
Fixes: 7f42ec394156 ("nilfs2: fix issue with race condition of competition between segments for dirty blocks")
Signed-off-by: Ryusuke Konishi <konishi.ryusuke(a)gmail.com>
Reported-by: syzbot+5c04210f7c7f897c1e7f(a)syzkaller.appspotmail.com
Closes: https://lkml.kernel.org/r/00000000000019a97c05fd42f8c8@google.com
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
Please queue this patch to these stable trees instead of the patch
that failed to apply to them.
This patch is tailored to account for page/folio conversion and can
be applied from v4.8 to v6.7.
Also, all the builds and tests I did on each stable tree passed.
Thanks,
Ryusuke Konishi
fs/nilfs2/segment.c | 8 +++++---
1 file changed, 5 insertions(+), 3 deletions(-)
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 55e31cc903d1..0f21dbcd0bfb 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -1703,7 +1703,6 @@ static void nilfs_segctor_prepare_write(struct nilfs_sc_info *sci)
list_for_each_entry(bh, &segbuf->sb_payload_buffers,
b_assoc_buffers) {
- set_buffer_async_write(bh);
if (bh == segbuf->sb_super_root) {
if (bh->b_page != bd_page) {
lock_page(bd_page);
@@ -1714,6 +1713,7 @@ static void nilfs_segctor_prepare_write(struct nilfs_sc_info *sci)
}
break;
}
+ set_buffer_async_write(bh);
if (bh->b_page != fs_page) {
nilfs_begin_page_io(fs_page);
fs_page = bh->b_page;
@@ -1799,7 +1799,6 @@ static void nilfs_abort_logs(struct list_head *logs, int err)
list_for_each_entry(bh, &segbuf->sb_payload_buffers,
b_assoc_buffers) {
- clear_buffer_async_write(bh);
if (bh == segbuf->sb_super_root) {
clear_buffer_uptodate(bh);
if (bh->b_page != bd_page) {
@@ -1808,6 +1807,7 @@ static void nilfs_abort_logs(struct list_head *logs, int err)
}
break;
}
+ clear_buffer_async_write(bh);
if (bh->b_page != fs_page) {
nilfs_end_page_io(fs_page, err);
fs_page = bh->b_page;
@@ -1895,8 +1895,9 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
BIT(BH_Delay) | BIT(BH_NILFS_Volatile) |
BIT(BH_NILFS_Redirected));
- set_mask_bits(&bh->b_state, clear_bits, set_bits);
if (bh == segbuf->sb_super_root) {
+ set_buffer_uptodate(bh);
+ clear_buffer_dirty(bh);
if (bh->b_page != bd_page) {
end_page_writeback(bd_page);
bd_page = bh->b_page;
@@ -1904,6 +1905,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
update_sr = true;
break;
}
+ set_mask_bits(&bh->b_state, clear_bits, set_bits);
if (bh->b_page != fs_page) {
nilfs_end_page_io(fs_page, 0);
fs_page = bh->b_page;
--
2.39.3
On Tue, Feb 6, 2024 at 5:18 PM Saravana Kannan <saravanak(a)google.com> wrote:
>
> Similar to the existing "ports" node name, coresight device tree bindings
> have added "in-ports" and "out-ports" as standard node names for a
> collection of ports.
>
> Add support for these name to of_graph_get_port_parent() so that
> remote-endpoint parsing can find the correct parent node for these
> coresight ports too.
>
> Signed-off-by: Saravana Kannan <saravanak(a)google.com>
Greg,
I saw that you pulled the previous 2 patches in this series to 6.1,
6.6 and 6.7 kernel branches. I really should have added both of those
Fixes tag to this patch too.
Can you please pull in the patch to those stable branches too?
Thanks,
Saravana
> ---
> drivers/of/property.c | 4 +++-
> 1 file changed, 3 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/of/property.c b/drivers/of/property.c
> index 7bb2d8e290de..39a3ee1dfb58 100644
> --- a/drivers/of/property.c
> +++ b/drivers/of/property.c
> @@ -763,7 +763,9 @@ struct device_node *of_graph_get_port_parent(struct device_node *node)
> /* Walk 3 levels up only if there is 'ports' node. */
> for (depth = 3; depth && node; depth--) {
> node = of_get_next_parent(node);
> - if (depth == 2 && !of_node_name_eq(node, "ports"))
> + if (depth == 2 && !of_node_name_eq(node, "ports") &&
> + !of_node_name_eq(node, "in-ports") &&
> + !of_node_name_eq(node, "out-ports"))
> break;
> }
> return node;
> --
> 2.43.0.594.gd9cf4e227d-goog
>
In erofs_find_target_block() when erofs_dirnamecmp() returns 0,
we do not assign the target metabuf. This causes the caller
erofs_namei()'s erofs_put_metabuf() at the end to be not effective
leaving the refcount on the page.
As the page from metabuf (buf->page) is never put, such page cannot be
migrated or reclaimed. Fix it now by putting the metabuf from
previous loop and assigning the current metabuf to target before
returning so caller erofs_namei() can do the final put as it was
intended.
Fixes: 500edd095648 ("erofs: use meta buffers for inode lookup")
Cc: stable(a)vger.kernel.org
Signed-off-by: Sandeep Dhavale <dhavale(a)google.com>
---
fs/erofs/namei.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c
index d4f631d39f0f..bfe1c926436b 100644
--- a/fs/erofs/namei.c
+++ b/fs/erofs/namei.c
@@ -132,7 +132,10 @@ static void *erofs_find_target_block(struct erofs_buf *target,
if (!diff) {
*_ndirents = 0;
- goto out;
+ if (!IS_ERR(candidate))
+ erofs_put_metabuf(target);
+ *target = buf;
+ return de;
} else if (diff > 0) {
head = mid + 1;
startprfx = matched;
--
2.44.0.rc0.258.g7320e95886-goog
While mq_perf_tests runs with the default kselftest timeout limit, which
is 45 seconds, the test takes about 60 seconds to complete on i3.metal
AWS instances. Hence, the test always times out. Increase the timeout
to 180 seconds.
Fixes: 852c8cbf34d3 ("selftests/kselftest/runner.sh: Add 45 second timeout per test")
Cc: <stable(a)vger.kernel.org> # 5.4.x
Signed-off-by: SeongJae Park <sj(a)kernel.org>
Reviewed-by: Kees Cook <keescook(a)chromium.org>
---
Changes from v2
(https://lore.kernel.org/r/20240220000243.162285-1-sj@kernel.org)
- Update commit message about the new timeout limit to 180 seconds
- Remove wrong Link: line
Changes from v1
(https://lore.kernel.org/r/20240208212925.68286-1-sj@kernel.org)
- Use 180 seconds timeout instead of 100 seconds
tools/testing/selftests/mqueue/setting | 1 +
1 file changed, 1 insertion(+)
create mode 100644 tools/testing/selftests/mqueue/setting
diff --git a/tools/testing/selftests/mqueue/setting b/tools/testing/selftests/mqueue/setting
new file mode 100644
index 000000000000..a953c96aa16e
--- /dev/null
+++ b/tools/testing/selftests/mqueue/setting
@@ -0,0 +1 @@
+timeout=180
--
2.39.2
This is the fix of CVE-2024-23851 for kernel v5.10.
Upstream commit: https://github.com/torvalds/linux/commit/bd504bcfec41a503b32054da5472904b40…
Changed code not affected by the patch for the old version.
He Gao (1):
dm: limit the number of targets and parameter size area
drivers/md/dm-core.h | 2 ++
drivers/md/dm-ioctl.c | 3 ++-
drivers/md/dm-table.c | 9 +++++++--
3 files changed, 11 insertions(+), 3 deletions(-)
--
2.44.0.rc0.258.g7320e95886-goog
This is the fix of CVE-2024-23851 for kernel v5.15.
Upstream commit: https://github.com/torvalds/linux/commit/bd504bcfec41a503b32054da5472904b40…
Changed code not affected by the patch for the old version.
He Gao (1):
dm: limit the number of targets and parameter size area
drivers/md/dm-core.h | 2 ++
drivers/md/dm-ioctl.c | 3 ++-
drivers/md/dm-table.c | 9 +++++++--
3 files changed, 11 insertions(+), 3 deletions(-)
--
2.44.0.rc0.258.g7320e95886-goog
The quilt patch titled
Subject: mm/damon/lru_sort: fix quota status loss due to online tunings
has been removed from the -mm tree. Its filename was
mm-damon-lru_sort-fix-quota-status-loss-due-to-online-tunings.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: SeongJae Park <sj(a)kernel.org>
Subject: mm/damon/lru_sort: fix quota status loss due to online tunings
Date: Fri, 16 Feb 2024 11:40:25 -0800
For online parameters change, DAMON_LRU_SORT creates new schemes based on
latest values of the parameters and replaces the old schemes with the new
one. When creating it, the internal status of the quotas of the old
schemes is not preserved. As a result, charging of the quota starts from
zero after the online tuning. The data that collected to estimate the
throughput of the scheme's action is also reset, and therefore the
estimation should start from the scratch again. Because the throughput
estimation is being used to convert the time quota to the effective size
quota, this could result in temporal time quota inaccuracy. It would be
recovered over time, though. In short, the quota accuracy could be
temporarily degraded after online parameters update.
Fix the problem by checking the case and copying the internal fields for
the status.
Link: https://lkml.kernel.org/r/20240216194025.9207-3-sj@kernel.org
Fixes: 40e983cca927 ("mm/damon: introduce DAMON-based LRU-lists Sorting")
Signed-off-by: SeongJae Park <sj(a)kernel.org>
Cc: <stable(a)vger.kernel.org> [6.0+]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/damon/lru_sort.c | 43 +++++++++++++++++++++++++++++++++++-------
1 file changed, 36 insertions(+), 7 deletions(-)
--- a/mm/damon/lru_sort.c~mm-damon-lru_sort-fix-quota-status-loss-due-to-online-tunings
+++ a/mm/damon/lru_sort.c
@@ -185,9 +185,21 @@ static struct damos *damon_lru_sort_new_
return damon_lru_sort_new_scheme(&pattern, DAMOS_LRU_DEPRIO);
}
+static void damon_lru_sort_copy_quota_status(struct damos_quota *dst,
+ struct damos_quota *src)
+{
+ dst->total_charged_sz = src->total_charged_sz;
+ dst->total_charged_ns = src->total_charged_ns;
+ dst->charged_sz = src->charged_sz;
+ dst->charged_from = src->charged_from;
+ dst->charge_target_from = src->charge_target_from;
+ dst->charge_addr_from = src->charge_addr_from;
+}
+
static int damon_lru_sort_apply_parameters(void)
{
- struct damos *scheme;
+ struct damos *scheme, *hot_scheme, *cold_scheme;
+ struct damos *old_hot_scheme = NULL, *old_cold_scheme = NULL;
unsigned int hot_thres, cold_thres;
int err = 0;
@@ -195,18 +207,35 @@ static int damon_lru_sort_apply_paramete
if (err)
return err;
+ damon_for_each_scheme(scheme, ctx) {
+ if (!old_hot_scheme) {
+ old_hot_scheme = scheme;
+ continue;
+ }
+ old_cold_scheme = scheme;
+ }
+
hot_thres = damon_max_nr_accesses(&damon_lru_sort_mon_attrs) *
hot_thres_access_freq / 1000;
- scheme = damon_lru_sort_new_hot_scheme(hot_thres);
- if (!scheme)
+ hot_scheme = damon_lru_sort_new_hot_scheme(hot_thres);
+ if (!hot_scheme)
return -ENOMEM;
- damon_set_schemes(ctx, &scheme, 1);
+ if (old_hot_scheme)
+ damon_lru_sort_copy_quota_status(&hot_scheme->quota,
+ &old_hot_scheme->quota);
cold_thres = cold_min_age / damon_lru_sort_mon_attrs.aggr_interval;
- scheme = damon_lru_sort_new_cold_scheme(cold_thres);
- if (!scheme)
+ cold_scheme = damon_lru_sort_new_cold_scheme(cold_thres);
+ if (!cold_scheme) {
+ damon_destroy_scheme(hot_scheme);
return -ENOMEM;
- damon_add_scheme(ctx, scheme);
+ }
+ if (old_cold_scheme)
+ damon_lru_sort_copy_quota_status(&cold_scheme->quota,
+ &old_cold_scheme->quota);
+
+ damon_set_schemes(ctx, &hot_scheme, 1);
+ damon_add_scheme(ctx, cold_scheme);
return damon_set_region_biggest_system_ram_default(target,
&monitor_region_start,
_
Patches currently in -mm which might be from sj(a)kernel.org are
docs-admin-guide-mm-damon-usage-use-sysfs-interface-for-tracepoints-example.patch
mm-damon-rename-config_damon_dbgfs-to-damon_dbgfs_deprecated.patch
mm-damon-dbgfs-implement-deprecation-notice-file.patch
mm-damon-dbgfs-make-debugfs-interface-deprecation-message-a-macro.patch
docs-admin-guide-mm-damon-usage-document-deprecated-file-of-damon-debugfs-interface.patch
selftets-damon-prepare-for-monitor_on-file-renaming.patch
mm-damon-dbgfs-rename-monitor_on-file-to-monitor_on_deprecated.patch
docs-admin-guide-mm-damon-usage-update-for-monitor_on-renaming.patch
docs-translations-damon-usage-update-for-monitor_on-renaming.patch
mm-damon-sysfs-handle-state-file-inputs-for-every-sampling-interval-if-possible.patch
selftests-damon-_damon_sysfs-support-damos-quota.patch
selftests-damon-_damon_sysfs-support-damos-stats.patch
selftests-damon-_damon_sysfs-support-damos-apply-interval.patch
selftests-damon-add-a-test-for-damos-quota.patch
selftests-damon-add-a-test-for-damos-apply-intervals.patch
selftests-damon-add-a-test-for-a-race-between-target_ids_read-and-dbgfs_before_terminate.patch
selftests-damon-add-a-test-for-the-pid-leak-of-dbgfs_target_ids_write.patch
selftests-damon-_chk_dependency-get-debugfs-mount-point-from-proc-mounts.patch
docs-mm-damon-maintainer-profile-fix-reference-links-for-mm-stable-tree.patch
docs-mm-damon-move-the-list-of-damos-actions-to-design-doc.patch
docs-mm-damon-move-damon-operation-sets-list-from-the-usage-to-the-design-document.patch
docs-mm-damon-move-monitoring-target-regions-setup-detail-from-the-usage-to-the-design-document.patch
docs-admin-guide-mm-damon-usage-fix-wrong-quotas-diabling-condition.patch
mm-damon-core-set-damos_quota-esz-as-public-field-and-document.patch
mm-damon-sysfs-schemes-implement-quota-effective_bytes-file.patch
mm-damon-sysfs-implement-a-kdamond-command-for-updating-schemes-effective-quotas.patch
docs-abi-damon-document-effective_bytes-sysfs-file.patch
docs-admin-guide-mm-damon-usage-document-effective_bytes-file.patch
mm-damon-move-comments-and-fields-for-damos-quota-prioritization-to-the-end.patch
mm-damon-core-split-out-quota-goal-related-fields-to-a-struct.patch
mm-damon-core-add-multiple-goals-per-damos_quota-and-helpers-for-those.patch
mm-damon-sysfs-use-only-quota-goals.patch
mm-damon-core-remove-goal-field-of-damos_quota.patch
mm-damon-core-let-goal-specified-with-only-target-and-current-values.patch
mm-damon-core-support-multiple-metrics-for-quota-goal.patch
mm-damon-core-implement-psi-metric-damos-quota-goal.patch
mm-damon-sysfs-schemes-support-psi-based-quota-auto-tune.patch
docs-mm-damon-design-document-quota-goal-self-tuning.patch
docs-abi-damon-document-quota-goal-metric-file.patch
docs-admin-guide-mm-damon-usage-document-quota-goal-metric-file.patch
mm-damon-reclaim-implement-user-feedback-driven-quota-auto-tuning.patch
mm-damon-reclaim-implement-memory-psi-driven-quota-self-tuning.patch
docs-admin-guide-mm-damon-reclaim-document-auto-tuning-parameters.patch
The quilt patch titled
Subject: mm/damon/reclaim: fix quota stauts loss due to online tunings
has been removed from the -mm tree. Its filename was
mm-damon-reclaim-fix-quota-stauts-loss-due-to-online-tunings.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: SeongJae Park <sj(a)kernel.org>
Subject: mm/damon/reclaim: fix quota stauts loss due to online tunings
Date: Fri, 16 Feb 2024 11:40:24 -0800
Patch series "mm/damon: fix quota status loss due to online tunings".
DAMON_RECLAIM and DAMON_LRU_SORT is not preserving internal quota status
when applying new user parameters, and hence could cause temporal quota
accuracy degradation. Fix it by preserving the status.
This patch (of 2):
For online parameters change, DAMON_RECLAIM creates new scheme based on
latest values of the parameters and replaces the old scheme with the new
one. When creating it, the internal status of the quota of the old
scheme is not preserved. As a result, charging of the quota starts from
zero after the online tuning. The data that collected to estimate the
throughput of the scheme's action is also reset, and therefore the
estimation should start from the scratch again. Because the throughput
estimation is being used to convert the time quota to the effective size
quota, this could result in temporal time quota inaccuracy. It would be
recovered over time, though. In short, the quota accuracy could be
temporarily degraded after online parameters update.
Fix the problem by checking the case and copying the internal fields for
the status.
Link: https://lkml.kernel.org/r/20240216194025.9207-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20240216194025.9207-2-sj@kernel.org
Fixes: e035c280f6df ("mm/damon/reclaim: support online inputs update")
Signed-off-by: SeongJae Park <sj(a)kernel.org>
Cc: <stable(a)vger.kernel.org> [5.19+]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/damon/reclaim.c | 18 +++++++++++++++++-
1 file changed, 17 insertions(+), 1 deletion(-)
--- a/mm/damon/reclaim.c~mm-damon-reclaim-fix-quota-stauts-loss-due-to-online-tunings
+++ a/mm/damon/reclaim.c
@@ -150,9 +150,20 @@ static struct damos *damon_reclaim_new_s
&damon_reclaim_wmarks);
}
+static void damon_reclaim_copy_quota_status(struct damos_quota *dst,
+ struct damos_quota *src)
+{
+ dst->total_charged_sz = src->total_charged_sz;
+ dst->total_charged_ns = src->total_charged_ns;
+ dst->charged_sz = src->charged_sz;
+ dst->charged_from = src->charged_from;
+ dst->charge_target_from = src->charge_target_from;
+ dst->charge_addr_from = src->charge_addr_from;
+}
+
static int damon_reclaim_apply_parameters(void)
{
- struct damos *scheme;
+ struct damos *scheme, *old_scheme;
struct damos_filter *filter;
int err = 0;
@@ -164,6 +175,11 @@ static int damon_reclaim_apply_parameter
scheme = damon_reclaim_new_scheme();
if (!scheme)
return -ENOMEM;
+ if (!list_empty(&ctx->schemes)) {
+ damon_for_each_scheme(old_scheme, ctx)
+ damon_reclaim_copy_quota_status(&scheme->quota,
+ &old_scheme->quota);
+ }
if (skip_anon) {
filter = damos_new_filter(DAMOS_FILTER_TYPE_ANON, true);
if (!filter) {
_
Patches currently in -mm which might be from sj(a)kernel.org are
docs-admin-guide-mm-damon-usage-use-sysfs-interface-for-tracepoints-example.patch
mm-damon-rename-config_damon_dbgfs-to-damon_dbgfs_deprecated.patch
mm-damon-dbgfs-implement-deprecation-notice-file.patch
mm-damon-dbgfs-make-debugfs-interface-deprecation-message-a-macro.patch
docs-admin-guide-mm-damon-usage-document-deprecated-file-of-damon-debugfs-interface.patch
selftets-damon-prepare-for-monitor_on-file-renaming.patch
mm-damon-dbgfs-rename-monitor_on-file-to-monitor_on_deprecated.patch
docs-admin-guide-mm-damon-usage-update-for-monitor_on-renaming.patch
docs-translations-damon-usage-update-for-monitor_on-renaming.patch
mm-damon-sysfs-handle-state-file-inputs-for-every-sampling-interval-if-possible.patch
selftests-damon-_damon_sysfs-support-damos-quota.patch
selftests-damon-_damon_sysfs-support-damos-stats.patch
selftests-damon-_damon_sysfs-support-damos-apply-interval.patch
selftests-damon-add-a-test-for-damos-quota.patch
selftests-damon-add-a-test-for-damos-apply-intervals.patch
selftests-damon-add-a-test-for-a-race-between-target_ids_read-and-dbgfs_before_terminate.patch
selftests-damon-add-a-test-for-the-pid-leak-of-dbgfs_target_ids_write.patch
selftests-damon-_chk_dependency-get-debugfs-mount-point-from-proc-mounts.patch
docs-mm-damon-maintainer-profile-fix-reference-links-for-mm-stable-tree.patch
docs-mm-damon-move-the-list-of-damos-actions-to-design-doc.patch
docs-mm-damon-move-damon-operation-sets-list-from-the-usage-to-the-design-document.patch
docs-mm-damon-move-monitoring-target-regions-setup-detail-from-the-usage-to-the-design-document.patch
docs-admin-guide-mm-damon-usage-fix-wrong-quotas-diabling-condition.patch
mm-damon-core-set-damos_quota-esz-as-public-field-and-document.patch
mm-damon-sysfs-schemes-implement-quota-effective_bytes-file.patch
mm-damon-sysfs-implement-a-kdamond-command-for-updating-schemes-effective-quotas.patch
docs-abi-damon-document-effective_bytes-sysfs-file.patch
docs-admin-guide-mm-damon-usage-document-effective_bytes-file.patch
mm-damon-move-comments-and-fields-for-damos-quota-prioritization-to-the-end.patch
mm-damon-core-split-out-quota-goal-related-fields-to-a-struct.patch
mm-damon-core-add-multiple-goals-per-damos_quota-and-helpers-for-those.patch
mm-damon-sysfs-use-only-quota-goals.patch
mm-damon-core-remove-goal-field-of-damos_quota.patch
mm-damon-core-let-goal-specified-with-only-target-and-current-values.patch
mm-damon-core-support-multiple-metrics-for-quota-goal.patch
mm-damon-core-implement-psi-metric-damos-quota-goal.patch
mm-damon-sysfs-schemes-support-psi-based-quota-auto-tune.patch
docs-mm-damon-design-document-quota-goal-self-tuning.patch
docs-abi-damon-document-quota-goal-metric-file.patch
docs-admin-guide-mm-damon-usage-document-quota-goal-metric-file.patch
mm-damon-reclaim-implement-user-feedback-driven-quota-auto-tuning.patch
mm-damon-reclaim-implement-memory-psi-driven-quota-self-tuning.patch
docs-admin-guide-mm-damon-reclaim-document-auto-tuning-parameters.patch
The quilt patch titled
Subject: mm: memcontrol: clarify swapaccount=0 deprecation warning
has been removed from the -mm tree. Its filename was
mm-memcontrol-clarify-swapaccount=0-deprecation-warning.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Johannes Weiner <hannes(a)cmpxchg.org>
Subject: mm: memcontrol: clarify swapaccount=0 deprecation warning
Date: Tue, 13 Feb 2024 03:16:34 -0500
The swapaccount deprecation warning is throwing false positives. Since we
deprecated the knob and defaulted to enabling, the only reports we've been
getting are from folks that set swapaccount=1. While this is a nice
affirmation that always-enabling was the right choice, we certainly don't
want to warn when users request the supported mode.
Only warn when disabling is requested, and clarify the warning.
[colin.i.king(a)gmail.com: spelling: "commdandline" -> "commandline"]
Link: https://lkml.kernel.org/r/20240215090544.1649201-1-colin.i.king@gmail.com
Link: https://lkml.kernel.org/r/20240213081634.3652326-1-hannes@cmpxchg.org
Fixes: b25806dcd3d5 ("mm: memcontrol: deprecate swapaccounting=0 mode")
Signed-off-by: Colin Ian King <colin.i.king(a)gmail.com>
Reported-by: "Jonas Sch��fer" <jonas(a)wielicki.name>
Reported-by: Narcis Garcia <debianlists(a)actiu.net>
Suggested-by: Yosry Ahmed <yosryahmed(a)google.com>
Signed-off-by: Johannes Weiner <hannes(a)cmpxchg.org>
Reviewed-by: Yosry Ahmed <yosryahmed(a)google.com>
Acked-by: Michal Hocko <mhocko(a)suse.com>
Acked-by: Shakeel Butt <shakeelb(a)google.com>
Cc: Roman Gushchin <roman.gushchin(a)linux.dev>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/memcontrol.c | 10 +++++++---
1 file changed, 7 insertions(+), 3 deletions(-)
--- a/mm/memcontrol.c~mm-memcontrol-clarify-swapaccount=0-deprecation-warning
+++ a/mm/memcontrol.c
@@ -7971,9 +7971,13 @@ bool mem_cgroup_swap_full(struct folio *
static int __init setup_swap_account(char *s)
{
- pr_warn_once("The swapaccount= commandline option is deprecated. "
- "Please report your usecase to linux-mm(a)kvack.org if you "
- "depend on this functionality.\n");
+ bool res;
+
+ if (!kstrtobool(s, &res) && !res)
+ pr_warn_once("The swapaccount=0 commandline option is deprecated "
+ "in favor of configuring swap control via cgroupfs. "
+ "Please report your usecase to linux-mm(a)kvack.org if you "
+ "depend on this functionality.\n");
return 1;
}
__setup("swapaccount=", setup_swap_account);
_
Patches currently in -mm which might be from hannes(a)cmpxchg.org are
mm-zswap-rename-zswap_free_entry-to-zswap_entry_free.patch
mm-zswap-inline-and-remove-zswap_entry_find_get.patch
mm-zswap-move-zswap_invalidate_entry-to-related-functions.patch
mm-zswap-warn-when-referencing-a-dead-entry.patch
mm-zswap-clean-up-zswap_entry_put.patch
mm-zswap-rename-__zswap_load-to-zswap_decompress.patch
mm-zswap-break-out-zwap_compress.patch
mm-zswap-further-cleanup-zswap_store.patch
mm-zswap-simplify-zswap_invalidate.patch
mm-zswap-function-ordering-pool-alloc-free.patch
mm-zswap-function-ordering-pool-refcounting.patch
mm-zswap-function-ordering-zswap_pools.patch
mm-zswap-function-ordering-pool-params.patch
mm-zswap-function-ordering-public-lru-api.patch
mm-zswap-function-ordering-move-entry-sections-out-of-lru-section.patch
mm-zswap-function-ordering-move-entry-section-out-of-tree-section.patch
mm-zswap-function-ordering-compress-decompress-functions.patch
mm-zswap-function-ordering-per-cpu-compression-infra.patch
mm-zswap-function-ordering-writeback.patch
mm-zswap-function-ordering-shrink_memcg_cb.patch
The quilt patch titled
Subject: mm/zswap: invalidate duplicate entry when !zswap_enabled
has been removed from the -mm tree. Its filename was
mm-zswap-invalidate-duplicate-entry-when-zswap_enabled.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Chengming Zhou <zhouchengming(a)bytedance.com>
Subject: mm/zswap: invalidate duplicate entry when !zswap_enabled
Date: Thu, 8 Feb 2024 02:32:54 +0000
We have to invalidate any duplicate entry even when !zswap_enabled since
zswap can be disabled anytime. If the folio store success before, then
got dirtied again but zswap disabled, we won't invalidate the old
duplicate entry in the zswap_store(). So later lru writeback may
overwrite the new data in swapfile.
Link: https://lkml.kernel.org/r/20240208023254.3873823-1-chengming.zhou@linux.dev
Fixes: 42c06a0e8ebe ("mm: kill frontswap")
Signed-off-by: Chengming Zhou <zhouchengming(a)bytedance.com>
Acked-by: Johannes Weiner <hannes(a)cmpxchg.org>
Cc: Nhat Pham <nphamcs(a)gmail.com>
Cc: Yosry Ahmed <yosryahmed(a)google.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/zswap.c | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
--- a/mm/zswap.c~mm-zswap-invalidate-duplicate-entry-when-zswap_enabled
+++ a/mm/zswap.c
@@ -1518,7 +1518,7 @@ bool zswap_store(struct folio *folio)
if (folio_test_large(folio))
return false;
- if (!zswap_enabled || !tree)
+ if (!tree)
return false;
/*
@@ -1533,6 +1533,10 @@ bool zswap_store(struct folio *folio)
zswap_invalidate_entry(tree, dupentry);
}
spin_unlock(&tree->lock);
+
+ if (!zswap_enabled)
+ return false;
+
objcg = get_obj_cgroup_from_folio(folio);
if (objcg && !obj_cgroup_may_zswap(objcg)) {
memcg = get_mem_cgroup_from_objcg(objcg);
_
Patches currently in -mm which might be from zhouchengming(a)bytedance.com are
mm-zswap-make-sure-each-swapfile-always-have-zswap-rb-tree.patch
mm-zswap-split-zswap-rb-tree.patch
mm-zswap-fix-race-between-lru-writeback-and-swapoff.patch
mm-list_lru-remove-list_lru_putback.patch
mm-zswap-add-more-comments-in-shrink_memcg_cb.patch
mm-zswap-invalidate-zswap-entry-when-swap-entry-free.patch
mm-zswap-stop-lru-list-shrinking-when-encounter-warm-region.patch
mm-zswap-remove-duplicate_entry-debug-value.patch
mm-zswap-only-support-zswap_exclusive_loads_enabled.patch
mm-zswap-zswap-entry-doesnt-need-refcount-anymore.patch
mm-zswap-optimize-and-cleanup-the-invalidation-of-duplicate-entry.patch
mm-zsmalloc-fix-migrate_write_lock-when-config_compaction.patch
mm-zsmalloc-remove-migrate_write_lock_nested.patch
mm-zsmalloc-remove-unused-zspage-isolated.patch
mm-zswap-global-lru-and-shrinker-shared-by-all-zswap_pools.patch
mm-zswap-change-zswap_pool-kref-to-percpu_ref.patch
mm-zsmalloc-remove-set_zspage_mapping.patch
mm-zsmalloc-remove_zspage-dont-need-fullness-parameter.patch
mm-zsmalloc-remove-get_zspage_mapping.patch
maintainers-add-chengming-zhou-as-a-zswap-reviewer.patch
The quilt patch titled
Subject: lib/Kconfig.debug: TEST_IOV_ITER depends on MMU
has been removed from the -mm tree. Its filename was
lib-kconfigdebug-test_iov_iter-depends-on-mmu.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Guenter Roeck <linux(a)roeck-us.net>
Subject: lib/Kconfig.debug: TEST_IOV_ITER depends on MMU
Date: Thu, 8 Feb 2024 07:30:10 -0800
Trying to run the iov_iter unit test on a nommu system such as the qemu
kc705-nommu emulation results in a crash.
KTAP version 1
# Subtest: iov_iter
# module: kunit_iov_iter
1..9
BUG: failure at mm/nommu.c:318/vmap()!
Kernel panic - not syncing: BUG!
The test calls vmap() directly, but vmap() is not supported on nommu
systems, causing the crash. TEST_IOV_ITER therefore needs to depend on
MMU.
Link: https://lkml.kernel.org/r/20240208153010.1439753-1-linux@roeck-us.net
Fixes: 2d71340ff1d4 ("iov_iter: Kunit tests for copying to/from an iterator")
Signed-off-by: Guenter Roeck <linux(a)roeck-us.net>
Cc: David Howells <dhowells(a)redhat.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
lib/Kconfig.debug | 1 +
1 file changed, 1 insertion(+)
--- a/lib/Kconfig.debug~lib-kconfigdebug-test_iov_iter-depends-on-mmu
+++ a/lib/Kconfig.debug
@@ -2235,6 +2235,7 @@ config TEST_DIV64
config TEST_IOV_ITER
tristate "Test iov_iter operation" if !KUNIT_ALL_TESTS
depends on KUNIT
+ depends on MMU
default KUNIT_ALL_TESTS
help
Enable this to turn on testing of the operation of the I/O iterator
_
Patches currently in -mm which might be from linux(a)roeck-us.net are
The quilt patch titled
Subject: mm/swap: fix race when skipping swapcache
has been removed from the -mm tree. Its filename was
mm-swap-fix-race-when-skipping-swapcache.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Kairui Song <kasong(a)tencent.com>
Subject: mm/swap: fix race when skipping swapcache
Date: Wed, 7 Feb 2024 02:25:59 +0800
When skipping swapcache for SWP_SYNCHRONOUS_IO, if two or more threads
swapin the same entry at the same time, they get different pages (A, B).
Before one thread (T0) finishes the swapin and installs page (A) to the
PTE, another thread (T1) could finish swapin of page (B), swap_free the
entry, then swap out the possibly modified page reusing the same entry.
It breaks the pte_same check in (T0) because PTE value is unchanged,
causing ABA problem. Thread (T0) will install a stalled page (A) into the
PTE and cause data corruption.
One possible callstack is like this:
CPU0 CPU1
---- ----
do_swap_page() do_swap_page() with same entry
<direct swapin path> <direct swapin path>
<alloc page A> <alloc page B>
swap_read_folio() <- read to page A swap_read_folio() <- read to page B
<slow on later locks or interrupt> <finished swapin first>
... set_pte_at()
swap_free() <- entry is free
<write to page B, now page A stalled>
<swap out page B to same swap entry>
pte_same() <- Check pass, PTE seems
unchanged, but page A
is stalled!
swap_free() <- page B content lost!
set_pte_at() <- staled page A installed!
And besides, for ZRAM, swap_free() allows the swap device to discard the
entry content, so even if page (B) is not modified, if swap_read_folio()
on CPU0 happens later than swap_free() on CPU1, it may also cause data
loss.
To fix this, reuse swapcache_prepare which will pin the swap entry using
the cache flag, and allow only one thread to swap it in, also prevent any
parallel code from putting the entry in the cache. Release the pin after
PT unlocked.
Racers just loop and wait since it's a rare and very short event. A
schedule_timeout_uninterruptible(1) call is added to avoid repeated page
faults wasting too much CPU, causing livelock or adding too much noise to
perf statistics. A similar livelock issue was described in commit
029c4628b2eb ("mm: swap: get rid of livelock in swapin readahead")
Reproducer:
This race issue can be triggered easily using a well constructed
reproducer and patched brd (with a delay in read path) [1]:
With latest 6.8 mainline, race caused data loss can be observed easily:
$ gcc -g -lpthread test-thread-swap-race.c && ./a.out
Polulating 32MB of memory region...
Keep swapping out...
Starting round 0...
Spawning 65536 workers...
32746 workers spawned, wait for done...
Round 0: Error on 0x5aa00, expected 32746, got 32743, 3 data loss!
Round 0: Error on 0x395200, expected 32746, got 32743, 3 data loss!
Round 0: Error on 0x3fd000, expected 32746, got 32737, 9 data loss!
Round 0 Failed, 15 data loss!
This reproducer spawns multiple threads sharing the same memory region
using a small swap device. Every two threads updates mapped pages one by
one in opposite direction trying to create a race, with one dedicated
thread keep swapping out the data out using madvise.
The reproducer created a reproduce rate of about once every 5 minutes, so
the race should be totally possible in production.
After this patch, I ran the reproducer for over a few hundred rounds and
no data loss observed.
Performance overhead is minimal, microbenchmark swapin 10G from 32G
zram:
Before: 10934698 us
After: 11157121 us
Cached: 13155355 us (Dropping SWP_SYNCHRONOUS_IO flag)
[kasong(a)tencent.com: v4]
Link: https://lkml.kernel.org/r/20240219082040.7495-1-ryncsn@gmail.com
Link: https://lkml.kernel.org/r/20240206182559.32264-1-ryncsn@gmail.com
Fixes: 0bcac06f27d7 ("mm, swap: skip swapcache for swapin of synchronous device")
Reported-by: "Huang, Ying" <ying.huang(a)intel.com>
Closes: https://lore.kernel.org/lkml/87bk92gqpx.fsf_-_@yhuang6-desk2.ccr.corp.intel…
Link: https://github.com/ryncsn/emm-test-project/tree/master/swap-stress-race [1]
Signed-off-by: Kairui Song <kasong(a)tencent.com>
Reviewed-by: "Huang, Ying" <ying.huang(a)intel.com>
Acked-by: Yu Zhao <yuzhao(a)google.com>
Acked-by: David Hildenbrand <david(a)redhat.com>
Acked-by: Chris Li <chrisl(a)kernel.org>
Cc: Hugh Dickins <hughd(a)google.com>
Cc: Johannes Weiner <hannes(a)cmpxchg.org>
Cc: Matthew Wilcox (Oracle) <willy(a)infradead.org>
Cc: Michal Hocko <mhocko(a)suse.com>
Cc: Minchan Kim <minchan(a)kernel.org>
Cc: Yosry Ahmed <yosryahmed(a)google.com>
Cc: Yu Zhao <yuzhao(a)google.com>
Cc: Barry Song <21cnbao(a)gmail.com>
Cc: SeongJae Park <sj(a)kernel.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
include/linux/swap.h | 5 +++++
mm/memory.c | 20 ++++++++++++++++++++
mm/swap.h | 5 +++++
mm/swapfile.c | 13 +++++++++++++
4 files changed, 43 insertions(+)
--- a/include/linux/swap.h~mm-swap-fix-race-when-skipping-swapcache
+++ a/include/linux/swap.h
@@ -549,6 +549,11 @@ static inline int swap_duplicate(swp_ent
return 0;
}
+static inline int swapcache_prepare(swp_entry_t swp)
+{
+ return 0;
+}
+
static inline void swap_free(swp_entry_t swp)
{
}
--- a/mm/memory.c~mm-swap-fix-race-when-skipping-swapcache
+++ a/mm/memory.c
@@ -3799,6 +3799,7 @@ vm_fault_t do_swap_page(struct vm_fault
struct page *page;
struct swap_info_struct *si = NULL;
rmap_t rmap_flags = RMAP_NONE;
+ bool need_clear_cache = false;
bool exclusive = false;
swp_entry_t entry;
pte_t pte;
@@ -3867,6 +3868,20 @@ vm_fault_t do_swap_page(struct vm_fault
if (!folio) {
if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
__swap_count(entry) == 1) {
+ /*
+ * Prevent parallel swapin from proceeding with
+ * the cache flag. Otherwise, another thread may
+ * finish swapin first, free the entry, and swapout
+ * reusing the same entry. It's undetectable as
+ * pte_same() returns true due to entry reuse.
+ */
+ if (swapcache_prepare(entry)) {
+ /* Relax a bit to prevent rapid repeated page faults */
+ schedule_timeout_uninterruptible(1);
+ goto out;
+ }
+ need_clear_cache = true;
+
/* skip swapcache */
folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0,
vma, vmf->address, false);
@@ -4117,6 +4132,9 @@ unlock:
if (vmf->pte)
pte_unmap_unlock(vmf->pte, vmf->ptl);
out:
+ /* Clear the swap cache pin for direct swapin after PTL unlock */
+ if (need_clear_cache)
+ swapcache_clear(si, entry);
if (si)
put_swap_device(si);
return ret;
@@ -4131,6 +4149,8 @@ out_release:
folio_unlock(swapcache);
folio_put(swapcache);
}
+ if (need_clear_cache)
+ swapcache_clear(si, entry);
if (si)
put_swap_device(si);
return ret;
--- a/mm/swapfile.c~mm-swap-fix-race-when-skipping-swapcache
+++ a/mm/swapfile.c
@@ -3365,6 +3365,19 @@ int swapcache_prepare(swp_entry_t entry)
return __swap_duplicate(entry, SWAP_HAS_CACHE);
}
+void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry)
+{
+ struct swap_cluster_info *ci;
+ unsigned long offset = swp_offset(entry);
+ unsigned char usage;
+
+ ci = lock_cluster_or_swap_info(si, offset);
+ usage = __swap_entry_free_locked(si, offset, SWAP_HAS_CACHE);
+ unlock_cluster_or_swap_info(si, ci);
+ if (!usage)
+ free_swap_slot(entry);
+}
+
struct swap_info_struct *swp_swap_info(swp_entry_t entry)
{
return swap_type_to_swap_info(swp_type(entry));
--- a/mm/swap.h~mm-swap-fix-race-when-skipping-swapcache
+++ a/mm/swap.h
@@ -41,6 +41,7 @@ void __delete_from_swap_cache(struct fol
void delete_from_swap_cache(struct folio *folio);
void clear_shadow_from_swap_cache(int type, unsigned long begin,
unsigned long end);
+void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry);
struct folio *swap_cache_get_folio(swp_entry_t entry,
struct vm_area_struct *vma, unsigned long addr);
struct folio *filemap_get_incore_folio(struct address_space *mapping,
@@ -97,6 +98,10 @@ static inline int swap_writepage(struct
return 0;
}
+static inline void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry)
+{
+}
+
static inline struct folio *swap_cache_get_folio(swp_entry_t entry,
struct vm_area_struct *vma, unsigned long addr)
{
_
Patches currently in -mm which might be from kasong(a)tencent.com are
The quilt patch titled
Subject: selftests/mm: uffd-unit-test check if huge page size is 0
has been removed from the -mm tree. Its filename was
selftests-mm-uffd-unit-test-check-if-huge-page-size-is-0.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Terry Tritton <terry.tritton(a)linaro.org>
Subject: selftests/mm: uffd-unit-test check if huge page size is 0
Date: Mon, 5 Feb 2024 14:50:56 +0000
If HUGETLBFS is not enabled then the default_huge_page_size function will
return 0 and cause a divide by 0 error. Add a check to see if the huge page
size is 0 and skip the hugetlb tests if it is.
Link: https://lkml.kernel.org/r/20240205145055.3545806-2-terry.tritton@linaro.org
Fixes: 16a45b57cbf2 ("selftests/mm: add framework for uffd-unit-test")
Signed-off-by: Terry Tritton <terry.tritton(a)linaro.org>
Cc: Peter Griffin <peter.griffin(a)linaro.org>
Cc: Shuah Khan <shuah(a)kernel.org>
Cc: Peter Xu <peterx(a)redhat.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
tools/testing/selftests/mm/uffd-unit-tests.c | 6 ++++++
1 file changed, 6 insertions(+)
--- a/tools/testing/selftests/mm/uffd-unit-tests.c~selftests-mm-uffd-unit-test-check-if-huge-page-size-is-0
+++ a/tools/testing/selftests/mm/uffd-unit-tests.c
@@ -1517,6 +1517,12 @@ int main(int argc, char *argv[])
continue;
uffd_test_start("%s on %s", test->name, mem_type->name);
+ if ((mem_type->mem_flag == MEM_HUGETLB ||
+ mem_type->mem_flag == MEM_HUGETLB_PRIVATE) &&
+ (default_huge_page_size() == 0)) {
+ uffd_test_skip("huge page size is 0, feature missing?");
+ continue;
+ }
if (!uffd_feature_supported(test)) {
uffd_test_skip("feature missing");
continue;
_
Patches currently in -mm which might be from terry.tritton(a)linaro.org are
The quilt patch titled
Subject: mm: zswap: fix missing folio cleanup in writeback race path
has been removed from the -mm tree. Its filename was
mm-zswap-fix-missing-folio-cleanup-in-writeback-race-path.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Yosry Ahmed <yosryahmed(a)google.com>
Subject: mm: zswap: fix missing folio cleanup in writeback race path
Date: Thu, 25 Jan 2024 08:51:27 +0000
In zswap_writeback_entry(), after we get a folio from
__read_swap_cache_async(), we grab the tree lock again to check that the
swap entry was not invalidated and recycled. If it was, we delete the
folio we just added to the swap cache and exit.
However, __read_swap_cache_async() returns the folio locked when it is
newly allocated, which is always true for this path, and the folio is
ref'd. Make sure to unlock and put the folio before returning.
This was discovered by code inspection, probably because this path handles
a race condition that should not happen often, and the bug would not crash
the system, it will only strand the folio indefinitely.
Link: https://lkml.kernel.org/r/20240125085127.1327013-1-yosryahmed@google.com
Fixes: 04fc7816089c ("mm: fix zswap writeback race condition")
Signed-off-by: Yosry Ahmed <yosryahmed(a)google.com>
Reviewed-by: Chengming Zhou <zhouchengming(a)bytedance.com>
Acked-by: Johannes Weiner <hannes(a)cmpxchg.org>
Reviewed-by: Nhat Pham <nphamcs(a)gmail.com>
Cc: Domenico Cerasuolo <cerasuolodomenico(a)gmail.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/zswap.c | 2 ++
1 file changed, 2 insertions(+)
--- a/mm/zswap.c~mm-zswap-fix-missing-folio-cleanup-in-writeback-race-path
+++ a/mm/zswap.c
@@ -1440,6 +1440,8 @@ static int zswap_writeback_entry(struct
if (zswap_rb_search(&tree->rbroot, swp_offset(entry->swpentry)) != entry) {
spin_unlock(&tree->lock);
delete_from_swap_cache(folio);
+ folio_unlock(folio);
+ folio_put(folio);
return -ENOMEM;
}
spin_unlock(&tree->lock);
_
Patches currently in -mm which might be from yosryahmed(a)google.com are
mm-swap-enforce-updating-inuse_pages-at-the-end-of-swap_range_free.patch
mm-zswap-remove-unnecessary-trees-cleanups-in-zswap_swapoff.patch
mm-zswap-remove-unused-tree-argument-in-zswap_entry_put.patch
x86-mm-delete-unused-cpu-argument-to-leave_mm.patch
x86-mm-clarify-prev-usage-in-switch_mm_irqs_off.patch
The patch below does not apply to the 6.7-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.7.y
git checkout FETCH_HEAD
git cherry-pick -x e870920bbe68e52335a4c31a059e6af6a9a59dbb
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021921-bleak-sputter-5ecf@gregkh' --subject-prefix 'PATCH 6.7.y' HEAD^..
Possible dependencies:
e870920bbe68 ("arch/arm/mm: fix major fault accounting when retrying under per-VMA lock")
c16af1212479 ("ARM: 9328/1: mm: try VMA lock-based page fault handling first")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From e870920bbe68e52335a4c31a059e6af6a9a59dbb Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb(a)google.com>
Date: Mon, 22 Jan 2024 22:43:05 -0800
Subject: [PATCH] arch/arm/mm: fix major fault accounting when retrying under
per-VMA lock
The change [1] missed ARM architecture when fixing major fault accounting
for page fault retry under per-VMA lock.
The user-visible effects is that it restores correct major fault
accounting that was broken after [2] was merged in 6.7 kernel. The
more detailed description is in [3] and this patch simply adds the
same fix to ARM architecture which I missed in [3].
Add missing code to fix ARM architecture fault accounting.
[1] 46e714c729c8 ("arch/mm/fault: fix major fault accounting when retrying under per-VMA lock")
[2] https://lore.kernel.org/all/20231006195318.4087158-6-willy@infradead.org/
[3] https://lore.kernel.org/all/20231226214610.109282-1-surenb@google.com/
Link: https://lkml.kernel.org/r/20240123064305.2829244-1-surenb@google.com
Fixes: 12214eba1992 ("mm: handle read faults under the VMA lock")
Reported-by: Russell King (Oracle) <rmk+kernel(a)armlinux.org.uk>
Signed-off-by: Suren Baghdasaryan <surenb(a)google.com>
Cc: Alexander Gordeev <agordeev(a)linux.ibm.com>
Cc: Andy Lutomirski <luto(a)kernel.org>
Cc: Catalin Marinas <catalin.marinas(a)arm.com>
Cc: Christophe Leroy <christophe.leroy(a)csgroup.eu>
Cc: Dave Hansen <dave.hansen(a)linux.intel.com>
Cc: Gerald Schaefer <gerald.schaefer(a)linux.ibm.com>
Cc: Matthew Wilcox (Oracle) <willy(a)infradead.org>
Cc: Michael Ellerman <mpe(a)ellerman.id.au>
Cc: Palmer Dabbelt <palmer(a)dabbelt.com>
Cc: Peter Zijlstra <peterz(a)infradead.org>
Cc: Will Deacon <will(a)kernel.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
index e96fb40b9cc3..07565b593ed6 100644
--- a/arch/arm/mm/fault.c
+++ b/arch/arm/mm/fault.c
@@ -298,6 +298,8 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
goto done;
}
count_vm_vma_lock_event(VMA_LOCK_RETRY);
+ if (fault & VM_FAULT_MAJOR)
+ flags |= FAULT_FLAG_TRIED;
/* Quick path to respond to signals */
if (fault_signal_pending(fault, regs)) {
Hello,
I am sending this patch for inclusion in the stable tree, as it fixes
a critical stack-out-of-bounds bug in the cifs module related to the
`smb2_set_next_command()` function.
Problem Summary:
A problem was observed in the `statfs` system call for cifs, where it
failed with a "Resource temporarily unavailable" message. Further
investigation with KASAN revealed a stack-out-of-bounds error. The
root cause was a miscalculation of the size of the `smb2_query_info_req`
structure in the `SMB2_query_info_init()` function.
This situation arose due to a dependency on a prior commit
(`eb3e28c1e89b`) that replaced a 1-element array with a flexible
array member in the `smb2_query_info_req` structure. This commit was
not backported to the 5.10.y and 5.15.y stable branch, leading to an
incorrect size calculation after the backport of commit `33eae65c6f49`.
Fix Details:
The patch corrects the size calculation to ensure the correct length
is used when initializing the `smb2_query_info_req` structure. It has
been tested and confirmed to resolve the issue without introducing
any regressions.
Maybe the prior commit eb3e28c1e89b ("smb3: Replace smb2pdu 1-element
arrays with flex-arrays") should be backported to solve this problem
directly. The patch does not seem to conflict.
Best regards,
ZhaoLong Wang
ZhaoLong Wang (1):
cifs: Fix stack-out-of-bounds in smb2_set_next_command()
fs/cifs/smb2pdu.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--
2.39.2
The patch titled
Subject: mm/debug_vm_pgtable: fix BUG_ON with pud advanced test
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
mm-debug_vm_pgtable-fix-bug_on-with-pud-advanced-test.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: "Aneesh Kumar K.V (IBM)" <aneesh.kumar(a)kernel.org>
Subject: mm/debug_vm_pgtable: fix BUG_ON with pud advanced test
Date: Mon, 29 Jan 2024 11:30:22 +0530
Architectures like powerpc add debug checks to ensure we find only devmap
PUD pte entries. These debug checks are only done with CONFIG_DEBUG_VM.
This patch marks the ptes used for PUD advanced test devmap pte entries so
that we don't hit on debug checks on architecture like ppc64 as below.
WARNING: CPU: 2 PID: 1 at arch/powerpc/mm/book3s64/radix_pgtable.c:1382 radix__pud_hugepage_update+0x38/0x138
....
NIP [c0000000000a7004] radix__pud_hugepage_update+0x38/0x138
LR [c0000000000a77a8] radix__pudp_huge_get_and_clear+0x28/0x60
Call Trace:
[c000000004a2f950] [c000000004a2f9a0] 0xc000000004a2f9a0 (unreliable)
[c000000004a2f980] [000d34c100000000] 0xd34c100000000
[c000000004a2f9a0] [c00000000206ba98] pud_advanced_tests+0x118/0x334
[c000000004a2fa40] [c00000000206db34] debug_vm_pgtable+0xcbc/0x1c48
[c000000004a2fc10] [c00000000000fd28] do_one_initcall+0x60/0x388
Also
kernel BUG at arch/powerpc/mm/book3s64/pgtable.c:202!
....
NIP [c000000000096510] pudp_huge_get_and_clear_full+0x98/0x174
LR [c00000000206bb34] pud_advanced_tests+0x1b4/0x334
Call Trace:
[c000000004a2f950] [000d34c100000000] 0xd34c100000000 (unreliable)
[c000000004a2f9a0] [c00000000206bb34] pud_advanced_tests+0x1b4/0x334
[c000000004a2fa40] [c00000000206db34] debug_vm_pgtable+0xcbc/0x1c48
[c000000004a2fc10] [c00000000000fd28] do_one_initcall+0x60/0x388
Link: https://lkml.kernel.org/r/20240129060022.68044-1-aneesh.kumar@kernel.org
Fixes: 27af67f35631 ("powerpc/book3s64/mm: enable transparent pud hugepage")
Signed-off-by: Aneesh Kumar K.V (IBM) <aneesh.kumar(a)kernel.org>
Cc: Anshuman Khandual <anshuman.khandual(a)arm.com>
Cc: Michael Ellerman <mpe(a)ellerman.id.au>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/debug_vm_pgtable.c | 8 ++++++++
1 file changed, 8 insertions(+)
--- a/mm/debug_vm_pgtable.c~mm-debug_vm_pgtable-fix-bug_on-with-pud-advanced-test
+++ a/mm/debug_vm_pgtable.c
@@ -362,6 +362,12 @@ static void __init pud_advanced_tests(st
vaddr &= HPAGE_PUD_MASK;
pud = pfn_pud(args->pud_pfn, args->page_prot);
+ /*
+ * Some architectures have debug checks to make sure
+ * huge pud mapping are only found with devmap entries
+ * For now test with only devmap entries.
+ */
+ pud = pud_mkdevmap(pud);
set_pud_at(args->mm, vaddr, args->pudp, pud);
flush_dcache_page(page);
pudp_set_wrprotect(args->mm, vaddr, args->pudp);
@@ -374,6 +380,7 @@ static void __init pud_advanced_tests(st
WARN_ON(!pud_none(pud));
#endif /* __PAGETABLE_PMD_FOLDED */
pud = pfn_pud(args->pud_pfn, args->page_prot);
+ pud = pud_mkdevmap(pud);
pud = pud_wrprotect(pud);
pud = pud_mkclean(pud);
set_pud_at(args->mm, vaddr, args->pudp, pud);
@@ -391,6 +398,7 @@ static void __init pud_advanced_tests(st
#endif /* __PAGETABLE_PMD_FOLDED */
pud = pfn_pud(args->pud_pfn, args->page_prot);
+ pud = pud_mkdevmap(pud);
pud = pud_mkyoung(pud);
set_pud_at(args->mm, vaddr, args->pudp, pud);
flush_dcache_page(page);
_
Patches currently in -mm which might be from aneesh.kumar(a)kernel.org are
mm-debug_vm_pgtable-fix-bug_on-with-pud-advanced-test.patch
From: Alexander Usyskin <alexander.usyskin(a)intel.com>
On Arrow Lake S systems, MEI is no longer strictly connected to bus 0,
while graphics remain exclusively on bus 0. Adapt the component
matching logic to accommodate this change:
Original behavior: Required both MEI and graphics to be on the same
bus 0.
New behavior: Only enforces graphics to be on bus 0 (integrated),
allowing MEI to reside on any bus.
This ensures compatibility with Arrow Lake S and maintains functionality
for the legacy systems.
Fixes: 1dd924f6885b ("mei: gsc_proxy: add gsc proxy driver")
Cc: <stable(a)vger.kernel.org> # v6.3+
Signed-off-by: Alexander Usyskin <alexander.usyskin(a)intel.com>
Signed-off-by: Tomas Winkler <tomas.winkler(a)intel.com>
---
V2: Add reference to fixed commit
Requires 'mei: me: add arrow lake point S DID'
https://lore.kernel.org/lkml/20240211103912.117105-1-tomas.winkler@intel.co…
drivers/misc/mei/gsc_proxy/mei_gsc_proxy.c | 8 +++++---
1 file changed, 5 insertions(+), 3 deletions(-)
diff --git a/drivers/misc/mei/gsc_proxy/mei_gsc_proxy.c b/drivers/misc/mei/gsc_proxy/mei_gsc_proxy.c
index be52b113aea937c7c658e06c..89364bdbb1290f5726a34945 100644
--- a/drivers/misc/mei/gsc_proxy/mei_gsc_proxy.c
+++ b/drivers/misc/mei/gsc_proxy/mei_gsc_proxy.c
@@ -96,7 +96,8 @@ static const struct component_master_ops mei_component_master_ops = {
*
* The function checks if the device is pci device and
* Intel VGA adapter, the subcomponent is SW Proxy
- * and the parent of MEI PCI and the parent of VGA are the same PCH device.
+ * and the VGA is on the bus 0 reserved for built-in devices
+ * to reject discrete GFX.
*
* @dev: master device
* @subcomponent: subcomponent to match (I915_COMPONENT_SWPROXY)
@@ -123,7 +124,8 @@ static int mei_gsc_proxy_component_match(struct device *dev, int subcomponent,
if (subcomponent != I915_COMPONENT_GSC_PROXY)
return 0;
- return component_compare_dev(dev->parent, ((struct device *)data)->parent);
+ /* Only built-in GFX */
+ return (pdev->bus->number == 0);
}
static int mei_gsc_proxy_probe(struct mei_cl_device *cldev,
@@ -146,7 +148,7 @@ static int mei_gsc_proxy_probe(struct mei_cl_device *cldev,
}
component_match_add_typed(&cldev->dev, &master_match,
- mei_gsc_proxy_component_match, cldev->dev.parent);
+ mei_gsc_proxy_component_match, NULL);
if (IS_ERR_OR_NULL(master_match)) {
ret = -ENOMEM;
goto err_exit;
--
2.43.0
From: Alexander Usyskin <alexander.usyskin(a)intel.com>
On Arrow Lake S systems, MEI is no longer strictly connected to bus 0,
while graphics remain exclusively on bus 0. Adapt the component
matching logic to accommodate this change:
Original behavior: Required both MEI and graphics to be on the same
bus 0.
New behavior: Only enforces graphics to be on bus 0 (integrated),
allowing MEI to reside on any bus.
This ensures compatibility with Arrow Lake S and maintains functionality
for the legacy systems.
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Alexander Usyskin <alexander.usyskin(a)intel.com>
Signed-off-by: Tomas Winkler <tomas.winkler(a)intel.com>
---
Requires 'mei: me: add arrow lake point S DID'
https://lore.kernel.org/lkml/20240211103912.117105-1-tomas.winkler@intel.co…
drivers/misc/mei/gsc_proxy/mei_gsc_proxy.c | 8 +++++---
1 file changed, 5 insertions(+), 3 deletions(-)
diff --git a/drivers/misc/mei/gsc_proxy/mei_gsc_proxy.c b/drivers/misc/mei/gsc_proxy/mei_gsc_proxy.c
index be52b113aea937c7c658e06c..89364bdbb1290f5726a34945 100644
--- a/drivers/misc/mei/gsc_proxy/mei_gsc_proxy.c
+++ b/drivers/misc/mei/gsc_proxy/mei_gsc_proxy.c
@@ -96,7 +96,8 @@ static const struct component_master_ops mei_component_master_ops = {
*
* The function checks if the device is pci device and
* Intel VGA adapter, the subcomponent is SW Proxy
- * and the parent of MEI PCI and the parent of VGA are the same PCH device.
+ * and the VGA is on the bus 0 reserved for built-in devices
+ * to reject discrete GFX.
*
* @dev: master device
* @subcomponent: subcomponent to match (I915_COMPONENT_SWPROXY)
@@ -123,7 +124,8 @@ static int mei_gsc_proxy_component_match(struct device *dev, int subcomponent,
if (subcomponent != I915_COMPONENT_GSC_PROXY)
return 0;
- return component_compare_dev(dev->parent, ((struct device *)data)->parent);
+ /* Only built-in GFX */
+ return (pdev->bus->number == 0);
}
static int mei_gsc_proxy_probe(struct mei_cl_device *cldev,
@@ -146,7 +148,7 @@ static int mei_gsc_proxy_probe(struct mei_cl_device *cldev,
}
component_match_add_typed(&cldev->dev, &master_match,
- mei_gsc_proxy_component_match, cldev->dev.parent);
+ mei_gsc_proxy_component_match, NULL);
if (IS_ERR_OR_NULL(master_match)) {
ret = -ENOMEM;
goto err_exit;
--
2.43.0
The patch titled
Subject: mm: cachestat: fix folio read-after-free in cache walk
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
mm-cachestat-fix-folio-read-after-free-in-cache-walk.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Nhat Pham <nphamcs(a)gmail.com>
Subject: mm: cachestat: fix folio read-after-free in cache walk
Date: Mon, 19 Feb 2024 19:01:21 -0800
In cachestat, we access the folio from the page cache's xarray to compute
its page offset, and check for its dirty and writeback flags. However, we
do not hold a reference to the folio before performing these actions,
which means the folio can concurrently be released and reused as another
folio/page/slab.
Get around this altogether by just using xarray's existing machinery for
the folio page offsets and dirty/writeback states.
This changes behavior for tmpfs files to now always report zeroes in their
dirty and writeback counters. This is okay as tmpfs doesn't follow
conventional writeback cache behavior: its pages get "cleaned" during
swapout, after which they're no longer resident etc.
Link: https://lkml.kernel.org/r/20240220153409.GA216065@cmpxchg.org
Fixes: cf264e1329fb ("cachestat: implement cachestat syscall")
Reported-by: Jann Horn <jannh(a)google.com>
Suggested-by: Matthew Wilcox <willy(a)infradead.org>
Signed-off-by: Nhat Pham <nphamcs(a)gmail.com>
Signed-off-by: Johannes Weiner <hannes(a)cmpxchg.org>
Cc: <stable(a)vger.kernel.org> [6.4+]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/filemap.c | 51 ++++++++++++++++++++++++-------------------------
1 file changed, 26 insertions(+), 25 deletions(-)
--- a/mm/filemap.c~mm-cachestat-fix-folio-read-after-free-in-cache-walk
+++ a/mm/filemap.c
@@ -4111,28 +4111,40 @@ static void filemap_cachestat(struct add
rcu_read_lock();
xas_for_each(&xas, folio, last_index) {
+ int order;
unsigned long nr_pages;
pgoff_t folio_first_index, folio_last_index;
+ /*
+ * Don't deref the folio. It is not pinned, and might
+ * get freed (and reused) underneath us.
+ *
+ * We *could* pin it, but that would be expensive for
+ * what should be a fast and lightweight syscall.
+ *
+ * Instead, derive all information of interest from
+ * the rcu-protected xarray.
+ */
+
if (xas_retry(&xas, folio))
continue;
+ order = xa_get_order(xas.xa, xas.xa_index);
+ nr_pages = 1 << order;
+ folio_first_index = round_down(xas.xa_index, 1 << order);
+ folio_last_index = folio_first_index + nr_pages - 1;
+
+ /* Folios might straddle the range boundaries, only count covered pages */
+ if (folio_first_index < first_index)
+ nr_pages -= first_index - folio_first_index;
+
+ if (folio_last_index > last_index)
+ nr_pages -= folio_last_index - last_index;
+
if (xa_is_value(folio)) {
/* page is evicted */
void *shadow = (void *)folio;
bool workingset; /* not used */
- int order = xa_get_order(xas.xa, xas.xa_index);
-
- nr_pages = 1 << order;
- folio_first_index = round_down(xas.xa_index, 1 << order);
- folio_last_index = folio_first_index + nr_pages - 1;
-
- /* Folios might straddle the range boundaries, only count covered pages */
- if (folio_first_index < first_index)
- nr_pages -= first_index - folio_first_index;
-
- if (folio_last_index > last_index)
- nr_pages -= folio_last_index - last_index;
cs->nr_evicted += nr_pages;
@@ -4150,24 +4162,13 @@ static void filemap_cachestat(struct add
goto resched;
}
- nr_pages = folio_nr_pages(folio);
- folio_first_index = folio_pgoff(folio);
- folio_last_index = folio_first_index + nr_pages - 1;
-
- /* Folios might straddle the range boundaries, only count covered pages */
- if (folio_first_index < first_index)
- nr_pages -= first_index - folio_first_index;
-
- if (folio_last_index > last_index)
- nr_pages -= folio_last_index - last_index;
-
/* page is in cache */
cs->nr_cache += nr_pages;
- if (folio_test_dirty(folio))
+ if (xas_get_mark(&xas, PAGECACHE_TAG_DIRTY))
cs->nr_dirty += nr_pages;
- if (folio_test_writeback(folio))
+ if (xas_get_mark(&xas, PAGECACHE_TAG_WRITEBACK))
cs->nr_writeback += nr_pages;
resched:
_
Patches currently in -mm which might be from nphamcs(a)gmail.com are
mm-swap_state-update-zswap-lrus-protection-range-with-the-folio-locked.patch
mm-swap_state-update-zswap-lrus-protection-range-with-the-folio-locked-v2.patch
mm-swap_state-update-zswap-lrus-protection-range-with-the-folio-locked-fix.patch
mm-cachestat-fix-folio-read-after-free-in-cache-walk.patch
selftests-zswap-add-zswap-selftest-file-to-zswap-maintainer-entry.patch
selftests-fix-the-zswap-invasive-shrink-test.patch
selftests-add-zswapin-and-no-zswap-tests.patch
The patch titled
Subject: mm/vmscan: fix a bug calling wakeup_kswapd() with a wrong zone index
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
mm-vmscan-fix-a-bug-calling-wakeup_kswapd-with-a-wrong-zone-index.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Byungchul Park <byungchul(a)sk.com>
Subject: mm/vmscan: fix a bug calling wakeup_kswapd() with a wrong zone index
Date: Fri, 16 Feb 2024 20:15:02 +0900
With numa balancing on, when a numa system is running where a numa node
doesn't have its local memory so it has no managed zones, the following
oops has been observed. It's because wakeup_kswapd() is called with a
wrong zone index, -1. Fixed it by checking the index before calling
wakeup_kswapd().
> BUG: unable to handle page fault for address: 00000000000033f3
> #PF: supervisor read access in kernel mode
> #PF: error_code(0x0000) - not-present page
> PGD 0 P4D 0
> Oops: 0000 [#1] PREEMPT SMP NOPTI
> CPU: 2 PID: 895 Comm: masim Not tainted 6.6.0-dirty #255
> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
> rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014
> RIP: 0010:wakeup_kswapd (./linux/mm/vmscan.c:7812)
> Code: (omitted)
> RSP: 0000:ffffc90004257d58 EFLAGS: 00010286
> RAX: ffffffffffffffff RBX: ffff88883fff0480 RCX: 0000000000000003
> RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff88883fff0480
> RBP: ffffffffffffffff R08: ff0003ffffffffff R09: ffffffffffffffff
> R10: ffff888106c95540 R11: 0000000055555554 R12: 0000000000000003
> R13: 0000000000000000 R14: 0000000000000000 R15: ffff88883fff0940
> FS: 00007fc4b8124740(0000) GS:ffff888827c00000(0000) knlGS:0000000000000000
> CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> CR2: 00000000000033f3 CR3: 000000026cc08004 CR4: 0000000000770ee0
> DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> PKRU: 55555554
> Call Trace:
> <TASK>
> ? __die
> ? page_fault_oops
> ? __pte_offset_map_lock
> ? exc_page_fault
> ? asm_exc_page_fault
> ? wakeup_kswapd
> migrate_misplaced_page
> __handle_mm_fault
> handle_mm_fault
> do_user_addr_fault
> exc_page_fault
> asm_exc_page_fault
> RIP: 0033:0x55b897ba0808
> Code: (omitted)
> RSP: 002b:00007ffeefa821a0 EFLAGS: 00010287
> RAX: 000055b89983acd0 RBX: 00007ffeefa823f8 RCX: 000055b89983acd0
> RDX: 00007fc2f8122010 RSI: 0000000000020000 RDI: 000055b89983acd0
> RBP: 00007ffeefa821a0 R08: 0000000000000037 R09: 0000000000000075
> R10: 0000000000000000 R11: 0000000000000202 R12: 0000000000000000
> R13: 00007ffeefa82410 R14: 000055b897ba5dd8 R15: 00007fc4b8340000
> </TASK>
Link: https://lkml.kernel.org/r/20240216111502.79759-1-byungchul@sk.com
Signed-off-by: Byungchul Park <byungchul(a)sk.com>
Reported-by: Hyeongtak Ji <hyeongtak.ji(a)sk.com>
Fixes: c574bbe917036 ("NUMA balancing: optimize page placement for memory tiering system")
Cc: Baolin Wang <baolin.wang(a)linux.alibaba.com>
Cc: "Huang, Ying" <ying.huang(a)intel.com>
Cc: Johannes Weiner <hannes(a)cmpxchg.org>
Cc: Oscar Salvador <osalvador(a)suse.de>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/migrate.c | 8 ++++++++
1 file changed, 8 insertions(+)
--- a/mm/migrate.c~mm-vmscan-fix-a-bug-calling-wakeup_kswapd-with-a-wrong-zone-index
+++ a/mm/migrate.c
@@ -2519,6 +2519,14 @@ static int numamigrate_isolate_folio(pg_
if (managed_zone(pgdat->node_zones + z))
break;
}
+
+ /*
+ * If there are no managed zones, it should not proceed
+ * further.
+ */
+ if (z < 0)
+ return 0;
+
wakeup_kswapd(pgdat->node_zones + z, 0,
folio_order(folio), ZONE_MOVABLE);
return 0;
_
Patches currently in -mm which might be from byungchul(a)sk.com are
mm-vmscan-fix-a-bug-calling-wakeup_kswapd-with-a-wrong-zone-index.patch
mm-vmscan-dont-turn-on-cache_trim_mode-at-the-highest-scan-priority.patch
sched-numa-mm-do-not-try-to-migrate-memory-to-memoryless-nodes.patch
This series includes 4 types of fixes:
Patches 1 and 2 force the path-managers not to allocate a new address
entry when dealing with the "special" ID 0, reserved to the address of
the initial subflow. These patches can be backported up to v5.19 and
v5.12 respectively.
Patch 3 to 6 fix the in-kernel path-manager not to create duplicated
subflows. Patch 6 is the main fix, but patches 3 to 5 are some kind of
pre-requisities: they fix some data races that could also lead to the
creation of unexpected subflows. These patches can be backported up to
v5.7, v5.10, v6.0, and v5.15 respectively.
Note that patch 3 modifies the existing ULP API. No better solutions
have been found for -net, and there is some similar prior art, see
commit 0df48c26d841 ("tcp: add tcpi_bytes_acked to tcp_info"). Please
also note that TLS ULP Diag has likely the same issue.
Patches 7 to 9 fix issues in the selftests, when executing them on older
kernels, e.g. when testing the last version of these kselftests on the
v5.15.148 kernel as it is done by LKFT when validating stable kernels.
These patches only avoid printing expected errors the console and
marking some tests as "OK" while they have been skipped. Patches 7 and 8
can be backported up to v6.6.
Patches 10 to 13 make sure all MPTCP selftests subtests have a unique
name. It is important to have a unique (sub)test name in TAP, because
that's the test identifier. Some CI environments might drop tests with
duplicated names. Patches 10 to 12 can be backported up to v6.6.
Signed-off-by: Matthieu Baerts (NGI0) <matttbe(a)kernel.org>
---
Geliang Tang (2):
mptcp: add needs_id for userspace appending addr
mptcp: add needs_id for netlink appending addr
Matthieu Baerts (NGI0) (7):
selftests: mptcp: pm nl: also list skipped tests
selftests: mptcp: pm nl: avoid error msg on older kernels
selftests: mptcp: diag: fix bash warnings on older kernels
selftests: mptcp: simult flows: fix some subtest names
selftests: mptcp: userspace_pm: unique subtest names
selftests: mptcp: diag: unique 'in use' subtest names
selftests: mptcp: diag: unique 'cestab' subtest names
Paolo Abeni (4):
mptcp: fix lockless access in subflow ULP diag
mptcp: fix data races on local_id
mptcp: fix data races on remote_id
mptcp: fix duplicate subflow creation
include/net/tcp.h | 2 +-
net/mptcp/diag.c | 8 ++-
net/mptcp/pm_netlink.c | 69 ++++++++++++++---------
net/mptcp/pm_userspace.c | 15 ++---
net/mptcp/protocol.c | 2 +-
net/mptcp/protocol.h | 15 ++++-
net/mptcp/subflow.c | 15 ++---
net/tls/tls_main.c | 2 +-
tools/testing/selftests/net/mptcp/diag.sh | 41 ++++++++------
tools/testing/selftests/net/mptcp/pm_netlink.sh | 8 ++-
tools/testing/selftests/net/mptcp/simult_flows.sh | 3 +-
tools/testing/selftests/net/mptcp/userspace_pm.sh | 4 +-
12 files changed, 116 insertions(+), 68 deletions(-)
---
base-commit: c40c0d3a768c78a023a72fb2ceea00743e3a695d
change-id: 20240215-upstream-net-20240215-misc-fixes-03815ec14dc6
Best regards,
--
Matthieu Baerts (NGI0) <matttbe(a)kernel.org>
Hi Greg, few stable updates for you -
Cheers,
Kent
The following changes since commit 0dd3ee31125508cd67f7e7172247f05b7fd1753a:
Linux 6.7 (2024-01-07 12:18:38 -0800)
are available in the Git repository at:
https://evilpiepirate.org/git/bcachefs.git tags/bcachefs-for-v6.7-stable-20240208
for you to fetch changes up to f1582f4774ac7c30c5460a8c7a6e5a82b9ce5a6a:
bcachefs: time_stats: Check for last_event == 0 when updating freq stats (2024-02-08 15:33:11 -0500)
----------------------------------------------------------------
bcachefs updates for v6.7 stable:
locking fixes in subvolume create, destroy paths - Al, Su Yue, Guoyu Ou
fix race in thread_with_file - Mathias Krause
small rebalance fixes - Daniel, myself
workaround for building with old clang (can't take a pointer to memcmp)
build fix on parisc
minor time_stats fix
----------------------------------------------------------------
Al Viro (2):
new helper: user_path_locked_at()
bch2_ioctl_subvolume_destroy(): fix locking
Christoph Hellwig (1):
bcachefs: fix incorrect usage of REQ_OP_FLUSH
Daniel Hill (1):
bcachefs: rebalance should wakeup on shutdown if disabled
Guoyu Ou (1):
bcachefs: unlock parent dir if entry is not found in subvolume deletion
Helge Deller (1):
bcachefs: Fix build on parisc by avoiding __multi3()
Kent Overstreet (4):
bcachefs: Don't pass memcmp() as a pointer
bcachefs: Add missing bch2_moving_ctxt_flush_all()
bcachefs: bch2_kthread_io_clock_wait() no longer sleeps until full amount
bcachefs: time_stats: Check for last_event == 0 when updating freq stats
Mathias Krause (1):
bcachefs: install fd later to avoid race with close
Su Yue (2):
bcachefs: kvfree bch_fs::snapshots in bch2_fs_snapshots_exit
bcachefs: grab s_umount only if snapshotting
fs/bcachefs/chardev.c | 3 +--
fs/bcachefs/clock.c | 4 ++--
fs/bcachefs/fs-io.c | 2 +-
fs/bcachefs/fs-ioctl.c | 42 +++++++++++++++++++++--------------------
fs/bcachefs/journal_io.c | 3 ++-
fs/bcachefs/mean_and_variance.h | 2 +-
fs/bcachefs/move.c | 2 +-
fs/bcachefs/move.h | 1 +
fs/bcachefs/rebalance.c | 13 +++++++++++--
fs/bcachefs/replicas.c | 10 ++++++++--
fs/bcachefs/snapshot.c | 2 +-
fs/bcachefs/util.c | 5 +++--
fs/namei.c | 16 +++++++++++++---
include/linux/namei.h | 1 +
14 files changed, 68 insertions(+), 38 deletions(-)
Dear ,
Please find the attached copy of our contract for your reference.
Confirm the details, sign and return as soon as possible. The shipment cost remains the same.
See No 4 highlighted in RED confirm if we can increase as before.
Let us know if you need further clarification.
Best Regards,
Connor Gilchrist
Sales Shadow's Ridge inc.
The Shipyard, Bath Road, Lymington,
Hampshire, SO41 3YL, England
Ph. +44 (0)1590 647406.
From: Andrzej Kacprowski <Andrzej.Kacprowski(a)intel.com>
There is no point in requesting 1 tile on VPU40xx as the FW will
probably need more tiles to run workloads, so it will have to
reconfigure PLL anyway. Don't enable any tiles and allow the FW to
perform initial tile configuration.
This improves NPU boot stability as the tiles are always enabled only
by the FW from the same initial state.
Fixes: 79cdc56c4a54 ("accel/ivpu: Add initial support for VPU 4")
Signed-off-by: Andrzej Kacprowski <Andrzej.Kacprowski(a)intel.com>
Signed-off-by: Jacek Lawrynowicz <jacek.lawrynowicz(a)linux.intel.com>
---
drivers/accel/ivpu/ivpu_hw_40xx.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/accel/ivpu/ivpu_hw_40xx.c b/drivers/accel/ivpu/ivpu_hw_40xx.c
index 1c995307c113..a1523d0b1ef3 100644
--- a/drivers/accel/ivpu/ivpu_hw_40xx.c
+++ b/drivers/accel/ivpu/ivpu_hw_40xx.c
@@ -24,7 +24,7 @@
#define SKU_HW_ID_SHIFT 16u
#define SKU_HW_ID_MASK 0xffff0000u
-#define PLL_CONFIG_DEFAULT 0x1
+#define PLL_CONFIG_DEFAULT 0x0
#define PLL_CDYN_DEFAULT 0x80
#define PLL_EPP_DEFAULT 0x80
#define PLL_REF_CLK_FREQ (50 * 1000000)
--
2.43.0
From: Kairui Song <kasong(a)tencent.com>
When skipping swapcache for SWP_SYNCHRONOUS_IO, if two or more threads
swapin the same entry at the same time, they get different pages (A, B).
Before one thread (T0) finishes the swapin and installs page (A)
to the PTE, another thread (T1) could finish swapin of page (B),
swap_free the entry, then swap out the possibly modified page
reusing the same entry. It breaks the pte_same check in (T0) because
PTE value is unchanged, causing ABA problem. Thread (T0) will
install a stalled page (A) into the PTE and cause data corruption.
One possible callstack is like this:
CPU0 CPU1
---- ----
do_swap_page() do_swap_page() with same entry
<direct swapin path> <direct swapin path>
<alloc page A> <alloc page B>
swap_read_folio() <- read to page A swap_read_folio() <- read to page B
<slow on later locks or interrupt> <finished swapin first>
... set_pte_at()
swap_free() <- entry is free
<write to page B, now page A stalled>
<swap out page B to same swap entry>
pte_same() <- Check pass, PTE seems
unchanged, but page A
is stalled!
swap_free() <- page B content lost!
set_pte_at() <- staled page A installed!
And besides, for ZRAM, swap_free() allows the swap device to discard
the entry content, so even if page (B) is not modified, if
swap_read_folio() on CPU0 happens later than swap_free() on CPU1,
it may also cause data loss.
To fix this, reuse swapcache_prepare which will pin the swap entry using
the cache flag, and allow only one thread to swap it in, also prevent
any parallel code from putting the entry in the cache. Release the pin
after PT unlocked.
Racers just loop and wait since it's a rare and very short event.
A schedule_timeout_uninterruptible(1) call is added to avoid repeated
page faults wasting too much CPU, causing livelock or adding too much
noise to perf statistics. A similar livelock issue was described in
commit 029c4628b2eb ("mm: swap: get rid of livelock in swapin readahead")
Reproducer:
This race issue can be triggered easily using a well constructed
reproducer and patched brd (with a delay in read path) [1]:
With latest 6.8 mainline, race caused data loss can be observed easily:
$ gcc -g -lpthread test-thread-swap-race.c && ./a.out
Polulating 32MB of memory region...
Keep swapping out...
Starting round 0...
Spawning 65536 workers...
32746 workers spawned, wait for done...
Round 0: Error on 0x5aa00, expected 32746, got 32743, 3 data loss!
Round 0: Error on 0x395200, expected 32746, got 32743, 3 data loss!
Round 0: Error on 0x3fd000, expected 32746, got 32737, 9 data loss!
Round 0 Failed, 15 data loss!
This reproducer spawns multiple threads sharing the same memory region
using a small swap device. Every two threads updates mapped pages one by
one in opposite direction trying to create a race, with one dedicated
thread keep swapping out the data out using madvise.
The reproducer created a reproduce rate of about once every 5 minutes,
so the race should be totally possible in production.
After this patch, I ran the reproducer for over a few hundred rounds
and no data loss observed.
Performance overhead is minimal, microbenchmark swapin 10G from 32G
zram:
Before: 10934698 us
After: 11157121 us
Cached: 13155355 us (Dropping SWP_SYNCHRONOUS_IO flag)
Fixes: 0bcac06f27d7 ("mm, swap: skip swapcache for swapin of synchronous device")
Link: https://github.com/ryncsn/emm-test-project/tree/master/swap-stress-race [1]
Reported-by: "Huang, Ying" <ying.huang(a)intel.com>
Closes: https://lore.kernel.org/lkml/87bk92gqpx.fsf_-_@yhuang6-desk2.ccr.corp.intel…
Signed-off-by: Kairui Song <kasong(a)tencent.com>
Cc: stable(a)vger.kernel.org
---
V3: https://lore.kernel.org/all/20240216095105.14502-1-ryncsn@gmail.com/
Update from V3:
- Use schedule_timeout_uninterruptible(1) for now instead of schedule() to
prevent the busy faulting task holds CPU and livelocks [Huang, Ying]
V2: https://lore.kernel.org/all/20240206182559.32264-1-ryncsn@gmail.com/
Update from V2:
- Add a schedule() if raced to prevent repeated page faults wasting CPU
and add noise to perf statistics.
- Use a bool to state the special case instead of reusing existing
variables fixing error handling [Minchan Kim].
V1: https://lore.kernel.org/all/20240205110959.4021-1-ryncsn@gmail.com/
Update from V1:
- Add some words on ZRAM case, it will discard swap content on swap_free
so the race window is a bit different but cure is the same. [Barry Song]
- Update comments make it cleaner [Huang, Ying]
- Add a function place holder to fix CONFIG_SWAP=n built [SeongJae Park]
- Update the commit message and summary, refer to SWP_SYNCHRONOUS_IO
instead of "direct swapin path" [Yu Zhao]
- Update commit message.
- Collect Review and Acks.
include/linux/swap.h | 5 +++++
mm/memory.c | 20 ++++++++++++++++++++
mm/swap.h | 5 +++++
mm/swapfile.c | 13 +++++++++++++
4 files changed, 43 insertions(+)
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 4db00ddad261..8d28f6091a32 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -549,6 +549,11 @@ static inline int swap_duplicate(swp_entry_t swp)
return 0;
}
+static inline int swapcache_prepare(swp_entry_t swp)
+{
+ return 0;
+}
+
static inline void swap_free(swp_entry_t swp)
{
}
diff --git a/mm/memory.c b/mm/memory.c
index 7e1f4849463a..a99f5e7be9a5 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3799,6 +3799,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
struct page *page;
struct swap_info_struct *si = NULL;
rmap_t rmap_flags = RMAP_NONE;
+ bool need_clear_cache = false;
bool exclusive = false;
swp_entry_t entry;
pte_t pte;
@@ -3867,6 +3868,20 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
if (!folio) {
if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
__swap_count(entry) == 1) {
+ /*
+ * Prevent parallel swapin from proceeding with
+ * the cache flag. Otherwise, another thread may
+ * finish swapin first, free the entry, and swapout
+ * reusing the same entry. It's undetectable as
+ * pte_same() returns true due to entry reuse.
+ */
+ if (swapcache_prepare(entry)) {
+ /* Relax a bit to prevent rapid repeated page faults */
+ schedule_timeout_uninterruptible(1);
+ goto out;
+ }
+ need_clear_cache = true;
+
/* skip swapcache */
folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0,
vma, vmf->address, false);
@@ -4117,6 +4132,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
if (vmf->pte)
pte_unmap_unlock(vmf->pte, vmf->ptl);
out:
+ /* Clear the swap cache pin for direct swapin after PTL unlock */
+ if (need_clear_cache)
+ swapcache_clear(si, entry);
if (si)
put_swap_device(si);
return ret;
@@ -4131,6 +4149,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
folio_unlock(swapcache);
folio_put(swapcache);
}
+ if (need_clear_cache)
+ swapcache_clear(si, entry);
if (si)
put_swap_device(si);
return ret;
diff --git a/mm/swap.h b/mm/swap.h
index 758c46ca671e..fc2f6ade7f80 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -41,6 +41,7 @@ void __delete_from_swap_cache(struct folio *folio,
void delete_from_swap_cache(struct folio *folio);
void clear_shadow_from_swap_cache(int type, unsigned long begin,
unsigned long end);
+void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry);
struct folio *swap_cache_get_folio(swp_entry_t entry,
struct vm_area_struct *vma, unsigned long addr);
struct folio *filemap_get_incore_folio(struct address_space *mapping,
@@ -97,6 +98,10 @@ static inline int swap_writepage(struct page *p, struct writeback_control *wbc)
return 0;
}
+static inline void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry)
+{
+}
+
static inline struct folio *swap_cache_get_folio(swp_entry_t entry,
struct vm_area_struct *vma, unsigned long addr)
{
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 556ff7347d5f..746aa9da5302 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -3365,6 +3365,19 @@ int swapcache_prepare(swp_entry_t entry)
return __swap_duplicate(entry, SWAP_HAS_CACHE);
}
+void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry)
+{
+ struct swap_cluster_info *ci;
+ unsigned long offset = swp_offset(entry);
+ unsigned char usage;
+
+ ci = lock_cluster_or_swap_info(si, offset);
+ usage = __swap_entry_free_locked(si, offset, SWAP_HAS_CACHE);
+ unlock_cluster_or_swap_info(si, ci);
+ if (!usage)
+ free_swap_slot(entry);
+}
+
struct swap_info_struct *swp_swap_info(swp_entry_t entry)
{
return swap_type_to_swap_info(swp_type(entry));
--
2.43.0
Currently ACPI_MEMORY_NVS is omitted from the linear map, which causes
a trouble with the following firmware memory region setup:
[..] efi: 0x0000dfd62000-0x0000dfd83fff [ACPI Reclaim|...]
[..] efi: 0x0000dfd84000-0x0000dfd87fff [ACPI Mem NVS|...]
, on ARM64 with 64k page size, the whole 0x0000dfd80000-0x0000dfd8ffff
range will be omitted from the the linear map due to 64k round-up. And
a page fault happens when trying to access the ACPI_RECLAIM_MEMORY:
[...] Unable to handle kernel paging request at virtual address ffff0000dfd80000
To fix this, add ACPI_MEMORY_NVS into the linear map.
Signed-off-by: Boqun Feng <boqun.feng(a)gmail.com>
Cc: stable(a)vger.kernel.org # 5.15+
---
We hit this in an ARM64 Hyper-V VM when using 64k page size, although
this issue may also be fixed if the efi memory regions are all 64k
aligned, but I don't find this memory region setup is invalid per UEFI
spec, also I don't find that spec disallows ACPI_MEMORY_NVS to be mapped
in the OS linear map, but if there is any better way or I'm reading the
spec incorrectly, please let me know.
It's Cced stable since 5.15 because that's when Hyper-V ARM64 support is
added, and Hyper-V is the only one that hits the problem so far.
drivers/firmware/efi/efi-init.c | 9 +++++++--
1 file changed, 7 insertions(+), 2 deletions(-)
diff --git a/drivers/firmware/efi/efi-init.c b/drivers/firmware/efi/efi-init.c
index a00e07b853f2..9a1b9bc66d50 100644
--- a/drivers/firmware/efi/efi-init.c
+++ b/drivers/firmware/efi/efi-init.c
@@ -139,6 +139,7 @@ static __init int is_usable_memory(efi_memory_desc_t *md)
case EFI_LOADER_CODE:
case EFI_LOADER_DATA:
case EFI_ACPI_RECLAIM_MEMORY:
+ case EFI_ACPI_MEMORY_NVS:
case EFI_BOOT_SERVICES_CODE:
case EFI_BOOT_SERVICES_DATA:
case EFI_CONVENTIONAL_MEMORY:
@@ -202,8 +203,12 @@ static __init void reserve_regions(void)
if (!is_usable_memory(md))
memblock_mark_nomap(paddr, size);
- /* keep ACPI reclaim memory intact for kexec etc. */
- if (md->type == EFI_ACPI_RECLAIM_MEMORY)
+ /*
+ * keep ACPI reclaim and NVS memory and intact for kexec
+ * etc.
+ */
+ if (md->type == EFI_ACPI_RECLAIM_MEMORY ||
+ md->type == EFI_ACPI_MEMORY_NVS)
memblock_reserve(paddr, size);
}
}
--
2.43.0
From: Mike Marciniszyn <mike.marciniszyn(a)intel.com>
[ Upstream commit 0a5ec366de7e94192669ba08de6ed336607fd282 ]
The SQ is shared for between kernel and used by storing the kernel page
pointer and passing that to a kmap_atomic().
This then requires that the alignment is PAGE_SIZE aligned.
Fix by adding an iWarp specific alignment check.
The patch needed to be reworked because the separate routines
present upstream are not there in older irdma drivers.
Fixes: e965ef0e7b2c ("RDMA/irdma: Split QP handler into irdma_reg_user_mr_type_qp")
Link: https://lore.kernel.org/r/20231129202143.1434-3-shiraz.saleem@intel.com
Signed-off-by: Mike Marciniszyn <mike.marciniszyn(a)intel.com>
Signed-off-by: Shiraz Saleem <shiraz.saleem(a)intel.com>
Signed-off-by: Jason Gunthorpe <jgg(a)nvidia.com>
---
drivers/infiniband/hw/irdma/verbs.c | 7 +++++++
1 file changed, 7 insertions(+)
diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c
index 447e1bcc82a3..3c437c8070b6 100644
--- a/drivers/infiniband/hw/irdma/verbs.c
+++ b/drivers/infiniband/hw/irdma/verbs.c
@@ -2845,6 +2845,13 @@ static struct ib_mr *irdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 len,
switch (req.reg_type) {
case IRDMA_MEMREG_TYPE_QP:
+ /* iWarp: Catch page not starting on OS page boundary */
+ if (!rdma_protocol_roce(&iwdev->ibdev, 1) &&
+ ib_umem_offset(iwmr->region)) {
+ err = -EINVAL;
+ goto error;
+ }
+
total = req.sq_pages + req.rq_pages + shadow_pgcnt;
if (total > iwmr->page_cnt) {
err = -EINVAL;
--
1.8.3.1
This reverts commit 3f225f29c69c13ce1cbdb1d607a42efeef080056.
The shadow call stack for irq now is stored in current task's thread info
in irq_stack_entry. There is a possibility that we have some soft irqs
pending at the end of hard irq, and when we process softirq with the irq
enabled, irq_stack_entry will enter again and overwrite the shadow call
stack whitch stored in current task's thread info, leading to the
incorrect shadow call stack restoration for the first entry of the hard
IRQ, then the system end up with a panic.
task A | task A
-------------------------------------+------------------------------------
el1_irq //irq1 enter |
irq_handler //save scs_sp1 |
gic_handle_irq |
irq_exit |
__do_softirq |
| el1_irq //irq2 enter
| irq_handler //save scs_sp2
| //overwrite scs_sp1
| ...
| irq_stack_exit //restore scs_sp2
irq_stack_exit //restore wrong |
//scs_sp2 |
So revert this commit to fix it.
Fixes: 3f225f29c69c ("arm64: Stash shadow stack pointer in the task struct on interrupt")
Signed-off-by: Xiang Yang <xiangyang3(a)huawei.com>
---
arch/arm64/kernel/entry.S | 8 ++++++--
1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index a94acea770c7..020a455824be 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -431,7 +431,9 @@ SYM_CODE_END(__swpan_exit_el0)
.macro irq_stack_entry
mov x19, sp // preserve the original sp
- scs_save tsk // preserve the original shadow stack
+#ifdef CONFIG_SHADOW_CALL_STACK
+ mov x24, scs_sp // preserve the original shadow stack
+#endif
/*
* Compare sp with the base of the task stack.
@@ -465,7 +467,9 @@ SYM_CODE_END(__swpan_exit_el0)
*/
.macro irq_stack_exit
mov sp, x19
- scs_load_current
+#ifdef CONFIG_SHADOW_CALL_STACK
+ mov scs_sp, x24
+#endif
.endm
/* GPRs used by entry code */
--
2.34.1
Changes since v2:
- added signed-off
Changes since v1:
- added upstream commit id to the commit message
This suggests a fix from 6.3 for stable that fixes a nasty bug in the
timing behavior of periodic RT tasks w.r.t timerslack_ns. While the
documentation clearly states that the slack time is ignored for RT tasks,
this is not the case for the hrtimer code. This patch fixes the issue and
applies to all stable kernels.
Best regards,
Felix Moessbauer
Siemens AG
Davidlohr Bueso (1):
hrtimer: Ignore slack time for RT tasks in schedule_hrtimeout_range()
kernel/time/hrtimer.c | 14 +++++++++++---
1 file changed, 11 insertions(+), 3 deletions(-)
--
2.39.2
(cc stakeholders from various distros - apologies if I missed anyone)
Please consider the patches below for backporting to the linux-6.6.y
stable tree.
These are prerequisites for building a signed x86 efistub kernel image
that complies with the tightened UEFI boot requirements imposed by
MicroSoft, and this is the condition under which it is willing to sign
future Linux secure boot shim builds with its 3rd party CA
certificate. (Such builds must enforce a strict separation between
executable and writable code, among other things)
The patches apply cleanly onto 6.6.17 (-rc2), resulting in a defconfig
build that boots as expected under OVMF/KVM.
5f51c5d0e905 x86/efi: Drop EFI stub .bss from .data section
7e50262229fa x86/efi: Disregard setup header of loaded image
bfab35f552ab x86/efi: Drop alignment flags from PE section headers
768171d7ebbc x86/boot: Remove the 'bugger off' message
8eace5b35556 x86/boot: Omit compression buffer from PE/COFF image
memory footprint
7448e8e5d15a x86/boot: Drop redundant code setting the root device
b618d31f112b x86/boot: Drop references to startup_64
2e765c02dcbf x86/boot: Grab kernel_info offset from zoffset header directly
eac956345f99 x86/boot: Set EFI handover offset directly in header asm
093ab258e3fb x86/boot: Define setup size in linker script
aeb92067f6ae x86/boot: Derive file size from _edata symbol
efa089e63b56 x86/boot: Construct PE/COFF .text section from assembler
fa5750521e0a x86/boot: Drop PE/COFF .reloc section
34951f3c28bd x86/boot: Split off PE/COFF .data section
3e3eabe26dc8 x86/boot: Increase section and file alignment to 4k/512
1ad55cecf22f x86/efistub: Use 1:1 file:memory mapping for PE/COFF
.compat section
arch/x86/boot/Makefile | 2 +-
arch/x86/boot/compressed/vmlinux.lds.S | 6 +-
arch/x86/boot/header.S | 211 ++++++++++--------------
arch/x86/boot/setup.ld | 14 +-
arch/x86/boot/tools/build.c | 273 ++------------------------------
drivers/firmware/efi/libstub/Makefile | 7 -
drivers/firmware/efi/libstub/x86-stub.c | 46 +-----
7 files changed, 112 insertions(+), 447 deletions(-)
The following changes since commit 8b4118fabd6eb75fed19483b04dab3a036886489:
Linux 6.1.78 (2024-02-16 19:06:32 +0100)
are available in the Git repository at:
https://git.kernel.org/pub/scm/linux/kernel/git/cel/linux.git nfsd-6.1.y
for you to fetch changes up to d432d1006b60bd6b5c38974727bdce78f449eeea:
nfsd: don't take fi_lock in nfsd_break_deleg_cb() (2024-02-16 13:58:29 -0500)
----------------------------------------------------------------
NeilBrown (2):
nfsd: fix RELEASE_LOCKOWNER
nfsd: don't take fi_lock in nfsd_break_deleg_cb()
fs/nfsd/nfs4state.c | 37 ++++++++++++++++++++-----------------
1 file changed, 20 insertions(+), 17 deletions(-)
--
Chuck Lever
This is a backport of all the work that lead up to the work that Linus made
on eventfs. I trust Linus's version more so than the versions in 6.6 and
6.7. There may be plenty of hidden issues due to the design.
This is the update for 6.6. It includes Linus's updates as well as all the
patches leading up to them. As the eventfs work went in in two parts, half
went in in 6.6 and the other in 6.7, there were 6 backports that were done
custom to 6.6 as the bugs found in 6.7 were in 6.6 but implemented
differently. This series starts with reverting those 6 backports and then
applying the updated patches to get to Linus's simplification.
I ran these through my full test suite that I use before sending anything to
Linus, although I did not run my "bisect" test that walks through the
patches. The tests were just run on the end result.
This was created with the following command against v6.6.15, after reverting
the 6 patches:
git log --reverse --no-merges --pretty=oneline v6.6..origin/master fs/tracefs/ | cut -d' ' -f1 |
while read a; do if ! git cherry-pick -x $a; then break; fi ; done
Which adds -x to the cherry pick to add the upstream commit SHAs.
There was one patch in tracefs that didn't need to be backported and I removed
that one.
Beau Belgrave (1):
eventfs: Fix events beyond NAME_MAX blocking tasks
Erick Archer (1):
eventfs: Use kcalloc() instead of kzalloc()
Jiapeng Chong (1):
tracefs/eventfs: Modify mismatched function name
Linus Torvalds (7):
tracefs: remove stale 'update_gid' code
eventfs: Initialize the tracefs inode properly
tracefs: Avoid using the ei->dentry pointer unnecessarily
tracefs: dentry lookup crapectomy
eventfs: Remove unused d_parent pointer field
eventfs: Clean up dentry ops and add revalidate function
eventfs: Get rid of dentry pointers without refcounts
Nathan Chancellor (1):
eventfs: Use ERR_CAST() in eventfs_create_events_dir()
Steven Rostedt (Google) (46):
Revert "eventfs: Do not allow NULL parent to eventfs_start_creating()"
Revert "eventfs: Check for NULL ef in eventfs_set_attr()"
Revert "eventfs: Use simple_recursive_removal() to clean up dentries"
Revert "eventfs: Delete eventfs_inode when the last dentry is freed"
Revert "eventfs: Save ownership and mode"
Revert "eventfs: Remove "is_freed" union with rcu head"
eventfs: Remove eventfs_file and just use eventfs_inode
eventfs: Use eventfs_remove_events_dir()
eventfs: Fix failure path in eventfs_create_events_dir()
eventfs: Fix WARN_ON() in create_file_dentry()
eventfs: Fix typo in eventfs_inode union comment
eventfs: Remove extra dget() in eventfs_create_events_dir()
eventfs: Fix kerneldoc of eventfs_remove_rec()
eventfs: Remove "is_freed" union with rcu head
eventfs: Have a free_ei() that just frees the eventfs_inode
eventfs: Test for ei->is_freed when accessing ei->dentry
eventfs: Save ownership and mode
eventfs: Hold eventfs_mutex when calling callback functions
eventfs: Delete eventfs_inode when the last dentry is freed
eventfs: Remove special processing of dput() of events directory
eventfs: Use simple_recursive_removal() to clean up dentries
eventfs: Remove expectation that ei->is_freed means ei->dentry == NULL
eventfs: Do not invalidate dentry in create_file/dir_dentry()
eventfs: Use GFP_NOFS for allocation when eventfs_mutex is held
eventfs: Move taking of inode_lock into dcache_dir_open_wrapper()
eventfs: Do not allow NULL parent to eventfs_start_creating()
eventfs: Make sure that parent->d_inode is locked in creating files/dirs
eventfs: Have event files and directories default to parent uid and gid
eventfs: Fix file and directory uid and gid ownership
tracefs: Check for dentry->d_inode exists in set_gid()
eventfs: Fix bitwise fields for "is_events"
eventfs: Remove "lookup" parameter from create_dir/file_dentry()
eventfs: Stop using dcache_readdir() for getdents()
tracefs/eventfs: Use root and instance inodes as default ownership
eventfs: Have eventfs_iterate() stop immediately if ei->is_freed is set
eventfs: Do ctx->pos update for all iterations in eventfs_iterate()
eventfs: Read ei->entries before ei->children in eventfs_iterate()
eventfs: Shortcut eventfs_iterate() by skipping entries already read
eventfs: Have the inodes all for files and directories all be the same
eventfs: Do not create dentries nor inodes in iterate_shared
eventfs: Save directory inodes in the eventfs_inode structure
tracefs: Zero out the tracefs_inode when allocating it
eventfs: Warn if an eventfs_inode is freed without is_freed being set
eventfs: Restructure eventfs_inode structure to be more condensed
eventfs: Remove fsnotify*() functions from lookup()
eventfs: Keep all directory links at 1
----
fs/tracefs/event_inode.c | 1250 +++++++++++++++++++-----------------------
fs/tracefs/inode.c | 276 +++++-----
fs/tracefs/internal.h | 60 +-
include/linux/trace_events.h | 2 +-
include/linux/tracefs.h | 73 ++-
kernel/trace/trace.c | 7 +-
kernel/trace/trace.h | 4 +-
kernel/trace/trace_events.c | 311 +++++++----
8 files changed, 1029 insertions(+), 954 deletions(-)
Changes since v1:
- added upstream commit id to the commit message
This suggests a fix from 6.3 for stable that fixes a nasty bug in the
timing behavior of periodic RT tasks w.r.t timerslack_ns. While the
documentation clearly states that the slack time is ignored for RT tasks,
this is not the case for the hrtimer code. This patch fixes the issue and
applies to all stable kernels.
Best regards,
Felix Moessbauer
Siemens AG
Davidlohr Bueso (1):
hrtimer: Ignore slack time for RT tasks in schedule_hrtimeout_range()
kernel/time/hrtimer.c | 14 +++++++++++---
1 file changed, 11 insertions(+), 3 deletions(-)
--
2.39.2
l2tp_ip6_sendmsg needs to avoid accounting for the transport header
twice when splicing more data into an already partially-occupied skbuff.
To manage this, we check whether the skbuff contains data using
skb_queue_empty when deciding how much data to append using
ip6_append_data.
However, the code which performed the calculation was incorrect:
ulen = len + skb_queue_empty(&sk->sk_write_queue) ? transhdrlen : 0;
...due to C operator precedence, this ends up setting ulen to
transhdrlen for messages with a non-zero length, which results in
corrupted packets on the wire.
Add parentheses to correct the calculation in line with the original
intent.
Fixes: 9d4c75800f61 ("ipv4, ipv6: Fix handling of transhdrlen in __ip{,6}_append_data()")
Cc: David Howells <dhowells(a)redhat.com>
Cc: stable(a)vger.kernel.org
Signed-off-by: Tom Parkin <tparkin(a)katalix.com>
---
This issue was uncovered by Debian build-testing for the
golang-github-katalix-go-l2tp package[1].
It seems 9d4c75800f61 has been backported to the linux-6.1.y stable
kernel (and possibly others), so I think this fix will also need
backporting.
The bug is currently seen on at least Debian Bookworm, Ubuntu Jammy, and
Debian testing/unstable.
Unfortunately tests using "ip l2tp" and which focus on dataplane
transport will not uncover this bug: it's necessary to send a packet
using an L2TPIP6 socket opened by userspace, and to verify the packet on
the wire. The l2tp-ktest[2] test suite has been extended to cover this.
[1]. https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=1063746
[2]. https://github.com/katalix/l2tp-ktest
---
net/l2tp/l2tp_ip6.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
index dd3153966173..7bf14cf9ffaa 100644
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -627,7 +627,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
back_from_confirm:
lock_sock(sk);
- ulen = len + skb_queue_empty(&sk->sk_write_queue) ? transhdrlen : 0;
+ ulen = len + (skb_queue_empty(&sk->sk_write_queue) ? transhdrlen : 0);
err = ip6_append_data(sk, ip_generic_getfrag, msg,
ulen, transhdrlen, &ipc6,
&fl6, (struct rt6_info *)dst,
--
2.34.1
Hi,
this series does basically two things:
1. Disables automatic load balancing as adviced by the hardware
workaround.
2. Forces the sharing of the load submitted to CCS among all the
CCS available (as of now only DG2 has more than one CCS). This
way the user, when sending a query, will see only one CCS
available.
Andi
Andi Shyti (2):
drm/i915/gt: Disable HW load balancing for CCS
drm/i915/gt: Set default CCS mode '1'
drivers/gpu/drm/i915/gt/intel_gt.c | 11 +++++++++++
drivers/gpu/drm/i915/gt/intel_gt_regs.h | 3 +++
drivers/gpu/drm/i915/gt/intel_workarounds.c | 6 ++++++
drivers/gpu/drm/i915/i915_drv.h | 17 +++++++++++++++++
drivers/gpu/drm/i915/i915_query.c | 5 +++--
5 files changed, 40 insertions(+), 2 deletions(-)
--
2.43.0
Commit fb24ea52f78e0d595852e ("drivers: Remove explicit invocations of
mmiowb()") remove all mmiowb() in drivers, but it says:
"NOTE: mmiowb() has only ever guaranteed ordering in conjunction with
spin_unlock(). However, pairing each mmiowb() removal in this patch with
the corresponding call to spin_unlock() is not at all trivial, so there
is a small chance that this change may regress any drivers incorrectly
relying on mmiowb() to order MMIO writes between CPUs using lock-free
synchronisation."
The mmio in radeon_ring_commit() is protected by a mutex rather than a
spinlock, but in the mutex fastpath it behaves similar to spinlock and
need a mmiowb() to make sure the wptr is up-to-date for hardware.
Without this, we get such an error when run 'glxgears' on weak ordering
architectures such as LoongArch:
radeon 0000:04:00.0: ring 0 stalled for more than 10324msec
radeon 0000:04:00.0: ring 3 stalled for more than 10240msec
radeon 0000:04:00.0: GPU lockup (current fence id 0x000000000001f412 last fence id 0x000000000001f414 on ring 3)
radeon 0000:04:00.0: GPU lockup (current fence id 0x000000000000f940 last fence id 0x000000000000f941 on ring 0)
radeon 0000:04:00.0: scheduling IB failed (-35).
[drm:radeon_gem_va_ioctl [radeon]] *ERROR* Couldn't update BO_VA (-35)
radeon 0000:04:00.0: scheduling IB failed (-35).
[drm:radeon_gem_va_ioctl [radeon]] *ERROR* Couldn't update BO_VA (-35)
radeon 0000:04:00.0: scheduling IB failed (-35).
[drm:radeon_gem_va_ioctl [radeon]] *ERROR* Couldn't update BO_VA (-35)
radeon 0000:04:00.0: scheduling IB failed (-35).
[drm:radeon_gem_va_ioctl [radeon]] *ERROR* Couldn't update BO_VA (-35)
radeon 0000:04:00.0: scheduling IB failed (-35).
[drm:radeon_gem_va_ioctl [radeon]] *ERROR* Couldn't update BO_VA (-35)
radeon 0000:04:00.0: scheduling IB failed (-35).
[drm:radeon_gem_va_ioctl [radeon]] *ERROR* Couldn't update BO_VA (-35)
radeon 0000:04:00.0: scheduling IB failed (-35).
[drm:radeon_gem_va_ioctl [radeon]] *ERROR* Couldn't update BO_VA (-35)
Cc: stable(a)vger.kernel.org
Signed-off-by: Tianyang Zhang <zhangtianyang(a)loongson.cn>
Signed-off-by: Huacai Chen <chenhuacai(a)loongson.cn>
---
drivers/gpu/drm/radeon/radeon_ring.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/drivers/gpu/drm/radeon/radeon_ring.c b/drivers/gpu/drm/radeon/radeon_ring.c
index 38048593bb4a..d461dc85d820 100644
--- a/drivers/gpu/drm/radeon/radeon_ring.c
+++ b/drivers/gpu/drm/radeon/radeon_ring.c
@@ -183,6 +183,7 @@ void radeon_ring_commit(struct radeon_device *rdev, struct radeon_ring *ring,
if (hdp_flush && rdev->asic->mmio_hdp_flush)
rdev->asic->mmio_hdp_flush(rdev);
radeon_ring_set_wptr(rdev, ring);
+ mmiowb(); /* Make sure wptr is up-to-date for hw */
}
/**
--
2.43.0
On 2024/2/20 13:32, Kairui Song wrote:
> On Tue, Feb 20, 2024 at 12:49 PM Chengming Zhou <zhouchengming(a)bytedance.com>
> wrote:
>>
>> On 2024/2/20 06:10, Barry Song wrote:
>>> On Mon, Feb 19, 2024 at 9:21 PM Kairui Song <ryncsn(a)gmail.com> wrote:
>>>>
>>>> From: Kairui Song <kasong(a)tencent.com>
>>>>
>>>> When skipping swapcache for SWP_SYNCHRONOUS_IO, if two or more threads
>>>> swapin the same entry at the same time, they get different pages (A,
> B).
>>>> Before one thread (T0) finishes the swapin and installs page (A)
>>>> to the PTE, another thread (T1) could finish swapin of page (B),
>>>> swap_free the entry, then swap out the possibly modified page
>>>> reusing the same entry. It breaks the pte_same check in (T0) because
>>>> PTE value is unchanged, causing ABA problem. Thread (T0) will
>>>> install a stalled page (A) into the PTE and cause data corruption.
>>>>
>>>> One possible callstack is like this:
>>>>
>>>> CPU0 CPU1
>>>> ---- ----
>>>> do_swap_page() do_swap_page() with same entry
>>>> <direct swapin path> <direct swapin path>
>>>> <alloc page A> <alloc page B>
>>>> swap_read_folio() <- read to page A swap_read_folio() <- read to page
> B
>>>> <slow on later locks or interrupt> <finished swapin first>
>>>> .. set_pte_at()
>>>> swap_free() <- entry is free
>>>> <write to page B, now page A
> stalled>
>>>> <swap out page B to same swap
> entry>
>>>> pte_same() <- Check pass, PTE seems
>>>> unchanged, but page A
>>>> is stalled!
>>>> swap_free() <- page B content lost!
>>>> set_pte_at() <- staled page A installed!
>>>>
>>>> And besides, for ZRAM, swap_free() allows the swap device to discard
>>>> the entry content, so even if page (B) is not modified, if
>>>> swap_read_folio() on CPU0 happens later than swap_free() on CPU1,
>>>> it may also cause data loss.
>>>>
>>>> To fix this, reuse swapcache_prepare which will pin the swap entry
> using
>>>> the cache flag, and allow only one thread to swap it in, also prevent
>>>> any parallel code from putting the entry in the cache. Release the pin
>>>> after PT unlocked.
>>>>
>>>> Racers just loop and wait since it's a rare and very short event.
>>>> A schedule_timeout_uninterruptible(1) call is added to avoid repeated
>>>> page faults wasting too much CPU, causing livelock or adding too much
>>>> noise to perf statistics. A similar livelock issue was described in
>>>> commit 029c4628b2eb ("mm: swap: get rid of livelock in swapin
> readahead")
>>>>
>>>> Reproducer:
>>>>
>>>> This race issue can be triggered easily using a well constructed
>>>> reproducer and patched brd (with a delay in read path) [1]:
>>>>
>>>> With latest 6.8 mainline, race caused data loss can be observed easily:
>>>> $ gcc -g -lpthread test-thread-swap-race.c && ./a.out
>>>> Polulating 32MB of memory region...
>>>> Keep swapping out...
>>>> Starting round 0...
>>>> Spawning 65536 workers...
>>>> 32746 workers spawned, wait for done...
>>>> Round 0: Error on 0x5aa00, expected 32746, got 32743, 3 data loss!
>>>> Round 0: Error on 0x395200, expected 32746, got 32743, 3 data loss!
>>>> Round 0: Error on 0x3fd000, expected 32746, got 32737, 9 data loss!
>>>> Round 0 Failed, 15 data loss!
>>>>
>>>> This reproducer spawns multiple threads sharing the same memory region
>>>> using a small swap device. Every two threads updates mapped pages one
> by
>>>> one in opposite direction trying to create a race, with one dedicated
>>>> thread keep swapping out the data out using madvise.
>>>>
>>>> The reproducer created a reproduce rate of about once every 5 minutes,
>>>> so the race should be totally possible in production.
>>>>
>>>> After this patch, I ran the reproducer for over a few hundred rounds
>>>> and no data loss observed.
>>>>
>>>> Performance overhead is minimal, microbenchmark swapin 10G from 32G
>>>> zram:
>>>>
>>>> Before: 10934698 us
>>>> After: 11157121 us
>>>> Cached: 13155355 us (Dropping SWP_SYNCHRONOUS_IO flag)
>>>>
>>>> Fixes: 0bcac06f27d7 ("mm, swap: skip swapcache for swapin of
> synchronous device")
>>>> Link:
> https://github.com/ryncsn/emm-test-project/tree/master/swap-stress-race [1]
>>>> Reported-by: "Huang, Ying" <ying.huang(a)intel.com>
>>>> Closes:
> https://lore.kernel.org/lkml/87bk92gqpx.fsf_-_@yhuang6-desk2.ccr.corp.intel…
>>>> Signed-off-by: Kairui Song <kasong(a)tencent.com>
>>>> Cc: stable(a)vger.kernel.org
>>>>
>>>> ---
>>>> V3:
> https://lore.kernel.org/all/20240216095105.14502-1-ryncsn@gmail.com/
>>>> Update from V3:
>>>> - Use schedule_timeout_uninterruptible(1) for now instead of
> schedule() to
>>>> prevent the busy faulting task holds CPU and livelocks [Huang, Ying]
>>>>
>>>> V2:
> https://lore.kernel.org/all/20240206182559.32264-1-ryncsn@gmail.com/
>>>> Update from V2:
>>>> - Add a schedule() if raced to prevent repeated page faults wasting CPU
>>>> and add noise to perf statistics.
>>>> - Use a bool to state the special case instead of reusing existing
>>>> variables fixing error handling [Minchan Kim].
>>>>
>>>> V1: https://lore.kernel.org/all/20240205110959.4021-1-ryncsn@gmail.com/
>>>> Update from V1:
>>>> - Add some words on ZRAM case, it will discard swap content on
> swap_free
>>>> so the race window is a bit different but cure is the same. [Barry
> Song]
>>>> - Update comments make it cleaner [Huang, Ying]
>>>> - Add a function place holder to fix CONFIG_SWAP=n built [SeongJae
> Park]
>>>> - Update the commit message and summary, refer to SWP_SYNCHRONOUS_IO
>>>> instead of "direct swapin path" [Yu Zhao]
>>>> - Update commit message.
>>>> - Collect Review and Acks.
>>>>
>>>> include/linux/swap.h | 5 +++++
>>>> mm/memory.c | 20 ++++++++++++++++++++
>>>> mm/swap.h | 5 +++++
>>>> mm/swapfile.c | 13 +++++++++++++
>>>> 4 files changed, 43 insertions(+)
>>>>
>>>> diff --git a/include/linux/swap.h b/include/linux/swap.h
>>>> index 4db00ddad261..8d28f6091a32 100644
>>>> --- a/include/linux/swap.h
>>>> +++ b/include/linux/swap.h
>>>> @@ -549,6 +549,11 @@ static inline int swap_duplicate(swp_entry_t swp)
>>>> return 0;
>>>> }
>>>>
>>>> +static inline int swapcache_prepare(swp_entry_t swp)
>>>> +{
>>>> + return 0;
>>>> +}
>>>> +
>>>> static inline void swap_free(swp_entry_t swp)
>>>> {
>>>> }
>>>> diff --git a/mm/memory.c b/mm/memory.c
>>>> index 7e1f4849463a..a99f5e7be9a5 100644
>>>> --- a/mm/memory.c
>>>> +++ b/mm/memory.c
>>>> @@ -3799,6 +3799,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>>>> struct page *page;
>>>> struct swap_info_struct *si = NULL;
>>>> rmap_t rmap_flags = RMAP_NONE;
>>>> + bool need_clear_cache = false;
>>>> bool exclusive = false;
>>>> swp_entry_t entry;
>>>> pte_t pte;
>>>> @@ -3867,6 +3868,20 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>>>> if (!folio) {
>>>> if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
>>>> __swap_count(entry) == 1) {
>>>> + /*
>>>> + * Prevent parallel swapin from proceeding with
>>>> + * the cache flag. Otherwise, another thread
> may
>>>> + * finish swapin first, free the entry, and
> swapout
>>>> + * reusing the same entry. It's undetectable as
>>>> + * pte_same() returns true due to entry reuse.
>>>> + */
>>>> + if (swapcache_prepare(entry)) {
>>>> + /* Relax a bit to prevent rapid
> repeated page faults */
>>>> + schedule_timeout_uninterruptible(1);
>>>
>>> Not a ideal model, imaging two tasks,
>>>
>>> task A - low priority running on a LITTLE core
>>> task B - high priority and have real-time requirements such as audio,
>>> graphics running on a big core.
>>>
>>> The original code will make B win even if it is a bit later than A as
> its CPU is
>>> much faster to finish swap_read_folio for example from zRAM. task B's
>>> swap-in can finish very soon.
>>>
>>> With the patch, B will wait a tick and its real-time performance will be
>>> negatively affected from time to time once low priority and high
> priority
>>> tasks fault in the same PTE and high priority tasks are a bit later than
>>> low priority tasks. This is a kind of priority inversion.
>>>
>>> When we support large folio swap-in, things can get worse. For example,
>>> to swap-in 16 or even more pages in one do_swap_page, the chance for
>>> task A and task B located in the same range of 16 PTEs will increase
>>> though they are not located in the same PTE.
>>>
>>> Please consider this is not a blocker for this patch. But I will put
> the problem
>>> in my list and run some real tests on Android phones later.
>>
>> Good point. Late for the discussion, I'm wondering why not get an extra
> reference
>> on the swap entry, instead of swapcache_prepare()? Then the faster thread
> will
>> succeed, but can't free the swap entry. Later, the slower thread will
> find the
>> changed pte value and fail, and free the swap entry. Maybe I missed
> something?
>
> Hi, Chengming
>
> That was my initial purpose. Then found a lot of problems with it. Increase
> swap count here, it may race with another swap free and end up increasing
> the swap count of a freed entry.
>
> That can be fixed with audits and new helpers, but there are many other
> potential issues too. One major problem is that after count bump, raced
> swap threads will fallback to cached swap in. Pages in swapcache can be
> swaped out without allocating an entry, making the problem we were trying
> to resolve more serious.
Thanks for your clarification! Right, there are many issues I just ignored...
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x 14db5f64a971fce3d8ea35de4dfc7f443a3efb92
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021944-kettle-upturned-4371@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
14db5f64a971 ("zonefs: Improve error handling")
77af13ba3c7f ("zonefs: Do not propagate iomap_dio_rw() ENOTBLK error to user space")
aa7f243f32e1 ("zonefs: Separate zone information from inode information")
34422914dc00 ("zonefs: Reduce struct zonefs_inode_info size")
46a9c526eef7 ("zonefs: Simplify IO error handling")
4008e2a0b01a ("zonefs: Reorganize code")
a608da3bd730 ("zonefs: Detect append writes at invalid locations")
db58653ce0c7 ("zonefs: Fix active zone accounting")
7dd12d65ac64 ("zonefs: fix zone report size in __zonefs_io_error()")
8745889a7fd0 ("Merge tag 'iomap-6.0-merge-2' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 14db5f64a971fce3d8ea35de4dfc7f443a3efb92 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <dlemoal(a)kernel.org>
Date: Thu, 8 Feb 2024 17:26:59 +0900
Subject: [PATCH] zonefs: Improve error handling
Write error handling is racy and can sometime lead to the error recovery
path wrongly changing the inode size of a sequential zone file to an
incorrect value which results in garbage data being readable at the end
of a file. There are 2 problems:
1) zonefs_file_dio_write() updates a zone file write pointer offset
after issuing a direct IO with iomap_dio_rw(). This update is done
only if the IO succeed for synchronous direct writes. However, for
asynchronous direct writes, the update is done without waiting for
the IO completion so that the next asynchronous IO can be
immediately issued. However, if an asynchronous IO completes with a
failure right before the i_truncate_mutex lock protecting the update,
the update may change the value of the inode write pointer offset
that was corrected by the error path (zonefs_io_error() function).
2) zonefs_io_error() is called when a read or write error occurs. This
function executes a report zone operation using the callback function
zonefs_io_error_cb(), which does all the error recovery handling
based on the current zone condition, write pointer position and
according to the mount options being used. However, depending on the
zoned device being used, a report zone callback may be executed in a
context that is different from the context of __zonefs_io_error(). As
a result, zonefs_io_error_cb() may be executed without the inode
truncate mutex lock held, which can lead to invalid error processing.
Fix both problems as follows:
- Problem 1: Perform the inode write pointer offset update before a
direct write is issued with iomap_dio_rw(). This is safe to do as
partial direct writes are not supported (IOMAP_DIO_PARTIAL is not
set) and any failed IO will trigger the execution of zonefs_io_error()
which will correct the inode write pointer offset to reflect the
current state of the one on the device.
- Problem 2: Change zonefs_io_error_cb() into zonefs_handle_io_error()
and call this function directly from __zonefs_io_error() after
obtaining the zone information using blkdev_report_zones() with a
simple callback function that copies to a local stack variable the
struct blk_zone obtained from the device. This ensures that error
handling is performed holding the inode truncate mutex.
This change also simplifies error handling for conventional zone files
by bypassing the execution of report zones entirely. This is safe to
do because the condition of conventional zones cannot be read-only or
offline and conventional zone files are always fully mapped with a
constant file size.
Reported-by: Shin'ichiro Kawasaki <shinichiro.kawasaki(a)wdc.com>
Fixes: 8dcc1a9d90c1 ("fs: New zonefs file system")
Cc: stable(a)vger.kernel.org
Signed-off-by: Damien Le Moal <dlemoal(a)kernel.org>
Tested-by: Shin'ichiro Kawasaki <shinichiro.kawasaki(a)wdc.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn(a)wdc.com>
Reviewed-by: Himanshu Madhani <himanshu.madhani(a)oracle.com>
diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c
index 6ab2318a9c8e..dba5dcb62bef 100644
--- a/fs/zonefs/file.c
+++ b/fs/zonefs/file.c
@@ -348,7 +348,12 @@ static int zonefs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size,
struct zonefs_inode_info *zi = ZONEFS_I(inode);
if (error) {
- zonefs_io_error(inode, true);
+ /*
+ * For Sync IOs, error recovery is called from
+ * zonefs_file_dio_write().
+ */
+ if (!is_sync_kiocb(iocb))
+ zonefs_io_error(inode, true);
return error;
}
@@ -491,6 +496,14 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
ret = -EINVAL;
goto inode_unlock;
}
+ /*
+ * Advance the zone write pointer offset. This assumes that the
+ * IO will succeed, which is OK to do because we do not allow
+ * partial writes (IOMAP_DIO_PARTIAL is not set) and if the IO
+ * fails, the error path will correct the write pointer offset.
+ */
+ z->z_wpoffset += count;
+ zonefs_inode_account_active(inode);
mutex_unlock(&zi->i_truncate_mutex);
}
@@ -504,20 +517,19 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
if (ret == -ENOTBLK)
ret = -EBUSY;
- if (zonefs_zone_is_seq(z) &&
- (ret > 0 || ret == -EIOCBQUEUED)) {
- if (ret > 0)
- count = ret;
-
- /*
- * Update the zone write pointer offset assuming the write
- * operation succeeded. If it did not, the error recovery path
- * will correct it. Also do active seq file accounting.
- */
- mutex_lock(&zi->i_truncate_mutex);
- z->z_wpoffset += count;
- zonefs_inode_account_active(inode);
- mutex_unlock(&zi->i_truncate_mutex);
+ /*
+ * For a failed IO or partial completion, trigger error recovery
+ * to update the zone write pointer offset to a correct value.
+ * For asynchronous IOs, zonefs_file_write_dio_end_io() may already
+ * have executed error recovery if the IO already completed when we
+ * reach here. However, we cannot know that and execute error recovery
+ * again (that will not change anything).
+ */
+ if (zonefs_zone_is_seq(z)) {
+ if (ret > 0 && ret != count)
+ ret = -EIO;
+ if (ret < 0 && ret != -EIOCBQUEUED)
+ zonefs_io_error(inode, true);
}
inode_unlock:
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index 93971742613a..b6e8e7c96251 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -246,16 +246,18 @@ static void zonefs_inode_update_mode(struct inode *inode)
z->z_mode = inode->i_mode;
}
-struct zonefs_ioerr_data {
- struct inode *inode;
- bool write;
-};
-
static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx,
void *data)
{
- struct zonefs_ioerr_data *err = data;
- struct inode *inode = err->inode;
+ struct blk_zone *z = data;
+
+ *z = *zone;
+ return 0;
+}
+
+static void zonefs_handle_io_error(struct inode *inode, struct blk_zone *zone,
+ bool write)
+{
struct zonefs_zone *z = zonefs_inode_zone(inode);
struct super_block *sb = inode->i_sb;
struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
@@ -270,8 +272,8 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx,
data_size = zonefs_check_zone_condition(sb, z, zone);
isize = i_size_read(inode);
if (!(z->z_flags & (ZONEFS_ZONE_READONLY | ZONEFS_ZONE_OFFLINE)) &&
- !err->write && isize == data_size)
- return 0;
+ !write && isize == data_size)
+ return;
/*
* At this point, we detected either a bad zone or an inconsistency
@@ -292,7 +294,7 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx,
* In all cases, warn about inode size inconsistency and handle the
* IO error according to the zone condition and to the mount options.
*/
- if (zonefs_zone_is_seq(z) && isize != data_size)
+ if (isize != data_size)
zonefs_warn(sb,
"inode %lu: invalid size %lld (should be %lld)\n",
inode->i_ino, isize, data_size);
@@ -352,8 +354,6 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx,
zonefs_i_size_write(inode, data_size);
z->z_wpoffset = data_size;
zonefs_inode_account_active(inode);
-
- return 0;
}
/*
@@ -367,23 +367,25 @@ void __zonefs_io_error(struct inode *inode, bool write)
{
struct zonefs_zone *z = zonefs_inode_zone(inode);
struct super_block *sb = inode->i_sb;
- struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
unsigned int noio_flag;
- unsigned int nr_zones = 1;
- struct zonefs_ioerr_data err = {
- .inode = inode,
- .write = write,
- };
+ struct blk_zone zone;
int ret;
/*
- * The only files that have more than one zone are conventional zone
- * files with aggregated conventional zones, for which the inode zone
- * size is always larger than the device zone size.
+ * Conventional zone have no write pointer and cannot become read-only
+ * or offline. So simply fake a report for a single or aggregated zone
+ * and let zonefs_handle_io_error() correct the zone inode information
+ * according to the mount options.
*/
- if (z->z_size > bdev_zone_sectors(sb->s_bdev))
- nr_zones = z->z_size >>
- (sbi->s_zone_sectors_shift + SECTOR_SHIFT);
+ if (!zonefs_zone_is_seq(z)) {
+ zone.start = z->z_sector;
+ zone.len = z->z_size >> SECTOR_SHIFT;
+ zone.wp = zone.start + zone.len;
+ zone.type = BLK_ZONE_TYPE_CONVENTIONAL;
+ zone.cond = BLK_ZONE_COND_NOT_WP;
+ zone.capacity = zone.len;
+ goto handle_io_error;
+ }
/*
* Memory allocations in blkdev_report_zones() can trigger a memory
@@ -394,12 +396,20 @@ void __zonefs_io_error(struct inode *inode, bool write)
* the GFP_NOIO context avoids both problems.
*/
noio_flag = memalloc_noio_save();
- ret = blkdev_report_zones(sb->s_bdev, z->z_sector, nr_zones,
- zonefs_io_error_cb, &err);
- if (ret != nr_zones)
+ ret = blkdev_report_zones(sb->s_bdev, z->z_sector, 1,
+ zonefs_io_error_cb, &zone);
+ memalloc_noio_restore(noio_flag);
+
+ if (ret != 1) {
zonefs_err(sb, "Get inode %lu zone information failed %d\n",
inode->i_ino, ret);
- memalloc_noio_restore(noio_flag);
+ zonefs_warn(sb, "remounting filesystem read-only\n");
+ sb->s_flags |= SB_RDONLY;
+ return;
+ }
+
+handle_io_error:
+ zonefs_handle_io_error(inode, &zone, write);
}
static struct kmem_cache *zonefs_inode_cachep;
Hi Jaegeuk Kim, Chao Yu,
In Debian the following regression was reported after a Dhya updated
to 6.1.76:
On Wed, Feb 07, 2024 at 10:43:47PM -0500, Dhya wrote:
> Package: src:linux
> Version: 6.1.76-1
> Severity: critical
> Justification: breaks the whole system
>
> Dear Maintainer,
>
> After upgrade to linux-image-6.1.0-18-amd64 6.1.76-1 F2FS filesystem
> fails to mount rw. Message in the boot journal:
>
> kernel: F2FS-fs (nvme0n1p6): invalid zstd compress level: 6
>
> There was recently an f2fs patch to the 6.1 kernel tree which might be
> related: https://www.spinics.net/lists/stable-commits/msg329957.html
>
> Was able to recover the system by doing:
>
> sudo mount -o remount,rw,relatime,lazytime,background_gc=on,discard,no_heap,user_xattr,inline_xattr,acl,inline_data,inline_dentry,extent_cache,mode=adaptive,active_logs=6,alloc_mode=default,checkpoint_merge,fsync_mode=posix,compress_algorithm=lz4,compress_log_size=2,compress_mode=fs,atgc,discard_unit=block,memory=normal /dev/nvme0n1p6 /
>
> under the running bad 6.1.0-18-amd64 kernel, then editing
> /etc/default/grub:
>
> GRUB_DEFAULT="Advanced options for Debian GNU/Linux>Debian GNU/Linux, with Linux 6.1.0-17-amd64"
>
> and running 'update-grub' and rebooting to boot the 6.1.0-17-amd64
> kernel.
The issue is easily reproducible by:
# dd if=/dev/zero of=test.img count=100 bs=1M
# mkfs.f2fs -f -O compression,extra_attr ./test.img
# mount -t f2fs -o compress_algorithm=zstd:6,compress_chksum,atgc,gc_merge,lazytime ./test.img /mnt
resulting in
[ 60.789982] F2FS-fs (loop0): invalid zstd compress level: 6
A bugzilla report has been submitted in
https://bugzilla.kernel.org/show_bug.cgi?id=218471
#regzbot introduced: v6.1.69..v6.1.76
#regzbot link: https://bugs.debian.org/1063422
#regzbot link: https://bugzilla.kernel.org/show_bug.cgi?id=218471
Regards,
Salvatore
The patch titled
Subject: sched/numa, mm: do not try to migrate memory to memoryless nodes
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
sched-numa-mm-do-not-try-to-migrate-memory-to-memoryless-nodes.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Byungchul Park <byungchul(a)sk.com>
Subject: sched/numa, mm: do not try to migrate memory to memoryless nodes
Date: Mon, 19 Feb 2024 13:10:47 +0900
With numa balancing on, when a numa system is running where a numa node
doesn't have its local memory so it has no managed zones, the following
oops has been observed. It's because wakeup_kswapd() is called with a
wrong zone index, -1. Fixed it by checking the index before calling
wakeup_kswapd().
> BUG: unable to handle page fault for address: 00000000000033f3
> #PF: supervisor read access in kernel mode
> #PF: error_code(0x0000) - not-present page
> PGD 0 P4D 0
> Oops: 0000 [#1] PREEMPT SMP NOPTI
> CPU: 2 PID: 895 Comm: masim Not tainted 6.6.0-dirty #255
> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
> rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014
> RIP: 0010:wakeup_kswapd (./linux/mm/vmscan.c:7812)
> Code: (omitted)
> RSP: 0000:ffffc90004257d58 EFLAGS: 00010286
> RAX: ffffffffffffffff RBX: ffff88883fff0480 RCX: 0000000000000003
> RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff88883fff0480
> RBP: ffffffffffffffff R08: ff0003ffffffffff R09: ffffffffffffffff
> R10: ffff888106c95540 R11: 0000000055555554 R12: 0000000000000003
> R13: 0000000000000000 R14: 0000000000000000 R15: ffff88883fff0940
> FS: 00007fc4b8124740(0000) GS:ffff888827c00000(0000) knlGS:0000000000000000
> CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> CR2: 00000000000033f3 CR3: 000000026cc08004 CR4: 0000000000770ee0
> DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> PKRU: 55555554
> Call Trace:
> <TASK>
> ? __die
> ? page_fault_oops
> ? __pte_offset_map_lock
> ? exc_page_fault
> ? asm_exc_page_fault
> ? wakeup_kswapd
> migrate_misplaced_page
> __handle_mm_fault
> handle_mm_fault
> do_user_addr_fault
> exc_page_fault
> asm_exc_page_fault
> RIP: 0033:0x55b897ba0808
> Code: (omitted)
> RSP: 002b:00007ffeefa821a0 EFLAGS: 00010287
> RAX: 000055b89983acd0 RBX: 00007ffeefa823f8 RCX: 000055b89983acd0
> RDX: 00007fc2f8122010 RSI: 0000000000020000 RDI: 000055b89983acd0
> RBP: 00007ffeefa821a0 R08: 0000000000000037 R09: 0000000000000075
> R10: 0000000000000000 R11: 0000000000000202 R12: 0000000000000000
> R13: 00007ffeefa82410 R14: 000055b897ba5dd8 R15: 00007fc4b8340000
> </TASK>
Fix this by avoiding any attempt to migrate memory to memoryless nodes.
Link: https://lkml.kernel.org/r/20240219041920.1183-1-byungchul@sk.com
Link: https://lkml.kernel.org/r/20240216111502.79759-1-byungchul@sk.com
Fixes: c574bbe917036 ("NUMA balancing: optimize page placement for memory tiering system")
Signed-off-by: Byungchul Park <byungchul(a)sk.com>
Reviewed-by: Oscar Salvador <osalvador(a)suse.de>
Reviewed-by: "Huang, Ying" <ying.huang(a)intel.com>
Reviewed-by: Phil Auld <pauld(a)redhat.com>
Cc: Benjamin Segall <bsegall(a)google.com>
Cc: Daniel Bristot de Oliveira <bristot(a)redhat.com>
Cc: Dietmar Eggemann <dietmar.eggemann(a)arm.com>
Cc: Ingo Molnar <mingo(a)redhat.com>
Cc: Juri Lelli <juri.lelli(a)redhat.com>
Cc: Mel Gorman <mgorman(a)suse.de>
Cc: Peter Zijlstra <peterz(a)infradead.org>
Cc: Steven Rostedt <rostedt(a)goodmis.org>
Cc: Valentin Schneider <vschneid(a)redhat.com>
Cc: Vincent Guittot <vincent.guittot(a)linaro.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
kernel/sched/fair.c | 6 ++++++
1 file changed, 6 insertions(+)
--- a/kernel/sched/fair.c~sched-numa-mm-do-not-try-to-migrate-memory-to-memoryless-nodes
+++ a/kernel/sched/fair.c
@@ -1831,6 +1831,12 @@ bool should_numa_migrate_memory(struct t
int last_cpupid, this_cpupid;
/*
+ * Cannot migrate to memoryless nodes.
+ */
+ if (!node_state(dst_nid, N_MEMORY))
+ return false;
+
+ /*
* The pages in slow memory node should be migrated according
* to hot/cold instead of private/shared.
*/
_
Patches currently in -mm which might be from byungchul(a)sk.com are
sched-numa-mm-do-not-try-to-migrate-memory-to-memoryless-nodes.patch
mm-vmscan-dont-turn-on-cache_trim_mode-at-the-highest-scan-priority.patch
While mq_perf_tests runs with the default kselftest timeout limit, which
is 45 seconds, the test takes about 60 seconds to complete on i3.metal
AWS instances. Hence, the test always times out. Increase the timeout
to 100 seconds.
Link: https://lore.kernel.org/r/20240208212925.68286-1-sj@kernel.org
Fixes: 852c8cbf34d3 ("selftests/kselftest/runner.sh: Add 45 second timeout per test")
Cc: <stable(a)vger.kernel.org> # 5.4.x
Signed-off-by: SeongJae Park <sj(a)kernel.org>
Reviewed-by: Kees Cook <keescook(a)chromium.org>
---
Changes from v1
(https://lore.kernel.org/r/20240208212925.68286-1-sj@kernel.org)
- Use 180 seconds timeout instead of 100 seconds
tools/testing/selftests/mqueue/setting | 1 +
1 file changed, 1 insertion(+)
create mode 100644 tools/testing/selftests/mqueue/setting
diff --git a/tools/testing/selftests/mqueue/setting b/tools/testing/selftests/mqueue/setting
new file mode 100644
index 000000000000..a953c96aa16e
--- /dev/null
+++ b/tools/testing/selftests/mqueue/setting
@@ -0,0 +1 @@
+timeout=180
--
2.39.2
The patch titled
Subject: mm/damon/lru_sort: fix quota status loss due to online tunings
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
mm-damon-lru_sort-fix-quota-status-loss-due-to-online-tunings.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: SeongJae Park <sj(a)kernel.org>
Subject: mm/damon/lru_sort: fix quota status loss due to online tunings
Date: Fri, 16 Feb 2024 11:40:25 -0800
For online parameters change, DAMON_LRU_SORT creates new schemes based on
latest values of the parameters and replaces the old schemes with the new
one. When creating it, the internal status of the quotas of the old
schemes is not preserved. As a result, charging of the quota starts from
zero after the online tuning. The data that collected to estimate the
throughput of the scheme's action is also reset, and therefore the
estimation should start from the scratch again. Because the throughput
estimation is being used to convert the time quota to the effective size
quota, this could result in temporal time quota inaccuracy. It would be
recovered over time, though. In short, the quota accuracy could be
temporarily degraded after online parameters update.
Fix the problem by checking the case and copying the internal fields for
the status.
Link: https://lkml.kernel.org/r/20240216194025.9207-3-sj@kernel.org
Fixes: 40e983cca927 ("mm/damon: introduce DAMON-based LRU-lists Sorting")
Signed-off-by: SeongJae Park <sj(a)kernel.org>
Cc: <stable(a)vger.kernel.org> [6.0+]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/damon/lru_sort.c | 43 +++++++++++++++++++++++++++++++++++-------
1 file changed, 36 insertions(+), 7 deletions(-)
--- a/mm/damon/lru_sort.c~mm-damon-lru_sort-fix-quota-status-loss-due-to-online-tunings
+++ a/mm/damon/lru_sort.c
@@ -185,9 +185,21 @@ static struct damos *damon_lru_sort_new_
return damon_lru_sort_new_scheme(&pattern, DAMOS_LRU_DEPRIO);
}
+static void damon_lru_sort_copy_quota_status(struct damos_quota *dst,
+ struct damos_quota *src)
+{
+ dst->total_charged_sz = src->total_charged_sz;
+ dst->total_charged_ns = src->total_charged_ns;
+ dst->charged_sz = src->charged_sz;
+ dst->charged_from = src->charged_from;
+ dst->charge_target_from = src->charge_target_from;
+ dst->charge_addr_from = src->charge_addr_from;
+}
+
static int damon_lru_sort_apply_parameters(void)
{
- struct damos *scheme;
+ struct damos *scheme, *hot_scheme, *cold_scheme;
+ struct damos *old_hot_scheme = NULL, *old_cold_scheme = NULL;
unsigned int hot_thres, cold_thres;
int err = 0;
@@ -195,18 +207,35 @@ static int damon_lru_sort_apply_paramete
if (err)
return err;
+ damon_for_each_scheme(scheme, ctx) {
+ if (!old_hot_scheme) {
+ old_hot_scheme = scheme;
+ continue;
+ }
+ old_cold_scheme = scheme;
+ }
+
hot_thres = damon_max_nr_accesses(&damon_lru_sort_mon_attrs) *
hot_thres_access_freq / 1000;
- scheme = damon_lru_sort_new_hot_scheme(hot_thres);
- if (!scheme)
+ hot_scheme = damon_lru_sort_new_hot_scheme(hot_thres);
+ if (!hot_scheme)
return -ENOMEM;
- damon_set_schemes(ctx, &scheme, 1);
+ if (old_hot_scheme)
+ damon_lru_sort_copy_quota_status(&hot_scheme->quota,
+ &old_hot_scheme->quota);
cold_thres = cold_min_age / damon_lru_sort_mon_attrs.aggr_interval;
- scheme = damon_lru_sort_new_cold_scheme(cold_thres);
- if (!scheme)
+ cold_scheme = damon_lru_sort_new_cold_scheme(cold_thres);
+ if (!cold_scheme) {
+ damon_destroy_scheme(hot_scheme);
return -ENOMEM;
- damon_add_scheme(ctx, scheme);
+ }
+ if (old_cold_scheme)
+ damon_lru_sort_copy_quota_status(&cold_scheme->quota,
+ &old_cold_scheme->quota);
+
+ damon_set_schemes(ctx, &hot_scheme, 1);
+ damon_add_scheme(ctx, cold_scheme);
return damon_set_region_biggest_system_ram_default(target,
&monitor_region_start,
_
Patches currently in -mm which might be from sj(a)kernel.org are
mm-damon-core-check-apply-interval-in-damon_do_apply_schemes.patch
mm-damon-sysfs-schemes-handle-schemes-sysfs-dir-removal-before-commit_schemes_quota_goals.patch
mm-damon-reclaim-fix-quota-stauts-loss-due-to-online-tunings.patch
mm-damon-lru_sort-fix-quota-status-loss-due-to-online-tunings.patch
docs-admin-guide-mm-damon-usage-use-sysfs-interface-for-tracepoints-example.patch
mm-damon-rename-config_damon_dbgfs-to-damon_dbgfs_deprecated.patch
mm-damon-dbgfs-implement-deprecation-notice-file.patch
mm-damon-dbgfs-make-debugfs-interface-deprecation-message-a-macro.patch
docs-admin-guide-mm-damon-usage-document-deprecated-file-of-damon-debugfs-interface.patch
selftets-damon-prepare-for-monitor_on-file-renaming.patch
mm-damon-dbgfs-rename-monitor_on-file-to-monitor_on_deprecated.patch
docs-admin-guide-mm-damon-usage-update-for-monitor_on-renaming.patch
docs-translations-damon-usage-update-for-monitor_on-renaming.patch
mm-damon-sysfs-handle-state-file-inputs-for-every-sampling-interval-if-possible.patch
selftests-damon-_damon_sysfs-support-damos-quota.patch
selftests-damon-_damon_sysfs-support-damos-stats.patch
selftests-damon-_damon_sysfs-support-damos-apply-interval.patch
selftests-damon-add-a-test-for-damos-quota.patch
selftests-damon-add-a-test-for-damos-apply-intervals.patch
selftests-damon-add-a-test-for-a-race-between-target_ids_read-and-dbgfs_before_terminate.patch
selftests-damon-add-a-test-for-the-pid-leak-of-dbgfs_target_ids_write.patch
selftests-damon-_chk_dependency-get-debugfs-mount-point-from-proc-mounts.patch
docs-mm-damon-maintainer-profile-fix-reference-links-for-mm-stable-tree.patch
docs-mm-damon-move-the-list-of-damos-actions-to-design-doc.patch
docs-mm-damon-move-damon-operation-sets-list-from-the-usage-to-the-design-document.patch
docs-mm-damon-move-monitoring-target-regions-setup-detail-from-the-usage-to-the-design-document.patch
docs-admin-guide-mm-damon-usage-fix-wrong-quotas-diabling-condition.patch
mm-damon-core-set-damos_quota-esz-as-public-field-and-document.patch
mm-damon-sysfs-schemes-implement-quota-effective_bytes-file.patch
mm-damon-sysfs-implement-a-kdamond-command-for-updating-schemes-effective-quotas.patch
docs-abi-damon-document-effective_bytes-sysfs-file.patch
docs-admin-guide-mm-damon-usage-document-effective_bytes-file.patch
mm-damon-move-comments-and-fields-for-damos-quota-prioritization-to-the-end.patch
mm-damon-core-split-out-quota-goal-related-fields-to-a-struct.patch
mm-damon-core-add-multiple-goals-per-damos_quota-and-helpers-for-those.patch
mm-damon-sysfs-use-only-quota-goals.patch
mm-damon-core-remove-goal-field-of-damos_quota.patch
mm-damon-core-let-goal-specified-with-only-target-and-current-values.patch
mm-damon-core-support-multiple-metrics-for-quota-goal.patch
mm-damon-core-implement-psi-metric-damos-quota-goal.patch
mm-damon-sysfs-schemes-support-psi-based-quota-auto-tune.patch
docs-mm-damon-design-document-quota-goal-self-tuning.patch
docs-abi-damon-document-quota-goal-metric-file.patch
docs-admin-guide-mm-damon-usage-document-quota-goal-metric-file.patch
mm-damon-reclaim-implement-user-feedback-driven-quota-auto-tuning.patch
mm-damon-reclaim-implement-memory-psi-driven-quota-self-tuning.patch
docs-admin-guide-mm-damon-reclaim-document-auto-tuning-parameters.patch
The patch titled
Subject: mm/damon/reclaim: fix quota stauts loss due to online tunings
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
mm-damon-reclaim-fix-quota-stauts-loss-due-to-online-tunings.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: SeongJae Park <sj(a)kernel.org>
Subject: mm/damon/reclaim: fix quota stauts loss due to online tunings
Date: Fri, 16 Feb 2024 11:40:24 -0800
Patch series "mm/damon: fix quota status loss due to online tunings".
DAMON_RECLAIM and DAMON_LRU_SORT is not preserving internal quota status
when applying new user parameters, and hence could cause temporal quota
accuracy degradation. Fix it by preserving the status.
This patch (of 2):
For online parameters change, DAMON_RECLAIM creates new scheme based on
latest values of the parameters and replaces the old scheme with the new
one. When creating it, the internal status of the quota of the old
scheme is not preserved. As a result, charging of the quota starts from
zero after the online tuning. The data that collected to estimate the
throughput of the scheme's action is also reset, and therefore the
estimation should start from the scratch again. Because the throughput
estimation is being used to convert the time quota to the effective size
quota, this could result in temporal time quota inaccuracy. It would be
recovered over time, though. In short, the quota accuracy could be
temporarily degraded after online parameters update.
Fix the problem by checking the case and copying the internal fields for
the status.
Link: https://lkml.kernel.org/r/20240216194025.9207-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20240216194025.9207-2-sj@kernel.org
Fixes: e035c280f6df ("mm/damon/reclaim: support online inputs update")
Signed-off-by: SeongJae Park <sj(a)kernel.org>
Cc: <stable(a)vger.kernel.org> [5.19+]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/damon/reclaim.c | 18 +++++++++++++++++-
1 file changed, 17 insertions(+), 1 deletion(-)
--- a/mm/damon/reclaim.c~mm-damon-reclaim-fix-quota-stauts-loss-due-to-online-tunings
+++ a/mm/damon/reclaim.c
@@ -150,9 +150,20 @@ static struct damos *damon_reclaim_new_s
&damon_reclaim_wmarks);
}
+static void damon_reclaim_copy_quota_status(struct damos_quota *dst,
+ struct damos_quota *src)
+{
+ dst->total_charged_sz = src->total_charged_sz;
+ dst->total_charged_ns = src->total_charged_ns;
+ dst->charged_sz = src->charged_sz;
+ dst->charged_from = src->charged_from;
+ dst->charge_target_from = src->charge_target_from;
+ dst->charge_addr_from = src->charge_addr_from;
+}
+
static int damon_reclaim_apply_parameters(void)
{
- struct damos *scheme;
+ struct damos *scheme, *old_scheme;
struct damos_filter *filter;
int err = 0;
@@ -164,6 +175,11 @@ static int damon_reclaim_apply_parameter
scheme = damon_reclaim_new_scheme();
if (!scheme)
return -ENOMEM;
+ if (!list_empty(&ctx->schemes)) {
+ damon_for_each_scheme(old_scheme, ctx)
+ damon_reclaim_copy_quota_status(&scheme->quota,
+ &old_scheme->quota);
+ }
if (skip_anon) {
filter = damos_new_filter(DAMOS_FILTER_TYPE_ANON, true);
if (!filter) {
_
Patches currently in -mm which might be from sj(a)kernel.org are
mm-damon-core-check-apply-interval-in-damon_do_apply_schemes.patch
mm-damon-sysfs-schemes-handle-schemes-sysfs-dir-removal-before-commit_schemes_quota_goals.patch
mm-damon-reclaim-fix-quota-stauts-loss-due-to-online-tunings.patch
mm-damon-lru_sort-fix-quota-status-loss-due-to-online-tunings.patch
docs-admin-guide-mm-damon-usage-use-sysfs-interface-for-tracepoints-example.patch
mm-damon-rename-config_damon_dbgfs-to-damon_dbgfs_deprecated.patch
mm-damon-dbgfs-implement-deprecation-notice-file.patch
mm-damon-dbgfs-make-debugfs-interface-deprecation-message-a-macro.patch
docs-admin-guide-mm-damon-usage-document-deprecated-file-of-damon-debugfs-interface.patch
selftets-damon-prepare-for-monitor_on-file-renaming.patch
mm-damon-dbgfs-rename-monitor_on-file-to-monitor_on_deprecated.patch
docs-admin-guide-mm-damon-usage-update-for-monitor_on-renaming.patch
docs-translations-damon-usage-update-for-monitor_on-renaming.patch
mm-damon-sysfs-handle-state-file-inputs-for-every-sampling-interval-if-possible.patch
selftests-damon-_damon_sysfs-support-damos-quota.patch
selftests-damon-_damon_sysfs-support-damos-stats.patch
selftests-damon-_damon_sysfs-support-damos-apply-interval.patch
selftests-damon-add-a-test-for-damos-quota.patch
selftests-damon-add-a-test-for-damos-apply-intervals.patch
selftests-damon-add-a-test-for-a-race-between-target_ids_read-and-dbgfs_before_terminate.patch
selftests-damon-add-a-test-for-the-pid-leak-of-dbgfs_target_ids_write.patch
selftests-damon-_chk_dependency-get-debugfs-mount-point-from-proc-mounts.patch
docs-mm-damon-maintainer-profile-fix-reference-links-for-mm-stable-tree.patch
docs-mm-damon-move-the-list-of-damos-actions-to-design-doc.patch
docs-mm-damon-move-damon-operation-sets-list-from-the-usage-to-the-design-document.patch
docs-mm-damon-move-monitoring-target-regions-setup-detail-from-the-usage-to-the-design-document.patch
docs-admin-guide-mm-damon-usage-fix-wrong-quotas-diabling-condition.patch
mm-damon-core-set-damos_quota-esz-as-public-field-and-document.patch
mm-damon-sysfs-schemes-implement-quota-effective_bytes-file.patch
mm-damon-sysfs-implement-a-kdamond-command-for-updating-schemes-effective-quotas.patch
docs-abi-damon-document-effective_bytes-sysfs-file.patch
docs-admin-guide-mm-damon-usage-document-effective_bytes-file.patch
mm-damon-move-comments-and-fields-for-damos-quota-prioritization-to-the-end.patch
mm-damon-core-split-out-quota-goal-related-fields-to-a-struct.patch
mm-damon-core-add-multiple-goals-per-damos_quota-and-helpers-for-those.patch
mm-damon-sysfs-use-only-quota-goals.patch
mm-damon-core-remove-goal-field-of-damos_quota.patch
mm-damon-core-let-goal-specified-with-only-target-and-current-values.patch
mm-damon-core-support-multiple-metrics-for-quota-goal.patch
mm-damon-core-implement-psi-metric-damos-quota-goal.patch
mm-damon-sysfs-schemes-support-psi-based-quota-auto-tune.patch
docs-mm-damon-design-document-quota-goal-self-tuning.patch
docs-abi-damon-document-quota-goal-metric-file.patch
docs-admin-guide-mm-damon-usage-document-quota-goal-metric-file.patch
mm-damon-reclaim-implement-user-feedback-driven-quota-auto-tuning.patch
mm-damon-reclaim-implement-memory-psi-driven-quota-self-tuning.patch
docs-admin-guide-mm-damon-reclaim-document-auto-tuning-parameters.patch
While mq_perf_tests runs with the default kselftest timeout limit, which
is 45 seconds, the test takes about 60 seconds to complete on i3.metal
AWS instances. Hence, the test always times out. Increase the timeout
to 100 seconds.
Fixes: 852c8cbf34d3 ("selftests/kselftest/runner.sh: Add 45 second timeout per test")
Cc: <stable(a)vger.kernel.org> # 5.4.x
Signed-off-by: SeongJae Park <sj(a)kernel.org>
---
tools/testing/selftests/mqueue/setting | 1 +
1 file changed, 1 insertion(+)
create mode 100644 tools/testing/selftests/mqueue/setting
diff --git a/tools/testing/selftests/mqueue/setting b/tools/testing/selftests/mqueue/setting
new file mode 100644
index 000000000000..54dc12287839
--- /dev/null
+++ b/tools/testing/selftests/mqueue/setting
@@ -0,0 +1 @@
+timeout=100
--
2.39.2
When linking or renaming a file, if only one of the source or
destination directory is backed by an S_PRIVATE inode, then the related
set of layer masks would be used as uninitialized by
is_access_to_paths_allowed(). This would result to indeterministic
access for one side instead of always being allowed.
This bug could only be triggered with a mounted filesystem containing
both S_PRIVATE and !S_PRIVATE inodes, which doesn't seem possible.
The collect_domain_accesses() calls return early if
is_nouser_or_private() returns false, which means that the directory's
superblock has SB_NOUSER or its inode has S_PRIVATE. Because rename or
link actions are only allowed on the same mounted filesystem, the
superblock is always the same for both source and destination
directories. However, it might be possible in theory to have an
S_PRIVATE parent source inode with an !S_PRIVATE parent destination
inode, or vice versa.
To make sure this case is not an issue, explicitly initialized both set
of layer masks to 0, which means to allow all actions on the related
side. If at least on side has !S_PRIVATE, then
collect_domain_accesses() and is_access_to_paths_allowed() check for the
required access rights.
Cc: Arnd Bergmann <arnd(a)arndb.de>
Cc: Christian Brauner <brauner(a)kernel.org>
Cc: Günther Noack <gnoack(a)google.com>
Cc: Jann Horn <jannh(a)google.com>
Cc: Shervin Oloumi <enlightened(a)chromium.org>
Cc: stable(a)vger.kernel.org
Fixes: b91c3e4ea756 ("landlock: Add support for file reparenting with LANDLOCK_ACCESS_FS_REFER")
Signed-off-by: Mickaël Salaün <mic(a)digikod.net>
---
security/landlock/fs.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/security/landlock/fs.c b/security/landlock/fs.c
index 90f7f6db1e87..f243c6a392ee 100644
--- a/security/landlock/fs.c
+++ b/security/landlock/fs.c
@@ -1093,8 +1093,8 @@ static int current_check_refer_path(struct dentry *const old_dentry,
bool allow_parent1, allow_parent2;
access_mask_t access_request_parent1, access_request_parent2;
struct path mnt_dir;
- layer_mask_t layer_masks_parent1[LANDLOCK_NUM_ACCESS_FS],
- layer_masks_parent2[LANDLOCK_NUM_ACCESS_FS];
+ layer_mask_t layer_masks_parent1[LANDLOCK_NUM_ACCESS_FS] = {},
+ layer_masks_parent2[LANDLOCK_NUM_ACCESS_FS] = {};
if (!dom)
return 0;
--
2.43.0
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.19.y
git checkout FETCH_HEAD
git cherry-pick -x 4e440abc894585a34c2904a32cd54af1742311b3
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021941-jelly-tubular-7919@gregkh' --subject-prefix 'PATCH 4.19.y' HEAD^..
Possible dependencies:
4e440abc8945 ("hwmon: (coretemp) Fix out-of-bounds memory access")
7108b80a542b ("hwmon/coretemp: Handle large core ID value")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 4e440abc894585a34c2904a32cd54af1742311b3 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang(a)intel.com>
Date: Fri, 2 Feb 2024 17:21:34 +0800
Subject: [PATCH] hwmon: (coretemp) Fix out-of-bounds memory access
Fix a bug that pdata->cpu_map[] is set before out-of-bounds check.
The problem might be triggered on systems with more than 128 cores per
package.
Fixes: 7108b80a542b ("hwmon/coretemp: Handle large core ID value")
Signed-off-by: Zhang Rui <rui.zhang(a)intel.com>
Cc: <stable(a)vger.kernel.org>
Link: https://lore.kernel.org/r/20240202092144.71180-2-rui.zhang@intel.com
Signed-off-by: Guenter Roeck <linux(a)roeck-us.net>
diff --git a/drivers/hwmon/coretemp.c b/drivers/hwmon/coretemp.c
index ba82d1e79c13..e78c76919111 100644
--- a/drivers/hwmon/coretemp.c
+++ b/drivers/hwmon/coretemp.c
@@ -509,18 +509,14 @@ static int create_core_data(struct platform_device *pdev, unsigned int cpu,
if (pkg_flag) {
attr_no = PKG_SYSFS_ATTR_NO;
} else {
- index = ida_alloc(&pdata->ida, GFP_KERNEL);
+ index = ida_alloc_max(&pdata->ida, NUM_REAL_CORES - 1, GFP_KERNEL);
if (index < 0)
return index;
+
pdata->cpu_map[index] = topology_core_id(cpu);
attr_no = index + BASE_SYSFS_ATTR_NO;
}
- if (attr_no > MAX_CORE_DATA - 1) {
- err = -ERANGE;
- goto ida_free;
- }
-
tdata = init_temp_data(cpu, pkg_flag);
if (!tdata) {
err = -ENOMEM;
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x 4e440abc894585a34c2904a32cd54af1742311b3
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021940-monsieur-unshipped-a926@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
4e440abc8945 ("hwmon: (coretemp) Fix out-of-bounds memory access")
7108b80a542b ("hwmon/coretemp: Handle large core ID value")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 4e440abc894585a34c2904a32cd54af1742311b3 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang(a)intel.com>
Date: Fri, 2 Feb 2024 17:21:34 +0800
Subject: [PATCH] hwmon: (coretemp) Fix out-of-bounds memory access
Fix a bug that pdata->cpu_map[] is set before out-of-bounds check.
The problem might be triggered on systems with more than 128 cores per
package.
Fixes: 7108b80a542b ("hwmon/coretemp: Handle large core ID value")
Signed-off-by: Zhang Rui <rui.zhang(a)intel.com>
Cc: <stable(a)vger.kernel.org>
Link: https://lore.kernel.org/r/20240202092144.71180-2-rui.zhang@intel.com
Signed-off-by: Guenter Roeck <linux(a)roeck-us.net>
diff --git a/drivers/hwmon/coretemp.c b/drivers/hwmon/coretemp.c
index ba82d1e79c13..e78c76919111 100644
--- a/drivers/hwmon/coretemp.c
+++ b/drivers/hwmon/coretemp.c
@@ -509,18 +509,14 @@ static int create_core_data(struct platform_device *pdev, unsigned int cpu,
if (pkg_flag) {
attr_no = PKG_SYSFS_ATTR_NO;
} else {
- index = ida_alloc(&pdata->ida, GFP_KERNEL);
+ index = ida_alloc_max(&pdata->ida, NUM_REAL_CORES - 1, GFP_KERNEL);
if (index < 0)
return index;
+
pdata->cpu_map[index] = topology_core_id(cpu);
attr_no = index + BASE_SYSFS_ATTR_NO;
}
- if (attr_no > MAX_CORE_DATA - 1) {
- err = -ERANGE;
- goto ida_free;
- }
-
tdata = init_temp_data(cpu, pkg_flag);
if (!tdata) {
err = -ENOMEM;
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x 4e440abc894585a34c2904a32cd54af1742311b3
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021939-concierge-eclipse-4ffd@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
4e440abc8945 ("hwmon: (coretemp) Fix out-of-bounds memory access")
7108b80a542b ("hwmon/coretemp: Handle large core ID value")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 4e440abc894585a34c2904a32cd54af1742311b3 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang(a)intel.com>
Date: Fri, 2 Feb 2024 17:21:34 +0800
Subject: [PATCH] hwmon: (coretemp) Fix out-of-bounds memory access
Fix a bug that pdata->cpu_map[] is set before out-of-bounds check.
The problem might be triggered on systems with more than 128 cores per
package.
Fixes: 7108b80a542b ("hwmon/coretemp: Handle large core ID value")
Signed-off-by: Zhang Rui <rui.zhang(a)intel.com>
Cc: <stable(a)vger.kernel.org>
Link: https://lore.kernel.org/r/20240202092144.71180-2-rui.zhang@intel.com
Signed-off-by: Guenter Roeck <linux(a)roeck-us.net>
diff --git a/drivers/hwmon/coretemp.c b/drivers/hwmon/coretemp.c
index ba82d1e79c13..e78c76919111 100644
--- a/drivers/hwmon/coretemp.c
+++ b/drivers/hwmon/coretemp.c
@@ -509,18 +509,14 @@ static int create_core_data(struct platform_device *pdev, unsigned int cpu,
if (pkg_flag) {
attr_no = PKG_SYSFS_ATTR_NO;
} else {
- index = ida_alloc(&pdata->ida, GFP_KERNEL);
+ index = ida_alloc_max(&pdata->ida, NUM_REAL_CORES - 1, GFP_KERNEL);
if (index < 0)
return index;
+
pdata->cpu_map[index] = topology_core_id(cpu);
attr_no = index + BASE_SYSFS_ATTR_NO;
}
- if (attr_no > MAX_CORE_DATA - 1) {
- err = -ERANGE;
- goto ida_free;
- }
-
tdata = init_temp_data(cpu, pkg_flag);
if (!tdata) {
err = -ENOMEM;
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 4e440abc894585a34c2904a32cd54af1742311b3
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021939-blunt-uncouple-5a31@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
4e440abc8945 ("hwmon: (coretemp) Fix out-of-bounds memory access")
7108b80a542b ("hwmon/coretemp: Handle large core ID value")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 4e440abc894585a34c2904a32cd54af1742311b3 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang(a)intel.com>
Date: Fri, 2 Feb 2024 17:21:34 +0800
Subject: [PATCH] hwmon: (coretemp) Fix out-of-bounds memory access
Fix a bug that pdata->cpu_map[] is set before out-of-bounds check.
The problem might be triggered on systems with more than 128 cores per
package.
Fixes: 7108b80a542b ("hwmon/coretemp: Handle large core ID value")
Signed-off-by: Zhang Rui <rui.zhang(a)intel.com>
Cc: <stable(a)vger.kernel.org>
Link: https://lore.kernel.org/r/20240202092144.71180-2-rui.zhang@intel.com
Signed-off-by: Guenter Roeck <linux(a)roeck-us.net>
diff --git a/drivers/hwmon/coretemp.c b/drivers/hwmon/coretemp.c
index ba82d1e79c13..e78c76919111 100644
--- a/drivers/hwmon/coretemp.c
+++ b/drivers/hwmon/coretemp.c
@@ -509,18 +509,14 @@ static int create_core_data(struct platform_device *pdev, unsigned int cpu,
if (pkg_flag) {
attr_no = PKG_SYSFS_ATTR_NO;
} else {
- index = ida_alloc(&pdata->ida, GFP_KERNEL);
+ index = ida_alloc_max(&pdata->ida, NUM_REAL_CORES - 1, GFP_KERNEL);
if (index < 0)
return index;
+
pdata->cpu_map[index] = topology_core_id(cpu);
attr_no = index + BASE_SYSFS_ATTR_NO;
}
- if (attr_no > MAX_CORE_DATA - 1) {
- err = -ERANGE;
- goto ida_free;
- }
-
tdata = init_temp_data(cpu, pkg_flag);
if (!tdata) {
err = -ENOMEM;
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x 4e440abc894585a34c2904a32cd54af1742311b3
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021938-relieving-boneless-4cca@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
4e440abc8945 ("hwmon: (coretemp) Fix out-of-bounds memory access")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 4e440abc894585a34c2904a32cd54af1742311b3 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang(a)intel.com>
Date: Fri, 2 Feb 2024 17:21:34 +0800
Subject: [PATCH] hwmon: (coretemp) Fix out-of-bounds memory access
Fix a bug that pdata->cpu_map[] is set before out-of-bounds check.
The problem might be triggered on systems with more than 128 cores per
package.
Fixes: 7108b80a542b ("hwmon/coretemp: Handle large core ID value")
Signed-off-by: Zhang Rui <rui.zhang(a)intel.com>
Cc: <stable(a)vger.kernel.org>
Link: https://lore.kernel.org/r/20240202092144.71180-2-rui.zhang@intel.com
Signed-off-by: Guenter Roeck <linux(a)roeck-us.net>
diff --git a/drivers/hwmon/coretemp.c b/drivers/hwmon/coretemp.c
index ba82d1e79c13..e78c76919111 100644
--- a/drivers/hwmon/coretemp.c
+++ b/drivers/hwmon/coretemp.c
@@ -509,18 +509,14 @@ static int create_core_data(struct platform_device *pdev, unsigned int cpu,
if (pkg_flag) {
attr_no = PKG_SYSFS_ATTR_NO;
} else {
- index = ida_alloc(&pdata->ida, GFP_KERNEL);
+ index = ida_alloc_max(&pdata->ida, NUM_REAL_CORES - 1, GFP_KERNEL);
if (index < 0)
return index;
+
pdata->cpu_map[index] = topology_core_id(cpu);
attr_no = index + BASE_SYSFS_ATTR_NO;
}
- if (attr_no > MAX_CORE_DATA - 1) {
- err = -ERANGE;
- goto ida_free;
- }
-
tdata = init_temp_data(cpu, pkg_flag);
if (!tdata) {
err = -ENOMEM;
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x 4e440abc894585a34c2904a32cd54af1742311b3
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021937-revered-agreement-261e@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
4e440abc8945 ("hwmon: (coretemp) Fix out-of-bounds memory access")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 4e440abc894585a34c2904a32cd54af1742311b3 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang(a)intel.com>
Date: Fri, 2 Feb 2024 17:21:34 +0800
Subject: [PATCH] hwmon: (coretemp) Fix out-of-bounds memory access
Fix a bug that pdata->cpu_map[] is set before out-of-bounds check.
The problem might be triggered on systems with more than 128 cores per
package.
Fixes: 7108b80a542b ("hwmon/coretemp: Handle large core ID value")
Signed-off-by: Zhang Rui <rui.zhang(a)intel.com>
Cc: <stable(a)vger.kernel.org>
Link: https://lore.kernel.org/r/20240202092144.71180-2-rui.zhang@intel.com
Signed-off-by: Guenter Roeck <linux(a)roeck-us.net>
diff --git a/drivers/hwmon/coretemp.c b/drivers/hwmon/coretemp.c
index ba82d1e79c13..e78c76919111 100644
--- a/drivers/hwmon/coretemp.c
+++ b/drivers/hwmon/coretemp.c
@@ -509,18 +509,14 @@ static int create_core_data(struct platform_device *pdev, unsigned int cpu,
if (pkg_flag) {
attr_no = PKG_SYSFS_ATTR_NO;
} else {
- index = ida_alloc(&pdata->ida, GFP_KERNEL);
+ index = ida_alloc_max(&pdata->ida, NUM_REAL_CORES - 1, GFP_KERNEL);
if (index < 0)
return index;
+
pdata->cpu_map[index] = topology_core_id(cpu);
attr_no = index + BASE_SYSFS_ATTR_NO;
}
- if (attr_no > MAX_CORE_DATA - 1) {
- err = -ERANGE;
- goto ida_free;
- }
-
tdata = init_temp_data(cpu, pkg_flag);
if (!tdata) {
err = -ENOMEM;
The patch below does not apply to the 6.7-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.7.y
git checkout FETCH_HEAD
git cherry-pick -x 4e440abc894585a34c2904a32cd54af1742311b3
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021936-mulch-prone-c0a6@gregkh' --subject-prefix 'PATCH 6.7.y' HEAD^..
Possible dependencies:
4e440abc8945 ("hwmon: (coretemp) Fix out-of-bounds memory access")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 4e440abc894585a34c2904a32cd54af1742311b3 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang(a)intel.com>
Date: Fri, 2 Feb 2024 17:21:34 +0800
Subject: [PATCH] hwmon: (coretemp) Fix out-of-bounds memory access
Fix a bug that pdata->cpu_map[] is set before out-of-bounds check.
The problem might be triggered on systems with more than 128 cores per
package.
Fixes: 7108b80a542b ("hwmon/coretemp: Handle large core ID value")
Signed-off-by: Zhang Rui <rui.zhang(a)intel.com>
Cc: <stable(a)vger.kernel.org>
Link: https://lore.kernel.org/r/20240202092144.71180-2-rui.zhang@intel.com
Signed-off-by: Guenter Roeck <linux(a)roeck-us.net>
diff --git a/drivers/hwmon/coretemp.c b/drivers/hwmon/coretemp.c
index ba82d1e79c13..e78c76919111 100644
--- a/drivers/hwmon/coretemp.c
+++ b/drivers/hwmon/coretemp.c
@@ -509,18 +509,14 @@ static int create_core_data(struct platform_device *pdev, unsigned int cpu,
if (pkg_flag) {
attr_no = PKG_SYSFS_ATTR_NO;
} else {
- index = ida_alloc(&pdata->ida, GFP_KERNEL);
+ index = ida_alloc_max(&pdata->ida, NUM_REAL_CORES - 1, GFP_KERNEL);
if (index < 0)
return index;
+
pdata->cpu_map[index] = topology_core_id(cpu);
attr_no = index + BASE_SYSFS_ATTR_NO;
}
- if (attr_no > MAX_CORE_DATA - 1) {
- err = -ERANGE;
- goto ida_free;
- }
-
tdata = init_temp_data(cpu, pkg_flag);
if (!tdata) {
err = -ENOMEM;
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.19.y
git checkout FETCH_HEAD
git cherry-pick -x fa765c4b4aed2d64266b694520ecb025c862c5a9
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021921-why-roamer-871c@gregkh' --subject-prefix 'PATCH 4.19.y' HEAD^..
Possible dependencies:
fa765c4b4aed ("xen/events: close evtchn after mapping cleanup")
3fcdaf3d7634 ("xen/events: modify internal [un]bind interfaces")
5dd9ad32d775 ("xen/events: drop xen_allocate_irqs_dynamic()")
3bdb0ac350fe ("xen/events: remove some simple helpers from events_base.c")
686464514fbe ("xen/events: reduce externally visible helper functions")
e64e7c74b99e ("xen/events: avoid using info_for_irq() in xen_send_IPI_one()")
9e90e58c11b7 ("xen: evtchn: Allow shared registration of IRQ handers")
87797fad6cce ("xen/events: replace evtchn_rwlock with RCU")
58f6259b7a08 ("xen/evtchn: Introduce new IOCTL to bind static evtchn")
04d684875b30 ("xen: xen_debug_interrupt prototype to global header")
073352e951f6 ("genirq: Add and use an irq_data_update_affinity helper")
961343d78226 ("genirq: Refactor accessors to use irq_data_get_affinity_mask")
83f877a09516 ("xen/events: remove redundant initialization of variable irq")
3de218ff39b9 ("xen/events: reset active flag for lateeoi events later")
d120198bd5ff ("xen/evtchn: Change irq_info lock to raw_spinlock_t")
b6622798bc50 ("xen/events: avoid handling the same event on two cpus at the same time")
25da4618af24 ("xen/events: don't unmask an event channel when an eoi is pending")
06f45fe96fcd ("xen/events: add per-xenbus device event statistics and settings")
f2fa0e5e9f31 ("xen/events: link interdomain events to associated xenbus device")
88f0a9d06644 ("xen/events: Implement irq distribution")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From fa765c4b4aed2d64266b694520ecb025c862c5a9 Mon Sep 17 00:00:00 2001
From: Maximilian Heyne <mheyne(a)amazon.de>
Date: Wed, 24 Jan 2024 16:31:28 +0000
Subject: [PATCH] xen/events: close evtchn after mapping cleanup
shutdown_pirq and startup_pirq are not taking the
irq_mapping_update_lock because they can't due to lock inversion. Both
are called with the irq_desc->lock being taking. The lock order,
however, is first irq_mapping_update_lock and then irq_desc->lock.
This opens multiple races:
- shutdown_pirq can be interrupted by a function that allocates an event
channel:
CPU0 CPU1
shutdown_pirq {
xen_evtchn_close(e)
__startup_pirq {
EVTCHNOP_bind_pirq
-> returns just freed evtchn e
set_evtchn_to_irq(e, irq)
}
xen_irq_info_cleanup() {
set_evtchn_to_irq(e, -1)
}
}
Assume here event channel e refers here to the same event channel
number.
After this race the evtchn_to_irq mapping for e is invalid (-1).
- __startup_pirq races with __unbind_from_irq in a similar way. Because
__startup_pirq doesn't take irq_mapping_update_lock it can grab the
evtchn that __unbind_from_irq is currently freeing and cleaning up. In
this case even though the event channel is allocated, its mapping can
be unset in evtchn_to_irq.
The fix is to first cleanup the mappings and then close the event
channel. In this way, when an event channel gets allocated it's
potential previous evtchn_to_irq mappings are guaranteed to be unset already.
This is also the reverse order of the allocation where first the event
channel is allocated and then the mappings are setup.
On a 5.10 kernel prior to commit 3fcdaf3d7634 ("xen/events: modify internal
[un]bind interfaces"), we hit a BUG like the following during probing of NVMe
devices. The issue is that during nvme_setup_io_queues, pci_free_irq
is called for every device which results in a call to shutdown_pirq.
With many nvme devices it's therefore likely to hit this race during
boot because there will be multiple calls to shutdown_pirq and
startup_pirq are running potentially in parallel.
------------[ cut here ]------------
blkfront: xvda: barrier or flush: disabled; persistent grants: enabled; indirect descriptors: enabled; bounce buffer: enabled
kernel BUG at drivers/xen/events/events_base.c:499!
invalid opcode: 0000 [#1] SMP PTI
CPU: 44 PID: 375 Comm: kworker/u257:23 Not tainted 5.10.201-191.748.amzn2.x86_64 #1
Hardware name: Xen HVM domU, BIOS 4.11.amazon 08/24/2006
Workqueue: nvme-reset-wq nvme_reset_work
RIP: 0010:bind_evtchn_to_cpu+0xdf/0xf0
Code: 5d 41 5e c3 cc cc cc cc 44 89 f7 e8 2b 55 ad ff 49 89 c5 48 85 c0 0f 84 64 ff ff ff 4c 8b 68 30 41 83 fe ff 0f 85 60 ff ff ff <0f> 0b 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 0f 1f 44 00 00
RSP: 0000:ffffc9000d533b08 EFLAGS: 00010046
RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000006
RDX: 0000000000000028 RSI: 00000000ffffffff RDI: 00000000ffffffff
RBP: ffff888107419680 R08: 0000000000000000 R09: ffffffff82d72b00
R10: 0000000000000000 R11: 0000000000000000 R12: 00000000000001ed
R13: 0000000000000000 R14: 00000000ffffffff R15: 0000000000000002
FS: 0000000000000000(0000) GS:ffff88bc8b500000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000000 CR3: 0000000002610001 CR4: 00000000001706e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
? show_trace_log_lvl+0x1c1/0x2d9
? show_trace_log_lvl+0x1c1/0x2d9
? set_affinity_irq+0xdc/0x1c0
? __die_body.cold+0x8/0xd
? die+0x2b/0x50
? do_trap+0x90/0x110
? bind_evtchn_to_cpu+0xdf/0xf0
? do_error_trap+0x65/0x80
? bind_evtchn_to_cpu+0xdf/0xf0
? exc_invalid_op+0x4e/0x70
? bind_evtchn_to_cpu+0xdf/0xf0
? asm_exc_invalid_op+0x12/0x20
? bind_evtchn_to_cpu+0xdf/0xf0
? bind_evtchn_to_cpu+0xc5/0xf0
set_affinity_irq+0xdc/0x1c0
irq_do_set_affinity+0x1d7/0x1f0
irq_setup_affinity+0xd6/0x1a0
irq_startup+0x8a/0xf0
__setup_irq+0x639/0x6d0
? nvme_suspend+0x150/0x150
request_threaded_irq+0x10c/0x180
? nvme_suspend+0x150/0x150
pci_request_irq+0xa8/0xf0
? __blk_mq_free_request+0x74/0xa0
queue_request_irq+0x6f/0x80
nvme_create_queue+0x1af/0x200
nvme_create_io_queues+0xbd/0xf0
nvme_setup_io_queues+0x246/0x320
? nvme_irq_check+0x30/0x30
nvme_reset_work+0x1c8/0x400
process_one_work+0x1b0/0x350
worker_thread+0x49/0x310
? process_one_work+0x350/0x350
kthread+0x11b/0x140
? __kthread_bind_mask+0x60/0x60
ret_from_fork+0x22/0x30
Modules linked in:
---[ end trace a11715de1eee1873 ]---
Fixes: d46a78b05c0e ("xen: implement pirq type event channels")
Cc: stable(a)vger.kernel.org
Co-debugged-by: Andrew Panyakin <apanyaki(a)amazon.com>
Signed-off-by: Maximilian Heyne <mheyne(a)amazon.de>
Reviewed-by: Juergen Gross <jgross(a)suse.com>
Link: https://lore.kernel.org/r/20240124163130.31324-1-mheyne@amazon.de
Signed-off-by: Juergen Gross <jgross(a)suse.com>
diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c
index b8cfea7812d6..3b9f080109d7 100644
--- a/drivers/xen/events/events_base.c
+++ b/drivers/xen/events/events_base.c
@@ -923,8 +923,8 @@ static void shutdown_pirq(struct irq_data *data)
return;
do_mask(info, EVT_MASK_REASON_EXPLICIT);
- xen_evtchn_close(evtchn);
xen_irq_info_cleanup(info);
+ xen_evtchn_close(evtchn);
}
static void enable_pirq(struct irq_data *data)
@@ -956,6 +956,7 @@ EXPORT_SYMBOL_GPL(xen_irq_from_gsi);
static void __unbind_from_irq(struct irq_info *info, unsigned int irq)
{
evtchn_port_t evtchn;
+ bool close_evtchn = false;
if (!info) {
xen_irq_free_desc(irq);
@@ -975,7 +976,7 @@ static void __unbind_from_irq(struct irq_info *info, unsigned int irq)
struct xenbus_device *dev;
if (!info->is_static)
- xen_evtchn_close(evtchn);
+ close_evtchn = true;
switch (info->type) {
case IRQT_VIRQ:
@@ -995,6 +996,9 @@ static void __unbind_from_irq(struct irq_info *info, unsigned int irq)
}
xen_irq_info_cleanup(info);
+
+ if (close_evtchn)
+ xen_evtchn_close(evtchn);
}
xen_free_irq(info);
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x fa765c4b4aed2d64266b694520ecb025c862c5a9
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021917-krypton-upcountry-d467@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
fa765c4b4aed ("xen/events: close evtchn after mapping cleanup")
3fcdaf3d7634 ("xen/events: modify internal [un]bind interfaces")
5dd9ad32d775 ("xen/events: drop xen_allocate_irqs_dynamic()")
3bdb0ac350fe ("xen/events: remove some simple helpers from events_base.c")
686464514fbe ("xen/events: reduce externally visible helper functions")
e64e7c74b99e ("xen/events: avoid using info_for_irq() in xen_send_IPI_one()")
9e90e58c11b7 ("xen: evtchn: Allow shared registration of IRQ handers")
87797fad6cce ("xen/events: replace evtchn_rwlock with RCU")
58f6259b7a08 ("xen/evtchn: Introduce new IOCTL to bind static evtchn")
04d684875b30 ("xen: xen_debug_interrupt prototype to global header")
073352e951f6 ("genirq: Add and use an irq_data_update_affinity helper")
961343d78226 ("genirq: Refactor accessors to use irq_data_get_affinity_mask")
83f877a09516 ("xen/events: remove redundant initialization of variable irq")
3de218ff39b9 ("xen/events: reset active flag for lateeoi events later")
d120198bd5ff ("xen/evtchn: Change irq_info lock to raw_spinlock_t")
b6622798bc50 ("xen/events: avoid handling the same event on two cpus at the same time")
25da4618af24 ("xen/events: don't unmask an event channel when an eoi is pending")
06f45fe96fcd ("xen/events: add per-xenbus device event statistics and settings")
f2fa0e5e9f31 ("xen/events: link interdomain events to associated xenbus device")
88f0a9d06644 ("xen/events: Implement irq distribution")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From fa765c4b4aed2d64266b694520ecb025c862c5a9 Mon Sep 17 00:00:00 2001
From: Maximilian Heyne <mheyne(a)amazon.de>
Date: Wed, 24 Jan 2024 16:31:28 +0000
Subject: [PATCH] xen/events: close evtchn after mapping cleanup
shutdown_pirq and startup_pirq are not taking the
irq_mapping_update_lock because they can't due to lock inversion. Both
are called with the irq_desc->lock being taking. The lock order,
however, is first irq_mapping_update_lock and then irq_desc->lock.
This opens multiple races:
- shutdown_pirq can be interrupted by a function that allocates an event
channel:
CPU0 CPU1
shutdown_pirq {
xen_evtchn_close(e)
__startup_pirq {
EVTCHNOP_bind_pirq
-> returns just freed evtchn e
set_evtchn_to_irq(e, irq)
}
xen_irq_info_cleanup() {
set_evtchn_to_irq(e, -1)
}
}
Assume here event channel e refers here to the same event channel
number.
After this race the evtchn_to_irq mapping for e is invalid (-1).
- __startup_pirq races with __unbind_from_irq in a similar way. Because
__startup_pirq doesn't take irq_mapping_update_lock it can grab the
evtchn that __unbind_from_irq is currently freeing and cleaning up. In
this case even though the event channel is allocated, its mapping can
be unset in evtchn_to_irq.
The fix is to first cleanup the mappings and then close the event
channel. In this way, when an event channel gets allocated it's
potential previous evtchn_to_irq mappings are guaranteed to be unset already.
This is also the reverse order of the allocation where first the event
channel is allocated and then the mappings are setup.
On a 5.10 kernel prior to commit 3fcdaf3d7634 ("xen/events: modify internal
[un]bind interfaces"), we hit a BUG like the following during probing of NVMe
devices. The issue is that during nvme_setup_io_queues, pci_free_irq
is called for every device which results in a call to shutdown_pirq.
With many nvme devices it's therefore likely to hit this race during
boot because there will be multiple calls to shutdown_pirq and
startup_pirq are running potentially in parallel.
------------[ cut here ]------------
blkfront: xvda: barrier or flush: disabled; persistent grants: enabled; indirect descriptors: enabled; bounce buffer: enabled
kernel BUG at drivers/xen/events/events_base.c:499!
invalid opcode: 0000 [#1] SMP PTI
CPU: 44 PID: 375 Comm: kworker/u257:23 Not tainted 5.10.201-191.748.amzn2.x86_64 #1
Hardware name: Xen HVM domU, BIOS 4.11.amazon 08/24/2006
Workqueue: nvme-reset-wq nvme_reset_work
RIP: 0010:bind_evtchn_to_cpu+0xdf/0xf0
Code: 5d 41 5e c3 cc cc cc cc 44 89 f7 e8 2b 55 ad ff 49 89 c5 48 85 c0 0f 84 64 ff ff ff 4c 8b 68 30 41 83 fe ff 0f 85 60 ff ff ff <0f> 0b 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 0f 1f 44 00 00
RSP: 0000:ffffc9000d533b08 EFLAGS: 00010046
RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000006
RDX: 0000000000000028 RSI: 00000000ffffffff RDI: 00000000ffffffff
RBP: ffff888107419680 R08: 0000000000000000 R09: ffffffff82d72b00
R10: 0000000000000000 R11: 0000000000000000 R12: 00000000000001ed
R13: 0000000000000000 R14: 00000000ffffffff R15: 0000000000000002
FS: 0000000000000000(0000) GS:ffff88bc8b500000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000000 CR3: 0000000002610001 CR4: 00000000001706e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
? show_trace_log_lvl+0x1c1/0x2d9
? show_trace_log_lvl+0x1c1/0x2d9
? set_affinity_irq+0xdc/0x1c0
? __die_body.cold+0x8/0xd
? die+0x2b/0x50
? do_trap+0x90/0x110
? bind_evtchn_to_cpu+0xdf/0xf0
? do_error_trap+0x65/0x80
? bind_evtchn_to_cpu+0xdf/0xf0
? exc_invalid_op+0x4e/0x70
? bind_evtchn_to_cpu+0xdf/0xf0
? asm_exc_invalid_op+0x12/0x20
? bind_evtchn_to_cpu+0xdf/0xf0
? bind_evtchn_to_cpu+0xc5/0xf0
set_affinity_irq+0xdc/0x1c0
irq_do_set_affinity+0x1d7/0x1f0
irq_setup_affinity+0xd6/0x1a0
irq_startup+0x8a/0xf0
__setup_irq+0x639/0x6d0
? nvme_suspend+0x150/0x150
request_threaded_irq+0x10c/0x180
? nvme_suspend+0x150/0x150
pci_request_irq+0xa8/0xf0
? __blk_mq_free_request+0x74/0xa0
queue_request_irq+0x6f/0x80
nvme_create_queue+0x1af/0x200
nvme_create_io_queues+0xbd/0xf0
nvme_setup_io_queues+0x246/0x320
? nvme_irq_check+0x30/0x30
nvme_reset_work+0x1c8/0x400
process_one_work+0x1b0/0x350
worker_thread+0x49/0x310
? process_one_work+0x350/0x350
kthread+0x11b/0x140
? __kthread_bind_mask+0x60/0x60
ret_from_fork+0x22/0x30
Modules linked in:
---[ end trace a11715de1eee1873 ]---
Fixes: d46a78b05c0e ("xen: implement pirq type event channels")
Cc: stable(a)vger.kernel.org
Co-debugged-by: Andrew Panyakin <apanyaki(a)amazon.com>
Signed-off-by: Maximilian Heyne <mheyne(a)amazon.de>
Reviewed-by: Juergen Gross <jgross(a)suse.com>
Link: https://lore.kernel.org/r/20240124163130.31324-1-mheyne@amazon.de
Signed-off-by: Juergen Gross <jgross(a)suse.com>
diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c
index b8cfea7812d6..3b9f080109d7 100644
--- a/drivers/xen/events/events_base.c
+++ b/drivers/xen/events/events_base.c
@@ -923,8 +923,8 @@ static void shutdown_pirq(struct irq_data *data)
return;
do_mask(info, EVT_MASK_REASON_EXPLICIT);
- xen_evtchn_close(evtchn);
xen_irq_info_cleanup(info);
+ xen_evtchn_close(evtchn);
}
static void enable_pirq(struct irq_data *data)
@@ -956,6 +956,7 @@ EXPORT_SYMBOL_GPL(xen_irq_from_gsi);
static void __unbind_from_irq(struct irq_info *info, unsigned int irq)
{
evtchn_port_t evtchn;
+ bool close_evtchn = false;
if (!info) {
xen_irq_free_desc(irq);
@@ -975,7 +976,7 @@ static void __unbind_from_irq(struct irq_info *info, unsigned int irq)
struct xenbus_device *dev;
if (!info->is_static)
- xen_evtchn_close(evtchn);
+ close_evtchn = true;
switch (info->type) {
case IRQT_VIRQ:
@@ -995,6 +996,9 @@ static void __unbind_from_irq(struct irq_info *info, unsigned int irq)
}
xen_irq_info_cleanup(info);
+
+ if (close_evtchn)
+ xen_evtchn_close(evtchn);
}
xen_free_irq(info);
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x fa765c4b4aed2d64266b694520ecb025c862c5a9
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021914-wackiness-diagnoses-52e2@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
fa765c4b4aed ("xen/events: close evtchn after mapping cleanup")
3fcdaf3d7634 ("xen/events: modify internal [un]bind interfaces")
5dd9ad32d775 ("xen/events: drop xen_allocate_irqs_dynamic()")
3bdb0ac350fe ("xen/events: remove some simple helpers from events_base.c")
686464514fbe ("xen/events: reduce externally visible helper functions")
e64e7c74b99e ("xen/events: avoid using info_for_irq() in xen_send_IPI_one()")
9e90e58c11b7 ("xen: evtchn: Allow shared registration of IRQ handers")
87797fad6cce ("xen/events: replace evtchn_rwlock with RCU")
58f6259b7a08 ("xen/evtchn: Introduce new IOCTL to bind static evtchn")
04d684875b30 ("xen: xen_debug_interrupt prototype to global header")
073352e951f6 ("genirq: Add and use an irq_data_update_affinity helper")
961343d78226 ("genirq: Refactor accessors to use irq_data_get_affinity_mask")
83f877a09516 ("xen/events: remove redundant initialization of variable irq")
3de218ff39b9 ("xen/events: reset active flag for lateeoi events later")
d120198bd5ff ("xen/evtchn: Change irq_info lock to raw_spinlock_t")
b6622798bc50 ("xen/events: avoid handling the same event on two cpus at the same time")
25da4618af24 ("xen/events: don't unmask an event channel when an eoi is pending")
06f45fe96fcd ("xen/events: add per-xenbus device event statistics and settings")
f2fa0e5e9f31 ("xen/events: link interdomain events to associated xenbus device")
88f0a9d06644 ("xen/events: Implement irq distribution")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From fa765c4b4aed2d64266b694520ecb025c862c5a9 Mon Sep 17 00:00:00 2001
From: Maximilian Heyne <mheyne(a)amazon.de>
Date: Wed, 24 Jan 2024 16:31:28 +0000
Subject: [PATCH] xen/events: close evtchn after mapping cleanup
shutdown_pirq and startup_pirq are not taking the
irq_mapping_update_lock because they can't due to lock inversion. Both
are called with the irq_desc->lock being taking. The lock order,
however, is first irq_mapping_update_lock and then irq_desc->lock.
This opens multiple races:
- shutdown_pirq can be interrupted by a function that allocates an event
channel:
CPU0 CPU1
shutdown_pirq {
xen_evtchn_close(e)
__startup_pirq {
EVTCHNOP_bind_pirq
-> returns just freed evtchn e
set_evtchn_to_irq(e, irq)
}
xen_irq_info_cleanup() {
set_evtchn_to_irq(e, -1)
}
}
Assume here event channel e refers here to the same event channel
number.
After this race the evtchn_to_irq mapping for e is invalid (-1).
- __startup_pirq races with __unbind_from_irq in a similar way. Because
__startup_pirq doesn't take irq_mapping_update_lock it can grab the
evtchn that __unbind_from_irq is currently freeing and cleaning up. In
this case even though the event channel is allocated, its mapping can
be unset in evtchn_to_irq.
The fix is to first cleanup the mappings and then close the event
channel. In this way, when an event channel gets allocated it's
potential previous evtchn_to_irq mappings are guaranteed to be unset already.
This is also the reverse order of the allocation where first the event
channel is allocated and then the mappings are setup.
On a 5.10 kernel prior to commit 3fcdaf3d7634 ("xen/events: modify internal
[un]bind interfaces"), we hit a BUG like the following during probing of NVMe
devices. The issue is that during nvme_setup_io_queues, pci_free_irq
is called for every device which results in a call to shutdown_pirq.
With many nvme devices it's therefore likely to hit this race during
boot because there will be multiple calls to shutdown_pirq and
startup_pirq are running potentially in parallel.
------------[ cut here ]------------
blkfront: xvda: barrier or flush: disabled; persistent grants: enabled; indirect descriptors: enabled; bounce buffer: enabled
kernel BUG at drivers/xen/events/events_base.c:499!
invalid opcode: 0000 [#1] SMP PTI
CPU: 44 PID: 375 Comm: kworker/u257:23 Not tainted 5.10.201-191.748.amzn2.x86_64 #1
Hardware name: Xen HVM domU, BIOS 4.11.amazon 08/24/2006
Workqueue: nvme-reset-wq nvme_reset_work
RIP: 0010:bind_evtchn_to_cpu+0xdf/0xf0
Code: 5d 41 5e c3 cc cc cc cc 44 89 f7 e8 2b 55 ad ff 49 89 c5 48 85 c0 0f 84 64 ff ff ff 4c 8b 68 30 41 83 fe ff 0f 85 60 ff ff ff <0f> 0b 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 0f 1f 44 00 00
RSP: 0000:ffffc9000d533b08 EFLAGS: 00010046
RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000006
RDX: 0000000000000028 RSI: 00000000ffffffff RDI: 00000000ffffffff
RBP: ffff888107419680 R08: 0000000000000000 R09: ffffffff82d72b00
R10: 0000000000000000 R11: 0000000000000000 R12: 00000000000001ed
R13: 0000000000000000 R14: 00000000ffffffff R15: 0000000000000002
FS: 0000000000000000(0000) GS:ffff88bc8b500000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000000 CR3: 0000000002610001 CR4: 00000000001706e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
? show_trace_log_lvl+0x1c1/0x2d9
? show_trace_log_lvl+0x1c1/0x2d9
? set_affinity_irq+0xdc/0x1c0
? __die_body.cold+0x8/0xd
? die+0x2b/0x50
? do_trap+0x90/0x110
? bind_evtchn_to_cpu+0xdf/0xf0
? do_error_trap+0x65/0x80
? bind_evtchn_to_cpu+0xdf/0xf0
? exc_invalid_op+0x4e/0x70
? bind_evtchn_to_cpu+0xdf/0xf0
? asm_exc_invalid_op+0x12/0x20
? bind_evtchn_to_cpu+0xdf/0xf0
? bind_evtchn_to_cpu+0xc5/0xf0
set_affinity_irq+0xdc/0x1c0
irq_do_set_affinity+0x1d7/0x1f0
irq_setup_affinity+0xd6/0x1a0
irq_startup+0x8a/0xf0
__setup_irq+0x639/0x6d0
? nvme_suspend+0x150/0x150
request_threaded_irq+0x10c/0x180
? nvme_suspend+0x150/0x150
pci_request_irq+0xa8/0xf0
? __blk_mq_free_request+0x74/0xa0
queue_request_irq+0x6f/0x80
nvme_create_queue+0x1af/0x200
nvme_create_io_queues+0xbd/0xf0
nvme_setup_io_queues+0x246/0x320
? nvme_irq_check+0x30/0x30
nvme_reset_work+0x1c8/0x400
process_one_work+0x1b0/0x350
worker_thread+0x49/0x310
? process_one_work+0x350/0x350
kthread+0x11b/0x140
? __kthread_bind_mask+0x60/0x60
ret_from_fork+0x22/0x30
Modules linked in:
---[ end trace a11715de1eee1873 ]---
Fixes: d46a78b05c0e ("xen: implement pirq type event channels")
Cc: stable(a)vger.kernel.org
Co-debugged-by: Andrew Panyakin <apanyaki(a)amazon.com>
Signed-off-by: Maximilian Heyne <mheyne(a)amazon.de>
Reviewed-by: Juergen Gross <jgross(a)suse.com>
Link: https://lore.kernel.org/r/20240124163130.31324-1-mheyne@amazon.de
Signed-off-by: Juergen Gross <jgross(a)suse.com>
diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c
index b8cfea7812d6..3b9f080109d7 100644
--- a/drivers/xen/events/events_base.c
+++ b/drivers/xen/events/events_base.c
@@ -923,8 +923,8 @@ static void shutdown_pirq(struct irq_data *data)
return;
do_mask(info, EVT_MASK_REASON_EXPLICIT);
- xen_evtchn_close(evtchn);
xen_irq_info_cleanup(info);
+ xen_evtchn_close(evtchn);
}
static void enable_pirq(struct irq_data *data)
@@ -956,6 +956,7 @@ EXPORT_SYMBOL_GPL(xen_irq_from_gsi);
static void __unbind_from_irq(struct irq_info *info, unsigned int irq)
{
evtchn_port_t evtchn;
+ bool close_evtchn = false;
if (!info) {
xen_irq_free_desc(irq);
@@ -975,7 +976,7 @@ static void __unbind_from_irq(struct irq_info *info, unsigned int irq)
struct xenbus_device *dev;
if (!info->is_static)
- xen_evtchn_close(evtchn);
+ close_evtchn = true;
switch (info->type) {
case IRQT_VIRQ:
@@ -995,6 +996,9 @@ static void __unbind_from_irq(struct irq_info *info, unsigned int irq)
}
xen_irq_info_cleanup(info);
+
+ if (close_evtchn)
+ xen_evtchn_close(evtchn);
}
xen_free_irq(info);
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x fa765c4b4aed2d64266b694520ecb025c862c5a9
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021911-king-sugar-1024@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
fa765c4b4aed ("xen/events: close evtchn after mapping cleanup")
3fcdaf3d7634 ("xen/events: modify internal [un]bind interfaces")
5dd9ad32d775 ("xen/events: drop xen_allocate_irqs_dynamic()")
3bdb0ac350fe ("xen/events: remove some simple helpers from events_base.c")
686464514fbe ("xen/events: reduce externally visible helper functions")
e64e7c74b99e ("xen/events: avoid using info_for_irq() in xen_send_IPI_one()")
9e90e58c11b7 ("xen: evtchn: Allow shared registration of IRQ handers")
87797fad6cce ("xen/events: replace evtchn_rwlock with RCU")
58f6259b7a08 ("xen/evtchn: Introduce new IOCTL to bind static evtchn")
04d684875b30 ("xen: xen_debug_interrupt prototype to global header")
073352e951f6 ("genirq: Add and use an irq_data_update_affinity helper")
961343d78226 ("genirq: Refactor accessors to use irq_data_get_affinity_mask")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From fa765c4b4aed2d64266b694520ecb025c862c5a9 Mon Sep 17 00:00:00 2001
From: Maximilian Heyne <mheyne(a)amazon.de>
Date: Wed, 24 Jan 2024 16:31:28 +0000
Subject: [PATCH] xen/events: close evtchn after mapping cleanup
shutdown_pirq and startup_pirq are not taking the
irq_mapping_update_lock because they can't due to lock inversion. Both
are called with the irq_desc->lock being taking. The lock order,
however, is first irq_mapping_update_lock and then irq_desc->lock.
This opens multiple races:
- shutdown_pirq can be interrupted by a function that allocates an event
channel:
CPU0 CPU1
shutdown_pirq {
xen_evtchn_close(e)
__startup_pirq {
EVTCHNOP_bind_pirq
-> returns just freed evtchn e
set_evtchn_to_irq(e, irq)
}
xen_irq_info_cleanup() {
set_evtchn_to_irq(e, -1)
}
}
Assume here event channel e refers here to the same event channel
number.
After this race the evtchn_to_irq mapping for e is invalid (-1).
- __startup_pirq races with __unbind_from_irq in a similar way. Because
__startup_pirq doesn't take irq_mapping_update_lock it can grab the
evtchn that __unbind_from_irq is currently freeing and cleaning up. In
this case even though the event channel is allocated, its mapping can
be unset in evtchn_to_irq.
The fix is to first cleanup the mappings and then close the event
channel. In this way, when an event channel gets allocated it's
potential previous evtchn_to_irq mappings are guaranteed to be unset already.
This is also the reverse order of the allocation where first the event
channel is allocated and then the mappings are setup.
On a 5.10 kernel prior to commit 3fcdaf3d7634 ("xen/events: modify internal
[un]bind interfaces"), we hit a BUG like the following during probing of NVMe
devices. The issue is that during nvme_setup_io_queues, pci_free_irq
is called for every device which results in a call to shutdown_pirq.
With many nvme devices it's therefore likely to hit this race during
boot because there will be multiple calls to shutdown_pirq and
startup_pirq are running potentially in parallel.
------------[ cut here ]------------
blkfront: xvda: barrier or flush: disabled; persistent grants: enabled; indirect descriptors: enabled; bounce buffer: enabled
kernel BUG at drivers/xen/events/events_base.c:499!
invalid opcode: 0000 [#1] SMP PTI
CPU: 44 PID: 375 Comm: kworker/u257:23 Not tainted 5.10.201-191.748.amzn2.x86_64 #1
Hardware name: Xen HVM domU, BIOS 4.11.amazon 08/24/2006
Workqueue: nvme-reset-wq nvme_reset_work
RIP: 0010:bind_evtchn_to_cpu+0xdf/0xf0
Code: 5d 41 5e c3 cc cc cc cc 44 89 f7 e8 2b 55 ad ff 49 89 c5 48 85 c0 0f 84 64 ff ff ff 4c 8b 68 30 41 83 fe ff 0f 85 60 ff ff ff <0f> 0b 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 0f 1f 44 00 00
RSP: 0000:ffffc9000d533b08 EFLAGS: 00010046
RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000006
RDX: 0000000000000028 RSI: 00000000ffffffff RDI: 00000000ffffffff
RBP: ffff888107419680 R08: 0000000000000000 R09: ffffffff82d72b00
R10: 0000000000000000 R11: 0000000000000000 R12: 00000000000001ed
R13: 0000000000000000 R14: 00000000ffffffff R15: 0000000000000002
FS: 0000000000000000(0000) GS:ffff88bc8b500000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000000 CR3: 0000000002610001 CR4: 00000000001706e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
? show_trace_log_lvl+0x1c1/0x2d9
? show_trace_log_lvl+0x1c1/0x2d9
? set_affinity_irq+0xdc/0x1c0
? __die_body.cold+0x8/0xd
? die+0x2b/0x50
? do_trap+0x90/0x110
? bind_evtchn_to_cpu+0xdf/0xf0
? do_error_trap+0x65/0x80
? bind_evtchn_to_cpu+0xdf/0xf0
? exc_invalid_op+0x4e/0x70
? bind_evtchn_to_cpu+0xdf/0xf0
? asm_exc_invalid_op+0x12/0x20
? bind_evtchn_to_cpu+0xdf/0xf0
? bind_evtchn_to_cpu+0xc5/0xf0
set_affinity_irq+0xdc/0x1c0
irq_do_set_affinity+0x1d7/0x1f0
irq_setup_affinity+0xd6/0x1a0
irq_startup+0x8a/0xf0
__setup_irq+0x639/0x6d0
? nvme_suspend+0x150/0x150
request_threaded_irq+0x10c/0x180
? nvme_suspend+0x150/0x150
pci_request_irq+0xa8/0xf0
? __blk_mq_free_request+0x74/0xa0
queue_request_irq+0x6f/0x80
nvme_create_queue+0x1af/0x200
nvme_create_io_queues+0xbd/0xf0
nvme_setup_io_queues+0x246/0x320
? nvme_irq_check+0x30/0x30
nvme_reset_work+0x1c8/0x400
process_one_work+0x1b0/0x350
worker_thread+0x49/0x310
? process_one_work+0x350/0x350
kthread+0x11b/0x140
? __kthread_bind_mask+0x60/0x60
ret_from_fork+0x22/0x30
Modules linked in:
---[ end trace a11715de1eee1873 ]---
Fixes: d46a78b05c0e ("xen: implement pirq type event channels")
Cc: stable(a)vger.kernel.org
Co-debugged-by: Andrew Panyakin <apanyaki(a)amazon.com>
Signed-off-by: Maximilian Heyne <mheyne(a)amazon.de>
Reviewed-by: Juergen Gross <jgross(a)suse.com>
Link: https://lore.kernel.org/r/20240124163130.31324-1-mheyne@amazon.de
Signed-off-by: Juergen Gross <jgross(a)suse.com>
diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c
index b8cfea7812d6..3b9f080109d7 100644
--- a/drivers/xen/events/events_base.c
+++ b/drivers/xen/events/events_base.c
@@ -923,8 +923,8 @@ static void shutdown_pirq(struct irq_data *data)
return;
do_mask(info, EVT_MASK_REASON_EXPLICIT);
- xen_evtchn_close(evtchn);
xen_irq_info_cleanup(info);
+ xen_evtchn_close(evtchn);
}
static void enable_pirq(struct irq_data *data)
@@ -956,6 +956,7 @@ EXPORT_SYMBOL_GPL(xen_irq_from_gsi);
static void __unbind_from_irq(struct irq_info *info, unsigned int irq)
{
evtchn_port_t evtchn;
+ bool close_evtchn = false;
if (!info) {
xen_irq_free_desc(irq);
@@ -975,7 +976,7 @@ static void __unbind_from_irq(struct irq_info *info, unsigned int irq)
struct xenbus_device *dev;
if (!info->is_static)
- xen_evtchn_close(evtchn);
+ close_evtchn = true;
switch (info->type) {
case IRQT_VIRQ:
@@ -995,6 +996,9 @@ static void __unbind_from_irq(struct irq_info *info, unsigned int irq)
}
xen_irq_info_cleanup(info);
+
+ if (close_evtchn)
+ xen_evtchn_close(evtchn);
}
xen_free_irq(info);
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x fa765c4b4aed2d64266b694520ecb025c862c5a9
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021908-polymer-backyard-ce35@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
fa765c4b4aed ("xen/events: close evtchn after mapping cleanup")
3fcdaf3d7634 ("xen/events: modify internal [un]bind interfaces")
5dd9ad32d775 ("xen/events: drop xen_allocate_irqs_dynamic()")
3bdb0ac350fe ("xen/events: remove some simple helpers from events_base.c")
686464514fbe ("xen/events: reduce externally visible helper functions")
e64e7c74b99e ("xen/events: avoid using info_for_irq() in xen_send_IPI_one()")
9e90e58c11b7 ("xen: evtchn: Allow shared registration of IRQ handers")
87797fad6cce ("xen/events: replace evtchn_rwlock with RCU")
58f6259b7a08 ("xen/evtchn: Introduce new IOCTL to bind static evtchn")
04d684875b30 ("xen: xen_debug_interrupt prototype to global header")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From fa765c4b4aed2d64266b694520ecb025c862c5a9 Mon Sep 17 00:00:00 2001
From: Maximilian Heyne <mheyne(a)amazon.de>
Date: Wed, 24 Jan 2024 16:31:28 +0000
Subject: [PATCH] xen/events: close evtchn after mapping cleanup
shutdown_pirq and startup_pirq are not taking the
irq_mapping_update_lock because they can't due to lock inversion. Both
are called with the irq_desc->lock being taking. The lock order,
however, is first irq_mapping_update_lock and then irq_desc->lock.
This opens multiple races:
- shutdown_pirq can be interrupted by a function that allocates an event
channel:
CPU0 CPU1
shutdown_pirq {
xen_evtchn_close(e)
__startup_pirq {
EVTCHNOP_bind_pirq
-> returns just freed evtchn e
set_evtchn_to_irq(e, irq)
}
xen_irq_info_cleanup() {
set_evtchn_to_irq(e, -1)
}
}
Assume here event channel e refers here to the same event channel
number.
After this race the evtchn_to_irq mapping for e is invalid (-1).
- __startup_pirq races with __unbind_from_irq in a similar way. Because
__startup_pirq doesn't take irq_mapping_update_lock it can grab the
evtchn that __unbind_from_irq is currently freeing and cleaning up. In
this case even though the event channel is allocated, its mapping can
be unset in evtchn_to_irq.
The fix is to first cleanup the mappings and then close the event
channel. In this way, when an event channel gets allocated it's
potential previous evtchn_to_irq mappings are guaranteed to be unset already.
This is also the reverse order of the allocation where first the event
channel is allocated and then the mappings are setup.
On a 5.10 kernel prior to commit 3fcdaf3d7634 ("xen/events: modify internal
[un]bind interfaces"), we hit a BUG like the following during probing of NVMe
devices. The issue is that during nvme_setup_io_queues, pci_free_irq
is called for every device which results in a call to shutdown_pirq.
With many nvme devices it's therefore likely to hit this race during
boot because there will be multiple calls to shutdown_pirq and
startup_pirq are running potentially in parallel.
------------[ cut here ]------------
blkfront: xvda: barrier or flush: disabled; persistent grants: enabled; indirect descriptors: enabled; bounce buffer: enabled
kernel BUG at drivers/xen/events/events_base.c:499!
invalid opcode: 0000 [#1] SMP PTI
CPU: 44 PID: 375 Comm: kworker/u257:23 Not tainted 5.10.201-191.748.amzn2.x86_64 #1
Hardware name: Xen HVM domU, BIOS 4.11.amazon 08/24/2006
Workqueue: nvme-reset-wq nvme_reset_work
RIP: 0010:bind_evtchn_to_cpu+0xdf/0xf0
Code: 5d 41 5e c3 cc cc cc cc 44 89 f7 e8 2b 55 ad ff 49 89 c5 48 85 c0 0f 84 64 ff ff ff 4c 8b 68 30 41 83 fe ff 0f 85 60 ff ff ff <0f> 0b 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 0f 1f 44 00 00
RSP: 0000:ffffc9000d533b08 EFLAGS: 00010046
RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000006
RDX: 0000000000000028 RSI: 00000000ffffffff RDI: 00000000ffffffff
RBP: ffff888107419680 R08: 0000000000000000 R09: ffffffff82d72b00
R10: 0000000000000000 R11: 0000000000000000 R12: 00000000000001ed
R13: 0000000000000000 R14: 00000000ffffffff R15: 0000000000000002
FS: 0000000000000000(0000) GS:ffff88bc8b500000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000000 CR3: 0000000002610001 CR4: 00000000001706e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
? show_trace_log_lvl+0x1c1/0x2d9
? show_trace_log_lvl+0x1c1/0x2d9
? set_affinity_irq+0xdc/0x1c0
? __die_body.cold+0x8/0xd
? die+0x2b/0x50
? do_trap+0x90/0x110
? bind_evtchn_to_cpu+0xdf/0xf0
? do_error_trap+0x65/0x80
? bind_evtchn_to_cpu+0xdf/0xf0
? exc_invalid_op+0x4e/0x70
? bind_evtchn_to_cpu+0xdf/0xf0
? asm_exc_invalid_op+0x12/0x20
? bind_evtchn_to_cpu+0xdf/0xf0
? bind_evtchn_to_cpu+0xc5/0xf0
set_affinity_irq+0xdc/0x1c0
irq_do_set_affinity+0x1d7/0x1f0
irq_setup_affinity+0xd6/0x1a0
irq_startup+0x8a/0xf0
__setup_irq+0x639/0x6d0
? nvme_suspend+0x150/0x150
request_threaded_irq+0x10c/0x180
? nvme_suspend+0x150/0x150
pci_request_irq+0xa8/0xf0
? __blk_mq_free_request+0x74/0xa0
queue_request_irq+0x6f/0x80
nvme_create_queue+0x1af/0x200
nvme_create_io_queues+0xbd/0xf0
nvme_setup_io_queues+0x246/0x320
? nvme_irq_check+0x30/0x30
nvme_reset_work+0x1c8/0x400
process_one_work+0x1b0/0x350
worker_thread+0x49/0x310
? process_one_work+0x350/0x350
kthread+0x11b/0x140
? __kthread_bind_mask+0x60/0x60
ret_from_fork+0x22/0x30
Modules linked in:
---[ end trace a11715de1eee1873 ]---
Fixes: d46a78b05c0e ("xen: implement pirq type event channels")
Cc: stable(a)vger.kernel.org
Co-debugged-by: Andrew Panyakin <apanyaki(a)amazon.com>
Signed-off-by: Maximilian Heyne <mheyne(a)amazon.de>
Reviewed-by: Juergen Gross <jgross(a)suse.com>
Link: https://lore.kernel.org/r/20240124163130.31324-1-mheyne@amazon.de
Signed-off-by: Juergen Gross <jgross(a)suse.com>
diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c
index b8cfea7812d6..3b9f080109d7 100644
--- a/drivers/xen/events/events_base.c
+++ b/drivers/xen/events/events_base.c
@@ -923,8 +923,8 @@ static void shutdown_pirq(struct irq_data *data)
return;
do_mask(info, EVT_MASK_REASON_EXPLICIT);
- xen_evtchn_close(evtchn);
xen_irq_info_cleanup(info);
+ xen_evtchn_close(evtchn);
}
static void enable_pirq(struct irq_data *data)
@@ -956,6 +956,7 @@ EXPORT_SYMBOL_GPL(xen_irq_from_gsi);
static void __unbind_from_irq(struct irq_info *info, unsigned int irq)
{
evtchn_port_t evtchn;
+ bool close_evtchn = false;
if (!info) {
xen_irq_free_desc(irq);
@@ -975,7 +976,7 @@ static void __unbind_from_irq(struct irq_info *info, unsigned int irq)
struct xenbus_device *dev;
if (!info->is_static)
- xen_evtchn_close(evtchn);
+ close_evtchn = true;
switch (info->type) {
case IRQT_VIRQ:
@@ -995,6 +996,9 @@ static void __unbind_from_irq(struct irq_info *info, unsigned int irq)
}
xen_irq_info_cleanup(info);
+
+ if (close_evtchn)
+ xen_evtchn_close(evtchn);
}
xen_free_irq(info);
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x fa765c4b4aed2d64266b694520ecb025c862c5a9
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021905-anger-exclusion-a2ae@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
fa765c4b4aed ("xen/events: close evtchn after mapping cleanup")
3fcdaf3d7634 ("xen/events: modify internal [un]bind interfaces")
5dd9ad32d775 ("xen/events: drop xen_allocate_irqs_dynamic()")
3bdb0ac350fe ("xen/events: remove some simple helpers from events_base.c")
686464514fbe ("xen/events: reduce externally visible helper functions")
e64e7c74b99e ("xen/events: avoid using info_for_irq() in xen_send_IPI_one()")
9e90e58c11b7 ("xen: evtchn: Allow shared registration of IRQ handers")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From fa765c4b4aed2d64266b694520ecb025c862c5a9 Mon Sep 17 00:00:00 2001
From: Maximilian Heyne <mheyne(a)amazon.de>
Date: Wed, 24 Jan 2024 16:31:28 +0000
Subject: [PATCH] xen/events: close evtchn after mapping cleanup
shutdown_pirq and startup_pirq are not taking the
irq_mapping_update_lock because they can't due to lock inversion. Both
are called with the irq_desc->lock being taking. The lock order,
however, is first irq_mapping_update_lock and then irq_desc->lock.
This opens multiple races:
- shutdown_pirq can be interrupted by a function that allocates an event
channel:
CPU0 CPU1
shutdown_pirq {
xen_evtchn_close(e)
__startup_pirq {
EVTCHNOP_bind_pirq
-> returns just freed evtchn e
set_evtchn_to_irq(e, irq)
}
xen_irq_info_cleanup() {
set_evtchn_to_irq(e, -1)
}
}
Assume here event channel e refers here to the same event channel
number.
After this race the evtchn_to_irq mapping for e is invalid (-1).
- __startup_pirq races with __unbind_from_irq in a similar way. Because
__startup_pirq doesn't take irq_mapping_update_lock it can grab the
evtchn that __unbind_from_irq is currently freeing and cleaning up. In
this case even though the event channel is allocated, its mapping can
be unset in evtchn_to_irq.
The fix is to first cleanup the mappings and then close the event
channel. In this way, when an event channel gets allocated it's
potential previous evtchn_to_irq mappings are guaranteed to be unset already.
This is also the reverse order of the allocation where first the event
channel is allocated and then the mappings are setup.
On a 5.10 kernel prior to commit 3fcdaf3d7634 ("xen/events: modify internal
[un]bind interfaces"), we hit a BUG like the following during probing of NVMe
devices. The issue is that during nvme_setup_io_queues, pci_free_irq
is called for every device which results in a call to shutdown_pirq.
With many nvme devices it's therefore likely to hit this race during
boot because there will be multiple calls to shutdown_pirq and
startup_pirq are running potentially in parallel.
------------[ cut here ]------------
blkfront: xvda: barrier or flush: disabled; persistent grants: enabled; indirect descriptors: enabled; bounce buffer: enabled
kernel BUG at drivers/xen/events/events_base.c:499!
invalid opcode: 0000 [#1] SMP PTI
CPU: 44 PID: 375 Comm: kworker/u257:23 Not tainted 5.10.201-191.748.amzn2.x86_64 #1
Hardware name: Xen HVM domU, BIOS 4.11.amazon 08/24/2006
Workqueue: nvme-reset-wq nvme_reset_work
RIP: 0010:bind_evtchn_to_cpu+0xdf/0xf0
Code: 5d 41 5e c3 cc cc cc cc 44 89 f7 e8 2b 55 ad ff 49 89 c5 48 85 c0 0f 84 64 ff ff ff 4c 8b 68 30 41 83 fe ff 0f 85 60 ff ff ff <0f> 0b 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 0f 1f 44 00 00
RSP: 0000:ffffc9000d533b08 EFLAGS: 00010046
RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000006
RDX: 0000000000000028 RSI: 00000000ffffffff RDI: 00000000ffffffff
RBP: ffff888107419680 R08: 0000000000000000 R09: ffffffff82d72b00
R10: 0000000000000000 R11: 0000000000000000 R12: 00000000000001ed
R13: 0000000000000000 R14: 00000000ffffffff R15: 0000000000000002
FS: 0000000000000000(0000) GS:ffff88bc8b500000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000000 CR3: 0000000002610001 CR4: 00000000001706e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
? show_trace_log_lvl+0x1c1/0x2d9
? show_trace_log_lvl+0x1c1/0x2d9
? set_affinity_irq+0xdc/0x1c0
? __die_body.cold+0x8/0xd
? die+0x2b/0x50
? do_trap+0x90/0x110
? bind_evtchn_to_cpu+0xdf/0xf0
? do_error_trap+0x65/0x80
? bind_evtchn_to_cpu+0xdf/0xf0
? exc_invalid_op+0x4e/0x70
? bind_evtchn_to_cpu+0xdf/0xf0
? asm_exc_invalid_op+0x12/0x20
? bind_evtchn_to_cpu+0xdf/0xf0
? bind_evtchn_to_cpu+0xc5/0xf0
set_affinity_irq+0xdc/0x1c0
irq_do_set_affinity+0x1d7/0x1f0
irq_setup_affinity+0xd6/0x1a0
irq_startup+0x8a/0xf0
__setup_irq+0x639/0x6d0
? nvme_suspend+0x150/0x150
request_threaded_irq+0x10c/0x180
? nvme_suspend+0x150/0x150
pci_request_irq+0xa8/0xf0
? __blk_mq_free_request+0x74/0xa0
queue_request_irq+0x6f/0x80
nvme_create_queue+0x1af/0x200
nvme_create_io_queues+0xbd/0xf0
nvme_setup_io_queues+0x246/0x320
? nvme_irq_check+0x30/0x30
nvme_reset_work+0x1c8/0x400
process_one_work+0x1b0/0x350
worker_thread+0x49/0x310
? process_one_work+0x350/0x350
kthread+0x11b/0x140
? __kthread_bind_mask+0x60/0x60
ret_from_fork+0x22/0x30
Modules linked in:
---[ end trace a11715de1eee1873 ]---
Fixes: d46a78b05c0e ("xen: implement pirq type event channels")
Cc: stable(a)vger.kernel.org
Co-debugged-by: Andrew Panyakin <apanyaki(a)amazon.com>
Signed-off-by: Maximilian Heyne <mheyne(a)amazon.de>
Reviewed-by: Juergen Gross <jgross(a)suse.com>
Link: https://lore.kernel.org/r/20240124163130.31324-1-mheyne@amazon.de
Signed-off-by: Juergen Gross <jgross(a)suse.com>
diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c
index b8cfea7812d6..3b9f080109d7 100644
--- a/drivers/xen/events/events_base.c
+++ b/drivers/xen/events/events_base.c
@@ -923,8 +923,8 @@ static void shutdown_pirq(struct irq_data *data)
return;
do_mask(info, EVT_MASK_REASON_EXPLICIT);
- xen_evtchn_close(evtchn);
xen_irq_info_cleanup(info);
+ xen_evtchn_close(evtchn);
}
static void enable_pirq(struct irq_data *data)
@@ -956,6 +956,7 @@ EXPORT_SYMBOL_GPL(xen_irq_from_gsi);
static void __unbind_from_irq(struct irq_info *info, unsigned int irq)
{
evtchn_port_t evtchn;
+ bool close_evtchn = false;
if (!info) {
xen_irq_free_desc(irq);
@@ -975,7 +976,7 @@ static void __unbind_from_irq(struct irq_info *info, unsigned int irq)
struct xenbus_device *dev;
if (!info->is_static)
- xen_evtchn_close(evtchn);
+ close_evtchn = true;
switch (info->type) {
case IRQT_VIRQ:
@@ -995,6 +996,9 @@ static void __unbind_from_irq(struct irq_info *info, unsigned int irq)
}
xen_irq_info_cleanup(info);
+
+ if (close_evtchn)
+ xen_evtchn_close(evtchn);
}
xen_free_irq(info);
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.19.y
git checkout FETCH_HEAD
git cherry-pick -x f814bdda774c183b0cc15ec8f3b6e7c6f4527ba5
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021925-onscreen-cancel-9be1@gregkh' --subject-prefix 'PATCH 4.19.y' HEAD^..
Possible dependencies:
f814bdda774c ("blk-wbt: Fix detection of dirty-throttled tasks")
ba91c849fa50 ("blk-rq-qos: store a gendisk instead of request_queue in struct rq_qos")
3963d84df797 ("blk-rq-qos: constify rq_qos_ops")
ce57b558604e ("blk-rq-qos: make rq_qos_add and rq_qos_del more useful")
b494f9c566ba ("blk-rq-qos: move rq_qos_add and rq_qos_del out of line")
de185b56e8a6 ("blk-cgroup: pass a gendisk to blkcg_schedule_throttle")
9df3e65139b9 ("blk-iocost: simplify ioc_name")
14a6e2eb7df5 ("block: don't allow the same type rq_qos add more than once")
5cf9c91ba927 ("block: serialize all debugfs operations using q->debugfs_mutex")
8a177a36da6c ("blk-iolatency: Fix inflight count imbalances and IO hangs on offline")
c97ab271576d ("blk-cgroup: remove unneeded includes from <linux/blk-cgroup.h>")
7f20ba7c42fd ("blk-cgroup: remove pointless CONFIG_BLOCK ifdefs")
bbb1ebe7a909 ("blk-cgroup: replace bio_blkcg with bio_blkcg_css")
dec223c92a46 ("blk-cgroup: move struct blkcg to block/blk-cgroup.h")
397c9f46ee4d ("blk-cgroup: move blkcg_{pin,unpin}_online out of line")
216889aad362 ("blk-cgroup: move blk_cgroup_congested out line")
d589ae0d4460 ("Merge tag 'for-5.18/block-2022-04-01' of git://git.kernel.dk/linux-block")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From f814bdda774c183b0cc15ec8f3b6e7c6f4527ba5 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack(a)suse.cz>
Date: Tue, 23 Jan 2024 18:58:26 +0100
Subject: [PATCH] blk-wbt: Fix detection of dirty-throttled tasks
The detection of dirty-throttled tasks in blk-wbt has been subtly broken
since its beginning in 2016. Namely if we are doing cgroup writeback and
the throttled task is not in the root cgroup, balance_dirty_pages() will
set dirty_sleep for the non-root bdi_writeback structure. However
blk-wbt checks dirty_sleep only in the root cgroup bdi_writeback
structure. Thus detection of recently throttled tasks is not working in
this case (we noticed this when we switched to cgroup v2 and suddently
writeback was slow).
Since blk-wbt has no easy way to get to proper bdi_writeback and
furthermore its intention has always been to work on the whole device
rather than on individual cgroups, just move the dirty_sleep timestamp
from bdi_writeback to backing_dev_info. That fixes the checking for
recently throttled task and saves memory for everybody as a bonus.
CC: stable(a)vger.kernel.org
Fixes: b57d74aff9ab ("writeback: track if we're sleeping on progress in balance_dirty_pages()")
Signed-off-by: Jan Kara <jack(a)suse.cz>
Link: https://lore.kernel.org/r/20240123175826.21452-1-jack@suse.cz
[axboe: fixup indentation errors]
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 5ba3cd574eac..0c0e270a8265 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -163,9 +163,9 @@ static void wb_timestamp(struct rq_wb *rwb, unsigned long *var)
*/
static bool wb_recent_wait(struct rq_wb *rwb)
{
- struct bdi_writeback *wb = &rwb->rqos.disk->bdi->wb;
+ struct backing_dev_info *bdi = rwb->rqos.disk->bdi;
- return time_before(jiffies, wb->dirty_sleep + HZ);
+ return time_before(jiffies, bdi->last_bdp_sleep + HZ);
}
static inline struct rq_wait *get_rq_wait(struct rq_wb *rwb,
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index ae12696ec492..2ad261082bba 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -141,8 +141,6 @@ struct bdi_writeback {
struct delayed_work dwork; /* work item used for writeback */
struct delayed_work bw_dwork; /* work item used for bandwidth estimate */
- unsigned long dirty_sleep; /* last wait */
-
struct list_head bdi_node; /* anchored at bdi->wb_list */
#ifdef CONFIG_CGROUP_WRITEBACK
@@ -179,6 +177,11 @@ struct backing_dev_info {
* any dirty wbs, which is depended upon by bdi_has_dirty().
*/
atomic_long_t tot_write_bandwidth;
+ /*
+ * Jiffies when last process was dirty throttled on this bdi. Used by
+ * blk-wbt.
+ */
+ unsigned long last_bdp_sleep;
struct bdi_writeback wb; /* the root writeback info for this bdi */
struct list_head wb_list; /* list of all wbs */
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 1e3447bccdb1..e039d05304dd 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -436,7 +436,6 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
INIT_LIST_HEAD(&wb->work_list);
INIT_DELAYED_WORK(&wb->dwork, wb_workfn);
INIT_DELAYED_WORK(&wb->bw_dwork, wb_update_bandwidth_workfn);
- wb->dirty_sleep = jiffies;
err = fprop_local_init_percpu(&wb->completions, gfp);
if (err)
@@ -921,6 +920,7 @@ int bdi_init(struct backing_dev_info *bdi)
INIT_LIST_HEAD(&bdi->bdi_list);
INIT_LIST_HEAD(&bdi->wb_list);
init_waitqueue_head(&bdi->wb_waitq);
+ bdi->last_bdp_sleep = jiffies;
return cgwb_bdi_init(bdi);
}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index cd4e4ae77c40..cc37fa7f3364 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1921,7 +1921,7 @@ static int balance_dirty_pages(struct bdi_writeback *wb,
break;
}
__set_current_state(TASK_KILLABLE);
- wb->dirty_sleep = now;
+ bdi->last_bdp_sleep = jiffies;
io_schedule_timeout(pause);
current->dirty_paused_when = now + pause;
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x f814bdda774c183b0cc15ec8f3b6e7c6f4527ba5
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021924-handcart-displease-1607@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
f814bdda774c ("blk-wbt: Fix detection of dirty-throttled tasks")
ba91c849fa50 ("blk-rq-qos: store a gendisk instead of request_queue in struct rq_qos")
3963d84df797 ("blk-rq-qos: constify rq_qos_ops")
ce57b558604e ("blk-rq-qos: make rq_qos_add and rq_qos_del more useful")
b494f9c566ba ("blk-rq-qos: move rq_qos_add and rq_qos_del out of line")
de185b56e8a6 ("blk-cgroup: pass a gendisk to blkcg_schedule_throttle")
9df3e65139b9 ("blk-iocost: simplify ioc_name")
14a6e2eb7df5 ("block: don't allow the same type rq_qos add more than once")
5cf9c91ba927 ("block: serialize all debugfs operations using q->debugfs_mutex")
8a177a36da6c ("blk-iolatency: Fix inflight count imbalances and IO hangs on offline")
c97ab271576d ("blk-cgroup: remove unneeded includes from <linux/blk-cgroup.h>")
7f20ba7c42fd ("blk-cgroup: remove pointless CONFIG_BLOCK ifdefs")
bbb1ebe7a909 ("blk-cgroup: replace bio_blkcg with bio_blkcg_css")
dec223c92a46 ("blk-cgroup: move struct blkcg to block/blk-cgroup.h")
397c9f46ee4d ("blk-cgroup: move blkcg_{pin,unpin}_online out of line")
216889aad362 ("blk-cgroup: move blk_cgroup_congested out line")
d589ae0d4460 ("Merge tag 'for-5.18/block-2022-04-01' of git://git.kernel.dk/linux-block")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From f814bdda774c183b0cc15ec8f3b6e7c6f4527ba5 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack(a)suse.cz>
Date: Tue, 23 Jan 2024 18:58:26 +0100
Subject: [PATCH] blk-wbt: Fix detection of dirty-throttled tasks
The detection of dirty-throttled tasks in blk-wbt has been subtly broken
since its beginning in 2016. Namely if we are doing cgroup writeback and
the throttled task is not in the root cgroup, balance_dirty_pages() will
set dirty_sleep for the non-root bdi_writeback structure. However
blk-wbt checks dirty_sleep only in the root cgroup bdi_writeback
structure. Thus detection of recently throttled tasks is not working in
this case (we noticed this when we switched to cgroup v2 and suddently
writeback was slow).
Since blk-wbt has no easy way to get to proper bdi_writeback and
furthermore its intention has always been to work on the whole device
rather than on individual cgroups, just move the dirty_sleep timestamp
from bdi_writeback to backing_dev_info. That fixes the checking for
recently throttled task and saves memory for everybody as a bonus.
CC: stable(a)vger.kernel.org
Fixes: b57d74aff9ab ("writeback: track if we're sleeping on progress in balance_dirty_pages()")
Signed-off-by: Jan Kara <jack(a)suse.cz>
Link: https://lore.kernel.org/r/20240123175826.21452-1-jack@suse.cz
[axboe: fixup indentation errors]
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 5ba3cd574eac..0c0e270a8265 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -163,9 +163,9 @@ static void wb_timestamp(struct rq_wb *rwb, unsigned long *var)
*/
static bool wb_recent_wait(struct rq_wb *rwb)
{
- struct bdi_writeback *wb = &rwb->rqos.disk->bdi->wb;
+ struct backing_dev_info *bdi = rwb->rqos.disk->bdi;
- return time_before(jiffies, wb->dirty_sleep + HZ);
+ return time_before(jiffies, bdi->last_bdp_sleep + HZ);
}
static inline struct rq_wait *get_rq_wait(struct rq_wb *rwb,
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index ae12696ec492..2ad261082bba 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -141,8 +141,6 @@ struct bdi_writeback {
struct delayed_work dwork; /* work item used for writeback */
struct delayed_work bw_dwork; /* work item used for bandwidth estimate */
- unsigned long dirty_sleep; /* last wait */
-
struct list_head bdi_node; /* anchored at bdi->wb_list */
#ifdef CONFIG_CGROUP_WRITEBACK
@@ -179,6 +177,11 @@ struct backing_dev_info {
* any dirty wbs, which is depended upon by bdi_has_dirty().
*/
atomic_long_t tot_write_bandwidth;
+ /*
+ * Jiffies when last process was dirty throttled on this bdi. Used by
+ * blk-wbt.
+ */
+ unsigned long last_bdp_sleep;
struct bdi_writeback wb; /* the root writeback info for this bdi */
struct list_head wb_list; /* list of all wbs */
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 1e3447bccdb1..e039d05304dd 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -436,7 +436,6 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
INIT_LIST_HEAD(&wb->work_list);
INIT_DELAYED_WORK(&wb->dwork, wb_workfn);
INIT_DELAYED_WORK(&wb->bw_dwork, wb_update_bandwidth_workfn);
- wb->dirty_sleep = jiffies;
err = fprop_local_init_percpu(&wb->completions, gfp);
if (err)
@@ -921,6 +920,7 @@ int bdi_init(struct backing_dev_info *bdi)
INIT_LIST_HEAD(&bdi->bdi_list);
INIT_LIST_HEAD(&bdi->wb_list);
init_waitqueue_head(&bdi->wb_waitq);
+ bdi->last_bdp_sleep = jiffies;
return cgwb_bdi_init(bdi);
}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index cd4e4ae77c40..cc37fa7f3364 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1921,7 +1921,7 @@ static int balance_dirty_pages(struct bdi_writeback *wb,
break;
}
__set_current_state(TASK_KILLABLE);
- wb->dirty_sleep = now;
+ bdi->last_bdp_sleep = jiffies;
io_schedule_timeout(pause);
current->dirty_paused_when = now + pause;
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x f814bdda774c183b0cc15ec8f3b6e7c6f4527ba5
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021923-onboard-pork-7d99@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
f814bdda774c ("blk-wbt: Fix detection of dirty-throttled tasks")
ba91c849fa50 ("blk-rq-qos: store a gendisk instead of request_queue in struct rq_qos")
3963d84df797 ("blk-rq-qos: constify rq_qos_ops")
ce57b558604e ("blk-rq-qos: make rq_qos_add and rq_qos_del more useful")
b494f9c566ba ("blk-rq-qos: move rq_qos_add and rq_qos_del out of line")
de185b56e8a6 ("blk-cgroup: pass a gendisk to blkcg_schedule_throttle")
9df3e65139b9 ("blk-iocost: simplify ioc_name")
14a6e2eb7df5 ("block: don't allow the same type rq_qos add more than once")
5cf9c91ba927 ("block: serialize all debugfs operations using q->debugfs_mutex")
8a177a36da6c ("blk-iolatency: Fix inflight count imbalances and IO hangs on offline")
c97ab271576d ("blk-cgroup: remove unneeded includes from <linux/blk-cgroup.h>")
7f20ba7c42fd ("blk-cgroup: remove pointless CONFIG_BLOCK ifdefs")
bbb1ebe7a909 ("blk-cgroup: replace bio_blkcg with bio_blkcg_css")
dec223c92a46 ("blk-cgroup: move struct blkcg to block/blk-cgroup.h")
397c9f46ee4d ("blk-cgroup: move blkcg_{pin,unpin}_online out of line")
216889aad362 ("blk-cgroup: move blk_cgroup_congested out line")
d589ae0d4460 ("Merge tag 'for-5.18/block-2022-04-01' of git://git.kernel.dk/linux-block")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From f814bdda774c183b0cc15ec8f3b6e7c6f4527ba5 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack(a)suse.cz>
Date: Tue, 23 Jan 2024 18:58:26 +0100
Subject: [PATCH] blk-wbt: Fix detection of dirty-throttled tasks
The detection of dirty-throttled tasks in blk-wbt has been subtly broken
since its beginning in 2016. Namely if we are doing cgroup writeback and
the throttled task is not in the root cgroup, balance_dirty_pages() will
set dirty_sleep for the non-root bdi_writeback structure. However
blk-wbt checks dirty_sleep only in the root cgroup bdi_writeback
structure. Thus detection of recently throttled tasks is not working in
this case (we noticed this when we switched to cgroup v2 and suddently
writeback was slow).
Since blk-wbt has no easy way to get to proper bdi_writeback and
furthermore its intention has always been to work on the whole device
rather than on individual cgroups, just move the dirty_sleep timestamp
from bdi_writeback to backing_dev_info. That fixes the checking for
recently throttled task and saves memory for everybody as a bonus.
CC: stable(a)vger.kernel.org
Fixes: b57d74aff9ab ("writeback: track if we're sleeping on progress in balance_dirty_pages()")
Signed-off-by: Jan Kara <jack(a)suse.cz>
Link: https://lore.kernel.org/r/20240123175826.21452-1-jack@suse.cz
[axboe: fixup indentation errors]
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 5ba3cd574eac..0c0e270a8265 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -163,9 +163,9 @@ static void wb_timestamp(struct rq_wb *rwb, unsigned long *var)
*/
static bool wb_recent_wait(struct rq_wb *rwb)
{
- struct bdi_writeback *wb = &rwb->rqos.disk->bdi->wb;
+ struct backing_dev_info *bdi = rwb->rqos.disk->bdi;
- return time_before(jiffies, wb->dirty_sleep + HZ);
+ return time_before(jiffies, bdi->last_bdp_sleep + HZ);
}
static inline struct rq_wait *get_rq_wait(struct rq_wb *rwb,
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index ae12696ec492..2ad261082bba 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -141,8 +141,6 @@ struct bdi_writeback {
struct delayed_work dwork; /* work item used for writeback */
struct delayed_work bw_dwork; /* work item used for bandwidth estimate */
- unsigned long dirty_sleep; /* last wait */
-
struct list_head bdi_node; /* anchored at bdi->wb_list */
#ifdef CONFIG_CGROUP_WRITEBACK
@@ -179,6 +177,11 @@ struct backing_dev_info {
* any dirty wbs, which is depended upon by bdi_has_dirty().
*/
atomic_long_t tot_write_bandwidth;
+ /*
+ * Jiffies when last process was dirty throttled on this bdi. Used by
+ * blk-wbt.
+ */
+ unsigned long last_bdp_sleep;
struct bdi_writeback wb; /* the root writeback info for this bdi */
struct list_head wb_list; /* list of all wbs */
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 1e3447bccdb1..e039d05304dd 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -436,7 +436,6 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
INIT_LIST_HEAD(&wb->work_list);
INIT_DELAYED_WORK(&wb->dwork, wb_workfn);
INIT_DELAYED_WORK(&wb->bw_dwork, wb_update_bandwidth_workfn);
- wb->dirty_sleep = jiffies;
err = fprop_local_init_percpu(&wb->completions, gfp);
if (err)
@@ -921,6 +920,7 @@ int bdi_init(struct backing_dev_info *bdi)
INIT_LIST_HEAD(&bdi->bdi_list);
INIT_LIST_HEAD(&bdi->wb_list);
init_waitqueue_head(&bdi->wb_waitq);
+ bdi->last_bdp_sleep = jiffies;
return cgwb_bdi_init(bdi);
}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index cd4e4ae77c40..cc37fa7f3364 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1921,7 +1921,7 @@ static int balance_dirty_pages(struct bdi_writeback *wb,
break;
}
__set_current_state(TASK_KILLABLE);
- wb->dirty_sleep = now;
+ bdi->last_bdp_sleep = jiffies;
io_schedule_timeout(pause);
current->dirty_paused_when = now + pause;
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x f814bdda774c183b0cc15ec8f3b6e7c6f4527ba5
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021921-unspoiled-despite-59cc@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
f814bdda774c ("blk-wbt: Fix detection of dirty-throttled tasks")
ba91c849fa50 ("blk-rq-qos: store a gendisk instead of request_queue in struct rq_qos")
3963d84df797 ("blk-rq-qos: constify rq_qos_ops")
ce57b558604e ("blk-rq-qos: make rq_qos_add and rq_qos_del more useful")
b494f9c566ba ("blk-rq-qos: move rq_qos_add and rq_qos_del out of line")
de185b56e8a6 ("blk-cgroup: pass a gendisk to blkcg_schedule_throttle")
9df3e65139b9 ("blk-iocost: simplify ioc_name")
14a6e2eb7df5 ("block: don't allow the same type rq_qos add more than once")
5cf9c91ba927 ("block: serialize all debugfs operations using q->debugfs_mutex")
8a177a36da6c ("blk-iolatency: Fix inflight count imbalances and IO hangs on offline")
c97ab271576d ("blk-cgroup: remove unneeded includes from <linux/blk-cgroup.h>")
7f20ba7c42fd ("blk-cgroup: remove pointless CONFIG_BLOCK ifdefs")
bbb1ebe7a909 ("blk-cgroup: replace bio_blkcg with bio_blkcg_css")
dec223c92a46 ("blk-cgroup: move struct blkcg to block/blk-cgroup.h")
397c9f46ee4d ("blk-cgroup: move blkcg_{pin,unpin}_online out of line")
216889aad362 ("blk-cgroup: move blk_cgroup_congested out line")
d589ae0d4460 ("Merge tag 'for-5.18/block-2022-04-01' of git://git.kernel.dk/linux-block")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From f814bdda774c183b0cc15ec8f3b6e7c6f4527ba5 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack(a)suse.cz>
Date: Tue, 23 Jan 2024 18:58:26 +0100
Subject: [PATCH] blk-wbt: Fix detection of dirty-throttled tasks
The detection of dirty-throttled tasks in blk-wbt has been subtly broken
since its beginning in 2016. Namely if we are doing cgroup writeback and
the throttled task is not in the root cgroup, balance_dirty_pages() will
set dirty_sleep for the non-root bdi_writeback structure. However
blk-wbt checks dirty_sleep only in the root cgroup bdi_writeback
structure. Thus detection of recently throttled tasks is not working in
this case (we noticed this when we switched to cgroup v2 and suddently
writeback was slow).
Since blk-wbt has no easy way to get to proper bdi_writeback and
furthermore its intention has always been to work on the whole device
rather than on individual cgroups, just move the dirty_sleep timestamp
from bdi_writeback to backing_dev_info. That fixes the checking for
recently throttled task and saves memory for everybody as a bonus.
CC: stable(a)vger.kernel.org
Fixes: b57d74aff9ab ("writeback: track if we're sleeping on progress in balance_dirty_pages()")
Signed-off-by: Jan Kara <jack(a)suse.cz>
Link: https://lore.kernel.org/r/20240123175826.21452-1-jack@suse.cz
[axboe: fixup indentation errors]
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 5ba3cd574eac..0c0e270a8265 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -163,9 +163,9 @@ static void wb_timestamp(struct rq_wb *rwb, unsigned long *var)
*/
static bool wb_recent_wait(struct rq_wb *rwb)
{
- struct bdi_writeback *wb = &rwb->rqos.disk->bdi->wb;
+ struct backing_dev_info *bdi = rwb->rqos.disk->bdi;
- return time_before(jiffies, wb->dirty_sleep + HZ);
+ return time_before(jiffies, bdi->last_bdp_sleep + HZ);
}
static inline struct rq_wait *get_rq_wait(struct rq_wb *rwb,
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index ae12696ec492..2ad261082bba 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -141,8 +141,6 @@ struct bdi_writeback {
struct delayed_work dwork; /* work item used for writeback */
struct delayed_work bw_dwork; /* work item used for bandwidth estimate */
- unsigned long dirty_sleep; /* last wait */
-
struct list_head bdi_node; /* anchored at bdi->wb_list */
#ifdef CONFIG_CGROUP_WRITEBACK
@@ -179,6 +177,11 @@ struct backing_dev_info {
* any dirty wbs, which is depended upon by bdi_has_dirty().
*/
atomic_long_t tot_write_bandwidth;
+ /*
+ * Jiffies when last process was dirty throttled on this bdi. Used by
+ * blk-wbt.
+ */
+ unsigned long last_bdp_sleep;
struct bdi_writeback wb; /* the root writeback info for this bdi */
struct list_head wb_list; /* list of all wbs */
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 1e3447bccdb1..e039d05304dd 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -436,7 +436,6 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
INIT_LIST_HEAD(&wb->work_list);
INIT_DELAYED_WORK(&wb->dwork, wb_workfn);
INIT_DELAYED_WORK(&wb->bw_dwork, wb_update_bandwidth_workfn);
- wb->dirty_sleep = jiffies;
err = fprop_local_init_percpu(&wb->completions, gfp);
if (err)
@@ -921,6 +920,7 @@ int bdi_init(struct backing_dev_info *bdi)
INIT_LIST_HEAD(&bdi->bdi_list);
INIT_LIST_HEAD(&bdi->wb_list);
init_waitqueue_head(&bdi->wb_waitq);
+ bdi->last_bdp_sleep = jiffies;
return cgwb_bdi_init(bdi);
}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index cd4e4ae77c40..cc37fa7f3364 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1921,7 +1921,7 @@ static int balance_dirty_pages(struct bdi_writeback *wb,
break;
}
__set_current_state(TASK_KILLABLE);
- wb->dirty_sleep = now;
+ bdi->last_bdp_sleep = jiffies;
io_schedule_timeout(pause);
current->dirty_paused_when = now + pause;
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x f814bdda774c183b0cc15ec8f3b6e7c6f4527ba5
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021920-latitude-quilt-dfa8@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
f814bdda774c ("blk-wbt: Fix detection of dirty-throttled tasks")
ba91c849fa50 ("blk-rq-qos: store a gendisk instead of request_queue in struct rq_qos")
3963d84df797 ("blk-rq-qos: constify rq_qos_ops")
ce57b558604e ("blk-rq-qos: make rq_qos_add and rq_qos_del more useful")
b494f9c566ba ("blk-rq-qos: move rq_qos_add and rq_qos_del out of line")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From f814bdda774c183b0cc15ec8f3b6e7c6f4527ba5 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack(a)suse.cz>
Date: Tue, 23 Jan 2024 18:58:26 +0100
Subject: [PATCH] blk-wbt: Fix detection of dirty-throttled tasks
The detection of dirty-throttled tasks in blk-wbt has been subtly broken
since its beginning in 2016. Namely if we are doing cgroup writeback and
the throttled task is not in the root cgroup, balance_dirty_pages() will
set dirty_sleep for the non-root bdi_writeback structure. However
blk-wbt checks dirty_sleep only in the root cgroup bdi_writeback
structure. Thus detection of recently throttled tasks is not working in
this case (we noticed this when we switched to cgroup v2 and suddently
writeback was slow).
Since blk-wbt has no easy way to get to proper bdi_writeback and
furthermore its intention has always been to work on the whole device
rather than on individual cgroups, just move the dirty_sleep timestamp
from bdi_writeback to backing_dev_info. That fixes the checking for
recently throttled task and saves memory for everybody as a bonus.
CC: stable(a)vger.kernel.org
Fixes: b57d74aff9ab ("writeback: track if we're sleeping on progress in balance_dirty_pages()")
Signed-off-by: Jan Kara <jack(a)suse.cz>
Link: https://lore.kernel.org/r/20240123175826.21452-1-jack@suse.cz
[axboe: fixup indentation errors]
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 5ba3cd574eac..0c0e270a8265 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -163,9 +163,9 @@ static void wb_timestamp(struct rq_wb *rwb, unsigned long *var)
*/
static bool wb_recent_wait(struct rq_wb *rwb)
{
- struct bdi_writeback *wb = &rwb->rqos.disk->bdi->wb;
+ struct backing_dev_info *bdi = rwb->rqos.disk->bdi;
- return time_before(jiffies, wb->dirty_sleep + HZ);
+ return time_before(jiffies, bdi->last_bdp_sleep + HZ);
}
static inline struct rq_wait *get_rq_wait(struct rq_wb *rwb,
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index ae12696ec492..2ad261082bba 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -141,8 +141,6 @@ struct bdi_writeback {
struct delayed_work dwork; /* work item used for writeback */
struct delayed_work bw_dwork; /* work item used for bandwidth estimate */
- unsigned long dirty_sleep; /* last wait */
-
struct list_head bdi_node; /* anchored at bdi->wb_list */
#ifdef CONFIG_CGROUP_WRITEBACK
@@ -179,6 +177,11 @@ struct backing_dev_info {
* any dirty wbs, which is depended upon by bdi_has_dirty().
*/
atomic_long_t tot_write_bandwidth;
+ /*
+ * Jiffies when last process was dirty throttled on this bdi. Used by
+ * blk-wbt.
+ */
+ unsigned long last_bdp_sleep;
struct bdi_writeback wb; /* the root writeback info for this bdi */
struct list_head wb_list; /* list of all wbs */
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 1e3447bccdb1..e039d05304dd 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -436,7 +436,6 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
INIT_LIST_HEAD(&wb->work_list);
INIT_DELAYED_WORK(&wb->dwork, wb_workfn);
INIT_DELAYED_WORK(&wb->bw_dwork, wb_update_bandwidth_workfn);
- wb->dirty_sleep = jiffies;
err = fprop_local_init_percpu(&wb->completions, gfp);
if (err)
@@ -921,6 +920,7 @@ int bdi_init(struct backing_dev_info *bdi)
INIT_LIST_HEAD(&bdi->bdi_list);
INIT_LIST_HEAD(&bdi->wb_list);
init_waitqueue_head(&bdi->wb_waitq);
+ bdi->last_bdp_sleep = jiffies;
return cgwb_bdi_init(bdi);
}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index cd4e4ae77c40..cc37fa7f3364 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1921,7 +1921,7 @@ static int balance_dirty_pages(struct bdi_writeback *wb,
break;
}
__set_current_state(TASK_KILLABLE);
- wb->dirty_sleep = now;
+ bdi->last_bdp_sleep = jiffies;
io_schedule_timeout(pause);
current->dirty_paused_when = now + pause;
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.19.y
git checkout FETCH_HEAD
git cherry-pick -x 9cae43da9867412f8bd09aee5c8a8dc5e8dc3dc2
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021940-unviable-dispersal-2253@gregkh' --subject-prefix 'PATCH 4.19.y' HEAD^..
Possible dependencies:
9cae43da9867 ("hv_netvsc: Register VF in netvsc_probe if NET_DEVICE_REGISTER missed")
c807d6cd089d ("hv_netvsc: Mark VF as slave before exposing it to user-mode")
365e1ececb29 ("hv_netvsc: Fix race between VF offering and VF association message from host")
c60882a4566a ("hv_netvsc: use netif_is_bond_master() instead of open code")
d0922bf79817 ("hv_netvsc: Add error handling while switching data path")
34b06a2eee44 ("hv_netvsc: Process NETDEV_GOING_DOWN on VF hot remove")
8b31f8c982b7 ("hv_netvsc: Wait for completion on request SWITCH_DATA_PATH")
4d18fcc95f50 ("hv_netvsc: Use vmbus_requestor to generate transaction IDs for VMBus hardening")
e8b7db38449a ("Drivers: hv: vmbus: Add vmbus_requestor data structure for VMBus hardening")
4907a43da831 ("Merge tag 'hyperv-next-signed' of git://git.kernel.org/pub/scm/linux/kernel/git/hyperv/linux")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 9cae43da9867412f8bd09aee5c8a8dc5e8dc3dc2 Mon Sep 17 00:00:00 2001
From: Shradha Gupta <shradhagupta(a)linux.microsoft.com>
Date: Thu, 1 Feb 2024 20:40:38 -0800
Subject: [PATCH] hv_netvsc: Register VF in netvsc_probe if NET_DEVICE_REGISTER
missed
If hv_netvsc driver is unloaded and reloaded, the NET_DEVICE_REGISTER
handler cannot perform VF register successfully as the register call
is received before netvsc_probe is finished. This is because we
register register_netdevice_notifier() very early( even before
vmbus_driver_register()).
To fix this, we try to register each such matching VF( if it is visible
as a netdevice) at the end of netvsc_probe.
Cc: stable(a)vger.kernel.org
Fixes: 85520856466e ("hv_netvsc: Fix race of register_netdevice_notifier and VF register")
Suggested-by: Dexuan Cui <decui(a)microsoft.com>
Signed-off-by: Shradha Gupta <shradhagupta(a)linux.microsoft.com>
Reviewed-by: Haiyang Zhang <haiyangz(a)microsoft.com>
Reviewed-by: Dexuan Cui <decui(a)microsoft.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index 273bd8a20122..11831a1c9762 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -42,6 +42,10 @@
#define LINKCHANGE_INT (2 * HZ)
#define VF_TAKEOVER_INT (HZ / 10)
+/* Macros to define the context of vf registration */
+#define VF_REG_IN_PROBE 1
+#define VF_REG_IN_NOTIFIER 2
+
static unsigned int ring_size __ro_after_init = 128;
module_param(ring_size, uint, 0444);
MODULE_PARM_DESC(ring_size, "Ring buffer size (# of 4K pages)");
@@ -2185,7 +2189,7 @@ static rx_handler_result_t netvsc_vf_handle_frame(struct sk_buff **pskb)
}
static int netvsc_vf_join(struct net_device *vf_netdev,
- struct net_device *ndev)
+ struct net_device *ndev, int context)
{
struct net_device_context *ndev_ctx = netdev_priv(ndev);
int ret;
@@ -2208,7 +2212,11 @@ static int netvsc_vf_join(struct net_device *vf_netdev,
goto upper_link_failed;
}
- schedule_delayed_work(&ndev_ctx->vf_takeover, VF_TAKEOVER_INT);
+ /* If this registration is called from probe context vf_takeover
+ * is taken care of later in probe itself.
+ */
+ if (context == VF_REG_IN_NOTIFIER)
+ schedule_delayed_work(&ndev_ctx->vf_takeover, VF_TAKEOVER_INT);
call_netdevice_notifiers(NETDEV_JOIN, vf_netdev);
@@ -2346,7 +2354,7 @@ static int netvsc_prepare_bonding(struct net_device *vf_netdev)
return NOTIFY_DONE;
}
-static int netvsc_register_vf(struct net_device *vf_netdev)
+static int netvsc_register_vf(struct net_device *vf_netdev, int context)
{
struct net_device_context *net_device_ctx;
struct netvsc_device *netvsc_dev;
@@ -2386,7 +2394,7 @@ static int netvsc_register_vf(struct net_device *vf_netdev)
netdev_info(ndev, "VF registering: %s\n", vf_netdev->name);
- if (netvsc_vf_join(vf_netdev, ndev) != 0)
+ if (netvsc_vf_join(vf_netdev, ndev, context) != 0)
return NOTIFY_DONE;
dev_hold(vf_netdev);
@@ -2484,10 +2492,31 @@ static int netvsc_unregister_vf(struct net_device *vf_netdev)
return NOTIFY_OK;
}
+static int check_dev_is_matching_vf(struct net_device *event_ndev)
+{
+ /* Skip NetVSC interfaces */
+ if (event_ndev->netdev_ops == &device_ops)
+ return -ENODEV;
+
+ /* Avoid non-Ethernet type devices */
+ if (event_ndev->type != ARPHRD_ETHER)
+ return -ENODEV;
+
+ /* Avoid Vlan dev with same MAC registering as VF */
+ if (is_vlan_dev(event_ndev))
+ return -ENODEV;
+
+ /* Avoid Bonding master dev with same MAC registering as VF */
+ if (netif_is_bond_master(event_ndev))
+ return -ENODEV;
+
+ return 0;
+}
+
static int netvsc_probe(struct hv_device *dev,
const struct hv_vmbus_device_id *dev_id)
{
- struct net_device *net = NULL;
+ struct net_device *net = NULL, *vf_netdev;
struct net_device_context *net_device_ctx;
struct netvsc_device_info *device_info = NULL;
struct netvsc_device *nvdev;
@@ -2599,6 +2628,30 @@ static int netvsc_probe(struct hv_device *dev,
}
list_add(&net_device_ctx->list, &netvsc_dev_list);
+
+ /* When the hv_netvsc driver is unloaded and reloaded, the
+ * NET_DEVICE_REGISTER for the vf device is replayed before probe
+ * is complete. This is because register_netdevice_notifier() gets
+ * registered before vmbus_driver_register() so that callback func
+ * is set before probe and we don't miss events like NETDEV_POST_INIT
+ * So, in this section we try to register the matching vf device that
+ * is present as a netdevice, knowing that its register call is not
+ * processed in the netvsc_netdev_notifier(as probing is progress and
+ * get_netvsc_byslot fails).
+ */
+ for_each_netdev(dev_net(net), vf_netdev) {
+ ret = check_dev_is_matching_vf(vf_netdev);
+ if (ret != 0)
+ continue;
+
+ if (net != get_netvsc_byslot(vf_netdev))
+ continue;
+
+ netvsc_prepare_bonding(vf_netdev);
+ netvsc_register_vf(vf_netdev, VF_REG_IN_PROBE);
+ __netvsc_vf_setup(net, vf_netdev);
+ break;
+ }
rtnl_unlock();
netvsc_devinfo_put(device_info);
@@ -2754,28 +2807,17 @@ static int netvsc_netdev_event(struct notifier_block *this,
unsigned long event, void *ptr)
{
struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);
+ int ret = 0;
- /* Skip our own events */
- if (event_dev->netdev_ops == &device_ops)
- return NOTIFY_DONE;
-
- /* Avoid non-Ethernet type devices */
- if (event_dev->type != ARPHRD_ETHER)
- return NOTIFY_DONE;
-
- /* Avoid Vlan dev with same MAC registering as VF */
- if (is_vlan_dev(event_dev))
- return NOTIFY_DONE;
-
- /* Avoid Bonding master dev with same MAC registering as VF */
- if (netif_is_bond_master(event_dev))
+ ret = check_dev_is_matching_vf(event_dev);
+ if (ret != 0)
return NOTIFY_DONE;
switch (event) {
case NETDEV_POST_INIT:
return netvsc_prepare_bonding(event_dev);
case NETDEV_REGISTER:
- return netvsc_register_vf(event_dev);
+ return netvsc_register_vf(event_dev, VF_REG_IN_NOTIFIER);
case NETDEV_UNREGISTER:
return netvsc_unregister_vf(event_dev);
case NETDEV_UP:
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x 9cae43da9867412f8bd09aee5c8a8dc5e8dc3dc2
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021939-deceiving-skinny-32a6@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
9cae43da9867 ("hv_netvsc: Register VF in netvsc_probe if NET_DEVICE_REGISTER missed")
c807d6cd089d ("hv_netvsc: Mark VF as slave before exposing it to user-mode")
365e1ececb29 ("hv_netvsc: Fix race between VF offering and VF association message from host")
c60882a4566a ("hv_netvsc: use netif_is_bond_master() instead of open code")
d0922bf79817 ("hv_netvsc: Add error handling while switching data path")
34b06a2eee44 ("hv_netvsc: Process NETDEV_GOING_DOWN on VF hot remove")
8b31f8c982b7 ("hv_netvsc: Wait for completion on request SWITCH_DATA_PATH")
4d18fcc95f50 ("hv_netvsc: Use vmbus_requestor to generate transaction IDs for VMBus hardening")
e8b7db38449a ("Drivers: hv: vmbus: Add vmbus_requestor data structure for VMBus hardening")
4907a43da831 ("Merge tag 'hyperv-next-signed' of git://git.kernel.org/pub/scm/linux/kernel/git/hyperv/linux")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 9cae43da9867412f8bd09aee5c8a8dc5e8dc3dc2 Mon Sep 17 00:00:00 2001
From: Shradha Gupta <shradhagupta(a)linux.microsoft.com>
Date: Thu, 1 Feb 2024 20:40:38 -0800
Subject: [PATCH] hv_netvsc: Register VF in netvsc_probe if NET_DEVICE_REGISTER
missed
If hv_netvsc driver is unloaded and reloaded, the NET_DEVICE_REGISTER
handler cannot perform VF register successfully as the register call
is received before netvsc_probe is finished. This is because we
register register_netdevice_notifier() very early( even before
vmbus_driver_register()).
To fix this, we try to register each such matching VF( if it is visible
as a netdevice) at the end of netvsc_probe.
Cc: stable(a)vger.kernel.org
Fixes: 85520856466e ("hv_netvsc: Fix race of register_netdevice_notifier and VF register")
Suggested-by: Dexuan Cui <decui(a)microsoft.com>
Signed-off-by: Shradha Gupta <shradhagupta(a)linux.microsoft.com>
Reviewed-by: Haiyang Zhang <haiyangz(a)microsoft.com>
Reviewed-by: Dexuan Cui <decui(a)microsoft.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index 273bd8a20122..11831a1c9762 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -42,6 +42,10 @@
#define LINKCHANGE_INT (2 * HZ)
#define VF_TAKEOVER_INT (HZ / 10)
+/* Macros to define the context of vf registration */
+#define VF_REG_IN_PROBE 1
+#define VF_REG_IN_NOTIFIER 2
+
static unsigned int ring_size __ro_after_init = 128;
module_param(ring_size, uint, 0444);
MODULE_PARM_DESC(ring_size, "Ring buffer size (# of 4K pages)");
@@ -2185,7 +2189,7 @@ static rx_handler_result_t netvsc_vf_handle_frame(struct sk_buff **pskb)
}
static int netvsc_vf_join(struct net_device *vf_netdev,
- struct net_device *ndev)
+ struct net_device *ndev, int context)
{
struct net_device_context *ndev_ctx = netdev_priv(ndev);
int ret;
@@ -2208,7 +2212,11 @@ static int netvsc_vf_join(struct net_device *vf_netdev,
goto upper_link_failed;
}
- schedule_delayed_work(&ndev_ctx->vf_takeover, VF_TAKEOVER_INT);
+ /* If this registration is called from probe context vf_takeover
+ * is taken care of later in probe itself.
+ */
+ if (context == VF_REG_IN_NOTIFIER)
+ schedule_delayed_work(&ndev_ctx->vf_takeover, VF_TAKEOVER_INT);
call_netdevice_notifiers(NETDEV_JOIN, vf_netdev);
@@ -2346,7 +2354,7 @@ static int netvsc_prepare_bonding(struct net_device *vf_netdev)
return NOTIFY_DONE;
}
-static int netvsc_register_vf(struct net_device *vf_netdev)
+static int netvsc_register_vf(struct net_device *vf_netdev, int context)
{
struct net_device_context *net_device_ctx;
struct netvsc_device *netvsc_dev;
@@ -2386,7 +2394,7 @@ static int netvsc_register_vf(struct net_device *vf_netdev)
netdev_info(ndev, "VF registering: %s\n", vf_netdev->name);
- if (netvsc_vf_join(vf_netdev, ndev) != 0)
+ if (netvsc_vf_join(vf_netdev, ndev, context) != 0)
return NOTIFY_DONE;
dev_hold(vf_netdev);
@@ -2484,10 +2492,31 @@ static int netvsc_unregister_vf(struct net_device *vf_netdev)
return NOTIFY_OK;
}
+static int check_dev_is_matching_vf(struct net_device *event_ndev)
+{
+ /* Skip NetVSC interfaces */
+ if (event_ndev->netdev_ops == &device_ops)
+ return -ENODEV;
+
+ /* Avoid non-Ethernet type devices */
+ if (event_ndev->type != ARPHRD_ETHER)
+ return -ENODEV;
+
+ /* Avoid Vlan dev with same MAC registering as VF */
+ if (is_vlan_dev(event_ndev))
+ return -ENODEV;
+
+ /* Avoid Bonding master dev with same MAC registering as VF */
+ if (netif_is_bond_master(event_ndev))
+ return -ENODEV;
+
+ return 0;
+}
+
static int netvsc_probe(struct hv_device *dev,
const struct hv_vmbus_device_id *dev_id)
{
- struct net_device *net = NULL;
+ struct net_device *net = NULL, *vf_netdev;
struct net_device_context *net_device_ctx;
struct netvsc_device_info *device_info = NULL;
struct netvsc_device *nvdev;
@@ -2599,6 +2628,30 @@ static int netvsc_probe(struct hv_device *dev,
}
list_add(&net_device_ctx->list, &netvsc_dev_list);
+
+ /* When the hv_netvsc driver is unloaded and reloaded, the
+ * NET_DEVICE_REGISTER for the vf device is replayed before probe
+ * is complete. This is because register_netdevice_notifier() gets
+ * registered before vmbus_driver_register() so that callback func
+ * is set before probe and we don't miss events like NETDEV_POST_INIT
+ * So, in this section we try to register the matching vf device that
+ * is present as a netdevice, knowing that its register call is not
+ * processed in the netvsc_netdev_notifier(as probing is progress and
+ * get_netvsc_byslot fails).
+ */
+ for_each_netdev(dev_net(net), vf_netdev) {
+ ret = check_dev_is_matching_vf(vf_netdev);
+ if (ret != 0)
+ continue;
+
+ if (net != get_netvsc_byslot(vf_netdev))
+ continue;
+
+ netvsc_prepare_bonding(vf_netdev);
+ netvsc_register_vf(vf_netdev, VF_REG_IN_PROBE);
+ __netvsc_vf_setup(net, vf_netdev);
+ break;
+ }
rtnl_unlock();
netvsc_devinfo_put(device_info);
@@ -2754,28 +2807,17 @@ static int netvsc_netdev_event(struct notifier_block *this,
unsigned long event, void *ptr)
{
struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);
+ int ret = 0;
- /* Skip our own events */
- if (event_dev->netdev_ops == &device_ops)
- return NOTIFY_DONE;
-
- /* Avoid non-Ethernet type devices */
- if (event_dev->type != ARPHRD_ETHER)
- return NOTIFY_DONE;
-
- /* Avoid Vlan dev with same MAC registering as VF */
- if (is_vlan_dev(event_dev))
- return NOTIFY_DONE;
-
- /* Avoid Bonding master dev with same MAC registering as VF */
- if (netif_is_bond_master(event_dev))
+ ret = check_dev_is_matching_vf(event_dev);
+ if (ret != 0)
return NOTIFY_DONE;
switch (event) {
case NETDEV_POST_INIT:
return netvsc_prepare_bonding(event_dev);
case NETDEV_REGISTER:
- return netvsc_register_vf(event_dev);
+ return netvsc_register_vf(event_dev, VF_REG_IN_NOTIFIER);
case NETDEV_UNREGISTER:
return netvsc_unregister_vf(event_dev);
case NETDEV_UP:
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x 9cae43da9867412f8bd09aee5c8a8dc5e8dc3dc2
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021938-legislate-polygon-4765@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
9cae43da9867 ("hv_netvsc: Register VF in netvsc_probe if NET_DEVICE_REGISTER missed")
c807d6cd089d ("hv_netvsc: Mark VF as slave before exposing it to user-mode")
365e1ececb29 ("hv_netvsc: Fix race between VF offering and VF association message from host")
c60882a4566a ("hv_netvsc: use netif_is_bond_master() instead of open code")
d0922bf79817 ("hv_netvsc: Add error handling while switching data path")
34b06a2eee44 ("hv_netvsc: Process NETDEV_GOING_DOWN on VF hot remove")
8b31f8c982b7 ("hv_netvsc: Wait for completion on request SWITCH_DATA_PATH")
4d18fcc95f50 ("hv_netvsc: Use vmbus_requestor to generate transaction IDs for VMBus hardening")
e8b7db38449a ("Drivers: hv: vmbus: Add vmbus_requestor data structure for VMBus hardening")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 9cae43da9867412f8bd09aee5c8a8dc5e8dc3dc2 Mon Sep 17 00:00:00 2001
From: Shradha Gupta <shradhagupta(a)linux.microsoft.com>
Date: Thu, 1 Feb 2024 20:40:38 -0800
Subject: [PATCH] hv_netvsc: Register VF in netvsc_probe if NET_DEVICE_REGISTER
missed
If hv_netvsc driver is unloaded and reloaded, the NET_DEVICE_REGISTER
handler cannot perform VF register successfully as the register call
is received before netvsc_probe is finished. This is because we
register register_netdevice_notifier() very early( even before
vmbus_driver_register()).
To fix this, we try to register each such matching VF( if it is visible
as a netdevice) at the end of netvsc_probe.
Cc: stable(a)vger.kernel.org
Fixes: 85520856466e ("hv_netvsc: Fix race of register_netdevice_notifier and VF register")
Suggested-by: Dexuan Cui <decui(a)microsoft.com>
Signed-off-by: Shradha Gupta <shradhagupta(a)linux.microsoft.com>
Reviewed-by: Haiyang Zhang <haiyangz(a)microsoft.com>
Reviewed-by: Dexuan Cui <decui(a)microsoft.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index 273bd8a20122..11831a1c9762 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -42,6 +42,10 @@
#define LINKCHANGE_INT (2 * HZ)
#define VF_TAKEOVER_INT (HZ / 10)
+/* Macros to define the context of vf registration */
+#define VF_REG_IN_PROBE 1
+#define VF_REG_IN_NOTIFIER 2
+
static unsigned int ring_size __ro_after_init = 128;
module_param(ring_size, uint, 0444);
MODULE_PARM_DESC(ring_size, "Ring buffer size (# of 4K pages)");
@@ -2185,7 +2189,7 @@ static rx_handler_result_t netvsc_vf_handle_frame(struct sk_buff **pskb)
}
static int netvsc_vf_join(struct net_device *vf_netdev,
- struct net_device *ndev)
+ struct net_device *ndev, int context)
{
struct net_device_context *ndev_ctx = netdev_priv(ndev);
int ret;
@@ -2208,7 +2212,11 @@ static int netvsc_vf_join(struct net_device *vf_netdev,
goto upper_link_failed;
}
- schedule_delayed_work(&ndev_ctx->vf_takeover, VF_TAKEOVER_INT);
+ /* If this registration is called from probe context vf_takeover
+ * is taken care of later in probe itself.
+ */
+ if (context == VF_REG_IN_NOTIFIER)
+ schedule_delayed_work(&ndev_ctx->vf_takeover, VF_TAKEOVER_INT);
call_netdevice_notifiers(NETDEV_JOIN, vf_netdev);
@@ -2346,7 +2354,7 @@ static int netvsc_prepare_bonding(struct net_device *vf_netdev)
return NOTIFY_DONE;
}
-static int netvsc_register_vf(struct net_device *vf_netdev)
+static int netvsc_register_vf(struct net_device *vf_netdev, int context)
{
struct net_device_context *net_device_ctx;
struct netvsc_device *netvsc_dev;
@@ -2386,7 +2394,7 @@ static int netvsc_register_vf(struct net_device *vf_netdev)
netdev_info(ndev, "VF registering: %s\n", vf_netdev->name);
- if (netvsc_vf_join(vf_netdev, ndev) != 0)
+ if (netvsc_vf_join(vf_netdev, ndev, context) != 0)
return NOTIFY_DONE;
dev_hold(vf_netdev);
@@ -2484,10 +2492,31 @@ static int netvsc_unregister_vf(struct net_device *vf_netdev)
return NOTIFY_OK;
}
+static int check_dev_is_matching_vf(struct net_device *event_ndev)
+{
+ /* Skip NetVSC interfaces */
+ if (event_ndev->netdev_ops == &device_ops)
+ return -ENODEV;
+
+ /* Avoid non-Ethernet type devices */
+ if (event_ndev->type != ARPHRD_ETHER)
+ return -ENODEV;
+
+ /* Avoid Vlan dev with same MAC registering as VF */
+ if (is_vlan_dev(event_ndev))
+ return -ENODEV;
+
+ /* Avoid Bonding master dev with same MAC registering as VF */
+ if (netif_is_bond_master(event_ndev))
+ return -ENODEV;
+
+ return 0;
+}
+
static int netvsc_probe(struct hv_device *dev,
const struct hv_vmbus_device_id *dev_id)
{
- struct net_device *net = NULL;
+ struct net_device *net = NULL, *vf_netdev;
struct net_device_context *net_device_ctx;
struct netvsc_device_info *device_info = NULL;
struct netvsc_device *nvdev;
@@ -2599,6 +2628,30 @@ static int netvsc_probe(struct hv_device *dev,
}
list_add(&net_device_ctx->list, &netvsc_dev_list);
+
+ /* When the hv_netvsc driver is unloaded and reloaded, the
+ * NET_DEVICE_REGISTER for the vf device is replayed before probe
+ * is complete. This is because register_netdevice_notifier() gets
+ * registered before vmbus_driver_register() so that callback func
+ * is set before probe and we don't miss events like NETDEV_POST_INIT
+ * So, in this section we try to register the matching vf device that
+ * is present as a netdevice, knowing that its register call is not
+ * processed in the netvsc_netdev_notifier(as probing is progress and
+ * get_netvsc_byslot fails).
+ */
+ for_each_netdev(dev_net(net), vf_netdev) {
+ ret = check_dev_is_matching_vf(vf_netdev);
+ if (ret != 0)
+ continue;
+
+ if (net != get_netvsc_byslot(vf_netdev))
+ continue;
+
+ netvsc_prepare_bonding(vf_netdev);
+ netvsc_register_vf(vf_netdev, VF_REG_IN_PROBE);
+ __netvsc_vf_setup(net, vf_netdev);
+ break;
+ }
rtnl_unlock();
netvsc_devinfo_put(device_info);
@@ -2754,28 +2807,17 @@ static int netvsc_netdev_event(struct notifier_block *this,
unsigned long event, void *ptr)
{
struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);
+ int ret = 0;
- /* Skip our own events */
- if (event_dev->netdev_ops == &device_ops)
- return NOTIFY_DONE;
-
- /* Avoid non-Ethernet type devices */
- if (event_dev->type != ARPHRD_ETHER)
- return NOTIFY_DONE;
-
- /* Avoid Vlan dev with same MAC registering as VF */
- if (is_vlan_dev(event_dev))
- return NOTIFY_DONE;
-
- /* Avoid Bonding master dev with same MAC registering as VF */
- if (netif_is_bond_master(event_dev))
+ ret = check_dev_is_matching_vf(event_dev);
+ if (ret != 0)
return NOTIFY_DONE;
switch (event) {
case NETDEV_POST_INIT:
return netvsc_prepare_bonding(event_dev);
case NETDEV_REGISTER:
- return netvsc_register_vf(event_dev);
+ return netvsc_register_vf(event_dev, VF_REG_IN_NOTIFIER);
case NETDEV_UNREGISTER:
return netvsc_unregister_vf(event_dev);
case NETDEV_UP:
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 9cae43da9867412f8bd09aee5c8a8dc5e8dc3dc2
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021937-trace-hardly-10d8@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
9cae43da9867 ("hv_netvsc: Register VF in netvsc_probe if NET_DEVICE_REGISTER missed")
c807d6cd089d ("hv_netvsc: Mark VF as slave before exposing it to user-mode")
365e1ececb29 ("hv_netvsc: Fix race between VF offering and VF association message from host")
c60882a4566a ("hv_netvsc: use netif_is_bond_master() instead of open code")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 9cae43da9867412f8bd09aee5c8a8dc5e8dc3dc2 Mon Sep 17 00:00:00 2001
From: Shradha Gupta <shradhagupta(a)linux.microsoft.com>
Date: Thu, 1 Feb 2024 20:40:38 -0800
Subject: [PATCH] hv_netvsc: Register VF in netvsc_probe if NET_DEVICE_REGISTER
missed
If hv_netvsc driver is unloaded and reloaded, the NET_DEVICE_REGISTER
handler cannot perform VF register successfully as the register call
is received before netvsc_probe is finished. This is because we
register register_netdevice_notifier() very early( even before
vmbus_driver_register()).
To fix this, we try to register each such matching VF( if it is visible
as a netdevice) at the end of netvsc_probe.
Cc: stable(a)vger.kernel.org
Fixes: 85520856466e ("hv_netvsc: Fix race of register_netdevice_notifier and VF register")
Suggested-by: Dexuan Cui <decui(a)microsoft.com>
Signed-off-by: Shradha Gupta <shradhagupta(a)linux.microsoft.com>
Reviewed-by: Haiyang Zhang <haiyangz(a)microsoft.com>
Reviewed-by: Dexuan Cui <decui(a)microsoft.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index 273bd8a20122..11831a1c9762 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -42,6 +42,10 @@
#define LINKCHANGE_INT (2 * HZ)
#define VF_TAKEOVER_INT (HZ / 10)
+/* Macros to define the context of vf registration */
+#define VF_REG_IN_PROBE 1
+#define VF_REG_IN_NOTIFIER 2
+
static unsigned int ring_size __ro_after_init = 128;
module_param(ring_size, uint, 0444);
MODULE_PARM_DESC(ring_size, "Ring buffer size (# of 4K pages)");
@@ -2185,7 +2189,7 @@ static rx_handler_result_t netvsc_vf_handle_frame(struct sk_buff **pskb)
}
static int netvsc_vf_join(struct net_device *vf_netdev,
- struct net_device *ndev)
+ struct net_device *ndev, int context)
{
struct net_device_context *ndev_ctx = netdev_priv(ndev);
int ret;
@@ -2208,7 +2212,11 @@ static int netvsc_vf_join(struct net_device *vf_netdev,
goto upper_link_failed;
}
- schedule_delayed_work(&ndev_ctx->vf_takeover, VF_TAKEOVER_INT);
+ /* If this registration is called from probe context vf_takeover
+ * is taken care of later in probe itself.
+ */
+ if (context == VF_REG_IN_NOTIFIER)
+ schedule_delayed_work(&ndev_ctx->vf_takeover, VF_TAKEOVER_INT);
call_netdevice_notifiers(NETDEV_JOIN, vf_netdev);
@@ -2346,7 +2354,7 @@ static int netvsc_prepare_bonding(struct net_device *vf_netdev)
return NOTIFY_DONE;
}
-static int netvsc_register_vf(struct net_device *vf_netdev)
+static int netvsc_register_vf(struct net_device *vf_netdev, int context)
{
struct net_device_context *net_device_ctx;
struct netvsc_device *netvsc_dev;
@@ -2386,7 +2394,7 @@ static int netvsc_register_vf(struct net_device *vf_netdev)
netdev_info(ndev, "VF registering: %s\n", vf_netdev->name);
- if (netvsc_vf_join(vf_netdev, ndev) != 0)
+ if (netvsc_vf_join(vf_netdev, ndev, context) != 0)
return NOTIFY_DONE;
dev_hold(vf_netdev);
@@ -2484,10 +2492,31 @@ static int netvsc_unregister_vf(struct net_device *vf_netdev)
return NOTIFY_OK;
}
+static int check_dev_is_matching_vf(struct net_device *event_ndev)
+{
+ /* Skip NetVSC interfaces */
+ if (event_ndev->netdev_ops == &device_ops)
+ return -ENODEV;
+
+ /* Avoid non-Ethernet type devices */
+ if (event_ndev->type != ARPHRD_ETHER)
+ return -ENODEV;
+
+ /* Avoid Vlan dev with same MAC registering as VF */
+ if (is_vlan_dev(event_ndev))
+ return -ENODEV;
+
+ /* Avoid Bonding master dev with same MAC registering as VF */
+ if (netif_is_bond_master(event_ndev))
+ return -ENODEV;
+
+ return 0;
+}
+
static int netvsc_probe(struct hv_device *dev,
const struct hv_vmbus_device_id *dev_id)
{
- struct net_device *net = NULL;
+ struct net_device *net = NULL, *vf_netdev;
struct net_device_context *net_device_ctx;
struct netvsc_device_info *device_info = NULL;
struct netvsc_device *nvdev;
@@ -2599,6 +2628,30 @@ static int netvsc_probe(struct hv_device *dev,
}
list_add(&net_device_ctx->list, &netvsc_dev_list);
+
+ /* When the hv_netvsc driver is unloaded and reloaded, the
+ * NET_DEVICE_REGISTER for the vf device is replayed before probe
+ * is complete. This is because register_netdevice_notifier() gets
+ * registered before vmbus_driver_register() so that callback func
+ * is set before probe and we don't miss events like NETDEV_POST_INIT
+ * So, in this section we try to register the matching vf device that
+ * is present as a netdevice, knowing that its register call is not
+ * processed in the netvsc_netdev_notifier(as probing is progress and
+ * get_netvsc_byslot fails).
+ */
+ for_each_netdev(dev_net(net), vf_netdev) {
+ ret = check_dev_is_matching_vf(vf_netdev);
+ if (ret != 0)
+ continue;
+
+ if (net != get_netvsc_byslot(vf_netdev))
+ continue;
+
+ netvsc_prepare_bonding(vf_netdev);
+ netvsc_register_vf(vf_netdev, VF_REG_IN_PROBE);
+ __netvsc_vf_setup(net, vf_netdev);
+ break;
+ }
rtnl_unlock();
netvsc_devinfo_put(device_info);
@@ -2754,28 +2807,17 @@ static int netvsc_netdev_event(struct notifier_block *this,
unsigned long event, void *ptr)
{
struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);
+ int ret = 0;
- /* Skip our own events */
- if (event_dev->netdev_ops == &device_ops)
- return NOTIFY_DONE;
-
- /* Avoid non-Ethernet type devices */
- if (event_dev->type != ARPHRD_ETHER)
- return NOTIFY_DONE;
-
- /* Avoid Vlan dev with same MAC registering as VF */
- if (is_vlan_dev(event_dev))
- return NOTIFY_DONE;
-
- /* Avoid Bonding master dev with same MAC registering as VF */
- if (netif_is_bond_master(event_dev))
+ ret = check_dev_is_matching_vf(event_dev);
+ if (ret != 0)
return NOTIFY_DONE;
switch (event) {
case NETDEV_POST_INIT:
return netvsc_prepare_bonding(event_dev);
case NETDEV_REGISTER:
- return netvsc_register_vf(event_dev);
+ return netvsc_register_vf(event_dev, VF_REG_IN_NOTIFIER);
case NETDEV_UNREGISTER:
return netvsc_unregister_vf(event_dev);
case NETDEV_UP:
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.19.y
git checkout FETCH_HEAD
git cherry-pick -x f0e4a1356466ec1858ae8e5c70bea2ce5e55008b
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021948-carload-deniable-e02d@gregkh' --subject-prefix 'PATCH 4.19.y' HEAD^..
Possible dependencies:
f0e4a1356466 ("pmdomain: renesas: r8a77980-sysc: CR7 must be always on")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From f0e4a1356466ec1858ae8e5c70bea2ce5e55008b Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas(a)glider.be>
Date: Fri, 12 Jan 2024 17:33:55 +0100
Subject: [PATCH] pmdomain: renesas: r8a77980-sysc: CR7 must be always on
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
The power domain containing the Cortex-R7 CPU core on the R-Car V3H SoC
must always be in power-on state, unlike on other SoCs in the R-Car Gen3
family. See Table 9.4 "Power domains" in the R-Car Series, 3rd
Generation Hardware User’s Manual Rev.1.00 and later.
Fix this by marking the domain as a CPU domain without control
registers, so the driver will not touch it.
Fixes: 41d6d8bd8ae9 ("soc: renesas: rcar-sysc: add R8A77980 support")
Signed-off-by: Geert Uytterhoeven <geert+renesas(a)glider.be>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/fdad9a86132d53ecddf72b734dac406915c4edc0.17050767…
Signed-off-by: Ulf Hansson <ulf.hansson(a)linaro.org>
diff --git a/drivers/pmdomain/renesas/r8a77980-sysc.c b/drivers/pmdomain/renesas/r8a77980-sysc.c
index 39ca84a67daa..621e411fc999 100644
--- a/drivers/pmdomain/renesas/r8a77980-sysc.c
+++ b/drivers/pmdomain/renesas/r8a77980-sysc.c
@@ -25,7 +25,8 @@ static const struct rcar_sysc_area r8a77980_areas[] __initconst = {
PD_CPU_NOCR },
{ "ca53-cpu3", 0x200, 3, R8A77980_PD_CA53_CPU3, R8A77980_PD_CA53_SCU,
PD_CPU_NOCR },
- { "cr7", 0x240, 0, R8A77980_PD_CR7, R8A77980_PD_ALWAYS_ON },
+ { "cr7", 0x240, 0, R8A77980_PD_CR7, R8A77980_PD_ALWAYS_ON,
+ PD_CPU_NOCR },
{ "a3ir", 0x180, 0, R8A77980_PD_A3IR, R8A77980_PD_ALWAYS_ON },
{ "a2ir0", 0x400, 0, R8A77980_PD_A2IR0, R8A77980_PD_A3IR },
{ "a2ir1", 0x400, 1, R8A77980_PD_A2IR1, R8A77980_PD_A3IR },
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x f0e4a1356466ec1858ae8e5c70bea2ce5e55008b
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021947-dividers-resize-9476@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
f0e4a1356466 ("pmdomain: renesas: r8a77980-sysc: CR7 must be always on")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From f0e4a1356466ec1858ae8e5c70bea2ce5e55008b Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas(a)glider.be>
Date: Fri, 12 Jan 2024 17:33:55 +0100
Subject: [PATCH] pmdomain: renesas: r8a77980-sysc: CR7 must be always on
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
The power domain containing the Cortex-R7 CPU core on the R-Car V3H SoC
must always be in power-on state, unlike on other SoCs in the R-Car Gen3
family. See Table 9.4 "Power domains" in the R-Car Series, 3rd
Generation Hardware User’s Manual Rev.1.00 and later.
Fix this by marking the domain as a CPU domain without control
registers, so the driver will not touch it.
Fixes: 41d6d8bd8ae9 ("soc: renesas: rcar-sysc: add R8A77980 support")
Signed-off-by: Geert Uytterhoeven <geert+renesas(a)glider.be>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/fdad9a86132d53ecddf72b734dac406915c4edc0.17050767…
Signed-off-by: Ulf Hansson <ulf.hansson(a)linaro.org>
diff --git a/drivers/pmdomain/renesas/r8a77980-sysc.c b/drivers/pmdomain/renesas/r8a77980-sysc.c
index 39ca84a67daa..621e411fc999 100644
--- a/drivers/pmdomain/renesas/r8a77980-sysc.c
+++ b/drivers/pmdomain/renesas/r8a77980-sysc.c
@@ -25,7 +25,8 @@ static const struct rcar_sysc_area r8a77980_areas[] __initconst = {
PD_CPU_NOCR },
{ "ca53-cpu3", 0x200, 3, R8A77980_PD_CA53_CPU3, R8A77980_PD_CA53_SCU,
PD_CPU_NOCR },
- { "cr7", 0x240, 0, R8A77980_PD_CR7, R8A77980_PD_ALWAYS_ON },
+ { "cr7", 0x240, 0, R8A77980_PD_CR7, R8A77980_PD_ALWAYS_ON,
+ PD_CPU_NOCR },
{ "a3ir", 0x180, 0, R8A77980_PD_A3IR, R8A77980_PD_ALWAYS_ON },
{ "a2ir0", 0x400, 0, R8A77980_PD_A2IR0, R8A77980_PD_A3IR },
{ "a2ir1", 0x400, 1, R8A77980_PD_A2IR1, R8A77980_PD_A3IR },
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x f0e4a1356466ec1858ae8e5c70bea2ce5e55008b
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021946-unfitting-schnapps-e6e2@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
f0e4a1356466 ("pmdomain: renesas: r8a77980-sysc: CR7 must be always on")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From f0e4a1356466ec1858ae8e5c70bea2ce5e55008b Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas(a)glider.be>
Date: Fri, 12 Jan 2024 17:33:55 +0100
Subject: [PATCH] pmdomain: renesas: r8a77980-sysc: CR7 must be always on
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
The power domain containing the Cortex-R7 CPU core on the R-Car V3H SoC
must always be in power-on state, unlike on other SoCs in the R-Car Gen3
family. See Table 9.4 "Power domains" in the R-Car Series, 3rd
Generation Hardware User’s Manual Rev.1.00 and later.
Fix this by marking the domain as a CPU domain without control
registers, so the driver will not touch it.
Fixes: 41d6d8bd8ae9 ("soc: renesas: rcar-sysc: add R8A77980 support")
Signed-off-by: Geert Uytterhoeven <geert+renesas(a)glider.be>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/fdad9a86132d53ecddf72b734dac406915c4edc0.17050767…
Signed-off-by: Ulf Hansson <ulf.hansson(a)linaro.org>
diff --git a/drivers/pmdomain/renesas/r8a77980-sysc.c b/drivers/pmdomain/renesas/r8a77980-sysc.c
index 39ca84a67daa..621e411fc999 100644
--- a/drivers/pmdomain/renesas/r8a77980-sysc.c
+++ b/drivers/pmdomain/renesas/r8a77980-sysc.c
@@ -25,7 +25,8 @@ static const struct rcar_sysc_area r8a77980_areas[] __initconst = {
PD_CPU_NOCR },
{ "ca53-cpu3", 0x200, 3, R8A77980_PD_CA53_CPU3, R8A77980_PD_CA53_SCU,
PD_CPU_NOCR },
- { "cr7", 0x240, 0, R8A77980_PD_CR7, R8A77980_PD_ALWAYS_ON },
+ { "cr7", 0x240, 0, R8A77980_PD_CR7, R8A77980_PD_ALWAYS_ON,
+ PD_CPU_NOCR },
{ "a3ir", 0x180, 0, R8A77980_PD_A3IR, R8A77980_PD_ALWAYS_ON },
{ "a2ir0", 0x400, 0, R8A77980_PD_A2IR0, R8A77980_PD_A3IR },
{ "a2ir1", 0x400, 1, R8A77980_PD_A2IR1, R8A77980_PD_A3IR },
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x f0e4a1356466ec1858ae8e5c70bea2ce5e55008b
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021945-deletion-semifinal-2050@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
f0e4a1356466 ("pmdomain: renesas: r8a77980-sysc: CR7 must be always on")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From f0e4a1356466ec1858ae8e5c70bea2ce5e55008b Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas(a)glider.be>
Date: Fri, 12 Jan 2024 17:33:55 +0100
Subject: [PATCH] pmdomain: renesas: r8a77980-sysc: CR7 must be always on
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
The power domain containing the Cortex-R7 CPU core on the R-Car V3H SoC
must always be in power-on state, unlike on other SoCs in the R-Car Gen3
family. See Table 9.4 "Power domains" in the R-Car Series, 3rd
Generation Hardware User’s Manual Rev.1.00 and later.
Fix this by marking the domain as a CPU domain without control
registers, so the driver will not touch it.
Fixes: 41d6d8bd8ae9 ("soc: renesas: rcar-sysc: add R8A77980 support")
Signed-off-by: Geert Uytterhoeven <geert+renesas(a)glider.be>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/fdad9a86132d53ecddf72b734dac406915c4edc0.17050767…
Signed-off-by: Ulf Hansson <ulf.hansson(a)linaro.org>
diff --git a/drivers/pmdomain/renesas/r8a77980-sysc.c b/drivers/pmdomain/renesas/r8a77980-sysc.c
index 39ca84a67daa..621e411fc999 100644
--- a/drivers/pmdomain/renesas/r8a77980-sysc.c
+++ b/drivers/pmdomain/renesas/r8a77980-sysc.c
@@ -25,7 +25,8 @@ static const struct rcar_sysc_area r8a77980_areas[] __initconst = {
PD_CPU_NOCR },
{ "ca53-cpu3", 0x200, 3, R8A77980_PD_CA53_CPU3, R8A77980_PD_CA53_SCU,
PD_CPU_NOCR },
- { "cr7", 0x240, 0, R8A77980_PD_CR7, R8A77980_PD_ALWAYS_ON },
+ { "cr7", 0x240, 0, R8A77980_PD_CR7, R8A77980_PD_ALWAYS_ON,
+ PD_CPU_NOCR },
{ "a3ir", 0x180, 0, R8A77980_PD_A3IR, R8A77980_PD_ALWAYS_ON },
{ "a2ir0", 0x400, 0, R8A77980_PD_A2IR0, R8A77980_PD_A3IR },
{ "a2ir1", 0x400, 1, R8A77980_PD_A2IR1, R8A77980_PD_A3IR },
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x f0e4a1356466ec1858ae8e5c70bea2ce5e55008b
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021944-left-dazzling-8e79@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
f0e4a1356466 ("pmdomain: renesas: r8a77980-sysc: CR7 must be always on")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From f0e4a1356466ec1858ae8e5c70bea2ce5e55008b Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas(a)glider.be>
Date: Fri, 12 Jan 2024 17:33:55 +0100
Subject: [PATCH] pmdomain: renesas: r8a77980-sysc: CR7 must be always on
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
The power domain containing the Cortex-R7 CPU core on the R-Car V3H SoC
must always be in power-on state, unlike on other SoCs in the R-Car Gen3
family. See Table 9.4 "Power domains" in the R-Car Series, 3rd
Generation Hardware User’s Manual Rev.1.00 and later.
Fix this by marking the domain as a CPU domain without control
registers, so the driver will not touch it.
Fixes: 41d6d8bd8ae9 ("soc: renesas: rcar-sysc: add R8A77980 support")
Signed-off-by: Geert Uytterhoeven <geert+renesas(a)glider.be>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/fdad9a86132d53ecddf72b734dac406915c4edc0.17050767…
Signed-off-by: Ulf Hansson <ulf.hansson(a)linaro.org>
diff --git a/drivers/pmdomain/renesas/r8a77980-sysc.c b/drivers/pmdomain/renesas/r8a77980-sysc.c
index 39ca84a67daa..621e411fc999 100644
--- a/drivers/pmdomain/renesas/r8a77980-sysc.c
+++ b/drivers/pmdomain/renesas/r8a77980-sysc.c
@@ -25,7 +25,8 @@ static const struct rcar_sysc_area r8a77980_areas[] __initconst = {
PD_CPU_NOCR },
{ "ca53-cpu3", 0x200, 3, R8A77980_PD_CA53_CPU3, R8A77980_PD_CA53_SCU,
PD_CPU_NOCR },
- { "cr7", 0x240, 0, R8A77980_PD_CR7, R8A77980_PD_ALWAYS_ON },
+ { "cr7", 0x240, 0, R8A77980_PD_CR7, R8A77980_PD_ALWAYS_ON,
+ PD_CPU_NOCR },
{ "a3ir", 0x180, 0, R8A77980_PD_A3IR, R8A77980_PD_ALWAYS_ON },
{ "a2ir0", 0x400, 0, R8A77980_PD_A2IR0, R8A77980_PD_A3IR },
{ "a2ir1", 0x400, 1, R8A77980_PD_A2IR1, R8A77980_PD_A3IR },
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.19.y
git checkout FETCH_HEAD
git cherry-pick -x 2fe8a236436fe40d8d26a1af8d150fc80f04ee1a
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021907-craziness-snuggle-7e2b@gregkh' --subject-prefix 'PATCH 4.19.y' HEAD^..
Possible dependencies:
2fe8a236436f ("s390/qeth: Fix potential loss of L3-IP@ in case of network issues")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 2fe8a236436fe40d8d26a1af8d150fc80f04ee1a Mon Sep 17 00:00:00 2001
From: Alexandra Winter <wintera(a)linux.ibm.com>
Date: Tue, 6 Feb 2024 09:58:49 +0100
Subject: [PATCH] s390/qeth: Fix potential loss of L3-IP@ in case of network
issues
Symptom:
In case of a bad cable connection (e.g. dirty optics) a fast sequence of
network DOWN-UP-DOWN-UP could happen. UP triggers recovery of the qeth
interface. In case of a second DOWN while recovery is still ongoing, it
can happen that the IP@ of a Layer3 qeth interface is lost and will not
be recovered by the second UP.
Problem:
When registration of IP addresses with Layer 3 qeth devices fails, (e.g.
because of bad address format) the respective IP address is deleted from
its hash-table in the driver. If registration fails because of a ENETDOWN
condition, the address should stay in the hashtable, so a subsequent
recovery can restore it.
3caa4af834df ("qeth: keep ip-address after LAN_OFFLINE failure")
fixes this for registration failures during normal operation, but not
during recovery.
Solution:
Keep L3-IP address in case of ENETDOWN in qeth_l3_recover_ip(). For
consistency with qeth_l3_add_ip() we also keep it in case of EADDRINUSE,
i.e. for some reason the card already/still has this address registered.
Fixes: 4a71df50047f ("qeth: new qeth device driver")
Cc: stable(a)vger.kernel.org
Signed-off-by: Alexandra Winter <wintera(a)linux.ibm.com>
Link: https://lore.kernel.org/r/20240206085849.2902775-1-wintera@linux.ibm.com
Signed-off-by: Paolo Abeni <pabeni(a)redhat.com>
diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c
index b92a32b4b114..04c64ce0a1ca 100644
--- a/drivers/s390/net/qeth_l3_main.c
+++ b/drivers/s390/net/qeth_l3_main.c
@@ -255,9 +255,10 @@ static void qeth_l3_clear_ip_htable(struct qeth_card *card, int recover)
if (!recover) {
hash_del(&addr->hnode);
kfree(addr);
- continue;
+ } else {
+ /* prepare for recovery */
+ addr->disp_flag = QETH_DISP_ADDR_ADD;
}
- addr->disp_flag = QETH_DISP_ADDR_ADD;
}
mutex_unlock(&card->ip_lock);
@@ -278,9 +279,11 @@ static void qeth_l3_recover_ip(struct qeth_card *card)
if (addr->disp_flag == QETH_DISP_ADDR_ADD) {
rc = qeth_l3_register_addr_entry(card, addr);
- if (!rc) {
+ if (!rc || rc == -EADDRINUSE || rc == -ENETDOWN) {
+ /* keep it in the records */
addr->disp_flag = QETH_DISP_ADDR_DO_NOTHING;
} else {
+ /* bad address */
hash_del(&addr->hnode);
kfree(addr);
}
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x 2fe8a236436fe40d8d26a1af8d150fc80f04ee1a
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021906-aspirin-starless-d7df@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
2fe8a236436f ("s390/qeth: Fix potential loss of L3-IP@ in case of network issues")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 2fe8a236436fe40d8d26a1af8d150fc80f04ee1a Mon Sep 17 00:00:00 2001
From: Alexandra Winter <wintera(a)linux.ibm.com>
Date: Tue, 6 Feb 2024 09:58:49 +0100
Subject: [PATCH] s390/qeth: Fix potential loss of L3-IP@ in case of network
issues
Symptom:
In case of a bad cable connection (e.g. dirty optics) a fast sequence of
network DOWN-UP-DOWN-UP could happen. UP triggers recovery of the qeth
interface. In case of a second DOWN while recovery is still ongoing, it
can happen that the IP@ of a Layer3 qeth interface is lost and will not
be recovered by the second UP.
Problem:
When registration of IP addresses with Layer 3 qeth devices fails, (e.g.
because of bad address format) the respective IP address is deleted from
its hash-table in the driver. If registration fails because of a ENETDOWN
condition, the address should stay in the hashtable, so a subsequent
recovery can restore it.
3caa4af834df ("qeth: keep ip-address after LAN_OFFLINE failure")
fixes this for registration failures during normal operation, but not
during recovery.
Solution:
Keep L3-IP address in case of ENETDOWN in qeth_l3_recover_ip(). For
consistency with qeth_l3_add_ip() we also keep it in case of EADDRINUSE,
i.e. for some reason the card already/still has this address registered.
Fixes: 4a71df50047f ("qeth: new qeth device driver")
Cc: stable(a)vger.kernel.org
Signed-off-by: Alexandra Winter <wintera(a)linux.ibm.com>
Link: https://lore.kernel.org/r/20240206085849.2902775-1-wintera@linux.ibm.com
Signed-off-by: Paolo Abeni <pabeni(a)redhat.com>
diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c
index b92a32b4b114..04c64ce0a1ca 100644
--- a/drivers/s390/net/qeth_l3_main.c
+++ b/drivers/s390/net/qeth_l3_main.c
@@ -255,9 +255,10 @@ static void qeth_l3_clear_ip_htable(struct qeth_card *card, int recover)
if (!recover) {
hash_del(&addr->hnode);
kfree(addr);
- continue;
+ } else {
+ /* prepare for recovery */
+ addr->disp_flag = QETH_DISP_ADDR_ADD;
}
- addr->disp_flag = QETH_DISP_ADDR_ADD;
}
mutex_unlock(&card->ip_lock);
@@ -278,9 +279,11 @@ static void qeth_l3_recover_ip(struct qeth_card *card)
if (addr->disp_flag == QETH_DISP_ADDR_ADD) {
rc = qeth_l3_register_addr_entry(card, addr);
- if (!rc) {
+ if (!rc || rc == -EADDRINUSE || rc == -ENETDOWN) {
+ /* keep it in the records */
addr->disp_flag = QETH_DISP_ADDR_DO_NOTHING;
} else {
+ /* bad address */
hash_del(&addr->hnode);
kfree(addr);
}
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 46f5ab762d048dad224436978315cbc2fa79c630
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021922-repugnant-crafter-ce03@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
46f5ab762d04 ("fs: relax mount_setattr() permission checks")
87bb5b60019c ("fs: clean up mount_setattr control flow")
a26f788b6e7a ("fs: add mnt_allow_writers() and simplify mount_setattr_prepare()")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 46f5ab762d048dad224436978315cbc2fa79c630 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner(a)kernel.org>
Date: Tue, 6 Feb 2024 11:22:09 +0100
Subject: [PATCH] fs: relax mount_setattr() permission checks
When we added mount_setattr() I added additional checks compared to the
legacy do_reconfigure_mnt() and do_change_type() helpers used by regular
mount(2). If that mount had a parent then verify that the caller and the
mount namespace the mount is attached to match and if not make sure that
it's an anonymous mount.
The real rootfs falls into neither category. It is neither an anoymous
mount because it is obviously attached to the initial mount namespace
but it also obviously doesn't have a parent mount. So that means legacy
mount(2) allows changing mount properties on the real rootfs but
mount_setattr(2) blocks this. I never thought much about this but of
course someone on this planet of earth changes properties on the real
rootfs as can be seen in [1].
Since util-linux finally switched to the new mount api in 2.39 not so
long ago it also relies on mount_setattr() and that surfaced this issue
when Fedora 39 finally switched to it. Fix this.
Link: https://bugzilla.redhat.com/show_bug.cgi?id=2256843
Link: https://lore.kernel.org/r/20240206-vfs-mount-rootfs-v1-1-19b335eee133@kerne…
Reviewed-by: Jan Kara <jack(a)suse.cz>
Reported-by: Karel Zak <kzak(a)redhat.com>
Cc: stable(a)vger.kernel.org # v5.12+
Signed-off-by: Christian Brauner <brauner(a)kernel.org>
diff --git a/fs/namespace.c b/fs/namespace.c
index 437f60e96d40..5a51315c6678 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -4472,10 +4472,15 @@ static int do_mount_setattr(struct path *path, struct mount_kattr *kattr)
/*
* If this is an attached mount make sure it's located in the callers
* mount namespace. If it's not don't let the caller interact with it.
- * If this is a detached mount make sure it has an anonymous mount
- * namespace attached to it, i.e. we've created it via OPEN_TREE_CLONE.
+ *
+ * If this mount doesn't have a parent it's most often simply a
+ * detached mount with an anonymous mount namespace. IOW, something
+ * that's simply not attached yet. But there are apparently also users
+ * that do change mount properties on the rootfs itself. That obviously
+ * neither has a parent nor is it a detached mount so we cannot
+ * unconditionally check for detached mounts.
*/
- if (!(mnt_has_parent(mnt) ? check_mnt(mnt) : is_anon_ns(mnt->mnt_ns)))
+ if ((mnt_has_parent(mnt) || !is_anon_ns(mnt->mnt_ns)) && !check_mnt(mnt))
goto out;
/*
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x fcbe4873089c84da641df75cda9cac2e9addbb4b
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021912-plastic-bannister-f6dc@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
fcbe4873089c ("ASoC: SOF: IPC3: fix message bounds on ipc ops")
12c41c779fad ("ASoC: SOF: Refactor rx function for fuzzing")
989a3e447917 ("ASoC: SOF: ipc3: Check for upper size limit for the received message")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From fcbe4873089c84da641df75cda9cac2e9addbb4b Mon Sep 17 00:00:00 2001
From: Curtis Malainey <cujomalainey(a)chromium.org>
Date: Tue, 13 Feb 2024 14:38:34 +0200
Subject: [PATCH] ASoC: SOF: IPC3: fix message bounds on ipc ops
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
commit 74ad8ed65121 ("ASoC: SOF: ipc3: Implement rx_msg IPC ops")
introduced a new allocation before the upper bounds check in
do_rx_work. As a result A DSP can cause bad allocations if spewing
garbage.
Fixes: 74ad8ed65121 ("ASoC: SOF: ipc3: Implement rx_msg IPC ops")
Reported-by: Tim Van Patten <timvp(a)google.com>
Cc: stable(a)vger.kernel.org
Signed-off-by: Curtis Malainey <cujomalainey(a)chromium.org>
Reviewed-by: Péter Ujfalusi <peter.ujfalusi(a)linux.intel.com>
Reviewed-by: Daniel Baluta <daniel.baluta(a)nxp.com>
Reviewed-by: Pierre-Louis Bossart <pierre-louis.bossart(a)linux.intel.com>
Signed-off-by: Peter Ujfalusi <peter.ujfalusi(a)linux.intel.com>
Link: https://msgid.link/r/20240213123834.4827-1-peter.ujfalusi@linux.intel.com
Signed-off-by: Mark Brown <broonie(a)kernel.org>
diff --git a/sound/soc/sof/ipc3.c b/sound/soc/sof/ipc3.c
index fb40378ad084..c03dd513fbff 100644
--- a/sound/soc/sof/ipc3.c
+++ b/sound/soc/sof/ipc3.c
@@ -1067,7 +1067,7 @@ static void sof_ipc3_rx_msg(struct snd_sof_dev *sdev)
return;
}
- if (hdr.size < sizeof(hdr)) {
+ if (hdr.size < sizeof(hdr) || hdr.size > SOF_IPC_MSG_MAX_SIZE) {
dev_err(sdev->dev, "The received message size is invalid\n");
return;
}
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.19.y
git checkout FETCH_HEAD
git cherry-pick -x fe752331d4b361d43cfd0b89534b4b2176057c32
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021908-spooky-conductor-a78c@gregkh' --subject-prefix 'PATCH 4.19.y' HEAD^..
Possible dependencies:
fe752331d4b3 ("KVM: s390: vsie: fix race during shadow creation")
c3235e2dd695 ("KVM: s390: add stat counter for shadow gmap events")
0130337ec45b ("KVM: s390: Cleanup ipte lock access and SIIF facility checks")
73f91b004321 ("KVM: s390: pci: enable host forwarding of Adapter Event Notifications")
98b1d33dac5f ("KVM: s390: pci: do initial setup for AEN interpretation")
6438e30714ab ("KVM: s390: pci: add basic kvm_zdev structure")
062f002485d4 ("s390/pci: externalize the SIC operation controls and routine")
d2197485a188 ("s390/airq: pass more TPI info to airq handlers")
61380a7adfce ("KVM: s390: handle_tprot: Honor storage keys")
e613d83454d7 ("KVM: s390: Honor storage keys when accessing guest memory")
79e06c4c4950 ("Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From fe752331d4b361d43cfd0b89534b4b2176057c32 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger(a)linux.ibm.com>
Date: Wed, 20 Dec 2023 13:53:17 +0100
Subject: [PATCH] KVM: s390: vsie: fix race during shadow creation
Right now it is possible to see gmap->private being zero in
kvm_s390_vsie_gmap_notifier resulting in a crash. This is due to the
fact that we add gmap->private == kvm after creation:
static int acquire_gmap_shadow(struct kvm_vcpu *vcpu,
struct vsie_page *vsie_page)
{
[...]
gmap = gmap_shadow(vcpu->arch.gmap, asce, edat);
if (IS_ERR(gmap))
return PTR_ERR(gmap);
gmap->private = vcpu->kvm;
Let children inherit the private field of the parent.
Reported-by: Marc Hartmayer <mhartmay(a)linux.ibm.com>
Fixes: a3508fbe9dc6 ("KVM: s390: vsie: initial support for nested virtualization")
Cc: <stable(a)vger.kernel.org>
Cc: David Hildenbrand <david(a)redhat.com>
Reviewed-by: Janosch Frank <frankja(a)linux.ibm.com>
Reviewed-by: David Hildenbrand <david(a)redhat.com>
Reviewed-by: Claudio Imbrenda <imbrenda(a)linux.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger(a)linux.ibm.com>
Link: https://lore.kernel.org/r/20231220125317.4258-1-borntraeger@linux.ibm.com
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index 8207a892bbe2..db9a180de65f 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -1220,7 +1220,6 @@ static int acquire_gmap_shadow(struct kvm_vcpu *vcpu,
gmap = gmap_shadow(vcpu->arch.gmap, asce, edat);
if (IS_ERR(gmap))
return PTR_ERR(gmap);
- gmap->private = vcpu->kvm;
vcpu->kvm->stat.gmap_shadow_create++;
WRITE_ONCE(vsie_page->gmap, gmap);
return 0;
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index 6f96b5a71c63..8da39deb56ca 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -1691,6 +1691,7 @@ struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce,
return ERR_PTR(-ENOMEM);
new->mm = parent->mm;
new->parent = gmap_get(parent);
+ new->private = parent->private;
new->orig_asce = asce;
new->edat_level = edat_level;
new->initialized = false;
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x fe752331d4b361d43cfd0b89534b4b2176057c32
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021907-crabgrass-chase-b674@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
fe752331d4b3 ("KVM: s390: vsie: fix race during shadow creation")
c3235e2dd695 ("KVM: s390: add stat counter for shadow gmap events")
0130337ec45b ("KVM: s390: Cleanup ipte lock access and SIIF facility checks")
73f91b004321 ("KVM: s390: pci: enable host forwarding of Adapter Event Notifications")
98b1d33dac5f ("KVM: s390: pci: do initial setup for AEN interpretation")
6438e30714ab ("KVM: s390: pci: add basic kvm_zdev structure")
062f002485d4 ("s390/pci: externalize the SIC operation controls and routine")
d2197485a188 ("s390/airq: pass more TPI info to airq handlers")
61380a7adfce ("KVM: s390: handle_tprot: Honor storage keys")
e613d83454d7 ("KVM: s390: Honor storage keys when accessing guest memory")
79e06c4c4950 ("Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From fe752331d4b361d43cfd0b89534b4b2176057c32 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger(a)linux.ibm.com>
Date: Wed, 20 Dec 2023 13:53:17 +0100
Subject: [PATCH] KVM: s390: vsie: fix race during shadow creation
Right now it is possible to see gmap->private being zero in
kvm_s390_vsie_gmap_notifier resulting in a crash. This is due to the
fact that we add gmap->private == kvm after creation:
static int acquire_gmap_shadow(struct kvm_vcpu *vcpu,
struct vsie_page *vsie_page)
{
[...]
gmap = gmap_shadow(vcpu->arch.gmap, asce, edat);
if (IS_ERR(gmap))
return PTR_ERR(gmap);
gmap->private = vcpu->kvm;
Let children inherit the private field of the parent.
Reported-by: Marc Hartmayer <mhartmay(a)linux.ibm.com>
Fixes: a3508fbe9dc6 ("KVM: s390: vsie: initial support for nested virtualization")
Cc: <stable(a)vger.kernel.org>
Cc: David Hildenbrand <david(a)redhat.com>
Reviewed-by: Janosch Frank <frankja(a)linux.ibm.com>
Reviewed-by: David Hildenbrand <david(a)redhat.com>
Reviewed-by: Claudio Imbrenda <imbrenda(a)linux.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger(a)linux.ibm.com>
Link: https://lore.kernel.org/r/20231220125317.4258-1-borntraeger@linux.ibm.com
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index 8207a892bbe2..db9a180de65f 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -1220,7 +1220,6 @@ static int acquire_gmap_shadow(struct kvm_vcpu *vcpu,
gmap = gmap_shadow(vcpu->arch.gmap, asce, edat);
if (IS_ERR(gmap))
return PTR_ERR(gmap);
- gmap->private = vcpu->kvm;
vcpu->kvm->stat.gmap_shadow_create++;
WRITE_ONCE(vsie_page->gmap, gmap);
return 0;
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index 6f96b5a71c63..8da39deb56ca 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -1691,6 +1691,7 @@ struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce,
return ERR_PTR(-ENOMEM);
new->mm = parent->mm;
new->parent = gmap_get(parent);
+ new->private = parent->private;
new->orig_asce = asce;
new->edat_level = edat_level;
new->initialized = false;
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x fe752331d4b361d43cfd0b89534b4b2176057c32
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021906-gains-squeegee-612d@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
fe752331d4b3 ("KVM: s390: vsie: fix race during shadow creation")
c3235e2dd695 ("KVM: s390: add stat counter for shadow gmap events")
0130337ec45b ("KVM: s390: Cleanup ipte lock access and SIIF facility checks")
73f91b004321 ("KVM: s390: pci: enable host forwarding of Adapter Event Notifications")
98b1d33dac5f ("KVM: s390: pci: do initial setup for AEN interpretation")
6438e30714ab ("KVM: s390: pci: add basic kvm_zdev structure")
062f002485d4 ("s390/pci: externalize the SIC operation controls and routine")
d2197485a188 ("s390/airq: pass more TPI info to airq handlers")
61380a7adfce ("KVM: s390: handle_tprot: Honor storage keys")
e613d83454d7 ("KVM: s390: Honor storage keys when accessing guest memory")
79e06c4c4950 ("Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From fe752331d4b361d43cfd0b89534b4b2176057c32 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger(a)linux.ibm.com>
Date: Wed, 20 Dec 2023 13:53:17 +0100
Subject: [PATCH] KVM: s390: vsie: fix race during shadow creation
Right now it is possible to see gmap->private being zero in
kvm_s390_vsie_gmap_notifier resulting in a crash. This is due to the
fact that we add gmap->private == kvm after creation:
static int acquire_gmap_shadow(struct kvm_vcpu *vcpu,
struct vsie_page *vsie_page)
{
[...]
gmap = gmap_shadow(vcpu->arch.gmap, asce, edat);
if (IS_ERR(gmap))
return PTR_ERR(gmap);
gmap->private = vcpu->kvm;
Let children inherit the private field of the parent.
Reported-by: Marc Hartmayer <mhartmay(a)linux.ibm.com>
Fixes: a3508fbe9dc6 ("KVM: s390: vsie: initial support for nested virtualization")
Cc: <stable(a)vger.kernel.org>
Cc: David Hildenbrand <david(a)redhat.com>
Reviewed-by: Janosch Frank <frankja(a)linux.ibm.com>
Reviewed-by: David Hildenbrand <david(a)redhat.com>
Reviewed-by: Claudio Imbrenda <imbrenda(a)linux.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger(a)linux.ibm.com>
Link: https://lore.kernel.org/r/20231220125317.4258-1-borntraeger@linux.ibm.com
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index 8207a892bbe2..db9a180de65f 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -1220,7 +1220,6 @@ static int acquire_gmap_shadow(struct kvm_vcpu *vcpu,
gmap = gmap_shadow(vcpu->arch.gmap, asce, edat);
if (IS_ERR(gmap))
return PTR_ERR(gmap);
- gmap->private = vcpu->kvm;
vcpu->kvm->stat.gmap_shadow_create++;
WRITE_ONCE(vsie_page->gmap, gmap);
return 0;
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index 6f96b5a71c63..8da39deb56ca 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -1691,6 +1691,7 @@ struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce,
return ERR_PTR(-ENOMEM);
new->mm = parent->mm;
new->parent = gmap_get(parent);
+ new->private = parent->private;
new->orig_asce = asce;
new->edat_level = edat_level;
new->initialized = false;
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x fe752331d4b361d43cfd0b89534b4b2176057c32
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021905-spyglass-handed-15bd@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
fe752331d4b3 ("KVM: s390: vsie: fix race during shadow creation")
c3235e2dd695 ("KVM: s390: add stat counter for shadow gmap events")
0130337ec45b ("KVM: s390: Cleanup ipte lock access and SIIF facility checks")
73f91b004321 ("KVM: s390: pci: enable host forwarding of Adapter Event Notifications")
98b1d33dac5f ("KVM: s390: pci: do initial setup for AEN interpretation")
6438e30714ab ("KVM: s390: pci: add basic kvm_zdev structure")
062f002485d4 ("s390/pci: externalize the SIC operation controls and routine")
d2197485a188 ("s390/airq: pass more TPI info to airq handlers")
61380a7adfce ("KVM: s390: handle_tprot: Honor storage keys")
e613d83454d7 ("KVM: s390: Honor storage keys when accessing guest memory")
79e06c4c4950 ("Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From fe752331d4b361d43cfd0b89534b4b2176057c32 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger(a)linux.ibm.com>
Date: Wed, 20 Dec 2023 13:53:17 +0100
Subject: [PATCH] KVM: s390: vsie: fix race during shadow creation
Right now it is possible to see gmap->private being zero in
kvm_s390_vsie_gmap_notifier resulting in a crash. This is due to the
fact that we add gmap->private == kvm after creation:
static int acquire_gmap_shadow(struct kvm_vcpu *vcpu,
struct vsie_page *vsie_page)
{
[...]
gmap = gmap_shadow(vcpu->arch.gmap, asce, edat);
if (IS_ERR(gmap))
return PTR_ERR(gmap);
gmap->private = vcpu->kvm;
Let children inherit the private field of the parent.
Reported-by: Marc Hartmayer <mhartmay(a)linux.ibm.com>
Fixes: a3508fbe9dc6 ("KVM: s390: vsie: initial support for nested virtualization")
Cc: <stable(a)vger.kernel.org>
Cc: David Hildenbrand <david(a)redhat.com>
Reviewed-by: Janosch Frank <frankja(a)linux.ibm.com>
Reviewed-by: David Hildenbrand <david(a)redhat.com>
Reviewed-by: Claudio Imbrenda <imbrenda(a)linux.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger(a)linux.ibm.com>
Link: https://lore.kernel.org/r/20231220125317.4258-1-borntraeger@linux.ibm.com
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index 8207a892bbe2..db9a180de65f 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -1220,7 +1220,6 @@ static int acquire_gmap_shadow(struct kvm_vcpu *vcpu,
gmap = gmap_shadow(vcpu->arch.gmap, asce, edat);
if (IS_ERR(gmap))
return PTR_ERR(gmap);
- gmap->private = vcpu->kvm;
vcpu->kvm->stat.gmap_shadow_create++;
WRITE_ONCE(vsie_page->gmap, gmap);
return 0;
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index 6f96b5a71c63..8da39deb56ca 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -1691,6 +1691,7 @@ struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce,
return ERR_PTR(-ENOMEM);
new->mm = parent->mm;
new->parent = gmap_get(parent);
+ new->private = parent->private;
new->orig_asce = asce;
new->edat_level = edat_level;
new->initialized = false;
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x fe752331d4b361d43cfd0b89534b4b2176057c32
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021904-gestate-update-2b10@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
fe752331d4b3 ("KVM: s390: vsie: fix race during shadow creation")
c3235e2dd695 ("KVM: s390: add stat counter for shadow gmap events")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From fe752331d4b361d43cfd0b89534b4b2176057c32 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger(a)linux.ibm.com>
Date: Wed, 20 Dec 2023 13:53:17 +0100
Subject: [PATCH] KVM: s390: vsie: fix race during shadow creation
Right now it is possible to see gmap->private being zero in
kvm_s390_vsie_gmap_notifier resulting in a crash. This is due to the
fact that we add gmap->private == kvm after creation:
static int acquire_gmap_shadow(struct kvm_vcpu *vcpu,
struct vsie_page *vsie_page)
{
[...]
gmap = gmap_shadow(vcpu->arch.gmap, asce, edat);
if (IS_ERR(gmap))
return PTR_ERR(gmap);
gmap->private = vcpu->kvm;
Let children inherit the private field of the parent.
Reported-by: Marc Hartmayer <mhartmay(a)linux.ibm.com>
Fixes: a3508fbe9dc6 ("KVM: s390: vsie: initial support for nested virtualization")
Cc: <stable(a)vger.kernel.org>
Cc: David Hildenbrand <david(a)redhat.com>
Reviewed-by: Janosch Frank <frankja(a)linux.ibm.com>
Reviewed-by: David Hildenbrand <david(a)redhat.com>
Reviewed-by: Claudio Imbrenda <imbrenda(a)linux.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger(a)linux.ibm.com>
Link: https://lore.kernel.org/r/20231220125317.4258-1-borntraeger@linux.ibm.com
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index 8207a892bbe2..db9a180de65f 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -1220,7 +1220,6 @@ static int acquire_gmap_shadow(struct kvm_vcpu *vcpu,
gmap = gmap_shadow(vcpu->arch.gmap, asce, edat);
if (IS_ERR(gmap))
return PTR_ERR(gmap);
- gmap->private = vcpu->kvm;
vcpu->kvm->stat.gmap_shadow_create++;
WRITE_ONCE(vsie_page->gmap, gmap);
return 0;
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index 6f96b5a71c63..8da39deb56ca 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -1691,6 +1691,7 @@ struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce,
return ERR_PTR(-ENOMEM);
new->mm = parent->mm;
new->parent = gmap_get(parent);
+ new->private = parent->private;
new->orig_asce = asce;
new->edat_level = edat_level;
new->initialized = false;
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x fe752331d4b361d43cfd0b89534b4b2176057c32
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021903-knapsack-change-c3a4@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
fe752331d4b3 ("KVM: s390: vsie: fix race during shadow creation")
c3235e2dd695 ("KVM: s390: add stat counter for shadow gmap events")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From fe752331d4b361d43cfd0b89534b4b2176057c32 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger(a)linux.ibm.com>
Date: Wed, 20 Dec 2023 13:53:17 +0100
Subject: [PATCH] KVM: s390: vsie: fix race during shadow creation
Right now it is possible to see gmap->private being zero in
kvm_s390_vsie_gmap_notifier resulting in a crash. This is due to the
fact that we add gmap->private == kvm after creation:
static int acquire_gmap_shadow(struct kvm_vcpu *vcpu,
struct vsie_page *vsie_page)
{
[...]
gmap = gmap_shadow(vcpu->arch.gmap, asce, edat);
if (IS_ERR(gmap))
return PTR_ERR(gmap);
gmap->private = vcpu->kvm;
Let children inherit the private field of the parent.
Reported-by: Marc Hartmayer <mhartmay(a)linux.ibm.com>
Fixes: a3508fbe9dc6 ("KVM: s390: vsie: initial support for nested virtualization")
Cc: <stable(a)vger.kernel.org>
Cc: David Hildenbrand <david(a)redhat.com>
Reviewed-by: Janosch Frank <frankja(a)linux.ibm.com>
Reviewed-by: David Hildenbrand <david(a)redhat.com>
Reviewed-by: Claudio Imbrenda <imbrenda(a)linux.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger(a)linux.ibm.com>
Link: https://lore.kernel.org/r/20231220125317.4258-1-borntraeger@linux.ibm.com
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index 8207a892bbe2..db9a180de65f 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -1220,7 +1220,6 @@ static int acquire_gmap_shadow(struct kvm_vcpu *vcpu,
gmap = gmap_shadow(vcpu->arch.gmap, asce, edat);
if (IS_ERR(gmap))
return PTR_ERR(gmap);
- gmap->private = vcpu->kvm;
vcpu->kvm->stat.gmap_shadow_create++;
WRITE_ONCE(vsie_page->gmap, gmap);
return 0;
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index 6f96b5a71c63..8da39deb56ca 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -1691,6 +1691,7 @@ struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce,
return ERR_PTR(-ENOMEM);
new->mm = parent->mm;
new->parent = gmap_get(parent);
+ new->private = parent->private;
new->orig_asce = asce;
new->edat_level = edat_level;
new->initialized = false;
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 4508ec17357094e2075f334948393ddedbb75157
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021928-granite-partake-3387@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
4508ec173570 ("smb: client: set correct id, uid and cruid for multiuser automounts")
561f82a3a24c ("smb: client: rename cifs_dfs_ref.c to namespace.c")
38c8a9a52082 ("smb: move client and server files to common directory fs/smb")
b56bce502f55 ("cifs: set DFS root session in cifs_get_smb_ses()")
7ad54b98fc1f ("cifs: use origin fullpath for automounts")
a1c0d00572fc ("cifs: share dfs connections and supers")
a73a26d97eca ("cifs: split out ses and tcon retrieval from mount_get_conns()")
2301bc103ac4 ("cifs: remove unused smb3_fs_context::mount_options")
abdb1742a312 ("cifs: get rid of mount options string parsing")
9fd29a5bae6e ("cifs: use fs_context for automounts")
68e14569d7e5 ("smb3: add dynamic trace points for tree disconnect")
13609a8b3ac6 ("cifs: move from strlcpy with unused retval to strscpy")
5dd8ce24667a ("cifs: missing directory in MAINTAINERS file")
332019e23a51 ("Merge tag '5.20-rc-smb3-client-fixes-part2' of git://git.samba.org/sfrench/cifs-2.6")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 4508ec17357094e2075f334948393ddedbb75157 Mon Sep 17 00:00:00 2001
From: Paulo Alcantara <pc(a)manguebit.com>
Date: Sun, 11 Feb 2024 20:19:30 -0300
Subject: [PATCH] smb: client: set correct id, uid and cruid for multiuser
automounts
When uid, gid and cruid are not specified, we need to dynamically
set them into the filesystem context used for automounting otherwise
they'll end up reusing the values from the parent mount.
Fixes: 9fd29a5bae6e ("cifs: use fs_context for automounts")
Reported-by: Shane Nehring <snehring(a)iastate.edu>
Closes: https://bugzilla.redhat.com/show_bug.cgi?id=2259257
Cc: stable(a)vger.kernel.org # 6.2+
Signed-off-by: Paulo Alcantara (Red Hat) <pc(a)manguebit.com>
Signed-off-by: Steve French <stfrench(a)microsoft.com>
diff --git a/fs/smb/client/namespace.c b/fs/smb/client/namespace.c
index a6968573b775..4a517b280f2b 100644
--- a/fs/smb/client/namespace.c
+++ b/fs/smb/client/namespace.c
@@ -168,6 +168,21 @@ static char *automount_fullpath(struct dentry *dentry, void *page)
return s;
}
+static void fs_context_set_ids(struct smb3_fs_context *ctx)
+{
+ kuid_t uid = current_fsuid();
+ kgid_t gid = current_fsgid();
+
+ if (ctx->multiuser) {
+ if (!ctx->uid_specified)
+ ctx->linux_uid = uid;
+ if (!ctx->gid_specified)
+ ctx->linux_gid = gid;
+ }
+ if (!ctx->cruid_specified)
+ ctx->cred_uid = uid;
+}
+
/*
* Create a vfsmount that we can automount
*/
@@ -205,6 +220,7 @@ static struct vfsmount *cifs_do_automount(struct path *path)
tmp.leaf_fullpath = NULL;
tmp.UNC = tmp.prepath = NULL;
tmp.dfs_root_ses = NULL;
+ fs_context_set_ids(&tmp);
rc = smb3_fs_context_dup(ctx, &tmp);
if (rc) {
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x 4508ec17357094e2075f334948393ddedbb75157
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021928-dragster-release-24a1@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
4508ec173570 ("smb: client: set correct id, uid and cruid for multiuser automounts")
561f82a3a24c ("smb: client: rename cifs_dfs_ref.c to namespace.c")
38c8a9a52082 ("smb: move client and server files to common directory fs/smb")
b56bce502f55 ("cifs: set DFS root session in cifs_get_smb_ses()")
7ad54b98fc1f ("cifs: use origin fullpath for automounts")
a1c0d00572fc ("cifs: share dfs connections and supers")
a73a26d97eca ("cifs: split out ses and tcon retrieval from mount_get_conns()")
2301bc103ac4 ("cifs: remove unused smb3_fs_context::mount_options")
abdb1742a312 ("cifs: get rid of mount options string parsing")
9fd29a5bae6e ("cifs: use fs_context for automounts")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 4508ec17357094e2075f334948393ddedbb75157 Mon Sep 17 00:00:00 2001
From: Paulo Alcantara <pc(a)manguebit.com>
Date: Sun, 11 Feb 2024 20:19:30 -0300
Subject: [PATCH] smb: client: set correct id, uid and cruid for multiuser
automounts
When uid, gid and cruid are not specified, we need to dynamically
set them into the filesystem context used for automounting otherwise
they'll end up reusing the values from the parent mount.
Fixes: 9fd29a5bae6e ("cifs: use fs_context for automounts")
Reported-by: Shane Nehring <snehring(a)iastate.edu>
Closes: https://bugzilla.redhat.com/show_bug.cgi?id=2259257
Cc: stable(a)vger.kernel.org # 6.2+
Signed-off-by: Paulo Alcantara (Red Hat) <pc(a)manguebit.com>
Signed-off-by: Steve French <stfrench(a)microsoft.com>
diff --git a/fs/smb/client/namespace.c b/fs/smb/client/namespace.c
index a6968573b775..4a517b280f2b 100644
--- a/fs/smb/client/namespace.c
+++ b/fs/smb/client/namespace.c
@@ -168,6 +168,21 @@ static char *automount_fullpath(struct dentry *dentry, void *page)
return s;
}
+static void fs_context_set_ids(struct smb3_fs_context *ctx)
+{
+ kuid_t uid = current_fsuid();
+ kgid_t gid = current_fsgid();
+
+ if (ctx->multiuser) {
+ if (!ctx->uid_specified)
+ ctx->linux_uid = uid;
+ if (!ctx->gid_specified)
+ ctx->linux_gid = gid;
+ }
+ if (!ctx->cruid_specified)
+ ctx->cred_uid = uid;
+}
+
/*
* Create a vfsmount that we can automount
*/
@@ -205,6 +220,7 @@ static struct vfsmount *cifs_do_automount(struct path *path)
tmp.leaf_fullpath = NULL;
tmp.UNC = tmp.prepath = NULL;
tmp.dfs_root_ses = NULL;
+ fs_context_set_ids(&tmp);
rc = smb3_fs_context_dup(ctx, &tmp);
if (rc) {
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x ec4d82f855ce332de26fe080892483de98cc1a19
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021902-gecko-caloric-cd8b@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
ec4d82f855ce ("thunderbolt: Fix setting the CNS bit in ROUTER_CS_5")
d49b4f043d63 ("thunderbolt: Add support for enhanced uni-directional TMU mode")
bdc6660e553a ("thunderbolt: Do not call CLx functions from TMU code")
12a14f2fca32 ("thunderbolt: Move CLx support functions into clx.c")
ef34add89ee4 ("thunderbolt: Check valid TMU configuration in tb_switch_tmu_configure()")
4e7b4955cba1 ("thunderbolt: Move tb_enable_tmu() close to other TMU functions")
20c2fae9dbe3 ("thunderbolt: Move TMU configuration to tb_enable_tmu()")
7d283f4148f1 ("thunderbolt: Get rid of tb_switch_enable_tmu_1st_child()")
701e73a823bb ("thunderbolt: Rework Titan Ridge TMU objection disable function")
826f55d50de9 ("thunderbolt: Drop useless 'unidirectional' parameter from tb_switch_tmu_is_enabled()")
c437dcb18310 ("thunderbolt: Fix a couple of style issues in TMU code")
7ce542219b63 ("thunderbolt: Introduce tb_switch_downstream_port()")
3fe95742af29 ("thunderbolt: Do not touch CL state configuration during discovery")
d31137619776 ("thunderbolt: Use correct type in tb_port_is_clx_enabled() prototype")
d0f1e0c2a699 ("thunderbolt: Add support for receiver lane margining")
b12d2955e732 ("thunderbolt: Add helper to check if CL states are enabled on port")
3846d011403b ("thunderbolt: Pass CL state bitmask to tb_port_clx_supported()")
95f8f1cbc87b ("thunderbolt: Move port CL state functions into correct place in switch.c")
b60e31bf18a7 ("thunderbolt: Add DP OUT resource when DP tunnel is discovered")
9e2e5ea3b28f ("Merge tag 'usb-6.0-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/usb")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From ec4d82f855ce332de26fe080892483de98cc1a19 Mon Sep 17 00:00:00 2001
From: Mohammad Rahimi <rahimi.mhmmd(a)gmail.com>
Date: Sat, 27 Jan 2024 11:26:28 +0800
Subject: [PATCH] thunderbolt: Fix setting the CNS bit in ROUTER_CS_5
The bit 23, CM TBT3 Not Supported (CNS), in ROUTER_CS_5 indicates
whether a USB4 Connection Manager is TBT3-Compatible and should be:
0b for TBT3-Compatible
1b for Not TBT3-Compatible
Fixes: b04079837b20 ("thunderbolt: Add initial support for USB4")
Cc: stable(a)vger.kernel.org
Signed-off-by: Mohammad Rahimi <rahimi.mhmmd(a)gmail.com>
Signed-off-by: Mika Westerberg <mika.westerberg(a)linux.intel.com>
diff --git a/drivers/thunderbolt/tb_regs.h b/drivers/thunderbolt/tb_regs.h
index 87e4795275fe..6f798f6a2b84 100644
--- a/drivers/thunderbolt/tb_regs.h
+++ b/drivers/thunderbolt/tb_regs.h
@@ -203,7 +203,7 @@ struct tb_regs_switch_header {
#define ROUTER_CS_5_WOP BIT(1)
#define ROUTER_CS_5_WOU BIT(2)
#define ROUTER_CS_5_WOD BIT(3)
-#define ROUTER_CS_5_C3S BIT(23)
+#define ROUTER_CS_5_CNS BIT(23)
#define ROUTER_CS_5_PTO BIT(24)
#define ROUTER_CS_5_UTO BIT(25)
#define ROUTER_CS_5_HCO BIT(26)
diff --git a/drivers/thunderbolt/usb4.c b/drivers/thunderbolt/usb4.c
index f8f0d24ff6e4..1515eff8cc3e 100644
--- a/drivers/thunderbolt/usb4.c
+++ b/drivers/thunderbolt/usb4.c
@@ -290,7 +290,7 @@ int usb4_switch_setup(struct tb_switch *sw)
}
/* TBT3 supported by the CM */
- val |= ROUTER_CS_5_C3S;
+ val &= ~ROUTER_CS_5_CNS;
return tb_sw_write(sw, &val, TB_CFG_SWITCH, ROUTER_CS_5, 1);
}
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x ec4d82f855ce332de26fe080892483de98cc1a19
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021900-paprika-revisit-a716@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
ec4d82f855ce ("thunderbolt: Fix setting the CNS bit in ROUTER_CS_5")
d49b4f043d63 ("thunderbolt: Add support for enhanced uni-directional TMU mode")
bdc6660e553a ("thunderbolt: Do not call CLx functions from TMU code")
12a14f2fca32 ("thunderbolt: Move CLx support functions into clx.c")
ef34add89ee4 ("thunderbolt: Check valid TMU configuration in tb_switch_tmu_configure()")
4e7b4955cba1 ("thunderbolt: Move tb_enable_tmu() close to other TMU functions")
20c2fae9dbe3 ("thunderbolt: Move TMU configuration to tb_enable_tmu()")
7d283f4148f1 ("thunderbolt: Get rid of tb_switch_enable_tmu_1st_child()")
701e73a823bb ("thunderbolt: Rework Titan Ridge TMU objection disable function")
826f55d50de9 ("thunderbolt: Drop useless 'unidirectional' parameter from tb_switch_tmu_is_enabled()")
c437dcb18310 ("thunderbolt: Fix a couple of style issues in TMU code")
7ce542219b63 ("thunderbolt: Introduce tb_switch_downstream_port()")
3fe95742af29 ("thunderbolt: Do not touch CL state configuration during discovery")
d31137619776 ("thunderbolt: Use correct type in tb_port_is_clx_enabled() prototype")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From ec4d82f855ce332de26fe080892483de98cc1a19 Mon Sep 17 00:00:00 2001
From: Mohammad Rahimi <rahimi.mhmmd(a)gmail.com>
Date: Sat, 27 Jan 2024 11:26:28 +0800
Subject: [PATCH] thunderbolt: Fix setting the CNS bit in ROUTER_CS_5
The bit 23, CM TBT3 Not Supported (CNS), in ROUTER_CS_5 indicates
whether a USB4 Connection Manager is TBT3-Compatible and should be:
0b for TBT3-Compatible
1b for Not TBT3-Compatible
Fixes: b04079837b20 ("thunderbolt: Add initial support for USB4")
Cc: stable(a)vger.kernel.org
Signed-off-by: Mohammad Rahimi <rahimi.mhmmd(a)gmail.com>
Signed-off-by: Mika Westerberg <mika.westerberg(a)linux.intel.com>
diff --git a/drivers/thunderbolt/tb_regs.h b/drivers/thunderbolt/tb_regs.h
index 87e4795275fe..6f798f6a2b84 100644
--- a/drivers/thunderbolt/tb_regs.h
+++ b/drivers/thunderbolt/tb_regs.h
@@ -203,7 +203,7 @@ struct tb_regs_switch_header {
#define ROUTER_CS_5_WOP BIT(1)
#define ROUTER_CS_5_WOU BIT(2)
#define ROUTER_CS_5_WOD BIT(3)
-#define ROUTER_CS_5_C3S BIT(23)
+#define ROUTER_CS_5_CNS BIT(23)
#define ROUTER_CS_5_PTO BIT(24)
#define ROUTER_CS_5_UTO BIT(25)
#define ROUTER_CS_5_HCO BIT(26)
diff --git a/drivers/thunderbolt/usb4.c b/drivers/thunderbolt/usb4.c
index f8f0d24ff6e4..1515eff8cc3e 100644
--- a/drivers/thunderbolt/usb4.c
+++ b/drivers/thunderbolt/usb4.c
@@ -290,7 +290,7 @@ int usb4_switch_setup(struct tb_switch *sw)
}
/* TBT3 supported by the CM */
- val |= ROUTER_CS_5_C3S;
+ val &= ~ROUTER_CS_5_CNS;
return tb_sw_write(sw, &val, TB_CFG_SWITCH, ROUTER_CS_5, 1);
}
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x ec4d82f855ce332de26fe080892483de98cc1a19
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021901-java-rejoice-9b51@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
ec4d82f855ce ("thunderbolt: Fix setting the CNS bit in ROUTER_CS_5")
d49b4f043d63 ("thunderbolt: Add support for enhanced uni-directional TMU mode")
bdc6660e553a ("thunderbolt: Do not call CLx functions from TMU code")
12a14f2fca32 ("thunderbolt: Move CLx support functions into clx.c")
ef34add89ee4 ("thunderbolt: Check valid TMU configuration in tb_switch_tmu_configure()")
4e7b4955cba1 ("thunderbolt: Move tb_enable_tmu() close to other TMU functions")
20c2fae9dbe3 ("thunderbolt: Move TMU configuration to tb_enable_tmu()")
7d283f4148f1 ("thunderbolt: Get rid of tb_switch_enable_tmu_1st_child()")
701e73a823bb ("thunderbolt: Rework Titan Ridge TMU objection disable function")
826f55d50de9 ("thunderbolt: Drop useless 'unidirectional' parameter from tb_switch_tmu_is_enabled()")
c437dcb18310 ("thunderbolt: Fix a couple of style issues in TMU code")
7ce542219b63 ("thunderbolt: Introduce tb_switch_downstream_port()")
3fe95742af29 ("thunderbolt: Do not touch CL state configuration during discovery")
d31137619776 ("thunderbolt: Use correct type in tb_port_is_clx_enabled() prototype")
d0f1e0c2a699 ("thunderbolt: Add support for receiver lane margining")
b12d2955e732 ("thunderbolt: Add helper to check if CL states are enabled on port")
3846d011403b ("thunderbolt: Pass CL state bitmask to tb_port_clx_supported()")
95f8f1cbc87b ("thunderbolt: Move port CL state functions into correct place in switch.c")
b60e31bf18a7 ("thunderbolt: Add DP OUT resource when DP tunnel is discovered")
9e2e5ea3b28f ("Merge tag 'usb-6.0-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/usb")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From ec4d82f855ce332de26fe080892483de98cc1a19 Mon Sep 17 00:00:00 2001
From: Mohammad Rahimi <rahimi.mhmmd(a)gmail.com>
Date: Sat, 27 Jan 2024 11:26:28 +0800
Subject: [PATCH] thunderbolt: Fix setting the CNS bit in ROUTER_CS_5
The bit 23, CM TBT3 Not Supported (CNS), in ROUTER_CS_5 indicates
whether a USB4 Connection Manager is TBT3-Compatible and should be:
0b for TBT3-Compatible
1b for Not TBT3-Compatible
Fixes: b04079837b20 ("thunderbolt: Add initial support for USB4")
Cc: stable(a)vger.kernel.org
Signed-off-by: Mohammad Rahimi <rahimi.mhmmd(a)gmail.com>
Signed-off-by: Mika Westerberg <mika.westerberg(a)linux.intel.com>
diff --git a/drivers/thunderbolt/tb_regs.h b/drivers/thunderbolt/tb_regs.h
index 87e4795275fe..6f798f6a2b84 100644
--- a/drivers/thunderbolt/tb_regs.h
+++ b/drivers/thunderbolt/tb_regs.h
@@ -203,7 +203,7 @@ struct tb_regs_switch_header {
#define ROUTER_CS_5_WOP BIT(1)
#define ROUTER_CS_5_WOU BIT(2)
#define ROUTER_CS_5_WOD BIT(3)
-#define ROUTER_CS_5_C3S BIT(23)
+#define ROUTER_CS_5_CNS BIT(23)
#define ROUTER_CS_5_PTO BIT(24)
#define ROUTER_CS_5_UTO BIT(25)
#define ROUTER_CS_5_HCO BIT(26)
diff --git a/drivers/thunderbolt/usb4.c b/drivers/thunderbolt/usb4.c
index f8f0d24ff6e4..1515eff8cc3e 100644
--- a/drivers/thunderbolt/usb4.c
+++ b/drivers/thunderbolt/usb4.c
@@ -290,7 +290,7 @@ int usb4_switch_setup(struct tb_switch *sw)
}
/* TBT3 supported by the CM */
- val |= ROUTER_CS_5_C3S;
+ val &= ~ROUTER_CS_5_CNS;
return tb_sw_write(sw, &val, TB_CFG_SWITCH, ROUTER_CS_5, 1);
}
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x b5d1b4b46f856da1473c7ba9a5cdfcb55c9b2478
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021947-penholder-identify-b98c@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
b5d1b4b46f85 ("PCI: dwc: Fix a 64bit bug in dw_pcie_ep_raise_msix_irq()")
2217fffcd63f ("PCI: dwc: endpoint: Fix dw_pcie_ep_raise_msix_irq() alignment support")
92af77ca26f7 ("PCI: dwc: Use FIELD_GET/PREP()")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From b5d1b4b46f856da1473c7ba9a5cdfcb55c9b2478 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter(a)linaro.org>
Date: Fri, 26 Jan 2024 11:40:37 +0300
Subject: [PATCH] PCI: dwc: Fix a 64bit bug in dw_pcie_ep_raise_msix_irq()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
The "msg_addr" variable is u64. However, the "aligned_offset" is an
unsigned int. This means that when the code does:
msg_addr &= ~aligned_offset;
it will unintentionally zero out the high 32 bits. Use ALIGN_DOWN() to do
the alignment instead.
Fixes: 2217fffcd63f ("PCI: dwc: endpoint: Fix dw_pcie_ep_raise_msix_irq() alignment support")
Link: https://lore.kernel.org/r/af59c7ad-ab93-40f7-ad4a-7ac0b14d37f5@moroto.mount…
Signed-off-by: Dan Carpenter <dan.carpenter(a)linaro.org>
Signed-off-by: Bjorn Helgaas <bhelgaas(a)google.com>
Reviewed-by: Niklas Cassel <cassel(a)kernel.org>
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen(a)linux.intel.com>
Reviewed-by: Manivannan Sadhasivam <manivannan.sadhasivam(a)linaro.org>
Cc: <stable(a)vger.kernel.org>
diff --git a/drivers/pci/controller/dwc/pcie-designware-ep.c b/drivers/pci/controller/dwc/pcie-designware-ep.c
index 5befed2dc02b..d6b66597101e 100644
--- a/drivers/pci/controller/dwc/pcie-designware-ep.c
+++ b/drivers/pci/controller/dwc/pcie-designware-ep.c
@@ -6,6 +6,7 @@
* Author: Kishon Vijay Abraham I <kishon(a)ti.com>
*/
+#include <linux/align.h>
#include <linux/bitfield.h>
#include <linux/of.h>
#include <linux/platform_device.h>
@@ -551,7 +552,7 @@ int dw_pcie_ep_raise_msix_irq(struct dw_pcie_ep *ep, u8 func_no,
}
aligned_offset = msg_addr & (epc->mem->window.page_size - 1);
- msg_addr &= ~aligned_offset;
+ msg_addr = ALIGN_DOWN(msg_addr, epc->mem->window.page_size);
ret = dw_pcie_ep_map_addr(epc, func_no, 0, ep->msi_mem_phys, msg_addr,
epc->mem->window.page_size);
if (ret)
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x b5d1b4b46f856da1473c7ba9a5cdfcb55c9b2478
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021946-sprain-turmoil-2ff2@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
b5d1b4b46f85 ("PCI: dwc: Fix a 64bit bug in dw_pcie_ep_raise_msix_irq()")
2217fffcd63f ("PCI: dwc: endpoint: Fix dw_pcie_ep_raise_msix_irq() alignment support")
92af77ca26f7 ("PCI: dwc: Use FIELD_GET/PREP()")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From b5d1b4b46f856da1473c7ba9a5cdfcb55c9b2478 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter(a)linaro.org>
Date: Fri, 26 Jan 2024 11:40:37 +0300
Subject: [PATCH] PCI: dwc: Fix a 64bit bug in dw_pcie_ep_raise_msix_irq()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
The "msg_addr" variable is u64. However, the "aligned_offset" is an
unsigned int. This means that when the code does:
msg_addr &= ~aligned_offset;
it will unintentionally zero out the high 32 bits. Use ALIGN_DOWN() to do
the alignment instead.
Fixes: 2217fffcd63f ("PCI: dwc: endpoint: Fix dw_pcie_ep_raise_msix_irq() alignment support")
Link: https://lore.kernel.org/r/af59c7ad-ab93-40f7-ad4a-7ac0b14d37f5@moroto.mount…
Signed-off-by: Dan Carpenter <dan.carpenter(a)linaro.org>
Signed-off-by: Bjorn Helgaas <bhelgaas(a)google.com>
Reviewed-by: Niklas Cassel <cassel(a)kernel.org>
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen(a)linux.intel.com>
Reviewed-by: Manivannan Sadhasivam <manivannan.sadhasivam(a)linaro.org>
Cc: <stable(a)vger.kernel.org>
diff --git a/drivers/pci/controller/dwc/pcie-designware-ep.c b/drivers/pci/controller/dwc/pcie-designware-ep.c
index 5befed2dc02b..d6b66597101e 100644
--- a/drivers/pci/controller/dwc/pcie-designware-ep.c
+++ b/drivers/pci/controller/dwc/pcie-designware-ep.c
@@ -6,6 +6,7 @@
* Author: Kishon Vijay Abraham I <kishon(a)ti.com>
*/
+#include <linux/align.h>
#include <linux/bitfield.h>
#include <linux/of.h>
#include <linux/platform_device.h>
@@ -551,7 +552,7 @@ int dw_pcie_ep_raise_msix_irq(struct dw_pcie_ep *ep, u8 func_no,
}
aligned_offset = msg_addr & (epc->mem->window.page_size - 1);
- msg_addr &= ~aligned_offset;
+ msg_addr = ALIGN_DOWN(msg_addr, epc->mem->window.page_size);
ret = dw_pcie_ep_map_addr(epc, func_no, 0, ep->msi_mem_phys, msg_addr,
epc->mem->window.page_size);
if (ret)
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x b5d1b4b46f856da1473c7ba9a5cdfcb55c9b2478
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021945-droplet-overbid-1f08@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
b5d1b4b46f85 ("PCI: dwc: Fix a 64bit bug in dw_pcie_ep_raise_msix_irq()")
2217fffcd63f ("PCI: dwc: endpoint: Fix dw_pcie_ep_raise_msix_irq() alignment support")
92af77ca26f7 ("PCI: dwc: Use FIELD_GET/PREP()")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From b5d1b4b46f856da1473c7ba9a5cdfcb55c9b2478 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter(a)linaro.org>
Date: Fri, 26 Jan 2024 11:40:37 +0300
Subject: [PATCH] PCI: dwc: Fix a 64bit bug in dw_pcie_ep_raise_msix_irq()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
The "msg_addr" variable is u64. However, the "aligned_offset" is an
unsigned int. This means that when the code does:
msg_addr &= ~aligned_offset;
it will unintentionally zero out the high 32 bits. Use ALIGN_DOWN() to do
the alignment instead.
Fixes: 2217fffcd63f ("PCI: dwc: endpoint: Fix dw_pcie_ep_raise_msix_irq() alignment support")
Link: https://lore.kernel.org/r/af59c7ad-ab93-40f7-ad4a-7ac0b14d37f5@moroto.mount…
Signed-off-by: Dan Carpenter <dan.carpenter(a)linaro.org>
Signed-off-by: Bjorn Helgaas <bhelgaas(a)google.com>
Reviewed-by: Niklas Cassel <cassel(a)kernel.org>
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen(a)linux.intel.com>
Reviewed-by: Manivannan Sadhasivam <manivannan.sadhasivam(a)linaro.org>
Cc: <stable(a)vger.kernel.org>
diff --git a/drivers/pci/controller/dwc/pcie-designware-ep.c b/drivers/pci/controller/dwc/pcie-designware-ep.c
index 5befed2dc02b..d6b66597101e 100644
--- a/drivers/pci/controller/dwc/pcie-designware-ep.c
+++ b/drivers/pci/controller/dwc/pcie-designware-ep.c
@@ -6,6 +6,7 @@
* Author: Kishon Vijay Abraham I <kishon(a)ti.com>
*/
+#include <linux/align.h>
#include <linux/bitfield.h>
#include <linux/of.h>
#include <linux/platform_device.h>
@@ -551,7 +552,7 @@ int dw_pcie_ep_raise_msix_irq(struct dw_pcie_ep *ep, u8 func_no,
}
aligned_offset = msg_addr & (epc->mem->window.page_size - 1);
- msg_addr &= ~aligned_offset;
+ msg_addr = ALIGN_DOWN(msg_addr, epc->mem->window.page_size);
ret = dw_pcie_ep_map_addr(epc, func_no, 0, ep->msi_mem_phys, msg_addr,
epc->mem->window.page_size);
if (ret)
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.19.y
git checkout FETCH_HEAD
git cherry-pick -x c98d8836b817d11fdff4ca7749cbbe04ff7f0c64
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021948-regally-festival-39c8@gregkh' --subject-prefix 'PATCH 4.19.y' HEAD^..
Possible dependencies:
c98d8836b817 ("wifi: mac80211: reload info pointer in ieee80211_tx_dequeue()")
23a5f0af6ff4 ("wifi: mac80211: remove cipher scheme support")
6d501764288c ("mac80211: introduce channel switch disconnect function")
71abf71e9e63 ("mac80211: Remove unused assignment statements")
77dfc2bc0bb4 ("mac80211: do not access the IV when it was stripped")
63214f02cff9 ("mac80211: save transmit power envelope element and power constraint")
405fca8a9461 ("ieee80211: add power type definition for 6 GHz")
5d24828d05f3 ("mac80211: always allocate struct ieee802_11_elems")
c6e37ed498f9 ("mac80211: move CRC into struct ieee802_11_elems")
a5b983c60731 ("mac80211: mesh: clean up rx_bcn_presp API")
65be6aa36ded ("mac80211: add HE 6 GHz capability only if supported")
15fae3410f1d ("mac80211: notify driver on mgd TX completion")
9bd6a83e53a7 ("mac80211: add vendor-specific capabilities to assoc request")
bac2fd3d7534 ("mac80211: remove use of ieee80211_get_he_sta_cap()")
c74025f47ac8 ("mac80211: rearrange struct txq_info for fewer holes")
d8b261548dcf ("mac80211: add to bss_conf if broadcast TWT is supported")
bbc6f03ff26e ("mac80211: reset profile_periodicity/ema_ap")
bf30ca922a0c ("mac80211: check defrag PN against current frame")
3a11ce08c45b ("mac80211: add fragment cache to sta_info")
270032a2a9c4 ("mac80211: drop A-MSDUs on old ciphers")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From c98d8836b817d11fdff4ca7749cbbe04ff7f0c64 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg(a)intel.com>
Date: Wed, 31 Jan 2024 16:49:10 +0100
Subject: [PATCH] wifi: mac80211: reload info pointer in ieee80211_tx_dequeue()
This pointer can change here since the SKB can change, so we
actually later open-coded IEEE80211_SKB_CB() again. Reload
the pointer where needed, so the monitor-mode case using it
gets fixed, and then use info-> later as well.
Cc: stable(a)vger.kernel.org
Fixes: 531682159092 ("mac80211: fix VLAN handling with TXQs")
Link: https://msgid.link/20240131164910.b54c28d583bc.I29450cec84ea6773cff5d9c16ff…
Signed-off-by: Johannes Berg <johannes.berg(a)intel.com>
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index e448ab338448..6fbb15b65902 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -5,7 +5,7 @@
* Copyright 2006-2007 Jiri Benc <jbenc(a)suse.cz>
* Copyright 2007 Johannes Berg <johannes(a)sipsolutions.net>
* Copyright 2013-2014 Intel Mobile Communications GmbH
- * Copyright (C) 2018-2022 Intel Corporation
+ * Copyright (C) 2018-2024 Intel Corporation
*
* Transmit and frame generation functions.
*/
@@ -3927,6 +3927,7 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw,
goto begin;
skb = __skb_dequeue(&tx.skbs);
+ info = IEEE80211_SKB_CB(skb);
if (!skb_queue_empty(&tx.skbs)) {
spin_lock_bh(&fq->lock);
@@ -3971,7 +3972,7 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw,
}
encap_out:
- IEEE80211_SKB_CB(skb)->control.vif = vif;
+ info->control.vif = vif;
if (tx.sta &&
wiphy_ext_feature_isset(local->hw.wiphy, NL80211_EXT_FEATURE_AQL)) {
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x c98d8836b817d11fdff4ca7749cbbe04ff7f0c64
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021947-unshaven-machine-1e73@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
c98d8836b817 ("wifi: mac80211: reload info pointer in ieee80211_tx_dequeue()")
23a5f0af6ff4 ("wifi: mac80211: remove cipher scheme support")
6d501764288c ("mac80211: introduce channel switch disconnect function")
71abf71e9e63 ("mac80211: Remove unused assignment statements")
77dfc2bc0bb4 ("mac80211: do not access the IV when it was stripped")
63214f02cff9 ("mac80211: save transmit power envelope element and power constraint")
405fca8a9461 ("ieee80211: add power type definition for 6 GHz")
5d24828d05f3 ("mac80211: always allocate struct ieee802_11_elems")
c6e37ed498f9 ("mac80211: move CRC into struct ieee802_11_elems")
a5b983c60731 ("mac80211: mesh: clean up rx_bcn_presp API")
65be6aa36ded ("mac80211: add HE 6 GHz capability only if supported")
15fae3410f1d ("mac80211: notify driver on mgd TX completion")
9bd6a83e53a7 ("mac80211: add vendor-specific capabilities to assoc request")
bac2fd3d7534 ("mac80211: remove use of ieee80211_get_he_sta_cap()")
c74025f47ac8 ("mac80211: rearrange struct txq_info for fewer holes")
d8b261548dcf ("mac80211: add to bss_conf if broadcast TWT is supported")
bbc6f03ff26e ("mac80211: reset profile_periodicity/ema_ap")
bf30ca922a0c ("mac80211: check defrag PN against current frame")
3a11ce08c45b ("mac80211: add fragment cache to sta_info")
270032a2a9c4 ("mac80211: drop A-MSDUs on old ciphers")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From c98d8836b817d11fdff4ca7749cbbe04ff7f0c64 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg(a)intel.com>
Date: Wed, 31 Jan 2024 16:49:10 +0100
Subject: [PATCH] wifi: mac80211: reload info pointer in ieee80211_tx_dequeue()
This pointer can change here since the SKB can change, so we
actually later open-coded IEEE80211_SKB_CB() again. Reload
the pointer where needed, so the monitor-mode case using it
gets fixed, and then use info-> later as well.
Cc: stable(a)vger.kernel.org
Fixes: 531682159092 ("mac80211: fix VLAN handling with TXQs")
Link: https://msgid.link/20240131164910.b54c28d583bc.I29450cec84ea6773cff5d9c16ff…
Signed-off-by: Johannes Berg <johannes.berg(a)intel.com>
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index e448ab338448..6fbb15b65902 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -5,7 +5,7 @@
* Copyright 2006-2007 Jiri Benc <jbenc(a)suse.cz>
* Copyright 2007 Johannes Berg <johannes(a)sipsolutions.net>
* Copyright 2013-2014 Intel Mobile Communications GmbH
- * Copyright (C) 2018-2022 Intel Corporation
+ * Copyright (C) 2018-2024 Intel Corporation
*
* Transmit and frame generation functions.
*/
@@ -3927,6 +3927,7 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw,
goto begin;
skb = __skb_dequeue(&tx.skbs);
+ info = IEEE80211_SKB_CB(skb);
if (!skb_queue_empty(&tx.skbs)) {
spin_lock_bh(&fq->lock);
@@ -3971,7 +3972,7 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw,
}
encap_out:
- IEEE80211_SKB_CB(skb)->control.vif = vif;
+ info->control.vif = vif;
if (tx.sta &&
wiphy_ext_feature_isset(local->hw.wiphy, NL80211_EXT_FEATURE_AQL)) {
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.19.y
git checkout FETCH_HEAD
git cherry-pick -x 5bc09b397cbf1221f8a8aacb1152650c9195b02b
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021950-voltage-culprit-3dd5@gregkh' --subject-prefix 'PATCH 4.19.y' HEAD^..
Possible dependencies:
5bc09b397cbf ("nilfs2: fix potential bug in end_buffer_async_write")
ff5710c3f3c2 ("nilfs2: convert nilfs_segctor_prepare_write to use folios")
3cd36212bf75 ("nilfs2: convert nilfs_segctor_complete_write to use folios")
50196f0081ca ("nilfs2: convert nilfs_abort_logs to use folios")
8f46eaf6fd84 ("nilfs2: add nilfs_end_folio_io()")
679bd7ebdd31 ("nilfs2: fix buffer corruption due to concurrent device reads")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 5bc09b397cbf1221f8a8aacb1152650c9195b02b Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke(a)gmail.com>
Date: Sun, 4 Feb 2024 01:16:45 +0900
Subject: [PATCH] nilfs2: fix potential bug in end_buffer_async_write
According to a syzbot report, end_buffer_async_write(), which handles the
completion of block device writes, may detect abnormal condition of the
buffer async_write flag and cause a BUG_ON failure when using nilfs2.
Nilfs2 itself does not use end_buffer_async_write(). But, the async_write
flag is now used as a marker by commit 7f42ec394156 ("nilfs2: fix issue
with race condition of competition between segments for dirty blocks") as
a means of resolving double list insertion of dirty blocks in
nilfs_lookup_dirty_data_buffers() and nilfs_lookup_node_buffers() and the
resulting crash.
This modification is safe as long as it is used for file data and b-tree
node blocks where the page caches are independent. However, it was
irrelevant and redundant to also introduce async_write for segment summary
and super root blocks that share buffers with the backing device. This
led to the possibility that the BUG_ON check in end_buffer_async_write
would fail as described above, if independent writebacks of the backing
device occurred in parallel.
The use of async_write for segment summary buffers has already been
removed in a previous change.
Fix this issue by removing the manipulation of the async_write flag for
the remaining super root block buffer.
Link: https://lkml.kernel.org/r/20240203161645.4992-1-konishi.ryusuke@gmail.com
Fixes: 7f42ec394156 ("nilfs2: fix issue with race condition of competition between segments for dirty blocks")
Signed-off-by: Ryusuke Konishi <konishi.ryusuke(a)gmail.com>
Reported-by: syzbot+5c04210f7c7f897c1e7f(a)syzkaller.appspotmail.com
Closes: https://lkml.kernel.org/r/00000000000019a97c05fd42f8c8@google.com
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 2590a0860eab..2bfb08052d39 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -1703,7 +1703,6 @@ static void nilfs_segctor_prepare_write(struct nilfs_sc_info *sci)
list_for_each_entry(bh, &segbuf->sb_payload_buffers,
b_assoc_buffers) {
- set_buffer_async_write(bh);
if (bh == segbuf->sb_super_root) {
if (bh->b_folio != bd_folio) {
folio_lock(bd_folio);
@@ -1714,6 +1713,7 @@ static void nilfs_segctor_prepare_write(struct nilfs_sc_info *sci)
}
break;
}
+ set_buffer_async_write(bh);
if (bh->b_folio != fs_folio) {
nilfs_begin_folio_io(fs_folio);
fs_folio = bh->b_folio;
@@ -1800,7 +1800,6 @@ static void nilfs_abort_logs(struct list_head *logs, int err)
list_for_each_entry(bh, &segbuf->sb_payload_buffers,
b_assoc_buffers) {
- clear_buffer_async_write(bh);
if (bh == segbuf->sb_super_root) {
clear_buffer_uptodate(bh);
if (bh->b_folio != bd_folio) {
@@ -1809,6 +1808,7 @@ static void nilfs_abort_logs(struct list_head *logs, int err)
}
break;
}
+ clear_buffer_async_write(bh);
if (bh->b_folio != fs_folio) {
nilfs_end_folio_io(fs_folio, err);
fs_folio = bh->b_folio;
@@ -1896,8 +1896,9 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
BIT(BH_Delay) | BIT(BH_NILFS_Volatile) |
BIT(BH_NILFS_Redirected));
- set_mask_bits(&bh->b_state, clear_bits, set_bits);
if (bh == segbuf->sb_super_root) {
+ set_buffer_uptodate(bh);
+ clear_buffer_dirty(bh);
if (bh->b_folio != bd_folio) {
folio_end_writeback(bd_folio);
bd_folio = bh->b_folio;
@@ -1905,6 +1906,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
update_sr = true;
break;
}
+ set_mask_bits(&bh->b_state, clear_bits, set_bits);
if (bh->b_folio != fs_folio) {
nilfs_end_folio_io(fs_folio, 0);
fs_folio = bh->b_folio;
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x 5bc09b397cbf1221f8a8aacb1152650c9195b02b
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021948-sacrifice-superior-85e5@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
5bc09b397cbf ("nilfs2: fix potential bug in end_buffer_async_write")
ff5710c3f3c2 ("nilfs2: convert nilfs_segctor_prepare_write to use folios")
3cd36212bf75 ("nilfs2: convert nilfs_segctor_complete_write to use folios")
50196f0081ca ("nilfs2: convert nilfs_abort_logs to use folios")
8f46eaf6fd84 ("nilfs2: add nilfs_end_folio_io()")
679bd7ebdd31 ("nilfs2: fix buffer corruption due to concurrent device reads")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 5bc09b397cbf1221f8a8aacb1152650c9195b02b Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke(a)gmail.com>
Date: Sun, 4 Feb 2024 01:16:45 +0900
Subject: [PATCH] nilfs2: fix potential bug in end_buffer_async_write
According to a syzbot report, end_buffer_async_write(), which handles the
completion of block device writes, may detect abnormal condition of the
buffer async_write flag and cause a BUG_ON failure when using nilfs2.
Nilfs2 itself does not use end_buffer_async_write(). But, the async_write
flag is now used as a marker by commit 7f42ec394156 ("nilfs2: fix issue
with race condition of competition between segments for dirty blocks") as
a means of resolving double list insertion of dirty blocks in
nilfs_lookup_dirty_data_buffers() and nilfs_lookup_node_buffers() and the
resulting crash.
This modification is safe as long as it is used for file data and b-tree
node blocks where the page caches are independent. However, it was
irrelevant and redundant to also introduce async_write for segment summary
and super root blocks that share buffers with the backing device. This
led to the possibility that the BUG_ON check in end_buffer_async_write
would fail as described above, if independent writebacks of the backing
device occurred in parallel.
The use of async_write for segment summary buffers has already been
removed in a previous change.
Fix this issue by removing the manipulation of the async_write flag for
the remaining super root block buffer.
Link: https://lkml.kernel.org/r/20240203161645.4992-1-konishi.ryusuke@gmail.com
Fixes: 7f42ec394156 ("nilfs2: fix issue with race condition of competition between segments for dirty blocks")
Signed-off-by: Ryusuke Konishi <konishi.ryusuke(a)gmail.com>
Reported-by: syzbot+5c04210f7c7f897c1e7f(a)syzkaller.appspotmail.com
Closes: https://lkml.kernel.org/r/00000000000019a97c05fd42f8c8@google.com
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 2590a0860eab..2bfb08052d39 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -1703,7 +1703,6 @@ static void nilfs_segctor_prepare_write(struct nilfs_sc_info *sci)
list_for_each_entry(bh, &segbuf->sb_payload_buffers,
b_assoc_buffers) {
- set_buffer_async_write(bh);
if (bh == segbuf->sb_super_root) {
if (bh->b_folio != bd_folio) {
folio_lock(bd_folio);
@@ -1714,6 +1713,7 @@ static void nilfs_segctor_prepare_write(struct nilfs_sc_info *sci)
}
break;
}
+ set_buffer_async_write(bh);
if (bh->b_folio != fs_folio) {
nilfs_begin_folio_io(fs_folio);
fs_folio = bh->b_folio;
@@ -1800,7 +1800,6 @@ static void nilfs_abort_logs(struct list_head *logs, int err)
list_for_each_entry(bh, &segbuf->sb_payload_buffers,
b_assoc_buffers) {
- clear_buffer_async_write(bh);
if (bh == segbuf->sb_super_root) {
clear_buffer_uptodate(bh);
if (bh->b_folio != bd_folio) {
@@ -1809,6 +1808,7 @@ static void nilfs_abort_logs(struct list_head *logs, int err)
}
break;
}
+ clear_buffer_async_write(bh);
if (bh->b_folio != fs_folio) {
nilfs_end_folio_io(fs_folio, err);
fs_folio = bh->b_folio;
@@ -1896,8 +1896,9 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
BIT(BH_Delay) | BIT(BH_NILFS_Volatile) |
BIT(BH_NILFS_Redirected));
- set_mask_bits(&bh->b_state, clear_bits, set_bits);
if (bh == segbuf->sb_super_root) {
+ set_buffer_uptodate(bh);
+ clear_buffer_dirty(bh);
if (bh->b_folio != bd_folio) {
folio_end_writeback(bd_folio);
bd_folio = bh->b_folio;
@@ -1905,6 +1906,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
update_sr = true;
break;
}
+ set_mask_bits(&bh->b_state, clear_bits, set_bits);
if (bh->b_folio != fs_folio) {
nilfs_end_folio_io(fs_folio, 0);
fs_folio = bh->b_folio;
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x 5bc09b397cbf1221f8a8aacb1152650c9195b02b
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021947-stays-preflight-796b@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
5bc09b397cbf ("nilfs2: fix potential bug in end_buffer_async_write")
ff5710c3f3c2 ("nilfs2: convert nilfs_segctor_prepare_write to use folios")
3cd36212bf75 ("nilfs2: convert nilfs_segctor_complete_write to use folios")
50196f0081ca ("nilfs2: convert nilfs_abort_logs to use folios")
8f46eaf6fd84 ("nilfs2: add nilfs_end_folio_io()")
679bd7ebdd31 ("nilfs2: fix buffer corruption due to concurrent device reads")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 5bc09b397cbf1221f8a8aacb1152650c9195b02b Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke(a)gmail.com>
Date: Sun, 4 Feb 2024 01:16:45 +0900
Subject: [PATCH] nilfs2: fix potential bug in end_buffer_async_write
According to a syzbot report, end_buffer_async_write(), which handles the
completion of block device writes, may detect abnormal condition of the
buffer async_write flag and cause a BUG_ON failure when using nilfs2.
Nilfs2 itself does not use end_buffer_async_write(). But, the async_write
flag is now used as a marker by commit 7f42ec394156 ("nilfs2: fix issue
with race condition of competition between segments for dirty blocks") as
a means of resolving double list insertion of dirty blocks in
nilfs_lookup_dirty_data_buffers() and nilfs_lookup_node_buffers() and the
resulting crash.
This modification is safe as long as it is used for file data and b-tree
node blocks where the page caches are independent. However, it was
irrelevant and redundant to also introduce async_write for segment summary
and super root blocks that share buffers with the backing device. This
led to the possibility that the BUG_ON check in end_buffer_async_write
would fail as described above, if independent writebacks of the backing
device occurred in parallel.
The use of async_write for segment summary buffers has already been
removed in a previous change.
Fix this issue by removing the manipulation of the async_write flag for
the remaining super root block buffer.
Link: https://lkml.kernel.org/r/20240203161645.4992-1-konishi.ryusuke@gmail.com
Fixes: 7f42ec394156 ("nilfs2: fix issue with race condition of competition between segments for dirty blocks")
Signed-off-by: Ryusuke Konishi <konishi.ryusuke(a)gmail.com>
Reported-by: syzbot+5c04210f7c7f897c1e7f(a)syzkaller.appspotmail.com
Closes: https://lkml.kernel.org/r/00000000000019a97c05fd42f8c8@google.com
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 2590a0860eab..2bfb08052d39 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -1703,7 +1703,6 @@ static void nilfs_segctor_prepare_write(struct nilfs_sc_info *sci)
list_for_each_entry(bh, &segbuf->sb_payload_buffers,
b_assoc_buffers) {
- set_buffer_async_write(bh);
if (bh == segbuf->sb_super_root) {
if (bh->b_folio != bd_folio) {
folio_lock(bd_folio);
@@ -1714,6 +1713,7 @@ static void nilfs_segctor_prepare_write(struct nilfs_sc_info *sci)
}
break;
}
+ set_buffer_async_write(bh);
if (bh->b_folio != fs_folio) {
nilfs_begin_folio_io(fs_folio);
fs_folio = bh->b_folio;
@@ -1800,7 +1800,6 @@ static void nilfs_abort_logs(struct list_head *logs, int err)
list_for_each_entry(bh, &segbuf->sb_payload_buffers,
b_assoc_buffers) {
- clear_buffer_async_write(bh);
if (bh == segbuf->sb_super_root) {
clear_buffer_uptodate(bh);
if (bh->b_folio != bd_folio) {
@@ -1809,6 +1808,7 @@ static void nilfs_abort_logs(struct list_head *logs, int err)
}
break;
}
+ clear_buffer_async_write(bh);
if (bh->b_folio != fs_folio) {
nilfs_end_folio_io(fs_folio, err);
fs_folio = bh->b_folio;
@@ -1896,8 +1896,9 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
BIT(BH_Delay) | BIT(BH_NILFS_Volatile) |
BIT(BH_NILFS_Redirected));
- set_mask_bits(&bh->b_state, clear_bits, set_bits);
if (bh == segbuf->sb_super_root) {
+ set_buffer_uptodate(bh);
+ clear_buffer_dirty(bh);
if (bh->b_folio != bd_folio) {
folio_end_writeback(bd_folio);
bd_folio = bh->b_folio;
@@ -1905,6 +1906,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
update_sr = true;
break;
}
+ set_mask_bits(&bh->b_state, clear_bits, set_bits);
if (bh->b_folio != fs_folio) {
nilfs_end_folio_io(fs_folio, 0);
fs_folio = bh->b_folio;
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 5bc09b397cbf1221f8a8aacb1152650c9195b02b
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021945-output-concave-3ad1@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
5bc09b397cbf ("nilfs2: fix potential bug in end_buffer_async_write")
ff5710c3f3c2 ("nilfs2: convert nilfs_segctor_prepare_write to use folios")
3cd36212bf75 ("nilfs2: convert nilfs_segctor_complete_write to use folios")
50196f0081ca ("nilfs2: convert nilfs_abort_logs to use folios")
8f46eaf6fd84 ("nilfs2: add nilfs_end_folio_io()")
679bd7ebdd31 ("nilfs2: fix buffer corruption due to concurrent device reads")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 5bc09b397cbf1221f8a8aacb1152650c9195b02b Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke(a)gmail.com>
Date: Sun, 4 Feb 2024 01:16:45 +0900
Subject: [PATCH] nilfs2: fix potential bug in end_buffer_async_write
According to a syzbot report, end_buffer_async_write(), which handles the
completion of block device writes, may detect abnormal condition of the
buffer async_write flag and cause a BUG_ON failure when using nilfs2.
Nilfs2 itself does not use end_buffer_async_write(). But, the async_write
flag is now used as a marker by commit 7f42ec394156 ("nilfs2: fix issue
with race condition of competition between segments for dirty blocks") as
a means of resolving double list insertion of dirty blocks in
nilfs_lookup_dirty_data_buffers() and nilfs_lookup_node_buffers() and the
resulting crash.
This modification is safe as long as it is used for file data and b-tree
node blocks where the page caches are independent. However, it was
irrelevant and redundant to also introduce async_write for segment summary
and super root blocks that share buffers with the backing device. This
led to the possibility that the BUG_ON check in end_buffer_async_write
would fail as described above, if independent writebacks of the backing
device occurred in parallel.
The use of async_write for segment summary buffers has already been
removed in a previous change.
Fix this issue by removing the manipulation of the async_write flag for
the remaining super root block buffer.
Link: https://lkml.kernel.org/r/20240203161645.4992-1-konishi.ryusuke@gmail.com
Fixes: 7f42ec394156 ("nilfs2: fix issue with race condition of competition between segments for dirty blocks")
Signed-off-by: Ryusuke Konishi <konishi.ryusuke(a)gmail.com>
Reported-by: syzbot+5c04210f7c7f897c1e7f(a)syzkaller.appspotmail.com
Closes: https://lkml.kernel.org/r/00000000000019a97c05fd42f8c8@google.com
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 2590a0860eab..2bfb08052d39 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -1703,7 +1703,6 @@ static void nilfs_segctor_prepare_write(struct nilfs_sc_info *sci)
list_for_each_entry(bh, &segbuf->sb_payload_buffers,
b_assoc_buffers) {
- set_buffer_async_write(bh);
if (bh == segbuf->sb_super_root) {
if (bh->b_folio != bd_folio) {
folio_lock(bd_folio);
@@ -1714,6 +1713,7 @@ static void nilfs_segctor_prepare_write(struct nilfs_sc_info *sci)
}
break;
}
+ set_buffer_async_write(bh);
if (bh->b_folio != fs_folio) {
nilfs_begin_folio_io(fs_folio);
fs_folio = bh->b_folio;
@@ -1800,7 +1800,6 @@ static void nilfs_abort_logs(struct list_head *logs, int err)
list_for_each_entry(bh, &segbuf->sb_payload_buffers,
b_assoc_buffers) {
- clear_buffer_async_write(bh);
if (bh == segbuf->sb_super_root) {
clear_buffer_uptodate(bh);
if (bh->b_folio != bd_folio) {
@@ -1809,6 +1808,7 @@ static void nilfs_abort_logs(struct list_head *logs, int err)
}
break;
}
+ clear_buffer_async_write(bh);
if (bh->b_folio != fs_folio) {
nilfs_end_folio_io(fs_folio, err);
fs_folio = bh->b_folio;
@@ -1896,8 +1896,9 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
BIT(BH_Delay) | BIT(BH_NILFS_Volatile) |
BIT(BH_NILFS_Redirected));
- set_mask_bits(&bh->b_state, clear_bits, set_bits);
if (bh == segbuf->sb_super_root) {
+ set_buffer_uptodate(bh);
+ clear_buffer_dirty(bh);
if (bh->b_folio != bd_folio) {
folio_end_writeback(bd_folio);
bd_folio = bh->b_folio;
@@ -1905,6 +1906,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
update_sr = true;
break;
}
+ set_mask_bits(&bh->b_state, clear_bits, set_bits);
if (bh->b_folio != fs_folio) {
nilfs_end_folio_io(fs_folio, 0);
fs_folio = bh->b_folio;
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x 5bc09b397cbf1221f8a8aacb1152650c9195b02b
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021944-impolite-substance-4bc0@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
5bc09b397cbf ("nilfs2: fix potential bug in end_buffer_async_write")
ff5710c3f3c2 ("nilfs2: convert nilfs_segctor_prepare_write to use folios")
3cd36212bf75 ("nilfs2: convert nilfs_segctor_complete_write to use folios")
50196f0081ca ("nilfs2: convert nilfs_abort_logs to use folios")
8f46eaf6fd84 ("nilfs2: add nilfs_end_folio_io()")
679bd7ebdd31 ("nilfs2: fix buffer corruption due to concurrent device reads")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 5bc09b397cbf1221f8a8aacb1152650c9195b02b Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke(a)gmail.com>
Date: Sun, 4 Feb 2024 01:16:45 +0900
Subject: [PATCH] nilfs2: fix potential bug in end_buffer_async_write
According to a syzbot report, end_buffer_async_write(), which handles the
completion of block device writes, may detect abnormal condition of the
buffer async_write flag and cause a BUG_ON failure when using nilfs2.
Nilfs2 itself does not use end_buffer_async_write(). But, the async_write
flag is now used as a marker by commit 7f42ec394156 ("nilfs2: fix issue
with race condition of competition between segments for dirty blocks") as
a means of resolving double list insertion of dirty blocks in
nilfs_lookup_dirty_data_buffers() and nilfs_lookup_node_buffers() and the
resulting crash.
This modification is safe as long as it is used for file data and b-tree
node blocks where the page caches are independent. However, it was
irrelevant and redundant to also introduce async_write for segment summary
and super root blocks that share buffers with the backing device. This
led to the possibility that the BUG_ON check in end_buffer_async_write
would fail as described above, if independent writebacks of the backing
device occurred in parallel.
The use of async_write for segment summary buffers has already been
removed in a previous change.
Fix this issue by removing the manipulation of the async_write flag for
the remaining super root block buffer.
Link: https://lkml.kernel.org/r/20240203161645.4992-1-konishi.ryusuke@gmail.com
Fixes: 7f42ec394156 ("nilfs2: fix issue with race condition of competition between segments for dirty blocks")
Signed-off-by: Ryusuke Konishi <konishi.ryusuke(a)gmail.com>
Reported-by: syzbot+5c04210f7c7f897c1e7f(a)syzkaller.appspotmail.com
Closes: https://lkml.kernel.org/r/00000000000019a97c05fd42f8c8@google.com
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 2590a0860eab..2bfb08052d39 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -1703,7 +1703,6 @@ static void nilfs_segctor_prepare_write(struct nilfs_sc_info *sci)
list_for_each_entry(bh, &segbuf->sb_payload_buffers,
b_assoc_buffers) {
- set_buffer_async_write(bh);
if (bh == segbuf->sb_super_root) {
if (bh->b_folio != bd_folio) {
folio_lock(bd_folio);
@@ -1714,6 +1713,7 @@ static void nilfs_segctor_prepare_write(struct nilfs_sc_info *sci)
}
break;
}
+ set_buffer_async_write(bh);
if (bh->b_folio != fs_folio) {
nilfs_begin_folio_io(fs_folio);
fs_folio = bh->b_folio;
@@ -1800,7 +1800,6 @@ static void nilfs_abort_logs(struct list_head *logs, int err)
list_for_each_entry(bh, &segbuf->sb_payload_buffers,
b_assoc_buffers) {
- clear_buffer_async_write(bh);
if (bh == segbuf->sb_super_root) {
clear_buffer_uptodate(bh);
if (bh->b_folio != bd_folio) {
@@ -1809,6 +1808,7 @@ static void nilfs_abort_logs(struct list_head *logs, int err)
}
break;
}
+ clear_buffer_async_write(bh);
if (bh->b_folio != fs_folio) {
nilfs_end_folio_io(fs_folio, err);
fs_folio = bh->b_folio;
@@ -1896,8 +1896,9 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
BIT(BH_Delay) | BIT(BH_NILFS_Volatile) |
BIT(BH_NILFS_Redirected));
- set_mask_bits(&bh->b_state, clear_bits, set_bits);
if (bh == segbuf->sb_super_root) {
+ set_buffer_uptodate(bh);
+ clear_buffer_dirty(bh);
if (bh->b_folio != bd_folio) {
folio_end_writeback(bd_folio);
bd_folio = bh->b_folio;
@@ -1905,6 +1906,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
update_sr = true;
break;
}
+ set_mask_bits(&bh->b_state, clear_bits, set_bits);
if (bh->b_folio != fs_folio) {
nilfs_end_folio_io(fs_folio, 0);
fs_folio = bh->b_folio;
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x 5bc09b397cbf1221f8a8aacb1152650c9195b02b
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021942-tabloid-clarinet-50d2@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
5bc09b397cbf ("nilfs2: fix potential bug in end_buffer_async_write")
ff5710c3f3c2 ("nilfs2: convert nilfs_segctor_prepare_write to use folios")
3cd36212bf75 ("nilfs2: convert nilfs_segctor_complete_write to use folios")
50196f0081ca ("nilfs2: convert nilfs_abort_logs to use folios")
8f46eaf6fd84 ("nilfs2: add nilfs_end_folio_io()")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 5bc09b397cbf1221f8a8aacb1152650c9195b02b Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke(a)gmail.com>
Date: Sun, 4 Feb 2024 01:16:45 +0900
Subject: [PATCH] nilfs2: fix potential bug in end_buffer_async_write
According to a syzbot report, end_buffer_async_write(), which handles the
completion of block device writes, may detect abnormal condition of the
buffer async_write flag and cause a BUG_ON failure when using nilfs2.
Nilfs2 itself does not use end_buffer_async_write(). But, the async_write
flag is now used as a marker by commit 7f42ec394156 ("nilfs2: fix issue
with race condition of competition between segments for dirty blocks") as
a means of resolving double list insertion of dirty blocks in
nilfs_lookup_dirty_data_buffers() and nilfs_lookup_node_buffers() and the
resulting crash.
This modification is safe as long as it is used for file data and b-tree
node blocks where the page caches are independent. However, it was
irrelevant and redundant to also introduce async_write for segment summary
and super root blocks that share buffers with the backing device. This
led to the possibility that the BUG_ON check in end_buffer_async_write
would fail as described above, if independent writebacks of the backing
device occurred in parallel.
The use of async_write for segment summary buffers has already been
removed in a previous change.
Fix this issue by removing the manipulation of the async_write flag for
the remaining super root block buffer.
Link: https://lkml.kernel.org/r/20240203161645.4992-1-konishi.ryusuke@gmail.com
Fixes: 7f42ec394156 ("nilfs2: fix issue with race condition of competition between segments for dirty blocks")
Signed-off-by: Ryusuke Konishi <konishi.ryusuke(a)gmail.com>
Reported-by: syzbot+5c04210f7c7f897c1e7f(a)syzkaller.appspotmail.com
Closes: https://lkml.kernel.org/r/00000000000019a97c05fd42f8c8@google.com
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 2590a0860eab..2bfb08052d39 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -1703,7 +1703,6 @@ static void nilfs_segctor_prepare_write(struct nilfs_sc_info *sci)
list_for_each_entry(bh, &segbuf->sb_payload_buffers,
b_assoc_buffers) {
- set_buffer_async_write(bh);
if (bh == segbuf->sb_super_root) {
if (bh->b_folio != bd_folio) {
folio_lock(bd_folio);
@@ -1714,6 +1713,7 @@ static void nilfs_segctor_prepare_write(struct nilfs_sc_info *sci)
}
break;
}
+ set_buffer_async_write(bh);
if (bh->b_folio != fs_folio) {
nilfs_begin_folio_io(fs_folio);
fs_folio = bh->b_folio;
@@ -1800,7 +1800,6 @@ static void nilfs_abort_logs(struct list_head *logs, int err)
list_for_each_entry(bh, &segbuf->sb_payload_buffers,
b_assoc_buffers) {
- clear_buffer_async_write(bh);
if (bh == segbuf->sb_super_root) {
clear_buffer_uptodate(bh);
if (bh->b_folio != bd_folio) {
@@ -1809,6 +1808,7 @@ static void nilfs_abort_logs(struct list_head *logs, int err)
}
break;
}
+ clear_buffer_async_write(bh);
if (bh->b_folio != fs_folio) {
nilfs_end_folio_io(fs_folio, err);
fs_folio = bh->b_folio;
@@ -1896,8 +1896,9 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
BIT(BH_Delay) | BIT(BH_NILFS_Volatile) |
BIT(BH_NILFS_Redirected));
- set_mask_bits(&bh->b_state, clear_bits, set_bits);
if (bh == segbuf->sb_super_root) {
+ set_buffer_uptodate(bh);
+ clear_buffer_dirty(bh);
if (bh->b_folio != bd_folio) {
folio_end_writeback(bd_folio);
bd_folio = bh->b_folio;
@@ -1905,6 +1906,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
update_sr = true;
break;
}
+ set_mask_bits(&bh->b_state, clear_bits, set_bits);
if (bh->b_folio != fs_folio) {
nilfs_end_folio_io(fs_folio, 0);
fs_folio = bh->b_folio;
The patch below does not apply to the 6.7-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.7.y
git checkout FETCH_HEAD
git cherry-pick -x 5bc09b397cbf1221f8a8aacb1152650c9195b02b
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021941-aerobics-disliking-5751@gregkh' --subject-prefix 'PATCH 6.7.y' HEAD^..
Possible dependencies:
5bc09b397cbf ("nilfs2: fix potential bug in end_buffer_async_write")
ff5710c3f3c2 ("nilfs2: convert nilfs_segctor_prepare_write to use folios")
3cd36212bf75 ("nilfs2: convert nilfs_segctor_complete_write to use folios")
50196f0081ca ("nilfs2: convert nilfs_abort_logs to use folios")
8f46eaf6fd84 ("nilfs2: add nilfs_end_folio_io()")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 5bc09b397cbf1221f8a8aacb1152650c9195b02b Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke(a)gmail.com>
Date: Sun, 4 Feb 2024 01:16:45 +0900
Subject: [PATCH] nilfs2: fix potential bug in end_buffer_async_write
According to a syzbot report, end_buffer_async_write(), which handles the
completion of block device writes, may detect abnormal condition of the
buffer async_write flag and cause a BUG_ON failure when using nilfs2.
Nilfs2 itself does not use end_buffer_async_write(). But, the async_write
flag is now used as a marker by commit 7f42ec394156 ("nilfs2: fix issue
with race condition of competition between segments for dirty blocks") as
a means of resolving double list insertion of dirty blocks in
nilfs_lookup_dirty_data_buffers() and nilfs_lookup_node_buffers() and the
resulting crash.
This modification is safe as long as it is used for file data and b-tree
node blocks where the page caches are independent. However, it was
irrelevant and redundant to also introduce async_write for segment summary
and super root blocks that share buffers with the backing device. This
led to the possibility that the BUG_ON check in end_buffer_async_write
would fail as described above, if independent writebacks of the backing
device occurred in parallel.
The use of async_write for segment summary buffers has already been
removed in a previous change.
Fix this issue by removing the manipulation of the async_write flag for
the remaining super root block buffer.
Link: https://lkml.kernel.org/r/20240203161645.4992-1-konishi.ryusuke@gmail.com
Fixes: 7f42ec394156 ("nilfs2: fix issue with race condition of competition between segments for dirty blocks")
Signed-off-by: Ryusuke Konishi <konishi.ryusuke(a)gmail.com>
Reported-by: syzbot+5c04210f7c7f897c1e7f(a)syzkaller.appspotmail.com
Closes: https://lkml.kernel.org/r/00000000000019a97c05fd42f8c8@google.com
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 2590a0860eab..2bfb08052d39 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -1703,7 +1703,6 @@ static void nilfs_segctor_prepare_write(struct nilfs_sc_info *sci)
list_for_each_entry(bh, &segbuf->sb_payload_buffers,
b_assoc_buffers) {
- set_buffer_async_write(bh);
if (bh == segbuf->sb_super_root) {
if (bh->b_folio != bd_folio) {
folio_lock(bd_folio);
@@ -1714,6 +1713,7 @@ static void nilfs_segctor_prepare_write(struct nilfs_sc_info *sci)
}
break;
}
+ set_buffer_async_write(bh);
if (bh->b_folio != fs_folio) {
nilfs_begin_folio_io(fs_folio);
fs_folio = bh->b_folio;
@@ -1800,7 +1800,6 @@ static void nilfs_abort_logs(struct list_head *logs, int err)
list_for_each_entry(bh, &segbuf->sb_payload_buffers,
b_assoc_buffers) {
- clear_buffer_async_write(bh);
if (bh == segbuf->sb_super_root) {
clear_buffer_uptodate(bh);
if (bh->b_folio != bd_folio) {
@@ -1809,6 +1808,7 @@ static void nilfs_abort_logs(struct list_head *logs, int err)
}
break;
}
+ clear_buffer_async_write(bh);
if (bh->b_folio != fs_folio) {
nilfs_end_folio_io(fs_folio, err);
fs_folio = bh->b_folio;
@@ -1896,8 +1896,9 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
BIT(BH_Delay) | BIT(BH_NILFS_Volatile) |
BIT(BH_NILFS_Redirected));
- set_mask_bits(&bh->b_state, clear_bits, set_bits);
if (bh == segbuf->sb_super_root) {
+ set_buffer_uptodate(bh);
+ clear_buffer_dirty(bh);
if (bh->b_folio != bd_folio) {
folio_end_writeback(bd_folio);
bd_folio = bh->b_folio;
@@ -1905,6 +1906,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
update_sr = true;
break;
}
+ set_mask_bits(&bh->b_state, clear_bits, set_bits);
if (bh->b_folio != fs_folio) {
nilfs_end_folio_io(fs_folio, 0);
fs_folio = bh->b_folio;
commit 38296afe3c6ee07319e01bb249aa4bb47c07b534 upstream.
Syzbot reported a hang issue in migrate_pages_batch() called by mbind()
and nilfs_lookup_dirty_data_buffers() called in the log writer of nilfs2.
While migrate_pages_batch() locks a folio and waits for the writeback to
complete, the log writer thread that should bring the writeback to
completion picks up the folio being written back in
nilfs_lookup_dirty_data_buffers() that it calls for subsequent log
creation and was trying to lock the folio. Thus causing a deadlock.
In the first place, it is unexpected that folios/pages in the middle of
writeback will be updated and become dirty. Nilfs2 adds a checksum to
verify the validity of the log being written and uses it for recovery at
mount, so data changes during writeback are suppressed. Since this is
broken, an unclean shutdown could potentially cause recovery to fail.
Investigation revealed that the root cause is that the wait for writeback
completion in nilfs_page_mkwrite() is conditional, and if the backing
device does not require stable writes, data may be modified without
waiting.
Fix these issues by making nilfs_page_mkwrite() wait for writeback to
finish regardless of the stable write requirement of the backing device.
Link: https://lkml.kernel.org/r/20240131145657.4209-1-konishi.ryusuke@gmail.com
Fixes: 1d1d1a767206 ("mm: only enforce stable page writes if the backing device requires it")
Signed-off-by: Ryusuke Konishi <konishi.ryusuke(a)gmail.com>
Reported-by: syzbot+ee2ae68da3b22d04cd8d(a)syzkaller.appspotmail.com
Closes: https://lkml.kernel.org/r/00000000000047d819061004ad6c@google.com
Tested-by: Ryusuke Konishi <konishi.ryusuke(a)gmail.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
Please apply this patch to the stable trees indicated by the subject line
prefix.
These versions do not yet have page-to-folio conversion applied to the
target function, so page-based "wait_on_page_writeback()" is used instead
of "folio_wait_writeback()" in this patch. This did not apply as-is to
v6.5 and earlier versions due to an fs-wide change. So I would like to
post a separate patch for earlier stable trees.
Thanks,
Ryusuke Konishi
fs/nilfs2/file.c | 8 +++++++-
1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 740ce26d1e76..0505feef79f4 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -105,7 +105,13 @@ static vm_fault_t nilfs_page_mkwrite(struct vm_fault *vmf)
nilfs_transaction_commit(inode->i_sb);
mapped:
- wait_for_stable_page(page);
+ /*
+ * Since checksumming including data blocks is performed to determine
+ * the validity of the log to be written and used for recovery, it is
+ * necessary to wait for writeback to finish here, regardless of the
+ * stable write requirement of the backing device.
+ */
+ wait_on_page_writeback(page);
out:
sb_end_pagefault(inode->i_sb);
return vmf_fs_error(ret);
--
2.39.3
The patch below does not apply to the 6.7-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.7.y
git checkout FETCH_HEAD
git cherry-pick -x ea73179e64131bcd29ba6defd33732abdf8ca14b
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021900-scorch-tripping-95c2@gregkh' --subject-prefix 'PATCH 6.7.y' HEAD^..
Possible dependencies:
ea73179e6413 ("powerpc/ftrace: Ignore ftrace locations in exit text sections")
2ec36570c358 ("powerpc/ftrace: Fix indentation in ftrace.h")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From ea73179e64131bcd29ba6defd33732abdf8ca14b Mon Sep 17 00:00:00 2001
From: Naveen N Rao <naveen(a)kernel.org>
Date: Tue, 13 Feb 2024 23:24:10 +0530
Subject: [PATCH] powerpc/ftrace: Ignore ftrace locations in exit text sections
Michael reported that we are seeing an ftrace bug on bootup when KASAN
is enabled and we are using -fpatchable-function-entry:
ftrace: allocating 47780 entries in 18 pages
ftrace-powerpc: 0xc0000000020b3d5c: No module provided for non-kernel address
------------[ ftrace bug ]------------
ftrace faulted on modifying
[<c0000000020b3d5c>] 0xc0000000020b3d5c
Initializing ftrace call sites
ftrace record flags: 0
(0)
expected tramp: c00000000008cef4
------------[ cut here ]------------
WARNING: CPU: 0 PID: 0 at kernel/trace/ftrace.c:2180 ftrace_bug+0x3c0/0x424
Modules linked in:
CPU: 0 PID: 0 Comm: swapper Not tainted 6.5.0-rc3-00120-g0f71dcfb4aef #860
Hardware name: IBM pSeries (emulated by qemu) POWER9 (raw) 0x4e1202 0xf000005 of:SLOF,HEAD hv:linux,kvm pSeries
NIP: c0000000003aa81c LR: c0000000003aa818 CTR: 0000000000000000
REGS: c0000000033cfab0 TRAP: 0700 Not tainted (6.5.0-rc3-00120-g0f71dcfb4aef)
MSR: 8000000002021033 <SF,VEC,ME,IR,DR,RI,LE> CR: 28028240 XER: 00000000
CFAR: c0000000002781a8 IRQMASK: 3
...
NIP [c0000000003aa81c] ftrace_bug+0x3c0/0x424
LR [c0000000003aa818] ftrace_bug+0x3bc/0x424
Call Trace:
ftrace_bug+0x3bc/0x424 (unreliable)
ftrace_process_locs+0x5f4/0x8a0
ftrace_init+0xc0/0x1d0
start_kernel+0x1d8/0x484
With CONFIG_FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY=y and
CONFIG_KASAN=y, compiler emits nops in functions that it generates for
registering and unregistering global variables (unlike with -pg and
-mprofile-kernel where calls to _mcount() are not generated in those
functions). Those functions then end up in INIT_TEXT and EXIT_TEXT
respectively. We don't expect to see any profiled functions in
EXIT_TEXT, so ftrace_init_nop() assumes that all addresses that aren't
in the core kernel text belongs to a module. Since these functions do
not match that criteria, we see the above bug.
Address this by having ftrace ignore all locations in the text exit
sections of vmlinux.
Fixes: 0f71dcfb4aef ("powerpc/ftrace: Add support for -fpatchable-function-entry")
Cc: stable(a)vger.kernel.org # v6.6+
Reported-by: Michael Ellerman <mpe(a)ellerman.id.au>
Signed-off-by: Naveen N Rao <naveen(a)kernel.org>
Reviewed-by: Benjamin Gray <bgray(a)linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe(a)ellerman.id.au>
Link: https://msgid.link/20240213175410.1091313-1-naveen@kernel.org
diff --git a/arch/powerpc/include/asm/ftrace.h b/arch/powerpc/include/asm/ftrace.h
index 1ebd2ca97f12..107fc5a48456 100644
--- a/arch/powerpc/include/asm/ftrace.h
+++ b/arch/powerpc/include/asm/ftrace.h
@@ -20,14 +20,6 @@
#ifndef __ASSEMBLY__
extern void _mcount(void);
-static inline unsigned long ftrace_call_adjust(unsigned long addr)
-{
- if (IS_ENABLED(CONFIG_ARCH_USING_PATCHABLE_FUNCTION_ENTRY))
- addr += MCOUNT_INSN_SIZE;
-
- return addr;
-}
-
unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip,
unsigned long sp);
@@ -142,8 +134,10 @@ static inline u8 this_cpu_get_ftrace_enabled(void) { return 1; }
#ifdef CONFIG_FUNCTION_TRACER
extern unsigned int ftrace_tramp_text[], ftrace_tramp_init[];
void ftrace_free_init_tramp(void);
+unsigned long ftrace_call_adjust(unsigned long addr);
#else
static inline void ftrace_free_init_tramp(void) { }
+static inline unsigned long ftrace_call_adjust(unsigned long addr) { return addr; }
#endif
#endif /* !__ASSEMBLY__ */
diff --git a/arch/powerpc/include/asm/sections.h b/arch/powerpc/include/asm/sections.h
index ea26665f82cf..f43f3a6b0051 100644
--- a/arch/powerpc/include/asm/sections.h
+++ b/arch/powerpc/include/asm/sections.h
@@ -14,6 +14,7 @@ typedef struct func_desc func_desc_t;
extern char __head_end[];
extern char __srwx_boundary[];
+extern char __exittext_begin[], __exittext_end[];
/* Patch sites */
extern s32 patch__call_flush_branch_caches1;
diff --git a/arch/powerpc/kernel/trace/ftrace.c b/arch/powerpc/kernel/trace/ftrace.c
index 82010629cf88..d8d6b4fd9a14 100644
--- a/arch/powerpc/kernel/trace/ftrace.c
+++ b/arch/powerpc/kernel/trace/ftrace.c
@@ -27,10 +27,22 @@
#include <asm/ftrace.h>
#include <asm/syscall.h>
#include <asm/inst.h>
+#include <asm/sections.h>
#define NUM_FTRACE_TRAMPS 2
static unsigned long ftrace_tramps[NUM_FTRACE_TRAMPS];
+unsigned long ftrace_call_adjust(unsigned long addr)
+{
+ if (addr >= (unsigned long)__exittext_begin && addr < (unsigned long)__exittext_end)
+ return 0;
+
+ if (IS_ENABLED(CONFIG_ARCH_USING_PATCHABLE_FUNCTION_ENTRY))
+ addr += MCOUNT_INSN_SIZE;
+
+ return addr;
+}
+
static ppc_inst_t ftrace_create_branch_inst(unsigned long ip, unsigned long addr, int link)
{
ppc_inst_t op;
diff --git a/arch/powerpc/kernel/trace/ftrace_64_pg.c b/arch/powerpc/kernel/trace/ftrace_64_pg.c
index 7b85c3b460a3..12fab1803bcf 100644
--- a/arch/powerpc/kernel/trace/ftrace_64_pg.c
+++ b/arch/powerpc/kernel/trace/ftrace_64_pg.c
@@ -37,6 +37,11 @@
#define NUM_FTRACE_TRAMPS 8
static unsigned long ftrace_tramps[NUM_FTRACE_TRAMPS];
+unsigned long ftrace_call_adjust(unsigned long addr)
+{
+ return addr;
+}
+
static ppc_inst_t
ftrace_call_replace(unsigned long ip, unsigned long addr, int link)
{
diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S
index 1c5970df3233..f420df7888a7 100644
--- a/arch/powerpc/kernel/vmlinux.lds.S
+++ b/arch/powerpc/kernel/vmlinux.lds.S
@@ -281,7 +281,9 @@ SECTIONS
* to deal with references from __bug_table
*/
.exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) {
+ __exittext_begin = .;
EXIT_TEXT
+ __exittext_end = .;
}
. = ALIGN(PAGE_SIZE);
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x aad98efd0b121f63a2e1c221dcb4d4850128c697
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021931-venue-await-dbc3@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
aad98efd0b12 ("powerpc/64: Set task pt_regs->link to the LR value on scv entry")
e754f4d13e39 ("powerpc/64: move interrupt return asm to interrupt_64.S")
59dc5bfca0cb ("powerpc/64s: avoid reloading (H)SRR registers if they are still valid")
1df7d5e4baea ("powerpc/64s: introduce different functions to return from SRR vs HSRR interrupts")
ac3d085368b3 ("powerpc/signal32: Remove impossible #ifdef combinations")
69d4d6e5fd9f ("powerpc: Don't use 'struct ppc_inst' to reference instruction location")
e90a21ea801d ("powerpc/lib/code-patching: Don't use struct 'ppc_inst' for runnable code in tests.")
6c0d181daabc ("powerpc/lib/code-patching: Make instr_is_branch_to_addr() static")
18c85964b10b ("powerpc: Do not dereference code as 'struct ppc_inst' (uprobe, code-patching, feature-fixups)")
f30becb5e9ec ("powerpc: Replace PPC_INST_NOP by PPC_RAW_NOP()")
ef909ba95414 ("powerpc/lib/feature-fixups: Use PPC_RAW_xxx() macros")
5a03e1e9728e ("powerpc/ftrace: Use PPC_RAW_MFLR() and PPC_RAW_NOP()")
e73045975601 ("powerpc/security: Use PPC_RAW_BLR() and PPC_RAW_NOP()")
47b04699d070 ("powerpc/modules: Use PPC_RAW_xx() macros")
1c9debbc2eb5 ("powerpc/signal: Use PPC_RAW_xx() macros")
82123a3d1d5a ("powerpc/kprobes: Fix validation of prefixed instructions across page boundary")
d72500f99284 ("powerpc/64s/syscall: Fix ptrace syscall info with scv syscalls")
5b48ba2fbd77 ("powerpc/64s: Fix stf mitigation patching w/strict RWX & hash")
49b39ec248af ("powerpc/64s: Fix entry flush patching w/strict RWX & hash")
2c8c89b95831 ("powerpc/pseries: Fix hcall tracing recursion in pv queued spinlocks")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From aad98efd0b121f63a2e1c221dcb4d4850128c697 Mon Sep 17 00:00:00 2001
From: Naveen N Rao <naveen(a)kernel.org>
Date: Fri, 2 Feb 2024 21:13:16 +0530
Subject: [PATCH] powerpc/64: Set task pt_regs->link to the LR value on scv
entry
Nysal reported that userspace backtraces are missing in offcputime bcc
tool. As an example:
$ sudo ./bcc/tools/offcputime.py -uU
Tracing off-CPU time (us) of user threads by user stack... Hit Ctrl-C to end.
^C
write
- python (9107)
8
write
- sudo (9105)
9
mmap
- python (9107)
16
clock_nanosleep
- multipathd (697)
3001604
The offcputime bcc tool attaches a bpf program to a kprobe on
finish_task_switch(), which is usually hit on a syscall from userspace.
With the switch to system call vectored, we started setting
pt_regs->link to zero. This is because system call vectored behaves like
a function call with LR pointing to the system call return address, and
with no modification to SRR0/SRR1. The LR value does indicate our next
instruction, so it is being saved as pt_regs->nip, and pt_regs->link is
being set to zero. This is not a problem by itself, but BPF uses perf
callchain infrastructure for capturing stack traces, and that stores LR
as the second entry in the stack trace. perf has code to cope with the
second entry being zero, and skips over it. However, generic userspace
unwinders assume that a zero entry indicates end of the stack trace,
resulting in a truncated userspace stack trace.
Rather than fixing all userspace unwinders to ignore/skip past the
second entry, store the real LR value in pt_regs->link so that there
continues to be a valid, though duplicate entry in the stack trace.
With this change:
$ sudo ./bcc/tools/offcputime.py -uU
Tracing off-CPU time (us) of user threads by user stack... Hit Ctrl-C to end.
^C
write
write
[unknown]
[unknown]
[unknown]
[unknown]
[unknown]
PyObject_VectorcallMethod
[unknown]
[unknown]
PyObject_CallOneArg
PyFile_WriteObject
PyFile_WriteString
[unknown]
[unknown]
PyObject_Vectorcall
_PyEval_EvalFrameDefault
PyEval_EvalCode
[unknown]
[unknown]
[unknown]
_PyRun_SimpleFileObject
_PyRun_AnyFileObject
Py_RunMain
[unknown]
Py_BytesMain
[unknown]
__libc_start_main
- python (1293)
7
write
write
[unknown]
sudo_ev_loop_v1
sudo_ev_dispatch_v1
[unknown]
[unknown]
[unknown]
[unknown]
__libc_start_main
- sudo (1291)
7
syscall
syscall
bpf_open_perf_buffer_opts
[unknown]
[unknown]
[unknown]
[unknown]
_PyObject_MakeTpCall
PyObject_Vectorcall
_PyEval_EvalFrameDefault
PyEval_EvalCode
[unknown]
[unknown]
[unknown]
_PyRun_SimpleFileObject
_PyRun_AnyFileObject
Py_RunMain
[unknown]
Py_BytesMain
[unknown]
__libc_start_main
- python (1293)
11
clock_nanosleep
clock_nanosleep
nanosleep
sleep
[unknown]
[unknown]
__clone
- multipathd (698)
3001661
Fixes: 7fa95f9adaee ("powerpc/64s: system call support for scv/rfscv instructions")
Cc: stable(a)vger.kernel.org
Reported-by: "Nysal Jan K.A" <nysal(a)linux.ibm.com>
Signed-off-by: Naveen N Rao <naveen(a)kernel.org>
Signed-off-by: Michael Ellerman <mpe(a)ellerman.id.au>
Link: https://msgid.link/20240202154316.395276-1-naveen@kernel.org
diff --git a/arch/powerpc/kernel/interrupt_64.S b/arch/powerpc/kernel/interrupt_64.S
index bd863702d812..1ad059a9e2fe 100644
--- a/arch/powerpc/kernel/interrupt_64.S
+++ b/arch/powerpc/kernel/interrupt_64.S
@@ -52,7 +52,8 @@ _ASM_NOKPROBE_SYMBOL(system_call_vectored_\name)
mr r10,r1
ld r1,PACAKSAVE(r13)
std r10,0(r1)
- std r11,_NIP(r1)
+ std r11,_LINK(r1)
+ std r11,_NIP(r1) /* Saved LR is also the next instruction */
std r12,_MSR(r1)
std r0,GPR0(r1)
std r10,GPR1(r1)
@@ -70,7 +71,6 @@ _ASM_NOKPROBE_SYMBOL(system_call_vectored_\name)
std r9,GPR13(r1)
SAVE_NVGPRS(r1)
std r11,_XER(r1)
- std r11,_LINK(r1)
std r11,_CTR(r1)
li r11,\trapnr
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x ea73179e64131bcd29ba6defd33732abdf8ca14b
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021901-stallion-swaddling-5833@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
ea73179e6413 ("powerpc/ftrace: Ignore ftrace locations in exit text sections")
2ec36570c358 ("powerpc/ftrace: Fix indentation in ftrace.h")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From ea73179e64131bcd29ba6defd33732abdf8ca14b Mon Sep 17 00:00:00 2001
From: Naveen N Rao <naveen(a)kernel.org>
Date: Tue, 13 Feb 2024 23:24:10 +0530
Subject: [PATCH] powerpc/ftrace: Ignore ftrace locations in exit text sections
Michael reported that we are seeing an ftrace bug on bootup when KASAN
is enabled and we are using -fpatchable-function-entry:
ftrace: allocating 47780 entries in 18 pages
ftrace-powerpc: 0xc0000000020b3d5c: No module provided for non-kernel address
------------[ ftrace bug ]------------
ftrace faulted on modifying
[<c0000000020b3d5c>] 0xc0000000020b3d5c
Initializing ftrace call sites
ftrace record flags: 0
(0)
expected tramp: c00000000008cef4
------------[ cut here ]------------
WARNING: CPU: 0 PID: 0 at kernel/trace/ftrace.c:2180 ftrace_bug+0x3c0/0x424
Modules linked in:
CPU: 0 PID: 0 Comm: swapper Not tainted 6.5.0-rc3-00120-g0f71dcfb4aef #860
Hardware name: IBM pSeries (emulated by qemu) POWER9 (raw) 0x4e1202 0xf000005 of:SLOF,HEAD hv:linux,kvm pSeries
NIP: c0000000003aa81c LR: c0000000003aa818 CTR: 0000000000000000
REGS: c0000000033cfab0 TRAP: 0700 Not tainted (6.5.0-rc3-00120-g0f71dcfb4aef)
MSR: 8000000002021033 <SF,VEC,ME,IR,DR,RI,LE> CR: 28028240 XER: 00000000
CFAR: c0000000002781a8 IRQMASK: 3
...
NIP [c0000000003aa81c] ftrace_bug+0x3c0/0x424
LR [c0000000003aa818] ftrace_bug+0x3bc/0x424
Call Trace:
ftrace_bug+0x3bc/0x424 (unreliable)
ftrace_process_locs+0x5f4/0x8a0
ftrace_init+0xc0/0x1d0
start_kernel+0x1d8/0x484
With CONFIG_FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY=y and
CONFIG_KASAN=y, compiler emits nops in functions that it generates for
registering and unregistering global variables (unlike with -pg and
-mprofile-kernel where calls to _mcount() are not generated in those
functions). Those functions then end up in INIT_TEXT and EXIT_TEXT
respectively. We don't expect to see any profiled functions in
EXIT_TEXT, so ftrace_init_nop() assumes that all addresses that aren't
in the core kernel text belongs to a module. Since these functions do
not match that criteria, we see the above bug.
Address this by having ftrace ignore all locations in the text exit
sections of vmlinux.
Fixes: 0f71dcfb4aef ("powerpc/ftrace: Add support for -fpatchable-function-entry")
Cc: stable(a)vger.kernel.org # v6.6+
Reported-by: Michael Ellerman <mpe(a)ellerman.id.au>
Signed-off-by: Naveen N Rao <naveen(a)kernel.org>
Reviewed-by: Benjamin Gray <bgray(a)linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe(a)ellerman.id.au>
Link: https://msgid.link/20240213175410.1091313-1-naveen@kernel.org
diff --git a/arch/powerpc/include/asm/ftrace.h b/arch/powerpc/include/asm/ftrace.h
index 1ebd2ca97f12..107fc5a48456 100644
--- a/arch/powerpc/include/asm/ftrace.h
+++ b/arch/powerpc/include/asm/ftrace.h
@@ -20,14 +20,6 @@
#ifndef __ASSEMBLY__
extern void _mcount(void);
-static inline unsigned long ftrace_call_adjust(unsigned long addr)
-{
- if (IS_ENABLED(CONFIG_ARCH_USING_PATCHABLE_FUNCTION_ENTRY))
- addr += MCOUNT_INSN_SIZE;
-
- return addr;
-}
-
unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip,
unsigned long sp);
@@ -142,8 +134,10 @@ static inline u8 this_cpu_get_ftrace_enabled(void) { return 1; }
#ifdef CONFIG_FUNCTION_TRACER
extern unsigned int ftrace_tramp_text[], ftrace_tramp_init[];
void ftrace_free_init_tramp(void);
+unsigned long ftrace_call_adjust(unsigned long addr);
#else
static inline void ftrace_free_init_tramp(void) { }
+static inline unsigned long ftrace_call_adjust(unsigned long addr) { return addr; }
#endif
#endif /* !__ASSEMBLY__ */
diff --git a/arch/powerpc/include/asm/sections.h b/arch/powerpc/include/asm/sections.h
index ea26665f82cf..f43f3a6b0051 100644
--- a/arch/powerpc/include/asm/sections.h
+++ b/arch/powerpc/include/asm/sections.h
@@ -14,6 +14,7 @@ typedef struct func_desc func_desc_t;
extern char __head_end[];
extern char __srwx_boundary[];
+extern char __exittext_begin[], __exittext_end[];
/* Patch sites */
extern s32 patch__call_flush_branch_caches1;
diff --git a/arch/powerpc/kernel/trace/ftrace.c b/arch/powerpc/kernel/trace/ftrace.c
index 82010629cf88..d8d6b4fd9a14 100644
--- a/arch/powerpc/kernel/trace/ftrace.c
+++ b/arch/powerpc/kernel/trace/ftrace.c
@@ -27,10 +27,22 @@
#include <asm/ftrace.h>
#include <asm/syscall.h>
#include <asm/inst.h>
+#include <asm/sections.h>
#define NUM_FTRACE_TRAMPS 2
static unsigned long ftrace_tramps[NUM_FTRACE_TRAMPS];
+unsigned long ftrace_call_adjust(unsigned long addr)
+{
+ if (addr >= (unsigned long)__exittext_begin && addr < (unsigned long)__exittext_end)
+ return 0;
+
+ if (IS_ENABLED(CONFIG_ARCH_USING_PATCHABLE_FUNCTION_ENTRY))
+ addr += MCOUNT_INSN_SIZE;
+
+ return addr;
+}
+
static ppc_inst_t ftrace_create_branch_inst(unsigned long ip, unsigned long addr, int link)
{
ppc_inst_t op;
diff --git a/arch/powerpc/kernel/trace/ftrace_64_pg.c b/arch/powerpc/kernel/trace/ftrace_64_pg.c
index 7b85c3b460a3..12fab1803bcf 100644
--- a/arch/powerpc/kernel/trace/ftrace_64_pg.c
+++ b/arch/powerpc/kernel/trace/ftrace_64_pg.c
@@ -37,6 +37,11 @@
#define NUM_FTRACE_TRAMPS 8
static unsigned long ftrace_tramps[NUM_FTRACE_TRAMPS];
+unsigned long ftrace_call_adjust(unsigned long addr)
+{
+ return addr;
+}
+
static ppc_inst_t
ftrace_call_replace(unsigned long ip, unsigned long addr, int link)
{
diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S
index 1c5970df3233..f420df7888a7 100644
--- a/arch/powerpc/kernel/vmlinux.lds.S
+++ b/arch/powerpc/kernel/vmlinux.lds.S
@@ -281,7 +281,9 @@ SECTIONS
* to deal with references from __bug_table
*/
.exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) {
+ __exittext_begin = .;
EXIT_TEXT
+ __exittext_end = .;
}
. = ALIGN(PAGE_SIZE);
The patch below does not apply to the 6.7-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.7.y
git checkout FETCH_HEAD
git cherry-pick -x ea73179e64131bcd29ba6defd33732abdf8ca14b
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021900-amusement-national-c29f@gregkh' --subject-prefix 'PATCH 6.7.y' HEAD^..
Possible dependencies:
ea73179e6413 ("powerpc/ftrace: Ignore ftrace locations in exit text sections")
2ec36570c358 ("powerpc/ftrace: Fix indentation in ftrace.h")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From ea73179e64131bcd29ba6defd33732abdf8ca14b Mon Sep 17 00:00:00 2001
From: Naveen N Rao <naveen(a)kernel.org>
Date: Tue, 13 Feb 2024 23:24:10 +0530
Subject: [PATCH] powerpc/ftrace: Ignore ftrace locations in exit text sections
Michael reported that we are seeing an ftrace bug on bootup when KASAN
is enabled and we are using -fpatchable-function-entry:
ftrace: allocating 47780 entries in 18 pages
ftrace-powerpc: 0xc0000000020b3d5c: No module provided for non-kernel address
------------[ ftrace bug ]------------
ftrace faulted on modifying
[<c0000000020b3d5c>] 0xc0000000020b3d5c
Initializing ftrace call sites
ftrace record flags: 0
(0)
expected tramp: c00000000008cef4
------------[ cut here ]------------
WARNING: CPU: 0 PID: 0 at kernel/trace/ftrace.c:2180 ftrace_bug+0x3c0/0x424
Modules linked in:
CPU: 0 PID: 0 Comm: swapper Not tainted 6.5.0-rc3-00120-g0f71dcfb4aef #860
Hardware name: IBM pSeries (emulated by qemu) POWER9 (raw) 0x4e1202 0xf000005 of:SLOF,HEAD hv:linux,kvm pSeries
NIP: c0000000003aa81c LR: c0000000003aa818 CTR: 0000000000000000
REGS: c0000000033cfab0 TRAP: 0700 Not tainted (6.5.0-rc3-00120-g0f71dcfb4aef)
MSR: 8000000002021033 <SF,VEC,ME,IR,DR,RI,LE> CR: 28028240 XER: 00000000
CFAR: c0000000002781a8 IRQMASK: 3
...
NIP [c0000000003aa81c] ftrace_bug+0x3c0/0x424
LR [c0000000003aa818] ftrace_bug+0x3bc/0x424
Call Trace:
ftrace_bug+0x3bc/0x424 (unreliable)
ftrace_process_locs+0x5f4/0x8a0
ftrace_init+0xc0/0x1d0
start_kernel+0x1d8/0x484
With CONFIG_FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY=y and
CONFIG_KASAN=y, compiler emits nops in functions that it generates for
registering and unregistering global variables (unlike with -pg and
-mprofile-kernel where calls to _mcount() are not generated in those
functions). Those functions then end up in INIT_TEXT and EXIT_TEXT
respectively. We don't expect to see any profiled functions in
EXIT_TEXT, so ftrace_init_nop() assumes that all addresses that aren't
in the core kernel text belongs to a module. Since these functions do
not match that criteria, we see the above bug.
Address this by having ftrace ignore all locations in the text exit
sections of vmlinux.
Fixes: 0f71dcfb4aef ("powerpc/ftrace: Add support for -fpatchable-function-entry")
Cc: stable(a)vger.kernel.org # v6.6+
Reported-by: Michael Ellerman <mpe(a)ellerman.id.au>
Signed-off-by: Naveen N Rao <naveen(a)kernel.org>
Reviewed-by: Benjamin Gray <bgray(a)linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe(a)ellerman.id.au>
Link: https://msgid.link/20240213175410.1091313-1-naveen@kernel.org
diff --git a/arch/powerpc/include/asm/ftrace.h b/arch/powerpc/include/asm/ftrace.h
index 1ebd2ca97f12..107fc5a48456 100644
--- a/arch/powerpc/include/asm/ftrace.h
+++ b/arch/powerpc/include/asm/ftrace.h
@@ -20,14 +20,6 @@
#ifndef __ASSEMBLY__
extern void _mcount(void);
-static inline unsigned long ftrace_call_adjust(unsigned long addr)
-{
- if (IS_ENABLED(CONFIG_ARCH_USING_PATCHABLE_FUNCTION_ENTRY))
- addr += MCOUNT_INSN_SIZE;
-
- return addr;
-}
-
unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip,
unsigned long sp);
@@ -142,8 +134,10 @@ static inline u8 this_cpu_get_ftrace_enabled(void) { return 1; }
#ifdef CONFIG_FUNCTION_TRACER
extern unsigned int ftrace_tramp_text[], ftrace_tramp_init[];
void ftrace_free_init_tramp(void);
+unsigned long ftrace_call_adjust(unsigned long addr);
#else
static inline void ftrace_free_init_tramp(void) { }
+static inline unsigned long ftrace_call_adjust(unsigned long addr) { return addr; }
#endif
#endif /* !__ASSEMBLY__ */
diff --git a/arch/powerpc/include/asm/sections.h b/arch/powerpc/include/asm/sections.h
index ea26665f82cf..f43f3a6b0051 100644
--- a/arch/powerpc/include/asm/sections.h
+++ b/arch/powerpc/include/asm/sections.h
@@ -14,6 +14,7 @@ typedef struct func_desc func_desc_t;
extern char __head_end[];
extern char __srwx_boundary[];
+extern char __exittext_begin[], __exittext_end[];
/* Patch sites */
extern s32 patch__call_flush_branch_caches1;
diff --git a/arch/powerpc/kernel/trace/ftrace.c b/arch/powerpc/kernel/trace/ftrace.c
index 82010629cf88..d8d6b4fd9a14 100644
--- a/arch/powerpc/kernel/trace/ftrace.c
+++ b/arch/powerpc/kernel/trace/ftrace.c
@@ -27,10 +27,22 @@
#include <asm/ftrace.h>
#include <asm/syscall.h>
#include <asm/inst.h>
+#include <asm/sections.h>
#define NUM_FTRACE_TRAMPS 2
static unsigned long ftrace_tramps[NUM_FTRACE_TRAMPS];
+unsigned long ftrace_call_adjust(unsigned long addr)
+{
+ if (addr >= (unsigned long)__exittext_begin && addr < (unsigned long)__exittext_end)
+ return 0;
+
+ if (IS_ENABLED(CONFIG_ARCH_USING_PATCHABLE_FUNCTION_ENTRY))
+ addr += MCOUNT_INSN_SIZE;
+
+ return addr;
+}
+
static ppc_inst_t ftrace_create_branch_inst(unsigned long ip, unsigned long addr, int link)
{
ppc_inst_t op;
diff --git a/arch/powerpc/kernel/trace/ftrace_64_pg.c b/arch/powerpc/kernel/trace/ftrace_64_pg.c
index 7b85c3b460a3..12fab1803bcf 100644
--- a/arch/powerpc/kernel/trace/ftrace_64_pg.c
+++ b/arch/powerpc/kernel/trace/ftrace_64_pg.c
@@ -37,6 +37,11 @@
#define NUM_FTRACE_TRAMPS 8
static unsigned long ftrace_tramps[NUM_FTRACE_TRAMPS];
+unsigned long ftrace_call_adjust(unsigned long addr)
+{
+ return addr;
+}
+
static ppc_inst_t
ftrace_call_replace(unsigned long ip, unsigned long addr, int link)
{
diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S
index 1c5970df3233..f420df7888a7 100644
--- a/arch/powerpc/kernel/vmlinux.lds.S
+++ b/arch/powerpc/kernel/vmlinux.lds.S
@@ -281,7 +281,9 @@ SECTIONS
* to deal with references from __bug_table
*/
.exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) {
+ __exittext_begin = .;
EXIT_TEXT
+ __exittext_end = .;
}
. = ALIGN(PAGE_SIZE);
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.19.y
git checkout FETCH_HEAD
git cherry-pick -x b35f8dbbce818b02c730dc85133dc7754266e084
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021931-frisbee-commence-08ec@gregkh' --subject-prefix 'PATCH 4.19.y' HEAD^..
Possible dependencies:
b35f8dbbce81 ("serial: max310x: prevent infinite while() loop in port startup")
93cd256ab224 ("serial: max310x: improve crystal stable clock detection")
0419373333c2 ("serial: max310x: set default value when reading clock ready bit")
6ef281daf020 ("serial: max310x: use a separate regmap for each port")
285e76fc049c ("serial: max310x: use regmap methods for SPI batch operations")
c808fab604ca ("serial: max310x: Make use of device properties")
b7382c73b2d7 ("tty: max310x: Don't pass stacked buffers to SPI")
3a10e3dd52e8 ("serial: max310x: Fix to avoid potential NULL pointer dereference")
f233ea4327d7 ("serial: max310x: Correction of the initial setting of the MODE1 bits for various supported ICs.")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From b35f8dbbce818b02c730dc85133dc7754266e084 Mon Sep 17 00:00:00 2001
From: Hugo Villeneuve <hvilleneuve(a)dimonoff.com>
Date: Tue, 16 Jan 2024 16:30:01 -0500
Subject: [PATCH] serial: max310x: prevent infinite while() loop in port
startup
If there is a problem after resetting a port, the do/while() loop that
checks the default value of DIVLSB register may run forever and spam the
I2C bus.
Add a delay before each read of DIVLSB, and a maximum number of tries to
prevent that situation from happening.
Also fail probe if port reset is unsuccessful.
Fixes: 10d8b34a4217 ("serial: max310x: Driver rework")
Cc: stable(a)vger.kernel.org
Signed-off-by: Hugo Villeneuve <hvilleneuve(a)dimonoff.com>
Link: https://lore.kernel.org/r/20240116213001.3691629-5-hugo@hugovil.com
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
diff --git a/drivers/tty/serial/max310x.c b/drivers/tty/serial/max310x.c
index 552e153a24e0..10bf6d75bf9e 100644
--- a/drivers/tty/serial/max310x.c
+++ b/drivers/tty/serial/max310x.c
@@ -237,6 +237,10 @@
#define MAX310x_REV_MASK (0xf8)
#define MAX310X_WRITE_BIT 0x80
+/* Port startup definitions */
+#define MAX310X_PORT_STARTUP_WAIT_RETRIES 20 /* Number of retries */
+#define MAX310X_PORT_STARTUP_WAIT_DELAY_MS 10 /* Delay between retries */
+
/* Crystal-related definitions */
#define MAX310X_XTAL_WAIT_RETRIES 20 /* Number of retries */
#define MAX310X_XTAL_WAIT_DELAY_MS 10 /* Delay between retries */
@@ -1346,6 +1350,9 @@ static int max310x_probe(struct device *dev, const struct max310x_devtype *devty
goto out_clk;
for (i = 0; i < devtype->nr; i++) {
+ bool started = false;
+ unsigned int try = 0, val = 0;
+
/* Reset port */
regmap_write(regmaps[i], MAX310X_MODE2_REG,
MAX310X_MODE2_RST_BIT);
@@ -1354,8 +1361,17 @@ static int max310x_probe(struct device *dev, const struct max310x_devtype *devty
/* Wait for port startup */
do {
- regmap_read(regmaps[i], MAX310X_BRGDIVLSB_REG, &ret);
- } while (ret != 0x01);
+ msleep(MAX310X_PORT_STARTUP_WAIT_DELAY_MS);
+ regmap_read(regmaps[i], MAX310X_BRGDIVLSB_REG, &val);
+
+ if (val == 0x01)
+ started = true;
+ } while (!started && (++try < MAX310X_PORT_STARTUP_WAIT_RETRIES));
+
+ if (!started) {
+ ret = dev_err_probe(dev, -EAGAIN, "port reset failed\n");
+ goto out_uart;
+ }
regmap_write(regmaps[i], MAX310X_MODE1_REG, devtype->mode1);
}
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x b35f8dbbce818b02c730dc85133dc7754266e084
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021930-subprime-fondue-f8d3@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
b35f8dbbce81 ("serial: max310x: prevent infinite while() loop in port startup")
93cd256ab224 ("serial: max310x: improve crystal stable clock detection")
0419373333c2 ("serial: max310x: set default value when reading clock ready bit")
6ef281daf020 ("serial: max310x: use a separate regmap for each port")
285e76fc049c ("serial: max310x: use regmap methods for SPI batch operations")
c808fab604ca ("serial: max310x: Make use of device properties")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From b35f8dbbce818b02c730dc85133dc7754266e084 Mon Sep 17 00:00:00 2001
From: Hugo Villeneuve <hvilleneuve(a)dimonoff.com>
Date: Tue, 16 Jan 2024 16:30:01 -0500
Subject: [PATCH] serial: max310x: prevent infinite while() loop in port
startup
If there is a problem after resetting a port, the do/while() loop that
checks the default value of DIVLSB register may run forever and spam the
I2C bus.
Add a delay before each read of DIVLSB, and a maximum number of tries to
prevent that situation from happening.
Also fail probe if port reset is unsuccessful.
Fixes: 10d8b34a4217 ("serial: max310x: Driver rework")
Cc: stable(a)vger.kernel.org
Signed-off-by: Hugo Villeneuve <hvilleneuve(a)dimonoff.com>
Link: https://lore.kernel.org/r/20240116213001.3691629-5-hugo@hugovil.com
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
diff --git a/drivers/tty/serial/max310x.c b/drivers/tty/serial/max310x.c
index 552e153a24e0..10bf6d75bf9e 100644
--- a/drivers/tty/serial/max310x.c
+++ b/drivers/tty/serial/max310x.c
@@ -237,6 +237,10 @@
#define MAX310x_REV_MASK (0xf8)
#define MAX310X_WRITE_BIT 0x80
+/* Port startup definitions */
+#define MAX310X_PORT_STARTUP_WAIT_RETRIES 20 /* Number of retries */
+#define MAX310X_PORT_STARTUP_WAIT_DELAY_MS 10 /* Delay between retries */
+
/* Crystal-related definitions */
#define MAX310X_XTAL_WAIT_RETRIES 20 /* Number of retries */
#define MAX310X_XTAL_WAIT_DELAY_MS 10 /* Delay between retries */
@@ -1346,6 +1350,9 @@ static int max310x_probe(struct device *dev, const struct max310x_devtype *devty
goto out_clk;
for (i = 0; i < devtype->nr; i++) {
+ bool started = false;
+ unsigned int try = 0, val = 0;
+
/* Reset port */
regmap_write(regmaps[i], MAX310X_MODE2_REG,
MAX310X_MODE2_RST_BIT);
@@ -1354,8 +1361,17 @@ static int max310x_probe(struct device *dev, const struct max310x_devtype *devty
/* Wait for port startup */
do {
- regmap_read(regmaps[i], MAX310X_BRGDIVLSB_REG, &ret);
- } while (ret != 0x01);
+ msleep(MAX310X_PORT_STARTUP_WAIT_DELAY_MS);
+ regmap_read(regmaps[i], MAX310X_BRGDIVLSB_REG, &val);
+
+ if (val == 0x01)
+ started = true;
+ } while (!started && (++try < MAX310X_PORT_STARTUP_WAIT_RETRIES));
+
+ if (!started) {
+ ret = dev_err_probe(dev, -EAGAIN, "port reset failed\n");
+ goto out_uart;
+ }
regmap_write(regmaps[i], MAX310X_MODE1_REG, devtype->mode1);
}
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x b35f8dbbce818b02c730dc85133dc7754266e084
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021929-cosponsor-flick-22e3@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
b35f8dbbce81 ("serial: max310x: prevent infinite while() loop in port startup")
93cd256ab224 ("serial: max310x: improve crystal stable clock detection")
0419373333c2 ("serial: max310x: set default value when reading clock ready bit")
6ef281daf020 ("serial: max310x: use a separate regmap for each port")
285e76fc049c ("serial: max310x: use regmap methods for SPI batch operations")
c808fab604ca ("serial: max310x: Make use of device properties")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From b35f8dbbce818b02c730dc85133dc7754266e084 Mon Sep 17 00:00:00 2001
From: Hugo Villeneuve <hvilleneuve(a)dimonoff.com>
Date: Tue, 16 Jan 2024 16:30:01 -0500
Subject: [PATCH] serial: max310x: prevent infinite while() loop in port
startup
If there is a problem after resetting a port, the do/while() loop that
checks the default value of DIVLSB register may run forever and spam the
I2C bus.
Add a delay before each read of DIVLSB, and a maximum number of tries to
prevent that situation from happening.
Also fail probe if port reset is unsuccessful.
Fixes: 10d8b34a4217 ("serial: max310x: Driver rework")
Cc: stable(a)vger.kernel.org
Signed-off-by: Hugo Villeneuve <hvilleneuve(a)dimonoff.com>
Link: https://lore.kernel.org/r/20240116213001.3691629-5-hugo@hugovil.com
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
diff --git a/drivers/tty/serial/max310x.c b/drivers/tty/serial/max310x.c
index 552e153a24e0..10bf6d75bf9e 100644
--- a/drivers/tty/serial/max310x.c
+++ b/drivers/tty/serial/max310x.c
@@ -237,6 +237,10 @@
#define MAX310x_REV_MASK (0xf8)
#define MAX310X_WRITE_BIT 0x80
+/* Port startup definitions */
+#define MAX310X_PORT_STARTUP_WAIT_RETRIES 20 /* Number of retries */
+#define MAX310X_PORT_STARTUP_WAIT_DELAY_MS 10 /* Delay between retries */
+
/* Crystal-related definitions */
#define MAX310X_XTAL_WAIT_RETRIES 20 /* Number of retries */
#define MAX310X_XTAL_WAIT_DELAY_MS 10 /* Delay between retries */
@@ -1346,6 +1350,9 @@ static int max310x_probe(struct device *dev, const struct max310x_devtype *devty
goto out_clk;
for (i = 0; i < devtype->nr; i++) {
+ bool started = false;
+ unsigned int try = 0, val = 0;
+
/* Reset port */
regmap_write(regmaps[i], MAX310X_MODE2_REG,
MAX310X_MODE2_RST_BIT);
@@ -1354,8 +1361,17 @@ static int max310x_probe(struct device *dev, const struct max310x_devtype *devty
/* Wait for port startup */
do {
- regmap_read(regmaps[i], MAX310X_BRGDIVLSB_REG, &ret);
- } while (ret != 0x01);
+ msleep(MAX310X_PORT_STARTUP_WAIT_DELAY_MS);
+ regmap_read(regmaps[i], MAX310X_BRGDIVLSB_REG, &val);
+
+ if (val == 0x01)
+ started = true;
+ } while (!started && (++try < MAX310X_PORT_STARTUP_WAIT_RETRIES));
+
+ if (!started) {
+ ret = dev_err_probe(dev, -EAGAIN, "port reset failed\n");
+ goto out_uart;
+ }
regmap_write(regmaps[i], MAX310X_MODE1_REG, devtype->mode1);
}
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x b35f8dbbce818b02c730dc85133dc7754266e084
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021928-flanked-outboard-6a4c@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
b35f8dbbce81 ("serial: max310x: prevent infinite while() loop in port startup")
93cd256ab224 ("serial: max310x: improve crystal stable clock detection")
0419373333c2 ("serial: max310x: set default value when reading clock ready bit")
6ef281daf020 ("serial: max310x: use a separate regmap for each port")
285e76fc049c ("serial: max310x: use regmap methods for SPI batch operations")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From b35f8dbbce818b02c730dc85133dc7754266e084 Mon Sep 17 00:00:00 2001
From: Hugo Villeneuve <hvilleneuve(a)dimonoff.com>
Date: Tue, 16 Jan 2024 16:30:01 -0500
Subject: [PATCH] serial: max310x: prevent infinite while() loop in port
startup
If there is a problem after resetting a port, the do/while() loop that
checks the default value of DIVLSB register may run forever and spam the
I2C bus.
Add a delay before each read of DIVLSB, and a maximum number of tries to
prevent that situation from happening.
Also fail probe if port reset is unsuccessful.
Fixes: 10d8b34a4217 ("serial: max310x: Driver rework")
Cc: stable(a)vger.kernel.org
Signed-off-by: Hugo Villeneuve <hvilleneuve(a)dimonoff.com>
Link: https://lore.kernel.org/r/20240116213001.3691629-5-hugo@hugovil.com
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
diff --git a/drivers/tty/serial/max310x.c b/drivers/tty/serial/max310x.c
index 552e153a24e0..10bf6d75bf9e 100644
--- a/drivers/tty/serial/max310x.c
+++ b/drivers/tty/serial/max310x.c
@@ -237,6 +237,10 @@
#define MAX310x_REV_MASK (0xf8)
#define MAX310X_WRITE_BIT 0x80
+/* Port startup definitions */
+#define MAX310X_PORT_STARTUP_WAIT_RETRIES 20 /* Number of retries */
+#define MAX310X_PORT_STARTUP_WAIT_DELAY_MS 10 /* Delay between retries */
+
/* Crystal-related definitions */
#define MAX310X_XTAL_WAIT_RETRIES 20 /* Number of retries */
#define MAX310X_XTAL_WAIT_DELAY_MS 10 /* Delay between retries */
@@ -1346,6 +1350,9 @@ static int max310x_probe(struct device *dev, const struct max310x_devtype *devty
goto out_clk;
for (i = 0; i < devtype->nr; i++) {
+ bool started = false;
+ unsigned int try = 0, val = 0;
+
/* Reset port */
regmap_write(regmaps[i], MAX310X_MODE2_REG,
MAX310X_MODE2_RST_BIT);
@@ -1354,8 +1361,17 @@ static int max310x_probe(struct device *dev, const struct max310x_devtype *devty
/* Wait for port startup */
do {
- regmap_read(regmaps[i], MAX310X_BRGDIVLSB_REG, &ret);
- } while (ret != 0x01);
+ msleep(MAX310X_PORT_STARTUP_WAIT_DELAY_MS);
+ regmap_read(regmaps[i], MAX310X_BRGDIVLSB_REG, &val);
+
+ if (val == 0x01)
+ started = true;
+ } while (!started && (++try < MAX310X_PORT_STARTUP_WAIT_RETRIES));
+
+ if (!started) {
+ ret = dev_err_probe(dev, -EAGAIN, "port reset failed\n");
+ goto out_uart;
+ }
regmap_write(regmaps[i], MAX310X_MODE1_REG, devtype->mode1);
}
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.19.y
git checkout FETCH_HEAD
git cherry-pick -x 8afa6c6decea37e7cb473d2c60473f37f46cea35
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021913-outmatch-nuclear-240b@gregkh' --subject-prefix 'PATCH 4.19.y' HEAD^..
Possible dependencies:
8afa6c6decea ("serial: max310x: fail probe if clock crystal is unstable")
93cd256ab224 ("serial: max310x: improve crystal stable clock detection")
0419373333c2 ("serial: max310x: set default value when reading clock ready bit")
d4d6f03c4fb3 ("serial: max310x: Try to get crystal clock rate from property")
974e454d6f96 ("serial: max310x: Use devm_clk_get_optional() to get the input clock")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 8afa6c6decea37e7cb473d2c60473f37f46cea35 Mon Sep 17 00:00:00 2001
From: Hugo Villeneuve <hvilleneuve(a)dimonoff.com>
Date: Tue, 16 Jan 2024 16:30:00 -0500
Subject: [PATCH] serial: max310x: fail probe if clock crystal is unstable
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
A stable clock is really required in order to use this UART, so log an
error message and bail out if the chip reports that the clock is not
stable.
Fixes: 4cf9a888fd3c ("serial: max310x: Check the clock readiness")
Cc: stable(a)vger.kernel.org
Suggested-by: Jan Kundrát <jan.kundrat(a)cesnet.cz>
Link: https://www.spinics.net/lists/linux-serial/msg35773.html
Signed-off-by: Hugo Villeneuve <hvilleneuve(a)dimonoff.com>
Link: https://lore.kernel.org/r/20240116213001.3691629-4-hugo@hugovil.com
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
diff --git a/drivers/tty/serial/max310x.c b/drivers/tty/serial/max310x.c
index c0eb0615d945..552e153a24e0 100644
--- a/drivers/tty/serial/max310x.c
+++ b/drivers/tty/serial/max310x.c
@@ -587,7 +587,7 @@ static int max310x_update_best_err(unsigned long f, long *besterr)
return 1;
}
-static u32 max310x_set_ref_clk(struct device *dev, struct max310x_port *s,
+static s32 max310x_set_ref_clk(struct device *dev, struct max310x_port *s,
unsigned long freq, bool xtal)
{
unsigned int div, clksrc, pllcfg = 0;
@@ -657,7 +657,8 @@ static u32 max310x_set_ref_clk(struct device *dev, struct max310x_port *s,
} while (!stable && (++try < MAX310X_XTAL_WAIT_RETRIES));
if (!stable)
- dev_warn(dev, "clock is not stable yet\n");
+ return dev_err_probe(dev, -EAGAIN,
+ "clock is not stable\n");
}
return bestfreq;
@@ -1282,7 +1283,7 @@ static int max310x_probe(struct device *dev, const struct max310x_devtype *devty
{
int i, ret, fmin, fmax, freq;
struct max310x_port *s;
- u32 uartclk = 0;
+ s32 uartclk = 0;
bool xtal;
for (i = 0; i < devtype->nr; i++)
@@ -1360,6 +1361,11 @@ static int max310x_probe(struct device *dev, const struct max310x_devtype *devty
}
uartclk = max310x_set_ref_clk(dev, s, freq, xtal);
+ if (uartclk < 0) {
+ ret = uartclk;
+ goto out_uart;
+ }
+
dev_dbg(dev, "Reference clock set to %i Hz\n", uartclk);
for (i = 0; i < devtype->nr; i++) {
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x 8afa6c6decea37e7cb473d2c60473f37f46cea35
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021912-contend-garter-a81d@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
8afa6c6decea ("serial: max310x: fail probe if clock crystal is unstable")
93cd256ab224 ("serial: max310x: improve crystal stable clock detection")
0419373333c2 ("serial: max310x: set default value when reading clock ready bit")
d4d6f03c4fb3 ("serial: max310x: Try to get crystal clock rate from property")
974e454d6f96 ("serial: max310x: Use devm_clk_get_optional() to get the input clock")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 8afa6c6decea37e7cb473d2c60473f37f46cea35 Mon Sep 17 00:00:00 2001
From: Hugo Villeneuve <hvilleneuve(a)dimonoff.com>
Date: Tue, 16 Jan 2024 16:30:00 -0500
Subject: [PATCH] serial: max310x: fail probe if clock crystal is unstable
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
A stable clock is really required in order to use this UART, so log an
error message and bail out if the chip reports that the clock is not
stable.
Fixes: 4cf9a888fd3c ("serial: max310x: Check the clock readiness")
Cc: stable(a)vger.kernel.org
Suggested-by: Jan Kundrát <jan.kundrat(a)cesnet.cz>
Link: https://www.spinics.net/lists/linux-serial/msg35773.html
Signed-off-by: Hugo Villeneuve <hvilleneuve(a)dimonoff.com>
Link: https://lore.kernel.org/r/20240116213001.3691629-4-hugo@hugovil.com
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
diff --git a/drivers/tty/serial/max310x.c b/drivers/tty/serial/max310x.c
index c0eb0615d945..552e153a24e0 100644
--- a/drivers/tty/serial/max310x.c
+++ b/drivers/tty/serial/max310x.c
@@ -587,7 +587,7 @@ static int max310x_update_best_err(unsigned long f, long *besterr)
return 1;
}
-static u32 max310x_set_ref_clk(struct device *dev, struct max310x_port *s,
+static s32 max310x_set_ref_clk(struct device *dev, struct max310x_port *s,
unsigned long freq, bool xtal)
{
unsigned int div, clksrc, pllcfg = 0;
@@ -657,7 +657,8 @@ static u32 max310x_set_ref_clk(struct device *dev, struct max310x_port *s,
} while (!stable && (++try < MAX310X_XTAL_WAIT_RETRIES));
if (!stable)
- dev_warn(dev, "clock is not stable yet\n");
+ return dev_err_probe(dev, -EAGAIN,
+ "clock is not stable\n");
}
return bestfreq;
@@ -1282,7 +1283,7 @@ static int max310x_probe(struct device *dev, const struct max310x_devtype *devty
{
int i, ret, fmin, fmax, freq;
struct max310x_port *s;
- u32 uartclk = 0;
+ s32 uartclk = 0;
bool xtal;
for (i = 0; i < devtype->nr; i++)
@@ -1360,6 +1361,11 @@ static int max310x_probe(struct device *dev, const struct max310x_devtype *devty
}
uartclk = max310x_set_ref_clk(dev, s, freq, xtal);
+ if (uartclk < 0) {
+ ret = uartclk;
+ goto out_uart;
+ }
+
dev_dbg(dev, "Reference clock set to %i Hz\n", uartclk);
for (i = 0; i < devtype->nr; i++) {
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x 8afa6c6decea37e7cb473d2c60473f37f46cea35
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021911-shininess-bucket-c9fd@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
8afa6c6decea ("serial: max310x: fail probe if clock crystal is unstable")
93cd256ab224 ("serial: max310x: improve crystal stable clock detection")
0419373333c2 ("serial: max310x: set default value when reading clock ready bit")
d4d6f03c4fb3 ("serial: max310x: Try to get crystal clock rate from property")
974e454d6f96 ("serial: max310x: Use devm_clk_get_optional() to get the input clock")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 8afa6c6decea37e7cb473d2c60473f37f46cea35 Mon Sep 17 00:00:00 2001
From: Hugo Villeneuve <hvilleneuve(a)dimonoff.com>
Date: Tue, 16 Jan 2024 16:30:00 -0500
Subject: [PATCH] serial: max310x: fail probe if clock crystal is unstable
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
A stable clock is really required in order to use this UART, so log an
error message and bail out if the chip reports that the clock is not
stable.
Fixes: 4cf9a888fd3c ("serial: max310x: Check the clock readiness")
Cc: stable(a)vger.kernel.org
Suggested-by: Jan Kundrát <jan.kundrat(a)cesnet.cz>
Link: https://www.spinics.net/lists/linux-serial/msg35773.html
Signed-off-by: Hugo Villeneuve <hvilleneuve(a)dimonoff.com>
Link: https://lore.kernel.org/r/20240116213001.3691629-4-hugo@hugovil.com
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
diff --git a/drivers/tty/serial/max310x.c b/drivers/tty/serial/max310x.c
index c0eb0615d945..552e153a24e0 100644
--- a/drivers/tty/serial/max310x.c
+++ b/drivers/tty/serial/max310x.c
@@ -587,7 +587,7 @@ static int max310x_update_best_err(unsigned long f, long *besterr)
return 1;
}
-static u32 max310x_set_ref_clk(struct device *dev, struct max310x_port *s,
+static s32 max310x_set_ref_clk(struct device *dev, struct max310x_port *s,
unsigned long freq, bool xtal)
{
unsigned int div, clksrc, pllcfg = 0;
@@ -657,7 +657,8 @@ static u32 max310x_set_ref_clk(struct device *dev, struct max310x_port *s,
} while (!stable && (++try < MAX310X_XTAL_WAIT_RETRIES));
if (!stable)
- dev_warn(dev, "clock is not stable yet\n");
+ return dev_err_probe(dev, -EAGAIN,
+ "clock is not stable\n");
}
return bestfreq;
@@ -1282,7 +1283,7 @@ static int max310x_probe(struct device *dev, const struct max310x_devtype *devty
{
int i, ret, fmin, fmax, freq;
struct max310x_port *s;
- u32 uartclk = 0;
+ s32 uartclk = 0;
bool xtal;
for (i = 0; i < devtype->nr; i++)
@@ -1360,6 +1361,11 @@ static int max310x_probe(struct device *dev, const struct max310x_devtype *devty
}
uartclk = max310x_set_ref_clk(dev, s, freq, xtal);
+ if (uartclk < 0) {
+ ret = uartclk;
+ goto out_uart;
+ }
+
dev_dbg(dev, "Reference clock set to %i Hz\n", uartclk);
for (i = 0; i < devtype->nr; i++) {
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.19.y
git checkout FETCH_HEAD
git cherry-pick -x 30926783a46841c2d1bbf3f74067ba85d304fd0d
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021931-chop-dumping-2981@gregkh' --subject-prefix 'PATCH 4.19.y' HEAD^..
Possible dependencies:
30926783a468 ("serial: core: Fix atomicity violation in uart_tiocmget")
559c7ff4e324 ("serial: core: Use port lock wrappers")
84a9582fd203 ("serial: core: Start managing serial controllers to enable runtime PM")
51e45fba14bf ("serial: core: lock port for start_rx() in uart_resume_port()")
abcb0cf1f5b2 ("serial: core: lock port for stop_rx() in uart_suspend_port()")
d5b3d02d0b10 ("serial: Make uart_remove_one_port() return void")
63f4c3456171 ("serial: core: Disable uart_start() on uart_remove_one_port()")
826736a6c7c8 ("serial: Rename uart_change_speed() to uart_change_line_settings()")
8e90cf29aef7 ("serial: Move uart_change_speed() earlier")
b300fb26c59a ("tty: Convert ->carrier_raised() and callchains to bool")
515be7baeddb ("tty: Cleanup tty_port_set_initialized() bool parameter")
7c7f9bc986e6 ("serial: Deassert Transmit Enable on probe in driver-specific way")
a12c68920918 ("Merge 7e2cd21e02b3 ("Merge tag 'tty-6.0-rc7' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/tty") into tty-next")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 30926783a46841c2d1bbf3f74067ba85d304fd0d Mon Sep 17 00:00:00 2001
From: Gui-Dong Han <2045gemini(a)gmail.com>
Date: Fri, 12 Jan 2024 19:36:24 +0800
Subject: [PATCH] serial: core: Fix atomicity violation in uart_tiocmget
In uart_tiocmget():
result = uport->mctrl;
uart_port_lock_irq(uport);
result |= uport->ops->get_mctrl(uport);
uart_port_unlock_irq(uport);
...
return result;
In uart_update_mctrl():
uart_port_lock_irqsave(port, &flags);
...
port->mctrl = (old & ~clear) | set;
...
port->ops->set_mctrl(port, port->mctrl);
...
uart_port_unlock_irqrestore(port, flags);
An atomicity violation is identified due to the concurrent execution of
uart_tiocmget() and uart_update_mctrl(). After assigning
result = uport->mctrl, the mctrl value may change in uart_update_mctrl(),
leading to a mismatch between the value returned by
uport->ops->get_mctrl(uport) and the mctrl value previously read.
This can result in uart_tiocmget() returning an incorrect value.
This possible bug is found by an experimental static analysis tool
developed by our team, BassCheck[1]. This tool analyzes the locking APIs
to extract function pairs that can be concurrently executed, and then
analyzes the instructions in the paired functions to identify possible
concurrency bugs including data races and atomicity violations. The above
possible bug is reported when our tool analyzes the source code of
Linux 5.17.
To address this issue, it is suggested to move the line
result = uport->mctrl inside the uart_port_lock block to ensure atomicity
and prevent the mctrl value from being altered during the execution of
uart_tiocmget(). With this patch applied, our tool no longer reports the
bug, with the kernel configuration allyesconfig for x86_64. Due to the
absence of the requisite hardware, we are unable to conduct runtime
testing of the patch. Therefore, our verification is solely based on code
logic analysis.
[1] https://sites.google.com/view/basscheck/
Fixes: c5f4644e6c8b ("[PATCH] Serial: Adjust serial locking")
Cc: stable(a)vger.kernel.org
Signed-off-by: Gui-Dong Han <2045gemini(a)gmail.com>
Link: https://lore.kernel.org/r/20240112113624.17048-1-2045gemini@gmail.com
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c
index b56ed8c376b2..d6a58a9e072a 100644
--- a/drivers/tty/serial/serial_core.c
+++ b/drivers/tty/serial/serial_core.c
@@ -1084,8 +1084,8 @@ static int uart_tiocmget(struct tty_struct *tty)
goto out;
if (!tty_io_error(tty)) {
- result = uport->mctrl;
uart_port_lock_irq(uport);
+ result = uport->mctrl;
result |= uport->ops->get_mctrl(uport);
uart_port_unlock_irq(uport);
}
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x 30926783a46841c2d1bbf3f74067ba85d304fd0d
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021929-blooming-drainer-0214@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
30926783a468 ("serial: core: Fix atomicity violation in uart_tiocmget")
559c7ff4e324 ("serial: core: Use port lock wrappers")
84a9582fd203 ("serial: core: Start managing serial controllers to enable runtime PM")
51e45fba14bf ("serial: core: lock port for start_rx() in uart_resume_port()")
abcb0cf1f5b2 ("serial: core: lock port for stop_rx() in uart_suspend_port()")
d5b3d02d0b10 ("serial: Make uart_remove_one_port() return void")
63f4c3456171 ("serial: core: Disable uart_start() on uart_remove_one_port()")
826736a6c7c8 ("serial: Rename uart_change_speed() to uart_change_line_settings()")
8e90cf29aef7 ("serial: Move uart_change_speed() earlier")
b300fb26c59a ("tty: Convert ->carrier_raised() and callchains to bool")
515be7baeddb ("tty: Cleanup tty_port_set_initialized() bool parameter")
7c7f9bc986e6 ("serial: Deassert Transmit Enable on probe in driver-specific way")
a12c68920918 ("Merge 7e2cd21e02b3 ("Merge tag 'tty-6.0-rc7' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/tty") into tty-next")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 30926783a46841c2d1bbf3f74067ba85d304fd0d Mon Sep 17 00:00:00 2001
From: Gui-Dong Han <2045gemini(a)gmail.com>
Date: Fri, 12 Jan 2024 19:36:24 +0800
Subject: [PATCH] serial: core: Fix atomicity violation in uart_tiocmget
In uart_tiocmget():
result = uport->mctrl;
uart_port_lock_irq(uport);
result |= uport->ops->get_mctrl(uport);
uart_port_unlock_irq(uport);
...
return result;
In uart_update_mctrl():
uart_port_lock_irqsave(port, &flags);
...
port->mctrl = (old & ~clear) | set;
...
port->ops->set_mctrl(port, port->mctrl);
...
uart_port_unlock_irqrestore(port, flags);
An atomicity violation is identified due to the concurrent execution of
uart_tiocmget() and uart_update_mctrl(). After assigning
result = uport->mctrl, the mctrl value may change in uart_update_mctrl(),
leading to a mismatch between the value returned by
uport->ops->get_mctrl(uport) and the mctrl value previously read.
This can result in uart_tiocmget() returning an incorrect value.
This possible bug is found by an experimental static analysis tool
developed by our team, BassCheck[1]. This tool analyzes the locking APIs
to extract function pairs that can be concurrently executed, and then
analyzes the instructions in the paired functions to identify possible
concurrency bugs including data races and atomicity violations. The above
possible bug is reported when our tool analyzes the source code of
Linux 5.17.
To address this issue, it is suggested to move the line
result = uport->mctrl inside the uart_port_lock block to ensure atomicity
and prevent the mctrl value from being altered during the execution of
uart_tiocmget(). With this patch applied, our tool no longer reports the
bug, with the kernel configuration allyesconfig for x86_64. Due to the
absence of the requisite hardware, we are unable to conduct runtime
testing of the patch. Therefore, our verification is solely based on code
logic analysis.
[1] https://sites.google.com/view/basscheck/
Fixes: c5f4644e6c8b ("[PATCH] Serial: Adjust serial locking")
Cc: stable(a)vger.kernel.org
Signed-off-by: Gui-Dong Han <2045gemini(a)gmail.com>
Link: https://lore.kernel.org/r/20240112113624.17048-1-2045gemini@gmail.com
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c
index b56ed8c376b2..d6a58a9e072a 100644
--- a/drivers/tty/serial/serial_core.c
+++ b/drivers/tty/serial/serial_core.c
@@ -1084,8 +1084,8 @@ static int uart_tiocmget(struct tty_struct *tty)
goto out;
if (!tty_io_error(tty)) {
- result = uport->mctrl;
uart_port_lock_irq(uport);
+ result = uport->mctrl;
result |= uport->ops->get_mctrl(uport);
uart_port_unlock_irq(uport);
}
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x 30926783a46841c2d1bbf3f74067ba85d304fd0d
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021928-karma-varsity-4f37@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
30926783a468 ("serial: core: Fix atomicity violation in uart_tiocmget")
559c7ff4e324 ("serial: core: Use port lock wrappers")
84a9582fd203 ("serial: core: Start managing serial controllers to enable runtime PM")
51e45fba14bf ("serial: core: lock port for start_rx() in uart_resume_port()")
abcb0cf1f5b2 ("serial: core: lock port for stop_rx() in uart_suspend_port()")
d5b3d02d0b10 ("serial: Make uart_remove_one_port() return void")
63f4c3456171 ("serial: core: Disable uart_start() on uart_remove_one_port()")
826736a6c7c8 ("serial: Rename uart_change_speed() to uart_change_line_settings()")
8e90cf29aef7 ("serial: Move uart_change_speed() earlier")
b300fb26c59a ("tty: Convert ->carrier_raised() and callchains to bool")
515be7baeddb ("tty: Cleanup tty_port_set_initialized() bool parameter")
7c7f9bc986e6 ("serial: Deassert Transmit Enable on probe in driver-specific way")
a12c68920918 ("Merge 7e2cd21e02b3 ("Merge tag 'tty-6.0-rc7' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/tty") into tty-next")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 30926783a46841c2d1bbf3f74067ba85d304fd0d Mon Sep 17 00:00:00 2001
From: Gui-Dong Han <2045gemini(a)gmail.com>
Date: Fri, 12 Jan 2024 19:36:24 +0800
Subject: [PATCH] serial: core: Fix atomicity violation in uart_tiocmget
In uart_tiocmget():
result = uport->mctrl;
uart_port_lock_irq(uport);
result |= uport->ops->get_mctrl(uport);
uart_port_unlock_irq(uport);
...
return result;
In uart_update_mctrl():
uart_port_lock_irqsave(port, &flags);
...
port->mctrl = (old & ~clear) | set;
...
port->ops->set_mctrl(port, port->mctrl);
...
uart_port_unlock_irqrestore(port, flags);
An atomicity violation is identified due to the concurrent execution of
uart_tiocmget() and uart_update_mctrl(). After assigning
result = uport->mctrl, the mctrl value may change in uart_update_mctrl(),
leading to a mismatch between the value returned by
uport->ops->get_mctrl(uport) and the mctrl value previously read.
This can result in uart_tiocmget() returning an incorrect value.
This possible bug is found by an experimental static analysis tool
developed by our team, BassCheck[1]. This tool analyzes the locking APIs
to extract function pairs that can be concurrently executed, and then
analyzes the instructions in the paired functions to identify possible
concurrency bugs including data races and atomicity violations. The above
possible bug is reported when our tool analyzes the source code of
Linux 5.17.
To address this issue, it is suggested to move the line
result = uport->mctrl inside the uart_port_lock block to ensure atomicity
and prevent the mctrl value from being altered during the execution of
uart_tiocmget(). With this patch applied, our tool no longer reports the
bug, with the kernel configuration allyesconfig for x86_64. Due to the
absence of the requisite hardware, we are unable to conduct runtime
testing of the patch. Therefore, our verification is solely based on code
logic analysis.
[1] https://sites.google.com/view/basscheck/
Fixes: c5f4644e6c8b ("[PATCH] Serial: Adjust serial locking")
Cc: stable(a)vger.kernel.org
Signed-off-by: Gui-Dong Han <2045gemini(a)gmail.com>
Link: https://lore.kernel.org/r/20240112113624.17048-1-2045gemini@gmail.com
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c
index b56ed8c376b2..d6a58a9e072a 100644
--- a/drivers/tty/serial/serial_core.c
+++ b/drivers/tty/serial/serial_core.c
@@ -1084,8 +1084,8 @@ static int uart_tiocmget(struct tty_struct *tty)
goto out;
if (!tty_io_error(tty)) {
- result = uport->mctrl;
uart_port_lock_irq(uport);
+ result = uport->mctrl;
result |= uport->ops->get_mctrl(uport);
uart_port_unlock_irq(uport);
}
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 30926783a46841c2d1bbf3f74067ba85d304fd0d
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021926-frayed-faculty-a92b@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
30926783a468 ("serial: core: Fix atomicity violation in uart_tiocmget")
559c7ff4e324 ("serial: core: Use port lock wrappers")
84a9582fd203 ("serial: core: Start managing serial controllers to enable runtime PM")
51e45fba14bf ("serial: core: lock port for start_rx() in uart_resume_port()")
abcb0cf1f5b2 ("serial: core: lock port for stop_rx() in uart_suspend_port()")
d5b3d02d0b10 ("serial: Make uart_remove_one_port() return void")
63f4c3456171 ("serial: core: Disable uart_start() on uart_remove_one_port()")
826736a6c7c8 ("serial: Rename uart_change_speed() to uart_change_line_settings()")
8e90cf29aef7 ("serial: Move uart_change_speed() earlier")
b300fb26c59a ("tty: Convert ->carrier_raised() and callchains to bool")
515be7baeddb ("tty: Cleanup tty_port_set_initialized() bool parameter")
7c7f9bc986e6 ("serial: Deassert Transmit Enable on probe in driver-specific way")
a12c68920918 ("Merge 7e2cd21e02b3 ("Merge tag 'tty-6.0-rc7' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/tty") into tty-next")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 30926783a46841c2d1bbf3f74067ba85d304fd0d Mon Sep 17 00:00:00 2001
From: Gui-Dong Han <2045gemini(a)gmail.com>
Date: Fri, 12 Jan 2024 19:36:24 +0800
Subject: [PATCH] serial: core: Fix atomicity violation in uart_tiocmget
In uart_tiocmget():
result = uport->mctrl;
uart_port_lock_irq(uport);
result |= uport->ops->get_mctrl(uport);
uart_port_unlock_irq(uport);
...
return result;
In uart_update_mctrl():
uart_port_lock_irqsave(port, &flags);
...
port->mctrl = (old & ~clear) | set;
...
port->ops->set_mctrl(port, port->mctrl);
...
uart_port_unlock_irqrestore(port, flags);
An atomicity violation is identified due to the concurrent execution of
uart_tiocmget() and uart_update_mctrl(). After assigning
result = uport->mctrl, the mctrl value may change in uart_update_mctrl(),
leading to a mismatch between the value returned by
uport->ops->get_mctrl(uport) and the mctrl value previously read.
This can result in uart_tiocmget() returning an incorrect value.
This possible bug is found by an experimental static analysis tool
developed by our team, BassCheck[1]. This tool analyzes the locking APIs
to extract function pairs that can be concurrently executed, and then
analyzes the instructions in the paired functions to identify possible
concurrency bugs including data races and atomicity violations. The above
possible bug is reported when our tool analyzes the source code of
Linux 5.17.
To address this issue, it is suggested to move the line
result = uport->mctrl inside the uart_port_lock block to ensure atomicity
and prevent the mctrl value from being altered during the execution of
uart_tiocmget(). With this patch applied, our tool no longer reports the
bug, with the kernel configuration allyesconfig for x86_64. Due to the
absence of the requisite hardware, we are unable to conduct runtime
testing of the patch. Therefore, our verification is solely based on code
logic analysis.
[1] https://sites.google.com/view/basscheck/
Fixes: c5f4644e6c8b ("[PATCH] Serial: Adjust serial locking")
Cc: stable(a)vger.kernel.org
Signed-off-by: Gui-Dong Han <2045gemini(a)gmail.com>
Link: https://lore.kernel.org/r/20240112113624.17048-1-2045gemini@gmail.com
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c
index b56ed8c376b2..d6a58a9e072a 100644
--- a/drivers/tty/serial/serial_core.c
+++ b/drivers/tty/serial/serial_core.c
@@ -1084,8 +1084,8 @@ static int uart_tiocmget(struct tty_struct *tty)
goto out;
if (!tty_io_error(tty)) {
- result = uport->mctrl;
uart_port_lock_irq(uport);
+ result = uport->mctrl;
result |= uport->ops->get_mctrl(uport);
uart_port_unlock_irq(uport);
}
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x 30926783a46841c2d1bbf3f74067ba85d304fd0d
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021924-immerse-spur-af58@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
30926783a468 ("serial: core: Fix atomicity violation in uart_tiocmget")
559c7ff4e324 ("serial: core: Use port lock wrappers")
84a9582fd203 ("serial: core: Start managing serial controllers to enable runtime PM")
51e45fba14bf ("serial: core: lock port for start_rx() in uart_resume_port()")
abcb0cf1f5b2 ("serial: core: lock port for stop_rx() in uart_suspend_port()")
d5b3d02d0b10 ("serial: Make uart_remove_one_port() return void")
63f4c3456171 ("serial: core: Disable uart_start() on uart_remove_one_port()")
826736a6c7c8 ("serial: Rename uart_change_speed() to uart_change_line_settings()")
8e90cf29aef7 ("serial: Move uart_change_speed() earlier")
b300fb26c59a ("tty: Convert ->carrier_raised() and callchains to bool")
515be7baeddb ("tty: Cleanup tty_port_set_initialized() bool parameter")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 30926783a46841c2d1bbf3f74067ba85d304fd0d Mon Sep 17 00:00:00 2001
From: Gui-Dong Han <2045gemini(a)gmail.com>
Date: Fri, 12 Jan 2024 19:36:24 +0800
Subject: [PATCH] serial: core: Fix atomicity violation in uart_tiocmget
In uart_tiocmget():
result = uport->mctrl;
uart_port_lock_irq(uport);
result |= uport->ops->get_mctrl(uport);
uart_port_unlock_irq(uport);
...
return result;
In uart_update_mctrl():
uart_port_lock_irqsave(port, &flags);
...
port->mctrl = (old & ~clear) | set;
...
port->ops->set_mctrl(port, port->mctrl);
...
uart_port_unlock_irqrestore(port, flags);
An atomicity violation is identified due to the concurrent execution of
uart_tiocmget() and uart_update_mctrl(). After assigning
result = uport->mctrl, the mctrl value may change in uart_update_mctrl(),
leading to a mismatch between the value returned by
uport->ops->get_mctrl(uport) and the mctrl value previously read.
This can result in uart_tiocmget() returning an incorrect value.
This possible bug is found by an experimental static analysis tool
developed by our team, BassCheck[1]. This tool analyzes the locking APIs
to extract function pairs that can be concurrently executed, and then
analyzes the instructions in the paired functions to identify possible
concurrency bugs including data races and atomicity violations. The above
possible bug is reported when our tool analyzes the source code of
Linux 5.17.
To address this issue, it is suggested to move the line
result = uport->mctrl inside the uart_port_lock block to ensure atomicity
and prevent the mctrl value from being altered during the execution of
uart_tiocmget(). With this patch applied, our tool no longer reports the
bug, with the kernel configuration allyesconfig for x86_64. Due to the
absence of the requisite hardware, we are unable to conduct runtime
testing of the patch. Therefore, our verification is solely based on code
logic analysis.
[1] https://sites.google.com/view/basscheck/
Fixes: c5f4644e6c8b ("[PATCH] Serial: Adjust serial locking")
Cc: stable(a)vger.kernel.org
Signed-off-by: Gui-Dong Han <2045gemini(a)gmail.com>
Link: https://lore.kernel.org/r/20240112113624.17048-1-2045gemini@gmail.com
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c
index b56ed8c376b2..d6a58a9e072a 100644
--- a/drivers/tty/serial/serial_core.c
+++ b/drivers/tty/serial/serial_core.c
@@ -1084,8 +1084,8 @@ static int uart_tiocmget(struct tty_struct *tty)
goto out;
if (!tty_io_error(tty)) {
- result = uport->mctrl;
uart_port_lock_irq(uport);
+ result = uport->mctrl;
result |= uport->ops->get_mctrl(uport);
uart_port_unlock_irq(uport);
}
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x 30926783a46841c2d1bbf3f74067ba85d304fd0d
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021922-tribute-surround-40c2@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
30926783a468 ("serial: core: Fix atomicity violation in uart_tiocmget")
559c7ff4e324 ("serial: core: Use port lock wrappers")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 30926783a46841c2d1bbf3f74067ba85d304fd0d Mon Sep 17 00:00:00 2001
From: Gui-Dong Han <2045gemini(a)gmail.com>
Date: Fri, 12 Jan 2024 19:36:24 +0800
Subject: [PATCH] serial: core: Fix atomicity violation in uart_tiocmget
In uart_tiocmget():
result = uport->mctrl;
uart_port_lock_irq(uport);
result |= uport->ops->get_mctrl(uport);
uart_port_unlock_irq(uport);
...
return result;
In uart_update_mctrl():
uart_port_lock_irqsave(port, &flags);
...
port->mctrl = (old & ~clear) | set;
...
port->ops->set_mctrl(port, port->mctrl);
...
uart_port_unlock_irqrestore(port, flags);
An atomicity violation is identified due to the concurrent execution of
uart_tiocmget() and uart_update_mctrl(). After assigning
result = uport->mctrl, the mctrl value may change in uart_update_mctrl(),
leading to a mismatch between the value returned by
uport->ops->get_mctrl(uport) and the mctrl value previously read.
This can result in uart_tiocmget() returning an incorrect value.
This possible bug is found by an experimental static analysis tool
developed by our team, BassCheck[1]. This tool analyzes the locking APIs
to extract function pairs that can be concurrently executed, and then
analyzes the instructions in the paired functions to identify possible
concurrency bugs including data races and atomicity violations. The above
possible bug is reported when our tool analyzes the source code of
Linux 5.17.
To address this issue, it is suggested to move the line
result = uport->mctrl inside the uart_port_lock block to ensure atomicity
and prevent the mctrl value from being altered during the execution of
uart_tiocmget(). With this patch applied, our tool no longer reports the
bug, with the kernel configuration allyesconfig for x86_64. Due to the
absence of the requisite hardware, we are unable to conduct runtime
testing of the patch. Therefore, our verification is solely based on code
logic analysis.
[1] https://sites.google.com/view/basscheck/
Fixes: c5f4644e6c8b ("[PATCH] Serial: Adjust serial locking")
Cc: stable(a)vger.kernel.org
Signed-off-by: Gui-Dong Han <2045gemini(a)gmail.com>
Link: https://lore.kernel.org/r/20240112113624.17048-1-2045gemini@gmail.com
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c
index b56ed8c376b2..d6a58a9e072a 100644
--- a/drivers/tty/serial/serial_core.c
+++ b/drivers/tty/serial/serial_core.c
@@ -1084,8 +1084,8 @@ static int uart_tiocmget(struct tty_struct *tty)
goto out;
if (!tty_io_error(tty)) {
- result = uport->mctrl;
uart_port_lock_irq(uport);
+ result = uport->mctrl;
result |= uport->ops->get_mctrl(uport);
uart_port_unlock_irq(uport);
}
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x cefa98e806fd4e2a5e2047457a11ae5f17b8f621
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021957-battering-gallows-35b2@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
cefa98e806fd ("nfp: flower: add hardware offload check for post ct entry")
3e44d19934b9 ("nfp: flower: add goto_chain_index for ct entry")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From cefa98e806fd4e2a5e2047457a11ae5f17b8f621 Mon Sep 17 00:00:00 2001
From: Hui Zhou <hui.zhou(a)corigine.com>
Date: Wed, 24 Jan 2024 17:19:08 +0200
Subject: [PATCH] nfp: flower: add hardware offload check for post ct entry
The nfp offload flow pay will not allocate a mask id when the out port
is openvswitch internal port. This is because these flows are used to
configure the pre_tun table and are never actually send to the firmware
as an add-flow message. When a tc rule which action contains ct and
the post ct entry's out port is openvswitch internal port, the merge
offload flow pay with the wrong mask id of 0 will be send to the
firmware. Actually, the nfp can not support hardware offload for this
situation, so return EOPNOTSUPP.
Fixes: bd0fe7f96a3c ("nfp: flower-ct: add zone table entry when handling pre/post_ct flows")
CC: stable(a)vger.kernel.org # 5.14+
Signed-off-by: Hui Zhou <hui.zhou(a)corigine.com>
Signed-off-by: Louis Peens <louis.peens(a)corigine.com>
Link: https://lore.kernel.org/r/20240124151909.31603-2-louis.peens@corigine.com
Signed-off-by: Jakub Kicinski <kuba(a)kernel.org>
diff --git a/drivers/net/ethernet/netronome/nfp/flower/conntrack.c b/drivers/net/ethernet/netronome/nfp/flower/conntrack.c
index 2967bab72505..726d8cdf0b9c 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/conntrack.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/conntrack.c
@@ -1864,10 +1864,30 @@ int nfp_fl_ct_handle_post_ct(struct nfp_flower_priv *priv,
{
struct flow_rule *rule = flow_cls_offload_flow_rule(flow);
struct nfp_fl_ct_flow_entry *ct_entry;
+ struct flow_action_entry *ct_goto;
struct nfp_fl_ct_zone_entry *zt;
+ struct flow_action_entry *act;
bool wildcarded = false;
struct flow_match_ct ct;
- struct flow_action_entry *ct_goto;
+ int i;
+
+ flow_action_for_each(i, act, &rule->action) {
+ switch (act->id) {
+ case FLOW_ACTION_REDIRECT:
+ case FLOW_ACTION_REDIRECT_INGRESS:
+ case FLOW_ACTION_MIRRED:
+ case FLOW_ACTION_MIRRED_INGRESS:
+ if (act->dev->rtnl_link_ops &&
+ !strcmp(act->dev->rtnl_link_ops->kind, "openvswitch")) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "unsupported offload: out port is openvswitch internal port");
+ return -EOPNOTSUPP;
+ }
+ break;
+ default:
+ break;
+ }
+ }
flow_rule_match_ct(rule, &ct);
if (!ct.mask->ct_zone) {
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x cefa98e806fd4e2a5e2047457a11ae5f17b8f621
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021956-product-barstool-9b9c@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
cefa98e806fd ("nfp: flower: add hardware offload check for post ct entry")
3e44d19934b9 ("nfp: flower: add goto_chain_index for ct entry")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From cefa98e806fd4e2a5e2047457a11ae5f17b8f621 Mon Sep 17 00:00:00 2001
From: Hui Zhou <hui.zhou(a)corigine.com>
Date: Wed, 24 Jan 2024 17:19:08 +0200
Subject: [PATCH] nfp: flower: add hardware offload check for post ct entry
The nfp offload flow pay will not allocate a mask id when the out port
is openvswitch internal port. This is because these flows are used to
configure the pre_tun table and are never actually send to the firmware
as an add-flow message. When a tc rule which action contains ct and
the post ct entry's out port is openvswitch internal port, the merge
offload flow pay with the wrong mask id of 0 will be send to the
firmware. Actually, the nfp can not support hardware offload for this
situation, so return EOPNOTSUPP.
Fixes: bd0fe7f96a3c ("nfp: flower-ct: add zone table entry when handling pre/post_ct flows")
CC: stable(a)vger.kernel.org # 5.14+
Signed-off-by: Hui Zhou <hui.zhou(a)corigine.com>
Signed-off-by: Louis Peens <louis.peens(a)corigine.com>
Link: https://lore.kernel.org/r/20240124151909.31603-2-louis.peens@corigine.com
Signed-off-by: Jakub Kicinski <kuba(a)kernel.org>
diff --git a/drivers/net/ethernet/netronome/nfp/flower/conntrack.c b/drivers/net/ethernet/netronome/nfp/flower/conntrack.c
index 2967bab72505..726d8cdf0b9c 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/conntrack.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/conntrack.c
@@ -1864,10 +1864,30 @@ int nfp_fl_ct_handle_post_ct(struct nfp_flower_priv *priv,
{
struct flow_rule *rule = flow_cls_offload_flow_rule(flow);
struct nfp_fl_ct_flow_entry *ct_entry;
+ struct flow_action_entry *ct_goto;
struct nfp_fl_ct_zone_entry *zt;
+ struct flow_action_entry *act;
bool wildcarded = false;
struct flow_match_ct ct;
- struct flow_action_entry *ct_goto;
+ int i;
+
+ flow_action_for_each(i, act, &rule->action) {
+ switch (act->id) {
+ case FLOW_ACTION_REDIRECT:
+ case FLOW_ACTION_REDIRECT_INGRESS:
+ case FLOW_ACTION_MIRRED:
+ case FLOW_ACTION_MIRRED_INGRESS:
+ if (act->dev->rtnl_link_ops &&
+ !strcmp(act->dev->rtnl_link_ops->kind, "openvswitch")) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "unsupported offload: out port is openvswitch internal port");
+ return -EOPNOTSUPP;
+ }
+ break;
+ default:
+ break;
+ }
+ }
flow_rule_match_ct(rule, &ct);
if (!ct.mask->ct_zone) {
The patch below does not apply to the 6.7-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.7.y
git checkout FETCH_HEAD
git cherry-pick -x d16df040c8dad25c962b4404d2d534bfea327c6a
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021917-retract-untaken-e83c@gregkh' --subject-prefix 'PATCH 6.7.y' HEAD^..
Possible dependencies:
d16df040c8da ("drm/amdgpu: make damage clips support configurable")
b8b39de64627 ("drm/amd/pm: setup the framework to support Wifi RFI mitigation feature")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From d16df040c8dad25c962b4404d2d534bfea327c6a Mon Sep 17 00:00:00 2001
From: Hamza Mahfooz <hamza.mahfooz(a)amd.com>
Date: Thu, 8 Feb 2024 16:23:29 -0500
Subject: [PATCH] drm/amdgpu: make damage clips support configurable
We have observed that there are quite a number of PSR-SU panels on the
market that are unable to keep up with what user space throws at them,
resulting in hangs and random black screens. So, make damage clips
support configurable and disable it by default for PSR-SU displays.
Cc: stable(a)vger.kernel.org
Reviewed-by: Mario Limonciello <mario.limonciello(a)amd.com>
Signed-off-by: Hamza Mahfooz <hamza.mahfooz(a)amd.com>
Signed-off-by: Alex Deucher <alexander.deucher(a)amd.com>
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 6dce81a061ab..517117a0796f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -200,6 +200,7 @@ extern uint amdgpu_dc_debug_mask;
extern uint amdgpu_dc_visual_confirm;
extern uint amdgpu_dm_abm_level;
extern int amdgpu_backlight;
+extern int amdgpu_damage_clips;
extern struct amdgpu_mgpu_info mgpu_info;
extern int amdgpu_ras_enable;
extern uint amdgpu_ras_mask;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 211501ea9169..586f4d03039d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -211,6 +211,7 @@ int amdgpu_seamless = -1; /* auto */
uint amdgpu_debug_mask;
int amdgpu_agp = -1; /* auto */
int amdgpu_wbrf = -1;
+int amdgpu_damage_clips = -1; /* auto */
static void amdgpu_drv_delayed_reset_work_handler(struct work_struct *work);
@@ -859,6 +860,18 @@ int amdgpu_backlight = -1;
MODULE_PARM_DESC(backlight, "Backlight control (0 = pwm, 1 = aux, -1 auto (default))");
module_param_named(backlight, amdgpu_backlight, bint, 0444);
+/**
+ * DOC: damageclips (int)
+ * Enable or disable damage clips support. If damage clips support is disabled,
+ * we will force full frame updates, irrespective of what user space sends to
+ * us.
+ *
+ * Defaults to -1 (where it is enabled unless a PSR-SU display is detected).
+ */
+MODULE_PARM_DESC(damageclips,
+ "Damage clips support (0 = disable, 1 = enable, -1 auto (default))");
+module_param_named(damageclips, amdgpu_damage_clips, int, 0444);
+
/**
* DOC: tmz (int)
* Trusted Memory Zone (TMZ) is a method to protect data being written
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index 59d2eee72a32..d5ef07af9906 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -5219,6 +5219,7 @@ static void fill_dc_dirty_rects(struct drm_plane *plane,
struct drm_plane_state *new_plane_state,
struct drm_crtc_state *crtc_state,
struct dc_flip_addrs *flip_addrs,
+ bool is_psr_su,
bool *dirty_regions_changed)
{
struct dm_crtc_state *dm_crtc_state = to_dm_crtc_state(crtc_state);
@@ -5243,6 +5244,10 @@ static void fill_dc_dirty_rects(struct drm_plane *plane,
num_clips = drm_plane_get_damage_clips_count(new_plane_state);
clips = drm_plane_get_damage_clips(new_plane_state);
+ if (num_clips && (!amdgpu_damage_clips || (amdgpu_damage_clips < 0 &&
+ is_psr_su)))
+ goto ffu;
+
if (!dm_crtc_state->mpo_requested) {
if (!num_clips || num_clips > DC_MAX_DIRTY_RECTS)
goto ffu;
@@ -8298,6 +8303,8 @@ static void amdgpu_dm_commit_planes(struct drm_atomic_state *state,
fill_dc_dirty_rects(plane, old_plane_state,
new_plane_state, new_crtc_state,
&bundle->flip_addrs[planes_count],
+ acrtc_state->stream->link->psr_settings.psr_version ==
+ DC_PSR_VERSION_SU_1,
&dirty_rects_changed);
/*
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x d16df040c8dad25c962b4404d2d534bfea327c6a
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021917-swarm-federal-1cdf@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
d16df040c8da ("drm/amdgpu: make damage clips support configurable")
b8b39de64627 ("drm/amd/pm: setup the framework to support Wifi RFI mitigation feature")
2e9b152325f6 ("drm/amdgpu: optimize RLC powerdown notification on Vangogh")
6ba5b613837c ("drm/amdgpu: add a module parameter to control the AGP aperture")
564ca1b53ece ("drm/amdgpu/gmc11: fix logic typo in AGP check")
56d3de7da67a ("drm/amdgpu: add power up/down UMSCH ppt callback")
fe6cd9152464 ("drm/amd/swsmu: add smu14 ip support")
67318cb84341 ("drm/amdgpu/gmc11: set gart placement GC11")
917f91d8d8e8 ("drm/amdgpu/gmc: add a way to force a particular placement for GART")
d07f1c20dd7c ("drm/amd/pm: add xgmi plpd mode selecting interface for smu v13.0.6")
10d9ee96ce05 ("drm/amd/pm: add plpd_mode in smu_context to indicate current mode")
b2e1cbe6281f ("drm/amdgpu/gmc11: disable AGP on GC 11.5")
de59b69932e6 ("drm/amdgpu/gmc: set a default disable value for AGP")
29495d81457a ("drm/amdgpu/gmc6-8: properly disable the AGP aperture")
25396684b57f ("drm/amd/pm: add smu_13_0_6 mca dump support")
bcd8dc49c0b9 ("drm/amd/pm: update smu_v13_0_6 ppsmc header")
cad2fb19bbfa ("drm/amd/pm: Fix clock reporting for SMUv13.0.6")
4e8303cf2c4d ("drm/amdgpu: Use function for IP version check")
887db1e49a73 ("drm/amdgpu: Merge debug module parameters")
df38fe12a22c ("drm/amd/pm: enable smu_v13_0_6 mca debug mode when UMC RAS feature is enabled")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From d16df040c8dad25c962b4404d2d534bfea327c6a Mon Sep 17 00:00:00 2001
From: Hamza Mahfooz <hamza.mahfooz(a)amd.com>
Date: Thu, 8 Feb 2024 16:23:29 -0500
Subject: [PATCH] drm/amdgpu: make damage clips support configurable
We have observed that there are quite a number of PSR-SU panels on the
market that are unable to keep up with what user space throws at them,
resulting in hangs and random black screens. So, make damage clips
support configurable and disable it by default for PSR-SU displays.
Cc: stable(a)vger.kernel.org
Reviewed-by: Mario Limonciello <mario.limonciello(a)amd.com>
Signed-off-by: Hamza Mahfooz <hamza.mahfooz(a)amd.com>
Signed-off-by: Alex Deucher <alexander.deucher(a)amd.com>
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 6dce81a061ab..517117a0796f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -200,6 +200,7 @@ extern uint amdgpu_dc_debug_mask;
extern uint amdgpu_dc_visual_confirm;
extern uint amdgpu_dm_abm_level;
extern int amdgpu_backlight;
+extern int amdgpu_damage_clips;
extern struct amdgpu_mgpu_info mgpu_info;
extern int amdgpu_ras_enable;
extern uint amdgpu_ras_mask;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 211501ea9169..586f4d03039d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -211,6 +211,7 @@ int amdgpu_seamless = -1; /* auto */
uint amdgpu_debug_mask;
int amdgpu_agp = -1; /* auto */
int amdgpu_wbrf = -1;
+int amdgpu_damage_clips = -1; /* auto */
static void amdgpu_drv_delayed_reset_work_handler(struct work_struct *work);
@@ -859,6 +860,18 @@ int amdgpu_backlight = -1;
MODULE_PARM_DESC(backlight, "Backlight control (0 = pwm, 1 = aux, -1 auto (default))");
module_param_named(backlight, amdgpu_backlight, bint, 0444);
+/**
+ * DOC: damageclips (int)
+ * Enable or disable damage clips support. If damage clips support is disabled,
+ * we will force full frame updates, irrespective of what user space sends to
+ * us.
+ *
+ * Defaults to -1 (where it is enabled unless a PSR-SU display is detected).
+ */
+MODULE_PARM_DESC(damageclips,
+ "Damage clips support (0 = disable, 1 = enable, -1 auto (default))");
+module_param_named(damageclips, amdgpu_damage_clips, int, 0444);
+
/**
* DOC: tmz (int)
* Trusted Memory Zone (TMZ) is a method to protect data being written
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index 59d2eee72a32..d5ef07af9906 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -5219,6 +5219,7 @@ static void fill_dc_dirty_rects(struct drm_plane *plane,
struct drm_plane_state *new_plane_state,
struct drm_crtc_state *crtc_state,
struct dc_flip_addrs *flip_addrs,
+ bool is_psr_su,
bool *dirty_regions_changed)
{
struct dm_crtc_state *dm_crtc_state = to_dm_crtc_state(crtc_state);
@@ -5243,6 +5244,10 @@ static void fill_dc_dirty_rects(struct drm_plane *plane,
num_clips = drm_plane_get_damage_clips_count(new_plane_state);
clips = drm_plane_get_damage_clips(new_plane_state);
+ if (num_clips && (!amdgpu_damage_clips || (amdgpu_damage_clips < 0 &&
+ is_psr_su)))
+ goto ffu;
+
if (!dm_crtc_state->mpo_requested) {
if (!num_clips || num_clips > DC_MAX_DIRTY_RECTS)
goto ffu;
@@ -8298,6 +8303,8 @@ static void amdgpu_dm_commit_planes(struct drm_atomic_state *state,
fill_dc_dirty_rects(plane, old_plane_state,
new_plane_state, new_crtc_state,
&bundle->flip_addrs[planes_count],
+ acrtc_state->stream->link->psr_settings.psr_version ==
+ DC_PSR_VERSION_SU_1,
&dirty_rects_changed);
/*