The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From fe6f45f6ba22d625a8500cbad0237c60dd3117ee Mon Sep 17 00:00:00 2001
From: Yang Yingliang <yangyingliang(a)huawei.com>
Date: Tue, 12 Oct 2021 14:36:24 +0800
Subject: [PATCH] iio: core: check return value when calling dev_set_name()
I got a null-ptr-deref report when doing fault injection test:
BUG: kernel NULL pointer dereference, address: 0000000000000000
RIP: 0010:strlen+0x0/0x20
Call Trace:
start_creating+0x199/0x2f0
debugfs_create_dir+0x25/0x430
__iio_device_register+0x4da/0x1b40 [industrialio]
__devm_iio_device_register+0x22/0x80 [industrialio]
max1027_probe+0x639/0x860 [max1027]
spi_probe+0x183/0x210
really_probe+0x285/0xc30
If dev_set_name() fails, the dev_name() is null, check the return
value of dev_set_name() to avoid the null-ptr-deref.
Reported-by: Hulk Robot <hulkci(a)huawei.com>
Fixes: e553f182d55b ("staging: iio: core: Introduce debugfs support...")
Signed-off-by: Yang Yingliang <yangyingliang(a)huawei.com>
Cc: <Stable(a)vger.kernel.org>
Link: https://lore.kernel.org/r/20211012063624.3167460-1-yangyingliang@huawei.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron(a)huawei.com>
diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c
index 2dbb37e09b8c..48fda6a79076 100644
--- a/drivers/iio/industrialio-core.c
+++ b/drivers/iio/industrialio-core.c
@@ -1664,7 +1664,13 @@ struct iio_dev *iio_device_alloc(struct device *parent, int sizeof_priv)
kfree(iio_dev_opaque);
return NULL;
}
- dev_set_name(&indio_dev->dev, "iio:device%d", iio_dev_opaque->id);
+
+ if (dev_set_name(&indio_dev->dev, "iio:device%d", iio_dev_opaque->id)) {
+ ida_simple_remove(&iio_ida, iio_dev_opaque->id);
+ kfree(iio_dev_opaque);
+ return NULL;
+ }
+
INIT_LIST_HEAD(&iio_dev_opaque->buffer_list);
INIT_LIST_HEAD(&iio_dev_opaque->ioctl_handlers);
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From fe6f45f6ba22d625a8500cbad0237c60dd3117ee Mon Sep 17 00:00:00 2001
From: Yang Yingliang <yangyingliang(a)huawei.com>
Date: Tue, 12 Oct 2021 14:36:24 +0800
Subject: [PATCH] iio: core: check return value when calling dev_set_name()
I got a null-ptr-deref report when doing fault injection test:
BUG: kernel NULL pointer dereference, address: 0000000000000000
RIP: 0010:strlen+0x0/0x20
Call Trace:
start_creating+0x199/0x2f0
debugfs_create_dir+0x25/0x430
__iio_device_register+0x4da/0x1b40 [industrialio]
__devm_iio_device_register+0x22/0x80 [industrialio]
max1027_probe+0x639/0x860 [max1027]
spi_probe+0x183/0x210
really_probe+0x285/0xc30
If dev_set_name() fails, the dev_name() is null, check the return
value of dev_set_name() to avoid the null-ptr-deref.
Reported-by: Hulk Robot <hulkci(a)huawei.com>
Fixes: e553f182d55b ("staging: iio: core: Introduce debugfs support...")
Signed-off-by: Yang Yingliang <yangyingliang(a)huawei.com>
Cc: <Stable(a)vger.kernel.org>
Link: https://lore.kernel.org/r/20211012063624.3167460-1-yangyingliang@huawei.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron(a)huawei.com>
diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c
index 2dbb37e09b8c..48fda6a79076 100644
--- a/drivers/iio/industrialio-core.c
+++ b/drivers/iio/industrialio-core.c
@@ -1664,7 +1664,13 @@ struct iio_dev *iio_device_alloc(struct device *parent, int sizeof_priv)
kfree(iio_dev_opaque);
return NULL;
}
- dev_set_name(&indio_dev->dev, "iio:device%d", iio_dev_opaque->id);
+
+ if (dev_set_name(&indio_dev->dev, "iio:device%d", iio_dev_opaque->id)) {
+ ida_simple_remove(&iio_ida, iio_dev_opaque->id);
+ kfree(iio_dev_opaque);
+ return NULL;
+ }
+
INIT_LIST_HEAD(&iio_dev_opaque->buffer_list);
INIT_LIST_HEAD(&iio_dev_opaque->ioctl_handlers);
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From fe6f45f6ba22d625a8500cbad0237c60dd3117ee Mon Sep 17 00:00:00 2001
From: Yang Yingliang <yangyingliang(a)huawei.com>
Date: Tue, 12 Oct 2021 14:36:24 +0800
Subject: [PATCH] iio: core: check return value when calling dev_set_name()
I got a null-ptr-deref report when doing fault injection test:
BUG: kernel NULL pointer dereference, address: 0000000000000000
RIP: 0010:strlen+0x0/0x20
Call Trace:
start_creating+0x199/0x2f0
debugfs_create_dir+0x25/0x430
__iio_device_register+0x4da/0x1b40 [industrialio]
__devm_iio_device_register+0x22/0x80 [industrialio]
max1027_probe+0x639/0x860 [max1027]
spi_probe+0x183/0x210
really_probe+0x285/0xc30
If dev_set_name() fails, the dev_name() is null, check the return
value of dev_set_name() to avoid the null-ptr-deref.
Reported-by: Hulk Robot <hulkci(a)huawei.com>
Fixes: e553f182d55b ("staging: iio: core: Introduce debugfs support...")
Signed-off-by: Yang Yingliang <yangyingliang(a)huawei.com>
Cc: <Stable(a)vger.kernel.org>
Link: https://lore.kernel.org/r/20211012063624.3167460-1-yangyingliang@huawei.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron(a)huawei.com>
diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c
index 2dbb37e09b8c..48fda6a79076 100644
--- a/drivers/iio/industrialio-core.c
+++ b/drivers/iio/industrialio-core.c
@@ -1664,7 +1664,13 @@ struct iio_dev *iio_device_alloc(struct device *parent, int sizeof_priv)
kfree(iio_dev_opaque);
return NULL;
}
- dev_set_name(&indio_dev->dev, "iio:device%d", iio_dev_opaque->id);
+
+ if (dev_set_name(&indio_dev->dev, "iio:device%d", iio_dev_opaque->id)) {
+ ida_simple_remove(&iio_ida, iio_dev_opaque->id);
+ kfree(iio_dev_opaque);
+ return NULL;
+ }
+
INIT_LIST_HEAD(&iio_dev_opaque->buffer_list);
INIT_LIST_HEAD(&iio_dev_opaque->ioctl_handlers);
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From fe6f45f6ba22d625a8500cbad0237c60dd3117ee Mon Sep 17 00:00:00 2001
From: Yang Yingliang <yangyingliang(a)huawei.com>
Date: Tue, 12 Oct 2021 14:36:24 +0800
Subject: [PATCH] iio: core: check return value when calling dev_set_name()
I got a null-ptr-deref report when doing fault injection test:
BUG: kernel NULL pointer dereference, address: 0000000000000000
RIP: 0010:strlen+0x0/0x20
Call Trace:
start_creating+0x199/0x2f0
debugfs_create_dir+0x25/0x430
__iio_device_register+0x4da/0x1b40 [industrialio]
__devm_iio_device_register+0x22/0x80 [industrialio]
max1027_probe+0x639/0x860 [max1027]
spi_probe+0x183/0x210
really_probe+0x285/0xc30
If dev_set_name() fails, the dev_name() is null, check the return
value of dev_set_name() to avoid the null-ptr-deref.
Reported-by: Hulk Robot <hulkci(a)huawei.com>
Fixes: e553f182d55b ("staging: iio: core: Introduce debugfs support...")
Signed-off-by: Yang Yingliang <yangyingliang(a)huawei.com>
Cc: <Stable(a)vger.kernel.org>
Link: https://lore.kernel.org/r/20211012063624.3167460-1-yangyingliang@huawei.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron(a)huawei.com>
diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c
index 2dbb37e09b8c..48fda6a79076 100644
--- a/drivers/iio/industrialio-core.c
+++ b/drivers/iio/industrialio-core.c
@@ -1664,7 +1664,13 @@ struct iio_dev *iio_device_alloc(struct device *parent, int sizeof_priv)
kfree(iio_dev_opaque);
return NULL;
}
- dev_set_name(&indio_dev->dev, "iio:device%d", iio_dev_opaque->id);
+
+ if (dev_set_name(&indio_dev->dev, "iio:device%d", iio_dev_opaque->id)) {
+ ida_simple_remove(&iio_ida, iio_dev_opaque->id);
+ kfree(iio_dev_opaque);
+ return NULL;
+ }
+
INIT_LIST_HEAD(&iio_dev_opaque->buffer_list);
INIT_LIST_HEAD(&iio_dev_opaque->ioctl_handlers);
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From fe6f45f6ba22d625a8500cbad0237c60dd3117ee Mon Sep 17 00:00:00 2001
From: Yang Yingliang <yangyingliang(a)huawei.com>
Date: Tue, 12 Oct 2021 14:36:24 +0800
Subject: [PATCH] iio: core: check return value when calling dev_set_name()
I got a null-ptr-deref report when doing fault injection test:
BUG: kernel NULL pointer dereference, address: 0000000000000000
RIP: 0010:strlen+0x0/0x20
Call Trace:
start_creating+0x199/0x2f0
debugfs_create_dir+0x25/0x430
__iio_device_register+0x4da/0x1b40 [industrialio]
__devm_iio_device_register+0x22/0x80 [industrialio]
max1027_probe+0x639/0x860 [max1027]
spi_probe+0x183/0x210
really_probe+0x285/0xc30
If dev_set_name() fails, the dev_name() is null, check the return
value of dev_set_name() to avoid the null-ptr-deref.
Reported-by: Hulk Robot <hulkci(a)huawei.com>
Fixes: e553f182d55b ("staging: iio: core: Introduce debugfs support...")
Signed-off-by: Yang Yingliang <yangyingliang(a)huawei.com>
Cc: <Stable(a)vger.kernel.org>
Link: https://lore.kernel.org/r/20211012063624.3167460-1-yangyingliang@huawei.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron(a)huawei.com>
diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c
index 2dbb37e09b8c..48fda6a79076 100644
--- a/drivers/iio/industrialio-core.c
+++ b/drivers/iio/industrialio-core.c
@@ -1664,7 +1664,13 @@ struct iio_dev *iio_device_alloc(struct device *parent, int sizeof_priv)
kfree(iio_dev_opaque);
return NULL;
}
- dev_set_name(&indio_dev->dev, "iio:device%d", iio_dev_opaque->id);
+
+ if (dev_set_name(&indio_dev->dev, "iio:device%d", iio_dev_opaque->id)) {
+ ida_simple_remove(&iio_ida, iio_dev_opaque->id);
+ kfree(iio_dev_opaque);
+ return NULL;
+ }
+
INIT_LIST_HEAD(&iio_dev_opaque->buffer_list);
INIT_LIST_HEAD(&iio_dev_opaque->ioctl_handlers);
The patch below does not apply to the 4.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From fe6f45f6ba22d625a8500cbad0237c60dd3117ee Mon Sep 17 00:00:00 2001
From: Yang Yingliang <yangyingliang(a)huawei.com>
Date: Tue, 12 Oct 2021 14:36:24 +0800
Subject: [PATCH] iio: core: check return value when calling dev_set_name()
I got a null-ptr-deref report when doing fault injection test:
BUG: kernel NULL pointer dereference, address: 0000000000000000
RIP: 0010:strlen+0x0/0x20
Call Trace:
start_creating+0x199/0x2f0
debugfs_create_dir+0x25/0x430
__iio_device_register+0x4da/0x1b40 [industrialio]
__devm_iio_device_register+0x22/0x80 [industrialio]
max1027_probe+0x639/0x860 [max1027]
spi_probe+0x183/0x210
really_probe+0x285/0xc30
If dev_set_name() fails, the dev_name() is null, check the return
value of dev_set_name() to avoid the null-ptr-deref.
Reported-by: Hulk Robot <hulkci(a)huawei.com>
Fixes: e553f182d55b ("staging: iio: core: Introduce debugfs support...")
Signed-off-by: Yang Yingliang <yangyingliang(a)huawei.com>
Cc: <Stable(a)vger.kernel.org>
Link: https://lore.kernel.org/r/20211012063624.3167460-1-yangyingliang@huawei.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron(a)huawei.com>
diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c
index 2dbb37e09b8c..48fda6a79076 100644
--- a/drivers/iio/industrialio-core.c
+++ b/drivers/iio/industrialio-core.c
@@ -1664,7 +1664,13 @@ struct iio_dev *iio_device_alloc(struct device *parent, int sizeof_priv)
kfree(iio_dev_opaque);
return NULL;
}
- dev_set_name(&indio_dev->dev, "iio:device%d", iio_dev_opaque->id);
+
+ if (dev_set_name(&indio_dev->dev, "iio:device%d", iio_dev_opaque->id)) {
+ ida_simple_remove(&iio_ida, iio_dev_opaque->id);
+ kfree(iio_dev_opaque);
+ return NULL;
+ }
+
INIT_LIST_HEAD(&iio_dev_opaque->buffer_list);
INIT_LIST_HEAD(&iio_dev_opaque->ioctl_handlers);
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From e775eb9fc2a4107f03222fa48bc95c2c82427e64 Mon Sep 17 00:00:00 2001
From: Meng Li <Meng.Li(a)windriver.com>
Date: Tue, 19 Oct 2021 10:32:41 +0800
Subject: [PATCH] soc: fsl: dpio: replace smp_processor_id with
raw_smp_processor_id
When enable debug kernel configs,there will be calltrace as below:
BUG: using smp_processor_id() in preemptible [00000000] code: swapper/0/1
caller is debug_smp_processor_id+0x20/0x30
CPU: 6 PID: 1 Comm: swapper/0 Not tainted 5.10.63-yocto-standard #1
Hardware name: NXP Layerscape LX2160ARDB (DT)
Call trace:
dump_backtrace+0x0/0x1a0
show_stack+0x24/0x30
dump_stack+0xf0/0x13c
check_preemption_disabled+0x100/0x110
debug_smp_processor_id+0x20/0x30
dpaa2_io_query_fq_count+0xdc/0x154
dpaa2_eth_stop+0x144/0x314
__dev_close_many+0xdc/0x160
__dev_change_flags+0xe8/0x220
dev_change_flags+0x30/0x70
ic_close_devs+0x50/0x78
ip_auto_config+0xed0/0xf10
do_one_initcall+0xac/0x460
kernel_init_freeable+0x30c/0x378
kernel_init+0x20/0x128
ret_from_fork+0x10/0x38
Based on comment in the context, it doesn't matter whether
preemption is disable or not. So, replace smp_processor_id()
with raw_smp_processor_id() to avoid above call trace.
Fixes: c89105c9b390 ("staging: fsl-mc: Move DPIO from staging to drivers/soc/fsl")
Cc: stable(a)vger.kernel.org
Signed-off-by: Meng Li <Meng.Li(a)windriver.com>
Signed-off-by: Li Yang <leoyang.li(a)nxp.com>
diff --git a/drivers/soc/fsl/dpio/dpio-service.c b/drivers/soc/fsl/dpio/dpio-service.c
index 7351f3030550..779c319a4b82 100644
--- a/drivers/soc/fsl/dpio/dpio-service.c
+++ b/drivers/soc/fsl/dpio/dpio-service.c
@@ -59,7 +59,7 @@ static inline struct dpaa2_io *service_select_by_cpu(struct dpaa2_io *d,
* potentially being migrated away.
*/
if (cpu < 0)
- cpu = smp_processor_id();
+ cpu = raw_smp_processor_id();
/* If a specific cpu was requested, pick it up immediately */
return dpio_by_cpu[cpu];
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From e775eb9fc2a4107f03222fa48bc95c2c82427e64 Mon Sep 17 00:00:00 2001
From: Meng Li <Meng.Li(a)windriver.com>
Date: Tue, 19 Oct 2021 10:32:41 +0800
Subject: [PATCH] soc: fsl: dpio: replace smp_processor_id with
raw_smp_processor_id
When enable debug kernel configs,there will be calltrace as below:
BUG: using smp_processor_id() in preemptible [00000000] code: swapper/0/1
caller is debug_smp_processor_id+0x20/0x30
CPU: 6 PID: 1 Comm: swapper/0 Not tainted 5.10.63-yocto-standard #1
Hardware name: NXP Layerscape LX2160ARDB (DT)
Call trace:
dump_backtrace+0x0/0x1a0
show_stack+0x24/0x30
dump_stack+0xf0/0x13c
check_preemption_disabled+0x100/0x110
debug_smp_processor_id+0x20/0x30
dpaa2_io_query_fq_count+0xdc/0x154
dpaa2_eth_stop+0x144/0x314
__dev_close_many+0xdc/0x160
__dev_change_flags+0xe8/0x220
dev_change_flags+0x30/0x70
ic_close_devs+0x50/0x78
ip_auto_config+0xed0/0xf10
do_one_initcall+0xac/0x460
kernel_init_freeable+0x30c/0x378
kernel_init+0x20/0x128
ret_from_fork+0x10/0x38
Based on comment in the context, it doesn't matter whether
preemption is disable or not. So, replace smp_processor_id()
with raw_smp_processor_id() to avoid above call trace.
Fixes: c89105c9b390 ("staging: fsl-mc: Move DPIO from staging to drivers/soc/fsl")
Cc: stable(a)vger.kernel.org
Signed-off-by: Meng Li <Meng.Li(a)windriver.com>
Signed-off-by: Li Yang <leoyang.li(a)nxp.com>
diff --git a/drivers/soc/fsl/dpio/dpio-service.c b/drivers/soc/fsl/dpio/dpio-service.c
index 7351f3030550..779c319a4b82 100644
--- a/drivers/soc/fsl/dpio/dpio-service.c
+++ b/drivers/soc/fsl/dpio/dpio-service.c
@@ -59,7 +59,7 @@ static inline struct dpaa2_io *service_select_by_cpu(struct dpaa2_io *d,
* potentially being migrated away.
*/
if (cpu < 0)
- cpu = smp_processor_id();
+ cpu = raw_smp_processor_id();
/* If a specific cpu was requested, pick it up immediately */
return dpio_by_cpu[cpu];
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 1869023e24c0de73a160a424dac4621cefd628ae Mon Sep 17 00:00:00 2001
From: Andrew Gabbasov <andrew_gabbasov(a)mentor.com>
Date: Wed, 22 Sep 2021 13:48:30 -0500
Subject: [PATCH] memory: renesas-rpc-if: Avoid unaligned bus access for
HyperFlash
HyperFlash devices in Renesas SoCs use 2-bytes addressing, according
to HW manual paragraph 62.3.3 (which officially describes Serial Flash
access, but seems to be applicable to HyperFlash too). And 1-byte bus
read operations to 2-bytes unaligned addresses in external address space
read mode work incorrectly (returns the other byte from the same word).
Function memcpy_fromio(), used by the driver to read data from the bus,
in ARM64 architecture (to which Renesas cores belong) uses 8-bytes
bus accesses for appropriate aligned addresses, and 1-bytes accesses
for other addresses. This results in incorrect data read from HyperFlash
in unaligned cases.
This issue can be reproduced using something like the following commands
(where mtd1 is a parition on Hyperflash storage, defined properly
in a device tree):
[Correct fragment, read from Hyperflash]
root@rcar-gen3:~# dd if=/dev/mtd1 of=/tmp/zz bs=32 count=1
root@rcar-gen3:~# hexdump -C /tmp/zz
00000000 f4 03 00 aa f5 03 01 aa f6 03 02 aa f7 03 03 aa |................|
00000010 00 00 80 d2 40 20 18 d5 00 06 81 d2 a0 18 a6 f2 |....@ ..........|
00000020
[Incorrect read of the same fragment: see the difference at offsets 8-11]
root@rcar-gen3:~# dd if=/dev/mtd1 of=/tmp/zz bs=12 count=1
root@rcar-gen3:~# hexdump -C /tmp/zz
00000000 f4 03 00 aa f5 03 01 aa 03 03 aa aa |............|
0000000c
Fix this issue by creating a local replacement of the copying function,
that performs only properly aligned bus accesses, and is used for reading
from HyperFlash.
Fixes: ca7d8b980b67f ("memory: add Renesas RPC-IF driver")
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Gabbasov <andrew_gabbasov(a)mentor.com>
Link: https://lore.kernel.org/r/20210922184830.29147-1-andrew_gabbasov@mentor.com
Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski(a)canonical.com>
diff --git a/drivers/memory/renesas-rpc-if.c b/drivers/memory/renesas-rpc-if.c
index 77a011d5ff8c..7435baad0007 100644
--- a/drivers/memory/renesas-rpc-if.c
+++ b/drivers/memory/renesas-rpc-if.c
@@ -185,7 +185,6 @@ static int rpcif_reg_read(void *context, unsigned int reg, unsigned int *val)
*val = readl(rpc->base + reg);
return 0;
-
}
static int rpcif_reg_write(void *context, unsigned int reg, unsigned int val)
@@ -545,6 +544,48 @@ err_out:
}
EXPORT_SYMBOL(rpcif_manual_xfer);
+static void memcpy_fromio_readw(void *to,
+ const void __iomem *from,
+ size_t count)
+{
+ const int maxw = (IS_ENABLED(CONFIG_64BIT)) ? 8 : 4;
+ u8 buf[2];
+
+ if (count && ((unsigned long)from & 1)) {
+ *(u16 *)buf = __raw_readw((void __iomem *)((unsigned long)from & ~1));
+ *(u8 *)to = buf[1];
+ from++;
+ to++;
+ count--;
+ }
+ while (count >= 2 && !IS_ALIGNED((unsigned long)from, maxw)) {
+ *(u16 *)to = __raw_readw(from);
+ from += 2;
+ to += 2;
+ count -= 2;
+ }
+ while (count >= maxw) {
+#ifdef CONFIG_64BIT
+ *(u64 *)to = __raw_readq(from);
+#else
+ *(u32 *)to = __raw_readl(from);
+#endif
+ from += maxw;
+ to += maxw;
+ count -= maxw;
+ }
+ while (count >= 2) {
+ *(u16 *)to = __raw_readw(from);
+ from += 2;
+ to += 2;
+ count -= 2;
+ }
+ if (count) {
+ *(u16 *)buf = __raw_readw(from);
+ *(u8 *)to = buf[0];
+ }
+}
+
ssize_t rpcif_dirmap_read(struct rpcif *rpc, u64 offs, size_t len, void *buf)
{
loff_t from = offs & (RPCIF_DIRMAP_SIZE - 1);
@@ -566,7 +607,10 @@ ssize_t rpcif_dirmap_read(struct rpcif *rpc, u64 offs, size_t len, void *buf)
regmap_write(rpc->regmap, RPCIF_DRDMCR, rpc->dummy);
regmap_write(rpc->regmap, RPCIF_DRDRENR, rpc->ddr);
- memcpy_fromio(buf, rpc->dirmap + from, len);
+ if (rpc->bus_size == 2)
+ memcpy_fromio_readw(buf, rpc->dirmap + from, len);
+ else
+ memcpy_fromio(buf, rpc->dirmap + from, len);
pm_runtime_put(rpc->dev);
The patch below does not apply to the 5.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 1869023e24c0de73a160a424dac4621cefd628ae Mon Sep 17 00:00:00 2001
From: Andrew Gabbasov <andrew_gabbasov(a)mentor.com>
Date: Wed, 22 Sep 2021 13:48:30 -0500
Subject: [PATCH] memory: renesas-rpc-if: Avoid unaligned bus access for
HyperFlash
HyperFlash devices in Renesas SoCs use 2-bytes addressing, according
to HW manual paragraph 62.3.3 (which officially describes Serial Flash
access, but seems to be applicable to HyperFlash too). And 1-byte bus
read operations to 2-bytes unaligned addresses in external address space
read mode work incorrectly (returns the other byte from the same word).
Function memcpy_fromio(), used by the driver to read data from the bus,
in ARM64 architecture (to which Renesas cores belong) uses 8-bytes
bus accesses for appropriate aligned addresses, and 1-bytes accesses
for other addresses. This results in incorrect data read from HyperFlash
in unaligned cases.
This issue can be reproduced using something like the following commands
(where mtd1 is a parition on Hyperflash storage, defined properly
in a device tree):
[Correct fragment, read from Hyperflash]
root@rcar-gen3:~# dd if=/dev/mtd1 of=/tmp/zz bs=32 count=1
root@rcar-gen3:~# hexdump -C /tmp/zz
00000000 f4 03 00 aa f5 03 01 aa f6 03 02 aa f7 03 03 aa |................|
00000010 00 00 80 d2 40 20 18 d5 00 06 81 d2 a0 18 a6 f2 |....@ ..........|
00000020
[Incorrect read of the same fragment: see the difference at offsets 8-11]
root@rcar-gen3:~# dd if=/dev/mtd1 of=/tmp/zz bs=12 count=1
root@rcar-gen3:~# hexdump -C /tmp/zz
00000000 f4 03 00 aa f5 03 01 aa 03 03 aa aa |............|
0000000c
Fix this issue by creating a local replacement of the copying function,
that performs only properly aligned bus accesses, and is used for reading
from HyperFlash.
Fixes: ca7d8b980b67f ("memory: add Renesas RPC-IF driver")
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Gabbasov <andrew_gabbasov(a)mentor.com>
Link: https://lore.kernel.org/r/20210922184830.29147-1-andrew_gabbasov@mentor.com
Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski(a)canonical.com>
diff --git a/drivers/memory/renesas-rpc-if.c b/drivers/memory/renesas-rpc-if.c
index 77a011d5ff8c..7435baad0007 100644
--- a/drivers/memory/renesas-rpc-if.c
+++ b/drivers/memory/renesas-rpc-if.c
@@ -185,7 +185,6 @@ static int rpcif_reg_read(void *context, unsigned int reg, unsigned int *val)
*val = readl(rpc->base + reg);
return 0;
-
}
static int rpcif_reg_write(void *context, unsigned int reg, unsigned int val)
@@ -545,6 +544,48 @@ err_out:
}
EXPORT_SYMBOL(rpcif_manual_xfer);
+static void memcpy_fromio_readw(void *to,
+ const void __iomem *from,
+ size_t count)
+{
+ const int maxw = (IS_ENABLED(CONFIG_64BIT)) ? 8 : 4;
+ u8 buf[2];
+
+ if (count && ((unsigned long)from & 1)) {
+ *(u16 *)buf = __raw_readw((void __iomem *)((unsigned long)from & ~1));
+ *(u8 *)to = buf[1];
+ from++;
+ to++;
+ count--;
+ }
+ while (count >= 2 && !IS_ALIGNED((unsigned long)from, maxw)) {
+ *(u16 *)to = __raw_readw(from);
+ from += 2;
+ to += 2;
+ count -= 2;
+ }
+ while (count >= maxw) {
+#ifdef CONFIG_64BIT
+ *(u64 *)to = __raw_readq(from);
+#else
+ *(u32 *)to = __raw_readl(from);
+#endif
+ from += maxw;
+ to += maxw;
+ count -= maxw;
+ }
+ while (count >= 2) {
+ *(u16 *)to = __raw_readw(from);
+ from += 2;
+ to += 2;
+ count -= 2;
+ }
+ if (count) {
+ *(u16 *)buf = __raw_readw(from);
+ *(u8 *)to = buf[0];
+ }
+}
+
ssize_t rpcif_dirmap_read(struct rpcif *rpc, u64 offs, size_t len, void *buf)
{
loff_t from = offs & (RPCIF_DIRMAP_SIZE - 1);
@@ -566,7 +607,10 @@ ssize_t rpcif_dirmap_read(struct rpcif *rpc, u64 offs, size_t len, void *buf)
regmap_write(rpc->regmap, RPCIF_DRDMCR, rpc->dummy);
regmap_write(rpc->regmap, RPCIF_DRDRENR, rpc->ddr);
- memcpy_fromio(buf, rpc->dirmap + from, len);
+ if (rpc->bus_size == 2)
+ memcpy_fromio_readw(buf, rpc->dirmap + from, len);
+ else
+ memcpy_fromio(buf, rpc->dirmap + from, len);
pm_runtime_put(rpc->dev);
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 1869023e24c0de73a160a424dac4621cefd628ae Mon Sep 17 00:00:00 2001
From: Andrew Gabbasov <andrew_gabbasov(a)mentor.com>
Date: Wed, 22 Sep 2021 13:48:30 -0500
Subject: [PATCH] memory: renesas-rpc-if: Avoid unaligned bus access for
HyperFlash
HyperFlash devices in Renesas SoCs use 2-bytes addressing, according
to HW manual paragraph 62.3.3 (which officially describes Serial Flash
access, but seems to be applicable to HyperFlash too). And 1-byte bus
read operations to 2-bytes unaligned addresses in external address space
read mode work incorrectly (returns the other byte from the same word).
Function memcpy_fromio(), used by the driver to read data from the bus,
in ARM64 architecture (to which Renesas cores belong) uses 8-bytes
bus accesses for appropriate aligned addresses, and 1-bytes accesses
for other addresses. This results in incorrect data read from HyperFlash
in unaligned cases.
This issue can be reproduced using something like the following commands
(where mtd1 is a parition on Hyperflash storage, defined properly
in a device tree):
[Correct fragment, read from Hyperflash]
root@rcar-gen3:~# dd if=/dev/mtd1 of=/tmp/zz bs=32 count=1
root@rcar-gen3:~# hexdump -C /tmp/zz
00000000 f4 03 00 aa f5 03 01 aa f6 03 02 aa f7 03 03 aa |................|
00000010 00 00 80 d2 40 20 18 d5 00 06 81 d2 a0 18 a6 f2 |....@ ..........|
00000020
[Incorrect read of the same fragment: see the difference at offsets 8-11]
root@rcar-gen3:~# dd if=/dev/mtd1 of=/tmp/zz bs=12 count=1
root@rcar-gen3:~# hexdump -C /tmp/zz
00000000 f4 03 00 aa f5 03 01 aa 03 03 aa aa |............|
0000000c
Fix this issue by creating a local replacement of the copying function,
that performs only properly aligned bus accesses, and is used for reading
from HyperFlash.
Fixes: ca7d8b980b67f ("memory: add Renesas RPC-IF driver")
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Gabbasov <andrew_gabbasov(a)mentor.com>
Link: https://lore.kernel.org/r/20210922184830.29147-1-andrew_gabbasov@mentor.com
Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski(a)canonical.com>
diff --git a/drivers/memory/renesas-rpc-if.c b/drivers/memory/renesas-rpc-if.c
index 77a011d5ff8c..7435baad0007 100644
--- a/drivers/memory/renesas-rpc-if.c
+++ b/drivers/memory/renesas-rpc-if.c
@@ -185,7 +185,6 @@ static int rpcif_reg_read(void *context, unsigned int reg, unsigned int *val)
*val = readl(rpc->base + reg);
return 0;
-
}
static int rpcif_reg_write(void *context, unsigned int reg, unsigned int val)
@@ -545,6 +544,48 @@ err_out:
}
EXPORT_SYMBOL(rpcif_manual_xfer);
+static void memcpy_fromio_readw(void *to,
+ const void __iomem *from,
+ size_t count)
+{
+ const int maxw = (IS_ENABLED(CONFIG_64BIT)) ? 8 : 4;
+ u8 buf[2];
+
+ if (count && ((unsigned long)from & 1)) {
+ *(u16 *)buf = __raw_readw((void __iomem *)((unsigned long)from & ~1));
+ *(u8 *)to = buf[1];
+ from++;
+ to++;
+ count--;
+ }
+ while (count >= 2 && !IS_ALIGNED((unsigned long)from, maxw)) {
+ *(u16 *)to = __raw_readw(from);
+ from += 2;
+ to += 2;
+ count -= 2;
+ }
+ while (count >= maxw) {
+#ifdef CONFIG_64BIT
+ *(u64 *)to = __raw_readq(from);
+#else
+ *(u32 *)to = __raw_readl(from);
+#endif
+ from += maxw;
+ to += maxw;
+ count -= maxw;
+ }
+ while (count >= 2) {
+ *(u16 *)to = __raw_readw(from);
+ from += 2;
+ to += 2;
+ count -= 2;
+ }
+ if (count) {
+ *(u16 *)buf = __raw_readw(from);
+ *(u8 *)to = buf[0];
+ }
+}
+
ssize_t rpcif_dirmap_read(struct rpcif *rpc, u64 offs, size_t len, void *buf)
{
loff_t from = offs & (RPCIF_DIRMAP_SIZE - 1);
@@ -566,7 +607,10 @@ ssize_t rpcif_dirmap_read(struct rpcif *rpc, u64 offs, size_t len, void *buf)
regmap_write(rpc->regmap, RPCIF_DRDMCR, rpc->dummy);
regmap_write(rpc->regmap, RPCIF_DRDRENR, rpc->ddr);
- memcpy_fromio(buf, rpc->dirmap + from, len);
+ if (rpc->bus_size == 2)
+ memcpy_fromio_readw(buf, rpc->dirmap + from, len);
+ else
+ memcpy_fromio(buf, rpc->dirmap + from, len);
pm_runtime_put(rpc->dev);
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 6b51b02a3a0ac49dfe302818d0746a799545e4e9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Christian=20K=C3=B6nig?= <christian.koenig(a)amd.com>
Date: Tue, 15 Jun 2021 13:12:33 +0200
Subject: [PATCH] dma-buf: fix and rework dma_buf_poll v7
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Daniel pointed me towards this function and there are multiple obvious problems
in the implementation.
First of all the retry loop is not working as intended. In general the retry
makes only sense if you grab the reference first and then check the sequence
values.
Then we should always also wait for the exclusive fence.
It's also good practice to keep the reference around when installing callbacks
to fences you don't own.
And last the whole implementation was unnecessary complex and rather hard to
understand which could lead to probably unexpected behavior of the IOCTL.
Fix all this by reworking the implementation from scratch. Dropping the
whole RCU approach and taking the lock instead.
Only mildly tested and needs a thoughtful review of the code.
Pushing through drm-misc-next to avoid merge conflicts and give the code
another round of testing.
v2: fix the reference counting as well
v3: keep the excl fence handling as is for stable
v4: back to testing all fences, drop RCU
v5: handle in and out separately
v6: add missing clear of events
v7: change coding style as suggested by Michel, drop unused variables
Signed-off-by: Christian König <christian.koenig(a)amd.com>
Reviewed-by: Daniel Vetter <daniel.vetter(a)ffwll.ch>
Tested-by: Michel Dänzer <mdaenzer(a)redhat.com>
CC: stable(a)vger.kernel.org
Link: https://patchwork.freedesktop.org/patch/msgid/20210720131110.88512-1-christ…
diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
index 474de2d988ca..61e20ae7b08b 100644
--- a/drivers/dma-buf/dma-buf.c
+++ b/drivers/dma-buf/dma-buf.c
@@ -74,7 +74,7 @@ static void dma_buf_release(struct dentry *dentry)
* If you hit this BUG() it means someone dropped their ref to the
* dma-buf while still having pending operation to the buffer.
*/
- BUG_ON(dmabuf->cb_shared.active || dmabuf->cb_excl.active);
+ BUG_ON(dmabuf->cb_in.active || dmabuf->cb_out.active);
dma_buf_stats_teardown(dmabuf);
dmabuf->ops->release(dmabuf);
@@ -206,16 +206,55 @@ static void dma_buf_poll_cb(struct dma_fence *fence, struct dma_fence_cb *cb)
wake_up_locked_poll(dcb->poll, dcb->active);
dcb->active = 0;
spin_unlock_irqrestore(&dcb->poll->lock, flags);
+ dma_fence_put(fence);
+}
+
+static bool dma_buf_poll_shared(struct dma_resv *resv,
+ struct dma_buf_poll_cb_t *dcb)
+{
+ struct dma_resv_list *fobj = dma_resv_shared_list(resv);
+ struct dma_fence *fence;
+ int i, r;
+
+ if (!fobj)
+ return false;
+
+ for (i = 0; i < fobj->shared_count; ++i) {
+ fence = rcu_dereference_protected(fobj->shared[i],
+ dma_resv_held(resv));
+ dma_fence_get(fence);
+ r = dma_fence_add_callback(fence, &dcb->cb, dma_buf_poll_cb);
+ if (!r)
+ return true;
+ dma_fence_put(fence);
+ }
+
+ return false;
+}
+
+static bool dma_buf_poll_excl(struct dma_resv *resv,
+ struct dma_buf_poll_cb_t *dcb)
+{
+ struct dma_fence *fence = dma_resv_excl_fence(resv);
+ int r;
+
+ if (!fence)
+ return false;
+
+ dma_fence_get(fence);
+ r = dma_fence_add_callback(fence, &dcb->cb, dma_buf_poll_cb);
+ if (!r)
+ return true;
+ dma_fence_put(fence);
+
+ return false;
}
static __poll_t dma_buf_poll(struct file *file, poll_table *poll)
{
struct dma_buf *dmabuf;
struct dma_resv *resv;
- struct dma_resv_list *fobj;
- struct dma_fence *fence_excl;
__poll_t events;
- unsigned shared_count, seq;
dmabuf = file->private_data;
if (!dmabuf || !dmabuf->resv)
@@ -229,101 +268,50 @@ static __poll_t dma_buf_poll(struct file *file, poll_table *poll)
if (!events)
return 0;
-retry:
- seq = read_seqcount_begin(&resv->seq);
- rcu_read_lock();
-
- fobj = rcu_dereference(resv->fence);
- if (fobj)
- shared_count = fobj->shared_count;
- else
- shared_count = 0;
- fence_excl = dma_resv_excl_fence(resv);
- if (read_seqcount_retry(&resv->seq, seq)) {
- rcu_read_unlock();
- goto retry;
- }
-
- if (fence_excl && (!(events & EPOLLOUT) || shared_count == 0)) {
- struct dma_buf_poll_cb_t *dcb = &dmabuf->cb_excl;
- __poll_t pevents = EPOLLIN;
+ dma_resv_lock(resv, NULL);
- if (shared_count == 0)
- pevents |= EPOLLOUT;
+ if (events & EPOLLOUT) {
+ struct dma_buf_poll_cb_t *dcb = &dmabuf->cb_out;
+ /* Check that callback isn't busy */
spin_lock_irq(&dmabuf->poll.lock);
- if (dcb->active) {
- dcb->active |= pevents;
- events &= ~pevents;
- } else
- dcb->active = pevents;
+ if (dcb->active)
+ events &= ~EPOLLOUT;
+ else
+ dcb->active = EPOLLOUT;
spin_unlock_irq(&dmabuf->poll.lock);
- if (events & pevents) {
- if (!dma_fence_get_rcu(fence_excl)) {
- /* force a recheck */
- events &= ~pevents;
- dma_buf_poll_cb(NULL, &dcb->cb);
- } else if (!dma_fence_add_callback(fence_excl, &dcb->cb,
- dma_buf_poll_cb)) {
- events &= ~pevents;
- dma_fence_put(fence_excl);
- } else {
- /*
- * No callback queued, wake up any additional
- * waiters.
- */
- dma_fence_put(fence_excl);
+ if (events & EPOLLOUT) {
+ if (!dma_buf_poll_shared(resv, dcb) &&
+ !dma_buf_poll_excl(resv, dcb))
+ /* No callback queued, wake up any other waiters */
dma_buf_poll_cb(NULL, &dcb->cb);
- }
+ else
+ events &= ~EPOLLOUT;
}
}
- if ((events & EPOLLOUT) && shared_count > 0) {
- struct dma_buf_poll_cb_t *dcb = &dmabuf->cb_shared;
- int i;
+ if (events & EPOLLIN) {
+ struct dma_buf_poll_cb_t *dcb = &dmabuf->cb_in;
- /* Only queue a new callback if no event has fired yet */
+ /* Check that callback isn't busy */
spin_lock_irq(&dmabuf->poll.lock);
if (dcb->active)
- events &= ~EPOLLOUT;
+ events &= ~EPOLLIN;
else
- dcb->active = EPOLLOUT;
+ dcb->active = EPOLLIN;
spin_unlock_irq(&dmabuf->poll.lock);
- if (!(events & EPOLLOUT))
- goto out;
-
- for (i = 0; i < shared_count; ++i) {
- struct dma_fence *fence = rcu_dereference(fobj->shared[i]);
-
- if (!dma_fence_get_rcu(fence)) {
- /*
- * fence refcount dropped to zero, this means
- * that fobj has been freed
- *
- * call dma_buf_poll_cb and force a recheck!
- */
- events &= ~EPOLLOUT;
+ if (events & EPOLLIN) {
+ if (!dma_buf_poll_excl(resv, dcb))
+ /* No callback queued, wake up any other waiters */
dma_buf_poll_cb(NULL, &dcb->cb);
- break;
- }
- if (!dma_fence_add_callback(fence, &dcb->cb,
- dma_buf_poll_cb)) {
- dma_fence_put(fence);
- events &= ~EPOLLOUT;
- break;
- }
- dma_fence_put(fence);
+ else
+ events &= ~EPOLLIN;
}
-
- /* No callback queued, wake up any additional waiters. */
- if (i == shared_count)
- dma_buf_poll_cb(NULL, &dcb->cb);
}
-out:
- rcu_read_unlock();
+ dma_resv_unlock(resv);
return events;
}
@@ -566,8 +554,8 @@ struct dma_buf *dma_buf_export(const struct dma_buf_export_info *exp_info)
dmabuf->owner = exp_info->owner;
spin_lock_init(&dmabuf->name_lock);
init_waitqueue_head(&dmabuf->poll);
- dmabuf->cb_excl.poll = dmabuf->cb_shared.poll = &dmabuf->poll;
- dmabuf->cb_excl.active = dmabuf->cb_shared.active = 0;
+ dmabuf->cb_in.poll = dmabuf->cb_out.poll = &dmabuf->poll;
+ dmabuf->cb_in.active = dmabuf->cb_out.active = 0;
if (!resv) {
resv = (struct dma_resv *)&dmabuf[1];
diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
index 66470c37e471..02c2eb874da6 100644
--- a/include/linux/dma-buf.h
+++ b/include/linux/dma-buf.h
@@ -440,7 +440,7 @@ struct dma_buf {
wait_queue_head_t *poll;
__poll_t active;
- } cb_excl, cb_shared;
+ } cb_in, cb_out;
#ifdef CONFIG_DMABUF_SYSFS_STATS
/**
* @sysfs_entry:
The patch below does not apply to the 5.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 6b51b02a3a0ac49dfe302818d0746a799545e4e9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Christian=20K=C3=B6nig?= <christian.koenig(a)amd.com>
Date: Tue, 15 Jun 2021 13:12:33 +0200
Subject: [PATCH] dma-buf: fix and rework dma_buf_poll v7
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Daniel pointed me towards this function and there are multiple obvious problems
in the implementation.
First of all the retry loop is not working as intended. In general the retry
makes only sense if you grab the reference first and then check the sequence
values.
Then we should always also wait for the exclusive fence.
It's also good practice to keep the reference around when installing callbacks
to fences you don't own.
And last the whole implementation was unnecessary complex and rather hard to
understand which could lead to probably unexpected behavior of the IOCTL.
Fix all this by reworking the implementation from scratch. Dropping the
whole RCU approach and taking the lock instead.
Only mildly tested and needs a thoughtful review of the code.
Pushing through drm-misc-next to avoid merge conflicts and give the code
another round of testing.
v2: fix the reference counting as well
v3: keep the excl fence handling as is for stable
v4: back to testing all fences, drop RCU
v5: handle in and out separately
v6: add missing clear of events
v7: change coding style as suggested by Michel, drop unused variables
Signed-off-by: Christian König <christian.koenig(a)amd.com>
Reviewed-by: Daniel Vetter <daniel.vetter(a)ffwll.ch>
Tested-by: Michel Dänzer <mdaenzer(a)redhat.com>
CC: stable(a)vger.kernel.org
Link: https://patchwork.freedesktop.org/patch/msgid/20210720131110.88512-1-christ…
diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
index 474de2d988ca..61e20ae7b08b 100644
--- a/drivers/dma-buf/dma-buf.c
+++ b/drivers/dma-buf/dma-buf.c
@@ -74,7 +74,7 @@ static void dma_buf_release(struct dentry *dentry)
* If you hit this BUG() it means someone dropped their ref to the
* dma-buf while still having pending operation to the buffer.
*/
- BUG_ON(dmabuf->cb_shared.active || dmabuf->cb_excl.active);
+ BUG_ON(dmabuf->cb_in.active || dmabuf->cb_out.active);
dma_buf_stats_teardown(dmabuf);
dmabuf->ops->release(dmabuf);
@@ -206,16 +206,55 @@ static void dma_buf_poll_cb(struct dma_fence *fence, struct dma_fence_cb *cb)
wake_up_locked_poll(dcb->poll, dcb->active);
dcb->active = 0;
spin_unlock_irqrestore(&dcb->poll->lock, flags);
+ dma_fence_put(fence);
+}
+
+static bool dma_buf_poll_shared(struct dma_resv *resv,
+ struct dma_buf_poll_cb_t *dcb)
+{
+ struct dma_resv_list *fobj = dma_resv_shared_list(resv);
+ struct dma_fence *fence;
+ int i, r;
+
+ if (!fobj)
+ return false;
+
+ for (i = 0; i < fobj->shared_count; ++i) {
+ fence = rcu_dereference_protected(fobj->shared[i],
+ dma_resv_held(resv));
+ dma_fence_get(fence);
+ r = dma_fence_add_callback(fence, &dcb->cb, dma_buf_poll_cb);
+ if (!r)
+ return true;
+ dma_fence_put(fence);
+ }
+
+ return false;
+}
+
+static bool dma_buf_poll_excl(struct dma_resv *resv,
+ struct dma_buf_poll_cb_t *dcb)
+{
+ struct dma_fence *fence = dma_resv_excl_fence(resv);
+ int r;
+
+ if (!fence)
+ return false;
+
+ dma_fence_get(fence);
+ r = dma_fence_add_callback(fence, &dcb->cb, dma_buf_poll_cb);
+ if (!r)
+ return true;
+ dma_fence_put(fence);
+
+ return false;
}
static __poll_t dma_buf_poll(struct file *file, poll_table *poll)
{
struct dma_buf *dmabuf;
struct dma_resv *resv;
- struct dma_resv_list *fobj;
- struct dma_fence *fence_excl;
__poll_t events;
- unsigned shared_count, seq;
dmabuf = file->private_data;
if (!dmabuf || !dmabuf->resv)
@@ -229,101 +268,50 @@ static __poll_t dma_buf_poll(struct file *file, poll_table *poll)
if (!events)
return 0;
-retry:
- seq = read_seqcount_begin(&resv->seq);
- rcu_read_lock();
-
- fobj = rcu_dereference(resv->fence);
- if (fobj)
- shared_count = fobj->shared_count;
- else
- shared_count = 0;
- fence_excl = dma_resv_excl_fence(resv);
- if (read_seqcount_retry(&resv->seq, seq)) {
- rcu_read_unlock();
- goto retry;
- }
-
- if (fence_excl && (!(events & EPOLLOUT) || shared_count == 0)) {
- struct dma_buf_poll_cb_t *dcb = &dmabuf->cb_excl;
- __poll_t pevents = EPOLLIN;
+ dma_resv_lock(resv, NULL);
- if (shared_count == 0)
- pevents |= EPOLLOUT;
+ if (events & EPOLLOUT) {
+ struct dma_buf_poll_cb_t *dcb = &dmabuf->cb_out;
+ /* Check that callback isn't busy */
spin_lock_irq(&dmabuf->poll.lock);
- if (dcb->active) {
- dcb->active |= pevents;
- events &= ~pevents;
- } else
- dcb->active = pevents;
+ if (dcb->active)
+ events &= ~EPOLLOUT;
+ else
+ dcb->active = EPOLLOUT;
spin_unlock_irq(&dmabuf->poll.lock);
- if (events & pevents) {
- if (!dma_fence_get_rcu(fence_excl)) {
- /* force a recheck */
- events &= ~pevents;
- dma_buf_poll_cb(NULL, &dcb->cb);
- } else if (!dma_fence_add_callback(fence_excl, &dcb->cb,
- dma_buf_poll_cb)) {
- events &= ~pevents;
- dma_fence_put(fence_excl);
- } else {
- /*
- * No callback queued, wake up any additional
- * waiters.
- */
- dma_fence_put(fence_excl);
+ if (events & EPOLLOUT) {
+ if (!dma_buf_poll_shared(resv, dcb) &&
+ !dma_buf_poll_excl(resv, dcb))
+ /* No callback queued, wake up any other waiters */
dma_buf_poll_cb(NULL, &dcb->cb);
- }
+ else
+ events &= ~EPOLLOUT;
}
}
- if ((events & EPOLLOUT) && shared_count > 0) {
- struct dma_buf_poll_cb_t *dcb = &dmabuf->cb_shared;
- int i;
+ if (events & EPOLLIN) {
+ struct dma_buf_poll_cb_t *dcb = &dmabuf->cb_in;
- /* Only queue a new callback if no event has fired yet */
+ /* Check that callback isn't busy */
spin_lock_irq(&dmabuf->poll.lock);
if (dcb->active)
- events &= ~EPOLLOUT;
+ events &= ~EPOLLIN;
else
- dcb->active = EPOLLOUT;
+ dcb->active = EPOLLIN;
spin_unlock_irq(&dmabuf->poll.lock);
- if (!(events & EPOLLOUT))
- goto out;
-
- for (i = 0; i < shared_count; ++i) {
- struct dma_fence *fence = rcu_dereference(fobj->shared[i]);
-
- if (!dma_fence_get_rcu(fence)) {
- /*
- * fence refcount dropped to zero, this means
- * that fobj has been freed
- *
- * call dma_buf_poll_cb and force a recheck!
- */
- events &= ~EPOLLOUT;
+ if (events & EPOLLIN) {
+ if (!dma_buf_poll_excl(resv, dcb))
+ /* No callback queued, wake up any other waiters */
dma_buf_poll_cb(NULL, &dcb->cb);
- break;
- }
- if (!dma_fence_add_callback(fence, &dcb->cb,
- dma_buf_poll_cb)) {
- dma_fence_put(fence);
- events &= ~EPOLLOUT;
- break;
- }
- dma_fence_put(fence);
+ else
+ events &= ~EPOLLIN;
}
-
- /* No callback queued, wake up any additional waiters. */
- if (i == shared_count)
- dma_buf_poll_cb(NULL, &dcb->cb);
}
-out:
- rcu_read_unlock();
+ dma_resv_unlock(resv);
return events;
}
@@ -566,8 +554,8 @@ struct dma_buf *dma_buf_export(const struct dma_buf_export_info *exp_info)
dmabuf->owner = exp_info->owner;
spin_lock_init(&dmabuf->name_lock);
init_waitqueue_head(&dmabuf->poll);
- dmabuf->cb_excl.poll = dmabuf->cb_shared.poll = &dmabuf->poll;
- dmabuf->cb_excl.active = dmabuf->cb_shared.active = 0;
+ dmabuf->cb_in.poll = dmabuf->cb_out.poll = &dmabuf->poll;
+ dmabuf->cb_in.active = dmabuf->cb_out.active = 0;
if (!resv) {
resv = (struct dma_resv *)&dmabuf[1];
diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
index 66470c37e471..02c2eb874da6 100644
--- a/include/linux/dma-buf.h
+++ b/include/linux/dma-buf.h
@@ -440,7 +440,7 @@ struct dma_buf {
wait_queue_head_t *poll;
__poll_t active;
- } cb_excl, cb_shared;
+ } cb_in, cb_out;
#ifdef CONFIG_DMABUF_SYSFS_STATS
/**
* @sysfs_entry:
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 928265e3601cde78c7e0a3e518a93b27defed3b1 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki(a)intel.com>
Date: Fri, 22 Oct 2021 14:58:23 +0200
Subject: [PATCH] PM: sleep: Do not let "syscore" devices runtime-suspend
during system transitions
There is no reason to allow "syscore" devices to runtime-suspend
during system-wide PM transitions, because they are subject to the
same possible failure modes as any other devices in that respect.
Accordingly, change device_prepare() and device_complete() to call
pm_runtime_get_noresume() and pm_runtime_put(), respectively, for
"syscore" devices too.
Fixes: 057d51a1268f ("Merge branch 'pm-sleep'")
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki(a)intel.com>
Cc: 3.10+ <stable(a)vger.kernel.org> # 3.10+
Reviewed-by: Ulf Hansson <ulf.hansson(a)linaro.org>
diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index cbea78e79f3d..fca6eab871fc 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -1051,7 +1051,7 @@ static void device_complete(struct device *dev, pm_message_t state)
const char *info = NULL;
if (dev->power.syscore)
- return;
+ goto out;
device_lock(dev);
@@ -1081,6 +1081,7 @@ static void device_complete(struct device *dev, pm_message_t state)
device_unlock(dev);
+out:
pm_runtime_put(dev);
}
@@ -1794,9 +1795,6 @@ static int device_prepare(struct device *dev, pm_message_t state)
int (*callback)(struct device *) = NULL;
int ret = 0;
- if (dev->power.syscore)
- return 0;
-
/*
* If a device's parent goes into runtime suspend at the wrong time,
* it won't be possible to resume the device. To prevent this we
@@ -1805,6 +1803,9 @@ static int device_prepare(struct device *dev, pm_message_t state)
*/
pm_runtime_get_noresume(dev);
+ if (dev->power.syscore)
+ return 0;
+
device_lock(dev);
dev->power.wakeup_path = false;
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 928265e3601cde78c7e0a3e518a93b27defed3b1 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki(a)intel.com>
Date: Fri, 22 Oct 2021 14:58:23 +0200
Subject: [PATCH] PM: sleep: Do not let "syscore" devices runtime-suspend
during system transitions
There is no reason to allow "syscore" devices to runtime-suspend
during system-wide PM transitions, because they are subject to the
same possible failure modes as any other devices in that respect.
Accordingly, change device_prepare() and device_complete() to call
pm_runtime_get_noresume() and pm_runtime_put(), respectively, for
"syscore" devices too.
Fixes: 057d51a1268f ("Merge branch 'pm-sleep'")
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki(a)intel.com>
Cc: 3.10+ <stable(a)vger.kernel.org> # 3.10+
Reviewed-by: Ulf Hansson <ulf.hansson(a)linaro.org>
diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index cbea78e79f3d..fca6eab871fc 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -1051,7 +1051,7 @@ static void device_complete(struct device *dev, pm_message_t state)
const char *info = NULL;
if (dev->power.syscore)
- return;
+ goto out;
device_lock(dev);
@@ -1081,6 +1081,7 @@ static void device_complete(struct device *dev, pm_message_t state)
device_unlock(dev);
+out:
pm_runtime_put(dev);
}
@@ -1794,9 +1795,6 @@ static int device_prepare(struct device *dev, pm_message_t state)
int (*callback)(struct device *) = NULL;
int ret = 0;
- if (dev->power.syscore)
- return 0;
-
/*
* If a device's parent goes into runtime suspend at the wrong time,
* it won't be possible to resume the device. To prevent this we
@@ -1805,6 +1803,9 @@ static int device_prepare(struct device *dev, pm_message_t state)
*/
pm_runtime_get_noresume(dev);
+ if (dev->power.syscore)
+ return 0;
+
device_lock(dev);
dev->power.wakeup_path = false;
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 928265e3601cde78c7e0a3e518a93b27defed3b1 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki(a)intel.com>
Date: Fri, 22 Oct 2021 14:58:23 +0200
Subject: [PATCH] PM: sleep: Do not let "syscore" devices runtime-suspend
during system transitions
There is no reason to allow "syscore" devices to runtime-suspend
during system-wide PM transitions, because they are subject to the
same possible failure modes as any other devices in that respect.
Accordingly, change device_prepare() and device_complete() to call
pm_runtime_get_noresume() and pm_runtime_put(), respectively, for
"syscore" devices too.
Fixes: 057d51a1268f ("Merge branch 'pm-sleep'")
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki(a)intel.com>
Cc: 3.10+ <stable(a)vger.kernel.org> # 3.10+
Reviewed-by: Ulf Hansson <ulf.hansson(a)linaro.org>
diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index cbea78e79f3d..fca6eab871fc 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -1051,7 +1051,7 @@ static void device_complete(struct device *dev, pm_message_t state)
const char *info = NULL;
if (dev->power.syscore)
- return;
+ goto out;
device_lock(dev);
@@ -1081,6 +1081,7 @@ static void device_complete(struct device *dev, pm_message_t state)
device_unlock(dev);
+out:
pm_runtime_put(dev);
}
@@ -1794,9 +1795,6 @@ static int device_prepare(struct device *dev, pm_message_t state)
int (*callback)(struct device *) = NULL;
int ret = 0;
- if (dev->power.syscore)
- return 0;
-
/*
* If a device's parent goes into runtime suspend at the wrong time,
* it won't be possible to resume the device. To prevent this we
@@ -1805,6 +1803,9 @@ static int device_prepare(struct device *dev, pm_message_t state)
*/
pm_runtime_get_noresume(dev);
+ if (dev->power.syscore)
+ return 0;
+
device_lock(dev);
dev->power.wakeup_path = false;
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 928265e3601cde78c7e0a3e518a93b27defed3b1 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki(a)intel.com>
Date: Fri, 22 Oct 2021 14:58:23 +0200
Subject: [PATCH] PM: sleep: Do not let "syscore" devices runtime-suspend
during system transitions
There is no reason to allow "syscore" devices to runtime-suspend
during system-wide PM transitions, because they are subject to the
same possible failure modes as any other devices in that respect.
Accordingly, change device_prepare() and device_complete() to call
pm_runtime_get_noresume() and pm_runtime_put(), respectively, for
"syscore" devices too.
Fixes: 057d51a1268f ("Merge branch 'pm-sleep'")
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki(a)intel.com>
Cc: 3.10+ <stable(a)vger.kernel.org> # 3.10+
Reviewed-by: Ulf Hansson <ulf.hansson(a)linaro.org>
diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index cbea78e79f3d..fca6eab871fc 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -1051,7 +1051,7 @@ static void device_complete(struct device *dev, pm_message_t state)
const char *info = NULL;
if (dev->power.syscore)
- return;
+ goto out;
device_lock(dev);
@@ -1081,6 +1081,7 @@ static void device_complete(struct device *dev, pm_message_t state)
device_unlock(dev);
+out:
pm_runtime_put(dev);
}
@@ -1794,9 +1795,6 @@ static int device_prepare(struct device *dev, pm_message_t state)
int (*callback)(struct device *) = NULL;
int ret = 0;
- if (dev->power.syscore)
- return 0;
-
/*
* If a device's parent goes into runtime suspend at the wrong time,
* it won't be possible to resume the device. To prevent this we
@@ -1805,6 +1803,9 @@ static int device_prepare(struct device *dev, pm_message_t state)
*/
pm_runtime_get_noresume(dev);
+ if (dev->power.syscore)
+ return 0;
+
device_lock(dev);
dev->power.wakeup_path = false;
The patch below does not apply to the 4.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 928265e3601cde78c7e0a3e518a93b27defed3b1 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki(a)intel.com>
Date: Fri, 22 Oct 2021 14:58:23 +0200
Subject: [PATCH] PM: sleep: Do not let "syscore" devices runtime-suspend
during system transitions
There is no reason to allow "syscore" devices to runtime-suspend
during system-wide PM transitions, because they are subject to the
same possible failure modes as any other devices in that respect.
Accordingly, change device_prepare() and device_complete() to call
pm_runtime_get_noresume() and pm_runtime_put(), respectively, for
"syscore" devices too.
Fixes: 057d51a1268f ("Merge branch 'pm-sleep'")
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki(a)intel.com>
Cc: 3.10+ <stable(a)vger.kernel.org> # 3.10+
Reviewed-by: Ulf Hansson <ulf.hansson(a)linaro.org>
diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index cbea78e79f3d..fca6eab871fc 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -1051,7 +1051,7 @@ static void device_complete(struct device *dev, pm_message_t state)
const char *info = NULL;
if (dev->power.syscore)
- return;
+ goto out;
device_lock(dev);
@@ -1081,6 +1081,7 @@ static void device_complete(struct device *dev, pm_message_t state)
device_unlock(dev);
+out:
pm_runtime_put(dev);
}
@@ -1794,9 +1795,6 @@ static int device_prepare(struct device *dev, pm_message_t state)
int (*callback)(struct device *) = NULL;
int ret = 0;
- if (dev->power.syscore)
- return 0;
-
/*
* If a device's parent goes into runtime suspend at the wrong time,
* it won't be possible to resume the device. To prevent this we
@@ -1805,6 +1803,9 @@ static int device_prepare(struct device *dev, pm_message_t state)
*/
pm_runtime_get_noresume(dev);
+ if (dev->power.syscore)
+ return 0;
+
device_lock(dev);
dev->power.wakeup_path = false;
The patch below does not apply to the 5.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 3c08b0931eedd04c530040499fadeccab50ed646 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj(a)kernel.org>
Date: Thu, 14 Oct 2021 13:20:22 -1000
Subject: [PATCH] blk-cgroup: blk_cgroup_bio_start() should use irq-safe
operations on blkg->iostat_cpu
c3df5fb57fe8 ("cgroup: rstat: fix A-A deadlock on 32bit around
u64_stats_sync") made u64_stats updates irq-safe to avoid A-A deadlocks.
Unfortunately, the conversion missed one in blk_cgroup_bio_start(). Fix it.
Fixes: 2d146aa3aa84 ("mm: memcontrol: switch to rstat")
Cc: stable(a)vger.kernel.org # v5.13+
Reported-by: syzbot+9738c8815b375ce482a1(a)syzkaller.appspotmail.com
Signed-off-by: Tejun Heo <tj(a)kernel.org>
Link: https://lore.kernel.org/r/YWi7NrQdVlxD6J9W@slm.duckdns.org
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 38b9f7684952..9a1c5839dd46 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1897,10 +1897,11 @@ void blk_cgroup_bio_start(struct bio *bio)
{
int rwd = blk_cgroup_io_type(bio), cpu;
struct blkg_iostat_set *bis;
+ unsigned long flags;
cpu = get_cpu();
bis = per_cpu_ptr(bio->bi_blkg->iostat_cpu, cpu);
- u64_stats_update_begin(&bis->sync);
+ flags = u64_stats_update_begin_irqsave(&bis->sync);
/*
* If the bio is flagged with BIO_CGROUP_ACCT it means this is a split
@@ -1912,7 +1913,7 @@ void blk_cgroup_bio_start(struct bio *bio)
}
bis->cur.ios[rwd]++;
- u64_stats_update_end(&bis->sync);
+ u64_stats_update_end_irqrestore(&bis->sync, flags);
if (cgroup_subsys_on_dfl(io_cgrp_subsys))
cgroup_rstat_updated(bio->bi_blkg->blkcg->css.cgroup, cpu);
put_cpu();
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 3c08b0931eedd04c530040499fadeccab50ed646 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj(a)kernel.org>
Date: Thu, 14 Oct 2021 13:20:22 -1000
Subject: [PATCH] blk-cgroup: blk_cgroup_bio_start() should use irq-safe
operations on blkg->iostat_cpu
c3df5fb57fe8 ("cgroup: rstat: fix A-A deadlock on 32bit around
u64_stats_sync") made u64_stats updates irq-safe to avoid A-A deadlocks.
Unfortunately, the conversion missed one in blk_cgroup_bio_start(). Fix it.
Fixes: 2d146aa3aa84 ("mm: memcontrol: switch to rstat")
Cc: stable(a)vger.kernel.org # v5.13+
Reported-by: syzbot+9738c8815b375ce482a1(a)syzkaller.appspotmail.com
Signed-off-by: Tejun Heo <tj(a)kernel.org>
Link: https://lore.kernel.org/r/YWi7NrQdVlxD6J9W@slm.duckdns.org
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 38b9f7684952..9a1c5839dd46 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1897,10 +1897,11 @@ void blk_cgroup_bio_start(struct bio *bio)
{
int rwd = blk_cgroup_io_type(bio), cpu;
struct blkg_iostat_set *bis;
+ unsigned long flags;
cpu = get_cpu();
bis = per_cpu_ptr(bio->bi_blkg->iostat_cpu, cpu);
- u64_stats_update_begin(&bis->sync);
+ flags = u64_stats_update_begin_irqsave(&bis->sync);
/*
* If the bio is flagged with BIO_CGROUP_ACCT it means this is a split
@@ -1912,7 +1913,7 @@ void blk_cgroup_bio_start(struct bio *bio)
}
bis->cur.ios[rwd]++;
- u64_stats_update_end(&bis->sync);
+ u64_stats_update_end_irqrestore(&bis->sync, flags);
if (cgroup_subsys_on_dfl(io_cgrp_subsys))
cgroup_rstat_updated(bio->bi_blkg->blkcg->css.cgroup, cpu);
put_cpu();
The patch below does not apply to the 4.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From a7fda04bc9b6ad9da8e19c9e6e3b1dab773d068a Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzysztof.kozlowski(a)canonical.com>
Date: Fri, 8 Oct 2021 13:37:14 +0200
Subject: [PATCH] regulator: dt-bindings: samsung,s5m8767: correct
s5m8767,pmic-buck-default-dvs-idx property
The driver was always parsing "s5m8767,pmic-buck-default-dvs-idx", not
"s5m8767,pmic-buck234-default-dvs-idx".
Cc: <stable(a)vger.kernel.org>
Fixes: 26aec009f6b6 ("regulator: add device tree support for s5m8767")
Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski(a)canonical.com>
Acked-by: Rob Herring <robh(a)kernel.org>
Message-Id: <20211008113723.134648-3-krzysztof.kozlowski(a)canonical.com>
Signed-off-by: Mark Brown <broonie(a)kernel.org>
diff --git a/Documentation/devicetree/bindings/regulator/samsung,s5m8767.txt b/Documentation/devicetree/bindings/regulator/samsung,s5m8767.txt
index d9cff1614f7a..6cd83d920155 100644
--- a/Documentation/devicetree/bindings/regulator/samsung,s5m8767.txt
+++ b/Documentation/devicetree/bindings/regulator/samsung,s5m8767.txt
@@ -39,7 +39,7 @@ Optional properties of the main device node (the parent!):
Additional properties required if either of the optional properties are used:
- - s5m8767,pmic-buck234-default-dvs-idx: Default voltage setting selected from
+ - s5m8767,pmic-buck-default-dvs-idx: Default voltage setting selected from
the possible 8 options selectable by the dvs gpios. The value of this
property should be between 0 and 7. If not specified or if out of range, the
default value of this property is set to 0.
The patch below does not apply to the 4.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From b16bef60a9112b1e6daf3afd16484eb06e7ce792 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzysztof.kozlowski(a)canonical.com>
Date: Fri, 8 Oct 2021 13:37:13 +0200
Subject: [PATCH] regulator: s5m8767: do not use reset value as DVS voltage if
GPIO DVS is disabled
The driver and its bindings, before commit 04f9f068a619 ("regulator:
s5m8767: Modify parsing method of the voltage table of buck2/3/4") were
requiring to provide at least one safe/default voltage for DVS registers
if DVS GPIO is not being enabled.
IOW, if s5m8767,pmic-buck2-uses-gpio-dvs is missing, the
s5m8767,pmic-buck2-dvs-voltage should still be present and contain one
voltage.
This requirement was coming from driver behavior matching this condition
(none of DVS GPIO is enabled): it was always initializing the DVS
selector pins to 0 and keeping the DVS enable setting at reset value
(enabled). Therefore if none of DVS GPIO is enabled in devicetree,
driver was configuring the first DVS voltage for buck[234].
Mentioned commit 04f9f068a619 ("regulator: s5m8767: Modify parsing
method of the voltage table of buck2/3/4") broke it because DVS voltage
won't be parsed from devicetree if DVS GPIO is not enabled. After the
change, driver will configure bucks to use the register reset value as
voltage which might have unpleasant effects.
Fix this by relaxing the bindings constrain: if DVS GPIO is not enabled
in devicetree (therefore DVS voltage is also not parsed), explicitly
disable it.
Cc: <stable(a)vger.kernel.org>
Fixes: 04f9f068a619 ("regulator: s5m8767: Modify parsing method of the voltage table of buck2/3/4")
Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski(a)canonical.com>
Acked-by: Rob Herring <robh(a)kernel.org>
Message-Id: <20211008113723.134648-2-krzysztof.kozlowski(a)canonical.com>
Signed-off-by: Mark Brown <broonie(a)kernel.org>
diff --git a/Documentation/devicetree/bindings/regulator/samsung,s5m8767.txt b/Documentation/devicetree/bindings/regulator/samsung,s5m8767.txt
index 093edda0c8df..d9cff1614f7a 100644
--- a/Documentation/devicetree/bindings/regulator/samsung,s5m8767.txt
+++ b/Documentation/devicetree/bindings/regulator/samsung,s5m8767.txt
@@ -13,6 +13,14 @@ common regulator binding documented in:
Required properties of the main device node (the parent!):
+ - s5m8767,pmic-buck-ds-gpios: GPIO specifiers for three host gpio's used
+ for selecting GPIO DVS lines. It is one-to-one mapped to dvs gpio lines.
+
+ [1] If either of the 's5m8767,pmic-buck[2/3/4]-uses-gpio-dvs' optional
+ property is specified, then all the eight voltage values for the
+ 's5m8767,pmic-buck[2/3/4]-dvs-voltage' should be specified.
+
+Optional properties of the main device node (the parent!):
- s5m8767,pmic-buck2-dvs-voltage: A set of 8 voltage values in micro-volt (uV)
units for buck2 when changing voltage using gpio dvs. Refer to [1] below
for additional information.
@@ -25,19 +33,6 @@ Required properties of the main device node (the parent!):
units for buck4 when changing voltage using gpio dvs. Refer to [1] below
for additional information.
- - s5m8767,pmic-buck-ds-gpios: GPIO specifiers for three host gpio's used
- for selecting GPIO DVS lines. It is one-to-one mapped to dvs gpio lines.
-
- [1] If none of the 's5m8767,pmic-buck[2/3/4]-uses-gpio-dvs' optional
- property is specified, the 's5m8767,pmic-buck[2/3/4]-dvs-voltage'
- property should specify atleast one voltage level (which would be a
- safe operating voltage).
-
- If either of the 's5m8767,pmic-buck[2/3/4]-uses-gpio-dvs' optional
- property is specified, then all the eight voltage values for the
- 's5m8767,pmic-buck[2/3/4]-dvs-voltage' should be specified.
-
-Optional properties of the main device node (the parent!):
- s5m8767,pmic-buck2-uses-gpio-dvs: 'buck2' can be controlled by gpio dvs.
- s5m8767,pmic-buck3-uses-gpio-dvs: 'buck3' can be controlled by gpio dvs.
- s5m8767,pmic-buck4-uses-gpio-dvs: 'buck4' can be controlled by gpio dvs.
diff --git a/drivers/regulator/s5m8767.c b/drivers/regulator/s5m8767.c
index 7c111bbdc2af..35269f998210 100644
--- a/drivers/regulator/s5m8767.c
+++ b/drivers/regulator/s5m8767.c
@@ -850,18 +850,15 @@ static int s5m8767_pmic_probe(struct platform_device *pdev)
/* DS4 GPIO */
gpio_direction_output(pdata->buck_ds[2], 0x0);
- if (pdata->buck2_gpiodvs || pdata->buck3_gpiodvs ||
- pdata->buck4_gpiodvs) {
- regmap_update_bits(s5m8767->iodev->regmap_pmic,
- S5M8767_REG_BUCK2CTRL, 1 << 1,
- (pdata->buck2_gpiodvs) ? (1 << 1) : (0 << 1));
- regmap_update_bits(s5m8767->iodev->regmap_pmic,
- S5M8767_REG_BUCK3CTRL, 1 << 1,
- (pdata->buck3_gpiodvs) ? (1 << 1) : (0 << 1));
- regmap_update_bits(s5m8767->iodev->regmap_pmic,
- S5M8767_REG_BUCK4CTRL, 1 << 1,
- (pdata->buck4_gpiodvs) ? (1 << 1) : (0 << 1));
- }
+ regmap_update_bits(s5m8767->iodev->regmap_pmic,
+ S5M8767_REG_BUCK2CTRL, 1 << 1,
+ (pdata->buck2_gpiodvs) ? (1 << 1) : (0 << 1));
+ regmap_update_bits(s5m8767->iodev->regmap_pmic,
+ S5M8767_REG_BUCK3CTRL, 1 << 1,
+ (pdata->buck3_gpiodvs) ? (1 << 1) : (0 << 1));
+ regmap_update_bits(s5m8767->iodev->regmap_pmic,
+ S5M8767_REG_BUCK4CTRL, 1 << 1,
+ (pdata->buck4_gpiodvs) ? (1 << 1) : (0 << 1));
/* Initialize GPIO DVS registers */
for (i = 0; i < 8; i++) {
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From db05ddf7f321634c5659a0cf7ea56594e22365f7 Mon Sep 17 00:00:00 2001
From: Corey Minyard <cminyard(a)mvista.com>
Date: Mon, 20 Sep 2021 06:25:37 -0500
Subject: [PATCH] ipmi:watchdog: Set panic count to proper value on a panic
You will get two decrements when the messages on a panic are sent, not
one, since commit 2033f6858970 ("ipmi: Free receive messages when in an
oops") was added, but the watchdog code had a bug where it didn't set
the value properly.
Reported-by: Anton Lundin <glance(a)acc.umu.se>
Cc: <Stable(a)vger.kernel.org> # v5.4+
Fixes: 2033f6858970 ("ipmi: Free receive messages when in an oops")
Signed-off-by: Corey Minyard <cminyard(a)mvista.com>
diff --git a/drivers/char/ipmi/ipmi_watchdog.c b/drivers/char/ipmi/ipmi_watchdog.c
index e4ff3b50de7f..f855a9665c28 100644
--- a/drivers/char/ipmi/ipmi_watchdog.c
+++ b/drivers/char/ipmi/ipmi_watchdog.c
@@ -497,7 +497,7 @@ static void panic_halt_ipmi_heartbeat(void)
msg.cmd = IPMI_WDOG_RESET_TIMER;
msg.data = NULL;
msg.data_len = 0;
- atomic_inc(&panic_done_count);
+ atomic_add(2, &panic_done_count);
rv = ipmi_request_supply_msgs(watchdog_user,
(struct ipmi_addr *) &addr,
0,
@@ -507,7 +507,7 @@ static void panic_halt_ipmi_heartbeat(void)
&panic_halt_heartbeat_recv_msg,
1);
if (rv)
- atomic_dec(&panic_done_count);
+ atomic_sub(2, &panic_done_count);
}
static struct ipmi_smi_msg panic_halt_smi_msg = {
@@ -531,12 +531,12 @@ static void panic_halt_ipmi_set_timeout(void)
/* Wait for the messages to be free. */
while (atomic_read(&panic_done_count) != 0)
ipmi_poll_interface(watchdog_user);
- atomic_inc(&panic_done_count);
+ atomic_add(2, &panic_done_count);
rv = __ipmi_set_timeout(&panic_halt_smi_msg,
&panic_halt_recv_msg,
&send_heartbeat_now);
if (rv) {
- atomic_dec(&panic_done_count);
+ atomic_sub(2, &panic_done_count);
pr_warn("Unable to extend the watchdog timeout\n");
} else {
if (send_heartbeat_now)
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From db05ddf7f321634c5659a0cf7ea56594e22365f7 Mon Sep 17 00:00:00 2001
From: Corey Minyard <cminyard(a)mvista.com>
Date: Mon, 20 Sep 2021 06:25:37 -0500
Subject: [PATCH] ipmi:watchdog: Set panic count to proper value on a panic
You will get two decrements when the messages on a panic are sent, not
one, since commit 2033f6858970 ("ipmi: Free receive messages when in an
oops") was added, but the watchdog code had a bug where it didn't set
the value properly.
Reported-by: Anton Lundin <glance(a)acc.umu.se>
Cc: <Stable(a)vger.kernel.org> # v5.4+
Fixes: 2033f6858970 ("ipmi: Free receive messages when in an oops")
Signed-off-by: Corey Minyard <cminyard(a)mvista.com>
diff --git a/drivers/char/ipmi/ipmi_watchdog.c b/drivers/char/ipmi/ipmi_watchdog.c
index e4ff3b50de7f..f855a9665c28 100644
--- a/drivers/char/ipmi/ipmi_watchdog.c
+++ b/drivers/char/ipmi/ipmi_watchdog.c
@@ -497,7 +497,7 @@ static void panic_halt_ipmi_heartbeat(void)
msg.cmd = IPMI_WDOG_RESET_TIMER;
msg.data = NULL;
msg.data_len = 0;
- atomic_inc(&panic_done_count);
+ atomic_add(2, &panic_done_count);
rv = ipmi_request_supply_msgs(watchdog_user,
(struct ipmi_addr *) &addr,
0,
@@ -507,7 +507,7 @@ static void panic_halt_ipmi_heartbeat(void)
&panic_halt_heartbeat_recv_msg,
1);
if (rv)
- atomic_dec(&panic_done_count);
+ atomic_sub(2, &panic_done_count);
}
static struct ipmi_smi_msg panic_halt_smi_msg = {
@@ -531,12 +531,12 @@ static void panic_halt_ipmi_set_timeout(void)
/* Wait for the messages to be free. */
while (atomic_read(&panic_done_count) != 0)
ipmi_poll_interface(watchdog_user);
- atomic_inc(&panic_done_count);
+ atomic_add(2, &panic_done_count);
rv = __ipmi_set_timeout(&panic_halt_smi_msg,
&panic_halt_recv_msg,
&send_heartbeat_now);
if (rv) {
- atomic_dec(&panic_done_count);
+ atomic_sub(2, &panic_done_count);
pr_warn("Unable to extend the watchdog timeout\n");
} else {
if (send_heartbeat_now)
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From cbfcd13be5cb2a07868afe67520ed181956579a7 Mon Sep 17 00:00:00 2001
From: Ondrej Mosnacek <omosnace(a)redhat.com>
Date: Wed, 28 Jul 2021 16:03:13 +0200
Subject: [PATCH] selinux: fix race condition when computing ocontext SIDs
Current code contains a lot of racy patterns when converting an
ocontext's context structure to an SID. This is being done in a "lazy"
fashion, such that the SID is looked up in the SID table only when it's
first needed and then cached in the "sid" field of the ocontext
structure. However, this is done without any locking or memory barriers
and is thus unsafe.
Between commits 24ed7fdae669 ("selinux: use separate table for initial
SID lookup") and 66f8e2f03c02 ("selinux: sidtab reverse lookup hash
table"), this race condition lead to an actual observable bug, because a
pointer to the shared sid field was passed directly to
sidtab_context_to_sid(), which was using this location to also store an
intermediate value, which could have been read by other threads and
interpreted as an SID. In practice this caused e.g. new mounts to get a
wrong (seemingly random) filesystem context, leading to strange denials.
This bug has been spotted in the wild at least twice, see [1] and [2].
Fix the race condition by making all the racy functions use a common
helper that ensures the ocontext::sid accesses are made safely using the
appropriate SMP constructs.
Note that security_netif_sid() was populating the sid field of both
contexts stored in the ocontext, but only the first one was actually
used. The SELinux wiki's documentation on the "netifcon" policy
statement [3] suggests that using only the first context is intentional.
I kept only the handling of the first context here, as there is really
no point in doing the SID lookup for the unused one.
I wasn't able to reproduce the bug mentioned above on any kernel that
includes commit 66f8e2f03c02, even though it has been reported that the
issue occurs with that commit, too, just less frequently. Thus, I wasn't
able to verify that this patch fixes the issue, but it makes sense to
avoid the race condition regardless.
[1] https://github.com/containers/container-selinux/issues/89
[2] https://lists.fedoraproject.org/archives/list/selinux@lists.fedoraproject.o…
[3] https://selinuxproject.org/page/NetworkStatements#netifcon
Cc: stable(a)vger.kernel.org
Cc: Xinjie Zheng <xinjie(a)google.com>
Reported-by: Sujithra Periasamy <sujithra(a)google.com>
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Signed-off-by: Ondrej Mosnacek <omosnace(a)redhat.com>
Signed-off-by: Paul Moore <paul(a)paul-moore.com>
diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c
index e5f1b2757a83..c4931bf6f92a 100644
--- a/security/selinux/ss/services.c
+++ b/security/selinux/ss/services.c
@@ -2376,6 +2376,43 @@ err_policy:
return rc;
}
+/**
+ * ocontext_to_sid - Helper to safely get sid for an ocontext
+ * @sidtab: SID table
+ * @c: ocontext structure
+ * @index: index of the context entry (0 or 1)
+ * @out_sid: pointer to the resulting SID value
+ *
+ * For all ocontexts except OCON_ISID the SID fields are populated
+ * on-demand when needed. Since updating the SID value is an SMP-sensitive
+ * operation, this helper must be used to do that safely.
+ *
+ * WARNING: This function may return -ESTALE, indicating that the caller
+ * must retry the operation after re-acquiring the policy pointer!
+ */
+static int ocontext_to_sid(struct sidtab *sidtab, struct ocontext *c,
+ size_t index, u32 *out_sid)
+{
+ int rc;
+ u32 sid;
+
+ /* Ensure the associated sidtab entry is visible to this thread. */
+ sid = smp_load_acquire(&c->sid[index]);
+ if (!sid) {
+ rc = sidtab_context_to_sid(sidtab, &c->context[index], &sid);
+ if (rc)
+ return rc;
+
+ /*
+ * Ensure the new sidtab entry is visible to other threads
+ * when they see the SID.
+ */
+ smp_store_release(&c->sid[index], sid);
+ }
+ *out_sid = sid;
+ return 0;
+}
+
/**
* security_port_sid - Obtain the SID for a port.
* @state: SELinux state
@@ -2414,17 +2451,13 @@ retry:
}
if (c) {
- if (!c->sid[0]) {
- rc = sidtab_context_to_sid(sidtab, &c->context[0],
- &c->sid[0]);
- if (rc == -ESTALE) {
- rcu_read_unlock();
- goto retry;
- }
- if (rc)
- goto out;
+ rc = ocontext_to_sid(sidtab, c, 0, out_sid);
+ if (rc == -ESTALE) {
+ rcu_read_unlock();
+ goto retry;
}
- *out_sid = c->sid[0];
+ if (rc)
+ goto out;
} else {
*out_sid = SECINITSID_PORT;
}
@@ -2473,18 +2506,13 @@ retry:
}
if (c) {
- if (!c->sid[0]) {
- rc = sidtab_context_to_sid(sidtab,
- &c->context[0],
- &c->sid[0]);
- if (rc == -ESTALE) {
- rcu_read_unlock();
- goto retry;
- }
- if (rc)
- goto out;
+ rc = ocontext_to_sid(sidtab, c, 0, out_sid);
+ if (rc == -ESTALE) {
+ rcu_read_unlock();
+ goto retry;
}
- *out_sid = c->sid[0];
+ if (rc)
+ goto out;
} else
*out_sid = SECINITSID_UNLABELED;
@@ -2533,17 +2561,13 @@ retry:
}
if (c) {
- if (!c->sid[0]) {
- rc = sidtab_context_to_sid(sidtab, &c->context[0],
- &c->sid[0]);
- if (rc == -ESTALE) {
- rcu_read_unlock();
- goto retry;
- }
- if (rc)
- goto out;
+ rc = ocontext_to_sid(sidtab, c, 0, out_sid);
+ if (rc == -ESTALE) {
+ rcu_read_unlock();
+ goto retry;
}
- *out_sid = c->sid[0];
+ if (rc)
+ goto out;
} else
*out_sid = SECINITSID_UNLABELED;
@@ -2587,25 +2611,13 @@ retry:
}
if (c) {
- if (!c->sid[0] || !c->sid[1]) {
- rc = sidtab_context_to_sid(sidtab, &c->context[0],
- &c->sid[0]);
- if (rc == -ESTALE) {
- rcu_read_unlock();
- goto retry;
- }
- if (rc)
- goto out;
- rc = sidtab_context_to_sid(sidtab, &c->context[1],
- &c->sid[1]);
- if (rc == -ESTALE) {
- rcu_read_unlock();
- goto retry;
- }
- if (rc)
- goto out;
+ rc = ocontext_to_sid(sidtab, c, 0, if_sid);
+ if (rc == -ESTALE) {
+ rcu_read_unlock();
+ goto retry;
}
- *if_sid = c->sid[0];
+ if (rc)
+ goto out;
} else
*if_sid = SECINITSID_NETIF;
@@ -2697,18 +2709,13 @@ retry:
}
if (c) {
- if (!c->sid[0]) {
- rc = sidtab_context_to_sid(sidtab,
- &c->context[0],
- &c->sid[0]);
- if (rc == -ESTALE) {
- rcu_read_unlock();
- goto retry;
- }
- if (rc)
- goto out;
+ rc = ocontext_to_sid(sidtab, c, 0, out_sid);
+ if (rc == -ESTALE) {
+ rcu_read_unlock();
+ goto retry;
}
- *out_sid = c->sid[0];
+ if (rc)
+ goto out;
} else {
*out_sid = SECINITSID_NODE;
}
@@ -2873,7 +2880,7 @@ static inline int __security_genfs_sid(struct selinux_policy *policy,
u16 sclass;
struct genfs *genfs;
struct ocontext *c;
- int rc, cmp = 0;
+ int cmp = 0;
while (path[0] == '/' && path[1] == '/')
path++;
@@ -2887,9 +2894,8 @@ static inline int __security_genfs_sid(struct selinux_policy *policy,
break;
}
- rc = -ENOENT;
if (!genfs || cmp)
- goto out;
+ return -ENOENT;
for (c = genfs->head; c; c = c->next) {
len = strlen(c->u.name);
@@ -2898,20 +2904,10 @@ static inline int __security_genfs_sid(struct selinux_policy *policy,
break;
}
- rc = -ENOENT;
if (!c)
- goto out;
-
- if (!c->sid[0]) {
- rc = sidtab_context_to_sid(sidtab, &c->context[0], &c->sid[0]);
- if (rc)
- goto out;
- }
+ return -ENOENT;
- *sid = c->sid[0];
- rc = 0;
-out:
- return rc;
+ return ocontext_to_sid(sidtab, c, 0, sid);
}
/**
@@ -2996,17 +2992,13 @@ retry:
if (c) {
sbsec->behavior = c->v.behavior;
- if (!c->sid[0]) {
- rc = sidtab_context_to_sid(sidtab, &c->context[0],
- &c->sid[0]);
- if (rc == -ESTALE) {
- rcu_read_unlock();
- goto retry;
- }
- if (rc)
- goto out;
+ rc = ocontext_to_sid(sidtab, c, 0, &sbsec->sid);
+ if (rc == -ESTALE) {
+ rcu_read_unlock();
+ goto retry;
}
- sbsec->sid = c->sid[0];
+ if (rc)
+ goto out;
} else {
rc = __security_genfs_sid(policy, fstype, "/",
SECCLASS_DIR, &sbsec->sid);
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From cbfcd13be5cb2a07868afe67520ed181956579a7 Mon Sep 17 00:00:00 2001
From: Ondrej Mosnacek <omosnace(a)redhat.com>
Date: Wed, 28 Jul 2021 16:03:13 +0200
Subject: [PATCH] selinux: fix race condition when computing ocontext SIDs
Current code contains a lot of racy patterns when converting an
ocontext's context structure to an SID. This is being done in a "lazy"
fashion, such that the SID is looked up in the SID table only when it's
first needed and then cached in the "sid" field of the ocontext
structure. However, this is done without any locking or memory barriers
and is thus unsafe.
Between commits 24ed7fdae669 ("selinux: use separate table for initial
SID lookup") and 66f8e2f03c02 ("selinux: sidtab reverse lookup hash
table"), this race condition lead to an actual observable bug, because a
pointer to the shared sid field was passed directly to
sidtab_context_to_sid(), which was using this location to also store an
intermediate value, which could have been read by other threads and
interpreted as an SID. In practice this caused e.g. new mounts to get a
wrong (seemingly random) filesystem context, leading to strange denials.
This bug has been spotted in the wild at least twice, see [1] and [2].
Fix the race condition by making all the racy functions use a common
helper that ensures the ocontext::sid accesses are made safely using the
appropriate SMP constructs.
Note that security_netif_sid() was populating the sid field of both
contexts stored in the ocontext, but only the first one was actually
used. The SELinux wiki's documentation on the "netifcon" policy
statement [3] suggests that using only the first context is intentional.
I kept only the handling of the first context here, as there is really
no point in doing the SID lookup for the unused one.
I wasn't able to reproduce the bug mentioned above on any kernel that
includes commit 66f8e2f03c02, even though it has been reported that the
issue occurs with that commit, too, just less frequently. Thus, I wasn't
able to verify that this patch fixes the issue, but it makes sense to
avoid the race condition regardless.
[1] https://github.com/containers/container-selinux/issues/89
[2] https://lists.fedoraproject.org/archives/list/selinux@lists.fedoraproject.o…
[3] https://selinuxproject.org/page/NetworkStatements#netifcon
Cc: stable(a)vger.kernel.org
Cc: Xinjie Zheng <xinjie(a)google.com>
Reported-by: Sujithra Periasamy <sujithra(a)google.com>
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Signed-off-by: Ondrej Mosnacek <omosnace(a)redhat.com>
Signed-off-by: Paul Moore <paul(a)paul-moore.com>
diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c
index e5f1b2757a83..c4931bf6f92a 100644
--- a/security/selinux/ss/services.c
+++ b/security/selinux/ss/services.c
@@ -2376,6 +2376,43 @@ err_policy:
return rc;
}
+/**
+ * ocontext_to_sid - Helper to safely get sid for an ocontext
+ * @sidtab: SID table
+ * @c: ocontext structure
+ * @index: index of the context entry (0 or 1)
+ * @out_sid: pointer to the resulting SID value
+ *
+ * For all ocontexts except OCON_ISID the SID fields are populated
+ * on-demand when needed. Since updating the SID value is an SMP-sensitive
+ * operation, this helper must be used to do that safely.
+ *
+ * WARNING: This function may return -ESTALE, indicating that the caller
+ * must retry the operation after re-acquiring the policy pointer!
+ */
+static int ocontext_to_sid(struct sidtab *sidtab, struct ocontext *c,
+ size_t index, u32 *out_sid)
+{
+ int rc;
+ u32 sid;
+
+ /* Ensure the associated sidtab entry is visible to this thread. */
+ sid = smp_load_acquire(&c->sid[index]);
+ if (!sid) {
+ rc = sidtab_context_to_sid(sidtab, &c->context[index], &sid);
+ if (rc)
+ return rc;
+
+ /*
+ * Ensure the new sidtab entry is visible to other threads
+ * when they see the SID.
+ */
+ smp_store_release(&c->sid[index], sid);
+ }
+ *out_sid = sid;
+ return 0;
+}
+
/**
* security_port_sid - Obtain the SID for a port.
* @state: SELinux state
@@ -2414,17 +2451,13 @@ retry:
}
if (c) {
- if (!c->sid[0]) {
- rc = sidtab_context_to_sid(sidtab, &c->context[0],
- &c->sid[0]);
- if (rc == -ESTALE) {
- rcu_read_unlock();
- goto retry;
- }
- if (rc)
- goto out;
+ rc = ocontext_to_sid(sidtab, c, 0, out_sid);
+ if (rc == -ESTALE) {
+ rcu_read_unlock();
+ goto retry;
}
- *out_sid = c->sid[0];
+ if (rc)
+ goto out;
} else {
*out_sid = SECINITSID_PORT;
}
@@ -2473,18 +2506,13 @@ retry:
}
if (c) {
- if (!c->sid[0]) {
- rc = sidtab_context_to_sid(sidtab,
- &c->context[0],
- &c->sid[0]);
- if (rc == -ESTALE) {
- rcu_read_unlock();
- goto retry;
- }
- if (rc)
- goto out;
+ rc = ocontext_to_sid(sidtab, c, 0, out_sid);
+ if (rc == -ESTALE) {
+ rcu_read_unlock();
+ goto retry;
}
- *out_sid = c->sid[0];
+ if (rc)
+ goto out;
} else
*out_sid = SECINITSID_UNLABELED;
@@ -2533,17 +2561,13 @@ retry:
}
if (c) {
- if (!c->sid[0]) {
- rc = sidtab_context_to_sid(sidtab, &c->context[0],
- &c->sid[0]);
- if (rc == -ESTALE) {
- rcu_read_unlock();
- goto retry;
- }
- if (rc)
- goto out;
+ rc = ocontext_to_sid(sidtab, c, 0, out_sid);
+ if (rc == -ESTALE) {
+ rcu_read_unlock();
+ goto retry;
}
- *out_sid = c->sid[0];
+ if (rc)
+ goto out;
} else
*out_sid = SECINITSID_UNLABELED;
@@ -2587,25 +2611,13 @@ retry:
}
if (c) {
- if (!c->sid[0] || !c->sid[1]) {
- rc = sidtab_context_to_sid(sidtab, &c->context[0],
- &c->sid[0]);
- if (rc == -ESTALE) {
- rcu_read_unlock();
- goto retry;
- }
- if (rc)
- goto out;
- rc = sidtab_context_to_sid(sidtab, &c->context[1],
- &c->sid[1]);
- if (rc == -ESTALE) {
- rcu_read_unlock();
- goto retry;
- }
- if (rc)
- goto out;
+ rc = ocontext_to_sid(sidtab, c, 0, if_sid);
+ if (rc == -ESTALE) {
+ rcu_read_unlock();
+ goto retry;
}
- *if_sid = c->sid[0];
+ if (rc)
+ goto out;
} else
*if_sid = SECINITSID_NETIF;
@@ -2697,18 +2709,13 @@ retry:
}
if (c) {
- if (!c->sid[0]) {
- rc = sidtab_context_to_sid(sidtab,
- &c->context[0],
- &c->sid[0]);
- if (rc == -ESTALE) {
- rcu_read_unlock();
- goto retry;
- }
- if (rc)
- goto out;
+ rc = ocontext_to_sid(sidtab, c, 0, out_sid);
+ if (rc == -ESTALE) {
+ rcu_read_unlock();
+ goto retry;
}
- *out_sid = c->sid[0];
+ if (rc)
+ goto out;
} else {
*out_sid = SECINITSID_NODE;
}
@@ -2873,7 +2880,7 @@ static inline int __security_genfs_sid(struct selinux_policy *policy,
u16 sclass;
struct genfs *genfs;
struct ocontext *c;
- int rc, cmp = 0;
+ int cmp = 0;
while (path[0] == '/' && path[1] == '/')
path++;
@@ -2887,9 +2894,8 @@ static inline int __security_genfs_sid(struct selinux_policy *policy,
break;
}
- rc = -ENOENT;
if (!genfs || cmp)
- goto out;
+ return -ENOENT;
for (c = genfs->head; c; c = c->next) {
len = strlen(c->u.name);
@@ -2898,20 +2904,10 @@ static inline int __security_genfs_sid(struct selinux_policy *policy,
break;
}
- rc = -ENOENT;
if (!c)
- goto out;
-
- if (!c->sid[0]) {
- rc = sidtab_context_to_sid(sidtab, &c->context[0], &c->sid[0]);
- if (rc)
- goto out;
- }
+ return -ENOENT;
- *sid = c->sid[0];
- rc = 0;
-out:
- return rc;
+ return ocontext_to_sid(sidtab, c, 0, sid);
}
/**
@@ -2996,17 +2992,13 @@ retry:
if (c) {
sbsec->behavior = c->v.behavior;
- if (!c->sid[0]) {
- rc = sidtab_context_to_sid(sidtab, &c->context[0],
- &c->sid[0]);
- if (rc == -ESTALE) {
- rcu_read_unlock();
- goto retry;
- }
- if (rc)
- goto out;
+ rc = ocontext_to_sid(sidtab, c, 0, &sbsec->sid);
+ if (rc == -ESTALE) {
+ rcu_read_unlock();
+ goto retry;
}
- sbsec->sid = c->sid[0];
+ if (rc)
+ goto out;
} else {
rc = __security_genfs_sid(policy, fstype, "/",
SECCLASS_DIR, &sbsec->sid);
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From cbfcd13be5cb2a07868afe67520ed181956579a7 Mon Sep 17 00:00:00 2001
From: Ondrej Mosnacek <omosnace(a)redhat.com>
Date: Wed, 28 Jul 2021 16:03:13 +0200
Subject: [PATCH] selinux: fix race condition when computing ocontext SIDs
Current code contains a lot of racy patterns when converting an
ocontext's context structure to an SID. This is being done in a "lazy"
fashion, such that the SID is looked up in the SID table only when it's
first needed and then cached in the "sid" field of the ocontext
structure. However, this is done without any locking or memory barriers
and is thus unsafe.
Between commits 24ed7fdae669 ("selinux: use separate table for initial
SID lookup") and 66f8e2f03c02 ("selinux: sidtab reverse lookup hash
table"), this race condition lead to an actual observable bug, because a
pointer to the shared sid field was passed directly to
sidtab_context_to_sid(), which was using this location to also store an
intermediate value, which could have been read by other threads and
interpreted as an SID. In practice this caused e.g. new mounts to get a
wrong (seemingly random) filesystem context, leading to strange denials.
This bug has been spotted in the wild at least twice, see [1] and [2].
Fix the race condition by making all the racy functions use a common
helper that ensures the ocontext::sid accesses are made safely using the
appropriate SMP constructs.
Note that security_netif_sid() was populating the sid field of both
contexts stored in the ocontext, but only the first one was actually
used. The SELinux wiki's documentation on the "netifcon" policy
statement [3] suggests that using only the first context is intentional.
I kept only the handling of the first context here, as there is really
no point in doing the SID lookup for the unused one.
I wasn't able to reproduce the bug mentioned above on any kernel that
includes commit 66f8e2f03c02, even though it has been reported that the
issue occurs with that commit, too, just less frequently. Thus, I wasn't
able to verify that this patch fixes the issue, but it makes sense to
avoid the race condition regardless.
[1] https://github.com/containers/container-selinux/issues/89
[2] https://lists.fedoraproject.org/archives/list/selinux@lists.fedoraproject.o…
[3] https://selinuxproject.org/page/NetworkStatements#netifcon
Cc: stable(a)vger.kernel.org
Cc: Xinjie Zheng <xinjie(a)google.com>
Reported-by: Sujithra Periasamy <sujithra(a)google.com>
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Signed-off-by: Ondrej Mosnacek <omosnace(a)redhat.com>
Signed-off-by: Paul Moore <paul(a)paul-moore.com>
diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c
index e5f1b2757a83..c4931bf6f92a 100644
--- a/security/selinux/ss/services.c
+++ b/security/selinux/ss/services.c
@@ -2376,6 +2376,43 @@ err_policy:
return rc;
}
+/**
+ * ocontext_to_sid - Helper to safely get sid for an ocontext
+ * @sidtab: SID table
+ * @c: ocontext structure
+ * @index: index of the context entry (0 or 1)
+ * @out_sid: pointer to the resulting SID value
+ *
+ * For all ocontexts except OCON_ISID the SID fields are populated
+ * on-demand when needed. Since updating the SID value is an SMP-sensitive
+ * operation, this helper must be used to do that safely.
+ *
+ * WARNING: This function may return -ESTALE, indicating that the caller
+ * must retry the operation after re-acquiring the policy pointer!
+ */
+static int ocontext_to_sid(struct sidtab *sidtab, struct ocontext *c,
+ size_t index, u32 *out_sid)
+{
+ int rc;
+ u32 sid;
+
+ /* Ensure the associated sidtab entry is visible to this thread. */
+ sid = smp_load_acquire(&c->sid[index]);
+ if (!sid) {
+ rc = sidtab_context_to_sid(sidtab, &c->context[index], &sid);
+ if (rc)
+ return rc;
+
+ /*
+ * Ensure the new sidtab entry is visible to other threads
+ * when they see the SID.
+ */
+ smp_store_release(&c->sid[index], sid);
+ }
+ *out_sid = sid;
+ return 0;
+}
+
/**
* security_port_sid - Obtain the SID for a port.
* @state: SELinux state
@@ -2414,17 +2451,13 @@ retry:
}
if (c) {
- if (!c->sid[0]) {
- rc = sidtab_context_to_sid(sidtab, &c->context[0],
- &c->sid[0]);
- if (rc == -ESTALE) {
- rcu_read_unlock();
- goto retry;
- }
- if (rc)
- goto out;
+ rc = ocontext_to_sid(sidtab, c, 0, out_sid);
+ if (rc == -ESTALE) {
+ rcu_read_unlock();
+ goto retry;
}
- *out_sid = c->sid[0];
+ if (rc)
+ goto out;
} else {
*out_sid = SECINITSID_PORT;
}
@@ -2473,18 +2506,13 @@ retry:
}
if (c) {
- if (!c->sid[0]) {
- rc = sidtab_context_to_sid(sidtab,
- &c->context[0],
- &c->sid[0]);
- if (rc == -ESTALE) {
- rcu_read_unlock();
- goto retry;
- }
- if (rc)
- goto out;
+ rc = ocontext_to_sid(sidtab, c, 0, out_sid);
+ if (rc == -ESTALE) {
+ rcu_read_unlock();
+ goto retry;
}
- *out_sid = c->sid[0];
+ if (rc)
+ goto out;
} else
*out_sid = SECINITSID_UNLABELED;
@@ -2533,17 +2561,13 @@ retry:
}
if (c) {
- if (!c->sid[0]) {
- rc = sidtab_context_to_sid(sidtab, &c->context[0],
- &c->sid[0]);
- if (rc == -ESTALE) {
- rcu_read_unlock();
- goto retry;
- }
- if (rc)
- goto out;
+ rc = ocontext_to_sid(sidtab, c, 0, out_sid);
+ if (rc == -ESTALE) {
+ rcu_read_unlock();
+ goto retry;
}
- *out_sid = c->sid[0];
+ if (rc)
+ goto out;
} else
*out_sid = SECINITSID_UNLABELED;
@@ -2587,25 +2611,13 @@ retry:
}
if (c) {
- if (!c->sid[0] || !c->sid[1]) {
- rc = sidtab_context_to_sid(sidtab, &c->context[0],
- &c->sid[0]);
- if (rc == -ESTALE) {
- rcu_read_unlock();
- goto retry;
- }
- if (rc)
- goto out;
- rc = sidtab_context_to_sid(sidtab, &c->context[1],
- &c->sid[1]);
- if (rc == -ESTALE) {
- rcu_read_unlock();
- goto retry;
- }
- if (rc)
- goto out;
+ rc = ocontext_to_sid(sidtab, c, 0, if_sid);
+ if (rc == -ESTALE) {
+ rcu_read_unlock();
+ goto retry;
}
- *if_sid = c->sid[0];
+ if (rc)
+ goto out;
} else
*if_sid = SECINITSID_NETIF;
@@ -2697,18 +2709,13 @@ retry:
}
if (c) {
- if (!c->sid[0]) {
- rc = sidtab_context_to_sid(sidtab,
- &c->context[0],
- &c->sid[0]);
- if (rc == -ESTALE) {
- rcu_read_unlock();
- goto retry;
- }
- if (rc)
- goto out;
+ rc = ocontext_to_sid(sidtab, c, 0, out_sid);
+ if (rc == -ESTALE) {
+ rcu_read_unlock();
+ goto retry;
}
- *out_sid = c->sid[0];
+ if (rc)
+ goto out;
} else {
*out_sid = SECINITSID_NODE;
}
@@ -2873,7 +2880,7 @@ static inline int __security_genfs_sid(struct selinux_policy *policy,
u16 sclass;
struct genfs *genfs;
struct ocontext *c;
- int rc, cmp = 0;
+ int cmp = 0;
while (path[0] == '/' && path[1] == '/')
path++;
@@ -2887,9 +2894,8 @@ static inline int __security_genfs_sid(struct selinux_policy *policy,
break;
}
- rc = -ENOENT;
if (!genfs || cmp)
- goto out;
+ return -ENOENT;
for (c = genfs->head; c; c = c->next) {
len = strlen(c->u.name);
@@ -2898,20 +2904,10 @@ static inline int __security_genfs_sid(struct selinux_policy *policy,
break;
}
- rc = -ENOENT;
if (!c)
- goto out;
-
- if (!c->sid[0]) {
- rc = sidtab_context_to_sid(sidtab, &c->context[0], &c->sid[0]);
- if (rc)
- goto out;
- }
+ return -ENOENT;
- *sid = c->sid[0];
- rc = 0;
-out:
- return rc;
+ return ocontext_to_sid(sidtab, c, 0, sid);
}
/**
@@ -2996,17 +2992,13 @@ retry:
if (c) {
sbsec->behavior = c->v.behavior;
- if (!c->sid[0]) {
- rc = sidtab_context_to_sid(sidtab, &c->context[0],
- &c->sid[0]);
- if (rc == -ESTALE) {
- rcu_read_unlock();
- goto retry;
- }
- if (rc)
- goto out;
+ rc = ocontext_to_sid(sidtab, c, 0, &sbsec->sid);
+ if (rc == -ESTALE) {
+ rcu_read_unlock();
+ goto retry;
}
- sbsec->sid = c->sid[0];
+ if (rc)
+ goto out;
} else {
rc = __security_genfs_sid(policy, fstype, "/",
SECCLASS_DIR, &sbsec->sid);
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From cbfcd13be5cb2a07868afe67520ed181956579a7 Mon Sep 17 00:00:00 2001
From: Ondrej Mosnacek <omosnace(a)redhat.com>
Date: Wed, 28 Jul 2021 16:03:13 +0200
Subject: [PATCH] selinux: fix race condition when computing ocontext SIDs
Current code contains a lot of racy patterns when converting an
ocontext's context structure to an SID. This is being done in a "lazy"
fashion, such that the SID is looked up in the SID table only when it's
first needed and then cached in the "sid" field of the ocontext
structure. However, this is done without any locking or memory barriers
and is thus unsafe.
Between commits 24ed7fdae669 ("selinux: use separate table for initial
SID lookup") and 66f8e2f03c02 ("selinux: sidtab reverse lookup hash
table"), this race condition lead to an actual observable bug, because a
pointer to the shared sid field was passed directly to
sidtab_context_to_sid(), which was using this location to also store an
intermediate value, which could have been read by other threads and
interpreted as an SID. In practice this caused e.g. new mounts to get a
wrong (seemingly random) filesystem context, leading to strange denials.
This bug has been spotted in the wild at least twice, see [1] and [2].
Fix the race condition by making all the racy functions use a common
helper that ensures the ocontext::sid accesses are made safely using the
appropriate SMP constructs.
Note that security_netif_sid() was populating the sid field of both
contexts stored in the ocontext, but only the first one was actually
used. The SELinux wiki's documentation on the "netifcon" policy
statement [3] suggests that using only the first context is intentional.
I kept only the handling of the first context here, as there is really
no point in doing the SID lookup for the unused one.
I wasn't able to reproduce the bug mentioned above on any kernel that
includes commit 66f8e2f03c02, even though it has been reported that the
issue occurs with that commit, too, just less frequently. Thus, I wasn't
able to verify that this patch fixes the issue, but it makes sense to
avoid the race condition regardless.
[1] https://github.com/containers/container-selinux/issues/89
[2] https://lists.fedoraproject.org/archives/list/selinux@lists.fedoraproject.o…
[3] https://selinuxproject.org/page/NetworkStatements#netifcon
Cc: stable(a)vger.kernel.org
Cc: Xinjie Zheng <xinjie(a)google.com>
Reported-by: Sujithra Periasamy <sujithra(a)google.com>
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Signed-off-by: Ondrej Mosnacek <omosnace(a)redhat.com>
Signed-off-by: Paul Moore <paul(a)paul-moore.com>
diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c
index e5f1b2757a83..c4931bf6f92a 100644
--- a/security/selinux/ss/services.c
+++ b/security/selinux/ss/services.c
@@ -2376,6 +2376,43 @@ err_policy:
return rc;
}
+/**
+ * ocontext_to_sid - Helper to safely get sid for an ocontext
+ * @sidtab: SID table
+ * @c: ocontext structure
+ * @index: index of the context entry (0 or 1)
+ * @out_sid: pointer to the resulting SID value
+ *
+ * For all ocontexts except OCON_ISID the SID fields are populated
+ * on-demand when needed. Since updating the SID value is an SMP-sensitive
+ * operation, this helper must be used to do that safely.
+ *
+ * WARNING: This function may return -ESTALE, indicating that the caller
+ * must retry the operation after re-acquiring the policy pointer!
+ */
+static int ocontext_to_sid(struct sidtab *sidtab, struct ocontext *c,
+ size_t index, u32 *out_sid)
+{
+ int rc;
+ u32 sid;
+
+ /* Ensure the associated sidtab entry is visible to this thread. */
+ sid = smp_load_acquire(&c->sid[index]);
+ if (!sid) {
+ rc = sidtab_context_to_sid(sidtab, &c->context[index], &sid);
+ if (rc)
+ return rc;
+
+ /*
+ * Ensure the new sidtab entry is visible to other threads
+ * when they see the SID.
+ */
+ smp_store_release(&c->sid[index], sid);
+ }
+ *out_sid = sid;
+ return 0;
+}
+
/**
* security_port_sid - Obtain the SID for a port.
* @state: SELinux state
@@ -2414,17 +2451,13 @@ retry:
}
if (c) {
- if (!c->sid[0]) {
- rc = sidtab_context_to_sid(sidtab, &c->context[0],
- &c->sid[0]);
- if (rc == -ESTALE) {
- rcu_read_unlock();
- goto retry;
- }
- if (rc)
- goto out;
+ rc = ocontext_to_sid(sidtab, c, 0, out_sid);
+ if (rc == -ESTALE) {
+ rcu_read_unlock();
+ goto retry;
}
- *out_sid = c->sid[0];
+ if (rc)
+ goto out;
} else {
*out_sid = SECINITSID_PORT;
}
@@ -2473,18 +2506,13 @@ retry:
}
if (c) {
- if (!c->sid[0]) {
- rc = sidtab_context_to_sid(sidtab,
- &c->context[0],
- &c->sid[0]);
- if (rc == -ESTALE) {
- rcu_read_unlock();
- goto retry;
- }
- if (rc)
- goto out;
+ rc = ocontext_to_sid(sidtab, c, 0, out_sid);
+ if (rc == -ESTALE) {
+ rcu_read_unlock();
+ goto retry;
}
- *out_sid = c->sid[0];
+ if (rc)
+ goto out;
} else
*out_sid = SECINITSID_UNLABELED;
@@ -2533,17 +2561,13 @@ retry:
}
if (c) {
- if (!c->sid[0]) {
- rc = sidtab_context_to_sid(sidtab, &c->context[0],
- &c->sid[0]);
- if (rc == -ESTALE) {
- rcu_read_unlock();
- goto retry;
- }
- if (rc)
- goto out;
+ rc = ocontext_to_sid(sidtab, c, 0, out_sid);
+ if (rc == -ESTALE) {
+ rcu_read_unlock();
+ goto retry;
}
- *out_sid = c->sid[0];
+ if (rc)
+ goto out;
} else
*out_sid = SECINITSID_UNLABELED;
@@ -2587,25 +2611,13 @@ retry:
}
if (c) {
- if (!c->sid[0] || !c->sid[1]) {
- rc = sidtab_context_to_sid(sidtab, &c->context[0],
- &c->sid[0]);
- if (rc == -ESTALE) {
- rcu_read_unlock();
- goto retry;
- }
- if (rc)
- goto out;
- rc = sidtab_context_to_sid(sidtab, &c->context[1],
- &c->sid[1]);
- if (rc == -ESTALE) {
- rcu_read_unlock();
- goto retry;
- }
- if (rc)
- goto out;
+ rc = ocontext_to_sid(sidtab, c, 0, if_sid);
+ if (rc == -ESTALE) {
+ rcu_read_unlock();
+ goto retry;
}
- *if_sid = c->sid[0];
+ if (rc)
+ goto out;
} else
*if_sid = SECINITSID_NETIF;
@@ -2697,18 +2709,13 @@ retry:
}
if (c) {
- if (!c->sid[0]) {
- rc = sidtab_context_to_sid(sidtab,
- &c->context[0],
- &c->sid[0]);
- if (rc == -ESTALE) {
- rcu_read_unlock();
- goto retry;
- }
- if (rc)
- goto out;
+ rc = ocontext_to_sid(sidtab, c, 0, out_sid);
+ if (rc == -ESTALE) {
+ rcu_read_unlock();
+ goto retry;
}
- *out_sid = c->sid[0];
+ if (rc)
+ goto out;
} else {
*out_sid = SECINITSID_NODE;
}
@@ -2873,7 +2880,7 @@ static inline int __security_genfs_sid(struct selinux_policy *policy,
u16 sclass;
struct genfs *genfs;
struct ocontext *c;
- int rc, cmp = 0;
+ int cmp = 0;
while (path[0] == '/' && path[1] == '/')
path++;
@@ -2887,9 +2894,8 @@ static inline int __security_genfs_sid(struct selinux_policy *policy,
break;
}
- rc = -ENOENT;
if (!genfs || cmp)
- goto out;
+ return -ENOENT;
for (c = genfs->head; c; c = c->next) {
len = strlen(c->u.name);
@@ -2898,20 +2904,10 @@ static inline int __security_genfs_sid(struct selinux_policy *policy,
break;
}
- rc = -ENOENT;
if (!c)
- goto out;
-
- if (!c->sid[0]) {
- rc = sidtab_context_to_sid(sidtab, &c->context[0], &c->sid[0]);
- if (rc)
- goto out;
- }
+ return -ENOENT;
- *sid = c->sid[0];
- rc = 0;
-out:
- return rc;
+ return ocontext_to_sid(sidtab, c, 0, sid);
}
/**
@@ -2996,17 +2992,13 @@ retry:
if (c) {
sbsec->behavior = c->v.behavior;
- if (!c->sid[0]) {
- rc = sidtab_context_to_sid(sidtab, &c->context[0],
- &c->sid[0]);
- if (rc == -ESTALE) {
- rcu_read_unlock();
- goto retry;
- }
- if (rc)
- goto out;
+ rc = ocontext_to_sid(sidtab, c, 0, &sbsec->sid);
+ if (rc == -ESTALE) {
+ rcu_read_unlock();
+ goto retry;
}
- sbsec->sid = c->sid[0];
+ if (rc)
+ goto out;
} else {
rc = __security_genfs_sid(policy, fstype, "/",
SECCLASS_DIR, &sbsec->sid);
The patch below does not apply to the 4.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From cbfcd13be5cb2a07868afe67520ed181956579a7 Mon Sep 17 00:00:00 2001
From: Ondrej Mosnacek <omosnace(a)redhat.com>
Date: Wed, 28 Jul 2021 16:03:13 +0200
Subject: [PATCH] selinux: fix race condition when computing ocontext SIDs
Current code contains a lot of racy patterns when converting an
ocontext's context structure to an SID. This is being done in a "lazy"
fashion, such that the SID is looked up in the SID table only when it's
first needed and then cached in the "sid" field of the ocontext
structure. However, this is done without any locking or memory barriers
and is thus unsafe.
Between commits 24ed7fdae669 ("selinux: use separate table for initial
SID lookup") and 66f8e2f03c02 ("selinux: sidtab reverse lookup hash
table"), this race condition lead to an actual observable bug, because a
pointer to the shared sid field was passed directly to
sidtab_context_to_sid(), which was using this location to also store an
intermediate value, which could have been read by other threads and
interpreted as an SID. In practice this caused e.g. new mounts to get a
wrong (seemingly random) filesystem context, leading to strange denials.
This bug has been spotted in the wild at least twice, see [1] and [2].
Fix the race condition by making all the racy functions use a common
helper that ensures the ocontext::sid accesses are made safely using the
appropriate SMP constructs.
Note that security_netif_sid() was populating the sid field of both
contexts stored in the ocontext, but only the first one was actually
used. The SELinux wiki's documentation on the "netifcon" policy
statement [3] suggests that using only the first context is intentional.
I kept only the handling of the first context here, as there is really
no point in doing the SID lookup for the unused one.
I wasn't able to reproduce the bug mentioned above on any kernel that
includes commit 66f8e2f03c02, even though it has been reported that the
issue occurs with that commit, too, just less frequently. Thus, I wasn't
able to verify that this patch fixes the issue, but it makes sense to
avoid the race condition regardless.
[1] https://github.com/containers/container-selinux/issues/89
[2] https://lists.fedoraproject.org/archives/list/selinux@lists.fedoraproject.o…
[3] https://selinuxproject.org/page/NetworkStatements#netifcon
Cc: stable(a)vger.kernel.org
Cc: Xinjie Zheng <xinjie(a)google.com>
Reported-by: Sujithra Periasamy <sujithra(a)google.com>
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Signed-off-by: Ondrej Mosnacek <omosnace(a)redhat.com>
Signed-off-by: Paul Moore <paul(a)paul-moore.com>
diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c
index e5f1b2757a83..c4931bf6f92a 100644
--- a/security/selinux/ss/services.c
+++ b/security/selinux/ss/services.c
@@ -2376,6 +2376,43 @@ err_policy:
return rc;
}
+/**
+ * ocontext_to_sid - Helper to safely get sid for an ocontext
+ * @sidtab: SID table
+ * @c: ocontext structure
+ * @index: index of the context entry (0 or 1)
+ * @out_sid: pointer to the resulting SID value
+ *
+ * For all ocontexts except OCON_ISID the SID fields are populated
+ * on-demand when needed. Since updating the SID value is an SMP-sensitive
+ * operation, this helper must be used to do that safely.
+ *
+ * WARNING: This function may return -ESTALE, indicating that the caller
+ * must retry the operation after re-acquiring the policy pointer!
+ */
+static int ocontext_to_sid(struct sidtab *sidtab, struct ocontext *c,
+ size_t index, u32 *out_sid)
+{
+ int rc;
+ u32 sid;
+
+ /* Ensure the associated sidtab entry is visible to this thread. */
+ sid = smp_load_acquire(&c->sid[index]);
+ if (!sid) {
+ rc = sidtab_context_to_sid(sidtab, &c->context[index], &sid);
+ if (rc)
+ return rc;
+
+ /*
+ * Ensure the new sidtab entry is visible to other threads
+ * when they see the SID.
+ */
+ smp_store_release(&c->sid[index], sid);
+ }
+ *out_sid = sid;
+ return 0;
+}
+
/**
* security_port_sid - Obtain the SID for a port.
* @state: SELinux state
@@ -2414,17 +2451,13 @@ retry:
}
if (c) {
- if (!c->sid[0]) {
- rc = sidtab_context_to_sid(sidtab, &c->context[0],
- &c->sid[0]);
- if (rc == -ESTALE) {
- rcu_read_unlock();
- goto retry;
- }
- if (rc)
- goto out;
+ rc = ocontext_to_sid(sidtab, c, 0, out_sid);
+ if (rc == -ESTALE) {
+ rcu_read_unlock();
+ goto retry;
}
- *out_sid = c->sid[0];
+ if (rc)
+ goto out;
} else {
*out_sid = SECINITSID_PORT;
}
@@ -2473,18 +2506,13 @@ retry:
}
if (c) {
- if (!c->sid[0]) {
- rc = sidtab_context_to_sid(sidtab,
- &c->context[0],
- &c->sid[0]);
- if (rc == -ESTALE) {
- rcu_read_unlock();
- goto retry;
- }
- if (rc)
- goto out;
+ rc = ocontext_to_sid(sidtab, c, 0, out_sid);
+ if (rc == -ESTALE) {
+ rcu_read_unlock();
+ goto retry;
}
- *out_sid = c->sid[0];
+ if (rc)
+ goto out;
} else
*out_sid = SECINITSID_UNLABELED;
@@ -2533,17 +2561,13 @@ retry:
}
if (c) {
- if (!c->sid[0]) {
- rc = sidtab_context_to_sid(sidtab, &c->context[0],
- &c->sid[0]);
- if (rc == -ESTALE) {
- rcu_read_unlock();
- goto retry;
- }
- if (rc)
- goto out;
+ rc = ocontext_to_sid(sidtab, c, 0, out_sid);
+ if (rc == -ESTALE) {
+ rcu_read_unlock();
+ goto retry;
}
- *out_sid = c->sid[0];
+ if (rc)
+ goto out;
} else
*out_sid = SECINITSID_UNLABELED;
@@ -2587,25 +2611,13 @@ retry:
}
if (c) {
- if (!c->sid[0] || !c->sid[1]) {
- rc = sidtab_context_to_sid(sidtab, &c->context[0],
- &c->sid[0]);
- if (rc == -ESTALE) {
- rcu_read_unlock();
- goto retry;
- }
- if (rc)
- goto out;
- rc = sidtab_context_to_sid(sidtab, &c->context[1],
- &c->sid[1]);
- if (rc == -ESTALE) {
- rcu_read_unlock();
- goto retry;
- }
- if (rc)
- goto out;
+ rc = ocontext_to_sid(sidtab, c, 0, if_sid);
+ if (rc == -ESTALE) {
+ rcu_read_unlock();
+ goto retry;
}
- *if_sid = c->sid[0];
+ if (rc)
+ goto out;
} else
*if_sid = SECINITSID_NETIF;
@@ -2697,18 +2709,13 @@ retry:
}
if (c) {
- if (!c->sid[0]) {
- rc = sidtab_context_to_sid(sidtab,
- &c->context[0],
- &c->sid[0]);
- if (rc == -ESTALE) {
- rcu_read_unlock();
- goto retry;
- }
- if (rc)
- goto out;
+ rc = ocontext_to_sid(sidtab, c, 0, out_sid);
+ if (rc == -ESTALE) {
+ rcu_read_unlock();
+ goto retry;
}
- *out_sid = c->sid[0];
+ if (rc)
+ goto out;
} else {
*out_sid = SECINITSID_NODE;
}
@@ -2873,7 +2880,7 @@ static inline int __security_genfs_sid(struct selinux_policy *policy,
u16 sclass;
struct genfs *genfs;
struct ocontext *c;
- int rc, cmp = 0;
+ int cmp = 0;
while (path[0] == '/' && path[1] == '/')
path++;
@@ -2887,9 +2894,8 @@ static inline int __security_genfs_sid(struct selinux_policy *policy,
break;
}
- rc = -ENOENT;
if (!genfs || cmp)
- goto out;
+ return -ENOENT;
for (c = genfs->head; c; c = c->next) {
len = strlen(c->u.name);
@@ -2898,20 +2904,10 @@ static inline int __security_genfs_sid(struct selinux_policy *policy,
break;
}
- rc = -ENOENT;
if (!c)
- goto out;
-
- if (!c->sid[0]) {
- rc = sidtab_context_to_sid(sidtab, &c->context[0], &c->sid[0]);
- if (rc)
- goto out;
- }
+ return -ENOENT;
- *sid = c->sid[0];
- rc = 0;
-out:
- return rc;
+ return ocontext_to_sid(sidtab, c, 0, sid);
}
/**
@@ -2996,17 +2992,13 @@ retry:
if (c) {
sbsec->behavior = c->v.behavior;
- if (!c->sid[0]) {
- rc = sidtab_context_to_sid(sidtab, &c->context[0],
- &c->sid[0]);
- if (rc == -ESTALE) {
- rcu_read_unlock();
- goto retry;
- }
- if (rc)
- goto out;
+ rc = ocontext_to_sid(sidtab, c, 0, &sbsec->sid);
+ if (rc == -ESTALE) {
+ rcu_read_unlock();
+ goto retry;
}
- sbsec->sid = c->sid[0];
+ if (rc)
+ goto out;
} else {
rc = __security_genfs_sid(policy, fstype, "/",
SECCLASS_DIR, &sbsec->sid);
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 4114978dcd24e72415276bba60ff4ff355970bbc Mon Sep 17 00:00:00 2001
From: Sean Young <sean(a)mess.org>
Date: Tue, 14 Sep 2021 16:57:46 +0200
Subject: [PATCH] media: ir_toy: prevent device from hanging during transmit
If the IR Toy is receiving IR while a transmit is done, it may end up
hanging. We can prevent this from happening by re-entering sample mode
just before issuing the transmit command.
Link: https://github.com/bengtmartensson/HarcHardware/discussions/25
Cc: stable(a)vger.kernel.org
[mchehab: renamed: s/STATE_RESET/STATE_COMMAND_NO_RESP/ ]
Signed-off-by: Sean Young <sean(a)mess.org>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei(a)kernel.org>
diff --git a/drivers/media/rc/ir_toy.c b/drivers/media/rc/ir_toy.c
index d2d9346eb8f5..71aced52248f 100644
--- a/drivers/media/rc/ir_toy.c
+++ b/drivers/media/rc/ir_toy.c
@@ -26,6 +26,7 @@ static const u8 COMMAND_VERSION[] = { 'v' };
// End transmit and repeat reset command so we exit sump mode
static const u8 COMMAND_RESET[] = { 0xff, 0xff, 0, 0, 0, 0, 0 };
static const u8 COMMAND_SMODE_ENTER[] = { 's' };
+static const u8 COMMAND_SMODE_EXIT[] = { 0 };
static const u8 COMMAND_TXSTART[] = { 0x26, 0x24, 0x25, 0x03 };
#define REPLY_XMITCOUNT 't'
@@ -317,12 +318,30 @@ static int irtoy_tx(struct rc_dev *rc, uint *txbuf, uint count)
buf[i] = cpu_to_be16(v);
}
- buf[count] = cpu_to_be16(0xffff);
+ buf[count] = 0xffff;
irtoy->tx_buf = buf;
irtoy->tx_len = size;
irtoy->emitted = 0;
+ // There is an issue where if the unit is receiving IR while the
+ // first TXSTART command is sent, the device might end up hanging
+ // with its led on. It does not respond to any command when this
+ // happens. To work around this, re-enter sample mode.
+ err = irtoy_command(irtoy, COMMAND_SMODE_EXIT,
+ sizeof(COMMAND_SMODE_EXIT), STATE_COMMAND_NO_RESP);
+ if (err) {
+ dev_err(irtoy->dev, "exit sample mode: %d\n", err);
+ return err;
+ }
+
+ err = irtoy_command(irtoy, COMMAND_SMODE_ENTER,
+ sizeof(COMMAND_SMODE_ENTER), STATE_COMMAND);
+ if (err) {
+ dev_err(irtoy->dev, "enter sample mode: %d\n", err);
+ return err;
+ }
+
err = irtoy_command(irtoy, COMMAND_TXSTART, sizeof(COMMAND_TXSTART),
STATE_TX);
kfree(buf);
The patch below does not apply to the 5.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 4114978dcd24e72415276bba60ff4ff355970bbc Mon Sep 17 00:00:00 2001
From: Sean Young <sean(a)mess.org>
Date: Tue, 14 Sep 2021 16:57:46 +0200
Subject: [PATCH] media: ir_toy: prevent device from hanging during transmit
If the IR Toy is receiving IR while a transmit is done, it may end up
hanging. We can prevent this from happening by re-entering sample mode
just before issuing the transmit command.
Link: https://github.com/bengtmartensson/HarcHardware/discussions/25
Cc: stable(a)vger.kernel.org
[mchehab: renamed: s/STATE_RESET/STATE_COMMAND_NO_RESP/ ]
Signed-off-by: Sean Young <sean(a)mess.org>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei(a)kernel.org>
diff --git a/drivers/media/rc/ir_toy.c b/drivers/media/rc/ir_toy.c
index d2d9346eb8f5..71aced52248f 100644
--- a/drivers/media/rc/ir_toy.c
+++ b/drivers/media/rc/ir_toy.c
@@ -26,6 +26,7 @@ static const u8 COMMAND_VERSION[] = { 'v' };
// End transmit and repeat reset command so we exit sump mode
static const u8 COMMAND_RESET[] = { 0xff, 0xff, 0, 0, 0, 0, 0 };
static const u8 COMMAND_SMODE_ENTER[] = { 's' };
+static const u8 COMMAND_SMODE_EXIT[] = { 0 };
static const u8 COMMAND_TXSTART[] = { 0x26, 0x24, 0x25, 0x03 };
#define REPLY_XMITCOUNT 't'
@@ -317,12 +318,30 @@ static int irtoy_tx(struct rc_dev *rc, uint *txbuf, uint count)
buf[i] = cpu_to_be16(v);
}
- buf[count] = cpu_to_be16(0xffff);
+ buf[count] = 0xffff;
irtoy->tx_buf = buf;
irtoy->tx_len = size;
irtoy->emitted = 0;
+ // There is an issue where if the unit is receiving IR while the
+ // first TXSTART command is sent, the device might end up hanging
+ // with its led on. It does not respond to any command when this
+ // happens. To work around this, re-enter sample mode.
+ err = irtoy_command(irtoy, COMMAND_SMODE_EXIT,
+ sizeof(COMMAND_SMODE_EXIT), STATE_COMMAND_NO_RESP);
+ if (err) {
+ dev_err(irtoy->dev, "exit sample mode: %d\n", err);
+ return err;
+ }
+
+ err = irtoy_command(irtoy, COMMAND_SMODE_ENTER,
+ sizeof(COMMAND_SMODE_ENTER), STATE_COMMAND);
+ if (err) {
+ dev_err(irtoy->dev, "enter sample mode: %d\n", err);
+ return err;
+ }
+
err = irtoy_command(irtoy, COMMAND_TXSTART, sizeof(COMMAND_TXSTART),
STATE_TX);
kfree(buf);
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 4114978dcd24e72415276bba60ff4ff355970bbc Mon Sep 17 00:00:00 2001
From: Sean Young <sean(a)mess.org>
Date: Tue, 14 Sep 2021 16:57:46 +0200
Subject: [PATCH] media: ir_toy: prevent device from hanging during transmit
If the IR Toy is receiving IR while a transmit is done, it may end up
hanging. We can prevent this from happening by re-entering sample mode
just before issuing the transmit command.
Link: https://github.com/bengtmartensson/HarcHardware/discussions/25
Cc: stable(a)vger.kernel.org
[mchehab: renamed: s/STATE_RESET/STATE_COMMAND_NO_RESP/ ]
Signed-off-by: Sean Young <sean(a)mess.org>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei(a)kernel.org>
diff --git a/drivers/media/rc/ir_toy.c b/drivers/media/rc/ir_toy.c
index d2d9346eb8f5..71aced52248f 100644
--- a/drivers/media/rc/ir_toy.c
+++ b/drivers/media/rc/ir_toy.c
@@ -26,6 +26,7 @@ static const u8 COMMAND_VERSION[] = { 'v' };
// End transmit and repeat reset command so we exit sump mode
static const u8 COMMAND_RESET[] = { 0xff, 0xff, 0, 0, 0, 0, 0 };
static const u8 COMMAND_SMODE_ENTER[] = { 's' };
+static const u8 COMMAND_SMODE_EXIT[] = { 0 };
static const u8 COMMAND_TXSTART[] = { 0x26, 0x24, 0x25, 0x03 };
#define REPLY_XMITCOUNT 't'
@@ -317,12 +318,30 @@ static int irtoy_tx(struct rc_dev *rc, uint *txbuf, uint count)
buf[i] = cpu_to_be16(v);
}
- buf[count] = cpu_to_be16(0xffff);
+ buf[count] = 0xffff;
irtoy->tx_buf = buf;
irtoy->tx_len = size;
irtoy->emitted = 0;
+ // There is an issue where if the unit is receiving IR while the
+ // first TXSTART command is sent, the device might end up hanging
+ // with its led on. It does not respond to any command when this
+ // happens. To work around this, re-enter sample mode.
+ err = irtoy_command(irtoy, COMMAND_SMODE_EXIT,
+ sizeof(COMMAND_SMODE_EXIT), STATE_COMMAND_NO_RESP);
+ if (err) {
+ dev_err(irtoy->dev, "exit sample mode: %d\n", err);
+ return err;
+ }
+
+ err = irtoy_command(irtoy, COMMAND_SMODE_ENTER,
+ sizeof(COMMAND_SMODE_ENTER), STATE_COMMAND);
+ if (err) {
+ dev_err(irtoy->dev, "enter sample mode: %d\n", err);
+ return err;
+ }
+
err = irtoy_command(irtoy, COMMAND_TXSTART, sizeof(COMMAND_TXSTART),
STATE_TX);
kfree(buf);
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From ec5a4919fa7b7d8c7a2af1c7e799b1fe4be84343 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc(a)google.com>
Date: Fri, 8 Oct 2021 17:11:05 -0700
Subject: [PATCH] KVM: VMX: Unregister posted interrupt wakeup handler on
hardware unsetup
Unregister KVM's posted interrupt wakeup handler during unsetup so that a
spurious interrupt that arrives after kvm_intel.ko is unloaded doesn't
call into freed memory.
Fixes: bf9f6ac8d749 ("KVM: Update Posted-Interrupts Descriptor when vCPU is blocked")
Cc: stable(a)vger.kernel.org
Signed-off-by: Sean Christopherson <seanjc(a)google.com>
Message-Id: <20211009001107.3936588-3-seanjc(a)google.com>
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 2f677e72d864..c11688f64e80 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -7555,6 +7555,8 @@ static void vmx_migrate_timers(struct kvm_vcpu *vcpu)
static void hardware_unsetup(void)
{
+ kvm_set_posted_intr_wakeup_handler(NULL);
+
if (nested)
nested_vmx_hardware_unsetup();
@@ -7885,8 +7887,6 @@ static __init int hardware_setup(void)
vmx_x86_ops.request_immediate_exit = __kvm_request_immediate_exit;
}
- kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler);
-
kvm_mce_cap_supported |= MCG_LMCE_P;
if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST)
@@ -7910,6 +7910,9 @@ static __init int hardware_setup(void)
r = alloc_kvm_area();
if (r)
nested_vmx_hardware_unsetup();
+
+ kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler);
+
return r;
}
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From ec5a4919fa7b7d8c7a2af1c7e799b1fe4be84343 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc(a)google.com>
Date: Fri, 8 Oct 2021 17:11:05 -0700
Subject: [PATCH] KVM: VMX: Unregister posted interrupt wakeup handler on
hardware unsetup
Unregister KVM's posted interrupt wakeup handler during unsetup so that a
spurious interrupt that arrives after kvm_intel.ko is unloaded doesn't
call into freed memory.
Fixes: bf9f6ac8d749 ("KVM: Update Posted-Interrupts Descriptor when vCPU is blocked")
Cc: stable(a)vger.kernel.org
Signed-off-by: Sean Christopherson <seanjc(a)google.com>
Message-Id: <20211009001107.3936588-3-seanjc(a)google.com>
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 2f677e72d864..c11688f64e80 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -7555,6 +7555,8 @@ static void vmx_migrate_timers(struct kvm_vcpu *vcpu)
static void hardware_unsetup(void)
{
+ kvm_set_posted_intr_wakeup_handler(NULL);
+
if (nested)
nested_vmx_hardware_unsetup();
@@ -7885,8 +7887,6 @@ static __init int hardware_setup(void)
vmx_x86_ops.request_immediate_exit = __kvm_request_immediate_exit;
}
- kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler);
-
kvm_mce_cap_supported |= MCG_LMCE_P;
if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST)
@@ -7910,6 +7910,9 @@ static __init int hardware_setup(void)
r = alloc_kvm_area();
if (r)
nested_vmx_hardware_unsetup();
+
+ kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler);
+
return r;
}
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From ec5a4919fa7b7d8c7a2af1c7e799b1fe4be84343 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc(a)google.com>
Date: Fri, 8 Oct 2021 17:11:05 -0700
Subject: [PATCH] KVM: VMX: Unregister posted interrupt wakeup handler on
hardware unsetup
Unregister KVM's posted interrupt wakeup handler during unsetup so that a
spurious interrupt that arrives after kvm_intel.ko is unloaded doesn't
call into freed memory.
Fixes: bf9f6ac8d749 ("KVM: Update Posted-Interrupts Descriptor when vCPU is blocked")
Cc: stable(a)vger.kernel.org
Signed-off-by: Sean Christopherson <seanjc(a)google.com>
Message-Id: <20211009001107.3936588-3-seanjc(a)google.com>
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 2f677e72d864..c11688f64e80 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -7555,6 +7555,8 @@ static void vmx_migrate_timers(struct kvm_vcpu *vcpu)
static void hardware_unsetup(void)
{
+ kvm_set_posted_intr_wakeup_handler(NULL);
+
if (nested)
nested_vmx_hardware_unsetup();
@@ -7885,8 +7887,6 @@ static __init int hardware_setup(void)
vmx_x86_ops.request_immediate_exit = __kvm_request_immediate_exit;
}
- kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler);
-
kvm_mce_cap_supported |= MCG_LMCE_P;
if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST)
@@ -7910,6 +7910,9 @@ static __init int hardware_setup(void)
r = alloc_kvm_area();
if (r)
nested_vmx_hardware_unsetup();
+
+ kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler);
+
return r;
}
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From ec5a4919fa7b7d8c7a2af1c7e799b1fe4be84343 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc(a)google.com>
Date: Fri, 8 Oct 2021 17:11:05 -0700
Subject: [PATCH] KVM: VMX: Unregister posted interrupt wakeup handler on
hardware unsetup
Unregister KVM's posted interrupt wakeup handler during unsetup so that a
spurious interrupt that arrives after kvm_intel.ko is unloaded doesn't
call into freed memory.
Fixes: bf9f6ac8d749 ("KVM: Update Posted-Interrupts Descriptor when vCPU is blocked")
Cc: stable(a)vger.kernel.org
Signed-off-by: Sean Christopherson <seanjc(a)google.com>
Message-Id: <20211009001107.3936588-3-seanjc(a)google.com>
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 2f677e72d864..c11688f64e80 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -7555,6 +7555,8 @@ static void vmx_migrate_timers(struct kvm_vcpu *vcpu)
static void hardware_unsetup(void)
{
+ kvm_set_posted_intr_wakeup_handler(NULL);
+
if (nested)
nested_vmx_hardware_unsetup();
@@ -7885,8 +7887,6 @@ static __init int hardware_setup(void)
vmx_x86_ops.request_immediate_exit = __kvm_request_immediate_exit;
}
- kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler);
-
kvm_mce_cap_supported |= MCG_LMCE_P;
if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST)
@@ -7910,6 +7910,9 @@ static __init int hardware_setup(void)
r = alloc_kvm_area();
if (r)
nested_vmx_hardware_unsetup();
+
+ kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler);
+
return r;
}
The patch below does not apply to the 4.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From ec5a4919fa7b7d8c7a2af1c7e799b1fe4be84343 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc(a)google.com>
Date: Fri, 8 Oct 2021 17:11:05 -0700
Subject: [PATCH] KVM: VMX: Unregister posted interrupt wakeup handler on
hardware unsetup
Unregister KVM's posted interrupt wakeup handler during unsetup so that a
spurious interrupt that arrives after kvm_intel.ko is unloaded doesn't
call into freed memory.
Fixes: bf9f6ac8d749 ("KVM: Update Posted-Interrupts Descriptor when vCPU is blocked")
Cc: stable(a)vger.kernel.org
Signed-off-by: Sean Christopherson <seanjc(a)google.com>
Message-Id: <20211009001107.3936588-3-seanjc(a)google.com>
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 2f677e72d864..c11688f64e80 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -7555,6 +7555,8 @@ static void vmx_migrate_timers(struct kvm_vcpu *vcpu)
static void hardware_unsetup(void)
{
+ kvm_set_posted_intr_wakeup_handler(NULL);
+
if (nested)
nested_vmx_hardware_unsetup();
@@ -7885,8 +7887,6 @@ static __init int hardware_setup(void)
vmx_x86_ops.request_immediate_exit = __kvm_request_immediate_exit;
}
- kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler);
-
kvm_mce_cap_supported |= MCG_LMCE_P;
if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST)
@@ -7910,6 +7910,9 @@ static __init int hardware_setup(void)
r = alloc_kvm_area();
if (r)
nested_vmx_hardware_unsetup();
+
+ kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler);
+
return r;
}
The patch below does not apply to the 5.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 2bb2e00ed9787e52580bb651264b8d6a2b7a9dd2 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana(a)suse.com>
Date: Wed, 13 Oct 2021 10:12:49 +0100
Subject: [PATCH] btrfs: fix deadlock between chunk allocation and chunk btree
modifications
When a task is doing some modification to the chunk btree and it is not in
the context of a chunk allocation or a chunk removal, it can deadlock with
another task that is currently allocating a new data or metadata chunk.
These contexts are the following:
* When relocating a system chunk, when we need to COW the extent buffers
that belong to the chunk btree;
* When adding a new device (ioctl), where we need to add a new device item
to the chunk btree;
* When removing a device (ioctl), where we need to remove a device item
from the chunk btree;
* When resizing a device (ioctl), where we need to update a device item in
the chunk btree and may need to relocate a system chunk that lies beyond
the new device size when shrinking a device.
The problem happens due to a sequence of steps like the following:
1) Task A starts a data or metadata chunk allocation and it locks the
chunk mutex;
2) Task B is relocating a system chunk, and when it needs to COW an extent
buffer of the chunk btree, it has locked both that extent buffer as
well as its parent extent buffer;
3) Since there is not enough available system space, either because none
of the existing system block groups have enough free space or because
the only one with enough free space is in RO mode due to the relocation,
task B triggers a new system chunk allocation. It blocks when trying to
acquire the chunk mutex, currently held by task A;
4) Task A enters btrfs_chunk_alloc_add_chunk_item(), in order to insert
the new chunk item into the chunk btree and update the existing device
items there. But in order to do that, it has to lock the extent buffer
that task B locked at step 2, or its parent extent buffer, but task B
is waiting on the chunk mutex, which is currently locked by task A,
therefore resulting in a deadlock.
One example report when the deadlock happens with system chunk relocation:
INFO: task kworker/u9:5:546 blocked for more than 143 seconds.
Not tainted 5.15.0-rc3+ #1
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
task:kworker/u9:5 state:D stack:25936 pid: 546 ppid: 2 flags:0x00004000
Workqueue: events_unbound btrfs_async_reclaim_metadata_space
Call Trace:
context_switch kernel/sched/core.c:4940 [inline]
__schedule+0xcd9/0x2530 kernel/sched/core.c:6287
schedule+0xd3/0x270 kernel/sched/core.c:6366
rwsem_down_read_slowpath+0x4ee/0x9d0 kernel/locking/rwsem.c:993
__down_read_common kernel/locking/rwsem.c:1214 [inline]
__down_read kernel/locking/rwsem.c:1223 [inline]
down_read_nested+0xe6/0x440 kernel/locking/rwsem.c:1590
__btrfs_tree_read_lock+0x31/0x350 fs/btrfs/locking.c:47
btrfs_tree_read_lock fs/btrfs/locking.c:54 [inline]
btrfs_read_lock_root_node+0x8a/0x320 fs/btrfs/locking.c:191
btrfs_search_slot_get_root fs/btrfs/ctree.c:1623 [inline]
btrfs_search_slot+0x13b4/0x2140 fs/btrfs/ctree.c:1728
btrfs_update_device+0x11f/0x500 fs/btrfs/volumes.c:2794
btrfs_chunk_alloc_add_chunk_item+0x34d/0xea0 fs/btrfs/volumes.c:5504
do_chunk_alloc fs/btrfs/block-group.c:3408 [inline]
btrfs_chunk_alloc+0x84d/0xf50 fs/btrfs/block-group.c:3653
flush_space+0x54e/0xd80 fs/btrfs/space-info.c:670
btrfs_async_reclaim_metadata_space+0x396/0xa90 fs/btrfs/space-info.c:953
process_one_work+0x9df/0x16d0 kernel/workqueue.c:2297
worker_thread+0x90/0xed0 kernel/workqueue.c:2444
kthread+0x3e5/0x4d0 kernel/kthread.c:319
ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:295
INFO: task syz-executor:9107 blocked for more than 143 seconds.
Not tainted 5.15.0-rc3+ #1
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
task:syz-executor state:D stack:23200 pid: 9107 ppid: 7792 flags:0x00004004
Call Trace:
context_switch kernel/sched/core.c:4940 [inline]
__schedule+0xcd9/0x2530 kernel/sched/core.c:6287
schedule+0xd3/0x270 kernel/sched/core.c:6366
schedule_preempt_disabled+0xf/0x20 kernel/sched/core.c:6425
__mutex_lock_common kernel/locking/mutex.c:669 [inline]
__mutex_lock+0xc96/0x1680 kernel/locking/mutex.c:729
btrfs_chunk_alloc+0x31a/0xf50 fs/btrfs/block-group.c:3631
find_free_extent_update_loop fs/btrfs/extent-tree.c:3986 [inline]
find_free_extent+0x25cb/0x3a30 fs/btrfs/extent-tree.c:4335
btrfs_reserve_extent+0x1f1/0x500 fs/btrfs/extent-tree.c:4415
btrfs_alloc_tree_block+0x203/0x1120 fs/btrfs/extent-tree.c:4813
__btrfs_cow_block+0x412/0x1620 fs/btrfs/ctree.c:415
btrfs_cow_block+0x2f6/0x8c0 fs/btrfs/ctree.c:570
btrfs_search_slot+0x1094/0x2140 fs/btrfs/ctree.c:1768
relocate_tree_block fs/btrfs/relocation.c:2694 [inline]
relocate_tree_blocks+0xf73/0x1770 fs/btrfs/relocation.c:2757
relocate_block_group+0x47e/0xc70 fs/btrfs/relocation.c:3673
btrfs_relocate_block_group+0x48a/0xc60 fs/btrfs/relocation.c:4070
btrfs_relocate_chunk+0x96/0x280 fs/btrfs/volumes.c:3181
__btrfs_balance fs/btrfs/volumes.c:3911 [inline]
btrfs_balance+0x1f03/0x3cd0 fs/btrfs/volumes.c:4301
btrfs_ioctl_balance+0x61e/0x800 fs/btrfs/ioctl.c:4137
btrfs_ioctl+0x39ea/0x7b70 fs/btrfs/ioctl.c:4949
vfs_ioctl fs/ioctl.c:51 [inline]
__do_sys_ioctl fs/ioctl.c:874 [inline]
__se_sys_ioctl fs/ioctl.c:860 [inline]
__x64_sys_ioctl+0x193/0x200 fs/ioctl.c:860
do_syscall_x64 arch/x86/entry/common.c:50 [inline]
do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x44/0xae
So fix this by making sure that whenever we try to modify the chunk btree
and we are neither in a chunk allocation context nor in a chunk remove
context, we reserve system space before modifying the chunk btree.
Reported-by: Hao Sun <sunhao.th(a)gmail.com>
Link: https://lore.kernel.org/linux-btrfs/CACkBjsax51i4mu6C0C3vJqQN3NR_iVuucoeG3U…
Fixes: 79bd37120b1495 ("btrfs: rework chunk allocation to avoid exhaustion of the system chunk array")
CC: stable(a)vger.kernel.org # 5.14+
Reviewed-by: Josef Bacik <josef(a)toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index de9aeb3733cf..f971d043469c 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -3425,25 +3425,6 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags)
goto out;
}
- /*
- * If this is a system chunk allocation then stop right here and do not
- * add the chunk item to the chunk btree. This is to prevent a deadlock
- * because this system chunk allocation can be triggered while COWing
- * some extent buffer of the chunk btree and while holding a lock on a
- * parent extent buffer, in which case attempting to insert the chunk
- * item (or update the device item) would result in a deadlock on that
- * parent extent buffer. In this case defer the chunk btree updates to
- * the second phase of chunk allocation and keep our reservation until
- * the second phase completes.
- *
- * This is a rare case and can only be triggered by the very few cases
- * we have where we need to touch the chunk btree outside chunk allocation
- * and chunk removal. These cases are basically adding a device, removing
- * a device or resizing a device.
- */
- if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
- return 0;
-
ret = btrfs_chunk_alloc_add_chunk_item(trans, bg);
/*
* Normally we are not expected to fail with -ENOSPC here, since we have
@@ -3576,14 +3557,14 @@ out:
* This has happened before and commit eafa4fd0ad0607 ("btrfs: fix exhaustion of
* the system chunk array due to concurrent allocations") provides more details.
*
- * For allocation of system chunks, we defer the updates and insertions into the
- * chunk btree to phase 2. This is to prevent deadlocks on extent buffers because
- * if the chunk allocation is triggered while COWing an extent buffer of the
- * chunk btree, we are holding a lock on the parent of that extent buffer and
- * doing the chunk btree updates and insertions can require locking that parent.
- * This is for the very few and rare cases where we update the chunk btree that
- * are not chunk allocation or chunk removal: adding a device, removing a device
- * or resizing a device.
+ * Allocation of system chunks does not happen through this function. A task that
+ * needs to update the chunk btree (the only btree that uses system chunks), must
+ * preallocate chunk space by calling either check_system_chunk() or
+ * btrfs_reserve_chunk_metadata() - the former is used when allocating a data or
+ * metadata chunk or when removing a chunk, while the later is used before doing
+ * a modification to the chunk btree - use cases for the later are adding,
+ * removing and resizing a device as well as relocation of a system chunk.
+ * See the comment below for more details.
*
* The reservation of system space, done through check_system_chunk(), as well
* as all the updates and insertions into the chunk btree must be done while
@@ -3620,11 +3601,27 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
if (trans->allocating_chunk)
return -ENOSPC;
/*
- * If we are removing a chunk, don't re-enter or we would deadlock.
- * System space reservation and system chunk allocation is done by the
- * chunk remove operation (btrfs_remove_chunk()).
+ * Allocation of system chunks can not happen through this path, as we
+ * could end up in a deadlock if we are allocating a data or metadata
+ * chunk and there is another task modifying the chunk btree.
+ *
+ * This is because while we are holding the chunk mutex, we will attempt
+ * to add the new chunk item to the chunk btree or update an existing
+ * device item in the chunk btree, while the other task that is modifying
+ * the chunk btree is attempting to COW an extent buffer while holding a
+ * lock on it and on its parent - if the COW operation triggers a system
+ * chunk allocation, then we can deadlock because we are holding the
+ * chunk mutex and we may need to access that extent buffer or its parent
+ * in order to add the chunk item or update a device item.
+ *
+ * Tasks that want to modify the chunk tree should reserve system space
+ * before updating the chunk btree, by calling either
+ * btrfs_reserve_chunk_metadata() or check_system_chunk().
+ * It's possible that after a task reserves the space, it still ends up
+ * here - this happens in the cases described above at do_chunk_alloc().
+ * The task will have to either retry or fail.
*/
- if (trans->removing_chunk)
+ if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
return -ENOSPC;
space_info = btrfs_find_space_info(fs_info, flags);
@@ -3723,17 +3720,14 @@ static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
return num_dev;
}
-/*
- * Reserve space in the system space for allocating or removing a chunk
- */
-void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
+static void reserve_chunk_space(struct btrfs_trans_handle *trans,
+ u64 bytes,
+ u64 type)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_space_info *info;
u64 left;
- u64 thresh;
int ret = 0;
- u64 num_devs;
/*
* Needed because we can end up allocating a system chunk and for an
@@ -3746,19 +3740,13 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
left = info->total_bytes - btrfs_space_info_used(info, true);
spin_unlock(&info->lock);
- num_devs = get_profile_num_devs(fs_info, type);
-
- /* num_devs device items to update and 1 chunk item to add or remove */
- thresh = btrfs_calc_metadata_size(fs_info, num_devs) +
- btrfs_calc_insert_metadata_size(fs_info, 1);
-
- if (left < thresh && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
+ if (left < bytes && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu",
- left, thresh, type);
+ left, bytes, type);
btrfs_dump_space_info(fs_info, info, 0, 0);
}
- if (left < thresh) {
+ if (left < bytes) {
u64 flags = btrfs_system_alloc_profile(fs_info);
struct btrfs_block_group *bg;
@@ -3767,21 +3755,20 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
* needing it, as we might not need to COW all nodes/leafs from
* the paths we visit in the chunk tree (they were already COWed
* or created in the current transaction for example).
- *
- * Also, if our caller is allocating a system chunk, do not
- * attempt to insert the chunk item in the chunk btree, as we
- * could deadlock on an extent buffer since our caller may be
- * COWing an extent buffer from the chunk btree.
*/
bg = btrfs_create_chunk(trans, flags);
if (IS_ERR(bg)) {
ret = PTR_ERR(bg);
- } else if (!(type & BTRFS_BLOCK_GROUP_SYSTEM)) {
+ } else {
/*
* If we fail to add the chunk item here, we end up
* trying again at phase 2 of chunk allocation, at
* btrfs_create_pending_block_groups(). So ignore
- * any error here.
+ * any error here. An ENOSPC here could happen, due to
+ * the cases described at do_chunk_alloc() - the system
+ * block group we just created was just turned into RO
+ * mode by a scrub for example, or a running discard
+ * temporarily removed its free space entries, etc.
*/
btrfs_chunk_alloc_add_chunk_item(trans, bg);
}
@@ -3790,12 +3777,61 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
if (!ret) {
ret = btrfs_block_rsv_add(fs_info->chunk_root,
&fs_info->chunk_block_rsv,
- thresh, BTRFS_RESERVE_NO_FLUSH);
+ bytes, BTRFS_RESERVE_NO_FLUSH);
if (!ret)
- trans->chunk_bytes_reserved += thresh;
+ trans->chunk_bytes_reserved += bytes;
}
}
+/*
+ * Reserve space in the system space for allocating or removing a chunk.
+ * The caller must be holding fs_info->chunk_mutex.
+ */
+void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ const u64 num_devs = get_profile_num_devs(fs_info, type);
+ u64 bytes;
+
+ /* num_devs device items to update and 1 chunk item to add or remove. */
+ bytes = btrfs_calc_metadata_size(fs_info, num_devs) +
+ btrfs_calc_insert_metadata_size(fs_info, 1);
+
+ reserve_chunk_space(trans, bytes, type);
+}
+
+/*
+ * Reserve space in the system space, if needed, for doing a modification to the
+ * chunk btree.
+ *
+ * @trans: A transaction handle.
+ * @is_item_insertion: Indicate if the modification is for inserting a new item
+ * in the chunk btree or if it's for the deletion or update
+ * of an existing item.
+ *
+ * This is used in a context where we need to update the chunk btree outside
+ * block group allocation and removal, to avoid a deadlock with a concurrent
+ * task that is allocating a metadata or data block group and therefore needs to
+ * update the chunk btree while holding the chunk mutex. After the update to the
+ * chunk btree is done, btrfs_trans_release_chunk_metadata() should be called.
+ *
+ */
+void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans,
+ bool is_item_insertion)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ u64 bytes;
+
+ if (is_item_insertion)
+ bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
+ else
+ bytes = btrfs_calc_metadata_size(fs_info, 1);
+
+ mutex_lock(&fs_info->chunk_mutex);
+ reserve_chunk_space(trans, bytes, BTRFS_BLOCK_GROUP_SYSTEM);
+ mutex_unlock(&fs_info->chunk_mutex);
+}
+
void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
{
struct btrfs_block_group *block_group;
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index 07f977d3816c..5878b7ce3b78 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -293,6 +293,8 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
enum btrfs_chunk_alloc_enum force);
int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type);
void check_system_chunk(struct btrfs_trans_handle *trans, const u64 type);
+void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans,
+ bool is_item_insertion);
u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags);
void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
int btrfs_free_block_groups(struct btrfs_fs_info *info);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index fed823596248..33a0ee7ac590 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2692,8 +2692,12 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
list_add_tail(&node->list, &rc->backref_cache.changed);
} else {
path->lowest_level = node->level;
+ if (root == root->fs_info->chunk_root)
+ btrfs_reserve_chunk_metadata(trans, false);
ret = btrfs_search_slot(trans, root, key, path, 0, 1);
btrfs_release_path(path);
+ if (root == root->fs_info->chunk_root)
+ btrfs_trans_release_chunk_metadata(trans);
if (ret > 0)
ret = 0;
}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index debba6f04858..9eab8a741166 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1847,8 +1847,10 @@ static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
key.type = BTRFS_DEV_ITEM_KEY;
key.offset = device->devid;
+ btrfs_reserve_chunk_metadata(trans, true);
ret = btrfs_insert_empty_item(trans, trans->fs_info->chunk_root, path,
&key, sizeof(*dev_item));
+ btrfs_trans_release_chunk_metadata(trans);
if (ret)
goto out;
@@ -1921,7 +1923,9 @@ static int btrfs_rm_dev_item(struct btrfs_device *device)
key.type = BTRFS_DEV_ITEM_KEY;
key.offset = device->devid;
+ btrfs_reserve_chunk_metadata(trans, false);
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ btrfs_trans_release_chunk_metadata(trans);
if (ret) {
if (ret > 0)
ret = -ENOENT;
@@ -2513,7 +2517,9 @@ static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
key.type = BTRFS_DEV_ITEM_KEY;
while (1) {
+ btrfs_reserve_chunk_metadata(trans, false);
ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+ btrfs_trans_release_chunk_metadata(trans);
if (ret < 0)
goto error;
@@ -2862,6 +2868,7 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans,
struct btrfs_super_block *super_copy = fs_info->super_copy;
u64 old_total;
u64 diff;
+ int ret;
if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
return -EACCES;
@@ -2890,7 +2897,11 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans,
&trans->transaction->dev_update_list);
mutex_unlock(&fs_info->chunk_mutex);
- return btrfs_update_device(trans, device);
+ btrfs_reserve_chunk_metadata(trans, false);
+ ret = btrfs_update_device(trans, device);
+ btrfs_trans_release_chunk_metadata(trans);
+
+ return ret;
}
static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
@@ -4925,8 +4936,10 @@ again:
round_down(old_total - diff, fs_info->sectorsize));
mutex_unlock(&fs_info->chunk_mutex);
+ btrfs_reserve_chunk_metadata(trans, false);
/* Now btrfs_update_device() will change the on-disk size. */
ret = btrfs_update_device(trans, device);
+ btrfs_trans_release_chunk_metadata(trans);
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 2bb2e00ed9787e52580bb651264b8d6a2b7a9dd2 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana(a)suse.com>
Date: Wed, 13 Oct 2021 10:12:49 +0100
Subject: [PATCH] btrfs: fix deadlock between chunk allocation and chunk btree
modifications
When a task is doing some modification to the chunk btree and it is not in
the context of a chunk allocation or a chunk removal, it can deadlock with
another task that is currently allocating a new data or metadata chunk.
These contexts are the following:
* When relocating a system chunk, when we need to COW the extent buffers
that belong to the chunk btree;
* When adding a new device (ioctl), where we need to add a new device item
to the chunk btree;
* When removing a device (ioctl), where we need to remove a device item
from the chunk btree;
* When resizing a device (ioctl), where we need to update a device item in
the chunk btree and may need to relocate a system chunk that lies beyond
the new device size when shrinking a device.
The problem happens due to a sequence of steps like the following:
1) Task A starts a data or metadata chunk allocation and it locks the
chunk mutex;
2) Task B is relocating a system chunk, and when it needs to COW an extent
buffer of the chunk btree, it has locked both that extent buffer as
well as its parent extent buffer;
3) Since there is not enough available system space, either because none
of the existing system block groups have enough free space or because
the only one with enough free space is in RO mode due to the relocation,
task B triggers a new system chunk allocation. It blocks when trying to
acquire the chunk mutex, currently held by task A;
4) Task A enters btrfs_chunk_alloc_add_chunk_item(), in order to insert
the new chunk item into the chunk btree and update the existing device
items there. But in order to do that, it has to lock the extent buffer
that task B locked at step 2, or its parent extent buffer, but task B
is waiting on the chunk mutex, which is currently locked by task A,
therefore resulting in a deadlock.
One example report when the deadlock happens with system chunk relocation:
INFO: task kworker/u9:5:546 blocked for more than 143 seconds.
Not tainted 5.15.0-rc3+ #1
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
task:kworker/u9:5 state:D stack:25936 pid: 546 ppid: 2 flags:0x00004000
Workqueue: events_unbound btrfs_async_reclaim_metadata_space
Call Trace:
context_switch kernel/sched/core.c:4940 [inline]
__schedule+0xcd9/0x2530 kernel/sched/core.c:6287
schedule+0xd3/0x270 kernel/sched/core.c:6366
rwsem_down_read_slowpath+0x4ee/0x9d0 kernel/locking/rwsem.c:993
__down_read_common kernel/locking/rwsem.c:1214 [inline]
__down_read kernel/locking/rwsem.c:1223 [inline]
down_read_nested+0xe6/0x440 kernel/locking/rwsem.c:1590
__btrfs_tree_read_lock+0x31/0x350 fs/btrfs/locking.c:47
btrfs_tree_read_lock fs/btrfs/locking.c:54 [inline]
btrfs_read_lock_root_node+0x8a/0x320 fs/btrfs/locking.c:191
btrfs_search_slot_get_root fs/btrfs/ctree.c:1623 [inline]
btrfs_search_slot+0x13b4/0x2140 fs/btrfs/ctree.c:1728
btrfs_update_device+0x11f/0x500 fs/btrfs/volumes.c:2794
btrfs_chunk_alloc_add_chunk_item+0x34d/0xea0 fs/btrfs/volumes.c:5504
do_chunk_alloc fs/btrfs/block-group.c:3408 [inline]
btrfs_chunk_alloc+0x84d/0xf50 fs/btrfs/block-group.c:3653
flush_space+0x54e/0xd80 fs/btrfs/space-info.c:670
btrfs_async_reclaim_metadata_space+0x396/0xa90 fs/btrfs/space-info.c:953
process_one_work+0x9df/0x16d0 kernel/workqueue.c:2297
worker_thread+0x90/0xed0 kernel/workqueue.c:2444
kthread+0x3e5/0x4d0 kernel/kthread.c:319
ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:295
INFO: task syz-executor:9107 blocked for more than 143 seconds.
Not tainted 5.15.0-rc3+ #1
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
task:syz-executor state:D stack:23200 pid: 9107 ppid: 7792 flags:0x00004004
Call Trace:
context_switch kernel/sched/core.c:4940 [inline]
__schedule+0xcd9/0x2530 kernel/sched/core.c:6287
schedule+0xd3/0x270 kernel/sched/core.c:6366
schedule_preempt_disabled+0xf/0x20 kernel/sched/core.c:6425
__mutex_lock_common kernel/locking/mutex.c:669 [inline]
__mutex_lock+0xc96/0x1680 kernel/locking/mutex.c:729
btrfs_chunk_alloc+0x31a/0xf50 fs/btrfs/block-group.c:3631
find_free_extent_update_loop fs/btrfs/extent-tree.c:3986 [inline]
find_free_extent+0x25cb/0x3a30 fs/btrfs/extent-tree.c:4335
btrfs_reserve_extent+0x1f1/0x500 fs/btrfs/extent-tree.c:4415
btrfs_alloc_tree_block+0x203/0x1120 fs/btrfs/extent-tree.c:4813
__btrfs_cow_block+0x412/0x1620 fs/btrfs/ctree.c:415
btrfs_cow_block+0x2f6/0x8c0 fs/btrfs/ctree.c:570
btrfs_search_slot+0x1094/0x2140 fs/btrfs/ctree.c:1768
relocate_tree_block fs/btrfs/relocation.c:2694 [inline]
relocate_tree_blocks+0xf73/0x1770 fs/btrfs/relocation.c:2757
relocate_block_group+0x47e/0xc70 fs/btrfs/relocation.c:3673
btrfs_relocate_block_group+0x48a/0xc60 fs/btrfs/relocation.c:4070
btrfs_relocate_chunk+0x96/0x280 fs/btrfs/volumes.c:3181
__btrfs_balance fs/btrfs/volumes.c:3911 [inline]
btrfs_balance+0x1f03/0x3cd0 fs/btrfs/volumes.c:4301
btrfs_ioctl_balance+0x61e/0x800 fs/btrfs/ioctl.c:4137
btrfs_ioctl+0x39ea/0x7b70 fs/btrfs/ioctl.c:4949
vfs_ioctl fs/ioctl.c:51 [inline]
__do_sys_ioctl fs/ioctl.c:874 [inline]
__se_sys_ioctl fs/ioctl.c:860 [inline]
__x64_sys_ioctl+0x193/0x200 fs/ioctl.c:860
do_syscall_x64 arch/x86/entry/common.c:50 [inline]
do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x44/0xae
So fix this by making sure that whenever we try to modify the chunk btree
and we are neither in a chunk allocation context nor in a chunk remove
context, we reserve system space before modifying the chunk btree.
Reported-by: Hao Sun <sunhao.th(a)gmail.com>
Link: https://lore.kernel.org/linux-btrfs/CACkBjsax51i4mu6C0C3vJqQN3NR_iVuucoeG3U…
Fixes: 79bd37120b1495 ("btrfs: rework chunk allocation to avoid exhaustion of the system chunk array")
CC: stable(a)vger.kernel.org # 5.14+
Reviewed-by: Josef Bacik <josef(a)toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index de9aeb3733cf..f971d043469c 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -3425,25 +3425,6 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags)
goto out;
}
- /*
- * If this is a system chunk allocation then stop right here and do not
- * add the chunk item to the chunk btree. This is to prevent a deadlock
- * because this system chunk allocation can be triggered while COWing
- * some extent buffer of the chunk btree and while holding a lock on a
- * parent extent buffer, in which case attempting to insert the chunk
- * item (or update the device item) would result in a deadlock on that
- * parent extent buffer. In this case defer the chunk btree updates to
- * the second phase of chunk allocation and keep our reservation until
- * the second phase completes.
- *
- * This is a rare case and can only be triggered by the very few cases
- * we have where we need to touch the chunk btree outside chunk allocation
- * and chunk removal. These cases are basically adding a device, removing
- * a device or resizing a device.
- */
- if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
- return 0;
-
ret = btrfs_chunk_alloc_add_chunk_item(trans, bg);
/*
* Normally we are not expected to fail with -ENOSPC here, since we have
@@ -3576,14 +3557,14 @@ out:
* This has happened before and commit eafa4fd0ad0607 ("btrfs: fix exhaustion of
* the system chunk array due to concurrent allocations") provides more details.
*
- * For allocation of system chunks, we defer the updates and insertions into the
- * chunk btree to phase 2. This is to prevent deadlocks on extent buffers because
- * if the chunk allocation is triggered while COWing an extent buffer of the
- * chunk btree, we are holding a lock on the parent of that extent buffer and
- * doing the chunk btree updates and insertions can require locking that parent.
- * This is for the very few and rare cases where we update the chunk btree that
- * are not chunk allocation or chunk removal: adding a device, removing a device
- * or resizing a device.
+ * Allocation of system chunks does not happen through this function. A task that
+ * needs to update the chunk btree (the only btree that uses system chunks), must
+ * preallocate chunk space by calling either check_system_chunk() or
+ * btrfs_reserve_chunk_metadata() - the former is used when allocating a data or
+ * metadata chunk or when removing a chunk, while the later is used before doing
+ * a modification to the chunk btree - use cases for the later are adding,
+ * removing and resizing a device as well as relocation of a system chunk.
+ * See the comment below for more details.
*
* The reservation of system space, done through check_system_chunk(), as well
* as all the updates and insertions into the chunk btree must be done while
@@ -3620,11 +3601,27 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
if (trans->allocating_chunk)
return -ENOSPC;
/*
- * If we are removing a chunk, don't re-enter or we would deadlock.
- * System space reservation and system chunk allocation is done by the
- * chunk remove operation (btrfs_remove_chunk()).
+ * Allocation of system chunks can not happen through this path, as we
+ * could end up in a deadlock if we are allocating a data or metadata
+ * chunk and there is another task modifying the chunk btree.
+ *
+ * This is because while we are holding the chunk mutex, we will attempt
+ * to add the new chunk item to the chunk btree or update an existing
+ * device item in the chunk btree, while the other task that is modifying
+ * the chunk btree is attempting to COW an extent buffer while holding a
+ * lock on it and on its parent - if the COW operation triggers a system
+ * chunk allocation, then we can deadlock because we are holding the
+ * chunk mutex and we may need to access that extent buffer or its parent
+ * in order to add the chunk item or update a device item.
+ *
+ * Tasks that want to modify the chunk tree should reserve system space
+ * before updating the chunk btree, by calling either
+ * btrfs_reserve_chunk_metadata() or check_system_chunk().
+ * It's possible that after a task reserves the space, it still ends up
+ * here - this happens in the cases described above at do_chunk_alloc().
+ * The task will have to either retry or fail.
*/
- if (trans->removing_chunk)
+ if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
return -ENOSPC;
space_info = btrfs_find_space_info(fs_info, flags);
@@ -3723,17 +3720,14 @@ static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
return num_dev;
}
-/*
- * Reserve space in the system space for allocating or removing a chunk
- */
-void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
+static void reserve_chunk_space(struct btrfs_trans_handle *trans,
+ u64 bytes,
+ u64 type)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_space_info *info;
u64 left;
- u64 thresh;
int ret = 0;
- u64 num_devs;
/*
* Needed because we can end up allocating a system chunk and for an
@@ -3746,19 +3740,13 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
left = info->total_bytes - btrfs_space_info_used(info, true);
spin_unlock(&info->lock);
- num_devs = get_profile_num_devs(fs_info, type);
-
- /* num_devs device items to update and 1 chunk item to add or remove */
- thresh = btrfs_calc_metadata_size(fs_info, num_devs) +
- btrfs_calc_insert_metadata_size(fs_info, 1);
-
- if (left < thresh && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
+ if (left < bytes && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu",
- left, thresh, type);
+ left, bytes, type);
btrfs_dump_space_info(fs_info, info, 0, 0);
}
- if (left < thresh) {
+ if (left < bytes) {
u64 flags = btrfs_system_alloc_profile(fs_info);
struct btrfs_block_group *bg;
@@ -3767,21 +3755,20 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
* needing it, as we might not need to COW all nodes/leafs from
* the paths we visit in the chunk tree (they were already COWed
* or created in the current transaction for example).
- *
- * Also, if our caller is allocating a system chunk, do not
- * attempt to insert the chunk item in the chunk btree, as we
- * could deadlock on an extent buffer since our caller may be
- * COWing an extent buffer from the chunk btree.
*/
bg = btrfs_create_chunk(trans, flags);
if (IS_ERR(bg)) {
ret = PTR_ERR(bg);
- } else if (!(type & BTRFS_BLOCK_GROUP_SYSTEM)) {
+ } else {
/*
* If we fail to add the chunk item here, we end up
* trying again at phase 2 of chunk allocation, at
* btrfs_create_pending_block_groups(). So ignore
- * any error here.
+ * any error here. An ENOSPC here could happen, due to
+ * the cases described at do_chunk_alloc() - the system
+ * block group we just created was just turned into RO
+ * mode by a scrub for example, or a running discard
+ * temporarily removed its free space entries, etc.
*/
btrfs_chunk_alloc_add_chunk_item(trans, bg);
}
@@ -3790,12 +3777,61 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
if (!ret) {
ret = btrfs_block_rsv_add(fs_info->chunk_root,
&fs_info->chunk_block_rsv,
- thresh, BTRFS_RESERVE_NO_FLUSH);
+ bytes, BTRFS_RESERVE_NO_FLUSH);
if (!ret)
- trans->chunk_bytes_reserved += thresh;
+ trans->chunk_bytes_reserved += bytes;
}
}
+/*
+ * Reserve space in the system space for allocating or removing a chunk.
+ * The caller must be holding fs_info->chunk_mutex.
+ */
+void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ const u64 num_devs = get_profile_num_devs(fs_info, type);
+ u64 bytes;
+
+ /* num_devs device items to update and 1 chunk item to add or remove. */
+ bytes = btrfs_calc_metadata_size(fs_info, num_devs) +
+ btrfs_calc_insert_metadata_size(fs_info, 1);
+
+ reserve_chunk_space(trans, bytes, type);
+}
+
+/*
+ * Reserve space in the system space, if needed, for doing a modification to the
+ * chunk btree.
+ *
+ * @trans: A transaction handle.
+ * @is_item_insertion: Indicate if the modification is for inserting a new item
+ * in the chunk btree or if it's for the deletion or update
+ * of an existing item.
+ *
+ * This is used in a context where we need to update the chunk btree outside
+ * block group allocation and removal, to avoid a deadlock with a concurrent
+ * task that is allocating a metadata or data block group and therefore needs to
+ * update the chunk btree while holding the chunk mutex. After the update to the
+ * chunk btree is done, btrfs_trans_release_chunk_metadata() should be called.
+ *
+ */
+void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans,
+ bool is_item_insertion)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ u64 bytes;
+
+ if (is_item_insertion)
+ bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
+ else
+ bytes = btrfs_calc_metadata_size(fs_info, 1);
+
+ mutex_lock(&fs_info->chunk_mutex);
+ reserve_chunk_space(trans, bytes, BTRFS_BLOCK_GROUP_SYSTEM);
+ mutex_unlock(&fs_info->chunk_mutex);
+}
+
void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
{
struct btrfs_block_group *block_group;
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index 07f977d3816c..5878b7ce3b78 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -293,6 +293,8 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
enum btrfs_chunk_alloc_enum force);
int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type);
void check_system_chunk(struct btrfs_trans_handle *trans, const u64 type);
+void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans,
+ bool is_item_insertion);
u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags);
void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
int btrfs_free_block_groups(struct btrfs_fs_info *info);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index fed823596248..33a0ee7ac590 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2692,8 +2692,12 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
list_add_tail(&node->list, &rc->backref_cache.changed);
} else {
path->lowest_level = node->level;
+ if (root == root->fs_info->chunk_root)
+ btrfs_reserve_chunk_metadata(trans, false);
ret = btrfs_search_slot(trans, root, key, path, 0, 1);
btrfs_release_path(path);
+ if (root == root->fs_info->chunk_root)
+ btrfs_trans_release_chunk_metadata(trans);
if (ret > 0)
ret = 0;
}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index debba6f04858..9eab8a741166 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1847,8 +1847,10 @@ static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
key.type = BTRFS_DEV_ITEM_KEY;
key.offset = device->devid;
+ btrfs_reserve_chunk_metadata(trans, true);
ret = btrfs_insert_empty_item(trans, trans->fs_info->chunk_root, path,
&key, sizeof(*dev_item));
+ btrfs_trans_release_chunk_metadata(trans);
if (ret)
goto out;
@@ -1921,7 +1923,9 @@ static int btrfs_rm_dev_item(struct btrfs_device *device)
key.type = BTRFS_DEV_ITEM_KEY;
key.offset = device->devid;
+ btrfs_reserve_chunk_metadata(trans, false);
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ btrfs_trans_release_chunk_metadata(trans);
if (ret) {
if (ret > 0)
ret = -ENOENT;
@@ -2513,7 +2517,9 @@ static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
key.type = BTRFS_DEV_ITEM_KEY;
while (1) {
+ btrfs_reserve_chunk_metadata(trans, false);
ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+ btrfs_trans_release_chunk_metadata(trans);
if (ret < 0)
goto error;
@@ -2862,6 +2868,7 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans,
struct btrfs_super_block *super_copy = fs_info->super_copy;
u64 old_total;
u64 diff;
+ int ret;
if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
return -EACCES;
@@ -2890,7 +2897,11 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans,
&trans->transaction->dev_update_list);
mutex_unlock(&fs_info->chunk_mutex);
- return btrfs_update_device(trans, device);
+ btrfs_reserve_chunk_metadata(trans, false);
+ ret = btrfs_update_device(trans, device);
+ btrfs_trans_release_chunk_metadata(trans);
+
+ return ret;
}
static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
@@ -4925,8 +4936,10 @@ again:
round_down(old_total - diff, fs_info->sectorsize));
mutex_unlock(&fs_info->chunk_mutex);
+ btrfs_reserve_chunk_metadata(trans, false);
/* Now btrfs_update_device() will change the on-disk size. */
ret = btrfs_update_device(trans, device);
+ btrfs_trans_release_chunk_metadata(trans);
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
> commit 8615ff6dd1ac9e01b6fcf0fc0652353f79f524ed
> Author: Yang Shi <shy828301(a)gmail.com>
> Date: Thu Oct 28 14:36:11 2021 -0700
>
> mm: filemap: check if THP has hwpoisoned subpage for PMD page fault
>
> commit eac96c3efdb593df1a57bb5b95dbe037bfa9a522 upstream.
For the sake of testing,
other than this breaking systemd-journal,
postgresql is another service that would hang forever with 100% CPU,
on arm64 (odroid-c4) using Ubuntu 20.04.
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From b968e84b509da593c50dc3db679e1d33de701f78 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz(a)infradead.org>
Date: Fri, 17 Sep 2021 11:20:04 +0200
Subject: [PATCH] x86/iopl: Fake iopl(3) CLI/STI usage
Since commit c8137ace5638 ("x86/iopl: Restrict iopl() permission
scope") it's possible to emulate iopl(3) using ioperm(), except for
the CLI/STI usage.
Userspace CLI/STI usage is very dubious (read broken), since any
exception taken during that window can lead to rescheduling anyway (or
worse). The IOPL(2) manpage even states that usage of CLI/STI is highly
discouraged and might even crash the system.
Of course, that won't stop people and HP has the dubious honour of
being the first vendor to be found using this in their hp-health
package.
In order to enable this 'software' to still 'work', have the #GP treat
the CLI/STI instructions as NOPs when iopl(3). Warn the user that
their program is doing dubious things.
Fixes: a24ca9976843 ("x86/iopl: Remove legacy IOPL option")
Reported-by: Ondrej Zary <linux(a)zary.sk>
Signed-off-by: Peter Zijlstra (Intel) <peterz(a)infradead.org>
Reviewed-by: Thomas Gleixner <tglx(a)linutronix.de>
Cc: stable(a)kernel.org # v5.5+
Link: https://lkml.kernel.org/r/20210918090641.GD5106@worktop.programming.kicks-a…
diff --git a/arch/x86/include/asm/insn-eval.h b/arch/x86/include/asm/insn-eval.h
index 91d7182ad2d6..4ec3613551e3 100644
--- a/arch/x86/include/asm/insn-eval.h
+++ b/arch/x86/include/asm/insn-eval.h
@@ -21,6 +21,7 @@ int insn_get_modrm_rm_off(struct insn *insn, struct pt_regs *regs);
int insn_get_modrm_reg_off(struct insn *insn, struct pt_regs *regs);
unsigned long insn_get_seg_base(struct pt_regs *regs, int seg_reg_idx);
int insn_get_code_seg_params(struct pt_regs *regs);
+int insn_get_effective_ip(struct pt_regs *regs, unsigned long *ip);
int insn_fetch_from_user(struct pt_regs *regs,
unsigned char buf[MAX_INSN_SIZE]);
int insn_fetch_from_user_inatomic(struct pt_regs *regs,
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 9ad2acaaae9b..577f342dbfb2 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -518,6 +518,7 @@ struct thread_struct {
*/
unsigned long iopl_emul;
+ unsigned int iopl_warn:1;
unsigned int sig_on_uaccess_err:1;
/*
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 1d9463e3096b..f2f733bcb2b9 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -132,6 +132,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg,
frame->ret_addr = (unsigned long) ret_from_fork;
p->thread.sp = (unsigned long) fork_frame;
p->thread.io_bitmap = NULL;
+ p->thread.iopl_warn = 0;
memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
#ifdef CONFIG_X86_64
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index a58800973aed..f3f3034b06f3 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -528,6 +528,36 @@ static enum kernel_gp_hint get_kernel_gp_address(struct pt_regs *regs,
#define GPFSTR "general protection fault"
+static bool fixup_iopl_exception(struct pt_regs *regs)
+{
+ struct thread_struct *t = ¤t->thread;
+ unsigned char byte;
+ unsigned long ip;
+
+ if (!IS_ENABLED(CONFIG_X86_IOPL_IOPERM) || t->iopl_emul != 3)
+ return false;
+
+ if (insn_get_effective_ip(regs, &ip))
+ return false;
+
+ if (get_user(byte, (const char __user *)ip))
+ return false;
+
+ if (byte != 0xfa && byte != 0xfb)
+ return false;
+
+ if (!t->iopl_warn && printk_ratelimit()) {
+ pr_err("%s[%d] attempts to use CLI/STI, pretending it's a NOP, ip:%lx",
+ current->comm, task_pid_nr(current), ip);
+ print_vma_addr(KERN_CONT " in ", ip);
+ pr_cont("\n");
+ t->iopl_warn = 1;
+ }
+
+ regs->ip += 1;
+ return true;
+}
+
DEFINE_IDTENTRY_ERRORCODE(exc_general_protection)
{
char desc[sizeof(GPFSTR) + 50 + 2*sizeof(unsigned long) + 1] = GPFSTR;
@@ -553,6 +583,9 @@ DEFINE_IDTENTRY_ERRORCODE(exc_general_protection)
tsk = current;
if (user_mode(regs)) {
+ if (fixup_iopl_exception(regs))
+ goto exit;
+
tsk->thread.error_code = error_code;
tsk->thread.trap_nr = X86_TRAP_GP;
diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c
index a1d24fdc07cf..eb3ccffb9b9d 100644
--- a/arch/x86/lib/insn-eval.c
+++ b/arch/x86/lib/insn-eval.c
@@ -1417,7 +1417,7 @@ void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs)
}
}
-static int insn_get_effective_ip(struct pt_regs *regs, unsigned long *ip)
+int insn_get_effective_ip(struct pt_regs *regs, unsigned long *ip)
{
unsigned long seg_base = 0;
The patch below does not apply to the 5.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 432cce21b66c49b1951d046d3ffc5d52fdad6265 Mon Sep 17 00:00:00 2001
From: Sachi King <nakato(a)nakato.io>
Date: Sat, 2 Oct 2021 14:18:39 +1000
Subject: [PATCH] platform/x86: amd-pmc: Add alternative acpi id for PMC
controller
The Surface Laptop 4 AMD has used the AMD0005 to identify this
controller instead of using the appropriate ACPI ID AMDI0005. Include
AMD0005 in the acpi id list.
Link: https://github.com/linux-surface/acpidumps/tree/master/surface_laptop_4_amd
Link: https://gist.github.com/nakato/2a1a7df1a45fe680d7a08c583e1bf863
Cc: <stable(a)vger.kernel.org> # 5.14+
Signed-off-by: Sachi King <nakato(a)nakato.io>
Reviewed-by: Mario Limonciello <mario.limonciello(a)amd.com>
Link: https://lore.kernel.org/r/20211002041840.2058647-1-nakato@nakato.io
Signed-off-by: Hans de Goede <hdegoede(a)redhat.com>
diff --git a/drivers/platform/x86/amd-pmc.c b/drivers/platform/x86/amd-pmc.c
index 548432a2ea65..b7b6ad4629a7 100644
--- a/drivers/platform/x86/amd-pmc.c
+++ b/drivers/platform/x86/amd-pmc.c
@@ -555,6 +555,7 @@ static const struct acpi_device_id amd_pmc_acpi_ids[] = {
{"AMDI0006", 0},
{"AMDI0007", 0},
{"AMD0004", 0},
+ {"AMD0005", 0},
{ }
};
MODULE_DEVICE_TABLE(acpi, amd_pmc_acpi_ids);
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 432cce21b66c49b1951d046d3ffc5d52fdad6265 Mon Sep 17 00:00:00 2001
From: Sachi King <nakato(a)nakato.io>
Date: Sat, 2 Oct 2021 14:18:39 +1000
Subject: [PATCH] platform/x86: amd-pmc: Add alternative acpi id for PMC
controller
The Surface Laptop 4 AMD has used the AMD0005 to identify this
controller instead of using the appropriate ACPI ID AMDI0005. Include
AMD0005 in the acpi id list.
Link: https://github.com/linux-surface/acpidumps/tree/master/surface_laptop_4_amd
Link: https://gist.github.com/nakato/2a1a7df1a45fe680d7a08c583e1bf863
Cc: <stable(a)vger.kernel.org> # 5.14+
Signed-off-by: Sachi King <nakato(a)nakato.io>
Reviewed-by: Mario Limonciello <mario.limonciello(a)amd.com>
Link: https://lore.kernel.org/r/20211002041840.2058647-1-nakato@nakato.io
Signed-off-by: Hans de Goede <hdegoede(a)redhat.com>
diff --git a/drivers/platform/x86/amd-pmc.c b/drivers/platform/x86/amd-pmc.c
index 548432a2ea65..b7b6ad4629a7 100644
--- a/drivers/platform/x86/amd-pmc.c
+++ b/drivers/platform/x86/amd-pmc.c
@@ -555,6 +555,7 @@ static const struct acpi_device_id amd_pmc_acpi_ids[] = {
{"AMDI0006", 0},
{"AMDI0007", 0},
{"AMD0004", 0},
+ {"AMD0005", 0},
{ }
};
MODULE_DEVICE_TABLE(acpi, amd_pmc_acpi_ids);
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 95384b3e47afa04d7dfa014f6a52662852645578 Mon Sep 17 00:00:00 2001
From: "Zephaniah E. Loss-Cutler-Hull" <zephaniah(a)gmail.com>
Date: Mon, 4 Oct 2021 21:48:55 -0700
Subject: [PATCH] platform/x86: gigabyte-wmi: add support for B550 AORUS ELITE
AX V2
This works just fine on my system.
Signed-off-by: Zephaniah E. Loss-Cutler-Hull <zephaniah(a)gmail.com>
Cc: <stable(a)vger.kernel.org>
Link: https://lore.kernel.org/r/20211005044855.1429724-1-zephaniah@gmail.com
Signed-off-by: Hans de Goede <hdegoede(a)redhat.com>
diff --git a/drivers/platform/x86/gigabyte-wmi.c b/drivers/platform/x86/gigabyte-wmi.c
index d53634c8a6e0..658bab4b7964 100644
--- a/drivers/platform/x86/gigabyte-wmi.c
+++ b/drivers/platform/x86/gigabyte-wmi.c
@@ -141,6 +141,7 @@ static u8 gigabyte_wmi_detect_sensor_usability(struct wmi_device *wdev)
static const struct dmi_system_id gigabyte_wmi_known_working_platforms[] = {
DMI_EXACT_MATCH_GIGABYTE_BOARD_NAME("B450M S2H V2"),
+ DMI_EXACT_MATCH_GIGABYTE_BOARD_NAME("B550 AORUS ELITE AX V2"),
DMI_EXACT_MATCH_GIGABYTE_BOARD_NAME("B550 AORUS ELITE"),
DMI_EXACT_MATCH_GIGABYTE_BOARD_NAME("B550 AORUS ELITE V2"),
DMI_EXACT_MATCH_GIGABYTE_BOARD_NAME("B550 GAMING X V2"),
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 415de44076640483648d6c0f6d645a9ee61328ad Mon Sep 17 00:00:00 2001
From: Jane Malalane <jane.malalane(a)citrix.com>
Date: Thu, 21 Oct 2021 11:47:44 +0100
Subject: [PATCH] x86/cpu: Fix migration safety with X86_BUG_NULL_SEL
Currently, Linux probes for X86_BUG_NULL_SEL unconditionally which
makes it unsafe to migrate in a virtualised environment as the
properties across the migration pool might differ.
To be specific, the case which goes wrong is:
1. Zen1 (or earlier) and Zen2 (or later) in a migration pool
2. Linux boots on Zen2, probes and finds the absence of X86_BUG_NULL_SEL
3. Linux is then migrated to Zen1
Linux is now running on a X86_BUG_NULL_SEL-impacted CPU while believing
that the bug is fixed.
The only way to address the problem is to fully trust the "no longer
affected" CPUID bit when virtualised, because in the above case it would
be clear deliberately to indicate the fact "you might migrate to
somewhere which has this behaviour".
Zen3 adds the NullSelectorClearsBase CPUID bit to indicate that loading
a NULL segment selector zeroes the base and limit fields, as well as
just attributes. Zen2 also has this behaviour but doesn't have the NSCB
bit.
[ bp: Minor touchups. ]
Signed-off-by: Jane Malalane <jane.malalane(a)citrix.com>
Signed-off-by: Borislav Petkov <bp(a)suse.de>
CC: <stable(a)vger.kernel.org>
Link: https://lkml.kernel.org/r/20211021104744.24126-1-jane.malalane@citrix.com
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 2131af9f2fa2..4edb6f0f628c 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -989,6 +989,8 @@ static void init_amd(struct cpuinfo_x86 *c)
if (cpu_has(c, X86_FEATURE_IRPERF) &&
!cpu_has_amd_erratum(c, amd_erratum_1054))
msr_set_bit(MSR_K7_HWCR, MSR_K7_HWCR_IRPERF_EN_BIT);
+
+ check_null_seg_clears_base(c);
}
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 325d6022599b..1bfeb186452a 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1397,9 +1397,8 @@ void __init early_cpu_init(void)
early_identify_cpu(&boot_cpu_data);
}
-static void detect_null_seg_behavior(struct cpuinfo_x86 *c)
+static bool detect_null_seg_behavior(void)
{
-#ifdef CONFIG_X86_64
/*
* Empirically, writing zero to a segment selector on AMD does
* not clear the base, whereas writing zero to a segment
@@ -1420,10 +1419,43 @@ static void detect_null_seg_behavior(struct cpuinfo_x86 *c)
wrmsrl(MSR_FS_BASE, 1);
loadsegment(fs, 0);
rdmsrl(MSR_FS_BASE, tmp);
- if (tmp != 0)
- set_cpu_bug(c, X86_BUG_NULL_SEG);
wrmsrl(MSR_FS_BASE, old_base);
-#endif
+ return tmp == 0;
+}
+
+void check_null_seg_clears_base(struct cpuinfo_x86 *c)
+{
+ /* BUG_NULL_SEG is only relevant with 64bit userspace */
+ if (!IS_ENABLED(CONFIG_X86_64))
+ return;
+
+ /* Zen3 CPUs advertise Null Selector Clears Base in CPUID. */
+ if (c->extended_cpuid_level >= 0x80000021 &&
+ cpuid_eax(0x80000021) & BIT(6))
+ return;
+
+ /*
+ * CPUID bit above wasn't set. If this kernel is still running
+ * as a HV guest, then the HV has decided not to advertize
+ * that CPUID bit for whatever reason. For example, one
+ * member of the migration pool might be vulnerable. Which
+ * means, the bug is present: set the BUG flag and return.
+ */
+ if (cpu_has(c, X86_FEATURE_HYPERVISOR)) {
+ set_cpu_bug(c, X86_BUG_NULL_SEG);
+ return;
+ }
+
+ /*
+ * Zen2 CPUs also have this behaviour, but no CPUID bit.
+ * 0x18 is the respective family for Hygon.
+ */
+ if ((c->x86 == 0x17 || c->x86 == 0x18) &&
+ detect_null_seg_behavior())
+ return;
+
+ /* All the remaining ones are affected */
+ set_cpu_bug(c, X86_BUG_NULL_SEG);
}
static void generic_identify(struct cpuinfo_x86 *c)
@@ -1459,8 +1491,6 @@ static void generic_identify(struct cpuinfo_x86 *c)
get_model_name(c); /* Default name */
- detect_null_seg_behavior(c);
-
/*
* ESPFIX is a strange bug. All real CPUs have it. Paravirt
* systems that run Linux at CPL > 0 may or may not have the
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 95521302630d..ee6f23f7587d 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -75,6 +75,7 @@ extern int detect_extended_topology_early(struct cpuinfo_x86 *c);
extern int detect_extended_topology(struct cpuinfo_x86 *c);
extern int detect_ht_early(struct cpuinfo_x86 *c);
extern void detect_ht(struct cpuinfo_x86 *c);
+extern void check_null_seg_clears_base(struct cpuinfo_x86 *c);
unsigned int aperfmperf_get_khz(int cpu);
diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c
index 6d50136f7ab9..3fcdda4c1e11 100644
--- a/arch/x86/kernel/cpu/hygon.c
+++ b/arch/x86/kernel/cpu/hygon.c
@@ -335,6 +335,8 @@ static void init_hygon(struct cpuinfo_x86 *c)
/* Hygon CPUs don't reset SS attributes on SYSRET, Xen does. */
if (!cpu_has(c, X86_FEATURE_XENPV))
set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS);
+
+ check_null_seg_clears_base(c);
}
static void cpu_detect_tlb_hygon(struct cpuinfo_x86 *c)
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 415de44076640483648d6c0f6d645a9ee61328ad Mon Sep 17 00:00:00 2001
From: Jane Malalane <jane.malalane(a)citrix.com>
Date: Thu, 21 Oct 2021 11:47:44 +0100
Subject: [PATCH] x86/cpu: Fix migration safety with X86_BUG_NULL_SEL
Currently, Linux probes for X86_BUG_NULL_SEL unconditionally which
makes it unsafe to migrate in a virtualised environment as the
properties across the migration pool might differ.
To be specific, the case which goes wrong is:
1. Zen1 (or earlier) and Zen2 (or later) in a migration pool
2. Linux boots on Zen2, probes and finds the absence of X86_BUG_NULL_SEL
3. Linux is then migrated to Zen1
Linux is now running on a X86_BUG_NULL_SEL-impacted CPU while believing
that the bug is fixed.
The only way to address the problem is to fully trust the "no longer
affected" CPUID bit when virtualised, because in the above case it would
be clear deliberately to indicate the fact "you might migrate to
somewhere which has this behaviour".
Zen3 adds the NullSelectorClearsBase CPUID bit to indicate that loading
a NULL segment selector zeroes the base and limit fields, as well as
just attributes. Zen2 also has this behaviour but doesn't have the NSCB
bit.
[ bp: Minor touchups. ]
Signed-off-by: Jane Malalane <jane.malalane(a)citrix.com>
Signed-off-by: Borislav Petkov <bp(a)suse.de>
CC: <stable(a)vger.kernel.org>
Link: https://lkml.kernel.org/r/20211021104744.24126-1-jane.malalane@citrix.com
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 2131af9f2fa2..4edb6f0f628c 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -989,6 +989,8 @@ static void init_amd(struct cpuinfo_x86 *c)
if (cpu_has(c, X86_FEATURE_IRPERF) &&
!cpu_has_amd_erratum(c, amd_erratum_1054))
msr_set_bit(MSR_K7_HWCR, MSR_K7_HWCR_IRPERF_EN_BIT);
+
+ check_null_seg_clears_base(c);
}
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 325d6022599b..1bfeb186452a 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1397,9 +1397,8 @@ void __init early_cpu_init(void)
early_identify_cpu(&boot_cpu_data);
}
-static void detect_null_seg_behavior(struct cpuinfo_x86 *c)
+static bool detect_null_seg_behavior(void)
{
-#ifdef CONFIG_X86_64
/*
* Empirically, writing zero to a segment selector on AMD does
* not clear the base, whereas writing zero to a segment
@@ -1420,10 +1419,43 @@ static void detect_null_seg_behavior(struct cpuinfo_x86 *c)
wrmsrl(MSR_FS_BASE, 1);
loadsegment(fs, 0);
rdmsrl(MSR_FS_BASE, tmp);
- if (tmp != 0)
- set_cpu_bug(c, X86_BUG_NULL_SEG);
wrmsrl(MSR_FS_BASE, old_base);
-#endif
+ return tmp == 0;
+}
+
+void check_null_seg_clears_base(struct cpuinfo_x86 *c)
+{
+ /* BUG_NULL_SEG is only relevant with 64bit userspace */
+ if (!IS_ENABLED(CONFIG_X86_64))
+ return;
+
+ /* Zen3 CPUs advertise Null Selector Clears Base in CPUID. */
+ if (c->extended_cpuid_level >= 0x80000021 &&
+ cpuid_eax(0x80000021) & BIT(6))
+ return;
+
+ /*
+ * CPUID bit above wasn't set. If this kernel is still running
+ * as a HV guest, then the HV has decided not to advertize
+ * that CPUID bit for whatever reason. For example, one
+ * member of the migration pool might be vulnerable. Which
+ * means, the bug is present: set the BUG flag and return.
+ */
+ if (cpu_has(c, X86_FEATURE_HYPERVISOR)) {
+ set_cpu_bug(c, X86_BUG_NULL_SEG);
+ return;
+ }
+
+ /*
+ * Zen2 CPUs also have this behaviour, but no CPUID bit.
+ * 0x18 is the respective family for Hygon.
+ */
+ if ((c->x86 == 0x17 || c->x86 == 0x18) &&
+ detect_null_seg_behavior())
+ return;
+
+ /* All the remaining ones are affected */
+ set_cpu_bug(c, X86_BUG_NULL_SEG);
}
static void generic_identify(struct cpuinfo_x86 *c)
@@ -1459,8 +1491,6 @@ static void generic_identify(struct cpuinfo_x86 *c)
get_model_name(c); /* Default name */
- detect_null_seg_behavior(c);
-
/*
* ESPFIX is a strange bug. All real CPUs have it. Paravirt
* systems that run Linux at CPL > 0 may or may not have the
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 95521302630d..ee6f23f7587d 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -75,6 +75,7 @@ extern int detect_extended_topology_early(struct cpuinfo_x86 *c);
extern int detect_extended_topology(struct cpuinfo_x86 *c);
extern int detect_ht_early(struct cpuinfo_x86 *c);
extern void detect_ht(struct cpuinfo_x86 *c);
+extern void check_null_seg_clears_base(struct cpuinfo_x86 *c);
unsigned int aperfmperf_get_khz(int cpu);
diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c
index 6d50136f7ab9..3fcdda4c1e11 100644
--- a/arch/x86/kernel/cpu/hygon.c
+++ b/arch/x86/kernel/cpu/hygon.c
@@ -335,6 +335,8 @@ static void init_hygon(struct cpuinfo_x86 *c)
/* Hygon CPUs don't reset SS attributes on SYSRET, Xen does. */
if (!cpu_has(c, X86_FEATURE_XENPV))
set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS);
+
+ check_null_seg_clears_base(c);
}
static void cpu_detect_tlb_hygon(struct cpuinfo_x86 *c)
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 415de44076640483648d6c0f6d645a9ee61328ad Mon Sep 17 00:00:00 2001
From: Jane Malalane <jane.malalane(a)citrix.com>
Date: Thu, 21 Oct 2021 11:47:44 +0100
Subject: [PATCH] x86/cpu: Fix migration safety with X86_BUG_NULL_SEL
Currently, Linux probes for X86_BUG_NULL_SEL unconditionally which
makes it unsafe to migrate in a virtualised environment as the
properties across the migration pool might differ.
To be specific, the case which goes wrong is:
1. Zen1 (or earlier) and Zen2 (or later) in a migration pool
2. Linux boots on Zen2, probes and finds the absence of X86_BUG_NULL_SEL
3. Linux is then migrated to Zen1
Linux is now running on a X86_BUG_NULL_SEL-impacted CPU while believing
that the bug is fixed.
The only way to address the problem is to fully trust the "no longer
affected" CPUID bit when virtualised, because in the above case it would
be clear deliberately to indicate the fact "you might migrate to
somewhere which has this behaviour".
Zen3 adds the NullSelectorClearsBase CPUID bit to indicate that loading
a NULL segment selector zeroes the base and limit fields, as well as
just attributes. Zen2 also has this behaviour but doesn't have the NSCB
bit.
[ bp: Minor touchups. ]
Signed-off-by: Jane Malalane <jane.malalane(a)citrix.com>
Signed-off-by: Borislav Petkov <bp(a)suse.de>
CC: <stable(a)vger.kernel.org>
Link: https://lkml.kernel.org/r/20211021104744.24126-1-jane.malalane@citrix.com
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 2131af9f2fa2..4edb6f0f628c 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -989,6 +989,8 @@ static void init_amd(struct cpuinfo_x86 *c)
if (cpu_has(c, X86_FEATURE_IRPERF) &&
!cpu_has_amd_erratum(c, amd_erratum_1054))
msr_set_bit(MSR_K7_HWCR, MSR_K7_HWCR_IRPERF_EN_BIT);
+
+ check_null_seg_clears_base(c);
}
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 325d6022599b..1bfeb186452a 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1397,9 +1397,8 @@ void __init early_cpu_init(void)
early_identify_cpu(&boot_cpu_data);
}
-static void detect_null_seg_behavior(struct cpuinfo_x86 *c)
+static bool detect_null_seg_behavior(void)
{
-#ifdef CONFIG_X86_64
/*
* Empirically, writing zero to a segment selector on AMD does
* not clear the base, whereas writing zero to a segment
@@ -1420,10 +1419,43 @@ static void detect_null_seg_behavior(struct cpuinfo_x86 *c)
wrmsrl(MSR_FS_BASE, 1);
loadsegment(fs, 0);
rdmsrl(MSR_FS_BASE, tmp);
- if (tmp != 0)
- set_cpu_bug(c, X86_BUG_NULL_SEG);
wrmsrl(MSR_FS_BASE, old_base);
-#endif
+ return tmp == 0;
+}
+
+void check_null_seg_clears_base(struct cpuinfo_x86 *c)
+{
+ /* BUG_NULL_SEG is only relevant with 64bit userspace */
+ if (!IS_ENABLED(CONFIG_X86_64))
+ return;
+
+ /* Zen3 CPUs advertise Null Selector Clears Base in CPUID. */
+ if (c->extended_cpuid_level >= 0x80000021 &&
+ cpuid_eax(0x80000021) & BIT(6))
+ return;
+
+ /*
+ * CPUID bit above wasn't set. If this kernel is still running
+ * as a HV guest, then the HV has decided not to advertize
+ * that CPUID bit for whatever reason. For example, one
+ * member of the migration pool might be vulnerable. Which
+ * means, the bug is present: set the BUG flag and return.
+ */
+ if (cpu_has(c, X86_FEATURE_HYPERVISOR)) {
+ set_cpu_bug(c, X86_BUG_NULL_SEG);
+ return;
+ }
+
+ /*
+ * Zen2 CPUs also have this behaviour, but no CPUID bit.
+ * 0x18 is the respective family for Hygon.
+ */
+ if ((c->x86 == 0x17 || c->x86 == 0x18) &&
+ detect_null_seg_behavior())
+ return;
+
+ /* All the remaining ones are affected */
+ set_cpu_bug(c, X86_BUG_NULL_SEG);
}
static void generic_identify(struct cpuinfo_x86 *c)
@@ -1459,8 +1491,6 @@ static void generic_identify(struct cpuinfo_x86 *c)
get_model_name(c); /* Default name */
- detect_null_seg_behavior(c);
-
/*
* ESPFIX is a strange bug. All real CPUs have it. Paravirt
* systems that run Linux at CPL > 0 may or may not have the
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 95521302630d..ee6f23f7587d 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -75,6 +75,7 @@ extern int detect_extended_topology_early(struct cpuinfo_x86 *c);
extern int detect_extended_topology(struct cpuinfo_x86 *c);
extern int detect_ht_early(struct cpuinfo_x86 *c);
extern void detect_ht(struct cpuinfo_x86 *c);
+extern void check_null_seg_clears_base(struct cpuinfo_x86 *c);
unsigned int aperfmperf_get_khz(int cpu);
diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c
index 6d50136f7ab9..3fcdda4c1e11 100644
--- a/arch/x86/kernel/cpu/hygon.c
+++ b/arch/x86/kernel/cpu/hygon.c
@@ -335,6 +335,8 @@ static void init_hygon(struct cpuinfo_x86 *c)
/* Hygon CPUs don't reset SS attributes on SYSRET, Xen does. */
if (!cpu_has(c, X86_FEATURE_XENPV))
set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS);
+
+ check_null_seg_clears_base(c);
}
static void cpu_detect_tlb_hygon(struct cpuinfo_x86 *c)
The patch below does not apply to the 4.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 415de44076640483648d6c0f6d645a9ee61328ad Mon Sep 17 00:00:00 2001
From: Jane Malalane <jane.malalane(a)citrix.com>
Date: Thu, 21 Oct 2021 11:47:44 +0100
Subject: [PATCH] x86/cpu: Fix migration safety with X86_BUG_NULL_SEL
Currently, Linux probes for X86_BUG_NULL_SEL unconditionally which
makes it unsafe to migrate in a virtualised environment as the
properties across the migration pool might differ.
To be specific, the case which goes wrong is:
1. Zen1 (or earlier) and Zen2 (or later) in a migration pool
2. Linux boots on Zen2, probes and finds the absence of X86_BUG_NULL_SEL
3. Linux is then migrated to Zen1
Linux is now running on a X86_BUG_NULL_SEL-impacted CPU while believing
that the bug is fixed.
The only way to address the problem is to fully trust the "no longer
affected" CPUID bit when virtualised, because in the above case it would
be clear deliberately to indicate the fact "you might migrate to
somewhere which has this behaviour".
Zen3 adds the NullSelectorClearsBase CPUID bit to indicate that loading
a NULL segment selector zeroes the base and limit fields, as well as
just attributes. Zen2 also has this behaviour but doesn't have the NSCB
bit.
[ bp: Minor touchups. ]
Signed-off-by: Jane Malalane <jane.malalane(a)citrix.com>
Signed-off-by: Borislav Petkov <bp(a)suse.de>
CC: <stable(a)vger.kernel.org>
Link: https://lkml.kernel.org/r/20211021104744.24126-1-jane.malalane@citrix.com
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 2131af9f2fa2..4edb6f0f628c 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -989,6 +989,8 @@ static void init_amd(struct cpuinfo_x86 *c)
if (cpu_has(c, X86_FEATURE_IRPERF) &&
!cpu_has_amd_erratum(c, amd_erratum_1054))
msr_set_bit(MSR_K7_HWCR, MSR_K7_HWCR_IRPERF_EN_BIT);
+
+ check_null_seg_clears_base(c);
}
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 325d6022599b..1bfeb186452a 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1397,9 +1397,8 @@ void __init early_cpu_init(void)
early_identify_cpu(&boot_cpu_data);
}
-static void detect_null_seg_behavior(struct cpuinfo_x86 *c)
+static bool detect_null_seg_behavior(void)
{
-#ifdef CONFIG_X86_64
/*
* Empirically, writing zero to a segment selector on AMD does
* not clear the base, whereas writing zero to a segment
@@ -1420,10 +1419,43 @@ static void detect_null_seg_behavior(struct cpuinfo_x86 *c)
wrmsrl(MSR_FS_BASE, 1);
loadsegment(fs, 0);
rdmsrl(MSR_FS_BASE, tmp);
- if (tmp != 0)
- set_cpu_bug(c, X86_BUG_NULL_SEG);
wrmsrl(MSR_FS_BASE, old_base);
-#endif
+ return tmp == 0;
+}
+
+void check_null_seg_clears_base(struct cpuinfo_x86 *c)
+{
+ /* BUG_NULL_SEG is only relevant with 64bit userspace */
+ if (!IS_ENABLED(CONFIG_X86_64))
+ return;
+
+ /* Zen3 CPUs advertise Null Selector Clears Base in CPUID. */
+ if (c->extended_cpuid_level >= 0x80000021 &&
+ cpuid_eax(0x80000021) & BIT(6))
+ return;
+
+ /*
+ * CPUID bit above wasn't set. If this kernel is still running
+ * as a HV guest, then the HV has decided not to advertize
+ * that CPUID bit for whatever reason. For example, one
+ * member of the migration pool might be vulnerable. Which
+ * means, the bug is present: set the BUG flag and return.
+ */
+ if (cpu_has(c, X86_FEATURE_HYPERVISOR)) {
+ set_cpu_bug(c, X86_BUG_NULL_SEG);
+ return;
+ }
+
+ /*
+ * Zen2 CPUs also have this behaviour, but no CPUID bit.
+ * 0x18 is the respective family for Hygon.
+ */
+ if ((c->x86 == 0x17 || c->x86 == 0x18) &&
+ detect_null_seg_behavior())
+ return;
+
+ /* All the remaining ones are affected */
+ set_cpu_bug(c, X86_BUG_NULL_SEG);
}
static void generic_identify(struct cpuinfo_x86 *c)
@@ -1459,8 +1491,6 @@ static void generic_identify(struct cpuinfo_x86 *c)
get_model_name(c); /* Default name */
- detect_null_seg_behavior(c);
-
/*
* ESPFIX is a strange bug. All real CPUs have it. Paravirt
* systems that run Linux at CPL > 0 may or may not have the
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 95521302630d..ee6f23f7587d 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -75,6 +75,7 @@ extern int detect_extended_topology_early(struct cpuinfo_x86 *c);
extern int detect_extended_topology(struct cpuinfo_x86 *c);
extern int detect_ht_early(struct cpuinfo_x86 *c);
extern void detect_ht(struct cpuinfo_x86 *c);
+extern void check_null_seg_clears_base(struct cpuinfo_x86 *c);
unsigned int aperfmperf_get_khz(int cpu);
diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c
index 6d50136f7ab9..3fcdda4c1e11 100644
--- a/arch/x86/kernel/cpu/hygon.c
+++ b/arch/x86/kernel/cpu/hygon.c
@@ -335,6 +335,8 @@ static void init_hygon(struct cpuinfo_x86 *c)
/* Hygon CPUs don't reset SS attributes on SYSRET, Xen does. */
if (!cpu_has(c, X86_FEATURE_XENPV))
set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS);
+
+ check_null_seg_clears_base(c);
}
static void cpu_detect_tlb_hygon(struct cpuinfo_x86 *c)
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 39fec6889d15a658c3a3ebb06fd69d3584ddffd3 Mon Sep 17 00:00:00 2001
From: Shaoying Xu <shaoyi(a)amazon.com>
Date: Thu, 2 Sep 2021 16:44:12 +0000
Subject: [PATCH] ext4: fix lazy initialization next schedule time computation
in more granular unit
Ext4 file system has default lazy inode table initialization setup once
it is mounted. However, it has issue on computing the next schedule time
that makes the timeout same amount in jiffies but different real time in
secs if with various HZ values. Therefore, fix by measuring the current
time in a more granular unit nanoseconds and make the next schedule time
independent of the HZ value.
Fixes: bfff68738f1c ("ext4: add support for lazy inode table initialization")
Signed-off-by: Shaoying Xu <shaoyi(a)amazon.com>
Cc: stable(a)vger.kernel.org
Signed-off-by: Theodore Ts'o <tytso(a)mit.edu>
Link: https://lore.kernel.org/r/20210902164412.9994-2-shaoyi@amazon.com
Signed-off-by: Theodore Ts'o <tytso(a)mit.edu>
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 88d5d274a868..8a67e5f3f576 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3263,9 +3263,9 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
struct super_block *sb = elr->lr_super;
ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
ext4_group_t group = elr->lr_next_group;
- unsigned long timeout = 0;
unsigned int prefetch_ios = 0;
int ret = 0;
+ u64 start_time;
if (elr->lr_mode == EXT4_LI_MODE_PREFETCH_BBITMAP) {
elr->lr_next_group = ext4_mb_prefetch(sb, group,
@@ -3302,14 +3302,13 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
ret = 1;
if (!ret) {
- timeout = jiffies;
+ start_time = ktime_get_real_ns();
ret = ext4_init_inode_table(sb, group,
elr->lr_timeout ? 0 : 1);
trace_ext4_lazy_itable_init(sb, group);
if (elr->lr_timeout == 0) {
- timeout = (jiffies - timeout) *
- EXT4_SB(elr->lr_super)->s_li_wait_mult;
- elr->lr_timeout = timeout;
+ elr->lr_timeout = nsecs_to_jiffies((ktime_get_real_ns() - start_time) *
+ EXT4_SB(elr->lr_super)->s_li_wait_mult);
}
elr->lr_next_sched = jiffies + elr->lr_timeout;
elr->lr_next_group = group + 1;
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 39fec6889d15a658c3a3ebb06fd69d3584ddffd3 Mon Sep 17 00:00:00 2001
From: Shaoying Xu <shaoyi(a)amazon.com>
Date: Thu, 2 Sep 2021 16:44:12 +0000
Subject: [PATCH] ext4: fix lazy initialization next schedule time computation
in more granular unit
Ext4 file system has default lazy inode table initialization setup once
it is mounted. However, it has issue on computing the next schedule time
that makes the timeout same amount in jiffies but different real time in
secs if with various HZ values. Therefore, fix by measuring the current
time in a more granular unit nanoseconds and make the next schedule time
independent of the HZ value.
Fixes: bfff68738f1c ("ext4: add support for lazy inode table initialization")
Signed-off-by: Shaoying Xu <shaoyi(a)amazon.com>
Cc: stable(a)vger.kernel.org
Signed-off-by: Theodore Ts'o <tytso(a)mit.edu>
Link: https://lore.kernel.org/r/20210902164412.9994-2-shaoyi@amazon.com
Signed-off-by: Theodore Ts'o <tytso(a)mit.edu>
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 88d5d274a868..8a67e5f3f576 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3263,9 +3263,9 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
struct super_block *sb = elr->lr_super;
ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
ext4_group_t group = elr->lr_next_group;
- unsigned long timeout = 0;
unsigned int prefetch_ios = 0;
int ret = 0;
+ u64 start_time;
if (elr->lr_mode == EXT4_LI_MODE_PREFETCH_BBITMAP) {
elr->lr_next_group = ext4_mb_prefetch(sb, group,
@@ -3302,14 +3302,13 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
ret = 1;
if (!ret) {
- timeout = jiffies;
+ start_time = ktime_get_real_ns();
ret = ext4_init_inode_table(sb, group,
elr->lr_timeout ? 0 : 1);
trace_ext4_lazy_itable_init(sb, group);
if (elr->lr_timeout == 0) {
- timeout = (jiffies - timeout) *
- EXT4_SB(elr->lr_super)->s_li_wait_mult;
- elr->lr_timeout = timeout;
+ elr->lr_timeout = nsecs_to_jiffies((ktime_get_real_ns() - start_time) *
+ EXT4_SB(elr->lr_super)->s_li_wait_mult);
}
elr->lr_next_sched = jiffies + elr->lr_timeout;
elr->lr_next_group = group + 1;
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 39fec6889d15a658c3a3ebb06fd69d3584ddffd3 Mon Sep 17 00:00:00 2001
From: Shaoying Xu <shaoyi(a)amazon.com>
Date: Thu, 2 Sep 2021 16:44:12 +0000
Subject: [PATCH] ext4: fix lazy initialization next schedule time computation
in more granular unit
Ext4 file system has default lazy inode table initialization setup once
it is mounted. However, it has issue on computing the next schedule time
that makes the timeout same amount in jiffies but different real time in
secs if with various HZ values. Therefore, fix by measuring the current
time in a more granular unit nanoseconds and make the next schedule time
independent of the HZ value.
Fixes: bfff68738f1c ("ext4: add support for lazy inode table initialization")
Signed-off-by: Shaoying Xu <shaoyi(a)amazon.com>
Cc: stable(a)vger.kernel.org
Signed-off-by: Theodore Ts'o <tytso(a)mit.edu>
Link: https://lore.kernel.org/r/20210902164412.9994-2-shaoyi@amazon.com
Signed-off-by: Theodore Ts'o <tytso(a)mit.edu>
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 88d5d274a868..8a67e5f3f576 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3263,9 +3263,9 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
struct super_block *sb = elr->lr_super;
ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
ext4_group_t group = elr->lr_next_group;
- unsigned long timeout = 0;
unsigned int prefetch_ios = 0;
int ret = 0;
+ u64 start_time;
if (elr->lr_mode == EXT4_LI_MODE_PREFETCH_BBITMAP) {
elr->lr_next_group = ext4_mb_prefetch(sb, group,
@@ -3302,14 +3302,13 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
ret = 1;
if (!ret) {
- timeout = jiffies;
+ start_time = ktime_get_real_ns();
ret = ext4_init_inode_table(sb, group,
elr->lr_timeout ? 0 : 1);
trace_ext4_lazy_itable_init(sb, group);
if (elr->lr_timeout == 0) {
- timeout = (jiffies - timeout) *
- EXT4_SB(elr->lr_super)->s_li_wait_mult;
- elr->lr_timeout = timeout;
+ elr->lr_timeout = nsecs_to_jiffies((ktime_get_real_ns() - start_time) *
+ EXT4_SB(elr->lr_super)->s_li_wait_mult);
}
elr->lr_next_sched = jiffies + elr->lr_timeout;
elr->lr_next_group = group + 1;
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 39fec6889d15a658c3a3ebb06fd69d3584ddffd3 Mon Sep 17 00:00:00 2001
From: Shaoying Xu <shaoyi(a)amazon.com>
Date: Thu, 2 Sep 2021 16:44:12 +0000
Subject: [PATCH] ext4: fix lazy initialization next schedule time computation
in more granular unit
Ext4 file system has default lazy inode table initialization setup once
it is mounted. However, it has issue on computing the next schedule time
that makes the timeout same amount in jiffies but different real time in
secs if with various HZ values. Therefore, fix by measuring the current
time in a more granular unit nanoseconds and make the next schedule time
independent of the HZ value.
Fixes: bfff68738f1c ("ext4: add support for lazy inode table initialization")
Signed-off-by: Shaoying Xu <shaoyi(a)amazon.com>
Cc: stable(a)vger.kernel.org
Signed-off-by: Theodore Ts'o <tytso(a)mit.edu>
Link: https://lore.kernel.org/r/20210902164412.9994-2-shaoyi@amazon.com
Signed-off-by: Theodore Ts'o <tytso(a)mit.edu>
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 88d5d274a868..8a67e5f3f576 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3263,9 +3263,9 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
struct super_block *sb = elr->lr_super;
ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
ext4_group_t group = elr->lr_next_group;
- unsigned long timeout = 0;
unsigned int prefetch_ios = 0;
int ret = 0;
+ u64 start_time;
if (elr->lr_mode == EXT4_LI_MODE_PREFETCH_BBITMAP) {
elr->lr_next_group = ext4_mb_prefetch(sb, group,
@@ -3302,14 +3302,13 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
ret = 1;
if (!ret) {
- timeout = jiffies;
+ start_time = ktime_get_real_ns();
ret = ext4_init_inode_table(sb, group,
elr->lr_timeout ? 0 : 1);
trace_ext4_lazy_itable_init(sb, group);
if (elr->lr_timeout == 0) {
- timeout = (jiffies - timeout) *
- EXT4_SB(elr->lr_super)->s_li_wait_mult;
- elr->lr_timeout = timeout;
+ elr->lr_timeout = nsecs_to_jiffies((ktime_get_real_ns() - start_time) *
+ EXT4_SB(elr->lr_super)->s_li_wait_mult);
}
elr->lr_next_sched = jiffies + elr->lr_timeout;
elr->lr_next_group = group + 1;
The patch below does not apply to the 4.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 39fec6889d15a658c3a3ebb06fd69d3584ddffd3 Mon Sep 17 00:00:00 2001
From: Shaoying Xu <shaoyi(a)amazon.com>
Date: Thu, 2 Sep 2021 16:44:12 +0000
Subject: [PATCH] ext4: fix lazy initialization next schedule time computation
in more granular unit
Ext4 file system has default lazy inode table initialization setup once
it is mounted. However, it has issue on computing the next schedule time
that makes the timeout same amount in jiffies but different real time in
secs if with various HZ values. Therefore, fix by measuring the current
time in a more granular unit nanoseconds and make the next schedule time
independent of the HZ value.
Fixes: bfff68738f1c ("ext4: add support for lazy inode table initialization")
Signed-off-by: Shaoying Xu <shaoyi(a)amazon.com>
Cc: stable(a)vger.kernel.org
Signed-off-by: Theodore Ts'o <tytso(a)mit.edu>
Link: https://lore.kernel.org/r/20210902164412.9994-2-shaoyi@amazon.com
Signed-off-by: Theodore Ts'o <tytso(a)mit.edu>
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 88d5d274a868..8a67e5f3f576 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3263,9 +3263,9 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
struct super_block *sb = elr->lr_super;
ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
ext4_group_t group = elr->lr_next_group;
- unsigned long timeout = 0;
unsigned int prefetch_ios = 0;
int ret = 0;
+ u64 start_time;
if (elr->lr_mode == EXT4_LI_MODE_PREFETCH_BBITMAP) {
elr->lr_next_group = ext4_mb_prefetch(sb, group,
@@ -3302,14 +3302,13 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
ret = 1;
if (!ret) {
- timeout = jiffies;
+ start_time = ktime_get_real_ns();
ret = ext4_init_inode_table(sb, group,
elr->lr_timeout ? 0 : 1);
trace_ext4_lazy_itable_init(sb, group);
if (elr->lr_timeout == 0) {
- timeout = (jiffies - timeout) *
- EXT4_SB(elr->lr_super)->s_li_wait_mult;
- elr->lr_timeout = timeout;
+ elr->lr_timeout = nsecs_to_jiffies((ktime_get_real_ns() - start_time) *
+ EXT4_SB(elr->lr_super)->s_li_wait_mult);
}
elr->lr_next_sched = jiffies + elr->lr_timeout;
elr->lr_next_group = group + 1;
The patch below does not apply to the 4.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 3ab7992018455ac63c33e9b3eaa7264e293e40f4 Mon Sep 17 00:00:00 2001
From: Pavel Skripkin <paskripkin(a)gmail.com>
Date: Sun, 24 Oct 2021 17:03:15 +0300
Subject: [PATCH] ALSA: mixer: fix deadlock in snd_mixer_oss_set_volume
In commit 411cef6adfb3 ("ALSA: mixer: oss: Fix racy access to slots")
added mutex protection in snd_mixer_oss_set_volume(). Second
mutex_lock() in same function looks like typo, fix it.
Reported-by: syzbot+ace149a75a9a0a399ac7(a)syzkaller.appspotmail.com
Fixes: 411cef6adfb3 ("ALSA: mixer: oss: Fix racy access to slots")
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Pavel Skripkin <paskripkin(a)gmail.com>
Link: https://lore.kernel.org/r/20211024140315.16704-1-paskripkin@gmail.com
Signed-off-by: Takashi Iwai <tiwai(a)suse.de>
diff --git a/sound/core/oss/mixer_oss.c b/sound/core/oss/mixer_oss.c
index d5ddc154a735..9620115cfdc0 100644
--- a/sound/core/oss/mixer_oss.c
+++ b/sound/core/oss/mixer_oss.c
@@ -313,7 +313,7 @@ static int snd_mixer_oss_set_volume(struct snd_mixer_oss_file *fmixer,
pslot->volume[1] = right;
result = (left & 0xff) | ((right & 0xff) << 8);
unlock:
- mutex_lock(&mixer->reg_mutex);
+ mutex_unlock(&mixer->reg_mutex);
return result;
}
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 3ab7992018455ac63c33e9b3eaa7264e293e40f4 Mon Sep 17 00:00:00 2001
From: Pavel Skripkin <paskripkin(a)gmail.com>
Date: Sun, 24 Oct 2021 17:03:15 +0300
Subject: [PATCH] ALSA: mixer: fix deadlock in snd_mixer_oss_set_volume
In commit 411cef6adfb3 ("ALSA: mixer: oss: Fix racy access to slots")
added mutex protection in snd_mixer_oss_set_volume(). Second
mutex_lock() in same function looks like typo, fix it.
Reported-by: syzbot+ace149a75a9a0a399ac7(a)syzkaller.appspotmail.com
Fixes: 411cef6adfb3 ("ALSA: mixer: oss: Fix racy access to slots")
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Pavel Skripkin <paskripkin(a)gmail.com>
Link: https://lore.kernel.org/r/20211024140315.16704-1-paskripkin@gmail.com
Signed-off-by: Takashi Iwai <tiwai(a)suse.de>
diff --git a/sound/core/oss/mixer_oss.c b/sound/core/oss/mixer_oss.c
index d5ddc154a735..9620115cfdc0 100644
--- a/sound/core/oss/mixer_oss.c
+++ b/sound/core/oss/mixer_oss.c
@@ -313,7 +313,7 @@ static int snd_mixer_oss_set_volume(struct snd_mixer_oss_file *fmixer,
pslot->volume[1] = right;
result = (left & 0xff) | ((right & 0xff) << 8);
unlock:
- mutex_lock(&mixer->reg_mutex);
+ mutex_unlock(&mixer->reg_mutex);
return result;
}
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 3ab7992018455ac63c33e9b3eaa7264e293e40f4 Mon Sep 17 00:00:00 2001
From: Pavel Skripkin <paskripkin(a)gmail.com>
Date: Sun, 24 Oct 2021 17:03:15 +0300
Subject: [PATCH] ALSA: mixer: fix deadlock in snd_mixer_oss_set_volume
In commit 411cef6adfb3 ("ALSA: mixer: oss: Fix racy access to slots")
added mutex protection in snd_mixer_oss_set_volume(). Second
mutex_lock() in same function looks like typo, fix it.
Reported-by: syzbot+ace149a75a9a0a399ac7(a)syzkaller.appspotmail.com
Fixes: 411cef6adfb3 ("ALSA: mixer: oss: Fix racy access to slots")
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Pavel Skripkin <paskripkin(a)gmail.com>
Link: https://lore.kernel.org/r/20211024140315.16704-1-paskripkin@gmail.com
Signed-off-by: Takashi Iwai <tiwai(a)suse.de>
diff --git a/sound/core/oss/mixer_oss.c b/sound/core/oss/mixer_oss.c
index d5ddc154a735..9620115cfdc0 100644
--- a/sound/core/oss/mixer_oss.c
+++ b/sound/core/oss/mixer_oss.c
@@ -313,7 +313,7 @@ static int snd_mixer_oss_set_volume(struct snd_mixer_oss_file *fmixer,
pslot->volume[1] = right;
result = (left & 0xff) | ((right & 0xff) << 8);
unlock:
- mutex_lock(&mixer->reg_mutex);
+ mutex_unlock(&mixer->reg_mutex);
return result;
}
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 3ab7992018455ac63c33e9b3eaa7264e293e40f4 Mon Sep 17 00:00:00 2001
From: Pavel Skripkin <paskripkin(a)gmail.com>
Date: Sun, 24 Oct 2021 17:03:15 +0300
Subject: [PATCH] ALSA: mixer: fix deadlock in snd_mixer_oss_set_volume
In commit 411cef6adfb3 ("ALSA: mixer: oss: Fix racy access to slots")
added mutex protection in snd_mixer_oss_set_volume(). Second
mutex_lock() in same function looks like typo, fix it.
Reported-by: syzbot+ace149a75a9a0a399ac7(a)syzkaller.appspotmail.com
Fixes: 411cef6adfb3 ("ALSA: mixer: oss: Fix racy access to slots")
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Pavel Skripkin <paskripkin(a)gmail.com>
Link: https://lore.kernel.org/r/20211024140315.16704-1-paskripkin@gmail.com
Signed-off-by: Takashi Iwai <tiwai(a)suse.de>
diff --git a/sound/core/oss/mixer_oss.c b/sound/core/oss/mixer_oss.c
index d5ddc154a735..9620115cfdc0 100644
--- a/sound/core/oss/mixer_oss.c
+++ b/sound/core/oss/mixer_oss.c
@@ -313,7 +313,7 @@ static int snd_mixer_oss_set_volume(struct snd_mixer_oss_file *fmixer,
pslot->volume[1] = right;
result = (left & 0xff) | ((right & 0xff) << 8);
unlock:
- mutex_lock(&mixer->reg_mutex);
+ mutex_unlock(&mixer->reg_mutex);
return result;
}
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 3ab7992018455ac63c33e9b3eaa7264e293e40f4 Mon Sep 17 00:00:00 2001
From: Pavel Skripkin <paskripkin(a)gmail.com>
Date: Sun, 24 Oct 2021 17:03:15 +0300
Subject: [PATCH] ALSA: mixer: fix deadlock in snd_mixer_oss_set_volume
In commit 411cef6adfb3 ("ALSA: mixer: oss: Fix racy access to slots")
added mutex protection in snd_mixer_oss_set_volume(). Second
mutex_lock() in same function looks like typo, fix it.
Reported-by: syzbot+ace149a75a9a0a399ac7(a)syzkaller.appspotmail.com
Fixes: 411cef6adfb3 ("ALSA: mixer: oss: Fix racy access to slots")
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Pavel Skripkin <paskripkin(a)gmail.com>
Link: https://lore.kernel.org/r/20211024140315.16704-1-paskripkin@gmail.com
Signed-off-by: Takashi Iwai <tiwai(a)suse.de>
diff --git a/sound/core/oss/mixer_oss.c b/sound/core/oss/mixer_oss.c
index d5ddc154a735..9620115cfdc0 100644
--- a/sound/core/oss/mixer_oss.c
+++ b/sound/core/oss/mixer_oss.c
@@ -313,7 +313,7 @@ static int snd_mixer_oss_set_volume(struct snd_mixer_oss_file *fmixer,
pslot->volume[1] = right;
result = (left & 0xff) | ((right & 0xff) << 8);
unlock:
- mutex_lock(&mixer->reg_mutex);
+ mutex_unlock(&mixer->reg_mutex);
return result;
}
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 3ab7992018455ac63c33e9b3eaa7264e293e40f4 Mon Sep 17 00:00:00 2001
From: Pavel Skripkin <paskripkin(a)gmail.com>
Date: Sun, 24 Oct 2021 17:03:15 +0300
Subject: [PATCH] ALSA: mixer: fix deadlock in snd_mixer_oss_set_volume
In commit 411cef6adfb3 ("ALSA: mixer: oss: Fix racy access to slots")
added mutex protection in snd_mixer_oss_set_volume(). Second
mutex_lock() in same function looks like typo, fix it.
Reported-by: syzbot+ace149a75a9a0a399ac7(a)syzkaller.appspotmail.com
Fixes: 411cef6adfb3 ("ALSA: mixer: oss: Fix racy access to slots")
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Pavel Skripkin <paskripkin(a)gmail.com>
Link: https://lore.kernel.org/r/20211024140315.16704-1-paskripkin@gmail.com
Signed-off-by: Takashi Iwai <tiwai(a)suse.de>
diff --git a/sound/core/oss/mixer_oss.c b/sound/core/oss/mixer_oss.c
index d5ddc154a735..9620115cfdc0 100644
--- a/sound/core/oss/mixer_oss.c
+++ b/sound/core/oss/mixer_oss.c
@@ -313,7 +313,7 @@ static int snd_mixer_oss_set_volume(struct snd_mixer_oss_file *fmixer,
pslot->volume[1] = right;
result = (left & 0xff) | ((right & 0xff) << 8);
unlock:
- mutex_lock(&mixer->reg_mutex);
+ mutex_unlock(&mixer->reg_mutex);
return result;
}
The CONSOLE_POLLING mode is used for tools like k(g)db. In this kind of
setup, it is often sharing a serial device with the normal system console.
This is usually no problem because the polling helpers can consume input
values directly (when in kgdb context) and the normal Linux handlers can
only consume new input values after kgdb switched back.
This is not true anymore when RX DMA is enabled for UARTDM controllers.
Single input values can no longer be received correctly. Instead following
seems to happen:
* on 1. input, some old input is read (continuously)
* on 2. input, two old inputs are read (continuously)
* on 3. input, three old input values are read (continuously)
* on 4. input, 4 previous inputs are received
This repeats then for each group of 4 input values.
This behavior changes slightly depending on what state the controller was
when the first input was received. But this makes working with kgdb
basically impossible because control messages are always corrupted when
kgdboc tries to parse them.
RX DMA should therefore be off when CONSOLE_POLLING is enabled to avoid
these kind of problems. No such problem was noticed for TX DMA.
Cc: stable(a)vger.kernel.org
Fixes: 99693945013a ("tty: serial: msm: Add RX DMA support")
Signed-off-by: Sven Eckelmann <sven(a)narfation.org>
---
I've already described this problem in a mail two weeks ago
https://lore.kernel.org/all/4119639.d4o71su6xY@sven-desktop/
There was no feedback so I would like to propose the minimal version which
was described in that mail.
---
drivers/tty/serial/msm_serial.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/drivers/tty/serial/msm_serial.c b/drivers/tty/serial/msm_serial.c
index fcef7a961430..489d19274f9a 100644
--- a/drivers/tty/serial/msm_serial.c
+++ b/drivers/tty/serial/msm_serial.c
@@ -598,6 +598,9 @@ static void msm_start_rx_dma(struct msm_port *msm_port)
u32 val;
int ret;
+ if (IS_ENABLED(CONFIG_CONSOLE_POLL))
+ return;
+
if (!dma->chan)
return;
--
2.30.2
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 6e866a462867b60841202e900f10936a0478608c Mon Sep 17 00:00:00 2001
From: Helge Deller <deller(a)gmx.de>
Date: Sun, 31 Oct 2021 21:58:12 +0100
Subject: [PATCH] parisc: Fix set_fixmap() on PA1.x CPUs
Fix a kernel crash which happens on PA1.x CPUs while initializing the
FTRACE/KPROBE breakpoints. The PTE table entries for the fixmap area
were not created correctly.
Signed-off-by: Helge Deller <deller(a)gmx.de>
Fixes: ccfbc68d41c2 ("parisc: add set_fixmap()/clear_fixmap()")
Cc: stable(a)vger.kernel.org # v5.2+
diff --git a/arch/parisc/mm/fixmap.c b/arch/parisc/mm/fixmap.c
index 24426a7e1a5e..cc15d737fda6 100644
--- a/arch/parisc/mm/fixmap.c
+++ b/arch/parisc/mm/fixmap.c
@@ -20,12 +20,9 @@ void notrace set_fixmap(enum fixed_addresses idx, phys_addr_t phys)
pte_t *pte;
if (pmd_none(*pmd))
- pmd = pmd_alloc(NULL, pud, vaddr);
-
- pte = pte_offset_kernel(pmd, vaddr);
- if (pte_none(*pte))
pte = pte_alloc_kernel(pmd, vaddr);
+ pte = pte_offset_kernel(pmd, vaddr);
set_pte_at(&init_mm, vaddr, pte, __mk_pte(phys, PAGE_KERNEL_RWX));
flush_tlb_kernel_range(vaddr, vaddr + PAGE_SIZE);
}
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From cd8a36a90babf958082b87bc6b4df5dd70901eba Mon Sep 17 00:00:00 2001
From: James Smart <jsmart2021(a)gmail.com>
Date: Fri, 10 Sep 2021 16:31:53 -0700
Subject: [PATCH] scsi: lpfc: Fix FCP I/O flush functionality for TMF routines
A prior patch inadvertently caused lpfc_sli_sum_iocb() to exclude counting
of outstanding aborted I/Os and ABORT IOCBs. Thus,
lpfc_reset_flush_io_context() called from any TMF routine does not properly
wait to flush all outstanding FCP IOCBs leading to a block layer crash on
an invalid scsi_cmnd->request pointer.
kernel BUG at ../block/blk-core.c:1489!
RIP: 0010:blk_requeue_request+0xaf/0xc0
...
Call Trace:
<IRQ>
__scsi_queue_insert+0x90/0xe0 [scsi_mod]
blk_done_softirq+0x7e/0x90
__do_softirq+0xd2/0x280
irq_exit+0xd5/0xe0
do_IRQ+0x4c/0xd0
common_interrupt+0x87/0x87
</IRQ>
Fix by separating out the LPFC_IO_FCP, LPFC_IO_ON_TXCMPLQ,
LPFC_DRIVER_ABORTED, and CMD_ABORT_XRI_CN || CMD_CLOSE_XRI_CN checks into a
new lpfc_sli_validate_fcp_iocb_for_abort() routine when determining to
build an ABORT iocb.
Restore lpfc_reset_flush_io_context() functionality by including counting
of outstanding aborted IOCBs and ABORT IOCBs in lpfc_sli_sum_iocb().
Link: https://lore.kernel.org/r/20210910233159.115896-9-jsmart2021@gmail.com
Fixes: e1364711359f ("scsi: lpfc: Fix illegal memory access on Abort IOCBs")
Cc: <stable(a)vger.kernel.org> # v5.12+
Co-developed-by: Justin Tee <justin.tee(a)broadcom.com>
Signed-off-by: Justin Tee <justin.tee(a)broadcom.com>
Signed-off-by: James Smart <jsmart2021(a)gmail.com>
Signed-off-by: Martin K. Petersen <martin.petersen(a)oracle.com>
diff --git a/drivers/scsi/lpfc/lpfc_sli.c b/drivers/scsi/lpfc/lpfc_sli.c
index 546c851938bc..e8f6ad484768 100644
--- a/drivers/scsi/lpfc/lpfc_sli.c
+++ b/drivers/scsi/lpfc/lpfc_sli.c
@@ -12485,15 +12485,54 @@ lpfc_sli_hba_iocb_abort(struct lpfc_hba *phba)
}
/**
- * lpfc_sli_validate_fcp_iocb - find commands associated with a vport or LUN
+ * lpfc_sli_validate_fcp_iocb_for_abort - filter iocbs appropriate for FCP aborts
+ * @iocbq: Pointer to iocb object.
+ * @vport: Pointer to driver virtual port object.
+ *
+ * This function acts as an iocb filter for functions which abort FCP iocbs.
+ *
+ * Return values
+ * -ENODEV, if a null iocb or vport ptr is encountered
+ * -EINVAL, if the iocb is not an FCP I/O, not on the TX cmpl queue, premarked as
+ * driver already started the abort process, or is an abort iocb itself
+ * 0, passes criteria for aborting the FCP I/O iocb
+ **/
+static int
+lpfc_sli_validate_fcp_iocb_for_abort(struct lpfc_iocbq *iocbq,
+ struct lpfc_vport *vport)
+{
+ IOCB_t *icmd = NULL;
+
+ /* No null ptr vports */
+ if (!iocbq || iocbq->vport != vport)
+ return -ENODEV;
+
+ /* iocb must be for FCP IO, already exists on the TX cmpl queue,
+ * can't be premarked as driver aborted, nor be an ABORT iocb itself
+ */
+ icmd = &iocbq->iocb;
+ if (!(iocbq->iocb_flag & LPFC_IO_FCP) ||
+ !(iocbq->iocb_flag & LPFC_IO_ON_TXCMPLQ) ||
+ (iocbq->iocb_flag & LPFC_DRIVER_ABORTED) ||
+ (icmd->ulpCommand == CMD_ABORT_XRI_CN ||
+ icmd->ulpCommand == CMD_CLOSE_XRI_CN))
+ return -EINVAL;
+
+ return 0;
+}
+
+/**
+ * lpfc_sli_validate_fcp_iocb - validate commands associated with a SCSI target
* @iocbq: Pointer to driver iocb object.
* @vport: Pointer to driver virtual port object.
* @tgt_id: SCSI ID of the target.
* @lun_id: LUN ID of the scsi device.
* @ctx_cmd: LPFC_CTX_LUN/LPFC_CTX_TGT/LPFC_CTX_HOST
*
- * This function acts as an iocb filter for functions which abort or count
- * all FCP iocbs pending on a lun/SCSI target/SCSI host. It will return
+ * This function acts as an iocb filter for validating a lun/SCSI target/SCSI
+ * host.
+ *
+ * It will return
* 0 if the filtering criteria is met for the given iocb and will return
* 1 if the filtering criteria is not met.
* If ctx_cmd == LPFC_CTX_LUN, the function returns 0 only if the
@@ -12512,22 +12551,8 @@ lpfc_sli_validate_fcp_iocb(struct lpfc_iocbq *iocbq, struct lpfc_vport *vport,
lpfc_ctx_cmd ctx_cmd)
{
struct lpfc_io_buf *lpfc_cmd;
- IOCB_t *icmd = NULL;
int rc = 1;
- if (!iocbq || iocbq->vport != vport)
- return rc;
-
- if (!(iocbq->iocb_flag & LPFC_IO_FCP) ||
- !(iocbq->iocb_flag & LPFC_IO_ON_TXCMPLQ) ||
- iocbq->iocb_flag & LPFC_DRIVER_ABORTED)
- return rc;
-
- icmd = &iocbq->iocb;
- if (icmd->ulpCommand == CMD_ABORT_XRI_CN ||
- icmd->ulpCommand == CMD_CLOSE_XRI_CN)
- return rc;
-
lpfc_cmd = container_of(iocbq, struct lpfc_io_buf, cur_iocbq);
if (lpfc_cmd->pCmd == NULL)
@@ -12582,17 +12607,33 @@ lpfc_sli_sum_iocb(struct lpfc_vport *vport, uint16_t tgt_id, uint64_t lun_id,
{
struct lpfc_hba *phba = vport->phba;
struct lpfc_iocbq *iocbq;
+ IOCB_t *icmd = NULL;
int sum, i;
+ unsigned long iflags;
- spin_lock_irq(&phba->hbalock);
+ spin_lock_irqsave(&phba->hbalock, iflags);
for (i = 1, sum = 0; i <= phba->sli.last_iotag; i++) {
iocbq = phba->sli.iocbq_lookup[i];
- if (lpfc_sli_validate_fcp_iocb (iocbq, vport, tgt_id, lun_id,
- ctx_cmd) == 0)
+ if (!iocbq || iocbq->vport != vport)
+ continue;
+ if (!(iocbq->iocb_flag & LPFC_IO_FCP) ||
+ !(iocbq->iocb_flag & LPFC_IO_ON_TXCMPLQ))
+ continue;
+
+ /* Include counting outstanding aborts */
+ icmd = &iocbq->iocb;
+ if (icmd->ulpCommand == CMD_ABORT_XRI_CN ||
+ icmd->ulpCommand == CMD_CLOSE_XRI_CN) {
+ sum++;
+ continue;
+ }
+
+ if (lpfc_sli_validate_fcp_iocb(iocbq, vport, tgt_id, lun_id,
+ ctx_cmd) == 0)
sum++;
}
- spin_unlock_irq(&phba->hbalock);
+ spin_unlock_irqrestore(&phba->hbalock, iflags);
return sum;
}
@@ -12659,7 +12700,11 @@ lpfc_sli_abort_fcp_cmpl(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb,
*
* This function sends an abort command for every SCSI command
* associated with the given virtual port pending on the ring
- * filtered by lpfc_sli_validate_fcp_iocb function.
+ * filtered by lpfc_sli_validate_fcp_iocb_for_abort and then
+ * lpfc_sli_validate_fcp_iocb function. The ordering for validation before
+ * submitting abort iocbs must be lpfc_sli_validate_fcp_iocb_for_abort
+ * followed by lpfc_sli_validate_fcp_iocb.
+ *
* When abort_cmd == LPFC_CTX_LUN, the function sends abort only to the
* FCP iocbs associated with lun specified by tgt_id and lun_id
* parameters
@@ -12691,6 +12736,9 @@ lpfc_sli_abort_iocb(struct lpfc_vport *vport, u16 tgt_id, u64 lun_id,
for (i = 1; i <= phba->sli.last_iotag; i++) {
iocbq = phba->sli.iocbq_lookup[i];
+ if (lpfc_sli_validate_fcp_iocb_for_abort(iocbq, vport))
+ continue;
+
if (lpfc_sli_validate_fcp_iocb(iocbq, vport, tgt_id, lun_id,
abort_cmd) != 0)
continue;
@@ -12723,7 +12771,11 @@ lpfc_sli_abort_iocb(struct lpfc_vport *vport, u16 tgt_id, u64 lun_id,
*
* This function sends an abort command for every SCSI command
* associated with the given virtual port pending on the ring
- * filtered by lpfc_sli_validate_fcp_iocb function.
+ * filtered by lpfc_sli_validate_fcp_iocb_for_abort and then
+ * lpfc_sli_validate_fcp_iocb function. The ordering for validation before
+ * submitting abort iocbs must be lpfc_sli_validate_fcp_iocb_for_abort
+ * followed by lpfc_sli_validate_fcp_iocb.
+ *
* When taskmgmt_cmd == LPFC_CTX_LUN, the function sends abort only to the
* FCP iocbs associated with lun specified by tgt_id and lun_id
* parameters
@@ -12761,6 +12813,9 @@ lpfc_sli_abort_taskmgmt(struct lpfc_vport *vport, struct lpfc_sli_ring *pring,
for (i = 1; i <= phba->sli.last_iotag; i++) {
iocbq = phba->sli.iocbq_lookup[i];
+ if (lpfc_sli_validate_fcp_iocb_for_abort(iocbq, vport))
+ continue;
+
if (lpfc_sli_validate_fcp_iocb(iocbq, vport, tgt_id, lun_id,
cmd) != 0)
continue;
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From cd8a36a90babf958082b87bc6b4df5dd70901eba Mon Sep 17 00:00:00 2001
From: James Smart <jsmart2021(a)gmail.com>
Date: Fri, 10 Sep 2021 16:31:53 -0700
Subject: [PATCH] scsi: lpfc: Fix FCP I/O flush functionality for TMF routines
A prior patch inadvertently caused lpfc_sli_sum_iocb() to exclude counting
of outstanding aborted I/Os and ABORT IOCBs. Thus,
lpfc_reset_flush_io_context() called from any TMF routine does not properly
wait to flush all outstanding FCP IOCBs leading to a block layer crash on
an invalid scsi_cmnd->request pointer.
kernel BUG at ../block/blk-core.c:1489!
RIP: 0010:blk_requeue_request+0xaf/0xc0
...
Call Trace:
<IRQ>
__scsi_queue_insert+0x90/0xe0 [scsi_mod]
blk_done_softirq+0x7e/0x90
__do_softirq+0xd2/0x280
irq_exit+0xd5/0xe0
do_IRQ+0x4c/0xd0
common_interrupt+0x87/0x87
</IRQ>
Fix by separating out the LPFC_IO_FCP, LPFC_IO_ON_TXCMPLQ,
LPFC_DRIVER_ABORTED, and CMD_ABORT_XRI_CN || CMD_CLOSE_XRI_CN checks into a
new lpfc_sli_validate_fcp_iocb_for_abort() routine when determining to
build an ABORT iocb.
Restore lpfc_reset_flush_io_context() functionality by including counting
of outstanding aborted IOCBs and ABORT IOCBs in lpfc_sli_sum_iocb().
Link: https://lore.kernel.org/r/20210910233159.115896-9-jsmart2021@gmail.com
Fixes: e1364711359f ("scsi: lpfc: Fix illegal memory access on Abort IOCBs")
Cc: <stable(a)vger.kernel.org> # v5.12+
Co-developed-by: Justin Tee <justin.tee(a)broadcom.com>
Signed-off-by: Justin Tee <justin.tee(a)broadcom.com>
Signed-off-by: James Smart <jsmart2021(a)gmail.com>
Signed-off-by: Martin K. Petersen <martin.petersen(a)oracle.com>
diff --git a/drivers/scsi/lpfc/lpfc_sli.c b/drivers/scsi/lpfc/lpfc_sli.c
index 546c851938bc..e8f6ad484768 100644
--- a/drivers/scsi/lpfc/lpfc_sli.c
+++ b/drivers/scsi/lpfc/lpfc_sli.c
@@ -12485,15 +12485,54 @@ lpfc_sli_hba_iocb_abort(struct lpfc_hba *phba)
}
/**
- * lpfc_sli_validate_fcp_iocb - find commands associated with a vport or LUN
+ * lpfc_sli_validate_fcp_iocb_for_abort - filter iocbs appropriate for FCP aborts
+ * @iocbq: Pointer to iocb object.
+ * @vport: Pointer to driver virtual port object.
+ *
+ * This function acts as an iocb filter for functions which abort FCP iocbs.
+ *
+ * Return values
+ * -ENODEV, if a null iocb or vport ptr is encountered
+ * -EINVAL, if the iocb is not an FCP I/O, not on the TX cmpl queue, premarked as
+ * driver already started the abort process, or is an abort iocb itself
+ * 0, passes criteria for aborting the FCP I/O iocb
+ **/
+static int
+lpfc_sli_validate_fcp_iocb_for_abort(struct lpfc_iocbq *iocbq,
+ struct lpfc_vport *vport)
+{
+ IOCB_t *icmd = NULL;
+
+ /* No null ptr vports */
+ if (!iocbq || iocbq->vport != vport)
+ return -ENODEV;
+
+ /* iocb must be for FCP IO, already exists on the TX cmpl queue,
+ * can't be premarked as driver aborted, nor be an ABORT iocb itself
+ */
+ icmd = &iocbq->iocb;
+ if (!(iocbq->iocb_flag & LPFC_IO_FCP) ||
+ !(iocbq->iocb_flag & LPFC_IO_ON_TXCMPLQ) ||
+ (iocbq->iocb_flag & LPFC_DRIVER_ABORTED) ||
+ (icmd->ulpCommand == CMD_ABORT_XRI_CN ||
+ icmd->ulpCommand == CMD_CLOSE_XRI_CN))
+ return -EINVAL;
+
+ return 0;
+}
+
+/**
+ * lpfc_sli_validate_fcp_iocb - validate commands associated with a SCSI target
* @iocbq: Pointer to driver iocb object.
* @vport: Pointer to driver virtual port object.
* @tgt_id: SCSI ID of the target.
* @lun_id: LUN ID of the scsi device.
* @ctx_cmd: LPFC_CTX_LUN/LPFC_CTX_TGT/LPFC_CTX_HOST
*
- * This function acts as an iocb filter for functions which abort or count
- * all FCP iocbs pending on a lun/SCSI target/SCSI host. It will return
+ * This function acts as an iocb filter for validating a lun/SCSI target/SCSI
+ * host.
+ *
+ * It will return
* 0 if the filtering criteria is met for the given iocb and will return
* 1 if the filtering criteria is not met.
* If ctx_cmd == LPFC_CTX_LUN, the function returns 0 only if the
@@ -12512,22 +12551,8 @@ lpfc_sli_validate_fcp_iocb(struct lpfc_iocbq *iocbq, struct lpfc_vport *vport,
lpfc_ctx_cmd ctx_cmd)
{
struct lpfc_io_buf *lpfc_cmd;
- IOCB_t *icmd = NULL;
int rc = 1;
- if (!iocbq || iocbq->vport != vport)
- return rc;
-
- if (!(iocbq->iocb_flag & LPFC_IO_FCP) ||
- !(iocbq->iocb_flag & LPFC_IO_ON_TXCMPLQ) ||
- iocbq->iocb_flag & LPFC_DRIVER_ABORTED)
- return rc;
-
- icmd = &iocbq->iocb;
- if (icmd->ulpCommand == CMD_ABORT_XRI_CN ||
- icmd->ulpCommand == CMD_CLOSE_XRI_CN)
- return rc;
-
lpfc_cmd = container_of(iocbq, struct lpfc_io_buf, cur_iocbq);
if (lpfc_cmd->pCmd == NULL)
@@ -12582,17 +12607,33 @@ lpfc_sli_sum_iocb(struct lpfc_vport *vport, uint16_t tgt_id, uint64_t lun_id,
{
struct lpfc_hba *phba = vport->phba;
struct lpfc_iocbq *iocbq;
+ IOCB_t *icmd = NULL;
int sum, i;
+ unsigned long iflags;
- spin_lock_irq(&phba->hbalock);
+ spin_lock_irqsave(&phba->hbalock, iflags);
for (i = 1, sum = 0; i <= phba->sli.last_iotag; i++) {
iocbq = phba->sli.iocbq_lookup[i];
- if (lpfc_sli_validate_fcp_iocb (iocbq, vport, tgt_id, lun_id,
- ctx_cmd) == 0)
+ if (!iocbq || iocbq->vport != vport)
+ continue;
+ if (!(iocbq->iocb_flag & LPFC_IO_FCP) ||
+ !(iocbq->iocb_flag & LPFC_IO_ON_TXCMPLQ))
+ continue;
+
+ /* Include counting outstanding aborts */
+ icmd = &iocbq->iocb;
+ if (icmd->ulpCommand == CMD_ABORT_XRI_CN ||
+ icmd->ulpCommand == CMD_CLOSE_XRI_CN) {
+ sum++;
+ continue;
+ }
+
+ if (lpfc_sli_validate_fcp_iocb(iocbq, vport, tgt_id, lun_id,
+ ctx_cmd) == 0)
sum++;
}
- spin_unlock_irq(&phba->hbalock);
+ spin_unlock_irqrestore(&phba->hbalock, iflags);
return sum;
}
@@ -12659,7 +12700,11 @@ lpfc_sli_abort_fcp_cmpl(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb,
*
* This function sends an abort command for every SCSI command
* associated with the given virtual port pending on the ring
- * filtered by lpfc_sli_validate_fcp_iocb function.
+ * filtered by lpfc_sli_validate_fcp_iocb_for_abort and then
+ * lpfc_sli_validate_fcp_iocb function. The ordering for validation before
+ * submitting abort iocbs must be lpfc_sli_validate_fcp_iocb_for_abort
+ * followed by lpfc_sli_validate_fcp_iocb.
+ *
* When abort_cmd == LPFC_CTX_LUN, the function sends abort only to the
* FCP iocbs associated with lun specified by tgt_id and lun_id
* parameters
@@ -12691,6 +12736,9 @@ lpfc_sli_abort_iocb(struct lpfc_vport *vport, u16 tgt_id, u64 lun_id,
for (i = 1; i <= phba->sli.last_iotag; i++) {
iocbq = phba->sli.iocbq_lookup[i];
+ if (lpfc_sli_validate_fcp_iocb_for_abort(iocbq, vport))
+ continue;
+
if (lpfc_sli_validate_fcp_iocb(iocbq, vport, tgt_id, lun_id,
abort_cmd) != 0)
continue;
@@ -12723,7 +12771,11 @@ lpfc_sli_abort_iocb(struct lpfc_vport *vport, u16 tgt_id, u64 lun_id,
*
* This function sends an abort command for every SCSI command
* associated with the given virtual port pending on the ring
- * filtered by lpfc_sli_validate_fcp_iocb function.
+ * filtered by lpfc_sli_validate_fcp_iocb_for_abort and then
+ * lpfc_sli_validate_fcp_iocb function. The ordering for validation before
+ * submitting abort iocbs must be lpfc_sli_validate_fcp_iocb_for_abort
+ * followed by lpfc_sli_validate_fcp_iocb.
+ *
* When taskmgmt_cmd == LPFC_CTX_LUN, the function sends abort only to the
* FCP iocbs associated with lun specified by tgt_id and lun_id
* parameters
@@ -12761,6 +12813,9 @@ lpfc_sli_abort_taskmgmt(struct lpfc_vport *vport, struct lpfc_sli_ring *pring,
for (i = 1; i <= phba->sli.last_iotag; i++) {
iocbq = phba->sli.iocbq_lookup[i];
+ if (lpfc_sli_validate_fcp_iocb_for_abort(iocbq, vport))
+ continue;
+
if (lpfc_sli_validate_fcp_iocb(iocbq, vport, tgt_id, lun_id,
cmd) != 0)
continue;
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 5ae17501bc62a49b0b193dcce003f16375f16654 Mon Sep 17 00:00:00 2001
From: "Ewan D. Milne" <emilne(a)redhat.com>
Date: Fri, 29 Oct 2021 15:43:10 -0400
Subject: [PATCH] scsi: core: Avoid leaving shost->last_reset with stale value
if EH does not run
The changes to issue the abort from the scmd->abort_work instead of the EH
thread introduced a problem if eh_deadline is used. If aborting the
command(s) is successful, and there are never any scmds added to the
shost->eh_cmd_q, there is no code path which will reset the ->last_reset
value back to zero.
The effect of this is that after a successful abort with no EH thread
activity, a subsequent timeout, perhaps a long time later, might
immediately be considered past a user-set eh_deadline time, and the host
will be reset with no attempt at recovery.
Fix this by resetting ->last_reset back to zero in scmd_eh_abort_handler()
if it is determined that the EH thread will not run to do this.
Thanks to Gopinath Marappan for investigating this problem.
Link: https://lore.kernel.org/r/20211029194311.17504-2-emilne@redhat.com
Fixes: e494f6a72839 ("[SCSI] improved eh timeout handler")
Cc: stable(a)vger.kernel.org
Signed-off-by: Ewan D. Milne <emilne(a)redhat.com>
Signed-off-by: Martin K. Petersen <martin.petersen(a)oracle.com>
diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c
index 17aef936bc90..2cb7163e24cc 100644
--- a/drivers/scsi/hosts.c
+++ b/drivers/scsi/hosts.c
@@ -387,6 +387,7 @@ struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *sht, int privsize)
shost->shost_state = SHOST_CREATED;
INIT_LIST_HEAD(&shost->__devices);
INIT_LIST_HEAD(&shost->__targets);
+ INIT_LIST_HEAD(&shost->eh_abort_list);
INIT_LIST_HEAD(&shost->eh_cmd_q);
INIT_LIST_HEAD(&shost->starved_list);
init_waitqueue_head(&shost->host_wait);
diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
index 3de03925550e..bdf782d9cb86 100644
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -133,6 +133,23 @@ static bool scsi_eh_should_retry_cmd(struct scsi_cmnd *cmd)
return true;
}
+static void scsi_eh_complete_abort(struct scsi_cmnd *scmd, struct Scsi_Host *shost)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(shost->host_lock, flags);
+ list_del_init(&scmd->eh_entry);
+ /*
+ * If the abort succeeds, and there is no further
+ * EH action, clear the ->last_reset time.
+ */
+ if (list_empty(&shost->eh_abort_list) &&
+ list_empty(&shost->eh_cmd_q))
+ if (shost->eh_deadline != -1)
+ shost->last_reset = 0;
+ spin_unlock_irqrestore(shost->host_lock, flags);
+}
+
/**
* scmd_eh_abort_handler - Handle command aborts
* @work: command to be aborted.
@@ -150,6 +167,7 @@ scmd_eh_abort_handler(struct work_struct *work)
container_of(work, struct scsi_cmnd, abort_work.work);
struct scsi_device *sdev = scmd->device;
enum scsi_disposition rtn;
+ unsigned long flags;
if (scsi_host_eh_past_deadline(sdev->host)) {
SCSI_LOG_ERROR_RECOVERY(3,
@@ -173,12 +191,14 @@ scmd_eh_abort_handler(struct work_struct *work)
SCSI_LOG_ERROR_RECOVERY(3,
scmd_printk(KERN_WARNING, scmd,
"retry aborted command\n"));
+ scsi_eh_complete_abort(scmd, sdev->host);
scsi_queue_insert(scmd, SCSI_MLQUEUE_EH_RETRY);
return;
} else {
SCSI_LOG_ERROR_RECOVERY(3,
scmd_printk(KERN_WARNING, scmd,
"finish aborted command\n"));
+ scsi_eh_complete_abort(scmd, sdev->host);
scsi_finish_command(scmd);
return;
}
@@ -191,6 +211,9 @@ scmd_eh_abort_handler(struct work_struct *work)
}
}
+ spin_lock_irqsave(sdev->host->host_lock, flags);
+ list_del_init(&scmd->eh_entry);
+ spin_unlock_irqrestore(sdev->host->host_lock, flags);
scsi_eh_scmd_add(scmd);
}
@@ -221,6 +244,8 @@ scsi_abort_command(struct scsi_cmnd *scmd)
spin_lock_irqsave(shost->host_lock, flags);
if (shost->eh_deadline != -1 && !shost->last_reset)
shost->last_reset = jiffies;
+ BUG_ON(!list_empty(&scmd->eh_entry));
+ list_add_tail(&scmd->eh_entry, &shost->eh_abort_list);
spin_unlock_irqrestore(shost->host_lock, flags);
scmd->eh_eflags |= SCSI_EH_ABORT_SCHEDULED;
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index d0b7c6dc74f8..c851c05d6091 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1143,6 +1143,7 @@ void scsi_init_command(struct scsi_device *dev, struct scsi_cmnd *cmd)
cmd->sense_buffer = buf;
cmd->prot_sdb = prot;
cmd->flags = flags;
+ INIT_LIST_HEAD(&cmd->eh_entry);
INIT_DELAYED_WORK(&cmd->abort_work, scmd_eh_abort_handler);
cmd->jiffies_at_alloc = jiffies_at_alloc;
cmd->retries = retries;
diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h
index 7958a604f979..29ac40cf1aae 100644
--- a/include/scsi/scsi_cmnd.h
+++ b/include/scsi/scsi_cmnd.h
@@ -73,7 +73,7 @@ enum scsi_cmnd_submitter {
struct scsi_cmnd {
struct scsi_request req;
struct scsi_device *device;
- struct list_head eh_entry; /* entry for the host eh_cmd_q */
+ struct list_head eh_entry; /* entry for the host eh_abort_list/eh_cmd_q */
struct delayed_work abort_work;
struct rcu_head rcu;
diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
index ae715959f886..ebe059badba0 100644
--- a/include/scsi/scsi_host.h
+++ b/include/scsi/scsi_host.h
@@ -551,6 +551,7 @@ struct Scsi_Host {
struct mutex scan_mutex;/* serialize scanning activity */
+ struct list_head eh_abort_list;
struct list_head eh_cmd_q;
struct task_struct * ehandler; /* Error recovery thread. */
struct completion * eh_action; /* Wait for specific actions on the
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 5ae17501bc62a49b0b193dcce003f16375f16654 Mon Sep 17 00:00:00 2001
From: "Ewan D. Milne" <emilne(a)redhat.com>
Date: Fri, 29 Oct 2021 15:43:10 -0400
Subject: [PATCH] scsi: core: Avoid leaving shost->last_reset with stale value
if EH does not run
The changes to issue the abort from the scmd->abort_work instead of the EH
thread introduced a problem if eh_deadline is used. If aborting the
command(s) is successful, and there are never any scmds added to the
shost->eh_cmd_q, there is no code path which will reset the ->last_reset
value back to zero.
The effect of this is that after a successful abort with no EH thread
activity, a subsequent timeout, perhaps a long time later, might
immediately be considered past a user-set eh_deadline time, and the host
will be reset with no attempt at recovery.
Fix this by resetting ->last_reset back to zero in scmd_eh_abort_handler()
if it is determined that the EH thread will not run to do this.
Thanks to Gopinath Marappan for investigating this problem.
Link: https://lore.kernel.org/r/20211029194311.17504-2-emilne@redhat.com
Fixes: e494f6a72839 ("[SCSI] improved eh timeout handler")
Cc: stable(a)vger.kernel.org
Signed-off-by: Ewan D. Milne <emilne(a)redhat.com>
Signed-off-by: Martin K. Petersen <martin.petersen(a)oracle.com>
diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c
index 17aef936bc90..2cb7163e24cc 100644
--- a/drivers/scsi/hosts.c
+++ b/drivers/scsi/hosts.c
@@ -387,6 +387,7 @@ struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *sht, int privsize)
shost->shost_state = SHOST_CREATED;
INIT_LIST_HEAD(&shost->__devices);
INIT_LIST_HEAD(&shost->__targets);
+ INIT_LIST_HEAD(&shost->eh_abort_list);
INIT_LIST_HEAD(&shost->eh_cmd_q);
INIT_LIST_HEAD(&shost->starved_list);
init_waitqueue_head(&shost->host_wait);
diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
index 3de03925550e..bdf782d9cb86 100644
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -133,6 +133,23 @@ static bool scsi_eh_should_retry_cmd(struct scsi_cmnd *cmd)
return true;
}
+static void scsi_eh_complete_abort(struct scsi_cmnd *scmd, struct Scsi_Host *shost)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(shost->host_lock, flags);
+ list_del_init(&scmd->eh_entry);
+ /*
+ * If the abort succeeds, and there is no further
+ * EH action, clear the ->last_reset time.
+ */
+ if (list_empty(&shost->eh_abort_list) &&
+ list_empty(&shost->eh_cmd_q))
+ if (shost->eh_deadline != -1)
+ shost->last_reset = 0;
+ spin_unlock_irqrestore(shost->host_lock, flags);
+}
+
/**
* scmd_eh_abort_handler - Handle command aborts
* @work: command to be aborted.
@@ -150,6 +167,7 @@ scmd_eh_abort_handler(struct work_struct *work)
container_of(work, struct scsi_cmnd, abort_work.work);
struct scsi_device *sdev = scmd->device;
enum scsi_disposition rtn;
+ unsigned long flags;
if (scsi_host_eh_past_deadline(sdev->host)) {
SCSI_LOG_ERROR_RECOVERY(3,
@@ -173,12 +191,14 @@ scmd_eh_abort_handler(struct work_struct *work)
SCSI_LOG_ERROR_RECOVERY(3,
scmd_printk(KERN_WARNING, scmd,
"retry aborted command\n"));
+ scsi_eh_complete_abort(scmd, sdev->host);
scsi_queue_insert(scmd, SCSI_MLQUEUE_EH_RETRY);
return;
} else {
SCSI_LOG_ERROR_RECOVERY(3,
scmd_printk(KERN_WARNING, scmd,
"finish aborted command\n"));
+ scsi_eh_complete_abort(scmd, sdev->host);
scsi_finish_command(scmd);
return;
}
@@ -191,6 +211,9 @@ scmd_eh_abort_handler(struct work_struct *work)
}
}
+ spin_lock_irqsave(sdev->host->host_lock, flags);
+ list_del_init(&scmd->eh_entry);
+ spin_unlock_irqrestore(sdev->host->host_lock, flags);
scsi_eh_scmd_add(scmd);
}
@@ -221,6 +244,8 @@ scsi_abort_command(struct scsi_cmnd *scmd)
spin_lock_irqsave(shost->host_lock, flags);
if (shost->eh_deadline != -1 && !shost->last_reset)
shost->last_reset = jiffies;
+ BUG_ON(!list_empty(&scmd->eh_entry));
+ list_add_tail(&scmd->eh_entry, &shost->eh_abort_list);
spin_unlock_irqrestore(shost->host_lock, flags);
scmd->eh_eflags |= SCSI_EH_ABORT_SCHEDULED;
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index d0b7c6dc74f8..c851c05d6091 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1143,6 +1143,7 @@ void scsi_init_command(struct scsi_device *dev, struct scsi_cmnd *cmd)
cmd->sense_buffer = buf;
cmd->prot_sdb = prot;
cmd->flags = flags;
+ INIT_LIST_HEAD(&cmd->eh_entry);
INIT_DELAYED_WORK(&cmd->abort_work, scmd_eh_abort_handler);
cmd->jiffies_at_alloc = jiffies_at_alloc;
cmd->retries = retries;
diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h
index 7958a604f979..29ac40cf1aae 100644
--- a/include/scsi/scsi_cmnd.h
+++ b/include/scsi/scsi_cmnd.h
@@ -73,7 +73,7 @@ enum scsi_cmnd_submitter {
struct scsi_cmnd {
struct scsi_request req;
struct scsi_device *device;
- struct list_head eh_entry; /* entry for the host eh_cmd_q */
+ struct list_head eh_entry; /* entry for the host eh_abort_list/eh_cmd_q */
struct delayed_work abort_work;
struct rcu_head rcu;
diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
index ae715959f886..ebe059badba0 100644
--- a/include/scsi/scsi_host.h
+++ b/include/scsi/scsi_host.h
@@ -551,6 +551,7 @@ struct Scsi_Host {
struct mutex scan_mutex;/* serialize scanning activity */
+ struct list_head eh_abort_list;
struct list_head eh_cmd_q;
struct task_struct * ehandler; /* Error recovery thread. */
struct completion * eh_action; /* Wait for specific actions on the
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 5ae17501bc62a49b0b193dcce003f16375f16654 Mon Sep 17 00:00:00 2001
From: "Ewan D. Milne" <emilne(a)redhat.com>
Date: Fri, 29 Oct 2021 15:43:10 -0400
Subject: [PATCH] scsi: core: Avoid leaving shost->last_reset with stale value
if EH does not run
The changes to issue the abort from the scmd->abort_work instead of the EH
thread introduced a problem if eh_deadline is used. If aborting the
command(s) is successful, and there are never any scmds added to the
shost->eh_cmd_q, there is no code path which will reset the ->last_reset
value back to zero.
The effect of this is that after a successful abort with no EH thread
activity, a subsequent timeout, perhaps a long time later, might
immediately be considered past a user-set eh_deadline time, and the host
will be reset with no attempt at recovery.
Fix this by resetting ->last_reset back to zero in scmd_eh_abort_handler()
if it is determined that the EH thread will not run to do this.
Thanks to Gopinath Marappan for investigating this problem.
Link: https://lore.kernel.org/r/20211029194311.17504-2-emilne@redhat.com
Fixes: e494f6a72839 ("[SCSI] improved eh timeout handler")
Cc: stable(a)vger.kernel.org
Signed-off-by: Ewan D. Milne <emilne(a)redhat.com>
Signed-off-by: Martin K. Petersen <martin.petersen(a)oracle.com>
diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c
index 17aef936bc90..2cb7163e24cc 100644
--- a/drivers/scsi/hosts.c
+++ b/drivers/scsi/hosts.c
@@ -387,6 +387,7 @@ struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *sht, int privsize)
shost->shost_state = SHOST_CREATED;
INIT_LIST_HEAD(&shost->__devices);
INIT_LIST_HEAD(&shost->__targets);
+ INIT_LIST_HEAD(&shost->eh_abort_list);
INIT_LIST_HEAD(&shost->eh_cmd_q);
INIT_LIST_HEAD(&shost->starved_list);
init_waitqueue_head(&shost->host_wait);
diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
index 3de03925550e..bdf782d9cb86 100644
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -133,6 +133,23 @@ static bool scsi_eh_should_retry_cmd(struct scsi_cmnd *cmd)
return true;
}
+static void scsi_eh_complete_abort(struct scsi_cmnd *scmd, struct Scsi_Host *shost)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(shost->host_lock, flags);
+ list_del_init(&scmd->eh_entry);
+ /*
+ * If the abort succeeds, and there is no further
+ * EH action, clear the ->last_reset time.
+ */
+ if (list_empty(&shost->eh_abort_list) &&
+ list_empty(&shost->eh_cmd_q))
+ if (shost->eh_deadline != -1)
+ shost->last_reset = 0;
+ spin_unlock_irqrestore(shost->host_lock, flags);
+}
+
/**
* scmd_eh_abort_handler - Handle command aborts
* @work: command to be aborted.
@@ -150,6 +167,7 @@ scmd_eh_abort_handler(struct work_struct *work)
container_of(work, struct scsi_cmnd, abort_work.work);
struct scsi_device *sdev = scmd->device;
enum scsi_disposition rtn;
+ unsigned long flags;
if (scsi_host_eh_past_deadline(sdev->host)) {
SCSI_LOG_ERROR_RECOVERY(3,
@@ -173,12 +191,14 @@ scmd_eh_abort_handler(struct work_struct *work)
SCSI_LOG_ERROR_RECOVERY(3,
scmd_printk(KERN_WARNING, scmd,
"retry aborted command\n"));
+ scsi_eh_complete_abort(scmd, sdev->host);
scsi_queue_insert(scmd, SCSI_MLQUEUE_EH_RETRY);
return;
} else {
SCSI_LOG_ERROR_RECOVERY(3,
scmd_printk(KERN_WARNING, scmd,
"finish aborted command\n"));
+ scsi_eh_complete_abort(scmd, sdev->host);
scsi_finish_command(scmd);
return;
}
@@ -191,6 +211,9 @@ scmd_eh_abort_handler(struct work_struct *work)
}
}
+ spin_lock_irqsave(sdev->host->host_lock, flags);
+ list_del_init(&scmd->eh_entry);
+ spin_unlock_irqrestore(sdev->host->host_lock, flags);
scsi_eh_scmd_add(scmd);
}
@@ -221,6 +244,8 @@ scsi_abort_command(struct scsi_cmnd *scmd)
spin_lock_irqsave(shost->host_lock, flags);
if (shost->eh_deadline != -1 && !shost->last_reset)
shost->last_reset = jiffies;
+ BUG_ON(!list_empty(&scmd->eh_entry));
+ list_add_tail(&scmd->eh_entry, &shost->eh_abort_list);
spin_unlock_irqrestore(shost->host_lock, flags);
scmd->eh_eflags |= SCSI_EH_ABORT_SCHEDULED;
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index d0b7c6dc74f8..c851c05d6091 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1143,6 +1143,7 @@ void scsi_init_command(struct scsi_device *dev, struct scsi_cmnd *cmd)
cmd->sense_buffer = buf;
cmd->prot_sdb = prot;
cmd->flags = flags;
+ INIT_LIST_HEAD(&cmd->eh_entry);
INIT_DELAYED_WORK(&cmd->abort_work, scmd_eh_abort_handler);
cmd->jiffies_at_alloc = jiffies_at_alloc;
cmd->retries = retries;
diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h
index 7958a604f979..29ac40cf1aae 100644
--- a/include/scsi/scsi_cmnd.h
+++ b/include/scsi/scsi_cmnd.h
@@ -73,7 +73,7 @@ enum scsi_cmnd_submitter {
struct scsi_cmnd {
struct scsi_request req;
struct scsi_device *device;
- struct list_head eh_entry; /* entry for the host eh_cmd_q */
+ struct list_head eh_entry; /* entry for the host eh_abort_list/eh_cmd_q */
struct delayed_work abort_work;
struct rcu_head rcu;
diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
index ae715959f886..ebe059badba0 100644
--- a/include/scsi/scsi_host.h
+++ b/include/scsi/scsi_host.h
@@ -551,6 +551,7 @@ struct Scsi_Host {
struct mutex scan_mutex;/* serialize scanning activity */
+ struct list_head eh_abort_list;
struct list_head eh_cmd_q;
struct task_struct * ehandler; /* Error recovery thread. */
struct completion * eh_action; /* Wait for specific actions on the
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 5ae17501bc62a49b0b193dcce003f16375f16654 Mon Sep 17 00:00:00 2001
From: "Ewan D. Milne" <emilne(a)redhat.com>
Date: Fri, 29 Oct 2021 15:43:10 -0400
Subject: [PATCH] scsi: core: Avoid leaving shost->last_reset with stale value
if EH does not run
The changes to issue the abort from the scmd->abort_work instead of the EH
thread introduced a problem if eh_deadline is used. If aborting the
command(s) is successful, and there are never any scmds added to the
shost->eh_cmd_q, there is no code path which will reset the ->last_reset
value back to zero.
The effect of this is that after a successful abort with no EH thread
activity, a subsequent timeout, perhaps a long time later, might
immediately be considered past a user-set eh_deadline time, and the host
will be reset with no attempt at recovery.
Fix this by resetting ->last_reset back to zero in scmd_eh_abort_handler()
if it is determined that the EH thread will not run to do this.
Thanks to Gopinath Marappan for investigating this problem.
Link: https://lore.kernel.org/r/20211029194311.17504-2-emilne@redhat.com
Fixes: e494f6a72839 ("[SCSI] improved eh timeout handler")
Cc: stable(a)vger.kernel.org
Signed-off-by: Ewan D. Milne <emilne(a)redhat.com>
Signed-off-by: Martin K. Petersen <martin.petersen(a)oracle.com>
diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c
index 17aef936bc90..2cb7163e24cc 100644
--- a/drivers/scsi/hosts.c
+++ b/drivers/scsi/hosts.c
@@ -387,6 +387,7 @@ struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *sht, int privsize)
shost->shost_state = SHOST_CREATED;
INIT_LIST_HEAD(&shost->__devices);
INIT_LIST_HEAD(&shost->__targets);
+ INIT_LIST_HEAD(&shost->eh_abort_list);
INIT_LIST_HEAD(&shost->eh_cmd_q);
INIT_LIST_HEAD(&shost->starved_list);
init_waitqueue_head(&shost->host_wait);
diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
index 3de03925550e..bdf782d9cb86 100644
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -133,6 +133,23 @@ static bool scsi_eh_should_retry_cmd(struct scsi_cmnd *cmd)
return true;
}
+static void scsi_eh_complete_abort(struct scsi_cmnd *scmd, struct Scsi_Host *shost)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(shost->host_lock, flags);
+ list_del_init(&scmd->eh_entry);
+ /*
+ * If the abort succeeds, and there is no further
+ * EH action, clear the ->last_reset time.
+ */
+ if (list_empty(&shost->eh_abort_list) &&
+ list_empty(&shost->eh_cmd_q))
+ if (shost->eh_deadline != -1)
+ shost->last_reset = 0;
+ spin_unlock_irqrestore(shost->host_lock, flags);
+}
+
/**
* scmd_eh_abort_handler - Handle command aborts
* @work: command to be aborted.
@@ -150,6 +167,7 @@ scmd_eh_abort_handler(struct work_struct *work)
container_of(work, struct scsi_cmnd, abort_work.work);
struct scsi_device *sdev = scmd->device;
enum scsi_disposition rtn;
+ unsigned long flags;
if (scsi_host_eh_past_deadline(sdev->host)) {
SCSI_LOG_ERROR_RECOVERY(3,
@@ -173,12 +191,14 @@ scmd_eh_abort_handler(struct work_struct *work)
SCSI_LOG_ERROR_RECOVERY(3,
scmd_printk(KERN_WARNING, scmd,
"retry aborted command\n"));
+ scsi_eh_complete_abort(scmd, sdev->host);
scsi_queue_insert(scmd, SCSI_MLQUEUE_EH_RETRY);
return;
} else {
SCSI_LOG_ERROR_RECOVERY(3,
scmd_printk(KERN_WARNING, scmd,
"finish aborted command\n"));
+ scsi_eh_complete_abort(scmd, sdev->host);
scsi_finish_command(scmd);
return;
}
@@ -191,6 +211,9 @@ scmd_eh_abort_handler(struct work_struct *work)
}
}
+ spin_lock_irqsave(sdev->host->host_lock, flags);
+ list_del_init(&scmd->eh_entry);
+ spin_unlock_irqrestore(sdev->host->host_lock, flags);
scsi_eh_scmd_add(scmd);
}
@@ -221,6 +244,8 @@ scsi_abort_command(struct scsi_cmnd *scmd)
spin_lock_irqsave(shost->host_lock, flags);
if (shost->eh_deadline != -1 && !shost->last_reset)
shost->last_reset = jiffies;
+ BUG_ON(!list_empty(&scmd->eh_entry));
+ list_add_tail(&scmd->eh_entry, &shost->eh_abort_list);
spin_unlock_irqrestore(shost->host_lock, flags);
scmd->eh_eflags |= SCSI_EH_ABORT_SCHEDULED;
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index d0b7c6dc74f8..c851c05d6091 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1143,6 +1143,7 @@ void scsi_init_command(struct scsi_device *dev, struct scsi_cmnd *cmd)
cmd->sense_buffer = buf;
cmd->prot_sdb = prot;
cmd->flags = flags;
+ INIT_LIST_HEAD(&cmd->eh_entry);
INIT_DELAYED_WORK(&cmd->abort_work, scmd_eh_abort_handler);
cmd->jiffies_at_alloc = jiffies_at_alloc;
cmd->retries = retries;
diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h
index 7958a604f979..29ac40cf1aae 100644
--- a/include/scsi/scsi_cmnd.h
+++ b/include/scsi/scsi_cmnd.h
@@ -73,7 +73,7 @@ enum scsi_cmnd_submitter {
struct scsi_cmnd {
struct scsi_request req;
struct scsi_device *device;
- struct list_head eh_entry; /* entry for the host eh_cmd_q */
+ struct list_head eh_entry; /* entry for the host eh_abort_list/eh_cmd_q */
struct delayed_work abort_work;
struct rcu_head rcu;
diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
index ae715959f886..ebe059badba0 100644
--- a/include/scsi/scsi_host.h
+++ b/include/scsi/scsi_host.h
@@ -551,6 +551,7 @@ struct Scsi_Host {
struct mutex scan_mutex;/* serialize scanning activity */
+ struct list_head eh_abort_list;
struct list_head eh_cmd_q;
struct task_struct * ehandler; /* Error recovery thread. */
struct completion * eh_action; /* Wait for specific actions on the
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 5ae17501bc62a49b0b193dcce003f16375f16654 Mon Sep 17 00:00:00 2001
From: "Ewan D. Milne" <emilne(a)redhat.com>
Date: Fri, 29 Oct 2021 15:43:10 -0400
Subject: [PATCH] scsi: core: Avoid leaving shost->last_reset with stale value
if EH does not run
The changes to issue the abort from the scmd->abort_work instead of the EH
thread introduced a problem if eh_deadline is used. If aborting the
command(s) is successful, and there are never any scmds added to the
shost->eh_cmd_q, there is no code path which will reset the ->last_reset
value back to zero.
The effect of this is that after a successful abort with no EH thread
activity, a subsequent timeout, perhaps a long time later, might
immediately be considered past a user-set eh_deadline time, and the host
will be reset with no attempt at recovery.
Fix this by resetting ->last_reset back to zero in scmd_eh_abort_handler()
if it is determined that the EH thread will not run to do this.
Thanks to Gopinath Marappan for investigating this problem.
Link: https://lore.kernel.org/r/20211029194311.17504-2-emilne@redhat.com
Fixes: e494f6a72839 ("[SCSI] improved eh timeout handler")
Cc: stable(a)vger.kernel.org
Signed-off-by: Ewan D. Milne <emilne(a)redhat.com>
Signed-off-by: Martin K. Petersen <martin.petersen(a)oracle.com>
diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c
index 17aef936bc90..2cb7163e24cc 100644
--- a/drivers/scsi/hosts.c
+++ b/drivers/scsi/hosts.c
@@ -387,6 +387,7 @@ struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *sht, int privsize)
shost->shost_state = SHOST_CREATED;
INIT_LIST_HEAD(&shost->__devices);
INIT_LIST_HEAD(&shost->__targets);
+ INIT_LIST_HEAD(&shost->eh_abort_list);
INIT_LIST_HEAD(&shost->eh_cmd_q);
INIT_LIST_HEAD(&shost->starved_list);
init_waitqueue_head(&shost->host_wait);
diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
index 3de03925550e..bdf782d9cb86 100644
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -133,6 +133,23 @@ static bool scsi_eh_should_retry_cmd(struct scsi_cmnd *cmd)
return true;
}
+static void scsi_eh_complete_abort(struct scsi_cmnd *scmd, struct Scsi_Host *shost)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(shost->host_lock, flags);
+ list_del_init(&scmd->eh_entry);
+ /*
+ * If the abort succeeds, and there is no further
+ * EH action, clear the ->last_reset time.
+ */
+ if (list_empty(&shost->eh_abort_list) &&
+ list_empty(&shost->eh_cmd_q))
+ if (shost->eh_deadline != -1)
+ shost->last_reset = 0;
+ spin_unlock_irqrestore(shost->host_lock, flags);
+}
+
/**
* scmd_eh_abort_handler - Handle command aborts
* @work: command to be aborted.
@@ -150,6 +167,7 @@ scmd_eh_abort_handler(struct work_struct *work)
container_of(work, struct scsi_cmnd, abort_work.work);
struct scsi_device *sdev = scmd->device;
enum scsi_disposition rtn;
+ unsigned long flags;
if (scsi_host_eh_past_deadline(sdev->host)) {
SCSI_LOG_ERROR_RECOVERY(3,
@@ -173,12 +191,14 @@ scmd_eh_abort_handler(struct work_struct *work)
SCSI_LOG_ERROR_RECOVERY(3,
scmd_printk(KERN_WARNING, scmd,
"retry aborted command\n"));
+ scsi_eh_complete_abort(scmd, sdev->host);
scsi_queue_insert(scmd, SCSI_MLQUEUE_EH_RETRY);
return;
} else {
SCSI_LOG_ERROR_RECOVERY(3,
scmd_printk(KERN_WARNING, scmd,
"finish aborted command\n"));
+ scsi_eh_complete_abort(scmd, sdev->host);
scsi_finish_command(scmd);
return;
}
@@ -191,6 +211,9 @@ scmd_eh_abort_handler(struct work_struct *work)
}
}
+ spin_lock_irqsave(sdev->host->host_lock, flags);
+ list_del_init(&scmd->eh_entry);
+ spin_unlock_irqrestore(sdev->host->host_lock, flags);
scsi_eh_scmd_add(scmd);
}
@@ -221,6 +244,8 @@ scsi_abort_command(struct scsi_cmnd *scmd)
spin_lock_irqsave(shost->host_lock, flags);
if (shost->eh_deadline != -1 && !shost->last_reset)
shost->last_reset = jiffies;
+ BUG_ON(!list_empty(&scmd->eh_entry));
+ list_add_tail(&scmd->eh_entry, &shost->eh_abort_list);
spin_unlock_irqrestore(shost->host_lock, flags);
scmd->eh_eflags |= SCSI_EH_ABORT_SCHEDULED;
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index d0b7c6dc74f8..c851c05d6091 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1143,6 +1143,7 @@ void scsi_init_command(struct scsi_device *dev, struct scsi_cmnd *cmd)
cmd->sense_buffer = buf;
cmd->prot_sdb = prot;
cmd->flags = flags;
+ INIT_LIST_HEAD(&cmd->eh_entry);
INIT_DELAYED_WORK(&cmd->abort_work, scmd_eh_abort_handler);
cmd->jiffies_at_alloc = jiffies_at_alloc;
cmd->retries = retries;
diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h
index 7958a604f979..29ac40cf1aae 100644
--- a/include/scsi/scsi_cmnd.h
+++ b/include/scsi/scsi_cmnd.h
@@ -73,7 +73,7 @@ enum scsi_cmnd_submitter {
struct scsi_cmnd {
struct scsi_request req;
struct scsi_device *device;
- struct list_head eh_entry; /* entry for the host eh_cmd_q */
+ struct list_head eh_entry; /* entry for the host eh_abort_list/eh_cmd_q */
struct delayed_work abort_work;
struct rcu_head rcu;
diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
index ae715959f886..ebe059badba0 100644
--- a/include/scsi/scsi_host.h
+++ b/include/scsi/scsi_host.h
@@ -551,6 +551,7 @@ struct Scsi_Host {
struct mutex scan_mutex;/* serialize scanning activity */
+ struct list_head eh_abort_list;
struct list_head eh_cmd_q;
struct task_struct * ehandler; /* Error recovery thread. */
struct completion * eh_action; /* Wait for specific actions on the