The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 61ca49a9105faefa003b37542cebad8722f8ae22 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov(a)gmail.com>
Date: Mon, 26 Apr 2021 19:11:37 +0200
Subject: [PATCH] libceph: don't set global_id until we get an auth ticket
With the introduction of enforcing mode, setting global_id as soon
as we get it in the first MAuth reply will result in EACCES if the
connection is reset before we get the second MAuth reply containing
an auth ticket -- because on retry we would attempt to reclaim that
global_id with no auth ticket at hand.
Neither ceph_auth_client nor ceph_mon_client depend on global_id
being set ealy, so just delay the setting until we get and process
the second MAuth reply. While at it, complain if the monitor sends
a zero global_id or changes our global_id as the session is likely
to fail after that.
Cc: stable(a)vger.kernel.org # needs backporting for < 5.11
Signed-off-by: Ilya Dryomov <idryomov(a)gmail.com>
Reviewed-by: Sage Weil <sage(a)redhat.com>
diff --git a/net/ceph/auth.c b/net/ceph/auth.c
index eb261aa5fe18..de407e8feb97 100644
--- a/net/ceph/auth.c
+++ b/net/ceph/auth.c
@@ -36,6 +36,20 @@ static int init_protocol(struct ceph_auth_client *ac, int proto)
}
}
+static void set_global_id(struct ceph_auth_client *ac, u64 global_id)
+{
+ dout("%s global_id %llu\n", __func__, global_id);
+
+ if (!global_id)
+ pr_err("got zero global_id\n");
+
+ if (ac->global_id && global_id != ac->global_id)
+ pr_err("global_id changed from %llu to %llu\n", ac->global_id,
+ global_id);
+
+ ac->global_id = global_id;
+}
+
/*
* setup, teardown.
*/
@@ -222,11 +236,6 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac,
payload_end = payload + payload_len;
- if (global_id && ac->global_id != global_id) {
- dout(" set global_id %lld -> %lld\n", ac->global_id, global_id);
- ac->global_id = global_id;
- }
-
if (ac->negotiating) {
/* server does not support our protocols? */
if (!protocol && result < 0) {
@@ -253,11 +262,16 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac,
ret = ac->ops->handle_reply(ac, result, payload, payload_end,
NULL, NULL, NULL, NULL);
- if (ret == -EAGAIN)
+ if (ret == -EAGAIN) {
ret = build_request(ac, true, reply_buf, reply_len);
- else if (ret)
+ goto out;
+ } else if (ret) {
pr_err("auth protocol '%s' mauth authentication failed: %d\n",
ceph_auth_proto_name(ac->protocol), result);
+ goto out;
+ }
+
+ set_global_id(ac, global_id);
out:
mutex_unlock(&ac->mutex);
@@ -484,15 +498,11 @@ int ceph_auth_handle_reply_done(struct ceph_auth_client *ac,
int ret;
mutex_lock(&ac->mutex);
- if (global_id && ac->global_id != global_id) {
- dout("%s global_id %llu -> %llu\n", __func__, ac->global_id,
- global_id);
- ac->global_id = global_id;
- }
-
ret = ac->ops->handle_reply(ac, 0, reply, reply + reply_len,
session_key, session_key_len,
con_secret, con_secret_len);
+ if (!ret)
+ set_global_id(ac, global_id);
mutex_unlock(&ac->mutex);
return ret;
}
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 61ca49a9105faefa003b37542cebad8722f8ae22 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov(a)gmail.com>
Date: Mon, 26 Apr 2021 19:11:37 +0200
Subject: [PATCH] libceph: don't set global_id until we get an auth ticket
With the introduction of enforcing mode, setting global_id as soon
as we get it in the first MAuth reply will result in EACCES if the
connection is reset before we get the second MAuth reply containing
an auth ticket -- because on retry we would attempt to reclaim that
global_id with no auth ticket at hand.
Neither ceph_auth_client nor ceph_mon_client depend on global_id
being set ealy, so just delay the setting until we get and process
the second MAuth reply. While at it, complain if the monitor sends
a zero global_id or changes our global_id as the session is likely
to fail after that.
Cc: stable(a)vger.kernel.org # needs backporting for < 5.11
Signed-off-by: Ilya Dryomov <idryomov(a)gmail.com>
Reviewed-by: Sage Weil <sage(a)redhat.com>
diff --git a/net/ceph/auth.c b/net/ceph/auth.c
index eb261aa5fe18..de407e8feb97 100644
--- a/net/ceph/auth.c
+++ b/net/ceph/auth.c
@@ -36,6 +36,20 @@ static int init_protocol(struct ceph_auth_client *ac, int proto)
}
}
+static void set_global_id(struct ceph_auth_client *ac, u64 global_id)
+{
+ dout("%s global_id %llu\n", __func__, global_id);
+
+ if (!global_id)
+ pr_err("got zero global_id\n");
+
+ if (ac->global_id && global_id != ac->global_id)
+ pr_err("global_id changed from %llu to %llu\n", ac->global_id,
+ global_id);
+
+ ac->global_id = global_id;
+}
+
/*
* setup, teardown.
*/
@@ -222,11 +236,6 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac,
payload_end = payload + payload_len;
- if (global_id && ac->global_id != global_id) {
- dout(" set global_id %lld -> %lld\n", ac->global_id, global_id);
- ac->global_id = global_id;
- }
-
if (ac->negotiating) {
/* server does not support our protocols? */
if (!protocol && result < 0) {
@@ -253,11 +262,16 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac,
ret = ac->ops->handle_reply(ac, result, payload, payload_end,
NULL, NULL, NULL, NULL);
- if (ret == -EAGAIN)
+ if (ret == -EAGAIN) {
ret = build_request(ac, true, reply_buf, reply_len);
- else if (ret)
+ goto out;
+ } else if (ret) {
pr_err("auth protocol '%s' mauth authentication failed: %d\n",
ceph_auth_proto_name(ac->protocol), result);
+ goto out;
+ }
+
+ set_global_id(ac, global_id);
out:
mutex_unlock(&ac->mutex);
@@ -484,15 +498,11 @@ int ceph_auth_handle_reply_done(struct ceph_auth_client *ac,
int ret;
mutex_lock(&ac->mutex);
- if (global_id && ac->global_id != global_id) {
- dout("%s global_id %llu -> %llu\n", __func__, ac->global_id,
- global_id);
- ac->global_id = global_id;
- }
-
ret = ac->ops->handle_reply(ac, 0, reply, reply + reply_len,
session_key, session_key_len,
con_secret, con_secret_len);
+ if (!ret)
+ set_global_id(ac, global_id);
mutex_unlock(&ac->mutex);
return ret;
}
The patch below does not apply to the 4.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From be94215be1ab19e5d38f50962f611c88d4bfc83a Mon Sep 17 00:00:00 2001
From: Xiang Chen <chenxiang66(a)hisilicon.com>
Date: Thu, 1 Apr 2021 15:34:46 +0800
Subject: [PATCH] mtd: spi-nor: core: Fix an issue of releasing resources
during read/write
If rmmod the driver during read or write, the driver will release the
resources which are used during read or write, so it is possible to
refer to NULL pointer.
Use the testcase "mtd_debug read /dev/mtd0 0xc00000 0x400000 dest_file &
sleep 0.5;rmmod spi_hisi_sfc_v3xx.ko", the issue can be reproduced in
hisi_sfc_v3xx driver.
To avoid the issue, fill the interface _get_device and _put_device of
mtd_info to grab the reference to the spi controller driver module, so
the request of rmmod the driver is rejected before read/write is finished.
Fixes: b199489d37b2 ("mtd: spi-nor: add the framework for SPI NOR")
Signed-off-by: Xiang Chen <chenxiang66(a)hisilicon.com>
Signed-off-by: Yicong Yang <yangyicong(a)hisilicon.com>
Signed-off-by: Tudor Ambarus <tudor.ambarus(a)microchip.com>
Tested-by: Michael Walle <michael(a)walle.cc>
Tested-by: Tudor Ambarus <tudor.ambarus(a)microchip.com>
Reviewed-by: Michael Walle <michael(a)walle.cc>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/1617262486-4223-1-git-send-email-yangyicong@hisil…
diff --git a/drivers/mtd/spi-nor/core.c b/drivers/mtd/spi-nor/core.c
index 8cf3cf92129e..bd2c7717eb10 100644
--- a/drivers/mtd/spi-nor/core.c
+++ b/drivers/mtd/spi-nor/core.c
@@ -2905,6 +2905,37 @@ static void spi_nor_resume(struct mtd_info *mtd)
dev_err(dev, "resume() failed\n");
}
+static int spi_nor_get_device(struct mtd_info *mtd)
+{
+ struct mtd_info *master = mtd_get_master(mtd);
+ struct spi_nor *nor = mtd_to_spi_nor(master);
+ struct device *dev;
+
+ if (nor->spimem)
+ dev = nor->spimem->spi->controller->dev.parent;
+ else
+ dev = nor->dev;
+
+ if (!try_module_get(dev->driver->owner))
+ return -ENODEV;
+
+ return 0;
+}
+
+static void spi_nor_put_device(struct mtd_info *mtd)
+{
+ struct mtd_info *master = mtd_get_master(mtd);
+ struct spi_nor *nor = mtd_to_spi_nor(master);
+ struct device *dev;
+
+ if (nor->spimem)
+ dev = nor->spimem->spi->controller->dev.parent;
+ else
+ dev = nor->dev;
+
+ module_put(dev->driver->owner);
+}
+
void spi_nor_restore(struct spi_nor *nor)
{
/* restore the addressing mode */
@@ -3099,6 +3130,8 @@ int spi_nor_scan(struct spi_nor *nor, const char *name,
mtd->_read = spi_nor_read;
mtd->_suspend = spi_nor_suspend;
mtd->_resume = spi_nor_resume;
+ mtd->_get_device = spi_nor_get_device;
+ mtd->_put_device = spi_nor_put_device;
if (info->flags & USE_FSR)
nor->flags |= SNOR_F_USE_FSR;
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From be94215be1ab19e5d38f50962f611c88d4bfc83a Mon Sep 17 00:00:00 2001
From: Xiang Chen <chenxiang66(a)hisilicon.com>
Date: Thu, 1 Apr 2021 15:34:46 +0800
Subject: [PATCH] mtd: spi-nor: core: Fix an issue of releasing resources
during read/write
If rmmod the driver during read or write, the driver will release the
resources which are used during read or write, so it is possible to
refer to NULL pointer.
Use the testcase "mtd_debug read /dev/mtd0 0xc00000 0x400000 dest_file &
sleep 0.5;rmmod spi_hisi_sfc_v3xx.ko", the issue can be reproduced in
hisi_sfc_v3xx driver.
To avoid the issue, fill the interface _get_device and _put_device of
mtd_info to grab the reference to the spi controller driver module, so
the request of rmmod the driver is rejected before read/write is finished.
Fixes: b199489d37b2 ("mtd: spi-nor: add the framework for SPI NOR")
Signed-off-by: Xiang Chen <chenxiang66(a)hisilicon.com>
Signed-off-by: Yicong Yang <yangyicong(a)hisilicon.com>
Signed-off-by: Tudor Ambarus <tudor.ambarus(a)microchip.com>
Tested-by: Michael Walle <michael(a)walle.cc>
Tested-by: Tudor Ambarus <tudor.ambarus(a)microchip.com>
Reviewed-by: Michael Walle <michael(a)walle.cc>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/1617262486-4223-1-git-send-email-yangyicong@hisil…
diff --git a/drivers/mtd/spi-nor/core.c b/drivers/mtd/spi-nor/core.c
index 8cf3cf92129e..bd2c7717eb10 100644
--- a/drivers/mtd/spi-nor/core.c
+++ b/drivers/mtd/spi-nor/core.c
@@ -2905,6 +2905,37 @@ static void spi_nor_resume(struct mtd_info *mtd)
dev_err(dev, "resume() failed\n");
}
+static int spi_nor_get_device(struct mtd_info *mtd)
+{
+ struct mtd_info *master = mtd_get_master(mtd);
+ struct spi_nor *nor = mtd_to_spi_nor(master);
+ struct device *dev;
+
+ if (nor->spimem)
+ dev = nor->spimem->spi->controller->dev.parent;
+ else
+ dev = nor->dev;
+
+ if (!try_module_get(dev->driver->owner))
+ return -ENODEV;
+
+ return 0;
+}
+
+static void spi_nor_put_device(struct mtd_info *mtd)
+{
+ struct mtd_info *master = mtd_get_master(mtd);
+ struct spi_nor *nor = mtd_to_spi_nor(master);
+ struct device *dev;
+
+ if (nor->spimem)
+ dev = nor->spimem->spi->controller->dev.parent;
+ else
+ dev = nor->dev;
+
+ module_put(dev->driver->owner);
+}
+
void spi_nor_restore(struct spi_nor *nor)
{
/* restore the addressing mode */
@@ -3099,6 +3130,8 @@ int spi_nor_scan(struct spi_nor *nor, const char *name,
mtd->_read = spi_nor_read;
mtd->_suspend = spi_nor_suspend;
mtd->_resume = spi_nor_resume;
+ mtd->_get_device = spi_nor_get_device;
+ mtd->_put_device = spi_nor_put_device;
if (info->flags & USE_FSR)
nor->flags |= SNOR_F_USE_FSR;
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From be94215be1ab19e5d38f50962f611c88d4bfc83a Mon Sep 17 00:00:00 2001
From: Xiang Chen <chenxiang66(a)hisilicon.com>
Date: Thu, 1 Apr 2021 15:34:46 +0800
Subject: [PATCH] mtd: spi-nor: core: Fix an issue of releasing resources
during read/write
If rmmod the driver during read or write, the driver will release the
resources which are used during read or write, so it is possible to
refer to NULL pointer.
Use the testcase "mtd_debug read /dev/mtd0 0xc00000 0x400000 dest_file &
sleep 0.5;rmmod spi_hisi_sfc_v3xx.ko", the issue can be reproduced in
hisi_sfc_v3xx driver.
To avoid the issue, fill the interface _get_device and _put_device of
mtd_info to grab the reference to the spi controller driver module, so
the request of rmmod the driver is rejected before read/write is finished.
Fixes: b199489d37b2 ("mtd: spi-nor: add the framework for SPI NOR")
Signed-off-by: Xiang Chen <chenxiang66(a)hisilicon.com>
Signed-off-by: Yicong Yang <yangyicong(a)hisilicon.com>
Signed-off-by: Tudor Ambarus <tudor.ambarus(a)microchip.com>
Tested-by: Michael Walle <michael(a)walle.cc>
Tested-by: Tudor Ambarus <tudor.ambarus(a)microchip.com>
Reviewed-by: Michael Walle <michael(a)walle.cc>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/1617262486-4223-1-git-send-email-yangyicong@hisil…
diff --git a/drivers/mtd/spi-nor/core.c b/drivers/mtd/spi-nor/core.c
index 8cf3cf92129e..bd2c7717eb10 100644
--- a/drivers/mtd/spi-nor/core.c
+++ b/drivers/mtd/spi-nor/core.c
@@ -2905,6 +2905,37 @@ static void spi_nor_resume(struct mtd_info *mtd)
dev_err(dev, "resume() failed\n");
}
+static int spi_nor_get_device(struct mtd_info *mtd)
+{
+ struct mtd_info *master = mtd_get_master(mtd);
+ struct spi_nor *nor = mtd_to_spi_nor(master);
+ struct device *dev;
+
+ if (nor->spimem)
+ dev = nor->spimem->spi->controller->dev.parent;
+ else
+ dev = nor->dev;
+
+ if (!try_module_get(dev->driver->owner))
+ return -ENODEV;
+
+ return 0;
+}
+
+static void spi_nor_put_device(struct mtd_info *mtd)
+{
+ struct mtd_info *master = mtd_get_master(mtd);
+ struct spi_nor *nor = mtd_to_spi_nor(master);
+ struct device *dev;
+
+ if (nor->spimem)
+ dev = nor->spimem->spi->controller->dev.parent;
+ else
+ dev = nor->dev;
+
+ module_put(dev->driver->owner);
+}
+
void spi_nor_restore(struct spi_nor *nor)
{
/* restore the addressing mode */
@@ -3099,6 +3130,8 @@ int spi_nor_scan(struct spi_nor *nor, const char *name,
mtd->_read = spi_nor_read;
mtd->_suspend = spi_nor_suspend;
mtd->_resume = spi_nor_resume;
+ mtd->_get_device = spi_nor_get_device;
+ mtd->_put_device = spi_nor_put_device;
if (info->flags & USE_FSR)
nor->flags |= SNOR_F_USE_FSR;
The patch below does not apply to the 4.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 8c9af478c06bb1ab1422f90d8ecbc53defd44bc3 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt(a)goodmis.org>
Date: Wed, 5 May 2021 10:38:24 -0400
Subject: [PATCH] ftrace: Handle commands when closing set_ftrace_filter file
# echo switch_mm:traceoff > /sys/kernel/tracing/set_ftrace_filter
will cause switch_mm to stop tracing by the traceoff command.
# echo -n switch_mm:traceoff > /sys/kernel/tracing/set_ftrace_filter
does nothing.
The reason is that the parsing in the write function only processes
commands if it finished parsing (there is white space written after the
command). That's to handle:
write(fd, "switch_mm:", 10);
write(fd, "traceoff", 8);
cases, where the command is broken over multiple writes.
The problem is if the file descriptor is closed, then the write call is
not processed, and the command needs to be processed in the release code.
The release code can handle matching of functions, but does not handle
commands.
Cc: stable(a)vger.kernel.org
Fixes: eda1e32855656 ("tracing: handle broken names in ftrace filter")
Signed-off-by: Steven Rostedt (VMware) <rostedt(a)goodmis.org>
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 057e962ca5ce..c57508445faa 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -5591,7 +5591,10 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
parser = &iter->parser;
if (trace_parser_loaded(parser)) {
- ftrace_match_records(iter->hash, parser->buffer, parser->idx);
+ int enable = !(iter->flags & FTRACE_ITER_NOTRACE);
+
+ ftrace_process_regex(iter, parser->buffer,
+ parser->idx, enable);
}
trace_parser_put(parser);
These tests deliberately access these arrays out of bounds,
which will cause the dynamic local bounds checks inserted by
CONFIG_UBSAN_LOCAL_BOUNDS to fail and panic the kernel. To avoid this
problem, access the arrays via volatile pointers, which will prevent
the compiler from being able to determine the array bounds.
These accesses use volatile pointers to char (char *volatile) rather
than the more conventional pointers to volatile char (volatile char *)
because we want to prevent the compiler from making inferences about
the pointer itself (i.e. its array bounds), not the data that it
refers to.
Signed-off-by: Peter Collingbourne <pcc(a)google.com>
Cc: stable(a)vger.kernel.org
Link: https://linux-review.googlesource.com/id/I90b1713fbfa1bf68ff895aef099ea77b9…
---
lib/test_kasan.c | 29 +++++++++++++++++++++++------
1 file changed, 23 insertions(+), 6 deletions(-)
diff --git a/lib/test_kasan.c b/lib/test_kasan.c
index dc05cfc2d12f..cacbbbdef768 100644
--- a/lib/test_kasan.c
+++ b/lib/test_kasan.c
@@ -654,8 +654,20 @@ static char global_array[10];
static void kasan_global_oob(struct kunit *test)
{
- volatile int i = 3;
- char *p = &global_array[ARRAY_SIZE(global_array) + i];
+ /*
+ * Deliberate out-of-bounds access. To prevent CONFIG_UBSAN_LOCAL_BOUNDS
+ * from failing here and panicing the kernel, access the array via a
+ * volatile pointer, which will prevent the compiler from being able to
+ * determine the array bounds.
+ *
+ * This access uses a volatile pointer to char (char *volatile) rather
+ * than the more conventional pointer to volatile char (volatile char *)
+ * because we want to prevent the compiler from making inferences about
+ * the pointer itself (i.e. its array bounds), not the data that it
+ * refers to.
+ */
+ char *volatile array = global_array;
+ char *p = &array[ARRAY_SIZE(global_array) + 3];
/* Only generic mode instruments globals. */
KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_GENERIC);
@@ -703,8 +715,9 @@ static void ksize_uaf(struct kunit *test)
static void kasan_stack_oob(struct kunit *test)
{
char stack_array[10];
- volatile int i = OOB_TAG_OFF;
- char *p = &stack_array[ARRAY_SIZE(stack_array) + i];
+ /* See comment in kasan_global_oob. */
+ char *volatile array = stack_array;
+ char *p = &array[ARRAY_SIZE(stack_array) + OOB_TAG_OFF];
KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_STACK);
@@ -715,7 +728,9 @@ static void kasan_alloca_oob_left(struct kunit *test)
{
volatile int i = 10;
char alloca_array[i];
- char *p = alloca_array - 1;
+ /* See comment in kasan_global_oob. */
+ char *volatile array = alloca_array;
+ char *p = array - 1;
/* Only generic mode instruments dynamic allocas. */
KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_GENERIC);
@@ -728,7 +743,9 @@ static void kasan_alloca_oob_right(struct kunit *test)
{
volatile int i = 10;
char alloca_array[i];
- char *p = alloca_array + i;
+ /* See comment in kasan_global_oob. */
+ char *volatile array = alloca_array;
+ char *p = array + i;
/* Only generic mode instruments dynamic allocas. */
KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_GENERIC);
--
2.31.1.607.g51e8a6a459-goog
do_mq_timedreceive calls wq_sleep with a stack local address. The
sender (do_mq_timedsend) uses this address to later call
pipelined_send.
This leads to a very hard to trigger race where a do_mq_timedreceive call
might return and leave do_mq_timedsend to rely on an invalid address,
causing the following crash:
[ 240.739977] RIP: 0010:wake_q_add_safe+0x13/0x60
[ 240.739991] Call Trace:
[ 240.739999] __x64_sys_mq_timedsend+0x2a9/0x490
[ 240.740003] ? auditd_test_task+0x38/0x40
[ 240.740007] ? auditd_test_task+0x38/0x40
[ 240.740011] do_syscall_64+0x80/0x680
[ 240.740017] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[ 240.740019] RIP: 0033:0x7f5928e40343
The race occurs as:
1. do_mq_timedreceive calls wq_sleep with the address of
`struct ext_wait_queue` on function stack (aliased as `ewq_addr` here)
- it holds a valid `struct ext_wait_queue *` as long as the stack has
not been overwritten.
2. `ewq_addr` gets added to info->e_wait_q[RECV].list in wq_add, and
do_mq_timedsend receives it via wq_get_first_waiter(info, RECV) to call
__pipelined_op.
3. Sender calls __pipelined_op::smp_store_release(&this->state, STATE_READY).
Here is where the race window begins. (`this` is `ewq_addr`.)
4. If the receiver wakes up now in do_mq_timedreceive::wq_sleep, it
will see `state == STATE_READY` and break. `ewq_addr` gets removed from
info->e_wait_q[RECV].list.
5. do_mq_timedreceive returns, and `ewq_addr` is no longer guaranteed
to be a `struct ext_wait_queue *` since it was on do_mq_timedreceive's
stack. (Although the address may not get overwritten until another
function happens to touch it, which means it can persist around for an
indefinite time.)
6. do_mq_timedsend::__pipelined_op() still believes `ewq_addr` is a
`struct ext_wait_queue *`, and uses it to find a task_struct to pass
to the wake_q_add_safe call. In the lucky case where nothing has
overwritten `ewq_addr` yet, `ewq_addr->task` is the right task_struct.
In the unlucky case, __pipelined_op::wake_q_add_safe gets handed a
bogus address as the receiver's task_struct causing the crash.
do_mq_timedsend::__pipelined_op() should not dereference `this` after
setting STATE_READY, as the receiver counterpart is now free to return.
Change __pipelined_op to call wake_q_add before setting STATE_READY
which ensures that the receiver's task_struct can still be found via
`this`.
Fixes: c5b2cbdbdac563 ("ipc/mqueue.c: update/document memory barriers")
Signed-off-by: Varad Gautam <varad.gautam(a)suse.com>
Reported-by: Matthias von Faber <matthias.vonfaber(a)aox-tech.de>
Cc: <stable(a)vger.kernel.org> # 5.6
Cc: Christian Brauner <christian.brauner(a)ubuntu.com>
Cc: Oleg Nesterov <oleg(a)redhat.com>
Cc: "Eric W. Biederman" <ebiederm(a)xmission.com>
Cc: Manfred Spraul <manfred(a)colorfullife.com>
Cc: Andrew Morton <akpm(a)linux-foundation.org>
Cc: Davidlohr Bueso <dbueso(a)suse.de>
---
v2: Call wake_q_add before smp_store_release, instead of using a
get_task_struct/wake_q_add_safe combination across
smp_store_release. (Davidlohr Bueso)
ipc/mqueue.c | 33 ++++++++++++++++++++++++---------
1 file changed, 24 insertions(+), 9 deletions(-)
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 8031464ed4ae..bfcb6f81a824 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -78,11 +78,13 @@ struct posix_msg_tree_node {
* MQ_BARRIER:
* To achieve proper release/acquire memory barrier pairing, the state is set to
* STATE_READY with smp_store_release(), and it is read with READ_ONCE followed
- * by smp_acquire__after_ctrl_dep(). In addition, wake_q_add_safe() is used.
+ * by smp_acquire__after_ctrl_dep(). The state change to STATE_READY must be
+ * the last write operation, after which the blocked task can immediately
+ * return and exit.
*
* This prevents the following races:
*
- * 1) With the simple wake_q_add(), the task could be gone already before
+ * 1) With wake_q_add(), the task could be gone already before
* the increase of the reference happens
* Thread A
* Thread B
@@ -97,10 +99,25 @@ struct posix_msg_tree_node {
* sys_exit()
* get_task_struct() // UaF
*
- * Solution: Use wake_q_add_safe() and perform the get_task_struct() before
- * the smp_store_release() that does ->state = STATE_READY.
+ * 2) With wake_q_add(), the receiver task could have returned from the
+ * syscall and had its stack-allocated waiter overwritten before the
+ * sender could add it to the wake_q
+ * Thread A
+ * Thread B
+ * WRITE_ONCE(wait.state, STATE_NONE);
+ * schedule_hrtimeout()
+ * ->state = STATE_READY
+ * <timeout returns>
+ * if (wait.state == STATE_READY) return;
+ * sysret to user space
+ * overwrite receiver's stack
+ * wake_q_add(A)
+ * if (cmpxchg()) // corrupted waiter
*
- * 2) Without proper _release/_acquire barriers, the woken up task
+ * Solution: Queue the task for wakeup before the smp_store_release() that
+ * does ->state = STATE_READY.
+ *
+ * 3) Without proper _release/_acquire barriers, the woken up task
* could read stale data
*
* Thread A
@@ -116,7 +133,7 @@ struct posix_msg_tree_node {
*
* Solution: use _release and _acquire barriers.
*
- * 3) There is intentionally no barrier when setting current->state
+ * 4) There is intentionally no barrier when setting current->state
* to TASK_INTERRUPTIBLE: spin_unlock(&info->lock) provides the
* release memory barrier, and the wakeup is triggered when holding
* info->lock, i.e. spin_lock(&info->lock) provided a pairing
@@ -1005,11 +1022,9 @@ static inline void __pipelined_op(struct wake_q_head *wake_q,
struct ext_wait_queue *this)
{
list_del(&this->list);
- get_task_struct(this->task);
-
+ wake_q_add(wake_q, this->task);
/* see MQ_BARRIER for purpose/pairing */
smp_store_release(&this->state, STATE_READY);
- wake_q_add_safe(wake_q, this->task);
}
/* pipelined_send() - send a message directly to the task waiting in
--
2.30.2