The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 2c197870e4701610ec3b1143808d4e31152caf30 Mon Sep 17 00:00:00 2001
From: Julian Wiedmann <jwi(a)linux.ibm.com>
Date: Mon, 31 May 2021 18:40:06 +0300
Subject: [PATCH] s390/qdio: fix roll-back after timeout on ESTABLISH ccw
When qdio_establish() times out while waiting for the ESTABLISH ccw to
complete, it calls qdio_shutdown() to roll back all of its previous
actions. But at this point the qdio_irq's state is still
QDIO_IRQ_STATE_INACTIVE, so qdio_shutdown() will exit immediately
without doing any actual work.
Which means that eg. the qdio_irq's thinint-indicator stays registered,
and cdev->handler isn't restored to its old value. And since
commit 954d6235be41 ("s390/qdio: make thinint registration symmetric")
the qdio_irq also stays on the tiq_list, so on the next qdio_establish()
we might get a helpful BUG from the list-debugging code:
...
[ 4633.512591] list_add double add: new=00000000005a4110, prev=00000001b357db78, next=00000000005a4110.
[ 4633.512621] ------------[ cut here ]------------
[ 4633.512623] kernel BUG at lib/list_debug.c:29!
...
[ 4633.512796] [<00000001b2c6ee9a>] __list_add_valid+0x82/0xa0
[ 4633.512798] ([<00000001b2c6ee96>] __list_add_valid+0x7e/0xa0)
[ 4633.512800] [<00000001b2fcecce>] qdio_establish_thinint+0x116/0x190
[ 4633.512805] [<00000001b2fcbe58>] qdio_establish+0x128/0x498
...
Fix this by extracting a goto-chain from the existing error exits in
qdio_establish(), and check the return value of the wait_event_...()
to detect the timeout condition.
Fixes: 779e6e1c724d ("[S390] qdio: new qdio driver.")
Root-caused-by: Benjamin Block <bblock(a)linux.ibm.com>
Signed-off-by: Julian Wiedmann <jwi(a)linux.ibm.com>
Reviewed-by: Benjamin Block <bblock(a)linux.ibm.com>
Cc: <stable(a)vger.kernel.org> # 2.6.27
Signed-off-by: Heiko Carstens <hca(a)linux.ibm.com>
diff --git a/drivers/s390/cio/qdio_main.c b/drivers/s390/cio/qdio_main.c
index 3052fab00597..32c8c46b19b6 100644
--- a/drivers/s390/cio/qdio_main.c
+++ b/drivers/s390/cio/qdio_main.c
@@ -1083,6 +1083,7 @@ int qdio_establish(struct ccw_device *cdev,
{
struct qdio_irq *irq_ptr = cdev->private->qdio_data;
struct subchannel_id schid;
+ long timeout;
int rc;
ccw_device_get_schid(cdev, &schid);
@@ -1111,11 +1112,8 @@ int qdio_establish(struct ccw_device *cdev,
qdio_setup_irq(irq_ptr, init_data);
rc = qdio_establish_thinint(irq_ptr);
- if (rc) {
- qdio_shutdown_irq(irq_ptr);
- mutex_unlock(&irq_ptr->setup_mutex);
- return rc;
- }
+ if (rc)
+ goto err_thinint;
/* establish q */
irq_ptr->ccw.cmd_code = irq_ptr->equeue.cmd;
@@ -1131,15 +1129,16 @@ int qdio_establish(struct ccw_device *cdev,
if (rc) {
DBF_ERROR("%4x est IO ERR", irq_ptr->schid.sch_no);
DBF_ERROR("rc:%4x", rc);
- qdio_shutdown_thinint(irq_ptr);
- qdio_shutdown_irq(irq_ptr);
- mutex_unlock(&irq_ptr->setup_mutex);
- return rc;
+ goto err_ccw_start;
}
- wait_event_interruptible_timeout(cdev->private->wait_q,
- irq_ptr->state == QDIO_IRQ_STATE_ESTABLISHED ||
- irq_ptr->state == QDIO_IRQ_STATE_ERR, HZ);
+ timeout = wait_event_interruptible_timeout(cdev->private->wait_q,
+ irq_ptr->state == QDIO_IRQ_STATE_ESTABLISHED ||
+ irq_ptr->state == QDIO_IRQ_STATE_ERR, HZ);
+ if (timeout <= 0) {
+ rc = (timeout == -ERESTARTSYS) ? -EINTR : -ETIME;
+ goto err_ccw_timeout;
+ }
if (irq_ptr->state != QDIO_IRQ_STATE_ESTABLISHED) {
mutex_unlock(&irq_ptr->setup_mutex);
@@ -1156,6 +1155,14 @@ int qdio_establish(struct ccw_device *cdev,
qdio_print_subchannel_info(irq_ptr);
qdio_setup_debug_entries(irq_ptr);
return 0;
+
+err_ccw_timeout:
+err_ccw_start:
+ qdio_shutdown_thinint(irq_ptr);
+err_thinint:
+ qdio_shutdown_irq(irq_ptr);
+ mutex_unlock(&irq_ptr->setup_mutex);
+ return rc;
}
EXPORT_SYMBOL_GPL(qdio_establish);
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 2c197870e4701610ec3b1143808d4e31152caf30 Mon Sep 17 00:00:00 2001
From: Julian Wiedmann <jwi(a)linux.ibm.com>
Date: Mon, 31 May 2021 18:40:06 +0300
Subject: [PATCH] s390/qdio: fix roll-back after timeout on ESTABLISH ccw
When qdio_establish() times out while waiting for the ESTABLISH ccw to
complete, it calls qdio_shutdown() to roll back all of its previous
actions. But at this point the qdio_irq's state is still
QDIO_IRQ_STATE_INACTIVE, so qdio_shutdown() will exit immediately
without doing any actual work.
Which means that eg. the qdio_irq's thinint-indicator stays registered,
and cdev->handler isn't restored to its old value. And since
commit 954d6235be41 ("s390/qdio: make thinint registration symmetric")
the qdio_irq also stays on the tiq_list, so on the next qdio_establish()
we might get a helpful BUG from the list-debugging code:
...
[ 4633.512591] list_add double add: new=00000000005a4110, prev=00000001b357db78, next=00000000005a4110.
[ 4633.512621] ------------[ cut here ]------------
[ 4633.512623] kernel BUG at lib/list_debug.c:29!
...
[ 4633.512796] [<00000001b2c6ee9a>] __list_add_valid+0x82/0xa0
[ 4633.512798] ([<00000001b2c6ee96>] __list_add_valid+0x7e/0xa0)
[ 4633.512800] [<00000001b2fcecce>] qdio_establish_thinint+0x116/0x190
[ 4633.512805] [<00000001b2fcbe58>] qdio_establish+0x128/0x498
...
Fix this by extracting a goto-chain from the existing error exits in
qdio_establish(), and check the return value of the wait_event_...()
to detect the timeout condition.
Fixes: 779e6e1c724d ("[S390] qdio: new qdio driver.")
Root-caused-by: Benjamin Block <bblock(a)linux.ibm.com>
Signed-off-by: Julian Wiedmann <jwi(a)linux.ibm.com>
Reviewed-by: Benjamin Block <bblock(a)linux.ibm.com>
Cc: <stable(a)vger.kernel.org> # 2.6.27
Signed-off-by: Heiko Carstens <hca(a)linux.ibm.com>
diff --git a/drivers/s390/cio/qdio_main.c b/drivers/s390/cio/qdio_main.c
index 3052fab00597..32c8c46b19b6 100644
--- a/drivers/s390/cio/qdio_main.c
+++ b/drivers/s390/cio/qdio_main.c
@@ -1083,6 +1083,7 @@ int qdio_establish(struct ccw_device *cdev,
{
struct qdio_irq *irq_ptr = cdev->private->qdio_data;
struct subchannel_id schid;
+ long timeout;
int rc;
ccw_device_get_schid(cdev, &schid);
@@ -1111,11 +1112,8 @@ int qdio_establish(struct ccw_device *cdev,
qdio_setup_irq(irq_ptr, init_data);
rc = qdio_establish_thinint(irq_ptr);
- if (rc) {
- qdio_shutdown_irq(irq_ptr);
- mutex_unlock(&irq_ptr->setup_mutex);
- return rc;
- }
+ if (rc)
+ goto err_thinint;
/* establish q */
irq_ptr->ccw.cmd_code = irq_ptr->equeue.cmd;
@@ -1131,15 +1129,16 @@ int qdio_establish(struct ccw_device *cdev,
if (rc) {
DBF_ERROR("%4x est IO ERR", irq_ptr->schid.sch_no);
DBF_ERROR("rc:%4x", rc);
- qdio_shutdown_thinint(irq_ptr);
- qdio_shutdown_irq(irq_ptr);
- mutex_unlock(&irq_ptr->setup_mutex);
- return rc;
+ goto err_ccw_start;
}
- wait_event_interruptible_timeout(cdev->private->wait_q,
- irq_ptr->state == QDIO_IRQ_STATE_ESTABLISHED ||
- irq_ptr->state == QDIO_IRQ_STATE_ERR, HZ);
+ timeout = wait_event_interruptible_timeout(cdev->private->wait_q,
+ irq_ptr->state == QDIO_IRQ_STATE_ESTABLISHED ||
+ irq_ptr->state == QDIO_IRQ_STATE_ERR, HZ);
+ if (timeout <= 0) {
+ rc = (timeout == -ERESTARTSYS) ? -EINTR : -ETIME;
+ goto err_ccw_timeout;
+ }
if (irq_ptr->state != QDIO_IRQ_STATE_ESTABLISHED) {
mutex_unlock(&irq_ptr->setup_mutex);
@@ -1156,6 +1155,14 @@ int qdio_establish(struct ccw_device *cdev,
qdio_print_subchannel_info(irq_ptr);
qdio_setup_debug_entries(irq_ptr);
return 0;
+
+err_ccw_timeout:
+err_ccw_start:
+ qdio_shutdown_thinint(irq_ptr);
+err_thinint:
+ qdio_shutdown_irq(irq_ptr);
+ mutex_unlock(&irq_ptr->setup_mutex);
+ return rc;
}
EXPORT_SYMBOL_GPL(qdio_establish);
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 2c197870e4701610ec3b1143808d4e31152caf30 Mon Sep 17 00:00:00 2001
From: Julian Wiedmann <jwi(a)linux.ibm.com>
Date: Mon, 31 May 2021 18:40:06 +0300
Subject: [PATCH] s390/qdio: fix roll-back after timeout on ESTABLISH ccw
When qdio_establish() times out while waiting for the ESTABLISH ccw to
complete, it calls qdio_shutdown() to roll back all of its previous
actions. But at this point the qdio_irq's state is still
QDIO_IRQ_STATE_INACTIVE, so qdio_shutdown() will exit immediately
without doing any actual work.
Which means that eg. the qdio_irq's thinint-indicator stays registered,
and cdev->handler isn't restored to its old value. And since
commit 954d6235be41 ("s390/qdio: make thinint registration symmetric")
the qdio_irq also stays on the tiq_list, so on the next qdio_establish()
we might get a helpful BUG from the list-debugging code:
...
[ 4633.512591] list_add double add: new=00000000005a4110, prev=00000001b357db78, next=00000000005a4110.
[ 4633.512621] ------------[ cut here ]------------
[ 4633.512623] kernel BUG at lib/list_debug.c:29!
...
[ 4633.512796] [<00000001b2c6ee9a>] __list_add_valid+0x82/0xa0
[ 4633.512798] ([<00000001b2c6ee96>] __list_add_valid+0x7e/0xa0)
[ 4633.512800] [<00000001b2fcecce>] qdio_establish_thinint+0x116/0x190
[ 4633.512805] [<00000001b2fcbe58>] qdio_establish+0x128/0x498
...
Fix this by extracting a goto-chain from the existing error exits in
qdio_establish(), and check the return value of the wait_event_...()
to detect the timeout condition.
Fixes: 779e6e1c724d ("[S390] qdio: new qdio driver.")
Root-caused-by: Benjamin Block <bblock(a)linux.ibm.com>
Signed-off-by: Julian Wiedmann <jwi(a)linux.ibm.com>
Reviewed-by: Benjamin Block <bblock(a)linux.ibm.com>
Cc: <stable(a)vger.kernel.org> # 2.6.27
Signed-off-by: Heiko Carstens <hca(a)linux.ibm.com>
diff --git a/drivers/s390/cio/qdio_main.c b/drivers/s390/cio/qdio_main.c
index 3052fab00597..32c8c46b19b6 100644
--- a/drivers/s390/cio/qdio_main.c
+++ b/drivers/s390/cio/qdio_main.c
@@ -1083,6 +1083,7 @@ int qdio_establish(struct ccw_device *cdev,
{
struct qdio_irq *irq_ptr = cdev->private->qdio_data;
struct subchannel_id schid;
+ long timeout;
int rc;
ccw_device_get_schid(cdev, &schid);
@@ -1111,11 +1112,8 @@ int qdio_establish(struct ccw_device *cdev,
qdio_setup_irq(irq_ptr, init_data);
rc = qdio_establish_thinint(irq_ptr);
- if (rc) {
- qdio_shutdown_irq(irq_ptr);
- mutex_unlock(&irq_ptr->setup_mutex);
- return rc;
- }
+ if (rc)
+ goto err_thinint;
/* establish q */
irq_ptr->ccw.cmd_code = irq_ptr->equeue.cmd;
@@ -1131,15 +1129,16 @@ int qdio_establish(struct ccw_device *cdev,
if (rc) {
DBF_ERROR("%4x est IO ERR", irq_ptr->schid.sch_no);
DBF_ERROR("rc:%4x", rc);
- qdio_shutdown_thinint(irq_ptr);
- qdio_shutdown_irq(irq_ptr);
- mutex_unlock(&irq_ptr->setup_mutex);
- return rc;
+ goto err_ccw_start;
}
- wait_event_interruptible_timeout(cdev->private->wait_q,
- irq_ptr->state == QDIO_IRQ_STATE_ESTABLISHED ||
- irq_ptr->state == QDIO_IRQ_STATE_ERR, HZ);
+ timeout = wait_event_interruptible_timeout(cdev->private->wait_q,
+ irq_ptr->state == QDIO_IRQ_STATE_ESTABLISHED ||
+ irq_ptr->state == QDIO_IRQ_STATE_ERR, HZ);
+ if (timeout <= 0) {
+ rc = (timeout == -ERESTARTSYS) ? -EINTR : -ETIME;
+ goto err_ccw_timeout;
+ }
if (irq_ptr->state != QDIO_IRQ_STATE_ESTABLISHED) {
mutex_unlock(&irq_ptr->setup_mutex);
@@ -1156,6 +1155,14 @@ int qdio_establish(struct ccw_device *cdev,
qdio_print_subchannel_info(irq_ptr);
qdio_setup_debug_entries(irq_ptr);
return 0;
+
+err_ccw_timeout:
+err_ccw_start:
+ qdio_shutdown_thinint(irq_ptr);
+err_thinint:
+ qdio_shutdown_irq(irq_ptr);
+ mutex_unlock(&irq_ptr->setup_mutex);
+ return rc;
}
EXPORT_SYMBOL_GPL(qdio_establish);
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 2c197870e4701610ec3b1143808d4e31152caf30 Mon Sep 17 00:00:00 2001
From: Julian Wiedmann <jwi(a)linux.ibm.com>
Date: Mon, 31 May 2021 18:40:06 +0300
Subject: [PATCH] s390/qdio: fix roll-back after timeout on ESTABLISH ccw
When qdio_establish() times out while waiting for the ESTABLISH ccw to
complete, it calls qdio_shutdown() to roll back all of its previous
actions. But at this point the qdio_irq's state is still
QDIO_IRQ_STATE_INACTIVE, so qdio_shutdown() will exit immediately
without doing any actual work.
Which means that eg. the qdio_irq's thinint-indicator stays registered,
and cdev->handler isn't restored to its old value. And since
commit 954d6235be41 ("s390/qdio: make thinint registration symmetric")
the qdio_irq also stays on the tiq_list, so on the next qdio_establish()
we might get a helpful BUG from the list-debugging code:
...
[ 4633.512591] list_add double add: new=00000000005a4110, prev=00000001b357db78, next=00000000005a4110.
[ 4633.512621] ------------[ cut here ]------------
[ 4633.512623] kernel BUG at lib/list_debug.c:29!
...
[ 4633.512796] [<00000001b2c6ee9a>] __list_add_valid+0x82/0xa0
[ 4633.512798] ([<00000001b2c6ee96>] __list_add_valid+0x7e/0xa0)
[ 4633.512800] [<00000001b2fcecce>] qdio_establish_thinint+0x116/0x190
[ 4633.512805] [<00000001b2fcbe58>] qdio_establish+0x128/0x498
...
Fix this by extracting a goto-chain from the existing error exits in
qdio_establish(), and check the return value of the wait_event_...()
to detect the timeout condition.
Fixes: 779e6e1c724d ("[S390] qdio: new qdio driver.")
Root-caused-by: Benjamin Block <bblock(a)linux.ibm.com>
Signed-off-by: Julian Wiedmann <jwi(a)linux.ibm.com>
Reviewed-by: Benjamin Block <bblock(a)linux.ibm.com>
Cc: <stable(a)vger.kernel.org> # 2.6.27
Signed-off-by: Heiko Carstens <hca(a)linux.ibm.com>
diff --git a/drivers/s390/cio/qdio_main.c b/drivers/s390/cio/qdio_main.c
index 3052fab00597..32c8c46b19b6 100644
--- a/drivers/s390/cio/qdio_main.c
+++ b/drivers/s390/cio/qdio_main.c
@@ -1083,6 +1083,7 @@ int qdio_establish(struct ccw_device *cdev,
{
struct qdio_irq *irq_ptr = cdev->private->qdio_data;
struct subchannel_id schid;
+ long timeout;
int rc;
ccw_device_get_schid(cdev, &schid);
@@ -1111,11 +1112,8 @@ int qdio_establish(struct ccw_device *cdev,
qdio_setup_irq(irq_ptr, init_data);
rc = qdio_establish_thinint(irq_ptr);
- if (rc) {
- qdio_shutdown_irq(irq_ptr);
- mutex_unlock(&irq_ptr->setup_mutex);
- return rc;
- }
+ if (rc)
+ goto err_thinint;
/* establish q */
irq_ptr->ccw.cmd_code = irq_ptr->equeue.cmd;
@@ -1131,15 +1129,16 @@ int qdio_establish(struct ccw_device *cdev,
if (rc) {
DBF_ERROR("%4x est IO ERR", irq_ptr->schid.sch_no);
DBF_ERROR("rc:%4x", rc);
- qdio_shutdown_thinint(irq_ptr);
- qdio_shutdown_irq(irq_ptr);
- mutex_unlock(&irq_ptr->setup_mutex);
- return rc;
+ goto err_ccw_start;
}
- wait_event_interruptible_timeout(cdev->private->wait_q,
- irq_ptr->state == QDIO_IRQ_STATE_ESTABLISHED ||
- irq_ptr->state == QDIO_IRQ_STATE_ERR, HZ);
+ timeout = wait_event_interruptible_timeout(cdev->private->wait_q,
+ irq_ptr->state == QDIO_IRQ_STATE_ESTABLISHED ||
+ irq_ptr->state == QDIO_IRQ_STATE_ERR, HZ);
+ if (timeout <= 0) {
+ rc = (timeout == -ERESTARTSYS) ? -EINTR : -ETIME;
+ goto err_ccw_timeout;
+ }
if (irq_ptr->state != QDIO_IRQ_STATE_ESTABLISHED) {
mutex_unlock(&irq_ptr->setup_mutex);
@@ -1156,6 +1155,14 @@ int qdio_establish(struct ccw_device *cdev,
qdio_print_subchannel_info(irq_ptr);
qdio_setup_debug_entries(irq_ptr);
return 0;
+
+err_ccw_timeout:
+err_ccw_start:
+ qdio_shutdown_thinint(irq_ptr);
+err_thinint:
+ qdio_shutdown_irq(irq_ptr);
+ mutex_unlock(&irq_ptr->setup_mutex);
+ return rc;
}
EXPORT_SYMBOL_GPL(qdio_establish);
The patch below does not apply to the 4.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 2c197870e4701610ec3b1143808d4e31152caf30 Mon Sep 17 00:00:00 2001
From: Julian Wiedmann <jwi(a)linux.ibm.com>
Date: Mon, 31 May 2021 18:40:06 +0300
Subject: [PATCH] s390/qdio: fix roll-back after timeout on ESTABLISH ccw
When qdio_establish() times out while waiting for the ESTABLISH ccw to
complete, it calls qdio_shutdown() to roll back all of its previous
actions. But at this point the qdio_irq's state is still
QDIO_IRQ_STATE_INACTIVE, so qdio_shutdown() will exit immediately
without doing any actual work.
Which means that eg. the qdio_irq's thinint-indicator stays registered,
and cdev->handler isn't restored to its old value. And since
commit 954d6235be41 ("s390/qdio: make thinint registration symmetric")
the qdio_irq also stays on the tiq_list, so on the next qdio_establish()
we might get a helpful BUG from the list-debugging code:
...
[ 4633.512591] list_add double add: new=00000000005a4110, prev=00000001b357db78, next=00000000005a4110.
[ 4633.512621] ------------[ cut here ]------------
[ 4633.512623] kernel BUG at lib/list_debug.c:29!
...
[ 4633.512796] [<00000001b2c6ee9a>] __list_add_valid+0x82/0xa0
[ 4633.512798] ([<00000001b2c6ee96>] __list_add_valid+0x7e/0xa0)
[ 4633.512800] [<00000001b2fcecce>] qdio_establish_thinint+0x116/0x190
[ 4633.512805] [<00000001b2fcbe58>] qdio_establish+0x128/0x498
...
Fix this by extracting a goto-chain from the existing error exits in
qdio_establish(), and check the return value of the wait_event_...()
to detect the timeout condition.
Fixes: 779e6e1c724d ("[S390] qdio: new qdio driver.")
Root-caused-by: Benjamin Block <bblock(a)linux.ibm.com>
Signed-off-by: Julian Wiedmann <jwi(a)linux.ibm.com>
Reviewed-by: Benjamin Block <bblock(a)linux.ibm.com>
Cc: <stable(a)vger.kernel.org> # 2.6.27
Signed-off-by: Heiko Carstens <hca(a)linux.ibm.com>
diff --git a/drivers/s390/cio/qdio_main.c b/drivers/s390/cio/qdio_main.c
index 3052fab00597..32c8c46b19b6 100644
--- a/drivers/s390/cio/qdio_main.c
+++ b/drivers/s390/cio/qdio_main.c
@@ -1083,6 +1083,7 @@ int qdio_establish(struct ccw_device *cdev,
{
struct qdio_irq *irq_ptr = cdev->private->qdio_data;
struct subchannel_id schid;
+ long timeout;
int rc;
ccw_device_get_schid(cdev, &schid);
@@ -1111,11 +1112,8 @@ int qdio_establish(struct ccw_device *cdev,
qdio_setup_irq(irq_ptr, init_data);
rc = qdio_establish_thinint(irq_ptr);
- if (rc) {
- qdio_shutdown_irq(irq_ptr);
- mutex_unlock(&irq_ptr->setup_mutex);
- return rc;
- }
+ if (rc)
+ goto err_thinint;
/* establish q */
irq_ptr->ccw.cmd_code = irq_ptr->equeue.cmd;
@@ -1131,15 +1129,16 @@ int qdio_establish(struct ccw_device *cdev,
if (rc) {
DBF_ERROR("%4x est IO ERR", irq_ptr->schid.sch_no);
DBF_ERROR("rc:%4x", rc);
- qdio_shutdown_thinint(irq_ptr);
- qdio_shutdown_irq(irq_ptr);
- mutex_unlock(&irq_ptr->setup_mutex);
- return rc;
+ goto err_ccw_start;
}
- wait_event_interruptible_timeout(cdev->private->wait_q,
- irq_ptr->state == QDIO_IRQ_STATE_ESTABLISHED ||
- irq_ptr->state == QDIO_IRQ_STATE_ERR, HZ);
+ timeout = wait_event_interruptible_timeout(cdev->private->wait_q,
+ irq_ptr->state == QDIO_IRQ_STATE_ESTABLISHED ||
+ irq_ptr->state == QDIO_IRQ_STATE_ERR, HZ);
+ if (timeout <= 0) {
+ rc = (timeout == -ERESTARTSYS) ? -EINTR : -ETIME;
+ goto err_ccw_timeout;
+ }
if (irq_ptr->state != QDIO_IRQ_STATE_ESTABLISHED) {
mutex_unlock(&irq_ptr->setup_mutex);
@@ -1156,6 +1155,14 @@ int qdio_establish(struct ccw_device *cdev,
qdio_print_subchannel_info(irq_ptr);
qdio_setup_debug_entries(irq_ptr);
return 0;
+
+err_ccw_timeout:
+err_ccw_start:
+ qdio_shutdown_thinint(irq_ptr);
+err_thinint:
+ qdio_shutdown_irq(irq_ptr);
+ mutex_unlock(&irq_ptr->setup_mutex);
+ return rc;
}
EXPORT_SYMBOL_GPL(qdio_establish);
Backport summary
----------------
679c782de14b ("bpf/verifier: per-register parent pointers")
* Context patch for 2039f26f3aca5 ("bpf: Fix leakage due to
insufficient speculative store bypass mitigation").
* Context adjustments because of the code added by post-4.19 commit:
f92a819b4cbef ("bpf: prevent out of bounds speculation on pointer
arithmetic").
0bae2d4d62d5 ("bpf: correct slot_type marking logic to allow more stack slot sharing")
* Context patch for 2039f26f3aca5 ("bpf: Fix leakage due to
insufficient speculative store bypass mitigation").
* Minor context adjustement in selftest.
2011fccfb61b ("bpf: Support variable offset stack access from helpers")
* Context patch for 2039f26f3aca5 ("bpf: Fix leakage due to
insufficient speculative store bypass mitigation").
* 4.19 does not have the reg_state(env, regno) helper defined, so
replace the call with "cur_regs(env) + regno".
f2bcd05ec7b8 ("bpf: Reject indirect var_off stack access in raw mode")
* Follow-up fix for 2011fccfb61bb ("bpf: Support variable offset stack
access from helpers").
* Clean cherry-pick.
088ec26d9c2d ("bpf: Reject indirect var_off stack access in unpriv")
* Follow-up fix for 2011fccfb61bb ("bpf: Support variable offset stack
access from helpers").
* Drop comment in retrieve_ptr_limit(), as it was made obsolete by
post-4.19 commit 45bfdd767e235 ("bpf: Tighten speculative pointer
arithmetic mask").
107c26a70ca8 ("bpf: Sanity check max value for var_off stack access")
* Follow-up fix for 2011fccfb61bb ("bpf: Support variable offset stack
access from helpers").
* Clean cherry-pick.
8ff80e96e3cc ("selftests/bpf: Test variable offset stack access")
* Selftest follow-up for 2011fccfb61bb ("bpf: Support variable offset
stack access from helpers").
* Post-4.19, the verifier tests were split into different
files, in 4.19 they are still all in test_verifier.c, so apply the
changes manually.
f7cf25b2026d ("bpf: track spill/fill of constants")
* Context patch for 2039f26f3aca5 ("bpf: Fix leakage due to
insufficient speculative store bypass mitigation").
* Drop verbose_linfo() calls, as the function is not implemented in 4.19.
* Adjust mark_reg_read() calls to match the prototype in 4.19.
(mark_reg_read() was changed to take 4 parameters in post-4.19 commit
5327ed3d44b75("bpf: verifier: mark verified-insn with sub-register
zext flag"), but backporting it is out of scope for this patchseries).
fc559a70d57c ("selftests/bpf: fix tests due to const spill/fill")
* Selftest follow-up for f7cf25b2026d ("bpf: track spill/fill of constants").
* Post-4.19, the verifier tests were split into different
files, in 4.19 they are still all in test_verifier.c, so apply the
changes manually.
f5e81d111750 ("bpf: Introduce BPF nospec instruction for mitigating Spectre v4")
* Contextual adjustments.
* Drop arch/powerpc/net/bpf_jit_comp32.c changes, as the file is not
present in 4.19
* Drop riscv changes, as arch/riscv/net/bpf_jit_comp.c file is not
present in 4.19
2039f26f3aca ("bpf: Fix leakage due to insufficient speculative store bypass mitigation")
* Contextual adjustments.
* Apply check_stack_write_fixed_off() changes in check_stack_write().
* Replace env->bypass_spec_v4 -> env->allow_ptr_leaks.
c9e73e3d2b1e ("bpf: verifier: Allocate idmap scratch in verifier env")
e042aa532c84 ("bpf: Fix pointer arithmetic mask tightening under state")
* Contextual adjustments.
With this patchseries all bpf verifier selftests pass (tested in qemu for x86_64):
root@intel-x86-64:~# ./test_verifier
...
#663/p pass modified ctx pointer to helper, 3 OK
#664/p mov64 src == dst OK
#665/p mov64 src != dst OK
#666/u calls: ctx read at start of subprog OK
#666/p calls: ctx read at start of subprog OK
Summary: 932 PASSED, 0 SKIPPED, 0 FAILED
Alexei Starovoitov (2):
bpf: track spill/fill of constants
selftests/bpf: fix tests due to const spill/fill
Andrey Ignatov (5):
bpf: Support variable offset stack access from helpers
bpf: Reject indirect var_off stack access in raw mode
bpf: Reject indirect var_off stack access in unpriv mode
bpf: Sanity check max value for var_off stack access
selftests/bpf: Test variable offset stack access
Daniel Borkmann (3):
bpf: Introduce BPF nospec instruction for mitigating Spectre v4
bpf: Fix leakage due to insufficient speculative store bypass
mitigation
bpf: Fix pointer arithmetic mask tightening under state pruning
Edward Cree (1):
bpf/verifier: per-register parent pointers
Jiong Wang (1):
bpf: correct slot_type marking logic to allow more stack slot sharing
Lorenz Bauer (1):
bpf: verifier: Allocate idmap scratch in verifier env
arch/arm/net/bpf_jit_32.c | 3 +
arch/arm64/net/bpf_jit_comp.c | 13 +
arch/mips/net/ebpf_jit.c | 3 +
arch/powerpc/net/bpf_jit_comp64.c | 6 +
arch/s390/net/bpf_jit_comp.c | 5 +
arch/sparc/net/bpf_jit_comp_64.c | 3 +
arch/x86/net/bpf_jit_comp.c | 7 +
arch/x86/net/bpf_jit_comp32.c | 6 +
include/linux/bpf_verifier.h | 19 +-
include/linux/filter.h | 15 +
kernel/bpf/core.c | 18 +-
kernel/bpf/disasm.c | 16 +-
kernel/bpf/verifier.c | 498 ++++++++++----------
tools/testing/selftests/bpf/test_verifier.c | 144 +++++-
14 files changed, 468 insertions(+), 288 deletions(-)
--
2.25.1
From: Joerg Roedel <jroedel(a)suse.de>
The trampoline_pgd only maps the 0xfffffff000000000-0xffffffffffffffff
range of kernel memory (with 4-level paging). This range contains the
kernels text+data+bss mappings and the module mapping space, but not the
direct mapping and the vmalloc area.
This is enough to get an application processors out of real-mode, but
for code that switches back to real-mode the trampoline_pgd is missing
important parts of the address space. For example, consider this code
from arch/x86/kernel/reboot.c, function machine_real_restart() for a
64-bit kernel:
#ifdef CONFIG_X86_32
load_cr3(initial_page_table);
#else
write_cr3(real_mode_header->trampoline_pgd);
/* Exiting long mode will fail if CR4.PCIDE is set. */
if (boot_cpu_has(X86_FEATURE_PCID))
cr4_clear_bits(X86_CR4_PCIDE);
#endif
/* Jump to the identity-mapped low memory code */
#ifdef CONFIG_X86_32
asm volatile("jmpl *%0" : :
"rm" (real_mode_header->machine_real_restart_asm),
"a" (type));
#else
asm volatile("ljmpl *%0" : :
"m" (real_mode_header->machine_real_restart_asm),
"D" (type));
#endif
The code switches to the trampoline_pgd, which unmaps the direct mapping
and also the kernel stack. The call to cr4_clear_bits() will find no
stack and crash the machine. The real_mode_header pointer below points
into the direct mapping, and dereferencing it also causes a crash.
The reason this does not crash always is only that kernel mappings are
global and the CR3 switch does not flush those mappings. But if theses
mappings are not in the TLB already, the above code will crash before it
can jump to the real-mode stub.
Extend the trampoline_pgd to contain all kernel mappings to prevent
these crashes and to make code which runs on this page-table more
robust.
Cc: stable(a)vger.kernel.org
Signed-off-by: Joerg Roedel <jroedel(a)suse.de>
---
arch/x86/realmode/init.c | 12 +++++++++++-
1 file changed, 11 insertions(+), 1 deletion(-)
diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c
index 31b5856010cb..7a08c96cb42a 100644
--- a/arch/x86/realmode/init.c
+++ b/arch/x86/realmode/init.c
@@ -72,6 +72,7 @@ static void __init setup_real_mode(void)
#ifdef CONFIG_X86_64
u64 *trampoline_pgd;
u64 efer;
+ int i;
#endif
base = (unsigned char *)real_mode_header;
@@ -128,8 +129,17 @@ static void __init setup_real_mode(void)
trampoline_header->flags = 0;
trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd);
+
+ /*
+ * Map all of kernel memory into the trampoline PGD so that it includes
+ * the direct mapping and vmalloc space. This is needed to keep the
+ * stack and real_mode_header mapped when switching to this page table.
+ */
+ for (i = pgd_index(__PAGE_OFFSET); i < PTRS_PER_PGD; i++)
+ trampoline_pgd[i] = init_top_pgt[i].pgd;
+
+ /* Map the real mode stub as virtual == physical */
trampoline_pgd[0] = trampoline_pgd_entry.pgd;
- trampoline_pgd[511] = init_top_pgt[511].pgd;
#endif
sme_sev_setup_real_mode(trampoline_header);
--
2.33.0
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 6f93e834fa7c5faa0372e46828b4b2a966ac61d7 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain(a)oracle.com>
Date: Tue, 10 Aug 2021 23:23:44 +0800
Subject: [PATCH] btrfs: fix upper limit for max_inline for page size 64K
The mount option max_inline ranges from 0 to the sectorsize (which is
now equal to page size). But we parse the mount options too early and
before the actual sectorsize is read from the superblock. So the upper
limit of max_inline is unaware of the actual sectorsize and is limited
by the temporary sectorsize 4096, even on a system where the default
sectorsize is 64K.
Fix this by reading the superblock sectorsize before the mount option
parse.
Reported-by: Alexander Tsvetkov <alexander.tsvetkov(a)oracle.com>
CC: stable(a)vger.kernel.org # 5.4+
Signed-off-by: Anand Jain <anand.jain(a)oracle.com>
Reviewed-by: David Sterba <dsterba(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 2f9515dccce0..355ea88d5c5f 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3314,6 +3314,30 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
*/
fs_info->compress_type = BTRFS_COMPRESS_ZLIB;
+ /*
+ * Flag our filesystem as having big metadata blocks if they are bigger
+ * than the page size.
+ */
+ if (btrfs_super_nodesize(disk_super) > PAGE_SIZE) {
+ if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA))
+ btrfs_info(fs_info,
+ "flagging fs with big metadata feature");
+ features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
+ }
+
+ /* Set up fs_info before parsing mount options */
+ nodesize = btrfs_super_nodesize(disk_super);
+ sectorsize = btrfs_super_sectorsize(disk_super);
+ stripesize = sectorsize;
+ fs_info->dirty_metadata_batch = nodesize * (1 + ilog2(nr_cpu_ids));
+ fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
+
+ fs_info->nodesize = nodesize;
+ fs_info->sectorsize = sectorsize;
+ fs_info->sectorsize_bits = ilog2(sectorsize);
+ fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size;
+ fs_info->stripesize = stripesize;
+
ret = btrfs_parse_options(fs_info, options, sb->s_flags);
if (ret) {
err = ret;
@@ -3340,30 +3364,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA)
btrfs_info(fs_info, "has skinny extents");
- /*
- * flag our filesystem as having big metadata blocks if
- * they are bigger than the page size
- */
- if (btrfs_super_nodesize(disk_super) > PAGE_SIZE) {
- if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA))
- btrfs_info(fs_info,
- "flagging fs with big metadata feature");
- features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
- }
-
- nodesize = btrfs_super_nodesize(disk_super);
- sectorsize = btrfs_super_sectorsize(disk_super);
- stripesize = sectorsize;
- fs_info->dirty_metadata_batch = nodesize * (1 + ilog2(nr_cpu_ids));
- fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
-
- /* Cache block sizes */
- fs_info->nodesize = nodesize;
- fs_info->sectorsize = sectorsize;
- fs_info->sectorsize_bits = ilog2(sectorsize);
- fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size;
- fs_info->stripesize = stripesize;
-
/*
* mixed block groups end up with duplicate but slightly offset
* extent buffers for the same range. It leads to corruptions
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 6f93e834fa7c5faa0372e46828b4b2a966ac61d7 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain(a)oracle.com>
Date: Tue, 10 Aug 2021 23:23:44 +0800
Subject: [PATCH] btrfs: fix upper limit for max_inline for page size 64K
The mount option max_inline ranges from 0 to the sectorsize (which is
now equal to page size). But we parse the mount options too early and
before the actual sectorsize is read from the superblock. So the upper
limit of max_inline is unaware of the actual sectorsize and is limited
by the temporary sectorsize 4096, even on a system where the default
sectorsize is 64K.
Fix this by reading the superblock sectorsize before the mount option
parse.
Reported-by: Alexander Tsvetkov <alexander.tsvetkov(a)oracle.com>
CC: stable(a)vger.kernel.org # 5.4+
Signed-off-by: Anand Jain <anand.jain(a)oracle.com>
Reviewed-by: David Sterba <dsterba(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 2f9515dccce0..355ea88d5c5f 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3314,6 +3314,30 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
*/
fs_info->compress_type = BTRFS_COMPRESS_ZLIB;
+ /*
+ * Flag our filesystem as having big metadata blocks if they are bigger
+ * than the page size.
+ */
+ if (btrfs_super_nodesize(disk_super) > PAGE_SIZE) {
+ if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA))
+ btrfs_info(fs_info,
+ "flagging fs with big metadata feature");
+ features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
+ }
+
+ /* Set up fs_info before parsing mount options */
+ nodesize = btrfs_super_nodesize(disk_super);
+ sectorsize = btrfs_super_sectorsize(disk_super);
+ stripesize = sectorsize;
+ fs_info->dirty_metadata_batch = nodesize * (1 + ilog2(nr_cpu_ids));
+ fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
+
+ fs_info->nodesize = nodesize;
+ fs_info->sectorsize = sectorsize;
+ fs_info->sectorsize_bits = ilog2(sectorsize);
+ fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size;
+ fs_info->stripesize = stripesize;
+
ret = btrfs_parse_options(fs_info, options, sb->s_flags);
if (ret) {
err = ret;
@@ -3340,30 +3364,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA)
btrfs_info(fs_info, "has skinny extents");
- /*
- * flag our filesystem as having big metadata blocks if
- * they are bigger than the page size
- */
- if (btrfs_super_nodesize(disk_super) > PAGE_SIZE) {
- if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA))
- btrfs_info(fs_info,
- "flagging fs with big metadata feature");
- features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
- }
-
- nodesize = btrfs_super_nodesize(disk_super);
- sectorsize = btrfs_super_sectorsize(disk_super);
- stripesize = sectorsize;
- fs_info->dirty_metadata_batch = nodesize * (1 + ilog2(nr_cpu_ids));
- fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
-
- /* Cache block sizes */
- fs_info->nodesize = nodesize;
- fs_info->sectorsize = sectorsize;
- fs_info->sectorsize_bits = ilog2(sectorsize);
- fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size;
- fs_info->stripesize = stripesize;
-
/*
* mixed block groups end up with duplicate but slightly offset
* extent buffers for the same range. It leads to corruptions
Hi,
please add the following patches to 5.14 queue. All apply cleanly, thanks.
03fe78cc2942c55 btrfs: use delalloc_bytes to determine flush amount for shrink_delalloc
e16460707e94c3d btrfs: wait on async extents when flushing delalloc
93c60b17f2b5fca btrfs: reduce the preemptive flushing threshold to 90%
114623979405abf btrfs: do not do preemptive flushing if the majority is global rsv
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From e16460707e94c3d4c1b5418cb68b28b8efa903b2 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef(a)toxicpanda.com>
Date: Wed, 14 Jul 2021 14:47:21 -0400
Subject: [PATCH] btrfs: wait on async extents when flushing delalloc
I've been debugging an early ENOSPC problem in production and finally
root caused it to this problem. When we switched to the per-inode in
38d715f494f2 ("btrfs: use btrfs_start_delalloc_roots in
shrink_delalloc") I pulled out the async extent handling, because we
were doing the correct thing by calling filemap_flush() if we had async
extents set. This would properly wait on any async extents by locking
the page in the second flush, thus making sure our ordered extents were
properly set up.
However when I switched us back to page based flushing, I used
sync_inode(), which allows us to pass in our own wbc. The problem here
is that sync_inode() is smarter than the filemap_* helpers, it tries to
avoid calling writepages at all. This means that our second call could
skip calling do_writepages altogether, and thus not wait on the pagelock
for the async helpers. This means we could come back before any ordered
extents were created and then simply continue on in our flushing
mechanisms and ENOSPC out when we have plenty of space to use.
Fix this by putting back the async pages logic in shrink_delalloc. This
allows us to bulk write out everything that we need to, and then we can
wait in one place for the async helpers to catch up, and then wait on
any ordered extents that are created.
Fixes: e076ab2a2ca7 ("btrfs: shrink delalloc pages instead of full inodes")
CC: stable(a)vger.kernel.org # 5.10+
Reviewed-by: Nikolay Borisov <nborisov(a)suse.com>
Signed-off-by: Josef Bacik <josef(a)toxicpanda.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 73b6413cc9af..25eb214f56ac 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -9879,10 +9879,6 @@ static int start_delalloc_inodes(struct btrfs_root *root,
&work->work);
} else {
ret = sync_inode(inode, wbc);
- if (!ret &&
- test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
- &BTRFS_I(inode)->runtime_flags))
- ret = sync_inode(inode, wbc);
btrfs_add_delayed_iput(inode);
if (ret || wbc->nr_to_write <= 0)
goto out;
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index eb90a262563f..d9c8d738678f 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -532,9 +532,49 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info,
while ((delalloc_bytes || ordered_bytes) && loops < 3) {
u64 temp = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT;
long nr_pages = min_t(u64, temp, LONG_MAX);
+ int async_pages;
btrfs_start_delalloc_roots(fs_info, nr_pages, true);
+ /*
+ * We need to make sure any outstanding async pages are now
+ * processed before we continue. This is because things like
+ * sync_inode() try to be smart and skip writing if the inode is
+ * marked clean. We don't use filemap_fwrite for flushing
+ * because we want to control how many pages we write out at a
+ * time, thus this is the only safe way to make sure we've
+ * waited for outstanding compressed workers to have started
+ * their jobs and thus have ordered extents set up properly.
+ *
+ * This exists because we do not want to wait for each
+ * individual inode to finish its async work, we simply want to
+ * start the IO on everybody, and then come back here and wait
+ * for all of the async work to catch up. Once we're done with
+ * that we know we'll have ordered extents for everything and we
+ * can decide if we wait for that or not.
+ *
+ * If we choose to replace this in the future, make absolutely
+ * sure that the proper waiting is being done in the async case,
+ * as there have been bugs in that area before.
+ */
+ async_pages = atomic_read(&fs_info->async_delalloc_pages);
+ if (!async_pages)
+ goto skip_async;
+
+ /*
+ * We don't want to wait forever, if we wrote less pages in this
+ * loop than we have outstanding, only wait for that number of
+ * pages, otherwise we can wait for all async pages to finish
+ * before continuing.
+ */
+ if (async_pages > nr_pages)
+ async_pages -= nr_pages;
+ else
+ async_pages = 0;
+ wait_event(fs_info->async_submit_wait,
+ atomic_read(&fs_info->async_delalloc_pages) <=
+ async_pages);
+skip_async:
loops++;
if (wait_ordered && !trans) {
btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 03fe78cc2942c55cc13be5ca42578750f17204a1 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef(a)toxicpanda.com>
Date: Wed, 14 Jul 2021 14:47:20 -0400
Subject: [PATCH] btrfs: use delalloc_bytes to determine flush amount for
shrink_delalloc
We have been hitting some early ENOSPC issues in production with more
recent kernels, and I tracked it down to us simply not flushing delalloc
as aggressively as we should be. With tracing I was seeing us failing
all tickets with all of the block rsvs at or around 0, with very little
pinned space, but still around 120MiB of outstanding bytes_may_used.
Upon further investigation I saw that we were flushing around 14 pages
per shrink call for delalloc, despite having around 2GiB of delalloc
outstanding.
Consider the example of a 8 way machine, all CPUs trying to create a
file in parallel, which at the time of this commit requires 5 items to
do. Assuming a 16k leaf size, we have 10MiB of total metadata reclaim
size waiting on reservations. Now assume we have 128MiB of delalloc
outstanding. With our current math we would set items to 20, and then
set to_reclaim to 20 * 256k, or 5MiB.
Assuming that we went through this loop all 3 times, for both
FLUSH_DELALLOC and FLUSH_DELALLOC_WAIT, and then did the full loop
twice, we'd only flush 60MiB of the 128MiB delalloc space. This could
leave a fair bit of delalloc reservations still hanging around by the
time we go to ENOSPC out all the remaining tickets.
Fix this two ways. First, change the calculations to be a fraction of
the total delalloc bytes on the system. Prior to this change we were
calculating based on dirty inodes so our math made more sense, now it's
just completely unrelated to what we're actually doing.
Second add a FLUSH_DELALLOC_FULL state, that we hold off until we've
gone through the flush states at least once. This will empty the system
of all delalloc so we're sure to be truly out of space when we start
failing tickets.
I'm tagging stable 5.10 and forward, because this is where we started
using the page stuff heavily again. This affects earlier kernel
versions as well, but would be a pain to backport to them as the
flushing mechanisms aren't the same.
CC: stable(a)vger.kernel.org # 5.10+
Reviewed-by: Nikolay Borisov <nborisov(a)suse.com>
Signed-off-by: Josef Bacik <josef(a)toxicpanda.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 3cccf0f05666..fd3084feb4b5 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2779,10 +2779,11 @@ enum btrfs_flush_state {
FLUSH_DELAYED_REFS = 4,
FLUSH_DELALLOC = 5,
FLUSH_DELALLOC_WAIT = 6,
- ALLOC_CHUNK = 7,
- ALLOC_CHUNK_FORCE = 8,
- RUN_DELAYED_IPUTS = 9,
- COMMIT_TRANS = 10,
+ FLUSH_DELALLOC_FULL = 7,
+ ALLOC_CHUNK = 8,
+ ALLOC_CHUNK_FORCE = 9,
+ RUN_DELAYED_IPUTS = 10,
+ COMMIT_TRANS = 11,
};
int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index fbd492fe87f9..eb90a262563f 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -493,6 +493,11 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info,
long time_left;
int loops;
+ delalloc_bytes = percpu_counter_sum_positive(&fs_info->delalloc_bytes);
+ ordered_bytes = percpu_counter_sum_positive(&fs_info->ordered_bytes);
+ if (delalloc_bytes == 0 && ordered_bytes == 0)
+ return;
+
/* Calc the number of the pages we need flush for space reservation */
if (to_reclaim == U64_MAX) {
items = U64_MAX;
@@ -500,22 +505,21 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info,
/*
* to_reclaim is set to however much metadata we need to
* reclaim, but reclaiming that much data doesn't really track
- * exactly, so increase the amount to reclaim by 2x in order to
- * make sure we're flushing enough delalloc to hopefully reclaim
- * some metadata reservations.
+ * exactly. What we really want to do is reclaim full inode's
+ * worth of reservations, however that's not available to us
+ * here. We will take a fraction of the delalloc bytes for our
+ * flushing loops and hope for the best. Delalloc will expand
+ * the amount we write to cover an entire dirty extent, which
+ * will reclaim the metadata reservation for that range. If
+ * it's not enough subsequent flush stages will be more
+ * aggressive.
*/
+ to_reclaim = max(to_reclaim, delalloc_bytes >> 3);
items = calc_reclaim_items_nr(fs_info, to_reclaim) * 2;
- to_reclaim = items * EXTENT_SIZE_PER_ITEM;
}
trans = (struct btrfs_trans_handle *)current->journal_info;
- delalloc_bytes = percpu_counter_sum_positive(
- &fs_info->delalloc_bytes);
- ordered_bytes = percpu_counter_sum_positive(&fs_info->ordered_bytes);
- if (delalloc_bytes == 0 && ordered_bytes == 0)
- return;
-
/*
* If we are doing more ordered than delalloc we need to just wait on
* ordered extents, otherwise we'll waste time trying to flush delalloc
@@ -595,8 +599,11 @@ static void flush_space(struct btrfs_fs_info *fs_info,
break;
case FLUSH_DELALLOC:
case FLUSH_DELALLOC_WAIT:
+ case FLUSH_DELALLOC_FULL:
+ if (state == FLUSH_DELALLOC_FULL)
+ num_bytes = U64_MAX;
shrink_delalloc(fs_info, space_info, num_bytes,
- state == FLUSH_DELALLOC_WAIT, for_preempt);
+ state != FLUSH_DELALLOC, for_preempt);
break;
case FLUSH_DELAYED_REFS_NR:
case FLUSH_DELAYED_REFS:
@@ -906,6 +913,14 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
commit_cycles--;
}
+ /*
+ * We do not want to empty the system of delalloc unless we're
+ * under heavy pressure, so allow one trip through the flushing
+ * logic before we start doing a FLUSH_DELALLOC_FULL.
+ */
+ if (flush_state == FLUSH_DELALLOC_FULL && !commit_cycles)
+ flush_state++;
+
/*
* We don't want to force a chunk allocation until we've tried
* pretty hard to reclaim space. Think of the case where we
@@ -1069,7 +1084,7 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
* so if we now have space to allocate do the force chunk allocation.
*/
static const enum btrfs_flush_state data_flush_states[] = {
- FLUSH_DELALLOC_WAIT,
+ FLUSH_DELALLOC_FULL,
RUN_DELAYED_IPUTS,
COMMIT_TRANS,
ALLOC_CHUNK_FORCE,
@@ -1158,6 +1173,7 @@ static const enum btrfs_flush_state evict_flush_states[] = {
FLUSH_DELAYED_REFS,
FLUSH_DELALLOC,
FLUSH_DELALLOC_WAIT,
+ FLUSH_DELALLOC_FULL,
ALLOC_CHUNK,
COMMIT_TRANS,
};
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index c7d19eadecc5..8f58fd95efc7 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -94,6 +94,7 @@ struct btrfs_space_info;
EM( FLUSH_DELAYED_ITEMS, "FLUSH_DELAYED_ITEMS") \
EM( FLUSH_DELALLOC, "FLUSH_DELALLOC") \
EM( FLUSH_DELALLOC_WAIT, "FLUSH_DELALLOC_WAIT") \
+ EM( FLUSH_DELALLOC_FULL, "FLUSH_DELALLOC_FULL") \
EM( FLUSH_DELAYED_REFS_NR, "FLUSH_DELAYED_REFS_NR") \
EM( FLUSH_DELAYED_REFS, "FLUSH_ELAYED_REFS") \
EM( ALLOC_CHUNK, "ALLOC_CHUNK") \
The GIC driver uses a RMW sequence to update the affinity, and
relies on the gic_lock_irqsave/gic_unlock_irqrestore sequences
to update it atomically.
But these sequences only expend into anything meaningful if
the BL_SWITCHER option is selected, which almost never happens.
It also turns out that using a RMW and locks is just as silly,
as the GIC distributor supports byte accesses for the GICD_TARGETRn
registers, which when used make the update atomic by definition.
Drop the terminally broken code and replace it by a byte write.
Fixes: 04c8b0f82c7d ("irqchip/gic: Make locking a BL_SWITCHER only feature")
Cc: stable(a)vger.kernel.org
Signed-off-by: Marc Zyngier <maz(a)kernel.org>
---
drivers/irqchip/irq-gic.c | 14 +++-----------
1 file changed, 3 insertions(+), 11 deletions(-)
diff --git a/drivers/irqchip/irq-gic.c b/drivers/irqchip/irq-gic.c
index 00de05abd3c3..c17fabd6741e 100644
--- a/drivers/irqchip/irq-gic.c
+++ b/drivers/irqchip/irq-gic.c
@@ -329,10 +329,8 @@ static int gic_irq_set_vcpu_affinity(struct irq_data *d, void *vcpu)
static int gic_set_affinity(struct irq_data *d, const struct cpumask *mask_val,
bool force)
{
- void __iomem *reg = gic_dist_base(d) + GIC_DIST_TARGET + (gic_irq(d) & ~3);
- unsigned int cpu, shift = (gic_irq(d) % 4) * 8;
- u32 val, mask, bit;
- unsigned long flags;
+ void __iomem *reg = gic_dist_base(d) + GIC_DIST_TARGET + gic_irq(d);
+ unsigned int cpu;
if (!force)
cpu = cpumask_any_and(mask_val, cpu_online_mask);
@@ -342,13 +340,7 @@ static int gic_set_affinity(struct irq_data *d, const struct cpumask *mask_val,
if (cpu >= NR_GIC_CPU_IF || cpu >= nr_cpu_ids)
return -EINVAL;
- gic_lock_irqsave(flags);
- mask = 0xff << shift;
- bit = gic_cpu_map[cpu] << shift;
- val = readl_relaxed(reg) & ~mask;
- writel_relaxed(val | bit, reg);
- gic_unlock_irqrestore(flags);
-
+ writeb_relaxed(gic_cpu_map[cpu], reg);
irq_data_update_effective_affinity(d, cpumask_of(cpu));
return IRQ_SET_MASK_OK_DONE;
--
2.27.0
Adds mutex guard to the VMSA updating code. Also adds a check to skip a
vCPU if it has already been LAUNCH_UPDATE_VMSA'd which should allow
userspace to retry this ioctl until all the vCPUs can be successfully
LAUNCH_UPDATE_VMSA'd. Because this operation cannot be undone we cannot
unwind if one vCPU fails.
Fixes: ad73109ae7ec ("KVM: SVM: Provide support to launch and run an SEV-ES guest")
Signed-off-by: Peter Gonda <pgonda(a)google.com>
Cc: Marc Orr <marcorr(a)google.com>
Cc: Paolo Bonzini <pbonzini(a)redhat.com>
Cc: Sean Christopherson <seanjc(a)google.com>
Cc: Brijesh Singh <brijesh.singh(a)amd.com>
Cc: kvm(a)vger.kernel.org
Cc: stable(a)vger.kernel.org
Cc: linux-kernel(a)vger.kernel.org
---
arch/x86/kvm/svm/sev.c | 24 +++++++++++++++++++-----
1 file changed, 19 insertions(+), 5 deletions(-)
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 75e0b21ad07c..9a2ebd0328ca 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -598,22 +598,29 @@ static int sev_es_sync_vmsa(struct vcpu_svm *svm)
static int sev_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
- struct sev_data_launch_update_vmsa vmsa;
+ struct sev_data_launch_update_vmsa vmsa = {0};
struct kvm_vcpu *vcpu;
int i, ret;
if (!sev_es_guest(kvm))
return -ENOTTY;
- vmsa.reserved = 0;
-
kvm_for_each_vcpu(i, vcpu, kvm) {
struct vcpu_svm *svm = to_svm(vcpu);
+ ret = mutex_lock_killable(&vcpu->mutex);
+ if (ret)
+ goto out_unlock;
+
+ /* Skip to the next vCPU if this one has already be updated. */
+ ret = sev_es_sync_vmsa(svm);
+ if (svm->vcpu.arch.guest_state_protected)
+ goto unlock;
+
/* Perform some pre-encryption checks against the VMSA */
ret = sev_es_sync_vmsa(svm);
if (ret)
- return ret;
+ goto out_unlock;
/*
* The LAUNCH_UPDATE_VMSA command will perform in-place
@@ -629,12 +636,19 @@ static int sev_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp)
ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_VMSA, &vmsa,
&argp->error);
if (ret)
- return ret;
+ goto out_unlock;
svm->vcpu.arch.guest_state_protected = true;
+
+unlock:
+ mutex_unlock(&vcpu->mutex);
}
return 0;
+
+out_unlock:
+ mutex_unlock(&vcpu->mutex);
+ return ret;
}
static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp)
--
2.33.0.464.g1972c5931b-goog
Good day,
My name is Luis Fernandez.I would like to discuss something
important that will benefit both of us. I will send you more
details upon your response.
Regards
Luis Fernandez
Copying an ASID into new vCPUs will not work for SEV-ES since the vCPUs
VMSAs need to be setup and measured before SEV_LAUNCH_FINISH. Return an
error if a users tries to KVM_CAP_VM_COPY_ENC_CONTEXT_FROM from an
SEV-ES guest. The destination VM is already checked for SEV and SEV-ES
with sev_guest(), so this ioctl already fails if the destination is SEV
enabled.
Enabling mirroring a VM or copying its encryption context with an SEV-ES
VM is more involved and should happen in its own feature patch if that's
needed. This is because the vCPUs of SEV-ES VMs need to be updated with
LAUNCH_UPDATE_VMSA before LAUNCH_FINISH. This needs KVM changes because
the mirror VM has all its SEV ioctls blocked and the original VM doesn't
know about the mirrors vCPUs.
Fixes: 54526d1fd593 ("KVM: x86: Support KVM VMs sharing SEV context")
V2:
* Updated changelog with more information and added stable CC.
Signed-off-by: Peter Gonda <pgonda(a)google.com>
Cc: Marc Orr <marcorr(a)google.com>
Cc: Paolo Bonzini <pbonzini(a)redhat.com>
Cc: Sean Christopherson <seanjc(a)google.com>
Cc: Nathan Tempelman <natet(a)google.com>
Cc: Brijesh Singh <brijesh.singh(a)amd.com>
Cc: kvm(a)vger.kernel.org
Cc: stable(a)vger.kernel.org
Cc: linux-kernel(a)vger.kernel.org
---
arch/x86/kvm/svm/sev.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 75e0b21ad07c..8a279027425f 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -1728,7 +1728,7 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
source_kvm = source_kvm_file->private_data;
mutex_lock(&source_kvm->lock);
- if (!sev_guest(source_kvm)) {
+ if (!sev_guest(source_kvm) || sev_es_guest(source_kvm)) {
ret = -EINVAL;
goto e_source_unlock;
}
--
2.33.0.309.g3052b89438-goog
On Tue, Sep 14, 2021 at 11:32 AM Sean Christopherson <seanjc(a)google.com> wrote:
>
> On Tue, Sep 14, 2021, Peter Gonda wrote:
> > Copying an ASID into new vCPUs will not work for SEV-ES since the vCPUs
> > VMSAs need to be setup and measured before SEV_LAUNCH_FINISH. Return an
> > error if a users tries to KVM_CAP_VM_COPY_ENC_CONTEXT_FROM from an
> > SEV-ES guest.
>
> What happens if userspace does KVM_CAP_VM_COPY_ENC_CONTEXT_FROM before the source
> has created vCPUs, i.e. before it has done SEV_LAUNCH_FINISH?
That's not enough. If you wanted to be able to mirror SEV-ES you'd
also need to call LAUNCH_UPDATE_VMSA on the mirror's vCPUs before
SEV_LAUNCH_FINISH. That is do-able but I was writing a small change to
fix this bug. If mirroring of SEV-ES is wanted it's a much bigger
change.
>
> Might be worth noting that the destination cannot be an SEV guest, and therefore
> can't be an SEV-ES guest either.
sev_guest() implies sev_es_guest() so I think this case is covered.
>
> > Fixes: 54526d1fd593 ("KVM: x86: Support KVM VMs sharing SEV context")
>
> Cc: stable(a)vger.kernel.org
Oops. I'll update in the V2 if needed. Added to this thread for now.
If the IR Toy is receiving IR while a transmit is done, it may end up
hanging. We can prevent this from happening by re-entering sample mode
just before issuing the transmit command.
Link: https://github.com/bengtmartensson/HarcHardware/discussions/25
Cc: stable(a)vger.kernel.org
Signed-off-by: Sean Young <sean(a)mess.org>
---
drivers/media/rc/ir_toy.c | 21 ++++++++++++++++++++-
1 file changed, 20 insertions(+), 1 deletion(-)
diff --git a/drivers/media/rc/ir_toy.c b/drivers/media/rc/ir_toy.c
index d2d9346eb8f5..71aced52248f 100644
--- a/drivers/media/rc/ir_toy.c
+++ b/drivers/media/rc/ir_toy.c
@@ -26,6 +26,7 @@ static const u8 COMMAND_VERSION[] = { 'v' };
// End transmit and repeat reset command so we exit sump mode
static const u8 COMMAND_RESET[] = { 0xff, 0xff, 0, 0, 0, 0, 0 };
static const u8 COMMAND_SMODE_ENTER[] = { 's' };
+static const u8 COMMAND_SMODE_EXIT[] = { 0 };
static const u8 COMMAND_TXSTART[] = { 0x26, 0x24, 0x25, 0x03 };
#define REPLY_XMITCOUNT 't'
@@ -317,12 +318,30 @@ static int irtoy_tx(struct rc_dev *rc, uint *txbuf, uint count)
buf[i] = cpu_to_be16(v);
}
- buf[count] = cpu_to_be16(0xffff);
+ buf[count] = 0xffff;
irtoy->tx_buf = buf;
irtoy->tx_len = size;
irtoy->emitted = 0;
+ // There is an issue where if the unit is receiving IR while the
+ // first TXSTART command is sent, the device might end up hanging
+ // with its led on. It does not respond to any command when this
+ // happens. To work around this, re-enter sample mode.
+ err = irtoy_command(irtoy, COMMAND_SMODE_EXIT,
+ sizeof(COMMAND_SMODE_EXIT), STATE_COMMAND_NO_RESP);
+ if (err) {
+ dev_err(irtoy->dev, "exit sample mode: %d\n", err);
+ return err;
+ }
+
+ err = irtoy_command(irtoy, COMMAND_SMODE_ENTER,
+ sizeof(COMMAND_SMODE_ENTER), STATE_COMMAND);
+ if (err) {
+ dev_err(irtoy->dev, "enter sample mode: %d\n", err);
+ return err;
+ }
+
err = irtoy_command(irtoy, COMMAND_TXSTART, sizeof(COMMAND_TXSTART),
STATE_TX);
kfree(buf);
--
2.31.1
From: Douglas Anderson <dianders(a)chromium.org>
[ Upstream commit a70e558c151043ce46a5e5999f4310e0b3551f57 ]
This is really just a revert of commit 58074b08c04a ("drm/bridge:
ti-sn65dsi86: Read EDID blob over DDC"), resolving conflicts.
The old code failed to read the EDID properly in a very important
case: before the bridge's pre_enable() was called. The way things need
to work:
1. Read the EDID.
2. Based on the EDID, decide on video settings and pixel clock.
3. Enable the bridge w/ the desired settings.
The way things were working:
1. Try to read the EDID but fail; fall back to hardcoded values.
2. Based on hardcoded values, decide on video settings and pixel clock.
3. Enable the bridge w/ the desired settings.
4. Try again to read the EDID, it works now!
5. Realize that the hardcoded settings weren't quite right.
6. Disable / reenable the bridge w/ the right settings.
The reasons for the failures were twofold:
a) Since we never ran the bridge chip's pre-enable then we never set
the bit to ignore HPD. This meant the bridge chip didn't even _try_
to go out on the bus and communicate with the panel.
b) Even if we fixed things to ignore HPD, the EDID still wouldn't read
if the panel wasn't on.
Instead of reverting the code, we could fix it to set the HPD bit and
also power on the panel. However, it also works nicely to just let the
panel code read the EDID. Now that we've split the driver up we can
expose the DDC AUX channel bus to the panel node. The panel can take
charge of reading the EDID.
NOTE: in order for things to work, anyone that needs to read the EDID
will need to instantiate their panel using the new DP AUX bus (AKA by
listing their panel under the "aux-bus" node of the bridge chip in the
device tree).
In the future if we want to use the bridge chip to provide a full
external DP port (which won't have a panel) then we will have to
conditinally add EDID reading back in.
Suggested-by: Andrzej Hajda <a.hajda(a)samsung.com>
Signed-off-by: Douglas Anderson <dianders(a)chromium.org>
Reviewed-by: Bjorn Andersson <bjorn.andersson(a)linaro.org>
Link: https://patchwork.freedesktop.org/patch/msgid/20210611101711.v10.9.I9330684…
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
drivers/gpu/drm/bridge/ti-sn65dsi86.c | 22 ----------------------
1 file changed, 22 deletions(-)
diff --git a/drivers/gpu/drm/bridge/ti-sn65dsi86.c b/drivers/gpu/drm/bridge/ti-sn65dsi86.c
index 45a2969afb2b..aef850296756 100644
--- a/drivers/gpu/drm/bridge/ti-sn65dsi86.c
+++ b/drivers/gpu/drm/bridge/ti-sn65dsi86.c
@@ -124,7 +124,6 @@
* @connector: Our connector.
* @host_node: Remote DSI node.
* @dsi: Our MIPI DSI source.
- * @edid: Detected EDID of eDP panel.
* @refclk: Our reference clock.
* @panel: Our panel.
* @enable_gpio: The GPIO we toggle to enable the bridge.
@@ -154,7 +153,6 @@ struct ti_sn65dsi86 {
struct drm_dp_aux aux;
struct drm_bridge bridge;
struct drm_connector connector;
- struct edid *edid;
struct device_node *host_node;
struct mipi_dsi_device *dsi;
struct clk *refclk;
@@ -403,24 +401,6 @@ connector_to_ti_sn65dsi86(struct drm_connector *connector)
static int ti_sn_bridge_connector_get_modes(struct drm_connector *connector)
{
struct ti_sn65dsi86 *pdata = connector_to_ti_sn65dsi86(connector);
- struct edid *edid = pdata->edid;
- int num, ret;
-
- if (!edid) {
- pm_runtime_get_sync(pdata->dev);
- edid = pdata->edid = drm_get_edid(connector, &pdata->aux.ddc);
- pm_runtime_put_autosuspend(pdata->dev);
- }
-
- if (edid && drm_edid_is_valid(edid)) {
- ret = drm_connector_update_edid_property(connector, edid);
- if (!ret) {
- num = drm_add_edid_modes(connector, edid);
- if (num)
- return num;
- }
- }
-
return drm_panel_get_modes(pdata->panel, connector);
}
@@ -1358,8 +1338,6 @@ static void ti_sn_bridge_remove(struct auxiliary_device *adev)
mipi_dsi_device_unregister(pdata->dsi);
}
- kfree(pdata->edid);
-
drm_bridge_remove(&pdata->bridge);
of_node_put(pdata->host_node);
--
2.30.2
Hi Greg & Sasha,
tl;dr: Please add 2dce224f469f ("netns: protect netns ID lookups with RCU") to the stable releases from v5.4 and older. It fixes a spin_unlock_bh() in peernet2id() called with IRQs off. I think this neat side-effect of commit 2dce224f469f was quite un-intentional, hence no Fixes: tag or CC: stable.
The details:
From bugzilla.redhat.com/show_bug.cgi?id=1384179 (an ancient 4.9.0-0.rc0 kernel):
dump_stack+0x86/0xc3
__warn+0xcb/0xf0
warn_slowpath_null+0x1d/0x20
__local_bh_enable_ip+0x9d/0xc0
_raw_spin_unlock_bh+0x35/0x40
peernet2id+0x54/0x80
netlink_broadcast_filtered+0x220/0x3c0
netlink_broadcast+0x1d/0x20
audit_log+0x6a/0x90
security_set_bools+0xee/0x200
[]
Note, security_set_bools() calls write_lock_irq(). peernet2id() calls spin_unlock_bh().
From an internal (UEK) stack trace based on the v4.14.35 kernel (LTS 4.14.231):
queued_spin_lock_slowpath+0xb/0xf
_raw_spin_lock_irqsave+0x46/0x48
send_mad+0x3d2/0x590 [ib_core]
ib_sa_path_rec_get+0x223/0x4d0 [ib_core]
path_rec_start+0xa3/0x140 [ib_ipoib]
ipoib_start_xmit+0x2b0/0x6a0 [ib_ipoib]
dev_hard_start_xmit+0xb2/0x237
sch_direct_xmit+0x114/0x1bf
__dev_queue_xmit+0x592/0x818
dev_queue_xmit+0x10/0x12
arp_xmit+0x38/0xa6
arp_send_dst.part.16+0x61/0x84
arp_process+0x825/0x889
arp_rcv+0x140/0x1c9
__netif_receive_skb_core+0x401/0xb39
__netif_receive_skb+0x18/0x59
netif_receive_skb_internal+0x45/0x119
napi_gro_receive+0xd8/0xf6
ipoib_ib_handle_rx_wc+0x1ca/0x520 [ib_ipoib]
ipoib_poll+0xcd/0x150 [ib_ipoib]
net_rx_action+0x289/0x3f4
__do_softirq+0xe1/0x2b5
do_softirq_own_stack+0x2a/0x35
</IRQ>
do_softirq+0x4d/0x6a
__local_bh_enable_ip+0x57/0x59
_raw_spin_unlock_bh+0x23/0x25
peernet2id+0x51/0x73
netlink_broadcast_filtered+0x223/0x41b
netlink_broadcast+0x1d/0x1f
rdma_nl_multicast+0x22/0x30 [ib_core]
send_mad+0x3e5/0x590 [ib_core]
ib_sa_path_rec_get+0x223/0x4d0 [ib_core]
rdma_resolve_route+0x287/0x810 [rdma_cm]
rds_rdma_cm_event_handler_cmn+0x311/0x7d0 [rds_rdma]
rds_rdma_cm_event_handler_worker+0x22/0x30 [rds_rdma]
process_one_work+0x169/0x3a6
worker_thread+0x4d/0x3e5
kthread+0x105/0x138
ret_from_fork+0x24/0x49
Here, pay attention to ib_nl_make_request() which calls spin_lock_irqsave() on a global lock just before calling rdma_nl_multicast(). Thereafter, peernet2id() enables SoftIRQs, and ipoib starts and calls the same path and end up trying to acquire the same global lock again.
I have tried to repro this with no luck. But, stack traces seldom lies ;-)
Thxs, Håkon
This is a note to let you know that I've just added the patch titled
misc: genwqe: Fixes DMA mask setting
to my char-misc git tree which can be found at
git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/char-misc.git
in the char-misc-linus branch.
The patch will show up in the next release of the linux-next tree
(usually sometime within the next 24 hours during the week.)
The patch will hopefully also be merged in Linus's tree for the
next -rc kernel release.
If you have any questions about this process, please let me know.
>From 8d753db5c227d1f403c4bc9cae4ae02c862413cd Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet(a)wanadoo.fr>
Date: Wed, 8 Sep 2021 21:55:56 +0200
Subject: misc: genwqe: Fixes DMA mask setting
Commit 505b08777d78 ("misc: genwqe: Use dma_set_mask_and_coherent to simplify code")
changed the logic in the code.
Instead of a ||, a && should have been used to keep the code the same.
Fixes: 505b08777d78 ("misc: genwqe: Use dma_set_mask_and_coherent to simplify code")
Cc: stable <stable(a)vger.kernel.org>
Signed-off-by: Christophe JAILLET <christophe.jaillet(a)wanadoo.fr>
Link: https://lore.kernel.org/r/be49835baa8ba6daba5813b399edf6300f7fdbda.16311308…
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
drivers/misc/genwqe/card_base.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/misc/genwqe/card_base.c b/drivers/misc/genwqe/card_base.c
index 2e1befbd1ad9..693981891870 100644
--- a/drivers/misc/genwqe/card_base.c
+++ b/drivers/misc/genwqe/card_base.c
@@ -1090,7 +1090,7 @@ static int genwqe_pci_setup(struct genwqe_dev *cd)
/* check for 64-bit DMA address supported (DAC) */
/* check for 32-bit DMA address supported (SAC) */
- if (dma_set_mask_and_coherent(&pci_dev->dev, DMA_BIT_MASK(64)) ||
+ if (dma_set_mask_and_coherent(&pci_dev->dev, DMA_BIT_MASK(64)) &&
dma_set_mask_and_coherent(&pci_dev->dev, DMA_BIT_MASK(32))) {
dev_err(&pci_dev->dev,
"err: neither DMA32 nor DMA64 supported\n");
--
2.33.0
This is a note to let you know that I've just added the patch titled
usb: gadget: f_uac2: Add missing companion descriptor for feedback EP
to my usb git tree which can be found at
git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/usb.git
in the usb-linus branch.
The patch will show up in the next release of the linux-next tree
(usually sometime within the next 24 hours during the week.)
The patch will hopefully also be merged in Linus's tree for the
next -rc kernel release.
If you have any questions about this process, please let me know.
>From 595091a1426a3b2625dad322f69fe569dc9d8943 Mon Sep 17 00:00:00 2001
From: Jack Pham <jackp(a)codeaurora.org>
Date: Thu, 9 Sep 2021 10:48:10 -0700
Subject: usb: gadget: f_uac2: Add missing companion descriptor for feedback EP
The f_uac2 function fails to enumerate when connected in SuperSpeed
due to the feedback endpoint missing the companion descriptor.
Add a new ss_epin_fback_desc_comp descriptor and append it behind the
ss_epin_fback_desc both in the static definition of the ss_audio_desc
structure as well as its dynamic construction in setup_headers().
Fixes: 24f779dac8f3 ("usb: gadget: f_uac2/u_audio: add feedback endpoint support")
Cc: stable <stable(a)vger.kernel.org>
Signed-off-by: Jack Pham <jackp(a)codeaurora.org>
Link: https://lore.kernel.org/r/20210909174811.12534-2-jackp@codeaurora.org
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
drivers/usb/gadget/function/f_uac2.c | 16 +++++++++++++++-
1 file changed, 15 insertions(+), 1 deletion(-)
diff --git a/drivers/usb/gadget/function/f_uac2.c b/drivers/usb/gadget/function/f_uac2.c
index 3c34995276e7..d89c1ebb07f4 100644
--- a/drivers/usb/gadget/function/f_uac2.c
+++ b/drivers/usb/gadget/function/f_uac2.c
@@ -406,6 +406,14 @@ static struct usb_endpoint_descriptor ss_epin_fback_desc = {
.bInterval = 4,
};
+static struct usb_ss_ep_comp_descriptor ss_epin_fback_desc_comp = {
+ .bLength = sizeof(ss_epin_fback_desc_comp),
+ .bDescriptorType = USB_DT_SS_ENDPOINT_COMP,
+ .bMaxBurst = 0,
+ .bmAttributes = 0,
+ .wBytesPerInterval = cpu_to_le16(4),
+};
+
/* Audio Streaming IN Interface - Alt0 */
static struct usb_interface_descriptor std_as_in_if0_desc = {
@@ -597,6 +605,7 @@ static struct usb_descriptor_header *ss_audio_desc[] = {
(struct usb_descriptor_header *)&ss_epout_desc_comp,
(struct usb_descriptor_header *)&as_iso_out_desc,
(struct usb_descriptor_header *)&ss_epin_fback_desc,
+ (struct usb_descriptor_header *)&ss_epin_fback_desc_comp,
(struct usb_descriptor_header *)&std_as_in_if0_desc,
(struct usb_descriptor_header *)&std_as_in_if1_desc,
@@ -705,6 +714,7 @@ static void setup_headers(struct f_uac2_opts *opts,
{
struct usb_ss_ep_comp_descriptor *epout_desc_comp = NULL;
struct usb_ss_ep_comp_descriptor *epin_desc_comp = NULL;
+ struct usb_ss_ep_comp_descriptor *epin_fback_desc_comp = NULL;
struct usb_endpoint_descriptor *epout_desc;
struct usb_endpoint_descriptor *epin_desc;
struct usb_endpoint_descriptor *epin_fback_desc;
@@ -730,6 +740,7 @@ static void setup_headers(struct f_uac2_opts *opts,
epout_desc_comp = &ss_epout_desc_comp;
epin_desc_comp = &ss_epin_desc_comp;
epin_fback_desc = &ss_epin_fback_desc;
+ epin_fback_desc_comp = &ss_epin_fback_desc_comp;
ep_int_desc = &ss_ep_int_desc;
}
@@ -773,8 +784,11 @@ static void setup_headers(struct f_uac2_opts *opts,
headers[i++] = USBDHDR(&as_iso_out_desc);
- if (EPOUT_FBACK_IN_EN(opts))
+ if (EPOUT_FBACK_IN_EN(opts)) {
headers[i++] = USBDHDR(epin_fback_desc);
+ if (epin_fback_desc_comp)
+ headers[i++] = USBDHDR(epin_fback_desc_comp);
+ }
}
if (EPIN_EN(opts)) {
--
2.33.0
This is a note to let you know that I've just added the patch titled
usb: gadget: f_uac2: Populate SS descriptors' wBytesPerInterval
to my usb git tree which can be found at
git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/usb.git
in the usb-linus branch.
The patch will show up in the next release of the linux-next tree
(usually sometime within the next 24 hours during the week.)
The patch will hopefully also be merged in Linus's tree for the
next -rc kernel release.
If you have any questions about this process, please let me know.
>From f0e8a206a2a53a919e1709c654cb65d519f7befb Mon Sep 17 00:00:00 2001
From: Jack Pham <jackp(a)codeaurora.org>
Date: Thu, 9 Sep 2021 10:48:11 -0700
Subject: usb: gadget: f_uac2: Populate SS descriptors' wBytesPerInterval
For Isochronous endpoints, the SS companion descriptor's
wBytesPerInterval field is required to reserve bus time in order
to transmit the required payload during the service interval.
If left at 0, the UAC2 function is unable to transact data on its
playback or capture endpoints in SuperSpeed mode.
Since f_uac2 currently does not support any bursting this value can
be exactly equal to the calculated wMaxPacketSize.
Tested with Windows 10 as a host.
Fixes: f8cb3d556be3 ("usb: f_uac2: adds support for SS and SSP")
Cc: stable <stable(a)vger.kernel.org>
Signed-off-by: Jack Pham <jackp(a)codeaurora.org>
Link: https://lore.kernel.org/r/20210909174811.12534-3-jackp@codeaurora.org
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
drivers/usb/gadget/function/f_uac2.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/drivers/usb/gadget/function/f_uac2.c b/drivers/usb/gadget/function/f_uac2.c
index d89c1ebb07f4..be864560bfea 100644
--- a/drivers/usb/gadget/function/f_uac2.c
+++ b/drivers/usb/gadget/function/f_uac2.c
@@ -1178,6 +1178,9 @@ afunc_bind(struct usb_configuration *cfg, struct usb_function *fn)
agdev->out_ep_maxpsize = max_t(u16, agdev->out_ep_maxpsize,
le16_to_cpu(ss_epout_desc.wMaxPacketSize));
+ ss_epin_desc_comp.wBytesPerInterval = ss_epin_desc.wMaxPacketSize;
+ ss_epout_desc_comp.wBytesPerInterval = ss_epout_desc.wMaxPacketSize;
+
// HS and SS endpoint addresses are copied from autoconfigured FS descriptors
hs_ep_int_desc.bEndpointAddress = fs_ep_int_desc.bEndpointAddress;
hs_epout_desc.bEndpointAddress = fs_epout_desc.bEndpointAddress;
--
2.33.0
This is a note to let you know that I've just added the patch titled
usb: dwc2: gadget: Fix ISOC transfer complete handling for DDMA
to my usb git tree which can be found at
git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/usb.git
in the usb-linus branch.
The patch will show up in the next release of the linux-next tree
(usually sometime within the next 24 hours during the week.)
The patch will hopefully also be merged in Linus's tree for the
next -rc kernel release.
If you have any questions about this process, please let me know.
>From dbe2518b2d8eabffa74dbf7d9fdd7dacddab7fc0 Mon Sep 17 00:00:00 2001
From: Minas Harutyunyan <Minas.Harutyunyan(a)synopsys.com>
Date: Sat, 11 Sep 2021 22:58:30 +0400
Subject: usb: dwc2: gadget: Fix ISOC transfer complete handling for DDMA
When last descriptor in a descriptor list completed with XferComplete
interrupt, core switching to handle next descriptor and assert BNA
interrupt. Both these interrupts are set while dwc2_hsotg_epint()
handler called. Each interrupt should be handled separately: first
XferComplete interrupt then BNA interrupt, otherwise last completed
transfer will not be giveback to function driver as completed
request.
Fixes: 729cac693eec ("usb: dwc2: Change ISOC DDMA flow")
Cc: stable <stable(a)vger.kernel.org>
Signed-off-by: Minas Harutyunyan <Minas.Harutyunyan(a)synopsys.com>
Link: https://lore.kernel.org/r/a36981accc26cd674c5d8f8da6164344b94ec1fe.16313865…
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
drivers/usb/dwc2/gadget.c | 4 +---
1 file changed, 1 insertion(+), 3 deletions(-)
diff --git a/drivers/usb/dwc2/gadget.c b/drivers/usb/dwc2/gadget.c
index f09cbdfac9df..11d85a6e0b0d 100644
--- a/drivers/usb/dwc2/gadget.c
+++ b/drivers/usb/dwc2/gadget.c
@@ -3067,9 +3067,7 @@ static void dwc2_hsotg_epint(struct dwc2_hsotg *hsotg, unsigned int idx,
/* In DDMA handle isochronous requests separately */
if (using_desc_dma(hsotg) && hs_ep->isochronous) {
- /* XferCompl set along with BNA */
- if (!(ints & DXEPINT_BNAINTR))
- dwc2_gadget_complete_isoc_request_ddma(hs_ep);
+ dwc2_gadget_complete_isoc_request_ddma(hs_ep);
} else if (dir_in) {
/*
* We get OutDone from the FIFO, so we only
--
2.33.0
This is a note to let you know that I've just added the patch titled
usb: dwc3: core: balance phy init and exit
to my usb git tree which can be found at
git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/usb.git
in the usb-linus branch.
The patch will show up in the next release of the linux-next tree
(usually sometime within the next 24 hours during the week.)
The patch will hopefully also be merged in Linus's tree for the
next -rc kernel release.
If you have any questions about this process, please let me know.
>From 8cfac9a6744fcb143cb3e94ce002f09fd17fadbb Mon Sep 17 00:00:00 2001
From: Li Jun <jun.li(a)nxp.com>
Date: Wed, 8 Sep 2021 10:28:19 +0800
Subject: usb: dwc3: core: balance phy init and exit
After we start to do core soft reset while usb role switch,
the phy init is invoked at every switch to device mode, but
its counter part de-init is missing, this causes the actual
phy init can not be done when we really want to re-init phy
like system resume, because the counter maintained by phy
core is not 0. considering phy init is actually redundant for
role switch, so move out the phy init from core soft reset to
dwc3 core init where is the only place required.
Fixes: f88359e1588b ("usb: dwc3: core: Do core softreset when switch mode")
Cc: <stable(a)vger.kernel.org>
Tested-by: faqiang.zhu <faqiang.zhu(a)nxp.com>
Tested-by: John Stultz <john.stultz(a)linaro.org> #HiKey960
Acked-by: Felipe Balbi <balbi(a)kernel.org>
Signed-off-by: Li Jun <jun.li(a)nxp.com>
Link: https://lore.kernel.org/r/1631068099-13559-1-git-send-email-jun.li@nxp.com
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
drivers/usb/dwc3/core.c | 30 +++++++++++++-----------------
1 file changed, 13 insertions(+), 17 deletions(-)
diff --git a/drivers/usb/dwc3/core.c b/drivers/usb/dwc3/core.c
index 01866dcb953b..0104a80b185e 100644
--- a/drivers/usb/dwc3/core.c
+++ b/drivers/usb/dwc3/core.c
@@ -264,19 +264,6 @@ static int dwc3_core_soft_reset(struct dwc3 *dwc)
{
u32 reg;
int retries = 1000;
- int ret;
-
- usb_phy_init(dwc->usb2_phy);
- usb_phy_init(dwc->usb3_phy);
- ret = phy_init(dwc->usb2_generic_phy);
- if (ret < 0)
- return ret;
-
- ret = phy_init(dwc->usb3_generic_phy);
- if (ret < 0) {
- phy_exit(dwc->usb2_generic_phy);
- return ret;
- }
/*
* We're resetting only the device side because, if we're in host mode,
@@ -310,9 +297,6 @@ static int dwc3_core_soft_reset(struct dwc3 *dwc)
udelay(1);
} while (--retries);
- phy_exit(dwc->usb3_generic_phy);
- phy_exit(dwc->usb2_generic_phy);
-
return -ETIMEDOUT;
done:
@@ -982,9 +966,21 @@ static int dwc3_core_init(struct dwc3 *dwc)
dwc->phys_ready = true;
}
+ usb_phy_init(dwc->usb2_phy);
+ usb_phy_init(dwc->usb3_phy);
+ ret = phy_init(dwc->usb2_generic_phy);
+ if (ret < 0)
+ goto err0a;
+
+ ret = phy_init(dwc->usb3_generic_phy);
+ if (ret < 0) {
+ phy_exit(dwc->usb2_generic_phy);
+ goto err0a;
+ }
+
ret = dwc3_core_soft_reset(dwc);
if (ret)
- goto err0a;
+ goto err1;
if (hw_mode == DWC3_GHWPARAMS0_MODE_DRD &&
!DWC3_VER_IS_WITHIN(DWC3, ANY, 194A)) {
--
2.33.0
This is a note to let you know that I've just added the patch titled
Revert "USB: bcma: Add a check for devm_gpiod_get"
to my usb git tree which can be found at
git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/usb.git
in the usb-linus branch.
The patch will show up in the next release of the linux-next tree
(usually sometime within the next 24 hours during the week.)
The patch will hopefully also be merged in Linus's tree for the
next -rc kernel release.
If you have any questions about this process, please let me know.
>From d91adc5322ab53df4b6d1989242bfb6c63163eb2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= <rafal(a)milecki.pl>
Date: Tue, 31 Aug 2021 08:54:19 +0200
Subject: Revert "USB: bcma: Add a check for devm_gpiod_get"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
This reverts commit f3de5d857bb2362b00e2a8d4bc886cd49dcb66db.
That commit broke USB on all routers that have USB always powered on and
don't require toggling any GPIO. It's a majority of devices actually.
The original code worked and seemed safe: vcc GPIO is optional and
bcma_hci_platform_power_gpio() takes care of checking the pointer before
using it.
This revert fixes:
[ 10.801127] bcma_hcd: probe of bcma0:11 failed with error -2
Fixes: f3de5d857bb2 ("USB: bcma: Add a check for devm_gpiod_get")
Cc: stable <stable(a)vger.kernel.org>
Cc: Chuhong Yuan <hslester96(a)gmail.com>
Signed-off-by: Rafał Miłecki <rafal(a)milecki.pl>
Link: https://lore.kernel.org/r/20210831065419.18371-1-zajec5@gmail.com
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
drivers/usb/host/bcma-hcd.c | 5 +----
1 file changed, 1 insertion(+), 4 deletions(-)
diff --git a/drivers/usb/host/bcma-hcd.c b/drivers/usb/host/bcma-hcd.c
index 337b425dd4b0..2df52f75f6b3 100644
--- a/drivers/usb/host/bcma-hcd.c
+++ b/drivers/usb/host/bcma-hcd.c
@@ -406,12 +406,9 @@ static int bcma_hcd_probe(struct bcma_device *core)
return -ENOMEM;
usb_dev->core = core;
- if (core->dev.of_node) {
+ if (core->dev.of_node)
usb_dev->gpio_desc = devm_gpiod_get(&core->dev, "vcc",
GPIOD_OUT_HIGH);
- if (IS_ERR(usb_dev->gpio_desc))
- return PTR_ERR(usb_dev->gpio_desc);
- }
switch (core->id.id) {
case BCMA_CORE_USB20_HOST:
--
2.33.0
This is a note to let you know that I've just added the patch titled
usb: cdns3: fix race condition before setting doorbell
to my usb git tree which can be found at
git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/usb.git
in the usb-linus branch.
The patch will show up in the next release of the linux-next tree
(usually sometime within the next 24 hours during the week.)
The patch will hopefully also be merged in Linus's tree for the
next -rc kernel release.
If you have any questions about this process, please let me know.
>From b69ec50b3e55c4b2a85c8bc46763eaf330605847 Mon Sep 17 00:00:00 2001
From: Pawel Laszczak <pawell(a)cadence.com>
Date: Tue, 7 Sep 2021 08:26:19 +0200
Subject: usb: cdns3: fix race condition before setting doorbell
For DEV_VER_V3 version there exist race condition between clearing
ep_sts.EP_STS_TRBERR and setting ep_cmd.EP_CMD_DRDY bit.
Setting EP_CMD_DRDY will be ignored by controller when
EP_STS_TRBERR is set. So, between these two instructions we have
a small time gap in which the EP_STSS_TRBERR can be set. In such case
the transfer will not start after setting doorbell.
Fixes: 7733f6c32e36 ("usb: cdns3: Add Cadence USB3 DRD Driver")
cc: <stable(a)vger.kernel.org> # 5.12.x
Tested-by: Aswath Govindraju <a-govindraju(a)ti.com>
Reviewed-by: Aswath Govindraju <a-govindraju(a)ti.com>
Signed-off-by: Pawel Laszczak <pawell(a)cadence.com>
Link: https://lore.kernel.org/r/20210907062619.34622-1-pawell@gli-login.cadence.c…
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
drivers/usb/cdns3/cdns3-gadget.c | 14 ++++++++++++++
1 file changed, 14 insertions(+)
diff --git a/drivers/usb/cdns3/cdns3-gadget.c b/drivers/usb/cdns3/cdns3-gadget.c
index 5d8c982019af..1f3b4a142212 100644
--- a/drivers/usb/cdns3/cdns3-gadget.c
+++ b/drivers/usb/cdns3/cdns3-gadget.c
@@ -1100,6 +1100,19 @@ static int cdns3_ep_run_stream_transfer(struct cdns3_endpoint *priv_ep,
return 0;
}
+static void cdns3_rearm_drdy_if_needed(struct cdns3_endpoint *priv_ep)
+{
+ struct cdns3_device *priv_dev = priv_ep->cdns3_dev;
+
+ if (priv_dev->dev_ver < DEV_VER_V3)
+ return;
+
+ if (readl(&priv_dev->regs->ep_sts) & EP_STS_TRBERR) {
+ writel(EP_STS_TRBERR, &priv_dev->regs->ep_sts);
+ writel(EP_CMD_DRDY, &priv_dev->regs->ep_cmd);
+ }
+}
+
/**
* cdns3_ep_run_transfer - start transfer on no-default endpoint hardware
* @priv_ep: endpoint object
@@ -1351,6 +1364,7 @@ static int cdns3_ep_run_transfer(struct cdns3_endpoint *priv_ep,
/*clearing TRBERR and EP_STS_DESCMIS before seting DRDY*/
writel(EP_STS_TRBERR | EP_STS_DESCMIS, &priv_dev->regs->ep_sts);
writel(EP_CMD_DRDY, &priv_dev->regs->ep_cmd);
+ cdns3_rearm_drdy_if_needed(priv_ep);
trace_cdns3_doorbell_epx(priv_ep->name,
readl(&priv_dev->regs->ep_traddr));
}
--
2.33.0
This is a note to let you know that I've just added the patch titled
usb: gadget: u_audio: EP-OUT bInterval in fback frequency
to my usb git tree which can be found at
git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/usb.git
in the usb-linus branch.
The patch will show up in the next release of the linux-next tree
(usually sometime within the next 24 hours during the week.)
The patch will hopefully also be merged in Linus's tree for the
next -rc kernel release.
If you have any questions about this process, please let me know.
>From f5dfd98a80ff8d50cf4ae2820857d7f5a46cbab9 Mon Sep 17 00:00:00 2001
From: Pavel Hofman <pavel.hofman(a)ivitera.com>
Date: Mon, 6 Sep 2021 15:08:22 +0200
Subject: usb: gadget: u_audio: EP-OUT bInterval in fback frequency
The patch increases the bitshift in feedback frequency
calculation with EP-OUT bInterval value.
Tests have revealed that Win10 and OSX UAC2 drivers require
the feedback frequency to be based on the actual packet
interval instead of on the USB2 microframe. Otherwise they
ignore the feedback value. Linux snd-usb-audio driver
detects the applied bitshift automatically.
Tested-by: Henrik Enquist <henrik.enquist(a)gmail.com>
Signed-off-by: Pavel Hofman <pavel.hofman(a)ivitera.com>
Cc: stable <stable(a)vger.kernel.org>
Link: https://lore.kernel.org/r/20210906130822.12256-1-pavel.hofman@ivitera.com
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
drivers/usb/gadget/function/u_audio.c | 13 ++++++++++---
1 file changed, 10 insertions(+), 3 deletions(-)
diff --git a/drivers/usb/gadget/function/u_audio.c b/drivers/usb/gadget/function/u_audio.c
index 32ef22857083..ad16163b5ff8 100644
--- a/drivers/usb/gadget/function/u_audio.c
+++ b/drivers/usb/gadget/function/u_audio.c
@@ -96,11 +96,13 @@ static const struct snd_pcm_hardware uac_pcm_hardware = {
};
static void u_audio_set_fback_frequency(enum usb_device_speed speed,
+ struct usb_ep *out_ep,
unsigned long long freq,
unsigned int pitch,
void *buf)
{
u32 ff = 0;
+ const struct usb_endpoint_descriptor *ep_desc;
/*
* Because the pitch base is 1000000, the final divider here
@@ -128,8 +130,13 @@ static void u_audio_set_fback_frequency(enum usb_device_speed speed,
* byte fromat (that is Q16.16)
*
* ff = (freq << 16) / 8000
+ *
+ * Win10 and OSX UAC2 drivers require number of samples per packet
+ * in order to honor the feedback value.
+ * Linux snd-usb-audio detects the applied bit-shift automatically.
*/
- freq <<= 4;
+ ep_desc = out_ep->desc;
+ freq <<= 4 + (ep_desc->bInterval - 1);
}
ff = DIV_ROUND_CLOSEST_ULL((freq * pitch), 1953125);
@@ -267,7 +274,7 @@ static void u_audio_iso_fback_complete(struct usb_ep *ep,
pr_debug("%s: iso_complete status(%d) %d/%d\n",
__func__, status, req->actual, req->length);
- u_audio_set_fback_frequency(audio_dev->gadget->speed,
+ u_audio_set_fback_frequency(audio_dev->gadget->speed, audio_dev->out_ep,
params->c_srate, prm->pitch,
req->buf);
@@ -526,7 +533,7 @@ int u_audio_start_capture(struct g_audio *audio_dev)
* be meauserd at start of playback
*/
prm->pitch = 1000000;
- u_audio_set_fback_frequency(audio_dev->gadget->speed,
+ u_audio_set_fback_frequency(audio_dev->gadget->speed, ep,
params->c_srate, prm->pitch,
req_fback->buf);
--
2.33.0
This is a note to let you know that I've just added the patch titled
usb: gadget: r8a66597: fix a loop in set_feature()
to my usb git tree which can be found at
git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/usb.git
in the usb-linus branch.
The patch will show up in the next release of the linux-next tree
(usually sometime within the next 24 hours during the week.)
The patch will hopefully also be merged in Linus's tree for the
next -rc kernel release.
If you have any questions about this process, please let me know.
>From 17956b53ebff6a490baf580a836cbd3eae94892b Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter(a)oracle.com>
Date: Mon, 6 Sep 2021 12:42:21 +0300
Subject: usb: gadget: r8a66597: fix a loop in set_feature()
This loop is supposed to loop until if reads something other than
CS_IDST or until it times out after 30,000 attempts. But because of
the || vs && bug, it will never time out and instead it will loop a
minimum of 30,000 times.
This bug is quite old but the code is only used in USB_DEVICE_TEST_MODE
so it probably doesn't affect regular usage.
Fixes: 96fe53ef5498 ("usb: gadget: r8a66597-udc: add support for TEST_MODE")
Cc: stable <stable(a)vger.kernel.org>
Reviewed-by: Yoshihiro Shimoda <yoshihiro.shimoda.uh(a)renesas.com>
Acked-by: Felipe Balbi <balbi(a)kernel.org>
Signed-off-by: Dan Carpenter <dan.carpenter(a)oracle.com>
Link: https://lore.kernel.org/r/20210906094221.GA10957@kili
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
drivers/usb/gadget/udc/r8a66597-udc.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/usb/gadget/udc/r8a66597-udc.c b/drivers/usb/gadget/udc/r8a66597-udc.c
index 65cae4883454..38e4d6b505a0 100644
--- a/drivers/usb/gadget/udc/r8a66597-udc.c
+++ b/drivers/usb/gadget/udc/r8a66597-udc.c
@@ -1250,7 +1250,7 @@ static void set_feature(struct r8a66597 *r8a66597, struct usb_ctrlrequest *ctrl)
do {
tmp = r8a66597_read(r8a66597, INTSTS0) & CTSQ;
udelay(1);
- } while (tmp != CS_IDST || timeout-- > 0);
+ } while (tmp != CS_IDST && timeout-- > 0);
if (tmp == CS_IDST)
r8a66597_bset(r8a66597,
--
2.33.0
The following commit has been merged into the x86/urgent branch of tip:
Commit-ID: 81065b35e2486c024c7aa86caed452e1f01a59d4
Gitweb: https://git.kernel.org/tip/81065b35e2486c024c7aa86caed452e1f01a59d4
Author: Tony Luck <tony.luck(a)intel.com>
AuthorDate: Mon, 13 Sep 2021 14:52:39 -07:00
Committer: Borislav Petkov <bp(a)suse.de>
CommitterDate: Tue, 14 Sep 2021 10:27:03 +02:00
x86/mce: Avoid infinite loop for copy from user recovery
There are two cases for machine check recovery:
1) The machine check was triggered by ring3 (application) code.
This is the simpler case. The machine check handler simply queues
work to be executed on return to user. That code unmaps the page
from all users and arranges to send a SIGBUS to the task that
triggered the poison.
2) The machine check was triggered in kernel code that is covered by
an exception table entry. In this case the machine check handler
still queues a work entry to unmap the page, etc. but this will
not be called right away because the #MC handler returns to the
fix up code address in the exception table entry.
Problems occur if the kernel triggers another machine check before the
return to user processes the first queued work item.
Specifically, the work is queued using the ->mce_kill_me callback
structure in the task struct for the current thread. Attempting to queue
a second work item using this same callback results in a loop in the
linked list of work functions to call. So when the kernel does return to
user, it enters an infinite loop processing the same entry for ever.
There are some legitimate scenarios where the kernel may take a second
machine check before returning to the user.
1) Some code (e.g. futex) first tries a get_user() with page faults
disabled. If this fails, the code retries with page faults enabled
expecting that this will resolve the page fault.
2) Copy from user code retries a copy in byte-at-time mode to check
whether any additional bytes can be copied.
On the other side of the fence are some bad drivers that do not check
the return value from individual get_user() calls and may access
multiple user addresses without noticing that some/all calls have
failed.
Fix by adding a counter (current->mce_count) to keep track of repeated
machine checks before task_work() is called. First machine check saves
the address information and calls task_work_add(). Subsequent machine
checks before that task_work call back is executed check that the address
is in the same page as the first machine check (since the callback will
offline exactly one page).
Expected worst case is four machine checks before moving on (e.g. one
user access with page faults disabled, then a repeat to the same address
with page faults enabled ... repeat in copy tail bytes). Just in case
there is some code that loops forever enforce a limit of 10.
[ bp: Massage commit message, drop noinstr, fix typo, extend panic
messages. ]
Fixes: 5567d11c21a1 ("x86/mce: Send #MC singal from task work")
Signed-off-by: Tony Luck <tony.luck(a)intel.com>
Signed-off-by: Borislav Petkov <bp(a)suse.de>
Cc: <stable(a)vger.kernel.org>
Link: https://lkml.kernel.org/r/YT/IJ9ziLqmtqEPu@agluck-desk2.amr.corp.intel.com
---
arch/x86/kernel/cpu/mce/core.c | 43 ++++++++++++++++++++++++---------
include/linux/sched.h | 1 +-
2 files changed, 33 insertions(+), 11 deletions(-)
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 8cb7816..193204a 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -1253,6 +1253,9 @@ static void __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *fin
static void kill_me_now(struct callback_head *ch)
{
+ struct task_struct *p = container_of(ch, struct task_struct, mce_kill_me);
+
+ p->mce_count = 0;
force_sig(SIGBUS);
}
@@ -1262,6 +1265,7 @@ static void kill_me_maybe(struct callback_head *cb)
int flags = MF_ACTION_REQUIRED;
int ret;
+ p->mce_count = 0;
pr_err("Uncorrected hardware memory error in user-access at %llx", p->mce_addr);
if (!p->mce_ripv)
@@ -1290,17 +1294,34 @@ static void kill_me_maybe(struct callback_head *cb)
}
}
-static void queue_task_work(struct mce *m, int kill_current_task)
+static void queue_task_work(struct mce *m, char *msg, int kill_current_task)
{
- current->mce_addr = m->addr;
- current->mce_kflags = m->kflags;
- current->mce_ripv = !!(m->mcgstatus & MCG_STATUS_RIPV);
- current->mce_whole_page = whole_page(m);
+ int count = ++current->mce_count;
- if (kill_current_task)
- current->mce_kill_me.func = kill_me_now;
- else
- current->mce_kill_me.func = kill_me_maybe;
+ /* First call, save all the details */
+ if (count == 1) {
+ current->mce_addr = m->addr;
+ current->mce_kflags = m->kflags;
+ current->mce_ripv = !!(m->mcgstatus & MCG_STATUS_RIPV);
+ current->mce_whole_page = whole_page(m);
+
+ if (kill_current_task)
+ current->mce_kill_me.func = kill_me_now;
+ else
+ current->mce_kill_me.func = kill_me_maybe;
+ }
+
+ /* Ten is likely overkill. Don't expect more than two faults before task_work() */
+ if (count > 10)
+ mce_panic("Too many consecutive machine checks while accessing user data", m, msg);
+
+ /* Second or later call, make sure page address matches the one from first call */
+ if (count > 1 && (current->mce_addr >> PAGE_SHIFT) != (m->addr >> PAGE_SHIFT))
+ mce_panic("Consecutive machine checks to different user pages", m, msg);
+
+ /* Do not call task_work_add() more than once */
+ if (count > 1)
+ return;
task_work_add(current, ¤t->mce_kill_me, TWA_RESUME);
}
@@ -1438,7 +1459,7 @@ noinstr void do_machine_check(struct pt_regs *regs)
/* If this triggers there is no way to recover. Die hard. */
BUG_ON(!on_thread_stack() || !user_mode(regs));
- queue_task_work(&m, kill_current_task);
+ queue_task_work(&m, msg, kill_current_task);
} else {
/*
@@ -1456,7 +1477,7 @@ noinstr void do_machine_check(struct pt_regs *regs)
}
if (m.kflags & MCE_IN_KERNEL_COPYIN)
- queue_task_work(&m, kill_current_task);
+ queue_task_work(&m, msg, kill_current_task);
}
out:
mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1780260..361c7bc 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1468,6 +1468,7 @@ struct task_struct {
mce_whole_page : 1,
__mce_reserved : 62;
struct callback_head mce_kill_me;
+ int mce_count;
#endif
#ifdef CONFIG_KRETPROBES
This is a note to let you know that I've just added the patch titled
serial: mvebu-uart: fix driver's tx_empty callback
to my tty git tree which can be found at
git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/tty.git
in the tty-linus branch.
The patch will show up in the next release of the linux-next tree
(usually sometime within the next 24 hours during the week.)
The patch will hopefully also be merged in Linus's tree for the
next -rc kernel release.
If you have any questions about this process, please let me know.
>From 74e1eb3b4a1ef2e564b4bdeb6e92afe844e900de Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pali=20Roh=C3=A1r?= <pali(a)kernel.org>
Date: Sat, 11 Sep 2021 15:20:17 +0200
Subject: serial: mvebu-uart: fix driver's tx_empty callback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Driver's tx_empty callback should signal when the transmit shift register
is empty. So when the last character has been sent.
STAT_TX_FIFO_EMP bit signals only that HW transmit FIFO is empty, which
happens when the last byte is loaded into transmit shift register.
STAT_TX_EMP bit signals when the both HW transmit FIFO and transmit shift
register are empty.
So replace STAT_TX_FIFO_EMP check by STAT_TX_EMP in mvebu_uart_tx_empty()
callback function.
Fixes: 30530791a7a0 ("serial: mvebu-uart: initial support for Armada-3700 serial port")
Cc: stable <stable(a)vger.kernel.org>
Signed-off-by: Pali Rohár <pali(a)kernel.org>
Link: https://lore.kernel.org/r/20210911132017.25505-1-pali@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
drivers/tty/serial/mvebu-uart.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/tty/serial/mvebu-uart.c b/drivers/tty/serial/mvebu-uart.c
index 231de29a6452..ab226da75f7b 100644
--- a/drivers/tty/serial/mvebu-uart.c
+++ b/drivers/tty/serial/mvebu-uart.c
@@ -163,7 +163,7 @@ static unsigned int mvebu_uart_tx_empty(struct uart_port *port)
st = readl(port->membase + UART_STAT);
spin_unlock_irqrestore(&port->lock, flags);
- return (st & STAT_TX_FIFO_EMP) ? TIOCSER_TEMT : 0;
+ return (st & STAT_TX_EMP) ? TIOCSER_TEMT : 0;
}
static unsigned int mvebu_uart_get_mctrl(struct uart_port *port)
--
2.33.0
This is a note to let you know that I've just added the patch titled
serial: 8250: 8250_omap: Fix RX_LVL register offset
to my tty git tree which can be found at
git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/tty.git
in the tty-linus branch.
The patch will show up in the next release of the linux-next tree
(usually sometime within the next 24 hours during the week.)
The patch will hopefully also be merged in Linus's tree for the
next -rc kernel release.
If you have any questions about this process, please let me know.
>From 79e9e30a9292a62d25ab75488d3886108db1eaad Mon Sep 17 00:00:00 2001
From: Nishanth Menon <nm(a)ti.com>
Date: Fri, 3 Sep 2021 00:05:50 -0500
Subject: serial: 8250: 8250_omap: Fix RX_LVL register offset
Commit b67e830d38fa ("serial: 8250: 8250_omap: Fix possible interrupt
storm on K3 SoCs") introduced fixup including a register read to
RX_LVL, however, we should be using word offset than byte offset
since our registers are on 4 byte boundary (port.regshift = 2) for
8250_omap.
Fixes: b67e830d38fa ("serial: 8250: 8250_omap: Fix possible interrupt storm on K3 SoCs")
Cc: stable <stable(a)vger.kernel.org>
Cc: Jan Kiszka <jan.kiszka(a)siemens.com>
Cc: Vignesh Raghavendra <vigneshr(a)ti.com>
Signed-off-by: Nishanth Menon <nm(a)ti.com>
Link: https://lore.kernel.org/r/20210903050550.29050-1-nm@ti.com
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
drivers/tty/serial/8250/8250_omap.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/tty/serial/8250/8250_omap.c b/drivers/tty/serial/8250/8250_omap.c
index 891fd8345e25..73e5f1dbd075 100644
--- a/drivers/tty/serial/8250/8250_omap.c
+++ b/drivers/tty/serial/8250/8250_omap.c
@@ -106,7 +106,7 @@
#define UART_OMAP_EFR2_TIMEOUT_BEHAVE BIT(6)
/* RX FIFO occupancy indicator */
-#define UART_OMAP_RX_LVL 0x64
+#define UART_OMAP_RX_LVL 0x19
struct omap8250_priv {
int line;
--
2.33.0
The gauge requires us to clear the status bits manually for some alerts
to be properly dismissed. Previously the IRQ was configured to react only
on falling edge, which wasn't technically correct (the ALRT line is active
low), but it had a happy side-effect of preventing interrupt storms
on uncleared alerts from happening.
Fixes: 7fbf6b731bca ("power: supply: max17042: Do not enforce (incorrect) interrupt trigger type")
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Sebastian Krzyszkowiak <sebastian.krzyszkowiak(a)puri.sm>
---
drivers/power/supply/max17042_battery.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/drivers/power/supply/max17042_battery.c b/drivers/power/supply/max17042_battery.c
index 8dffae76b6a3..c53980c8432a 100644
--- a/drivers/power/supply/max17042_battery.c
+++ b/drivers/power/supply/max17042_battery.c
@@ -876,6 +876,9 @@ static irqreturn_t max17042_thread_handler(int id, void *dev)
max17042_set_soc_threshold(chip, 1);
}
+ regmap_clear_bits(chip->regmap, MAX17042_STATUS,
+ 0xFFFF & ~(STATUS_POR_BIT | STATUS_BST_BIT));
+
power_supply_changed(chip->battery);
return IRQ_HANDLED;
}
--
2.33.0
This is a note to let you know that I've just added the patch titled
binder: make sure fd closes complete
to my char-misc git tree which can be found at
git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/char-misc.git
in the char-misc-linus branch.
The patch will show up in the next release of the linux-next tree
(usually sometime within the next 24 hours during the week.)
The patch will hopefully also be merged in Linus's tree for the
next -rc kernel release.
If you have any questions about this process, please let me know.
>From 5fdb55c1ac9585eb23bb2541d5819224429e103d Mon Sep 17 00:00:00 2001
From: Todd Kjos <tkjos(a)google.com>
Date: Mon, 30 Aug 2021 12:51:46 -0700
Subject: binder: make sure fd closes complete
During BC_FREE_BUFFER processing, the BINDER_TYPE_FDA object
cleanup may close 1 or more fds. The close operations are
completed using the task work mechanism -- which means the thread
needs to return to userspace or the file object may never be
dereferenced -- which can lead to hung processes.
Force the binder thread back to userspace if an fd is closed during
BC_FREE_BUFFER handling.
Fixes: 80cd795630d6 ("binder: fix use-after-free due to ksys_close() during fdget()")
Cc: stable <stable(a)vger.kernel.org>
Reviewed-by: Martijn Coenen <maco(a)android.com>
Acked-by: Christian Brauner <christian.brauner(a)ubuntu.com>
Signed-off-by: Todd Kjos <tkjos(a)google.com>
Link: https://lore.kernel.org/r/20210830195146.587206-1-tkjos@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
drivers/android/binder.c | 23 +++++++++++++++++------
1 file changed, 17 insertions(+), 6 deletions(-)
diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 1a68c2f590cf..9edacc8b9768 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -1852,6 +1852,7 @@ static void binder_deferred_fd_close(int fd)
}
static void binder_transaction_buffer_release(struct binder_proc *proc,
+ struct binder_thread *thread,
struct binder_buffer *buffer,
binder_size_t failed_at,
bool is_failure)
@@ -2011,8 +2012,16 @@ static void binder_transaction_buffer_release(struct binder_proc *proc,
&proc->alloc, &fd, buffer,
offset, sizeof(fd));
WARN_ON(err);
- if (!err)
+ if (!err) {
binder_deferred_fd_close(fd);
+ /*
+ * Need to make sure the thread goes
+ * back to userspace to complete the
+ * deferred close
+ */
+ if (thread)
+ thread->looper_need_return = true;
+ }
}
} break;
default:
@@ -3104,7 +3113,7 @@ static void binder_transaction(struct binder_proc *proc,
err_copy_data_failed:
binder_free_txn_fixups(t);
trace_binder_transaction_failed_buffer_release(t->buffer);
- binder_transaction_buffer_release(target_proc, t->buffer,
+ binder_transaction_buffer_release(target_proc, NULL, t->buffer,
buffer_offset, true);
if (target_node)
binder_dec_node_tmpref(target_node);
@@ -3183,7 +3192,9 @@ static void binder_transaction(struct binder_proc *proc,
* Cleanup buffer and free it.
*/
static void
-binder_free_buf(struct binder_proc *proc, struct binder_buffer *buffer)
+binder_free_buf(struct binder_proc *proc,
+ struct binder_thread *thread,
+ struct binder_buffer *buffer)
{
binder_inner_proc_lock(proc);
if (buffer->transaction) {
@@ -3211,7 +3222,7 @@ binder_free_buf(struct binder_proc *proc, struct binder_buffer *buffer)
binder_node_inner_unlock(buf_node);
}
trace_binder_transaction_buffer_release(buffer);
- binder_transaction_buffer_release(proc, buffer, 0, false);
+ binder_transaction_buffer_release(proc, thread, buffer, 0, false);
binder_alloc_free_buf(&proc->alloc, buffer);
}
@@ -3413,7 +3424,7 @@ static int binder_thread_write(struct binder_proc *proc,
proc->pid, thread->pid, (u64)data_ptr,
buffer->debug_id,
buffer->transaction ? "active" : "finished");
- binder_free_buf(proc, buffer);
+ binder_free_buf(proc, thread, buffer);
break;
}
@@ -4106,7 +4117,7 @@ static int binder_thread_read(struct binder_proc *proc,
buffer->transaction = NULL;
binder_cleanup_transaction(t, "fd fixups failed",
BR_FAILED_REPLY);
- binder_free_buf(proc, buffer);
+ binder_free_buf(proc, thread, buffer);
binder_debug(BINDER_DEBUG_FAILED_TRANSACTION,
"%d:%d %stransaction %d fd fixups failed %d/%d, line %d\n",
proc->pid, thread->pid,
--
2.33.0
This is a note to let you know that I've just added the patch titled
binder: fix freeze race
to my char-misc git tree which can be found at
git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/char-misc.git
in the char-misc-linus branch.
The patch will show up in the next release of the linux-next tree
(usually sometime within the next 24 hours during the week.)
The patch will hopefully also be merged in Linus's tree for the
next -rc kernel release.
If you have any questions about this process, please let me know.
>From b564171ade70570b7f335fa8ed17adb28409e3ac Mon Sep 17 00:00:00 2001
From: Li Li <dualli(a)google.com>
Date: Fri, 10 Sep 2021 09:42:10 -0700
Subject: binder: fix freeze race
Currently cgroup freezer is used to freeze the application threads, and
BINDER_FREEZE is used to freeze the corresponding binder interface.
There's already a mechanism in ioctl(BINDER_FREEZE) to wait for any
existing transactions to drain out before actually freezing the binder
interface.
But freezing an app requires 2 steps, freezing the binder interface with
ioctl(BINDER_FREEZE) and then freezing the application main threads with
cgroupfs. This is not an atomic operation. The following race issue
might happen.
1) Binder interface is frozen by ioctl(BINDER_FREEZE);
2) Main thread A initiates a new sync binder transaction to process B;
3) Main thread A is frozen by "echo 1 > cgroup.freeze";
4) The response from process B reaches the frozen thread, which will
unexpectedly fail.
This patch provides a mechanism to check if there's any new pending
transaction happening between ioctl(BINDER_FREEZE) and freezing the
main thread. If there's any, the main thread freezing operation can
be rolled back to finish the pending transaction.
Furthermore, the response might reach the binder driver before the
rollback actually happens. That will still cause failed transaction.
As the other process doesn't wait for another response of the response,
the response transaction failure can be fixed by treating the response
transaction like an oneway/async one, allowing it to reach the frozen
thread. And it will be consumed when the thread gets unfrozen later.
NOTE: This patch reuses the existing definition of struct
binder_frozen_status_info but expands the bit assignments of __u32
member sync_recv.
To ensure backward compatibility, bit 0 of sync_recv still indicates
there's an outstanding sync binder transaction. This patch adds new
information to bit 1 of sync_recv, indicating the binder transaction
happens exactly when there's a race.
If an existing userspace app runs on a new kernel, a sync binder call
will set bit 0 of sync_recv so ioctl(BINDER_GET_FROZEN_INFO) still
return the expected value (true). The app just doesn't check bit 1
intentionally so it doesn't have the ability to tell if there's a race.
This behavior is aligned with what happens on an old kernel which
doesn't set bit 1 at all.
A new userspace app can 1) check bit 0 to know if there's a sync binder
transaction happened when being frozen - same as before; and 2) check
bit 1 to know if that sync binder transaction happened exactly when
there's a race - a new information for rollback decision.
the same time, confirmed the pending transactions succeeded.
Fixes: 432ff1e91694 ("binder: BINDER_FREEZE ioctl")
Acked-by: Todd Kjos <tkjos(a)google.com>
Cc: stable <stable(a)vger.kernel.org>
Signed-off-by: Li Li <dualli(a)google.com>
Test: stress test with apps being frozen and initiating binder calls at
Link: https://lore.kernel.org/r/20210910164210.2282716-2-dualli@chromium.org
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
drivers/android/binder.c | 35 ++++++++++++++++++++++++-----
drivers/android/binder_internal.h | 2 ++
include/uapi/linux/android/binder.h | 7 ++++++
3 files changed, 38 insertions(+), 6 deletions(-)
diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index d9030cb6b1e4..1a68c2f590cf 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -3038,9 +3038,8 @@ static void binder_transaction(struct binder_proc *proc,
if (reply) {
binder_enqueue_thread_work(thread, tcomplete);
binder_inner_proc_lock(target_proc);
- if (target_thread->is_dead || target_proc->is_frozen) {
- return_error = target_thread->is_dead ?
- BR_DEAD_REPLY : BR_FROZEN_REPLY;
+ if (target_thread->is_dead) {
+ return_error = BR_DEAD_REPLY;
binder_inner_proc_unlock(target_proc);
goto err_dead_proc_or_thread;
}
@@ -4648,6 +4647,22 @@ static int binder_ioctl_get_node_debug_info(struct binder_proc *proc,
return 0;
}
+static bool binder_txns_pending_ilocked(struct binder_proc *proc)
+{
+ struct rb_node *n;
+ struct binder_thread *thread;
+
+ if (proc->outstanding_txns > 0)
+ return true;
+
+ for (n = rb_first(&proc->threads); n; n = rb_next(n)) {
+ thread = rb_entry(n, struct binder_thread, rb_node);
+ if (thread->transaction_stack)
+ return true;
+ }
+ return false;
+}
+
static int binder_ioctl_freeze(struct binder_freeze_info *info,
struct binder_proc *target_proc)
{
@@ -4679,8 +4694,13 @@ static int binder_ioctl_freeze(struct binder_freeze_info *info,
(!target_proc->outstanding_txns),
msecs_to_jiffies(info->timeout_ms));
- if (!ret && target_proc->outstanding_txns)
- ret = -EAGAIN;
+ /* Check pending transactions that wait for reply */
+ if (ret >= 0) {
+ binder_inner_proc_lock(target_proc);
+ if (binder_txns_pending_ilocked(target_proc))
+ ret = -EAGAIN;
+ binder_inner_proc_unlock(target_proc);
+ }
if (ret < 0) {
binder_inner_proc_lock(target_proc);
@@ -4696,6 +4716,7 @@ static int binder_ioctl_get_freezer_info(
{
struct binder_proc *target_proc;
bool found = false;
+ __u32 txns_pending;
info->sync_recv = 0;
info->async_recv = 0;
@@ -4705,7 +4726,9 @@ static int binder_ioctl_get_freezer_info(
if (target_proc->pid == info->pid) {
found = true;
binder_inner_proc_lock(target_proc);
- info->sync_recv |= target_proc->sync_recv;
+ txns_pending = binder_txns_pending_ilocked(target_proc);
+ info->sync_recv |= target_proc->sync_recv |
+ (txns_pending << 1);
info->async_recv |= target_proc->async_recv;
binder_inner_proc_unlock(target_proc);
}
diff --git a/drivers/android/binder_internal.h b/drivers/android/binder_internal.h
index 810c0b84d3f8..402c4d4362a8 100644
--- a/drivers/android/binder_internal.h
+++ b/drivers/android/binder_internal.h
@@ -378,6 +378,8 @@ struct binder_ref {
* binder transactions
* (protected by @inner_lock)
* @sync_recv: process received sync transactions since last frozen
+ * bit 0: received sync transaction after being frozen
+ * bit 1: new pending sync transaction during freezing
* (protected by @inner_lock)
* @async_recv: process received async transactions since last frozen
* (protected by @inner_lock)
diff --git a/include/uapi/linux/android/binder.h b/include/uapi/linux/android/binder.h
index 20e435fe657a..3246f2c74696 100644
--- a/include/uapi/linux/android/binder.h
+++ b/include/uapi/linux/android/binder.h
@@ -225,7 +225,14 @@ struct binder_freeze_info {
struct binder_frozen_status_info {
__u32 pid;
+
+ /* process received sync transactions since last frozen
+ * bit 0: received sync transaction after being frozen
+ * bit 1: new pending sync transaction during freezing
+ */
__u32 sync_recv;
+
+ /* process received async transactions since last frozen */
__u32 async_recv;
};
--
2.33.0
Hello,
We ran automated tests on a recent commit from this kernel tree:
Kernel repo: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git
Commit: 82fd0e61d8e0 - ASoC: rt5682: Remove unused variable in rt5682_i2c_remove()
The results of these automated tests are provided below.
Overall result: PASSED
Merge: OK
Compile: OK
Tests: OK
Targeted tests: NO
All kernel binaries, config files, and logs are available for download here:
https://arr-cki-prod-datawarehouse-public.s3.amazonaws.com/index.html?prefi…
Please reply to this email if you have any questions about the tests that we
ran or if you have any suggestions on how to make future tests more effective.
,-. ,-.
( C ) ( K ) Continuous
`-',-.`-' Kernel
( I ) Integration
`-'
______________________________________________________________________________
Compile testing
---------------
We compiled the kernel for 4 architectures:
aarch64:
make options: make -j24 INSTALL_MOD_STRIP=1 targz-pkg
ppc64le:
make options: make -j24 INSTALL_MOD_STRIP=1 targz-pkg
s390x:
make options: make -j24 INSTALL_MOD_STRIP=1 targz-pkg
x86_64:
make options: make -j24 INSTALL_MOD_STRIP=1 targz-pkg
Hardware testing
----------------
We booted each kernel and ran the following tests:
aarch64:
Host 1:
⚡ Internal infrastructure issues prevented one or more tests (marked
with ⚡⚡⚡) from running on this architecture.
This is not the fault of the kernel that was tested.
✅ Boot test
✅ Reboot test
✅ ACPI table test
✅ ACPI enabled test
✅ LTP - cve
✅ LTP - sched
⚡⚡⚡ LTP - syscalls
⚡⚡⚡ LTP - can
⚡⚡⚡ LTP - commands
⚡⚡⚡ LTP - containers
⚡⚡⚡ LTP - dio
⚡⚡⚡ LTP - fs
⚡⚡⚡ LTP - fsx
⚡⚡⚡ LTP - math
⚡⚡⚡ LTP - hugetlb
⚡⚡⚡ LTP - mm
⚡⚡⚡ LTP - nptl
⚡⚡⚡ LTP - pty
⚡⚡⚡ LTP - ipc
⚡⚡⚡ LTP - tracing
⚡⚡⚡ CIFS Connectathon
⚡⚡⚡ POSIX pjd-fstest suites
⚡⚡⚡ NFS Connectathon
⚡⚡⚡ Loopdev Sanity
⚡⚡⚡ jvm - jcstress tests
⚡⚡⚡ Memory: fork_mem
⚡⚡⚡ Memory function: memfd_create
⚡⚡⚡ AMTU (Abstract Machine Test Utility)
⚡⚡⚡ Networking bridge: sanity
⚡⚡⚡ Ethernet drivers sanity
⚡⚡⚡ Networking socket: fuzz
⚡⚡⚡ Networking route: pmtu
⚡⚡⚡ Networking route_func - local
⚡⚡⚡ Networking route_func - forward
⚡⚡⚡ Networking TCP: keepalive test
⚡⚡⚡ Networking UDP: socket
⚡⚡⚡ Networking cki netfilter test
⚡⚡⚡ Networking tunnel: geneve basic test
⚡⚡⚡ Networking tunnel: gre basic
⚡⚡⚡ L2TP basic test
⚡⚡⚡ Networking tunnel: vxlan basic
⚡⚡⚡ Networking ipsec: basic netns - transport
⚡⚡⚡ Networking ipsec: basic netns - tunnel
⚡⚡⚡ Libkcapi AF_ALG test
⚡⚡⚡ pciutils: update pci ids test
⚡⚡⚡ ALSA PCM loopback test
⚡⚡⚡ ALSA Control (mixer) Userspace Element test
⚡⚡⚡ storage: dm/common
⚡⚡⚡ storage: SCSI VPD
⚡⚡⚡ trace: ftrace/tracer
🚧 ⚡⚡⚡ xarray-idr-radixtree-test
🚧 ⚡⚡⚡ i2c: i2cdetect sanity
🚧 ⚡⚡⚡ Firmware test suite
🚧 ⚡⚡⚡ Memory function: kaslr
🚧 ⚡⚡⚡ Networking: igmp conformance test
🚧 ⚡⚡⚡ audit: audit testsuite test
🚧 ⚡⚡⚡ lvm cache test
🚧 ⚡⚡⚡ lvm snapper test
Host 2:
⚡ Internal infrastructure issues prevented one or more tests (marked
with ⚡⚡⚡) from running on this architecture.
This is not the fault of the kernel that was tested.
⚡⚡⚡ Boot test
⚡⚡⚡ Reboot test
⚡⚡⚡ xfstests - ext4
⚡⚡⚡ xfstests - xfs
⚡⚡⚡ IPMI driver test
⚡⚡⚡ IPMItool loop stress test
⚡⚡⚡ selinux-policy: serge-testsuite
⚡⚡⚡ Storage blktests - blk
⚡⚡⚡ Storage block - filesystem fio test
⚡⚡⚡ Storage block - queue scheduler test
⚡⚡⚡ storage: software RAID testing
⚡⚡⚡ Storage: swraid mdadm raid_module test
🚧 ⚡⚡⚡ Podman system integration test - as root
🚧 ⚡⚡⚡ Podman system integration test - as user
🚧 ⚡⚡⚡ xfstests - btrfs
🚧 ⚡⚡⚡ Storage blktests - nvme-tcp
🚧 ⚡⚡⚡ Storage blktests - nvmeof-mp
🚧 ⚡⚡⚡ Storage blktests - srp
🚧 ⚡⚡⚡ stress: stress-ng
ppc64le:
Host 1:
⚡ Internal infrastructure issues prevented one or more tests (marked
with ⚡⚡⚡) from running on this architecture.
This is not the fault of the kernel that was tested.
✅ Boot test
✅ Reboot test
✅ LTP - cve
✅ LTP - sched
⚡⚡⚡ LTP - syscalls
⚡⚡⚡ LTP - can
⚡⚡⚡ LTP - commands
⚡⚡⚡ LTP - containers
⚡⚡⚡ LTP - dio
⚡⚡⚡ LTP - fs
⚡⚡⚡ LTP - fsx
⚡⚡⚡ LTP - math
⚡⚡⚡ LTP - hugetlb
⚡⚡⚡ LTP - mm
⚡⚡⚡ LTP - nptl
⚡⚡⚡ LTP - pty
⚡⚡⚡ LTP - ipc
⚡⚡⚡ LTP - tracing
⚡⚡⚡ CIFS Connectathon
⚡⚡⚡ POSIX pjd-fstest suites
⚡⚡⚡ NFS Connectathon
⚡⚡⚡ Loopdev Sanity
⚡⚡⚡ jvm - jcstress tests
⚡⚡⚡ Memory: fork_mem
⚡⚡⚡ Memory function: memfd_create
⚡⚡⚡ AMTU (Abstract Machine Test Utility)
⚡⚡⚡ Networking bridge: sanity
⚡⚡⚡ Ethernet drivers sanity
⚡⚡⚡ Networking socket: fuzz
⚡⚡⚡ Networking route: pmtu
⚡⚡⚡ Networking route_func - local
⚡⚡⚡ Networking route_func - forward
⚡⚡⚡ Networking TCP: keepalive test
⚡⚡⚡ Networking UDP: socket
⚡⚡⚡ Networking cki netfilter test
⚡⚡⚡ Networking tunnel: geneve basic test
⚡⚡⚡ Networking tunnel: gre basic
⚡⚡⚡ L2TP basic test
⚡⚡⚡ Networking tunnel: vxlan basic
⚡⚡⚡ Networking ipsec: basic netns - tunnel
⚡⚡⚡ Libkcapi AF_ALG test
⚡⚡⚡ pciutils: update pci ids test
⚡⚡⚡ ALSA PCM loopback test
⚡⚡⚡ ALSA Control (mixer) Userspace Element test
⚡⚡⚡ storage: dm/common
⚡⚡⚡ trace: ftrace/tracer
🚧 ⚡⚡⚡ xarray-idr-radixtree-test
🚧 ⚡⚡⚡ Memory function: kaslr
🚧 ⚡⚡⚡ audit: audit testsuite test
🚧 ⚡⚡⚡ lvm cache test
🚧 ⚡⚡⚡ lvm snapper test
Host 2:
✅ Boot test
✅ Reboot test
✅ xfstests - ext4
✅ xfstests - xfs
✅ IPMI driver test
✅ IPMItool loop stress test
✅ selinux-policy: serge-testsuite
✅ Storage blktests - blk
✅ Storage block - filesystem fio test
✅ Storage block - queue scheduler test
✅ storage: software RAID testing
✅ Storage: swraid mdadm raid_module test
🚧 ✅ Podman system integration test - as root
🚧 ✅ Podman system integration test - as user
🚧 ✅ xfstests - btrfs
🚧 ✅ Storage blktests - nvme-tcp
🚧 ❌ Storage blktests - nvmeof-mp
🚧 ❌ Storage blktests - srp
🚧 ✅ Storage: lvm device-mapper test - upstream
Host 3:
✅ Boot test
✅ Reboot test
✅ LTP - cve
✅ LTP - sched
✅ LTP - syscalls
✅ LTP - can
✅ LTP - commands
✅ LTP - containers
✅ LTP - dio
✅ LTP - fs
✅ LTP - fsx
✅ LTP - math
✅ LTP - hugetlb
✅ LTP - mm
✅ LTP - nptl
✅ LTP - pty
✅ LTP - ipc
✅ LTP - tracing
✅ CIFS Connectathon
✅ POSIX pjd-fstest suites
✅ NFS Connectathon
✅ Loopdev Sanity
✅ jvm - jcstress tests
✅ Memory: fork_mem
✅ Memory function: memfd_create
✅ AMTU (Abstract Machine Test Utility)
✅ Networking bridge: sanity
✅ Ethernet drivers sanity
✅ Networking socket: fuzz
✅ Networking route: pmtu
✅ Networking route_func - local
✅ Networking route_func - forward
✅ Networking TCP: keepalive test
✅ Networking UDP: socket
✅ Networking cki netfilter test
✅ Networking tunnel: geneve basic test
✅ Networking tunnel: gre basic
✅ L2TP basic test
✅ Networking tunnel: vxlan basic
✅ Networking ipsec: basic netns - tunnel
✅ Libkcapi AF_ALG test
✅ pciutils: update pci ids test
✅ ALSA PCM loopback test
✅ ALSA Control (mixer) Userspace Element test
✅ storage: dm/common
✅ trace: ftrace/tracer
🚧 ✅ xarray-idr-radixtree-test
🚧 ✅ Memory function: kaslr
🚧 ✅ audit: audit testsuite test
🚧 ✅ lvm cache test
🚧 ✅ lvm snapper test
s390x:
Host 1:
⚡ Internal infrastructure issues prevented one or more tests (marked
with ⚡⚡⚡) from running on this architecture.
This is not the fault of the kernel that was tested.
✅ Boot test
✅ Reboot test
✅ LTP - cve
✅ LTP - sched
✅ LTP - syscalls
✅ LTP - can
✅ LTP - commands
✅ LTP - containers
✅ LTP - dio
✅ LTP - fs
✅ LTP - fsx
✅ LTP - math
✅ LTP - hugetlb
✅ LTP - mm
✅ LTP - nptl
✅ LTP - pty
✅ LTP - ipc
⚡⚡⚡ LTP - tracing
✅ CIFS Connectathon
✅ POSIX pjd-fstest suites
✅ NFS Connectathon
✅ Loopdev Sanity
✅ jvm - jcstress tests
✅ Memory: fork_mem
✅ Memory function: memfd_create
✅ AMTU (Abstract Machine Test Utility)
✅ Networking bridge: sanity
✅ Ethernet drivers sanity
✅ Networking route: pmtu
✅ Networking route_func - local
✅ Networking route_func - forward
✅ Networking TCP: keepalive test
✅ Networking UDP: socket
✅ Networking cki netfilter test
✅ Networking tunnel: geneve basic test
✅ Networking tunnel: gre basic
✅ L2TP basic test
✅ Networking tunnel: vxlan basic
✅ Networking ipsec: basic netns - transport
✅ Networking ipsec: basic netns - tunnel
✅ Libkcapi AF_ALG test
✅ storage: dm/common
✅ trace: ftrace/tracer
🚧 ❌ xarray-idr-radixtree-test
🚧 ✅ Memory function: kaslr
🚧 ✅ audit: audit testsuite test
🚧 ✅ lvm cache test
🚧 ✅ lvm snapper test
Host 2:
⚡ Internal infrastructure issues prevented one or more tests (marked
with ⚡⚡⚡) from running on this architecture.
This is not the fault of the kernel that was tested.
⚡⚡⚡ Boot test
⚡⚡⚡ Reboot test
⚡⚡⚡ selinux-policy: serge-testsuite
⚡⚡⚡ Storage blktests - blk
⚡⚡⚡ Storage: swraid mdadm raid_module test
🚧 ⚡⚡⚡ Podman system integration test - as root
🚧 ⚡⚡⚡ Podman system integration test - as user
🚧 ⚡⚡⚡ Storage blktests - nvme-tcp
🚧 ⚡⚡⚡ Storage blktests - nvmeof-mp
🚧 ⚡⚡⚡ Storage blktests - srp
🚧 ⚡⚡⚡ stress: stress-ng
Host 3:
⚡ Internal infrastructure issues prevented one or more tests (marked
with ⚡⚡⚡) from running on this architecture.
This is not the fault of the kernel that was tested.
✅ Boot test
✅ Reboot test
✅ selinux-policy: serge-testsuite
✅ Storage blktests - blk
✅ Storage: swraid mdadm raid_module test
🚧 ✅ Podman system integration test - as root
🚧 ✅ Podman system integration test - as user
🚧 ✅ Storage blktests - nvme-tcp
🚧 ❌ Storage blktests - nvmeof-mp
🚧 ❌ Storage blktests - srp
🚧 ⚡⚡⚡ stress: stress-ng
x86_64:
Host 1:
⚡ Internal infrastructure issues prevented one or more tests (marked
with ⚡⚡⚡) from running on this architecture.
This is not the fault of the kernel that was tested.
⚡⚡⚡ Boot test
⚡⚡⚡ Reboot test
⚡⚡⚡ xfstests - ext4
⚡⚡⚡ xfstests - xfs
⚡⚡⚡ xfstests - nfsv4.2
⚡⚡⚡ xfstests - cifsv3.11
⚡⚡⚡ IPMI driver test
⚡⚡⚡ IPMItool loop stress test
⚡⚡⚡ selinux-policy: serge-testsuite
⚡⚡⚡ power-management: cpupower/sanity test
⚡⚡⚡ Storage blktests - blk
⚡⚡⚡ Storage block - filesystem fio test
⚡⚡⚡ Storage block - queue scheduler test
⚡⚡⚡ storage: software RAID testing
⚡⚡⚡ Storage: swraid mdadm raid_module test
🚧 ⚡⚡⚡ Podman system integration test - as root
🚧 ⚡⚡⚡ Podman system integration test - as user
🚧 ⚡⚡⚡ CPU: Idle Test
🚧 ⚡⚡⚡ xfstests - btrfs
🚧 ⚡⚡⚡ Storage blktests - nvme-tcp
🚧 ⚡⚡⚡ Storage blktests - nvmeof-mp
🚧 ⚡⚡⚡ Storage blktests - srp
🚧 ⚡⚡⚡ Storage: lvm device-mapper test - upstream
🚧 ⚡⚡⚡ stress: stress-ng
Host 2:
✅ Boot test
✅ Reboot test
✅ ACPI table test
✅ LTP - cve
✅ LTP - sched
✅ LTP - syscalls
✅ LTP - can
✅ LTP - commands
✅ LTP - containers
✅ LTP - dio
✅ LTP - fs
✅ LTP - fsx
✅ LTP - math
✅ LTP - hugetlb
✅ LTP - mm
✅ LTP - nptl
✅ LTP - pty
✅ LTP - ipc
✅ LTP - tracing
✅ CIFS Connectathon
✅ POSIX pjd-fstest suites
✅ NFS Connectathon
✅ Loopdev Sanity
✅ jvm - jcstress tests
✅ Memory: fork_mem
✅ Memory function: memfd_create
✅ AMTU (Abstract Machine Test Utility)
✅ Networking bridge: sanity
✅ Ethernet drivers sanity
✅ Networking socket: fuzz
✅ Networking route: pmtu
✅ Networking route_func - local
✅ Networking route_func - forward
✅ Networking TCP: keepalive test
✅ Networking UDP: socket
✅ Networking cki netfilter test
✅ Networking tunnel: geneve basic test
✅ Networking tunnel: gre basic
✅ L2TP basic test
✅ Networking tunnel: vxlan basic
✅ Networking ipsec: basic netns - transport
✅ Networking ipsec: basic netns - tunnel
✅ Libkcapi AF_ALG test
✅ pciutils: sanity smoke test
✅ pciutils: update pci ids test
✅ ALSA PCM loopback test
✅ ALSA Control (mixer) Userspace Element test
✅ storage: dm/common
✅ storage: SCSI VPD
✅ trace: ftrace/tracer
🚧 ✅ xarray-idr-radixtree-test
🚧 ✅ i2c: i2cdetect sanity
🚧 ✅ Firmware test suite
🚧 ✅ Memory function: kaslr
🚧 ✅ Networking: igmp conformance test
🚧 ✅ audit: audit testsuite test
🚧 ✅ lvm cache test
🚧 ✅ lvm snapper test
Host 3:
⚡ Internal infrastructure issues prevented one or more tests (marked
with ⚡⚡⚡) from running on this architecture.
This is not the fault of the kernel that was tested.
✅ Boot test
✅ Reboot test
✅ xfstests - ext4
✅ xfstests - xfs
✅ xfstests - nfsv4.2
✅ xfstests - cifsv3.11
✅ IPMI driver test
✅ IPMItool loop stress test
⚡⚡⚡ selinux-policy: serge-testsuite
⚡⚡⚡ power-management: cpupower/sanity test
⚡⚡⚡ Storage blktests - blk
⚡⚡⚡ Storage block - filesystem fio test
⚡⚡⚡ Storage block - queue scheduler test
⚡⚡⚡ storage: software RAID testing
⚡⚡⚡ Storage: swraid mdadm raid_module test
🚧 ⚡⚡⚡ Podman system integration test - as root
🚧 ⚡⚡⚡ Podman system integration test - as user
🚧 ⚡⚡⚡ CPU: Idle Test
🚧 ⚡⚡⚡ xfstests - btrfs
🚧 ⚡⚡⚡ Storage blktests - nvme-tcp
🚧 ⚡⚡⚡ Storage blktests - nvmeof-mp
🚧 ⚡⚡⚡ Storage blktests - srp
🚧 ⚡⚡⚡ Storage: lvm device-mapper test - upstream
🚧 ⚡⚡⚡ stress: stress-ng
Host 4:
⚡ Internal infrastructure issues prevented one or more tests (marked
with ⚡⚡⚡) from running on this architecture.
This is not the fault of the kernel that was tested.
⚡⚡⚡ Boot test
⚡⚡⚡ Reboot test
⚡⚡⚡ xfstests - ext4
⚡⚡⚡ xfstests - xfs
⚡⚡⚡ xfstests - nfsv4.2
⚡⚡⚡ xfstests - cifsv3.11
⚡⚡⚡ IPMI driver test
⚡⚡⚡ IPMItool loop stress test
⚡⚡⚡ selinux-policy: serge-testsuite
⚡⚡⚡ power-management: cpupower/sanity test
⚡⚡⚡ Storage blktests - blk
⚡⚡⚡ Storage block - filesystem fio test
⚡⚡⚡ Storage block - queue scheduler test
⚡⚡⚡ storage: software RAID testing
⚡⚡⚡ Storage: swraid mdadm raid_module test
🚧 ⚡⚡⚡ Podman system integration test - as root
🚧 ⚡⚡⚡ Podman system integration test - as user
🚧 ⚡⚡⚡ CPU: Idle Test
🚧 ⚡⚡⚡ xfstests - btrfs
🚧 ⚡⚡⚡ Storage blktests - nvme-tcp
🚧 ⚡⚡⚡ Storage blktests - nvmeof-mp
🚧 ⚡⚡⚡ Storage blktests - srp
🚧 ⚡⚡⚡ Storage: lvm device-mapper test - upstream
🚧 ⚡⚡⚡ stress: stress-ng
Test sources: https://gitlab.com/cki-project/kernel-tests
💚 Pull requests are welcome for new tests or improvements to existing tests!
Aborted tests
-------------
Tests that didn't complete running successfully are marked with ⚡⚡⚡.
If this was caused by an infrastructure issue, we try to mark that
explicitly in the report.
Waived tests
------------
If the test run included waived tests, they are marked with 🚧. Such tests are
executed but their results are not taken into account. Tests are waived when
their results are not reliable enough, e.g. when they're just introduced or are
being fixed.
Testing timeout
---------------
We aim to provide a report within reasonable timeframe. Tests that haven't
finished running yet are marked with ⏱.
Targeted tests
--------------
Test runs for patches always include a set of base tests, plus some
tests chosen based on the file paths modified by the patch. The latter
are called "targeted tests". If no targeted tests are run, that means
no patch-specific tests are available. Please, consider contributing a
targeted test for related patches to increase test coverage. See
https://docs.engineering.redhat.com/x/_wEZB for more details.
Hello,
We ran automated tests on a recent commit from this kernel tree:
Kernel repo: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git
Commit: 08d4da79178d - Linux 5.14.3-rc1
The results of these automated tests are provided below.
Overall result: PASSED
Merge: OK
Compile: OK
Tests: OK
Targeted tests: NO
All kernel binaries, config files, and logs are available for download here:
https://arr-cki-prod-datawarehouse-public.s3.amazonaws.com/index.html?prefi…
Please reply to this email if you have any questions about the tests that we
ran or if you have any suggestions on how to make future tests more effective.
,-. ,-.
( C ) ( K ) Continuous
`-',-.`-' Kernel
( I ) Integration
`-'
______________________________________________________________________________
Compile testing
---------------
We compiled the kernel for 4 architectures:
aarch64:
make options: make -j24 INSTALL_MOD_STRIP=1 targz-pkg
ppc64le:
make options: make -j24 INSTALL_MOD_STRIP=1 targz-pkg
s390x:
make options: make -j24 INSTALL_MOD_STRIP=1 targz-pkg
x86_64:
make options: make -j24 INSTALL_MOD_STRIP=1 targz-pkg
Hardware testing
----------------
We booted each kernel and ran the following tests:
aarch64:
Host 1:
⚡ Internal infrastructure issues prevented one or more tests (marked
with ⚡⚡⚡) from running on this architecture.
This is not the fault of the kernel that was tested.
✅ Boot test
✅ Reboot test
✅ ACPI table test
✅ ACPI enabled test
✅ LTP - cve
✅ LTP - sched
⚡⚡⚡ LTP - syscalls
⚡⚡⚡ LTP - can
⚡⚡⚡ LTP - commands
⚡⚡⚡ LTP - containers
⚡⚡⚡ LTP - dio
⚡⚡⚡ LTP - fs
⚡⚡⚡ LTP - fsx
⚡⚡⚡ LTP - math
⚡⚡⚡ LTP - hugetlb
⚡⚡⚡ LTP - mm
⚡⚡⚡ LTP - nptl
⚡⚡⚡ LTP - pty
⚡⚡⚡ LTP - ipc
⚡⚡⚡ LTP - tracing
⚡⚡⚡ CIFS Connectathon
⚡⚡⚡ POSIX pjd-fstest suites
⚡⚡⚡ NFS Connectathon
⚡⚡⚡ Loopdev Sanity
⚡⚡⚡ jvm - jcstress tests
⚡⚡⚡ Memory: fork_mem
⚡⚡⚡ Memory function: memfd_create
⚡⚡⚡ AMTU (Abstract Machine Test Utility)
⚡⚡⚡ Networking bridge: sanity
⚡⚡⚡ Ethernet drivers sanity
⚡⚡⚡ Networking socket: fuzz
⚡⚡⚡ Networking route: pmtu
⚡⚡⚡ Networking route_func - local
⚡⚡⚡ Networking route_func - forward
⚡⚡⚡ Networking TCP: keepalive test
⚡⚡⚡ Networking UDP: socket
⚡⚡⚡ Networking cki netfilter test
⚡⚡⚡ Networking tunnel: geneve basic test
⚡⚡⚡ Networking tunnel: gre basic
⚡⚡⚡ L2TP basic test
⚡⚡⚡ Networking tunnel: vxlan basic
⚡⚡⚡ Networking ipsec: basic netns - transport
⚡⚡⚡ Networking ipsec: basic netns - tunnel
⚡⚡⚡ Libkcapi AF_ALG test
⚡⚡⚡ pciutils: update pci ids test
⚡⚡⚡ ALSA PCM loopback test
⚡⚡⚡ ALSA Control (mixer) Userspace Element test
⚡⚡⚡ storage: dm/common
⚡⚡⚡ storage: SCSI VPD
⚡⚡⚡ trace: ftrace/tracer
🚧 ⚡⚡⚡ xarray-idr-radixtree-test
🚧 ⚡⚡⚡ i2c: i2cdetect sanity
🚧 ⚡⚡⚡ Firmware test suite
🚧 ⚡⚡⚡ Memory function: kaslr
🚧 ⚡⚡⚡ Networking: igmp conformance test
🚧 ⚡⚡⚡ audit: audit testsuite test
🚧 ⚡⚡⚡ lvm cache test
🚧 ⚡⚡⚡ lvm snapper test
Host 2:
⚡ Internal infrastructure issues prevented one or more tests (marked
with ⚡⚡⚡) from running on this architecture.
This is not the fault of the kernel that was tested.
⚡⚡⚡ Boot test
⚡⚡⚡ Reboot test
⚡⚡⚡ xfstests - ext4
⚡⚡⚡ xfstests - xfs
⚡⚡⚡ IPMI driver test
⚡⚡⚡ IPMItool loop stress test
⚡⚡⚡ selinux-policy: serge-testsuite
⚡⚡⚡ Storage blktests - blk
⚡⚡⚡ Storage block - filesystem fio test
⚡⚡⚡ Storage block - queue scheduler test
⚡⚡⚡ storage: software RAID testing
⚡⚡⚡ Storage: swraid mdadm raid_module test
🚧 ⚡⚡⚡ Podman system integration test - as root
🚧 ⚡⚡⚡ Podman system integration test - as user
🚧 ⚡⚡⚡ xfstests - btrfs
🚧 ⚡⚡⚡ Storage blktests - nvme-tcp
🚧 ⚡⚡⚡ Storage blktests - nvmeof-mp
🚧 ⚡⚡⚡ Storage blktests - srp
🚧 ⚡⚡⚡ stress: stress-ng
ppc64le:
Host 1:
✅ Boot test
✅ Reboot test
✅ LTP - cve
✅ LTP - sched
✅ LTP - syscalls
✅ LTP - can
✅ LTP - commands
✅ LTP - containers
✅ LTP - dio
✅ LTP - fs
✅ LTP - fsx
✅ LTP - math
✅ LTP - hugetlb
✅ LTP - mm
✅ LTP - nptl
✅ LTP - pty
✅ LTP - ipc
✅ LTP - tracing
✅ CIFS Connectathon
✅ POSIX pjd-fstest suites
✅ NFS Connectathon
✅ Loopdev Sanity
✅ jvm - jcstress tests
✅ Memory: fork_mem
✅ Memory function: memfd_create
✅ AMTU (Abstract Machine Test Utility)
✅ Networking bridge: sanity
✅ Ethernet drivers sanity
✅ Networking socket: fuzz
✅ Networking route: pmtu
✅ Networking route_func - local
✅ Networking route_func - forward
✅ Networking TCP: keepalive test
✅ Networking UDP: socket
✅ Networking cki netfilter test
✅ Networking tunnel: geneve basic test
✅ Networking tunnel: gre basic
✅ L2TP basic test
✅ Networking tunnel: vxlan basic
✅ Networking ipsec: basic netns - tunnel
✅ Libkcapi AF_ALG test
✅ pciutils: update pci ids test
✅ ALSA PCM loopback test
✅ ALSA Control (mixer) Userspace Element test
✅ storage: dm/common
✅ trace: ftrace/tracer
🚧 ✅ xarray-idr-radixtree-test
🚧 ✅ Memory function: kaslr
🚧 ✅ audit: audit testsuite test
🚧 ✅ lvm cache test
🚧 ✅ lvm snapper test
Host 2:
✅ Boot test
✅ Reboot test
✅ xfstests - ext4
✅ xfstests - xfs
✅ IPMI driver test
✅ IPMItool loop stress test
✅ selinux-policy: serge-testsuite
✅ Storage blktests - blk
✅ Storage block - filesystem fio test
✅ Storage block - queue scheduler test
✅ storage: software RAID testing
✅ Storage: swraid mdadm raid_module test
🚧 ✅ Podman system integration test - as root
🚧 ✅ Podman system integration test - as user
🚧 ✅ xfstests - btrfs
🚧 ✅ Storage blktests - nvme-tcp
🚧 ❌ Storage blktests - nvmeof-mp
🚧 ❌ Storage blktests - srp
🚧 ✅ Storage: lvm device-mapper test - upstream
s390x:
Host 1:
⚡ Internal infrastructure issues prevented one or more tests (marked
with ⚡⚡⚡) from running on this architecture.
This is not the fault of the kernel that was tested.
⚡⚡⚡ Boot test
⚡⚡⚡ Reboot test
⚡⚡⚡ selinux-policy: serge-testsuite
⚡⚡⚡ Storage blktests - blk
⚡⚡⚡ Storage: swraid mdadm raid_module test
🚧 ⚡⚡⚡ Podman system integration test - as root
🚧 ⚡⚡⚡ Podman system integration test - as user
🚧 ⚡⚡⚡ Storage blktests - nvme-tcp
🚧 ⚡⚡⚡ Storage blktests - nvmeof-mp
🚧 ⚡⚡⚡ Storage blktests - srp
🚧 ⚡⚡⚡ stress: stress-ng
Host 2:
⚡ Internal infrastructure issues prevented one or more tests (marked
with ⚡⚡⚡) from running on this architecture.
This is not the fault of the kernel that was tested.
✅ Boot test
✅ Reboot test
✅ LTP - cve
✅ LTP - sched
✅ LTP - syscalls
✅ LTP - can
✅ LTP - commands
✅ LTP - containers
✅ LTP - dio
✅ LTP - fs
✅ LTP - fsx
✅ LTP - math
✅ LTP - hugetlb
✅ LTP - mm
✅ LTP - nptl
✅ LTP - pty
✅ LTP - ipc
✅ LTP - tracing
✅ CIFS Connectathon
✅ POSIX pjd-fstest suites
✅ NFS Connectathon
✅ Loopdev Sanity
✅ jvm - jcstress tests
✅ Memory: fork_mem
✅ Memory function: memfd_create
✅ AMTU (Abstract Machine Test Utility)
✅ Networking bridge: sanity
✅ Ethernet drivers sanity
✅ Networking route: pmtu
✅ Networking route_func - local
✅ Networking route_func - forward
✅ Networking TCP: keepalive test
✅ Networking UDP: socket
✅ Networking cki netfilter test
✅ Networking tunnel: geneve basic test
✅ Networking tunnel: gre basic
✅ L2TP basic test
✅ Networking tunnel: vxlan basic
✅ Networking ipsec: basic netns - transport
✅ Networking ipsec: basic netns - tunnel
⚡⚡⚡ Libkcapi AF_ALG test
✅ storage: dm/common
✅ trace: ftrace/tracer
🚧 ❌ xarray-idr-radixtree-test
🚧 ✅ Memory function: kaslr
🚧 ✅ audit: audit testsuite test
🚧 ✅ lvm cache test
🚧 ✅ lvm snapper test
Host 3:
✅ Boot test
✅ Reboot test
✅ selinux-policy: serge-testsuite
✅ Storage blktests - blk
✅ Storage: swraid mdadm raid_module test
🚧 ✅ Podman system integration test - as root
🚧 ✅ Podman system integration test - as user
🚧 ✅ Storage blktests - nvme-tcp
🚧 ❌ Storage blktests - nvmeof-mp
🚧 ❌ Storage blktests - srp
🚧 ✅ stress: stress-ng
x86_64:
Host 1:
⚡ Internal infrastructure issues prevented one or more tests (marked
with ⚡⚡⚡) from running on this architecture.
This is not the fault of the kernel that was tested.
⚡⚡⚡ Boot test
⚡⚡⚡ Reboot test
⚡⚡⚡ xfstests - ext4
⚡⚡⚡ xfstests - xfs
⚡⚡⚡ xfstests - nfsv4.2
⚡⚡⚡ xfstests - cifsv3.11
⚡⚡⚡ IPMI driver test
⚡⚡⚡ IPMItool loop stress test
⚡⚡⚡ selinux-policy: serge-testsuite
⚡⚡⚡ power-management: cpupower/sanity test
⚡⚡⚡ Storage blktests - blk
⚡⚡⚡ Storage block - filesystem fio test
⚡⚡⚡ Storage block - queue scheduler test
⚡⚡⚡ storage: software RAID testing
⚡⚡⚡ Storage: swraid mdadm raid_module test
🚧 ⚡⚡⚡ Podman system integration test - as root
🚧 ⚡⚡⚡ Podman system integration test - as user
🚧 ⚡⚡⚡ CPU: Idle Test
🚧 ⚡⚡⚡ xfstests - btrfs
🚧 ⚡⚡⚡ Storage blktests - nvme-tcp
🚧 ⚡⚡⚡ Storage blktests - nvmeof-mp
🚧 ⚡⚡⚡ Storage blktests - srp
🚧 ⚡⚡⚡ Storage: lvm device-mapper test - upstream
🚧 ⚡⚡⚡ stress: stress-ng
Host 2:
✅ Boot test
✅ Reboot test
✅ ACPI table test
✅ LTP - cve
✅ LTP - sched
✅ LTP - syscalls
✅ LTP - can
✅ LTP - commands
✅ LTP - containers
✅ LTP - dio
✅ LTP - fs
✅ LTP - fsx
✅ LTP - math
✅ LTP - hugetlb
✅ LTP - mm
✅ LTP - nptl
✅ LTP - pty
✅ LTP - ipc
✅ LTP - tracing
✅ CIFS Connectathon
✅ POSIX pjd-fstest suites
✅ NFS Connectathon
✅ Loopdev Sanity
✅ jvm - jcstress tests
✅ Memory: fork_mem
✅ Memory function: memfd_create
✅ AMTU (Abstract Machine Test Utility)
✅ Networking bridge: sanity
✅ Ethernet drivers sanity
✅ Networking socket: fuzz
✅ Networking route: pmtu
✅ Networking route_func - local
✅ Networking route_func - forward
✅ Networking TCP: keepalive test
✅ Networking UDP: socket
✅ Networking cki netfilter test
✅ Networking tunnel: geneve basic test
✅ Networking tunnel: gre basic
✅ L2TP basic test
✅ Networking tunnel: vxlan basic
✅ Networking ipsec: basic netns - transport
✅ Networking ipsec: basic netns - tunnel
✅ Libkcapi AF_ALG test
✅ pciutils: sanity smoke test
✅ pciutils: update pci ids test
✅ ALSA PCM loopback test
✅ ALSA Control (mixer) Userspace Element test
✅ storage: dm/common
✅ storage: SCSI VPD
✅ trace: ftrace/tracer
🚧 ❌ xarray-idr-radixtree-test
🚧 ✅ i2c: i2cdetect sanity
🚧 ✅ Firmware test suite
🚧 ✅ Memory function: kaslr
🚧 ✅ Networking: igmp conformance test
🚧 ✅ audit: audit testsuite test
🚧 ✅ lvm cache test
🚧 ✅ lvm snapper test
Host 3:
✅ Boot test
✅ Reboot test
✅ xfstests - ext4
✅ xfstests - xfs
✅ xfstests - nfsv4.2
✅ xfstests - cifsv3.11
✅ IPMI driver test
✅ IPMItool loop stress test
✅ selinux-policy: serge-testsuite
✅ power-management: cpupower/sanity test
✅ Storage blktests - blk
✅ Storage block - filesystem fio test
✅ Storage block - queue scheduler test
✅ storage: software RAID testing
✅ Storage: swraid mdadm raid_module test
🚧 ✅ Podman system integration test - as root
🚧 ✅ Podman system integration test - as user
🚧 ✅ CPU: Idle Test
🚧 ✅ xfstests - btrfs
🚧 ✅ Storage blktests - nvme-tcp
🚧 ❌ Storage blktests - nvmeof-mp
🚧 ❌ Storage blktests - srp
🚧 ✅ Storage: lvm device-mapper test - upstream
🚧 ✅ stress: stress-ng
Test sources: https://gitlab.com/cki-project/kernel-tests
💚 Pull requests are welcome for new tests or improvements to existing tests!
Aborted tests
-------------
Tests that didn't complete running successfully are marked with ⚡⚡⚡.
If this was caused by an infrastructure issue, we try to mark that
explicitly in the report.
Waived tests
------------
If the test run included waived tests, they are marked with 🚧. Such tests are
executed but their results are not taken into account. Tests are waived when
their results are not reliable enough, e.g. when they're just introduced or are
being fixed.
Testing timeout
---------------
We aim to provide a report within reasonable timeframe. Tests that haven't
finished running yet are marked with ⏱.
Targeted tests
--------------
Test runs for patches always include a set of base tests, plus some
tests chosen based on the file paths modified by the patch. The latter
are called "targeted tests". If no targeted tests are run, that means
no patch-specific tests are available. Please, consider contributing a
targeted test for related patches to increase test coverage. See
https://docs.engineering.redhat.com/x/_wEZB for more details.
From: Joerg Roedel <jroedel(a)suse.de>
For now, kexec is not supported when running as an SEV-ES guest. Doing
so requires additional hypervisor support and special code to hand
over the CPUs to the new kernel in a safe way.
Until this is implemented, do not support kexec in SEV-ES guests.
Cc: stable(a)vger.kernel.org # v5.10+
Signed-off-by: Joerg Roedel <jroedel(a)suse.de>
---
arch/x86/kernel/machine_kexec_64.c | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 131f30fdcfbd..a8e16a411b40 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -591,3 +591,11 @@ void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages)
*/
set_memory_encrypted((unsigned long)vaddr, pages);
}
+
+/*
+ * Kexec is not supported in SEV-ES guests yet
+ */
+bool arch_kexec_supported(void)
+{
+ return !sev_es_active();
+}
--
2.33.0
The patch below does not apply to the 4.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 76224355db7570cbe6b6f75c8929a1558828dd55 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi(a)redhat.com>
Date: Tue, 17 Aug 2021 21:05:16 +0200
Subject: [PATCH] fuse: truncate pagecache on atomic_o_trunc
fuse_finish_open() will be called with FUSE_NOWRITE in case of atomic
O_TRUNC. This can deadlock with fuse_wait_on_page_writeback() in
fuse_launder_page() triggered by invalidate_inode_pages2().
Fix by replacing invalidate_inode_pages2() in fuse_finish_open() with a
truncate_pagecache() call. This makes sense regardless of FOPEN_KEEP_CACHE
or fc->writeback cache, so do it unconditionally.
Reported-by: Xie Yongji <xieyongji(a)bytedance.com>
Reported-and-tested-by: syzbot+bea44a5189836d956894(a)syzkaller.appspotmail.com
Fixes: e4648309b85a ("fuse: truncate pending writes on O_TRUNC")
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Miklos Szeredi <mszeredi(a)redhat.com>
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 97f860cfc195..5e5efb66c7d7 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -198,12 +198,11 @@ void fuse_finish_open(struct inode *inode, struct file *file)
struct fuse_file *ff = file->private_data;
struct fuse_conn *fc = get_fuse_conn(inode);
- if (!(ff->open_flags & FOPEN_KEEP_CACHE))
- invalidate_inode_pages2(inode->i_mapping);
if (ff->open_flags & FOPEN_STREAM)
stream_open(inode, file);
else if (ff->open_flags & FOPEN_NONSEEKABLE)
nonseekable_open(inode, file);
+
if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) {
struct fuse_inode *fi = get_fuse_inode(inode);
@@ -211,10 +210,14 @@ void fuse_finish_open(struct inode *inode, struct file *file)
fi->attr_version = atomic64_inc_return(&fc->attr_version);
i_size_write(inode, 0);
spin_unlock(&fi->lock);
+ truncate_pagecache(inode, 0);
fuse_invalidate_attr(inode);
if (fc->writeback_cache)
file_update_time(file);
+ } else if (!(ff->open_flags & FOPEN_KEEP_CACHE)) {
+ invalidate_inode_pages2(inode->i_mapping);
}
+
if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache)
fuse_link_write_file(file);
}
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 76224355db7570cbe6b6f75c8929a1558828dd55 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi(a)redhat.com>
Date: Tue, 17 Aug 2021 21:05:16 +0200
Subject: [PATCH] fuse: truncate pagecache on atomic_o_trunc
fuse_finish_open() will be called with FUSE_NOWRITE in case of atomic
O_TRUNC. This can deadlock with fuse_wait_on_page_writeback() in
fuse_launder_page() triggered by invalidate_inode_pages2().
Fix by replacing invalidate_inode_pages2() in fuse_finish_open() with a
truncate_pagecache() call. This makes sense regardless of FOPEN_KEEP_CACHE
or fc->writeback cache, so do it unconditionally.
Reported-by: Xie Yongji <xieyongji(a)bytedance.com>
Reported-and-tested-by: syzbot+bea44a5189836d956894(a)syzkaller.appspotmail.com
Fixes: e4648309b85a ("fuse: truncate pending writes on O_TRUNC")
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Miklos Szeredi <mszeredi(a)redhat.com>
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 97f860cfc195..5e5efb66c7d7 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -198,12 +198,11 @@ void fuse_finish_open(struct inode *inode, struct file *file)
struct fuse_file *ff = file->private_data;
struct fuse_conn *fc = get_fuse_conn(inode);
- if (!(ff->open_flags & FOPEN_KEEP_CACHE))
- invalidate_inode_pages2(inode->i_mapping);
if (ff->open_flags & FOPEN_STREAM)
stream_open(inode, file);
else if (ff->open_flags & FOPEN_NONSEEKABLE)
nonseekable_open(inode, file);
+
if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) {
struct fuse_inode *fi = get_fuse_inode(inode);
@@ -211,10 +210,14 @@ void fuse_finish_open(struct inode *inode, struct file *file)
fi->attr_version = atomic64_inc_return(&fc->attr_version);
i_size_write(inode, 0);
spin_unlock(&fi->lock);
+ truncate_pagecache(inode, 0);
fuse_invalidate_attr(inode);
if (fc->writeback_cache)
file_update_time(file);
+ } else if (!(ff->open_flags & FOPEN_KEEP_CACHE)) {
+ invalidate_inode_pages2(inode->i_mapping);
}
+
if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache)
fuse_link_write_file(file);
}
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 76224355db7570cbe6b6f75c8929a1558828dd55 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi(a)redhat.com>
Date: Tue, 17 Aug 2021 21:05:16 +0200
Subject: [PATCH] fuse: truncate pagecache on atomic_o_trunc
fuse_finish_open() will be called with FUSE_NOWRITE in case of atomic
O_TRUNC. This can deadlock with fuse_wait_on_page_writeback() in
fuse_launder_page() triggered by invalidate_inode_pages2().
Fix by replacing invalidate_inode_pages2() in fuse_finish_open() with a
truncate_pagecache() call. This makes sense regardless of FOPEN_KEEP_CACHE
or fc->writeback cache, so do it unconditionally.
Reported-by: Xie Yongji <xieyongji(a)bytedance.com>
Reported-and-tested-by: syzbot+bea44a5189836d956894(a)syzkaller.appspotmail.com
Fixes: e4648309b85a ("fuse: truncate pending writes on O_TRUNC")
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Miklos Szeredi <mszeredi(a)redhat.com>
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 97f860cfc195..5e5efb66c7d7 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -198,12 +198,11 @@ void fuse_finish_open(struct inode *inode, struct file *file)
struct fuse_file *ff = file->private_data;
struct fuse_conn *fc = get_fuse_conn(inode);
- if (!(ff->open_flags & FOPEN_KEEP_CACHE))
- invalidate_inode_pages2(inode->i_mapping);
if (ff->open_flags & FOPEN_STREAM)
stream_open(inode, file);
else if (ff->open_flags & FOPEN_NONSEEKABLE)
nonseekable_open(inode, file);
+
if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) {
struct fuse_inode *fi = get_fuse_inode(inode);
@@ -211,10 +210,14 @@ void fuse_finish_open(struct inode *inode, struct file *file)
fi->attr_version = atomic64_inc_return(&fc->attr_version);
i_size_write(inode, 0);
spin_unlock(&fi->lock);
+ truncate_pagecache(inode, 0);
fuse_invalidate_attr(inode);
if (fc->writeback_cache)
file_update_time(file);
+ } else if (!(ff->open_flags & FOPEN_KEEP_CACHE)) {
+ invalidate_inode_pages2(inode->i_mapping);
}
+
if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache)
fuse_link_write_file(file);
}
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From ecc53c48c13d995e6fe5559e30ffee48d92784fd Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe(a)kernel.dk>
Date: Sun, 29 Aug 2021 16:13:03 -0600
Subject: [PATCH] io-wq: check max_worker limits if a worker transitions bound
state
For the two places where new workers are created, we diligently check if
we are allowed to create a new worker. If we're currently at the limit
of how many workers of a given type we can have, then we don't create
any new ones.
If you have a mixed workload with various types of bound and unbounded
work, then it can happen that a worker finishes one type of work and
is then transitioned to the other type. For this case, we don't check
if we are actually allowed to do so. This can cause io-wq to temporarily
exceed the allowed number of workers for a given type.
When retrieving work, check that the types match. If they don't, check
if we are allowed to transition to the other type. If not, then don't
handle the new work.
Cc: stable(a)vger.kernel.org
Reported-by: Johannes Lundberg <johalun0(a)gmail.com>
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/fs/io-wq.c b/fs/io-wq.c
index 4b5fc621ab39..da3ad45028f9 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -424,7 +424,28 @@ static void io_wait_on_hash(struct io_wqe *wqe, unsigned int hash)
spin_unlock(&wq->hash->wait.lock);
}
-static struct io_wq_work *io_get_next_work(struct io_wqe *wqe)
+/*
+ * We can always run the work if the worker is currently the same type as
+ * the work (eg both are bound, or both are unbound). If they are not the
+ * same, only allow it if incrementing the worker count would be allowed.
+ */
+static bool io_worker_can_run_work(struct io_worker *worker,
+ struct io_wq_work *work)
+{
+ struct io_wqe_acct *acct;
+
+ if (!(worker->flags & IO_WORKER_F_BOUND) !=
+ !(work->flags & IO_WQ_WORK_UNBOUND))
+ return true;
+
+ /* not the same type, check if we'd go over the limit */
+ acct = io_work_get_acct(worker->wqe, work);
+ return acct->nr_workers < acct->max_workers;
+}
+
+static struct io_wq_work *io_get_next_work(struct io_wqe *wqe,
+ struct io_worker *worker,
+ bool *stalled)
__must_hold(wqe->lock)
{
struct io_wq_work_node *node, *prev;
@@ -436,6 +457,9 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe)
work = container_of(node, struct io_wq_work, list);
+ if (!io_worker_can_run_work(worker, work))
+ break;
+
/* not hashed, can run anytime */
if (!io_wq_is_hashed(work)) {
wq_list_del(&wqe->work_list, node, prev);
@@ -462,6 +486,7 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe)
raw_spin_unlock(&wqe->lock);
io_wait_on_hash(wqe, stall_hash);
raw_spin_lock(&wqe->lock);
+ *stalled = true;
}
return NULL;
@@ -501,6 +526,7 @@ static void io_worker_handle_work(struct io_worker *worker)
do {
struct io_wq_work *work;
+ bool stalled;
get_next:
/*
* If we got some work, mark us as busy. If we didn't, but
@@ -509,10 +535,11 @@ static void io_worker_handle_work(struct io_worker *worker)
* can't make progress, any work completion or insertion will
* clear the stalled flag.
*/
- work = io_get_next_work(wqe);
+ stalled = false;
+ work = io_get_next_work(wqe, worker, &stalled);
if (work)
__io_worker_busy(wqe, worker, work);
- else if (!wq_list_empty(&wqe->work_list))
+ else if (stalled)
wqe->flags |= IO_WQE_FLAG_STALLED;
raw_spin_unlock_irq(&wqe->lock);
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 94ffb0a282872c2f4b14f757fa1aef2302aeaabb Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe(a)kernel.dk>
Date: Mon, 30 Aug 2021 11:55:22 -0600
Subject: [PATCH] io-wq: fix race between adding work and activating a free
worker
The attempt to find and activate a free worker for new work is currently
combined with creating a new one if we don't find one, but that opens
io-wq up to a race where the worker that is found and activated can
put itself to sleep without knowing that it has been selected to perform
this new work.
Fix this by moving the activation into where we add the new work item,
then we can retain it within the wqe->lock scope and elimiate the race
with the worker itself checking inside the lock, but sleeping outside of
it.
Cc: stable(a)vger.kernel.org
Reported-by: Andres Freund <andres(a)anarazel.de>
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/fs/io-wq.c b/fs/io-wq.c
index cd9bd095fb1b..94f8f2ecb8e5 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -236,9 +236,9 @@ static bool io_wqe_activate_free_worker(struct io_wqe *wqe)
* We need a worker. If we find a free one, we're good. If not, and we're
* below the max number of workers, create one.
*/
-static void io_wqe_wake_worker(struct io_wqe *wqe, struct io_wqe_acct *acct)
+static void io_wqe_create_worker(struct io_wqe *wqe, struct io_wqe_acct *acct)
{
- bool ret;
+ bool do_create = false, first = false;
/*
* Most likely an attempt to queue unbounded work on an io_wq that
@@ -247,26 +247,18 @@ static void io_wqe_wake_worker(struct io_wqe *wqe, struct io_wqe_acct *acct)
if (unlikely(!acct->max_workers))
pr_warn_once("io-wq is not configured for unbound workers");
- rcu_read_lock();
- ret = io_wqe_activate_free_worker(wqe);
- rcu_read_unlock();
-
- if (!ret) {
- bool do_create = false, first = false;
-
- raw_spin_lock(&wqe->lock);
- if (acct->nr_workers < acct->max_workers) {
- if (!acct->nr_workers)
- first = true;
- acct->nr_workers++;
- do_create = true;
- }
- raw_spin_unlock(&wqe->lock);
- if (do_create) {
- atomic_inc(&acct->nr_running);
- atomic_inc(&wqe->wq->worker_refs);
- create_io_worker(wqe->wq, wqe, acct->index, first);
- }
+ raw_spin_lock(&wqe->lock);
+ if (acct->nr_workers < acct->max_workers) {
+ if (!acct->nr_workers)
+ first = true;
+ acct->nr_workers++;
+ do_create = true;
+ }
+ raw_spin_unlock(&wqe->lock);
+ if (do_create) {
+ atomic_inc(&acct->nr_running);
+ atomic_inc(&wqe->wq->worker_refs);
+ create_io_worker(wqe->wq, wqe, acct->index, first);
}
}
@@ -794,7 +786,8 @@ static void io_wqe_insert_work(struct io_wqe *wqe, struct io_wq_work *work)
static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
{
struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
- bool do_wake;
+ unsigned work_flags = work->flags;
+ bool do_create;
/*
* If io-wq is exiting for this task, or if the request has explicitly
@@ -809,12 +802,16 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
raw_spin_lock(&wqe->lock);
io_wqe_insert_work(wqe, work);
wqe->flags &= ~IO_WQE_FLAG_STALLED;
- do_wake = (work->flags & IO_WQ_WORK_CONCURRENT) ||
- !atomic_read(&acct->nr_running);
+
+ rcu_read_lock();
+ do_create = !io_wqe_activate_free_worker(wqe);
+ rcu_read_unlock();
+
raw_spin_unlock(&wqe->lock);
- if (do_wake)
- io_wqe_wake_worker(wqe, acct);
+ if (do_create && ((work_flags & IO_WQ_WORK_CONCURRENT) ||
+ !atomic_read(&acct->nr_running)))
+ io_wqe_create_worker(wqe, acct);
}
void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work)
The patch below does not apply to the 5.13-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 94ffb0a282872c2f4b14f757fa1aef2302aeaabb Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe(a)kernel.dk>
Date: Mon, 30 Aug 2021 11:55:22 -0600
Subject: [PATCH] io-wq: fix race between adding work and activating a free
worker
The attempt to find and activate a free worker for new work is currently
combined with creating a new one if we don't find one, but that opens
io-wq up to a race where the worker that is found and activated can
put itself to sleep without knowing that it has been selected to perform
this new work.
Fix this by moving the activation into where we add the new work item,
then we can retain it within the wqe->lock scope and elimiate the race
with the worker itself checking inside the lock, but sleeping outside of
it.
Cc: stable(a)vger.kernel.org
Reported-by: Andres Freund <andres(a)anarazel.de>
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/fs/io-wq.c b/fs/io-wq.c
index cd9bd095fb1b..94f8f2ecb8e5 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -236,9 +236,9 @@ static bool io_wqe_activate_free_worker(struct io_wqe *wqe)
* We need a worker. If we find a free one, we're good. If not, and we're
* below the max number of workers, create one.
*/
-static void io_wqe_wake_worker(struct io_wqe *wqe, struct io_wqe_acct *acct)
+static void io_wqe_create_worker(struct io_wqe *wqe, struct io_wqe_acct *acct)
{
- bool ret;
+ bool do_create = false, first = false;
/*
* Most likely an attempt to queue unbounded work on an io_wq that
@@ -247,26 +247,18 @@ static void io_wqe_wake_worker(struct io_wqe *wqe, struct io_wqe_acct *acct)
if (unlikely(!acct->max_workers))
pr_warn_once("io-wq is not configured for unbound workers");
- rcu_read_lock();
- ret = io_wqe_activate_free_worker(wqe);
- rcu_read_unlock();
-
- if (!ret) {
- bool do_create = false, first = false;
-
- raw_spin_lock(&wqe->lock);
- if (acct->nr_workers < acct->max_workers) {
- if (!acct->nr_workers)
- first = true;
- acct->nr_workers++;
- do_create = true;
- }
- raw_spin_unlock(&wqe->lock);
- if (do_create) {
- atomic_inc(&acct->nr_running);
- atomic_inc(&wqe->wq->worker_refs);
- create_io_worker(wqe->wq, wqe, acct->index, first);
- }
+ raw_spin_lock(&wqe->lock);
+ if (acct->nr_workers < acct->max_workers) {
+ if (!acct->nr_workers)
+ first = true;
+ acct->nr_workers++;
+ do_create = true;
+ }
+ raw_spin_unlock(&wqe->lock);
+ if (do_create) {
+ atomic_inc(&acct->nr_running);
+ atomic_inc(&wqe->wq->worker_refs);
+ create_io_worker(wqe->wq, wqe, acct->index, first);
}
}
@@ -794,7 +786,8 @@ static void io_wqe_insert_work(struct io_wqe *wqe, struct io_wq_work *work)
static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
{
struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
- bool do_wake;
+ unsigned work_flags = work->flags;
+ bool do_create;
/*
* If io-wq is exiting for this task, or if the request has explicitly
@@ -809,12 +802,16 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
raw_spin_lock(&wqe->lock);
io_wqe_insert_work(wqe, work);
wqe->flags &= ~IO_WQE_FLAG_STALLED;
- do_wake = (work->flags & IO_WQ_WORK_CONCURRENT) ||
- !atomic_read(&acct->nr_running);
+
+ rcu_read_lock();
+ do_create = !io_wqe_activate_free_worker(wqe);
+ rcu_read_unlock();
+
raw_spin_unlock(&wqe->lock);
- if (do_wake)
- io_wqe_wake_worker(wqe, acct);
+ if (do_create && ((work_flags & IO_WQ_WORK_CONCURRENT) ||
+ !atomic_read(&acct->nr_running)))
+ io_wqe_create_worker(wqe, acct);
}
void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work)
The patch below does not apply to the 5.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 94ffb0a282872c2f4b14f757fa1aef2302aeaabb Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe(a)kernel.dk>
Date: Mon, 30 Aug 2021 11:55:22 -0600
Subject: [PATCH] io-wq: fix race between adding work and activating a free
worker
The attempt to find and activate a free worker for new work is currently
combined with creating a new one if we don't find one, but that opens
io-wq up to a race where the worker that is found and activated can
put itself to sleep without knowing that it has been selected to perform
this new work.
Fix this by moving the activation into where we add the new work item,
then we can retain it within the wqe->lock scope and elimiate the race
with the worker itself checking inside the lock, but sleeping outside of
it.
Cc: stable(a)vger.kernel.org
Reported-by: Andres Freund <andres(a)anarazel.de>
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/fs/io-wq.c b/fs/io-wq.c
index cd9bd095fb1b..94f8f2ecb8e5 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -236,9 +236,9 @@ static bool io_wqe_activate_free_worker(struct io_wqe *wqe)
* We need a worker. If we find a free one, we're good. If not, and we're
* below the max number of workers, create one.
*/
-static void io_wqe_wake_worker(struct io_wqe *wqe, struct io_wqe_acct *acct)
+static void io_wqe_create_worker(struct io_wqe *wqe, struct io_wqe_acct *acct)
{
- bool ret;
+ bool do_create = false, first = false;
/*
* Most likely an attempt to queue unbounded work on an io_wq that
@@ -247,26 +247,18 @@ static void io_wqe_wake_worker(struct io_wqe *wqe, struct io_wqe_acct *acct)
if (unlikely(!acct->max_workers))
pr_warn_once("io-wq is not configured for unbound workers");
- rcu_read_lock();
- ret = io_wqe_activate_free_worker(wqe);
- rcu_read_unlock();
-
- if (!ret) {
- bool do_create = false, first = false;
-
- raw_spin_lock(&wqe->lock);
- if (acct->nr_workers < acct->max_workers) {
- if (!acct->nr_workers)
- first = true;
- acct->nr_workers++;
- do_create = true;
- }
- raw_spin_unlock(&wqe->lock);
- if (do_create) {
- atomic_inc(&acct->nr_running);
- atomic_inc(&wqe->wq->worker_refs);
- create_io_worker(wqe->wq, wqe, acct->index, first);
- }
+ raw_spin_lock(&wqe->lock);
+ if (acct->nr_workers < acct->max_workers) {
+ if (!acct->nr_workers)
+ first = true;
+ acct->nr_workers++;
+ do_create = true;
+ }
+ raw_spin_unlock(&wqe->lock);
+ if (do_create) {
+ atomic_inc(&acct->nr_running);
+ atomic_inc(&wqe->wq->worker_refs);
+ create_io_worker(wqe->wq, wqe, acct->index, first);
}
}
@@ -794,7 +786,8 @@ static void io_wqe_insert_work(struct io_wqe *wqe, struct io_wq_work *work)
static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
{
struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
- bool do_wake;
+ unsigned work_flags = work->flags;
+ bool do_create;
/*
* If io-wq is exiting for this task, or if the request has explicitly
@@ -809,12 +802,16 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
raw_spin_lock(&wqe->lock);
io_wqe_insert_work(wqe, work);
wqe->flags &= ~IO_WQE_FLAG_STALLED;
- do_wake = (work->flags & IO_WQ_WORK_CONCURRENT) ||
- !atomic_read(&acct->nr_running);
+
+ rcu_read_lock();
+ do_create = !io_wqe_activate_free_worker(wqe);
+ rcu_read_unlock();
+
raw_spin_unlock(&wqe->lock);
- if (do_wake)
- io_wqe_wake_worker(wqe, acct);
+ if (do_create && ((work_flags & IO_WQ_WORK_CONCURRENT) ||
+ !atomic_read(&acct->nr_running)))
+ io_wqe_create_worker(wqe, acct);
}
void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work)
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 87df7fb922d18e96992aa5e824aa34b2065fef59 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe(a)kernel.dk>
Date: Mon, 30 Aug 2021 07:45:47 -0600
Subject: [PATCH] io-wq: fix wakeup race when adding new work
When new work is added, io_wqe_enqueue() checks if we need to wake or
create a new worker. But that check is done outside the lock that
otherwise synchronizes us with a worker going to sleep, so we can end
up in the following situation:
CPU0 CPU1
lock
insert work
unlock
atomic_read(nr_running) != 0
lock
atomic_dec(nr_running)
no wakeup needed
Hold the wqe lock around the "need to wakeup" check. Then we can also get
rid of the temporary work_flags variable, as we know the work will remain
valid as long as we hold the lock.
Cc: stable(a)vger.kernel.org
Reported-by: Andres Freund <andres(a)anarazel.de>
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/fs/io-wq.c b/fs/io-wq.c
index 13aeb48a0964..cd9bd095fb1b 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -794,7 +794,7 @@ static void io_wqe_insert_work(struct io_wqe *wqe, struct io_wq_work *work)
static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
{
struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
- int work_flags;
+ bool do_wake;
/*
* If io-wq is exiting for this task, or if the request has explicitly
@@ -806,14 +806,14 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
return;
}
- work_flags = work->flags;
raw_spin_lock(&wqe->lock);
io_wqe_insert_work(wqe, work);
wqe->flags &= ~IO_WQE_FLAG_STALLED;
+ do_wake = (work->flags & IO_WQ_WORK_CONCURRENT) ||
+ !atomic_read(&acct->nr_running);
raw_spin_unlock(&wqe->lock);
- if ((work_flags & IO_WQ_WORK_CONCURRENT) ||
- !atomic_read(&acct->nr_running))
+ if (do_wake)
io_wqe_wake_worker(wqe, acct);
}
The patch below does not apply to the 5.13-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 87df7fb922d18e96992aa5e824aa34b2065fef59 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe(a)kernel.dk>
Date: Mon, 30 Aug 2021 07:45:47 -0600
Subject: [PATCH] io-wq: fix wakeup race when adding new work
When new work is added, io_wqe_enqueue() checks if we need to wake or
create a new worker. But that check is done outside the lock that
otherwise synchronizes us with a worker going to sleep, so we can end
up in the following situation:
CPU0 CPU1
lock
insert work
unlock
atomic_read(nr_running) != 0
lock
atomic_dec(nr_running)
no wakeup needed
Hold the wqe lock around the "need to wakeup" check. Then we can also get
rid of the temporary work_flags variable, as we know the work will remain
valid as long as we hold the lock.
Cc: stable(a)vger.kernel.org
Reported-by: Andres Freund <andres(a)anarazel.de>
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/fs/io-wq.c b/fs/io-wq.c
index 13aeb48a0964..cd9bd095fb1b 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -794,7 +794,7 @@ static void io_wqe_insert_work(struct io_wqe *wqe, struct io_wq_work *work)
static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
{
struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
- int work_flags;
+ bool do_wake;
/*
* If io-wq is exiting for this task, or if the request has explicitly
@@ -806,14 +806,14 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
return;
}
- work_flags = work->flags;
raw_spin_lock(&wqe->lock);
io_wqe_insert_work(wqe, work);
wqe->flags &= ~IO_WQE_FLAG_STALLED;
+ do_wake = (work->flags & IO_WQ_WORK_CONCURRENT) ||
+ !atomic_read(&acct->nr_running);
raw_spin_unlock(&wqe->lock);
- if ((work_flags & IO_WQ_WORK_CONCURRENT) ||
- !atomic_read(&acct->nr_running))
+ if (do_wake)
io_wqe_wake_worker(wqe, acct);
}
The patch below does not apply to the 5.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 87df7fb922d18e96992aa5e824aa34b2065fef59 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe(a)kernel.dk>
Date: Mon, 30 Aug 2021 07:45:47 -0600
Subject: [PATCH] io-wq: fix wakeup race when adding new work
When new work is added, io_wqe_enqueue() checks if we need to wake or
create a new worker. But that check is done outside the lock that
otherwise synchronizes us with a worker going to sleep, so we can end
up in the following situation:
CPU0 CPU1
lock
insert work
unlock
atomic_read(nr_running) != 0
lock
atomic_dec(nr_running)
no wakeup needed
Hold the wqe lock around the "need to wakeup" check. Then we can also get
rid of the temporary work_flags variable, as we know the work will remain
valid as long as we hold the lock.
Cc: stable(a)vger.kernel.org
Reported-by: Andres Freund <andres(a)anarazel.de>
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/fs/io-wq.c b/fs/io-wq.c
index 13aeb48a0964..cd9bd095fb1b 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -794,7 +794,7 @@ static void io_wqe_insert_work(struct io_wqe *wqe, struct io_wq_work *work)
static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
{
struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
- int work_flags;
+ bool do_wake;
/*
* If io-wq is exiting for this task, or if the request has explicitly
@@ -806,14 +806,14 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
return;
}
- work_flags = work->flags;
raw_spin_lock(&wqe->lock);
io_wqe_insert_work(wqe, work);
wqe->flags &= ~IO_WQE_FLAG_STALLED;
+ do_wake = (work->flags & IO_WQ_WORK_CONCURRENT) ||
+ !atomic_read(&acct->nr_running);
raw_spin_unlock(&wqe->lock);
- if ((work_flags & IO_WQ_WORK_CONCURRENT) ||
- !atomic_read(&acct->nr_running))
+ if (do_wake)
io_wqe_wake_worker(wqe, acct);
}
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 3134cc8beb69d0db9de651081707c4651c011621 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz(a)kernel.org>
Date: Thu, 19 Aug 2021 19:03:05 +0100
Subject: [PATCH] KVM: arm64: vgic: Resample HW pending state on deactivation
When a mapped level interrupt (a timer, for example) is deactivated
by the guest, the corresponding host interrupt is equally deactivated.
However, the fate of the pending state still needs to be dealt
with in SW.
This is specially true when the interrupt was in the active+pending
state in the virtual distributor at the point where the guest
was entered. On exit, the pending state is potentially stale
(the guest may have put the interrupt in a non-pending state).
If we don't do anything, the interrupt will be spuriously injected
in the guest. Although this shouldn't have any ill effect (spurious
interrupts are always possible), we can improve the emulation by
detecting the deactivation-while-pending case and resample the
interrupt.
While we're at it, move the logic into a common helper that can
be shared between the two GIC implementations.
Fixes: e40cc57bac79 ("KVM: arm/arm64: vgic: Support level-triggered mapped interrupts")
Reported-by: Raghavendra Rao Ananta <rananta(a)google.com>
Tested-by: Raghavendra Rao Ananta <rananta(a)google.com>
Reviewed-by: Oliver Upton <oupton(a)google.com>
Signed-off-by: Marc Zyngier <maz(a)kernel.org>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20210819180305.1670525-1-maz@kernel.org
diff --git a/arch/arm64/kvm/vgic/vgic-v2.c b/arch/arm64/kvm/vgic/vgic-v2.c
index 2c580204f1dc..95a18cec14a3 100644
--- a/arch/arm64/kvm/vgic/vgic-v2.c
+++ b/arch/arm64/kvm/vgic/vgic-v2.c
@@ -60,6 +60,7 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
u32 val = cpuif->vgic_lr[lr];
u32 cpuid, intid = val & GICH_LR_VIRTUALID;
struct vgic_irq *irq;
+ bool deactivated;
/* Extract the source vCPU id from the LR */
cpuid = val & GICH_LR_PHYSID_CPUID;
@@ -75,7 +76,8 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
raw_spin_lock(&irq->irq_lock);
- /* Always preserve the active bit */
+ /* Always preserve the active bit, note deactivation */
+ deactivated = irq->active && !(val & GICH_LR_ACTIVE_BIT);
irq->active = !!(val & GICH_LR_ACTIVE_BIT);
if (irq->active && vgic_irq_is_sgi(intid))
@@ -96,36 +98,8 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
if (irq->config == VGIC_CONFIG_LEVEL && !(val & GICH_LR_STATE))
irq->pending_latch = false;
- /*
- * Level-triggered mapped IRQs are special because we only
- * observe rising edges as input to the VGIC.
- *
- * If the guest never acked the interrupt we have to sample
- * the physical line and set the line level, because the
- * device state could have changed or we simply need to
- * process the still pending interrupt later.
- *
- * If this causes us to lower the level, we have to also clear
- * the physical active state, since we will otherwise never be
- * told when the interrupt becomes asserted again.
- *
- * Another case is when the interrupt requires a helping hand
- * on deactivation (no HW deactivation, for example).
- */
- if (vgic_irq_is_mapped_level(irq)) {
- bool resample = false;
-
- if (val & GICH_LR_PENDING_BIT) {
- irq->line_level = vgic_get_phys_line_level(irq);
- resample = !irq->line_level;
- } else if (vgic_irq_needs_resampling(irq) &&
- !(irq->active || irq->pending_latch)) {
- resample = true;
- }
-
- if (resample)
- vgic_irq_set_phys_active(irq, false);
- }
+ /* Handle resampling for mapped interrupts if required */
+ vgic_irq_handle_resampling(irq, deactivated, val & GICH_LR_PENDING_BIT);
raw_spin_unlock(&irq->irq_lock);
vgic_put_irq(vcpu->kvm, irq);
diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c
index 66004f61cd83..21a6207fb2ee 100644
--- a/arch/arm64/kvm/vgic/vgic-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-v3.c
@@ -46,6 +46,7 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
u32 intid, cpuid;
struct vgic_irq *irq;
bool is_v2_sgi = false;
+ bool deactivated;
cpuid = val & GICH_LR_PHYSID_CPUID;
cpuid >>= GICH_LR_PHYSID_CPUID_SHIFT;
@@ -68,7 +69,8 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
raw_spin_lock(&irq->irq_lock);
- /* Always preserve the active bit */
+ /* Always preserve the active bit, note deactivation */
+ deactivated = irq->active && !(val & ICH_LR_ACTIVE_BIT);
irq->active = !!(val & ICH_LR_ACTIVE_BIT);
if (irq->active && is_v2_sgi)
@@ -89,36 +91,8 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
if (irq->config == VGIC_CONFIG_LEVEL && !(val & ICH_LR_STATE))
irq->pending_latch = false;
- /*
- * Level-triggered mapped IRQs are special because we only
- * observe rising edges as input to the VGIC.
- *
- * If the guest never acked the interrupt we have to sample
- * the physical line and set the line level, because the
- * device state could have changed or we simply need to
- * process the still pending interrupt later.
- *
- * If this causes us to lower the level, we have to also clear
- * the physical active state, since we will otherwise never be
- * told when the interrupt becomes asserted again.
- *
- * Another case is when the interrupt requires a helping hand
- * on deactivation (no HW deactivation, for example).
- */
- if (vgic_irq_is_mapped_level(irq)) {
- bool resample = false;
-
- if (val & ICH_LR_PENDING_BIT) {
- irq->line_level = vgic_get_phys_line_level(irq);
- resample = !irq->line_level;
- } else if (vgic_irq_needs_resampling(irq) &&
- !(irq->active || irq->pending_latch)) {
- resample = true;
- }
-
- if (resample)
- vgic_irq_set_phys_active(irq, false);
- }
+ /* Handle resampling for mapped interrupts if required */
+ vgic_irq_handle_resampling(irq, deactivated, val & ICH_LR_PENDING_BIT);
raw_spin_unlock(&irq->irq_lock);
vgic_put_irq(vcpu->kvm, irq);
diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c
index 81cec508d413..5dad4996cfb2 100644
--- a/arch/arm64/kvm/vgic/vgic.c
+++ b/arch/arm64/kvm/vgic/vgic.c
@@ -1021,3 +1021,41 @@ bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int vintid)
return map_is_active;
}
+
+/*
+ * Level-triggered mapped IRQs are special because we only observe rising
+ * edges as input to the VGIC.
+ *
+ * If the guest never acked the interrupt we have to sample the physical
+ * line and set the line level, because the device state could have changed
+ * or we simply need to process the still pending interrupt later.
+ *
+ * We could also have entered the guest with the interrupt active+pending.
+ * On the next exit, we need to re-evaluate the pending state, as it could
+ * otherwise result in a spurious interrupt by injecting a now potentially
+ * stale pending state.
+ *
+ * If this causes us to lower the level, we have to also clear the physical
+ * active state, since we will otherwise never be told when the interrupt
+ * becomes asserted again.
+ *
+ * Another case is when the interrupt requires a helping hand on
+ * deactivation (no HW deactivation, for example).
+ */
+void vgic_irq_handle_resampling(struct vgic_irq *irq,
+ bool lr_deactivated, bool lr_pending)
+{
+ if (vgic_irq_is_mapped_level(irq)) {
+ bool resample = false;
+
+ if (unlikely(vgic_irq_needs_resampling(irq))) {
+ resample = !(irq->active || irq->pending_latch);
+ } else if (lr_pending || (lr_deactivated && irq->line_level)) {
+ irq->line_level = vgic_get_phys_line_level(irq);
+ resample = !irq->line_level;
+ }
+
+ if (resample)
+ vgic_irq_set_phys_active(irq, false);
+ }
+}
diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h
index dc1f3d1657ee..14a9218641f5 100644
--- a/arch/arm64/kvm/vgic/vgic.h
+++ b/arch/arm64/kvm/vgic/vgic.h
@@ -169,6 +169,8 @@ void vgic_irq_set_phys_active(struct vgic_irq *irq, bool active);
bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq,
unsigned long flags);
void vgic_kick_vcpus(struct kvm *kvm);
+void vgic_irq_handle_resampling(struct vgic_irq *irq,
+ bool lr_deactivated, bool lr_pending);
int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr,
phys_addr_t addr, phys_addr_t alignment);
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 3134cc8beb69d0db9de651081707c4651c011621 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz(a)kernel.org>
Date: Thu, 19 Aug 2021 19:03:05 +0100
Subject: [PATCH] KVM: arm64: vgic: Resample HW pending state on deactivation
When a mapped level interrupt (a timer, for example) is deactivated
by the guest, the corresponding host interrupt is equally deactivated.
However, the fate of the pending state still needs to be dealt
with in SW.
This is specially true when the interrupt was in the active+pending
state in the virtual distributor at the point where the guest
was entered. On exit, the pending state is potentially stale
(the guest may have put the interrupt in a non-pending state).
If we don't do anything, the interrupt will be spuriously injected
in the guest. Although this shouldn't have any ill effect (spurious
interrupts are always possible), we can improve the emulation by
detecting the deactivation-while-pending case and resample the
interrupt.
While we're at it, move the logic into a common helper that can
be shared between the two GIC implementations.
Fixes: e40cc57bac79 ("KVM: arm/arm64: vgic: Support level-triggered mapped interrupts")
Reported-by: Raghavendra Rao Ananta <rananta(a)google.com>
Tested-by: Raghavendra Rao Ananta <rananta(a)google.com>
Reviewed-by: Oliver Upton <oupton(a)google.com>
Signed-off-by: Marc Zyngier <maz(a)kernel.org>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20210819180305.1670525-1-maz@kernel.org
diff --git a/arch/arm64/kvm/vgic/vgic-v2.c b/arch/arm64/kvm/vgic/vgic-v2.c
index 2c580204f1dc..95a18cec14a3 100644
--- a/arch/arm64/kvm/vgic/vgic-v2.c
+++ b/arch/arm64/kvm/vgic/vgic-v2.c
@@ -60,6 +60,7 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
u32 val = cpuif->vgic_lr[lr];
u32 cpuid, intid = val & GICH_LR_VIRTUALID;
struct vgic_irq *irq;
+ bool deactivated;
/* Extract the source vCPU id from the LR */
cpuid = val & GICH_LR_PHYSID_CPUID;
@@ -75,7 +76,8 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
raw_spin_lock(&irq->irq_lock);
- /* Always preserve the active bit */
+ /* Always preserve the active bit, note deactivation */
+ deactivated = irq->active && !(val & GICH_LR_ACTIVE_BIT);
irq->active = !!(val & GICH_LR_ACTIVE_BIT);
if (irq->active && vgic_irq_is_sgi(intid))
@@ -96,36 +98,8 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
if (irq->config == VGIC_CONFIG_LEVEL && !(val & GICH_LR_STATE))
irq->pending_latch = false;
- /*
- * Level-triggered mapped IRQs are special because we only
- * observe rising edges as input to the VGIC.
- *
- * If the guest never acked the interrupt we have to sample
- * the physical line and set the line level, because the
- * device state could have changed or we simply need to
- * process the still pending interrupt later.
- *
- * If this causes us to lower the level, we have to also clear
- * the physical active state, since we will otherwise never be
- * told when the interrupt becomes asserted again.
- *
- * Another case is when the interrupt requires a helping hand
- * on deactivation (no HW deactivation, for example).
- */
- if (vgic_irq_is_mapped_level(irq)) {
- bool resample = false;
-
- if (val & GICH_LR_PENDING_BIT) {
- irq->line_level = vgic_get_phys_line_level(irq);
- resample = !irq->line_level;
- } else if (vgic_irq_needs_resampling(irq) &&
- !(irq->active || irq->pending_latch)) {
- resample = true;
- }
-
- if (resample)
- vgic_irq_set_phys_active(irq, false);
- }
+ /* Handle resampling for mapped interrupts if required */
+ vgic_irq_handle_resampling(irq, deactivated, val & GICH_LR_PENDING_BIT);
raw_spin_unlock(&irq->irq_lock);
vgic_put_irq(vcpu->kvm, irq);
diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c
index 66004f61cd83..21a6207fb2ee 100644
--- a/arch/arm64/kvm/vgic/vgic-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-v3.c
@@ -46,6 +46,7 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
u32 intid, cpuid;
struct vgic_irq *irq;
bool is_v2_sgi = false;
+ bool deactivated;
cpuid = val & GICH_LR_PHYSID_CPUID;
cpuid >>= GICH_LR_PHYSID_CPUID_SHIFT;
@@ -68,7 +69,8 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
raw_spin_lock(&irq->irq_lock);
- /* Always preserve the active bit */
+ /* Always preserve the active bit, note deactivation */
+ deactivated = irq->active && !(val & ICH_LR_ACTIVE_BIT);
irq->active = !!(val & ICH_LR_ACTIVE_BIT);
if (irq->active && is_v2_sgi)
@@ -89,36 +91,8 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
if (irq->config == VGIC_CONFIG_LEVEL && !(val & ICH_LR_STATE))
irq->pending_latch = false;
- /*
- * Level-triggered mapped IRQs are special because we only
- * observe rising edges as input to the VGIC.
- *
- * If the guest never acked the interrupt we have to sample
- * the physical line and set the line level, because the
- * device state could have changed or we simply need to
- * process the still pending interrupt later.
- *
- * If this causes us to lower the level, we have to also clear
- * the physical active state, since we will otherwise never be
- * told when the interrupt becomes asserted again.
- *
- * Another case is when the interrupt requires a helping hand
- * on deactivation (no HW deactivation, for example).
- */
- if (vgic_irq_is_mapped_level(irq)) {
- bool resample = false;
-
- if (val & ICH_LR_PENDING_BIT) {
- irq->line_level = vgic_get_phys_line_level(irq);
- resample = !irq->line_level;
- } else if (vgic_irq_needs_resampling(irq) &&
- !(irq->active || irq->pending_latch)) {
- resample = true;
- }
-
- if (resample)
- vgic_irq_set_phys_active(irq, false);
- }
+ /* Handle resampling for mapped interrupts if required */
+ vgic_irq_handle_resampling(irq, deactivated, val & ICH_LR_PENDING_BIT);
raw_spin_unlock(&irq->irq_lock);
vgic_put_irq(vcpu->kvm, irq);
diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c
index 81cec508d413..5dad4996cfb2 100644
--- a/arch/arm64/kvm/vgic/vgic.c
+++ b/arch/arm64/kvm/vgic/vgic.c
@@ -1021,3 +1021,41 @@ bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int vintid)
return map_is_active;
}
+
+/*
+ * Level-triggered mapped IRQs are special because we only observe rising
+ * edges as input to the VGIC.
+ *
+ * If the guest never acked the interrupt we have to sample the physical
+ * line and set the line level, because the device state could have changed
+ * or we simply need to process the still pending interrupt later.
+ *
+ * We could also have entered the guest with the interrupt active+pending.
+ * On the next exit, we need to re-evaluate the pending state, as it could
+ * otherwise result in a spurious interrupt by injecting a now potentially
+ * stale pending state.
+ *
+ * If this causes us to lower the level, we have to also clear the physical
+ * active state, since we will otherwise never be told when the interrupt
+ * becomes asserted again.
+ *
+ * Another case is when the interrupt requires a helping hand on
+ * deactivation (no HW deactivation, for example).
+ */
+void vgic_irq_handle_resampling(struct vgic_irq *irq,
+ bool lr_deactivated, bool lr_pending)
+{
+ if (vgic_irq_is_mapped_level(irq)) {
+ bool resample = false;
+
+ if (unlikely(vgic_irq_needs_resampling(irq))) {
+ resample = !(irq->active || irq->pending_latch);
+ } else if (lr_pending || (lr_deactivated && irq->line_level)) {
+ irq->line_level = vgic_get_phys_line_level(irq);
+ resample = !irq->line_level;
+ }
+
+ if (resample)
+ vgic_irq_set_phys_active(irq, false);
+ }
+}
diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h
index dc1f3d1657ee..14a9218641f5 100644
--- a/arch/arm64/kvm/vgic/vgic.h
+++ b/arch/arm64/kvm/vgic/vgic.h
@@ -169,6 +169,8 @@ void vgic_irq_set_phys_active(struct vgic_irq *irq, bool active);
bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq,
unsigned long flags);
void vgic_kick_vcpus(struct kvm *kvm);
+void vgic_irq_handle_resampling(struct vgic_irq *irq,
+ bool lr_deactivated, bool lr_pending);
int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr,
phys_addr_t addr, phys_addr_t alignment);
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 3134cc8beb69d0db9de651081707c4651c011621 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz(a)kernel.org>
Date: Thu, 19 Aug 2021 19:03:05 +0100
Subject: [PATCH] KVM: arm64: vgic: Resample HW pending state on deactivation
When a mapped level interrupt (a timer, for example) is deactivated
by the guest, the corresponding host interrupt is equally deactivated.
However, the fate of the pending state still needs to be dealt
with in SW.
This is specially true when the interrupt was in the active+pending
state in the virtual distributor at the point where the guest
was entered. On exit, the pending state is potentially stale
(the guest may have put the interrupt in a non-pending state).
If we don't do anything, the interrupt will be spuriously injected
in the guest. Although this shouldn't have any ill effect (spurious
interrupts are always possible), we can improve the emulation by
detecting the deactivation-while-pending case and resample the
interrupt.
While we're at it, move the logic into a common helper that can
be shared between the two GIC implementations.
Fixes: e40cc57bac79 ("KVM: arm/arm64: vgic: Support level-triggered mapped interrupts")
Reported-by: Raghavendra Rao Ananta <rananta(a)google.com>
Tested-by: Raghavendra Rao Ananta <rananta(a)google.com>
Reviewed-by: Oliver Upton <oupton(a)google.com>
Signed-off-by: Marc Zyngier <maz(a)kernel.org>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20210819180305.1670525-1-maz@kernel.org
diff --git a/arch/arm64/kvm/vgic/vgic-v2.c b/arch/arm64/kvm/vgic/vgic-v2.c
index 2c580204f1dc..95a18cec14a3 100644
--- a/arch/arm64/kvm/vgic/vgic-v2.c
+++ b/arch/arm64/kvm/vgic/vgic-v2.c
@@ -60,6 +60,7 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
u32 val = cpuif->vgic_lr[lr];
u32 cpuid, intid = val & GICH_LR_VIRTUALID;
struct vgic_irq *irq;
+ bool deactivated;
/* Extract the source vCPU id from the LR */
cpuid = val & GICH_LR_PHYSID_CPUID;
@@ -75,7 +76,8 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
raw_spin_lock(&irq->irq_lock);
- /* Always preserve the active bit */
+ /* Always preserve the active bit, note deactivation */
+ deactivated = irq->active && !(val & GICH_LR_ACTIVE_BIT);
irq->active = !!(val & GICH_LR_ACTIVE_BIT);
if (irq->active && vgic_irq_is_sgi(intid))
@@ -96,36 +98,8 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
if (irq->config == VGIC_CONFIG_LEVEL && !(val & GICH_LR_STATE))
irq->pending_latch = false;
- /*
- * Level-triggered mapped IRQs are special because we only
- * observe rising edges as input to the VGIC.
- *
- * If the guest never acked the interrupt we have to sample
- * the physical line and set the line level, because the
- * device state could have changed or we simply need to
- * process the still pending interrupt later.
- *
- * If this causes us to lower the level, we have to also clear
- * the physical active state, since we will otherwise never be
- * told when the interrupt becomes asserted again.
- *
- * Another case is when the interrupt requires a helping hand
- * on deactivation (no HW deactivation, for example).
- */
- if (vgic_irq_is_mapped_level(irq)) {
- bool resample = false;
-
- if (val & GICH_LR_PENDING_BIT) {
- irq->line_level = vgic_get_phys_line_level(irq);
- resample = !irq->line_level;
- } else if (vgic_irq_needs_resampling(irq) &&
- !(irq->active || irq->pending_latch)) {
- resample = true;
- }
-
- if (resample)
- vgic_irq_set_phys_active(irq, false);
- }
+ /* Handle resampling for mapped interrupts if required */
+ vgic_irq_handle_resampling(irq, deactivated, val & GICH_LR_PENDING_BIT);
raw_spin_unlock(&irq->irq_lock);
vgic_put_irq(vcpu->kvm, irq);
diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c
index 66004f61cd83..21a6207fb2ee 100644
--- a/arch/arm64/kvm/vgic/vgic-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-v3.c
@@ -46,6 +46,7 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
u32 intid, cpuid;
struct vgic_irq *irq;
bool is_v2_sgi = false;
+ bool deactivated;
cpuid = val & GICH_LR_PHYSID_CPUID;
cpuid >>= GICH_LR_PHYSID_CPUID_SHIFT;
@@ -68,7 +69,8 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
raw_spin_lock(&irq->irq_lock);
- /* Always preserve the active bit */
+ /* Always preserve the active bit, note deactivation */
+ deactivated = irq->active && !(val & ICH_LR_ACTIVE_BIT);
irq->active = !!(val & ICH_LR_ACTIVE_BIT);
if (irq->active && is_v2_sgi)
@@ -89,36 +91,8 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
if (irq->config == VGIC_CONFIG_LEVEL && !(val & ICH_LR_STATE))
irq->pending_latch = false;
- /*
- * Level-triggered mapped IRQs are special because we only
- * observe rising edges as input to the VGIC.
- *
- * If the guest never acked the interrupt we have to sample
- * the physical line and set the line level, because the
- * device state could have changed or we simply need to
- * process the still pending interrupt later.
- *
- * If this causes us to lower the level, we have to also clear
- * the physical active state, since we will otherwise never be
- * told when the interrupt becomes asserted again.
- *
- * Another case is when the interrupt requires a helping hand
- * on deactivation (no HW deactivation, for example).
- */
- if (vgic_irq_is_mapped_level(irq)) {
- bool resample = false;
-
- if (val & ICH_LR_PENDING_BIT) {
- irq->line_level = vgic_get_phys_line_level(irq);
- resample = !irq->line_level;
- } else if (vgic_irq_needs_resampling(irq) &&
- !(irq->active || irq->pending_latch)) {
- resample = true;
- }
-
- if (resample)
- vgic_irq_set_phys_active(irq, false);
- }
+ /* Handle resampling for mapped interrupts if required */
+ vgic_irq_handle_resampling(irq, deactivated, val & ICH_LR_PENDING_BIT);
raw_spin_unlock(&irq->irq_lock);
vgic_put_irq(vcpu->kvm, irq);
diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c
index 81cec508d413..5dad4996cfb2 100644
--- a/arch/arm64/kvm/vgic/vgic.c
+++ b/arch/arm64/kvm/vgic/vgic.c
@@ -1021,3 +1021,41 @@ bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int vintid)
return map_is_active;
}
+
+/*
+ * Level-triggered mapped IRQs are special because we only observe rising
+ * edges as input to the VGIC.
+ *
+ * If the guest never acked the interrupt we have to sample the physical
+ * line and set the line level, because the device state could have changed
+ * or we simply need to process the still pending interrupt later.
+ *
+ * We could also have entered the guest with the interrupt active+pending.
+ * On the next exit, we need to re-evaluate the pending state, as it could
+ * otherwise result in a spurious interrupt by injecting a now potentially
+ * stale pending state.
+ *
+ * If this causes us to lower the level, we have to also clear the physical
+ * active state, since we will otherwise never be told when the interrupt
+ * becomes asserted again.
+ *
+ * Another case is when the interrupt requires a helping hand on
+ * deactivation (no HW deactivation, for example).
+ */
+void vgic_irq_handle_resampling(struct vgic_irq *irq,
+ bool lr_deactivated, bool lr_pending)
+{
+ if (vgic_irq_is_mapped_level(irq)) {
+ bool resample = false;
+
+ if (unlikely(vgic_irq_needs_resampling(irq))) {
+ resample = !(irq->active || irq->pending_latch);
+ } else if (lr_pending || (lr_deactivated && irq->line_level)) {
+ irq->line_level = vgic_get_phys_line_level(irq);
+ resample = !irq->line_level;
+ }
+
+ if (resample)
+ vgic_irq_set_phys_active(irq, false);
+ }
+}
diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h
index dc1f3d1657ee..14a9218641f5 100644
--- a/arch/arm64/kvm/vgic/vgic.h
+++ b/arch/arm64/kvm/vgic/vgic.h
@@ -169,6 +169,8 @@ void vgic_irq_set_phys_active(struct vgic_irq *irq, bool active);
bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq,
unsigned long flags);
void vgic_kick_vcpus(struct kvm *kvm);
+void vgic_irq_handle_resampling(struct vgic_irq *irq,
+ bool lr_deactivated, bool lr_pending);
int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr,
phys_addr_t addr, phys_addr_t alignment);
The patch below does not apply to the 5.13-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 3134cc8beb69d0db9de651081707c4651c011621 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz(a)kernel.org>
Date: Thu, 19 Aug 2021 19:03:05 +0100
Subject: [PATCH] KVM: arm64: vgic: Resample HW pending state on deactivation
When a mapped level interrupt (a timer, for example) is deactivated
by the guest, the corresponding host interrupt is equally deactivated.
However, the fate of the pending state still needs to be dealt
with in SW.
This is specially true when the interrupt was in the active+pending
state in the virtual distributor at the point where the guest
was entered. On exit, the pending state is potentially stale
(the guest may have put the interrupt in a non-pending state).
If we don't do anything, the interrupt will be spuriously injected
in the guest. Although this shouldn't have any ill effect (spurious
interrupts are always possible), we can improve the emulation by
detecting the deactivation-while-pending case and resample the
interrupt.
While we're at it, move the logic into a common helper that can
be shared between the two GIC implementations.
Fixes: e40cc57bac79 ("KVM: arm/arm64: vgic: Support level-triggered mapped interrupts")
Reported-by: Raghavendra Rao Ananta <rananta(a)google.com>
Tested-by: Raghavendra Rao Ananta <rananta(a)google.com>
Reviewed-by: Oliver Upton <oupton(a)google.com>
Signed-off-by: Marc Zyngier <maz(a)kernel.org>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20210819180305.1670525-1-maz@kernel.org
diff --git a/arch/arm64/kvm/vgic/vgic-v2.c b/arch/arm64/kvm/vgic/vgic-v2.c
index 2c580204f1dc..95a18cec14a3 100644
--- a/arch/arm64/kvm/vgic/vgic-v2.c
+++ b/arch/arm64/kvm/vgic/vgic-v2.c
@@ -60,6 +60,7 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
u32 val = cpuif->vgic_lr[lr];
u32 cpuid, intid = val & GICH_LR_VIRTUALID;
struct vgic_irq *irq;
+ bool deactivated;
/* Extract the source vCPU id from the LR */
cpuid = val & GICH_LR_PHYSID_CPUID;
@@ -75,7 +76,8 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
raw_spin_lock(&irq->irq_lock);
- /* Always preserve the active bit */
+ /* Always preserve the active bit, note deactivation */
+ deactivated = irq->active && !(val & GICH_LR_ACTIVE_BIT);
irq->active = !!(val & GICH_LR_ACTIVE_BIT);
if (irq->active && vgic_irq_is_sgi(intid))
@@ -96,36 +98,8 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
if (irq->config == VGIC_CONFIG_LEVEL && !(val & GICH_LR_STATE))
irq->pending_latch = false;
- /*
- * Level-triggered mapped IRQs are special because we only
- * observe rising edges as input to the VGIC.
- *
- * If the guest never acked the interrupt we have to sample
- * the physical line and set the line level, because the
- * device state could have changed or we simply need to
- * process the still pending interrupt later.
- *
- * If this causes us to lower the level, we have to also clear
- * the physical active state, since we will otherwise never be
- * told when the interrupt becomes asserted again.
- *
- * Another case is when the interrupt requires a helping hand
- * on deactivation (no HW deactivation, for example).
- */
- if (vgic_irq_is_mapped_level(irq)) {
- bool resample = false;
-
- if (val & GICH_LR_PENDING_BIT) {
- irq->line_level = vgic_get_phys_line_level(irq);
- resample = !irq->line_level;
- } else if (vgic_irq_needs_resampling(irq) &&
- !(irq->active || irq->pending_latch)) {
- resample = true;
- }
-
- if (resample)
- vgic_irq_set_phys_active(irq, false);
- }
+ /* Handle resampling for mapped interrupts if required */
+ vgic_irq_handle_resampling(irq, deactivated, val & GICH_LR_PENDING_BIT);
raw_spin_unlock(&irq->irq_lock);
vgic_put_irq(vcpu->kvm, irq);
diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c
index 66004f61cd83..21a6207fb2ee 100644
--- a/arch/arm64/kvm/vgic/vgic-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-v3.c
@@ -46,6 +46,7 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
u32 intid, cpuid;
struct vgic_irq *irq;
bool is_v2_sgi = false;
+ bool deactivated;
cpuid = val & GICH_LR_PHYSID_CPUID;
cpuid >>= GICH_LR_PHYSID_CPUID_SHIFT;
@@ -68,7 +69,8 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
raw_spin_lock(&irq->irq_lock);
- /* Always preserve the active bit */
+ /* Always preserve the active bit, note deactivation */
+ deactivated = irq->active && !(val & ICH_LR_ACTIVE_BIT);
irq->active = !!(val & ICH_LR_ACTIVE_BIT);
if (irq->active && is_v2_sgi)
@@ -89,36 +91,8 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
if (irq->config == VGIC_CONFIG_LEVEL && !(val & ICH_LR_STATE))
irq->pending_latch = false;
- /*
- * Level-triggered mapped IRQs are special because we only
- * observe rising edges as input to the VGIC.
- *
- * If the guest never acked the interrupt we have to sample
- * the physical line and set the line level, because the
- * device state could have changed or we simply need to
- * process the still pending interrupt later.
- *
- * If this causes us to lower the level, we have to also clear
- * the physical active state, since we will otherwise never be
- * told when the interrupt becomes asserted again.
- *
- * Another case is when the interrupt requires a helping hand
- * on deactivation (no HW deactivation, for example).
- */
- if (vgic_irq_is_mapped_level(irq)) {
- bool resample = false;
-
- if (val & ICH_LR_PENDING_BIT) {
- irq->line_level = vgic_get_phys_line_level(irq);
- resample = !irq->line_level;
- } else if (vgic_irq_needs_resampling(irq) &&
- !(irq->active || irq->pending_latch)) {
- resample = true;
- }
-
- if (resample)
- vgic_irq_set_phys_active(irq, false);
- }
+ /* Handle resampling for mapped interrupts if required */
+ vgic_irq_handle_resampling(irq, deactivated, val & ICH_LR_PENDING_BIT);
raw_spin_unlock(&irq->irq_lock);
vgic_put_irq(vcpu->kvm, irq);
diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c
index 81cec508d413..5dad4996cfb2 100644
--- a/arch/arm64/kvm/vgic/vgic.c
+++ b/arch/arm64/kvm/vgic/vgic.c
@@ -1021,3 +1021,41 @@ bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int vintid)
return map_is_active;
}
+
+/*
+ * Level-triggered mapped IRQs are special because we only observe rising
+ * edges as input to the VGIC.
+ *
+ * If the guest never acked the interrupt we have to sample the physical
+ * line and set the line level, because the device state could have changed
+ * or we simply need to process the still pending interrupt later.
+ *
+ * We could also have entered the guest with the interrupt active+pending.
+ * On the next exit, we need to re-evaluate the pending state, as it could
+ * otherwise result in a spurious interrupt by injecting a now potentially
+ * stale pending state.
+ *
+ * If this causes us to lower the level, we have to also clear the physical
+ * active state, since we will otherwise never be told when the interrupt
+ * becomes asserted again.
+ *
+ * Another case is when the interrupt requires a helping hand on
+ * deactivation (no HW deactivation, for example).
+ */
+void vgic_irq_handle_resampling(struct vgic_irq *irq,
+ bool lr_deactivated, bool lr_pending)
+{
+ if (vgic_irq_is_mapped_level(irq)) {
+ bool resample = false;
+
+ if (unlikely(vgic_irq_needs_resampling(irq))) {
+ resample = !(irq->active || irq->pending_latch);
+ } else if (lr_pending || (lr_deactivated && irq->line_level)) {
+ irq->line_level = vgic_get_phys_line_level(irq);
+ resample = !irq->line_level;
+ }
+
+ if (resample)
+ vgic_irq_set_phys_active(irq, false);
+ }
+}
diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h
index dc1f3d1657ee..14a9218641f5 100644
--- a/arch/arm64/kvm/vgic/vgic.h
+++ b/arch/arm64/kvm/vgic/vgic.h
@@ -169,6 +169,8 @@ void vgic_irq_set_phys_active(struct vgic_irq *irq, bool active);
bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq,
unsigned long flags);
void vgic_kick_vcpus(struct kvm *kvm);
+void vgic_irq_handle_resampling(struct vgic_irq *irq,
+ bool lr_deactivated, bool lr_pending);
int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr,
phys_addr_t addr, phys_addr_t alignment);
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From f7782bb8d818d8f47c26b22079db10599922787a Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc(a)google.com>
Date: Tue, 10 Aug 2021 07:45:26 -0700
Subject: [PATCH] KVM: nVMX: Unconditionally clear nested.pi_pending on nested
VM-Enter
Clear nested.pi_pending on nested VM-Enter even if L2 will run without
posted interrupts enabled. If nested.pi_pending is left set from a
previous L2, vmx_complete_nested_posted_interrupt() will pick up the
stale flag and exit to userspace with an "internal emulation error" due
the new L2 not having a valid nested.pi_desc.
Arguably, vmx_complete_nested_posted_interrupt() should first check for
posted interrupts being enabled, but it's also completely reasonable that
KVM wouldn't screw up a fundamental flag. Not to mention that the mere
existence of nested.pi_pending is a long-standing bug as KVM shouldn't
move the posted interrupt out of the IRR until it's actually processed,
e.g. KVM effectively drops an interrupt when it performs a nested VM-Exit
with a "pending" posted interrupt. Fixing the mess is a future problem.
Prior to vmx_complete_nested_posted_interrupt() interpreting a null PI
descriptor as an error, this was a benign bug as the null PI descriptor
effectively served as a check on PI not being enabled. Even then, the
new flow did not become problematic until KVM started checking the result
of kvm_check_nested_events().
Fixes: 705699a13994 ("KVM: nVMX: Enable nested posted interrupt processing")
Fixes: 966eefb89657 ("KVM: nVMX: Disable vmcs02 posted interrupts if vmcs12 PID isn't mappable")
Fixes: 47d3530f86c0 ("KVM: x86: Exit to userspace when kvm_check_nested_events fails")
Cc: stable(a)vger.kernel.org
Cc: Jim Mattson <jmattson(a)google.com>
Signed-off-by: Sean Christopherson <seanjc(a)google.com>
Message-Id: <20210810144526.2662272-1-seanjc(a)google.com>
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 264a9f4c9179..bc6327950657 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -2187,12 +2187,11 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0
~PIN_BASED_VMX_PREEMPTION_TIMER);
/* Posted interrupts setting is only taken from vmcs12. */
- if (nested_cpu_has_posted_intr(vmcs12)) {
+ vmx->nested.pi_pending = false;
+ if (nested_cpu_has_posted_intr(vmcs12))
vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
- vmx->nested.pi_pending = false;
- } else {
+ else
exec_control &= ~PIN_BASED_POSTED_INTR;
- }
pin_controls_set(vmx, exec_control);
/*
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From f7782bb8d818d8f47c26b22079db10599922787a Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc(a)google.com>
Date: Tue, 10 Aug 2021 07:45:26 -0700
Subject: [PATCH] KVM: nVMX: Unconditionally clear nested.pi_pending on nested
VM-Enter
Clear nested.pi_pending on nested VM-Enter even if L2 will run without
posted interrupts enabled. If nested.pi_pending is left set from a
previous L2, vmx_complete_nested_posted_interrupt() will pick up the
stale flag and exit to userspace with an "internal emulation error" due
the new L2 not having a valid nested.pi_desc.
Arguably, vmx_complete_nested_posted_interrupt() should first check for
posted interrupts being enabled, but it's also completely reasonable that
KVM wouldn't screw up a fundamental flag. Not to mention that the mere
existence of nested.pi_pending is a long-standing bug as KVM shouldn't
move the posted interrupt out of the IRR until it's actually processed,
e.g. KVM effectively drops an interrupt when it performs a nested VM-Exit
with a "pending" posted interrupt. Fixing the mess is a future problem.
Prior to vmx_complete_nested_posted_interrupt() interpreting a null PI
descriptor as an error, this was a benign bug as the null PI descriptor
effectively served as a check on PI not being enabled. Even then, the
new flow did not become problematic until KVM started checking the result
of kvm_check_nested_events().
Fixes: 705699a13994 ("KVM: nVMX: Enable nested posted interrupt processing")
Fixes: 966eefb89657 ("KVM: nVMX: Disable vmcs02 posted interrupts if vmcs12 PID isn't mappable")
Fixes: 47d3530f86c0 ("KVM: x86: Exit to userspace when kvm_check_nested_events fails")
Cc: stable(a)vger.kernel.org
Cc: Jim Mattson <jmattson(a)google.com>
Signed-off-by: Sean Christopherson <seanjc(a)google.com>
Message-Id: <20210810144526.2662272-1-seanjc(a)google.com>
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 264a9f4c9179..bc6327950657 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -2187,12 +2187,11 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0
~PIN_BASED_VMX_PREEMPTION_TIMER);
/* Posted interrupts setting is only taken from vmcs12. */
- if (nested_cpu_has_posted_intr(vmcs12)) {
+ vmx->nested.pi_pending = false;
+ if (nested_cpu_has_posted_intr(vmcs12))
vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
- vmx->nested.pi_pending = false;
- } else {
+ else
exec_control &= ~PIN_BASED_POSTED_INTR;
- }
pin_controls_set(vmx, exec_control);
/*
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From f7782bb8d818d8f47c26b22079db10599922787a Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc(a)google.com>
Date: Tue, 10 Aug 2021 07:45:26 -0700
Subject: [PATCH] KVM: nVMX: Unconditionally clear nested.pi_pending on nested
VM-Enter
Clear nested.pi_pending on nested VM-Enter even if L2 will run without
posted interrupts enabled. If nested.pi_pending is left set from a
previous L2, vmx_complete_nested_posted_interrupt() will pick up the
stale flag and exit to userspace with an "internal emulation error" due
the new L2 not having a valid nested.pi_desc.
Arguably, vmx_complete_nested_posted_interrupt() should first check for
posted interrupts being enabled, but it's also completely reasonable that
KVM wouldn't screw up a fundamental flag. Not to mention that the mere
existence of nested.pi_pending is a long-standing bug as KVM shouldn't
move the posted interrupt out of the IRR until it's actually processed,
e.g. KVM effectively drops an interrupt when it performs a nested VM-Exit
with a "pending" posted interrupt. Fixing the mess is a future problem.
Prior to vmx_complete_nested_posted_interrupt() interpreting a null PI
descriptor as an error, this was a benign bug as the null PI descriptor
effectively served as a check on PI not being enabled. Even then, the
new flow did not become problematic until KVM started checking the result
of kvm_check_nested_events().
Fixes: 705699a13994 ("KVM: nVMX: Enable nested posted interrupt processing")
Fixes: 966eefb89657 ("KVM: nVMX: Disable vmcs02 posted interrupts if vmcs12 PID isn't mappable")
Fixes: 47d3530f86c0 ("KVM: x86: Exit to userspace when kvm_check_nested_events fails")
Cc: stable(a)vger.kernel.org
Cc: Jim Mattson <jmattson(a)google.com>
Signed-off-by: Sean Christopherson <seanjc(a)google.com>
Message-Id: <20210810144526.2662272-1-seanjc(a)google.com>
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 264a9f4c9179..bc6327950657 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -2187,12 +2187,11 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0
~PIN_BASED_VMX_PREEMPTION_TIMER);
/* Posted interrupts setting is only taken from vmcs12. */
- if (nested_cpu_has_posted_intr(vmcs12)) {
+ vmx->nested.pi_pending = false;
+ if (nested_cpu_has_posted_intr(vmcs12))
vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
- vmx->nested.pi_pending = false;
- } else {
+ else
exec_control &= ~PIN_BASED_POSTED_INTR;
- }
pin_controls_set(vmx, exec_control);
/*
The patch below does not apply to the 4.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From f7782bb8d818d8f47c26b22079db10599922787a Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc(a)google.com>
Date: Tue, 10 Aug 2021 07:45:26 -0700
Subject: [PATCH] KVM: nVMX: Unconditionally clear nested.pi_pending on nested
VM-Enter
Clear nested.pi_pending on nested VM-Enter even if L2 will run without
posted interrupts enabled. If nested.pi_pending is left set from a
previous L2, vmx_complete_nested_posted_interrupt() will pick up the
stale flag and exit to userspace with an "internal emulation error" due
the new L2 not having a valid nested.pi_desc.
Arguably, vmx_complete_nested_posted_interrupt() should first check for
posted interrupts being enabled, but it's also completely reasonable that
KVM wouldn't screw up a fundamental flag. Not to mention that the mere
existence of nested.pi_pending is a long-standing bug as KVM shouldn't
move the posted interrupt out of the IRR until it's actually processed,
e.g. KVM effectively drops an interrupt when it performs a nested VM-Exit
with a "pending" posted interrupt. Fixing the mess is a future problem.
Prior to vmx_complete_nested_posted_interrupt() interpreting a null PI
descriptor as an error, this was a benign bug as the null PI descriptor
effectively served as a check on PI not being enabled. Even then, the
new flow did not become problematic until KVM started checking the result
of kvm_check_nested_events().
Fixes: 705699a13994 ("KVM: nVMX: Enable nested posted interrupt processing")
Fixes: 966eefb89657 ("KVM: nVMX: Disable vmcs02 posted interrupts if vmcs12 PID isn't mappable")
Fixes: 47d3530f86c0 ("KVM: x86: Exit to userspace when kvm_check_nested_events fails")
Cc: stable(a)vger.kernel.org
Cc: Jim Mattson <jmattson(a)google.com>
Signed-off-by: Sean Christopherson <seanjc(a)google.com>
Message-Id: <20210810144526.2662272-1-seanjc(a)google.com>
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 264a9f4c9179..bc6327950657 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -2187,12 +2187,11 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0
~PIN_BASED_VMX_PREEMPTION_TIMER);
/* Posted interrupts setting is only taken from vmcs12. */
- if (nested_cpu_has_posted_intr(vmcs12)) {
+ vmx->nested.pi_pending = false;
+ if (nested_cpu_has_posted_intr(vmcs12))
vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
- vmx->nested.pi_pending = false;
- } else {
+ else
exec_control &= ~PIN_BASED_POSTED_INTR;
- }
pin_controls_set(vmx, exec_control);
/*
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 81b4b56d4f8130bbb99cf4e2b48082e5b4cfccb9 Mon Sep 17 00:00:00 2001
From: Maxim Levitsky <mlevitsk(a)redhat.com>
Date: Thu, 26 Aug 2021 12:57:49 +0300
Subject: [PATCH] KVM: VMX: avoid running vmx_handle_exit_irqoff in case of
emulation
If we are emulating an invalid guest state, we don't have a correct
exit reason, and thus we shouldn't do anything in this function.
Signed-off-by: Maxim Levitsky <mlevitsk(a)redhat.com>
Message-Id: <20210826095750.1650467-2-mlevitsk(a)redhat.com>
Cc: stable(a)vger.kernel.org
Fixes: 95b5a48c4f2b ("KVM: VMX: Handle NMIs, #MCs and async #PFs in common irqs-disabled fn", 2019-06-18)
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index fada1055f325..0c2c0d5ae873 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -6382,6 +6382,9 @@ static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ if (vmx->emulation_required)
+ return;
+
if (vmx->exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT)
handle_external_interrupt_irqoff(vcpu);
else if (vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI)
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From ec607a564f70519b340f7eb4cfc0f4a6b55285ac Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini(a)redhat.com>
Date: Fri, 6 Aug 2021 07:05:58 -0400
Subject: [PATCH] KVM: x86: clamp host mapping level to max_level in
kvm_mmu_max_mapping_level
This change started as a way to make kvm_mmu_hugepage_adjust a bit simpler,
but it does fix two bugs as well.
One bug is in zapping collapsible PTEs. If a large page size is
disallowed but not all of them, kvm_mmu_max_mapping_level will return the
host mapping level and the small PTEs will be zapped up to that level.
However, if e.g. 1GB are prohibited, we can still zap 4KB mapping and
preserve the 2MB ones. This can happen for example when NX huge pages
are in use.
The second would happen when userspace backs guest memory
with a 1gb hugepage but only assign a subset of the page to
the guest. 1gb pages would be disallowed by the memslot, but
not 2mb. kvm_mmu_max_mapping_level() would fall through to the
host_pfn_mapping_level() logic, see the 1gb hugepage, and map the whole
thing into the guest.
Fixes: 2f57b7051fe8 ("KVM: x86/mmu: Persist gfn_lpage_is_disallowed() to max_level")
Cc: stable(a)vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 54cb15e4b550..bfd2705a7291 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -2910,6 +2910,7 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm,
kvm_pfn_t pfn, int max_level)
{
struct kvm_lpage_info *linfo;
+ int host_level;
max_level = min(max_level, max_huge_page_level);
for ( ; max_level > PG_LEVEL_4K; max_level--) {
@@ -2921,7 +2922,8 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm,
if (max_level == PG_LEVEL_4K)
return PG_LEVEL_4K;
- return host_pfn_mapping_level(kvm, gfn, pfn, slot);
+ host_level = host_pfn_mapping_level(kvm, gfn, pfn, slot);
+ return min(host_level, max_level);
}
int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn,
@@ -2945,17 +2947,12 @@ int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn,
if (!slot)
return PG_LEVEL_4K;
- level = kvm_mmu_max_mapping_level(vcpu->kvm, slot, gfn, pfn, max_level);
- if (level == PG_LEVEL_4K)
- return level;
-
- *req_level = level = min(level, max_level);
-
/*
* Enforce the iTLB multihit workaround after capturing the requested
* level, which will be used to do precise, accurate accounting.
*/
- if (huge_page_disallowed)
+ *req_level = level = kvm_mmu_max_mapping_level(vcpu->kvm, slot, gfn, pfn, max_level);
+ if (level == PG_LEVEL_4K || huge_page_disallowed)
return PG_LEVEL_4K;
/*
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From a3e03bc1368c1bc16e19b001fc96dc7430573cc8 Mon Sep 17 00:00:00 2001
From: Halil Pasic <pasic(a)linux.ibm.com>
Date: Fri, 27 Aug 2021 14:54:29 +0200
Subject: [PATCH] KVM: s390: index kvm->arch.idle_mask by vcpu_idx
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
While in practice vcpu->vcpu_idx == vcpu->vcp_id is often true, it may
not always be, and we must not rely on this. Reason is that KVM decides
the vcpu_idx, userspace decides the vcpu_id, thus the two might not
match.
Currently kvm->arch.idle_mask is indexed by vcpu_id, which implies
that code like
for_each_set_bit(vcpu_id, kvm->arch.idle_mask, online_vcpus) {
vcpu = kvm_get_vcpu(kvm, vcpu_id);
do_stuff(vcpu);
}
is not legit. Reason is that kvm_get_vcpu expects an vcpu_idx, not an
vcpu_id. The trouble is, we do actually use kvm->arch.idle_mask like
this. To fix this problem we have two options. Either use
kvm_get_vcpu_by_id(vcpu_id), which would loop to find the right vcpu_id,
or switch to indexing via vcpu_idx. The latter is preferable for obvious
reasons.
Let us make switch from indexing kvm->arch.idle_mask by vcpu_id to
indexing it by vcpu_idx. To keep gisa_int.kicked_mask indexed by the
same index as idle_mask lets make the same change for it as well.
Fixes: 1ee0bc559dc3 ("KVM: s390: get rid of local_int array")
Signed-off-by: Halil Pasic <pasic(a)linux.ibm.com>
Reviewed-by: Christian Bornträger <borntraeger(a)de.ibm.com>
Reviewed-by: Claudio Imbrenda <imbrenda(a)linux.ibm.com>
Cc: <stable(a)vger.kernel.org> # 3.15+
Link: https://lore.kernel.org/r/20210827125429.1912577-1-pasic@linux.ibm.com
Signed-off-by: Christian Borntraeger <borntraeger(a)de.ibm.com>
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 118d5450c523..611f18ecde91 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -963,6 +963,7 @@ struct kvm_arch{
atomic64_t cmma_dirty_pages;
/* subset of available cpu features enabled by user space */
DECLARE_BITMAP(cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS);
+ /* indexed by vcpu_idx */
DECLARE_BITMAP(idle_mask, KVM_MAX_VCPUS);
struct kvm_s390_gisa_interrupt gisa_int;
struct kvm_s390_pv pv;
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index d548d60caed2..16256e17a544 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -419,13 +419,13 @@ static unsigned long deliverable_irqs(struct kvm_vcpu *vcpu)
static void __set_cpu_idle(struct kvm_vcpu *vcpu)
{
kvm_s390_set_cpuflags(vcpu, CPUSTAT_WAIT);
- set_bit(vcpu->vcpu_id, vcpu->kvm->arch.idle_mask);
+ set_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.idle_mask);
}
static void __unset_cpu_idle(struct kvm_vcpu *vcpu)
{
kvm_s390_clear_cpuflags(vcpu, CPUSTAT_WAIT);
- clear_bit(vcpu->vcpu_id, vcpu->kvm->arch.idle_mask);
+ clear_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.idle_mask);
}
static void __reset_intercept_indicators(struct kvm_vcpu *vcpu)
@@ -3050,18 +3050,18 @@ int kvm_s390_get_irq_state(struct kvm_vcpu *vcpu, __u8 __user *buf, int len)
static void __airqs_kick_single_vcpu(struct kvm *kvm, u8 deliverable_mask)
{
- int vcpu_id, online_vcpus = atomic_read(&kvm->online_vcpus);
+ int vcpu_idx, online_vcpus = atomic_read(&kvm->online_vcpus);
struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int;
struct kvm_vcpu *vcpu;
- for_each_set_bit(vcpu_id, kvm->arch.idle_mask, online_vcpus) {
- vcpu = kvm_get_vcpu(kvm, vcpu_id);
+ for_each_set_bit(vcpu_idx, kvm->arch.idle_mask, online_vcpus) {
+ vcpu = kvm_get_vcpu(kvm, vcpu_idx);
if (psw_ioint_disabled(vcpu))
continue;
deliverable_mask &= (u8)(vcpu->arch.sie_block->gcr[6] >> 24);
if (deliverable_mask) {
/* lately kicked but not yet running */
- if (test_and_set_bit(vcpu_id, gi->kicked_mask))
+ if (test_and_set_bit(vcpu_idx, gi->kicked_mask))
return;
kvm_s390_vcpu_wakeup(vcpu);
return;
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 5b45c83ced21..e144c8046ceb 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -4026,7 +4026,7 @@ static int vcpu_pre_run(struct kvm_vcpu *vcpu)
kvm_s390_patch_guest_per_regs(vcpu);
}
- clear_bit(vcpu->vcpu_id, vcpu->kvm->arch.gisa_int.kicked_mask);
+ clear_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.gisa_int.kicked_mask);
vcpu->arch.sie_block->icptcode = 0;
cpuflags = atomic_read(&vcpu->arch.sie_block->cpuflags);
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index 9fad25109b0d..ecd741ee3276 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -79,7 +79,7 @@ static inline int is_vcpu_stopped(struct kvm_vcpu *vcpu)
static inline int is_vcpu_idle(struct kvm_vcpu *vcpu)
{
- return test_bit(vcpu->vcpu_id, vcpu->kvm->arch.idle_mask);
+ return test_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.idle_mask);
}
static inline int kvm_is_ucontrol(struct kvm *kvm)
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From a3e03bc1368c1bc16e19b001fc96dc7430573cc8 Mon Sep 17 00:00:00 2001
From: Halil Pasic <pasic(a)linux.ibm.com>
Date: Fri, 27 Aug 2021 14:54:29 +0200
Subject: [PATCH] KVM: s390: index kvm->arch.idle_mask by vcpu_idx
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
While in practice vcpu->vcpu_idx == vcpu->vcp_id is often true, it may
not always be, and we must not rely on this. Reason is that KVM decides
the vcpu_idx, userspace decides the vcpu_id, thus the two might not
match.
Currently kvm->arch.idle_mask is indexed by vcpu_id, which implies
that code like
for_each_set_bit(vcpu_id, kvm->arch.idle_mask, online_vcpus) {
vcpu = kvm_get_vcpu(kvm, vcpu_id);
do_stuff(vcpu);
}
is not legit. Reason is that kvm_get_vcpu expects an vcpu_idx, not an
vcpu_id. The trouble is, we do actually use kvm->arch.idle_mask like
this. To fix this problem we have two options. Either use
kvm_get_vcpu_by_id(vcpu_id), which would loop to find the right vcpu_id,
or switch to indexing via vcpu_idx. The latter is preferable for obvious
reasons.
Let us make switch from indexing kvm->arch.idle_mask by vcpu_id to
indexing it by vcpu_idx. To keep gisa_int.kicked_mask indexed by the
same index as idle_mask lets make the same change for it as well.
Fixes: 1ee0bc559dc3 ("KVM: s390: get rid of local_int array")
Signed-off-by: Halil Pasic <pasic(a)linux.ibm.com>
Reviewed-by: Christian Bornträger <borntraeger(a)de.ibm.com>
Reviewed-by: Claudio Imbrenda <imbrenda(a)linux.ibm.com>
Cc: <stable(a)vger.kernel.org> # 3.15+
Link: https://lore.kernel.org/r/20210827125429.1912577-1-pasic@linux.ibm.com
Signed-off-by: Christian Borntraeger <borntraeger(a)de.ibm.com>
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 118d5450c523..611f18ecde91 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -963,6 +963,7 @@ struct kvm_arch{
atomic64_t cmma_dirty_pages;
/* subset of available cpu features enabled by user space */
DECLARE_BITMAP(cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS);
+ /* indexed by vcpu_idx */
DECLARE_BITMAP(idle_mask, KVM_MAX_VCPUS);
struct kvm_s390_gisa_interrupt gisa_int;
struct kvm_s390_pv pv;
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index d548d60caed2..16256e17a544 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -419,13 +419,13 @@ static unsigned long deliverable_irqs(struct kvm_vcpu *vcpu)
static void __set_cpu_idle(struct kvm_vcpu *vcpu)
{
kvm_s390_set_cpuflags(vcpu, CPUSTAT_WAIT);
- set_bit(vcpu->vcpu_id, vcpu->kvm->arch.idle_mask);
+ set_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.idle_mask);
}
static void __unset_cpu_idle(struct kvm_vcpu *vcpu)
{
kvm_s390_clear_cpuflags(vcpu, CPUSTAT_WAIT);
- clear_bit(vcpu->vcpu_id, vcpu->kvm->arch.idle_mask);
+ clear_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.idle_mask);
}
static void __reset_intercept_indicators(struct kvm_vcpu *vcpu)
@@ -3050,18 +3050,18 @@ int kvm_s390_get_irq_state(struct kvm_vcpu *vcpu, __u8 __user *buf, int len)
static void __airqs_kick_single_vcpu(struct kvm *kvm, u8 deliverable_mask)
{
- int vcpu_id, online_vcpus = atomic_read(&kvm->online_vcpus);
+ int vcpu_idx, online_vcpus = atomic_read(&kvm->online_vcpus);
struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int;
struct kvm_vcpu *vcpu;
- for_each_set_bit(vcpu_id, kvm->arch.idle_mask, online_vcpus) {
- vcpu = kvm_get_vcpu(kvm, vcpu_id);
+ for_each_set_bit(vcpu_idx, kvm->arch.idle_mask, online_vcpus) {
+ vcpu = kvm_get_vcpu(kvm, vcpu_idx);
if (psw_ioint_disabled(vcpu))
continue;
deliverable_mask &= (u8)(vcpu->arch.sie_block->gcr[6] >> 24);
if (deliverable_mask) {
/* lately kicked but not yet running */
- if (test_and_set_bit(vcpu_id, gi->kicked_mask))
+ if (test_and_set_bit(vcpu_idx, gi->kicked_mask))
return;
kvm_s390_vcpu_wakeup(vcpu);
return;
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 5b45c83ced21..e144c8046ceb 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -4026,7 +4026,7 @@ static int vcpu_pre_run(struct kvm_vcpu *vcpu)
kvm_s390_patch_guest_per_regs(vcpu);
}
- clear_bit(vcpu->vcpu_id, vcpu->kvm->arch.gisa_int.kicked_mask);
+ clear_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.gisa_int.kicked_mask);
vcpu->arch.sie_block->icptcode = 0;
cpuflags = atomic_read(&vcpu->arch.sie_block->cpuflags);
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index 9fad25109b0d..ecd741ee3276 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -79,7 +79,7 @@ static inline int is_vcpu_stopped(struct kvm_vcpu *vcpu)
static inline int is_vcpu_idle(struct kvm_vcpu *vcpu)
{
- return test_bit(vcpu->vcpu_id, vcpu->kvm->arch.idle_mask);
+ return test_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.idle_mask);
}
static inline int kvm_is_ucontrol(struct kvm *kvm)
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From a3e03bc1368c1bc16e19b001fc96dc7430573cc8 Mon Sep 17 00:00:00 2001
From: Halil Pasic <pasic(a)linux.ibm.com>
Date: Fri, 27 Aug 2021 14:54:29 +0200
Subject: [PATCH] KVM: s390: index kvm->arch.idle_mask by vcpu_idx
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
While in practice vcpu->vcpu_idx == vcpu->vcp_id is often true, it may
not always be, and we must not rely on this. Reason is that KVM decides
the vcpu_idx, userspace decides the vcpu_id, thus the two might not
match.
Currently kvm->arch.idle_mask is indexed by vcpu_id, which implies
that code like
for_each_set_bit(vcpu_id, kvm->arch.idle_mask, online_vcpus) {
vcpu = kvm_get_vcpu(kvm, vcpu_id);
do_stuff(vcpu);
}
is not legit. Reason is that kvm_get_vcpu expects an vcpu_idx, not an
vcpu_id. The trouble is, we do actually use kvm->arch.idle_mask like
this. To fix this problem we have two options. Either use
kvm_get_vcpu_by_id(vcpu_id), which would loop to find the right vcpu_id,
or switch to indexing via vcpu_idx. The latter is preferable for obvious
reasons.
Let us make switch from indexing kvm->arch.idle_mask by vcpu_id to
indexing it by vcpu_idx. To keep gisa_int.kicked_mask indexed by the
same index as idle_mask lets make the same change for it as well.
Fixes: 1ee0bc559dc3 ("KVM: s390: get rid of local_int array")
Signed-off-by: Halil Pasic <pasic(a)linux.ibm.com>
Reviewed-by: Christian Bornträger <borntraeger(a)de.ibm.com>
Reviewed-by: Claudio Imbrenda <imbrenda(a)linux.ibm.com>
Cc: <stable(a)vger.kernel.org> # 3.15+
Link: https://lore.kernel.org/r/20210827125429.1912577-1-pasic@linux.ibm.com
Signed-off-by: Christian Borntraeger <borntraeger(a)de.ibm.com>
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 118d5450c523..611f18ecde91 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -963,6 +963,7 @@ struct kvm_arch{
atomic64_t cmma_dirty_pages;
/* subset of available cpu features enabled by user space */
DECLARE_BITMAP(cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS);
+ /* indexed by vcpu_idx */
DECLARE_BITMAP(idle_mask, KVM_MAX_VCPUS);
struct kvm_s390_gisa_interrupt gisa_int;
struct kvm_s390_pv pv;
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index d548d60caed2..16256e17a544 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -419,13 +419,13 @@ static unsigned long deliverable_irqs(struct kvm_vcpu *vcpu)
static void __set_cpu_idle(struct kvm_vcpu *vcpu)
{
kvm_s390_set_cpuflags(vcpu, CPUSTAT_WAIT);
- set_bit(vcpu->vcpu_id, vcpu->kvm->arch.idle_mask);
+ set_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.idle_mask);
}
static void __unset_cpu_idle(struct kvm_vcpu *vcpu)
{
kvm_s390_clear_cpuflags(vcpu, CPUSTAT_WAIT);
- clear_bit(vcpu->vcpu_id, vcpu->kvm->arch.idle_mask);
+ clear_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.idle_mask);
}
static void __reset_intercept_indicators(struct kvm_vcpu *vcpu)
@@ -3050,18 +3050,18 @@ int kvm_s390_get_irq_state(struct kvm_vcpu *vcpu, __u8 __user *buf, int len)
static void __airqs_kick_single_vcpu(struct kvm *kvm, u8 deliverable_mask)
{
- int vcpu_id, online_vcpus = atomic_read(&kvm->online_vcpus);
+ int vcpu_idx, online_vcpus = atomic_read(&kvm->online_vcpus);
struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int;
struct kvm_vcpu *vcpu;
- for_each_set_bit(vcpu_id, kvm->arch.idle_mask, online_vcpus) {
- vcpu = kvm_get_vcpu(kvm, vcpu_id);
+ for_each_set_bit(vcpu_idx, kvm->arch.idle_mask, online_vcpus) {
+ vcpu = kvm_get_vcpu(kvm, vcpu_idx);
if (psw_ioint_disabled(vcpu))
continue;
deliverable_mask &= (u8)(vcpu->arch.sie_block->gcr[6] >> 24);
if (deliverable_mask) {
/* lately kicked but not yet running */
- if (test_and_set_bit(vcpu_id, gi->kicked_mask))
+ if (test_and_set_bit(vcpu_idx, gi->kicked_mask))
return;
kvm_s390_vcpu_wakeup(vcpu);
return;
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 5b45c83ced21..e144c8046ceb 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -4026,7 +4026,7 @@ static int vcpu_pre_run(struct kvm_vcpu *vcpu)
kvm_s390_patch_guest_per_regs(vcpu);
}
- clear_bit(vcpu->vcpu_id, vcpu->kvm->arch.gisa_int.kicked_mask);
+ clear_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.gisa_int.kicked_mask);
vcpu->arch.sie_block->icptcode = 0;
cpuflags = atomic_read(&vcpu->arch.sie_block->cpuflags);
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index 9fad25109b0d..ecd741ee3276 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -79,7 +79,7 @@ static inline int is_vcpu_stopped(struct kvm_vcpu *vcpu)
static inline int is_vcpu_idle(struct kvm_vcpu *vcpu)
{
- return test_bit(vcpu->vcpu_id, vcpu->kvm->arch.idle_mask);
+ return test_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.idle_mask);
}
static inline int kvm_is_ucontrol(struct kvm *kvm)
The patch below does not apply to the 4.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From a3e03bc1368c1bc16e19b001fc96dc7430573cc8 Mon Sep 17 00:00:00 2001
From: Halil Pasic <pasic(a)linux.ibm.com>
Date: Fri, 27 Aug 2021 14:54:29 +0200
Subject: [PATCH] KVM: s390: index kvm->arch.idle_mask by vcpu_idx
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
While in practice vcpu->vcpu_idx == vcpu->vcp_id is often true, it may
not always be, and we must not rely on this. Reason is that KVM decides
the vcpu_idx, userspace decides the vcpu_id, thus the two might not
match.
Currently kvm->arch.idle_mask is indexed by vcpu_id, which implies
that code like
for_each_set_bit(vcpu_id, kvm->arch.idle_mask, online_vcpus) {
vcpu = kvm_get_vcpu(kvm, vcpu_id);
do_stuff(vcpu);
}
is not legit. Reason is that kvm_get_vcpu expects an vcpu_idx, not an
vcpu_id. The trouble is, we do actually use kvm->arch.idle_mask like
this. To fix this problem we have two options. Either use
kvm_get_vcpu_by_id(vcpu_id), which would loop to find the right vcpu_id,
or switch to indexing via vcpu_idx. The latter is preferable for obvious
reasons.
Let us make switch from indexing kvm->arch.idle_mask by vcpu_id to
indexing it by vcpu_idx. To keep gisa_int.kicked_mask indexed by the
same index as idle_mask lets make the same change for it as well.
Fixes: 1ee0bc559dc3 ("KVM: s390: get rid of local_int array")
Signed-off-by: Halil Pasic <pasic(a)linux.ibm.com>
Reviewed-by: Christian Bornträger <borntraeger(a)de.ibm.com>
Reviewed-by: Claudio Imbrenda <imbrenda(a)linux.ibm.com>
Cc: <stable(a)vger.kernel.org> # 3.15+
Link: https://lore.kernel.org/r/20210827125429.1912577-1-pasic@linux.ibm.com
Signed-off-by: Christian Borntraeger <borntraeger(a)de.ibm.com>
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 118d5450c523..611f18ecde91 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -963,6 +963,7 @@ struct kvm_arch{
atomic64_t cmma_dirty_pages;
/* subset of available cpu features enabled by user space */
DECLARE_BITMAP(cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS);
+ /* indexed by vcpu_idx */
DECLARE_BITMAP(idle_mask, KVM_MAX_VCPUS);
struct kvm_s390_gisa_interrupt gisa_int;
struct kvm_s390_pv pv;
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index d548d60caed2..16256e17a544 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -419,13 +419,13 @@ static unsigned long deliverable_irqs(struct kvm_vcpu *vcpu)
static void __set_cpu_idle(struct kvm_vcpu *vcpu)
{
kvm_s390_set_cpuflags(vcpu, CPUSTAT_WAIT);
- set_bit(vcpu->vcpu_id, vcpu->kvm->arch.idle_mask);
+ set_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.idle_mask);
}
static void __unset_cpu_idle(struct kvm_vcpu *vcpu)
{
kvm_s390_clear_cpuflags(vcpu, CPUSTAT_WAIT);
- clear_bit(vcpu->vcpu_id, vcpu->kvm->arch.idle_mask);
+ clear_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.idle_mask);
}
static void __reset_intercept_indicators(struct kvm_vcpu *vcpu)
@@ -3050,18 +3050,18 @@ int kvm_s390_get_irq_state(struct kvm_vcpu *vcpu, __u8 __user *buf, int len)
static void __airqs_kick_single_vcpu(struct kvm *kvm, u8 deliverable_mask)
{
- int vcpu_id, online_vcpus = atomic_read(&kvm->online_vcpus);
+ int vcpu_idx, online_vcpus = atomic_read(&kvm->online_vcpus);
struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int;
struct kvm_vcpu *vcpu;
- for_each_set_bit(vcpu_id, kvm->arch.idle_mask, online_vcpus) {
- vcpu = kvm_get_vcpu(kvm, vcpu_id);
+ for_each_set_bit(vcpu_idx, kvm->arch.idle_mask, online_vcpus) {
+ vcpu = kvm_get_vcpu(kvm, vcpu_idx);
if (psw_ioint_disabled(vcpu))
continue;
deliverable_mask &= (u8)(vcpu->arch.sie_block->gcr[6] >> 24);
if (deliverable_mask) {
/* lately kicked but not yet running */
- if (test_and_set_bit(vcpu_id, gi->kicked_mask))
+ if (test_and_set_bit(vcpu_idx, gi->kicked_mask))
return;
kvm_s390_vcpu_wakeup(vcpu);
return;
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 5b45c83ced21..e144c8046ceb 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -4026,7 +4026,7 @@ static int vcpu_pre_run(struct kvm_vcpu *vcpu)
kvm_s390_patch_guest_per_regs(vcpu);
}
- clear_bit(vcpu->vcpu_id, vcpu->kvm->arch.gisa_int.kicked_mask);
+ clear_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.gisa_int.kicked_mask);
vcpu->arch.sie_block->icptcode = 0;
cpuflags = atomic_read(&vcpu->arch.sie_block->cpuflags);
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index 9fad25109b0d..ecd741ee3276 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -79,7 +79,7 @@ static inline int is_vcpu_stopped(struct kvm_vcpu *vcpu)
static inline int is_vcpu_idle(struct kvm_vcpu *vcpu)
{
- return test_bit(vcpu->vcpu_id, vcpu->kvm->arch.idle_mask);
+ return test_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.idle_mask);
}
static inline int kvm_is_ucontrol(struct kvm *kvm)
The patch below does not apply to the 5.13-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 2ae2eb9dde18979b40629dd413b9adbd6c894cdf Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence(a)gmail.com>
Date: Thu, 9 Sep 2021 13:56:27 +0100
Subject: [PATCH] io_uring: fail links of cancelled timeouts
When we cancel a timeout we should mark it with REQ_F_FAIL, so
linked requests are cancelled as well, but not queued for further
execution.
Cc: stable(a)vger.kernel.org
Signed-off-by: Pavel Begunkov <asml.silence(a)gmail.com>
Link: https://lore.kernel.org/r/fff625b44eeced3a5cae79f60e6acf3fbdf8f990.16311921…
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/fs/io_uring.c b/fs/io_uring.c
index b21a423a4de8..ffd91844b2e5 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1482,6 +1482,8 @@ static void io_kill_timeout(struct io_kiocb *req, int status)
struct io_timeout_data *io = req->async_data;
if (hrtimer_try_to_cancel(&io->timer) != -1) {
+ if (status)
+ req_set_fail(req);
atomic_set(&req->ctx->cq_timeouts,
atomic_read(&req->ctx->cq_timeouts) + 1);
list_del_init(&req->timeout.list);
The patch below does not apply to the 5.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 89c2b3b74918200e46699338d7bcc19b1ea12110 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence(a)gmail.com>
Date: Mon, 23 Aug 2021 11:18:45 +0100
Subject: [PATCH] io_uring: reexpand under-reexpanded iters
[ 74.211232] BUG: KASAN: stack-out-of-bounds in iov_iter_revert+0x809/0x900
[ 74.212778] Read of size 8 at addr ffff888025dc78b8 by task
syz-executor.0/828
[ 74.214756] CPU: 0 PID: 828 Comm: syz-executor.0 Not tainted
5.14.0-rc3-next-20210730 #1
[ 74.216525] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014
[ 74.219033] Call Trace:
[ 74.219683] dump_stack_lvl+0x8b/0xb3
[ 74.220706] print_address_description.constprop.0+0x1f/0x140
[ 74.224226] kasan_report.cold+0x7f/0x11b
[ 74.226085] iov_iter_revert+0x809/0x900
[ 74.227960] io_write+0x57d/0xe40
[ 74.232647] io_issue_sqe+0x4da/0x6a80
[ 74.242578] __io_queue_sqe+0x1ac/0xe60
[ 74.245358] io_submit_sqes+0x3f6e/0x76a0
[ 74.248207] __do_sys_io_uring_enter+0x90c/0x1a20
[ 74.257167] do_syscall_64+0x3b/0x90
[ 74.257984] entry_SYSCALL_64_after_hwframe+0x44/0xae
old_size = iov_iter_count();
...
iov_iter_revert(old_size - iov_iter_count());
If iov_iter_revert() is done base on the initial size as above, and the
iter is truncated and not reexpanded in the middle, it miscalculates
borders causing problems. This trace is due to no one reexpanding after
generic_write_checks().
Now iters store how many bytes has been truncated, so reexpand them to
the initial state right before reverting.
Cc: stable(a)vger.kernel.org
Reported-by: Palash Oswal <oswalpalash(a)gmail.com>
Reported-by: Sudip Mukherjee <sudipm.mukherjee(a)gmail.com>
Reported-and-tested-by: syzbot+9671693590ef5aad8953(a)syzkaller.appspotmail.com
Signed-off-by: Pavel Begunkov <asml.silence(a)gmail.com>
Signed-off-by: Al Viro <viro(a)zeniv.linux.org.uk>
diff --git a/fs/io_uring.c b/fs/io_uring.c
index d94fb5835a20..71639d39caf9 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -3281,6 +3281,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
if (req->flags & REQ_F_NOWAIT)
goto done;
/* some cases will consume bytes even on error returns */
+ iov_iter_reexpand(iter, iter->count + iter->truncated);
iov_iter_revert(iter, io_size - iov_iter_count(iter));
ret = 0;
} else if (ret == -EIOCBQUEUED) {
@@ -3420,6 +3421,7 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)
} else {
copy_iov:
/* some cases will consume bytes even on error returns */
+ iov_iter_reexpand(iter, iter->count + iter->truncated);
iov_iter_revert(iter, io_size - iov_iter_count(iter));
ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false);
return ret ?: -EAGAIN;
The patch below does not apply to the 5.13-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 89c2b3b74918200e46699338d7bcc19b1ea12110 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence(a)gmail.com>
Date: Mon, 23 Aug 2021 11:18:45 +0100
Subject: [PATCH] io_uring: reexpand under-reexpanded iters
[ 74.211232] BUG: KASAN: stack-out-of-bounds in iov_iter_revert+0x809/0x900
[ 74.212778] Read of size 8 at addr ffff888025dc78b8 by task
syz-executor.0/828
[ 74.214756] CPU: 0 PID: 828 Comm: syz-executor.0 Not tainted
5.14.0-rc3-next-20210730 #1
[ 74.216525] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014
[ 74.219033] Call Trace:
[ 74.219683] dump_stack_lvl+0x8b/0xb3
[ 74.220706] print_address_description.constprop.0+0x1f/0x140
[ 74.224226] kasan_report.cold+0x7f/0x11b
[ 74.226085] iov_iter_revert+0x809/0x900
[ 74.227960] io_write+0x57d/0xe40
[ 74.232647] io_issue_sqe+0x4da/0x6a80
[ 74.242578] __io_queue_sqe+0x1ac/0xe60
[ 74.245358] io_submit_sqes+0x3f6e/0x76a0
[ 74.248207] __do_sys_io_uring_enter+0x90c/0x1a20
[ 74.257167] do_syscall_64+0x3b/0x90
[ 74.257984] entry_SYSCALL_64_after_hwframe+0x44/0xae
old_size = iov_iter_count();
...
iov_iter_revert(old_size - iov_iter_count());
If iov_iter_revert() is done base on the initial size as above, and the
iter is truncated and not reexpanded in the middle, it miscalculates
borders causing problems. This trace is due to no one reexpanding after
generic_write_checks().
Now iters store how many bytes has been truncated, so reexpand them to
the initial state right before reverting.
Cc: stable(a)vger.kernel.org
Reported-by: Palash Oswal <oswalpalash(a)gmail.com>
Reported-by: Sudip Mukherjee <sudipm.mukherjee(a)gmail.com>
Reported-and-tested-by: syzbot+9671693590ef5aad8953(a)syzkaller.appspotmail.com
Signed-off-by: Pavel Begunkov <asml.silence(a)gmail.com>
Signed-off-by: Al Viro <viro(a)zeniv.linux.org.uk>
diff --git a/fs/io_uring.c b/fs/io_uring.c
index d94fb5835a20..71639d39caf9 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -3281,6 +3281,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
if (req->flags & REQ_F_NOWAIT)
goto done;
/* some cases will consume bytes even on error returns */
+ iov_iter_reexpand(iter, iter->count + iter->truncated);
iov_iter_revert(iter, io_size - iov_iter_count(iter));
ret = 0;
} else if (ret == -EIOCBQUEUED) {
@@ -3420,6 +3421,7 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)
} else {
copy_iov:
/* some cases will consume bytes even on error returns */
+ iov_iter_reexpand(iter, iter->count + iter->truncated);
iov_iter_revert(iter, io_size - iov_iter_count(iter));
ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false);
return ret ?: -EAGAIN;
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 2ae2eb9dde18979b40629dd413b9adbd6c894cdf Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence(a)gmail.com>
Date: Thu, 9 Sep 2021 13:56:27 +0100
Subject: [PATCH] io_uring: fail links of cancelled timeouts
When we cancel a timeout we should mark it with REQ_F_FAIL, so
linked requests are cancelled as well, but not queued for further
execution.
Cc: stable(a)vger.kernel.org
Signed-off-by: Pavel Begunkov <asml.silence(a)gmail.com>
Link: https://lore.kernel.org/r/fff625b44eeced3a5cae79f60e6acf3fbdf8f990.16311921…
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/fs/io_uring.c b/fs/io_uring.c
index b21a423a4de8..ffd91844b2e5 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1482,6 +1482,8 @@ static void io_kill_timeout(struct io_kiocb *req, int status)
struct io_timeout_data *io = req->async_data;
if (hrtimer_try_to_cancel(&io->timer) != -1) {
+ if (status)
+ req_set_fail(req);
atomic_set(&req->ctx->cq_timeouts,
atomic_read(&req->ctx->cq_timeouts) + 1);
list_del_init(&req->timeout.list);
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 89c2b3b74918200e46699338d7bcc19b1ea12110 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence(a)gmail.com>
Date: Mon, 23 Aug 2021 11:18:45 +0100
Subject: [PATCH] io_uring: reexpand under-reexpanded iters
[ 74.211232] BUG: KASAN: stack-out-of-bounds in iov_iter_revert+0x809/0x900
[ 74.212778] Read of size 8 at addr ffff888025dc78b8 by task
syz-executor.0/828
[ 74.214756] CPU: 0 PID: 828 Comm: syz-executor.0 Not tainted
5.14.0-rc3-next-20210730 #1
[ 74.216525] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014
[ 74.219033] Call Trace:
[ 74.219683] dump_stack_lvl+0x8b/0xb3
[ 74.220706] print_address_description.constprop.0+0x1f/0x140
[ 74.224226] kasan_report.cold+0x7f/0x11b
[ 74.226085] iov_iter_revert+0x809/0x900
[ 74.227960] io_write+0x57d/0xe40
[ 74.232647] io_issue_sqe+0x4da/0x6a80
[ 74.242578] __io_queue_sqe+0x1ac/0xe60
[ 74.245358] io_submit_sqes+0x3f6e/0x76a0
[ 74.248207] __do_sys_io_uring_enter+0x90c/0x1a20
[ 74.257167] do_syscall_64+0x3b/0x90
[ 74.257984] entry_SYSCALL_64_after_hwframe+0x44/0xae
old_size = iov_iter_count();
...
iov_iter_revert(old_size - iov_iter_count());
If iov_iter_revert() is done base on the initial size as above, and the
iter is truncated and not reexpanded in the middle, it miscalculates
borders causing problems. This trace is due to no one reexpanding after
generic_write_checks().
Now iters store how many bytes has been truncated, so reexpand them to
the initial state right before reverting.
Cc: stable(a)vger.kernel.org
Reported-by: Palash Oswal <oswalpalash(a)gmail.com>
Reported-by: Sudip Mukherjee <sudipm.mukherjee(a)gmail.com>
Reported-and-tested-by: syzbot+9671693590ef5aad8953(a)syzkaller.appspotmail.com
Signed-off-by: Pavel Begunkov <asml.silence(a)gmail.com>
Signed-off-by: Al Viro <viro(a)zeniv.linux.org.uk>
diff --git a/fs/io_uring.c b/fs/io_uring.c
index d94fb5835a20..71639d39caf9 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -3281,6 +3281,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
if (req->flags & REQ_F_NOWAIT)
goto done;
/* some cases will consume bytes even on error returns */
+ iov_iter_reexpand(iter, iter->count + iter->truncated);
iov_iter_revert(iter, io_size - iov_iter_count(iter));
ret = 0;
} else if (ret == -EIOCBQUEUED) {
@@ -3420,6 +3421,7 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)
} else {
copy_iov:
/* some cases will consume bytes even on error returns */
+ iov_iter_reexpand(iter, iter->count + iter->truncated);
iov_iter_revert(iter, io_size - iov_iter_count(iter));
ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false);
return ret ?: -EAGAIN;
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From dadebc350da2bef62593b1df007a6e0b90baf42a Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence(a)gmail.com>
Date: Mon, 23 Aug 2021 13:30:44 +0100
Subject: [PATCH] io_uring: fix io_try_cancel_userdata race for iowq
WARNING: CPU: 1 PID: 5870 at fs/io_uring.c:5975 io_try_cancel_userdata+0x30f/0x540 fs/io_uring.c:5975
CPU: 0 PID: 5870 Comm: iou-wrk-5860 Not tainted 5.14.0-rc6-next-20210820-syzkaller #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
RIP: 0010:io_try_cancel_userdata+0x30f/0x540 fs/io_uring.c:5975
Call Trace:
io_async_cancel fs/io_uring.c:6014 [inline]
io_issue_sqe+0x22d5/0x65a0 fs/io_uring.c:6407
io_wq_submit_work+0x1dc/0x300 fs/io_uring.c:6511
io_worker_handle_work+0xa45/0x1840 fs/io-wq.c:533
io_wqe_worker+0x2cc/0xbb0 fs/io-wq.c:582
ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:295
io_try_cancel_userdata() can be called from io_async_cancel() executing
in the io-wq context, so the warning fires, which is there to alert
anyone accessing task->io_uring->io_wq in a racy way. However,
io_wq_put_and_exit() always first waits for all threads to complete,
so the only detail left is to zero tctx->io_wq after the context is
removed.
note: one little assumption is that when IO_WQ_WORK_CANCEL, the executor
won't touch ->io_wq, because io_wq_destroy() might cancel left pending
requests in such a way.
Cc: stable(a)vger.kernel.org
Reported-by: syzbot+b0c9d1588ae92866515f(a)syzkaller.appspotmail.com
Signed-off-by: Pavel Begunkov <asml.silence(a)gmail.com>
Link: https://lore.kernel.org/r/dfdd37a80cfa9ffd3e59538929c99cdd55d8699e.16297217…
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 827e60ae4909..6859438c4e09 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -5863,7 +5863,7 @@ static int io_try_cancel_userdata(struct io_kiocb *req, u64 sqe_addr)
struct io_ring_ctx *ctx = req->ctx;
int ret;
- WARN_ON_ONCE(req->task != current);
+ WARN_ON_ONCE(!io_wq_current_is_worker() && req->task != current);
ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx);
if (ret != -ENOENT)
@@ -6369,6 +6369,7 @@ static void io_wq_submit_work(struct io_wq_work *work)
if (timeout)
io_queue_linked_timeout(timeout);
+ /* either cancelled or io-wq is dying, so don't touch tctx->iowq */
if (work->flags & IO_WQ_WORK_CANCEL)
ret = -ECANCELED;
@@ -9184,8 +9185,8 @@ static void io_uring_clean_tctx(struct io_uring_task *tctx)
* Must be after io_uring_del_task_file() (removes nodes under
* uring_lock) to avoid race with io_uring_try_cancel_iowq().
*/
- tctx->io_wq = NULL;
io_wq_put_and_exit(wq);
+ tctx->io_wq = NULL;
}
}
The patch below does not apply to the 5.13-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From dadebc350da2bef62593b1df007a6e0b90baf42a Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence(a)gmail.com>
Date: Mon, 23 Aug 2021 13:30:44 +0100
Subject: [PATCH] io_uring: fix io_try_cancel_userdata race for iowq
WARNING: CPU: 1 PID: 5870 at fs/io_uring.c:5975 io_try_cancel_userdata+0x30f/0x540 fs/io_uring.c:5975
CPU: 0 PID: 5870 Comm: iou-wrk-5860 Not tainted 5.14.0-rc6-next-20210820-syzkaller #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
RIP: 0010:io_try_cancel_userdata+0x30f/0x540 fs/io_uring.c:5975
Call Trace:
io_async_cancel fs/io_uring.c:6014 [inline]
io_issue_sqe+0x22d5/0x65a0 fs/io_uring.c:6407
io_wq_submit_work+0x1dc/0x300 fs/io_uring.c:6511
io_worker_handle_work+0xa45/0x1840 fs/io-wq.c:533
io_wqe_worker+0x2cc/0xbb0 fs/io-wq.c:582
ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:295
io_try_cancel_userdata() can be called from io_async_cancel() executing
in the io-wq context, so the warning fires, which is there to alert
anyone accessing task->io_uring->io_wq in a racy way. However,
io_wq_put_and_exit() always first waits for all threads to complete,
so the only detail left is to zero tctx->io_wq after the context is
removed.
note: one little assumption is that when IO_WQ_WORK_CANCEL, the executor
won't touch ->io_wq, because io_wq_destroy() might cancel left pending
requests in such a way.
Cc: stable(a)vger.kernel.org
Reported-by: syzbot+b0c9d1588ae92866515f(a)syzkaller.appspotmail.com
Signed-off-by: Pavel Begunkov <asml.silence(a)gmail.com>
Link: https://lore.kernel.org/r/dfdd37a80cfa9ffd3e59538929c99cdd55d8699e.16297217…
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 827e60ae4909..6859438c4e09 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -5863,7 +5863,7 @@ static int io_try_cancel_userdata(struct io_kiocb *req, u64 sqe_addr)
struct io_ring_ctx *ctx = req->ctx;
int ret;
- WARN_ON_ONCE(req->task != current);
+ WARN_ON_ONCE(!io_wq_current_is_worker() && req->task != current);
ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx);
if (ret != -ENOENT)
@@ -6369,6 +6369,7 @@ static void io_wq_submit_work(struct io_wq_work *work)
if (timeout)
io_queue_linked_timeout(timeout);
+ /* either cancelled or io-wq is dying, so don't touch tctx->iowq */
if (work->flags & IO_WQ_WORK_CANCEL)
ret = -ECANCELED;
@@ -9184,8 +9185,8 @@ static void io_uring_clean_tctx(struct io_uring_task *tctx)
* Must be after io_uring_del_task_file() (removes nodes under
* uring_lock) to avoid race with io_uring_try_cancel_iowq().
*/
- tctx->io_wq = NULL;
io_wq_put_and_exit(wq);
+ tctx->io_wq = NULL;
}
}
The patch below does not apply to the 5.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From dadebc350da2bef62593b1df007a6e0b90baf42a Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence(a)gmail.com>
Date: Mon, 23 Aug 2021 13:30:44 +0100
Subject: [PATCH] io_uring: fix io_try_cancel_userdata race for iowq
WARNING: CPU: 1 PID: 5870 at fs/io_uring.c:5975 io_try_cancel_userdata+0x30f/0x540 fs/io_uring.c:5975
CPU: 0 PID: 5870 Comm: iou-wrk-5860 Not tainted 5.14.0-rc6-next-20210820-syzkaller #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
RIP: 0010:io_try_cancel_userdata+0x30f/0x540 fs/io_uring.c:5975
Call Trace:
io_async_cancel fs/io_uring.c:6014 [inline]
io_issue_sqe+0x22d5/0x65a0 fs/io_uring.c:6407
io_wq_submit_work+0x1dc/0x300 fs/io_uring.c:6511
io_worker_handle_work+0xa45/0x1840 fs/io-wq.c:533
io_wqe_worker+0x2cc/0xbb0 fs/io-wq.c:582
ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:295
io_try_cancel_userdata() can be called from io_async_cancel() executing
in the io-wq context, so the warning fires, which is there to alert
anyone accessing task->io_uring->io_wq in a racy way. However,
io_wq_put_and_exit() always first waits for all threads to complete,
so the only detail left is to zero tctx->io_wq after the context is
removed.
note: one little assumption is that when IO_WQ_WORK_CANCEL, the executor
won't touch ->io_wq, because io_wq_destroy() might cancel left pending
requests in such a way.
Cc: stable(a)vger.kernel.org
Reported-by: syzbot+b0c9d1588ae92866515f(a)syzkaller.appspotmail.com
Signed-off-by: Pavel Begunkov <asml.silence(a)gmail.com>
Link: https://lore.kernel.org/r/dfdd37a80cfa9ffd3e59538929c99cdd55d8699e.16297217…
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 827e60ae4909..6859438c4e09 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -5863,7 +5863,7 @@ static int io_try_cancel_userdata(struct io_kiocb *req, u64 sqe_addr)
struct io_ring_ctx *ctx = req->ctx;
int ret;
- WARN_ON_ONCE(req->task != current);
+ WARN_ON_ONCE(!io_wq_current_is_worker() && req->task != current);
ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx);
if (ret != -ENOENT)
@@ -6369,6 +6369,7 @@ static void io_wq_submit_work(struct io_wq_work *work)
if (timeout)
io_queue_linked_timeout(timeout);
+ /* either cancelled or io-wq is dying, so don't touch tctx->iowq */
if (work->flags & IO_WQ_WORK_CANCEL)
ret = -ECANCELED;
@@ -9184,8 +9185,8 @@ static void io_uring_clean_tctx(struct io_uring_task *tctx)
* Must be after io_uring_del_task_file() (removes nodes under
* uring_lock) to avoid race with io_uring_try_cancel_iowq().
*/
- tctx->io_wq = NULL;
io_wq_put_and_exit(wq);
+ tctx->io_wq = NULL;
}
}
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 0bea96f59ba40e63c0ae93ad6a02417b95f22f4d Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence(a)gmail.com>
Date: Fri, 20 Aug 2021 10:36:36 +0100
Subject: [PATCH] io_uring: place fixed tables under memcg limits
Fixed tables may be large enough, place all of them together with
allocated tags under memcg limits.
Cc: stable(a)vger.kernel.org
Signed-off-by: Pavel Begunkov <asml.silence(a)gmail.com>
Link: https://lore.kernel.org/r/b3ac9f5da9821bb59837b5fe25e8ef4be982218c.16294516…
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 9a021fe6c461..c6f1afa5ec04 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7138,14 +7138,14 @@ static void **io_alloc_page_table(size_t size)
size_t init_size = size;
void **table;
- table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL);
+ table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT);
if (!table)
return NULL;
for (i = 0; i < nr_tables; i++) {
unsigned int this_size = min_t(size_t, size, PAGE_SIZE);
- table[i] = kzalloc(this_size, GFP_KERNEL);
+ table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT);
if (!table[i]) {
io_free_page_table(table, init_size);
return NULL;
@@ -7336,7 +7336,8 @@ static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, rsrc_put_fn *do_put,
static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files)
{
- table->files = kvcalloc(nr_files, sizeof(table->files[0]), GFP_KERNEL);
+ table->files = kvcalloc(nr_files, sizeof(table->files[0]),
+ GFP_KERNEL_ACCOUNT);
return !!table->files;
}
The patch below does not apply to the 5.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 0bea96f59ba40e63c0ae93ad6a02417b95f22f4d Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence(a)gmail.com>
Date: Fri, 20 Aug 2021 10:36:36 +0100
Subject: [PATCH] io_uring: place fixed tables under memcg limits
Fixed tables may be large enough, place all of them together with
allocated tags under memcg limits.
Cc: stable(a)vger.kernel.org
Signed-off-by: Pavel Begunkov <asml.silence(a)gmail.com>
Link: https://lore.kernel.org/r/b3ac9f5da9821bb59837b5fe25e8ef4be982218c.16294516…
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 9a021fe6c461..c6f1afa5ec04 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7138,14 +7138,14 @@ static void **io_alloc_page_table(size_t size)
size_t init_size = size;
void **table;
- table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL);
+ table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT);
if (!table)
return NULL;
for (i = 0; i < nr_tables; i++) {
unsigned int this_size = min_t(size_t, size, PAGE_SIZE);
- table[i] = kzalloc(this_size, GFP_KERNEL);
+ table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT);
if (!table[i]) {
io_free_page_table(table, init_size);
return NULL;
@@ -7336,7 +7336,8 @@ static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, rsrc_put_fn *do_put,
static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files)
{
- table->files = kvcalloc(nr_files, sizeof(table->files[0]), GFP_KERNEL);
+ table->files = kvcalloc(nr_files, sizeof(table->files[0]),
+ GFP_KERNEL_ACCOUNT);
return !!table->files;
}
The patch below does not apply to the 5.13-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 0bea96f59ba40e63c0ae93ad6a02417b95f22f4d Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence(a)gmail.com>
Date: Fri, 20 Aug 2021 10:36:36 +0100
Subject: [PATCH] io_uring: place fixed tables under memcg limits
Fixed tables may be large enough, place all of them together with
allocated tags under memcg limits.
Cc: stable(a)vger.kernel.org
Signed-off-by: Pavel Begunkov <asml.silence(a)gmail.com>
Link: https://lore.kernel.org/r/b3ac9f5da9821bb59837b5fe25e8ef4be982218c.16294516…
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 9a021fe6c461..c6f1afa5ec04 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7138,14 +7138,14 @@ static void **io_alloc_page_table(size_t size)
size_t init_size = size;
void **table;
- table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL);
+ table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT);
if (!table)
return NULL;
for (i = 0; i < nr_tables; i++) {
unsigned int this_size = min_t(size_t, size, PAGE_SIZE);
- table[i] = kzalloc(this_size, GFP_KERNEL);
+ table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT);
if (!table[i]) {
io_free_page_table(table, init_size);
return NULL;
@@ -7336,7 +7336,8 @@ static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, rsrc_put_fn *do_put,
static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files)
{
- table->files = kvcalloc(nr_files, sizeof(table->files[0]), GFP_KERNEL);
+ table->files = kvcalloc(nr_files, sizeof(table->files[0]),
+ GFP_KERNEL_ACCOUNT);
return !!table->files;
}
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 3a1b8a4e843f96b636431450d8d79061605cf74b Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence(a)gmail.com>
Date: Fri, 20 Aug 2021 10:36:35 +0100
Subject: [PATCH] io_uring: limit fixed table size by RLIMIT_NOFILE
Limit the number of files in io_uring fixed tables by RLIMIT_NOFILE,
that's the first and the simpliest restriction that we should impose.
Cc: stable(a)vger.kernel.org
Suggested-by: Jens Axboe <axboe(a)kernel.dk>
Signed-off-by: Pavel Begunkov <asml.silence(a)gmail.com>
Link: https://lore.kernel.org/r/b2756c340aed7d6c0b302c26dab50c6c5907f4ce.16294516…
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 706ac8c03b95..9a021fe6c461 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7733,6 +7733,8 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
return -EINVAL;
if (nr_args > IORING_MAX_FIXED_FILES)
return -EMFILE;
+ if (nr_args > rlimit(RLIMIT_NOFILE))
+ return -EMFILE;
ret = io_rsrc_node_switch_start(ctx);
if (ret)
return ret;
This series backports some patches that failed to apply to 4.19-stable
due to the prototype of inode_operations::getattr having changed in
v5.12, as well as several other conflicts. Please apply to 4.19-stable.
Eric Biggers (4):
fscrypt: add fscrypt_symlink_getattr() for computing st_size
ext4: report correct st_size for encrypted symlinks
f2fs: report correct st_size for encrypted symlinks
ubifs: report correct st_size for encrypted symlinks
fs/crypto/hooks.c | 44 +++++++++++++++++++++++++++++++++
fs/ext4/symlink.c | 11 ++++++++-
fs/f2fs/namei.c | 11 ++++++++-
fs/ubifs/file.c | 12 ++++++++-
include/linux/fscrypt_notsupp.h | 6 +++++
include/linux/fscrypt_supp.h | 1 +
6 files changed, 82 insertions(+), 3 deletions(-)
--
2.33.0.153.gba50c8fa24-goog
From: Jaegeuk Kim <jaegeuk(a)kernel.org>
commit dddd3d65293a52c2c3850c19b1e5115712e534d8 upstream.
We must flush all the dirty data when enabling checkpoint back. Let's guarantee
that first by adding a retry logic on sync_inodes_sb(). In addition to that,
this patch adds to flush data in fsync when checkpoint is disabled, which can
mitigate the sync_inodes_sb() failures in advance.
Reviewed-by: Chao Yu <chao(a)kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk(a)kernel.org>
---
fs/f2fs/file.c | 5 ++---
fs/f2fs/super.c | 11 ++++++++++-
2 files changed, 12 insertions(+), 4 deletions(-)
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 5c74b2997197..6ee8b1e0e174 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -259,8 +259,7 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end,
};
unsigned int seq_id = 0;
- if (unlikely(f2fs_readonly(inode->i_sb) ||
- is_sbi_flag_set(sbi, SBI_CP_DISABLED)))
+ if (unlikely(f2fs_readonly(inode->i_sb)))
return 0;
trace_f2fs_sync_file_enter(inode);
@@ -274,7 +273,7 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end,
ret = file_write_and_wait_range(file, start, end);
clear_inode_flag(inode, FI_NEED_IPU);
- if (ret) {
+ if (ret || is_sbi_flag_set(sbi, SBI_CP_DISABLED)) {
trace_f2fs_sync_file_exit(inode, cp_reason, datasync, ret);
return ret;
}
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index c52988067887..476b2c497d28 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1764,8 +1764,17 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi)
static void f2fs_enable_checkpoint(struct f2fs_sb_info *sbi)
{
+ int retry = DEFAULT_RETRY_IO_COUNT;
+
/* we should flush all the data to keep data consistency */
- sync_inodes_sb(sbi->sb);
+ do {
+ sync_inodes_sb(sbi->sb);
+ cond_resched();
+ congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT);
+ } while (get_pages(sbi, F2FS_DIRTY_DATA) && retry--);
+
+ if (unlikely(retry < 0))
+ f2fs_warn(sbi, "checkpoint=enable has some unwritten data.");
down_write(&sbi->gc_lock);
f2fs_dirty_to_prefree(sbi);
--
2.33.0.153.gba50c8fa24-goog
commit 2f32c147a3816d789722c0bd242a9431332ec3ed upstream.
This patch adds a missing pci-id to support the wireless device in the
Galaxy Book Flex2 Alpha which is shipping and in the wild. It has been
tested to work on 5.13+. While it has been shipping in Fedora kernels,
Samsung and a couple of users requested it be added to stable.
Justin
From: Guillaume Nault <gnault(a)redhat.com>
__peernet2id() can be protected by RCU as it only calls idr_for_each(),
which is RCU-safe, and never modifies the nsid table.
rtnl_net_dumpid() can also do lockless lookups. It does two nested
idr_for_each() calls on nsid tables (one direct call and one indirect
call because of rtnl_net_dumpid_one() calling __peernet2id()). The
netnsid tables are never updated. Therefore it is safe to not take the
nsid_lock and run within an RCU-critical section instead.
Signed-off-by: Guillaume Nault <gnault(a)redhat.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
A nice side-effect of replacing spin_{lock,unlock}_bh() with
rcu_spin_{lock,unlock}() in peernet2id() is that it avoids the
situation where SoftIRQs get enabled whilst IRQs are turned off.
>From bugzilla.redhat.com/show_bug.cgi?id=1384179 (an ancient
4.9.0-0.rc0 kernel):
dump_stack+0x86/0xc3
__warn+0xcb/0xf0
warn_slowpath_null+0x1d/0x20
__local_bh_enable_ip+0x9d/0xc0
_raw_spin_unlock_bh+0x35/0x40
peernet2id+0x54/0x80
netlink_broadcast_filtered+0x220/0x3c0
netlink_broadcast+0x1d/0x20
audit_log+0x6a/0x90
security_set_bools+0xee/0x200
[]
Note, security_set_bools() calls write_lock_irq(). peernet2id() calls
spin_unlock_bh().
>From an internal (UEK) stack trace based on the v4.14.35 kernel (LTS
4.14.231):
queued_spin_lock_slowpath+0xb/0xf
_raw_spin_lock_irqsave+0x46/0x48
send_mad+0x3d2/0x590 [ib_core]
ib_sa_path_rec_get+0x223/0x4d0 [ib_core]
path_rec_start+0xa3/0x140 [ib_ipoib]
ipoib_start_xmit+0x2b0/0x6a0 [ib_ipoib]
dev_hard_start_xmit+0xb2/0x237
sch_direct_xmit+0x114/0x1bf
__dev_queue_xmit+0x592/0x818
dev_queue_xmit+0x10/0x12
arp_xmit+0x38/0xa6
arp_send_dst.part.16+0x61/0x84
arp_process+0x825/0x889
arp_rcv+0x140/0x1c9
__netif_receive_skb_core+0x401/0xb39
__netif_receive_skb+0x18/0x59
netif_receive_skb_internal+0x45/0x119
napi_gro_receive+0xd8/0xf6
ipoib_ib_handle_rx_wc+0x1ca/0x520 [ib_ipoib]
ipoib_poll+0xcd/0x150 [ib_ipoib]
net_rx_action+0x289/0x3f4
__do_softirq+0xe1/0x2b5
do_softirq_own_stack+0x2a/0x35
</IRQ>
do_softirq+0x4d/0x6a
__local_bh_enable_ip+0x57/0x59
_raw_spin_unlock_bh+0x23/0x25
peernet2id+0x51/0x73
netlink_broadcast_filtered+0x223/0x41b
netlink_broadcast+0x1d/0x1f
rdma_nl_multicast+0x22/0x30 [ib_core]
send_mad+0x3e5/0x590 [ib_core]
ib_sa_path_rec_get+0x223/0x4d0 [ib_core]
rdma_resolve_route+0x287/0x810 [rdma_cm]
rds_rdma_cm_event_handler_cmn+0x311/0x7d0 [rds_rdma]
rds_rdma_cm_event_handler_worker+0x22/0x30 [rds_rdma]
process_one_work+0x169/0x3a6
worker_thread+0x4d/0x3e5
kthread+0x105/0x138
ret_from_fork+0x24/0x49
Here, pay attention to ib_nl_make_request() which calls
spin_lock_irqsave() on a global lock just before calling
rdma_nl_multicast(). Thereafter, peernet2id() enables SoftIRQs, and
ipoib starts and calls the same path and ends up trying to acquire the
same global lock again.
(cherry picked from commit 2dce224f469f060b9998a5a869151ef83c08ce77)
Fixes: fba143c66abb ("netns: avoid disabling irq for netns id")
Signed-off-by: Håkon Bugge <haakon.bugge(a)oracle.com>
Conflicts:
net/core/net_namespace.c
* Due to context differences because v5.4 lacks commit
4905294162bd ("netns: Remove __peernet2id_alloc()").
Only comments affected.
---
net/core/net_namespace.c | 28 ++++++++++------------------
1 file changed, 10 insertions(+), 18 deletions(-)
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index c303873..9bf1551 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -211,9 +211,9 @@ static int net_eq_idr(int id, void *net, void *peer)
return 0;
}
-/* Should be called with nsid_lock held. If a new id is assigned, the bool alloc
- * is set to true, thus the caller knows that the new id must be notified via
- * rtnl.
+/* Must be called from RCU-critical section or with nsid_lock held. If
+ * a new id is assigned, the bool alloc is set to true, thus the
+ * caller knows that the new id must be notified via rtnl.
*/
static int __peernet2id_alloc(struct net *net, struct net *peer, bool *alloc)
{
@@ -237,7 +237,7 @@ static int __peernet2id_alloc(struct net *net, struct net *peer, bool *alloc)
return NETNSA_NSID_NOT_ASSIGNED;
}
-/* should be called with nsid_lock held */
+/* Must be called from RCU-critical section or with nsid_lock held */
static int __peernet2id(struct net *net, struct net *peer)
{
bool no = false;
@@ -281,9 +281,10 @@ int peernet2id(struct net *net, struct net *peer)
{
int id;
- spin_lock_bh(&net->nsid_lock);
+ rcu_read_lock();
id = __peernet2id(net, peer);
- spin_unlock_bh(&net->nsid_lock);
+ rcu_read_unlock();
+
return id;
}
EXPORT_SYMBOL(peernet2id);
@@ -962,6 +963,7 @@ struct rtnl_net_dump_cb {
int s_idx;
};
+/* Runs in RCU-critical section. */
static int rtnl_net_dumpid_one(int id, void *peer, void *data)
{
struct rtnl_net_dump_cb *net_cb = (struct rtnl_net_dump_cb *)data;
@@ -1046,19 +1048,9 @@ static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb)
goto end;
}
- spin_lock_bh(&net_cb.tgt_net->nsid_lock);
- if (net_cb.fillargs.add_ref &&
- !net_eq(net_cb.ref_net, net_cb.tgt_net) &&
- !spin_trylock_bh(&net_cb.ref_net->nsid_lock)) {
- spin_unlock_bh(&net_cb.tgt_net->nsid_lock);
- err = -EAGAIN;
- goto end;
- }
+ rcu_read_lock();
idr_for_each(&net_cb.tgt_net->netns_ids, rtnl_net_dumpid_one, &net_cb);
- if (net_cb.fillargs.add_ref &&
- !net_eq(net_cb.ref_net, net_cb.tgt_net))
- spin_unlock_bh(&net_cb.ref_net->nsid_lock);
- spin_unlock_bh(&net_cb.tgt_net->nsid_lock);
+ rcu_read_unlock();
cb->args[0] = net_cb.idx;
end:
--
1.8.3.1