From: Maximilian Heyne <mheyne(a)amazon.de>
Commit fa765c4b4aed2d64266b694520ecb025c862c5a9 upstream
shutdown_pirq and startup_pirq are not taking the
irq_mapping_update_lock because they can't due to lock inversion. Both
are called with the irq_desc->lock being taking. The lock order,
however, is first irq_mapping_update_lock and then irq_desc->lock.
This opens multiple races:
- shutdown_pirq can be interrupted by a function that allocates an event
channel:
CPU0 CPU1
shutdown_pirq {
xen_evtchn_close(e)
__startup_pirq {
EVTCHNOP_bind_pirq
-> returns just freed evtchn e
set_evtchn_to_irq(e, irq)
}
xen_irq_info_cleanup() {
set_evtchn_to_irq(e, -1)
}
}
Assume here event channel e refers here to the same event channel
number.
After this race the evtchn_to_irq mapping for e is invalid (-1).
- __startup_pirq races with __unbind_from_irq in a similar way. Because
__startup_pirq doesn't take irq_mapping_update_lock it can grab the
evtchn that __unbind_from_irq is currently freeing and cleaning up. In
this case even though the event channel is allocated, its mapping can
be unset in evtchn_to_irq.
The fix is to first cleanup the mappings and then close the event
channel. In this way, when an event channel gets allocated it's
potential previous evtchn_to_irq mappings are guaranteed to be unset already.
This is also the reverse order of the allocation where first the event
channel is allocated and then the mappings are setup.
On a 5.10 kernel prior to commit 3fcdaf3d7634 ("xen/events: modify internal
[un]bind interfaces"), we hit a BUG like the following during probing of NVMe
devices. The issue is that during nvme_setup_io_queues, pci_free_irq
is called for every device which results in a call to shutdown_pirq.
With many nvme devices it's therefore likely to hit this race during
boot because there will be multiple calls to shutdown_pirq and
startup_pirq are running potentially in parallel.
------------[ cut here ]------------
blkfront: xvda: barrier or flush: disabled; persistent grants: enabled; indirect descriptors: enabled; bounce buffer: enabled
kernel BUG at drivers/xen/events/events_base.c:499!
invalid opcode: 0000 [#1] SMP PTI
CPU: 44 PID: 375 Comm: kworker/u257:23 Not tainted 5.10.201-191.748.amzn2.x86_64 #1
Hardware name: Xen HVM domU, BIOS 4.11.amazon 08/24/2006
Workqueue: nvme-reset-wq nvme_reset_work
RIP: 0010:bind_evtchn_to_cpu+0xdf/0xf0
Code: 5d 41 5e c3 cc cc cc cc 44 89 f7 e8 2b 55 ad ff 49 89 c5 48 85 c0 0f 84 64 ff ff ff 4c 8b 68 30 41 83 fe ff 0f 85 60 ff ff ff <0f> 0b 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 0f 1f 44 00 00
RSP: 0000:ffffc9000d533b08 EFLAGS: 00010046
RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000006
RDX: 0000000000000028 RSI: 00000000ffffffff RDI: 00000000ffffffff
RBP: ffff888107419680 R08: 0000000000000000 R09: ffffffff82d72b00
R10: 0000000000000000 R11: 0000000000000000 R12: 00000000000001ed
R13: 0000000000000000 R14: 00000000ffffffff R15: 0000000000000002
FS: 0000000000000000(0000) GS:ffff88bc8b500000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000000 CR3: 0000000002610001 CR4: 00000000001706e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
? show_trace_log_lvl+0x1c1/0x2d9
? show_trace_log_lvl+0x1c1/0x2d9
? set_affinity_irq+0xdc/0x1c0
? __die_body.cold+0x8/0xd
? die+0x2b/0x50
? do_trap+0x90/0x110
? bind_evtchn_to_cpu+0xdf/0xf0
? do_error_trap+0x65/0x80
? bind_evtchn_to_cpu+0xdf/0xf0
? exc_invalid_op+0x4e/0x70
? bind_evtchn_to_cpu+0xdf/0xf0
? asm_exc_invalid_op+0x12/0x20
? bind_evtchn_to_cpu+0xdf/0xf0
? bind_evtchn_to_cpu+0xc5/0xf0
set_affinity_irq+0xdc/0x1c0
irq_do_set_affinity+0x1d7/0x1f0
irq_setup_affinity+0xd6/0x1a0
irq_startup+0x8a/0xf0
__setup_irq+0x639/0x6d0
? nvme_suspend+0x150/0x150
request_threaded_irq+0x10c/0x180
? nvme_suspend+0x150/0x150
pci_request_irq+0xa8/0xf0
? __blk_mq_free_request+0x74/0xa0
queue_request_irq+0x6f/0x80
nvme_create_queue+0x1af/0x200
nvme_create_io_queues+0xbd/0xf0
nvme_setup_io_queues+0x246/0x320
? nvme_irq_check+0x30/0x30
nvme_reset_work+0x1c8/0x400
process_one_work+0x1b0/0x350
worker_thread+0x49/0x310
? process_one_work+0x350/0x350
kthread+0x11b/0x140
? __kthread_bind_mask+0x60/0x60
ret_from_fork+0x22/0x30
Modules linked in:
---[ end trace a11715de1eee1873 ]---
Fixes: d46a78b05c0e ("xen: implement pirq type event channels")
Cc: stable(a)vger.kernel.org
Co-debugged-by: Andrew Panyakin <apanyaki(a)amazon.com>
Signed-off-by: Maximilian Heyne <mheyne(a)amazon.de>
[apanyaki: backport to v5.10-stable]
Signed-off-by: Andrew Paniakin <apanyaki(a)amazon.com>
---
Compare to upstream patch this one does not have close_evtchn flag
because there is no need to handle static event channels.
This feature was added only in 58f6259b7a08f ("xen/evtchn: Introduce new
IOCTL to bind static evtchn")
drivers/xen/events/events_base.c | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c
index 24e39984914f..f608663bdfd5 100644
--- a/drivers/xen/events/events_base.c
+++ b/drivers/xen/events/events_base.c
@@ -885,8 +885,8 @@ static void shutdown_pirq(struct irq_data *data)
return;
do_mask(info, EVT_MASK_REASON_EXPLICIT);
- xen_evtchn_close(evtchn);
xen_irq_info_cleanup(info);
+ xen_evtchn_close(evtchn);
}
static void enable_pirq(struct irq_data *data)
@@ -929,8 +929,6 @@ static void __unbind_from_irq(unsigned int irq)
if (VALID_EVTCHN(evtchn)) {
unsigned int cpu = cpu_from_irq(irq);
- xen_evtchn_close(evtchn);
-
switch (type_from_irq(irq)) {
case IRQT_VIRQ:
per_cpu(virq_to_irq, cpu)[virq_from_irq(irq)] = -1;
@@ -943,6 +941,7 @@ static void __unbind_from_irq(unsigned int irq)
}
xen_irq_info_cleanup(info);
+ xen_evtchn_close(evtchn);
}
xen_free_irq(irq);
--
2.40.1
During the handoff from earlycon to the real console driver, we have
two separate drivers operating on the same device concurrently. In the
case of the 8250 driver these concurrent accesses cause problems due
to the driver's use of banked registers, controlled by LCR.DLAB. It is
possible for the setup(), config_port(), pm() and set_mctrl() callbacks
to set DLAB, which can cause the earlycon code that intends to access
TX to instead access DLL, leading to missed output and corruption on
the serial line due to unintended modifications to the baud rate.
In particular, for setup() we have:
univ8250_console_setup()
-> serial8250_console_setup()
-> uart_set_options()
-> serial8250_set_termios()
-> serial8250_do_set_termios()
-> serial8250_do_set_divisor()
For config_port() we have:
serial8250_config_port()
-> autoconfig()
For pm() we have:
serial8250_pm()
-> serial8250_do_pm()
-> serial8250_set_sleep()
For set_mctrl() we have (for some devices):
serial8250_set_mctrl()
-> omap8250_set_mctrl()
-> __omap8250_set_mctrl()
To avoid such problems, let's make it so that the console is locked
during pre-registration calls to these callbacks, which will prevent
the earlycon driver from running concurrently.
Remove the partial solution to this problem in the 8250 driver
that locked the console only during autoconfig_irq(), as this would
result in a deadlock with the new approach. The console continues
to be locked during autoconfig_irq() because it can only be called
through uart_configure_port().
Although this patch introduces more locking than strictly necessary
(and in particular it also locks during the call to rs485_config()
which is not affected by this issue as far as I can tell), it follows
the principle that it is the responsibility of the generic console
code to manage the earlycon handoff by ensuring that earlycon and real
console driver code cannot run concurrently, and not the individual
drivers.
Signed-off-by: Peter Collingbourne <pcc(a)google.com>
Reviewed-by: John Ogness <john.ogness(a)linutronix.de>
Link: https://linux-review.googlesource.com/id/I7cf8124dcebf8618e6b2ee543fa5b2553…
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Cc: stable(a)vger.kernel.org
---
drivers/tty/serial/8250/8250_port.c | 6 ------
drivers/tty/serial/serial_core.c | 12 ++++++++++++
kernel/printk/printk.c | 21 ++++++++++++++++++---
3 files changed, 30 insertions(+), 9 deletions(-)
diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c
index 8ca061d3bbb9..1d65055dde27 100644
--- a/drivers/tty/serial/8250/8250_port.c
+++ b/drivers/tty/serial/8250/8250_port.c
@@ -1329,9 +1329,6 @@ static void autoconfig_irq(struct uart_8250_port *up)
inb_p(ICP);
}
- if (uart_console(port))
- console_lock();
-
/* forget possible initially masked and pending IRQ */
probe_irq_off(probe_irq_on());
save_mcr = serial8250_in_MCR(up);
@@ -1371,9 +1368,6 @@ static void autoconfig_irq(struct uart_8250_port *up)
if (port->flags & UPF_FOURPORT)
outb_p(save_ICP, ICP);
- if (uart_console(port))
- console_unlock();
-
port->irq = (irq > 0) ? irq : 0;
}
diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c
index d6a58a9e072a..ff85ebd3a007 100644
--- a/drivers/tty/serial/serial_core.c
+++ b/drivers/tty/serial/serial_core.c
@@ -2608,7 +2608,12 @@ uart_configure_port(struct uart_driver *drv, struct uart_state *state,
port->type = PORT_UNKNOWN;
flags |= UART_CONFIG_TYPE;
}
+ /* Synchronize with possible boot console. */
+ if (uart_console(port))
+ console_lock();
port->ops->config_port(port, flags);
+ if (uart_console(port))
+ console_unlock();
}
if (port->type != PORT_UNKNOWN) {
@@ -2616,6 +2621,10 @@ uart_configure_port(struct uart_driver *drv, struct uart_state *state,
uart_report_port(drv, port);
+ /* Synchronize with possible boot console. */
+ if (uart_console(port))
+ console_lock();
+
/* Power up port for set_mctrl() */
uart_change_pm(state, UART_PM_STATE_ON);
@@ -2632,6 +2641,9 @@ uart_configure_port(struct uart_driver *drv, struct uart_state *state,
uart_rs485_config(port);
+ if (uart_console(port))
+ console_unlock();
+
/*
* If this driver supports console, and it hasn't been
* successfully registered yet, try to re-register it.
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index f2444b581e16..f51e4e5a869d 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -3263,6 +3263,21 @@ static int __init keep_bootcon_setup(char *str)
early_param("keep_bootcon", keep_bootcon_setup);
+static int console_call_setup(struct console *newcon, char *options)
+{
+ int err;
+
+ if (!newcon->setup)
+ return 0;
+
+ /* Synchronize with possible boot console. */
+ console_lock();
+ err = newcon->setup(newcon, options);
+ console_unlock();
+
+ return err;
+}
+
/*
* This is called by register_console() to try to match
* the newly registered console with any of the ones selected
@@ -3298,8 +3313,8 @@ static int try_enable_preferred_console(struct console *newcon,
if (_braille_register_console(newcon, c))
return 0;
- if (newcon->setup &&
- (err = newcon->setup(newcon, c->options)) != 0)
+ err = console_call_setup(newcon, c->options);
+ if (err != 0)
return err;
}
newcon->flags |= CON_ENABLED;
@@ -3325,7 +3340,7 @@ static void try_enable_default_console(struct console *newcon)
if (newcon->index < 0)
newcon->index = 0;
- if (newcon->setup && newcon->setup(newcon, NULL) != 0)
+ if (console_call_setup(newcon, NULL) != 0)
return;
newcon->flags |= CON_ENABLED;
--
2.44.0.rc1.240.g4c46232300-goog
During the handoff from earlycon to the real console driver, we have
two separate drivers operating on the same device concurrently. In the
case of the 8250 driver these concurrent accesses cause problems due
to the driver's use of banked registers, controlled by LCR.DLAB. It is
possible for the setup(), config_port(), pm() and set_mctrl() callbacks
to set DLAB, which can cause the earlycon code that intends to access
TX to instead access DLL, leading to missed output and corruption on
the serial line due to unintended modifications to the baud rate.
In particular, for setup() we have:
univ8250_console_setup()
-> serial8250_console_setup()
-> uart_set_options()
-> serial8250_set_termios()
-> serial8250_do_set_termios()
-> serial8250_do_set_divisor()
For config_port() we have:
serial8250_config_port()
-> autoconfig()
For pm() we have:
serial8250_pm()
-> serial8250_do_pm()
-> serial8250_set_sleep()
For set_mctrl() we have (for some devices):
serial8250_set_mctrl()
-> omap8250_set_mctrl()
-> __omap8250_set_mctrl()
To avoid such problems, let's make it so that the console is locked
during pre-registration calls to these callbacks, which will prevent
the earlycon driver from running concurrently.
Remove the partial solution to this problem in the 8250 driver
that locked the console only during autoconfig_irq(), as this would
result in a deadlock with the new approach. The console continues
to be locked during autoconfig_irq() because it can only be called
through uart_configure_port().
Although this patch introduces more locking than strictly necessary
(and in particular it also locks during the call to rs485_config()
which is not affected by this issue as far as I can tell), it follows
the principle that it is the responsibility of the generic console
code to manage the earlycon handoff by ensuring that earlycon and real
console driver code cannot run concurrently, and not the individual
drivers.
Signed-off-by: Peter Collingbourne <pcc(a)google.com>
Reviewed-by: John Ogness <john.ogness(a)linutronix.de>
Link: https://linux-review.googlesource.com/id/I7cf8124dcebf8618e6b2ee543fa5b2553…
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Cc: stable(a)vger.kernel.org
---
v3:
- switch to if (err)
v2:
- add some comments
drivers/tty/serial/8250/8250_port.c | 6 ------
drivers/tty/serial/serial_core.c | 12 ++++++++++++
kernel/printk/printk.c | 21 ++++++++++++++++++---
3 files changed, 30 insertions(+), 9 deletions(-)
diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c
index 8ca061d3bbb9..1d65055dde27 100644
--- a/drivers/tty/serial/8250/8250_port.c
+++ b/drivers/tty/serial/8250/8250_port.c
@@ -1329,9 +1329,6 @@ static void autoconfig_irq(struct uart_8250_port *up)
inb_p(ICP);
}
- if (uart_console(port))
- console_lock();
-
/* forget possible initially masked and pending IRQ */
probe_irq_off(probe_irq_on());
save_mcr = serial8250_in_MCR(up);
@@ -1371,9 +1368,6 @@ static void autoconfig_irq(struct uart_8250_port *up)
if (port->flags & UPF_FOURPORT)
outb_p(save_ICP, ICP);
- if (uart_console(port))
- console_unlock();
-
port->irq = (irq > 0) ? irq : 0;
}
diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c
index d6a58a9e072a..ff85ebd3a007 100644
--- a/drivers/tty/serial/serial_core.c
+++ b/drivers/tty/serial/serial_core.c
@@ -2608,7 +2608,12 @@ uart_configure_port(struct uart_driver *drv, struct uart_state *state,
port->type = PORT_UNKNOWN;
flags |= UART_CONFIG_TYPE;
}
+ /* Synchronize with possible boot console. */
+ if (uart_console(port))
+ console_lock();
port->ops->config_port(port, flags);
+ if (uart_console(port))
+ console_unlock();
}
if (port->type != PORT_UNKNOWN) {
@@ -2616,6 +2621,10 @@ uart_configure_port(struct uart_driver *drv, struct uart_state *state,
uart_report_port(drv, port);
+ /* Synchronize with possible boot console. */
+ if (uart_console(port))
+ console_lock();
+
/* Power up port for set_mctrl() */
uart_change_pm(state, UART_PM_STATE_ON);
@@ -2632,6 +2641,9 @@ uart_configure_port(struct uart_driver *drv, struct uart_state *state,
uart_rs485_config(port);
+ if (uart_console(port))
+ console_unlock();
+
/*
* If this driver supports console, and it hasn't been
* successfully registered yet, try to re-register it.
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index f2444b581e16..89f2aa2e1172 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -3263,6 +3263,21 @@ static int __init keep_bootcon_setup(char *str)
early_param("keep_bootcon", keep_bootcon_setup);
+static int console_call_setup(struct console *newcon, char *options)
+{
+ int err;
+
+ if (!newcon->setup)
+ return 0;
+
+ /* Synchronize with possible boot console. */
+ console_lock();
+ err = newcon->setup(newcon, options);
+ console_unlock();
+
+ return err;
+}
+
/*
* This is called by register_console() to try to match
* the newly registered console with any of the ones selected
@@ -3298,8 +3313,8 @@ static int try_enable_preferred_console(struct console *newcon,
if (_braille_register_console(newcon, c))
return 0;
- if (newcon->setup &&
- (err = newcon->setup(newcon, c->options)) != 0)
+ err = console_call_setup(newcon, c->options);
+ if (err)
return err;
}
newcon->flags |= CON_ENABLED;
@@ -3325,7 +3340,7 @@ static void try_enable_default_console(struct console *newcon)
if (newcon->index < 0)
newcon->index = 0;
- if (newcon->setup && newcon->setup(newcon, NULL) != 0)
+ if (console_call_setup(newcon, NULL) != 0)
return;
newcon->flags |= CON_ENABLED;
--
2.44.0.278.ge034bb2e1d-goog
From: Maximilian Heyne <mheyne(a)amazon.de>
Commit fa765c4b4aed2d64266b694520ecb025c862c5a9 upstream
shutdown_pirq and startup_pirq are not taking the
irq_mapping_update_lock because they can't due to lock inversion. Both
are called with the irq_desc->lock being taking. The lock order,
however, is first irq_mapping_update_lock and then irq_desc->lock.
This opens multiple races:
- shutdown_pirq can be interrupted by a function that allocates an event
channel:
CPU0 CPU1
shutdown_pirq {
xen_evtchn_close(e)
__startup_pirq {
EVTCHNOP_bind_pirq
-> returns just freed evtchn e
set_evtchn_to_irq(e, irq)
}
xen_irq_info_cleanup() {
set_evtchn_to_irq(e, -1)
}
}
Assume here event channel e refers here to the same event channel
number.
After this race the evtchn_to_irq mapping for e is invalid (-1).
- __startup_pirq races with __unbind_from_irq in a similar way. Because
__startup_pirq doesn't take irq_mapping_update_lock it can grab the
evtchn that __unbind_from_irq is currently freeing and cleaning up. In
this case even though the event channel is allocated, its mapping can
be unset in evtchn_to_irq.
The fix is to first cleanup the mappings and then close the event
channel. In this way, when an event channel gets allocated it's
potential previous evtchn_to_irq mappings are guaranteed to be unset already.
This is also the reverse order of the allocation where first the event
channel is allocated and then the mappings are setup.
On a 5.10 kernel prior to commit 3fcdaf3d7634 ("xen/events: modify internal
[un]bind interfaces"), we hit a BUG like the following during probing of NVMe
devices. The issue is that during nvme_setup_io_queues, pci_free_irq
is called for every device which results in a call to shutdown_pirq.
With many nvme devices it's therefore likely to hit this race during
boot because there will be multiple calls to shutdown_pirq and
startup_pirq are running potentially in parallel.
------------[ cut here ]------------
blkfront: xvda: barrier or flush: disabled; persistent grants: enabled; indirect descriptors: enabled; bounce buffer: enabled
kernel BUG at drivers/xen/events/events_base.c:499!
invalid opcode: 0000 [#1] SMP PTI
CPU: 44 PID: 375 Comm: kworker/u257:23 Not tainted 5.10.201-191.748.amzn2.x86_64 #1
Hardware name: Xen HVM domU, BIOS 4.11.amazon 08/24/2006
Workqueue: nvme-reset-wq nvme_reset_work
RIP: 0010:bind_evtchn_to_cpu+0xdf/0xf0
Code: 5d 41 5e c3 cc cc cc cc 44 89 f7 e8 2b 55 ad ff 49 89 c5 48 85 c0 0f 84 64 ff ff ff 4c 8b 68 30 41 83 fe ff 0f 85 60 ff ff ff <0f> 0b 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 0f 1f 44 00 00
RSP: 0000:ffffc9000d533b08 EFLAGS: 00010046
RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000006
RDX: 0000000000000028 RSI: 00000000ffffffff RDI: 00000000ffffffff
RBP: ffff888107419680 R08: 0000000000000000 R09: ffffffff82d72b00
R10: 0000000000000000 R11: 0000000000000000 R12: 00000000000001ed
R13: 0000000000000000 R14: 00000000ffffffff R15: 0000000000000002
FS: 0000000000000000(0000) GS:ffff88bc8b500000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000000 CR3: 0000000002610001 CR4: 00000000001706e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
? show_trace_log_lvl+0x1c1/0x2d9
? show_trace_log_lvl+0x1c1/0x2d9
? set_affinity_irq+0xdc/0x1c0
? __die_body.cold+0x8/0xd
? die+0x2b/0x50
? do_trap+0x90/0x110
? bind_evtchn_to_cpu+0xdf/0xf0
? do_error_trap+0x65/0x80
? bind_evtchn_to_cpu+0xdf/0xf0
? exc_invalid_op+0x4e/0x70
? bind_evtchn_to_cpu+0xdf/0xf0
? asm_exc_invalid_op+0x12/0x20
? bind_evtchn_to_cpu+0xdf/0xf0
? bind_evtchn_to_cpu+0xc5/0xf0
set_affinity_irq+0xdc/0x1c0
irq_do_set_affinity+0x1d7/0x1f0
irq_setup_affinity+0xd6/0x1a0
irq_startup+0x8a/0xf0
__setup_irq+0x639/0x6d0
? nvme_suspend+0x150/0x150
request_threaded_irq+0x10c/0x180
? nvme_suspend+0x150/0x150
pci_request_irq+0xa8/0xf0
? __blk_mq_free_request+0x74/0xa0
queue_request_irq+0x6f/0x80
nvme_create_queue+0x1af/0x200
nvme_create_io_queues+0xbd/0xf0
nvme_setup_io_queues+0x246/0x320
? nvme_irq_check+0x30/0x30
nvme_reset_work+0x1c8/0x400
process_one_work+0x1b0/0x350
worker_thread+0x49/0x310
? process_one_work+0x350/0x350
kthread+0x11b/0x140
? __kthread_bind_mask+0x60/0x60
ret_from_fork+0x22/0x30
Modules linked in:
---[ end trace a11715de1eee1873 ]---
Fixes: d46a78b05c0e ("xen: implement pirq type event channels")
Cc: stable(a)vger.kernel.org
Co-debugged-by: Andrew Panyakin <apanyaki(a)amazon.com>
Signed-off-by: Maximilian Heyne <mheyne(a)amazon.de>
[apanyaki: backport to v5.15-stable]
Signed-off-by: Andrew Paniakin <apanyaki(a)amazon.com>
---
Compare to upstream patch this one does not have close_evtchn flag
because there is no need to handle static event channels.
This feature was added only in 58f6259b7a08f ("xen/evtchn: Introduce new
IOCTL to bind static evtchn")
drivers/xen/events/events_base.c | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c
index ee691b20d4a3..04ff194fecf4 100644
--- a/drivers/xen/events/events_base.c
+++ b/drivers/xen/events/events_base.c
@@ -936,8 +936,8 @@ static void shutdown_pirq(struct irq_data *data)
return;
do_mask(info, EVT_MASK_REASON_EXPLICIT);
- xen_evtchn_close(evtchn);
xen_irq_info_cleanup(info);
+ xen_evtchn_close(evtchn);
}
static void enable_pirq(struct irq_data *data)
@@ -981,8 +981,6 @@ static void __unbind_from_irq(unsigned int irq)
unsigned int cpu = cpu_from_irq(irq);
struct xenbus_device *dev;
- xen_evtchn_close(evtchn);
-
switch (type_from_irq(irq)) {
case IRQT_VIRQ:
per_cpu(virq_to_irq, cpu)[virq_from_irq(irq)] = -1;
@@ -1000,6 +998,7 @@ static void __unbind_from_irq(unsigned int irq)
}
xen_irq_info_cleanup(info);
+ xen_evtchn_close(evtchn);
}
xen_free_irq(irq);
--
2.40.1
From: Maximilian Heyne <mheyne(a)amazon.de>
Commit fa765c4b4aed2d64266b694520ecb025c862c5a9 upstream
shutdown_pirq and startup_pirq are not taking the
irq_mapping_update_lock because they can't due to lock inversion. Both
are called with the irq_desc->lock being taking. The lock order,
however, is first irq_mapping_update_lock and then irq_desc->lock.
This opens multiple races:
- shutdown_pirq can be interrupted by a function that allocates an event
channel:
CPU0 CPU1
shutdown_pirq {
xen_evtchn_close(e)
__startup_pirq {
EVTCHNOP_bind_pirq
-> returns just freed evtchn e
set_evtchn_to_irq(e, irq)
}
xen_irq_info_cleanup() {
set_evtchn_to_irq(e, -1)
}
}
Assume here event channel e refers here to the same event channel
number.
After this race the evtchn_to_irq mapping for e is invalid (-1).
- __startup_pirq races with __unbind_from_irq in a similar way. Because
__startup_pirq doesn't take irq_mapping_update_lock it can grab the
evtchn that __unbind_from_irq is currently freeing and cleaning up. In
this case even though the event channel is allocated, its mapping can
be unset in evtchn_to_irq.
The fix is to first cleanup the mappings and then close the event
channel. In this way, when an event channel gets allocated it's
potential previous evtchn_to_irq mappings are guaranteed to be unset already.
This is also the reverse order of the allocation where first the event
channel is allocated and then the mappings are setup.
On a 5.10 kernel prior to commit 3fcdaf3d7634 ("xen/events: modify internal
[un]bind interfaces"), we hit a BUG like the following during probing of NVMe
devices. The issue is that during nvme_setup_io_queues, pci_free_irq
is called for every device which results in a call to shutdown_pirq.
With many nvme devices it's therefore likely to hit this race during
boot because there will be multiple calls to shutdown_pirq and
startup_pirq are running potentially in parallel.
------------[ cut here ]------------
blkfront: xvda: barrier or flush: disabled; persistent grants: enabled; indirect descriptors: enabled; bounce buffer: enabled
kernel BUG at drivers/xen/events/events_base.c:499!
invalid opcode: 0000 [#1] SMP PTI
CPU: 44 PID: 375 Comm: kworker/u257:23 Not tainted 5.10.201-191.748.amzn2.x86_64 #1
Hardware name: Xen HVM domU, BIOS 4.11.amazon 08/24/2006
Workqueue: nvme-reset-wq nvme_reset_work
RIP: 0010:bind_evtchn_to_cpu+0xdf/0xf0
Code: 5d 41 5e c3 cc cc cc cc 44 89 f7 e8 2b 55 ad ff 49 89 c5 48 85 c0 0f 84 64 ff ff ff 4c 8b 68 30 41 83 fe ff 0f 85 60 ff ff ff <0f> 0b 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 0f 1f 44 00 00
RSP: 0000:ffffc9000d533b08 EFLAGS: 00010046
RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000006
RDX: 0000000000000028 RSI: 00000000ffffffff RDI: 00000000ffffffff
RBP: ffff888107419680 R08: 0000000000000000 R09: ffffffff82d72b00
R10: 0000000000000000 R11: 0000000000000000 R12: 00000000000001ed
R13: 0000000000000000 R14: 00000000ffffffff R15: 0000000000000002
FS: 0000000000000000(0000) GS:ffff88bc8b500000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000000 CR3: 0000000002610001 CR4: 00000000001706e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
? show_trace_log_lvl+0x1c1/0x2d9
? show_trace_log_lvl+0x1c1/0x2d9
? set_affinity_irq+0xdc/0x1c0
? __die_body.cold+0x8/0xd
? die+0x2b/0x50
? do_trap+0x90/0x110
? bind_evtchn_to_cpu+0xdf/0xf0
? do_error_trap+0x65/0x80
? bind_evtchn_to_cpu+0xdf/0xf0
? exc_invalid_op+0x4e/0x70
? bind_evtchn_to_cpu+0xdf/0xf0
? asm_exc_invalid_op+0x12/0x20
? bind_evtchn_to_cpu+0xdf/0xf0
? bind_evtchn_to_cpu+0xc5/0xf0
set_affinity_irq+0xdc/0x1c0
irq_do_set_affinity+0x1d7/0x1f0
irq_setup_affinity+0xd6/0x1a0
irq_startup+0x8a/0xf0
__setup_irq+0x639/0x6d0
? nvme_suspend+0x150/0x150
request_threaded_irq+0x10c/0x180
? nvme_suspend+0x150/0x150
pci_request_irq+0xa8/0xf0
? __blk_mq_free_request+0x74/0xa0
queue_request_irq+0x6f/0x80
nvme_create_queue+0x1af/0x200
nvme_create_io_queues+0xbd/0xf0
nvme_setup_io_queues+0x246/0x320
? nvme_irq_check+0x30/0x30
nvme_reset_work+0x1c8/0x400
process_one_work+0x1b0/0x350
worker_thread+0x49/0x310
? process_one_work+0x350/0x350
kthread+0x11b/0x140
? __kthread_bind_mask+0x60/0x60
ret_from_fork+0x22/0x30
Modules linked in:
---[ end trace a11715de1eee1873 ]---
Fixes: d46a78b05c0e ("xen: implement pirq type event channels")
Cc: stable(a)vger.kernel.org
Co-debugged-by: Andrew Panyakin <apanyaki(a)amazon.com>
Signed-off-by: Maximilian Heyne <mheyne(a)amazon.de>
---
Compare to upstream patch this one does not have close_evtchn flag
because there is no need to handle static event channels.
This feature was added only in 58f6259b7a08f ("xen/evtchn: Introduce new
IOCTL to bind static evtchn")
drivers/xen/events/events_base.c | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c
index 00f8e349921d..96b96516c980 100644
--- a/drivers/xen/events/events_base.c
+++ b/drivers/xen/events/events_base.c
@@ -937,8 +937,8 @@ static void shutdown_pirq(struct irq_data *data)
return;
do_mask(info, EVT_MASK_REASON_EXPLICIT);
- xen_evtchn_close(evtchn);
xen_irq_info_cleanup(info);
+ xen_evtchn_close(evtchn);
}
static void enable_pirq(struct irq_data *data)
@@ -982,8 +982,6 @@ static void __unbind_from_irq(unsigned int irq)
unsigned int cpu = cpu_from_irq(irq);
struct xenbus_device *dev;
- xen_evtchn_close(evtchn);
-
switch (type_from_irq(irq)) {
case IRQT_VIRQ:
per_cpu(virq_to_irq, cpu)[virq_from_irq(irq)] = -1;
@@ -1001,6 +999,7 @@ static void __unbind_from_irq(unsigned int irq)
}
xen_irq_info_cleanup(info);
+ xen_evtchn_close(evtchn);
}
xen_free_irq(irq);
--
2.40.1
[BUG]
Currently btrfs can create subvolume with an invalid qgroup inherit
without triggering any error:
# mkfs.btrfs -O quota -f $dev
# mount $dev $mnt
# btrfs subvolume create -i 2/0 $mnt/subv1
# btrfs qgroup show -prce --sync $mnt
Qgroupid Referenced Exclusive Path
-------- ---------- --------- ----
0/5 16.00KiB 16.00KiB <toplevel>
0/256 16.00KiB 16.00KiB subv1
[CAUSE]
We only do a very basic size check for btrfs_qgroup_inherit structure,
but never really verify if the values are correct.
Thus in btrfs_qgroup_inherit() function, we have to skip non-existing
qgroups, and never return any error.
[FIX]
Fix the behavior and introduce extra checks:
- Introduce early check for btrfs_qgroup_inherit structure
Not only the size, but also all the qgroup ids would be verifyed.
And the timing is very early, so we can return error early.
This early check is very important for snapshot creation, as snapshot
is delayed to transaction commit.
- Drop support for btrfs_qgroup_inherit::num_ref_copies and
num_excl_copies
Those two members are used to specify to copy refr/excl numbers from
other qgroups.
This would definitely mark qgroup inconsistent, and btrfs-progs has
dropped the support for them for a long time.
It's time to drop the support for kernel.
- Verify the supported btrfs_qgroup_inherit::flags
Just in case we want to add extra flags for btrfs_qgroup_inherit.
Now above subvolume creation would fail with -ENOENT other than silently
ignore the non-existing qgroup.
CC: stable(a)vger.kernel.org
Signed-off-by: Qu Wenruo <wqu(a)suse.com>
---
fs/btrfs/ioctl.c | 16 +++---------
fs/btrfs/qgroup.c | 52 ++++++++++++++++++++++++++++++++++++++
fs/btrfs/qgroup.h | 3 +++
include/uapi/linux/btrfs.h | 1 +
4 files changed, 59 insertions(+), 13 deletions(-)
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 8b80fbea1e72..c19ce2e292dc 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1382,7 +1382,7 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
if (vol_args->flags & BTRFS_SUBVOL_RDONLY)
readonly = true;
if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) {
- u64 nums;
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(file_inode(file));
if (vol_args->size < sizeof(*inherit) ||
vol_args->size > PAGE_SIZE) {
@@ -1395,19 +1395,9 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
goto free_args;
}
- if (inherit->num_qgroups > PAGE_SIZE ||
- inherit->num_ref_copies > PAGE_SIZE ||
- inherit->num_excl_copies > PAGE_SIZE) {
- ret = -EINVAL;
+ ret = btrfs_qgroup_check_inherit(fs_info, inherit, vol_args->size);
+ if (ret < 0)
goto free_inherit;
- }
-
- nums = inherit->num_qgroups + 2 * inherit->num_ref_copies +
- 2 * inherit->num_excl_copies;
- if (vol_args->size != struct_size(inherit, qgroups, nums)) {
- ret = -EINVAL;
- goto free_inherit;
- }
}
ret = __btrfs_ioctl_snap_create(file, file_mnt_idmap(file),
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 4fa83c76b37b..66968092b554 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -3046,6 +3046,58 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans)
return ret;
}
+int btrfs_qgroup_check_inherit(struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup_inherit *inherit,
+ size_t size)
+{
+ if (inherit->flags & ~BTRFS_QGROUP_INHERIT_FLAGS_SUPP)
+ return -EOPNOTSUPP;
+ if (size < sizeof(*inherit) || size > PAGE_SIZE)
+ return -EINVAL;
+
+ /*
+ * In the past we allow btrfs_qgroup_inherit to specify to copy
+ * refr/excl numbers directly from other qgroups.
+ * This behavior has been disable in btrfs-progs for a very long time,
+ * but here we should also disable them for kernel, as this behavior
+ * is known to mark qgroup inconsistent, and a rescan would wipe out the
+ * change anyway.
+ *
+ * So here we just reject any btrfs_qgroup_inherit with num_ref_copies or
+ * num_excl_copies.
+ */
+ if (inherit->num_ref_copies || inherit->num_excl_copies)
+ return -EINVAL;
+
+ if (inherit->num_qgroups > PAGE_SIZE)
+ return -EINVAL;
+
+ if (size != struct_size(inherit, qgroups, inherit->num_qgroups))
+ return -EINVAL;
+
+ /*
+ * Now check all the remaining qgroups, they should all:
+ * - Exist
+ * - Be higher level qgroups.
+ */
+ for (int i = 0; i < inherit->num_qgroups; i++) {
+ struct btrfs_qgroup *qgroup;
+ u64 qgroupid = inherit->qgroups[i];
+
+ if (btrfs_qgroup_level(qgroupid) == 0)
+ return -EINVAL;
+
+ spin_lock(&fs_info->qgroup_lock);
+ qgroup = find_qgroup_rb(fs_info, qgroupid);
+ if (!qgroup) {
+ spin_unlock(&fs_info->qgroup_lock);
+ return -ENOENT;
+ }
+ spin_unlock(&fs_info->qgroup_lock);
+ }
+ return 0;
+}
+
static int qgroup_auto_inherit(struct btrfs_fs_info *fs_info,
u64 inode_rootid,
struct btrfs_qgroup_inherit **inherit)
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 1f664261c064..706640be0ec2 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -350,6 +350,9 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
struct ulist *new_roots);
int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans);
int btrfs_run_qgroups(struct btrfs_trans_handle *trans);
+int btrfs_qgroup_check_inherit(struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup_inherit *inherit,
+ size_t size);
int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
u64 objectid, u64 inode_rootid,
struct btrfs_qgroup_inherit *inherit);
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
index f8bc34a6bcfa..cdf6ad872149 100644
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -92,6 +92,7 @@ struct btrfs_qgroup_limit {
* struct btrfs_qgroup_inherit.flags
*/
#define BTRFS_QGROUP_INHERIT_SET_LIMITS (1ULL << 0)
+#define BTRFS_QGROUP_INHERIT_FLAGS_SUPP (BTRFS_QGROUP_INHERIT_SET_LIMITS)
struct btrfs_qgroup_inherit {
__u64 flags;
--
2.43.2