The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x fa765c4b4aed2d64266b694520ecb025c862c5a9
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021911-king-sugar-1024@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
fa765c4b4aed ("xen/events: close evtchn after mapping cleanup")
3fcdaf3d7634 ("xen/events: modify internal [un]bind interfaces")
5dd9ad32d775 ("xen/events: drop xen_allocate_irqs_dynamic()")
3bdb0ac350fe ("xen/events: remove some simple helpers from events_base.c")
686464514fbe ("xen/events: reduce externally visible helper functions")
e64e7c74b99e ("xen/events: avoid using info_for_irq() in xen_send_IPI_one()")
9e90e58c11b7 ("xen: evtchn: Allow shared registration of IRQ handers")
87797fad6cce ("xen/events: replace evtchn_rwlock with RCU")
58f6259b7a08 ("xen/evtchn: Introduce new IOCTL to bind static evtchn")
04d684875b30 ("xen: xen_debug_interrupt prototype to global header")
073352e951f6 ("genirq: Add and use an irq_data_update_affinity helper")
961343d78226 ("genirq: Refactor accessors to use irq_data_get_affinity_mask")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From fa765c4b4aed2d64266b694520ecb025c862c5a9 Mon Sep 17 00:00:00 2001
From: Maximilian Heyne <mheyne(a)amazon.de>
Date: Wed, 24 Jan 2024 16:31:28 +0000
Subject: [PATCH] xen/events: close evtchn after mapping cleanup
shutdown_pirq and startup_pirq are not taking the
irq_mapping_update_lock because they can't due to lock inversion. Both
are called with the irq_desc->lock being taking. The lock order,
however, is first irq_mapping_update_lock and then irq_desc->lock.
This opens multiple races:
- shutdown_pirq can be interrupted by a function that allocates an event
channel:
CPU0 CPU1
shutdown_pirq {
xen_evtchn_close(e)
__startup_pirq {
EVTCHNOP_bind_pirq
-> returns just freed evtchn e
set_evtchn_to_irq(e, irq)
}
xen_irq_info_cleanup() {
set_evtchn_to_irq(e, -1)
}
}
Assume here event channel e refers here to the same event channel
number.
After this race the evtchn_to_irq mapping for e is invalid (-1).
- __startup_pirq races with __unbind_from_irq in a similar way. Because
__startup_pirq doesn't take irq_mapping_update_lock it can grab the
evtchn that __unbind_from_irq is currently freeing and cleaning up. In
this case even though the event channel is allocated, its mapping can
be unset in evtchn_to_irq.
The fix is to first cleanup the mappings and then close the event
channel. In this way, when an event channel gets allocated it's
potential previous evtchn_to_irq mappings are guaranteed to be unset already.
This is also the reverse order of the allocation where first the event
channel is allocated and then the mappings are setup.
On a 5.10 kernel prior to commit 3fcdaf3d7634 ("xen/events: modify internal
[un]bind interfaces"), we hit a BUG like the following during probing of NVMe
devices. The issue is that during nvme_setup_io_queues, pci_free_irq
is called for every device which results in a call to shutdown_pirq.
With many nvme devices it's therefore likely to hit this race during
boot because there will be multiple calls to shutdown_pirq and
startup_pirq are running potentially in parallel.
------------[ cut here ]------------
blkfront: xvda: barrier or flush: disabled; persistent grants: enabled; indirect descriptors: enabled; bounce buffer: enabled
kernel BUG at drivers/xen/events/events_base.c:499!
invalid opcode: 0000 [#1] SMP PTI
CPU: 44 PID: 375 Comm: kworker/u257:23 Not tainted 5.10.201-191.748.amzn2.x86_64 #1
Hardware name: Xen HVM domU, BIOS 4.11.amazon 08/24/2006
Workqueue: nvme-reset-wq nvme_reset_work
RIP: 0010:bind_evtchn_to_cpu+0xdf/0xf0
Code: 5d 41 5e c3 cc cc cc cc 44 89 f7 e8 2b 55 ad ff 49 89 c5 48 85 c0 0f 84 64 ff ff ff 4c 8b 68 30 41 83 fe ff 0f 85 60 ff ff ff <0f> 0b 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 0f 1f 44 00 00
RSP: 0000:ffffc9000d533b08 EFLAGS: 00010046
RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000006
RDX: 0000000000000028 RSI: 00000000ffffffff RDI: 00000000ffffffff
RBP: ffff888107419680 R08: 0000000000000000 R09: ffffffff82d72b00
R10: 0000000000000000 R11: 0000000000000000 R12: 00000000000001ed
R13: 0000000000000000 R14: 00000000ffffffff R15: 0000000000000002
FS: 0000000000000000(0000) GS:ffff88bc8b500000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000000 CR3: 0000000002610001 CR4: 00000000001706e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
? show_trace_log_lvl+0x1c1/0x2d9
? show_trace_log_lvl+0x1c1/0x2d9
? set_affinity_irq+0xdc/0x1c0
? __die_body.cold+0x8/0xd
? die+0x2b/0x50
? do_trap+0x90/0x110
? bind_evtchn_to_cpu+0xdf/0xf0
? do_error_trap+0x65/0x80
? bind_evtchn_to_cpu+0xdf/0xf0
? exc_invalid_op+0x4e/0x70
? bind_evtchn_to_cpu+0xdf/0xf0
? asm_exc_invalid_op+0x12/0x20
? bind_evtchn_to_cpu+0xdf/0xf0
? bind_evtchn_to_cpu+0xc5/0xf0
set_affinity_irq+0xdc/0x1c0
irq_do_set_affinity+0x1d7/0x1f0
irq_setup_affinity+0xd6/0x1a0
irq_startup+0x8a/0xf0
__setup_irq+0x639/0x6d0
? nvme_suspend+0x150/0x150
request_threaded_irq+0x10c/0x180
? nvme_suspend+0x150/0x150
pci_request_irq+0xa8/0xf0
? __blk_mq_free_request+0x74/0xa0
queue_request_irq+0x6f/0x80
nvme_create_queue+0x1af/0x200
nvme_create_io_queues+0xbd/0xf0
nvme_setup_io_queues+0x246/0x320
? nvme_irq_check+0x30/0x30
nvme_reset_work+0x1c8/0x400
process_one_work+0x1b0/0x350
worker_thread+0x49/0x310
? process_one_work+0x350/0x350
kthread+0x11b/0x140
? __kthread_bind_mask+0x60/0x60
ret_from_fork+0x22/0x30
Modules linked in:
---[ end trace a11715de1eee1873 ]---
Fixes: d46a78b05c0e ("xen: implement pirq type event channels")
Cc: stable(a)vger.kernel.org
Co-debugged-by: Andrew Panyakin <apanyaki(a)amazon.com>
Signed-off-by: Maximilian Heyne <mheyne(a)amazon.de>
Reviewed-by: Juergen Gross <jgross(a)suse.com>
Link: https://lore.kernel.org/r/20240124163130.31324-1-mheyne@amazon.de
Signed-off-by: Juergen Gross <jgross(a)suse.com>
diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c
index b8cfea7812d6..3b9f080109d7 100644
--- a/drivers/xen/events/events_base.c
+++ b/drivers/xen/events/events_base.c
@@ -923,8 +923,8 @@ static void shutdown_pirq(struct irq_data *data)
return;
do_mask(info, EVT_MASK_REASON_EXPLICIT);
- xen_evtchn_close(evtchn);
xen_irq_info_cleanup(info);
+ xen_evtchn_close(evtchn);
}
static void enable_pirq(struct irq_data *data)
@@ -956,6 +956,7 @@ EXPORT_SYMBOL_GPL(xen_irq_from_gsi);
static void __unbind_from_irq(struct irq_info *info, unsigned int irq)
{
evtchn_port_t evtchn;
+ bool close_evtchn = false;
if (!info) {
xen_irq_free_desc(irq);
@@ -975,7 +976,7 @@ static void __unbind_from_irq(struct irq_info *info, unsigned int irq)
struct xenbus_device *dev;
if (!info->is_static)
- xen_evtchn_close(evtchn);
+ close_evtchn = true;
switch (info->type) {
case IRQT_VIRQ:
@@ -995,6 +996,9 @@ static void __unbind_from_irq(struct irq_info *info, unsigned int irq)
}
xen_irq_info_cleanup(info);
+
+ if (close_evtchn)
+ xen_evtchn_close(evtchn);
}
xen_free_irq(info);
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x fa765c4b4aed2d64266b694520ecb025c862c5a9
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021908-polymer-backyard-ce35@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
fa765c4b4aed ("xen/events: close evtchn after mapping cleanup")
3fcdaf3d7634 ("xen/events: modify internal [un]bind interfaces")
5dd9ad32d775 ("xen/events: drop xen_allocate_irqs_dynamic()")
3bdb0ac350fe ("xen/events: remove some simple helpers from events_base.c")
686464514fbe ("xen/events: reduce externally visible helper functions")
e64e7c74b99e ("xen/events: avoid using info_for_irq() in xen_send_IPI_one()")
9e90e58c11b7 ("xen: evtchn: Allow shared registration of IRQ handers")
87797fad6cce ("xen/events: replace evtchn_rwlock with RCU")
58f6259b7a08 ("xen/evtchn: Introduce new IOCTL to bind static evtchn")
04d684875b30 ("xen: xen_debug_interrupt prototype to global header")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From fa765c4b4aed2d64266b694520ecb025c862c5a9 Mon Sep 17 00:00:00 2001
From: Maximilian Heyne <mheyne(a)amazon.de>
Date: Wed, 24 Jan 2024 16:31:28 +0000
Subject: [PATCH] xen/events: close evtchn after mapping cleanup
shutdown_pirq and startup_pirq are not taking the
irq_mapping_update_lock because they can't due to lock inversion. Both
are called with the irq_desc->lock being taking. The lock order,
however, is first irq_mapping_update_lock and then irq_desc->lock.
This opens multiple races:
- shutdown_pirq can be interrupted by a function that allocates an event
channel:
CPU0 CPU1
shutdown_pirq {
xen_evtchn_close(e)
__startup_pirq {
EVTCHNOP_bind_pirq
-> returns just freed evtchn e
set_evtchn_to_irq(e, irq)
}
xen_irq_info_cleanup() {
set_evtchn_to_irq(e, -1)
}
}
Assume here event channel e refers here to the same event channel
number.
After this race the evtchn_to_irq mapping for e is invalid (-1).
- __startup_pirq races with __unbind_from_irq in a similar way. Because
__startup_pirq doesn't take irq_mapping_update_lock it can grab the
evtchn that __unbind_from_irq is currently freeing and cleaning up. In
this case even though the event channel is allocated, its mapping can
be unset in evtchn_to_irq.
The fix is to first cleanup the mappings and then close the event
channel. In this way, when an event channel gets allocated it's
potential previous evtchn_to_irq mappings are guaranteed to be unset already.
This is also the reverse order of the allocation where first the event
channel is allocated and then the mappings are setup.
On a 5.10 kernel prior to commit 3fcdaf3d7634 ("xen/events: modify internal
[un]bind interfaces"), we hit a BUG like the following during probing of NVMe
devices. The issue is that during nvme_setup_io_queues, pci_free_irq
is called for every device which results in a call to shutdown_pirq.
With many nvme devices it's therefore likely to hit this race during
boot because there will be multiple calls to shutdown_pirq and
startup_pirq are running potentially in parallel.
------------[ cut here ]------------
blkfront: xvda: barrier or flush: disabled; persistent grants: enabled; indirect descriptors: enabled; bounce buffer: enabled
kernel BUG at drivers/xen/events/events_base.c:499!
invalid opcode: 0000 [#1] SMP PTI
CPU: 44 PID: 375 Comm: kworker/u257:23 Not tainted 5.10.201-191.748.amzn2.x86_64 #1
Hardware name: Xen HVM domU, BIOS 4.11.amazon 08/24/2006
Workqueue: nvme-reset-wq nvme_reset_work
RIP: 0010:bind_evtchn_to_cpu+0xdf/0xf0
Code: 5d 41 5e c3 cc cc cc cc 44 89 f7 e8 2b 55 ad ff 49 89 c5 48 85 c0 0f 84 64 ff ff ff 4c 8b 68 30 41 83 fe ff 0f 85 60 ff ff ff <0f> 0b 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 0f 1f 44 00 00
RSP: 0000:ffffc9000d533b08 EFLAGS: 00010046
RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000006
RDX: 0000000000000028 RSI: 00000000ffffffff RDI: 00000000ffffffff
RBP: ffff888107419680 R08: 0000000000000000 R09: ffffffff82d72b00
R10: 0000000000000000 R11: 0000000000000000 R12: 00000000000001ed
R13: 0000000000000000 R14: 00000000ffffffff R15: 0000000000000002
FS: 0000000000000000(0000) GS:ffff88bc8b500000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000000 CR3: 0000000002610001 CR4: 00000000001706e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
? show_trace_log_lvl+0x1c1/0x2d9
? show_trace_log_lvl+0x1c1/0x2d9
? set_affinity_irq+0xdc/0x1c0
? __die_body.cold+0x8/0xd
? die+0x2b/0x50
? do_trap+0x90/0x110
? bind_evtchn_to_cpu+0xdf/0xf0
? do_error_trap+0x65/0x80
? bind_evtchn_to_cpu+0xdf/0xf0
? exc_invalid_op+0x4e/0x70
? bind_evtchn_to_cpu+0xdf/0xf0
? asm_exc_invalid_op+0x12/0x20
? bind_evtchn_to_cpu+0xdf/0xf0
? bind_evtchn_to_cpu+0xc5/0xf0
set_affinity_irq+0xdc/0x1c0
irq_do_set_affinity+0x1d7/0x1f0
irq_setup_affinity+0xd6/0x1a0
irq_startup+0x8a/0xf0
__setup_irq+0x639/0x6d0
? nvme_suspend+0x150/0x150
request_threaded_irq+0x10c/0x180
? nvme_suspend+0x150/0x150
pci_request_irq+0xa8/0xf0
? __blk_mq_free_request+0x74/0xa0
queue_request_irq+0x6f/0x80
nvme_create_queue+0x1af/0x200
nvme_create_io_queues+0xbd/0xf0
nvme_setup_io_queues+0x246/0x320
? nvme_irq_check+0x30/0x30
nvme_reset_work+0x1c8/0x400
process_one_work+0x1b0/0x350
worker_thread+0x49/0x310
? process_one_work+0x350/0x350
kthread+0x11b/0x140
? __kthread_bind_mask+0x60/0x60
ret_from_fork+0x22/0x30
Modules linked in:
---[ end trace a11715de1eee1873 ]---
Fixes: d46a78b05c0e ("xen: implement pirq type event channels")
Cc: stable(a)vger.kernel.org
Co-debugged-by: Andrew Panyakin <apanyaki(a)amazon.com>
Signed-off-by: Maximilian Heyne <mheyne(a)amazon.de>
Reviewed-by: Juergen Gross <jgross(a)suse.com>
Link: https://lore.kernel.org/r/20240124163130.31324-1-mheyne@amazon.de
Signed-off-by: Juergen Gross <jgross(a)suse.com>
diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c
index b8cfea7812d6..3b9f080109d7 100644
--- a/drivers/xen/events/events_base.c
+++ b/drivers/xen/events/events_base.c
@@ -923,8 +923,8 @@ static void shutdown_pirq(struct irq_data *data)
return;
do_mask(info, EVT_MASK_REASON_EXPLICIT);
- xen_evtchn_close(evtchn);
xen_irq_info_cleanup(info);
+ xen_evtchn_close(evtchn);
}
static void enable_pirq(struct irq_data *data)
@@ -956,6 +956,7 @@ EXPORT_SYMBOL_GPL(xen_irq_from_gsi);
static void __unbind_from_irq(struct irq_info *info, unsigned int irq)
{
evtchn_port_t evtchn;
+ bool close_evtchn = false;
if (!info) {
xen_irq_free_desc(irq);
@@ -975,7 +976,7 @@ static void __unbind_from_irq(struct irq_info *info, unsigned int irq)
struct xenbus_device *dev;
if (!info->is_static)
- xen_evtchn_close(evtchn);
+ close_evtchn = true;
switch (info->type) {
case IRQT_VIRQ:
@@ -995,6 +996,9 @@ static void __unbind_from_irq(struct irq_info *info, unsigned int irq)
}
xen_irq_info_cleanup(info);
+
+ if (close_evtchn)
+ xen_evtchn_close(evtchn);
}
xen_free_irq(info);
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x fa765c4b4aed2d64266b694520ecb025c862c5a9
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021905-anger-exclusion-a2ae@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
fa765c4b4aed ("xen/events: close evtchn after mapping cleanup")
3fcdaf3d7634 ("xen/events: modify internal [un]bind interfaces")
5dd9ad32d775 ("xen/events: drop xen_allocate_irqs_dynamic()")
3bdb0ac350fe ("xen/events: remove some simple helpers from events_base.c")
686464514fbe ("xen/events: reduce externally visible helper functions")
e64e7c74b99e ("xen/events: avoid using info_for_irq() in xen_send_IPI_one()")
9e90e58c11b7 ("xen: evtchn: Allow shared registration of IRQ handers")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From fa765c4b4aed2d64266b694520ecb025c862c5a9 Mon Sep 17 00:00:00 2001
From: Maximilian Heyne <mheyne(a)amazon.de>
Date: Wed, 24 Jan 2024 16:31:28 +0000
Subject: [PATCH] xen/events: close evtchn after mapping cleanup
shutdown_pirq and startup_pirq are not taking the
irq_mapping_update_lock because they can't due to lock inversion. Both
are called with the irq_desc->lock being taking. The lock order,
however, is first irq_mapping_update_lock and then irq_desc->lock.
This opens multiple races:
- shutdown_pirq can be interrupted by a function that allocates an event
channel:
CPU0 CPU1
shutdown_pirq {
xen_evtchn_close(e)
__startup_pirq {
EVTCHNOP_bind_pirq
-> returns just freed evtchn e
set_evtchn_to_irq(e, irq)
}
xen_irq_info_cleanup() {
set_evtchn_to_irq(e, -1)
}
}
Assume here event channel e refers here to the same event channel
number.
After this race the evtchn_to_irq mapping for e is invalid (-1).
- __startup_pirq races with __unbind_from_irq in a similar way. Because
__startup_pirq doesn't take irq_mapping_update_lock it can grab the
evtchn that __unbind_from_irq is currently freeing and cleaning up. In
this case even though the event channel is allocated, its mapping can
be unset in evtchn_to_irq.
The fix is to first cleanup the mappings and then close the event
channel. In this way, when an event channel gets allocated it's
potential previous evtchn_to_irq mappings are guaranteed to be unset already.
This is also the reverse order of the allocation where first the event
channel is allocated and then the mappings are setup.
On a 5.10 kernel prior to commit 3fcdaf3d7634 ("xen/events: modify internal
[un]bind interfaces"), we hit a BUG like the following during probing of NVMe
devices. The issue is that during nvme_setup_io_queues, pci_free_irq
is called for every device which results in a call to shutdown_pirq.
With many nvme devices it's therefore likely to hit this race during
boot because there will be multiple calls to shutdown_pirq and
startup_pirq are running potentially in parallel.
------------[ cut here ]------------
blkfront: xvda: barrier or flush: disabled; persistent grants: enabled; indirect descriptors: enabled; bounce buffer: enabled
kernel BUG at drivers/xen/events/events_base.c:499!
invalid opcode: 0000 [#1] SMP PTI
CPU: 44 PID: 375 Comm: kworker/u257:23 Not tainted 5.10.201-191.748.amzn2.x86_64 #1
Hardware name: Xen HVM domU, BIOS 4.11.amazon 08/24/2006
Workqueue: nvme-reset-wq nvme_reset_work
RIP: 0010:bind_evtchn_to_cpu+0xdf/0xf0
Code: 5d 41 5e c3 cc cc cc cc 44 89 f7 e8 2b 55 ad ff 49 89 c5 48 85 c0 0f 84 64 ff ff ff 4c 8b 68 30 41 83 fe ff 0f 85 60 ff ff ff <0f> 0b 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 0f 1f 44 00 00
RSP: 0000:ffffc9000d533b08 EFLAGS: 00010046
RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000006
RDX: 0000000000000028 RSI: 00000000ffffffff RDI: 00000000ffffffff
RBP: ffff888107419680 R08: 0000000000000000 R09: ffffffff82d72b00
R10: 0000000000000000 R11: 0000000000000000 R12: 00000000000001ed
R13: 0000000000000000 R14: 00000000ffffffff R15: 0000000000000002
FS: 0000000000000000(0000) GS:ffff88bc8b500000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000000 CR3: 0000000002610001 CR4: 00000000001706e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
? show_trace_log_lvl+0x1c1/0x2d9
? show_trace_log_lvl+0x1c1/0x2d9
? set_affinity_irq+0xdc/0x1c0
? __die_body.cold+0x8/0xd
? die+0x2b/0x50
? do_trap+0x90/0x110
? bind_evtchn_to_cpu+0xdf/0xf0
? do_error_trap+0x65/0x80
? bind_evtchn_to_cpu+0xdf/0xf0
? exc_invalid_op+0x4e/0x70
? bind_evtchn_to_cpu+0xdf/0xf0
? asm_exc_invalid_op+0x12/0x20
? bind_evtchn_to_cpu+0xdf/0xf0
? bind_evtchn_to_cpu+0xc5/0xf0
set_affinity_irq+0xdc/0x1c0
irq_do_set_affinity+0x1d7/0x1f0
irq_setup_affinity+0xd6/0x1a0
irq_startup+0x8a/0xf0
__setup_irq+0x639/0x6d0
? nvme_suspend+0x150/0x150
request_threaded_irq+0x10c/0x180
? nvme_suspend+0x150/0x150
pci_request_irq+0xa8/0xf0
? __blk_mq_free_request+0x74/0xa0
queue_request_irq+0x6f/0x80
nvme_create_queue+0x1af/0x200
nvme_create_io_queues+0xbd/0xf0
nvme_setup_io_queues+0x246/0x320
? nvme_irq_check+0x30/0x30
nvme_reset_work+0x1c8/0x400
process_one_work+0x1b0/0x350
worker_thread+0x49/0x310
? process_one_work+0x350/0x350
kthread+0x11b/0x140
? __kthread_bind_mask+0x60/0x60
ret_from_fork+0x22/0x30
Modules linked in:
---[ end trace a11715de1eee1873 ]---
Fixes: d46a78b05c0e ("xen: implement pirq type event channels")
Cc: stable(a)vger.kernel.org
Co-debugged-by: Andrew Panyakin <apanyaki(a)amazon.com>
Signed-off-by: Maximilian Heyne <mheyne(a)amazon.de>
Reviewed-by: Juergen Gross <jgross(a)suse.com>
Link: https://lore.kernel.org/r/20240124163130.31324-1-mheyne@amazon.de
Signed-off-by: Juergen Gross <jgross(a)suse.com>
diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c
index b8cfea7812d6..3b9f080109d7 100644
--- a/drivers/xen/events/events_base.c
+++ b/drivers/xen/events/events_base.c
@@ -923,8 +923,8 @@ static void shutdown_pirq(struct irq_data *data)
return;
do_mask(info, EVT_MASK_REASON_EXPLICIT);
- xen_evtchn_close(evtchn);
xen_irq_info_cleanup(info);
+ xen_evtchn_close(evtchn);
}
static void enable_pirq(struct irq_data *data)
@@ -956,6 +956,7 @@ EXPORT_SYMBOL_GPL(xen_irq_from_gsi);
static void __unbind_from_irq(struct irq_info *info, unsigned int irq)
{
evtchn_port_t evtchn;
+ bool close_evtchn = false;
if (!info) {
xen_irq_free_desc(irq);
@@ -975,7 +976,7 @@ static void __unbind_from_irq(struct irq_info *info, unsigned int irq)
struct xenbus_device *dev;
if (!info->is_static)
- xen_evtchn_close(evtchn);
+ close_evtchn = true;
switch (info->type) {
case IRQT_VIRQ:
@@ -995,6 +996,9 @@ static void __unbind_from_irq(struct irq_info *info, unsigned int irq)
}
xen_irq_info_cleanup(info);
+
+ if (close_evtchn)
+ xen_evtchn_close(evtchn);
}
xen_free_irq(info);
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.19.y
git checkout FETCH_HEAD
git cherry-pick -x f814bdda774c183b0cc15ec8f3b6e7c6f4527ba5
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021925-onscreen-cancel-9be1@gregkh' --subject-prefix 'PATCH 4.19.y' HEAD^..
Possible dependencies:
f814bdda774c ("blk-wbt: Fix detection of dirty-throttled tasks")
ba91c849fa50 ("blk-rq-qos: store a gendisk instead of request_queue in struct rq_qos")
3963d84df797 ("blk-rq-qos: constify rq_qos_ops")
ce57b558604e ("blk-rq-qos: make rq_qos_add and rq_qos_del more useful")
b494f9c566ba ("blk-rq-qos: move rq_qos_add and rq_qos_del out of line")
de185b56e8a6 ("blk-cgroup: pass a gendisk to blkcg_schedule_throttle")
9df3e65139b9 ("blk-iocost: simplify ioc_name")
14a6e2eb7df5 ("block: don't allow the same type rq_qos add more than once")
5cf9c91ba927 ("block: serialize all debugfs operations using q->debugfs_mutex")
8a177a36da6c ("blk-iolatency: Fix inflight count imbalances and IO hangs on offline")
c97ab271576d ("blk-cgroup: remove unneeded includes from <linux/blk-cgroup.h>")
7f20ba7c42fd ("blk-cgroup: remove pointless CONFIG_BLOCK ifdefs")
bbb1ebe7a909 ("blk-cgroup: replace bio_blkcg with bio_blkcg_css")
dec223c92a46 ("blk-cgroup: move struct blkcg to block/blk-cgroup.h")
397c9f46ee4d ("blk-cgroup: move blkcg_{pin,unpin}_online out of line")
216889aad362 ("blk-cgroup: move blk_cgroup_congested out line")
d589ae0d4460 ("Merge tag 'for-5.18/block-2022-04-01' of git://git.kernel.dk/linux-block")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From f814bdda774c183b0cc15ec8f3b6e7c6f4527ba5 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack(a)suse.cz>
Date: Tue, 23 Jan 2024 18:58:26 +0100
Subject: [PATCH] blk-wbt: Fix detection of dirty-throttled tasks
The detection of dirty-throttled tasks in blk-wbt has been subtly broken
since its beginning in 2016. Namely if we are doing cgroup writeback and
the throttled task is not in the root cgroup, balance_dirty_pages() will
set dirty_sleep for the non-root bdi_writeback structure. However
blk-wbt checks dirty_sleep only in the root cgroup bdi_writeback
structure. Thus detection of recently throttled tasks is not working in
this case (we noticed this when we switched to cgroup v2 and suddently
writeback was slow).
Since blk-wbt has no easy way to get to proper bdi_writeback and
furthermore its intention has always been to work on the whole device
rather than on individual cgroups, just move the dirty_sleep timestamp
from bdi_writeback to backing_dev_info. That fixes the checking for
recently throttled task and saves memory for everybody as a bonus.
CC: stable(a)vger.kernel.org
Fixes: b57d74aff9ab ("writeback: track if we're sleeping on progress in balance_dirty_pages()")
Signed-off-by: Jan Kara <jack(a)suse.cz>
Link: https://lore.kernel.org/r/20240123175826.21452-1-jack@suse.cz
[axboe: fixup indentation errors]
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 5ba3cd574eac..0c0e270a8265 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -163,9 +163,9 @@ static void wb_timestamp(struct rq_wb *rwb, unsigned long *var)
*/
static bool wb_recent_wait(struct rq_wb *rwb)
{
- struct bdi_writeback *wb = &rwb->rqos.disk->bdi->wb;
+ struct backing_dev_info *bdi = rwb->rqos.disk->bdi;
- return time_before(jiffies, wb->dirty_sleep + HZ);
+ return time_before(jiffies, bdi->last_bdp_sleep + HZ);
}
static inline struct rq_wait *get_rq_wait(struct rq_wb *rwb,
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index ae12696ec492..2ad261082bba 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -141,8 +141,6 @@ struct bdi_writeback {
struct delayed_work dwork; /* work item used for writeback */
struct delayed_work bw_dwork; /* work item used for bandwidth estimate */
- unsigned long dirty_sleep; /* last wait */
-
struct list_head bdi_node; /* anchored at bdi->wb_list */
#ifdef CONFIG_CGROUP_WRITEBACK
@@ -179,6 +177,11 @@ struct backing_dev_info {
* any dirty wbs, which is depended upon by bdi_has_dirty().
*/
atomic_long_t tot_write_bandwidth;
+ /*
+ * Jiffies when last process was dirty throttled on this bdi. Used by
+ * blk-wbt.
+ */
+ unsigned long last_bdp_sleep;
struct bdi_writeback wb; /* the root writeback info for this bdi */
struct list_head wb_list; /* list of all wbs */
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 1e3447bccdb1..e039d05304dd 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -436,7 +436,6 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
INIT_LIST_HEAD(&wb->work_list);
INIT_DELAYED_WORK(&wb->dwork, wb_workfn);
INIT_DELAYED_WORK(&wb->bw_dwork, wb_update_bandwidth_workfn);
- wb->dirty_sleep = jiffies;
err = fprop_local_init_percpu(&wb->completions, gfp);
if (err)
@@ -921,6 +920,7 @@ int bdi_init(struct backing_dev_info *bdi)
INIT_LIST_HEAD(&bdi->bdi_list);
INIT_LIST_HEAD(&bdi->wb_list);
init_waitqueue_head(&bdi->wb_waitq);
+ bdi->last_bdp_sleep = jiffies;
return cgwb_bdi_init(bdi);
}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index cd4e4ae77c40..cc37fa7f3364 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1921,7 +1921,7 @@ static int balance_dirty_pages(struct bdi_writeback *wb,
break;
}
__set_current_state(TASK_KILLABLE);
- wb->dirty_sleep = now;
+ bdi->last_bdp_sleep = jiffies;
io_schedule_timeout(pause);
current->dirty_paused_when = now + pause;
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x f814bdda774c183b0cc15ec8f3b6e7c6f4527ba5
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021924-handcart-displease-1607@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
f814bdda774c ("blk-wbt: Fix detection of dirty-throttled tasks")
ba91c849fa50 ("blk-rq-qos: store a gendisk instead of request_queue in struct rq_qos")
3963d84df797 ("blk-rq-qos: constify rq_qos_ops")
ce57b558604e ("blk-rq-qos: make rq_qos_add and rq_qos_del more useful")
b494f9c566ba ("blk-rq-qos: move rq_qos_add and rq_qos_del out of line")
de185b56e8a6 ("blk-cgroup: pass a gendisk to blkcg_schedule_throttle")
9df3e65139b9 ("blk-iocost: simplify ioc_name")
14a6e2eb7df5 ("block: don't allow the same type rq_qos add more than once")
5cf9c91ba927 ("block: serialize all debugfs operations using q->debugfs_mutex")
8a177a36da6c ("blk-iolatency: Fix inflight count imbalances and IO hangs on offline")
c97ab271576d ("blk-cgroup: remove unneeded includes from <linux/blk-cgroup.h>")
7f20ba7c42fd ("blk-cgroup: remove pointless CONFIG_BLOCK ifdefs")
bbb1ebe7a909 ("blk-cgroup: replace bio_blkcg with bio_blkcg_css")
dec223c92a46 ("blk-cgroup: move struct blkcg to block/blk-cgroup.h")
397c9f46ee4d ("blk-cgroup: move blkcg_{pin,unpin}_online out of line")
216889aad362 ("blk-cgroup: move blk_cgroup_congested out line")
d589ae0d4460 ("Merge tag 'for-5.18/block-2022-04-01' of git://git.kernel.dk/linux-block")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From f814bdda774c183b0cc15ec8f3b6e7c6f4527ba5 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack(a)suse.cz>
Date: Tue, 23 Jan 2024 18:58:26 +0100
Subject: [PATCH] blk-wbt: Fix detection of dirty-throttled tasks
The detection of dirty-throttled tasks in blk-wbt has been subtly broken
since its beginning in 2016. Namely if we are doing cgroup writeback and
the throttled task is not in the root cgroup, balance_dirty_pages() will
set dirty_sleep for the non-root bdi_writeback structure. However
blk-wbt checks dirty_sleep only in the root cgroup bdi_writeback
structure. Thus detection of recently throttled tasks is not working in
this case (we noticed this when we switched to cgroup v2 and suddently
writeback was slow).
Since blk-wbt has no easy way to get to proper bdi_writeback and
furthermore its intention has always been to work on the whole device
rather than on individual cgroups, just move the dirty_sleep timestamp
from bdi_writeback to backing_dev_info. That fixes the checking for
recently throttled task and saves memory for everybody as a bonus.
CC: stable(a)vger.kernel.org
Fixes: b57d74aff9ab ("writeback: track if we're sleeping on progress in balance_dirty_pages()")
Signed-off-by: Jan Kara <jack(a)suse.cz>
Link: https://lore.kernel.org/r/20240123175826.21452-1-jack@suse.cz
[axboe: fixup indentation errors]
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 5ba3cd574eac..0c0e270a8265 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -163,9 +163,9 @@ static void wb_timestamp(struct rq_wb *rwb, unsigned long *var)
*/
static bool wb_recent_wait(struct rq_wb *rwb)
{
- struct bdi_writeback *wb = &rwb->rqos.disk->bdi->wb;
+ struct backing_dev_info *bdi = rwb->rqos.disk->bdi;
- return time_before(jiffies, wb->dirty_sleep + HZ);
+ return time_before(jiffies, bdi->last_bdp_sleep + HZ);
}
static inline struct rq_wait *get_rq_wait(struct rq_wb *rwb,
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index ae12696ec492..2ad261082bba 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -141,8 +141,6 @@ struct bdi_writeback {
struct delayed_work dwork; /* work item used for writeback */
struct delayed_work bw_dwork; /* work item used for bandwidth estimate */
- unsigned long dirty_sleep; /* last wait */
-
struct list_head bdi_node; /* anchored at bdi->wb_list */
#ifdef CONFIG_CGROUP_WRITEBACK
@@ -179,6 +177,11 @@ struct backing_dev_info {
* any dirty wbs, which is depended upon by bdi_has_dirty().
*/
atomic_long_t tot_write_bandwidth;
+ /*
+ * Jiffies when last process was dirty throttled on this bdi. Used by
+ * blk-wbt.
+ */
+ unsigned long last_bdp_sleep;
struct bdi_writeback wb; /* the root writeback info for this bdi */
struct list_head wb_list; /* list of all wbs */
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 1e3447bccdb1..e039d05304dd 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -436,7 +436,6 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
INIT_LIST_HEAD(&wb->work_list);
INIT_DELAYED_WORK(&wb->dwork, wb_workfn);
INIT_DELAYED_WORK(&wb->bw_dwork, wb_update_bandwidth_workfn);
- wb->dirty_sleep = jiffies;
err = fprop_local_init_percpu(&wb->completions, gfp);
if (err)
@@ -921,6 +920,7 @@ int bdi_init(struct backing_dev_info *bdi)
INIT_LIST_HEAD(&bdi->bdi_list);
INIT_LIST_HEAD(&bdi->wb_list);
init_waitqueue_head(&bdi->wb_waitq);
+ bdi->last_bdp_sleep = jiffies;
return cgwb_bdi_init(bdi);
}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index cd4e4ae77c40..cc37fa7f3364 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1921,7 +1921,7 @@ static int balance_dirty_pages(struct bdi_writeback *wb,
break;
}
__set_current_state(TASK_KILLABLE);
- wb->dirty_sleep = now;
+ bdi->last_bdp_sleep = jiffies;
io_schedule_timeout(pause);
current->dirty_paused_when = now + pause;
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x f814bdda774c183b0cc15ec8f3b6e7c6f4527ba5
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021923-onboard-pork-7d99@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
f814bdda774c ("blk-wbt: Fix detection of dirty-throttled tasks")
ba91c849fa50 ("blk-rq-qos: store a gendisk instead of request_queue in struct rq_qos")
3963d84df797 ("blk-rq-qos: constify rq_qos_ops")
ce57b558604e ("blk-rq-qos: make rq_qos_add and rq_qos_del more useful")
b494f9c566ba ("blk-rq-qos: move rq_qos_add and rq_qos_del out of line")
de185b56e8a6 ("blk-cgroup: pass a gendisk to blkcg_schedule_throttle")
9df3e65139b9 ("blk-iocost: simplify ioc_name")
14a6e2eb7df5 ("block: don't allow the same type rq_qos add more than once")
5cf9c91ba927 ("block: serialize all debugfs operations using q->debugfs_mutex")
8a177a36da6c ("blk-iolatency: Fix inflight count imbalances and IO hangs on offline")
c97ab271576d ("blk-cgroup: remove unneeded includes from <linux/blk-cgroup.h>")
7f20ba7c42fd ("blk-cgroup: remove pointless CONFIG_BLOCK ifdefs")
bbb1ebe7a909 ("blk-cgroup: replace bio_blkcg with bio_blkcg_css")
dec223c92a46 ("blk-cgroup: move struct blkcg to block/blk-cgroup.h")
397c9f46ee4d ("blk-cgroup: move blkcg_{pin,unpin}_online out of line")
216889aad362 ("blk-cgroup: move blk_cgroup_congested out line")
d589ae0d4460 ("Merge tag 'for-5.18/block-2022-04-01' of git://git.kernel.dk/linux-block")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From f814bdda774c183b0cc15ec8f3b6e7c6f4527ba5 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack(a)suse.cz>
Date: Tue, 23 Jan 2024 18:58:26 +0100
Subject: [PATCH] blk-wbt: Fix detection of dirty-throttled tasks
The detection of dirty-throttled tasks in blk-wbt has been subtly broken
since its beginning in 2016. Namely if we are doing cgroup writeback and
the throttled task is not in the root cgroup, balance_dirty_pages() will
set dirty_sleep for the non-root bdi_writeback structure. However
blk-wbt checks dirty_sleep only in the root cgroup bdi_writeback
structure. Thus detection of recently throttled tasks is not working in
this case (we noticed this when we switched to cgroup v2 and suddently
writeback was slow).
Since blk-wbt has no easy way to get to proper bdi_writeback and
furthermore its intention has always been to work on the whole device
rather than on individual cgroups, just move the dirty_sleep timestamp
from bdi_writeback to backing_dev_info. That fixes the checking for
recently throttled task and saves memory for everybody as a bonus.
CC: stable(a)vger.kernel.org
Fixes: b57d74aff9ab ("writeback: track if we're sleeping on progress in balance_dirty_pages()")
Signed-off-by: Jan Kara <jack(a)suse.cz>
Link: https://lore.kernel.org/r/20240123175826.21452-1-jack@suse.cz
[axboe: fixup indentation errors]
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 5ba3cd574eac..0c0e270a8265 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -163,9 +163,9 @@ static void wb_timestamp(struct rq_wb *rwb, unsigned long *var)
*/
static bool wb_recent_wait(struct rq_wb *rwb)
{
- struct bdi_writeback *wb = &rwb->rqos.disk->bdi->wb;
+ struct backing_dev_info *bdi = rwb->rqos.disk->bdi;
- return time_before(jiffies, wb->dirty_sleep + HZ);
+ return time_before(jiffies, bdi->last_bdp_sleep + HZ);
}
static inline struct rq_wait *get_rq_wait(struct rq_wb *rwb,
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index ae12696ec492..2ad261082bba 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -141,8 +141,6 @@ struct bdi_writeback {
struct delayed_work dwork; /* work item used for writeback */
struct delayed_work bw_dwork; /* work item used for bandwidth estimate */
- unsigned long dirty_sleep; /* last wait */
-
struct list_head bdi_node; /* anchored at bdi->wb_list */
#ifdef CONFIG_CGROUP_WRITEBACK
@@ -179,6 +177,11 @@ struct backing_dev_info {
* any dirty wbs, which is depended upon by bdi_has_dirty().
*/
atomic_long_t tot_write_bandwidth;
+ /*
+ * Jiffies when last process was dirty throttled on this bdi. Used by
+ * blk-wbt.
+ */
+ unsigned long last_bdp_sleep;
struct bdi_writeback wb; /* the root writeback info for this bdi */
struct list_head wb_list; /* list of all wbs */
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 1e3447bccdb1..e039d05304dd 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -436,7 +436,6 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
INIT_LIST_HEAD(&wb->work_list);
INIT_DELAYED_WORK(&wb->dwork, wb_workfn);
INIT_DELAYED_WORK(&wb->bw_dwork, wb_update_bandwidth_workfn);
- wb->dirty_sleep = jiffies;
err = fprop_local_init_percpu(&wb->completions, gfp);
if (err)
@@ -921,6 +920,7 @@ int bdi_init(struct backing_dev_info *bdi)
INIT_LIST_HEAD(&bdi->bdi_list);
INIT_LIST_HEAD(&bdi->wb_list);
init_waitqueue_head(&bdi->wb_waitq);
+ bdi->last_bdp_sleep = jiffies;
return cgwb_bdi_init(bdi);
}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index cd4e4ae77c40..cc37fa7f3364 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1921,7 +1921,7 @@ static int balance_dirty_pages(struct bdi_writeback *wb,
break;
}
__set_current_state(TASK_KILLABLE);
- wb->dirty_sleep = now;
+ bdi->last_bdp_sleep = jiffies;
io_schedule_timeout(pause);
current->dirty_paused_when = now + pause;
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x f814bdda774c183b0cc15ec8f3b6e7c6f4527ba5
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021921-unspoiled-despite-59cc@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
f814bdda774c ("blk-wbt: Fix detection of dirty-throttled tasks")
ba91c849fa50 ("blk-rq-qos: store a gendisk instead of request_queue in struct rq_qos")
3963d84df797 ("blk-rq-qos: constify rq_qos_ops")
ce57b558604e ("blk-rq-qos: make rq_qos_add and rq_qos_del more useful")
b494f9c566ba ("blk-rq-qos: move rq_qos_add and rq_qos_del out of line")
de185b56e8a6 ("blk-cgroup: pass a gendisk to blkcg_schedule_throttle")
9df3e65139b9 ("blk-iocost: simplify ioc_name")
14a6e2eb7df5 ("block: don't allow the same type rq_qos add more than once")
5cf9c91ba927 ("block: serialize all debugfs operations using q->debugfs_mutex")
8a177a36da6c ("blk-iolatency: Fix inflight count imbalances and IO hangs on offline")
c97ab271576d ("blk-cgroup: remove unneeded includes from <linux/blk-cgroup.h>")
7f20ba7c42fd ("blk-cgroup: remove pointless CONFIG_BLOCK ifdefs")
bbb1ebe7a909 ("blk-cgroup: replace bio_blkcg with bio_blkcg_css")
dec223c92a46 ("blk-cgroup: move struct blkcg to block/blk-cgroup.h")
397c9f46ee4d ("blk-cgroup: move blkcg_{pin,unpin}_online out of line")
216889aad362 ("blk-cgroup: move blk_cgroup_congested out line")
d589ae0d4460 ("Merge tag 'for-5.18/block-2022-04-01' of git://git.kernel.dk/linux-block")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From f814bdda774c183b0cc15ec8f3b6e7c6f4527ba5 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack(a)suse.cz>
Date: Tue, 23 Jan 2024 18:58:26 +0100
Subject: [PATCH] blk-wbt: Fix detection of dirty-throttled tasks
The detection of dirty-throttled tasks in blk-wbt has been subtly broken
since its beginning in 2016. Namely if we are doing cgroup writeback and
the throttled task is not in the root cgroup, balance_dirty_pages() will
set dirty_sleep for the non-root bdi_writeback structure. However
blk-wbt checks dirty_sleep only in the root cgroup bdi_writeback
structure. Thus detection of recently throttled tasks is not working in
this case (we noticed this when we switched to cgroup v2 and suddently
writeback was slow).
Since blk-wbt has no easy way to get to proper bdi_writeback and
furthermore its intention has always been to work on the whole device
rather than on individual cgroups, just move the dirty_sleep timestamp
from bdi_writeback to backing_dev_info. That fixes the checking for
recently throttled task and saves memory for everybody as a bonus.
CC: stable(a)vger.kernel.org
Fixes: b57d74aff9ab ("writeback: track if we're sleeping on progress in balance_dirty_pages()")
Signed-off-by: Jan Kara <jack(a)suse.cz>
Link: https://lore.kernel.org/r/20240123175826.21452-1-jack@suse.cz
[axboe: fixup indentation errors]
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 5ba3cd574eac..0c0e270a8265 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -163,9 +163,9 @@ static void wb_timestamp(struct rq_wb *rwb, unsigned long *var)
*/
static bool wb_recent_wait(struct rq_wb *rwb)
{
- struct bdi_writeback *wb = &rwb->rqos.disk->bdi->wb;
+ struct backing_dev_info *bdi = rwb->rqos.disk->bdi;
- return time_before(jiffies, wb->dirty_sleep + HZ);
+ return time_before(jiffies, bdi->last_bdp_sleep + HZ);
}
static inline struct rq_wait *get_rq_wait(struct rq_wb *rwb,
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index ae12696ec492..2ad261082bba 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -141,8 +141,6 @@ struct bdi_writeback {
struct delayed_work dwork; /* work item used for writeback */
struct delayed_work bw_dwork; /* work item used for bandwidth estimate */
- unsigned long dirty_sleep; /* last wait */
-
struct list_head bdi_node; /* anchored at bdi->wb_list */
#ifdef CONFIG_CGROUP_WRITEBACK
@@ -179,6 +177,11 @@ struct backing_dev_info {
* any dirty wbs, which is depended upon by bdi_has_dirty().
*/
atomic_long_t tot_write_bandwidth;
+ /*
+ * Jiffies when last process was dirty throttled on this bdi. Used by
+ * blk-wbt.
+ */
+ unsigned long last_bdp_sleep;
struct bdi_writeback wb; /* the root writeback info for this bdi */
struct list_head wb_list; /* list of all wbs */
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 1e3447bccdb1..e039d05304dd 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -436,7 +436,6 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
INIT_LIST_HEAD(&wb->work_list);
INIT_DELAYED_WORK(&wb->dwork, wb_workfn);
INIT_DELAYED_WORK(&wb->bw_dwork, wb_update_bandwidth_workfn);
- wb->dirty_sleep = jiffies;
err = fprop_local_init_percpu(&wb->completions, gfp);
if (err)
@@ -921,6 +920,7 @@ int bdi_init(struct backing_dev_info *bdi)
INIT_LIST_HEAD(&bdi->bdi_list);
INIT_LIST_HEAD(&bdi->wb_list);
init_waitqueue_head(&bdi->wb_waitq);
+ bdi->last_bdp_sleep = jiffies;
return cgwb_bdi_init(bdi);
}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index cd4e4ae77c40..cc37fa7f3364 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1921,7 +1921,7 @@ static int balance_dirty_pages(struct bdi_writeback *wb,
break;
}
__set_current_state(TASK_KILLABLE);
- wb->dirty_sleep = now;
+ bdi->last_bdp_sleep = jiffies;
io_schedule_timeout(pause);
current->dirty_paused_when = now + pause;
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x f814bdda774c183b0cc15ec8f3b6e7c6f4527ba5
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021920-latitude-quilt-dfa8@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
f814bdda774c ("blk-wbt: Fix detection of dirty-throttled tasks")
ba91c849fa50 ("blk-rq-qos: store a gendisk instead of request_queue in struct rq_qos")
3963d84df797 ("blk-rq-qos: constify rq_qos_ops")
ce57b558604e ("blk-rq-qos: make rq_qos_add and rq_qos_del more useful")
b494f9c566ba ("blk-rq-qos: move rq_qos_add and rq_qos_del out of line")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From f814bdda774c183b0cc15ec8f3b6e7c6f4527ba5 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack(a)suse.cz>
Date: Tue, 23 Jan 2024 18:58:26 +0100
Subject: [PATCH] blk-wbt: Fix detection of dirty-throttled tasks
The detection of dirty-throttled tasks in blk-wbt has been subtly broken
since its beginning in 2016. Namely if we are doing cgroup writeback and
the throttled task is not in the root cgroup, balance_dirty_pages() will
set dirty_sleep for the non-root bdi_writeback structure. However
blk-wbt checks dirty_sleep only in the root cgroup bdi_writeback
structure. Thus detection of recently throttled tasks is not working in
this case (we noticed this when we switched to cgroup v2 and suddently
writeback was slow).
Since blk-wbt has no easy way to get to proper bdi_writeback and
furthermore its intention has always been to work on the whole device
rather than on individual cgroups, just move the dirty_sleep timestamp
from bdi_writeback to backing_dev_info. That fixes the checking for
recently throttled task and saves memory for everybody as a bonus.
CC: stable(a)vger.kernel.org
Fixes: b57d74aff9ab ("writeback: track if we're sleeping on progress in balance_dirty_pages()")
Signed-off-by: Jan Kara <jack(a)suse.cz>
Link: https://lore.kernel.org/r/20240123175826.21452-1-jack@suse.cz
[axboe: fixup indentation errors]
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 5ba3cd574eac..0c0e270a8265 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -163,9 +163,9 @@ static void wb_timestamp(struct rq_wb *rwb, unsigned long *var)
*/
static bool wb_recent_wait(struct rq_wb *rwb)
{
- struct bdi_writeback *wb = &rwb->rqos.disk->bdi->wb;
+ struct backing_dev_info *bdi = rwb->rqos.disk->bdi;
- return time_before(jiffies, wb->dirty_sleep + HZ);
+ return time_before(jiffies, bdi->last_bdp_sleep + HZ);
}
static inline struct rq_wait *get_rq_wait(struct rq_wb *rwb,
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index ae12696ec492..2ad261082bba 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -141,8 +141,6 @@ struct bdi_writeback {
struct delayed_work dwork; /* work item used for writeback */
struct delayed_work bw_dwork; /* work item used for bandwidth estimate */
- unsigned long dirty_sleep; /* last wait */
-
struct list_head bdi_node; /* anchored at bdi->wb_list */
#ifdef CONFIG_CGROUP_WRITEBACK
@@ -179,6 +177,11 @@ struct backing_dev_info {
* any dirty wbs, which is depended upon by bdi_has_dirty().
*/
atomic_long_t tot_write_bandwidth;
+ /*
+ * Jiffies when last process was dirty throttled on this bdi. Used by
+ * blk-wbt.
+ */
+ unsigned long last_bdp_sleep;
struct bdi_writeback wb; /* the root writeback info for this bdi */
struct list_head wb_list; /* list of all wbs */
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 1e3447bccdb1..e039d05304dd 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -436,7 +436,6 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
INIT_LIST_HEAD(&wb->work_list);
INIT_DELAYED_WORK(&wb->dwork, wb_workfn);
INIT_DELAYED_WORK(&wb->bw_dwork, wb_update_bandwidth_workfn);
- wb->dirty_sleep = jiffies;
err = fprop_local_init_percpu(&wb->completions, gfp);
if (err)
@@ -921,6 +920,7 @@ int bdi_init(struct backing_dev_info *bdi)
INIT_LIST_HEAD(&bdi->bdi_list);
INIT_LIST_HEAD(&bdi->wb_list);
init_waitqueue_head(&bdi->wb_waitq);
+ bdi->last_bdp_sleep = jiffies;
return cgwb_bdi_init(bdi);
}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index cd4e4ae77c40..cc37fa7f3364 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1921,7 +1921,7 @@ static int balance_dirty_pages(struct bdi_writeback *wb,
break;
}
__set_current_state(TASK_KILLABLE);
- wb->dirty_sleep = now;
+ bdi->last_bdp_sleep = jiffies;
io_schedule_timeout(pause);
current->dirty_paused_when = now + pause;