Freezing the request queue from inside sysfs store callbacks may cause a deadlock in combination with the dm-multipath driver and the queue_if_no_path option. Additionally, freezing the request queue slows down system boot on systems where sysfs attributes are set synchronously.
Fix this by removing the blk_mq_freeze_queue() / blk_mq_unfreeze_queue() calls from the store callbacks that do not strictly need these callbacks. Add the __data_racy annotation to request_queue.rq_timeout to suppress KCSAN data race reports about the rq_timeout reads.
This patch may cause a small delay in applying the new settings.
For all the attributes affected by this patch, I/O will complete correctly whether the old or the new value of the attribute is used.
This patch affects the following sysfs attributes: * io_poll_delay * io_timeout * nomerges * read_ahead_kb * rq_affinity
Here is an example of a deadlock triggered by running test srp/002 if this patch is not applied:
task:multipathd Call Trace: <TASK> __schedule+0x8c1/0x1bf0 schedule+0xdd/0x270 schedule_preempt_disabled+0x1c/0x30 __mutex_lock+0xb89/0x1650 mutex_lock_nested+0x1f/0x30 dm_table_set_restrictions+0x823/0xdf0 __bind+0x166/0x590 dm_swap_table+0x2a7/0x490 do_resume+0x1b1/0x610 dev_suspend+0x55/0x1a0 ctl_ioctl+0x3a5/0x7e0 dm_ctl_ioctl+0x12/0x20 __x64_sys_ioctl+0x127/0x1a0 x64_sys_call+0xe2b/0x17d0 do_syscall_64+0x96/0x3a0 entry_SYSCALL_64_after_hwframe+0x4b/0x53 </TASK> task:(udev-worker) Call Trace: <TASK> __schedule+0x8c1/0x1bf0 schedule+0xdd/0x270 blk_mq_freeze_queue_wait+0xf2/0x140 blk_mq_freeze_queue_nomemsave+0x23/0x30 queue_ra_store+0x14e/0x290 queue_attr_store+0x23e/0x2c0 sysfs_kf_write+0xde/0x140 kernfs_fop_write_iter+0x3b2/0x630 vfs_write+0x4fd/0x1390 ksys_write+0xfd/0x230 __x64_sys_write+0x76/0xc0 x64_sys_call+0x276/0x17d0 do_syscall_64+0x96/0x3a0 entry_SYSCALL_64_after_hwframe+0x4b/0x53 </TASK>
Cc: Christoph Hellwig hch@lst.de Cc: Ming Lei ming.lei@redhat.com Cc: Nilay Shroff nilay@linux.ibm.com Cc: Martin Wilck mwilck@suse.com Cc: Benjamin Marzinski bmarzins@redhat.com Cc: stable@vger.kernel.org Fixes: af2814149883 ("block: freeze the queue in queue_attr_store") Signed-off-by: Bart Van Assche bvanassche@acm.org --- block/blk-sysfs.c | 26 ++++++++------------------ include/linux/blkdev.h | 2 +- 2 files changed, 9 insertions(+), 19 deletions(-)
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 76c47fe9b8d6..8684c57498cc 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -143,21 +143,22 @@ queue_ra_store(struct gendisk *disk, const char *page, size_t count) { unsigned long ra_kb; ssize_t ret; - unsigned int memflags; struct request_queue *q = disk->queue;
ret = queue_var_store(&ra_kb, page, count); if (ret < 0) return ret; /* - * ->ra_pages is protected by ->limits_lock because it is usually - * calculated from the queue limits by queue_limits_commit_update. + * The ->ra_pages change below is protected by ->limits_lock because it + * is usually calculated from the queue limits by + * queue_limits_commit_update(). + * + * bdi->ra_pages reads are not serialized against bdi->ra_pages writes. + * Use WRITE_ONCE() to write bdi->ra_pages once. */ mutex_lock(&q->limits_lock); - memflags = blk_mq_freeze_queue(q); - disk->bdi->ra_pages = ra_kb >> (PAGE_SHIFT - 10); + WRITE_ONCE(disk->bdi->ra_pages, ra_kb >> (PAGE_SHIFT - 10)); mutex_unlock(&q->limits_lock); - blk_mq_unfreeze_queue(q, memflags);
return ret; } @@ -375,21 +376,18 @@ static ssize_t queue_nomerges_store(struct gendisk *disk, const char *page, size_t count) { unsigned long nm; - unsigned int memflags; struct request_queue *q = disk->queue; ssize_t ret = queue_var_store(&nm, page, count);
if (ret < 0) return ret;
- memflags = blk_mq_freeze_queue(q); blk_queue_flag_clear(QUEUE_FLAG_NOMERGES, q); blk_queue_flag_clear(QUEUE_FLAG_NOXMERGES, q); if (nm == 2) blk_queue_flag_set(QUEUE_FLAG_NOMERGES, q); else if (nm) blk_queue_flag_set(QUEUE_FLAG_NOXMERGES, q); - blk_mq_unfreeze_queue(q, memflags);
return ret; } @@ -409,7 +407,6 @@ queue_rq_affinity_store(struct gendisk *disk, const char *page, size_t count) #ifdef CONFIG_SMP struct request_queue *q = disk->queue; unsigned long val; - unsigned int memflags;
ret = queue_var_store(&val, page, count); if (ret < 0) @@ -421,7 +418,6 @@ queue_rq_affinity_store(struct gendisk *disk, const char *page, size_t count) * are accessed individually using atomic test_bit operation. So we * don't grab any lock while updating these flags. */ - memflags = blk_mq_freeze_queue(q); if (val == 2) { blk_queue_flag_set(QUEUE_FLAG_SAME_COMP, q); blk_queue_flag_set(QUEUE_FLAG_SAME_FORCE, q); @@ -432,7 +428,6 @@ queue_rq_affinity_store(struct gendisk *disk, const char *page, size_t count) blk_queue_flag_clear(QUEUE_FLAG_SAME_COMP, q); blk_queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q); } - blk_mq_unfreeze_queue(q, memflags); #endif return ret; } @@ -446,11 +441,9 @@ static ssize_t queue_poll_delay_store(struct gendisk *disk, const char *page, static ssize_t queue_poll_store(struct gendisk *disk, const char *page, size_t count) { - unsigned int memflags; ssize_t ret = count; struct request_queue *q = disk->queue;
- memflags = blk_mq_freeze_queue(q); if (!(q->limits.features & BLK_FEAT_POLL)) { ret = -EINVAL; goto out; @@ -459,7 +452,6 @@ static ssize_t queue_poll_store(struct gendisk *disk, const char *page, pr_info_ratelimited("writes to the poll attribute are ignored.\n"); pr_info_ratelimited("please use driver specific parameters instead.\n"); out: - blk_mq_unfreeze_queue(q, memflags); return ret; }
@@ -472,7 +464,7 @@ static ssize_t queue_io_timeout_show(struct gendisk *disk, char *page) static ssize_t queue_io_timeout_store(struct gendisk *disk, const char *page, size_t count) { - unsigned int val, memflags; + unsigned int val; int err; struct request_queue *q = disk->queue;
@@ -480,9 +472,7 @@ static ssize_t queue_io_timeout_store(struct gendisk *disk, const char *page, if (err || val == 0) return -EINVAL;
- memflags = blk_mq_freeze_queue(q); blk_queue_rq_timeout(q, msecs_to_jiffies(val)); - blk_mq_unfreeze_queue(q, memflags);
return count; } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 2fff8a80dbd2..cb4ba09959ee 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -495,7 +495,7 @@ struct request_queue { */ unsigned long queue_flags;
- unsigned int rq_timeout; + unsigned int __data_racy rq_timeout;
unsigned int queue_depth;
Hi Bart,
On 11/15/25 2:34 AM, Bart Van Assche wrote:
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 2fff8a80dbd2..cb4ba09959ee 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -495,7 +495,7 @@ struct request_queue { */ unsigned long queue_flags;
- unsigned int rq_timeout;
- unsigned int __data_racy rq_timeout;
unsigned int queue_depth;
This change look good to me however as I mentioned earlier, introducing __data_racy would break the kernel build. So are you going to raise a separate bug report to fix it?
AS .tmp_vmlinux2.kallsyms.o LD vmlinux.unstripped BTFIDS vmlinux.unstripped WARN: multiple IDs found for 'task_struct': 116, 10183 - using 116 WARN: multiple IDs found for 'module': 190, 10190 - using 190 WARN: multiple IDs found for 'vm_area_struct': 324, 10227 - using 324 WARN: multiple IDs found for 'inode': 956, 10314 - using 956 WARN: multiple IDs found for 'path': 989, 10344 - using 989 WARN: multiple IDs found for 'file': 765, 10375 - using 765 WARN: multiple IDs found for 'cgroup': 1030, 10409 - using 1030 WARN: multiple IDs found for 'seq_file': 1358, 10593 - using 1358 WARN: multiple IDs found for 'bpf_prog': 2054, 10984 - using 2054 WARN: multiple IDs found for 'bpf_map': 2134, 11012 - using 2134 [...] [...] make[2]: *** [scripts/Makefile.vmlinux:72: vmlinux.unstripped] Error 255 make[2]: *** Deleting file 'vmlinux.unstripped' make[1]: *** [/home/src/linux/Makefile:1242: vmlinux] Error 2 make: *** [Makefile:248: __sub-make] Error 2
Thanks, --Nilay
On 11/17/25 1:01 AM, Nilay Shroff wrote:
This change look good to me however as I mentioned earlier, introducing __data_racy would break the kernel build. So are you going to raise a separate bug report to fix it?
AS .tmp_vmlinux2.kallsyms.o LD vmlinux.unstripped BTFIDS vmlinux.unstripped WARN: multiple IDs found for 'task_struct': 116, 10183 - using 116 WARN: multiple IDs found for 'module': 190, 10190 - using 190 WARN: multiple IDs found for 'vm_area_struct': 324, 10227 - using 324 WARN: multiple IDs found for 'inode': 956, 10314 - using 956 WARN: multiple IDs found for 'path': 989, 10344 - using 989 WARN: multiple IDs found for 'file': 765, 10375 - using 765 WARN: multiple IDs found for 'cgroup': 1030, 10409 - using 1030 WARN: multiple IDs found for 'seq_file': 1358, 10593 - using 1358 WARN: multiple IDs found for 'bpf_prog': 2054, 10984 - using 2054 WARN: multiple IDs found for 'bpf_map': 2134, 11012 - using 2134 [...] [...] make[2]: *** [scripts/Makefile.vmlinux:72: vmlinux.unstripped] Error 255 make[2]: *** Deleting file 'vmlinux.unstripped' make[1]: *** [/home/src/linux/Makefile:1242: vmlinux] Error 2 make: *** [Makefile:248: __sub-make] Error 2
The kernel build is already broken without my patch series. Anyway, I have reported this. In the kernel documentation I found the following:
**Please do NOT report BPF issues to bugzilla.kernel.org since it is a guarantee that the reported issue will be overlooked.**
So I sent an email to the BPF mailing list reporting that the kernel build fails if both CONFIG_DEBUG_INFO_BTF and CONFIG_KCSAN are enabled for Linus' master branch (commit e7c375b18160 ("Merge tag 'vfs-6.18-rc7.fixes' of gitolite.kernel.org:pub/scm/linux/kernel/git/ vfs/vfs")). See also https://lore.kernel.org/bpf/2412725b-916c-47bd-91c3-c2d57e3e6c7b@acm.org/.
Bart.
On 11/18/25 2:14 AM, Bart Van Assche wrote:
On 11/17/25 1:01 AM, Nilay Shroff wrote:
This change look good to me however as I mentioned earlier, introducing __data_racy would break the kernel build. So are you going to raise a separate bug report to fix it?
AS .tmp_vmlinux2.kallsyms.o LD vmlinux.unstripped BTFIDS vmlinux.unstripped WARN: multiple IDs found for 'task_struct': 116, 10183 - using 116 WARN: multiple IDs found for 'module': 190, 10190 - using 190 WARN: multiple IDs found for 'vm_area_struct': 324, 10227 - using 324 WARN: multiple IDs found for 'inode': 956, 10314 - using 956 WARN: multiple IDs found for 'path': 989, 10344 - using 989 WARN: multiple IDs found for 'file': 765, 10375 - using 765 WARN: multiple IDs found for 'cgroup': 1030, 10409 - using 1030 WARN: multiple IDs found for 'seq_file': 1358, 10593 - using 1358 WARN: multiple IDs found for 'bpf_prog': 2054, 10984 - using 2054 WARN: multiple IDs found for 'bpf_map': 2134, 11012 - using 2134 [...] [...] make[2]: *** [scripts/Makefile.vmlinux:72: vmlinux.unstripped] Error 255 make[2]: *** Deleting file 'vmlinux.unstripped' make[1]: *** [/home/src/linux/Makefile:1242: vmlinux] Error 2 make: *** [Makefile:248: __sub-make] Error 2
The kernel build is already broken without my patch series. Anyway, I have reported this. In the kernel documentation I found the following:
**Please do NOT report BPF issues to bugzilla.kernel.org since it is a guarantee that the reported issue will be overlooked.**
So I sent an email to the BPF mailing list reporting that the kernel build fails if both CONFIG_DEBUG_INFO_BTF and CONFIG_KCSAN are enabled for Linus' master branch (commit e7c375b18160 ("Merge tag 'vfs-6.18-rc7.fixes' of gitolite.kernel.org:pub/scm/linux/kernel/git/ vfs/vfs")). See also https://lore.kernel.org/bpf/2412725b-916c-47bd-91c3-c2d57e3e6c7b@acm.org/.
Okay sounds good. Though I couldn't recreate this without __data_racy on my platform. Anyways, I will review your patchset.
Thanks, --Nilay
On 11/15/25 2:34 AM, Bart Van Assche wrote:
Freezing the request queue from inside sysfs store callbacks may cause a deadlock in combination with the dm-multipath driver and the queue_if_no_path option. Additionally, freezing the request queue slows down system boot on systems where sysfs attributes are set synchronously.
Fix this by removing the blk_mq_freeze_queue() / blk_mq_unfreeze_queue() calls from the store callbacks that do not strictly need these callbacks. Add the __data_racy annotation to request_queue.rq_timeout to suppress KCSAN data race reports about the rq_timeout reads.
This patch may cause a small delay in applying the new settings.
For all the attributes affected by this patch, I/O will complete correctly whether the old or the new value of the attribute is used.
This patch affects the following sysfs attributes:
- io_poll_delay
- io_timeout
- nomerges
- read_ahead_kb
- rq_affinity
Looks good to me: Reviewed-by: Nilay Shroff nilay@linux.ibm.com
linux-stable-mirror@lists.linaro.org