With /sys/fs/cgroup/blkio/blkio.throttle.read_bps_device set to throttle  the through put of a USB device (easily triggered when set to 4096),  removing the USB device causes the kernel to panic:  [ 81.053401] Unable to handle kernel paging request at virtual address 3032343c [ 81.060710] pgd = 423a14c2 [ 81.063450] [3032343c] *pgd=00000000 [ 81.067072] Internal error: Oops: 805 [#1] SMP ARM [ 81.071913] Modules linked in: nls_ascii nls_cp437 vfat fat sg sd_mod uas usb_storage iptable_nat nf_nat_ipv4 nf_nat iptable_mangle iptable_filter ip6t_REJECT nf_reject_ipv6 nft_counter ipt_REJECT nf_reject_ipv4 xt_tcpudp xt_state xt_conntrack nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 nft_compat nf_tables nfnetlink btrfs zstd_compress libcrc32c zlib_deflate zstd_decompress xxhash xor ofpart m25p80 dw_hdmi_ahb_audio spi_nor dw_hdmi_cec raid6_pq imx_media_csi(C) imx_media_ic(C) imx_media_capture(C) videobuf2_dma_contig videobuf2_memops videobuf2_v4l2 videobuf2_common imx_media_vdic(C) imx_thermal snd_soc_imx_sgtl5000 snd_soc_sgtl5000 snd_soc_imx_audmux imx6_mipi_csi2(C) ov5640 snd_soc_fsl_ssi imx_pcm_dma imx_pcm_fiq snd_soc_core imx2_wdt snd_pcm_dmaengine snd_pcm snd_timer flexcan can_dev spi_imx [ 81.143117] snd soundcore pwm_imx etnaviv dw_hdmi_imx dw_hdmi imx_media(C) gpu_sched imx_media_common(C) imx_ldb parallel_display v4l2_fwnode cec imxdrm imx_ipu_v3 panel_simple drm_kms_helper evdev drm imx6q_cpufreq fb_sys_fops pwm_bl ip_tables x_tables autofs4 ext4 crc16 mbcache jbd2 crc32c_generic fscrypto ecb cls_cgroup ahci_imx libahci_platform libahci libata scsi_mod ci_hdrc_imx ci_hdrc ulpi ehci_hcd udc_core i2c_imx sdhci_esdhc_imx sdhci_pltfm sdhci usbcore usbmisc_imx phy_mxs_usb anatop_regulator dwc3_haps clk_pwm micrel [ 81.190475] CPU: 3 PID: 0 Comm: swapper/3 Tainted: G C 4.19.0-3-armmp #1 Debian 4.19.20-1co5 [ 81.200309] Hardware name: Freescale i.MX6 Quad/DualLite (Device Tree) [ 81.206907] PC is at expire_timers+0x70/0x15c [ 81.211309] LR is at run_timer_softirq+0xb4/0x1dc [ 81.216061] pc : [<c03d7430>] lr : [<c03d75d0>] psr: 200f0193 [ 81.222386] sp : ee991dd8 ip : ee991e10 fp : ee991e0c [ 81.227663] r10: ffffe000 r9 : c1105dc8 r8 : 00000000 [ 81.234650] r7 : 00000200 r6 : ee991e10 r5 : ef6d0480 r4 : ed02a62c [ 81.242951] r3 : 30323438 r2 : ee991e10 r1 : ee991e10 r0 : ef6d0480 [ 81.251238] Flags: nzCv IRQs off FIQs on Mode SVC_32 ISA ARM Segment none [ 81.260212] Control: 10c5387d Table: 1331804a DAC: 00000051 [ 81.267678] Process swapper/3 (pid: 0, stack limit = 0x02c38c69) [ 81.275447] Stack: (0xee991dd8 to 0xee992000) [ 81.281596] 1dc0: c11f7db0 c1105df4 [ 81.291569] 1de0: 00000038 ee991e10 ef6d0480 ee991e10 c1104d00 c1013480 c1105dc8 2e6bd000 [ 81.301529] 1e00: ee991e64 ee991e10 c03d75d0 c03d73cc 30323438 ee991e20 c03da6e0 c03ebb40 [ 81.311471] 1e20: 200f0193 0000000f 00000000 00000012 df137679 6fdeee9d ef6d1808 00000001 [ 81.321400] 1e40: 00000002 c1104084 ffffe000 00000100 ee990000 00000282 ee991ecc ee991e68 [ 81.331299] 1e60: c0302994 c03d7528 00000001 00000010 ee990000 c0b06478 00200042 c0c6ec40 [ 81.341170] 1e80: c1104d00 ffff2a31 0000000a c1017500 c11f7a38 c100f368 c1105df4 c1104080 [ 81.351032] 1ea0: c03c19e0 c10174b8 00000000 00000000 00000001 ee8d8400 ee990000 c0c727f0 [ 81.360855] 1ec0: ee991edc ee991ed0 c0355400 c030285c ee991f04 ee991ee0 c03bc334 c0355330 [ 81.370660] 1ee0: c11067e0 f400010c f4000100 ee991f30 f4001100 ee990000 ee991f2c ee991f08 [ 81.380446] 1f00: c03025a0 c03bc2d0 c030abd4 600f0013 ffffffff ee991f64 00000001 ee990000 [ 81.390212] 1f20: ee991f8c ee991f30 c0301a0c c0302554 00000000 0006313c ef6d0448 c03225c0 [ 81.399968] 1f40: ffffe000 c1105df4 c1105e3c 00000008 00000001 c11f7534 c0c727f0 ee991f8c [ 81.409703] 1f60: ee991f90 ee991f80 c030abd0 c030abd4 600f0013 ffffffff 00000051 00000000 [ 81.419435] 1f80: ee991f9c ee991f90 c0aeefd0 c030ab98 ee991fcc ee991fa0 c038398c c0aeefa4 [ 81.429127] 1fa0: 00000001 00000086 00000003 10c0387d c121a928 1020406a 412fc09a 00000000 [ 81.438816] 1fc0: ee991fdc ee991fd0 c0383c88 c0383870 ee991ff4 ee991fe0 c0312e80 c0383c6c [ 81.448505] 1fe0: 3e98006a 00000051 00000000 ee991ff8 10302c6c c0312d2c 00000000 00000000 [ 81.458198] [<c03d7430>] (expire_timers) from [<c03d75d0>] (run_timer_softirq+0xb4/0x1dc) [ 81.467904] [<c03d75d0>] (run_timer_softirq) from [<c0302994>] (__do_softirq+0x144/0x390) [ 81.477631] [<c0302994>] (__do_softirq) from [<c0355400>] (irq_exit+0xdc/0x118) [ 81.486487] [<c0355400>] (irq_exit) from [<c03bc334>] (__handle_domain_irq+0x70/0xc4) [ 81.495863] [<c03bc334>] (__handle_domain_irq) from [<c03025a0>] (gic_handle_irq+0x58/0x94) [ 81.505790] [<c03025a0>] (gic_handle_irq) from [<c0301a0c>] (__irq_svc+0x6c/0x90) [ 81.514850] Exception stack(0xee991f30 to 0xee991f78) [ 81.521498] 1f20: 00000000 0006313c ef6d0448 c03225c0 [ 81.531312] 1f40: ffffe000 c1105df4 c1105e3c 00000008 00000001 c11f7534 c0c727f0 ee991f8c [ 81.541142] 1f60: ee991f90 ee991f80 c030abd0 c030abd4 600f0013 ffffffff [ 81.549413] [<c0301a0c>] (__irq_svc) from [<c030abd4>] (arch_cpu_idle+0x48/0x4c) [ 81.558468] [<c030abd4>] (arch_cpu_idle) from [<c0aeefd0>] (default_idle_call+0x38/0x3c) [ 81.568240] [<c0aeefd0>] (default_idle_call) from [<c038398c>] (do_idle+0x128/0x164) [ 81.577651] [<c038398c>] (do_idle) from [<c0383c88>] (cpu_startup_entry+0x28/0x30) [ 81.586895] [<c0383c88>] (cpu_startup_entry) from [<c0312e80>] (secondary_start_kernel+0x160/0x188) [ 81.599226] [<c0312e80>] (secondary_start_kernel) from [<10302c6c>] (0x10302c6c) [ 81.608299] Code: e5943000 e5942004 e3530000 e5823000 (15832004) [ 81.616097] ---[ end trace caf7673330a23b6c ]--- [ 81.622393] Kernel panic - not syncing: Fatal exception in interrupt [ 81.630434] CPU2: stopping [ 81.634784] CPU: 2 PID: 0 Comm: swapper/2 Tainted: G D C 4.19.0-3-armmp #1 Debian 4.19.20-1co5 [ 81.647844] Hardware name: Freescale i.MX6 Quad/DualLite (Device Tree) [ 81.656075] [<c0315cf8>] (unwind_backtrace) from [<c030ee34>] (show_stack+0x20/0x24) [ 81.665525] [<c030ee34>] (show_stack) from [<c0ad3300>] (dump_stack+0x94/0xa8) [ 81.674442] [<c0ad3300>] (dump_stack) from [<c0313498>] (handle_IPI+0x364/0x39c) [ 81.683517] [<c0313498>] (handle_IPI) from [<c03025d8>] (gic_handle_irq+0x90/0x94) [ 81.692744] [<c03025d8>] (gic_handle_irq) from [<c0301a0c>] (__irq_svc+0x6c/0x90) [ 81.701883] Exception stack(0xee98ff30 to 0xee98ff78) [ 81.708563] ff20: 00000000 000527c0 ef6bf448 c03225c0 [ 81.718410] ff40: ffffe000 c1105df4 c1105e3c 00000004 00000001 c11f7534 c0c727f0 ee98ff8c [ 81.728241] ff60: ee98ff90 ee98ff80 c030abd0 c030abd4 60070013 ffffffff [ 81.745568] [<c030abd4>] (arch_cpu_idle) from [<c0aeefd0>] (default_idle_call+0x38/0x3c) [ 81.755318] [<c0aeefd0>] (default_idle_call) from [<c038398c>] (do_idle+0x128/0x164) [ 81.764702] [<c038398c>] (do_idle) from [<c0383c88>] (cpu_startup_entry+0x28/0x30) [ 81.773908] [<c0383c88>] (cpu_startup_entry) from [<c0312e80>] (secondary_start_kernel+0x160/0x188) [ 81.786168] [<c0312e80>] (secondary_start_kernel) from [<10302c6c>] (0x10302c6c) [ 81.795213] CPU0: stopping [ 81.799551] CPU: 0 PID: 0 Comm: swapper/0 Tainted: G D C 4.19.0-3-armmp #1 Debian 4.19.20-1co5 [ 81.812590] Hardware name: Freescale i.MX6 Quad/DualLite (Device Tree) [ 81.820790] [<c0315cf8>] (unwind_backtrace) from [<c030ee34>] (show_stack+0x20/0x24) [ 81.830200] [<c030ee34>] (show_stack) from [<c0ad3300>] (dump_stack+0x94/0xa8) [ 81.839078] [<c0ad3300>] (dump_stack) from [<c0313498>] (handle_IPI+0x364/0x39c) [ 81.848108] [<c0313498>] (handle_IPI) from [<c03025d8>] (gic_handle_irq+0x90/0x94) [ 81.857305] [<c03025d8>] (gic_handle_irq) from [<c0301a0c>] (__irq_svc+0x6c/0x90) [ 81.866408] Exception stack(0xc1101ee0 to 0xc1101f28) [ 81.873088] 1ee0: 00000000 00066914 ef69d448 c03225c0 ffffe000 c1105df4 c1105e3c 00000001 [ 81.882924] 1f00: 00000001 c11f7534 c0c727f0 c1101f3c c1101f40 c1101f30 c030abd0 c030abd4 [ 81.892746] 1f20: 60030013 ffffffff [ 81.897853] [<c0301a0c>] (__irq_svc) from [<c030abd4>] (arch_cpu_idle+0x48/0x4c) [ 81.906884] [<c030abd4>] (arch_cpu_idle) from [<c0aeefd0>] (default_idle_call+0x38/0x3c) [ 81.916614] [<c0aeefd0>] (default_idle_call) from [<c038398c>] (do_idle+0x128/0x164) [ 81.926012] [<c038398c>] (do_idle) from [<c0383c88>] (cpu_startup_entry+0x28/0x30) [ 81.935254] [<c0383c88>] (cpu_startup_entry) from [<c0ae85ec>] (rest_init+0xb8/0xbc) [ 81.944671] [<c0ae85ec>] (rest_init) from [<c0f01098>] (start_kernel+0x4d4/0x500) [ 81.953804] CPU1: stopping [ 81.958129] CPU: 1 PID: 0 Comm: swapper/1 Tainted: G D C 4.19.0-3-armmp #1 Debian 4.19.20-1co5 [ 81.971109] Hardware name: Freescale i.MX6 Quad/DualLite (Device Tree) [ 81.979303] [<c0315cf8>] (unwind_backtrace) from [<c030ee34>] (show_stack+0x20/0x24) [ 81.988712] [<c030ee34>] (show_stack) from [<c0ad3300>] (dump_stack+0x94/0xa8) [ 81.997602] [<c0ad3300>] (dump_stack) from [<c0313498>] (handle_IPI+0x364/0x39c) [ 82.006650] [<c0313498>] (handle_IPI) from [<c03025d8>] (gic_handle_irq+0x90/0x94) [ 82.015856] [<c03025d8>] (gic_handle_irq) from [<c0301a0c>] (__irq_svc+0x6c/0x90) [ 82.024949] Exception stack(0xee98df30 to 0xee98df78) [ 82.031595] df20: 00000000 00062e98 ef6ae448 c03225c0 [ 82.041400] df40: ffffe000 c1105df4 c1105e3c 00000002 00000001 c11f7534 c0c727f0 ee98df8c [ 82.051197] df60: ee98df90 ee98df80 c030abd0 c030abd4 60010013 ffffffff [ 82.059422] [<c0301a0c>] (__irq_svc) from [<c030abd4>] (arch_cpu_idle+0x48/0x4c) [ 82.068425] [<c030abd4>] (arch_cpu_idle) from [<c0aeefd0>] (default_idle_call+0x38/0x3c) [ 82.078130] [<c0aeefd0>] (default_idle_call) from [<c038398c>] (do_idle+0x128/0x164) [ 82.087486] [<c038398c>] (do_idle) from [<c0383c88>] (cpu_startup_entry+0x28/0x30) [ 82.096673] [<c0383c88>] (cpu_startup_entry) from [<c0312e80>] (secondary_start_kernel+0x160/0x188) [ 82.108947] [<c0312e80>] (secondary_start_kernel) from [<10302c6c>] (0x10302c6c) [ 82.117993] ---[ end Kernel panic - not syncing: Fatal exception in interrupt ]---
This is fixed in the upstream kernel by 47cdee29ef9d. This fix has a few issues addressed by c3e2219216c9 and c326f846ebc2a which have been included in this series.
Tested by setting up the following udev rule:
KERNEL=="sd*", SUBSYSTEMS=="usb", ACTION=="add", ENV{DEVTYPE}=="disk", ENV{MAJOR}=="8", R UN+="/bin/sh -c 'echo $env{MAJOR}:$env{MINOR} 4096 > /sys/fs/cgroup/blkio/blkio.throttle. read_bps_device'"
With the udev rule operational, inserting and removing a USB flash drive after ~5 secs.
Ming Lei (3): block: move blk_exit_queue into __blk_release_queue block: free sched's request pool in blk_cleanup_queue blk-mq: remove WARN_ON(!q->elevator) from blk_mq_sched_free_requests
block/blk-core.c | 50 ++++++++++++-------------------------------- block/blk-mq-sched.c | 29 ++++++++++++++++++++++--- block/blk-mq-sched.h | 1 + block/blk-sysfs.c | 47 ++++++++++++++++++++++++++++------------- block/blk.h | 11 ++++++++-- block/elevator.c | 2 +- 6 files changed, 82 insertions(+), 58 deletions(-)
From: Ming Lei ming.lei@redhat.com
commit 47cdee29ef9d94e485eb08f962c74943023a5271 upstream.
Commit 498f6650aec8 ("block: Fix a race between the cgroup code and request queue initialization") moves what blk_exit_queue does into blk_cleanup_queue() for fixing issue caused by changing back queue lock.
However, after legacy request IO path is killed, driver queue lock won't be used at all, and there isn't story for changing back queue lock. Then the issue addressed by Commit 498f6650aec8 doesn't exist any more.
So move move blk_exit_queue into __blk_release_queue.
This patch basically reverts the following two commits:
498f6650aec8 block: Fix a race between the cgroup code and request queue initialization 24ecc3585348 block: Ensure that a request queue is dissociated from the cgroup controller
Cc: Bart Van Assche bvanassche@acm.org Reviewed-by: Christoph Hellwig hch@lst.de Signed-off-by: Ming Lei ming.lei@redhat.com Signed-off-by: Jens Axboe axboe@kernel.dk (cherry picked/backported from commit 47cdee29ef9d94e485eb08f962c74943023a5271) Signed-off-by: Martyn Welch martyn.welch@collabora.com --- block/blk-core.c | 37 ------------------------------------- block/blk-sysfs.c | 47 ++++++++++++++++++++++++++++++++--------------- block/blk.h | 1 - 3 files changed, 32 insertions(+), 53 deletions(-)
diff --git a/block/blk-core.c b/block/blk-core.c index ea33d6abdcfc..28df78c5db34 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -716,35 +716,6 @@ void blk_set_queue_dying(struct request_queue *q) } EXPORT_SYMBOL_GPL(blk_set_queue_dying);
-/* Unconfigure the I/O scheduler and dissociate from the cgroup controller. */ -void blk_exit_queue(struct request_queue *q) -{ - /* - * Since the I/O scheduler exit code may access cgroup information, - * perform I/O scheduler exit before disassociating from the block - * cgroup controller. - */ - if (q->elevator) { - ioc_clear_queue(q); - elevator_exit(q, q->elevator); - q->elevator = NULL; - } - - /* - * Remove all references to @q from the block cgroup controller before - * restoring @q->queue_lock to avoid that restoring this pointer causes - * e.g. blkcg_print_blkgs() to crash. - */ - blkcg_exit_queue(q); - - /* - * Since the cgroup code may dereference the @q->backing_dev_info - * pointer, only decrease its reference count after having removed the - * association with the block cgroup controller. - */ - bdi_put(q->backing_dev_info); -} - /** * blk_cleanup_queue - shutdown a request queue * @q: request queue to shutdown @@ -810,14 +781,6 @@ void blk_cleanup_queue(struct request_queue *q) del_timer_sync(&q->backing_dev_info->laptop_mode_wb_timer); blk_sync_queue(q);
- /* - * I/O scheduler exit is only safe after the sysfs scheduler attribute - * has been removed. - */ - WARN_ON_ONCE(q->kobj.state_in_sysfs); - - blk_exit_queue(q); - if (q->mq_ops) blk_mq_exit_queue(q);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 8286640d4d66..1d0e5463f3b6 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -794,6 +794,36 @@ static void blk_free_queue_rcu(struct rcu_head *rcu_head) kmem_cache_free(blk_requestq_cachep, q); }
+/* Unconfigure the I/O scheduler and dissociate from the cgroup controller. */ +static void blk_exit_queue(struct request_queue *q) +{ + /* + * Since the I/O scheduler exit code may access cgroup information, + * perform I/O scheduler exit before disassociating from the block + * cgroup controller. + */ + if (q->elevator) { + ioc_clear_queue(q); + elevator_exit(q, q->elevator); + q->elevator = NULL; + } + + /* + * Remove all references to @q from the block cgroup controller before + * restoring @q->queue_lock to avoid that restoring this pointer causes + * e.g. blkcg_print_blkgs() to crash. + */ + blkcg_exit_queue(q); + + /* + * Since the cgroup code may dereference the @q->backing_dev_info + * pointer, only decrease its reference count after having removed the + * association with the block cgroup controller. + */ + bdi_put(q->backing_dev_info); +} + + /** * __blk_release_queue - release a request queue when it is no longer needed * @work: pointer to the release_work member of the request queue to be released @@ -819,21 +849,6 @@ static void __blk_release_queue(struct work_struct *work) blk_stat_remove_callback(q, q->poll_cb); blk_stat_free_callback(q->poll_cb);
- if (!blk_queue_dead(q)) { - /* - * Last reference was dropped without having called - * blk_cleanup_queue(). - */ - WARN_ONCE(blk_queue_init_done(q), - "request queue %p has been registered but blk_cleanup_queue() has not been called for that queue\n", - q); - blk_exit_queue(q); - } - - WARN(blk_queue_root_blkg(q), - "request queue %p is being released but it has not yet been removed from the blkcg controller\n", - q); - blk_free_queue_stats(q->stats);
if (q->mq_ops) @@ -844,6 +859,8 @@ static void __blk_release_queue(struct work_struct *work) if (q->queue_tags) __blk_queue_free_tags(q);
+ blk_exit_queue(q); + if (!q->mq_ops) { if (q->exit_rq_fn) q->exit_rq_fn(q, q->fq->flush_rq); diff --git a/block/blk.h b/block/blk.h index 1a5b67b57e6b..9af0b67ef332 100644 --- a/block/blk.h +++ b/block/blk.h @@ -137,7 +137,6 @@ void blk_free_flush_queue(struct blk_flush_queue *q); int blk_init_rl(struct request_list *rl, struct request_queue *q, gfp_t gfp_mask); void blk_exit_rl(struct request_queue *q, struct request_list *rl); -void blk_exit_queue(struct request_queue *q); void blk_rq_bio_prep(struct request_queue *q, struct request *rq, struct bio *bio); void blk_queue_bypass_start(struct request_queue *q);
From: Ming Lei ming.lei@redhat.com
commit c3e2219216c92919a6bd1711f340f5faa98695e6 upstream.
In theory, IO scheduler belongs to request queue, and the request pool of sched tags belongs to the request queue too.
However, the current tags allocation interfaces are re-used for both driver tags and sched tags, and driver tags is definitely host wide, and doesn't belong to any request queue, same with its request pool. So we need tagset instance for freeing request of sched tags.
Meantime, blk_mq_free_tag_set() often follows blk_cleanup_queue() in case of non-BLK_MQ_F_TAG_SHARED, this way requires that request pool of sched tags to be freed before calling blk_mq_free_tag_set().
Commit 47cdee29ef9d94e ("block: move blk_exit_queue into __blk_release_queue") moves blk_exit_queue into __blk_release_queue for simplying the fast path in generic_make_request(), then causes oops during freeing requests of sched tags in __blk_release_queue().
Fix the above issue by move freeing request pool of sched tags into blk_cleanup_queue(), this way is safe becasue queue has been frozen and no any in-queue requests at that time. Freeing sched tags has to be kept in queue's release handler becasue there might be un-completed dispatch activity which might refer to sched tags.
Cc: Bart Van Assche bvanassche@acm.org Cc: Christoph Hellwig hch@lst.de Fixes: 47cdee29ef9d94e485eb08f962c74943023a5271 ("block: move blk_exit_queue into __blk_release_queue") Tested-by: Yi Zhang yi.zhang@redhat.com Reported-by: kernel test robot rong.a.chen@intel.com Signed-off-by: Ming Lei ming.lei@redhat.com Signed-off-by: Jens Axboe axboe@kernel.dk (cherry picked/backported from commit c3e2219216c92919a6bd1711f340f5faa98695e6) Signed-off-by: Martyn Welch martyn.welch@collabora.com --- block/blk-core.c | 13 +++++++++++++ block/blk-mq-sched.c | 30 +++++++++++++++++++++++++++--- block/blk-mq-sched.h | 1 + block/blk-sysfs.c | 2 +- block/blk.h | 10 +++++++++- block/elevator.c | 2 +- 6 files changed, 52 insertions(+), 6 deletions(-)
diff --git a/block/blk-core.c b/block/blk-core.c index 28df78c5db34..747e87816283 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -784,6 +784,19 @@ void blk_cleanup_queue(struct request_queue *q) if (q->mq_ops) blk_mq_exit_queue(q);
+ /* + * In theory, request pool of sched_tags belongs to request queue. + * However, the current implementation requires tag_set for freeing + * requests, so free the pool now. + * + * Queue has become frozen, there can't be any in-queue requests, so + * it is safe to free requests now. + */ + mutex_lock(&q->sysfs_lock); + if (q->elevator) + blk_mq_sched_free_requests(q); + mutex_unlock(&q->sysfs_lock); + percpu_ref_exit(&q->q_usage_counter);
spin_lock_irq(lock); diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index da1de190a3b1..105f8c124604 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -453,14 +453,18 @@ static int blk_mq_sched_alloc_tags(struct request_queue *q, return ret; }
+/* called in queue's release handler, tagset has gone away */ static void blk_mq_sched_tags_teardown(struct request_queue *q) { - struct blk_mq_tag_set *set = q->tag_set; struct blk_mq_hw_ctx *hctx; int i;
- queue_for_each_hw_ctx(q, hctx, i) - blk_mq_sched_free_tags(set, hctx, i); + queue_for_each_hw_ctx(q, hctx, i) { + if (hctx->sched_tags) { + blk_mq_free_rq_map(hctx->sched_tags); + hctx->sched_tags = NULL; + } + } }
int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) @@ -501,6 +505,7 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) ret = e->ops.mq.init_hctx(hctx, i); if (ret) { eq = q->elevator; + blk_mq_sched_free_requests(q); blk_mq_exit_sched(q, eq); kobject_put(&eq->kobj); return ret; @@ -512,11 +517,30 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) return 0;
err: + blk_mq_sched_free_requests(q); blk_mq_sched_tags_teardown(q); q->elevator = NULL; return ret; }
+/* + * called in either blk_queue_cleanup or elevator_switch, tagset + * is required for freeing requests + */ +void blk_mq_sched_free_requests(struct request_queue *q) +{ + struct blk_mq_hw_ctx *hctx; + int i; + + lockdep_assert_held(&q->sysfs_lock); + WARN_ON(!q->elevator); + + queue_for_each_hw_ctx(q, hctx, i) { + if (hctx->sched_tags) + blk_mq_free_rqs(q->tag_set, hctx->sched_tags, i); + } +} + void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e) { struct blk_mq_hw_ctx *hctx; diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index fe660764b8d1..d97e13fe8ece 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h @@ -28,6 +28,7 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx);
int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e); void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e); +void blk_mq_sched_free_requests(struct request_queue *q);
static inline bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 1d0e5463f3b6..718222dc6f83 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -804,7 +804,7 @@ static void blk_exit_queue(struct request_queue *q) */ if (q->elevator) { ioc_clear_queue(q); - elevator_exit(q, q->elevator); + __elevator_exit(q, q->elevator); q->elevator = NULL; }
diff --git a/block/blk.h b/block/blk.h index 9af0b67ef332..169a2ec6325a 100644 --- a/block/blk.h +++ b/block/blk.h @@ -5,6 +5,7 @@ #include <linux/idr.h> #include <linux/blk-mq.h> #include "blk-mq.h" +#include "blk-mq-sched.h"
/* Amount of time in which a process may batch requests */ #define BLK_BATCH_TIME (HZ/50UL) @@ -242,10 +243,17 @@ int elevator_init(struct request_queue *); int elevator_init_mq(struct request_queue *q); int elevator_switch_mq(struct request_queue *q, struct elevator_type *new_e); -void elevator_exit(struct request_queue *, struct elevator_queue *); +void __elevator_exit(struct request_queue *, struct elevator_queue *); int elv_register_queue(struct request_queue *q); void elv_unregister_queue(struct request_queue *q);
+static inline void elevator_exit(struct request_queue *q, + struct elevator_queue *e) +{ + blk_mq_sched_free_requests(q); + __elevator_exit(q, e); +} + struct hd_struct *__disk_get_part(struct gendisk *disk, int partno);
#ifdef CONFIG_FAIL_IO_TIMEOUT diff --git a/block/elevator.c b/block/elevator.c index fae58b2f906f..4dcb925eac13 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -240,7 +240,7 @@ int elevator_init(struct request_queue *q) return err; }
-void elevator_exit(struct request_queue *q, struct elevator_queue *e) +void __elevator_exit(struct request_queue *q, struct elevator_queue *e) { mutex_lock(&e->sysfs_lock); if (e->uses_mq && e->type->ops.mq.exit_sched)
From: Ming Lei ming.lei@redhat.com
commit c326f846ebc2a30eca386b85dffba96e23803d81 upstream.
blk_mq_sched_free_requests() may be called in failure path in which q->elevator may not be setup yet, so remove WARN_ON(!q->elevator) from blk_mq_sched_free_requests for avoiding the false positive.
This function is actually safe to call in case of !q->elevator because hctx->sched_tags is checked.
Cc: Bart Van Assche bvanassche@acm.org Cc: Christoph Hellwig hch@lst.de Cc: Yi Zhang yi.zhang@redhat.com Fixes: c3e2219216c9 ("block: free sched's request pool in blk_cleanup_queue") Reported-by: syzbot+b9d0d56867048c7bcfde@syzkaller.appspotmail.com Signed-off-by: Ming Lei ming.lei@redhat.com Signed-off-by: Jens Axboe axboe@kernel.dk (cherry picked from commit c326f846ebc2a30eca386b85dffba96e23803d81) Signed-off-by: Martyn Welch martyn.welch@collabora.com --- block/blk-mq-sched.c | 1 - 1 file changed, 1 deletion(-)
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 105f8c124604..156e6444dc8f 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -533,7 +533,6 @@ void blk_mq_sched_free_requests(struct request_queue *q) int i;
lockdep_assert_held(&q->sysfs_lock); - WARN_ON(!q->elevator);
queue_for_each_hw_ctx(q, hctx, i) { if (hctx->sched_tags)
linux-stable-mirror@lists.linaro.org