From: Yu Kuai yukuai3@huawei.com
This patchset backport three patches to fix a crash found by our test:
BUG: kernel NULL pointer dereference, address: 00000000000001a0 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP CPU: 1 PID: 1317 Comm: mount Not tainted 5.10.0-16691-gf6076432827d-dirty #169 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20190727_073836-4 RIP: 0010:__blk_mq_sched_bio_merge+0x9d/0x1a0 Code: 87 1e 9d 89 d0 25 00 00 00 01 0f 85 ad 00 00 00 48 83 05 25 a1 37 0c 01 3 RSP: 0018:ffffc90000473b50 EFLAGS: 00010202 RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffffc90000473b98 RDX: 0000000000001000 RSI: ffff8881080c7500 RDI: ffff888103a9cc18 RBP: ffff88813bc80000 R08: 0000000000000001 R09: 0000000000000000 R10: ffff88810710be30 R11: 0000000000000000 R12: ffff888103a9cc18 R13: ffff8881080c7500 R14: 0000000000000001 R15: 0000000000000000 FS: 00007f51bcdbb040(0000) GS:ffff88813bc80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000000001a0 CR3: 000000010d715000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace: blk_mq_submit_bio+0x115/0xd80 submit_bio_noacct+0x4ff/0x610 submit_bio+0xaa/0x1a0 submit_bh_wbc+0x1cb/0x2f0 submit_bh+0x17/0x20 ext4_read_bh+0x63/0x170 ext4_read_bh_lock+0x2c/0xd0 __ext4_sb_bread_gfp.isra.0+0xa0/0xf0 ext4_fill_super+0x21f/0x5610 ? pointer+0x31b/0x5a0 ? vsnprintf+0x131/0x7d0 mount_bdev+0x233/0x280 ? ext4_calculate_overhead+0x660/0x660 ext4_mount+0x19/0x30 legacy_get_tree+0x35/0x90 vfs_get_tree+0x29/0x100 ? capable+0x1d/0x30 path_mount+0x8a7/0x1150 do_mount+0x8d/0xc0 __se_sys_mount+0x14a/0x220 __x64_sys_mount+0x29/0x40 do_syscall_64+0x45/0x70 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f51bbe1623a Code: 48 8b 0d 51 dc 2b 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 8 RSP: 002b:00007fff173ae898 EFLAGS: 00000246 ORIG_RAX: 00000000000000a5 RAX: ffffffffffffffda RBX: 000056169a120030 RCX: 00007f51bbe1623a RDX: 000056169a120210 RSI: 000056169a120250 RDI: 000056169a120230 RBP: 0000000000000000 R08: 0000000000000000 R09: 00007fff173ad798 R10: 00000000c0ed0000 R11: 0000000000000246 R12: 000056169a120230 R13: 000056169a120210 R14: 0000000000000000 R15: 00007f51bcbac184 Modules linked in: dm_service_time dm_multipath CR2: 00000000000001a0 ---[ end trace ac5d86e09fdc7c98 ]--- RIP: 0010:__blk_mq_sched_bio_merge+0x9d/0x1a0 Code: 87 1e 9d 89 d0 25 00 00 00 01 0f 85 ad 00 00 00 48 83 05 25 a1 37 0c 01 3 RSP: 0018:ffffc90000473b50 EFLAGS: 00010202 RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffffc90000473b98 RDX: 0000000000001000 RSI: ffff8881080c7500 RDI: ffff888103a9cc18 RBP: ffff88813bc80000 R08: 0000000000000001 R09: 0000000000000000 R10: ffff88810710be30 R11: 0000000000000000 R12: ffff888103a9cc18 R13: ffff8881080c7500 R14: 0000000000000001 R15: 0000000000000000 FS: 00007f51bcdbb040(0000) GS:ffff88813bc80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f10e97a5000 CR3: 000000010d715000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Kernel panic - not syncing: Fatal exception Kernel Offset: disabled ---[ end Kernel panic - not syncing: Fatal exception ]---
root cause: t1 dm-mpath t2 mount
alloc_dev md->queue = blk_alloc_queue add_disk_no_queue_reg
dm_setup_md_queue case DM_TYPE_REQUEST_BASED -> multipath md->disk->fops = &dm_rq_blk_dops; ext4_fill_super ┊__ext4_sb_bread_gfp ┊ ext4_read_bh ┊ submit_bio -> queue is not initialized yet ┊ __blk_mq_sched_bio_merge ┊ ctx = blk_mq_get_ctx(q); -> ctx is NULL dm_mq_init_request_queue
Patch 3 is the fix patch, and patch 1,2 is needed to backport patch 3.
Please noted that there are lots of conficts between 5.10 and mainline, and I made plenty adaptations in these patches.
I already tested this patchset with dmtest create/remove tests:
dmtest run --suite thin-provisioning -t /Creation\Deletion/
Christoph Hellwig (3): block: look up holders by bdev block: support delayed holder registration dm: delay registering the gendisk
block/genhd.c | 13 +++++ drivers/md/dm.c | 24 +++++---- fs/block_dev.c | 105 +++++++++++++++++++++++++++----------- include/linux/blk_types.h | 3 -- include/linux/genhd.h | 9 +++- 5 files changed, 110 insertions(+), 44 deletions(-)
From: Christoph Hellwig hch@lst.de
commit 0dbcfe247f22a6d73302dfa691c48b3c14d31c4c upstream.
Invert they way the holder relations are tracked. This very slightly reduces the memory overhead for partitioned devices.
Signed-off-by: Christoph Hellwig hch@lst.de Signed-off-by: Yu Kuai yukuai3@huawei.com --- block/genhd.c | 3 +++ fs/block_dev.c | 31 +++++++++++++++++++------------ include/linux/blk_types.h | 3 --- include/linux/genhd.h | 4 +++- 4 files changed, 25 insertions(+), 16 deletions(-)
diff --git a/block/genhd.c b/block/genhd.c index 796baf761202..2b11a2735285 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1760,6 +1760,9 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) disk_to_dev(disk)->class = &block_class; disk_to_dev(disk)->type = &disk_type; device_initialize(disk_to_dev(disk)); +#ifdef CONFIG_SYSFS + INIT_LIST_HEAD(&disk->slave_bdevs); +#endif return disk;
out_free_part0: diff --git a/fs/block_dev.c b/fs/block_dev.c index 29f020c4b2d0..a202c76fcf7f 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -823,9 +823,6 @@ static void init_once(void *foo)
memset(bdev, 0, sizeof(*bdev)); mutex_init(&bdev->bd_mutex); -#ifdef CONFIG_SYSFS - INIT_LIST_HEAD(&bdev->bd_holder_disks); -#endif bdev->bd_bdi = &noop_backing_dev_info; inode_init_once(&ei->vfs_inode); /* Initialize mutex for freeze. */ @@ -1188,7 +1185,7 @@ EXPORT_SYMBOL(bd_abort_claiming); #ifdef CONFIG_SYSFS struct bd_holder_disk { struct list_head list; - struct gendisk *disk; + struct block_device *bdev; int refcnt; };
@@ -1197,8 +1194,8 @@ static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev, { struct bd_holder_disk *holder;
- list_for_each_entry(holder, &bdev->bd_holder_disks, list) - if (holder->disk == disk) + list_for_each_entry(holder, &disk->slave_bdevs, list) + if (holder->bdev == bdev) return holder; return NULL; } @@ -1244,9 +1241,13 @@ static void del_symlink(struct kobject *from, struct kobject *to) int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) { struct bd_holder_disk *holder; + struct block_device *bdev_holder = bdget_disk(disk, 0); int ret = 0;
- mutex_lock(&bdev->bd_mutex); + if (WARN_ON_ONCE(!bdev_holder)) + return -ENOENT; + + mutex_lock(&bdev_holder->bd_mutex);
WARN_ON_ONCE(!bdev->bd_holder);
@@ -1267,7 +1268,7 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) }
INIT_LIST_HEAD(&holder->list); - holder->disk = disk; + holder->bdev = bdev; holder->refcnt = 1;
ret = add_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj); @@ -1283,7 +1284,7 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) */ kobject_get(bdev->bd_part->holder_dir);
- list_add(&holder->list, &bdev->bd_holder_disks); + list_add(&holder->list, &disk->slave_bdevs); goto out_unlock;
out_del: @@ -1291,7 +1292,8 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) out_free: kfree(holder); out_unlock: - mutex_unlock(&bdev->bd_mutex); + mutex_unlock(&bdev_holder->bd_mutex); + bdput(bdev_holder); return ret; } EXPORT_SYMBOL_GPL(bd_link_disk_holder); @@ -1309,8 +1311,12 @@ EXPORT_SYMBOL_GPL(bd_link_disk_holder); void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) { struct bd_holder_disk *holder; + struct block_device *bdev_holder = bdget_disk(disk, 0);
- mutex_lock(&bdev->bd_mutex); + if (WARN_ON_ONCE(!bdev_holder)) + return; + + mutex_lock(&bdev_holder->bd_mutex);
holder = bd_find_holder_disk(bdev, disk);
@@ -1323,7 +1329,8 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) kfree(holder); }
- mutex_unlock(&bdev->bd_mutex); + mutex_unlock(&bdev_holder->bd_mutex); + bdput(bdev_holder); } EXPORT_SYMBOL_GPL(bd_unlink_disk_holder); #endif diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index d9b69bbde5cc..1b84ecb34c18 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -29,9 +29,6 @@ struct block_device { void * bd_holder; int bd_holders; bool bd_write_holder; -#ifdef CONFIG_SYSFS - struct list_head bd_holder_disks; -#endif struct block_device * bd_contains; u8 bd_partno; struct hd_struct * bd_part; diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 03da3f603d30..3e5049a527e6 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -195,7 +195,9 @@ struct gendisk { #define GD_NEED_PART_SCAN 0 struct rw_semaphore lookup_sem; struct kobject *slave_dir; - +#ifdef CONFIG_SYSFS + struct list_head slave_bdevs; +#endif struct timer_rand_state *random; atomic_t sync_io; /* RAID */ struct disk_events *ev;
On Fri, Jul 29, 2022 at 02:23:54PM +0800, Yu Kuai wrote:
From: Christoph Hellwig hch@lst.de
commit 0dbcfe247f22a6d73302dfa691c48b3c14d31c4c upstream.
Invert they way the holder relations are tracked. This very slightly reduces the memory overhead for partitioned devices.
Signed-off-by: Christoph Hellwig hch@lst.de Signed-off-by: Yu Kuai yukuai3@huawei.com
block/genhd.c | 3 +++ fs/block_dev.c | 31 +++++++++++++++++++------------ include/linux/blk_types.h | 3 --- include/linux/genhd.h | 4 +++- 4 files changed, 25 insertions(+), 16 deletions(-)
diff --git a/block/genhd.c b/block/genhd.c index 796baf761202..2b11a2735285 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1760,6 +1760,9 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) disk_to_dev(disk)->class = &block_class; disk_to_dev(disk)->type = &disk_type; device_initialize(disk_to_dev(disk)); +#ifdef CONFIG_SYSFS
- INIT_LIST_HEAD(&disk->slave_bdevs);
+#endif return disk; out_free_part0: diff --git a/fs/block_dev.c b/fs/block_dev.c index 29f020c4b2d0..a202c76fcf7f 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -823,9 +823,6 @@ static void init_once(void *foo) memset(bdev, 0, sizeof(*bdev)); mutex_init(&bdev->bd_mutex); -#ifdef CONFIG_SYSFS
- INIT_LIST_HEAD(&bdev->bd_holder_disks);
-#endif bdev->bd_bdi = &noop_backing_dev_info; inode_init_once(&ei->vfs_inode); /* Initialize mutex for freeze. */ @@ -1188,7 +1185,7 @@ EXPORT_SYMBOL(bd_abort_claiming); #ifdef CONFIG_SYSFS struct bd_holder_disk { struct list_head list;
- struct gendisk *disk;
- struct block_device *bdev; int refcnt;
}; @@ -1197,8 +1194,8 @@ static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev, { struct bd_holder_disk *holder;
- list_for_each_entry(holder, &bdev->bd_holder_disks, list)
if (holder->disk == disk)
- list_for_each_entry(holder, &disk->slave_bdevs, list)
return NULL;if (holder->bdev == bdev) return holder;
} @@ -1244,9 +1241,13 @@ static void del_symlink(struct kobject *from, struct kobject *to) int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) { struct bd_holder_disk *holder;
- struct block_device *bdev_holder = bdget_disk(disk, 0); int ret = 0;
- mutex_lock(&bdev->bd_mutex);
- if (WARN_ON_ONCE(!bdev_holder))
return -ENOENT;
- mutex_lock(&bdev_holder->bd_mutex);
WARN_ON_ONCE(!bdev->bd_holder); @@ -1267,7 +1268,7 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) } INIT_LIST_HEAD(&holder->list);
- holder->disk = disk;
- holder->bdev = bdev; holder->refcnt = 1;
ret = add_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj); @@ -1283,7 +1284,7 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) */ kobject_get(bdev->bd_part->holder_dir);
- list_add(&holder->list, &bdev->bd_holder_disks);
- list_add(&holder->list, &disk->slave_bdevs); goto out_unlock;
out_del: @@ -1291,7 +1292,8 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) out_free: kfree(holder); out_unlock:
- mutex_unlock(&bdev->bd_mutex);
- mutex_unlock(&bdev_holder->bd_mutex);
- bdput(bdev_holder); return ret;
} EXPORT_SYMBOL_GPL(bd_link_disk_holder); @@ -1309,8 +1311,12 @@ EXPORT_SYMBOL_GPL(bd_link_disk_holder); void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) { struct bd_holder_disk *holder;
- struct block_device *bdev_holder = bdget_disk(disk, 0);
- mutex_lock(&bdev->bd_mutex);
- if (WARN_ON_ONCE(!bdev_holder))
return;
- mutex_lock(&bdev_holder->bd_mutex);
holder = bd_find_holder_disk(bdev, disk); @@ -1323,7 +1329,8 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) kfree(holder); }
- mutex_unlock(&bdev->bd_mutex);
- mutex_unlock(&bdev_holder->bd_mutex);
- bdput(bdev_holder);
} EXPORT_SYMBOL_GPL(bd_unlink_disk_holder); #endif diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index d9b69bbde5cc..1b84ecb34c18 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -29,9 +29,6 @@ struct block_device { void * bd_holder; int bd_holders; bool bd_write_holder; -#ifdef CONFIG_SYSFS
- struct list_head bd_holder_disks;
-#endif struct block_device * bd_contains; u8 bd_partno; struct hd_struct * bd_part; diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 03da3f603d30..3e5049a527e6 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -195,7 +195,9 @@ struct gendisk { #define GD_NEED_PART_SCAN 0 struct rw_semaphore lookup_sem; struct kobject *slave_dir;
+#ifdef CONFIG_SYSFS
- struct list_head slave_bdevs;
+#endif
This is very different from the upstream version, and forces the change onto everyone, not just those who had CONFIG_BLOCK_HOLDER_DEPRECATED enabled like was done in the main kernel tree.
Why force this on all and not just use the same option?
I would need a strong ACK from the original developers and maintainers of these subsystems before being able to take these into the 5.10 tree.
What prevents you from just using 5.15 for those systems that run into these issues?
thanks,
greg k-h
Hi, Greg
在 2022/08/01 19:19, Greg KH 写道:
On Fri, Jul 29, 2022 at 02:23:54PM +0800, Yu Kuai wrote:
From: Christoph Hellwig hch@lst.de
commit 0dbcfe247f22a6d73302dfa691c48b3c14d31c4c upstream.
Invert they way the holder relations are tracked. This very slightly reduces the memory overhead for partitioned devices.
Signed-off-by: Christoph Hellwig hch@lst.de Signed-off-by: Yu Kuai yukuai3@huawei.com
block/genhd.c | 3 +++ fs/block_dev.c | 31 +++++++++++++++++++------------ include/linux/blk_types.h | 3 --- include/linux/genhd.h | 4 +++- 4 files changed, 25 insertions(+), 16 deletions(-)
diff --git a/block/genhd.c b/block/genhd.c index 796baf761202..2b11a2735285 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1760,6 +1760,9 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) disk_to_dev(disk)->class = &block_class; disk_to_dev(disk)->type = &disk_type; device_initialize(disk_to_dev(disk)); +#ifdef CONFIG_SYSFS
- INIT_LIST_HEAD(&disk->slave_bdevs);
+#endif return disk; out_free_part0: diff --git a/fs/block_dev.c b/fs/block_dev.c index 29f020c4b2d0..a202c76fcf7f 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -823,9 +823,6 @@ static void init_once(void *foo) memset(bdev, 0, sizeof(*bdev)); mutex_init(&bdev->bd_mutex); -#ifdef CONFIG_SYSFS
- INIT_LIST_HEAD(&bdev->bd_holder_disks);
-#endif bdev->bd_bdi = &noop_backing_dev_info; inode_init_once(&ei->vfs_inode); /* Initialize mutex for freeze. */ @@ -1188,7 +1185,7 @@ EXPORT_SYMBOL(bd_abort_claiming); #ifdef CONFIG_SYSFS struct bd_holder_disk { struct list_head list;
- struct gendisk *disk;
- struct block_device *bdev; int refcnt; };
@@ -1197,8 +1194,8 @@ static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev, { struct bd_holder_disk *holder;
- list_for_each_entry(holder, &bdev->bd_holder_disks, list)
if (holder->disk == disk)
- list_for_each_entry(holder, &disk->slave_bdevs, list)
return NULL; }if (holder->bdev == bdev) return holder;
@@ -1244,9 +1241,13 @@ static void del_symlink(struct kobject *from, struct kobject *to) int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) { struct bd_holder_disk *holder;
- struct block_device *bdev_holder = bdget_disk(disk, 0); int ret = 0;
- mutex_lock(&bdev->bd_mutex);
- if (WARN_ON_ONCE(!bdev_holder))
return -ENOENT;
- mutex_lock(&bdev_holder->bd_mutex);
WARN_ON_ONCE(!bdev->bd_holder); @@ -1267,7 +1268,7 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) } INIT_LIST_HEAD(&holder->list);
- holder->disk = disk;
- holder->bdev = bdev; holder->refcnt = 1;
ret = add_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj); @@ -1283,7 +1284,7 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) */ kobject_get(bdev->bd_part->holder_dir);
- list_add(&holder->list, &bdev->bd_holder_disks);
- list_add(&holder->list, &disk->slave_bdevs); goto out_unlock;
out_del: @@ -1291,7 +1292,8 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) out_free: kfree(holder); out_unlock:
- mutex_unlock(&bdev->bd_mutex);
- mutex_unlock(&bdev_holder->bd_mutex);
- bdput(bdev_holder); return ret; } EXPORT_SYMBOL_GPL(bd_link_disk_holder);
@@ -1309,8 +1311,12 @@ EXPORT_SYMBOL_GPL(bd_link_disk_holder); void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) { struct bd_holder_disk *holder;
- struct block_device *bdev_holder = bdget_disk(disk, 0);
- mutex_lock(&bdev->bd_mutex);
- if (WARN_ON_ONCE(!bdev_holder))
return;
- mutex_lock(&bdev_holder->bd_mutex);
holder = bd_find_holder_disk(bdev, disk); @@ -1323,7 +1329,8 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) kfree(holder); }
- mutex_unlock(&bdev->bd_mutex);
- mutex_unlock(&bdev_holder->bd_mutex);
- bdput(bdev_holder); } EXPORT_SYMBOL_GPL(bd_unlink_disk_holder); #endif
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index d9b69bbde5cc..1b84ecb34c18 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -29,9 +29,6 @@ struct block_device { void * bd_holder; int bd_holders; bool bd_write_holder; -#ifdef CONFIG_SYSFS
- struct list_head bd_holder_disks;
-#endif struct block_device * bd_contains; u8 bd_partno; struct hd_struct * bd_part; diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 03da3f603d30..3e5049a527e6 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -195,7 +195,9 @@ struct gendisk { #define GD_NEED_PART_SCAN 0 struct rw_semaphore lookup_sem; struct kobject *slave_dir;
+#ifdef CONFIG_SYSFS
- struct list_head slave_bdevs;
+#endif
This is very different from the upstream version, and forces the change onto everyone, not just those who had CONFIG_BLOCK_HOLDER_DEPRECATED enabled like was done in the main kernel tree.
Why force this on all and not just use the same option?
I would need a strong ACK from the original developers and maintainers of these subsystems before being able to take these into the 5.10 tree.
Yes, I agree that Christoph must take a look first.
What prevents you from just using 5.15 for those systems that run into these issues?
The null pointer problem is reported by our product(related to dm- mpath), and they had been using 5.10 for a long time. And switching kernel for commercial will require lots of work, it's not an option for now.
Thanks, Kuai
On Mon, Aug 01, 2022 at 08:25:27PM +0800, Yu Kuai wrote:
Hi, Greg
在 2022/08/01 19:19, Greg KH 写道:
On Fri, Jul 29, 2022 at 02:23:54PM +0800, Yu Kuai wrote:
From: Christoph Hellwig hch@lst.de
commit 0dbcfe247f22a6d73302dfa691c48b3c14d31c4c upstream.
Invert they way the holder relations are tracked. This very slightly reduces the memory overhead for partitioned devices.
Signed-off-by: Christoph Hellwig hch@lst.de Signed-off-by: Yu Kuai yukuai3@huawei.com
block/genhd.c | 3 +++ fs/block_dev.c | 31 +++++++++++++++++++------------ include/linux/blk_types.h | 3 --- include/linux/genhd.h | 4 +++- 4 files changed, 25 insertions(+), 16 deletions(-)
diff --git a/block/genhd.c b/block/genhd.c index 796baf761202..2b11a2735285 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1760,6 +1760,9 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) disk_to_dev(disk)->class = &block_class; disk_to_dev(disk)->type = &disk_type; device_initialize(disk_to_dev(disk)); +#ifdef CONFIG_SYSFS
- INIT_LIST_HEAD(&disk->slave_bdevs);
+#endif return disk; out_free_part0: diff --git a/fs/block_dev.c b/fs/block_dev.c index 29f020c4b2d0..a202c76fcf7f 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -823,9 +823,6 @@ static void init_once(void *foo) memset(bdev, 0, sizeof(*bdev)); mutex_init(&bdev->bd_mutex); -#ifdef CONFIG_SYSFS
- INIT_LIST_HEAD(&bdev->bd_holder_disks);
-#endif bdev->bd_bdi = &noop_backing_dev_info; inode_init_once(&ei->vfs_inode); /* Initialize mutex for freeze. */ @@ -1188,7 +1185,7 @@ EXPORT_SYMBOL(bd_abort_claiming); #ifdef CONFIG_SYSFS struct bd_holder_disk { struct list_head list;
- struct gendisk *disk;
- struct block_device *bdev; int refcnt; };
@@ -1197,8 +1194,8 @@ static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev, { struct bd_holder_disk *holder;
- list_for_each_entry(holder, &bdev->bd_holder_disks, list)
if (holder->disk == disk)
- list_for_each_entry(holder, &disk->slave_bdevs, list)
return NULL; }if (holder->bdev == bdev) return holder;
@@ -1244,9 +1241,13 @@ static void del_symlink(struct kobject *from, struct kobject *to) int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) { struct bd_holder_disk *holder;
- struct block_device *bdev_holder = bdget_disk(disk, 0); int ret = 0;
- mutex_lock(&bdev->bd_mutex);
- if (WARN_ON_ONCE(!bdev_holder))
return -ENOENT;
- mutex_lock(&bdev_holder->bd_mutex); WARN_ON_ONCE(!bdev->bd_holder);
@@ -1267,7 +1268,7 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) } INIT_LIST_HEAD(&holder->list);
- holder->disk = disk;
- holder->bdev = bdev; holder->refcnt = 1; ret = add_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
@@ -1283,7 +1284,7 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) */ kobject_get(bdev->bd_part->holder_dir);
- list_add(&holder->list, &bdev->bd_holder_disks);
- list_add(&holder->list, &disk->slave_bdevs); goto out_unlock; out_del:
@@ -1291,7 +1292,8 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) out_free: kfree(holder); out_unlock:
- mutex_unlock(&bdev->bd_mutex);
- mutex_unlock(&bdev_holder->bd_mutex);
- bdput(bdev_holder); return ret; } EXPORT_SYMBOL_GPL(bd_link_disk_holder);
@@ -1309,8 +1311,12 @@ EXPORT_SYMBOL_GPL(bd_link_disk_holder); void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) { struct bd_holder_disk *holder;
- struct block_device *bdev_holder = bdget_disk(disk, 0);
- mutex_lock(&bdev->bd_mutex);
- if (WARN_ON_ONCE(!bdev_holder))
return;
- mutex_lock(&bdev_holder->bd_mutex); holder = bd_find_holder_disk(bdev, disk);
@@ -1323,7 +1329,8 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) kfree(holder); }
- mutex_unlock(&bdev->bd_mutex);
- mutex_unlock(&bdev_holder->bd_mutex);
- bdput(bdev_holder); } EXPORT_SYMBOL_GPL(bd_unlink_disk_holder); #endif
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index d9b69bbde5cc..1b84ecb34c18 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -29,9 +29,6 @@ struct block_device { void * bd_holder; int bd_holders; bool bd_write_holder; -#ifdef CONFIG_SYSFS
- struct list_head bd_holder_disks;
-#endif struct block_device * bd_contains; u8 bd_partno; struct hd_struct * bd_part; diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 03da3f603d30..3e5049a527e6 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -195,7 +195,9 @@ struct gendisk { #define GD_NEED_PART_SCAN 0 struct rw_semaphore lookup_sem; struct kobject *slave_dir;
+#ifdef CONFIG_SYSFS
- struct list_head slave_bdevs;
+#endif
This is very different from the upstream version, and forces the change onto everyone, not just those who had CONFIG_BLOCK_HOLDER_DEPRECATED enabled like was done in the main kernel tree.
Why force this on all and not just use the same option?
I would need a strong ACK from the original developers and maintainers of these subsystems before being able to take these into the 5.10 tree.
Yes, I agree that Christoph must take a look first.
What prevents you from just using 5.15 for those systems that run into these issues?
The null pointer problem is reported by our product(related to dm- mpath), and they had been using 5.10 for a long time. And switching kernel for commercial will require lots of work, it's not an option for now.
It should be the same validation and verification and testing path for 5.15.y as for an intrusive kernel change as this one, right? What makes it any different to prevent 5.15 from being used?
thanks,
greg k-h
Hi, Greg
在 2022/08/01 21:17, Greg KH 写道:
On Mon, Aug 01, 2022 at 08:25:27PM +0800, Yu Kuai wrote:
Hi, Greg
在 2022/08/01 19:19, Greg KH 写道:
On Fri, Jul 29, 2022 at 02:23:54PM +0800, Yu Kuai wrote:
From: Christoph Hellwig hch@lst.de
commit 0dbcfe247f22a6d73302dfa691c48b3c14d31c4c upstream.
Invert they way the holder relations are tracked. This very slightly reduces the memory overhead for partitioned devices.
Signed-off-by: Christoph Hellwig hch@lst.de Signed-off-by: Yu Kuai yukuai3@huawei.com
block/genhd.c | 3 +++ fs/block_dev.c | 31 +++++++++++++++++++------------ include/linux/blk_types.h | 3 --- include/linux/genhd.h | 4 +++- 4 files changed, 25 insertions(+), 16 deletions(-)
diff --git a/block/genhd.c b/block/genhd.c index 796baf761202..2b11a2735285 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1760,6 +1760,9 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) disk_to_dev(disk)->class = &block_class; disk_to_dev(disk)->type = &disk_type; device_initialize(disk_to_dev(disk)); +#ifdef CONFIG_SYSFS
- INIT_LIST_HEAD(&disk->slave_bdevs);
+#endif return disk; out_free_part0: diff --git a/fs/block_dev.c b/fs/block_dev.c index 29f020c4b2d0..a202c76fcf7f 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -823,9 +823,6 @@ static void init_once(void *foo) memset(bdev, 0, sizeof(*bdev)); mutex_init(&bdev->bd_mutex); -#ifdef CONFIG_SYSFS
- INIT_LIST_HEAD(&bdev->bd_holder_disks);
-#endif bdev->bd_bdi = &noop_backing_dev_info; inode_init_once(&ei->vfs_inode); /* Initialize mutex for freeze. */ @@ -1188,7 +1185,7 @@ EXPORT_SYMBOL(bd_abort_claiming); #ifdef CONFIG_SYSFS struct bd_holder_disk { struct list_head list;
- struct gendisk *disk;
- struct block_device *bdev; int refcnt; };
@@ -1197,8 +1194,8 @@ static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev, { struct bd_holder_disk *holder;
- list_for_each_entry(holder, &bdev->bd_holder_disks, list)
if (holder->disk == disk)
- list_for_each_entry(holder, &disk->slave_bdevs, list)
return NULL; }if (holder->bdev == bdev) return holder;
@@ -1244,9 +1241,13 @@ static void del_symlink(struct kobject *from, struct kobject *to) int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) { struct bd_holder_disk *holder;
- struct block_device *bdev_holder = bdget_disk(disk, 0); int ret = 0;
- mutex_lock(&bdev->bd_mutex);
- if (WARN_ON_ONCE(!bdev_holder))
return -ENOENT;
- mutex_lock(&bdev_holder->bd_mutex); WARN_ON_ONCE(!bdev->bd_holder);
@@ -1267,7 +1268,7 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) } INIT_LIST_HEAD(&holder->list);
- holder->disk = disk;
- holder->bdev = bdev; holder->refcnt = 1; ret = add_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
@@ -1283,7 +1284,7 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) */ kobject_get(bdev->bd_part->holder_dir);
- list_add(&holder->list, &bdev->bd_holder_disks);
- list_add(&holder->list, &disk->slave_bdevs); goto out_unlock; out_del:
@@ -1291,7 +1292,8 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) out_free: kfree(holder); out_unlock:
- mutex_unlock(&bdev->bd_mutex);
- mutex_unlock(&bdev_holder->bd_mutex);
- bdput(bdev_holder); return ret; } EXPORT_SYMBOL_GPL(bd_link_disk_holder);
@@ -1309,8 +1311,12 @@ EXPORT_SYMBOL_GPL(bd_link_disk_holder); void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) { struct bd_holder_disk *holder;
- struct block_device *bdev_holder = bdget_disk(disk, 0);
- mutex_lock(&bdev->bd_mutex);
- if (WARN_ON_ONCE(!bdev_holder))
return;
- mutex_lock(&bdev_holder->bd_mutex); holder = bd_find_holder_disk(bdev, disk);
@@ -1323,7 +1329,8 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) kfree(holder); }
- mutex_unlock(&bdev->bd_mutex);
- mutex_unlock(&bdev_holder->bd_mutex);
- bdput(bdev_holder); } EXPORT_SYMBOL_GPL(bd_unlink_disk_holder); #endif
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index d9b69bbde5cc..1b84ecb34c18 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -29,9 +29,6 @@ struct block_device { void * bd_holder; int bd_holders; bool bd_write_holder; -#ifdef CONFIG_SYSFS
- struct list_head bd_holder_disks;
-#endif struct block_device * bd_contains; u8 bd_partno; struct hd_struct * bd_part; diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 03da3f603d30..3e5049a527e6 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -195,7 +195,9 @@ struct gendisk { #define GD_NEED_PART_SCAN 0 struct rw_semaphore lookup_sem; struct kobject *slave_dir;
+#ifdef CONFIG_SYSFS
- struct list_head slave_bdevs;
+#endif
This is very different from the upstream version, and forces the change onto everyone, not just those who had CONFIG_BLOCK_HOLDER_DEPRECATED enabled like was done in the main kernel tree.
Why force this on all and not just use the same option?
I would need a strong ACK from the original developers and maintainers of these subsystems before being able to take these into the 5.10 tree.
Yes, I agree that Christoph must take a look first.
What prevents you from just using 5.15 for those systems that run into these issues?
The null pointer problem is reported by our product(related to dm- mpath), and they had been using 5.10 for a long time. And switching kernel for commercial will require lots of work, it's not an option for now.
It should be the same validation and verification and testing path for 5.15.y as for an intrusive kernel change as this one, right? What makes it any different to prevent 5.15 from being used?
No, they are not the same, if we try to use 5.15.y, we have to test all other stuff that our product is used currently(ext4, block, scsi and lots of other modules). With this patch, we only need to make sure dm works fine.
Thanks, Kuai
On Mon, Aug 01, 2022 at 09:39:30PM +0800, Yu Kuai wrote:
Hi, Greg
在 2022/08/01 21:17, Greg KH 写道:
On Mon, Aug 01, 2022 at 08:25:27PM +0800, Yu Kuai wrote:
Hi, Greg
在 2022/08/01 19:19, Greg KH 写道:
On Fri, Jul 29, 2022 at 02:23:54PM +0800, Yu Kuai wrote:
From: Christoph Hellwig hch@lst.de
commit 0dbcfe247f22a6d73302dfa691c48b3c14d31c4c upstream.
Invert they way the holder relations are tracked. This very slightly reduces the memory overhead for partitioned devices.
Signed-off-by: Christoph Hellwig hch@lst.de Signed-off-by: Yu Kuai yukuai3@huawei.com
block/genhd.c | 3 +++ fs/block_dev.c | 31 +++++++++++++++++++------------ include/linux/blk_types.h | 3 --- include/linux/genhd.h | 4 +++- 4 files changed, 25 insertions(+), 16 deletions(-)
diff --git a/block/genhd.c b/block/genhd.c index 796baf761202..2b11a2735285 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1760,6 +1760,9 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) disk_to_dev(disk)->class = &block_class; disk_to_dev(disk)->type = &disk_type; device_initialize(disk_to_dev(disk)); +#ifdef CONFIG_SYSFS
- INIT_LIST_HEAD(&disk->slave_bdevs);
+#endif return disk; out_free_part0: diff --git a/fs/block_dev.c b/fs/block_dev.c index 29f020c4b2d0..a202c76fcf7f 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -823,9 +823,6 @@ static void init_once(void *foo) memset(bdev, 0, sizeof(*bdev)); mutex_init(&bdev->bd_mutex); -#ifdef CONFIG_SYSFS
- INIT_LIST_HEAD(&bdev->bd_holder_disks);
-#endif bdev->bd_bdi = &noop_backing_dev_info; inode_init_once(&ei->vfs_inode); /* Initialize mutex for freeze. */ @@ -1188,7 +1185,7 @@ EXPORT_SYMBOL(bd_abort_claiming); #ifdef CONFIG_SYSFS struct bd_holder_disk { struct list_head list;
- struct gendisk *disk;
- struct block_device *bdev; int refcnt; };
@@ -1197,8 +1194,8 @@ static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev, { struct bd_holder_disk *holder;
- list_for_each_entry(holder, &bdev->bd_holder_disks, list)
if (holder->disk == disk)
- list_for_each_entry(holder, &disk->slave_bdevs, list)
return NULL; }if (holder->bdev == bdev) return holder;
@@ -1244,9 +1241,13 @@ static void del_symlink(struct kobject *from, struct kobject *to) int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) { struct bd_holder_disk *holder;
- struct block_device *bdev_holder = bdget_disk(disk, 0); int ret = 0;
- mutex_lock(&bdev->bd_mutex);
- if (WARN_ON_ONCE(!bdev_holder))
return -ENOENT;
- mutex_lock(&bdev_holder->bd_mutex); WARN_ON_ONCE(!bdev->bd_holder);
@@ -1267,7 +1268,7 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) } INIT_LIST_HEAD(&holder->list);
- holder->disk = disk;
- holder->bdev = bdev; holder->refcnt = 1; ret = add_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
@@ -1283,7 +1284,7 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) */ kobject_get(bdev->bd_part->holder_dir);
- list_add(&holder->list, &bdev->bd_holder_disks);
- list_add(&holder->list, &disk->slave_bdevs); goto out_unlock; out_del:
@@ -1291,7 +1292,8 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) out_free: kfree(holder); out_unlock:
- mutex_unlock(&bdev->bd_mutex);
- mutex_unlock(&bdev_holder->bd_mutex);
- bdput(bdev_holder); return ret; } EXPORT_SYMBOL_GPL(bd_link_disk_holder);
@@ -1309,8 +1311,12 @@ EXPORT_SYMBOL_GPL(bd_link_disk_holder); void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) { struct bd_holder_disk *holder;
- struct block_device *bdev_holder = bdget_disk(disk, 0);
- mutex_lock(&bdev->bd_mutex);
- if (WARN_ON_ONCE(!bdev_holder))
return;
- mutex_lock(&bdev_holder->bd_mutex); holder = bd_find_holder_disk(bdev, disk);
@@ -1323,7 +1329,8 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) kfree(holder); }
- mutex_unlock(&bdev->bd_mutex);
- mutex_unlock(&bdev_holder->bd_mutex);
- bdput(bdev_holder); } EXPORT_SYMBOL_GPL(bd_unlink_disk_holder); #endif
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index d9b69bbde5cc..1b84ecb34c18 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -29,9 +29,6 @@ struct block_device { void * bd_holder; int bd_holders; bool bd_write_holder; -#ifdef CONFIG_SYSFS
- struct list_head bd_holder_disks;
-#endif struct block_device * bd_contains; u8 bd_partno; struct hd_struct * bd_part; diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 03da3f603d30..3e5049a527e6 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -195,7 +195,9 @@ struct gendisk { #define GD_NEED_PART_SCAN 0 struct rw_semaphore lookup_sem; struct kobject *slave_dir;
+#ifdef CONFIG_SYSFS
- struct list_head slave_bdevs;
+#endif
This is very different from the upstream version, and forces the change onto everyone, not just those who had CONFIG_BLOCK_HOLDER_DEPRECATED enabled like was done in the main kernel tree.
Why force this on all and not just use the same option?
I would need a strong ACK from the original developers and maintainers of these subsystems before being able to take these into the 5.10 tree.
Yes, I agree that Christoph must take a look first.
What prevents you from just using 5.15 for those systems that run into these issues?
The null pointer problem is reported by our product(related to dm- mpath), and they had been using 5.10 for a long time. And switching kernel for commercial will require lots of work, it's not an option for now.
It should be the same validation and verification and testing path for 5.15.y as for an intrusive kernel change as this one, right? What makes it any different to prevent 5.15 from being used?
No, they are not the same, if we try to use 5.15.y, we have to test all other stuff that our product is used currently(ext4, block, scsi and lots of other modules). With this patch, we only need to make sure dm works fine.
That is a very odd way to test a monolithic kernel image, where any change in any part can affect any other part of the kernel. You touched the block layer, why you wouldn't think that all block-related things would also have to be tested is very strange to me as those are directly related. And while you are at it, you should test all other subsystems as the system is released as a whole, not as individual changes that are not unrelated.
I think you need to revisit your testing strategy, good luck!
greg k-h
On Mon, Aug 01, 2022 at 01:19:09PM +0200, Greg KH wrote:
This is very different from the upstream version, and forces the change onto everyone, not just those who had CONFIG_BLOCK_HOLDER_DEPRECATED enabled like was done in the main kernel tree.
Why force this on all and not just use the same option?
I'm really worried about backports that are significantly different from the original commit. To the point where if they are so different and we don't have a grave security or data integrity bug I'm really not very much in favor of backporting them at all.
On Mon, Aug 01, 2022 at 08:04:58PM +0200, Christoph Hellwig wrote:
On Mon, Aug 01, 2022 at 01:19:09PM +0200, Greg KH wrote:
This is very different from the upstream version, and forces the change onto everyone, not just those who had CONFIG_BLOCK_HOLDER_DEPRECATED enabled like was done in the main kernel tree.
Why force this on all and not just use the same option?
I'm really worried about backports that are significantly different from the original commit. To the point where if they are so different and we don't have a grave security or data integrity bug I'm really not very much in favor of backporting them at all.
I agree, I'll drop this from my review queue now, thanks.
greg k-h
Hi, Christoph
在 2022/08/02 13:11, Greg KH 写道:
On Mon, Aug 01, 2022 at 08:04:58PM +0200, Christoph Hellwig wrote:
On Mon, Aug 01, 2022 at 01:19:09PM +0200, Greg KH wrote:
This is very different from the upstream version, and forces the change onto everyone, not just those who had CONFIG_BLOCK_HOLDER_DEPRECATED enabled like was done in the main kernel tree.
Why force this on all and not just use the same option?
I'm really worried about backports that are significantly different from the original commit. To the point where if they are so different and we don't have a grave security or data integrity bug I'm really not very much in favor of backporting them at all.
I do understand that backporting these patches is not good, thanks for taking time on this patchset.
I decided to push following patch in our version:
diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 588e8b43efab..c047d5fcb325 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -2149,12 +2149,16 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
switch (type) { case DM_TYPE_REQUEST_BASED: - md->disk->fops = &dm_rq_blk_dops; r = dm_mq_init_request_queue(md, t); if (r) { DMERR("Cannot initialize queue for request-based dm mapped device"); return r; } + /* + * Change the fops after queue is initialized, so that bio won't + * issued by rq-based path until that. + */ + md->disk->fops = &dm_rq_blk_dops; break; case DM_TYPE_BIO_BASED: case DM_TYPE_DAX_BIO_BASED:
This way only modify dm, and the problem can be fixed. If you guys think this is OK, I can send a patch for LTS. Otherwise, if you guys still think the null-ptr-def problem is uncommon and doesn't worth to fix, please ignore this mail.
Thanks, Kuai
I agree, I'll drop this from my review queue now, thanks.
greg k-h .
From: Christoph Hellwig hch@lst.de
commit d626338735909bc2b2e7cafc332f44ed41cfdeee upstream.
device mapper needs to register holders before it is ready to do I/O. Currently it does so by registering the disk early, which can leave the disk and queue in a weird half state where the queue is registered with the disk, except for sysfs and the elevator. And this state has been a bit promlematic before, and will get more so when sorting out the responsibilities between the queue and the disk.
Support registering holders on an initialized but not registered disk instead by delaying the sysfs registration until the disk is registered.
Signed-off-by: Christoph Hellwig hch@lst.de Reviewed-by: Mike Snitzer snitzer@redhat.com Signed-off-by: Yu Kuai yukuai3@huawei.com --- block/genhd.c | 10 ++++++ fs/block_dev.c | 74 +++++++++++++++++++++++++++++++++---------- include/linux/genhd.h | 5 +++ 3 files changed, 72 insertions(+), 17 deletions(-)
diff --git a/block/genhd.c b/block/genhd.c index 2b11a2735285..da4642182702 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -732,6 +732,16 @@ static void register_disk(struct device *parent, struct gendisk *disk, disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj); disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);
+ /* + * XXX: this is a mess, can't wait for real error handling in add_disk. + * Make sure ->slave_dir is NULL if we failed some of the registration + * so that the cleanup in bd_unlink_disk_holder works properly. + */ + if (bd_register_pending_holders(disk) < 0) { + kobject_put(disk->slave_dir); + disk->slave_dir = NULL; + } + if (disk->flags & GENHD_FL_HIDDEN) return;
diff --git a/fs/block_dev.c b/fs/block_dev.c index a202c76fcf7f..8dc894c0f5f3 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1210,6 +1210,19 @@ static void del_symlink(struct kobject *from, struct kobject *to) sysfs_remove_link(from, kobject_name(to)); }
+static int __link_disk_holder(struct block_device *bdev, struct gendisk *disk) +{ + int ret; + + ret = add_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj); + if (ret) + return ret; + ret = add_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj); + if (ret) + del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj); + return ret; +} + /** * bd_link_disk_holder - create symlinks between holding disk and slave bdev * @bdev: the claimed slave bdev @@ -1252,7 +1265,7 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) WARN_ON_ONCE(!bdev->bd_holder);
/* FIXME: remove the following once add_disk() handles errors */ - if (WARN_ON(!disk->slave_dir || !bdev->bd_part->holder_dir)) + if (WARN_ON(!bdev->bd_part->holder_dir)) goto out_unlock;
holder = bd_find_holder_disk(bdev, disk); @@ -1271,13 +1284,13 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) holder->bdev = bdev; holder->refcnt = 1;
- ret = add_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj); - if (ret) - goto out_free; - - ret = add_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj); - if (ret) - goto out_del; + if (disk->slave_dir) { + ret = __link_disk_holder(bdev, disk); + if (ret) { + kfree(holder); + goto out_unlock; + } + } /* * bdev could be deleted beneath us which would implicitly destroy * the holder directory. Hold on to it. @@ -1285,12 +1298,6 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) kobject_get(bdev->bd_part->holder_dir);
list_add(&holder->list, &disk->slave_bdevs); - goto out_unlock; - -out_del: - del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj); -out_free: - kfree(holder); out_unlock: mutex_unlock(&bdev_holder->bd_mutex); bdput(bdev_holder); @@ -1298,6 +1305,13 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) } EXPORT_SYMBOL_GPL(bd_link_disk_holder);
+static void __unlink_disk_holder(struct block_device *bdev, + struct gendisk *disk) +{ + del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj); + del_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj); +} + /** * bd_unlink_disk_holder - destroy symlinks created by bd_link_disk_holder() * @bdev: the calimed slave bdev @@ -1321,9 +1335,8 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) holder = bd_find_holder_disk(bdev, disk);
if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) { - del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj); - del_symlink(bdev->bd_part->holder_dir, - &disk_to_dev(disk)->kobj); + if (disk->slave_dir) + __unlink_disk_holder(bdev, disk); kobject_put(bdev->bd_part->holder_dir); list_del_init(&holder->list); kfree(holder); @@ -1335,6 +1348,33 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) EXPORT_SYMBOL_GPL(bd_unlink_disk_holder); #endif
+int bd_register_pending_holders(struct gendisk *disk) +{ + struct bd_holder_disk *holder; + struct block_device *bdev = bdget_disk(disk, 0); + int ret; + + if (WARN_ON_ONCE(!bdev)) + return -ENOENT; + + mutex_lock(&bdev->bd_mutex); + list_for_each_entry(holder, &disk->slave_bdevs, list) { + ret = __link_disk_holder(holder->bdev, disk); + if (ret) + goto out_undo; + } + mutex_unlock(&bdev->bd_mutex); + bdput(bdev); + return 0; + +out_undo: + list_for_each_entry_continue_reverse(holder, &disk->slave_bdevs, list) + __unlink_disk_holder(holder->bdev, disk); + mutex_unlock(&bdev->bd_mutex); + bdput(bdev); + return ret; +} + /** * check_disk_size_change - checks for disk size change and adjusts bdev size. * @disk: struct gendisk to check diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 3e5049a527e6..3249f4b46700 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -385,6 +385,7 @@ long compat_blkdev_ioctl(struct file *, unsigned, unsigned long); #ifdef CONFIG_SYSFS int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk); void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk); +int bd_register_pending_holders(struct gendisk *disk); #else static inline int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) @@ -395,6 +396,10 @@ static inline void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) { } +static inline int bd_register_pending_holders(struct gendisk *disk) +{ + return 0; +} #endif /* CONFIG_SYSFS */
#ifdef CONFIG_BLOCK
From: Christoph Hellwig hch@lst.de
commit 89f871af1b26d98d983cba7ed0e86effa45ba5f8 upstream.
device mapper is currently the only outlier that tries to call register_disk after add_disk, leading to fairly inconsistent state of these block layer data structures. Instead change device-mapper to just register the gendisk later now that the holder mechanism can cope with that.
Note that this introduces a user visible change: the dm kobject is now only visible after the initial table has been loaded.
Signed-off-by: Christoph Hellwig hch@lst.de Reviewed-by: Mike Snitzer snitzer@redhat.com Signed-off-by: Yu Kuai yukuai3@huawei.com --- drivers/md/dm.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c index ab0e2338e47e..85efe2f1995f 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1795,7 +1795,12 @@ static void cleanup_mapped_device(struct mapped_device *md) spin_lock(&_minor_lock); md->disk->private_data = NULL; spin_unlock(&_minor_lock); - del_gendisk(md->disk); + if (dm_get_md_type(md) != DM_TYPE_NONE) { + dm_sysfs_exit(md); + del_gendisk(md->disk); + } else { + md->disk->queue = NULL; + } put_disk(md->disk); }
@@ -1900,7 +1905,6 @@ static struct mapped_device *alloc_dev(int minor) } }
- add_disk_no_queue_reg(md->disk); format_dev_t(md->name, MKDEV(_major, minor));
md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0); @@ -2098,19 +2102,12 @@ static struct dm_table *__unbind(struct mapped_device *md) */ int dm_create(int minor, struct mapped_device **result) { - int r; struct mapped_device *md;
md = alloc_dev(minor); if (!md) return -ENXIO;
- r = dm_sysfs_init(md); - if (r) { - free_dev(md); - return r; - } - *result = md; return 0; } @@ -2188,8 +2185,14 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t) return r; } dm_table_set_restrictions(t, md->queue, &limits); - blk_register_queue(md->disk);
+ add_disk(md->disk); + r = dm_sysfs_init(md); + if (r) { + del_gendisk(md->disk); + return r; + } + md->type = type; return 0; }
@@ -2295,7 +2298,6 @@ static void __dm_destroy(struct mapped_device *md, bool wait) DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)", dm_device_name(md), atomic_read(&md->holders));
- dm_sysfs_exit(md); dm_table_destroy(__unbind(md)); free_dev(md); }
linux-stable-mirror@lists.linaro.org