This patch avoids that self-removal triggers the following deadlock:
====================================================== WARNING: possible circular locking dependency detected 4.18.0-rc2-dbg+ #5 Not tainted ------------------------------------------------------ modprobe/6539 is trying to acquire lock: 000000008323c4cd (kn->count#202){++++}, at: kernfs_remove_by_name_ns+0x45/0x90
but task is already holding lock: 00000000a6ec2c69 (&shost->scan_mutex){+.+.}, at: scsi_remove_host+0x21/0x150 [scsi_mod]
which lock already depends on the new lock.
the existing dependency chain (in reverse order) is:
-> #1 (&shost->scan_mutex){+.+.}: __mutex_lock+0xfe/0xc70 mutex_lock_nested+0x1b/0x20 scsi_remove_device+0x26/0x40 [scsi_mod] sdev_store_delete+0x27/0x30 [scsi_mod] dev_attr_store+0x3e/0x50 sysfs_kf_write+0x87/0xa0 kernfs_fop_write+0x190/0x230 __vfs_write+0xd2/0x3b0 vfs_write+0x101/0x270 ksys_write+0xab/0x120 __x64_sys_write+0x43/0x50 do_syscall_64+0x77/0x230 entry_SYSCALL_64_after_hwframe+0x49/0xbe
-> #0 (kn->count#202){++++}: lock_acquire+0xd2/0x260 __kernfs_remove+0x424/0x4a0 kernfs_remove_by_name_ns+0x45/0x90 remove_files.isra.1+0x3a/0x90 sysfs_remove_group+0x5c/0xc0 sysfs_remove_groups+0x39/0x60 device_remove_attrs+0x82/0xb0 device_del+0x251/0x580 __scsi_remove_device+0x19f/0x1d0 [scsi_mod] scsi_forget_host+0x37/0xb0 [scsi_mod] scsi_remove_host+0x9b/0x150 [scsi_mod] sdebug_driver_remove+0x4b/0x150 [scsi_debug] device_release_driver_internal+0x241/0x360 device_release_driver+0x12/0x20 bus_remove_device+0x1bc/0x290 device_del+0x259/0x580 device_unregister+0x1a/0x70 sdebug_remove_adapter+0x8b/0xf0 [scsi_debug] scsi_debug_exit+0x76/0xe8 [scsi_debug] __x64_sys_delete_module+0x1c1/0x280 do_syscall_64+0x77/0x230 entry_SYSCALL_64_after_hwframe+0x49/0xbe
other info that might help us debug this:
Possible unsafe locking scenario:
CPU0 CPU1 ---- ---- lock(&shost->scan_mutex); lock(kn->count#202); lock(&shost->scan_mutex); lock(kn->count#202);
*** DEADLOCK ***
2 locks held by modprobe/6539: #0: 00000000efaf9298 (&dev->mutex){....}, at: device_release_driver_internal+0x68/0x360 #1: 00000000a6ec2c69 (&shost->scan_mutex){+.+.}, at: scsi_remove_host+0x21/0x150 [scsi_mod]
stack backtrace: CPU: 10 PID: 6539 Comm: modprobe Not tainted 4.18.0-rc2-dbg+ #5 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.0.0-prebuilt.qemu-project.org 04/01/2014 Call Trace: dump_stack+0xa4/0xf5 print_circular_bug.isra.34+0x213/0x221 __lock_acquire+0x1a7e/0x1b50 lock_acquire+0xd2/0x260 __kernfs_remove+0x424/0x4a0 kernfs_remove_by_name_ns+0x45/0x90 remove_files.isra.1+0x3a/0x90 sysfs_remove_group+0x5c/0xc0 sysfs_remove_groups+0x39/0x60 device_remove_attrs+0x82/0xb0 device_del+0x251/0x580 __scsi_remove_device+0x19f/0x1d0 [scsi_mod] scsi_forget_host+0x37/0xb0 [scsi_mod] scsi_remove_host+0x9b/0x150 [scsi_mod] sdebug_driver_remove+0x4b/0x150 [scsi_debug] device_release_driver_internal+0x241/0x360 device_release_driver+0x12/0x20 bus_remove_device+0x1bc/0x290 device_del+0x259/0x580 device_unregister+0x1a/0x70 sdebug_remove_adapter+0x8b/0xf0 [scsi_debug] scsi_debug_exit+0x76/0xe8 [scsi_debug] __x64_sys_delete_module+0x1c1/0x280 do_syscall_64+0x77/0x230 entry_SYSCALL_64_after_hwframe+0x49/0xbe
See also https://www.mail-archive.com/linux-scsi@vger.kernel.org/msg54525.html.
Suggested-by: Eric W. Biederman ebiederm@xmission.com Fixes: ac0ece9174ac ("scsi: use device_remove_file_self() instead of device_schedule_callback()") Signed-off-by: Bart Van Assche bart.vanassche@wdc.com Cc: Eric W. Biederman ebiederm@xmission.com Cc: Tejun Heo tj@kernel.org Cc: Hannes Reinecke hare@suse.com Cc: Johannes Thumshirn jthumshirn@suse.de Cc: Ingo Molnar mingo@kernel.org Cc: Oleg Nesterov oleg@redhat.com Cc: stable@vger.kernel.org --- drivers/scsi/scsi_sysfs.c | 48 +++++++++++++++++++++++++++++++++++---- kernel/task_work.c | 1 + 2 files changed, 45 insertions(+), 4 deletions(-)
diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c index de122354d09a..c43f645900d4 100644 --- a/drivers/scsi/scsi_sysfs.c +++ b/drivers/scsi/scsi_sysfs.c @@ -12,6 +12,7 @@ #include <linux/blkdev.h> #include <linux/device.h> #include <linux/pm_runtime.h> +#include <linux/task_work.h>
#include <scsi/scsi.h> #include <scsi/scsi_device.h> @@ -718,14 +719,53 @@ store_rescan_field (struct device *dev, struct device_attribute *attr, } static DEVICE_ATTR(rescan, S_IWUSR, NULL, store_rescan_field);
+struct remove_dev_work { + struct callback_head head; + struct scsi_device *sdev; +}; + +static void delete_sdev(struct callback_head *head) +{ + struct remove_dev_work *work = container_of(head, typeof(*work), head); + struct scsi_device *sdev = work->sdev; + + scsi_remove_device(sdev); + kfree(work); + scsi_device_put(sdev); +} + static ssize_t sdev_store_delete(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { - if (device_remove_file_self(dev, attr)) - scsi_remove_device(to_scsi_device(dev)); - return count; -}; + struct scsi_device *sdev = to_scsi_device(dev); + struct remove_dev_work *work; + int ret; + + ret = scsi_device_get(sdev); + if (ret < 0) + goto out; + ret = -ENOMEM; + work = kmalloc(sizeof(*work), GFP_KERNEL); + if (!work) + goto put; + work->head.func = delete_sdev; + work->sdev = sdev; + ret = task_work_add(current, &work->head, false); + if (ret < 0) + goto free; + ret = count; + +out: + return ret; + +free: + kfree(work); + +put: + scsi_device_put(sdev); + goto out; +} static DEVICE_ATTR(delete, S_IWUSR, NULL, sdev_store_delete);
static ssize_t diff --git a/kernel/task_work.c b/kernel/task_work.c index 0fef395662a6..75dc496b9997 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -40,6 +40,7 @@ task_work_add(struct task_struct *task, struct callback_head *work, bool notify) set_notify_resume(task); return 0; } +EXPORT_SYMBOL_GPL(task_work_add);
/** * task_work_cancel - cancel a pending work added by task_work_add()
From my limited insight into this:
Looks good, Reviewed-by: Johannes Thumshirn jthumshirn@suse.de
Bart Van Assche bart.vanassche@wdc.com 于2018年7月25日周三 下午7:39写道:
This patch avoids that self-removal triggers the following deadlock:
====================================================== WARNING: possible circular locking dependency detected 4.18.0-rc2-dbg+ #5 Not tainted
modprobe/6539 is trying to acquire lock: 000000008323c4cd (kn->count#202){++++}, at: kernfs_remove_by_name_ns+0x45/0x90
but task is already holding lock: 00000000a6ec2c69 (&shost->scan_mutex){+.+.}, at: scsi_remove_host+0x21/0x150 [scsi_mod]
which lock already depends on the new lock.
the existing dependency chain (in reverse order) is:
-> #1 (&shost->scan_mutex){+.+.}: __mutex_lock+0xfe/0xc70 mutex_lock_nested+0x1b/0x20 scsi_remove_device+0x26/0x40 [scsi_mod] sdev_store_delete+0x27/0x30 [scsi_mod] dev_attr_store+0x3e/0x50 sysfs_kf_write+0x87/0xa0 kernfs_fop_write+0x190/0x230 __vfs_write+0xd2/0x3b0 vfs_write+0x101/0x270 ksys_write+0xab/0x120 __x64_sys_write+0x43/0x50 do_syscall_64+0x77/0x230 entry_SYSCALL_64_after_hwframe+0x49/0xbe
-> #0 (kn->count#202){++++}: lock_acquire+0xd2/0x260 __kernfs_remove+0x424/0x4a0 kernfs_remove_by_name_ns+0x45/0x90 remove_files.isra.1+0x3a/0x90 sysfs_remove_group+0x5c/0xc0 sysfs_remove_groups+0x39/0x60 device_remove_attrs+0x82/0xb0 device_del+0x251/0x580 __scsi_remove_device+0x19f/0x1d0 [scsi_mod] scsi_forget_host+0x37/0xb0 [scsi_mod] scsi_remove_host+0x9b/0x150 [scsi_mod] sdebug_driver_remove+0x4b/0x150 [scsi_debug] device_release_driver_internal+0x241/0x360 device_release_driver+0x12/0x20 bus_remove_device+0x1bc/0x290 device_del+0x259/0x580 device_unregister+0x1a/0x70 sdebug_remove_adapter+0x8b/0xf0 [scsi_debug] scsi_debug_exit+0x76/0xe8 [scsi_debug] __x64_sys_delete_module+0x1c1/0x280 do_syscall_64+0x77/0x230 entry_SYSCALL_64_after_hwframe+0x49/0xbe
other info that might help us debug this:
Possible unsafe locking scenario:
CPU0 CPU1 ---- ----
lock(&shost->scan_mutex); lock(kn->count#202); lock(&shost->scan_mutex); lock(kn->count#202);
*** DEADLOCK ***
2 locks held by modprobe/6539: #0: 00000000efaf9298 (&dev->mutex){....}, at: device_release_driver_internal+0x68/0x360 #1: 00000000a6ec2c69 (&shost->scan_mutex){+.+.}, at: scsi_remove_host+0x21/0x150 [scsi_mod]
stack backtrace: CPU: 10 PID: 6539 Comm: modprobe Not tainted 4.18.0-rc2-dbg+ #5 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.0.0-prebuilt.qemu-project.org 04/01/2014 Call Trace: dump_stack+0xa4/0xf5 print_circular_bug.isra.34+0x213/0x221 __lock_acquire+0x1a7e/0x1b50 lock_acquire+0xd2/0x260 __kernfs_remove+0x424/0x4a0 kernfs_remove_by_name_ns+0x45/0x90 remove_files.isra.1+0x3a/0x90 sysfs_remove_group+0x5c/0xc0 sysfs_remove_groups+0x39/0x60 device_remove_attrs+0x82/0xb0 device_del+0x251/0x580 __scsi_remove_device+0x19f/0x1d0 [scsi_mod] scsi_forget_host+0x37/0xb0 [scsi_mod] scsi_remove_host+0x9b/0x150 [scsi_mod] sdebug_driver_remove+0x4b/0x150 [scsi_debug] device_release_driver_internal+0x241/0x360 device_release_driver+0x12/0x20 bus_remove_device+0x1bc/0x290 device_del+0x259/0x580 device_unregister+0x1a/0x70 sdebug_remove_adapter+0x8b/0xf0 [scsi_debug] scsi_debug_exit+0x76/0xe8 [scsi_debug] __x64_sys_delete_module+0x1c1/0x280 do_syscall_64+0x77/0x230 entry_SYSCALL_64_after_hwframe+0x49/0xbe
See also https://www.mail-archive.com/linux-scsi@vger.kernel.org/msg54525.html.
Suggested-by: Eric W. Biederman ebiederm@xmission.com Fixes: ac0ece9174ac ("scsi: use device_remove_file_self() instead of device_schedule_callback()") Signed-off-by: Bart Van Assche bart.vanassche@wdc.com Cc: Eric W. Biederman ebiederm@xmission.com Cc: Tejun Heo tj@kernel.org Cc: Hannes Reinecke hare@suse.com Cc: Johannes Thumshirn jthumshirn@suse.de Cc: Ingo Molnar mingo@kernel.org Cc: Oleg Nesterov oleg@redhat.com Cc: stable@vger.kernel.org
Looks good to me! Reviewed-by: Jack Wang jinpu.wang@profitbricks.com
Hello,
ISTR giving the same feedback before.
On Wed, Jul 25, 2018 at 10:38:28AM -0700, Bart Van Assche wrote:
+struct remove_dev_work {
- struct callback_head head;
- struct scsi_device *sdev;
+};
+static void delete_sdev(struct callback_head *head) +{
- struct remove_dev_work *work = container_of(head, typeof(*work), head);
- struct scsi_device *sdev = work->sdev;
- scsi_remove_device(sdev);
- kfree(work);
- scsi_device_put(sdev);
+}
static ssize_t sdev_store_delete(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) {
- if (device_remove_file_self(dev, attr))
scsi_remove_device(to_scsi_device(dev));
- return count;
-};
- struct scsi_device *sdev = to_scsi_device(dev);
- struct remove_dev_work *work;
- int ret;
- ret = scsi_device_get(sdev);
- if (ret < 0)
goto out;
- ret = -ENOMEM;
- work = kmalloc(sizeof(*work), GFP_KERNEL);
- if (!work)
goto put;
- work->head.func = delete_sdev;
- work->sdev = sdev;
- ret = task_work_add(current, &work->head, false);
- if (ret < 0)
goto free;
- ret = count;
+out:
- return ret;
+free:
- kfree(work);
+put:
- scsi_device_put(sdev);
- goto out;
+}
Making removal asynchronous this way sometimes causes issues because whether the user sees the device released or not is racy. kernfs/sysfs have mechanisms to deal with these cases - remove_self and kernfs_break_active_protection(). Have you looked at those?
Thanks.
On Thu, 2018-07-26 at 06:35 -0700, Tejun Heo wrote:
Making removal asynchronous this way sometimes causes issues because whether the user sees the device released or not is racy. kernfs/sysfs have mechanisms to deal with these cases - remove_self and kernfs_break_active_protection(). Have you looked at those?
Hello Tejun,
The call stack in the patch description shows that sdev_store_delete() is involved in the deadlock. The implementation of that function is as follows:
static ssize_t sdev_store_delete(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { if (device_remove_file_self(dev, attr)) scsi_remove_device(to_scsi_device(dev)); return count; };
device_remove_file_self() calls sysfs_remove_file_self() and that last function calls kernfs_remove_self(). In other words, kernfs_remove_self() is already being used. Please let me know if I misunderstood your comment.
Thanks,
Bart.
Hello,
On Thu, Jul 26, 2018 at 02:09:41PM +0000, Bart Van Assche wrote:
On Thu, 2018-07-26 at 06:35 -0700, Tejun Heo wrote:
Making removal asynchronous this way sometimes causes issues because whether the user sees the device released or not is racy. kernfs/sysfs have mechanisms to deal with these cases - remove_self and kernfs_break_active_protection(). Have you looked at those?
Hello Tejun,
The call stack in the patch description shows that sdev_store_delete() is involved in the deadlock. The implementation of that function is as follows:
static ssize_t sdev_store_delete(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { if (device_remove_file_self(dev, attr)) scsi_remove_device(to_scsi_device(dev)); return count; };
device_remove_file_self() calls sysfs_remove_file_self() and that last function calls kernfs_remove_self(). In other words, kernfs_remove_self() is already being used. Please let me know if I misunderstood your comment.
So, here, because scsi_remove_device() is the one involved in the circular dependency, just breaking the dependency chain on the file itself (self removal) isn't enough. You can wrap the whole operation with kernfs_break_active_protection() to also move scsi_remove_device() invocation outside the kernfs synchronization. This will need to be piped through sysfs but shouldn't be too complex.
Thanks.
On Thu, 2018-07-26 at 07:14 -0700, tj@kernel.org wrote:
Hello,
On Thu, Jul 26, 2018 at 02:09:41PM +0000, Bart Van Assche wrote:
On Thu, 2018-07-26 at 06:35 -0700, Tejun Heo wrote:
Making removal asynchronous this way sometimes causes issues because whether the user sees the device released or not is racy. kernfs/sysfs have mechanisms to deal with these cases - remove_self and kernfs_break_active_protection(). Have you looked at those?
Hello Tejun,
The call stack in the patch description shows that sdev_store_delete() is involved in the deadlock. The implementation of that function is as follows:
static ssize_t sdev_store_delete(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { if (device_remove_file_self(dev, attr)) scsi_remove_device(to_scsi_device(dev)); return count; };
device_remove_file_self() calls sysfs_remove_file_self() and that last function calls kernfs_remove_self(). In other words, kernfs_remove_self() is already being used. Please let me know if I misunderstood your comment.
So, here, because scsi_remove_device() is the one involved in the circular dependency, just breaking the dependency chain on the file itself (self removal) isn't enough. You can wrap the whole operation with kernfs_break_active_protection() to also move scsi_remove_device() invocation outside the kernfs synchronization. This will need to be piped through sysfs but shouldn't be too complex.
Hello Tejun,
I have tried to implement the above but my implementation triggered a kernel warning. Can you have a look at the attached patches and see what I did wrong?
Thanks,
Bart.
The kernel warning I ran into is as follows:
kernfs_put: 6:0:0:0/delete: released with incorrect active_ref 2147483647 WARNING: CPU: 6 PID: 1014 at fs/kernfs/dir.c:527 kernfs_put+0x136/0x180 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.0.0-prebuilt.qemu-project.org 04/01/2014 RIP: 0010:kernfs_put+0x136/0x180 Call Trace: evict+0xc1/0x190 __dentry_kill+0xc4/0x150 shrink_dentry_list+0x9e/0x1c0 shrink_dcache_parent+0x63/0x70 d_invalidate+0x42/0xb0 lookup_fast+0x278/0x2a0 walk_component+0x38/0x450 link_path_walk+0x2a8/0x4f0 path_openat+0x89/0x13a0 do_filp_open+0x8c/0xf0 do_sys_open+0x1a6/0x230 do_syscall_64+0x4f/0x110 entry_SYSCALL_64_after_hwframe+0x44/0xa9
Hello, Bart.
On Thu, Jul 26, 2018 at 09:57:40PM +0000, Bart Van Assche wrote: ...
@@ -440,11 +445,21 @@ bool sysfs_remove_file_self(struct kobject *kobj, const struct attribute *attr) return false; ret = kernfs_remove_self(kn);
- if (ret && cb) {
kernfs_break_active_protection(kn);
cb(kobj, attr, data);
kernfs_break_active_protection(kn);
unbreak?
Also, wouldn't it be better to just expose sysfs_break/unbreak and then do sth like the following from scsi?
kobject_get(); sysfs_break_active_protection(); do normal sysfs removal; sysfs_unbreak..(); kobject_put();
Thanks.
On Mon, 2018-07-30 at 07:13 -0700, tj@kernel.org wrote:
Also, wouldn't it be better to just expose sysfs_break/unbreak and then do sth like the following from scsi?
kobject_get();
sysfs_break_active_protection(); do normal sysfs removal; sysfs_unbreak..(); kobject_put();
Hello Tejun,
It's not clear to me how the sysfs_break_active_protection() should obtain the struct kernfs_node pointer to the attribute. Calling that function before device_remove_file_self() causes a double call to kernfs_break_active_protection(), which is wrong. Calling kernfs_find_and_get(kobj->sd, attr->name) after the attribute has been removed results in a NULL pointer because the attribute that that call tries to look up has already been removed. Should I proceed with the approach proposed in the patches attached to a previous e-mail?
Thanks,
Bart.
Hello, Bart.
On Mon, Jul 30, 2018 at 05:28:11PM +0000, Bart Van Assche wrote:
It's not clear to me how the sysfs_break_active_protection() should obtain the struct kernfs_node pointer to the attribute. Calling that function before device_remove_file_self() causes a double call to kernfs_break_active_protection(), which is wrong. Calling kernfs_find_and_get(kobj->sd, attr->name) after the
So, if you braek active protection explicitly, there's no need to call remove_self(). It can just use regular remove.
attribute has been removed results in a NULL pointer because the attribute that that call tries to look up has already been removed. Should I proceed with the approach proposed in the patches attached to a previous e-mail?
I don't have an objection for that.
Thanks.
On Mon, 2018-07-30 at 10:31 -0700, tj@kernel.org wrote:
On Mon, Jul 30, 2018 at 05:28:11PM +0000, Bart Van Assche wrote:
It's not clear to me how the sysfs_break_active_protection() should obtain the struct kernfs_node pointer to the attribute. Calling that function before device_remove_file_self() causes a double call to kernfs_break_active_protection(), which is wrong. Calling kernfs_find_and_get(kobj->sd, attr->name) after the
So, if you braek active protection explicitly, there's no need to call remove_self(). It can just use regular remove.
But how to avoid that scsi_remove_device(to_scsi_device(dev)) gets called multiple times when using device_remove_self() and in case of concurrent writes into the SCSI device "delete" sysfs attribute?
Thanks,
Bart.
Hello,
On Mon, Jul 30, 2018 at 05:50:02PM +0000, Bart Van Assche wrote:
On Mon, 2018-07-30 at 10:31 -0700, tj@kernel.org wrote:
On Mon, Jul 30, 2018 at 05:28:11PM +0000, Bart Van Assche wrote:
It's not clear to me how the sysfs_break_active_protection() should obtain the struct kernfs_node pointer to the attribute. Calling that function before device_remove_file_self() causes a double call to kernfs_break_active_protection(), which is wrong. Calling kernfs_find_and_get(kobj->sd, attr->name) after the
So, if you braek active protection explicitly, there's no need to call remove_self(). It can just use regular remove.
But how to avoid that scsi_remove_device(to_scsi_device(dev)) gets called multiple times when using device_remove_self() and in case of concurrent writes into the SCSI device "delete" sysfs attribute?
So, scsi_remove_device() internally protects using scan_mutex and if the whole thing is wrapped with break_active_prot, I don't think you need to call remove_file_self at all, right?
Thanks.
On Thu, 2018-07-26 at 06:35 -0700, Tejun Heo wrote:
Making removal asynchronous this way sometimes causes issues because whether the user sees the device released or not is racy.
Hello Tejun,
How could this change cause any user-visible changes? My understanding is that any work queued with task_work_add() is executed before system call execution leaves kernel context and returns back to user space context.
Thanks,
Bart.
On Sun, Jul 29, 2018 at 04:03:41AM +0000, Bart Van Assche wrote:
On Thu, 2018-07-26 at 06:35 -0700, Tejun Heo wrote:
Making removal asynchronous this way sometimes causes issues because whether the user sees the device released or not is racy.
Hello Tejun,
How could this change cause any user-visible changes? My understanding is that any work queued with task_work_add() is executed before system call execution leaves kernel context and returns back to user space context.
Ah, you're right. This isn't workqueue. Hmm... yeah, needing to allocate sth in removal path is a bit icky but, it should be okay I guess. I *think* the kernfs active break/unbreak is likely gonna be cleaner tho.
Thanks.
linux-stable-mirror@lists.linaro.org