Rebased on linux-5.10.y, stable tags added. The first was dropped before, most of others make it right.
Pavel Begunkov (11): kernel/io_uring: cancel io_uring before task works io_uring: inline io_uring_attempt_task_drop() io_uring: add warn_once for io_uring_flush() io_uring: stop SQPOLL submit on creator's death io_uring: fix null-deref in io_disable_sqo_submit io_uring: do sqo disable on install_fd error io_uring: fix false positive sqo warning on flush io_uring: fix uring_flush in exit_files() warning io_uring: fix skipping disabling sqo on exec io_uring: dont kill fasync under completion_lock io_uring: fix sleeping under spin in __io_clean_op
fs/file.c | 2 - fs/io_uring.c | 119 +++++++++++++++++++++++++++++++++++--------------- kernel/exit.c | 2 + 3 files changed, 86 insertions(+), 37 deletions(-)
[ Upstream commit b1b6b5a30dce872f500dc43f067cba8e7f86fc7d ]
For cancelling io_uring requests it needs either to be able to run currently enqueued task_works or having it shut down by that moment. Otherwise io_uring_cancel_files() may be waiting for requests that won't ever complete.
Go with the first way and do cancellations before setting PF_EXITING and so before putting the task_work infrastructure into a transition state where task_work_run() would better not be called.
Cc: stable@vger.kernel.org # 5.5+ Signed-off-by: Pavel Begunkov asml.silence@gmail.com Signed-off-by: Jens Axboe axboe@kernel.dk --- fs/file.c | 2 -- kernel/exit.c | 2 ++ 2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/fs/file.c b/fs/file.c index 4559b5fec3bd..21c0893f2f1d 100644 --- a/fs/file.c +++ b/fs/file.c @@ -21,7 +21,6 @@ #include <linux/rcupdate.h> #include <linux/close_range.h> #include <net/sock.h> -#include <linux/io_uring.h>
unsigned int sysctl_nr_open __read_mostly = 1024*1024; unsigned int sysctl_nr_open_min = BITS_PER_LONG; @@ -453,7 +452,6 @@ void exit_files(struct task_struct *tsk) struct files_struct * files = tsk->files;
if (files) { - io_uring_files_cancel(files); task_lock(tsk); tsk->files = NULL; task_unlock(tsk); diff --git a/kernel/exit.c b/kernel/exit.c index 1f236ed375f8..d13d67fc5f4e 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -63,6 +63,7 @@ #include <linux/random.h> #include <linux/rcuwait.h> #include <linux/compat.h> +#include <linux/io_uring.h>
#include <linux/uaccess.h> #include <asm/unistd.h> @@ -762,6 +763,7 @@ void __noreturn do_exit(long code) schedule(); }
+ io_uring_files_cancel(tsk->files); exit_signals(tsk); /* sets PF_EXITING */
/* sync mm's RSS info before statistics gathering */
[ Upstream commit 4f793dc40bc605b97624fd36baf085b3c35e8bfd ]
A simple preparation change inlining io_uring_attempt_task_drop() into io_uring_flush().
Cc: stable@vger.kernel.org # 5.5+ Signed-off-by: Pavel Begunkov asml.silence@gmail.com Signed-off-by: Jens Axboe axboe@kernel.dk --- fs/io_uring.c | 29 +++++++++++------------------ 1 file changed, 11 insertions(+), 18 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c index 265aea2cd7bc..6c89d38076d0 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8804,23 +8804,6 @@ static void io_uring_del_task_file(struct file *file) fput(file); }
-/* - * Drop task note for this file if we're the only ones that hold it after - * pending fput() - */ -static void io_uring_attempt_task_drop(struct file *file) -{ - if (!current->io_uring) - return; - /* - * fput() is pending, will be 2 if the only other ref is our potential - * task file note. If the task is exiting, drop regardless of count. - */ - if (fatal_signal_pending(current) || (current->flags & PF_EXITING) || - atomic_long_read(&file->f_count) == 2) - io_uring_del_task_file(file); -} - static void io_uring_remove_task_files(struct io_uring_task *tctx) { struct file *file; @@ -8912,7 +8895,17 @@ void __io_uring_task_cancel(void)
static int io_uring_flush(struct file *file, void *data) { - io_uring_attempt_task_drop(file); + if (!current->io_uring) + return 0; + + /* + * fput() is pending, will be 2 if the only other ref is our potential + * task file note. If the task is exiting, drop regardless of count. + */ + if (fatal_signal_pending(current) || (current->flags & PF_EXITING) || + atomic_long_read(&file->f_count) == 2) + io_uring_del_task_file(file); + return 0; }
[ Upstream commit 6b5733eb638b7068ab7cb34e663b55a1d1892d85]
files_cancel() should cancel all relevant requests and drop file notes, so we should never have file notes after that, including on-exit fput and flush. Add a WARN_ONCE to be sure.
Cc: stable@vger.kernel.org # 5.5+ Signed-off-by: Pavel Begunkov asml.silence@gmail.com Signed-off-by: Jens Axboe axboe@kernel.dk --- fs/io_uring.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c index 6c89d38076d0..4dfba3d44a3c 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8895,17 +8895,23 @@ void __io_uring_task_cancel(void)
static int io_uring_flush(struct file *file, void *data) { - if (!current->io_uring) + struct io_uring_task *tctx = current->io_uring; + + if (!tctx) return 0;
+ /* we should have cancelled and erased it before PF_EXITING */ + WARN_ON_ONCE((current->flags & PF_EXITING) && + xa_load(&tctx->xa, (unsigned long)file)); + /* * fput() is pending, will be 2 if the only other ref is our potential * task file note. If the task is exiting, drop regardless of count. */ - if (fatal_signal_pending(current) || (current->flags & PF_EXITING) || - atomic_long_read(&file->f_count) == 2) - io_uring_del_task_file(file); + if (atomic_long_read(&file->f_count) != 2) + return 0;
+ io_uring_del_task_file(file); return 0; }
[ Upstream commit d9d05217cb6990b9a56e13b56e7a1b71e2551f6c ]
When the creator of SQPOLL io_uring dies (i.e. sqo_task), we don't want its internals like ->files and ->mm to be poked by the SQPOLL task, it have never been nice and recently got racy. That can happen when the owner undergoes destruction and SQPOLL tasks tries to submit new requests in parallel, and so calls io_sq_thread_acquire*().
That patch halts SQPOLL submissions when sqo_task dies by introducing sqo_dead flag. Once set, the SQPOLL task must not do any submission, which is synchronised by uring_lock as well as the new flag.
The tricky part is to make sure that disabling always happens, that means either the ring is discovered by creator's do_exit() -> cancel, or if the final close() happens before it's done by the creator. The last is guaranteed by the fact that for SQPOLL the creator task and only it holds exactly one file note, so either it pins up to do_exit() or removed by the creator on the final put in flush. (see comments in uring_flush() around file->f_count == 2).
One more place that can trigger io_sq_thread_acquire_*() is __io_req_task_submit(). Shoot off requests on sqo_dead there, even though actually we don't need to. That's because cancellation of sqo_task should wait for the request before going any further.
note 1: io_disable_sqo_submit() does io_ring_set_wakeup_flag() so the caller would enter the ring to get an error, but it still doesn't guarantee that the flag won't be cleared.
note 2: if final __userspace__ close happens not from the creator task, the file note will pin the ring until the task dies.
Cc: stable@vger.kernel.org # 5.5+ Fixed: b1b6b5a30dce8 ("kernel/io_uring: cancel io_uring before task works") Signed-off-by: Pavel Begunkov asml.silence@gmail.com Signed-off-by: Jens Axboe axboe@kernel.dk --- fs/io_uring.c | 58 ++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 50 insertions(+), 8 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c index 4dfba3d44a3c..723e1eb5349a 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -260,6 +260,7 @@ struct io_ring_ctx { unsigned int drain_next: 1; unsigned int eventfd_async: 1; unsigned int restricted: 1; + unsigned int sqo_dead: 1;
/* * Ring buffer of indices into array of io_uring_sqe, which is @@ -2063,11 +2064,9 @@ static void io_req_task_cancel(struct callback_head *cb) static void __io_req_task_submit(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - bool fail;
- fail = __io_sq_thread_acquire_mm(ctx); mutex_lock(&ctx->uring_lock); - if (!fail) + if (!ctx->sqo_dead && !__io_sq_thread_acquire_mm(ctx)) __io_queue_sqe(req, NULL); else __io_req_task_cancel(req, -EFAULT); @@ -6765,7 +6764,7 @@ static enum sq_ret __io_sq_thread(struct io_ring_ctx *ctx, to_submit = 8;
mutex_lock(&ctx->uring_lock); - if (likely(!percpu_ref_is_dying(&ctx->refs))) + if (likely(!percpu_ref_is_dying(&ctx->refs) && !ctx->sqo_dead)) ret = io_submit_sqes(ctx, to_submit); mutex_unlock(&ctx->uring_lock);
@@ -8456,6 +8455,10 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) mutex_lock(&ctx->uring_lock); percpu_ref_kill(&ctx->refs); /* if force is set, the ring is going away. always drop after that */ + + if (WARN_ON_ONCE((ctx->flags & IORING_SETUP_SQPOLL) && !ctx->sqo_dead)) + ctx->sqo_dead = 1; + ctx->cq_overflow_flushed = 1; if (ctx->rings) __io_cqring_overflow_flush(ctx, true, NULL, NULL); @@ -8714,6 +8717,18 @@ static bool __io_uring_cancel_task_requests(struct io_ring_ctx *ctx, return ret; }
+static void io_disable_sqo_submit(struct io_ring_ctx *ctx) +{ + WARN_ON_ONCE(ctx->sqo_task != current); + + mutex_lock(&ctx->uring_lock); + ctx->sqo_dead = 1; + mutex_unlock(&ctx->uring_lock); + + /* make sure callers enter the ring to get error */ + io_ring_set_wakeup_flag(ctx); +} + /* * We need to iteratively cancel requests, in case a request has dependent * hard links. These persist even for failure of cancelations, hence keep @@ -8725,6 +8740,8 @@ static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx, struct task_struct *task = current;
if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) { + /* for SQPOLL only sqo_task has task notes */ + io_disable_sqo_submit(ctx); task = ctx->sq_data->thread; atomic_inc(&task->io_uring->in_idle); io_sq_thread_park(ctx->sq_data); @@ -8896,6 +8913,7 @@ void __io_uring_task_cancel(void) static int io_uring_flush(struct file *file, void *data) { struct io_uring_task *tctx = current->io_uring; + struct io_ring_ctx *ctx = file->private_data;
if (!tctx) return 0; @@ -8911,7 +8929,16 @@ static int io_uring_flush(struct file *file, void *data) if (atomic_long_read(&file->f_count) != 2) return 0;
- io_uring_del_task_file(file); + if (ctx->flags & IORING_SETUP_SQPOLL) { + /* there is only one file note, which is owned by sqo_task */ + WARN_ON_ONCE((ctx->sqo_task == current) == + !xa_load(&tctx->xa, (unsigned long)file)); + + io_disable_sqo_submit(ctx); + } + + if (!(ctx->flags & IORING_SETUP_SQPOLL) || ctx->sqo_task == current) + io_uring_del_task_file(file); return 0; }
@@ -8985,8 +9012,9 @@ static unsigned long io_uring_nommu_get_unmapped_area(struct file *file,
#endif /* !CONFIG_MMU */
-static void io_sqpoll_wait_sq(struct io_ring_ctx *ctx) +static int io_sqpoll_wait_sq(struct io_ring_ctx *ctx) { + int ret = 0; DEFINE_WAIT(wait);
do { @@ -8995,6 +9023,11 @@ static void io_sqpoll_wait_sq(struct io_ring_ctx *ctx)
prepare_to_wait(&ctx->sqo_sq_wait, &wait, TASK_INTERRUPTIBLE);
+ if (unlikely(ctx->sqo_dead)) { + ret = -EOWNERDEAD; + goto out; + } + if (!io_sqring_full(ctx)) break;
@@ -9002,6 +9035,8 @@ static void io_sqpoll_wait_sq(struct io_ring_ctx *ctx) } while (!signal_pending(current));
finish_wait(&ctx->sqo_sq_wait, &wait); +out: + return ret; }
SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, @@ -9045,10 +9080,16 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, if (ctx->flags & IORING_SETUP_SQPOLL) { io_cqring_overflow_flush(ctx, false, NULL, NULL);
+ ret = -EOWNERDEAD; + if (unlikely(ctx->sqo_dead)) + goto out; if (flags & IORING_ENTER_SQ_WAKEUP) wake_up(&ctx->sq_data->wait); - if (flags & IORING_ENTER_SQ_WAIT) - io_sqpoll_wait_sq(ctx); + if (flags & IORING_ENTER_SQ_WAIT) { + ret = io_sqpoll_wait_sq(ctx); + if (ret) + goto out; + } submitted = to_submit; } else if (to_submit) { ret = io_uring_add_task_file(ctx, f.file); @@ -9467,6 +9508,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags); return ret; err: + io_disable_sqo_submit(ctx); io_ring_ctx_wait_and_kill(ctx); return ret; }
[ Upstream commit b4411616c26f26c4017b8fa4d3538b1a02028733 ]
general protection fault, probably for non-canonical address 0xdffffc0000000022: 0000 [#1] KASAN: null-ptr-deref in range [0x0000000000000110-0x0000000000000117] RIP: 0010:io_ring_set_wakeup_flag fs/io_uring.c:6929 [inline] RIP: 0010:io_disable_sqo_submit+0xdb/0x130 fs/io_uring.c:8891 Call Trace: io_uring_create fs/io_uring.c:9711 [inline] io_uring_setup+0x12b1/0x38e0 fs/io_uring.c:9739 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xa9
io_disable_sqo_submit() might be called before user rings were allocated, don't do io_ring_set_wakeup_flag() in those cases.
Cc: stable@vger.kernel.org # 5.5+ Reported-by: syzbot+ab412638aeb652ded540@syzkaller.appspotmail.com Fixes: d9d05217cb69 ("io_uring: stop SQPOLL submit on creator's death") Signed-off-by: Pavel Begunkov asml.silence@gmail.com Signed-off-by: Jens Axboe axboe@kernel.dk --- fs/io_uring.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c index 723e1eb5349a..f1f1de815755 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8726,7 +8726,8 @@ static void io_disable_sqo_submit(struct io_ring_ctx *ctx) mutex_unlock(&ctx->uring_lock);
/* make sure callers enter the ring to get error */ - io_ring_set_wakeup_flag(ctx); + if (ctx->rings) + io_ring_set_wakeup_flag(ctx); }
/*
[ Upstream commit 06585c497b55045ec21aa8128e340f6a6587351c ]
WARNING: CPU: 0 PID: 8494 at fs/io_uring.c:8717 io_ring_ctx_wait_and_kill+0x4f2/0x600 fs/io_uring.c:8717 Call Trace: io_uring_release+0x3e/0x50 fs/io_uring.c:8759 __fput+0x283/0x920 fs/file_table.c:280 task_work_run+0xdd/0x190 kernel/task_work.c:140 tracehook_notify_resume include/linux/tracehook.h:189 [inline] exit_to_user_mode_loop kernel/entry/common.c:174 [inline] exit_to_user_mode_prepare+0x249/0x250 kernel/entry/common.c:201 __syscall_exit_to_user_mode_work kernel/entry/common.c:291 [inline] syscall_exit_to_user_mode+0x19/0x50 kernel/entry/common.c:302 entry_SYSCALL_64_after_hwframe+0x44/0xa9
failed io_uring_install_fd() is a special case, we don't do io_ring_ctx_wait_and_kill() directly but defer it to fput, though still need to io_disable_sqo_submit() before.
note: it doesn't fix any real problem, just a warning. That's because sqring won't be available to the userspace in this case and so SQPOLL won't submit anything.
Cc: stable@vger.kernel.org # 5.5+ Reported-by: syzbot+9c9c35374c0ecac06516@syzkaller.appspotmail.com Fixes: d9d05217cb69 ("io_uring: stop SQPOLL submit on creator's death") Signed-off-by: Pavel Begunkov asml.silence@gmail.com Signed-off-by: Jens Axboe axboe@kernel.dk --- fs/io_uring.c | 1 + 1 file changed, 1 insertion(+)
diff --git a/fs/io_uring.c b/fs/io_uring.c index f1f1de815755..2acea64656f3 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -9501,6 +9501,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, */ ret = io_uring_install_fd(ctx, file); if (ret < 0) { + io_disable_sqo_submit(ctx); /* fput will clean it up */ fput(file); return ret;
[ Upstream commit 6b393a1ff1746a1c91bd95cbb2d79b104d8f15ac ]
WARNING: CPU: 1 PID: 9094 at fs/io_uring.c:8884 io_disable_sqo_submit+0x106/0x130 fs/io_uring.c:8884 Call Trace: io_uring_flush+0x28b/0x3a0 fs/io_uring.c:9099 filp_close+0xb4/0x170 fs/open.c:1280 close_fd+0x5c/0x80 fs/file.c:626 __do_sys_close fs/open.c:1299 [inline] __se_sys_close fs/open.c:1297 [inline] __x64_sys_close+0x2f/0xa0 fs/open.c:1297 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xa9
io_uring's final close() may be triggered by any task not only the creator. It's well handled by io_uring_flush() including SQPOLL case, though a warning in io_disable_sqo_submit() will fallaciously fire by moving this warning out to the only call site that matters.
Cc: stable@vger.kernel.org # 5.5+ Reported-by: syzbot+2f5d1785dc624932da78@syzkaller.appspotmail.com Signed-off-by: Pavel Begunkov asml.silence@gmail.com Signed-off-by: Jens Axboe axboe@kernel.dk --- fs/io_uring.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c index 2acea64656f3..e8d0bea702a3 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8719,8 +8719,6 @@ static bool __io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
static void io_disable_sqo_submit(struct io_ring_ctx *ctx) { - WARN_ON_ONCE(ctx->sqo_task != current); - mutex_lock(&ctx->uring_lock); ctx->sqo_dead = 1; mutex_unlock(&ctx->uring_lock); @@ -8742,6 +8740,7 @@ static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) { /* for SQPOLL only sqo_task has task notes */ + WARN_ON_ONCE(ctx->sqo_task != current); io_disable_sqo_submit(ctx); task = ctx->sq_data->thread; atomic_inc(&task->io_uring->in_idle);
[ Upstream commit 4325cb498cb743dacaa3edbec398c5255f476ef6 ]
WARNING: CPU: 1 PID: 11100 at fs/io_uring.c:9096 io_uring_flush+0x326/0x3a0 fs/io_uring.c:9096 RIP: 0010:io_uring_flush+0x326/0x3a0 fs/io_uring.c:9096 Call Trace: filp_close+0xb4/0x170 fs/open.c:1280 close_files fs/file.c:401 [inline] put_files_struct fs/file.c:416 [inline] put_files_struct+0x1cc/0x350 fs/file.c:413 exit_files+0x7e/0xa0 fs/file.c:433 do_exit+0xc22/0x2ae0 kernel/exit.c:820 do_group_exit+0x125/0x310 kernel/exit.c:922 get_signal+0x3e9/0x20a0 kernel/signal.c:2770 arch_do_signal_or_restart+0x2a8/0x1eb0 arch/x86/kernel/signal.c:811 handle_signal_work kernel/entry/common.c:147 [inline] exit_to_user_mode_loop kernel/entry/common.c:171 [inline] exit_to_user_mode_prepare+0x148/0x250 kernel/entry/common.c:201 __syscall_exit_to_user_mode_work kernel/entry/common.c:291 [inline] syscall_exit_to_user_mode+0x19/0x50 kernel/entry/common.c:302 entry_SYSCALL_64_after_hwframe+0x44/0xa9
An SQPOLL ring creator task may have gotten rid of its file note during exit and called io_disable_sqo_submit(), but the io_uring is still left referenced through fdtable, which will be put during close_files() and cause a false positive warning.
First split the warning into two for more clarity when is hit, and the add sqo_dead check to handle the described case.
Cc: stable@vger.kernel.org # 5.5+ Reported-by: syzbot+a32b546d58dde07875a1@syzkaller.appspotmail.com Signed-off-by: Pavel Begunkov asml.silence@gmail.com Signed-off-by: Jens Axboe axboe@kernel.dk --- fs/io_uring.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c index e8d0bea702a3..12fa5e09cefa 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8931,7 +8931,10 @@ static int io_uring_flush(struct file *file, void *data)
if (ctx->flags & IORING_SETUP_SQPOLL) { /* there is only one file note, which is owned by sqo_task */ - WARN_ON_ONCE((ctx->sqo_task == current) == + WARN_ON_ONCE(ctx->sqo_task != current && + xa_load(&tctx->xa, (unsigned long)file)); + /* sqo_dead check is for when this happens after cancellation */ + WARN_ON_ONCE(ctx->sqo_task == current && !ctx->sqo_dead && !xa_load(&tctx->xa, (unsigned long)file));
io_disable_sqo_submit(ctx);
[ Upstream commit 0b5cd6c32b14413bf87e10ee62be3162588dcbe6 ]
If there are no requests at the time __io_uring_task_cancel() is called, tctx_inflight() returns zero and and it terminates not getting a chance to go through __io_uring_files_cancel() and do io_disable_sqo_submit(). And we absolutely want them disabled by the time cancellation ends.
Cc: stable@vger.kernel.org # 5.5+ Reported-by: Jens Axboe axboe@kernel.dk Signed-off-by: Pavel Begunkov asml.silence@gmail.com Fixes: d9d05217cb69 ("io_uring: stop SQPOLL submit on creator's death") Signed-off-by: Jens Axboe axboe@kernel.dk --- fs/io_uring.c | 4 ++++ 1 file changed, 4 insertions(+)
diff --git a/fs/io_uring.c b/fs/io_uring.c index 12fa5e09cefa..5ead8b6aeda2 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8886,6 +8886,10 @@ void __io_uring_task_cancel(void) /* make sure overflow events are dropped */ atomic_inc(&tctx->in_idle);
+ /* trigger io_disable_sqo_submit() */ + if (tctx->sqpoll) + __io_uring_files_cancel(NULL); + do { /* read completions before cancelations */ inflight = tctx_inflight(tctx);
[ Upstream commit 4aa84f2ffa81f71e15e5cffc2cc6090dbee78f8e ]
CPU0 CPU1 ---- ---- lock(&new->fa_lock); local_irq_disable(); lock(&ctx->completion_lock); lock(&new->fa_lock); <Interrupt> lock(&ctx->completion_lock);
*** DEADLOCK ***
Move kill_fasync() out of io_commit_cqring() to io_cqring_ev_posted(), so it doesn't hold completion_lock while doing it. That saves from the reported deadlock, and it's just nice to shorten the locking time and untangle nested locks (compl_lock -> wq_head::lock).
Cc: stable@vger.kernel.org # 5.5+ Reported-by: syzbot+91ca3f25bd7f795f019c@syzkaller.appspotmail.com Signed-off-by: Pavel Begunkov asml.silence@gmail.com Signed-off-by: Jens Axboe axboe@kernel.dk --- fs/io_uring.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c index 5ead8b6aeda2..1c5d71829bf5 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1213,11 +1213,6 @@ static void __io_commit_cqring(struct io_ring_ctx *ctx)
/* order cqe stores with ring update */ smp_store_release(&rings->cq.tail, ctx->cached_cq_tail); - - if (wq_has_sleeper(&ctx->cq_wait)) { - wake_up_interruptible(&ctx->cq_wait); - kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN); - } }
static void io_put_identity(struct io_uring_task *tctx, struct io_kiocb *req) @@ -1584,6 +1579,10 @@ static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
static void io_cqring_ev_posted(struct io_ring_ctx *ctx) { + if (wq_has_sleeper(&ctx->cq_wait)) { + wake_up_interruptible(&ctx->cq_wait); + kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN); + } if (waitqueue_active(&ctx->wait)) wake_up(&ctx->wait); if (ctx->sq_data && waitqueue_active(&ctx->sq_data->wait))
[ Upstream commit 9d5c8190683a462dbc787658467a0da17011ea5f ]
[ 27.629441] BUG: sleeping function called from invalid context at fs/file.c:402 [ 27.631317] in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 1012, name: io_wqe_worker-0 [ 27.633220] 1 lock held by io_wqe_worker-0/1012: [ 27.634286] #0: ffff888105e26c98 (&ctx->completion_lock) {....}-{2:2}, at: __io_req_complete.part.102+0x30/0x70 [ 27.649249] Call Trace: [ 27.649874] dump_stack+0xac/0xe3 [ 27.650666] ___might_sleep+0x284/0x2c0 [ 27.651566] put_files_struct+0xb8/0x120 [ 27.652481] __io_clean_op+0x10c/0x2a0 [ 27.653362] __io_cqring_fill_event+0x2c1/0x350 [ 27.654399] __io_req_complete.part.102+0x41/0x70 [ 27.655464] io_openat2+0x151/0x300 [ 27.656297] io_issue_sqe+0x6c/0x14e0 [ 27.660991] io_wq_submit_work+0x7f/0x240 [ 27.662890] io_worker_handle_work+0x501/0x8a0 [ 27.664836] io_wqe_worker+0x158/0x520 [ 27.667726] kthread+0x134/0x180 [ 27.669641] ret_from_fork+0x1f/0x30
Instead of cleaning files on overflow, return back overflow cancellation into io_uring_cancel_files(). Previously it was racy to clean REQ_F_OVERFLOW flag, but we got rid of it, and can do it through repetitive attempts targeting all matching requests.
Cc: stable@vger.kernel.org # 5.9+ Reported-by: Abaci abaci@linux.alibaba.com Reported-by: Joseph Qi joseph.qi@linux.alibaba.com Cc: Xiaoguang Wang xiaoguang.wang@linux.alibaba.com Signed-off-by: Pavel Begunkov asml.silence@gmail.com Signed-off-by: Jens Axboe axboe@kernel.dk --- fs/io_uring.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c index 1c5d71829bf5..f77821626a92 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -970,6 +970,7 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req, static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec, const struct iovec *fast_iov, struct iov_iter *iter, bool force); +static void io_req_drop_files(struct io_kiocb *req);
static struct kmem_cache *req_cachep;
@@ -990,8 +991,7 @@ EXPORT_SYMBOL(io_uring_get_socket);
static inline void io_clean_op(struct io_kiocb *req) { - if (req->flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED | - REQ_F_INFLIGHT)) + if (req->flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED)) __io_clean_op(req); }
@@ -1255,6 +1255,8 @@ static void io_req_clean_work(struct io_kiocb *req) free_fs_struct(fs); req->work.flags &= ~IO_WQ_WORK_FS; } + if (req->flags & REQ_F_INFLIGHT) + io_req_drop_files(req);
io_put_identity(req->task->io_uring, req); } @@ -5929,9 +5931,6 @@ static void __io_clean_op(struct io_kiocb *req) } req->flags &= ~REQ_F_NEED_CLEANUP; } - - if (req->flags & REQ_F_INFLIGHT) - io_req_drop_files(req); }
static int io_issue_sqe(struct io_kiocb *req, bool force_nonblock, @@ -8669,6 +8668,8 @@ static bool io_uring_cancel_files(struct io_ring_ctx *ctx, break; /* cancel this request, or head link requests */ io_attempt_cancel(ctx, cancel_req); + io_cqring_overflow_flush(ctx, true, task, files); + io_put_req(cancel_req); /* cancellations _may_ trigger task work */ io_run_task_work();
On 1/26/21 4:16 AM, Pavel Begunkov wrote:
Rebased on linux-5.10.y, stable tags added. The first was dropped before, most of others make it right.
Pavel Begunkov (11): kernel/io_uring: cancel io_uring before task works io_uring: inline io_uring_attempt_task_drop() io_uring: add warn_once for io_uring_flush() io_uring: stop SQPOLL submit on creator's death io_uring: fix null-deref in io_disable_sqo_submit io_uring: do sqo disable on install_fd error io_uring: fix false positive sqo warning on flush io_uring: fix uring_flush in exit_files() warning io_uring: fix skipping disabling sqo on exec io_uring: dont kill fasync under completion_lock io_uring: fix sleeping under spin in __io_clean_op
fs/file.c | 2 - fs/io_uring.c | 119 +++++++++++++++++++++++++++++++++++--------------- kernel/exit.c | 2 + 3 files changed, 86 insertions(+), 37 deletions(-)
Thanks for doing this work, Pavel. We'll need a few followups once the current io_uring-5.11 has been merged to Linus's tree, but that's really minor compared to getting this series together.
Greg/Sasha, please apply.
linux-stable-mirror@lists.linaro.org