5.15 backport of io_uring pollfree patches.
Jens Axboe (2): io_uring: remove poll entry from list when canceling all io_uring: bump poll refs to full 31-bits
Jiapeng Chong (1): io_uring: Remove unused function req_ref_put
Pavel Begunkov (10): io_uring: correct fill events helpers types io_uring: clean cqe filling functions io_uring: refactor poll update io_uring: move common poll bits io_uring: kill poll linking optimisation io_uring: inline io_poll_complete io_uring: poll rework io_uring: fail links when poll fails io_uring: fix wrong arm_poll error handling io_uring: fix UAF due to missing POLLFREE handling
fs/io_uring.c | 746 +++++++++++++++++++++++--------------------------- 1 file changed, 347 insertions(+), 399 deletions(-)
[ upstream commit 54daa9b2d80ab35824464b35a99f716e1cdf2ccb ]
CQE result is a 32-bit integer, so the functions generating CQEs are better to accept not long but ints. Convert io_cqring_fill_event() and other helpers.
Signed-off-by: Pavel Begunkov asml.silence@gmail.com Link: https://lore.kernel.org/r/7ca6f15255e9117eae28adcac272744cae29b113.163337330... Signed-off-by: Jens Axboe axboe@kernel.dk [pavel: backport] Signed-off-by: Pavel Begunkov asml.silence@gmail.com --- fs/io_uring.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c index 9bff14c5e2b2..b5718278ae61 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1080,7 +1080,7 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);
static bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data, - long res, unsigned int cflags); + s32 res, u32 cflags); static void io_put_req(struct io_kiocb *req); static void io_put_req_deferred(struct io_kiocb *req); static void io_dismantle_req(struct io_kiocb *req); @@ -1763,7 +1763,7 @@ static __cold void io_uring_drop_tctx_refs(struct task_struct *task) }
static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, - long res, unsigned int cflags) + s32 res, u32 cflags) { struct io_overflow_cqe *ocqe;
@@ -1791,7 +1791,7 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, }
static inline bool __io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data, - long res, unsigned int cflags) + s32 res, u32 cflags) { struct io_uring_cqe *cqe;
@@ -1814,13 +1814,13 @@ static inline bool __io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data
/* not as hot to bloat with inlining */ static noinline bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data, - long res, unsigned int cflags) + s32 res, u32 cflags) { return __io_cqring_fill_event(ctx, user_data, res, cflags); }
-static void io_req_complete_post(struct io_kiocb *req, long res, - unsigned int cflags) +static void io_req_complete_post(struct io_kiocb *req, s32 res, + u32 cflags) { struct io_ring_ctx *ctx = req->ctx;
@@ -1861,8 +1861,8 @@ static inline bool io_req_needs_clean(struct io_kiocb *req) return req->flags & IO_REQ_CLEAN_FLAGS; }
-static void io_req_complete_state(struct io_kiocb *req, long res, - unsigned int cflags) +static inline void io_req_complete_state(struct io_kiocb *req, s32 res, + u32 cflags) { if (io_req_needs_clean(req)) io_clean_op(req); @@ -1872,7 +1872,7 @@ static void io_req_complete_state(struct io_kiocb *req, long res, }
static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags, - long res, unsigned cflags) + s32 res, u32 cflags) { if (issue_flags & IO_URING_F_COMPLETE_DEFER) io_req_complete_state(req, res, cflags); @@ -1880,12 +1880,12 @@ static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags, io_req_complete_post(req, res, cflags); }
-static inline void io_req_complete(struct io_kiocb *req, long res) +static inline void io_req_complete(struct io_kiocb *req, s32 res) { __io_req_complete(req, 0, res, 0); }
-static void io_req_complete_failed(struct io_kiocb *req, long res) +static void io_req_complete_failed(struct io_kiocb *req, s32 res) { req_set_fail(req); io_req_complete_post(req, res, 0); @@ -2707,7 +2707,7 @@ static bool __io_complete_rw_common(struct io_kiocb *req, long res) static void io_req_task_complete(struct io_kiocb *req, bool *locked) { unsigned int cflags = io_put_rw_kbuf(req); - long res = req->result; + int res = req->result;
if (*locked) { struct io_ring_ctx *ctx = req->ctx;
[ upstream commmit 913a571affedd17239c4d4ea90c8874b32fc2191 ]
Split io_cqring_fill_event() into a couple of more targeted functions. The first on is io_fill_cqe_aux() for completions that are not associated with request completions and doing the ->cq_extra accounting. Examples are additional CQEs from multishot poll and rsrc notifications.
The second is io_fill_cqe_req(), should be called when it's a normal request completion. Nothing more to it at the moment, will be used in later patches.
The last one is inlined __io_fill_cqe() for a finer grained control, should be used with caution and in hottest places.
Signed-off-by: Pavel Begunkov asml.silence@gmail.com Link: https://lore.kernel.org/r/59a9117a4a44fc9efcf04b3afa51e0d080f5943c.163655911... Signed-off-by: Jens Axboe axboe@kernel.dk [pavel: backport] Signed-off-by: Pavel Begunkov asml.silence@gmail.com --- fs/io_uring.c | 57 ++++++++++++++++++++++++++------------------------- 1 file changed, 29 insertions(+), 28 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c index b5718278ae61..7812a8e81f3e 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1079,8 +1079,8 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, bool cancel_all); static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);
-static bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data, - s32 res, u32 cflags); +static void io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags); + static void io_put_req(struct io_kiocb *req); static void io_put_req_deferred(struct io_kiocb *req); static void io_dismantle_req(struct io_kiocb *req); @@ -1515,7 +1515,7 @@ static void io_kill_timeout(struct io_kiocb *req, int status) atomic_set(&req->ctx->cq_timeouts, atomic_read(&req->ctx->cq_timeouts) + 1); list_del_init(&req->timeout.list); - io_cqring_fill_event(req->ctx, req->user_data, status, 0); + io_fill_cqe_req(req, status, 0); io_put_req_deferred(req); } } @@ -1790,8 +1790,8 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, return true; }
-static inline bool __io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data, - s32 res, u32 cflags) +static inline bool __io_fill_cqe(struct io_ring_ctx *ctx, u64 user_data, + s32 res, u32 cflags) { struct io_uring_cqe *cqe;
@@ -1812,11 +1812,16 @@ static inline bool __io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data return io_cqring_event_overflow(ctx, user_data, res, cflags); }
-/* not as hot to bloat with inlining */ -static noinline bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data, - s32 res, u32 cflags) +static noinline void io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags) +{ + __io_fill_cqe(req->ctx, req->user_data, res, cflags); +} + +static noinline bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, + s32 res, u32 cflags) { - return __io_cqring_fill_event(ctx, user_data, res, cflags); + ctx->cq_extra++; + return __io_fill_cqe(ctx, user_data, res, cflags); }
static void io_req_complete_post(struct io_kiocb *req, s32 res, @@ -1825,7 +1830,7 @@ static void io_req_complete_post(struct io_kiocb *req, s32 res, struct io_ring_ctx *ctx = req->ctx;
spin_lock(&ctx->completion_lock); - __io_cqring_fill_event(ctx, req->user_data, res, cflags); + __io_fill_cqe(ctx, req->user_data, res, cflags); /* * If we're the last reference to this request, add to our locked * free_list cache. @@ -2051,8 +2056,7 @@ static bool io_kill_linked_timeout(struct io_kiocb *req) link->timeout.head = NULL; if (hrtimer_try_to_cancel(&io->timer) != -1) { list_del(&link->timeout.list); - io_cqring_fill_event(link->ctx, link->user_data, - -ECANCELED, 0); + io_fill_cqe_req(link, -ECANCELED, 0); io_put_req_deferred(link); return true; } @@ -2076,7 +2080,7 @@ static void io_fail_links(struct io_kiocb *req) link->link = NULL;
trace_io_uring_fail_link(req, link); - io_cqring_fill_event(link->ctx, link->user_data, res, 0); + io_fill_cqe_req(link, res, 0); io_put_req_deferred(link); link = nxt; } @@ -2093,8 +2097,7 @@ static bool io_disarm_next(struct io_kiocb *req) req->flags &= ~REQ_F_ARM_LTIMEOUT; if (link && link->opcode == IORING_OP_LINK_TIMEOUT) { io_remove_next_linked(req); - io_cqring_fill_event(link->ctx, link->user_data, - -ECANCELED, 0); + io_fill_cqe_req(link, -ECANCELED, 0); io_put_req_deferred(link); posted = true; } @@ -2370,8 +2373,8 @@ static void io_submit_flush_completions(struct io_ring_ctx *ctx) for (i = 0; i < nr; i++) { struct io_kiocb *req = state->compl_reqs[i];
- __io_cqring_fill_event(ctx, req->user_data, req->result, - req->compl.cflags); + __io_fill_cqe(ctx, req->user_data, req->result, + req->compl.cflags); } io_commit_cqring(ctx); spin_unlock(&ctx->completion_lock); @@ -2482,8 +2485,7 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, req = list_first_entry(done, struct io_kiocb, inflight_entry); list_del(&req->inflight_entry);
- __io_cqring_fill_event(ctx, req->user_data, req->result, - io_put_rw_kbuf(req)); + io_fill_cqe_req(req, req->result, io_put_rw_kbuf(req)); (*nr_events)++;
if (req_ref_put_and_test(req)) @@ -5407,13 +5409,13 @@ static bool __io_poll_complete(struct io_kiocb *req, __poll_t mask) } if (req->poll.events & EPOLLONESHOT) flags = 0; - if (!io_cqring_fill_event(ctx, req->user_data, error, flags)) { + + if (!(flags & IORING_CQE_F_MORE)) { + io_fill_cqe_req(req, error, flags); + } else if (!io_fill_cqe_aux(ctx, req->user_data, error, flags)) { req->poll.events |= EPOLLONESHOT; flags = 0; } - if (flags & IORING_CQE_F_MORE) - ctx->cq_extra++; - return !(flags & IORING_CQE_F_MORE); }
@@ -5740,9 +5742,9 @@ static bool io_poll_remove_one(struct io_kiocb *req) do_complete = __io_poll_remove_one(req, io_poll_get_single(req), true);
if (do_complete) { - io_cqring_fill_event(req->ctx, req->user_data, -ECANCELED, 0); - io_commit_cqring(req->ctx); req_set_fail(req); + io_fill_cqe_req(req, -ECANCELED, 0); + io_commit_cqring(req->ctx); io_put_req_deferred(req); } return do_complete; @@ -6039,7 +6041,7 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) return PTR_ERR(req);
req_set_fail(req); - io_cqring_fill_event(ctx, req->user_data, -ECANCELED, 0); + io_fill_cqe_req(req, -ECANCELED, 0); io_put_req_deferred(req); return 0; } @@ -8265,8 +8267,7 @@ static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
io_ring_submit_lock(ctx, lock_ring); spin_lock(&ctx->completion_lock); - io_cqring_fill_event(ctx, prsrc->tag, 0, 0); - ctx->cq_extra++; + io_fill_cqe_aux(ctx, prsrc->tag, 0, 0); io_commit_cqring(ctx); spin_unlock(&ctx->completion_lock); io_cqring_ev_posted(ctx);
[ upstream commmit 2bbb146d96f4b45e17d6aeede300796bc1a96d68 ]
Clean up io_poll_update() and unify cancellation paths for remove and update.
Signed-off-by: Pavel Begunkov asml.silence@gmail.com Link: https://lore.kernel.org/r/5937138b6265a1285220e2fab1b28132c1d73ce3.163960518... Signed-off-by: Jens Axboe axboe@kernel.dk [pavel: backport] Signed-off-by: Pavel Begunkov asml.silence@gmail.com --- fs/io_uring.c | 62 +++++++++++++++++++++------------------------------ 1 file changed, 26 insertions(+), 36 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c index 7812a8e81f3e..7d58da54664b 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -5925,61 +5925,51 @@ static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags) struct io_ring_ctx *ctx = req->ctx; struct io_kiocb *preq; bool completing; - int ret; + int ret2, ret = 0;
spin_lock(&ctx->completion_lock); preq = io_poll_find(ctx, req->poll_update.old_user_data, true); if (!preq) { ret = -ENOENT; - goto err; - } - - if (!req->poll_update.update_events && !req->poll_update.update_user_data) { - completing = true; - ret = io_poll_remove_one(preq) ? 0 : -EALREADY; - goto err; +fail: + spin_unlock(&ctx->completion_lock); + goto out; } - + io_poll_remove_double(preq); /* * Don't allow racy completion with singleshot, as we cannot safely * update those. For multishot, if we're racing with completion, just * let completion re-add it. */ - io_poll_remove_double(preq); completing = !__io_poll_remove_one(preq, &preq->poll, false); if (completing && (preq->poll.events & EPOLLONESHOT)) { ret = -EALREADY; - goto err; - } - /* we now have a detached poll request. reissue. */ - ret = 0; -err: - if (ret < 0) { - spin_unlock(&ctx->completion_lock); - req_set_fail(req); - io_req_complete(req, ret); - return 0; - } - /* only mask one event flags, keep behavior flags */ - if (req->poll_update.update_events) { - preq->poll.events &= ~0xffff; - preq->poll.events |= req->poll_update.events & 0xffff; - preq->poll.events |= IO_POLL_UNMASK; + goto fail; } - if (req->poll_update.update_user_data) - preq->user_data = req->poll_update.new_user_data; spin_unlock(&ctx->completion_lock);
- /* complete update request, we're done with it */ - io_req_complete(req, ret); - - if (!completing) { - ret = io_poll_add(preq, issue_flags); - if (ret < 0) { - req_set_fail(preq); - io_req_complete(preq, ret); + if (req->poll_update.update_events || req->poll_update.update_user_data) { + /* only mask one event flags, keep behavior flags */ + if (req->poll_update.update_events) { + preq->poll.events &= ~0xffff; + preq->poll.events |= req->poll_update.events & 0xffff; + preq->poll.events |= IO_POLL_UNMASK; } + if (req->poll_update.update_user_data) + preq->user_data = req->poll_update.new_user_data; + + ret2 = io_poll_add(preq, issue_flags); + /* successfully updated, don't complete poll request */ + if (!ret2) + goto out; } + req_set_fail(preq); + io_req_complete(preq, -ECANCELED); +out: + if (ret < 0) + req_set_fail(req); + /* complete update request, we're done with it */ + io_req_complete(req, ret); return 0; }
[ upstream commmit 5641897a5e8fb8abeb07e89c71a788d3db3ec75e ]
Move some poll helpers/etc up, we'll need them there shortly
Signed-off-by: Pavel Begunkov asml.silence@gmail.com Link: https://lore.kernel.org/r/6c5c3dba24c86aad5cd389a54a8c7412e6a0621d.163960518... Signed-off-by: Jens Axboe axboe@kernel.dk [pavel: backport] Signed-off-by: Pavel Begunkov asml.silence@gmail.com --- fs/io_uring.c | 74 +++++++++++++++++++++++++-------------------------- 1 file changed, 37 insertions(+), 37 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c index 7d58da54664b..8fa257b62ba7 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -5312,6 +5312,43 @@ struct io_poll_table { int error; };
+static struct io_poll_iocb *io_poll_get_double(struct io_kiocb *req) +{ + /* pure poll stashes this in ->async_data, poll driven retry elsewhere */ + if (req->opcode == IORING_OP_POLL_ADD) + return req->async_data; + return req->apoll->double_poll; +} + +static struct io_poll_iocb *io_poll_get_single(struct io_kiocb *req) +{ + if (req->opcode == IORING_OP_POLL_ADD) + return &req->poll; + return &req->apoll->poll; +} + +static void io_poll_req_insert(struct io_kiocb *req) +{ + struct io_ring_ctx *ctx = req->ctx; + struct hlist_head *list; + + list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)]; + hlist_add_head(&req->hash_node, list); +} + +static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events, + wait_queue_func_t wake_func) +{ + poll->head = NULL; + poll->done = false; + poll->canceled = false; +#define IO_POLL_UNMASK (EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP) + /* mask in events that we always want/need */ + poll->events = events | IO_POLL_UNMASK; + INIT_LIST_HEAD(&poll->wait.entry); + init_waitqueue_func_entry(&poll->wait, wake_func); +} + static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll, __poll_t mask, io_req_tw_func_t func) { @@ -5360,21 +5397,6 @@ static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll) return false; }
-static struct io_poll_iocb *io_poll_get_double(struct io_kiocb *req) -{ - /* pure poll stashes this in ->async_data, poll driven retry elsewhere */ - if (req->opcode == IORING_OP_POLL_ADD) - return req->async_data; - return req->apoll->double_poll; -} - -static struct io_poll_iocb *io_poll_get_single(struct io_kiocb *req) -{ - if (req->opcode == IORING_OP_POLL_ADD) - return &req->poll; - return &req->apoll->poll; -} - static void io_poll_remove_double(struct io_kiocb *req) __must_hold(&req->ctx->completion_lock) { @@ -5499,19 +5521,6 @@ static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode, return 1; }
-static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events, - wait_queue_func_t wake_func) -{ - poll->head = NULL; - poll->done = false; - poll->canceled = false; -#define IO_POLL_UNMASK (EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP) - /* mask in events that we always want/need */ - poll->events = events | IO_POLL_UNMASK; - INIT_LIST_HEAD(&poll->wait.entry); - init_waitqueue_func_entry(&poll->wait, wake_func); -} - static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt, struct wait_queue_head *head, struct io_poll_iocb **poll_ptr) @@ -5606,15 +5615,6 @@ static int io_async_wake(struct wait_queue_entry *wait, unsigned mode, int sync, return __io_async_wake(req, poll, key_to_poll(key), io_async_task_func); }
-static void io_poll_req_insert(struct io_kiocb *req) -{ - struct io_ring_ctx *ctx = req->ctx; - struct hlist_head *list; - - list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)]; - hlist_add_head(&req->hash_node, list); -} - static __poll_t __io_arm_poll_handler(struct io_kiocb *req, struct io_poll_iocb *poll, struct io_poll_table *ipt, __poll_t mask,
[ upstream commmit ab1dab960b8352cee082db0f8a54dc92a948bfd7 ]
With IORING_FEAT_FAST_POLL in place, io_put_req_find_next() for poll requests doesn't make much sense, and in any case re-adding it shouldn't be a problem considering batching in tctx_task_work(). We can remove it.
Signed-off-by: Pavel Begunkov asml.silence@gmail.com Link: https://lore.kernel.org/r/15699682bf81610ec901d4e79d6da64baa9f70be.163960518... Signed-off-by: Jens Axboe axboe@kernel.dk [pavel: backport] Signed-off-by: Pavel Begunkov asml.silence@gmail.com --- fs/io_uring.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c index 8fa257b62ba7..6e80876d8894 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -5454,7 +5454,6 @@ static inline bool io_poll_complete(struct io_kiocb *req, __poll_t mask) static void io_poll_task_func(struct io_kiocb *req, bool *locked) { struct io_ring_ctx *ctx = req->ctx; - struct io_kiocb *nxt;
if (io_poll_rewait(req, &req->poll)) { spin_unlock(&ctx->completion_lock); @@ -5478,11 +5477,8 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked) spin_unlock(&ctx->completion_lock); io_cqring_ev_posted(ctx);
- if (done) { - nxt = io_put_req_find_next(req); - if (nxt) - io_req_task_submit(nxt, locked); - } + if (done) + io_put_req(req); } }
[ upstream commmit eb6e6f0690c846f7de46181bab3954c12c96e11e ]
Inline io_poll_complete(), it's simple and doesn't have any particular purpose.
Signed-off-by: Pavel Begunkov asml.silence@gmail.com Link: https://lore.kernel.org/r/933d7ee3e4450749a2d892235462c8f18d030293.163337330... Signed-off-by: Jens Axboe axboe@kernel.dk [pavel: backport] Signed-off-by: Pavel Begunkov asml.silence@gmail.com --- fs/io_uring.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c index 6e80876d8894..39d39dfaa55a 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -5441,16 +5441,6 @@ static bool __io_poll_complete(struct io_kiocb *req, __poll_t mask) return !(flags & IORING_CQE_F_MORE); }
-static inline bool io_poll_complete(struct io_kiocb *req, __poll_t mask) - __must_hold(&req->ctx->completion_lock) -{ - bool done; - - done = __io_poll_complete(req, mask); - io_commit_cqring(req->ctx); - return done; -} - static void io_poll_task_func(struct io_kiocb *req, bool *locked) { struct io_ring_ctx *ctx = req->ctx; @@ -5904,7 +5894,8 @@ static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
if (mask) { /* no async, we'd stolen it */ ipt.error = 0; - done = io_poll_complete(req, mask); + done = __io_poll_complete(req, mask); + io_commit_cqring(req->ctx); } spin_unlock(&ctx->completion_lock);
[ upstream commmit aa43477b040251f451db0d844073ac00a8ab66ee ]
It's not possible to go forward with the current state of io_uring polling, we need a more straightforward and easier synchronisation. There are a lot of problems with how it is at the moment, including missing events on rewait.
The main idea here is to introduce a notion of request ownership while polling, no one but the owner can modify any part but ->poll_refs of struct io_kiocb, that grants us protection against all sorts of races.
Main users of such exclusivity are poll task_work handler, so before queueing a tw one should have/acquire ownership, which will be handed off to the tw handler. The other user is __io_arm_poll_handler() do initial poll arming. It starts taking the ownership, so tw handlers won't be run until it's released later in the function after vfs_poll. note: also prevents races in __io_queue_proc(). Poll wake/etc. may not be able to get ownership, then they need to increase the poll refcount and the task_work should notice it and retry if necessary, see io_poll_check_events(). There is also IO_POLL_CANCEL_FLAG flag to notify that we want to kill request.
It makes cancellations more reliable, enables double multishot polling, fixes double poll rewait, fixes missing poll events and fixes another bunch of races.
Even though it adds some overhead for new refcounting, and there are a couple of nice performance wins: - no req->refs refcounting for poll requests anymore - if the data is already there (once measured for some test to be 1-2% of all apoll requests), it removes it doesn't add atomics and removes spin_lock/unlock pair. - works well with multishots, we don't do remove from queue / add to queue for each new poll event.
Signed-off-by: Pavel Begunkov asml.silence@gmail.com Link: https://lore.kernel.org/r/6b652927c77ed9580ea4330ac5612f0e0848c946.163960518... Signed-off-by: Jens Axboe axboe@kernel.dk [pavel: backport] Signed-off-by: Pavel Begunkov asml.silence@gmail.com --- fs/io_uring.c | 525 ++++++++++++++++++++++---------------------------- 1 file changed, 227 insertions(+), 298 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c index 39d39dfaa55a..bc4741061258 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -486,8 +486,6 @@ struct io_poll_iocb { struct file *file; struct wait_queue_head *head; __poll_t events; - bool done; - bool canceled; struct wait_queue_entry wait; };
@@ -885,6 +883,9 @@ struct io_kiocb {
/* store used ubuf, so we can prevent reloading */ struct io_mapped_ubuf *imu; + /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */ + struct io_buffer *kbuf; + atomic_t poll_refs; };
struct io_tctx_node { @@ -5312,6 +5313,25 @@ struct io_poll_table { int error; };
+#define IO_POLL_CANCEL_FLAG BIT(31) +#define IO_POLL_REF_MASK ((1u << 20)-1) + +/* + * If refs part of ->poll_refs (see IO_POLL_REF_MASK) is 0, it's free. We can + * bump it and acquire ownership. It's disallowed to modify requests while not + * owning it, that prevents from races for enqueueing task_work's and b/w + * arming poll and wakeups. + */ +static inline bool io_poll_get_ownership(struct io_kiocb *req) +{ + return !(atomic_fetch_inc(&req->poll_refs) & IO_POLL_REF_MASK); +} + +static void io_poll_mark_cancelled(struct io_kiocb *req) +{ + atomic_or(IO_POLL_CANCEL_FLAG, &req->poll_refs); +} + static struct io_poll_iocb *io_poll_get_double(struct io_kiocb *req) { /* pure poll stashes this in ->async_data, poll driven retry elsewhere */ @@ -5340,8 +5360,6 @@ static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events, wait_queue_func_t wake_func) { poll->head = NULL; - poll->done = false; - poll->canceled = false; #define IO_POLL_UNMASK (EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP) /* mask in events that we always want/need */ poll->events = events | IO_POLL_UNMASK; @@ -5349,161 +5367,168 @@ static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events, init_waitqueue_func_entry(&poll->wait, wake_func); }
-static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll, - __poll_t mask, io_req_tw_func_t func) +static inline void io_poll_remove_entry(struct io_poll_iocb *poll) { - /* for instances that support it check for an event match first: */ - if (mask && !(mask & poll->events)) - return 0; - - trace_io_uring_task_add(req->ctx, req->opcode, req->user_data, mask); + struct wait_queue_head *head = poll->head;
+ spin_lock_irq(&head->lock); list_del_init(&poll->wait.entry); + poll->head = NULL; + spin_unlock_irq(&head->lock); +}
- req->result = mask; - req->io_task_work.func = func; +static void io_poll_remove_entries(struct io_kiocb *req) +{ + struct io_poll_iocb *poll = io_poll_get_single(req); + struct io_poll_iocb *poll_double = io_poll_get_double(req);
- /* - * If this fails, then the task is exiting. When a task exits, the - * work gets canceled, so just cancel this request as well instead - * of executing it. We can't safely execute it anyway, as we may not - * have the needed state needed for it anyway. - */ - io_req_task_work_add(req); - return 1; + if (poll->head) + io_poll_remove_entry(poll); + if (poll_double && poll_double->head) + io_poll_remove_entry(poll_double); }
-static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll) - __acquires(&req->ctx->completion_lock) +/* + * All poll tw should go through this. Checks for poll events, manages + * references, does rewait, etc. + * + * Returns a negative error on failure. >0 when no action require, which is + * either spurious wakeup or multishot CQE is served. 0 when it's done with + * the request, then the mask is stored in req->result. + */ +static int io_poll_check_events(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; + struct io_poll_iocb *poll = io_poll_get_single(req); + int v;
/* req->task == current here, checking PF_EXITING is safe */ if (unlikely(req->task->flags & PF_EXITING)) - WRITE_ONCE(poll->canceled, true); + io_poll_mark_cancelled(req);
- if (!req->result && !READ_ONCE(poll->canceled)) { - struct poll_table_struct pt = { ._key = poll->events }; + do { + v = atomic_read(&req->poll_refs);
- req->result = vfs_poll(req->file, &pt) & poll->events; - } + /* tw handler should be the owner, and so have some references */ + if (WARN_ON_ONCE(!(v & IO_POLL_REF_MASK))) + return 0; + if (v & IO_POLL_CANCEL_FLAG) + return -ECANCELED;
- spin_lock(&ctx->completion_lock); - if (!req->result && !READ_ONCE(poll->canceled)) { - add_wait_queue(poll->head, &poll->wait); - return true; - } + if (!req->result) { + struct poll_table_struct pt = { ._key = poll->events };
- return false; -} + req->result = vfs_poll(req->file, &pt) & poll->events; + }
-static void io_poll_remove_double(struct io_kiocb *req) - __must_hold(&req->ctx->completion_lock) -{ - struct io_poll_iocb *poll = io_poll_get_double(req); + /* multishot, just fill an CQE and proceed */ + if (req->result && !(poll->events & EPOLLONESHOT)) { + __poll_t mask = mangle_poll(req->result & poll->events); + bool filled;
- lockdep_assert_held(&req->ctx->completion_lock); + spin_lock(&ctx->completion_lock); + filled = io_fill_cqe_aux(ctx, req->user_data, mask, + IORING_CQE_F_MORE); + io_commit_cqring(ctx); + spin_unlock(&ctx->completion_lock); + if (unlikely(!filled)) + return -ECANCELED; + io_cqring_ev_posted(ctx); + } else if (req->result) { + return 0; + }
- if (poll && poll->head) { - struct wait_queue_head *head = poll->head; + /* + * Release all references, retry if someone tried to restart + * task_work while we were executing it. + */ + } while (atomic_sub_return(v & IO_POLL_REF_MASK, &req->poll_refs));
- spin_lock_irq(&head->lock); - list_del_init(&poll->wait.entry); - if (poll->wait.private) - req_ref_put(req); - poll->head = NULL; - spin_unlock_irq(&head->lock); - } + return 1; }
-static bool __io_poll_complete(struct io_kiocb *req, __poll_t mask) - __must_hold(&req->ctx->completion_lock) +static void io_poll_task_func(struct io_kiocb *req, bool *locked) { struct io_ring_ctx *ctx = req->ctx; - unsigned flags = IORING_CQE_F_MORE; - int error; + int ret; + + ret = io_poll_check_events(req); + if (ret > 0) + return;
- if (READ_ONCE(req->poll.canceled)) { - error = -ECANCELED; - req->poll.events |= EPOLLONESHOT; + if (!ret) { + req->result = mangle_poll(req->result & req->poll.events); } else { - error = mangle_poll(mask); + req->result = ret; + req_set_fail(req); } - if (req->poll.events & EPOLLONESHOT) - flags = 0;
- if (!(flags & IORING_CQE_F_MORE)) { - io_fill_cqe_req(req, error, flags); - } else if (!io_fill_cqe_aux(ctx, req->user_data, error, flags)) { - req->poll.events |= EPOLLONESHOT; - flags = 0; - } - return !(flags & IORING_CQE_F_MORE); + io_poll_remove_entries(req); + spin_lock(&ctx->completion_lock); + hash_del(&req->hash_node); + spin_unlock(&ctx->completion_lock); + io_req_complete_post(req, req->result, 0); }
-static void io_poll_task_func(struct io_kiocb *req, bool *locked) +static void io_apoll_task_func(struct io_kiocb *req, bool *locked) { struct io_ring_ctx *ctx = req->ctx; + int ret;
- if (io_poll_rewait(req, &req->poll)) { - spin_unlock(&ctx->completion_lock); - } else { - bool done; + ret = io_poll_check_events(req); + if (ret > 0) + return;
- if (req->poll.done) { - spin_unlock(&ctx->completion_lock); - return; - } - done = __io_poll_complete(req, req->result); - if (done) { - io_poll_remove_double(req); - hash_del(&req->hash_node); - req->poll.done = true; - } else { - req->result = 0; - add_wait_queue(req->poll.head, &req->poll.wait); - } - io_commit_cqring(ctx); - spin_unlock(&ctx->completion_lock); - io_cqring_ev_posted(ctx); + io_poll_remove_entries(req); + spin_lock(&ctx->completion_lock); + hash_del(&req->hash_node); + spin_unlock(&ctx->completion_lock);
- if (done) - io_put_req(req); - } + if (!ret) + io_req_task_submit(req, locked); + else + io_req_complete_failed(req, ret); +} + +static void __io_poll_execute(struct io_kiocb *req, int mask) +{ + req->result = mask; + if (req->opcode == IORING_OP_POLL_ADD) + req->io_task_work.func = io_poll_task_func; + else + req->io_task_work.func = io_apoll_task_func; + + trace_io_uring_task_add(req->ctx, req->opcode, req->user_data, mask); + io_req_task_work_add(req); }
-static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode, - int sync, void *key) +static inline void io_poll_execute(struct io_kiocb *req, int res) +{ + if (io_poll_get_ownership(req)) + __io_poll_execute(req, res); +} + +static void io_poll_cancel_req(struct io_kiocb *req) +{ + io_poll_mark_cancelled(req); + /* kick tw, which should complete the request */ + io_poll_execute(req, 0); +} + +static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, + void *key) { struct io_kiocb *req = wait->private; - struct io_poll_iocb *poll = io_poll_get_single(req); + struct io_poll_iocb *poll = container_of(wait, struct io_poll_iocb, + wait); __poll_t mask = key_to_poll(key); - unsigned long flags;
- /* for instances that support it check for an event match first: */ + /* for instances that support it check for an event match first */ if (mask && !(mask & poll->events)) return 0; - if (!(poll->events & EPOLLONESHOT)) - return poll->wait.func(&poll->wait, mode, sync, key); - - list_del_init(&wait->entry);
- if (poll->head) { - bool done; - - spin_lock_irqsave(&poll->head->lock, flags); - done = list_empty(&poll->wait.entry); - if (!done) - list_del_init(&poll->wait.entry); - /* make sure double remove sees this as being gone */ - wait->private = NULL; - spin_unlock_irqrestore(&poll->head->lock, flags); - if (!done) { - /* use wait func handler, so it matches the rq type */ - poll->wait.func(&poll->wait, mode, sync, key); - } - } - req_ref_put(req); + if (io_poll_get_ownership(req)) + __io_poll_execute(req, mask); return 1; }
@@ -5519,10 +5544,10 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt, * if this happens. */ if (unlikely(pt->nr_entries)) { - struct io_poll_iocb *poll_one = poll; + struct io_poll_iocb *first = poll;
/* double add on the same waitqueue head, ignore */ - if (poll_one->head == head) + if (first->head == head) return; /* already have a 2nd entry, fail a third attempt */ if (*poll_ptr) { @@ -5531,25 +5556,19 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt, pt->error = -EINVAL; return; } - /* - * Can't handle multishot for double wait for now, turn it - * into one-shot mode. - */ - if (!(poll_one->events & EPOLLONESHOT)) - poll_one->events |= EPOLLONESHOT; + poll = kmalloc(sizeof(*poll), GFP_ATOMIC); if (!poll) { pt->error = -ENOMEM; return; } - io_init_poll_iocb(poll, poll_one->events, io_poll_double_wake); - req_ref_get(req); - poll->wait.private = req; + io_init_poll_iocb(poll, first->events, first->wait.func); *poll_ptr = poll; }
pt->nr_entries++; poll->head = head; + poll->wait.private = req;
if (poll->events & EPOLLEXCLUSIVE) add_wait_queue_exclusive(head, &poll->wait); @@ -5557,61 +5576,24 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt, add_wait_queue(head, &poll->wait); }
-static void io_async_queue_proc(struct file *file, struct wait_queue_head *head, +static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head, struct poll_table_struct *p) { struct io_poll_table *pt = container_of(p, struct io_poll_table, pt); - struct async_poll *apoll = pt->req->apoll;
- __io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll); -} - -static void io_async_task_func(struct io_kiocb *req, bool *locked) -{ - struct async_poll *apoll = req->apoll; - struct io_ring_ctx *ctx = req->ctx; - - trace_io_uring_task_run(req->ctx, req, req->opcode, req->user_data); - - if (io_poll_rewait(req, &apoll->poll)) { - spin_unlock(&ctx->completion_lock); - return; - } - - hash_del(&req->hash_node); - io_poll_remove_double(req); - apoll->poll.done = true; - spin_unlock(&ctx->completion_lock); - - if (!READ_ONCE(apoll->poll.canceled)) - io_req_task_submit(req, locked); - else - io_req_complete_failed(req, -ECANCELED); -} - -static int io_async_wake(struct wait_queue_entry *wait, unsigned mode, int sync, - void *key) -{ - struct io_kiocb *req = wait->private; - struct io_poll_iocb *poll = &req->apoll->poll; - - trace_io_uring_poll_wake(req->ctx, req->opcode, req->user_data, - key_to_poll(key)); - - return __io_async_wake(req, poll, key_to_poll(key), io_async_task_func); + __io_queue_proc(&pt->req->poll, pt, head, + (struct io_poll_iocb **) &pt->req->async_data); }
-static __poll_t __io_arm_poll_handler(struct io_kiocb *req, - struct io_poll_iocb *poll, - struct io_poll_table *ipt, __poll_t mask, - wait_queue_func_t wake_func) - __acquires(&ctx->completion_lock) +static int __io_arm_poll_handler(struct io_kiocb *req, + struct io_poll_iocb *poll, + struct io_poll_table *ipt, __poll_t mask) { struct io_ring_ctx *ctx = req->ctx; - bool cancel = false; + int v;
INIT_HLIST_NODE(&req->hash_node); - io_init_poll_iocb(poll, mask, wake_func); + io_init_poll_iocb(poll, mask, io_poll_wake); poll->file = req->file; poll->wait.private = req;
@@ -5620,31 +5602,54 @@ static __poll_t __io_arm_poll_handler(struct io_kiocb *req, ipt->error = 0; ipt->nr_entries = 0;
+ /* + * Take the ownership to delay any tw execution up until we're done + * with poll arming. see io_poll_get_ownership(). + */ + atomic_set(&req->poll_refs, 1); mask = vfs_poll(req->file, &ipt->pt) & poll->events; - if (unlikely(!ipt->nr_entries) && !ipt->error) - ipt->error = -EINVAL; + + if (mask && (poll->events & EPOLLONESHOT)) { + io_poll_remove_entries(req); + /* no one else has access to the req, forget about the ref */ + return mask; + } + if (!mask && unlikely(ipt->error || !ipt->nr_entries)) { + io_poll_remove_entries(req); + if (!ipt->error) + ipt->error = -EINVAL; + return 0; + }
spin_lock(&ctx->completion_lock); - if (ipt->error || (mask && (poll->events & EPOLLONESHOT))) - io_poll_remove_double(req); - if (likely(poll->head)) { - spin_lock_irq(&poll->head->lock); - if (unlikely(list_empty(&poll->wait.entry))) { - if (ipt->error) - cancel = true; - ipt->error = 0; - mask = 0; - } - if ((mask && (poll->events & EPOLLONESHOT)) || ipt->error) - list_del_init(&poll->wait.entry); - else if (cancel) - WRITE_ONCE(poll->canceled, true); - else if (!poll->done) /* actually waiting for an event */ - io_poll_req_insert(req); - spin_unlock_irq(&poll->head->lock); + io_poll_req_insert(req); + spin_unlock(&ctx->completion_lock); + + if (mask) { + /* can't multishot if failed, just queue the event we've got */ + if (unlikely(ipt->error || !ipt->nr_entries)) + poll->events |= EPOLLONESHOT; + __io_poll_execute(req, mask); + return 0; }
- return mask; + /* + * Release ownership. If someone tried to queue a tw while it was + * locked, kick it off for them. + */ + v = atomic_dec_return(&req->poll_refs); + if (unlikely(v & IO_POLL_REF_MASK)) + __io_poll_execute(req, 0); + return 0; +} + +static void io_async_queue_proc(struct file *file, struct wait_queue_head *head, + struct poll_table_struct *p) +{ + struct io_poll_table *pt = container_of(p, struct io_poll_table, pt); + struct async_poll *apoll = pt->req->apoll; + + __io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll); }
enum { @@ -5659,7 +5664,8 @@ static int io_arm_poll_handler(struct io_kiocb *req) struct io_ring_ctx *ctx = req->ctx; struct async_poll *apoll; struct io_poll_table ipt; - __poll_t ret, mask = EPOLLONESHOT | POLLERR | POLLPRI; + __poll_t mask = EPOLLONESHOT | POLLERR | POLLPRI; + int ret;
if (!req->file || !file_can_poll(req->file)) return IO_APOLL_ABORTED; @@ -5686,11 +5692,8 @@ static int io_arm_poll_handler(struct io_kiocb *req) req->apoll = apoll; req->flags |= REQ_F_POLLED; ipt.pt._qproc = io_async_queue_proc; - io_req_set_refcount(req);
- ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask, - io_async_wake); - spin_unlock(&ctx->completion_lock); + ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask); if (ret || ipt.error) return ret ? IO_APOLL_READY : IO_APOLL_ABORTED;
@@ -5699,43 +5702,6 @@ static int io_arm_poll_handler(struct io_kiocb *req) return IO_APOLL_OK; }
-static bool __io_poll_remove_one(struct io_kiocb *req, - struct io_poll_iocb *poll, bool do_cancel) - __must_hold(&req->ctx->completion_lock) -{ - bool do_complete = false; - - if (!poll->head) - return false; - spin_lock_irq(&poll->head->lock); - if (do_cancel) - WRITE_ONCE(poll->canceled, true); - if (!list_empty(&poll->wait.entry)) { - list_del_init(&poll->wait.entry); - do_complete = true; - } - spin_unlock_irq(&poll->head->lock); - hash_del(&req->hash_node); - return do_complete; -} - -static bool io_poll_remove_one(struct io_kiocb *req) - __must_hold(&req->ctx->completion_lock) -{ - bool do_complete; - - io_poll_remove_double(req); - do_complete = __io_poll_remove_one(req, io_poll_get_single(req), true); - - if (do_complete) { - req_set_fail(req); - io_fill_cqe_req(req, -ECANCELED, 0); - io_commit_cqring(req->ctx); - io_put_req_deferred(req); - } - return do_complete; -} - /* * Returns true if we found and killed one or more poll requests */ @@ -5744,7 +5710,8 @@ static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk, { struct hlist_node *tmp; struct io_kiocb *req; - int posted = 0, i; + bool found = false; + int i;
spin_lock(&ctx->completion_lock); for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { @@ -5752,16 +5719,14 @@ static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
list = &ctx->cancel_hash[i]; hlist_for_each_entry_safe(req, tmp, list, hash_node) { - if (io_match_task_safe(req, tsk, cancel_all)) - posted += io_poll_remove_one(req); + if (io_match_task_safe(req, tsk, cancel_all)) { + io_poll_cancel_req(req); + found = true; + } } } spin_unlock(&ctx->completion_lock); - - if (posted) - io_cqring_ev_posted(ctx); - - return posted != 0; + return found; }
static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, __u64 sqe_addr, @@ -5782,19 +5747,26 @@ static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, __u64 sqe_addr, return NULL; }
+static bool io_poll_disarm(struct io_kiocb *req) + __must_hold(&ctx->completion_lock) +{ + if (!io_poll_get_ownership(req)) + return false; + io_poll_remove_entries(req); + hash_del(&req->hash_node); + return true; +} + static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr, bool poll_only) __must_hold(&ctx->completion_lock) { - struct io_kiocb *req; + struct io_kiocb *req = io_poll_find(ctx, sqe_addr, poll_only);
- req = io_poll_find(ctx, sqe_addr, poll_only); if (!req) return -ENOENT; - if (io_poll_remove_one(req)) - return 0; - - return -EALREADY; + io_poll_cancel_req(req); + return 0; }
static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe, @@ -5844,23 +5816,6 @@ static int io_poll_update_prep(struct io_kiocb *req, return 0; }
-static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, - void *key) -{ - struct io_kiocb *req = wait->private; - struct io_poll_iocb *poll = &req->poll; - - return __io_async_wake(req, poll, key_to_poll(key), io_poll_task_func); -} - -static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head, - struct poll_table_struct *p) -{ - struct io_poll_table *pt = container_of(p, struct io_poll_table, pt); - - __io_queue_proc(&pt->req->poll, pt, head, (struct io_poll_iocb **) &pt->req->async_data); -} - static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_poll_iocb *poll = &req->poll; @@ -5882,57 +5837,31 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags) { struct io_poll_iocb *poll = &req->poll; - struct io_ring_ctx *ctx = req->ctx; struct io_poll_table ipt; - __poll_t mask; - bool done; + int ret;
ipt.pt._qproc = io_poll_queue_proc;
- mask = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events, - io_poll_wake); - - if (mask) { /* no async, we'd stolen it */ - ipt.error = 0; - done = __io_poll_complete(req, mask); - io_commit_cqring(req->ctx); - } - spin_unlock(&ctx->completion_lock); - - if (mask) { - io_cqring_ev_posted(ctx); - if (done) - io_put_req(req); - } - return ipt.error; + ret = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events); + ret = ret ?: ipt.error; + if (ret) + __io_req_complete(req, issue_flags, ret, 0); + return 0; }
static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags) { struct io_ring_ctx *ctx = req->ctx; struct io_kiocb *preq; - bool completing; int ret2, ret = 0;
spin_lock(&ctx->completion_lock); preq = io_poll_find(ctx, req->poll_update.old_user_data, true); - if (!preq) { - ret = -ENOENT; -fail: + if (!preq || !io_poll_disarm(preq)) { spin_unlock(&ctx->completion_lock); + ret = preq ? -EALREADY : -ENOENT; goto out; } - io_poll_remove_double(preq); - /* - * Don't allow racy completion with singleshot, as we cannot safely - * update those. For multishot, if we're racing with completion, just - * let completion re-add it. - */ - completing = !__io_poll_remove_one(preq, &preq->poll, false); - if (completing && (preq->poll.events & EPOLLONESHOT)) { - ret = -EALREADY; - goto fail; - } spin_unlock(&ctx->completion_lock);
if (req->poll_update.update_events || req->poll_update.update_user_data) {
From: Jiapeng Chong jiapeng.chong@linux.alibaba.com
[ upstream commmit c84b8a3fef663933007e885535591b9d30bdc860 ]
Fix the following clang warnings:
fs/io_uring.c:1195:20: warning: unused function 'req_ref_put' [-Wunused-function].
Fixes: aa43477b0402 ("io_uring: poll rework") Reported-by: Abaci Robot abaci@linux.alibaba.com Signed-off-by: Jiapeng Chong jiapeng.chong@linux.alibaba.com Link: https://lore.kernel.org/r/20220113162005.3011-1-jiapeng.chong@linux.alibaba.... Signed-off-by: Jens Axboe axboe@kernel.dk [pavel: backport] Signed-off-by: Pavel Begunkov asml.silence@gmail.com --- fs/io_uring.c | 6 ------ 1 file changed, 6 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c index bc4741061258..c4d34b4e8fb6 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1155,12 +1155,6 @@ static inline bool req_ref_put_and_test(struct io_kiocb *req) return atomic_dec_and_test(&req->refs); }
-static inline void req_ref_put(struct io_kiocb *req) -{ - WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT)); - WARN_ON_ONCE(req_ref_put_and_test(req)); -} - static inline void req_ref_get(struct io_kiocb *req) { WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
From: Jens Axboe axboe@kernel.dk
[ upstream commmit 61bc84c4008812d784c398cfb54118c1ba396dfc ]
When the ring is exiting, as part of the shutdown, poll requests are removed. But io_poll_remove_all() does not remove entries when finding them, and since completions are done out-of-band, we can find and remove the same entry multiple times.
We do guard the poll execution by poll ownership, but that does not exclude us from reissuing a new one once the previous removal ownership goes away.
This can race with poll execution as well, where we then end up seeing req->apoll be NULL because a previous task_work requeue finished the request.
Remove the poll entry when we find it and get ownership of it. This prevents multiple invocations from finding it.
Fixes: aa43477b0402 ("io_uring: poll rework") Reported-by: Dylan Yudaken dylany@fb.com Signed-off-by: Jens Axboe axboe@kernel.dk [pavel: backport] Signed-off-by: Pavel Begunkov asml.silence@gmail.com --- fs/io_uring.c | 1 + 1 file changed, 1 insertion(+)
diff --git a/fs/io_uring.c b/fs/io_uring.c index c4d34b4e8fb6..4ebc47ebb77a 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -5714,6 +5714,7 @@ static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk, list = &ctx->cancel_hash[i]; hlist_for_each_entry_safe(req, tmp, list, hash_node) { if (io_match_task_safe(req, tsk, cancel_all)) { + hlist_del_init(&req->hash_node); io_poll_cancel_req(req); found = true; }
From: Jens Axboe axboe@kernel.dk
[ upstream commmit e2c0cb7c0cc72939b61a7efee376206725796625 ]
The previous commit:
1bc84c40088 ("io_uring: remove poll entry from list when canceling all")
removed a potential overflow condition for the poll references. They are currently limited to 20-bits, even if we have 31-bits available. The upper bit is used to mark for cancelation.
Bump the poll ref space to 31-bits, making that kind of situation much harder to trigger in general. We'll separately add overflow checking and handling.
Fixes: aa43477b0402 ("io_uring: poll rework") Signed-off-by: Jens Axboe axboe@kernel.dk [pavel: backport] Signed-off-by: Pavel Begunkov asml.silence@gmail.com --- fs/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c index 4ebc47ebb77a..ac9fa3364c45 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -5308,7 +5308,7 @@ struct io_poll_table { };
#define IO_POLL_CANCEL_FLAG BIT(31) -#define IO_POLL_REF_MASK ((1u << 20)-1) +#define IO_POLL_REF_MASK GENMASK(30, 0)
/* * If refs part of ->poll_refs (see IO_POLL_REF_MASK) is 0, it's free. We can
[ upstream commmit c487a5ad48831afa6784b368ec40d0ee50f2fe1b ]
Don't forget to cancel all linked requests of poll request when __io_arm_poll_handler() failed.
Fixes: aa43477b04025 ("io_uring: poll rework") Signed-off-by: Pavel Begunkov asml.silence@gmail.com Link: https://lore.kernel.org/r/a78aad962460f9fdfe4aa4c0b62425c88f9415bc.165585224... Signed-off-by: Jens Axboe axboe@kernel.dk [pavel: backport] Signed-off-by: Pavel Begunkov asml.silence@gmail.com --- fs/io_uring.c | 2 ++ 1 file changed, 2 insertions(+)
diff --git a/fs/io_uring.c b/fs/io_uring.c index ac9fa3364c45..16b0ea1b5bf7 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -5838,6 +5838,8 @@ static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags) ipt.pt._qproc = io_poll_queue_proc;
ret = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events); + if (!ret && ipt.error) + req_set_fail(req); ret = ret ?: ipt.error; if (ret) __io_req_complete(req, issue_flags, ret, 0);
[ upstream commmit 9d2ad2947a53abf5e5e6527a9eeed50a3a4cbc72 ]
Leaving ip.error set when a request was punted to task_work execution is problematic, don't forget to clear it.
Fixes: aa43477b04025 ("io_uring: poll rework") Signed-off-by: Pavel Begunkov asml.silence@gmail.com Link: https://lore.kernel.org/r/a6c84ef4182c6962380aebe11b35bdcb25b0ccfb.165585224... Signed-off-by: Jens Axboe axboe@kernel.dk [pavel: backport] Signed-off-by: Pavel Begunkov asml.silence@gmail.com --- fs/io_uring.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c index 16b0ea1b5bf7..6bcc3798591e 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -5621,8 +5621,10 @@ static int __io_arm_poll_handler(struct io_kiocb *req,
if (mask) { /* can't multishot if failed, just queue the event we've got */ - if (unlikely(ipt->error || !ipt->nr_entries)) + if (unlikely(ipt->error || !ipt->nr_entries)) { poll->events |= EPOLLONESHOT; + ipt->error = 0; + } __io_poll_execute(req, mask); return 0; }
[ upstream commmit 791f3465c4afde02d7f16cf7424ca87070b69396 ]
Fixes a problem described in 50252e4b5e989 ("aio: fix use-after-free due to missing POLLFREE handling") and copies the approach used there.
In short, we have to forcibly eject a poll entry when we meet POLLFREE. We can't rely on io_poll_get_ownership() as can't wait for potentially running tw handlers, so we use the fact that wqs are RCU freed. See Eric's patch and comments for more details.
Reported-by: Eric Biggers ebiggers@google.com Link: https://lore.kernel.org/r/20211209010455.42744-6-ebiggers@kernel.org Reported-and-tested-by: syzbot+5426c7ed6868c705ca14@syzkaller.appspotmail.com Fixes: 221c5eb233823 ("io_uring: add support for IORING_OP_POLL") Signed-off-by: Pavel Begunkov asml.silence@gmail.com Link: https://lore.kernel.org/r/4ed56b6f548f7ea337603a82315750449412748a.164216125... [axboe: drop non-functional change from patch] Signed-off-by: Jens Axboe axboe@kernel.dk [pavel: backport] Signed-off-by: Pavel Begunkov asml.silence@gmail.com --- fs/io_uring.c | 58 ++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 50 insertions(+), 8 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c index 6bcc3798591e..e586b0e6d725 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -5363,12 +5363,14 @@ static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events,
static inline void io_poll_remove_entry(struct io_poll_iocb *poll) { - struct wait_queue_head *head = poll->head; + struct wait_queue_head *head = smp_load_acquire(&poll->head);
- spin_lock_irq(&head->lock); - list_del_init(&poll->wait.entry); - poll->head = NULL; - spin_unlock_irq(&head->lock); + if (head) { + spin_lock_irq(&head->lock); + list_del_init(&poll->wait.entry); + poll->head = NULL; + spin_unlock_irq(&head->lock); + } }
static void io_poll_remove_entries(struct io_kiocb *req) @@ -5376,10 +5378,26 @@ static void io_poll_remove_entries(struct io_kiocb *req) struct io_poll_iocb *poll = io_poll_get_single(req); struct io_poll_iocb *poll_double = io_poll_get_double(req);
- if (poll->head) - io_poll_remove_entry(poll); - if (poll_double && poll_double->head) + /* + * While we hold the waitqueue lock and the waitqueue is nonempty, + * wake_up_pollfree() will wait for us. However, taking the waitqueue + * lock in the first place can race with the waitqueue being freed. + * + * We solve this as eventpoll does: by taking advantage of the fact that + * all users of wake_up_pollfree() will RCU-delay the actual free. If + * we enter rcu_read_lock() and see that the pointer to the queue is + * non-NULL, we can then lock it without the memory being freed out from + * under us. + * + * Keep holding rcu_read_lock() as long as we hold the queue lock, in + * case the caller deletes the entry from the queue, leaving it empty. + * In that case, only RCU prevents the queue memory from being freed. + */ + rcu_read_lock(); + io_poll_remove_entry(poll); + if (poll_double) io_poll_remove_entry(poll_double); + rcu_read_unlock(); }
/* @@ -5517,6 +5535,30 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, wait); __poll_t mask = key_to_poll(key);
+ if (unlikely(mask & POLLFREE)) { + io_poll_mark_cancelled(req); + /* we have to kick tw in case it's not already */ + io_poll_execute(req, 0); + + /* + * If the waitqueue is being freed early but someone is already + * holds ownership over it, we have to tear down the request as + * best we can. That means immediately removing the request from + * its waitqueue and preventing all further accesses to the + * waitqueue via the request. + */ + list_del_init(&poll->wait.entry); + + /* + * Careful: this *must* be the last step, since as soon + * as req->head is NULL'ed out, the request can be + * completed and freed, since aio_poll_complete_work() + * will no longer need to take the waitqueue lock. + */ + smp_store_release(&poll->head, NULL); + return 1; + } + /* for instances that support it check for an event match first */ if (mask && !(mask & poll->events)) return 0;
On Mon, Aug 29, 2022 at 02:30:11PM +0100, Pavel Begunkov wrote:
5.15 backport of io_uring pollfree patches.
Jens Axboe (2): io_uring: remove poll entry from list when canceling all io_uring: bump poll refs to full 31-bits
Jiapeng Chong (1): io_uring: Remove unused function req_ref_put
Pavel Begunkov (10): io_uring: correct fill events helpers types io_uring: clean cqe filling functions io_uring: refactor poll update io_uring: move common poll bits io_uring: kill poll linking optimisation io_uring: inline io_poll_complete io_uring: poll rework io_uring: fail links when poll fails io_uring: fix wrong arm_poll error handling io_uring: fix UAF due to missing POLLFREE handling
fs/io_uring.c | 746 +++++++++++++++++++++++--------------------------- 1 file changed, 347 insertions(+), 399 deletions(-)
-- 2.37.2
All now queued up, thanks.
greg k-h
linux-stable-mirror@lists.linaro.org