summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
authorJens Axboe <axboe@kernel.dk>2020-04-03 13:44:44 -0600
committerJens Axboe <axboe@kernel.dk>2020-04-03 13:44:44 -0600
commit08b24342b2c0f2661d7c3691f5369465e8c62ef6 (patch)
tree83966e4d23c2c7108bd844898d991e980f1bc04e /fs
parentc52abf563049e787c1341cdf15c7dbe1bfbc951b (diff)
parentaa96bf8a9ee33457b7e3ea43e97dfa1e3a15ab20 (diff)
downloadlinux-next-08b24342b2c0f2661d7c3691f5369465e8c62ef6.tar.gz
Merge branch 'io_uring-5.7' into for-next
* io_uring-5.7: io_uring: use io-wq manager as backup task if task is exiting io_uring: grab task reference for poll requests io_uring: retry poll if we got woken with non-matching mask io_uring: add missing finish_wait() in io_sq_thread() io_uring: refactor file register/unregister/update handling
Diffstat (limited to 'fs')
-rw-r--r--fs/io-wq.c12
-rw-r--r--fs/io-wq.h2
-rw-r--r--fs/io_uring.c249
3 files changed, 167 insertions, 96 deletions
diff --git a/fs/io-wq.c b/fs/io-wq.c
index cc5cf2209fb0..4023c9846860 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -17,6 +17,7 @@
#include <linux/kthread.h>
#include <linux/rculist_nulls.h>
#include <linux/fs_struct.h>
+#include <linux/task_work.h>
#include "io-wq.h"
@@ -716,6 +717,9 @@ static int io_wq_manager(void *data)
complete(&wq->done);
while (!kthread_should_stop()) {
+ if (current->task_works)
+ task_work_run();
+
for_each_node(node) {
struct io_wqe *wqe = wq->wqes[node];
bool fork_worker[2] = { false, false };
@@ -738,6 +742,9 @@ static int io_wq_manager(void *data)
schedule_timeout(HZ);
}
+ if (current->task_works)
+ task_work_run();
+
return 0;
err:
set_bit(IO_WQ_BIT_ERROR, &wq->state);
@@ -1124,3 +1131,8 @@ void io_wq_destroy(struct io_wq *wq)
if (refcount_dec_and_test(&wq->use_refs))
__io_wq_destroy(wq);
}
+
+struct task_struct *io_wq_get_task(struct io_wq *wq)
+{
+ return wq->manager;
+}
diff --git a/fs/io-wq.h b/fs/io-wq.h
index 3ee7356d6be5..5ba12de7572f 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -136,6 +136,8 @@ typedef bool (work_cancel_fn)(struct io_wq_work *, void *);
enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
void *data);
+struct task_struct *io_wq_get_task(struct io_wq *wq);
+
#if defined(CONFIG_IO_WQ)
extern void io_wq_worker_sleeping(struct task_struct *);
extern void io_wq_worker_running(struct task_struct *);
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 358f97be9c7b..2460c3333f70 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -186,14 +186,23 @@ struct fixed_file_table {
struct file **files;
};
+struct fixed_file_ref_node {
+ struct percpu_ref refs;
+ struct list_head node;
+ struct list_head file_list;
+ struct fixed_file_data *file_data;
+ struct work_struct work;
+};
+
struct fixed_file_data {
struct fixed_file_table *table;
struct io_ring_ctx *ctx;
+ struct percpu_ref *cur_refs;
struct percpu_ref refs;
- struct llist_head put_llist;
- struct work_struct ref_work;
struct completion done;
+ struct list_head ref_list;
+ spinlock_t lock;
};
struct io_buffer {
@@ -606,10 +615,8 @@ struct io_kiocb {
struct list_head list;
unsigned int flags;
refcount_t refs;
- union {
- struct task_struct *task;
- unsigned long fsize;
- };
+ struct task_struct *task;
+ unsigned long fsize;
u64 user_data;
u32 result;
u32 sequence;
@@ -618,6 +625,8 @@ struct io_kiocb {
struct list_head inflight_entry;
+ struct percpu_ref *fixed_file_refs;
+
union {
/*
* Only commands that never go async can use the below fields,
@@ -848,7 +857,6 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
struct io_uring_files_update *ip,
unsigned nr_args);
static int io_grab_files(struct io_kiocb *req);
-static void io_ring_file_ref_flush(struct fixed_file_data *data);
static void io_cleanup_req(struct io_kiocb *req);
static int io_file_get(struct io_submit_state *state, struct io_kiocb *req,
int fd, struct file **out_file, bool fixed);
@@ -1326,6 +1334,7 @@ got_it:
req->flags = 0;
/* one is dropped after submission, the other at completion */
refcount_set(&req->refs, 2);
+ req->task = NULL;
req->result = 0;
INIT_IO_WORK(&req->work, io_wq_submit_work);
return req;
@@ -1341,7 +1350,7 @@ static inline void io_put_file(struct io_kiocb *req, struct file *file,
bool fixed)
{
if (fixed)
- percpu_ref_put(&req->ctx->file_data->refs);
+ percpu_ref_put(req->fixed_file_refs);
else
fput(file);
}
@@ -1362,6 +1371,8 @@ static void __io_req_aux_free(struct io_kiocb *req)
kfree(req->io);
if (req->file)
io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
+ if (req->task)
+ put_task_struct(req->task);
io_req_work_drop_env(req);
}
@@ -1393,21 +1404,18 @@ struct req_batch {
static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb)
{
- int fixed_refs = rb->to_free;
-
if (!rb->to_free)
return;
if (rb->need_iter) {
int i, inflight = 0;
unsigned long flags;
- fixed_refs = 0;
for (i = 0; i < rb->to_free; i++) {
struct io_kiocb *req = rb->reqs[i];
if (req->flags & REQ_F_FIXED_FILE) {
req->file = NULL;
- fixed_refs++;
+ percpu_ref_put(req->fixed_file_refs);
}
if (req->flags & REQ_F_INFLIGHT)
inflight++;
@@ -1433,8 +1441,6 @@ static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb)
}
do_free:
kmem_cache_free_bulk(req_cachep, rb->to_free, rb->reqs);
- if (fixed_refs)
- percpu_ref_put_many(&ctx->file_data->refs, fixed_refs);
percpu_ref_put_many(&ctx->refs, rb->to_free);
rb->to_free = rb->need_iter = 0;
}
@@ -4114,6 +4120,7 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
__poll_t mask, task_work_func_t func)
{
struct task_struct *tsk;
+ int ret;
/* for instances that support it check for an event match first: */
if (mask && !(mask & poll->events))
@@ -4127,11 +4134,15 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
req->result = mask;
init_task_work(&req->task_work, func);
/*
- * If this fails, then the task is exiting. If that is the case, then
- * the exit check will ultimately cancel these work items. Hence we
- * don't need to check here and handle it specifically.
+ * If this fails, then the task is exiting. Punt to one of the io-wq
+ * threads to ensure the work gets run, we can't always rely on exit
+ * cancelation taking care of this.
*/
- task_work_add(tsk, &req->task_work, true);
+ ret = task_work_add(tsk, &req->task_work, true);
+ if (unlikely(ret)) {
+ tsk = io_wq_get_task(req->ctx->io_wq);
+ task_work_add(tsk, &req->task_work, true);
+ }
wake_up_process(tsk);
return 1;
}
@@ -4251,10 +4262,7 @@ static bool io_arm_poll_handler(struct io_kiocb *req)
req->flags |= REQ_F_POLLED;
memcpy(&apoll->work, &req->work, sizeof(req->work));
- /*
- * Don't need a reference here, as we're adding it to the task
- * task_works list. If the task exits, the list is pruned.
- */
+ get_task_struct(current);
req->task = current;
req->apoll = apoll;
INIT_HLIST_NODE(&req->hash_node);
@@ -4407,8 +4415,20 @@ static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error)
static void io_poll_task_handler(struct io_kiocb *req, struct io_kiocb **nxt)
{
struct io_ring_ctx *ctx = req->ctx;
+ struct io_poll_iocb *poll = &req->poll;
+
+ if (!req->result && !READ_ONCE(poll->canceled)) {
+ struct poll_table_struct pt = { ._key = poll->events };
+
+ req->result = vfs_poll(req->file, &pt) & poll->events;
+ }
spin_lock_irq(&ctx->completion_lock);
+ if (!req->result && !READ_ONCE(poll->canceled)) {
+ add_wait_queue(poll->head, &poll->wait);
+ spin_unlock_irq(&ctx->completion_lock);
+ return;
+ }
hash_del(&req->hash_node);
io_poll_complete(req, req->result, 0);
req->flags |= REQ_F_COMP_LOCKED;
@@ -4465,10 +4485,7 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
events = READ_ONCE(sqe->poll_events);
poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
- /*
- * Don't need a reference here, as we're adding it to the task
- * task_works list. If the task exits, the list is pruned.
- */
+ get_task_struct(current);
req->task = current;
return 0;
}
@@ -5331,7 +5348,8 @@ static int io_file_get(struct io_submit_state *state, struct io_kiocb *req,
file = io_file_from_index(ctx, fd);
if (!file)
return -EBADF;
- percpu_ref_get(&ctx->file_data->refs);
+ req->fixed_file_refs = ctx->file_data->cur_refs;
+ percpu_ref_get(req->fixed_file_refs);
} else {
trace_io_uring_file_get(ctx, fd);
file = __io_file_get(state, fd);
@@ -5962,6 +5980,7 @@ static int io_sq_thread(void *data)
}
if (current->task_works) {
task_work_run();
+ finish_wait(&ctx->sqo_wait, &wait);
continue;
}
if (signal_pending(current))
@@ -6124,43 +6143,36 @@ static void io_file_ref_kill(struct percpu_ref *ref)
complete(&data->done);
}
-static void io_file_ref_exit_and_free(struct work_struct *work)
-{
- struct fixed_file_data *data;
-
- data = container_of(work, struct fixed_file_data, ref_work);
-
- /*
- * Ensure any percpu-ref atomic switch callback has run, it could have
- * been in progress when the files were being unregistered. Once
- * that's done, we can safely exit and free the ref and containing
- * data structure.
- */
- rcu_barrier();
- percpu_ref_exit(&data->refs);
- kfree(data);
-}
-
static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
{
struct fixed_file_data *data = ctx->file_data;
+ struct fixed_file_ref_node *ref_node = NULL;
unsigned nr_tables, i;
+ unsigned long flags;
if (!data)
return -ENXIO;
- percpu_ref_kill_and_confirm(&data->refs, io_file_ref_kill);
- flush_work(&data->ref_work);
+ spin_lock_irqsave(&data->lock, flags);
+ if (!list_empty(&data->ref_list))
+ ref_node = list_first_entry(&data->ref_list,
+ struct fixed_file_ref_node, node);
+ spin_unlock_irqrestore(&data->lock, flags);
+ if (ref_node)
+ percpu_ref_kill(&ref_node->refs);
+
+ percpu_ref_kill(&data->refs);
+
+ /* wait for all refs nodes to complete */
wait_for_completion(&data->done);
- io_ring_file_ref_flush(data);
__io_sqe_files_unregister(ctx);
nr_tables = DIV_ROUND_UP(ctx->nr_user_files, IORING_MAX_FILES_TABLE);
for (i = 0; i < nr_tables; i++)
kfree(data->table[i].files);
kfree(data->table);
- INIT_WORK(&data->ref_work, io_file_ref_exit_and_free);
- queue_work(system_wq, &data->ref_work);
+ percpu_ref_exit(&data->refs);
+ kfree(data);
ctx->file_data = NULL;
ctx->nr_user_files = 0;
return 0;
@@ -6385,46 +6397,72 @@ static void io_ring_file_put(struct io_ring_ctx *ctx, struct file *file)
}
struct io_file_put {
- struct llist_node llist;
+ struct list_head list;
struct file *file;
};
-static void io_ring_file_ref_flush(struct fixed_file_data *data)
+static void io_file_put_work(struct work_struct *work)
{
+ struct fixed_file_ref_node *ref_node;
+ struct fixed_file_data *file_data;
+ struct io_ring_ctx *ctx;
struct io_file_put *pfile, *tmp;
- struct llist_node *node;
+ unsigned long flags;
- while ((node = llist_del_all(&data->put_llist)) != NULL) {
- llist_for_each_entry_safe(pfile, tmp, node, llist) {
- io_ring_file_put(data->ctx, pfile->file);
- kfree(pfile);
- }
+ ref_node = container_of(work, struct fixed_file_ref_node, work);
+ file_data = ref_node->file_data;
+ ctx = file_data->ctx;
+
+ list_for_each_entry_safe(pfile, tmp, &ref_node->file_list, list) {
+ list_del_init(&pfile->list);
+ io_ring_file_put(ctx, pfile->file);
+ kfree(pfile);
}
+
+ spin_lock_irqsave(&file_data->lock, flags);
+ list_del_init(&ref_node->node);
+ spin_unlock_irqrestore(&file_data->lock, flags);
+
+ percpu_ref_exit(&ref_node->refs);
+ kfree(ref_node);
+ percpu_ref_put(&file_data->refs);
}
-static void io_ring_file_ref_switch(struct work_struct *work)
+static void io_file_data_ref_zero(struct percpu_ref *ref)
{
- struct fixed_file_data *data;
+ struct fixed_file_ref_node *ref_node;
+
+ ref_node = container_of(ref, struct fixed_file_ref_node, refs);
- data = container_of(work, struct fixed_file_data, ref_work);
- io_ring_file_ref_flush(data);
- percpu_ref_switch_to_percpu(&data->refs);
+ queue_work(system_wq, &ref_node->work);
}
-static void io_file_data_ref_zero(struct percpu_ref *ref)
+static struct fixed_file_ref_node *alloc_fixed_file_ref_node(
+ struct io_ring_ctx *ctx)
{
- struct fixed_file_data *data;
+ struct fixed_file_ref_node *ref_node;
- data = container_of(ref, struct fixed_file_data, refs);
+ ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
+ if (!ref_node)
+ return ERR_PTR(-ENOMEM);
+
+ if (percpu_ref_init(&ref_node->refs, io_file_data_ref_zero,
+ 0, GFP_KERNEL)) {
+ kfree(ref_node);
+ return ERR_PTR(-ENOMEM);
+ }
+ INIT_LIST_HEAD(&ref_node->node);
+ INIT_LIST_HEAD(&ref_node->file_list);
+ INIT_WORK(&ref_node->work, io_file_put_work);
+ ref_node->file_data = ctx->file_data;
+ return ref_node;
- /*
- * We can't safely switch from inside this context, punt to wq. If
- * the table ref is going away, the table is being unregistered.
- * Don't queue up the async work for that case, the caller will
- * handle it.
- */
- if (!percpu_ref_is_dying(&data->refs))
- queue_work(system_wq, &data->ref_work);
+}
+
+static void destroy_fixed_file_ref_node(struct fixed_file_ref_node *ref_node)
+{
+ percpu_ref_exit(&ref_node->refs);
+ kfree(ref_node);
}
static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
@@ -6435,6 +6473,8 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
struct file *file;
int fd, ret = 0;
unsigned i;
+ struct fixed_file_ref_node *ref_node;
+ unsigned long flags;
if (ctx->file_data)
return -EBUSY;
@@ -6448,6 +6488,7 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
return -ENOMEM;
ctx->file_data->ctx = ctx;
init_completion(&ctx->file_data->done);
+ INIT_LIST_HEAD(&ctx->file_data->ref_list);
nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_FILES_TABLE);
ctx->file_data->table = kcalloc(nr_tables,
@@ -6459,15 +6500,13 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
return -ENOMEM;
}
- if (percpu_ref_init(&ctx->file_data->refs, io_file_data_ref_zero,
+ if (percpu_ref_init(&ctx->file_data->refs, io_file_ref_kill,
PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
kfree(ctx->file_data->table);
kfree(ctx->file_data);
ctx->file_data = NULL;
return -ENOMEM;
}
- ctx->file_data->put_llist.first = NULL;
- INIT_WORK(&ctx->file_data->ref_work, io_ring_file_ref_switch);
if (io_sqe_alloc_file_tables(ctx, nr_tables, nr_args)) {
percpu_ref_exit(&ctx->file_data->refs);
@@ -6530,9 +6569,22 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
}
ret = io_sqe_files_scm(ctx);
- if (ret)
+ if (ret) {
+ io_sqe_files_unregister(ctx);
+ return ret;
+ }
+
+ ref_node = alloc_fixed_file_ref_node(ctx);
+ if (IS_ERR(ref_node)) {
io_sqe_files_unregister(ctx);
+ return PTR_ERR(ref_node);
+ }
+ ctx->file_data->cur_refs = &ref_node->refs;
+ spin_lock_irqsave(&ctx->file_data->lock, flags);
+ list_add(&ref_node->node, &ctx->file_data->ref_list);
+ spin_unlock_irqrestore(&ctx->file_data->lock, flags);
+ percpu_ref_get(&ctx->file_data->refs);
return ret;
}
@@ -6579,30 +6631,21 @@ static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
#endif
}
-static void io_atomic_switch(struct percpu_ref *ref)
-{
- struct fixed_file_data *data;
-
- /*
- * Juggle reference to ensure we hit zero, if needed, so we can
- * switch back to percpu mode
- */
- data = container_of(ref, struct fixed_file_data, refs);
- percpu_ref_put(&data->refs);
- percpu_ref_get(&data->refs);
-}
-
static int io_queue_file_removal(struct fixed_file_data *data,
- struct file *file)
+ struct file *file)
{
struct io_file_put *pfile;
+ struct percpu_ref *refs = data->cur_refs;
+ struct fixed_file_ref_node *ref_node;
pfile = kzalloc(sizeof(*pfile), GFP_KERNEL);
if (!pfile)
return -ENOMEM;
+ ref_node = container_of(refs, struct fixed_file_ref_node, refs);
pfile->file = file;
- llist_add(&pfile->llist, &data->put_llist);
+ list_add(&pfile->list, &ref_node->file_list);
+
return 0;
}
@@ -6611,17 +6654,23 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
unsigned nr_args)
{
struct fixed_file_data *data = ctx->file_data;
- bool ref_switch = false;
+ struct fixed_file_ref_node *ref_node;
struct file *file;
__s32 __user *fds;
int fd, i, err;
__u32 done;
+ unsigned long flags;
+ bool needs_switch = false;
if (check_add_overflow(up->offset, nr_args, &done))
return -EOVERFLOW;
if (done > ctx->nr_user_files)
return -EINVAL;
+ ref_node = alloc_fixed_file_ref_node(ctx);
+ if (IS_ERR(ref_node))
+ return PTR_ERR(ref_node);
+
done = 0;
fds = u64_to_user_ptr(up->fds);
while (nr_args) {
@@ -6642,7 +6691,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
if (err)
break;
table->files[index] = NULL;
- ref_switch = true;
+ needs_switch = true;
}
if (fd != -1) {
file = fget(fd);
@@ -6673,11 +6722,19 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
up->offset++;
}
- if (ref_switch)
- percpu_ref_switch_to_atomic(&data->refs, io_atomic_switch);
+ if (needs_switch) {
+ percpu_ref_kill(data->cur_refs);
+ spin_lock_irqsave(&data->lock, flags);
+ list_add(&ref_node->node, &data->ref_list);
+ data->cur_refs = &ref_node->refs;
+ spin_unlock_irqrestore(&data->lock, flags);
+ percpu_ref_get(&ctx->file_data->refs);
+ } else
+ destroy_fixed_file_ref_node(ref_node);
return done ? done : err;
}
+
static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
unsigned nr_args)
{