From 9b8284921513fc1ea57d87777283a59b05862f03 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 18 Sep 2020 20:13:06 -0600 Subject: io_uring: reference ->nsproxy for file table commands If we don't get and assign the namespace for the async work, then certain paths just don't work properly (like /dev/stdin, /proc/mounts, etc). Anything that references the current namespace of the given task should be assigned for async work on behalf of that task. Cc: stable@vger.kernel.org # v5.5+ Reported-by: Al Viro Signed-off-by: Jens Axboe --- fs/io-wq.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs/io-wq.c') diff --git a/fs/io-wq.c b/fs/io-wq.c index 414beb543883..09e20bbf0d37 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -60,6 +60,7 @@ struct io_worker { const struct cred *cur_creds; const struct cred *saved_creds; struct files_struct *restore_files; + struct nsproxy *restore_nsproxy; struct fs_struct *restore_fs; }; @@ -153,6 +154,7 @@ static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker) task_lock(current); current->files = worker->restore_files; + current->nsproxy = worker->restore_nsproxy; task_unlock(current); } @@ -318,6 +320,7 @@ static void io_worker_start(struct io_wqe *wqe, struct io_worker *worker) worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING); worker->restore_files = current->files; + worker->restore_nsproxy = current->nsproxy; worker->restore_fs = current->fs; io_wqe_inc_running(wqe, worker); } @@ -454,6 +457,7 @@ static void io_impersonate_work(struct io_worker *worker, if (work->files && current->files != work->files) { task_lock(current); current->files = work->files; + current->nsproxy = work->nsproxy; task_unlock(current); } if (work->fs && current->fs != work->fs) -- cgit v1.2.3 From 95da84659226d75698a1ab958be0af21d9cc2a9c Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 1 Sep 2020 10:41:46 +0200 Subject: io_wq: Make io_wqe::lock a raw_spinlock_t During a context switch the scheduler invokes wq_worker_sleeping() with disabled preemption. Disabling preemption is needed because it protects access to `worker->sleeping'. As an optimisation it avoids invoking schedule() within the schedule path as part of possible wake up (thus preempt_enable_no_resched() afterwards). The io-wq has been added to the mix in the same section with disabled preemption. This breaks on PREEMPT_RT because io_wq_worker_sleeping() acquires a spinlock_t. Also within the schedule() the spinlock_t must be acquired after tsk_is_pi_blocked() otherwise it will block on the sleeping lock again while scheduling out. While playing with `io_uring-bench' I didn't notice a significant latency spike after converting io_wqe::lock to a raw_spinlock_t. The latency was more or less the same. In order to keep the spinlock_t it would have to be moved after the tsk_is_pi_blocked() check which would introduce a branch instruction into the hot path. The lock is used to maintain the `work_list' and wakes one task up at most. Should io_wqe_cancel_pending_work() cause latency spikes, while searching for a specific item, then it would need to drop the lock during iterations. revert_creds() is also invoked under the lock. According to debug cred::non_rcu is 0. Otherwise it should be moved outside of the locked section because put_cred_rcu()->free_uid() acquires a sleeping lock. Convert io_wqe::lock to a raw_spinlock_t.c Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Jens Axboe --- fs/io-wq.c | 52 ++++++++++++++++++++++++++-------------------------- 1 file changed, 26 insertions(+), 26 deletions(-) (limited to 'fs/io-wq.c') diff --git a/fs/io-wq.c b/fs/io-wq.c index 09e20bbf0d37..c1a7ef85844b 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -88,7 +88,7 @@ enum { */ struct io_wqe { struct { - spinlock_t lock; + raw_spinlock_t lock; struct io_wq_work_list work_list; unsigned long hash_map; unsigned flags; @@ -149,7 +149,7 @@ static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker) if (current->files != worker->restore_files) { __acquire(&wqe->lock); - spin_unlock_irq(&wqe->lock); + raw_spin_unlock_irq(&wqe->lock); dropped_lock = true; task_lock(current); @@ -168,7 +168,7 @@ static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker) if (worker->mm) { if (!dropped_lock) { __acquire(&wqe->lock); - spin_unlock_irq(&wqe->lock); + raw_spin_unlock_irq(&wqe->lock); dropped_lock = true; } __set_current_state(TASK_RUNNING); @@ -222,17 +222,17 @@ static void io_worker_exit(struct io_worker *worker) worker->flags = 0; preempt_enable(); - spin_lock_irq(&wqe->lock); + raw_spin_lock_irq(&wqe->lock); hlist_nulls_del_rcu(&worker->nulls_node); list_del_rcu(&worker->all_list); if (__io_worker_unuse(wqe, worker)) { __release(&wqe->lock); - spin_lock_irq(&wqe->lock); + raw_spin_lock_irq(&wqe->lock); } acct->nr_workers--; nr_workers = wqe->acct[IO_WQ_ACCT_BOUND].nr_workers + wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers; - spin_unlock_irq(&wqe->lock); + raw_spin_unlock_irq(&wqe->lock); /* all workers gone, wq exit can proceed */ if (!nr_workers && refcount_dec_and_test(&wqe->wq->refs)) @@ -508,7 +508,7 @@ get_next: else if (!wq_list_empty(&wqe->work_list)) wqe->flags |= IO_WQE_FLAG_STALLED; - spin_unlock_irq(&wqe->lock); + raw_spin_unlock_irq(&wqe->lock); if (!work) break; io_assign_current_work(worker, work); @@ -542,17 +542,17 @@ get_next: io_wqe_enqueue(wqe, linked); if (hash != -1U && !next_hashed) { - spin_lock_irq(&wqe->lock); + raw_spin_lock_irq(&wqe->lock); wqe->hash_map &= ~BIT_ULL(hash); wqe->flags &= ~IO_WQE_FLAG_STALLED; /* skip unnecessary unlock-lock wqe->lock */ if (!work) goto get_next; - spin_unlock_irq(&wqe->lock); + raw_spin_unlock_irq(&wqe->lock); } } while (work); - spin_lock_irq(&wqe->lock); + raw_spin_lock_irq(&wqe->lock); } while (1); } @@ -567,7 +567,7 @@ static int io_wqe_worker(void *data) while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)) { set_current_state(TASK_INTERRUPTIBLE); loop: - spin_lock_irq(&wqe->lock); + raw_spin_lock_irq(&wqe->lock); if (io_wqe_run_queue(wqe)) { __set_current_state(TASK_RUNNING); io_worker_handle_work(worker); @@ -578,7 +578,7 @@ loop: __release(&wqe->lock); goto loop; } - spin_unlock_irq(&wqe->lock); + raw_spin_unlock_irq(&wqe->lock); if (signal_pending(current)) flush_signals(current); if (schedule_timeout(WORKER_IDLE_TIMEOUT)) @@ -590,11 +590,11 @@ loop: } if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) { - spin_lock_irq(&wqe->lock); + raw_spin_lock_irq(&wqe->lock); if (!wq_list_empty(&wqe->work_list)) io_worker_handle_work(worker); else - spin_unlock_irq(&wqe->lock); + raw_spin_unlock_irq(&wqe->lock); } io_worker_exit(worker); @@ -634,9 +634,9 @@ void io_wq_worker_sleeping(struct task_struct *tsk) worker->flags &= ~IO_WORKER_F_RUNNING; - spin_lock_irq(&wqe->lock); + raw_spin_lock_irq(&wqe->lock); io_wqe_dec_running(wqe, worker); - spin_unlock_irq(&wqe->lock); + raw_spin_unlock_irq(&wqe->lock); } static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) @@ -660,7 +660,7 @@ static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) return false; } - spin_lock_irq(&wqe->lock); + raw_spin_lock_irq(&wqe->lock); hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list); list_add_tail_rcu(&worker->all_list, &wqe->all_list); worker->flags |= IO_WORKER_F_FREE; @@ -669,7 +669,7 @@ static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) if (!acct->nr_workers && (worker->flags & IO_WORKER_F_BOUND)) worker->flags |= IO_WORKER_F_FIXED; acct->nr_workers++; - spin_unlock_irq(&wqe->lock); + raw_spin_unlock_irq(&wqe->lock); if (index == IO_WQ_ACCT_UNBOUND) atomic_inc(&wq->user->processes); @@ -724,12 +724,12 @@ static int io_wq_manager(void *data) if (!node_online(node)) continue; - spin_lock_irq(&wqe->lock); + raw_spin_lock_irq(&wqe->lock); if (io_wqe_need_worker(wqe, IO_WQ_ACCT_BOUND)) fork_worker[IO_WQ_ACCT_BOUND] = true; if (io_wqe_need_worker(wqe, IO_WQ_ACCT_UNBOUND)) fork_worker[IO_WQ_ACCT_UNBOUND] = true; - spin_unlock_irq(&wqe->lock); + raw_spin_unlock_irq(&wqe->lock); if (fork_worker[IO_WQ_ACCT_BOUND]) create_io_worker(wq, wqe, IO_WQ_ACCT_BOUND); if (fork_worker[IO_WQ_ACCT_UNBOUND]) @@ -825,10 +825,10 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) } work_flags = work->flags; - spin_lock_irqsave(&wqe->lock, flags); + raw_spin_lock_irqsave(&wqe->lock, flags); io_wqe_insert_work(wqe, work); wqe->flags &= ~IO_WQE_FLAG_STALLED; - spin_unlock_irqrestore(&wqe->lock, flags); + raw_spin_unlock_irqrestore(&wqe->lock, flags); if ((work_flags & IO_WQ_WORK_CONCURRENT) || !atomic_read(&acct->nr_running)) @@ -955,13 +955,13 @@ static void io_wqe_cancel_pending_work(struct io_wqe *wqe, unsigned long flags; retry: - spin_lock_irqsave(&wqe->lock, flags); + raw_spin_lock_irqsave(&wqe->lock, flags); wq_list_for_each(node, prev, &wqe->work_list) { work = container_of(node, struct io_wq_work, list); if (!match->fn(work, match->data)) continue; io_wqe_remove_pending(wqe, work, prev); - spin_unlock_irqrestore(&wqe->lock, flags); + raw_spin_unlock_irqrestore(&wqe->lock, flags); io_run_cancel(work, wqe); match->nr_pending++; if (!match->cancel_all) @@ -970,7 +970,7 @@ retry: /* not safe to continue after unlock */ goto retry; } - spin_unlock_irqrestore(&wqe->lock, flags); + raw_spin_unlock_irqrestore(&wqe->lock, flags); } static void io_wqe_cancel_running_work(struct io_wqe *wqe, @@ -1078,7 +1078,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) } atomic_set(&wqe->acct[IO_WQ_ACCT_UNBOUND].nr_running, 0); wqe->wq = wq; - spin_lock_init(&wqe->lock); + raw_spin_lock_init(&wqe->lock); INIT_WQ_LIST(&wqe->work_list); INIT_HLIST_NULLS_HEAD(&wqe->free_list, 0); INIT_LIST_HEAD(&wqe->all_list); -- cgit v1.2.3 From 91d8f5191e8fe6fc6a87aa5353b36f5a7409fbec Mon Sep 17 00:00:00 2001 From: Dennis Zhou Date: Wed, 16 Sep 2020 13:41:05 -0700 Subject: io_uring: add blkcg accounting to offloaded operations There are a few operations that are offloaded to the worker threads. In this case, we lose process context and end up in kthread context. This results in ios to be not accounted to the issuing cgroup and consequently end up as issued by root. Just like others, adopt the personality of the blkcg too when issuing via the workqueues. For the SQPOLL thread, it will live and attach in the inited cgroup's context. Signed-off-by: Dennis Zhou Signed-off-by: Jens Axboe --- fs/io-wq.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'fs/io-wq.c') diff --git a/fs/io-wq.c b/fs/io-wq.c index c1a7ef85844b..1be0d70f8673 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -17,6 +17,7 @@ #include #include #include +#include #include "io-wq.h" @@ -57,6 +58,9 @@ struct io_worker { struct rcu_head rcu; struct mm_struct *mm; +#ifdef CONFIG_BLK_CGROUP + struct cgroup_subsys_state *blkcg_css; +#endif const struct cred *cur_creds; const struct cred *saved_creds; struct files_struct *restore_files; @@ -177,6 +181,13 @@ static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker) worker->mm = NULL; } +#ifdef CONFIG_BLK_CGROUP + if (worker->blkcg_css) { + kthread_associate_blkcg(NULL); + worker->blkcg_css = NULL; + } +#endif + return dropped_lock; } @@ -439,6 +450,17 @@ static void io_wq_switch_mm(struct io_worker *worker, struct io_wq_work *work) work->flags |= IO_WQ_WORK_CANCEL; } +static inline void io_wq_switch_blkcg(struct io_worker *worker, + struct io_wq_work *work) +{ +#ifdef CONFIG_BLK_CGROUP + if (work->blkcg_css != worker->blkcg_css) { + kthread_associate_blkcg(work->blkcg_css); + worker->blkcg_css = work->blkcg_css; + } +#endif +} + static void io_wq_switch_creds(struct io_worker *worker, struct io_wq_work *work) { @@ -467,6 +489,7 @@ static void io_impersonate_work(struct io_worker *worker, if (worker->cur_creds != work->creds) io_wq_switch_creds(worker, work); current->signal->rlim[RLIMIT_FSIZE].rlim_cur = work->fsize; + io_wq_switch_blkcg(worker, work); } static void io_assign_current_work(struct io_worker *worker, -- cgit v1.2.3 From c4068bf898ddaef791049a366828d9b84b467bda Mon Sep 17 00:00:00 2001 From: Hillf Danton Date: Sat, 26 Sep 2020 21:26:55 +0800 Subject: io-wq: fix use-after-free in io_wq_worker_running The smart syzbot has found a reproducer for the following issue: ================================================================== BUG: KASAN: use-after-free in instrument_atomic_write include/linux/instrumented.h:71 [inline] BUG: KASAN: use-after-free in atomic_inc include/asm-generic/atomic-instrumented.h:240 [inline] BUG: KASAN: use-after-free in io_wqe_inc_running fs/io-wq.c:301 [inline] BUG: KASAN: use-after-free in io_wq_worker_running+0xde/0x110 fs/io-wq.c:613 Write of size 4 at addr ffff8882183db08c by task io_wqe_worker-0/7771 CPU: 0 PID: 7771 Comm: io_wqe_worker-0 Not tainted 5.9.0-rc4-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x198/0x1fd lib/dump_stack.c:118 print_address_description.constprop.0.cold+0xae/0x497 mm/kasan/report.c:383 __kasan_report mm/kasan/report.c:513 [inline] kasan_report.cold+0x1f/0x37 mm/kasan/report.c:530 check_memory_region_inline mm/kasan/generic.c:186 [inline] check_memory_region+0x13d/0x180 mm/kasan/generic.c:192 instrument_atomic_write include/linux/instrumented.h:71 [inline] atomic_inc include/asm-generic/atomic-instrumented.h:240 [inline] io_wqe_inc_running fs/io-wq.c:301 [inline] io_wq_worker_running+0xde/0x110 fs/io-wq.c:613 schedule_timeout+0x148/0x250 kernel/time/timer.c:1879 io_wqe_worker+0x517/0x10e0 fs/io-wq.c:580 kthread+0x3b5/0x4a0 kernel/kthread.c:292 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:294 Allocated by task 7768: kasan_save_stack+0x1b/0x40 mm/kasan/common.c:48 kasan_set_track mm/kasan/common.c:56 [inline] __kasan_kmalloc.constprop.0+0xbf/0xd0 mm/kasan/common.c:461 kmem_cache_alloc_node_trace+0x17b/0x3f0 mm/slab.c:3594 kmalloc_node include/linux/slab.h:572 [inline] kzalloc_node include/linux/slab.h:677 [inline] io_wq_create+0x57b/0xa10 fs/io-wq.c:1064 io_init_wq_offload fs/io_uring.c:7432 [inline] io_sq_offload_start fs/io_uring.c:7504 [inline] io_uring_create fs/io_uring.c:8625 [inline] io_uring_setup+0x1836/0x28e0 fs/io_uring.c:8694 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Freed by task 21: kasan_save_stack+0x1b/0x40 mm/kasan/common.c:48 kasan_set_track+0x1c/0x30 mm/kasan/common.c:56 kasan_set_free_info+0x1b/0x30 mm/kasan/generic.c:355 __kasan_slab_free+0xd8/0x120 mm/kasan/common.c:422 __cache_free mm/slab.c:3418 [inline] kfree+0x10e/0x2b0 mm/slab.c:3756 __io_wq_destroy fs/io-wq.c:1138 [inline] io_wq_destroy+0x2af/0x460 fs/io-wq.c:1146 io_finish_async fs/io_uring.c:6836 [inline] io_ring_ctx_free fs/io_uring.c:7870 [inline] io_ring_exit_work+0x1e4/0x6d0 fs/io_uring.c:7954 process_one_work+0x94c/0x1670 kernel/workqueue.c:2269 worker_thread+0x64c/0x1120 kernel/workqueue.c:2415 kthread+0x3b5/0x4a0 kernel/kthread.c:292 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:294 The buggy address belongs to the object at ffff8882183db000 which belongs to the cache kmalloc-1k of size 1024 The buggy address is located 140 bytes inside of 1024-byte region [ffff8882183db000, ffff8882183db400) The buggy address belongs to the page: page:000000009bada22b refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x2183db flags: 0x57ffe0000000200(slab) raw: 057ffe0000000200 ffffea0008604c48 ffffea00086a8648 ffff8880aa040700 raw: 0000000000000000 ffff8882183db000 0000000100000002 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff8882183daf80: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ffff8882183db000: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb >ffff8882183db080: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff8882183db100: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff8882183db180: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ================================================================== which is down to the comment below, /* all workers gone, wq exit can proceed */ if (!nr_workers && refcount_dec_and_test(&wqe->wq->refs)) complete(&wqe->wq->done); because there might be multiple cases of wqe in a wq and we would wait for every worker in every wqe to go home before releasing wq's resources on destroying. To that end, rework wq's refcount by making it independent of the tracking of workers because after all they are two different things, and keeping it balanced when workers come and go. Note the manager kthread, like other workers, now holds a grab to wq during its lifetime. Finally to help destroy wq, check IO_WQ_BIT_EXIT upon creating worker and do nothing for exiting wq. Cc: stable@vger.kernel.org # v5.5+ Reported-by: syzbot+45fa0a195b941764e0f0@syzkaller.appspotmail.com Reported-by: syzbot+9af99580130003da82b1@syzkaller.appspotmail.com Cc: Pavel Begunkov Signed-off-by: Hillf Danton Signed-off-by: Jens Axboe --- fs/io-wq.c | 116 ++++++++++++++++++++++++++++++------------------------------- 1 file changed, 58 insertions(+), 58 deletions(-) (limited to 'fs/io-wq.c') diff --git a/fs/io-wq.c b/fs/io-wq.c index 1be0d70f8673..98f9c74b25d2 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -213,7 +213,6 @@ static void io_worker_exit(struct io_worker *worker) { struct io_wqe *wqe = worker->wqe; struct io_wqe_acct *acct = io_wqe_get_acct(wqe, worker); - unsigned nr_workers; /* * If we're not at zero, someone else is holding a brief reference @@ -241,15 +240,11 @@ static void io_worker_exit(struct io_worker *worker) raw_spin_lock_irq(&wqe->lock); } acct->nr_workers--; - nr_workers = wqe->acct[IO_WQ_ACCT_BOUND].nr_workers + - wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers; raw_spin_unlock_irq(&wqe->lock); - /* all workers gone, wq exit can proceed */ - if (!nr_workers && refcount_dec_and_test(&wqe->wq->refs)) - complete(&wqe->wq->done); - kfree_rcu(worker, rcu); + if (refcount_dec_and_test(&wqe->wq->refs)) + complete(&wqe->wq->done); } static inline bool io_wqe_run_queue(struct io_wqe *wqe) @@ -664,7 +659,7 @@ void io_wq_worker_sleeping(struct task_struct *tsk) static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) { - struct io_wqe_acct *acct =&wqe->acct[index]; + struct io_wqe_acct *acct = &wqe->acct[index]; struct io_worker *worker; worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, wqe->node); @@ -697,6 +692,7 @@ static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) if (index == IO_WQ_ACCT_UNBOUND) atomic_inc(&wq->user->processes); + refcount_inc(&wq->refs); wake_up_process(worker->task); return true; } @@ -712,28 +708,63 @@ static inline bool io_wqe_need_worker(struct io_wqe *wqe, int index) return acct->nr_workers < acct->max_workers; } +static bool io_wqe_worker_send_sig(struct io_worker *worker, void *data) +{ + send_sig(SIGINT, worker->task, 1); + return false; +} + +/* + * Iterate the passed in list and call the specific function for each + * worker that isn't exiting + */ +static bool io_wq_for_each_worker(struct io_wqe *wqe, + bool (*func)(struct io_worker *, void *), + void *data) +{ + struct io_worker *worker; + bool ret = false; + + list_for_each_entry_rcu(worker, &wqe->all_list, all_list) { + if (io_worker_get(worker)) { + /* no task if node is/was offline */ + if (worker->task) + ret = func(worker, data); + io_worker_release(worker); + if (ret) + break; + } + } + + return ret; +} + +static bool io_wq_worker_wake(struct io_worker *worker, void *data) +{ + wake_up_process(worker->task); + return false; +} + /* * Manager thread. Tasked with creating new workers, if we need them. */ static int io_wq_manager(void *data) { struct io_wq *wq = data; - int workers_to_create = num_possible_nodes(); int node; /* create fixed workers */ - refcount_set(&wq->refs, workers_to_create); + refcount_set(&wq->refs, 1); for_each_node(node) { if (!node_online(node)) continue; - if (!create_io_worker(wq, wq->wqes[node], IO_WQ_ACCT_BOUND)) - goto err; - workers_to_create--; + if (create_io_worker(wq, wq->wqes[node], IO_WQ_ACCT_BOUND)) + continue; + set_bit(IO_WQ_BIT_ERROR, &wq->state); + set_bit(IO_WQ_BIT_EXIT, &wq->state); + goto out; } - while (workers_to_create--) - refcount_dec(&wq->refs); - complete(&wq->done); while (!kthread_should_stop()) { @@ -765,12 +796,18 @@ static int io_wq_manager(void *data) if (current->task_works) task_work_run(); - return 0; -err: - set_bit(IO_WQ_BIT_ERROR, &wq->state); - set_bit(IO_WQ_BIT_EXIT, &wq->state); - if (refcount_sub_and_test(workers_to_create, &wq->refs)) +out: + if (refcount_dec_and_test(&wq->refs)) { complete(&wq->done); + return 0; + } + /* if ERROR is set and we get here, we have workers to wake */ + if (test_bit(IO_WQ_BIT_ERROR, &wq->state)) { + rcu_read_lock(); + for_each_node(node) + io_wq_for_each_worker(wq->wqes[node], io_wq_worker_wake, NULL); + rcu_read_unlock(); + } return 0; } @@ -877,37 +914,6 @@ void io_wq_hash_work(struct io_wq_work *work, void *val) work->flags |= (IO_WQ_WORK_HASHED | (bit << IO_WQ_HASH_SHIFT)); } -static bool io_wqe_worker_send_sig(struct io_worker *worker, void *data) -{ - send_sig(SIGINT, worker->task, 1); - return false; -} - -/* - * Iterate the passed in list and call the specific function for each - * worker that isn't exiting - */ -static bool io_wq_for_each_worker(struct io_wqe *wqe, - bool (*func)(struct io_worker *, void *), - void *data) -{ - struct io_worker *worker; - bool ret = false; - - list_for_each_entry_rcu(worker, &wqe->all_list, all_list) { - if (io_worker_get(worker)) { - /* no task if node is/was offline */ - if (worker->task) - ret = func(worker, data); - io_worker_release(worker); - if (ret) - break; - } - } - - return ret; -} - void io_wq_cancel_all(struct io_wq *wq) { int node; @@ -1140,12 +1146,6 @@ bool io_wq_get(struct io_wq *wq, struct io_wq_data *data) return refcount_inc_not_zero(&wq->use_refs); } -static bool io_wq_worker_wake(struct io_worker *worker, void *data) -{ - wake_up_process(worker->task); - return false; -} - static void __io_wq_destroy(struct io_wq *wq) { int node; -- cgit v1.2.3 From 145cc8c665f406cc189cfcf15a9875689e2c73b8 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 26 Sep 2020 12:37:46 -0600 Subject: io-wq: kill unused IO_WORKER_F_EXITING This flag is no longer used, remove it. Signed-off-by: Jens Axboe --- fs/io-wq.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs/io-wq.c') diff --git a/fs/io-wq.c b/fs/io-wq.c index 98f9c74b25d2..0a182f1333e8 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -27,9 +27,8 @@ enum { IO_WORKER_F_UP = 1, /* up and active */ IO_WORKER_F_RUNNING = 2, /* account as running */ IO_WORKER_F_FREE = 4, /* worker on free list */ - IO_WORKER_F_EXITING = 8, /* worker exiting */ - IO_WORKER_F_FIXED = 16, /* static idle worker */ - IO_WORKER_F_BOUND = 32, /* is doing bounded work */ + IO_WORKER_F_FIXED = 8, /* static idle worker */ + IO_WORKER_F_BOUND = 16, /* is doing bounded work */ }; enum { -- cgit v1.2.3 From a8b595b22d31f83b715511f59012f152a269d83b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 15 Oct 2020 10:13:07 -0600 Subject: io-wq: assign NUMA node locality if appropriate There was an assumption that kthread_create_on_node() would properly set NUMA affinities in terms of CPUs allowed, but it doesn't. Make sure we do this when creating an io-wq context on NUMA. Cc: stable@vger.kernel.org Stefan Metzmacher Signed-off-by: Jens Axboe --- fs/io-wq.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/io-wq.c') diff --git a/fs/io-wq.c b/fs/io-wq.c index 0a182f1333e8..149fd2f0927e 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -676,6 +676,7 @@ static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) kfree(worker); return false; } + kthread_bind_mask(worker->task, cpumask_of_node(wqe->node)); raw_spin_lock_irq(&wqe->lock); hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list); -- cgit v1.2.3 From 0f203765880c4416675726be558b65da4a7604e2 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 14 Oct 2020 09:23:55 -0600 Subject: io_uring: pass required context in as flags We have a number of bits that decide what context to inherit. Set up io-wq flags for these instead. This is in preparation for always having the various members set, but not always needing them for all requests. No intended functional changes in this patch. Signed-off-by: Jens Axboe --- fs/io-wq.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'fs/io-wq.c') diff --git a/fs/io-wq.c b/fs/io-wq.c index 149fd2f0927e..e636898f8a1f 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -448,6 +448,8 @@ static inline void io_wq_switch_blkcg(struct io_worker *worker, struct io_wq_work *work) { #ifdef CONFIG_BLK_CGROUP + if (!(work->flags & IO_WQ_WORK_BLKCG)) + return; if (work->blkcg_css != worker->blkcg_css) { kthread_associate_blkcg(work->blkcg_css); worker->blkcg_css = work->blkcg_css; @@ -470,17 +472,17 @@ static void io_wq_switch_creds(struct io_worker *worker, static void io_impersonate_work(struct io_worker *worker, struct io_wq_work *work) { - if (work->files && current->files != work->files) { + if ((work->flags & IO_WQ_WORK_FILES) && current->files != work->files) { task_lock(current); current->files = work->files; current->nsproxy = work->nsproxy; task_unlock(current); } - if (work->fs && current->fs != work->fs) + if ((work->flags & IO_WQ_WORK_FS) && current->fs != work->fs) current->fs = work->fs; - if (work->mm != worker->mm) + if ((work->flags & IO_WQ_WORK_MM) && work->mm != worker->mm) io_wq_switch_mm(worker, work); - if (worker->cur_creds != work->creds) + if ((work->flags & IO_WQ_WORK_CREDS) && worker->cur_creds != work->creds) io_wq_switch_creds(worker, work); current->signal->rlim[RLIMIT_FSIZE].rlim_cur = work->fsize; io_wq_switch_blkcg(worker, work); -- cgit v1.2.3 From dfead8a8e2c494b947480bac90a6f9792f08bc12 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 14 Oct 2020 10:12:37 -0600 Subject: io_uring: rely solely on work flags to determine personality. We solely rely on work->work_flags now, so use that for proper checking and clearing/dropping of various identity items. Signed-off-by: Jens Axboe --- fs/io-wq.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'fs/io-wq.c') diff --git a/fs/io-wq.c b/fs/io-wq.c index e636898f8a1f..b7d8e544a804 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -429,14 +429,10 @@ static void io_wq_switch_mm(struct io_worker *worker, struct io_wq_work *work) mmput(worker->mm); worker->mm = NULL; } - if (!work->mm) - return; if (mmget_not_zero(work->mm)) { kthread_use_mm(work->mm); worker->mm = work->mm; - /* hang on to this mm */ - work->mm = NULL; return; } -- cgit v1.2.3 From 98447d65b4a7a59f8ea37dc6e5d743247d9a7b01 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 14 Oct 2020 10:48:51 -0600 Subject: io_uring: move io identity items into separate struct io-wq contains a pointer to the identity, which we just hold in io_kiocb for now. This is in preparation for putting this outside io_kiocb. The only exception is struct files_struct, which we'll need different rules for to avoid a circular dependency. No functional changes in this patch. Signed-off-by: Jens Axboe --- fs/io-wq.c | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) (limited to 'fs/io-wq.c') diff --git a/fs/io-wq.c b/fs/io-wq.c index b7d8e544a804..0c852b75384d 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -430,9 +430,9 @@ static void io_wq_switch_mm(struct io_worker *worker, struct io_wq_work *work) worker->mm = NULL; } - if (mmget_not_zero(work->mm)) { - kthread_use_mm(work->mm); - worker->mm = work->mm; + if (mmget_not_zero(work->identity->mm)) { + kthread_use_mm(work->identity->mm); + worker->mm = work->identity->mm; return; } @@ -446,9 +446,9 @@ static inline void io_wq_switch_blkcg(struct io_worker *worker, #ifdef CONFIG_BLK_CGROUP if (!(work->flags & IO_WQ_WORK_BLKCG)) return; - if (work->blkcg_css != worker->blkcg_css) { - kthread_associate_blkcg(work->blkcg_css); - worker->blkcg_css = work->blkcg_css; + if (work->identity->blkcg_css != worker->blkcg_css) { + kthread_associate_blkcg(work->identity->blkcg_css); + worker->blkcg_css = work->identity->blkcg_css; } #endif } @@ -456,9 +456,9 @@ static inline void io_wq_switch_blkcg(struct io_worker *worker, static void io_wq_switch_creds(struct io_worker *worker, struct io_wq_work *work) { - const struct cred *old_creds = override_creds(work->creds); + const struct cred *old_creds = override_creds(work->identity->creds); - worker->cur_creds = work->creds; + worker->cur_creds = work->identity->creds; if (worker->saved_creds) put_cred(old_creds); /* creds set by previous switch */ else @@ -468,19 +468,21 @@ static void io_wq_switch_creds(struct io_worker *worker, static void io_impersonate_work(struct io_worker *worker, struct io_wq_work *work) { - if ((work->flags & IO_WQ_WORK_FILES) && current->files != work->files) { + if ((work->flags & IO_WQ_WORK_FILES) && + current->files != work->identity->files) { task_lock(current); - current->files = work->files; - current->nsproxy = work->nsproxy; + current->files = work->identity->files; + current->nsproxy = work->identity->nsproxy; task_unlock(current); } - if ((work->flags & IO_WQ_WORK_FS) && current->fs != work->fs) - current->fs = work->fs; - if ((work->flags & IO_WQ_WORK_MM) && work->mm != worker->mm) + if ((work->flags & IO_WQ_WORK_FS) && current->fs != work->identity->fs) + current->fs = work->identity->fs; + if ((work->flags & IO_WQ_WORK_MM) && work->identity->mm != worker->mm) io_wq_switch_mm(worker, work); - if ((work->flags & IO_WQ_WORK_CREDS) && worker->cur_creds != work->creds) + if ((work->flags & IO_WQ_WORK_CREDS) && + worker->cur_creds != work->identity->creds) io_wq_switch_creds(worker, work); - current->signal->rlim[RLIMIT_FSIZE].rlim_cur = work->fsize; + current->signal->rlim[RLIMIT_FSIZE].rlim_cur = work->identity->fsize; io_wq_switch_blkcg(worker, work); } -- cgit v1.2.3 From 4ea33a976bfe79293965d0815e1914e4b6e58967 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 15 Oct 2020 13:46:44 -0600 Subject: io-wq: inherit audit loginuid and sessionid Make sure the async io-wq workers inherit the loginuid and sessionid from the original task, and restore them to unset once we're done with the async work item. While at it, disable the ability for kernel threads to write to their own loginuid. Signed-off-by: Jens Axboe --- fs/io-wq.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'fs/io-wq.c') diff --git a/fs/io-wq.c b/fs/io-wq.c index 0c852b75384d..7cb3b4cb9b11 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -18,6 +18,7 @@ #include #include #include +#include #include "io-wq.h" @@ -484,6 +485,10 @@ static void io_impersonate_work(struct io_worker *worker, io_wq_switch_creds(worker, work); current->signal->rlim[RLIMIT_FSIZE].rlim_cur = work->identity->fsize; io_wq_switch_blkcg(worker, work); +#ifdef CONFIG_AUDIT + current->loginuid = work->identity->loginuid; + current->sessionid = work->identity->sessionid; +#endif } static void io_assign_current_work(struct io_worker *worker, @@ -496,6 +501,11 @@ static void io_assign_current_work(struct io_worker *worker, cond_resched(); } +#ifdef CONFIG_AUDIT + current->loginuid = KUIDT_INIT(AUDIT_UID_UNSET); + current->sessionid = AUDIT_SID_UNSET; +#endif + spin_lock_irq(&worker->lock); worker->cur_work = work; spin_unlock_irq(&worker->lock); -- cgit v1.2.3 From 69228338c9c3f0519f0daeca362a730130211c83 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 20 Oct 2020 14:28:41 -0600 Subject: io_uring: unify fsize with def->work_flags This one was missed in the earlier conversion, should be included like any of the other IO identity flags. Make sure we restore to RLIM_INIFITY when dropping the personality again. Fixes: 98447d65b4a7 ("io_uring: move io identity items into separate struct") Signed-off-by: Jens Axboe --- fs/io-wq.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'fs/io-wq.c') diff --git a/fs/io-wq.c b/fs/io-wq.c index 7cb3b4cb9b11..4012ff541b7b 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -187,7 +187,8 @@ static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker) worker->blkcg_css = NULL; } #endif - + if (current->signal->rlim[RLIMIT_FSIZE].rlim_cur != RLIM_INFINITY) + current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; return dropped_lock; } @@ -483,7 +484,10 @@ static void io_impersonate_work(struct io_worker *worker, if ((work->flags & IO_WQ_WORK_CREDS) && worker->cur_creds != work->identity->creds) io_wq_switch_creds(worker, work); - current->signal->rlim[RLIMIT_FSIZE].rlim_cur = work->identity->fsize; + if (work->flags & IO_WQ_WORK_FSIZE) + current->signal->rlim[RLIMIT_FSIZE].rlim_cur = work->identity->fsize; + else if (current->signal->rlim[RLIMIT_FSIZE].rlim_cur != RLIM_INFINITY) + current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; io_wq_switch_blkcg(worker, work); #ifdef CONFIG_AUDIT current->loginuid = work->identity->loginuid; -- cgit v1.2.3 From 43c01fbefdf110e8713d6c0ff71055f06b9888a0 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 22 Oct 2020 09:02:50 -0600 Subject: io-wq: re-set NUMA node affinities if CPUs come online We correctly set io-wq NUMA node affinities when the io-wq context is setup, but if an entire node CPU set is offlined and then brought back online, the per node affinities are broken. Ensure that we set them again whenever a CPU comes online. This ensures that we always track the right node affinity. The usual cpuhp notifiers are used to drive it. Reported-by: Zhang Qiang Signed-off-by: Jens Axboe --- fs/io-wq.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 56 insertions(+), 4 deletions(-) (limited to 'fs/io-wq.c') diff --git a/fs/io-wq.c b/fs/io-wq.c index 4012ff541b7b..02894df7656d 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -19,7 +19,9 @@ #include #include #include +#include +#include "../kernel/sched/sched.h" #include "io-wq.h" #define WORKER_IDLE_TIMEOUT (5 * HZ) @@ -123,9 +125,13 @@ struct io_wq { refcount_t refs; struct completion done; + struct hlist_node cpuhp_node; + refcount_t use_refs; }; +static enum cpuhp_state io_wq_online; + static bool io_worker_get(struct io_worker *worker) { return refcount_inc_not_zero(&worker->ref); @@ -1091,10 +1097,12 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) return ERR_PTR(-ENOMEM); wq->wqes = kcalloc(nr_node_ids, sizeof(struct io_wqe *), GFP_KERNEL); - if (!wq->wqes) { - kfree(wq); - return ERR_PTR(-ENOMEM); - } + if (!wq->wqes) + goto err_wq; + + ret = cpuhp_state_add_instance_nocalls(io_wq_online, &wq->cpuhp_node); + if (ret) + goto err_wqes; wq->free_work = data->free_work; wq->do_work = data->do_work; @@ -1102,6 +1110,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) /* caller must already hold a reference to this */ wq->user = data->user; + ret = -ENOMEM; for_each_node(node) { struct io_wqe *wqe; int alloc_node = node; @@ -1145,9 +1154,12 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) ret = PTR_ERR(wq->manager); complete(&wq->done); err: + cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node); for_each_node(node) kfree(wq->wqes[node]); +err_wqes: kfree(wq->wqes); +err_wq: kfree(wq); return ERR_PTR(ret); } @@ -1164,6 +1176,8 @@ static void __io_wq_destroy(struct io_wq *wq) { int node; + cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node); + set_bit(IO_WQ_BIT_EXIT, &wq->state); if (wq->manager) kthread_stop(wq->manager); @@ -1191,3 +1205,41 @@ struct task_struct *io_wq_get_task(struct io_wq *wq) { return wq->manager; } + +static bool io_wq_worker_affinity(struct io_worker *worker, void *data) +{ + struct task_struct *task = worker->task; + struct rq_flags rf; + struct rq *rq; + + rq = task_rq_lock(task, &rf); + do_set_cpus_allowed(task, cpumask_of_node(worker->wqe->node)); + task->flags |= PF_NO_SETAFFINITY; + task_rq_unlock(rq, task, &rf); + return false; +} + +static int io_wq_cpu_online(unsigned int cpu, struct hlist_node *node) +{ + struct io_wq *wq = hlist_entry_safe(node, struct io_wq, cpuhp_node); + int i; + + rcu_read_lock(); + for_each_node(i) + io_wq_for_each_worker(wq->wqes[i], io_wq_worker_affinity, NULL); + rcu_read_unlock(); + return 0; +} + +static __init int io_wq_init(void) +{ + int ret; + + ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "io-wq/online", + io_wq_cpu_online, NULL); + if (ret < 0) + return ret; + io_wq_online = ret; + return 0; +} +subsys_initcall(io_wq_init); -- cgit v1.2.3 From 3dd1680d1418f22f7ddaf98a4eab66285a099b3e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 30 Oct 2020 09:36:41 -0600 Subject: io-wq: cancel request if it's asking for files and we don't have them This can't currently happen, but will be possible shortly. Handle missing files just like we do not being able to grab a needed mm, and mark the request as needing cancelation. Signed-off-by: Jens Axboe --- fs/io-wq.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs/io-wq.c') diff --git a/fs/io-wq.c b/fs/io-wq.c index 02894df7656d..b53c055bea6a 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -482,6 +482,10 @@ static void io_impersonate_work(struct io_worker *worker, current->files = work->identity->files; current->nsproxy = work->identity->nsproxy; task_unlock(current); + if (!work->identity->files) { + /* failed grabbing files, ensure work gets cancelled */ + work->flags |= IO_WQ_WORK_CANCEL; + } } if ((work->flags & IO_WQ_WORK_FS) && current->fs != work->identity->fs) current->fs = work->identity->fs; -- cgit v1.2.3