From fd844ba9ae59b51e34e77105d79f8eca780b3bd6 Mon Sep 17 00:00:00 2001 From: Scott Wood Date: Wed, 17 Jun 2020 14:17:42 +0200 Subject: sched/core: Check cpus_mask, not cpus_ptr in __set_cpus_allowed_ptr(), to fix mask corruption This function is concerned with the long-term CPU mask, not the transitory mask the task might have while migrate disabled. Before this patch, if a task was migrate-disabled at the time __set_cpus_allowed_ptr() was called, and the new mask happened to be equal to the CPU that the task was running on, then the mask update would be lost. Signed-off-by: Scott Wood Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20200617121742.cpxppyi7twxmpin7@linutronix.de --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8f360326861e..9eeac94224db 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1637,7 +1637,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, goto out; } - if (cpumask_equal(p->cpus_ptr, new_mask)) + if (cpumask_equal(&p->cpus_mask, new_mask)) goto out; /* -- cgit v1.2.3 From ce9bc3b27f2a21a7969b41ffb04df8cf61bd1592 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Wed, 17 Jun 2020 09:29:19 +0200 Subject: sched/deadline: Initialize ->dl_boosted syzbot reported the following warning triggered via SYSC_sched_setattr(): WARNING: CPU: 0 PID: 6973 at kernel/sched/deadline.c:593 setup_new_dl_entity /kernel/sched/deadline.c:594 [inline] WARNING: CPU: 0 PID: 6973 at kernel/sched/deadline.c:593 enqueue_dl_entity /kernel/sched/deadline.c:1370 [inline] WARNING: CPU: 0 PID: 6973 at kernel/sched/deadline.c:593 enqueue_task_dl+0x1c17/0x2ba0 /kernel/sched/deadline.c:1441 This happens because the ->dl_boosted flag is currently not initialized by __dl_clear_params() (unlike the other flags) and setup_new_dl_entity() rightfully complains about it. Initialize dl_boosted to 0. Fixes: 2d3d891d3344 ("sched/deadline: Add SCHED_DEADLINE inheritance logic") Reported-by: syzbot+5ac8bac25f95e8b221e7@syzkaller.appspotmail.com Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Tested-by: Daniel Wagner Link: https://lkml.kernel.org/r/20200617072919.818409-1-juri.lelli@redhat.com --- kernel/sched/deadline.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 504d2f51b0d6..f63f337c7147 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2692,6 +2692,7 @@ void __dl_clear_params(struct task_struct *p) dl_se->dl_bw = 0; dl_se->dl_density = 0; + dl_se->dl_boosted = 0; dl_se->dl_throttled = 0; dl_se->dl_yielded = 0; dl_se->dl_non_contending = 0; -- cgit v1.2.3 From 740797ce3a124b7dd22b7fb832d87bc8fba1cf6f Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Mon, 19 Nov 2018 16:32:01 +0100 Subject: sched/core: Fix PI boosting between RT and DEADLINE tasks syzbot reported the following warning: WARNING: CPU: 1 PID: 6351 at kernel/sched/deadline.c:628 enqueue_task_dl+0x22da/0x38a0 kernel/sched/deadline.c:1504 At deadline.c:628 we have: 623 static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se) 624 { 625 struct dl_rq *dl_rq = dl_rq_of_se(dl_se); 626 struct rq *rq = rq_of_dl_rq(dl_rq); 627 628 WARN_ON(dl_se->dl_boosted); 629 WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline)); [...] } Which means that setup_new_dl_entity() has been called on a task currently boosted. This shouldn't happen though, as setup_new_dl_entity() is only called when the 'dynamic' deadline of the new entity is in the past w.r.t. rq_clock and boosted tasks shouldn't verify this condition. Digging through the PI code I noticed that what above might in fact happen if an RT tasks blocks on an rt_mutex hold by a DEADLINE task. In the first branch of boosting conditions we check only if a pi_task 'dynamic' deadline is earlier than mutex holder's and in this case we set mutex holder to be dl_boosted. However, since RT 'dynamic' deadlines are only initialized if such tasks get boosted at some point (or if they become DEADLINE of course), in general RT 'dynamic' deadlines are usually equal to 0 and this verifies the aforementioned condition. Fix it by checking that the potential donor task is actually (even if temporary because in turn boosted) running at DEADLINE priority before using its 'dynamic' deadline value. Fixes: 2d3d891d3344 ("sched/deadline: Add SCHED_DEADLINE inheritance logic") Reported-by: syzbot+119ba87189432ead09b4@syzkaller.appspotmail.com Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Daniel Bristot de Oliveira Tested-by: Daniel Wagner Link: https://lkml.kernel.org/r/20181119153201.GB2119@localhost.localdomain --- kernel/sched/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9eeac94224db..c1ba2e569fc9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4533,7 +4533,8 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) */ if (dl_prio(prio)) { if (!dl_prio(p->normal_prio) || - (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) { + (pi_task && dl_prio(pi_task->prio) && + dl_entity_preempt(&pi_task->dl, &p->dl))) { p->dl.dl_boosted = 1; queue_flag |= ENQUEUE_REPLENISH; } else -- cgit v1.2.3 From b6e13e85829f032411b896bd2f0d6cbe4b0a3c4a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 22 Jun 2020 12:01:23 +0200 Subject: sched/core: Fix ttwu() race Paul reported rcutorture occasionally hitting a NULL deref: sched_ttwu_pending() ttwu_do_wakeup() check_preempt_curr() := check_preempt_wakeup() find_matching_se() is_same_group() if (se->cfs_rq == pse->cfs_rq) <-- *BOOM* Debugging showed that this only appears to happen when we take the new code-path from commit: 2ebb17717550 ("sched/core: Offload wakee task activation if it the wakee is descheduling") and only when @cpu == smp_processor_id(). Something which should not be possible, because p->on_cpu can only be true for remote tasks. Similarly, without the new code-path from commit: c6e7bd7afaeb ("sched/core: Optimize ttwu() spinning on p->on_cpu") this would've unconditionally hit: smp_cond_load_acquire(&p->on_cpu, !VAL); and if: 'cpu == smp_processor_id() && p->on_cpu' is possible, this would result in an instant live-lock (with IRQs disabled), something that hasn't been reported. The NULL deref can be explained however if the task_cpu(p) load at the beginning of try_to_wake_up() returns an old value, and this old value happens to be smp_processor_id(). Further assume that the p->on_cpu load accurately returns 1, it really is still running, just not here. Then, when we enqueue the task locally, we can crash in exactly the observed manner because p->se.cfs_rq != rq->cfs_rq, because p's cfs_rq is from the wrong CPU, therefore we'll iterate into the non-existant parents and NULL deref. The closest semi-plausible scenario I've managed to contrive is somewhat elaborate (then again, actual reproduction takes many CPU hours of rcutorture, so it can't be anything obvious): X->cpu = 1 rq(1)->curr = X CPU0 CPU1 CPU2 // switch away from X LOCK rq(1)->lock smp_mb__after_spinlock dequeue_task(X) X->on_rq = 9 switch_to(Z) X->on_cpu = 0 UNLOCK rq(1)->lock // migrate X to cpu 0 LOCK rq(1)->lock dequeue_task(X) set_task_cpu(X, 0) X->cpu = 0 UNLOCK rq(1)->lock LOCK rq(0)->lock enqueue_task(X) X->on_rq = 1 UNLOCK rq(0)->lock // switch to X LOCK rq(0)->lock smp_mb__after_spinlock switch_to(X) X->on_cpu = 1 UNLOCK rq(0)->lock // X goes sleep X->state = TASK_UNINTERRUPTIBLE smp_mb(); // wake X ttwu() LOCK X->pi_lock smp_mb__after_spinlock if (p->state) cpu = X->cpu; // =? 1 smp_rmb() // X calls schedule() LOCK rq(0)->lock smp_mb__after_spinlock dequeue_task(X) X->on_rq = 0 if (p->on_rq) smp_rmb(); if (p->on_cpu && ttwu_queue_wakelist(..)) [*] smp_cond_load_acquire(&p->on_cpu, !VAL) cpu = select_task_rq(X, X->wake_cpu, ...) if (X->cpu != cpu) switch_to(Y) X->on_cpu = 0 UNLOCK rq(0)->lock However I'm having trouble convincing myself that's actually possible on x86_64 -- after all, every LOCK implies an smp_mb() there, so if ttwu observes ->state != RUNNING, it must also observe ->cpu != 1. (Most of the previous ttwu() races were found on very large PowerPC) Nevertheless, this fully explains the observed failure case. Fix it by ordering the task_cpu(p) load after the p->on_cpu load, which is easy since nothing actually uses @cpu before this. Fixes: c6e7bd7afaeb ("sched/core: Optimize ttwu() spinning on p->on_cpu") Reported-by: Paul E. McKenney Tested-by: Paul E. McKenney Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20200622125649.GC576871@hirez.programming.kicks-ass.net --- kernel/sched/core.c | 33 ++++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c1ba2e569fc9..60791b991b21 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2293,8 +2293,15 @@ void sched_ttwu_pending(void *arg) rq_lock_irqsave(rq, &rf); update_rq_clock(rq); - llist_for_each_entry_safe(p, t, llist, wake_entry) + llist_for_each_entry_safe(p, t, llist, wake_entry) { + if (WARN_ON_ONCE(p->on_cpu)) + smp_cond_load_acquire(&p->on_cpu, !VAL); + + if (WARN_ON_ONCE(task_cpu(p) != cpu_of(rq))) + set_task_cpu(p, cpu_of(rq)); + ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf); + } rq_unlock_irqrestore(rq, &rf); } @@ -2378,6 +2385,9 @@ static inline bool ttwu_queue_cond(int cpu, int wake_flags) static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) { if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) { + if (WARN_ON_ONCE(cpu == smp_processor_id())) + return false; + sched_clock_cpu(cpu); /* Sync clocks across CPUs */ __ttwu_queue_wakelist(p, cpu, wake_flags); return true; @@ -2528,7 +2538,6 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) goto out; success = 1; - cpu = task_cpu(p); trace_sched_waking(p); p->state = TASK_RUNNING; trace_sched_wakeup(p); @@ -2550,7 +2559,6 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) /* We're going to change ->state: */ success = 1; - cpu = task_cpu(p); /* * Ensure we load p->on_rq _after_ p->state, otherwise it would @@ -2614,8 +2622,21 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * which potentially sends an IPI instead of spinning on p->on_cpu to * let the waker make forward progress. This is safe because IRQs are * disabled and the IPI will deliver after on_cpu is cleared. + * + * Ensure we load task_cpu(p) after p->on_cpu: + * + * set_task_cpu(p, cpu); + * STORE p->cpu = @cpu + * __schedule() (switch to task 'p') + * LOCK rq->lock + * smp_mb__after_spin_lock() smp_cond_load_acquire(&p->on_cpu) + * STORE p->on_cpu = 1 LOAD p->cpu + * + * to ensure we observe the correct CPU on which the task is currently + * scheduling. */ - if (READ_ONCE(p->on_cpu) && ttwu_queue_wakelist(p, cpu, wake_flags | WF_ON_RQ)) + if (smp_load_acquire(&p->on_cpu) && + ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_ON_RQ)) goto unlock; /* @@ -2635,6 +2656,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) psi_ttwu_dequeue(p); set_task_cpu(p, cpu); } +#else + cpu = task_cpu(p); #endif /* CONFIG_SMP */ ttwu_queue(p, cpu, wake_flags); @@ -2642,7 +2665,7 @@ unlock: raw_spin_unlock_irqrestore(&p->pi_lock, flags); out: if (success) - ttwu_stat(p, cpu, wake_flags); + ttwu_stat(p, task_cpu(p), wake_flags); preempt_enable(); return success; -- cgit v1.2.3 From 739f70b476cf05c5a424b42a8b5728914345610c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 22 Jun 2020 12:01:24 +0200 Subject: sched/core: s/WF_ON_RQ/WQ_ON_CPU/ Use a better name for this poorly named flag, to avoid confusion... Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Acked-by: Mel Gorman Link: https://lkml.kernel.org/r/20200622100825.785115830@infradead.org --- kernel/sched/core.c | 4 ++-- kernel/sched/sched.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 60791b991b21..f778067de277 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2376,7 +2376,7 @@ static inline bool ttwu_queue_cond(int cpu, int wake_flags) * the soon-to-be-idle CPU as the current CPU is likely busy. * nr_running is checked to avoid unnecessary task stacking. */ - if ((wake_flags & WF_ON_RQ) && cpu_rq(cpu)->nr_running <= 1) + if ((wake_flags & WF_ON_CPU) && cpu_rq(cpu)->nr_running <= 1) return true; return false; @@ -2636,7 +2636,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * scheduling. */ if (smp_load_acquire(&p->on_cpu) && - ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_ON_RQ)) + ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_ON_CPU)) goto unlock; /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 1d4e94c1e5fe..877fb08eb1b0 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1682,7 +1682,7 @@ static inline int task_on_rq_migrating(struct task_struct *p) #define WF_SYNC 0x01 /* Waker goes to sleep after wakeup */ #define WF_FORK 0x02 /* Child wakeup after fork */ #define WF_MIGRATED 0x04 /* Internal use, task got migrated */ -#define WF_ON_RQ 0x08 /* Wakee is on_rq */ +#define WF_ON_CPU 0x08 /* Wakee is on_cpu */ /* * To aid in avoiding the subversion of "niceness" due to uneven distribution -- cgit v1.2.3 From 8c4890d1c3358fb8023d46e1e554c41d54f02878 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 22 Jun 2020 12:01:25 +0200 Subject: smp, irq_work: Continue smp_call_function*() and irq_work*() integration Instead of relying on BUG_ON() to ensure the various data structures line up, use a bunch of horrible unions to make it all automatic. Much of the union magic is to ensure irq_work and smp_call_function do not (yet) see the members of their respective data structures change name. Suggested-by: Linus Torvalds Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Frederic Weisbecker Link: https://lkml.kernel.org/r/20200622100825.844455025@infradead.org --- kernel/sched/core.c | 6 +++--- kernel/smp.c | 18 ------------------ 2 files changed, 3 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f778067de277..ca5db40392d4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2293,7 +2293,7 @@ void sched_ttwu_pending(void *arg) rq_lock_irqsave(rq, &rf); update_rq_clock(rq); - llist_for_each_entry_safe(p, t, llist, wake_entry) { + llist_for_each_entry_safe(p, t, llist, wake_entry.llist) { if (WARN_ON_ONCE(p->on_cpu)) smp_cond_load_acquire(&p->on_cpu, !VAL); @@ -2329,7 +2329,7 @@ static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED); WRITE_ONCE(rq->ttwu_pending, 1); - __smp_call_single_queue(cpu, &p->wake_entry); + __smp_call_single_queue(cpu, &p->wake_entry.llist); } void wake_up_if_idle(int cpu) @@ -2786,7 +2786,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) #endif init_numa_balancing(clone_flags, p); #ifdef CONFIG_SMP - p->wake_entry_type = CSD_TYPE_TTWU; + p->wake_entry.u_flags = CSD_TYPE_TTWU; #endif } diff --git a/kernel/smp.c b/kernel/smp.c index 472c2b274c65..aa17eedff5be 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -669,24 +669,6 @@ void __init smp_init(void) { int num_nodes, num_cpus; - /* - * Ensure struct irq_work layout matches so that - * flush_smp_call_function_queue() can do horrible things. - */ - BUILD_BUG_ON(offsetof(struct irq_work, llnode) != - offsetof(struct __call_single_data, llist)); - BUILD_BUG_ON(offsetof(struct irq_work, func) != - offsetof(struct __call_single_data, func)); - BUILD_BUG_ON(offsetof(struct irq_work, flags) != - offsetof(struct __call_single_data, flags)); - - /* - * Assert the CSD_TYPE_TTWU layout is similar enough - * for task_struct to be on the @call_single_queue. - */ - BUILD_BUG_ON(offsetof(struct task_struct, wake_entry_type) - offsetof(struct task_struct, wake_entry) != - offsetof(struct __call_single_data, flags) - offsetof(struct __call_single_data, llist)); - idle_threads_init(); cpuhp_threads_init(); -- cgit v1.2.3 From e21cf43406a190adfcc4bfe592768066fb3aaa9b Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Wed, 24 Jun 2020 17:44:22 +0200 Subject: sched/cfs: change initial value of runnable_avg Some performance regression on reaim benchmark have been raised with commit 070f5e860ee2 ("sched/fair: Take into account runnable_avg to classify group") The problem comes from the init value of runnable_avg which is initialized with max value. This can be a problem if the newly forked task is finally a short task because the group of CPUs is wrongly set to overloaded and tasks are pulled less agressively. Set initial value of runnable_avg equals to util_avg to reflect that there is no waiting time so far. Fixes: 070f5e860ee2 ("sched/fair: Take into account runnable_avg to classify group") Reported-by: kernel test robot Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200624154422.29166-1-vincent.guittot@linaro.org --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index cbcb2f71599b..658aa7a2ae6f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -806,7 +806,7 @@ void post_init_entity_util_avg(struct task_struct *p) } } - sa->runnable_avg = cpu_scale; + sa->runnable_avg = sa->util_avg; if (p->sched_class != &fair_sched_class) { /* -- cgit v1.2.3