From b5bf9a90bbebffba888c9144c5a8a10317b04064 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 30 Apr 2018 14:51:01 +0200 Subject: sched/core: Introduce set_special_state() Gaurav reported a perceived problem with TASK_PARKED, which turned out to be a broken wait-loop pattern in __kthread_parkme(), but the reported issue can (and does) in fact happen for states that do not do condition based sleeps. When the 'current->state = TASK_RUNNING' store of a previous (concurrent) try_to_wake_up() collides with the setting of a 'special' sleep state, we can loose the sleep state. Normal condition based wait-loops are immune to this problem, but for sleep states that are not condition based are subject to this problem. There already is a fix for TASK_DEAD. Abstract that and also apply it to TASK_STOPPED and TASK_TRACED, both of which are also without condition based wait-loop. Reported-by: Gaurav Kohli Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Oleg Nesterov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- include/linux/sched.h | 50 +++++++++++++++++++++++++++++++++++++++----- include/linux/sched/signal.h | 2 +- 2 files changed, 46 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index b3d697f3b573..c2413703f45d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -112,17 +112,36 @@ struct task_group; #ifdef CONFIG_DEBUG_ATOMIC_SLEEP +/* + * Special states are those that do not use the normal wait-loop pattern. See + * the comment with set_special_state(). + */ +#define is_special_task_state(state) \ + ((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_DEAD)) + #define __set_current_state(state_value) \ do { \ + WARN_ON_ONCE(is_special_task_state(state_value));\ current->task_state_change = _THIS_IP_; \ current->state = (state_value); \ } while (0) + #define set_current_state(state_value) \ do { \ + WARN_ON_ONCE(is_special_task_state(state_value));\ current->task_state_change = _THIS_IP_; \ smp_store_mb(current->state, (state_value)); \ } while (0) +#define set_special_state(state_value) \ + do { \ + unsigned long flags; /* may shadow */ \ + WARN_ON_ONCE(!is_special_task_state(state_value)); \ + raw_spin_lock_irqsave(¤t->pi_lock, flags); \ + current->task_state_change = _THIS_IP_; \ + current->state = (state_value); \ + raw_spin_unlock_irqrestore(¤t->pi_lock, flags); \ + } while (0) #else /* * set_current_state() includes a barrier so that the write of current->state @@ -144,8 +163,8 @@ struct task_group; * * The above is typically ordered against the wakeup, which does: * - * need_sleep = false; - * wake_up_state(p, TASK_UNINTERRUPTIBLE); + * need_sleep = false; + * wake_up_state(p, TASK_UNINTERRUPTIBLE); * * Where wake_up_state() (and all other wakeup primitives) imply enough * barriers to order the store of the variable against wakeup. @@ -154,12 +173,33 @@ struct task_group; * once it observes the TASK_UNINTERRUPTIBLE store the waking CPU can issue a * TASK_RUNNING store which can collide with __set_current_state(TASK_RUNNING). * - * This is obviously fine, since they both store the exact same value. + * However, with slightly different timing the wakeup TASK_RUNNING store can + * also collide with the TASK_UNINTERRUPTIBLE store. Loosing that store is not + * a problem either because that will result in one extra go around the loop + * and our @cond test will save the day. * * Also see the comments of try_to_wake_up(). */ -#define __set_current_state(state_value) do { current->state = (state_value); } while (0) -#define set_current_state(state_value) smp_store_mb(current->state, (state_value)) +#define __set_current_state(state_value) \ + current->state = (state_value) + +#define set_current_state(state_value) \ + smp_store_mb(current->state, (state_value)) + +/* + * set_special_state() should be used for those states when the blocking task + * can not use the regular condition based wait-loop. In that case we must + * serialize against wakeups such that any possible in-flight TASK_RUNNING stores + * will not collide with our state change. + */ +#define set_special_state(state_value) \ + do { \ + unsigned long flags; /* may shadow */ \ + raw_spin_lock_irqsave(¤t->pi_lock, flags); \ + current->state = (state_value); \ + raw_spin_unlock_irqrestore(¤t->pi_lock, flags); \ + } while (0) + #endif /* Task command name length: */ diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index a7ce74c74e49..113d1ad1ced7 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -280,7 +280,7 @@ static inline void kernel_signal_stop(void) { spin_lock_irq(¤t->sighand->siglock); if (current->jobctl & JOBCTL_STOP_DEQUEUED) - __set_current_state(TASK_STOPPED); + set_special_state(TASK_STOPPED); spin_unlock_irq(¤t->sighand->siglock); schedule(); -- cgit v1.2.3