diff options
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r-- | kernel/sched/fair.c | 440 |
1 files changed, 314 insertions, 126 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 23663318fb81..e6d1dd4e9d68 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -268,33 +268,11 @@ const struct sched_class fair_sched_class; */ #ifdef CONFIG_FAIR_GROUP_SCHED -static inline struct task_struct *task_of(struct sched_entity *se) -{ - SCHED_WARN_ON(!entity_is_task(se)); - return container_of(se, struct task_struct, se); -} /* Walk up scheduling entities hierarchy */ #define for_each_sched_entity(se) \ for (; se; se = se->parent) -static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) -{ - return p->se.cfs_rq; -} - -/* runqueue on which this entity is (to be) queued */ -static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) -{ - return se->cfs_rq; -} - -/* runqueue "owned" by this group */ -static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) -{ - return grp->my_q; -} - static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len) { if (!path) @@ -455,33 +433,9 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse) #else /* !CONFIG_FAIR_GROUP_SCHED */ -static inline struct task_struct *task_of(struct sched_entity *se) -{ - return container_of(se, struct task_struct, se); -} - #define for_each_sched_entity(se) \ for (; se; se = NULL) -static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) -{ - return &task_rq(p)->cfs; -} - -static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) -{ - struct task_struct *p = task_of(se); - struct rq *rq = task_rq(p); - - return &rq->cfs; -} - -/* runqueue "owned" by this group */ -static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) -{ - return NULL; -} - static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len) { if (path) @@ -1039,11 +993,14 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) { struct task_struct *tsk = task_of(se); + unsigned int state; - if (tsk->state & TASK_INTERRUPTIBLE) + /* XXX racy against TTWU */ + state = READ_ONCE(tsk->__state); + if (state & TASK_INTERRUPTIBLE) __schedstat_set(se->statistics.sleep_start, rq_clock(rq_of(cfs_rq))); - if (tsk->state & TASK_UNINTERRUPTIBLE) + if (state & TASK_UNINTERRUPTIBLE) __schedstat_set(se->statistics.block_start, rq_clock(rq_of(cfs_rq))); } @@ -1107,7 +1064,7 @@ struct numa_group { static struct numa_group *deref_task_numa_group(struct task_struct *p) { return rcu_dereference_check(p->numa_group, p == current || - (lockdep_is_held(&task_rq(p)->lock) && !READ_ONCE(p->on_cpu))); + (lockdep_is_held(__rq_lockp(task_rq(p))) && !READ_ONCE(p->on_cpu))); } static struct numa_group *deref_curr_numa_group(struct task_struct *p) @@ -3139,7 +3096,7 @@ void reweight_task(struct task_struct *p, int prio) * * tg->weight * grq->load.weight * ge->load.weight = ----------------------------- (1) - * \Sum grq->load.weight + * \Sum grq->load.weight * * Now, because computing that sum is prohibitively expensive to compute (been * there, done that) we approximate it with this average stuff. The average @@ -3153,7 +3110,7 @@ void reweight_task(struct task_struct *p, int prio) * * tg->weight * grq->avg.load_avg * ge->load.weight = ------------------------------ (3) - * tg->load_avg + * tg->load_avg * * Where: tg->load_avg ~= \Sum grq->avg.load_avg * @@ -3169,7 +3126,7 @@ void reweight_task(struct task_struct *p, int prio) * * tg->weight * grq->load.weight * ge->load.weight = ----------------------------- = tg->weight (4) - * grp->load.weight + * grp->load.weight * * That is, the sum collapses because all other CPUs are idle; the UP scenario. * @@ -3188,7 +3145,7 @@ void reweight_task(struct task_struct *p, int prio) * * tg->weight * grq->load.weight * ge->load.weight = ----------------------------- (6) - * tg_load_avg' + * tg_load_avg' * * Where: * @@ -3341,6 +3298,15 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) if (child_cfs_rq_on_list(cfs_rq)) return false; + /* + * _avg must be null when _sum are null because _avg = _sum / divider + * Make sure that rounding and/or propagation of PELT values never + * break this. + */ + SCHED_WARN_ON(cfs_rq->avg.load_avg || + cfs_rq->avg.util_avg || + cfs_rq->avg.runnable_avg); + return true; } @@ -3594,9 +3560,12 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq load_sum = (s64)se_weight(se) * runnable_sum; load_avg = div_s64(load_sum, divider); + se->avg.load_sum = runnable_sum; + delta = load_avg - se->avg.load_avg; + if (!delta) + return; - se->avg.load_sum = runnable_sum; se->avg.load_avg = load_avg; add_positive(&cfs_rq->avg.load_avg, delta); @@ -4476,6 +4445,8 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) static void set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { + clear_buddies(cfs_rq, se); + /* 'current' is not kept within the tree. */ if (se->on_rq) { /* @@ -4535,7 +4506,7 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr) * Avoid running the skip buddy, if running something else can * be done without getting too unfair. */ - if (cfs_rq->skip == se) { + if (cfs_rq->skip && cfs_rq->skip == se) { struct sched_entity *second; if (se == curr) { @@ -4562,8 +4533,6 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr) se = cfs_rq->last; } - clear_buddies(cfs_rq, se); - return se; } @@ -4685,8 +4654,11 @@ static inline u64 sched_cfs_bandwidth_slice(void) */ void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b) { - if (cfs_b->quota != RUNTIME_INF) - cfs_b->runtime = cfs_b->quota; + if (unlikely(cfs_b->quota == RUNTIME_INF)) + return; + + cfs_b->runtime += cfs_b->quota; + cfs_b->runtime = min(cfs_b->runtime, cfs_b->quota + cfs_b->burst); } static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) @@ -5047,6 +5019,9 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u throttled = !list_empty(&cfs_b->throttled_cfs_rq); cfs_b->nr_periods += overrun; + /* Refill extra burst quota even if cfs_b->idle */ + __refill_cfs_bandwidth_runtime(cfs_b); + /* * idle depends on !throttled (for the case of a large deficit), and if * we're going inactive then everything else can be deferred @@ -5054,8 +5029,6 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u if (cfs_b->idle && !throttled) goto out_deactivate; - __refill_cfs_bandwidth_runtime(cfs_b); - if (!throttled) { /* mark as potentially idle for the upcoming period */ cfs_b->idle = 1; @@ -5305,6 +5278,7 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) if (new < max_cfs_quota_period) { cfs_b->period = ns_to_ktime(new); cfs_b->quota *= 2; + cfs_b->burst *= 2; pr_warn_ratelimited( "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us = %lld, cfs_quota_us = %lld)\n", @@ -5336,6 +5310,7 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) cfs_b->runtime = 0; cfs_b->quota = RUNTIME_INF; cfs_b->period = ns_to_ktime(default_cfs_period()); + cfs_b->burst = 0; INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq); hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); @@ -5385,7 +5360,7 @@ static void __maybe_unused update_runtime_enabled(struct rq *rq) { struct task_group *tg; - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq); rcu_read_lock(); list_for_each_entry_rcu(tg, &task_groups, list) { @@ -5404,7 +5379,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) { struct task_group *tg; - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq); rcu_read_lock(); list_for_each_entry_rcu(tg, &task_groups, list) { @@ -5992,11 +5967,15 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this /* Traverse only the allowed CPUs */ for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) { + struct rq *rq = cpu_rq(i); + + if (!sched_core_cookie_match(rq, p)) + continue; + if (sched_idle_cpu(i)) return i; if (available_idle_cpu(i)) { - struct rq *rq = cpu_rq(i); struct cpuidle_state *idle = idle_get_state(rq); if (idle && idle->exit_latency < min_exit_latency) { /* @@ -6082,9 +6061,10 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p return new_cpu; } -static inline int __select_idle_cpu(int cpu) +static inline int __select_idle_cpu(int cpu, struct task_struct *p) { - if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) + if ((available_idle_cpu(cpu) || sched_idle_cpu(cpu)) && + sched_cpu_cookie_match(cpu_rq(cpu), p)) return cpu; return -1; @@ -6154,7 +6134,7 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu int cpu; if (!static_branch_likely(&sched_smt_present)) - return __select_idle_cpu(core); + return __select_idle_cpu(core, p); for_each_cpu(cpu, cpu_smt_mask(core)) { if (!available_idle_cpu(cpu)) { @@ -6210,7 +6190,7 @@ static inline bool test_idle_cores(int cpu, bool def) static inline int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu) { - return __select_idle_cpu(core); + return __select_idle_cpu(core, p); } static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) @@ -6229,9 +6209,10 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool { struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); int i, cpu, idle_cpu = -1, nr = INT_MAX; + struct rq *this_rq = this_rq(); int this = smp_processor_id(); struct sched_domain *this_sd; - u64 time; + u64 time = 0; this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); if (!this_sd) @@ -6241,12 +6222,21 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool if (sched_feat(SIS_PROP) && !has_idle_core) { u64 avg_cost, avg_idle, span_avg; + unsigned long now = jiffies; /* - * Due to large variance we need a large fuzz factor; - * hackbench in particularly is sensitive here. + * If we're busy, the assumption that the last idle period + * predicts the future is flawed; age away the remaining + * predicted idle time. */ - avg_idle = this_rq()->avg_idle / 512; + if (unlikely(this_rq->wake_stamp < now)) { + while (this_rq->wake_stamp < now && this_rq->wake_avg_idle) { + this_rq->wake_stamp++; + this_rq->wake_avg_idle >>= 1; + } + } + + avg_idle = this_rq->wake_avg_idle; avg_cost = this_sd->avg_scan_cost + 1; span_avg = sd->span_weight * avg_idle; @@ -6267,7 +6257,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool } else { if (!--nr) return -1; - idle_cpu = __select_idle_cpu(cpu); + idle_cpu = __select_idle_cpu(cpu, p); if ((unsigned int)idle_cpu < nr_cpumask_bits) break; } @@ -6278,6 +6268,13 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool if (sched_feat(SIS_PROP) && !has_idle_core) { time = cpu_clock(this) - time; + + /* + * Account for the scan cost of wakeups against the average + * idle time. + */ + this_rq->wake_avg_idle -= min(this_rq->wake_avg_idle, time); + update_avg(&this_sd->avg_scan_cost, time); } @@ -6345,6 +6342,11 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) task_util = uclamp_task_util(p); } + /* + * per-cpu select_idle_mask usage + */ + lockdep_assert_irqs_disabled(); + if ((available_idle_cpu(target) || sched_idle_cpu(target)) && asym_fits_capacity(task_util, target)) return target; @@ -6620,8 +6622,11 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) struct cpumask *pd_mask = perf_domain_span(pd); unsigned long cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask)); unsigned long max_util = 0, sum_util = 0; + unsigned long _cpu_cap = cpu_cap; int cpu; + _cpu_cap -= arch_scale_thermal_pressure(cpumask_first(pd_mask)); + /* * The capacity state of CPUs of the current rd can be driven by CPUs * of another rd if they belong to the same pd. So, account for the @@ -6657,8 +6662,10 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) * is already enough to scale the EM reported power * consumption at the (eventually clamped) cpu_capacity. */ - sum_util += effective_cpu_util(cpu, util_running, cpu_cap, - ENERGY_UTIL, NULL); + cpu_util = effective_cpu_util(cpu, util_running, cpu_cap, + ENERGY_UTIL, NULL); + + sum_util += min(cpu_util, _cpu_cap); /* * Performance domain frequency: utilization clamping @@ -6669,10 +6676,10 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) */ cpu_util = effective_cpu_util(cpu, util_freq, cpu_cap, FREQUENCY_UTIL, tsk); - max_util = max(max_util, cpu_util); + max_util = max(max_util, min(cpu_util, _cpu_cap)); } - return em_cpu_energy(pd->em_pd, max_util, sum_util); + return em_cpu_energy(pd->em_pd, max_util, sum_util, _cpu_cap); } /* @@ -6718,15 +6725,15 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) { unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX; struct root_domain *rd = cpu_rq(smp_processor_id())->rd; + int cpu, best_energy_cpu = prev_cpu, target = -1; unsigned long cpu_cap, util, base_energy = 0; - int cpu, best_energy_cpu = prev_cpu; struct sched_domain *sd; struct perf_domain *pd; rcu_read_lock(); pd = rcu_dereference(rd->pd); if (!pd || READ_ONCE(rd->overutilized)) - goto fail; + goto unlock; /* * Energy-aware wake-up happens on the lowest sched_domain starting @@ -6736,7 +6743,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) sd = sd->parent; if (!sd) - goto fail; + goto unlock; + + target = prev_cpu; sync_entity_load_avg(&p->se); if (!task_util_est(p)) @@ -6744,13 +6753,10 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) for (; pd; pd = pd->next) { unsigned long cur_delta, spare_cap, max_spare_cap = 0; + bool compute_prev_delta = false; unsigned long base_energy_pd; int max_spare_cap_cpu = -1; - /* Compute the 'base' energy of the pd, without @p */ - base_energy_pd = compute_energy(p, -1, pd); - base_energy += base_energy_pd; - for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) { if (!cpumask_test_cpu(cpu, p->cpus_ptr)) continue; @@ -6771,26 +6777,40 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) if (!fits_capacity(util, cpu_cap)) continue; - /* Always use prev_cpu as a candidate. */ if (cpu == prev_cpu) { - prev_delta = compute_energy(p, prev_cpu, pd); - prev_delta -= base_energy_pd; - best_delta = min(best_delta, prev_delta); - } - - /* - * Find the CPU with the maximum spare capacity in - * the performance domain - */ - if (spare_cap > max_spare_cap) { + /* Always use prev_cpu as a candidate. */ + compute_prev_delta = true; + } else if (spare_cap > max_spare_cap) { + /* + * Find the CPU with the maximum spare capacity + * in the performance domain. + */ max_spare_cap = spare_cap; max_spare_cap_cpu = cpu; } } - /* Evaluate the energy impact of using this CPU. */ - if (max_spare_cap_cpu >= 0 && max_spare_cap_cpu != prev_cpu) { + if (max_spare_cap_cpu < 0 && !compute_prev_delta) + continue; + + /* Compute the 'base' energy of the pd, without @p */ + base_energy_pd = compute_energy(p, -1, pd); + base_energy += base_energy_pd; + + /* Evaluate the energy impact of using prev_cpu. */ + if (compute_prev_delta) { + prev_delta = compute_energy(p, prev_cpu, pd); + if (prev_delta < base_energy_pd) + goto unlock; + prev_delta -= base_energy_pd; + best_delta = min(best_delta, prev_delta); + } + + /* Evaluate the energy impact of using max_spare_cap_cpu. */ + if (max_spare_cap_cpu >= 0) { cur_delta = compute_energy(p, max_spare_cap_cpu, pd); + if (cur_delta < base_energy_pd) + goto unlock; cur_delta -= base_energy_pd; if (cur_delta < best_delta) { best_delta = cur_delta; @@ -6798,25 +6818,22 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) } } } -unlock: rcu_read_unlock(); /* * Pick the best CPU if prev_cpu cannot be used, or if it saves at * least 6% of the energy used by prev_cpu. */ - if (prev_delta == ULONG_MAX) - return best_energy_cpu; - - if ((prev_delta - best_delta) > ((prev_delta + base_energy) >> 4)) - return best_energy_cpu; + if ((prev_delta == ULONG_MAX) || + (prev_delta - best_delta) > ((prev_delta + base_energy) >> 4)) + target = best_energy_cpu; - return prev_cpu; + return target; -fail: +unlock: rcu_read_unlock(); - return -1; + return target; } /* @@ -6828,8 +6845,6 @@ fail: * certain conditions an idle sibling CPU if the domain has SD_WAKE_AFFINE set. * * Returns the target CPU number. - * - * preempt must be disabled. */ static int select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) @@ -6842,6 +6857,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) /* SD_flags and WF_flags share the first nibble */ int sd_flag = wake_flags & 0xF; + /* + * required for stable ->cpus_allowed + */ + lockdep_assert_held(&p->pi_lock); if (wake_flags & WF_TTWU) { record_wakee(p); @@ -6906,7 +6925,7 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu) * min_vruntime -- the latter is done by enqueue_entity() when placing * the task on the new runqueue. */ - if (p->state == TASK_WAKING) { + if (READ_ONCE(p->__state) == TASK_WAKING) { struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); u64 min_vruntime; @@ -6931,7 +6950,7 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu) * In case of TASK_ON_RQ_MIGRATING we in fact hold the 'old' * rq->lock and can modify state directly. */ - lockdep_assert_held(&task_rq(p)->lock); + lockdep_assert_rq_held(task_rq(p)); detach_entity_cfs_rq(&p->se); } else { @@ -7135,6 +7154,39 @@ preempt: set_last_buddy(se); } +#ifdef CONFIG_SMP +static struct task_struct *pick_task_fair(struct rq *rq) +{ + struct sched_entity *se; + struct cfs_rq *cfs_rq; + +again: + cfs_rq = &rq->cfs; + if (!cfs_rq->nr_running) + return NULL; + + do { + struct sched_entity *curr = cfs_rq->curr; + + /* When we pick for a remote RQ, we'll not have done put_prev_entity() */ + if (curr) { + if (curr->on_rq) + update_curr(cfs_rq); + else + curr = NULL; + + if (unlikely(check_cfs_rq_runtime(cfs_rq))) + goto again; + } + + se = pick_next_entity(cfs_rq, curr); + cfs_rq = group_cfs_rq(se); + } while (cfs_rq); + + return task_of(se); +} +#endif + struct task_struct * pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { @@ -7558,7 +7610,7 @@ static int task_hot(struct task_struct *p, struct lb_env *env) { s64 delta; - lockdep_assert_held(&env->src_rq->lock); + lockdep_assert_rq_held(env->src_rq); if (p->sched_class != &fair_sched_class) return 0; @@ -7580,6 +7632,14 @@ static int task_hot(struct task_struct *p, struct lb_env *env) if (sysctl_sched_migration_cost == -1) return 1; + + /* + * Don't migrate task if the task's cookie does not match + * with the destination CPU's core cookie. + */ + if (!sched_core_cookie_match(cpu_rq(env->dst_cpu), p)) + return 1; + if (sysctl_sched_migration_cost == 0) return 0; @@ -7656,7 +7716,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) { int tsk_cache_hot; - lockdep_assert_held(&env->src_rq->lock); + lockdep_assert_rq_held(env->src_rq); /* * We do not migrate tasks that are: @@ -7745,7 +7805,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) */ static void detach_task(struct task_struct *p, struct lb_env *env) { - lockdep_assert_held(&env->src_rq->lock); + lockdep_assert_rq_held(env->src_rq); deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK); set_task_cpu(p, env->dst_cpu); @@ -7761,7 +7821,7 @@ static struct task_struct *detach_one_task(struct lb_env *env) { struct task_struct *p; - lockdep_assert_held(&env->src_rq->lock); + lockdep_assert_rq_held(env->src_rq); list_for_each_entry_reverse(p, &env->src_rq->cfs_tasks, se.group_node) { @@ -7797,7 +7857,7 @@ static int detach_tasks(struct lb_env *env) struct task_struct *p; int detached = 0; - lockdep_assert_held(&env->src_rq->lock); + lockdep_assert_rq_held(env->src_rq); /* * Source run queue has been emptied by another CPU, clear @@ -7927,7 +7987,7 @@ next: */ static void attach_task(struct rq *rq, struct task_struct *p) { - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq); BUG_ON(task_rq(p) != rq); activate_task(rq, p, ENQUEUE_NOCLOCK); @@ -8893,6 +8953,10 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) p->cpus_ptr)) continue; + /* Skip over this group if no cookie matched */ + if (!sched_group_cookie_match(cpu_rq(this_cpu), p, group)) + continue; + local_group = cpumask_test_cpu(this_cpu, sched_group_span(group)); @@ -9821,7 +9885,7 @@ more_balance: if (need_active_balance(&env)) { unsigned long flags; - raw_spin_lock_irqsave(&busiest->lock, flags); + raw_spin_rq_lock_irqsave(busiest, flags); /* * Don't kick the active_load_balance_cpu_stop, @@ -9829,8 +9893,7 @@ more_balance: * moved to this_cpu: */ if (!cpumask_test_cpu(this_cpu, busiest->curr->cpus_ptr)) { - raw_spin_unlock_irqrestore(&busiest->lock, - flags); + raw_spin_rq_unlock_irqrestore(busiest, flags); goto out_one_pinned; } @@ -9847,7 +9910,7 @@ more_balance: busiest->push_cpu = this_cpu; active_balance = 1; } - raw_spin_unlock_irqrestore(&busiest->lock, flags); + raw_spin_rq_unlock_irqrestore(busiest, flags); if (active_balance) { stop_one_cpu_nowait(cpu_of(busiest), @@ -10632,6 +10695,14 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) u64 curr_cost = 0; update_misfit_status(NULL, this_rq); + + /* + * There is a task waiting to run. No need to search for one. + * Return 0; the task will be enqueued when switching to idle. + */ + if (this_rq->ttwu_pending) + return 0; + /* * We must set idle_stamp _before_ calling idle_balance(), such that we * measure the duration of idle_balance() as idle time. @@ -10664,7 +10735,7 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) goto out; } - raw_spin_unlock(&this_rq->lock); + raw_spin_rq_unlock(this_rq); update_blocked_averages(this_cpu); rcu_read_lock(); @@ -10697,12 +10768,13 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) * Stop searching for tasks to pull if there are * now runnable tasks on this rq. */ - if (pulled_task || this_rq->nr_running > 0) + if (pulled_task || this_rq->nr_running > 0 || + this_rq->ttwu_pending) break; } rcu_read_unlock(); - raw_spin_lock(&this_rq->lock); + raw_spin_rq_lock(this_rq); if (curr_cost > this_rq->max_idle_balance_cost) this_rq->max_idle_balance_cost = curr_cost; @@ -10795,6 +10867,119 @@ static void rq_offline_fair(struct rq *rq) #endif /* CONFIG_SMP */ +#ifdef CONFIG_SCHED_CORE +static inline bool +__entity_slice_used(struct sched_entity *se, int min_nr_tasks) +{ + u64 slice = sched_slice(cfs_rq_of(se), se); + u64 rtime = se->sum_exec_runtime - se->prev_sum_exec_runtime; + + return (rtime * min_nr_tasks > slice); +} + +#define MIN_NR_TASKS_DURING_FORCEIDLE 2 +static inline void task_tick_core(struct rq *rq, struct task_struct *curr) +{ + if (!sched_core_enabled(rq)) + return; + + /* + * If runqueue has only one task which used up its slice and + * if the sibling is forced idle, then trigger schedule to + * give forced idle task a chance. + * + * sched_slice() considers only this active rq and it gets the + * whole slice. But during force idle, we have siblings acting + * like a single runqueue and hence we need to consider runnable + * tasks on this CPU and the forced idle CPU. Ideally, we should + * go through the forced idle rq, but that would be a perf hit. + * We can assume that the forced idle CPU has at least + * MIN_NR_TASKS_DURING_FORCEIDLE - 1 tasks and use that to check + * if we need to give up the CPU. + */ + if (rq->core->core_forceidle && rq->cfs.nr_running == 1 && + __entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE)) + resched_curr(rq); +} + +/* + * se_fi_update - Update the cfs_rq->min_vruntime_fi in a CFS hierarchy if needed. + */ +static void se_fi_update(struct sched_entity *se, unsigned int fi_seq, bool forceidle) +{ + for_each_sched_entity(se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + if (forceidle) { + if (cfs_rq->forceidle_seq == fi_seq) + break; + cfs_rq->forceidle_seq = fi_seq; + } + + cfs_rq->min_vruntime_fi = cfs_rq->min_vruntime; + } +} + +void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi) +{ + struct sched_entity *se = &p->se; + + if (p->sched_class != &fair_sched_class) + return; + + se_fi_update(se, rq->core->core_forceidle_seq, in_fi); +} + +bool cfs_prio_less(struct task_struct *a, struct task_struct *b, bool in_fi) +{ + struct rq *rq = task_rq(a); + struct sched_entity *sea = &a->se; + struct sched_entity *seb = &b->se; + struct cfs_rq *cfs_rqa; + struct cfs_rq *cfs_rqb; + s64 delta; + + SCHED_WARN_ON(task_rq(b)->core != rq->core); + +#ifdef CONFIG_FAIR_GROUP_SCHED + /* + * Find an se in the hierarchy for tasks a and b, such that the se's + * are immediate siblings. + */ + while (sea->cfs_rq->tg != seb->cfs_rq->tg) { + int sea_depth = sea->depth; + int seb_depth = seb->depth; + + if (sea_depth >= seb_depth) + sea = parent_entity(sea); + if (sea_depth <= seb_depth) + seb = parent_entity(seb); + } + + se_fi_update(sea, rq->core->core_forceidle_seq, in_fi); + se_fi_update(seb, rq->core->core_forceidle_seq, in_fi); + + cfs_rqa = sea->cfs_rq; + cfs_rqb = seb->cfs_rq; +#else + cfs_rqa = &task_rq(a)->cfs; + cfs_rqb = &task_rq(b)->cfs; +#endif + + /* + * Find delta after normalizing se's vruntime with its cfs_rq's + * min_vruntime_fi, which would have been updated in prior calls + * to se_fi_update(). + */ + delta = (s64)(sea->vruntime - seb->vruntime) + + (s64)(cfs_rqb->min_vruntime_fi - cfs_rqa->min_vruntime_fi); + + return delta > 0; +} +#else +static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {} +#endif + /* * scheduler tick hitting a task of our scheduling class. * @@ -10818,6 +11003,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) update_misfit_status(curr, rq); update_overutilized_status(task_rq(curr)); + + task_tick_core(rq, curr); } /* @@ -10903,7 +11090,7 @@ static inline bool vruntime_normalized(struct task_struct *p) * waiting for actually being woken up by sched_ttwu_pending(). */ if (!se->sum_exec_runtime || - (p->state == TASK_WAKING && p->sched_remote_wakeup)) + (READ_ONCE(p->__state) == TASK_WAKING && p->sched_remote_wakeup)) return true; return false; @@ -11189,9 +11376,9 @@ void unregister_fair_sched_group(struct task_group *tg) rq = cpu_rq(cpu); - raw_spin_lock_irqsave(&rq->lock, flags); + raw_spin_rq_lock_irqsave(rq, flags); list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); - raw_spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_rq_unlock_irqrestore(rq, flags); } } @@ -11313,6 +11500,7 @@ DEFINE_SCHED_CLASS(fair) = { #ifdef CONFIG_SMP .balance = balance_fair, + .pick_task = pick_task_fair, .select_task_rq = select_task_rq_fair, .migrate_task_rq = migrate_task_rq_fair, |