diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2021-04-28 13:33:57 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2021-04-28 13:33:57 -0700 |
commit | 16b3d0cf5bad844daaf436ad2e9061de0fe36e5c (patch) | |
tree | d553a51e6d95fb166df7fa62264e9a27e4c438a4 /kernel/sched/fair.c | |
parent | 42dec9a936e7696bea1f27d3c5a0068cd9aa95fd (diff) | |
parent | 2ea46c6fc9452ac100ad907b051d797225847e33 (diff) |
Merge tag 'sched-core-2021-04-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar:
- Clean up SCHED_DEBUG: move the decades old mess of sysctl, procfs and
debugfs interfaces to a unified debugfs interface.
- Signals: Allow caching one sigqueue object per task, to improve
performance & latencies.
- Improve newidle_balance() irq-off latencies on systems with a large
number of CPU cgroups.
- Improve energy-aware scheduling
- Improve the PELT metrics for certain workloads
- Reintroduce select_idle_smt() to improve load-balancing locality -
but without the previous regressions
- Add 'scheduler latency debugging': warn after long periods of pending
need_resched. This is an opt-in feature that requires the enabling of
the LATENCY_WARN scheduler feature, or the use of the
resched_latency_warn_ms=xx boot parameter.
- CPU hotplug fixes for HP-rollback, and for the 'fail' interface. Fix
remaining balance_push() vs. hotplug holes/races
- PSI fixes, plus allow /proc/pressure/ files to be written by
CAP_SYS_RESOURCE tasks as well
- Fix/improve various load-balancing corner cases vs. capacity margins
- Fix sched topology on systems with NUMA diameter of 3 or above
- Fix PF_KTHREAD vs to_kthread() race
- Minor rseq optimizations
- Misc cleanups, optimizations, fixes and smaller updates
* tag 'sched-core-2021-04-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (61 commits)
cpumask/hotplug: Fix cpu_dying() state tracking
kthread: Fix PF_KTHREAD vs to_kthread() race
sched/debug: Fix cgroup_path[] serialization
sched,psi: Handle potential task count underflow bugs more gracefully
sched: Warn on long periods of pending need_resched
sched/fair: Move update_nohz_stats() to the CONFIG_NO_HZ_COMMON block to simplify the code & fix an unused function warning
sched/debug: Rename the sched_debug parameter to sched_verbose
sched,fair: Alternative sched_slice()
sched: Move /proc/sched_debug to debugfs
sched,debug: Convert sysctl sched_domains to debugfs
debugfs: Implement debugfs_create_str()
sched,preempt: Move preempt_dynamic to debug.c
sched: Move SCHED_DEBUG sysctl to debugfs
sched: Don't make LATENCYTOP select SCHED_DEBUG
sched: Remove sched_schedstats sysctl out from under SCHED_DEBUG
sched/numa: Allow runtime enabling/disabling of NUMA balance without SCHED_DEBUG
sched: Use cpu_dying() to fix balance_push vs hotplug-rollback
cpumask: Introduce DYING mask
cpumask: Make cpu_{online,possible,present,active}() inline
rseq: Optimise rseq_get_rseq_cs() and clear_rseq_cs()
...
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r-- | kernel/sched/fair.c | 380 |
1 files changed, 207 insertions, 173 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 794c2cb945f8..1d75af1ecfb4 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -49,7 +49,7 @@ static unsigned int normalized_sysctl_sched_latency = 6000000ULL; * * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) */ -enum sched_tunable_scaling sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG; +unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG; /* * Minimal preemption granularity for CPU-bound tasks: @@ -113,6 +113,13 @@ int __weak arch_asym_cpu_priority(int cpu) */ #define fits_capacity(cap, max) ((cap) * 1280 < (max) * 1024) +/* + * The margin used when comparing CPU capacities. + * is 'cap1' noticeably greater than 'cap2' + * + * (default: ~5%) + */ +#define capacity_greater(cap1, cap2) ((cap1) * 1024 > (cap2) * 1078) #endif #ifdef CONFIG_CFS_BANDWIDTH @@ -229,22 +236,25 @@ static void __update_inv_weight(struct load_weight *lw) static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw) { u64 fact = scale_load_down(weight); + u32 fact_hi = (u32)(fact >> 32); int shift = WMULT_SHIFT; + int fs; __update_inv_weight(lw); - if (unlikely(fact >> 32)) { - while (fact >> 32) { - fact >>= 1; - shift--; - } + if (unlikely(fact_hi)) { + fs = fls(fact_hi); + shift -= fs; + fact >>= fs; } fact = mul_u32_u32(fact, lw->inv_weight); - while (fact >> 32) { - fact >>= 1; - shift--; + fact_hi = (u32)(fact >> 32); + if (fact_hi) { + fs = fls(fact_hi); + shift -= fs; + fact >>= fs; } return mul_u64_u32_shr(delta_exec, fact, shift); @@ -624,15 +634,10 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) * Scheduling class statistics methods: */ -int sched_proc_update_handler(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) +int sched_update_scaling(void) { - int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); unsigned int factor = get_update_sysctl_factor(); - if (ret || !write) - return ret; - sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency, sysctl_sched_min_granularity); @@ -682,7 +687,13 @@ static u64 __sched_period(unsigned long nr_running) */ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) { - u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq); + unsigned int nr_running = cfs_rq->nr_running; + u64 slice; + + if (sched_feat(ALT_PERIOD)) + nr_running = rq_of(cfs_rq)->cfs.h_nr_running; + + slice = __sched_period(nr_running + !se->on_rq); for_each_sched_entity(se) { struct load_weight *load; @@ -699,6 +710,10 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) } slice = __calc_delta(slice, se->load.weight, load); } + + if (sched_feat(BASE_SLICE)) + slice = max(slice, (u64)sysctl_sched_min_granularity); + return slice; } @@ -1122,7 +1137,7 @@ static unsigned int task_nr_scan_windows(struct task_struct *p) return rss / nr_scan_pages; } -/* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */ +/* For sanity's sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */ #define MAX_SCAN_WINDOW 2560 static unsigned int task_scan_min(struct task_struct *p) @@ -2574,7 +2589,7 @@ no_join: } /* - * Get rid of NUMA staticstics associated with a task (either current or dead). + * Get rid of NUMA statistics associated with a task (either current or dead). * If @final is set, the task is dead and has reached refcount zero, so we can * safely free all relevant data structures. Otherwise, there might be * concurrent reads from places like load balancing and procfs, and we should @@ -3941,13 +3956,15 @@ static inline void util_est_dequeue(struct cfs_rq *cfs_rq, trace_sched_util_est_cfs_tp(cfs_rq); } +#define UTIL_EST_MARGIN (SCHED_CAPACITY_SCALE / 100) + /* * Check if a (signed) value is within a specified (unsigned) margin, * based on the observation that: * * abs(x) < y := (unsigned)(x + y - 1) < (2 * y - 1) * - * NOTE: this only works when value + maring < INT_MAX. + * NOTE: this only works when value + margin < INT_MAX. */ static inline bool within_margin(int value, int margin) { @@ -3958,7 +3975,7 @@ static inline void util_est_update(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) { - long last_ewma_diff; + long last_ewma_diff, last_enqueued_diff; struct util_est ue; if (!sched_feat(UTIL_EST)) @@ -3979,6 +3996,8 @@ static inline void util_est_update(struct cfs_rq *cfs_rq, if (ue.enqueued & UTIL_AVG_UNCHANGED) return; + last_enqueued_diff = ue.enqueued; + /* * Reset EWMA on utilization increases, the moving average is used only * to smooth utilization decreases. @@ -3992,12 +4011,17 @@ static inline void util_est_update(struct cfs_rq *cfs_rq, } /* - * Skip update of task's estimated utilization when its EWMA is + * Skip update of task's estimated utilization when its members are * already ~1% close to its last activation value. */ last_ewma_diff = ue.enqueued - ue.ewma; - if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100))) + last_enqueued_diff -= ue.enqueued; + if (within_margin(last_ewma_diff, UTIL_EST_MARGIN)) { + if (!within_margin(last_enqueued_diff, UTIL_EST_MARGIN)) + goto done; + return; + } /* * To avoid overestimation of actual task utilization, skip updates if @@ -4244,7 +4268,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) /* * When bandwidth control is enabled, cfs might have been removed * because of a parent been throttled but cfs->nr_running > 1. Try to - * add it unconditionnally. + * add it unconditionally. */ if (cfs_rq->nr_running == 1 || cfs_bandwidth_used()) list_add_leaf_cfs_rq(cfs_rq); @@ -5299,7 +5323,7 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) * bits doesn't do much. */ -/* cpu online calback */ +/* cpu online callback */ static void __maybe_unused update_runtime_enabled(struct rq *rq) { struct task_group *tg; @@ -6098,6 +6122,24 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu return -1; } +/* + * Scan the local SMT mask for idle CPUs. + */ +static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) +{ + int cpu; + + for_each_cpu(cpu, cpu_smt_mask(target)) { + if (!cpumask_test_cpu(cpu, p->cpus_ptr) || + !cpumask_test_cpu(cpu, sched_domain_span(sd))) + continue; + if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) + return cpu; + } + + return -1; +} + #else /* CONFIG_SCHED_SMT */ static inline void set_idle_cores(int cpu, int val) @@ -6114,6 +6156,11 @@ static inline int select_idle_core(struct task_struct *p, int core, struct cpuma return __select_idle_cpu(core); } +static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) +{ + return -1; +} + #endif /* CONFIG_SCHED_SMT */ /* @@ -6121,11 +6168,10 @@ static inline int select_idle_core(struct task_struct *p, int core, struct cpuma * comparing the average scan cost (tracked in sd->avg_scan_cost) against the * average idle time for this rq (as found in rq->avg_idle). */ -static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target) +static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool has_idle_core, int target) { struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); int i, cpu, idle_cpu = -1, nr = INT_MAX; - bool smt = test_idle_cores(target, false); int this = smp_processor_id(); struct sched_domain *this_sd; u64 time; @@ -6136,7 +6182,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); - if (sched_feat(SIS_PROP) && !smt) { + if (sched_feat(SIS_PROP) && !has_idle_core) { u64 avg_cost, avg_idle, span_avg; /* @@ -6156,7 +6202,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t } for_each_cpu_wrap(cpu, cpus, target) { - if (smt) { + if (has_idle_core) { i = select_idle_core(p, cpu, cpus, &idle_cpu); if ((unsigned int)i < nr_cpumask_bits) return i; @@ -6170,10 +6216,10 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t } } - if (smt) + if (has_idle_core) set_idle_cores(this, false); - if (sched_feat(SIS_PROP) && !smt) { + if (sched_feat(SIS_PROP) && !has_idle_core) { time = cpu_clock(this) - time; update_avg(&this_sd->avg_scan_cost, time); } @@ -6228,6 +6274,7 @@ static inline bool asym_fits_capacity(int task_util, int cpu) */ static int select_idle_sibling(struct task_struct *p, int prev, int target) { + bool has_idle_core = false; struct sched_domain *sd; unsigned long task_util; int i, recent_used_cpu; @@ -6307,7 +6354,17 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) if (!sd) return target; - i = select_idle_cpu(p, sd, target); + if (sched_smt_active()) { + has_idle_core = test_idle_cores(target, false); + + if (!has_idle_core && cpus_share_cache(prev, target)) { + i = select_idle_smt(p, sd, prev); + if ((unsigned int)i < nr_cpumask_bits) + return i; + } + } + + i = select_idle_cpu(p, sd, has_idle_core, target); if ((unsigned)i < nr_cpumask_bits) return i; @@ -6471,7 +6528,7 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu) * util_avg should already be correct. */ if (task_cpu(p) == cpu && dst_cpu != cpu) - sub_positive(&util, task_util(p)); + lsub_positive(&util, task_util(p)); else if (task_cpu(p) != cpu && dst_cpu == cpu) util += task_util(p); @@ -6518,8 +6575,24 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) * its pd list and will not be accounted by compute_energy(). */ for_each_cpu_and(cpu, pd_mask, cpu_online_mask) { - unsigned long cpu_util, util_cfs = cpu_util_next(cpu, p, dst_cpu); - struct task_struct *tsk = cpu == dst_cpu ? p : NULL; + unsigned long util_freq = cpu_util_next(cpu, p, dst_cpu); + unsigned long cpu_util, util_running = util_freq; + struct task_struct *tsk = NULL; + + /* + * When @p is placed on @cpu: + * + * util_running = max(cpu_util, cpu_util_est) + + * max(task_util, _task_util_est) + * + * while cpu_util_next is: max(cpu_util + task_util, + * cpu_util_est + _task_util_est) + */ + if (cpu == dst_cpu) { + tsk = p; + util_running = + cpu_util_next(cpu, p, -1) + task_util_est(p); + } /* * Busy time computation: utilization clamping is not @@ -6527,7 +6600,7 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) * is already enough to scale the EM reported power * consumption at the (eventually clamped) cpu_capacity. */ - sum_util += effective_cpu_util(cpu, util_cfs, cpu_cap, + sum_util += effective_cpu_util(cpu, util_running, cpu_cap, ENERGY_UTIL, NULL); /* @@ -6537,7 +6610,7 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) * NOTE: in case RT tasks are running, by default the * FREQUENCY_UTIL's utilization can be max OPP. */ - cpu_util = effective_cpu_util(cpu, util_cfs, cpu_cap, + cpu_util = effective_cpu_util(cpu, util_freq, cpu_cap, FREQUENCY_UTIL, tsk); max_util = max(max_util, cpu_util); } @@ -6935,7 +7008,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ /* * This is possible from callers such as attach_tasks(), in which we - * unconditionally check_prempt_curr() after an enqueue (which may have + * unconditionally check_preempt_curr() after an enqueue (which may have * lead to a throttle). This both saves work and prevents false * next-buddy nomination below. */ @@ -7392,8 +7465,7 @@ enum migration_type { #define LBF_NEED_BREAK 0x02 #define LBF_DST_PINNED 0x04 #define LBF_SOME_PINNED 0x08 -#define LBF_NOHZ_STATS 0x10 -#define LBF_NOHZ_AGAIN 0x20 +#define LBF_ACTIVE_LB 0x10 struct lb_env { struct sched_domain *sd; @@ -7539,6 +7611,10 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) return 0; + /* Disregard pcpu kthreads; they are where they need to be. */ + if (kthread_is_per_cpu(p)) + return 0; + if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) { int cpu; @@ -7551,10 +7627,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) * our sched_group. We may want to revisit it if we couldn't * meet load balance goals by pulling other tasks on src_cpu. * - * Avoid computing new_dst_cpu for NEWLY_IDLE or if we have - * already computed one in current iteration. + * Avoid computing new_dst_cpu + * - for NEWLY_IDLE + * - if we have already computed one in current iteration + * - if it's an active balance */ - if (env->idle == CPU_NEWLY_IDLE || (env->flags & LBF_DST_PINNED)) + if (env->idle == CPU_NEWLY_IDLE || + env->flags & (LBF_DST_PINNED | LBF_ACTIVE_LB)) return 0; /* Prevent to re-select dst_cpu via env's CPUs: */ @@ -7569,7 +7648,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) return 0; } - /* Record that we found atleast one task that could run on dst_cpu */ + /* Record that we found at least one task that could run on dst_cpu */ env->flags &= ~LBF_ALL_PINNED; if (task_running(env->src_rq, p)) { @@ -7579,10 +7658,14 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) /* * Aggressive migration if: - * 1) destination numa is preferred - * 2) task is cache cold, or - * 3) too many balance attempts have failed. + * 1) active balance + * 2) destination numa is preferred + * 3) task is cache cold, or + * 4) too many balance attempts have failed. */ + if (env->flags & LBF_ACTIVE_LB) + return 1; + tsk_cache_hot = migrate_degrades_locality(p, env); if (tsk_cache_hot == -1) tsk_cache_hot = task_hot(p, env); @@ -7659,6 +7742,15 @@ static int detach_tasks(struct lb_env *env) lockdep_assert_held(&env->src_rq->lock); + /* + * Source run queue has been emptied by another CPU, clear + * LBF_ALL_PINNED flag as we will not test any task. + */ + if (env->src_rq->nr_running <= 1) { + env->flags &= ~LBF_ALL_PINNED; + return 0; + } + if (env->imbalance <= 0) return 0; @@ -7708,8 +7800,7 @@ static int detach_tasks(struct lb_env *env) * scheduler fails to find a good waiting task to * migrate. */ - - if ((load >> env->sd->nr_balance_failed) > env->imbalance) + if (shr_bound(load, env->sd->nr_balance_failed) > env->imbalance) goto next; env->imbalance -= load; @@ -7854,16 +7945,20 @@ static inline bool others_have_blocked(struct rq *rq) return false; } -static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) +static inline void update_blocked_load_tick(struct rq *rq) { - rq->last_blocked_load_update_tick = jiffies; + WRITE_ONCE(rq->last_blocked_load_update_tick, jiffies); +} +static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) +{ if (!has_blocked) rq->has_blocked_load = 0; } #else static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; } static inline bool others_have_blocked(struct rq *rq) { return false; } +static inline void update_blocked_load_tick(struct rq *rq) {} static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {} #endif @@ -8024,6 +8119,7 @@ static void update_blocked_averages(int cpu) struct rq_flags rf; rq_lock_irqsave(rq, &rf); + update_blocked_load_tick(rq); update_rq_clock(rq); decayed |= __update_blocked_others(rq, &done); @@ -8311,26 +8407,6 @@ group_is_overloaded(unsigned int imbalance_pct, struct sg_lb_stats *sgs) return false; } -/* - * group_smaller_min_cpu_capacity: Returns true if sched_group sg has smaller - * per-CPU capacity than sched_group ref. - */ -static inline bool -group_smaller_min_cpu_capacity(struct sched_group *sg, struct sched_group *ref) -{ - return fits_capacity(sg->sgc->min_capacity, ref->sgc->min_capacity); -} - -/* - * group_smaller_max_cpu_capacity: Returns true if sched_group sg has smaller - * per-CPU capacity_orig than sched_group ref. - */ -static inline bool -group_smaller_max_cpu_capacity(struct sched_group *sg, struct sched_group *ref) -{ - return fits_capacity(sg->sgc->max_capacity, ref->sgc->max_capacity); -} - static inline enum group_type group_classify(unsigned int imbalance_pct, struct sched_group *group, @@ -8354,28 +8430,6 @@ group_type group_classify(unsigned int imbalance_pct, return group_has_spare; } -static bool update_nohz_stats(struct rq *rq, bool force) -{ -#ifdef CONFIG_NO_HZ_COMMON - unsigned int cpu = rq->cpu; - - if (!rq->has_blocked_load) - return false; - - if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask)) - return false; - - if (!force && !time_after(jiffies, rq->last_blocked_load_update_tick)) - return true; - - update_blocked_averages(cpu); - - return rq->has_blocked_load; -#else - return false; -#endif -} - /** * update_sg_lb_stats - Update sched_group's statistics for load balancing. * @env: The load balancing environment. @@ -8397,9 +8451,6 @@ static inline void update_sg_lb_stats(struct lb_env *env, for_each_cpu_and(i, sched_group_span(group), env->cpus) { struct rq *rq = cpu_rq(i); - if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false)) - env->flags |= LBF_NOHZ_AGAIN; - sgs->group_load += cpu_load(rq); sgs->group_util += cpu_util(i); sgs->group_runnable += cpu_runnable(rq); @@ -8489,7 +8540,7 @@ static bool update_sd_pick_busiest(struct lb_env *env, * internally or be covered by avg_load imbalance (eventually). */ if (sgs->group_type == group_misfit_task && - (!group_smaller_max_cpu_capacity(sg, sds->local) || + (!capacity_greater(capacity_of(env->dst_cpu), sg->sgc->max_capacity) || sds->local_stat.group_type != group_has_spare)) return false; @@ -8573,7 +8624,7 @@ static bool update_sd_pick_busiest(struct lb_env *env, */ if ((env->sd->flags & SD_ASYM_CPUCAPACITY) && (sgs->group_type <= group_fully_busy) && - (group_smaller_min_cpu_capacity(sds->local, sg))) + (capacity_greater(sg->sgc->min_capacity, capacity_of(env->dst_cpu)))) return false; return true; @@ -8940,11 +8991,6 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd struct sg_lb_stats tmp_sgs; int sg_status = 0; -#ifdef CONFIG_NO_HZ_COMMON - if (env->idle == CPU_NEWLY_IDLE && READ_ONCE(nohz.has_blocked)) - env->flags |= LBF_NOHZ_STATS; -#endif - do { struct sg_lb_stats *sgs = &tmp_sgs; int local_group; @@ -8981,14 +9027,6 @@ next_group: /* Tag domain that child domain prefers tasks go to siblings first */ sds->prefer_sibling = child && child->flags & SD_PREFER_SIBLING; -#ifdef CONFIG_NO_HZ_COMMON - if ((env->flags & LBF_NOHZ_AGAIN) && - cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) { - - WRITE_ONCE(nohz.next_blocked, - jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD)); - } -#endif if (env->sd->flags & SD_NUMA) env->fbq_type = fbq_classify_group(&sds->busiest_stat); @@ -9386,7 +9424,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, * average load. */ if (env->sd->flags & SD_ASYM_CPUCAPACITY && - capacity_of(env->dst_cpu) < capacity && + !capacity_greater(capacity_of(env->dst_cpu), capacity) && nr_running == 1) continue; @@ -9676,7 +9714,7 @@ more_balance: * load to given_cpu. In rare situations, this may cause * conflicts (balance_cpu and given_cpu/ilb_cpu deciding * _independently_ and at _same_ time to move some load to - * given_cpu) causing exceess load to be moved to given_cpu. + * given_cpu) causing excess load to be moved to given_cpu. * This however should not happen so much in practice and * moreover subsequent load balance cycles should correct the * excess load moved. @@ -9776,9 +9814,6 @@ more_balance: active_load_balance_cpu_stop, busiest, &busiest->active_balance_work); } - - /* We've kicked active balancing, force task migration. */ - sd->nr_balance_failed = sd->cache_nice_tries+1; } } else { sd->nr_balance_failed = 0; @@ -9820,7 +9855,7 @@ out_one_pinned: /* * newidle_balance() disregards balance intervals, so we could * repeatedly reach this code, which would lead to balance_interval - * skyrocketting in a short amount of time. Skip the balance_interval + * skyrocketing in a short amount of time. Skip the balance_interval * increase logic to avoid that. */ if (env.idle == CPU_NEWLY_IDLE) @@ -9928,13 +9963,7 @@ static int active_load_balance_cpu_stop(void *data) .src_cpu = busiest_rq->cpu, .src_rq = busiest_rq, .idle = CPU_IDLE, - /* - * can_migrate_task() doesn't need to compute new_dst_cpu - * for active balancing. Since we have CPU_IDLE, but no - * @dst_grpmask we need to make that test go away with lying - * about DST_PINNED. - */ - .flags = LBF_DST_PINNED, + .flags = LBF_ACTIVE_LB, }; schedstat_inc(sd->alb_count); @@ -10061,22 +10090,9 @@ out: * When the cpu is attached to null domain for ex, it will not be * updated. */ - if (likely(update_next_balance)) { + if (likely(update_next_balance)) rq->next_balance = next_balance; -#ifdef CONFIG_NO_HZ_COMMON - /* - * If this CPU has been elected to perform the nohz idle - * balance. Other idle CPUs have already rebalanced with - * nohz_idle_balance() and nohz.next_balance has been - * updated accordingly. This CPU is now running the idle load - * balance for itself and we need to update the - * nohz.next_balance accordingly. - */ - if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance)) - nohz.next_balance = rq->next_balance; -#endif - } } static inline int on_null_domain(struct rq *rq) @@ -10368,14 +10384,30 @@ out: WRITE_ONCE(nohz.has_blocked, 1); } +static bool update_nohz_stats(struct rq *rq) +{ + unsigned int cpu = rq->cpu; + + if (!rq->has_blocked_load) + return false; + + if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask)) + return false; + + if (!time_after(jiffies, READ_ONCE(rq->last_blocked_load_update_tick))) + return true; + + update_blocked_averages(cpu); + + return rq->has_blocked_load; +} + /* * Internal function that runs load balance for all idle cpus. The load balance * can be a simple update of blocked load or a complete load balance with * tasks movement depending of flags. - * The function returns false if the loop has stopped before running - * through all idle CPUs. */ -static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, +static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags, enum cpu_idle_type idle) { /* Earliest time when we have to do rebalance again */ @@ -10385,7 +10417,6 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, int update_next_balance = 0; int this_cpu = this_rq->cpu; int balance_cpu; - int ret = false; struct rq *rq; SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK); @@ -10406,8 +10437,12 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, */ smp_mb(); - for_each_cpu(balance_cpu, nohz.idle_cpus_mask) { - if (balance_cpu == this_cpu || !idle_cpu(balance_cpu)) + /* + * Start with the next CPU after this_cpu so we will end with this_cpu and let a + * chance for other idle cpu to pull load. + */ + for_each_cpu_wrap(balance_cpu, nohz.idle_cpus_mask, this_cpu+1) { + if (!idle_cpu(balance_cpu)) continue; /* @@ -10422,7 +10457,7 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, rq = cpu_rq(balance_cpu); - has_blocked_load |= update_nohz_stats(rq, true); + has_blocked_load |= update_nohz_stats(rq); /* * If time for next balance is due, @@ -10453,27 +10488,13 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, if (likely(update_next_balance)) nohz.next_balance = next_balance; - /* Newly idle CPU doesn't need an update */ - if (idle != CPU_NEWLY_IDLE) { - update_blocked_averages(this_cpu); - has_blocked_load |= this_rq->has_blocked_load; - } - - if (flags & NOHZ_BALANCE_KICK) - rebalance_domains(this_rq, CPU_IDLE); - WRITE_ONCE(nohz.next_blocked, now + msecs_to_jiffies(LOAD_AVG_PERIOD)); - /* The full idle balance loop has been done */ - ret = true; - abort: /* There is still blocked load, enable periodic update */ if (has_blocked_load) WRITE_ONCE(nohz.has_blocked, 1); - - return ret; } /* @@ -10497,6 +10518,24 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) return true; } +/* + * Check if we need to run the ILB for updating blocked load before entering + * idle state. + */ +void nohz_run_idle_balance(int cpu) +{ + unsigned int flags; + + flags = atomic_fetch_andnot(NOHZ_NEWILB_KICK, nohz_flags(cpu)); + + /* + * Update the blocked load only if no SCHED_SOFTIRQ is about to happen + * (ie NOHZ_STATS_KICK set) and will do the same. + */ + if ((flags == NOHZ_NEWILB_KICK) && !need_resched()) + _nohz_idle_balance(cpu_rq(cpu), NOHZ_STATS_KICK, CPU_IDLE); +} + static void nohz_newidle_balance(struct rq *this_rq) { int this_cpu = this_rq->cpu; @@ -10517,16 +10556,11 @@ static void nohz_newidle_balance(struct rq *this_rq) time_before(jiffies, READ_ONCE(nohz.next_blocked))) return; - raw_spin_unlock(&this_rq->lock); /* - * This CPU is going to be idle and blocked load of idle CPUs - * need to be updated. Run the ilb locally as it is a good - * candidate for ilb instead of waking up another idle CPU. - * Kick an normal ilb if we failed to do the update. + * Set the need to trigger ILB in order to update blocked load + * before entering idle state. */ - if (!_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE)) - kick_ilb(NOHZ_STATS_KICK); - raw_spin_lock(&this_rq->lock); + atomic_or(NOHZ_NEWILB_KICK, nohz_flags(this_cpu)); } #else /* !CONFIG_NO_HZ_COMMON */ @@ -10587,8 +10621,6 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) update_next_balance(sd, &next_balance); rcu_read_unlock(); - nohz_newidle_balance(this_rq); - goto out; } @@ -10635,7 +10667,6 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) if (curr_cost > this_rq->max_idle_balance_cost) this_rq->max_idle_balance_cost = curr_cost; -out: /* * While browsing the domains, we released the rq lock, a task could * have been enqueued in the meantime. Since we're not going idle, @@ -10644,16 +10675,19 @@ out: if (this_rq->cfs.h_nr_running && !pulled_task) pulled_task = 1; - /* Move the next balance forward */ - if (time_after(this_rq->next_balance, next_balance)) - this_rq->next_balance = next_balance; - /* Is there a task of a high priority class? */ if (this_rq->nr_running != this_rq->cfs.h_nr_running) pulled_task = -1; +out: + /* Move the next balance forward */ + if (time_after(this_rq->next_balance, next_balance)) + this_rq->next_balance = next_balance; + if (pulled_task) this_rq->idle_stamp = 0; + else + nohz_newidle_balance(this_rq); rq_repin_lock(this_rq, rf); |