summaryrefslogtreecommitdiff
path: root/kernel/sched/fair.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2021-04-28 13:33:57 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2021-04-28 13:33:57 -0700
commit16b3d0cf5bad844daaf436ad2e9061de0fe36e5c (patch)
treed553a51e6d95fb166df7fa62264e9a27e4c438a4 /kernel/sched/fair.c
parent42dec9a936e7696bea1f27d3c5a0068cd9aa95fd (diff)
parent2ea46c6fc9452ac100ad907b051d797225847e33 (diff)
Merge tag 'sched-core-2021-04-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar: - Clean up SCHED_DEBUG: move the decades old mess of sysctl, procfs and debugfs interfaces to a unified debugfs interface. - Signals: Allow caching one sigqueue object per task, to improve performance & latencies. - Improve newidle_balance() irq-off latencies on systems with a large number of CPU cgroups. - Improve energy-aware scheduling - Improve the PELT metrics for certain workloads - Reintroduce select_idle_smt() to improve load-balancing locality - but without the previous regressions - Add 'scheduler latency debugging': warn after long periods of pending need_resched. This is an opt-in feature that requires the enabling of the LATENCY_WARN scheduler feature, or the use of the resched_latency_warn_ms=xx boot parameter. - CPU hotplug fixes for HP-rollback, and for the 'fail' interface. Fix remaining balance_push() vs. hotplug holes/races - PSI fixes, plus allow /proc/pressure/ files to be written by CAP_SYS_RESOURCE tasks as well - Fix/improve various load-balancing corner cases vs. capacity margins - Fix sched topology on systems with NUMA diameter of 3 or above - Fix PF_KTHREAD vs to_kthread() race - Minor rseq optimizations - Misc cleanups, optimizations, fixes and smaller updates * tag 'sched-core-2021-04-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (61 commits) cpumask/hotplug: Fix cpu_dying() state tracking kthread: Fix PF_KTHREAD vs to_kthread() race sched/debug: Fix cgroup_path[] serialization sched,psi: Handle potential task count underflow bugs more gracefully sched: Warn on long periods of pending need_resched sched/fair: Move update_nohz_stats() to the CONFIG_NO_HZ_COMMON block to simplify the code & fix an unused function warning sched/debug: Rename the sched_debug parameter to sched_verbose sched,fair: Alternative sched_slice() sched: Move /proc/sched_debug to debugfs sched,debug: Convert sysctl sched_domains to debugfs debugfs: Implement debugfs_create_str() sched,preempt: Move preempt_dynamic to debug.c sched: Move SCHED_DEBUG sysctl to debugfs sched: Don't make LATENCYTOP select SCHED_DEBUG sched: Remove sched_schedstats sysctl out from under SCHED_DEBUG sched/numa: Allow runtime enabling/disabling of NUMA balance without SCHED_DEBUG sched: Use cpu_dying() to fix balance_push vs hotplug-rollback cpumask: Introduce DYING mask cpumask: Make cpu_{online,possible,present,active}() inline rseq: Optimise rseq_get_rseq_cs() and clear_rseq_cs() ...
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r--kernel/sched/fair.c380
1 files changed, 207 insertions, 173 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 794c2cb945f8..1d75af1ecfb4 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -49,7 +49,7 @@ static unsigned int normalized_sysctl_sched_latency = 6000000ULL;
*
* (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
*/
-enum sched_tunable_scaling sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
+unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
/*
* Minimal preemption granularity for CPU-bound tasks:
@@ -113,6 +113,13 @@ int __weak arch_asym_cpu_priority(int cpu)
*/
#define fits_capacity(cap, max) ((cap) * 1280 < (max) * 1024)
+/*
+ * The margin used when comparing CPU capacities.
+ * is 'cap1' noticeably greater than 'cap2'
+ *
+ * (default: ~5%)
+ */
+#define capacity_greater(cap1, cap2) ((cap1) * 1024 > (cap2) * 1078)
#endif
#ifdef CONFIG_CFS_BANDWIDTH
@@ -229,22 +236,25 @@ static void __update_inv_weight(struct load_weight *lw)
static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
{
u64 fact = scale_load_down(weight);
+ u32 fact_hi = (u32)(fact >> 32);
int shift = WMULT_SHIFT;
+ int fs;
__update_inv_weight(lw);
- if (unlikely(fact >> 32)) {
- while (fact >> 32) {
- fact >>= 1;
- shift--;
- }
+ if (unlikely(fact_hi)) {
+ fs = fls(fact_hi);
+ shift -= fs;
+ fact >>= fs;
}
fact = mul_u32_u32(fact, lw->inv_weight);
- while (fact >> 32) {
- fact >>= 1;
- shift--;
+ fact_hi = (u32)(fact >> 32);
+ if (fact_hi) {
+ fs = fls(fact_hi);
+ shift -= fs;
+ fact >>= fs;
}
return mul_u64_u32_shr(delta_exec, fact, shift);
@@ -624,15 +634,10 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
* Scheduling class statistics methods:
*/
-int sched_proc_update_handler(struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos)
+int sched_update_scaling(void)
{
- int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
unsigned int factor = get_update_sysctl_factor();
- if (ret || !write)
- return ret;
-
sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
sysctl_sched_min_granularity);
@@ -682,7 +687,13 @@ static u64 __sched_period(unsigned long nr_running)
*/
static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
+ unsigned int nr_running = cfs_rq->nr_running;
+ u64 slice;
+
+ if (sched_feat(ALT_PERIOD))
+ nr_running = rq_of(cfs_rq)->cfs.h_nr_running;
+
+ slice = __sched_period(nr_running + !se->on_rq);
for_each_sched_entity(se) {
struct load_weight *load;
@@ -699,6 +710,10 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
}
slice = __calc_delta(slice, se->load.weight, load);
}
+
+ if (sched_feat(BASE_SLICE))
+ slice = max(slice, (u64)sysctl_sched_min_granularity);
+
return slice;
}
@@ -1122,7 +1137,7 @@ static unsigned int task_nr_scan_windows(struct task_struct *p)
return rss / nr_scan_pages;
}
-/* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
+/* For sanity's sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
#define MAX_SCAN_WINDOW 2560
static unsigned int task_scan_min(struct task_struct *p)
@@ -2574,7 +2589,7 @@ no_join:
}
/*
- * Get rid of NUMA staticstics associated with a task (either current or dead).
+ * Get rid of NUMA statistics associated with a task (either current or dead).
* If @final is set, the task is dead and has reached refcount zero, so we can
* safely free all relevant data structures. Otherwise, there might be
* concurrent reads from places like load balancing and procfs, and we should
@@ -3941,13 +3956,15 @@ static inline void util_est_dequeue(struct cfs_rq *cfs_rq,
trace_sched_util_est_cfs_tp(cfs_rq);
}
+#define UTIL_EST_MARGIN (SCHED_CAPACITY_SCALE / 100)
+
/*
* Check if a (signed) value is within a specified (unsigned) margin,
* based on the observation that:
*
* abs(x) < y := (unsigned)(x + y - 1) < (2 * y - 1)
*
- * NOTE: this only works when value + maring < INT_MAX.
+ * NOTE: this only works when value + margin < INT_MAX.
*/
static inline bool within_margin(int value, int margin)
{
@@ -3958,7 +3975,7 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
struct task_struct *p,
bool task_sleep)
{
- long last_ewma_diff;
+ long last_ewma_diff, last_enqueued_diff;
struct util_est ue;
if (!sched_feat(UTIL_EST))
@@ -3979,6 +3996,8 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
if (ue.enqueued & UTIL_AVG_UNCHANGED)
return;
+ last_enqueued_diff = ue.enqueued;
+
/*
* Reset EWMA on utilization increases, the moving average is used only
* to smooth utilization decreases.
@@ -3992,12 +4011,17 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
}
/*
- * Skip update of task's estimated utilization when its EWMA is
+ * Skip update of task's estimated utilization when its members are
* already ~1% close to its last activation value.
*/
last_ewma_diff = ue.enqueued - ue.ewma;
- if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100)))
+ last_enqueued_diff -= ue.enqueued;
+ if (within_margin(last_ewma_diff, UTIL_EST_MARGIN)) {
+ if (!within_margin(last_enqueued_diff, UTIL_EST_MARGIN))
+ goto done;
+
return;
+ }
/*
* To avoid overestimation of actual task utilization, skip updates if
@@ -4244,7 +4268,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
/*
* When bandwidth control is enabled, cfs might have been removed
* because of a parent been throttled but cfs->nr_running > 1. Try to
- * add it unconditionnally.
+ * add it unconditionally.
*/
if (cfs_rq->nr_running == 1 || cfs_bandwidth_used())
list_add_leaf_cfs_rq(cfs_rq);
@@ -5299,7 +5323,7 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
* bits doesn't do much.
*/
-/* cpu online calback */
+/* cpu online callback */
static void __maybe_unused update_runtime_enabled(struct rq *rq)
{
struct task_group *tg;
@@ -6098,6 +6122,24 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu
return -1;
}
+/*
+ * Scan the local SMT mask for idle CPUs.
+ */
+static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
+{
+ int cpu;
+
+ for_each_cpu(cpu, cpu_smt_mask(target)) {
+ if (!cpumask_test_cpu(cpu, p->cpus_ptr) ||
+ !cpumask_test_cpu(cpu, sched_domain_span(sd)))
+ continue;
+ if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
+ return cpu;
+ }
+
+ return -1;
+}
+
#else /* CONFIG_SCHED_SMT */
static inline void set_idle_cores(int cpu, int val)
@@ -6114,6 +6156,11 @@ static inline int select_idle_core(struct task_struct *p, int core, struct cpuma
return __select_idle_cpu(core);
}
+static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
+{
+ return -1;
+}
+
#endif /* CONFIG_SCHED_SMT */
/*
@@ -6121,11 +6168,10 @@ static inline int select_idle_core(struct task_struct *p, int core, struct cpuma
* comparing the average scan cost (tracked in sd->avg_scan_cost) against the
* average idle time for this rq (as found in rq->avg_idle).
*/
-static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
+static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool has_idle_core, int target)
{
struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
int i, cpu, idle_cpu = -1, nr = INT_MAX;
- bool smt = test_idle_cores(target, false);
int this = smp_processor_id();
struct sched_domain *this_sd;
u64 time;
@@ -6136,7 +6182,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
- if (sched_feat(SIS_PROP) && !smt) {
+ if (sched_feat(SIS_PROP) && !has_idle_core) {
u64 avg_cost, avg_idle, span_avg;
/*
@@ -6156,7 +6202,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
}
for_each_cpu_wrap(cpu, cpus, target) {
- if (smt) {
+ if (has_idle_core) {
i = select_idle_core(p, cpu, cpus, &idle_cpu);
if ((unsigned int)i < nr_cpumask_bits)
return i;
@@ -6170,10 +6216,10 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
}
}
- if (smt)
+ if (has_idle_core)
set_idle_cores(this, false);
- if (sched_feat(SIS_PROP) && !smt) {
+ if (sched_feat(SIS_PROP) && !has_idle_core) {
time = cpu_clock(this) - time;
update_avg(&this_sd->avg_scan_cost, time);
}
@@ -6228,6 +6274,7 @@ static inline bool asym_fits_capacity(int task_util, int cpu)
*/
static int select_idle_sibling(struct task_struct *p, int prev, int target)
{
+ bool has_idle_core = false;
struct sched_domain *sd;
unsigned long task_util;
int i, recent_used_cpu;
@@ -6307,7 +6354,17 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
if (!sd)
return target;
- i = select_idle_cpu(p, sd, target);
+ if (sched_smt_active()) {
+ has_idle_core = test_idle_cores(target, false);
+
+ if (!has_idle_core && cpus_share_cache(prev, target)) {
+ i = select_idle_smt(p, sd, prev);
+ if ((unsigned int)i < nr_cpumask_bits)
+ return i;
+ }
+ }
+
+ i = select_idle_cpu(p, sd, has_idle_core, target);
if ((unsigned)i < nr_cpumask_bits)
return i;
@@ -6471,7 +6528,7 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
* util_avg should already be correct.
*/
if (task_cpu(p) == cpu && dst_cpu != cpu)
- sub_positive(&util, task_util(p));
+ lsub_positive(&util, task_util(p));
else if (task_cpu(p) != cpu && dst_cpu == cpu)
util += task_util(p);
@@ -6518,8 +6575,24 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
* its pd list and will not be accounted by compute_energy().
*/
for_each_cpu_and(cpu, pd_mask, cpu_online_mask) {
- unsigned long cpu_util, util_cfs = cpu_util_next(cpu, p, dst_cpu);
- struct task_struct *tsk = cpu == dst_cpu ? p : NULL;
+ unsigned long util_freq = cpu_util_next(cpu, p, dst_cpu);
+ unsigned long cpu_util, util_running = util_freq;
+ struct task_struct *tsk = NULL;
+
+ /*
+ * When @p is placed on @cpu:
+ *
+ * util_running = max(cpu_util, cpu_util_est) +
+ * max(task_util, _task_util_est)
+ *
+ * while cpu_util_next is: max(cpu_util + task_util,
+ * cpu_util_est + _task_util_est)
+ */
+ if (cpu == dst_cpu) {
+ tsk = p;
+ util_running =
+ cpu_util_next(cpu, p, -1) + task_util_est(p);
+ }
/*
* Busy time computation: utilization clamping is not
@@ -6527,7 +6600,7 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
* is already enough to scale the EM reported power
* consumption at the (eventually clamped) cpu_capacity.
*/
- sum_util += effective_cpu_util(cpu, util_cfs, cpu_cap,
+ sum_util += effective_cpu_util(cpu, util_running, cpu_cap,
ENERGY_UTIL, NULL);
/*
@@ -6537,7 +6610,7 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
* NOTE: in case RT tasks are running, by default the
* FREQUENCY_UTIL's utilization can be max OPP.
*/
- cpu_util = effective_cpu_util(cpu, util_cfs, cpu_cap,
+ cpu_util = effective_cpu_util(cpu, util_freq, cpu_cap,
FREQUENCY_UTIL, tsk);
max_util = max(max_util, cpu_util);
}
@@ -6935,7 +7008,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
/*
* This is possible from callers such as attach_tasks(), in which we
- * unconditionally check_prempt_curr() after an enqueue (which may have
+ * unconditionally check_preempt_curr() after an enqueue (which may have
* lead to a throttle). This both saves work and prevents false
* next-buddy nomination below.
*/
@@ -7392,8 +7465,7 @@ enum migration_type {
#define LBF_NEED_BREAK 0x02
#define LBF_DST_PINNED 0x04
#define LBF_SOME_PINNED 0x08
-#define LBF_NOHZ_STATS 0x10
-#define LBF_NOHZ_AGAIN 0x20
+#define LBF_ACTIVE_LB 0x10
struct lb_env {
struct sched_domain *sd;
@@ -7539,6 +7611,10 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
return 0;
+ /* Disregard pcpu kthreads; they are where they need to be. */
+ if (kthread_is_per_cpu(p))
+ return 0;
+
if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) {
int cpu;
@@ -7551,10 +7627,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
* our sched_group. We may want to revisit it if we couldn't
* meet load balance goals by pulling other tasks on src_cpu.
*
- * Avoid computing new_dst_cpu for NEWLY_IDLE or if we have
- * already computed one in current iteration.
+ * Avoid computing new_dst_cpu
+ * - for NEWLY_IDLE
+ * - if we have already computed one in current iteration
+ * - if it's an active balance
*/
- if (env->idle == CPU_NEWLY_IDLE || (env->flags & LBF_DST_PINNED))
+ if (env->idle == CPU_NEWLY_IDLE ||
+ env->flags & (LBF_DST_PINNED | LBF_ACTIVE_LB))
return 0;
/* Prevent to re-select dst_cpu via env's CPUs: */
@@ -7569,7 +7648,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
return 0;
}
- /* Record that we found atleast one task that could run on dst_cpu */
+ /* Record that we found at least one task that could run on dst_cpu */
env->flags &= ~LBF_ALL_PINNED;
if (task_running(env->src_rq, p)) {
@@ -7579,10 +7658,14 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
/*
* Aggressive migration if:
- * 1) destination numa is preferred
- * 2) task is cache cold, or
- * 3) too many balance attempts have failed.
+ * 1) active balance
+ * 2) destination numa is preferred
+ * 3) task is cache cold, or
+ * 4) too many balance attempts have failed.
*/
+ if (env->flags & LBF_ACTIVE_LB)
+ return 1;
+
tsk_cache_hot = migrate_degrades_locality(p, env);
if (tsk_cache_hot == -1)
tsk_cache_hot = task_hot(p, env);
@@ -7659,6 +7742,15 @@ static int detach_tasks(struct lb_env *env)
lockdep_assert_held(&env->src_rq->lock);
+ /*
+ * Source run queue has been emptied by another CPU, clear
+ * LBF_ALL_PINNED flag as we will not test any task.
+ */
+ if (env->src_rq->nr_running <= 1) {
+ env->flags &= ~LBF_ALL_PINNED;
+ return 0;
+ }
+
if (env->imbalance <= 0)
return 0;
@@ -7708,8 +7800,7 @@ static int detach_tasks(struct lb_env *env)
* scheduler fails to find a good waiting task to
* migrate.
*/
-
- if ((load >> env->sd->nr_balance_failed) > env->imbalance)
+ if (shr_bound(load, env->sd->nr_balance_failed) > env->imbalance)
goto next;
env->imbalance -= load;
@@ -7854,16 +7945,20 @@ static inline bool others_have_blocked(struct rq *rq)
return false;
}
-static inline void update_blocked_load_status(struct rq *rq, bool has_blocked)
+static inline void update_blocked_load_tick(struct rq *rq)
{
- rq->last_blocked_load_update_tick = jiffies;
+ WRITE_ONCE(rq->last_blocked_load_update_tick, jiffies);
+}
+static inline void update_blocked_load_status(struct rq *rq, bool has_blocked)
+{
if (!has_blocked)
rq->has_blocked_load = 0;
}
#else
static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; }
static inline bool others_have_blocked(struct rq *rq) { return false; }
+static inline void update_blocked_load_tick(struct rq *rq) {}
static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {}
#endif
@@ -8024,6 +8119,7 @@ static void update_blocked_averages(int cpu)
struct rq_flags rf;
rq_lock_irqsave(rq, &rf);
+ update_blocked_load_tick(rq);
update_rq_clock(rq);
decayed |= __update_blocked_others(rq, &done);
@@ -8311,26 +8407,6 @@ group_is_overloaded(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
return false;
}
-/*
- * group_smaller_min_cpu_capacity: Returns true if sched_group sg has smaller
- * per-CPU capacity than sched_group ref.
- */
-static inline bool
-group_smaller_min_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
-{
- return fits_capacity(sg->sgc->min_capacity, ref->sgc->min_capacity);
-}
-
-/*
- * group_smaller_max_cpu_capacity: Returns true if sched_group sg has smaller
- * per-CPU capacity_orig than sched_group ref.
- */
-static inline bool
-group_smaller_max_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
-{
- return fits_capacity(sg->sgc->max_capacity, ref->sgc->max_capacity);
-}
-
static inline enum
group_type group_classify(unsigned int imbalance_pct,
struct sched_group *group,
@@ -8354,28 +8430,6 @@ group_type group_classify(unsigned int imbalance_pct,
return group_has_spare;
}
-static bool update_nohz_stats(struct rq *rq, bool force)
-{
-#ifdef CONFIG_NO_HZ_COMMON
- unsigned int cpu = rq->cpu;
-
- if (!rq->has_blocked_load)
- return false;
-
- if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
- return false;
-
- if (!force && !time_after(jiffies, rq->last_blocked_load_update_tick))
- return true;
-
- update_blocked_averages(cpu);
-
- return rq->has_blocked_load;
-#else
- return false;
-#endif
-}
-
/**
* update_sg_lb_stats - Update sched_group's statistics for load balancing.
* @env: The load balancing environment.
@@ -8397,9 +8451,6 @@ static inline void update_sg_lb_stats(struct lb_env *env,
for_each_cpu_and(i, sched_group_span(group), env->cpus) {
struct rq *rq = cpu_rq(i);
- if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false))
- env->flags |= LBF_NOHZ_AGAIN;
-
sgs->group_load += cpu_load(rq);
sgs->group_util += cpu_util(i);
sgs->group_runnable += cpu_runnable(rq);
@@ -8489,7 +8540,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
* internally or be covered by avg_load imbalance (eventually).
*/
if (sgs->group_type == group_misfit_task &&
- (!group_smaller_max_cpu_capacity(sg, sds->local) ||
+ (!capacity_greater(capacity_of(env->dst_cpu), sg->sgc->max_capacity) ||
sds->local_stat.group_type != group_has_spare))
return false;
@@ -8573,7 +8624,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
*/
if ((env->sd->flags & SD_ASYM_CPUCAPACITY) &&
(sgs->group_type <= group_fully_busy) &&
- (group_smaller_min_cpu_capacity(sds->local, sg)))
+ (capacity_greater(sg->sgc->min_capacity, capacity_of(env->dst_cpu))))
return false;
return true;
@@ -8940,11 +8991,6 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
struct sg_lb_stats tmp_sgs;
int sg_status = 0;
-#ifdef CONFIG_NO_HZ_COMMON
- if (env->idle == CPU_NEWLY_IDLE && READ_ONCE(nohz.has_blocked))
- env->flags |= LBF_NOHZ_STATS;
-#endif
-
do {
struct sg_lb_stats *sgs = &tmp_sgs;
int local_group;
@@ -8981,14 +9027,6 @@ next_group:
/* Tag domain that child domain prefers tasks go to siblings first */
sds->prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
-#ifdef CONFIG_NO_HZ_COMMON
- if ((env->flags & LBF_NOHZ_AGAIN) &&
- cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) {
-
- WRITE_ONCE(nohz.next_blocked,
- jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD));
- }
-#endif
if (env->sd->flags & SD_NUMA)
env->fbq_type = fbq_classify_group(&sds->busiest_stat);
@@ -9386,7 +9424,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
* average load.
*/
if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
- capacity_of(env->dst_cpu) < capacity &&
+ !capacity_greater(capacity_of(env->dst_cpu), capacity) &&
nr_running == 1)
continue;
@@ -9676,7 +9714,7 @@ more_balance:
* load to given_cpu. In rare situations, this may cause
* conflicts (balance_cpu and given_cpu/ilb_cpu deciding
* _independently_ and at _same_ time to move some load to
- * given_cpu) causing exceess load to be moved to given_cpu.
+ * given_cpu) causing excess load to be moved to given_cpu.
* This however should not happen so much in practice and
* moreover subsequent load balance cycles should correct the
* excess load moved.
@@ -9776,9 +9814,6 @@ more_balance:
active_load_balance_cpu_stop, busiest,
&busiest->active_balance_work);
}
-
- /* We've kicked active balancing, force task migration. */
- sd->nr_balance_failed = sd->cache_nice_tries+1;
}
} else {
sd->nr_balance_failed = 0;
@@ -9820,7 +9855,7 @@ out_one_pinned:
/*
* newidle_balance() disregards balance intervals, so we could
* repeatedly reach this code, which would lead to balance_interval
- * skyrocketting in a short amount of time. Skip the balance_interval
+ * skyrocketing in a short amount of time. Skip the balance_interval
* increase logic to avoid that.
*/
if (env.idle == CPU_NEWLY_IDLE)
@@ -9928,13 +9963,7 @@ static int active_load_balance_cpu_stop(void *data)
.src_cpu = busiest_rq->cpu,
.src_rq = busiest_rq,
.idle = CPU_IDLE,
- /*
- * can_migrate_task() doesn't need to compute new_dst_cpu
- * for active balancing. Since we have CPU_IDLE, but no
- * @dst_grpmask we need to make that test go away with lying
- * about DST_PINNED.
- */
- .flags = LBF_DST_PINNED,
+ .flags = LBF_ACTIVE_LB,
};
schedstat_inc(sd->alb_count);
@@ -10061,22 +10090,9 @@ out:
* When the cpu is attached to null domain for ex, it will not be
* updated.
*/
- if (likely(update_next_balance)) {
+ if (likely(update_next_balance))
rq->next_balance = next_balance;
-#ifdef CONFIG_NO_HZ_COMMON
- /*
- * If this CPU has been elected to perform the nohz idle
- * balance. Other idle CPUs have already rebalanced with
- * nohz_idle_balance() and nohz.next_balance has been
- * updated accordingly. This CPU is now running the idle load
- * balance for itself and we need to update the
- * nohz.next_balance accordingly.
- */
- if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance))
- nohz.next_balance = rq->next_balance;
-#endif
- }
}
static inline int on_null_domain(struct rq *rq)
@@ -10368,14 +10384,30 @@ out:
WRITE_ONCE(nohz.has_blocked, 1);
}
+static bool update_nohz_stats(struct rq *rq)
+{
+ unsigned int cpu = rq->cpu;
+
+ if (!rq->has_blocked_load)
+ return false;
+
+ if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
+ return false;
+
+ if (!time_after(jiffies, READ_ONCE(rq->last_blocked_load_update_tick)))
+ return true;
+
+ update_blocked_averages(cpu);
+
+ return rq->has_blocked_load;
+}
+
/*
* Internal function that runs load balance for all idle cpus. The load balance
* can be a simple update of blocked load or a complete load balance with
* tasks movement depending of flags.
- * The function returns false if the loop has stopped before running
- * through all idle CPUs.
*/
-static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
+static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
enum cpu_idle_type idle)
{
/* Earliest time when we have to do rebalance again */
@@ -10385,7 +10417,6 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
int update_next_balance = 0;
int this_cpu = this_rq->cpu;
int balance_cpu;
- int ret = false;
struct rq *rq;
SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
@@ -10406,8 +10437,12 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
*/
smp_mb();
- for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
- if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
+ /*
+ * Start with the next CPU after this_cpu so we will end with this_cpu and let a
+ * chance for other idle cpu to pull load.
+ */
+ for_each_cpu_wrap(balance_cpu, nohz.idle_cpus_mask, this_cpu+1) {
+ if (!idle_cpu(balance_cpu))
continue;
/*
@@ -10422,7 +10457,7 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
rq = cpu_rq(balance_cpu);
- has_blocked_load |= update_nohz_stats(rq, true);
+ has_blocked_load |= update_nohz_stats(rq);
/*
* If time for next balance is due,
@@ -10453,27 +10488,13 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
if (likely(update_next_balance))
nohz.next_balance = next_balance;
- /* Newly idle CPU doesn't need an update */
- if (idle != CPU_NEWLY_IDLE) {
- update_blocked_averages(this_cpu);
- has_blocked_load |= this_rq->has_blocked_load;
- }
-
- if (flags & NOHZ_BALANCE_KICK)
- rebalance_domains(this_rq, CPU_IDLE);
-
WRITE_ONCE(nohz.next_blocked,
now + msecs_to_jiffies(LOAD_AVG_PERIOD));
- /* The full idle balance loop has been done */
- ret = true;
-
abort:
/* There is still blocked load, enable periodic update */
if (has_blocked_load)
WRITE_ONCE(nohz.has_blocked, 1);
-
- return ret;
}
/*
@@ -10497,6 +10518,24 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
return true;
}
+/*
+ * Check if we need to run the ILB for updating blocked load before entering
+ * idle state.
+ */
+void nohz_run_idle_balance(int cpu)
+{
+ unsigned int flags;
+
+ flags = atomic_fetch_andnot(NOHZ_NEWILB_KICK, nohz_flags(cpu));
+
+ /*
+ * Update the blocked load only if no SCHED_SOFTIRQ is about to happen
+ * (ie NOHZ_STATS_KICK set) and will do the same.
+ */
+ if ((flags == NOHZ_NEWILB_KICK) && !need_resched())
+ _nohz_idle_balance(cpu_rq(cpu), NOHZ_STATS_KICK, CPU_IDLE);
+}
+
static void nohz_newidle_balance(struct rq *this_rq)
{
int this_cpu = this_rq->cpu;
@@ -10517,16 +10556,11 @@ static void nohz_newidle_balance(struct rq *this_rq)
time_before(jiffies, READ_ONCE(nohz.next_blocked)))
return;
- raw_spin_unlock(&this_rq->lock);
/*
- * This CPU is going to be idle and blocked load of idle CPUs
- * need to be updated. Run the ilb locally as it is a good
- * candidate for ilb instead of waking up another idle CPU.
- * Kick an normal ilb if we failed to do the update.
+ * Set the need to trigger ILB in order to update blocked load
+ * before entering idle state.
*/
- if (!_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE))
- kick_ilb(NOHZ_STATS_KICK);
- raw_spin_lock(&this_rq->lock);
+ atomic_or(NOHZ_NEWILB_KICK, nohz_flags(this_cpu));
}
#else /* !CONFIG_NO_HZ_COMMON */
@@ -10587,8 +10621,6 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
update_next_balance(sd, &next_balance);
rcu_read_unlock();
- nohz_newidle_balance(this_rq);
-
goto out;
}
@@ -10635,7 +10667,6 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
if (curr_cost > this_rq->max_idle_balance_cost)
this_rq->max_idle_balance_cost = curr_cost;
-out:
/*
* While browsing the domains, we released the rq lock, a task could
* have been enqueued in the meantime. Since we're not going idle,
@@ -10644,16 +10675,19 @@ out:
if (this_rq->cfs.h_nr_running && !pulled_task)
pulled_task = 1;
- /* Move the next balance forward */
- if (time_after(this_rq->next_balance, next_balance))
- this_rq->next_balance = next_balance;
-
/* Is there a task of a high priority class? */
if (this_rq->nr_running != this_rq->cfs.h_nr_running)
pulled_task = -1;
+out:
+ /* Move the next balance forward */
+ if (time_after(this_rq->next_balance, next_balance))
+ this_rq->next_balance = next_balance;
+
if (pulled_task)
this_rq->idle_stamp = 0;
+ else
+ nohz_newidle_balance(this_rq);
rq_repin_lock(this_rq, rf);