summaryrefslogtreecommitdiff
path: root/kernel/sched/fair.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r--kernel/sched/fair.c227
1 files changed, 136 insertions, 91 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ba749f579714..3c8a379c357e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -801,7 +801,7 @@ void post_init_entity_util_avg(struct task_struct *p)
* For !fair tasks do:
*
update_cfs_rq_load_avg(now, cfs_rq);
- attach_entity_load_avg(cfs_rq, se, 0);
+ attach_entity_load_avg(cfs_rq, se);
switched_from_fair(rq, p);
*
* such that the next switched_to_fair() has the
@@ -3114,7 +3114,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
{
struct rq *rq = rq_of(cfs_rq);
- if (&rq->cfs == cfs_rq || (flags & SCHED_CPUFREQ_MIGRATION)) {
+ if (&rq->cfs == cfs_rq) {
/*
* There are a few boundary cases this might miss but it should
* get called often enough that that should (hopefully) not be
@@ -3366,16 +3366,17 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf
runnable_load_sum = (s64)se_runnable(se) * runnable_sum;
runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX);
- delta_sum = runnable_load_sum - se_weight(se) * se->avg.runnable_load_sum;
- delta_avg = runnable_load_avg - se->avg.runnable_load_avg;
-
- se->avg.runnable_load_sum = runnable_sum;
- se->avg.runnable_load_avg = runnable_load_avg;
if (se->on_rq) {
+ delta_sum = runnable_load_sum -
+ se_weight(se) * se->avg.runnable_load_sum;
+ delta_avg = runnable_load_avg - se->avg.runnable_load_avg;
add_positive(&cfs_rq->avg.runnable_load_avg, delta_avg);
add_positive(&cfs_rq->avg.runnable_load_sum, delta_sum);
}
+
+ se->avg.runnable_load_sum = runnable_sum;
+ se->avg.runnable_load_avg = runnable_load_avg;
}
static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
@@ -3515,12 +3516,11 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
* attach_entity_load_avg - attach this entity to its cfs_rq load avg
* @cfs_rq: cfs_rq to attach to
* @se: sched_entity to attach
- * @flags: migration hints
*
* Must call update_cfs_rq_load_avg() before this, since we rely on
* cfs_rq->avg.last_update_time being current.
*/
-static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
@@ -3556,7 +3556,7 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
- cfs_rq_util_change(cfs_rq, flags);
+ cfs_rq_util_change(cfs_rq, 0);
trace_pelt_cfs_tp(cfs_rq);
}
@@ -3614,7 +3614,7 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
*
* IOW we're enqueueing a task on a new CPU.
*/
- attach_entity_load_avg(cfs_rq, se, SCHED_CPUFREQ_MIGRATION);
+ attach_entity_load_avg(cfs_rq, se);
update_tg_load_avg(cfs_rq, 0);
} else if (decayed) {
@@ -3711,6 +3711,20 @@ static inline unsigned long task_util_est(struct task_struct *p)
return max(task_util(p), _task_util_est(p));
}
+#ifdef CONFIG_UCLAMP_TASK
+static inline unsigned long uclamp_task_util(struct task_struct *p)
+{
+ return clamp(task_util_est(p),
+ uclamp_eff_value(p, UCLAMP_MIN),
+ uclamp_eff_value(p, UCLAMP_MAX));
+}
+#else
+static inline unsigned long uclamp_task_util(struct task_struct *p)
+{
+ return task_util_est(p);
+}
+#endif
+
static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
struct task_struct *p)
{
@@ -3822,7 +3836,7 @@ done:
static inline int task_fits_capacity(struct task_struct *p, long capacity)
{
- return fits_capacity(task_util_est(p), capacity);
+ return fits_capacity(uclamp_task_util(p), capacity);
}
static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
@@ -3857,7 +3871,7 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
static inline void remove_entity_load_avg(struct sched_entity *se) {}
static inline void
-attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) {}
+attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
static inline void
detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
@@ -5196,6 +5210,20 @@ static inline void update_overutilized_status(struct rq *rq)
static inline void update_overutilized_status(struct rq *rq) { }
#endif
+/* Runqueue only has SCHED_IDLE tasks enqueued */
+static int sched_idle_rq(struct rq *rq)
+{
+ return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running &&
+ rq->nr_running);
+}
+
+#ifdef CONFIG_SMP
+static int sched_idle_cpu(int cpu)
+{
+ return sched_idle_rq(cpu_rq(cpu));
+}
+#endif
+
/*
* The enqueue_task method is called before nr_running is
* increased. Here we update the fair scheduling stats and
@@ -5310,6 +5338,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
struct sched_entity *se = &p->se;
int task_sleep = flags & DEQUEUE_SLEEP;
int idle_h_nr_running = task_has_idle_policy(p);
+ bool was_sched_idle = sched_idle_rq(rq);
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
@@ -5356,6 +5385,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (!se)
sub_nr_running(rq, 1);
+ /* balance early to pull high priority tasks */
+ if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
+ rq->next_balance = jiffies;
+
util_est_dequeue(&rq->cfs, p, task_sleep);
hrtick_update(rq);
}
@@ -5378,15 +5411,6 @@ static struct {
#endif /* CONFIG_NO_HZ_COMMON */
-/* CPU only has SCHED_IDLE tasks enqueued */
-static int sched_idle_cpu(int cpu)
-{
- struct rq *rq = cpu_rq(cpu);
-
- return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running &&
- rq->nr_running);
-}
-
static unsigned long cpu_load(struct rq *rq)
{
return cfs_rq_load_avg(&rq->cfs);
@@ -5588,7 +5612,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
unsigned int min_exit_latency = UINT_MAX;
u64 latest_idle_timestamp = 0;
int least_loaded_cpu = this_cpu;
- int shallowest_idle_cpu = -1, si_cpu = -1;
+ int shallowest_idle_cpu = -1;
int i;
/* Check if we have any choice: */
@@ -5597,6 +5621,9 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
/* Traverse only the allowed CPUs */
for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) {
+ if (sched_idle_cpu(i))
+ return i;
+
if (available_idle_cpu(i)) {
struct rq *rq = cpu_rq(i);
struct cpuidle_state *idle = idle_get_state(rq);
@@ -5619,12 +5646,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
latest_idle_timestamp = rq->idle_stamp;
shallowest_idle_cpu = i;
}
- } else if (shallowest_idle_cpu == -1 && si_cpu == -1) {
- if (sched_idle_cpu(i)) {
- si_cpu = i;
- continue;
- }
-
+ } else if (shallowest_idle_cpu == -1) {
load = cpu_load(cpu_rq(i));
if (load < min_load) {
min_load = load;
@@ -5633,11 +5655,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
}
}
- if (shallowest_idle_cpu != -1)
- return shallowest_idle_cpu;
- if (si_cpu != -1)
- return si_cpu;
- return least_loaded_cpu;
+ return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
}
static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
@@ -5790,7 +5808,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
*/
static int select_idle_smt(struct task_struct *p, int target)
{
- int cpu, si_cpu = -1;
+ int cpu;
if (!static_branch_likely(&sched_smt_present))
return -1;
@@ -5798,13 +5816,11 @@ static int select_idle_smt(struct task_struct *p, int target)
for_each_cpu(cpu, cpu_smt_mask(target)) {
if (!cpumask_test_cpu(cpu, p->cpus_ptr))
continue;
- if (available_idle_cpu(cpu))
+ if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
return cpu;
- if (si_cpu == -1 && sched_idle_cpu(cpu))
- si_cpu = cpu;
}
- return si_cpu;
+ return -1;
}
#else /* CONFIG_SCHED_SMT */
@@ -5828,12 +5844,13 @@ static inline int select_idle_smt(struct task_struct *p, int target)
*/
static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
{
+ struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
struct sched_domain *this_sd;
u64 avg_cost, avg_idle;
u64 time, cost;
s64 delta;
int this = smp_processor_id();
- int cpu, nr = INT_MAX, si_cpu = -1;
+ int cpu, nr = INT_MAX;
this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
if (!this_sd)
@@ -5859,15 +5876,13 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
time = cpu_clock(this);
- for_each_cpu_wrap(cpu, sched_domain_span(sd), target) {
+ cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
+
+ for_each_cpu_wrap(cpu, cpus, target) {
if (!--nr)
- return si_cpu;
- if (!cpumask_test_cpu(cpu, p->cpus_ptr))
- continue;
- if (available_idle_cpu(cpu))
+ return -1;
+ if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
break;
- if (si_cpu == -1 && sched_idle_cpu(cpu))
- si_cpu = cpu;
}
time = cpu_clock(this) - time;
@@ -5896,6 +5911,20 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
(available_idle_cpu(prev) || sched_idle_cpu(prev)))
return prev;
+ /*
+ * Allow a per-cpu kthread to stack with the wakee if the
+ * kworker thread and the tasks previous CPUs are the same.
+ * The assumption is that the wakee queued work for the
+ * per-cpu kthread that is now complete and the wakeup is
+ * essentially a sync wakeup. An obvious example of this
+ * pattern is IO completions.
+ */
+ if (is_per_cpu_kthread(current) &&
+ prev == smp_processor_id() &&
+ this_rq()->nr_running <= 1) {
+ return prev;
+ }
+
/* Check a recently used CPU as a potential idle candidate: */
recent_used_cpu = p->recent_used_cpu;
if (recent_used_cpu != prev &&
@@ -6268,9 +6297,18 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
if (!cpumask_test_cpu(cpu, p->cpus_ptr))
continue;
- /* Skip CPUs that will be overutilized. */
util = cpu_util_next(cpu, p, cpu);
cpu_cap = capacity_of(cpu);
+ spare_cap = cpu_cap - util;
+
+ /*
+ * Skip CPUs that cannot satisfy the capacity request.
+ * IOW, placing the task there would make the CPU
+ * overutilized. Take uclamp into account to see how
+ * much capacity we can get out of the CPU; this is
+ * aligned with schedutil_cpu_util().
+ */
+ util = uclamp_rq_util_with(cpu_rq(cpu), util, p);
if (!fits_capacity(util, cpu_cap))
continue;
@@ -6285,7 +6323,6 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
* Find the CPU with the maximum spare capacity in
* the performance domain
*/
- spare_cap = cpu_cap - util;
if (spare_cap > max_spare_cap) {
max_spare_cap = spare_cap;
max_spare_cap_cpu = cpu;
@@ -7780,29 +7817,11 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
*/
for_each_cpu(cpu, sched_group_span(sdg)) {
- struct sched_group_capacity *sgc;
- struct rq *rq = cpu_rq(cpu);
-
- /*
- * build_sched_domains() -> init_sched_groups_capacity()
- * gets here before we've attached the domains to the
- * runqueues.
- *
- * Use capacity_of(), which is set irrespective of domains
- * in update_cpu_capacity().
- *
- * This avoids capacity from being 0 and
- * causing divide-by-zero issues on boot.
- */
- if (unlikely(!rq->sd)) {
- capacity += capacity_of(cpu);
- } else {
- sgc = rq->sd->groups->sgc;
- capacity += sgc->capacity;
- }
+ unsigned long cpu_cap = capacity_of(cpu);
- min_capacity = min(capacity, min_capacity);
- max_capacity = max(capacity, max_capacity);
+ capacity += cpu_cap;
+ min_capacity = min(cpu_cap, min_capacity);
+ max_capacity = max(cpu_cap, max_capacity);
}
} else {
/*
@@ -8168,14 +8187,18 @@ static bool update_sd_pick_busiest(struct lb_env *env,
case group_has_spare:
/*
- * Select not overloaded group with lowest number of
- * idle cpus. We could also compare the spare capacity
- * which is more stable but it can end up that the
- * group has less spare capacity but finally more idle
+ * Select not overloaded group with lowest number of idle cpus
+ * and highest number of running tasks. We could also compare
+ * the spare capacity which is more stable but it can end up
+ * that the group has less spare capacity but finally more idle
* CPUs which means less opportunity to pull tasks.
*/
- if (sgs->idle_cpus >= busiest->idle_cpus)
+ if (sgs->idle_cpus > busiest->idle_cpus)
return false;
+ else if ((sgs->idle_cpus == busiest->idle_cpus) &&
+ (sgs->sum_nr_running <= busiest->sum_nr_running))
+ return false;
+
break;
}
@@ -8648,10 +8671,6 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
/*
* Try to use spare capacity of local group without overloading it or
* emptying busiest.
- * XXX Spreading tasks across NUMA nodes is not always the best policy
- * and special care should be taken for SD_NUMA domain level before
- * spreading the tasks. For now, load_balance() fully relies on
- * NUMA_BALANCING and fbq_classify_group/rq to override the decision.
*/
if (local->group_type == group_has_spare) {
if (busiest->group_type > group_fully_busy) {
@@ -8691,16 +8710,37 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
env->migration_type = migrate_task;
lsub_positive(&nr_diff, local->sum_nr_running);
env->imbalance = nr_diff >> 1;
- return;
- }
+ } else {
- /*
- * If there is no overload, we just want to even the number of
- * idle cpus.
- */
- env->migration_type = migrate_task;
- env->imbalance = max_t(long, 0, (local->idle_cpus -
+ /*
+ * If there is no overload, we just want to even the number of
+ * idle cpus.
+ */
+ env->migration_type = migrate_task;
+ env->imbalance = max_t(long, 0, (local->idle_cpus -
busiest->idle_cpus) >> 1);
+ }
+
+ /* Consider allowing a small imbalance between NUMA groups */
+ if (env->sd->flags & SD_NUMA) {
+ unsigned int imbalance_min;
+
+ /*
+ * Compute an allowed imbalance based on a simple
+ * pair of communicating tasks that should remain
+ * local and ignore them.
+ *
+ * NOTE: Generally this would have been based on
+ * the domain size and this was evaluated. However,
+ * the benefit is similar across a range of workloads
+ * and machines but scaling by the domain size adds
+ * the risk that lower domains have to be rebalanced.
+ */
+ imbalance_min = 2;
+ if (busiest->sum_nr_running <= imbalance_min)
+ env->imbalance = 0;
+ }
+
return;
}
@@ -9529,6 +9569,7 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
{
int continue_balancing = 1;
int cpu = rq->cpu;
+ int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
unsigned long interval;
struct sched_domain *sd;
/* Earliest time when we have to do rebalance again */
@@ -9565,7 +9606,7 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
break;
}
- interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
+ interval = get_sd_balance_interval(sd, busy);
need_serialize = sd->flags & SD_SERIALIZE;
if (need_serialize) {
@@ -9581,9 +9622,10 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
* state even if we migrated tasks. Update it.
*/
idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
+ busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
}
sd->last_balance = jiffies;
- interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
+ interval = get_sd_balance_interval(sd, busy);
}
if (need_serialize)
spin_unlock(&balancing);
@@ -10333,6 +10375,9 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
if (!task_on_rq_queued(p))
return;
+ if (rq->cfs.nr_running == 1)
+ return;
+
/*
* Reschedule if we are currently running on this runqueue and
* our priority decreased, or if we are not currently running on
@@ -10423,7 +10468,7 @@ static void attach_entity_cfs_rq(struct sched_entity *se)
/* Synchronize entity with its cfs_rq */
update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
- attach_entity_load_avg(cfs_rq, se, 0);
+ attach_entity_load_avg(cfs_rq, se);
update_tg_load_avg(cfs_rq, false);
propagate_entity_cfs_rq(se);
}