sched/core: Distribute tasks within affinity masks

Currently, when updating the affinity of tasks via either cpusets.cpus, or, sched_setaffinity(); tasks not currently running within the newly specified mask will be arbitrarily assigned to the first CPU within the mask. This (particularly in the case that we are restricting masks) can result in many tasks being assigned to the first CPUs of their new masks. This: 1) Can induce scheduling delays while the load-balancer has a chance to spread them between their new CPUs. 2) Can antogonize a poor load-balancer behavior where it has a difficult time recognizing that a cross-socket imbalance has been forced by an affinity mask. This change adds a new cpumask interface to allow iterated calls to distribute within the intersection of the provided masks. The cases that this mainly affects are: - modifying cpuset.cpus - when tasks join a cpuset - when modifying a task's affinity via sched_setaffinity(2) Signed-off-by: Paul Turner <pjt@google.com> Signed-off-by: Josh Don <joshdon@google.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Reviewed-by: Qais Yousef <qais.yousef@arm.com> Tested-by: Qais Yousef <qais.yousef@arm.com> Link: https://lkml.kernel.org/r/20200311010113.136465-1-joshdon@google.com
author: Paul Turner <pjt@google.com> 2020-03-10 18:01:13 -0700
committer: Peter Zijlstra <peterz@infradead.org> 2020-03-20 13:06:18 +0100
commit: 46a87b3851f0d6eb05e6d83d5c5a30df0eca8f76 (patch)
tree: b98cf01a4a12e708f063c07788f7a3a7b41a93f2
parent: fe61468b2cbc2b7ce5f8d3bf32ae5001d4c434e9 (diff)
3 files changed, 42 insertions, 1 deletions
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index d5cc88514aee..f0d895d6ac39 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -194,6 +194,11 @@ static inline unsigned int cpumask_local_spread(unsigned int i, int node)
 	return 0;
 }
 
+static inline int cpumask_any_and_distribute(const struct cpumask *src1p,
+					     const struct cpumask *src2p) {
+	return cpumask_next_and(-1, src1p, src2p);
+}
+
 #define for_each_cpu(cpu, mask)			\
 	for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask)
 #define for_each_cpu_not(cpu, mask)		\
@@ -245,6 +250,8 @@ static inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp)
 int cpumask_next_and(int n, const struct cpumask *, const struct cpumask *);
 int cpumask_any_but(const struct cpumask *mask, unsigned int cpu);
 unsigned int cpumask_local_spread(unsigned int i, int node);
+int cpumask_any_and_distribute(const struct cpumask *src1p,
+			       const struct cpumask *src2p);
 
 /**
  * for_each_cpu - iterate over every cpu in a mask
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 978bf6f6e0ff..014d4f793313 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1650,7 +1650,12 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
 	if (cpumask_equal(p->cpus_ptr, new_mask))
 		goto out;
 
-	dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
+	/*
+	 * Picking a ~random cpu helps in cases where we are changing affinity
+	 * for groups of tasks (ie. cpuset), so that load balancing is not
+	 * immediately required to distribute the tasks within their new mask.
+	 */
+	dest_cpu = cpumask_any_and_distribute(cpu_valid_mask, new_mask);
 	if (dest_cpu >= nr_cpu_ids) {
 		ret = -EINVAL;
 		goto out;
diff --git a/lib/cpumask.c b/lib/cpumask.c
index 0cb672eb107c..fb22fb266f93 100644
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -232,3 +232,32 @@ unsigned int cpumask_local_spread(unsigned int i, int node)
 	BUG();
 }
 EXPORT_SYMBOL(cpumask_local_spread);
+
+static DEFINE_PER_CPU(int, distribute_cpu_mask_prev);
+
+/**
+ * Returns an arbitrary cpu within srcp1 & srcp2.
+ *
+ * Iterated calls using the same srcp1 and srcp2 will be distributed within
+ * their intersection.
+ *
+ * Returns >= nr_cpu_ids if the intersection is empty.
+ */
+int cpumask_any_and_distribute(const struct cpumask *src1p,
+			       const struct cpumask *src2p)
+{
+	int next, prev;
+
+	/* NOTE: our first selection will skip 0. */
+	prev = __this_cpu_read(distribute_cpu_mask_prev);
+
+	next = cpumask_next_and(prev, src1p, src2p);
+	if (next >= nr_cpu_ids)
+		next = cpumask_first_and(src1p, src2p);
+
+	if (next < nr_cpu_ids)
+		__this_cpu_write(distribute_cpu_mask_prev, next);
+
+	return next;
+}
+EXPORT_SYMBOL(cpumask_any_and_distribute);
author	Paul Turner <pjt@google.com>	2020-03-10 18:01:13 -0700
committer	Peter Zijlstra <peterz@infradead.org>	2020-03-20 13:06:18 +0100
commit	46a87b3851f0d6eb05e6d83d5c5a30df0eca8f76 (patch)
tree	b98cf01a4a12e708f063c07788f7a3a7b41a93f2
parent	fe61468b2cbc2b7ce5f8d3bf32ae5001d4c434e9 (diff)