5 files changed, 114 insertions, 120 deletions
diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig
index 659879a56dba..dcb972a38fbc 100644
--- a/drivers/cpufreq/Kconfig
+++ b/drivers/cpufreq/Kconfig
@@ -3,6 +3,7 @@ menu "CPU Frequency scaling"
 config CPU_FREQ
 	bool "CPU Frequency scaling"
 	select SRCU
+	select IRQ_WORK
 	help
 	  CPU Frequency scaling allows you to change the clock speed of 
 	  CPUs on the fly. This is a nice method to save power, because 
diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c
index 8504a70a4785..bc002c8cba90 100644
--- a/drivers/cpufreq/cpufreq_conservative.c
+++ b/drivers/cpufreq/cpufreq_conservative.c
@@ -112,14 +112,12 @@ static void cs_check_cpu(int cpu, unsigned int load)
 	}
 }
 
-static unsigned int cs_dbs_timer(struct cpufreq_policy *policy, bool modify_all)
+static unsigned int cs_dbs_timer(struct cpufreq_policy *policy)
 {
 	struct dbs_data *dbs_data = policy->governor_data;
 	struct cs_dbs_tuners *cs_tuners = dbs_data->tuners;
 
-	if (modify_all)
-		dbs_check_cpu(dbs_data, policy->cpu);
-
+	dbs_check_cpu(dbs_data, policy->cpu);
 	return delay_for_sampling_rate(cs_tuners->sampling_rate);
 }
 
diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index e0d111024d48..6bc2f50cc1d9 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -128,10 +128,10 @@ void dbs_check_cpu(struct dbs_data *dbs_data, int cpu)
 		 * dropped down. So we perform the copy only once, upon the
 		 * first wake-up from idle.)
 		 *
-		 * Detecting this situation is easy: the governor's deferrable
-		 * timer would not have fired during CPU-idle periods. Hence
-		 * an unusually large 'wall_time' (as compared to the sampling
-		 * rate) indicates this scenario.
+		 * Detecting this situation is easy: the governor's utilization
+		 * update handler would not have run during CPU-idle periods.
+		 * Hence, an unusually large 'wall_time' (as compared to the
+		 * sampling rate) indicates this scenario.
 		 *
 		 * prev_load can be zero in two cases and we must recalculate it
 		 * for both cases:
@@ -161,72 +161,48 @@ void dbs_check_cpu(struct dbs_data *dbs_data, int cpu)
 }
 EXPORT_SYMBOL_GPL(dbs_check_cpu);
 
-void gov_add_timers(struct cpufreq_policy *policy, unsigned int delay)
+void gov_set_update_util(struct cpu_common_dbs_info *shared,
+			 unsigned int delay_us)
 {
+	struct cpufreq_policy *policy = shared->policy;
 	struct dbs_data *dbs_data = policy->governor_data;
-	struct cpu_dbs_info *cdbs;
 	int cpu;
 
+	gov_update_sample_delay(shared, delay_us);
+	shared->last_sample_time = 0;
+
 	for_each_cpu(cpu, policy->cpus) {
-		cdbs = dbs_data->cdata->get_cpu_cdbs(cpu);
-		cdbs->timer.expires = jiffies + delay;
-		add_timer_on(&cdbs->timer, cpu);
+		struct cpu_dbs_info *cdbs = dbs_data->cdata->get_cpu_cdbs(cpu);
+
+		cpufreq_set_update_util_data(cpu, &cdbs->update_util);
 	}
 }
-EXPORT_SYMBOL_GPL(gov_add_timers);
+EXPORT_SYMBOL_GPL(gov_set_update_util);
 
-static inline void gov_cancel_timers(struct cpufreq_policy *policy)
+static inline void gov_clear_update_util(struct cpufreq_policy *policy)
 {
-	struct dbs_data *dbs_data = policy->governor_data;
-	struct cpu_dbs_info *cdbs;
 	int i;
 
-	for_each_cpu(i, policy->cpus) {
-		cdbs = dbs_data->cdata->get_cpu_cdbs(i);
-		del_timer_sync(&cdbs->timer);
-	}
+	for_each_cpu(i, policy->cpus)
+		cpufreq_set_update_util_data(i, NULL);
+
+	synchronize_rcu();
 }
 
-void gov_cancel_work(struct cpu_common_dbs_info *shared)
+static void gov_cancel_work(struct cpu_common_dbs_info *shared)
 {
-	/* Tell dbs_timer_handler() to skip queuing up work items. */
+	/* Tell dbs_update_util_handler() to skip queuing up work items. */
 	atomic_inc(&shared->skip_work);
 	/*
-	 * If dbs_timer_handler() is already running, it may not notice the
-	 * incremented skip_work, so wait for it to complete to prevent its work
-	 * item from being queued up after the cancel_work_sync() below.
-	 */
-	gov_cancel_timers(shared->policy);
-	/*
-	 * In case dbs_timer_handler() managed to run and spawn a work item
-	 * before the timers have been canceled, wait for that work item to
-	 * complete and then cancel all of the timers set up by it.  If
-	 * dbs_timer_handler() runs again at that point, it will see the
-	 * positive value of skip_work and won't spawn any more work items.
+	 * If dbs_update_util_handler() is already running, it may not notice
+	 * the incremented skip_work, so wait for it to complete to prevent its
+	 * work item from being queued up after the cancel_work_sync() below.
 	 */
+	gov_clear_update_util(shared->policy);
+	irq_work_sync(&shared->irq_work);
 	cancel_work_sync(&shared->work);
-	gov_cancel_timers(shared->policy);
 	atomic_set(&shared->skip_work, 0);
 }
-EXPORT_SYMBOL_GPL(gov_cancel_work);
-
-/* Will return if we need to evaluate cpu load again or not */
-static bool need_load_eval(struct cpu_common_dbs_info *shared,
-			   unsigned int sampling_rate)
-{
-	if (policy_is_shared(shared->policy)) {
-		ktime_t time_now = ktime_get();
-		s64 delta_us = ktime_us_delta(time_now, shared->time_stamp);
-
-		/* Do nothing if we recently have sampled */
-		if (delta_us < (s64)(sampling_rate / 2))
-			return false;
-		else
-			shared->time_stamp = time_now;
-	}
-
-	return true;
-}
 
 static void dbs_work_handler(struct work_struct *work)
 {
@@ -234,56 +210,70 @@ static void dbs_work_handler(struct work_struct *work)
 					cpu_common_dbs_info, work);
 	struct cpufreq_policy *policy;
 	struct dbs_data *dbs_data;
-	unsigned int sampling_rate, delay;
-	bool eval_load;
+	unsigned int delay;
 
 	policy = shared->policy;
 	dbs_data = policy->governor_data;
 
-	/* Kill all timers */
-	gov_cancel_timers(policy);
-
-	if (dbs_data->cdata->governor == GOV_CONSERVATIVE) {
-		struct cs_dbs_tuners *cs_tuners = dbs_data->tuners;
-
-		sampling_rate = cs_tuners->sampling_rate;
-	} else {
-		struct od_dbs_tuners *od_tuners = dbs_data->tuners;
-
-		sampling_rate = od_tuners->sampling_rate;
-	}
-
-	eval_load = need_load_eval(shared, sampling_rate);
-
 	/*
-	 * Make sure cpufreq_governor_limits() isn't evaluating load in
-	 * parallel.
+	 * Make sure cpufreq_governor_limits() isn't evaluating load or the
+	 * ondemand governor isn't updating the sampling rate in parallel.
 	 */
 	mutex_lock(&shared->timer_mutex);
-	delay = dbs_data->cdata->gov_dbs_timer(policy, eval_load);
+	delay = dbs_data->cdata->gov_dbs_timer(policy);
+	shared->sample_delay_ns = jiffies_to_nsecs(delay);
 	mutex_unlock(&shared->timer_mutex);
 
+	/*
+	 * If the atomic operation below is reordered with respect to the
+	 * sample delay modification, the utilization update handler may end
+	 * up using a stale sample delay value.
+	 */
+	smp_mb__before_atomic();
 	atomic_dec(&shared->skip_work);
+}
+
+static void dbs_irq_work(struct irq_work *irq_work)
+{
+	struct cpu_common_dbs_info *shared;
 
-	gov_add_timers(policy, delay);
+	shared = container_of(irq_work, struct cpu_common_dbs_info, irq_work);
+	schedule_work(&shared->work);
 }
 
-static void dbs_timer_handler(unsigned long data)
+static inline void gov_queue_irq_work(struct cpu_common_dbs_info *shared)
 {
-	struct cpu_dbs_info *cdbs = (struct cpu_dbs_info *)data;
+#ifdef CONFIG_SMP
+	irq_work_queue_on(&shared->irq_work, smp_processor_id());
+#else
+	irq_work_queue(&shared->irq_work);
+#endif
+}
+
+static void dbs_update_util_handler(struct update_util_data *data, u64 time,
+				    unsigned long util, unsigned long max)
+{
+	struct cpu_dbs_info *cdbs = container_of(data, struct cpu_dbs_info, update_util);
 	struct cpu_common_dbs_info *shared = cdbs->shared;
 
 	/*
-	 * Timer handler may not be allowed to queue the work at the moment,
-	 * because:
-	 * - Another timer handler has done that
-	 * - We are stopping the governor
-	 * - Or we are updating the sampling rate of the ondemand governor
+	 * The work may not be allowed to be queued up right now.
+	 * Possible reasons:
+	 * - Work has already been queued up or is in progress.
+	 * - The governor is being stopped.
+	 * - It is too early (too little time from the previous sample).
 	 */
-	if (atomic_inc_return(&shared->skip_work) > 1)
-		atomic_dec(&shared->skip_work);
-	else
-		queue_work(system_wq, &shared->work);
+	if (atomic_inc_return(&shared->skip_work) == 1) {
+		u64 delta_ns;
+
+		delta_ns = time - shared->last_sample_time;
+		if ((s64)delta_ns >= shared->sample_delay_ns) {
+			shared->last_sample_time = time;
+			gov_queue_irq_work(shared);
+			return;
+		}
+	}
+	atomic_dec(&shared->skip_work);
 }
 
 static void set_sampling_rate(struct dbs_data *dbs_data,
@@ -315,6 +305,7 @@ static int alloc_common_dbs_info(struct cpufreq_policy *policy,
 
 	mutex_init(&shared->timer_mutex);
 	atomic_set(&shared->skip_work, 0);
+	init_irq_work(&shared->irq_work, dbs_irq_work);
 	INIT_WORK(&shared->work, dbs_work_handler);
 	return 0;
 }
@@ -467,9 +458,6 @@ static int cpufreq_governor_start(struct cpufreq_policy *policy,
 		io_busy = od_tuners->io_is_busy;
 	}
 
-	shared->policy = policy;
-	shared->time_stamp = ktime_get();
-
 	for_each_cpu(j, policy->cpus) {
 		struct cpu_dbs_info *j_cdbs = cdata->get_cpu_cdbs(j);
 		unsigned int prev_load;
@@ -485,10 +473,9 @@ static int cpufreq_governor_start(struct cpufreq_policy *policy,
 		if (ignore_nice)
 			j_cdbs->prev_cpu_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE];
 
-		__setup_timer(&j_cdbs->timer, dbs_timer_handler,
-			      (unsigned long)j_cdbs,
-			      TIMER_DEFERRABLE | TIMER_IRQSAFE);
+		j_cdbs->update_util.func = dbs_update_util_handler;
 	}
+	shared->policy = policy;
 
 	if (cdata->governor == GOV_CONSERVATIVE) {
 		struct cs_cpu_dbs_info_s *cs_dbs_info =
@@ -505,7 +492,7 @@ static int cpufreq_governor_start(struct cpufreq_policy *policy,
 		od_ops->powersave_bias_init_cpu(cpu);
 	}
 
-	gov_add_timers(policy, delay_for_sampling_rate(sampling_rate));
+	gov_set_update_util(shared, sampling_rate);
 	return 0;
 }
 
diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h
index 91e767a058a7..541777192dbc 100644
--- a/drivers/cpufreq/cpufreq_governor.h
+++ b/drivers/cpufreq/cpufreq_governor.h
@@ -18,6 +18,7 @@
 #define _CPUFREQ_GOVERNOR_H
 
 #include <linux/atomic.h>
+#include <linux/irq_work.h>
 #include <linux/cpufreq.h>
 #include <linux/kernel_stat.h>
 #include <linux/module.h>
@@ -138,11 +139,19 @@ struct cpu_common_dbs_info {
 	 */
 	struct mutex timer_mutex;
 
-	ktime_t time_stamp;
+	u64 last_sample_time;
+	s64 sample_delay_ns;
 	atomic_t skip_work;
+	struct irq_work irq_work;
 	struct work_struct work;
 };
 
+static inline void gov_update_sample_delay(struct cpu_common_dbs_info *shared,
+					   unsigned int delay_us)
+{
+	shared->sample_delay_ns = delay_us * NSEC_PER_USEC;
+}
+
 /* Per cpu structures */
 struct cpu_dbs_info {
 	u64 prev_cpu_idle;
@@ -155,7 +164,7 @@ struct cpu_dbs_info {
 	 * wake-up from idle.
 	 */
 	unsigned int prev_load;
-	struct timer_list timer;
+	struct update_util_data update_util;
 	struct cpu_common_dbs_info *shared;
 };
 
@@ -212,8 +221,7 @@ struct common_dbs_data {
 
 	struct cpu_dbs_info *(*get_cpu_cdbs)(int cpu);
 	void *(*get_cpu_dbs_info_s)(int cpu);
-	unsigned int (*gov_dbs_timer)(struct cpufreq_policy *policy,
-				      bool modify_all);
+	unsigned int (*gov_dbs_timer)(struct cpufreq_policy *policy);
 	void (*gov_check_cpu)(int cpu, unsigned int load);
 	int (*init)(struct dbs_data *dbs_data, bool notify);
 	void (*exit)(struct dbs_data *dbs_data, bool notify);
@@ -270,9 +278,6 @@ static ssize_t show_sampling_rate_min_gov_pol				\
 }
 
 extern struct mutex cpufreq_governor_lock;
-
-void gov_add_timers(struct cpufreq_policy *policy, unsigned int delay);
-void gov_cancel_work(struct cpu_common_dbs_info *shared);
 void dbs_check_cpu(struct dbs_data *dbs_data, int cpu);
 int cpufreq_governor_dbs(struct cpufreq_policy *policy,
 		struct common_dbs_data *cdata, unsigned int event);
diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index 929e193ac1c1..da7f3514d948 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -189,7 +189,7 @@ static void od_check_cpu(int cpu, unsigned int load)
 	}
 }
 
-static unsigned int od_dbs_timer(struct cpufreq_policy *policy, bool modify_all)
+static unsigned int od_dbs_timer(struct cpufreq_policy *policy)
 {
 	struct dbs_data *dbs_data = policy->governor_data;
 	unsigned int cpu = policy->cpu;
@@ -198,9 +198,6 @@ static unsigned int od_dbs_timer(struct cpufreq_policy *policy, bool modify_all)
 	struct od_dbs_tuners *od_tuners = dbs_data->tuners;
 	int delay = 0, sample_type = dbs_info->sample_type;
 
-	if (!modify_all)
-		goto max_delay;
-
 	/* Common NORMAL_SAMPLE setup */
 	dbs_info->sample_type = OD_NORMAL_SAMPLE;
 	if (sample_type == OD_SUB_SAMPLE) {
@@ -216,7 +213,6 @@ static unsigned int od_dbs_timer(struct cpufreq_policy *policy, bool modify_all)
 		}
 	}
 
-max_delay:
 	if (!delay)
 		delay = delay_for_sampling_rate(od_tuners->sampling_rate
 				* dbs_info->rate_mult);
@@ -262,7 +258,6 @@ static void update_sampling_rate(struct dbs_data *dbs_data,
 		struct od_cpu_dbs_info_s *dbs_info;
 		struct cpu_dbs_info *cdbs;
 		struct cpu_common_dbs_info *shared;
-		unsigned long next_sampling, appointed_at;
 
 		dbs_info = &per_cpu(od_cpu_dbs_info, cpu);
 		cdbs = &dbs_info->cdbs;
@@ -286,20 +281,28 @@ static void update_sampling_rate(struct dbs_data *dbs_data,
 		 * policy will be governed by dbs_data, otherwise there can be
 		 * multiple policies that are governed by the same dbs_data.
 		 */
-		if (dbs_data != policy->governor_data)
-			continue;
-
-		/*
-		 * Checking this for any CPU should be fine, timers for all of
-		 * them are scheduled together.
-		 */
-		next_sampling = jiffies + usecs_to_jiffies(new_rate);
-		appointed_at = dbs_info->cdbs.timer.expires;
-
-		if (time_before(next_sampling, appointed_at)) {
-			gov_cancel_work(shared);
-			gov_add_timers(policy, usecs_to_jiffies(new_rate));
-
+		if (dbs_data == policy->governor_data) {
+			mutex_lock(&shared->timer_mutex);
+			/*
+			 * On 32-bit architectures this may race with the
+			 * sample_delay_ns read in dbs_update_util_handler(),
+			 * but that really doesn't matter.  If the read returns
+			 * a value that's too big, the sample will be skipped,
+			 * but the next invocation of dbs_update_util_handler()
+			 * (when the update has been completed) will take a
+			 * sample.  If the returned value is too small, the
+			 * sample will be taken immediately, but that isn't a
+			 * problem, as we want the new rate to take effect
+			 * immediately anyway.
+			 *
+			 * If this runs in parallel with dbs_work_handler(), we
+			 * may end up overwriting the sample_delay_ns value that
+			 * it has just written, but the difference should not be
+			 * too big and it will be corrected next time a sample
+			 * is taken, so it shouldn't be significant.
+			 */
+			gov_update_sample_delay(shared, new_rate);
+			mutex_unlock(&shared->timer_mutex);
 		}
 	}