cpufreq: governor: Replace timers with utilization update callbacks

Instead of using a per-CPU deferrable timer for queuing up governor
work items, register a utilization update callback that will be
invoked from the scheduler on utilization changes.

The sampling rate is still the same as what was used for the
deferrable timers and the added irq_work overhead should be offset by
the eliminated timers overhead, so in theory the functional impact of
this patch should not be significant.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Tested-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>
diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index 929e193..da7f351 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -189,7 +189,7 @@
 	}
 }
 
-static unsigned int od_dbs_timer(struct cpufreq_policy *policy, bool modify_all)
+static unsigned int od_dbs_timer(struct cpufreq_policy *policy)
 {
 	struct dbs_data *dbs_data = policy->governor_data;
 	unsigned int cpu = policy->cpu;
@@ -198,9 +198,6 @@
 	struct od_dbs_tuners *od_tuners = dbs_data->tuners;
 	int delay = 0, sample_type = dbs_info->sample_type;
 
-	if (!modify_all)
-		goto max_delay;
-
 	/* Common NORMAL_SAMPLE setup */
 	dbs_info->sample_type = OD_NORMAL_SAMPLE;
 	if (sample_type == OD_SUB_SAMPLE) {
@@ -216,7 +213,6 @@
 		}
 	}
 
-max_delay:
 	if (!delay)
 		delay = delay_for_sampling_rate(od_tuners->sampling_rate
 				* dbs_info->rate_mult);
@@ -262,7 +258,6 @@
 		struct od_cpu_dbs_info_s *dbs_info;
 		struct cpu_dbs_info *cdbs;
 		struct cpu_common_dbs_info *shared;
-		unsigned long next_sampling, appointed_at;
 
 		dbs_info = &per_cpu(od_cpu_dbs_info, cpu);
 		cdbs = &dbs_info->cdbs;
@@ -286,20 +281,28 @@
 		 * policy will be governed by dbs_data, otherwise there can be
 		 * multiple policies that are governed by the same dbs_data.
 		 */
-		if (dbs_data != policy->governor_data)
-			continue;
-
-		/*
-		 * Checking this for any CPU should be fine, timers for all of
-		 * them are scheduled together.
-		 */
-		next_sampling = jiffies + usecs_to_jiffies(new_rate);
-		appointed_at = dbs_info->cdbs.timer.expires;
-
-		if (time_before(next_sampling, appointed_at)) {
-			gov_cancel_work(shared);
-			gov_add_timers(policy, usecs_to_jiffies(new_rate));
-
+		if (dbs_data == policy->governor_data) {
+			mutex_lock(&shared->timer_mutex);
+			/*
+			 * On 32-bit architectures this may race with the
+			 * sample_delay_ns read in dbs_update_util_handler(),
+			 * but that really doesn't matter.  If the read returns
+			 * a value that's too big, the sample will be skipped,
+			 * but the next invocation of dbs_update_util_handler()
+			 * (when the update has been completed) will take a
+			 * sample.  If the returned value is too small, the
+			 * sample will be taken immediately, but that isn't a
+			 * problem, as we want the new rate to take effect
+			 * immediately anyway.
+			 *
+			 * If this runs in parallel with dbs_work_handler(), we
+			 * may end up overwriting the sample_delay_ns value that
+			 * it has just written, but the difference should not be
+			 * too big and it will be corrected next time a sample
+			 * is taken, so it shouldn't be significant.
+			 */
+			gov_update_sample_delay(shared, new_rate);
+			mutex_unlock(&shared->timer_mutex);
 		}
 	}