From: Nicolas Pitre nicolas.pitre@linaro.org
Relying on pcpu->governor_enabled alone to ensure proper shutdown of the governor activities is flawed. The following race is perfectly possible:
CPU0 |CPU2 ---- +---- |cpufreq_interactive_timer() cpufreq_governor_interactive(GOV_STOP) |if (!pcpu->governor_enabled) pcpu->governor_enabled = 0; | return; [untaken] smp_wmb(); |[...] del_timer_sync(&pcpu->cpu_timer); |cpufreq_interactive_timer_resched() |mod_timer_pinned(&pcpu->cpu_timer)
Result: timer is pending again. It is also possible for del_timer_sync() to race against the invocation of the idle notifier callback which also has the potential to reactivate the timer.
To fix this issue, a read-write semaphore is used. Multiple readers are allowed as long as pcpu->governor_enabled is true. However it can be moved to false only after taking a write semaphore which would wait for any on-going asynchronous activities to complete and prevent any more of those activities to be initiated.
Signed-off-by: Nicolas Pitre nicolas.pitre@linaro.org --- drivers/cpufreq/cpufreq_interactive.c | 38 ++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 10 deletions(-)
diff --git a/drivers/cpufreq/cpufreq_interactive.c b/drivers/cpufreq/cpufreq_interactive.c index c82d9fe..061af7f 100644 --- a/drivers/cpufreq/cpufreq_interactive.c +++ b/drivers/cpufreq/cpufreq_interactive.c @@ -21,14 +21,13 @@ #include <linux/cpufreq.h> #include <linux/module.h> #include <linux/moduleparam.h> -#include <linux/mutex.h> +#include <linux/rwsem.h> #include <linux/sched.h> #include <linux/tick.h> #include <linux/time.h> #include <linux/timer.h> #include <linux/workqueue.h> #include <linux/kthread.h> -#include <linux/mutex.h> #include <linux/slab.h> #include <asm/cputime.h>
@@ -50,6 +49,7 @@ struct cpufreq_interactive_cpuinfo { unsigned int floor_freq; u64 floor_validate_time; u64 hispeed_validate_time; + struct rw_semaphore mutex; int governor_enabled; };
@@ -138,8 +138,8 @@ static void cpufreq_interactive_timer(unsigned long data) unsigned int index; unsigned long flags;
- smp_rmb(); - + if (!down_read_trylock(&pcpu->mutex)) + return; if (!pcpu->governor_enabled) goto exit;
@@ -258,6 +258,7 @@ rearm: }
exit: + up_read(&pcpu->mutex); return; }
@@ -267,8 +268,12 @@ static void cpufreq_interactive_idle_start(void) &per_cpu(cpuinfo, smp_processor_id()); int pending;
- if (!pcpu->governor_enabled) + if (!down_read_trylock(&pcpu->mutex)) + return; + if (!pcpu->governor_enabled) { + up_read(&pcpu->mutex); return; + }
pending = timer_pending(&pcpu->cpu_timer);
@@ -298,6 +303,7 @@ static void cpufreq_interactive_idle_start(void) } }
+ up_read(&pcpu->mutex); }
static void cpufreq_interactive_idle_end(void) @@ -305,8 +311,12 @@ static void cpufreq_interactive_idle_end(void) struct cpufreq_interactive_cpuinfo *pcpu = &per_cpu(cpuinfo, smp_processor_id());
- if (!pcpu->governor_enabled) + if (!down_read_trylock(&pcpu->mutex)) + return; + if (!pcpu->governor_enabled) { + up_read(&pcpu->mutex); return; + }
/* Arm the timer for 1-2 ticks later if not already. */ if (!timer_pending(&pcpu->cpu_timer)) { @@ -317,6 +327,8 @@ static void cpufreq_interactive_idle_end(void) del_timer(&pcpu->cpu_timer); cpufreq_interactive_timer(smp_processor_id()); } + + up_read(&pcpu->mutex); }
static int cpufreq_interactive_speedchange_task(void *data) @@ -351,10 +363,12 @@ static int cpufreq_interactive_speedchange_task(void *data) unsigned int max_freq = 0;
pcpu = &per_cpu(cpuinfo, cpu); - smp_rmb(); - - if (!pcpu->governor_enabled) + if (!down_read_trylock(&pcpu->mutex)) + continue; + if (!pcpu->governor_enabled) { + up_read(&pcpu->mutex); continue; + }
for_each_cpu(j, pcpu->policy->cpus) { struct cpufreq_interactive_cpuinfo *pjcpu = @@ -371,6 +385,8 @@ static int cpufreq_interactive_speedchange_task(void *data) trace_cpufreq_interactive_setspeed(cpu, pcpu->target_freq, pcpu->policy->cur); + + up_read(&pcpu->mutex); } }
@@ -690,9 +706,10 @@ static int cpufreq_governor_interactive(struct cpufreq_policy *policy, case CPUFREQ_GOV_STOP: for_each_cpu(j, policy->cpus) { pcpu = &per_cpu(cpuinfo, j); + down_write(&pcpu->mutex); pcpu->governor_enabled = 0; - smp_wmb(); del_timer_sync(&pcpu->cpu_timer); + up_write(&pcpu->mutex); }
if (atomic_dec_return(&active_count) > 0) @@ -736,6 +753,7 @@ static int __init cpufreq_interactive_init(void) init_timer_deferrable(&pcpu->cpu_timer); pcpu->cpu_timer.function = cpufreq_interactive_timer; pcpu->cpu_timer.data = i; + init_rwsem(&pcpu->mutex); }
spin_lock_init(&speedchange_cpumask_lock);
Thanks, I'll merge this to android-3.4 soon.
Todd
On Thu, Dec 13, 2012 at 9:47 AM, Viresh Kumar viresh.kumar@linaro.org wrote:
From: Nicolas Pitre nicolas.pitre@linaro.org
Relying on pcpu->governor_enabled alone to ensure proper shutdown of the governor activities is flawed. The following race is perfectly possible:
CPU0 |CPU2 ---- +---- |cpufreq_interactive_timer() cpufreq_governor_interactive(GOV_STOP) |if (!pcpu->governor_enabled) pcpu->governor_enabled = 0; | return; [untaken] smp_wmb(); |[...] del_timer_sync(&pcpu->cpu_timer); |cpufreq_interactive_timer_resched() |mod_timer_pinned(&pcpu->cpu_timer)
Result: timer is pending again. It is also possible for del_timer_sync() to race against the invocation of the idle notifier callback which also has the potential to reactivate the timer.
To fix this issue, a read-write semaphore is used. Multiple readers are allowed as long as pcpu->governor_enabled is true. However it can be moved to false only after taking a write semaphore which would wait for any on-going asynchronous activities to complete and prevent any more of those activities to be initiated.
Signed-off-by: Nicolas Pitre nicolas.pitre@linaro.org
drivers/cpufreq/cpufreq_interactive.c | 38 ++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 10 deletions(-)
diff --git a/drivers/cpufreq/cpufreq_interactive.c b/drivers/cpufreq/cpufreq_interactive.c index c82d9fe..061af7f 100644 --- a/drivers/cpufreq/cpufreq_interactive.c +++ b/drivers/cpufreq/cpufreq_interactive.c @@ -21,14 +21,13 @@ #include <linux/cpufreq.h> #include <linux/module.h> #include <linux/moduleparam.h> -#include <linux/mutex.h> +#include <linux/rwsem.h> #include <linux/sched.h> #include <linux/tick.h> #include <linux/time.h> #include <linux/timer.h> #include <linux/workqueue.h> #include <linux/kthread.h> -#include <linux/mutex.h> #include <linux/slab.h> #include <asm/cputime.h>
@@ -50,6 +49,7 @@ struct cpufreq_interactive_cpuinfo { unsigned int floor_freq; u64 floor_validate_time; u64 hispeed_validate_time;
struct rw_semaphore mutex; int governor_enabled;
};
@@ -138,8 +138,8 @@ static void cpufreq_interactive_timer(unsigned long data) unsigned int index; unsigned long flags;
smp_rmb();
if (!down_read_trylock(&pcpu->mutex))
return; if (!pcpu->governor_enabled) goto exit;
@@ -258,6 +258,7 @@ rearm: }
exit:
up_read(&pcpu->mutex); return;
}
@@ -267,8 +268,12 @@ static void cpufreq_interactive_idle_start(void) &per_cpu(cpuinfo, smp_processor_id()); int pending;
if (!pcpu->governor_enabled)
if (!down_read_trylock(&pcpu->mutex))
return;
if (!pcpu->governor_enabled) {
up_read(&pcpu->mutex); return;
} pending = timer_pending(&pcpu->cpu_timer);
@@ -298,6 +303,7 @@ static void cpufreq_interactive_idle_start(void) } }
up_read(&pcpu->mutex);
}
static void cpufreq_interactive_idle_end(void) @@ -305,8 +311,12 @@ static void cpufreq_interactive_idle_end(void) struct cpufreq_interactive_cpuinfo *pcpu = &per_cpu(cpuinfo, smp_processor_id());
if (!pcpu->governor_enabled)
if (!down_read_trylock(&pcpu->mutex))
return;
if (!pcpu->governor_enabled) {
up_read(&pcpu->mutex); return;
} /* Arm the timer for 1-2 ticks later if not already. */ if (!timer_pending(&pcpu->cpu_timer)) {
@@ -317,6 +327,8 @@ static void cpufreq_interactive_idle_end(void) del_timer(&pcpu->cpu_timer); cpufreq_interactive_timer(smp_processor_id()); }
up_read(&pcpu->mutex);
}
static int cpufreq_interactive_speedchange_task(void *data) @@ -351,10 +363,12 @@ static int cpufreq_interactive_speedchange_task(void *data) unsigned int max_freq = 0;
pcpu = &per_cpu(cpuinfo, cpu);
smp_rmb();
if (!pcpu->governor_enabled)
if (!down_read_trylock(&pcpu->mutex))
continue;
if (!pcpu->governor_enabled) {
up_read(&pcpu->mutex); continue;
} for_each_cpu(j, pcpu->policy->cpus) { struct cpufreq_interactive_cpuinfo *pjcpu =
@@ -371,6 +385,8 @@ static int cpufreq_interactive_speedchange_task(void *data) trace_cpufreq_interactive_setspeed(cpu, pcpu->target_freq, pcpu->policy->cur);
up_read(&pcpu->mutex); } }
@@ -690,9 +706,10 @@ static int cpufreq_governor_interactive(struct cpufreq_policy *policy, case CPUFREQ_GOV_STOP: for_each_cpu(j, policy->cpus) { pcpu = &per_cpu(cpuinfo, j);
down_write(&pcpu->mutex); pcpu->governor_enabled = 0;
smp_wmb(); del_timer_sync(&pcpu->cpu_timer);
up_write(&pcpu->mutex); } if (atomic_dec_return(&active_count) > 0)
@@ -736,6 +753,7 @@ static int __init cpufreq_interactive_init(void) init_timer_deferrable(&pcpu->cpu_timer); pcpu->cpu_timer.function = cpufreq_interactive_timer; pcpu->cpu_timer.data = i;
init_rwsem(&pcpu->mutex); } spin_lock_init(&speedchange_cpumask_lock);
-- 1.7.12.rc2.18.g61b472e
I put this up on AOSP Gerrit at https://android-review.googlesource.com/#/c/48233/ with a minor change to the semaphore field name to be more descriptive and to identify it as a semaphore.
After some out-of-band discussion it was agreed the race with the idle notifier definitely exists, but the race described with the timer function is unclear. As I understand it, del_timer_sync() spins waiting for the timer to stop running (and possibly rescheduling itself) and then deletes the timer. When try_to_del_timer_sync() returns successfully "the timer is not queued and the handler is not running on any CPU". Is there still a race here anyway?
Todd
On Tue, 18 Dec 2012, Todd Poynor wrote:
I put this up on AOSP Gerrit at https://android-review.googlesource.com/#/c/48233/ with a minor change to the semaphore field name to be more descriptive and to identify it as a semaphore.
After some out-of-band discussion it was agreed the race with the idle notifier definitely exists, but the race described with the timer function is unclear. As I understand it, del_timer_sync() spins waiting for the timer to stop running (and possibly rescheduling itself) and then deletes the timer. When try_to_del_timer_sync() returns successfully "the timer is not queued and the handler is not running on any CPU". Is there still a race here anyway?
After re-reading the code, I do agree.
What confused me is this comment for del_timer_sync():
* Synchronization rules: Callers must prevent restarting of the timer, * otherwise this function is meaningless. It must not be called from * interrupt contexts unless the timer is an irqsafe one. The caller must * not hold locks which would prevent completion of the timer's * handler. The timer's handler must not call add_timer_on(). [...]
However, the timer handler is using mod_timer_pinned(), not add_timer_on().
I stumbled upon the idle notifier race first, and initially my understanding of the timer code wasn't very clear. I used the timer self scheduling scenario to illustrate the race instead of the idle notifier simply because it was simpler.
Nicolas
On Tue, Dec 18, 2012 at 8:46 PM, Nicolas Pitre nicolas.pitre@linaro.org wrote:
On Tue, 18 Dec 2012, Todd Poynor wrote:
I put this up on AOSP Gerrit at https://android-review.googlesource.com/#/c/48233/ with a minor change to the semaphore field name to be more descriptive and to identify it as a semaphore.
After some out-of-band discussion it was agreed the race with the idle notifier definitely exists, but the race described with the timer function is unclear. As I understand it, del_timer_sync() spins waiting for the timer to stop running (and possibly rescheduling itself) and then deletes the timer. When try_to_del_timer_sync() returns successfully "the timer is not queued and the handler is not running on any CPU". Is there still a race here anyway?
After re-reading the code, I do agree.
What confused me is this comment for del_timer_sync():
- Synchronization rules: Callers must prevent restarting of the timer,
- otherwise this function is meaningless. It must not be called from
- interrupt contexts unless the timer is an irqsafe one. The caller must
- not hold locks which would prevent completion of the timer's
- handler. The timer's handler must not call add_timer_on(). [...]
However, the timer handler is using mod_timer_pinned(), not add_timer_on().
I stumbled upon the idle notifier race first, and initially my understanding of the timer code wasn't very clear. I used the timer self scheduling scenario to illustrate the race instead of the idle notifier simply because it was simpler.
OK, I'll fix up the commit description.
Thanks -- Todd
Nicolas