Hi Thomas,
This was earlier discussed here (Well, Not much :)):
https://lkml.org/lkml/2012/11/6/160
I am floating the idea again to get more attention on this patch. This is just
an idea for now, haven't seen much testing..
Migration of timers from idle cores to non-idle ones for power saving is very
well working and really saves a lot of power for us. What's currently not
working is the migration of running timers Or timers which re-arms themselves.
There are complications with migrating timers which schedules themselves again
from their handler. del_timer_sync() can't detect that the timer's handler
yet has not finished.
__mod_timer migrates the timer with following code:
...
spin_lock(&old_base->lock);
...
timer_set_base(timer, NULL);
spin_unlock(&old_base->lock); ->A
spin_lock(&new_base->lock); ->B
timer_set_base(timer, new_base);
...
After the unlock at time A, old_base->running_timer may get updated to the next
timer in queue. After lock at time B, lock_timer_base() will return new_base
where another timer might be running timer at that point of time.
Whereas, del_timer_sync() depends on below code to check if a timer's handler is
currently running or not.
if (base->running_timer != timer)
Because there is window where timer's handler would be running and at the same
time it wasn't marked as running timer for any of the bases (well, it matters
only for its current base, i.e. new_base). del_timer_sync() wouldn't know this
and will go on and delete the timer, whose handler is currently running.
The new approach tries to mark such timers with wait_for_migration_to_complete
flag (can be named better and some memory can be saved as PeterZ suggested),
which will be used in del_timer_sync() to see if the timer is currently
migrating and so isn't marked as running_timer of its base.
Benefits: Well, obviously saving power for a core which is being interrupted
again and again in its idle loop by this timer event. Which will also prevent
the core to go in deeper idle states if possible.
Please share your feedback on this approach.
Signed-off-by: Viresh Kumar <viresh.kumar(a)linaro.org>
---
include/linux/timer.h | 1 +
kernel/timer.c | 29 +++++++++++++----------------
2 files changed, 14 insertions(+), 16 deletions(-)
diff --git a/include/linux/timer.h b/include/linux/timer.h
index 8c5a197..ad00ebe 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -20,6 +20,7 @@ struct timer_list {
void (*function)(unsigned long);
unsigned long data;
+ int wait_for_migration_to_complete;
int slack;
diff --git a/kernel/timer.c b/kernel/timer.c
index 6582b82..30a93e6 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -748,21 +748,15 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
new_base = per_cpu(tvec_bases, cpu);
if (base != new_base) {
- /*
- * We are trying to schedule the timer on the local CPU.
- * However we can't change timer's base while it is running,
- * otherwise del_timer_sync() can't detect that the timer's
- * handler yet has not finished. This also guarantees that
- * the timer is serialized wrt itself.
- */
- if (likely(base->running_timer != timer)) {
- /* See the comment in lock_timer_base() */
- timer_set_base(timer, NULL);
- spin_unlock(&base->lock);
- base = new_base;
- spin_lock(&base->lock);
- timer_set_base(timer, base);
- }
+ if (base->running_timer == timer)
+ timer->wait_for_migration_to_complete = 1;
+
+ /* See the comment in lock_timer_base() */
+ timer_set_base(timer, NULL);
+ spin_unlock(&base->lock);
+ base = new_base;
+ spin_lock(&base->lock);
+ timer_set_base(timer, base);
}
timer->expires = expires;
@@ -992,7 +986,8 @@ int try_to_del_timer_sync(struct timer_list *timer)
base = lock_timer_base(timer, &flags);
- if (base->running_timer != timer) {
+ if ((base->running_timer != timer) &&
+ !timer->wait_for_migration_to_complete) {
timer_stats_timer_clear_start_info(timer);
ret = detach_if_pending(timer, base, true);
}
@@ -1185,6 +1180,8 @@ static inline void __run_timers(struct tvec_base *base)
call_timer_fn(timer, fn, data);
spin_lock_irq(&base->lock);
}
+ if (timer->wait_for_migration_to_complete)
+ timer->wait_for_migration_to_complete = 0;
}
}
base->running_timer = NULL;
--
1.7.12.rc2.18.g61b472e
commit 46a310b ([CPUFREQ] Don't set stat->last_index to -1 if the pol->cur has
incorrect value.) tries to handle case where policy->cur does not match any
entry in freq_table.
As indicated in the above commit, the exact match search of freq_table_get index
will return a -1 which is stored in stat->last_index. However, as a result of
the above commit, cpufreq_stat_notifier_trans which updates the statistics,
fails to update any *further* valid transitions that take place as
stat->last_index is -1 as the condition occurs at boot and never solved.
To fix this issue, lets create another entry for time_in_state and trans_table
that will tell the user that CPU was running on unknown frequency for some time.
This is how the output looks like on my thinkpad (Removed some entries to keep
it simple):
$ cat /sys/devices/system/cpu/cpu0/cpufreq/stats/time_in_state
2800000 46
2600000 138
1200000 65
1000000 152
800000 34803
unknown 0
$ cat /sys/devices/system/cpu/cpu0/cpufreq/stats/trans_table
From : To
: 2801000 2800000 2600000 2400000 2200000 2000000 unknown
2801000: 0 15 20 9 13 17 0
2800000: 13 0 4 1 0 1 0
2600000: 26 1 0 5 1 1 0
2400000: 11 0 6 0 1 1 0
2200000: 8 1 5 3 0 0 0
2000000: 11 1 2 1 2 0 0
unknown: 0 0 0 0 0 0 0
Reported-by: Carlos Hernandez <ceh(a)ti.com>
Reported-and-tested-by: Nishanth Menon <nm(a)ti.com>
Signed-off-by: Viresh Kumar <viresh.kumar(a)linaro.org>
---
drivers/cpufreq/cpufreq_stats.c | 45 ++++++++++++++++++++++++++++-------------
1 file changed, 31 insertions(+), 14 deletions(-)
diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c
index 4cf0d28..ebb21cd 100644
--- a/drivers/cpufreq/cpufreq_stats.c
+++ b/drivers/cpufreq/cpufreq_stats.c
@@ -72,9 +72,13 @@ static ssize_t show_time_in_state(struct cpufreq_policy *policy, char *buf)
return 0;
cpufreq_stats_update(stat->cpu);
for (i = 0; i < stat->state_num; i++) {
- len += sprintf(buf + len, "%u %llu\n", stat->freq_table[i],
- (unsigned long long)
- jiffies_64_to_clock_t(stat->time_in_state[i]));
+ if (stat->freq_table[i] == -1)
+ return sprintf(buf + len, "unknown");
+ else
+ return sprintf(buf + len, "%u", stat->freq_table[i]);
+
+ len += sprintf(buf + len, " %llu\n", (unsigned long long)
+ jiffies_64_to_clock_t(stat->time_in_state[i]));
}
return len;
}
@@ -94,8 +98,12 @@ static ssize_t show_trans_table(struct cpufreq_policy *policy, char *buf)
for (i = 0; i < stat->state_num; i++) {
if (len >= PAGE_SIZE)
break;
- len += snprintf(buf + len, PAGE_SIZE - len, "%9u ",
- stat->freq_table[i]);
+ if (stat->freq_table[i] == -1)
+ len += snprintf(buf + len, PAGE_SIZE - len, "%9s ",
+ "unknown");
+ else
+ len += snprintf(buf + len, PAGE_SIZE - len, "%9u ",
+ stat->freq_table[i]);
}
if (len >= PAGE_SIZE)
return PAGE_SIZE;
@@ -106,8 +114,12 @@ static ssize_t show_trans_table(struct cpufreq_policy *policy, char *buf)
if (len >= PAGE_SIZE)
break;
- len += snprintf(buf + len, PAGE_SIZE - len, "%9u: ",
- stat->freq_table[i]);
+ if (stat->freq_table[i] == -1)
+ len += snprintf(buf + len, PAGE_SIZE - len, "%9s: ",
+ "unknown");
+ else
+ len += snprintf(buf + len, PAGE_SIZE - len, "%9u: ",
+ stat->freq_table[i]);
for (j = 0; j < stat->state_num; j++) {
if (len >= PAGE_SIZE)
@@ -145,10 +157,12 @@ static struct attribute_group stats_attr_group = {
static int freq_table_get_index(struct cpufreq_stats *stat, unsigned int freq)
{
int index;
- for (index = 0; index < stat->max_state; index++)
+ for (index = 0; index < stat->max_state - 1; index++)
if (stat->freq_table[index] == freq)
return index;
- return -1;
+
+ /* Last state is INVALID, to mark out of table frequency */
+ return stat->max_state - 1;
}
/* should be called late in the CPU removal sequence so that the stats
@@ -222,6 +236,9 @@ static int cpufreq_stats_create_table(struct cpufreq_policy *policy,
count++;
}
+ /* An extra entry for Invalid frequencies */
+ count++;
+
alloc_size = count * sizeof(int) + count * sizeof(u64);
#ifdef CONFIG_CPU_FREQ_STAT_DETAILS
@@ -243,9 +260,13 @@ static int cpufreq_stats_create_table(struct cpufreq_policy *policy,
unsigned int freq = table[i].frequency;
if (freq == CPUFREQ_ENTRY_INVALID)
continue;
- if (freq_table_get_index(stat, freq) == -1)
+ if (freq_table_get_index(stat, freq) == stat->max_state - 1)
stat->freq_table[j++] = freq;
}
+
+ /* Mark Invalid freq as max value to indicate Invalid freq */
+ stat->freq_table[j++] = -1;
+
stat->state_num = j;
spin_lock(&cpufreq_stats_lock);
stat->last_time = get_jiffies_64();
@@ -315,10 +336,6 @@ static int cpufreq_stat_notifier_trans(struct notifier_block *nb,
old_index = stat->last_index;
new_index = freq_table_get_index(stat, freq->new);
- /* We can't do stat->time_in_state[-1]= .. */
- if (old_index == -1 || new_index == -1)
- return 0;
-
cpufreq_stats_update(freq->cpu);
if (old_index == new_index)
--
1.7.12.rc2.18.g61b472e