Hi Thomas,
On Wednesday 15 April 2015 04:43 AM, Thomas Gleixner wrote:
No new flags in the timer base, no concurrent expiry, no extra conditionals in the expiry path. Just a single extra check at the end of the softirq handler for this rare condition instead of imposing all that nonsense to the hotpath.
Here is what I could get to based on your suggestions:
diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 2d3f5c504939..c412511d34b8 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -78,6 +78,8 @@ struct tvec_root { struct tvec_base { spinlock_t lock; struct timer_list *running_timer; + struct list_head migration_list; + struct tvec_base *preferred_target; unsigned long timer_jiffies; unsigned long next_timer; unsigned long active_timers; @@ -785,10 +787,18 @@ __mod_timer(struct timer_list *timer, unsigned long expires, * We are trying to schedule the timer on the local CPU. * However we can't change timer's base while it is running, * otherwise del_timer_sync() can't detect that the timer's - * handler yet has not finished. This also guarantees that - * the timer is serialized wrt itself. + * handler yet has not finished. + * + * Move timer to migration_list which can be processed after all + * timers in current cycle are serviced. This also guarantees + * that the timer is serialized wrt itself. */ - if (likely(base->running_timer != timer)) { + if (unlikely(base->running_timer == timer)) { + timer->expires = expires; + base->preferred_target = new_base; + list_add_tail(&timer->entry, &base->migration_list); + goto out_unlock; + } else { /* See the comment in lock_timer_base() */ timer_set_base(timer, NULL); spin_unlock(&base->lock); @@ -1214,6 +1224,33 @@ static inline void __run_timers(struct tvec_base *base) spin_lock_irq(&base->lock); } } + + /* + * Timer handler re-armed timer and we didn't wanted to add that + * on a idle local CPU. Its handler has finished now, lets + * enqueue it again. + */ + if (unlikely(!list_empty(&base->migration_list))) { + struct tvec_base *new_base = base->preferred_target; + + do { + timer = list_first_entry(&base->migration_list, + struct timer_list, entry); + + __list_del(timer->entry.prev, timer->entry.next); + + /* See the comment in lock_timer_base() */ + timer_set_base(timer, NULL); + spin_unlock(&base->lock); + + spin_lock(&new_base->lock); + timer_set_base(timer, new_base); + internal_add_timer(new_base, timer); + spin_unlock(&new_base->lock); + + spin_lock(&base->lock); + } while (!list_empty(&base->migration_list)); + } } base->running_timer = NULL; spin_unlock_irq(&base->lock); @@ -1583,6 +1620,7 @@ static int init_timers_cpu(int cpu) for (j = 0; j < TVR_SIZE; j++) INIT_LIST_HEAD(base->tv1.vec + j);
+ INIT_LIST_HEAD(&base->migration_list); base->timer_jiffies = jiffies; base->next_timer = base->timer_jiffies; base->active_timers = 0;
Does this look any better ?
I tested this on Exynos (Dual ARM Cortex-A9) board, with ubuntu over it. System was fairly idle and I did 'dmesg > /dev/null' on one of the CPUs in an infinite loop, to get CPUs out of idle.
I have got few more concerns/diffs over this to further optimize things, but kept them separate so that I can drop them if they aren't correct.
1.) Should the above list_empty(migration_list) block be added out of the
while (time_after_eq(jiffies, base->timer_jiffies))
block ? So that we check it only once per timer interrupt.
Also ->running_timer is set to the last serviced timer and a del_timer_sync() might be waiting to remove it. But we continue with the migration list first, without clearing it first. Not sure if this is important at all..
2.) By the time we finish serving all pending timers, local CPU might not be idle anymore OR the target CPU may become idle.
diff --git a/kernel/time/timer.c b/kernel/time/timer.c index c412511d34b8..d75d31e9dc49 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1233,6 +1233,14 @@ static inline void __run_timers(struct tvec_base *base) if (unlikely(!list_empty(&base->migration_list))) { struct tvec_base *new_base = base->preferred_target;
+ if (!idle_cpu(base->cpu)) { + /* Local CPU not idle anymore */ + new_base = base; + } else if (idle_cpu(new_base->cpu)) { + /* Re-evaluate base, target CPU has gone idle */ + new_base = per_cpu(tvec_bases, get_nohz_timer_target(false)); + } + do { timer = list_first_entry(&base->migration_list, struct timer_list, entry);
The first if block is getting hit a lot of times in my tests. But the second one has never been hit (Probably because of only two CPUs, not sure).
2.) It might be better to update preferred_target every time we choose a difference base. This may help us avoid calling get_nohz_timer_target() in the second if block above.
diff --git a/kernel/time/timer.c b/kernel/time/timer.c index d75d31e9dc49..558cd98abd87 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -783,6 +783,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires, new_base = per_cpu(tvec_bases, cpu);
if (base != new_base) { + base->preferred_target = new_base; + /* * We are trying to schedule the timer on the local CPU. * However we can't change timer's base while it is running, @@ -795,7 +797,6 @@ __mod_timer(struct timer_list *timer, unsigned long expires, */ if (unlikely(base->running_timer == timer)) { timer->expires = expires; - base->preferred_target = new_base; list_add_tail(&timer->entry, &base->migration_list); goto out_unlock; } else {
-- viresh