On Wed, 22 Apr 2015, Eric Dumazet wrote:
On Wed, 2015-04-22 at 20:56 +0200, Thomas Gleixner wrote:
On Wed, 22 Apr 2015, Eric Dumazet wrote:
Check commit 4a8e320c929991c9480 ("net: sched: use pinned timers") for a specific example of the problems that can be raised.
If you have a problem with the core timer code then it should be fixed there and not worked around in some place which will ruin stuff for power saving interested users. I'm so tired of this 'I fix it in my sandbox' attitude, really. If the core code has a shortcoming we fix it there right away because you are probably not the only one who runs into that shortcoming. So if we don't fix it in the core we end up with a metric ton of slightly different (or broken) workarounds which affect the workload/system characteristics of other people.
Just for the record. Even the changelog of this commit is blatantly wrong:
"We can see that timers get migrated into a single cpu, presumably idle at the time timers are set up."
Spare me the obvious typo. A 'not' is missing.
:)
The timer migration moves timers to non idle cpus to leave the idle ones alone for power saving sake.
I can see and understand the reason why you want to avoid that, but I have to ask the question whether this pinning is the correct behaviour under all workloads and system characteristics. If yes, then the patch is the right answer, if no, then it is simply the wrong approach.
I take your 'Awesome' below as a no then.
but /proc/sys/kernel/timer_migration adds a fair overhead in many workloads.
get_nohz_timer_target() has to touch 3 cache lines per cpu...
And this is something we can fix and completely avoid if we think about it. Looking at the code I have to admit that the out of line call and the sysctl variable lookup is silly. But its not rocket science to fix this.
Awesome.
Here you go. Completely untested, at least it compiles.
Thanks,
tglx ---
Index: linux/include/linux/sched.h =================================================================== --- linux.orig/include/linux/sched.h +++ linux/include/linux/sched.h @@ -343,14 +343,10 @@ extern int runqueue_is_locked(int cpu); #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) extern void nohz_balance_enter_idle(int cpu); extern void set_cpu_sd_state_idle(void); -extern int get_nohz_timer_target(int pinned); +extern int get_nohz_timer_target(void); #else static inline void nohz_balance_enter_idle(int cpu) { } static inline void set_cpu_sd_state_idle(void) { } -static inline int get_nohz_timer_target(int pinned) -{ - return smp_processor_id(); -} #endif
/* Index: linux/include/linux/sched/sysctl.h =================================================================== --- linux.orig/include/linux/sched/sysctl.h +++ linux/include/linux/sched/sysctl.h @@ -57,24 +57,12 @@ extern unsigned int sysctl_numa_balancin extern unsigned int sysctl_sched_migration_cost; extern unsigned int sysctl_sched_nr_migrate; extern unsigned int sysctl_sched_time_avg; -extern unsigned int sysctl_timer_migration; extern unsigned int sysctl_sched_shares_window;
int sched_proc_update_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos); #endif -#ifdef CONFIG_SCHED_DEBUG -static inline unsigned int get_sysctl_timer_migration(void) -{ - return sysctl_timer_migration; -} -#else -static inline unsigned int get_sysctl_timer_migration(void) -{ - return 1; -} -#endif
/* * control realtime throttling: Index: linux/include/linux/timer.h =================================================================== --- linux.orig/include/linux/timer.h +++ linux/include/linux/timer.h @@ -254,6 +254,16 @@ extern void run_local_timers(void); struct hrtimer; extern enum hrtimer_restart it_real_fn(struct hrtimer *);
+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) +#include <linux/sysctl.h> + +extern unsigned int sysctl_timer_migration; +int timer_migration_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos); +extern struct static_key timer_migration_enabled; +#endif + unsigned long __round_jiffies(unsigned long j, int cpu); unsigned long __round_jiffies_relative(unsigned long j, int cpu); unsigned long round_jiffies(unsigned long j); Index: linux/kernel/sched/core.c =================================================================== --- linux.orig/kernel/sched/core.c +++ linux/kernel/sched/core.c @@ -593,13 +593,12 @@ void resched_cpu(int cpu) * selecting an idle cpu will add more delays to the timers than intended * (as that cpu's timer base may not be uptodate wrt jiffies etc). */ -int get_nohz_timer_target(int pinned) +int get_nohz_timer_target(void) { - int cpu = smp_processor_id(); - int i; + int i, cpu = smp_processor_id(); struct sched_domain *sd;
- if (pinned || !get_sysctl_timer_migration() || !idle_cpu(cpu)) + if (!idle_cpu(cpu)) return cpu;
rcu_read_lock(); @@ -7088,8 +7087,6 @@ void __init sched_init_smp(void) } #endif /* CONFIG_SMP */
-const_debug unsigned int sysctl_timer_migration = 1; - int in_sched_functions(unsigned long addr) { return in_lock_functions(addr) || Index: linux/kernel/sysctl.c =================================================================== --- linux.orig/kernel/sysctl.c +++ linux/kernel/sysctl.c @@ -349,15 +349,6 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { - .procname = "timer_migration", - .data = &sysctl_timer_migration, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, - }, #endif /* CONFIG_SMP */ #ifdef CONFIG_NUMA_BALANCING { @@ -1132,6 +1123,15 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &one, }, +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) + { + .procname = "timer_migration", + .data = &sysctl_timer_migration, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = timer_migration_handler, + }, +#endif { } };
Index: linux/kernel/time/hrtimer.c =================================================================== --- linux.orig/kernel/time/hrtimer.c +++ linux/kernel/time/hrtimer.c @@ -190,6 +190,23 @@ hrtimer_check_target(struct hrtimer *tim #endif }
+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) +static inline struct hrtimer_cpu_base *get_target_base(int pinned) +{ + if (pinned) + return this_cpu_ptr(&hrtimer_bases); + if (static_key_true(&timer_migration_enabled)) + return &per_cpu(hrtimer_bases, get_nohz_timer_target()); + return this_cpu_ptr(&hrtimer_bases); +} +#else + +static inline struct hrtimer_cpu_base *get_target_base(int pinned) +{ + return this_cpu_ptr(&hrtimer_bases); +} +#endif + /* * Switch the timer base to the current CPU when possible. */ @@ -197,14 +214,13 @@ static inline struct hrtimer_clock_base switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, int pinned) { + struct hrtimer_cpu_base *new_cpu_base, *this_base; struct hrtimer_clock_base *new_base; - struct hrtimer_cpu_base *new_cpu_base; - int this_cpu = smp_processor_id(); - int cpu = get_nohz_timer_target(pinned); int basenum = base->index;
+ this_base = this_cpu_ptr(&hrtimer_bases); + new_cpu_base = get_target_base(pinned); again: - new_cpu_base = &per_cpu(hrtimer_bases, cpu); new_base = &new_cpu_base->clock_base[basenum];
if (base != new_base) { @@ -225,17 +241,19 @@ again: raw_spin_unlock(&base->cpu_base->lock); raw_spin_lock(&new_base->cpu_base->lock);
- if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) { - cpu = this_cpu; + if (new_cpu_base != this_base && + hrtimer_check_target(timer, new_base)) { raw_spin_unlock(&new_base->cpu_base->lock); raw_spin_lock(&base->cpu_base->lock); + new_cpu_base = this_base; timer->base = base; goto again; } timer->base = new_base; } else { - if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) { - cpu = this_cpu; + if (new_cpu_base != this_base && + hrtimer_check_target(timer, new_base)) { + new_cpu_base = this_base; goto again; } } Index: linux/kernel/time/timer.c =================================================================== --- linux.orig/kernel/time/timer.c +++ linux/kernel/time/timer.c @@ -104,6 +104,49 @@ EXPORT_SYMBOL(boot_tvec_bases);
static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases;
+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) +unsigned int sysctl_timer_migration = 1; +struct static_key timer_migration_enabled = STATIC_KEY_INIT_TRUE; + +int timer_migration_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + static DEFINE_MUTEX(mutex); + bool keyon; + int ret; + + mutex_lock(&mutex); + ret = proc_dointvec(table, write, buffer, lenp, ppos); + if (ret || !write) + goto unlock; + + keyon = static_key_enabled(&timer_migration_enabled); + if (sysctl_timer_migration && !keyon ) + static_key_slow_inc(&timer_migration_enabled); + else if (!sysctl_timer_migration && keyon) + static_key_slow_dec(&timer_migration_enabled); + +unlock: + mutex_unlock(&mutex); + return ret; +} + +static inline struct tvec_base *get_target_base(int pinned) +{ + if (pinned) + return __this_cpu_read(tvec_bases); + if (static_key_true(&timer_migration_enabled)) + return per_cpu(tvec_bases, get_nohz_timer_target()); + return __this_cpu_read(tvec_bases); +} +#else +static inline struct tvec_base *get_target_base(int pinned) +{ + return __this_cpu_read(tvec_bases); +} +#endif + /* Functions below help us manage 'deferrable' flag */ static inline unsigned int tbase_get_deferrable(struct tvec_base *base) { @@ -774,7 +817,7 @@ __mod_timer(struct timer_list *timer, un { struct tvec_base *base, *new_base; unsigned long flags; - int ret = 0 , cpu; + int ret = 0;
timer_stats_timer_set_start_info(timer); BUG_ON(!timer->function); @@ -787,8 +830,7 @@ __mod_timer(struct timer_list *timer, un
debug_activate(timer, expires);
- cpu = get_nohz_timer_target(pinned); - new_base = per_cpu(tvec_bases, cpu); + new_base = get_target_base(pinned);
if (base != new_base) { /*