We now have a NOHZ kick to avoid the load of idle CPUs becoming stale. This is good, but it brings about CPU wakeups, which have an energy cost. As an alternative to waking CPUs up to do decay blocked load, we can sometimes do it from newly idle balance. If the newly idle balance is on a domain that covers all the currently nohz-idle CPUs, we push the value of nohz.next_update into the future. That means that if such newly idle balances happen often enough, we never need wake up a CPU just to update load.
Since we're doing this new update inside a for_each_domain, we need to do something to avoid doing multiple updates on the same CPU in the same idle_balance. A tick stamp is set on the rq in update_blocked_averages as a simple way to do this. Using a simple jiffies-based timestamp, as opposed to the last_update_time of the root cfs_rq's sched_avg, means we can do this without taking the rq lock.
Cc: Dietmar Eggemann dietmar.eggemann@arm.com Cc: Vincent Guittot vincent.guittot@linaro.org Cc: Ingo Molnar mingo@redhat.com Cc: Morten Rasmussen morten.rasmussen@arm.com Cc: Peter Zijlstra peterz@infradead.org Signed-off-by: Brendan Jackman brendan.jackman@arm.com --- kernel/sched/core.c | 1 + kernel/sched/fair.c | 40 ++++++++++++++++++++++++++++++++++------ kernel/sched/sched.h | 1 + 3 files changed, 36 insertions(+), 6 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d17c5da523a0..d8e71fd27806 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5923,6 +5923,7 @@ void __init sched_init(void) rq_attach_root(rq, &def_root_domain); #ifdef CONFIG_NO_HZ_COMMON rq->last_load_update_tick = jiffies; + rq->last_blocked_load_update_tick = jiffies; rq->nohz_flags = 0; #endif #ifdef CONFIG_NO_HZ_FULL diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9085caf49c76..0eacac05b834 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7062,6 +7062,7 @@ static void update_blocked_averages(int cpu) if (cfs_rq_is_decayed(cfs_rq)) list_del_leaf_cfs_rq(cfs_rq); } + rq->last_blocked_load_update_tick = jiffies; rq_unlock_irqrestore(rq, &rf); }
@@ -7121,6 +7122,7 @@ static inline void update_blocked_averages(int cpu) rq_lock_irqsave(rq, &rf); update_rq_clock(rq); update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq); + rq->last_blocked_load_update_tick = jiffies; rq_unlock_irqrestore(rq, &rf); }
@@ -7615,6 +7617,15 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq) } #endif /* CONFIG_NUMA_BALANCING */
+#ifdef CONFIG_NO_HZ_COMMON +static struct { + cpumask_var_t idle_cpus_mask; + atomic_t nr_cpus; + unsigned long next_balance; /* in jiffy units */ + unsigned long next_update; /* in jiffy units */ +} nohz ____cacheline_aligned; +#endif + /** * update_sd_lb_stats - Update sched_domain's statistics for load balancing. * @env: The load balancing environment. @@ -7633,6 +7644,29 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd if (child && child->flags & SD_PREFER_SIBLING) prefer_sibling = 1;
+#ifdef CONFIG_NO_HZ_COMMON + if (env->idle == CPU_NEWLY_IDLE) { + int cpu; + + /* Update the stats of NOHZ idle CPUs in the sd */ + for_each_cpu_and(cpu, sched_domain_span(env->sd), + nohz.idle_cpus_mask) { + struct rq *rq = cpu_rq(cpu); + + /* ... Unless we've already done since the last tick */ + if (jiffies > rq->last_blocked_load_update_tick) + update_blocked_averages(cpu); + } + } + /* + * If we've just updated all of the NOHZ idle CPUs, then we can push + * back the next nohz.next_update, which will prevent an unnecessary + * wakeup for the nohz stats kick + */ + if (cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) + nohz.next_update = jiffies + LOAD_AVG_PERIOD; +#endif + load_idx = get_sd_load_idx(env->sd, env->idle);
do { @@ -8657,12 +8691,6 @@ static inline int on_null_domain(struct rq *rq) * needed, they will kick the idle load balancer, which then does idle * load balancing for all the idle CPUs. */ -static struct { - cpumask_var_t idle_cpus_mask; - atomic_t nr_cpus; - unsigned long next_balance; /* in jiffy units */ - unsigned long next_update; /* in jiffy units */ -} nohz ____cacheline_aligned;
static inline int find_new_ilb(void) { diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 6f95ef653f73..6be8938bb977 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -681,6 +681,7 @@ struct rq { #ifdef CONFIG_NO_HZ_COMMON #ifdef CONFIG_SMP unsigned long last_load_update_tick; + unsigned long last_blocked_load_update_tick; #endif /* CONFIG_SMP */ unsigned long nohz_flags; #endif /* CONFIG_NO_HZ_COMMON */ -- 2.14.1