Hi,
The first patch fixes a potential regression we may observe on the UP systems and the others are doing minor optimizations in the scheduler core.
They are all tested on ARM 32 (exynos) and 64 (hikey) bit platforms.
-- viresh
Viresh Kumar (6): sched: fair: Call cpufreq update util handlers less frequently on UP sched: Reuse put_prev_task() sched: fair: Pass rq to weighted_cpuload() sched: fair: Avoid checking cfs_rq->nr_running twice sched: fair: Drop always true parameter of update_cfs_rq_load_avg() sched: cpufreq: Optimize cpufreq_update_this_cpu() a bit
kernel/sched/core.c | 2 +- kernel/sched/fair.c | 107 ++++++++++++++++++++++++++------------------------- kernel/sched/sched.h | 2 +- 3 files changed, 56 insertions(+), 55 deletions(-)
For SMP systems, update_load_avg() calls the cpufreq update util handlers only for the top level cfs_rq (i.e. rq->cfs).
But that is not the case for UP systems. update_load_avg() calls util handler for any cfs_rq for which it is called. This would result in way too many calls from the scheduler to the cpufreq governors when CONFIG_FAIR_GROUP_SCHED is enabled.
Reduce the frequency of these calls by copying the behavior from the SMP case, i.e. Only call util handlers for the top level cfs_rq.
Fixes: 536bd00cdbb7 ("sched/fair: Fix !CONFIG_SMP kernel cpufreq governor breakage") Signed-off-by: Viresh Kumar viresh.kumar@linaro.org
--- I wasn't sure if we would like to mark Stable for this or not.
Cc: 4.6+ stable@vger.kernel.org # 4.6+ --- kernel/sched/fair.c | 48 ++++++++++++++++++++++++------------------------ 1 file changed, 24 insertions(+), 24 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 47a0c552c77b..a0a97497c400 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2728,6 +2728,29 @@ static inline void update_cfs_shares(struct sched_entity *se) } #endif /* CONFIG_FAIR_GROUP_SCHED */
+static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) +{ + if (&this_rq()->cfs == cfs_rq) { + /* + * There are a few boundary cases this might miss but it should + * get called often enough that that should (hopefully) not be + * a real problem -- added to that it only calls on the local + * CPU, so if we enqueue remotely we'll miss an update, but + * the next tick/schedule should update. + * + * It will not get called when we go idle, because the idle + * thread is a different class (!fair), nor will the utilization + * number include things like RT tasks. + * + * As is, the util number is not freq-invariant (we'd have to + * implement arch_scale_freq_capacity() for that). + * + * See cpu_util(). + */ + cpufreq_update_util(rq_of(cfs_rq), 0); + } +} + #ifdef CONFIG_SMP /* * Approximate: @@ -3215,29 +3238,6 @@ static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {}
#endif /* CONFIG_FAIR_GROUP_SCHED */
-static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) -{ - if (&this_rq()->cfs == cfs_rq) { - /* - * There are a few boundary cases this might miss but it should - * get called often enough that that should (hopefully) not be - * a real problem -- added to that it only calls on the local - * CPU, so if we enqueue remotely we'll miss an update, but - * the next tick/schedule should update. - * - * It will not get called when we go idle, because the idle - * thread is a different class (!fair), nor will the utilization - * number include things like RT tasks. - * - * As is, the util number is not freq-invariant (we'd have to - * implement arch_scale_freq_capacity() for that). - * - * See cpu_util(). - */ - cpufreq_update_util(rq_of(cfs_rq), 0); - } -} - /* * Unsigned subtract and clamp on underflow. * @@ -3483,7 +3483,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
static inline void update_load_avg(struct sched_entity *se, int not_used1) { - cpufreq_update_util(rq_of(cfs_rq_of(se)), 0); + cfs_rq_util_change(cfs_rq_of(se)); }
static inline void
Reuse put_prev_task() instead of copying its implementation.
Signed-off-by: Viresh Kumar viresh.kumar@linaro.org --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a8366cfebd31..8ddebb068585 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5683,7 +5683,7 @@ static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf) */ next = pick_next_task(rq, &fake_task, rf); BUG_ON(!next); - next->sched_class->put_prev_task(rq, next); + put_prev_task(rq, next);
/* * Rules for changing task_struct::cpus_allowed are holding
weighted_cpuload() uses the cpu number passed to it get pointer to the runqueue. Almost all callers of weighted_cpuload() already have the rq pointer with them and can send that directly to weighted_cpuload(). In some cases the callers actually get the CPU number by doing cpu_of(rq).
It would be simpler to pass rq to weighted_cpuload().
Signed-off-by: Viresh Kumar viresh.kumar@linaro.org --- kernel/sched/fair.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a0a97497c400..fe03efd3880a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1378,7 +1378,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4; }
-static unsigned long weighted_cpuload(const int cpu); +static unsigned long weighted_cpuload(struct rq *rq); static unsigned long source_load(int cpu, int type); static unsigned long target_load(int cpu, int type); static unsigned long capacity_of(int cpu); @@ -1410,7 +1410,7 @@ static void update_numa_stats(struct numa_stats *ns, int nid) struct rq *rq = cpu_rq(cpu);
ns->nr_running += rq->nr_running; - ns->load += weighted_cpuload(cpu); + ns->load += weighted_cpuload(rq); ns->compute_capacity += capacity_of(cpu);
cpus++; @@ -5064,9 +5064,9 @@ static void cpu_load_update(struct rq *this_rq, unsigned long this_load, }
/* Used instead of source_load when we know the type == 0 */ -static unsigned long weighted_cpuload(const int cpu) +static unsigned long weighted_cpuload(struct rq *rq) { - return cfs_rq_runnable_load_avg(&cpu_rq(cpu)->cfs); + return cfs_rq_runnable_load_avg(&rq->cfs); }
#ifdef CONFIG_NO_HZ_COMMON @@ -5111,7 +5111,7 @@ static void cpu_load_update_idle(struct rq *this_rq) /* * bail if there's load or we're actually up-to-date. */ - if (weighted_cpuload(cpu_of(this_rq))) + if (weighted_cpuload(this_rq)) return;
cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0); @@ -5132,7 +5132,7 @@ void cpu_load_update_nohz_start(void) * concurrently we'll exit nohz. And cpu_load write can race with * cpu_load_update_idle() but both updater would be writing the same. */ - this_rq->cpu_load[0] = weighted_cpuload(cpu_of(this_rq)); + this_rq->cpu_load[0] = weighted_cpuload(this_rq); }
/* @@ -5148,7 +5148,7 @@ void cpu_load_update_nohz_stop(void) if (curr_jiffies == this_rq->last_load_update_tick) return;
- load = weighted_cpuload(cpu_of(this_rq)); + load = weighted_cpuload(this_rq); rq_lock(this_rq, &rf); update_rq_clock(this_rq); cpu_load_update_nohz(this_rq, curr_jiffies, load); @@ -5174,7 +5174,7 @@ static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load) */ void cpu_load_update_active(struct rq *this_rq) { - unsigned long load = weighted_cpuload(cpu_of(this_rq)); + unsigned long load = weighted_cpuload(this_rq);
if (tick_nohz_tick_stopped()) cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load); @@ -5192,7 +5192,7 @@ void cpu_load_update_active(struct rq *this_rq) static unsigned long source_load(int cpu, int type) { struct rq *rq = cpu_rq(cpu); - unsigned long total = weighted_cpuload(cpu); + unsigned long total = weighted_cpuload(rq);
if (type == 0 || !sched_feat(LB_BIAS)) return total; @@ -5207,7 +5207,7 @@ static unsigned long source_load(int cpu, int type) static unsigned long target_load(int cpu, int type) { struct rq *rq = cpu_rq(cpu); - unsigned long total = weighted_cpuload(cpu); + unsigned long total = weighted_cpuload(rq);
if (type == 0 || !sched_feat(LB_BIAS)) return total; @@ -5229,7 +5229,7 @@ static unsigned long cpu_avg_load_per_task(int cpu) { struct rq *rq = cpu_rq(cpu); unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running); - unsigned long load_avg = weighted_cpuload(cpu); + unsigned long load_avg = weighted_cpuload(rq);
if (nr_running) return load_avg / nr_running; @@ -5651,7 +5651,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) shallowest_idle_cpu = i; } } else if (shallowest_idle_cpu == -1) { - load = weighted_cpuload(i); + load = weighted_cpuload(cpu_rq(i)); if (load < min_load || (load == min_load && i == this_cpu)) { min_load = load; least_loaded_cpu = i; @@ -7439,7 +7439,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->nr_numa_running += rq->nr_numa_running; sgs->nr_preferred_running += rq->nr_preferred_running; #endif - sgs->sum_weighted_load += weighted_cpuload(i); + sgs->sum_weighted_load += weighted_cpuload(rq); /* * No need to call idle_cpu() if nr_running is not 0 */ @@ -7968,7 +7968,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
capacity = capacity_of(i);
- wl = weighted_cpuload(i); + wl = weighted_cpuload(rq);
/* * When comparing with imbalance, use weighted_cpuload()
Rearrange pick_next_task_fair() a bit to avoid checking cfs_rq->nr_running twice for the case where FAIR_GROUP_SCHED is enabled and the previous task doesn't belong to the fair class.
Signed-off-by: Viresh Kumar viresh.kumar@linaro.org --- kernel/sched/fair.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fe03efd3880a..e519f6c03fe9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6267,10 +6267,10 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf int new_tasks;
again: -#ifdef CONFIG_FAIR_GROUP_SCHED if (!cfs_rq->nr_running) goto idle;
+#ifdef CONFIG_FAIR_GROUP_SCHED if (prev->sched_class != &fair_sched_class) goto simple;
@@ -6300,11 +6300,17 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf /* * This call to check_cfs_rq_runtime() will do the * throttle and dequeue its entity in the parent(s). - * Therefore the 'simple' nr_running test will indeed + * Therefore the nr_running test will indeed * be correct. */ - if (unlikely(check_cfs_rq_runtime(cfs_rq))) + if (unlikely(check_cfs_rq_runtime(cfs_rq))) { + cfs_rq = &rq->cfs; + + if (!cfs_rq->nr_running) + goto idle; + goto simple; + } }
se = pick_next_entity(cfs_rq, curr); @@ -6344,12 +6350,8 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
return p; simple: - cfs_rq = &rq->cfs; #endif
- if (!cfs_rq->nr_running) - goto idle; - put_prev_task(rq, prev);
do {
update_freq is always true and there is no need to pass it to update_cfs_rq_load_avg(). Remove it.
Signed-off-by: Viresh Kumar viresh.kumar@linaro.org --- kernel/sched/fair.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e519f6c03fe9..89d61d09d083 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -806,7 +806,7 @@ void post_init_entity_util_avg(struct sched_entity *se) /* * For !fair tasks do: * - update_cfs_rq_load_avg(now, cfs_rq, false); + update_cfs_rq_load_avg(now, cfs_rq); attach_entity_load_avg(cfs_rq, se); switched_from_fair(rq, p); * @@ -3259,7 +3259,6 @@ static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {} * update_cfs_rq_load_avg - update the cfs_rq's load/util averages * @now: current time, as per cfs_rq_clock_task() * @cfs_rq: cfs_rq to update - * @update_freq: should we call cfs_rq_util_change() or will the call do so * * The cfs_rq avg is the direct sum of all its entities (blocked and runnable) * avg. The immediate corollary is that all (fair) tasks must be attached, see @@ -3273,7 +3272,7 @@ static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {} * call update_tg_load_avg() when this function returns true. */ static inline int -update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) +update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) { struct sched_avg *sa = &cfs_rq->avg; int decayed, removed_load = 0, removed_util = 0; @@ -3301,7 +3300,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) cfs_rq->load_last_update_time_copy = sa->last_update_time; #endif
- if (update_freq && (decayed || removed_util)) + if (decayed || removed_util) cfs_rq_util_change(cfs_rq);
return decayed || removed_load; @@ -3329,7 +3328,7 @@ static inline void update_load_avg(struct sched_entity *se, int flags) if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) __update_load_avg_se(now, cpu, cfs_rq, se);
- decayed = update_cfs_rq_load_avg(now, cfs_rq, true); + decayed = update_cfs_rq_load_avg(now, cfs_rq); decayed |= propagate_entity_load_avg(se);
if (decayed && (flags & UPDATE_TG)) @@ -3473,7 +3472,7 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf); #else /* CONFIG_SMP */
static inline int -update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) +update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) { return 0; } @@ -6995,7 +6994,7 @@ static void update_blocked_averages(int cpu) if (throttled_hierarchy(cfs_rq)) continue;
- if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true)) + if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq)) update_tg_load_avg(cfs_rq, 0);
/* Propagate pending load changes to the parent, if any: */ @@ -7068,7 +7067,7 @@ static inline void update_blocked_averages(int cpu)
rq_lock_irqsave(rq, &rf); update_rq_clock(rq); - update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true); + update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq); rq_unlock_irqrestore(rq, &rf); }
smp_processor_id() can result in a function call if CONFIG_DEBUG_PREEMPT is enabled. It would be much straight forward to use this_rq() macro instead.
Signed-off-by: Viresh Kumar viresh.kumar@linaro.org --- kernel/sched/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index bcdf6a136c59..4220ec76d643 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1995,7 +1995,7 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags)
static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags) { - if (cpu_of(rq) == smp_processor_id()) + if (rq == this_rq()) cpufreq_update_util(rq, flags); } #else
On 24-05-17, 10:59, Viresh Kumar wrote:
Hi,
The first patch fixes a potential regression we may observe on the UP systems and the others are doing minor optimizations in the scheduler core.
They are all tested on ARM 32 (exynos) and 64 (hikey) bit platforms.
Hi Ingo/Peter,
Do you have some feedback on this series ?
linaro-kernel@lists.linaro.org