From: Michael Turquette mturquette@linaro.org
{en,de}queue_task_fair are updated to track which cpus will have changed utilization values as function of task queueing. The affected cpus are passed on to arch_eval_cpu_freq for further machine-specific processing based on a selectable policy.
arch_scale_cpu_freq is called from run_rebalance_domains as a way to kick off the scaling process (via wake_up_process), so as to prevent re-entering the {en,de}queue code.
All of the call sites in this patch are up for discussion. Does it make sense to track which cpus have updated statistics in enqueue_fair_task? I chose this because I wanted to gather statistics for all cpus affected in the event CONFIG_FAIR_GROUP_SCHED is enabled. As agreed at LPC14 the next version of this patch will focus on the simpler case of not using scheduler cgroups, which should remove a good chunk of this code, including the cpumask stuff.
Also discussed at LPC14 is that fact that load_balance is a very interesting place to do this as frequency can be considered in concert with task placement. Please put forth any ideas on a sensible way to do this.
Is run_rebalance_domains a logical place to change cpu frequency? What other call sites make sense?
Even for platforms that can target a cpu frequency without sleeping (x86, some ARM platforms with PM microcontrollers) it is currently necessary to always kick the frequency target work out into a kthread. This is because of the rw_sem usage in the cpufreq core which might sleep. Replacing that lock type is probably a good idea.
Not-signed-off-by: Mike Turquette mturquette@linaro.org
Signed-off-by: Michael Turquette mturquette@linaro.org --- kernel/sched/fair.c | 47 ++++++++++++++++++++++++++++++++++++++++++----- kernel/sched/sched.h | 2 ++ 2 files changed, 44 insertions(+), 5 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d9386b0..1043266 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4236,6 +4236,11 @@ static inline void hrtick_update(struct rq *rq) } #endif
+static inline bool energy_aware(void) +{ + return sched_feat(ENERGY_AWARE); +} + /* * The enqueue_task method is called before nr_running is * increased. Here we update the fair scheduling stats and @@ -4246,6 +4251,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; + struct cpumask update_cpus; + + cpumask_clear(&update_cpus);
for_each_sched_entity(se) { if (se->on_rq) @@ -4275,12 +4283,26 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
update_cfs_shares(cfs_rq); update_entity_load_avg(se, 1); + /* track cpus that need to be re-evaluated */ + cpumask_set_cpu(cpu_of(rq_of(cfs_rq)), &update_cpus); }
+ /* !CONFIG_FAIR_GROUP_SCHED */ if (!se) { update_rq_runnable_avg(rq, rq->nr_running); add_nr_running(rq, 1); + + /* + * FIXME for !CONFIG_FAIR_GROUP_SCHED it might be nice to + * typedef update_cpus into an int and skip all of the cpumask + * stuff + */ + cpumask_set_cpu(cpu_of(rq), &update_cpus); } + + if (energy_aware() && !cpumask_empty(&update_cpus)) + arch_eval_cpu_freq(&update_cpus); + hrtick_update(rq); }
@@ -4296,6 +4318,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; int task_sleep = flags & DEQUEUE_SLEEP; + struct cpumask update_cpus; + + cpumask_clear(&update_cpus);
for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); @@ -4336,12 +4361,26 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
update_cfs_shares(cfs_rq); update_entity_load_avg(se, 1); + /* track runqueues/cpus that need to be re-evaluated */ + cpumask_set_cpu(cpu_of(rq_of(cfs_rq)), &update_cpus); }
+ /* !CONFIG_FAIR_GROUP_SCHED */ if (!se) { sub_nr_running(rq, 1); update_rq_runnable_avg(rq, 1); + + /* + * FIXME for !CONFIG_FAIR_GROUP_SCHED it might be nice to + * typedef update_cpus into an int and skip all of the cpumask + * stuff + */ + cpumask_set_cpu(cpu_of(rq), &update_cpus); } + + if (energy_aware() && !cpumask_empty(&update_cpus)) + arch_eval_cpu_freq(&update_cpus); + hrtick_update(rq); }
@@ -4615,11 +4654,6 @@ int get_cpu_usage(int cpu) }
-static inline bool energy_aware(void) -{ - return sched_feat(ENERGY_AWARE); -} - struct energy_env { struct sched_group *sg_top; struct sched_group *sg_cap; @@ -8292,6 +8326,9 @@ static void run_rebalance_domains(struct softirq_action *h) * stopped. */ nohz_idle_balance(this_rq, idle); + + if (energy_aware()) + arch_scale_cpu_freq(); }
/* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c357c77..167ba2a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -808,6 +808,8 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
unsigned long capacity_of(int cpu); int get_cpu_usage(int cpu); +void arch_scale_cpu_freq(void); +void arch_eval_cpu_freq(struct cpumask *update_cpus);
DECLARE_PER_CPU(struct sched_domain *, sd_llc); DECLARE_PER_CPU(int, sd_llc_size); -- 2.2.2