On Fri, Dec 16, 2016 at 10:37:23AM +0000, Morten Rasmussen wrote:
[...]
I'm not sure exactly how we would determine when we are 'global over-utilized' and distinguish it from 'outer over-utilized'. That requires a bit more pondering.
Regarding the flags in Thara's proposal. sd->overutilization can be a parent flag as well if you have more than two sched_domain levels. We need to consider more levels to have a scalable solution.
How about below code? I tweaked some based on Thara's patch, the main change is to use SD Level 1's flag for 'inner over-utilized', this part is absent now, I did some search on wiki page and found the algorithm for 'Assignment problem' is not easy to add into scheduler.
So this patch just drafted the implementation for 'outer overutilized' by using SD Level 1 flag and use rd->overutilized to indicate 'global over-utilized'. But I think it's easy to extend to support 'inner overutilized' after we have solution for 'Assignment problem'.
For 'global over-utilized' criteria, I reuse Thara's definition:
+ if (sds->total_capacity * 1024 < sds->total_util * capacity_margin) + set_rd_overutilized(env->dst_rq->rd); + else + clear_rd_overutilized(env->dst_rq->rd);
---8<---
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index cf56241..eeaea72 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4200,6 +4200,83 @@ static inline void hrtick_update(struct rq *rq) #ifdef CONFIG_SMP static bool cpu_overutilized(int cpu); static inline unsigned long boosted_cpu_util(int cpu); + +/* + * Some thoghts to set overutilized flag: + * + * ### SD Level 1 + * + * If we can find algorithm for best combination between CPUs + * and tasks we can use it to check if need set first schedule + * domain level's "overutilized" flag: + * + * if (assign_algorithm()) + * sd->groups->overutilized = true; + * + * After set this flag, the load balance will happen only + * in SD Level 1, so this means only take affact on clustser + * internally. + * + * So far unfortunately this part is absent due the algorithm + * is complex [1]...
+ * [1] https://en.wikipedia.org/wiki/Assignment_problem + * + * ### SD Level 2 + * + * If the CPU has misfit on it, it's no doubt to migrate task + * to another high capacity CPU. + * + * Or if one CPU is overutilized and we assume now scheduler has + * done good enough work to explore cluster internal capacity, so + * if one CPU is overutilized that means finally need seek another + * cluster to provide more computing capacity. + * + * For upper two cases, we set SD level 2 flag. So later this flag is + * used by any CPU trigger load balance in the same schedule domain. + * + * We can add more strick criteria for migration, like the dstination + * CPU acpaicty >= source CPU capacity, then we can finally permit + * the task migration to higher capacity CPU for this case. + * + * ### Root domain + * + * If set root domain flag, means explore performance as possible + * to spread out tasks. + * + */ +static void set_sd_overutilized(struct sched_domain *sd) +{ + if (sd && sd->parent) + sd->parent->groups->overutilized = true; +} + +static void set_rd_overutilized(struct root_domain *rd) +{ + rd->overutilized = true; +} + +static void clear_rd_overutilized(struct root_domain *rd) +{ + rd->overutilized = false; +} + +static bool is_sd_overutilized(struct sched_domain *sd) +{ + struct sched_group *group = sd->groups; + int cpu = smp_processor_id(); + + if (cpu_rq(cpu)->rd->overutilized) + return true; + + do { + if (group->overutilized) + return true; + + } while (group = group->next, group != sd->groups); + + return false; +} + #else #define boosted_cpu_util(cpu) cpu_util(cpu) #endif @@ -4228,6 +4305,7 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) { struct cfs_rq *cfs_rq; + struct sched_domain *sd; struct sched_entity *se = &p->se; #ifdef CONFIG_SMP int task_new = flags & ENQUEUE_WAKEUP_NEW; @@ -4292,11 +4370,12 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (!se) { walt_inc_cumulative_runnable_avg(rq, p); - if (!task_new && !rq->rd->overutilized && - cpu_overutilized(rq->cpu)) { - rq->rd->overutilized = true; - trace_sched_overutilized(true); - } + + rcu_read_lock(); + sd = rcu_dereference(rq->sd); + if (!task_new && cpu_overutilized(rq->cpu)) + set_sd_overutilized(sd); + rcu_read_unlock();
/* * We want to potentially trigger a freq switch @@ -5921,7 +6000,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f }
if (!sd) { - if (energy_aware() && !cpu_rq(cpu)->rd->overutilized) + sd = rcu_dereference(cpu_rq(prev_cpu)->sd); + if (energy_aware() && !is_sd_overutilized(sd)) new_cpu = energy_aware_wake_cpu(p, prev_cpu, sync); else if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ new_cpu = select_idle_sibling(p, new_cpu); @@ -7003,6 +7083,7 @@ struct sd_lb_stats { struct sched_group *local; /* Local group in this sd */ unsigned long total_load; /* Total load of all groups in sd */ unsigned long total_capacity; /* Total capacity of all groups in sd */ + unsigned long total_util; /* Total util of all groups in sd */ unsigned long avg_load; /* Average load across all groups in sd */
struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */ @@ -7022,6 +7103,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds) .local = NULL, .total_load = 0UL, .total_capacity = 0UL, + .total_util = 0UL, .busiest_stat = { .avg_load = 0UL, .sum_nr_running = 0, @@ -7343,10 +7425,11 @@ group_type group_classify(struct sched_group *group, static inline void update_sg_lb_stats(struct lb_env *env, struct sched_group *group, int load_idx, int local_group, struct sg_lb_stats *sgs, - bool *overload, bool *overutilized) + bool *overload) { unsigned long load; int i, nr_running; + bool overutilized = false;
memset(sgs, 0, sizeof(*sgs));
@@ -7379,7 +7462,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->idle_cpus++;
if (cpu_overutilized(i)) { - *overutilized = true; + overutilized = true; if (!sgs->group_misfit_task && rq->misfit_task) sgs->group_misfit_task = capacity_of(i); } @@ -7396,6 +7479,9 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->group_no_capacity = group_is_overloaded(env, sgs); sgs->group_type = group_classify(group, sgs); + + if (group->overutilized != overutilized) + group->overutilized = overutilized; }
/** @@ -7504,7 +7590,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd struct sched_group *sg = env->sd->groups; struct sg_lb_stats tmp_sgs; int load_idx, prefer_sibling = 0; - bool overload = false, overutilized = false; + bool overload = false;
if (child && child->flags & SD_PREFER_SIBLING) prefer_sibling = 1; @@ -7526,7 +7612,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd }
update_sg_lb_stats(env, sg, load_idx, local_group, sgs, - &overload, &overutilized); + &overload);
if (local_group) goto next_group; @@ -7566,6 +7652,7 @@ next_group: /* Now, start updating sd_lb_stats */ sds->total_load += sgs->group_load; sds->total_capacity += sgs->group_capacity; + sds->total_util += sgs->group_util;
sg = sg->next; } while (sg != env->sd->groups); @@ -7580,18 +7667,16 @@ next_group: if (env->dst_rq->rd->overload != overload) env->dst_rq->rd->overload = overload;
- /* Update over-utilization (tipping point, U >= 0) indicator */ - if (env->dst_rq->rd->overutilized != overutilized) { - env->dst_rq->rd->overutilized = overutilized; - trace_sched_overutilized(overutilized); - } - } else { - if (!env->dst_rq->rd->overutilized && overutilized) { - env->dst_rq->rd->overutilized = true; - trace_sched_overutilized(true); - } - } + /* + * If overall util is greater that overall capacity, + * set root domain's overutlized flag. + */ + if (sds->total_capacity * 1024 < sds->total_util * capacity_margin) + set_rd_overutilized(env->dst_rq->rd); + else + clear_rd_overutilized(env->dst_rq->rd);
+ } }
/** @@ -7834,7 +7919,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) */ update_sd_lb_stats(env, &sds);
- if (energy_aware() && !env->dst_rq->rd->overutilized) + if (energy_aware() && !is_sd_overutilized(env->sd)) goto out_balanced;
local = &sds.local_stat; @@ -8748,6 +8833,10 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
rcu_read_lock(); for_each_domain(cpu, sd) { + + if (energy_aware() && !is_sd_overutilized(sd)) + continue; + /* * Decay the newidle max times here because this is a regular * visit to all the domains. Decay ~1% per second. @@ -9039,6 +9128,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) { struct cfs_rq *cfs_rq; struct sched_entity *se = &curr->se; + struct sched_domain *sd;
for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); @@ -9049,12 +9139,13 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) task_tick_numa(rq, curr);
#ifdef CONFIG_SMP - if (!rq->rd->overutilized && cpu_overutilized(task_cpu(curr))) { - rq->rd->overutilized = true; - trace_sched_overutilized(true); - } - rq->misfit_task = !task_fits_max(curr, rq->cpu); + + rcu_read_lock(); + sd = rcu_dereference(rq->sd); + if (cpu_overutilized(task_cpu(curr)) || rq->misfit_task) + set_sd_overutilized(sd); + rcu_read_unlock(); #endif
} diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 2f2b959..ca2cedb 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -916,6 +916,7 @@ struct sched_group { unsigned int group_weight; struct sched_group_capacity *sgc; const struct sched_group_energy const *sge; + bool overutilized;
/* * The CPUs this group covers.