The current implementation of overutilization, aborts energy aware scheduling if any cpu in the system is over-utilized. This patch introduces over utilization flag per sched group level instead of a single flag system wide. Load balancing is done at the sched domain where any of the sched group is over utilized. If energy aware scheduling is enabled and no sched group in a sched domain is overuttilized, load balancing is skipped for that sched domain and energy aware scheduling continues at that level.
The implementation is based on two points 1. For every cpu in every sched domain the first group is the group that contains the cpu itself. 2. sched groups are shared between cpus.
Thus if a sched group is overutilized the overutilized flag is set at the first sched group of the parent sched domain. This ensures a load balancing at the overutilzed sched domain level. For example consider a big little system with two little cpu's (CPU A and CPU B) and two big cpu's (CPU C and CPU D). In this system, the hierarchy will be as follows CPU A SD level 1 - SG1 (CPUA), SG2 (CPUB) SD level 2 - SG5(CPUA, CPUB), SG6(CPU C, CPU D) RD
CPU B SD level 1 - SG2(CPUB), SG1 (CPUA) SD level 2 - SG5(CPU A, CPU B), SG6(CPU C, CPUD) RD
CPU C SD level 1 - SG3(CPU C), SG4 (CPUD) SD level 2 - SG6(CPUC, CPUD), SG5(CPUA, CPU B) RD
CPU D SD level 1 - SG4(CPU D), SG3(CPU C) SD level2 - SG6(CPUC, CPU D), SG5(CPU A, APU B) RD
In the above system if CPUA is overutilized, the overutilized flag is set at SG5(parent sched domain first sched group). Similarly if CPUB is overutilized, the flag is set at SG5. During load balancing, at SD level 1, the overutilized flag is checked at the parent sched domain, first sched group level(SG5). If there is no parent sched domain, then the flag is set/checked at the root domain. This ensures that load balancing happens irrespective of which cpu is over utilized in a sched domain.
Signed-off-by: Thara Gopinath thara.gopinath@linaro.org --- kernel/sched/fair.c | 108 ++++++++++++++++++++++++++++++++++++++++++--------- kernel/sched/sched.h | 1 + 2 files changed, 90 insertions(+), 19 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 01fa969..0c97e0a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4559,6 +4559,36 @@ static inline void hrtick_update(struct rq *rq)
static bool cpu_overutilized(int cpu);
+static bool +is_sd_overutilized(struct sched_domain *sd, struct root_domain *rd) +{ + if (sd && sd->parent) + return sd->parent->groups->overutilized; + + if (!rd) + return false; + + return rd->overutilized; +} + +static void +set_sd_overutilized(struct sched_domain *sd, struct root_domain *rd) +{ + if (sd && sd->parent) + sd->parent->groups->overutilized = true; + else if (rd) + rd->overutilized = true; +} + +static void +clear_sd_overutilized(struct sched_domain *sd, struct root_domain *rd) +{ + if (sd && sd->parent) + sd->parent->groups->overutilized = false; + else if (rd) + rd->overutilized = false; +} + /* * The enqueue_task method is called before nr_running is * increased. Here we update the fair scheduling stats and @@ -4568,6 +4598,7 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) { struct cfs_rq *cfs_rq; + struct sched_domain *sd; struct sched_entity *se = &p->se; int task_new = !(flags & ENQUEUE_WAKEUP);
@@ -4603,9 +4634,12 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (!se) { add_nr_running(rq, 1); - if (!task_new && !rq->rd->overutilized && - cpu_overutilized(rq->cpu)) - rq->rd->overutilized = true; + rcu_read_lock(); + sd = rcu_dereference(rq->sd); + if (!task_new && !is_sd_overutilized(sd, rq->rd) && + cpu_overutilized(rq->cpu)) + set_sd_overutilized(sd, rq->rd); + rcu_read_unlock(); } hrtick_update(rq); } @@ -5989,8 +6023,6 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu) unsigned long max_spare = 0; struct sched_domain *sd;
- rcu_read_lock(); - sd = rcu_dereference(per_cpu(sd_ea, prev_cpu));
if (!sd) @@ -6028,7 +6060,6 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu) }
unlock: - rcu_read_unlock();
if (energy_cpu == prev_cpu && !cpu_overutilized(prev_cpu)) return prev_cpu; @@ -6063,10 +6094,16 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f && cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); }
- if (energy_aware() && !(cpu_rq(prev_cpu)->rd->overutilized)) - return select_energy_cpu_brute(p, prev_cpu); - rcu_read_lock(); + sd = rcu_dereference(cpu_rq(prev_cpu)->sd); + if (energy_aware() && + !is_sd_overutilized(sd, + cpu_rq(cpu)->rd)) { + new_cpu = select_energy_cpu_brute(p, prev_cpu); + goto unlock; + } + + sd = NULL; for_each_domain(cpu, tmp) { if (!(tmp->flags & SD_LOAD_BALANCE)) break; @@ -6131,6 +6168,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f } /* while loop will break here if sd == NULL */ } + +unlock: rcu_read_unlock();
return new_cpu; @@ -7178,6 +7217,7 @@ struct sd_lb_stats { struct sched_group *local; /* Local group in this sd */ unsigned long total_load; /* Total load of all groups in sd */ unsigned long total_capacity; /* Total capacity of all groups in sd */ + unsigned long total_util; /* Total util of all groups in sd */ unsigned long avg_load; /* Average load across all groups in sd */
struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */ @@ -7197,6 +7237,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds) .local = NULL, .total_load = 0UL, .total_capacity = 0UL, + .total_util = 0UL, .busiest_stat = { .avg_load = 0UL, .sum_nr_running = 0, @@ -7692,6 +7733,7 @@ next_group: /* Now, start updating sd_lb_stats */ sds->total_load += sgs->group_load; sds->total_capacity += sgs->group_capacity; + sds->total_util += sgs->group_util;
sg = sg->next; } while (sg != env->sd->groups); @@ -7701,17 +7743,26 @@ next_group:
env->src_grp_nr_running = sds->busiest_stat.sum_nr_running;
+ /* Setting overutilized flag might not be necessary here + * Revisit + */ if (!lb_sd_parent(env->sd)) { /* update overload indicator if we are at root domain */ if (env->dst_rq->rd->overload != overload) env->dst_rq->rd->overload = overload; + }
- /* Update over-utilization (tipping point, U >= 0) indicator */ - if (env->dst_rq->rd->overutilized != overutilized) - env->dst_rq->rd->overutilized = overutilized; - } else { - if (!env->dst_rq->rd->overutilized && overutilized) - env->dst_rq->rd->overutilized = true; + if (overutilized) + set_sd_overutilized(env->sd, env->dst_rq->rd); + + /* If the domain util is greater that domain capacity, load balancing + * needs to be done at the next sched domain level as well + */ + if (sds->total_capacity * 1024 < sds->total_util * capacity_margin) { + /* If already at the highest domain nothing can be done */ + if (env->sd->parent) + set_sd_overutilized(env->sd->parent, + env->dst_rq->rd); } }
@@ -7932,8 +7983,11 @@ static struct sched_group *find_busiest_group(struct lb_env *env) */ update_sd_lb_stats(env, &sds);
- if (energy_aware() && !env->dst_rq->rd->overutilized) - goto out_balanced; + /* Is this check really required here?? Revisit */ + if (energy_aware()) { + if (!is_sd_overutilized(env->sd, env->dst_rq->rd)) + goto out_balanced; + }
local = &sds.local_stat; busiest = &sds.busiest_stat; @@ -8000,6 +8054,12 @@ static struct sched_group *find_busiest_group(struct lb_env *env) force_balance: /* Looks like there is an imbalance. Compute it */ calculate_imbalance(env, &sds); + + /* Is this the correct place to clear this flag? Should access + * to flag be locked? Revisit. + */ + clear_sd_overutilized(env->sd, env->dst_rq->rd); + return sds.busiest;
out_balanced: @@ -8790,6 +8850,11 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
rcu_read_lock(); for_each_domain(cpu, sd) { + if (energy_aware()) { + if (!is_sd_overutilized(sd, rq->rd)) + continue; + } + /* * Decay the newidle max times here because this is a regular * visit to all the domains. Decay ~1% per second. @@ -9083,6 +9148,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) { struct cfs_rq *cfs_rq; struct sched_entity *se = &curr->se; + struct sched_domain *sd;
for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); @@ -9092,8 +9158,12 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) if (static_branch_unlikely(&sched_numa_balancing)) task_tick_numa(rq, curr);
- if (!rq->rd->overutilized && cpu_overutilized(task_cpu(curr))) - rq->rd->overutilized = true; + rcu_read_lock(); + sd = rcu_dereference(rq->sd); + if (!is_sd_overutilized(sd, rq->rd) && + cpu_overutilized(task_cpu(curr))) + set_sd_overutilized(sd, rq->rd); + rcu_read_unlock(); }
/* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index f99391d..90c48ac 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -913,6 +913,7 @@ struct sched_group { unsigned int group_weight; struct sched_group_capacity *sgc; const struct sched_group_energy const *sge; + bool overutilized;
/* * The CPUs this group covers. -- 2.1.4
Hi Thara,
Good to see the patch; please see some questions.
On Wed, Dec 07, 2016 at 05:22:37PM -0500, Thara Gopinath wrote:
The current implementation of overutilization, aborts energy aware scheduling if any cpu in the system is over-utilized. This patch introduces over utilization flag per sched group level instead of a single flag system wide. Load balancing is done at the sched domain where any of the sched group is over utilized. If energy aware scheduling is enabled and no sched group in a sched domain is overuttilized, load balancing is skipped for that sched domain and energy aware scheduling continues at that level.
The implementation is based on two points
- For every cpu in every sched domain the first group is the group that contains the cpu itself.
- sched groups are shared between cpus.
Thus if a sched group is overutilized the overutilized flag is set at the first sched group of the parent sched domain. This ensures a load balancing at the overutilzed sched domain level. For example consider a big little system with two little cpu's (CPU A and CPU B) and two big cpu's (CPU C and CPU D). In this system, the hierarchy will be as follows CPU A SD level 1 - SG1 (CPUA), SG2 (CPUB) SD level 2 - SG5(CPUA, CPUB), SG6(CPU C, CPU D) RD
CPU B SD level 1 - SG2(CPUB), SG1 (CPUA) SD level 2 - SG5(CPU A, CPU B), SG6(CPU C, CPUD) RD
CPU C SD level 1 - SG3(CPU C), SG4 (CPUD) SD level 2 - SG6(CPUC, CPUD), SG5(CPUA, CPU B) RD
CPU D SD level 1 - SG4(CPU D), SG3(CPU C) SD level2 - SG6(CPUC, CPU D), SG5(CPU A, APU B) RD
In the above system if CPUA is overutilized, the overutilized flag is set at SG5(parent sched domain first sched group). Similarly if CPUB is overutilized, the flag is set at SG5. During load balancing, at SD level 1, the overutilized flag is checked at the parent sched domain, first sched group level(SG5). If there is no parent sched domain, then the flag is set/checked at the root domain. This ensures that load balancing happens irrespective of which cpu is over utilized in a sched domain.
So for SD level 1, any CPU is overutilized then it will set overutilized flag for SD level 1.
What's the criteria to set the root domain flag? I read the code, if only has one overutilized CPU in system and it still possible to set root domain's overutilized flag. right?
Signed-off-by: Thara Gopinath thara.gopinath@linaro.org
kernel/sched/fair.c | 108 ++++++++++++++++++++++++++++++++++++++++++--------- kernel/sched/sched.h | 1 + 2 files changed, 90 insertions(+), 19 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 01fa969..0c97e0a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4559,6 +4559,36 @@ static inline void hrtick_update(struct rq *rq)
static bool cpu_overutilized(int cpu);
+static bool +is_sd_overutilized(struct sched_domain *sd, struct root_domain *rd) +{
- if (sd && sd->parent)
return sd->parent->groups->overutilized;
- if (!rd)
return false;
- return rd->overutilized;
+}
+static void +set_sd_overutilized(struct sched_domain *sd, struct root_domain *rd) +{
- if (sd && sd->parent)
sd->parent->groups->overutilized = true;
- else if (rd)
rd->overutilized = true;
+}
+static void +clear_sd_overutilized(struct sched_domain *sd, struct root_domain *rd) +{
- if (sd && sd->parent)
sd->parent->groups->overutilized = false;
- else if (rd)
rd->overutilized = false;
+}
/*
- The enqueue_task method is called before nr_running is
- increased. Here we update the fair scheduling stats and
@@ -4568,6 +4598,7 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) { struct cfs_rq *cfs_rq;
- struct sched_domain *sd; struct sched_entity *se = &p->se; int task_new = !(flags & ENQUEUE_WAKEUP);
@@ -4603,9 +4634,12 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (!se) { add_nr_running(rq, 1);
if (!task_new && !rq->rd->overutilized &&
cpu_overutilized(rq->cpu))
rq->rd->overutilized = true;
rcu_read_lock();
sd = rcu_dereference(rq->sd);
if (!task_new && !is_sd_overutilized(sd, rq->rd) &&
cpu_overutilized(rq->cpu))
set_sd_overutilized(sd, rq->rd);
} hrtick_update(rq);rcu_read_unlock();
} @@ -5989,8 +6023,6 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu) unsigned long max_spare = 0; struct sched_domain *sd;
rcu_read_lock();
sd = rcu_dereference(per_cpu(sd_ea, prev_cpu));
if (!sd)
@@ -6028,7 +6060,6 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu) }
unlock:
rcu_read_unlock();
if (energy_cpu == prev_cpu && !cpu_overutilized(prev_cpu)) return prev_cpu;
@@ -6063,10 +6094,16 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f && cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); }
- if (energy_aware() && !(cpu_rq(prev_cpu)->rd->overutilized))
return select_energy_cpu_brute(p, prev_cpu);
- rcu_read_lock();
- sd = rcu_dereference(cpu_rq(prev_cpu)->sd);
- if (energy_aware() &&
!is_sd_overutilized(sd,
cpu_rq(cpu)->rd)) {
new_cpu = select_energy_cpu_brute(p, prev_cpu);
goto unlock;
- }
- sd = NULL;
Is it better to place function select_energy_cpu_brute() out of rcu locking, like below?
rcu_read_lock(); sd = rcu_dereference(cpu_rq(prev_cpu)->sd); is_overutilized = is_sd_overutilized(sd, cpu_rq(cpu)->rd)); rcu_read_unlock();
if (energy_aware() && !is_overutilized) return select_energy_cpu_brute(p, prev_cpu);
for_each_domain(cpu, tmp) { if (!(tmp->flags & SD_LOAD_BALANCE)) break; @@ -6131,6 +6168,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f } /* while loop will break here if sd == NULL */ }
+unlock: rcu_read_unlock();
return new_cpu; @@ -7178,6 +7217,7 @@ struct sd_lb_stats { struct sched_group *local; /* Local group in this sd */ unsigned long total_load; /* Total load of all groups in sd */ unsigned long total_capacity; /* Total capacity of all groups in sd */
unsigned long total_util; /* Total util of all groups in sd */ unsigned long avg_load; /* Average load across all groups in sd */
struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
@@ -7197,6 +7237,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds) .local = NULL, .total_load = 0UL, .total_capacity = 0UL,
.busiest_stat = { .avg_load = 0UL, .sum_nr_running = 0,.total_util = 0UL,
@@ -7692,6 +7733,7 @@ next_group: /* Now, start updating sd_lb_stats */ sds->total_load += sgs->group_load; sds->total_capacity += sgs->group_capacity;
sds->total_util += sgs->group_util;
sg = sg->next; } while (sg != env->sd->groups);
@@ -7701,17 +7743,26 @@ next_group:
env->src_grp_nr_running = sds->busiest_stat.sum_nr_running;
- /* Setting overutilized flag might not be necessary here
* Revisit
if (!lb_sd_parent(env->sd)) { /* update overload indicator if we are at root domain */ if (env->dst_rq->rd->overload != overload) env->dst_rq->rd->overload = overload;*/
- }
/* Update over-utilization (tipping point, U >= 0) indicator */
if (env->dst_rq->rd->overutilized != overutilized)
env->dst_rq->rd->overutilized = overutilized;
- } else {
if (!env->dst_rq->rd->overutilized && overutilized)
env->dst_rq->rd->overutilized = true;
- if (overutilized)
set_sd_overutilized(env->sd, env->dst_rq->rd);
If it's not overutilized, here should call function clear_sd_overutilized()? The old code clears root domain flag.
- /* If the domain util is greater that domain capacity, load balancing
* needs to be done at the next sched domain level as well
*/
- if (sds->total_capacity * 1024 < sds->total_util * capacity_margin) {
/* If already at the highest domain nothing can be done */
if (env->sd->parent)
set_sd_overutilized(env->sd->parent,
env->dst_rq->rd);
So usually this will set root domain's flag after the whole schedule domain util greater than domain capacity. If CPU has one "misfit" task then scheduler will not reach this condition, so this will not set root domain's flag and introduce delay to migrate task.
} }
@@ -7932,8 +7983,11 @@ static struct sched_group *find_busiest_group(struct lb_env *env) */ update_sd_lb_stats(env, &sds);
- if (energy_aware() && !env->dst_rq->rd->overutilized)
goto out_balanced;
/* Is this check really required here?? Revisit */
if (energy_aware()) {
if (!is_sd_overutilized(env->sd, env->dst_rq->rd))
goto out_balanced;
}
local = &sds.local_stat; busiest = &sds.busiest_stat;
@@ -8000,6 +8054,12 @@ static struct sched_group *find_busiest_group(struct lb_env *env) force_balance: /* Looks like there is an imbalance. Compute it */ calculate_imbalance(env, &sds);
- /* Is this the correct place to clear this flag? Should access
* to flag be locked? Revisit.
*/
- clear_sd_overutilized(env->sd, env->dst_rq->rd);
Have specific sequency to clear overutilized flag for root domain and SD level 2? Like firstly clean root domain flag and then clear SD level 2 flag.
return sds.busiest;
out_balanced: @@ -8790,6 +8850,11 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
rcu_read_lock(); for_each_domain(cpu, sd) {
if (energy_aware()) {
if (!is_sd_overutilized(sd, rq->rd))
continue;
}
- /*
- Decay the newidle max times here because this is a regular
- visit to all the domains. Decay ~1% per second.
@@ -9083,6 +9148,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) { struct cfs_rq *cfs_rq; struct sched_entity *se = &curr->se;
struct sched_domain *sd;
for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se);
@@ -9092,8 +9158,12 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) if (static_branch_unlikely(&sched_numa_balancing)) task_tick_numa(rq, curr);
- if (!rq->rd->overutilized && cpu_overutilized(task_cpu(curr)))
rq->rd->overutilized = true;
- rcu_read_lock();
- sd = rcu_dereference(rq->sd);
- if (!is_sd_overutilized(sd, rq->rd) &&
cpu_overutilized(task_cpu(curr)))
set_sd_overutilized(sd, rq->rd);
- rcu_read_unlock();
}
/* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index f99391d..90c48ac 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -913,6 +913,7 @@ struct sched_group { unsigned int group_weight; struct sched_group_capacity *sgc; const struct sched_group_energy const *sge;
bool overutilized;
/*
- The CPUs this group covers.
-- 2.1.4
Hi Leo,
Thanks for the review. It will be great if you can test this patch out and let me know of any improvements/degradation.
On 12/08/2016 05:24 AM, Leo Yan wrote:
Hi Thara,
Good to see the patch; please see some questions.
On Wed, Dec 07, 2016 at 05:22:37PM -0500, Thara Gopinath wrote:
The current implementation of overutilization, aborts energy aware scheduling if any cpu in the system is over-utilized. This patch introduces over utilization flag per sched group level instead of a single flag system wide. Load balancing is done at the sched domain where any of the sched group is over utilized. If energy aware scheduling is enabled and no sched group in a sched domain is overuttilized, load balancing is skipped for that sched domain and energy aware scheduling continues at that level.
The implementation is based on two points
- For every cpu in every sched domain the first group is the group that contains the cpu itself.
- sched groups are shared between cpus.
Thus if a sched group is overutilized the overutilized flag is set at the first sched group of the parent sched domain. This ensures a load balancing at the overutilzed sched domain level. For example consider a big little system with two little cpu's (CPU A and CPU B) and two big cpu's (CPU C and CPU D). In this system, the hierarchy will be as follows CPU A SD level 1 - SG1 (CPUA), SG2 (CPUB) SD level 2 - SG5(CPUA, CPUB), SG6(CPU C, CPU D) RD
CPU B SD level 1 - SG2(CPUB), SG1 (CPUA) SD level 2 - SG5(CPU A, CPU B), SG6(CPU C, CPUD) RD
CPU C SD level 1 - SG3(CPU C), SG4 (CPUD) SD level 2 - SG6(CPUC, CPUD), SG5(CPUA, CPU B) RD
CPU D SD level 1 - SG4(CPU D), SG3(CPU C) SD level2 - SG6(CPUC, CPU D), SG5(CPU A, APU B) RD
In the above system if CPUA is overutilized, the overutilized flag is set at SG5(parent sched domain first sched group). Similarly if CPUB is overutilized, the flag is set at SG5. During load balancing, at SD level 1, the overutilized flag is checked at the parent sched domain, first sched group level(SG5). If there is no parent sched domain, then the flag is set/checked at the root domain. This ensures that load balancing happens irrespective of which cpu is over utilized in a sched domain.
So for SD level 1, any CPU is overutilized then it will set overutilized flag for SD level 1.
SD is cpu specific. If CPU A or CPU B is overutlized, the flag is set at SG5 level. This way irrespective of which cpu load balancing is happening from, load is balanced between CPU A and CPU B.
What's the criteria to set the root domain flag? I read the code, if only has one overutilized CPU in system and it still possible to set root domain's overutilized flag. right?
In the above example if both CPUA and CPU B are over-utilized(which means for both CPU A and CPU B, SD level 1 is over-utilized which in turn means SG5 is overutilized), the load balancing has to happen at the next level between SG5 and SG6. In this case the flag is set at the RD level.
Signed-off-by: Thara Gopinath thara.gopinath@linaro.org
kernel/sched/fair.c | 108 ++++++++++++++++++++++++++++++++++++++++++--------- kernel/sched/sched.h | 1 + 2 files changed, 90 insertions(+), 19 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 01fa969..0c97e0a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4559,6 +4559,36 @@ static inline void hrtick_update(struct rq *rq)
static bool cpu_overutilized(int cpu);
+static bool +is_sd_overutilized(struct sched_domain *sd, struct root_domain *rd) +{
- if (sd && sd->parent)
return sd->parent->groups->overutilized;
- if (!rd)
return false;
- return rd->overutilized;
+}
+static void +set_sd_overutilized(struct sched_domain *sd, struct root_domain *rd) +{
- if (sd && sd->parent)
sd->parent->groups->overutilized = true;
- else if (rd)
rd->overutilized = true;
+}
+static void +clear_sd_overutilized(struct sched_domain *sd, struct root_domain *rd) +{
- if (sd && sd->parent)
sd->parent->groups->overutilized = false;
- else if (rd)
rd->overutilized = false;
+}
/*
- The enqueue_task method is called before nr_running is
- increased. Here we update the fair scheduling stats and
@@ -4568,6 +4598,7 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) { struct cfs_rq *cfs_rq;
- struct sched_domain *sd; struct sched_entity *se = &p->se; int task_new = !(flags & ENQUEUE_WAKEUP);
@@ -4603,9 +4634,12 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (!se) { add_nr_running(rq, 1);
if (!task_new && !rq->rd->overutilized &&
cpu_overutilized(rq->cpu))
rq->rd->overutilized = true;
rcu_read_lock();
sd = rcu_dereference(rq->sd);
if (!task_new && !is_sd_overutilized(sd, rq->rd) &&
cpu_overutilized(rq->cpu))
set_sd_overutilized(sd, rq->rd);
} hrtick_update(rq);rcu_read_unlock();
} @@ -5989,8 +6023,6 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu) unsigned long max_spare = 0; struct sched_domain *sd;
rcu_read_lock();
sd = rcu_dereference(per_cpu(sd_ea, prev_cpu));
if (!sd)
@@ -6028,7 +6060,6 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu) }
unlock:
rcu_read_unlock();
if (energy_cpu == prev_cpu && !cpu_overutilized(prev_cpu)) return prev_cpu;
@@ -6063,10 +6094,16 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f && cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); }
- if (energy_aware() && !(cpu_rq(prev_cpu)->rd->overutilized))
return select_energy_cpu_brute(p, prev_cpu);
- rcu_read_lock();
- sd = rcu_dereference(cpu_rq(prev_cpu)->sd);
- if (energy_aware() &&
!is_sd_overutilized(sd,
cpu_rq(cpu)->rd)) {
new_cpu = select_energy_cpu_brute(p, prev_cpu);
goto unlock;
- }
- sd = NULL;
Is it better to place function select_energy_cpu_brute() out of rcu locking, like below?
I dont understand how this will change anything. In the above code, select_energy_cpu_brute() is rcu protected.
rcu_read_lock(); sd = rcu_dereference(cpu_rq(prev_cpu)->sd); is_overutilized = is_sd_overutilized(sd, cpu_rq(cpu)->rd)); rcu_read_unlock(); if (energy_aware() && !is_overutilized) return select_energy_cpu_brute(p, prev_cpu);
for_each_domain(cpu, tmp) { if (!(tmp->flags & SD_LOAD_BALANCE)) break; @@ -6131,6 +6168,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f } /* while loop will break here if sd == NULL */ }
+unlock: rcu_read_unlock();
return new_cpu; @@ -7178,6 +7217,7 @@ struct sd_lb_stats { struct sched_group *local; /* Local group in this sd */ unsigned long total_load; /* Total load of all groups in sd */ unsigned long total_capacity; /* Total capacity of all groups in sd */
unsigned long total_util; /* Total util of all groups in sd */ unsigned long avg_load; /* Average load across all groups in sd */
struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
@@ -7197,6 +7237,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds) .local = NULL, .total_load = 0UL, .total_capacity = 0UL,
.busiest_stat = { .avg_load = 0UL, .sum_nr_running = 0,.total_util = 0UL,
@@ -7692,6 +7733,7 @@ next_group: /* Now, start updating sd_lb_stats */ sds->total_load += sgs->group_load; sds->total_capacity += sgs->group_capacity;
sds->total_util += sgs->group_util;
sg = sg->next; } while (sg != env->sd->groups);
@@ -7701,17 +7743,26 @@ next_group:
env->src_grp_nr_running = sds->busiest_stat.sum_nr_running;
- /* Setting overutilized flag might not be necessary here
* Revisit
if (!lb_sd_parent(env->sd)) { /* update overload indicator if we are at root domain */ if (env->dst_rq->rd->overload != overload) env->dst_rq->rd->overload = overload;*/
- }
/* Update over-utilization (tipping point, U >= 0) indicator */
if (env->dst_rq->rd->overutilized != overutilized)
env->dst_rq->rd->overutilized = overutilized;
- } else {
if (!env->dst_rq->rd->overutilized && overutilized)
env->dst_rq->rd->overutilized = true;
- if (overutilized)
set_sd_overutilized(env->sd, env->dst_rq->rd);
If it's not overutilized, here should call function clear_sd_overutilized()? The old code clears root domain flag.
Yes. May be we should. I am not sure about it. Because we clear the flag at a different place in this implementation.
- /* If the domain util is greater that domain capacity, load balancing
* needs to be done at the next sched domain level as well
*/
- if (sds->total_capacity * 1024 < sds->total_util * capacity_margin) {
/* If already at the highest domain nothing can be done */
if (env->sd->parent)
set_sd_overutilized(env->sd->parent,
env->dst_rq->rd);
So usually this will set root domain's flag after the whole schedule domain util greater than domain capacity. If CPU has one "misfit" task then scheduler will not reach this condition, so this will not set root domain's flag and introduce delay to migrate task.
Hmm yes. you are correct. we may have to handle misfit tasks separately.
} }
@@ -7932,8 +7983,11 @@ static struct sched_group *find_busiest_group(struct lb_env *env) */ update_sd_lb_stats(env, &sds);
- if (energy_aware() && !env->dst_rq->rd->overutilized)
goto out_balanced;
/* Is this check really required here?? Revisit */
if (energy_aware()) {
if (!is_sd_overutilized(env->sd, env->dst_rq->rd))
goto out_balanced;
}
local = &sds.local_stat; busiest = &sds.busiest_stat;
@@ -8000,6 +8054,12 @@ static struct sched_group *find_busiest_group(struct lb_env *env) force_balance: /* Looks like there is an imbalance. Compute it */ calculate_imbalance(env, &sds);
- /* Is this the correct place to clear this flag? Should access
* to flag be locked? Revisit.
*/
- clear_sd_overutilized(env->sd, env->dst_rq->rd);
Have specific sequency to clear overutilized flag for root domain and SD level 2? Like firstly clean root domain flag and then clear SD level 2 flag.
We don't need a specific sequence. As and when each sched domain get balanced, the corresponding flag will get cleared.
return sds.busiest;
out_balanced: @@ -8790,6 +8850,11 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
rcu_read_lock(); for_each_domain(cpu, sd) {
if (energy_aware()) {
if (!is_sd_overutilized(sd, rq->rd))
continue;
}
- /*
- Decay the newidle max times here because this is a regular
- visit to all the domains. Decay ~1% per second.
@@ -9083,6 +9148,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) { struct cfs_rq *cfs_rq; struct sched_entity *se = &curr->se;
struct sched_domain *sd;
for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se);
@@ -9092,8 +9158,12 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) if (static_branch_unlikely(&sched_numa_balancing)) task_tick_numa(rq, curr);
- if (!rq->rd->overutilized && cpu_overutilized(task_cpu(curr)))
rq->rd->overutilized = true;
- rcu_read_lock();
- sd = rcu_dereference(rq->sd);
- if (!is_sd_overutilized(sd, rq->rd) &&
cpu_overutilized(task_cpu(curr)))
set_sd_overutilized(sd, rq->rd);
- rcu_read_unlock();
}
/* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index f99391d..90c48ac 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -913,6 +913,7 @@ struct sched_group { unsigned int group_weight; struct sched_group_capacity *sgc; const struct sched_group_energy const *sge;
bool overutilized;
/*
- The CPUs this group covers.
-- 2.1.4
-- Regards Thara
On Thu, Dec 08, 2016 at 11:42:50AM -0500, Thara Gopinath wrote:
Hi Leo,
Thanks for the review. It will be great if you can test this patch out and let me know of any improvements/degradation.
Yeah, will test it and let you know result.
On 12/08/2016 05:24 AM, Leo Yan wrote:
[...]
@@ -6063,10 +6094,16 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f && cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); }
- if (energy_aware() && !(cpu_rq(prev_cpu)->rd->overutilized))
return select_energy_cpu_brute(p, prev_cpu);
- rcu_read_lock();
- sd = rcu_dereference(cpu_rq(prev_cpu)->sd);
- if (energy_aware() &&
!is_sd_overutilized(sd,
cpu_rq(cpu)->rd)) {
new_cpu = select_energy_cpu_brute(p, prev_cpu);
goto unlock;
- }
- sd = NULL;
Is it better to place function select_energy_cpu_brute() out of rcu locking, like below?
I dont understand how this will change anything. In the above code, select_energy_cpu_brute() is rcu protected.
Your code moves function select_energy_cpu_brute() into RCU protection.
Be honest I'm not sure this change will hurt performance or not, usually it avoids to move much workload into locks.
[...]
@@ -7932,8 +7983,11 @@ static struct sched_group *find_busiest_group(struct lb_env *env) */ update_sd_lb_stats(env, &sds);
- if (energy_aware() && !env->dst_rq->rd->overutilized)
goto out_balanced;
/* Is this check really required here?? Revisit */
if (energy_aware()) {
if (!is_sd_overutilized(env->sd, env->dst_rq->rd))
goto out_balanced;
}
local = &sds.local_stat; busiest = &sds.busiest_stat;
@@ -8000,6 +8054,12 @@ static struct sched_group *find_busiest_group(struct lb_env *env) force_balance: /* Looks like there is an imbalance. Compute it */ calculate_imbalance(env, &sds);
- /* Is this the correct place to clear this flag? Should access
* to flag be locked? Revisit.
*/
- clear_sd_overutilized(env->sd, env->dst_rq->rd);
Have specific sequency to clear overutilized flag for root domain and SD level 2? Like firstly clean root domain flag and then clear SD level 2 flag.
We don't need a specific sequence. As and when each sched domain get balanced, the corresponding flag will get cleared.
Have you observed the overutilized flag is more easily to clear? If we clear overutilized flag at here, so means the overutilized flag is cleared after every time's load balance. This will let overutilized flag to be cleared very quickly and should have much less time to stay "overutilized" than before.
We could run rt-app cases for more analysis.
Thanks, Leo Yan
Hi Thara,
Thanks for sharing your proposal.
On Thu, Dec 08, 2016 at 11:42:50AM -0500, Thara Gopinath wrote:
On 12/08/2016 05:24 AM, Leo Yan wrote:
On Wed, Dec 07, 2016 at 05:22:37PM -0500, Thara Gopinath wrote:
@@ -7701,17 +7743,26 @@ next_group:
env->src_grp_nr_running = sds->busiest_stat.sum_nr_running;
- /* Setting overutilized flag might not be necessary here
- Revisit
- */ if (!lb_sd_parent(env->sd)) { /* update overload indicator if we are at root domain */ if (env->dst_rq->rd->overload != overload) env->dst_rq->rd->overload = overload;
- }
/* Update over-utilization (tipping point, U >= 0) indicator */
if (env->dst_rq->rd->overutilized != overutilized)
env->dst_rq->rd->overutilized = overutilized;
- } else {
if (!env->dst_rq->rd->overutilized && overutilized)
env->dst_rq->rd->overutilized = true;
- if (overutilized)
set_sd_overutilized(env->sd, env->dst_rq->rd);
If it's not overutilized, here should call function clear_sd_overutilized()? The old code clears root domain flag.
Yes. May be we should. I am not sure about it. Because we clear the flag at a different place in this implementation.
I think Leo is right. If we have just visited all cpus in the sched_domain and not found that any of them are over-utilized, we should be able to clear the flag for the domain. I don't see why we shouldn't if the intention with the flag is to indicate whether the domain is over-utilized or not?
- /* If the domain util is greater that domain capacity, load balancing
- needs to be done at the next sched domain level as well
- */
- if (sds->total_capacity * 1024 < sds->total_util * capacity_margin) {
/* If already at the highest domain nothing can be done */
if (env->sd->parent)
set_sd_overutilized(env->sd->parent,
env->dst_rq->rd);
So usually this will set root domain's flag after the whole schedule domain util greater than domain capacity. If CPU has one "misfit" task then scheduler will not reach this condition, so this will not set root domain's flag and introduce delay to migrate task.
Hmm yes. you are correct. we may have to handle misfit tasks separately.
I don't quite understand why we have to set the flag on the parent sched_domain here. It should be set anyways when update_sd_lb_stats() is called for the parent sched_domain in the next iteration of for_each_domain() in rebalance_domains(). But I see that we bails out early there, which I don't quite understand either. More on that in a separate reply ;-)
I agree with Leo that total_util doesn't flag some quite important cases as overutilized.
} }
@@ -7932,8 +7983,11 @@ static struct sched_group *find_busiest_group(struct lb_env *env) */ update_sd_lb_stats(env, &sds);
- if (energy_aware() && !env->dst_rq->rd->overutilized)
goto out_balanced;
/* Is this check really required here?? Revisit */
if (energy_aware()) {
if (!is_sd_overutilized(env->sd, env->dst_rq->rd))
goto out_balanced;
}
local = &sds.local_stat; busiest = &sds.busiest_stat;
@@ -8000,6 +8054,12 @@ static struct sched_group *find_busiest_group(struct lb_env *env) force_balance: /* Looks like there is an imbalance. Compute it */ calculate_imbalance(env, &sds);
- /* Is this the correct place to clear this flag? Should access
- to flag be locked? Revisit.
- */
- clear_sd_overutilized(env->sd, env->dst_rq->rd);
Have specific sequency to clear overutilized flag for root domain and SD level 2? Like firstly clean root domain flag and then clear SD level 2 flag.
We don't need a specific sequence. As and when each sched domain get balanced, the corresponding flag will get cleared.
IIUC, it isn't quite right to clear the flag here. The sched_domain may have more than two groups, for example a 4+4 big.LITTLE topology has four groups at the lowest level. Load-balance may resolve over-utilization in one sched_group, but one or more other groups might still be over-utilized
They won't be balanced this time anyway, so the flag might be re-enabled again at the next load-balance, so it may not be a problem, it just makes it a bit harder to understand.
Even in the case where we only have two sched_group, load-balance isn't guaranteed to result no groups being over-utilized. The source group may be so heavily loaded that it over-utilizes the destination group too after the load has been balanced.
Morten IMPORTANT NOTICE: The contents of this email and any attachments are confidential and may also be privileged. If you are not the intended recipient, please notify the sender immediately and do not disclose the contents to any other person, use it for any purpose, or store or copy the information in any medium. Thank you.
On 9 December 2016 at 18:06, Morten Rasmussen morten.rasmussen@arm.com wrote:
Hi Thara,
Thanks for sharing your proposal.
On Thu, Dec 08, 2016 at 11:42:50AM -0500, Thara Gopinath wrote:
On 12/08/2016 05:24 AM, Leo Yan wrote:
On Wed, Dec 07, 2016 at 05:22:37PM -0500, Thara Gopinath wrote:
@@ -7701,17 +7743,26 @@ next_group:
env->src_grp_nr_running = sds->busiest_stat.sum_nr_running;
- /* Setting overutilized flag might not be necessary here
- Revisit
- */ if (!lb_sd_parent(env->sd)) { /* update overload indicator if we are at root domain */ if (env->dst_rq->rd->overload != overload) env->dst_rq->rd->overload = overload;
- }
/* Update over-utilization (tipping point, U >= 0) indicator */
if (env->dst_rq->rd->overutilized != overutilized)
env->dst_rq->rd->overutilized = overutilized;
- } else {
if (!env->dst_rq->rd->overutilized && overutilized)
env->dst_rq->rd->overutilized = true;
- if (overutilized)
set_sd_overutilized(env->sd, env->dst_rq->rd);
If it's not overutilized, here should call function clear_sd_overutilized()? The old code clears root domain flag.
Yes. May be we should. I am not sure about it. Because we clear the flag at a different place in this implementation.
I think Leo is right. If we have just visited all cpus in the sched_domain and not found that any of them are over-utilized, we should be able to clear the flag for the domain. I don't see why we shouldn't if the intention with the flag is to indicate whether the domain is over-utilized or not?
- /* If the domain util is greater that domain capacity, load balancing
- needs to be done at the next sched domain level as well
- */
- if (sds->total_capacity * 1024 < sds->total_util * capacity_margin) {
/* If already at the highest domain nothing can be done */
if (env->sd->parent)
set_sd_overutilized(env->sd->parent,
env->dst_rq->rd);
So usually this will set root domain's flag after the whole schedule domain util greater than domain capacity. If CPU has one "misfit" task then scheduler will not reach this condition, so this will not set root domain's flag and introduce delay to migrate task.
Hmm yes. you are correct. we may have to handle misfit tasks separately.
I don't quite understand why we have to set the flag on the parent sched_domain here. It should be set anyways when update_sd_lb_stats() is
But the load balance will not be triggered if the flag is not set and update_sd_lb_stats will not be called. So once we have updated the statistics of the current domain and we consider that this domain is overutilized, we set the overutilized flag to the parent so the load balance will happen at the next level involving more cpus
called for the parent sched_domain in the next iteration of for_each_domain() in rebalance_domains(). But I see that we bails out early there, which I don't quite understand either. More on that in a separate reply ;-)
I agree with Leo that total_util doesn't flag some quite important cases as overutilized.
yes the case of a single task that requires more capacity that the max capacity_orig of cpus in the domain is not tested
} }
@@ -7932,8 +7983,11 @@ static struct sched_group *find_busiest_group(struct lb_env *env) */ update_sd_lb_stats(env, &sds);
- if (energy_aware() && !env->dst_rq->rd->overutilized)
goto out_balanced;
/* Is this check really required here?? Revisit */
if (energy_aware()) {
if (!is_sd_overutilized(env->sd, env->dst_rq->rd))
goto out_balanced;
}
local = &sds.local_stat; busiest = &sds.busiest_stat;
@@ -8000,6 +8054,12 @@ static struct sched_group *find_busiest_group(struct lb_env *env) force_balance: /* Looks like there is an imbalance. Compute it */ calculate_imbalance(env, &sds);
- /* Is this the correct place to clear this flag? Should access
- to flag be locked? Revisit.
- */
- clear_sd_overutilized(env->sd, env->dst_rq->rd);
Have specific sequency to clear overutilized flag for root domain and SD level 2? Like firstly clean root domain flag and then clear SD level 2 flag.
We don't need a specific sequence. As and when each sched domain get balanced, the corresponding flag will get cleared.
IIUC, it isn't quite right to clear the flag here. The sched_domain may have more than two groups, for example a 4+4 big.LITTLE topology has four groups at the lowest level. Load-balance may resolve over-utilization in one sched_group, but one or more other groups might still be over-utilized
They won't be balanced this time anyway, so the flag might be re-enabled again at the next load-balance, so it may not be a problem, it just makes it a bit harder to understand.
Even in the case where we only have two sched_group, load-balance isn't guaranteed to result no groups being over-utilized. The source group may be so heavily loaded that it over-utilizes the destination group too after the load has been balanced.
Morten IMPORTANT NOTICE: The contents of this email and any attachments are confidential and may also be privileged. If you are not the intended recipient, please notify the sender immediately and do not disclose the contents to any other person, use it for any purpose, or store or copy the information in any medium. Thank you.
On Mon, Dec 12, 2016 at 05:00:10PM +0100, Vincent Guittot wrote:
On 9 December 2016 at 18:06, Morten Rasmussen morten.rasmussen@arm.com wrote:
On Thu, Dec 08, 2016 at 11:42:50AM -0500, Thara Gopinath wrote:
On 12/08/2016 05:24 AM, Leo Yan wrote:
On Wed, Dec 07, 2016 at 05:22:37PM -0500, Thara Gopinath wrote:
- /* If the domain util is greater that domain capacity, load balancing
- needs to be done at the next sched domain level as well
- */
- if (sds->total_capacity * 1024 < sds->total_util * capacity_margin) {
/* If already at the highest domain nothing can be done */
if (env->sd->parent)
set_sd_overutilized(env->sd->parent,
env->dst_rq->rd);
So usually this will set root domain's flag after the whole schedule domain util greater than domain capacity. If CPU has one "misfit" task then scheduler will not reach this condition, so this will not set root domain's flag and introduce delay to migrate task.
Hmm yes. you are correct. we may have to handle misfit tasks separately.
I don't quite understand why we have to set the flag on the parent sched_domain here. It should be set anyways when update_sd_lb_stats() is
But the load balance will not be triggered if the flag is not set and update_sd_lb_stats will not be called. So once we have updated the statistics of the current domain and we consider that this domain is overutilized, we set the overutilized flag to the parent so the load balance will happen at the next level involving more cpus
Okay. IIUC, we one criteria for setting the flag at the current level, and another one for setting it at the next level to 'call for help'.
To me it seems that we have distinct scenarios that possibly require a flag each: One to tell that we need to balance within the domain, we are sure that we can fix things locally, and another one tell we need help from a another domain, i.e. we need to balance at the parent domain level.
called for the parent sched_domain in the next iteration of for_each_domain() in rebalance_domains(). But I see that we bails out early there, which I don't quite understand either. More on that in a separate reply ;-)
I agree with Leo that total_util doesn't flag some quite important cases as overutilized.
yes the case of a single task that requires more capacity that the max capacity_orig of cpus in the domain is not tested
Okay. I think we need to think that in otherwise the solution won't work for asymmetric cpu capacity systems.
Thanks, Morten IMPORTANT NOTICE: The contents of this email and any attachments are confidential and may also be privileged. If you are not the intended recipient, please notify the sender immediately and do not disclose the contents to any other person, use it for any purpose, or store or copy the information in any medium. Thank you.
Hello Morten,
Thanks for the review. Sorry for the delay in reply.
On 12/09/2016 12:06 PM, Morten Rasmussen wrote:
Hi Thara,
Thanks for sharing your proposal.
On Thu, Dec 08, 2016 at 11:42:50AM -0500, Thara Gopinath wrote:
On 12/08/2016 05:24 AM, Leo Yan wrote:
On Wed, Dec 07, 2016 at 05:22:37PM -0500, Thara Gopinath wrote:
@@ -7701,17 +7743,26 @@ next_group:
env->src_grp_nr_running = sds->busiest_stat.sum_nr_running;
- /* Setting overutilized flag might not be necessary here
- Revisit
- */ if (!lb_sd_parent(env->sd)) { /* update overload indicator if we are at root domain */ if (env->dst_rq->rd->overload != overload) env->dst_rq->rd->overload = overload;
- }
/* Update over-utilization (tipping point, U >= 0) indicator */
if (env->dst_rq->rd->overutilized != overutilized)
env->dst_rq->rd->overutilized = overutilized;
- } else {
if (!env->dst_rq->rd->overutilized && overutilized)
env->dst_rq->rd->overutilized = true;
- if (overutilized)
set_sd_overutilized(env->sd, env->dst_rq->rd);
If it's not overutilized, here should call function clear_sd_overutilized()? The old code clears root domain flag.
Yes. May be we should. I am not sure about it. Because we clear the flag at a different place in this implementation.
I think Leo is right. If we have just visited all cpus in the sched_domain and not found that any of them are over-utilized, we should be able to clear the flag for the domain. I don't see why we shouldn't if the intention with the flag is to indicate whether the domain is over-utilized or not?
Yes I will clear the flag here.
- /* If the domain util is greater that domain capacity, load balancing
- needs to be done at the next sched domain level as well
- */
- if (sds->total_capacity * 1024 < sds->total_util * capacity_margin) {
/* If already at the highest domain nothing can be done */
if (env->sd->parent)
set_sd_overutilized(env->sd->parent,
env->dst_rq->rd);
So usually this will set root domain's flag after the whole schedule domain util greater than domain capacity. If CPU has one "misfit" task then scheduler will not reach this condition, so this will not set root domain's flag and introduce delay to migrate task.
Hmm yes. you are correct. we may have to handle misfit tasks separately.
I don't quite understand why we have to set the flag on the parent sched_domain here. It should be set anyways when update_sd_lb_stats() is called for the parent sched_domain in the next iteration of for_each_domain() in rebalance_domains(). But I see that we bails out early there, which I don't quite understand either. More on that in a separate reply ;-)
We bail out in rebalance_domains if no group is over loaded in a sched domain. Setting the flag here ensures that the domain is not skipped in rebalance_domain.
I agree with Leo that total_util doesn't flag some quite important cases as overutilized.
} }
@@ -7932,8 +7983,11 @@ static struct sched_group *find_busiest_group(struct lb_env *env) */ update_sd_lb_stats(env, &sds);
- if (energy_aware() && !env->dst_rq->rd->overutilized)
goto out_balanced;
/* Is this check really required here?? Revisit */
if (energy_aware()) {
if (!is_sd_overutilized(env->sd, env->dst_rq->rd))
goto out_balanced;
}
local = &sds.local_stat; busiest = &sds.busiest_stat;
@@ -8000,6 +8054,12 @@ static struct sched_group *find_busiest_group(struct lb_env *env) force_balance: /* Looks like there is an imbalance. Compute it */ calculate_imbalance(env, &sds);
- /* Is this the correct place to clear this flag? Should access
- to flag be locked? Revisit.
- */
- clear_sd_overutilized(env->sd, env->dst_rq->rd);
Have specific sequency to clear overutilized flag for root domain and SD level 2? Like firstly clean root domain flag and then clear SD level 2 flag.
We don't need a specific sequence. As and when each sched domain get balanced, the corresponding flag will get cleared.
IIUC, it isn't quite right to clear the flag here. The sched_domain may have more than two groups, for example a 4+4 big.LITTLE topology has four groups at the lowest level. Load-balance may resolve over-utilization in one sched_group, but one or more other groups might still be over-utilized
I don't understand this. IIUC load balance is between the sched groups of a domain. Irrespective of how many groups are there call to load_balance() will balance the load between all the groups in the domain(affinity being an exception). Am I missing something here?
They won't be balanced this time anyway, so the flag might be re-enabled again at the next load-balance, so it may not be a problem, it just makes it a bit harder to understand.
Even in the case where we only have two sched_group, load-balance isn't guaranteed to result no groups being over-utilized. The source group may be so heavily loaded that it over-utilizes the destination group too after the load has been balanced.
This I agree. This is not the correct place. I can move this under out_balanced in load_balance function. Any other recommendation on where this flag can be conclusively cleared?
Morten IMPORTANT NOTICE: The contents of this email and any attachments are confidential and may also be privileged. If you are not the intended recipient, please notify the sender immediately and do not disclose the contents to any other person, use it for any purpose, or store or copy the information in any medium. Thank you.
-- Regards Thara
On Thu, Dec 15, 2016 at 03:37:12PM -0500, Thara Gopinath wrote:
Hello Morten,
Thanks for the review. Sorry for the delay in reply.
No problem.
On 12/09/2016 12:06 PM, Morten Rasmussen wrote:
On Thu, Dec 08, 2016 at 11:42:50AM -0500, Thara Gopinath wrote:
On 12/08/2016 05:24 AM, Leo Yan wrote:
On Wed, Dec 07, 2016 at 05:22:37PM -0500, Thara Gopinath wrote:
- /* If the domain util is greater that domain capacity, load balancing
- needs to be done at the next sched domain level as well
- */
- if (sds->total_capacity * 1024 < sds->total_util * capacity_margin) {
/* If already at the highest domain nothing can be done */
if (env->sd->parent)
set_sd_overutilized(env->sd->parent,
env->dst_rq->rd);
So usually this will set root domain's flag after the whole schedule domain util greater than domain capacity. If CPU has one "misfit" task then scheduler will not reach this condition, so this will not set root domain's flag and introduce delay to migrate task.
Hmm yes. you are correct. we may have to handle misfit tasks separately.
I don't quite understand why we have to set the flag on the parent sched_domain here. It should be set anyways when update_sd_lb_stats() is called for the parent sched_domain in the next iteration of for_each_domain() in rebalance_domains(). But I see that we bails out early there, which I don't quite understand either. More on that in a separate reply ;-)
We bail out in rebalance_domains if no group is over loaded in a sched domain. Setting the flag here ensures that the domain is not skipped in rebalance_domain.
Okay. I think idea of setting a flag on those domains that can safely be skipped is good, my concern is the conditions for setting it. Comparing total_util to total_capacity won't reveal 'misfit' tasks on big.LITTLE as Leo pointed out. I think your proposal should work fine on SMP platforms, but for big.LITTLE we need to set the flag for additional cases to deal with the 'misfit' tasks.
I agree with Leo that total_util doesn't flag some quite important cases as overutilized.
} }
@@ -7932,8 +7983,11 @@ static struct sched_group *find_busiest_group(struct lb_env *env) */ update_sd_lb_stats(env, &sds);
- if (energy_aware() && !env->dst_rq->rd->overutilized)
goto out_balanced;
/* Is this check really required here?? Revisit */
if (energy_aware()) {
if (!is_sd_overutilized(env->sd, env->dst_rq->rd))
goto out_balanced;
}
local = &sds.local_stat; busiest = &sds.busiest_stat;
@@ -8000,6 +8054,12 @@ static struct sched_group *find_busiest_group(struct lb_env *env) force_balance: /* Looks like there is an imbalance. Compute it */ calculate_imbalance(env, &sds);
- /* Is this the correct place to clear this flag? Should access
- to flag be locked? Revisit.
- */
- clear_sd_overutilized(env->sd, env->dst_rq->rd);
Have specific sequency to clear overutilized flag for root domain and SD level 2? Like firstly clean root domain flag and then clear SD level 2 flag.
We don't need a specific sequence. As and when each sched domain get balanced, the corresponding flag will get cleared.
IIUC, it isn't quite right to clear the flag here. The sched_domain may have more than two groups, for example a 4+4 big.LITTLE topology has four groups at the lowest level. Load-balance may resolve over-utilization in one sched_group, but one or more other groups might still be over-utilized
I don't understand this. IIUC load balance is between the sched groups of a domain. Irrespective of how many groups are there call to load_balance() will balance the load between all the groups in the domain(affinity being an exception). Am I missing something here?
You are right that load_balance() will eventually balance between all groups, but only after it has been called by a cpu in each group (minus one to be precise). load_balance() is pull-based, when it is called it will only pull tasks to the destination cpu, which is the cpu where the function call happens in most cases (nohz_idle_balance() is the exception). So you need to try to pull in all directions before things are balanced.
My point is that, pulling in one direction between two groups, doesn't guarantee that all over-utilization has been resolved. For example, if two groups are over-utilized, it will take two calls to load_balance() from non-over-utilized cpu(s) to resolve the problem as each call only pulls from one group. Clearing the over-utilized flag after the first load-balance() is therefore somewhat misleading.
They won't be balanced this time anyway, so the flag might be re-enabled again at the next load-balance, so it may not be a problem, it just makes it a bit harder to understand.
Even in the case where we only have two sched_group, load-balance isn't guaranteed to result no groups being over-utilized. The source group may be so heavily loaded that it over-utilizes the destination group too after the load has been balanced.
This I agree. This is not the correct place. I can move this under out_balanced in load_balance function. Any other recommendation on where this flag can be conclusively cleared?
If we want to keep the flag up until all over-utilization has been addressed, we can only clear it if all child domains have cleared theirs.
Maybe you can count the number of over-utilized child domains in update_sd_lb_stats(). If it is >1 then we know that we didn't fix all imbalances this time and we should leave the flag up. Would that work?
Morten IMPORTANT NOTICE: The contents of this email and any attachments are confidential and may also be privileged. If you are not the intended recipient, please notify the sender immediately and do not disclose the contents to any other person, use it for any purpose, or store or copy the information in any medium. Thank you.
Hi Morten,
On 12/16/2016 04:18 AM, Morten Rasmussen wrote:
On Thu, Dec 15, 2016 at 03:37:12PM -0500, Thara Gopinath wrote:
Hello Morten,
Thanks for the review. Sorry for the delay in reply.
No problem.
On 12/09/2016 12:06 PM, Morten Rasmussen wrote:
On Thu, Dec 08, 2016 at 11:42:50AM -0500, Thara Gopinath wrote:
On 12/08/2016 05:24 AM, Leo Yan wrote:
On Wed, Dec 07, 2016 at 05:22:37PM -0500, Thara Gopinath wrote:
- /* If the domain util is greater that domain capacity, load balancing
- needs to be done at the next sched domain level as well
- */
- if (sds->total_capacity * 1024 < sds->total_util * capacity_margin) {
/* If already at the highest domain nothing can be done */
if (env->sd->parent)
set_sd_overutilized(env->sd->parent,
env->dst_rq->rd);
So usually this will set root domain's flag after the whole schedule domain util greater than domain capacity. If CPU has one "misfit" task then scheduler will not reach this condition, so this will not set root domain's flag and introduce delay to migrate task.
Hmm yes. you are correct. we may have to handle misfit tasks separately.
I don't quite understand why we have to set the flag on the parent sched_domain here. It should be set anyways when update_sd_lb_stats() is called for the parent sched_domain in the next iteration of for_each_domain() in rebalance_domains(). But I see that we bails out early there, which I don't quite understand either. More on that in a separate reply ;-)
We bail out in rebalance_domains if no group is over loaded in a sched domain. Setting the flag here ensures that the domain is not skipped in rebalance_domain.
Okay. I think idea of setting a flag on those domains that can safely be skipped is good, my concern is the conditions for setting it. Comparing total_util to total_capacity won't reveal 'misfit' tasks on big.LITTLE as Leo pointed out. I think your proposal should work fine on SMP platforms, but for big.LITTLE we need to set the flag for additional cases to deal with the 'misfit' tasks.
Yes I agree with this one. I will try to incorporate this in the next version.
I agree with Leo that total_util doesn't flag some quite important cases as overutilized.
} }
@@ -7932,8 +7983,11 @@ static struct sched_group *find_busiest_group(struct lb_env *env) */ update_sd_lb_stats(env, &sds);
- if (energy_aware() && !env->dst_rq->rd->overutilized)
goto out_balanced;
/* Is this check really required here?? Revisit */
if (energy_aware()) {
if (!is_sd_overutilized(env->sd, env->dst_rq->rd))
goto out_balanced;
}
local = &sds.local_stat; busiest = &sds.busiest_stat;
@@ -8000,6 +8054,12 @@ static struct sched_group *find_busiest_group(struct lb_env *env) force_balance: /* Looks like there is an imbalance. Compute it */ calculate_imbalance(env, &sds);
- /* Is this the correct place to clear this flag? Should access
- to flag be locked? Revisit.
- */
- clear_sd_overutilized(env->sd, env->dst_rq->rd);
Have specific sequency to clear overutilized flag for root domain and SD level 2? Like firstly clean root domain flag and then clear SD level 2 flag.
We don't need a specific sequence. As and when each sched domain get balanced, the corresponding flag will get cleared.
IIUC, it isn't quite right to clear the flag here. The sched_domain may have more than two groups, for example a 4+4 big.LITTLE topology has four groups at the lowest level. Load-balance may resolve over-utilization in one sched_group, but one or more other groups might still be over-utilized
I don't understand this. IIUC load balance is between the sched groups of a domain. Irrespective of how many groups are there call to load_balance() will balance the load between all the groups in the domain(affinity being an exception). Am I missing something here?
You are right that load_balance() will eventually balance between all groups, but only after it has been called by a cpu in each group (minus one to be precise). load_balance() is pull-based, when it is called it will only pull tasks to the destination cpu, which is the cpu where the function call happens in most cases (nohz_idle_balance() is the exception). So you need to try to pull in all directions before things are balanced.
My point is that, pulling in one direction between two groups, doesn't guarantee that all over-utilization has been resolved. For example, if two groups are over-utilized, it will take two calls to load_balance() from non-over-utilized cpu(s) to resolve the problem as each call only pulls from one group. Clearing the over-utilized flag after the first load-balance() is therefore somewhat misleading.
They won't be balanced this time anyway, so the flag might be re-enabled again at the next load-balance, so it may not be a problem, it just makes it a bit harder to understand.
Even in the case where we only have two sched_group, load-balance isn't guaranteed to result no groups being over-utilized. The source group may be so heavily loaded that it over-utilizes the destination group too after the load has been balanced.
This I agree. This is not the correct place. I can move this under out_balanced in load_balance function. Any other recommendation on where this flag can be conclusively cleared?
If we want to keep the flag up until all over-utilization has been addressed, we can only clear it if all child domains have cleared theirs.
Maybe you can count the number of over-utilized child domains in update_sd_lb_stats(). If it is >1 then we know that we didn't fix all imbalances this time and we should leave the flag up. Would that work?
I have been thinking about this. I think clearing the flag in update_sd_lb if the group is not overutilized should be okay for now. If the group is overutilized the flag will remain set and the next cpu in the group can attempt a load balance. If the group is not overutilized, we anyways bail out in find_busiest_group. Do you think it makes sense?
Morten IMPORTANT NOTICE: The contents of this email and any attachments are confidential and may also be privileged. If you are not the intended recipient, please notify the sender immediately and do not disclose the contents to any other person, use it for any purpose, or store or copy the information in any medium. Thank you.
-- Regards Thara
On Wed, Dec 07, 2016 at 05:22:37PM -0500, Thara Gopinath wrote:
The current implementation of overutilization, aborts energy aware scheduling if any cpu in the system is over-utilized. This patch introduces over utilization flag per sched group level instead of a single flag system wide. Load balancing is done at the sched domain where any of the sched group is over utilized. If energy aware scheduling is enabled and no sched group in a sched domain is overuttilized, load balancing is skipped for that sched domain and energy aware scheduling continues at that level.
The implementation is based on two points
- For every cpu in every sched domain the first group is the group that contains the cpu itself.
- sched groups are shared between cpus.
Thus if a sched group is overutilized the overutilized flag is set at the first sched group of the parent sched domain. This ensures a load balancing at the overutilzed sched domain level. For example consider a big little system with two little cpu's (CPU A and CPU B) and two big cpu's (CPU C and CPU D). In this system, the hierarchy will be as follows CPU A SD level 1 - SG1 (CPUA), SG2 (CPUB) SD level 2 - SG5(CPUA, CPUB), SG6(CPU C, CPU D) RD
CPU B SD level 1 - SG2(CPUB), SG1 (CPUA) SD level 2 - SG5(CPU A, CPU B), SG6(CPU C, CPUD) RD
CPU C SD level 1 - SG3(CPU C), SG4 (CPUD) SD level 2 - SG6(CPUC, CPUD), SG5(CPUA, CPU B) RD
CPU D SD level 1 - SG4(CPU D), SG3(CPU C) SD level2 - SG6(CPUC, CPU D), SG5(CPU A, APU B) RD
In the above system if CPUA is overutilized, the overutilized flag is set at SG5(parent sched domain first sched group). Similarly if CPUB is overutilized, the flag is set at SG5. During load balancing, at SD level 1, the overutilized flag is checked at the parent sched domain, first sched group level(SG5). If there is no parent sched domain, then the flag is set/checked at the root domain. This ensures that load balancing happens irrespective of which cpu is over utilized in a sched domain.
Signed-off-by: Thara Gopinath thara.gopinath@linaro.org
kernel/sched/fair.c | 108 ++++++++++++++++++++++++++++++++++++++++++--------- kernel/sched/sched.h | 1 + 2 files changed, 90 insertions(+), 19 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 01fa969..0c97e0a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4559,6 +4559,36 @@ static inline void hrtick_update(struct rq *rq)
static bool cpu_overutilized(int cpu);
+static bool +is_sd_overutilized(struct sched_domain *sd, struct root_domain *rd) +{
if (sd && sd->parent)
return sd->parent->groups->overutilized;
if (!rd)
return false;
return rd->overutilized;
+}
+static void +set_sd_overutilized(struct sched_domain *sd, struct root_domain *rd) +{
if (sd && sd->parent)
sd->parent->groups->overutilized = true;
else if (rd)
rd->overutilized = true;
+}
+static void +clear_sd_overutilized(struct sched_domain *sd, struct root_domain *rd) +{
if (sd && sd->parent)
sd->parent->groups->overutilized = false;
else if (rd)
rd->overutilized = false;
+}
/*
- The enqueue_task method is called before nr_running is
- increased. Here we update the fair scheduling stats and
@@ -4568,6 +4598,7 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) { struct cfs_rq *cfs_rq;
struct sched_domain *sd; struct sched_entity *se = &p->se; int task_new = !(flags & ENQUEUE_WAKEUP);
@@ -4603,9 +4634,12 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (!se) { add_nr_running(rq, 1);
if (!task_new && !rq->rd->overutilized &&
cpu_overutilized(rq->cpu))
rq->rd->overutilized = true;
rcu_read_lock();
sd = rcu_dereference(rq->sd);
if (!task_new && !is_sd_overutilized(sd, rq->rd) &&
cpu_overutilized(rq->cpu))
set_sd_overutilized(sd, rq->rd);
rcu_read_unlock(); } hrtick_update(rq);
} @@ -5989,8 +6023,6 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu) unsigned long max_spare = 0; struct sched_domain *sd;
rcu_read_lock();
sd = rcu_dereference(per_cpu(sd_ea, prev_cpu)); if (!sd)
@@ -6028,7 +6060,6 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu) }
unlock:
rcu_read_unlock(); if (energy_cpu == prev_cpu && !cpu_overutilized(prev_cpu)) return prev_cpu;
@@ -6063,10 +6094,16 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f && cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); }
if (energy_aware() && !(cpu_rq(prev_cpu)->rd->overutilized))
return select_energy_cpu_brute(p, prev_cpu);
rcu_read_lock();
sd = rcu_dereference(cpu_rq(prev_cpu)->sd);
if (energy_aware() &&
!is_sd_overutilized(sd,
cpu_rq(cpu)->rd)) {
new_cpu = select_energy_cpu_brute(p, prev_cpu);
goto unlock;
This is slightly different from what we currently do. Now we may do energy-aware task placement even when some cpus might be over-utilized other domains. The search space in select_energy_cpu_brute() is the sched_domain pointed at by the sd_ea pointer. For big.LITTLE systems like the one you describe in your example, this would point to the top level and therefore iterate over all cpus, including some the ones in the other cluster not covered by the !is_sd_overutilized(cpu_rq(prev_cpu)->sd) check.
I don't think it is major problem, as I don't recall select_energy_cpu_brute() making any assumptions about cpus not being over-utilized. In fact, the appear to be skipped already.
}
sd = NULL; for_each_domain(cpu, tmp) { if (!(tmp->flags & SD_LOAD_BALANCE)) break;
@@ -6131,6 +6168,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f } /* while loop will break here if sd == NULL */ }
+unlock: rcu_read_unlock();
return new_cpu;
@@ -7178,6 +7217,7 @@ struct sd_lb_stats { struct sched_group *local; /* Local group in this sd */ unsigned long total_load; /* Total load of all groups in sd */ unsigned long total_capacity; /* Total capacity of all groups in sd */
unsigned long total_util; /* Total util of all groups in sd */ unsigned long avg_load; /* Average load across all groups in sd */ struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
@@ -7197,6 +7237,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds) .local = NULL, .total_load = 0UL, .total_capacity = 0UL,
.total_util = 0UL, .busiest_stat = { .avg_load = 0UL, .sum_nr_running = 0,
@@ -7692,6 +7733,7 @@ next_group: /* Now, start updating sd_lb_stats */ sds->total_load += sgs->group_load; sds->total_capacity += sgs->group_capacity;
sds->total_util += sgs->group_util; sg = sg->next; } while (sg != env->sd->groups);
@@ -7701,17 +7743,26 @@ next_group:
env->src_grp_nr_running = sds->busiest_stat.sum_nr_running;
/* Setting overutilized flag might not be necessary here
* Revisit
*/ if (!lb_sd_parent(env->sd)) { /* update overload indicator if we are at root domain */ if (env->dst_rq->rd->overload != overload) env->dst_rq->rd->overload = overload;
}
/* Update over-utilization (tipping point, U >= 0) indicator */
if (env->dst_rq->rd->overutilized != overutilized)
env->dst_rq->rd->overutilized = overutilized;
} else {
if (!env->dst_rq->rd->overutilized && overutilized)
env->dst_rq->rd->overutilized = true;
if (overutilized)
set_sd_overutilized(env->sd, env->dst_rq->rd);
/* If the domain util is greater that domain capacity, load balancing
* needs to be done at the next sched domain level as well
*/
if (sds->total_capacity * 1024 < sds->total_util * capacity_margin) {
/* If already at the highest domain nothing can be done */
if (env->sd->parent)
set_sd_overutilized(env->sd->parent,
env->dst_rq->rd);
As I said in my other reply. I don't quite understand why this total_util condition is necessary.
}
}
@@ -7932,8 +7983,11 @@ static struct sched_group *find_busiest_group(struct lb_env *env) */ update_sd_lb_stats(env, &sds);
if (energy_aware() && !env->dst_rq->rd->overutilized)
goto out_balanced;
/* Is this check really required here?? Revisit */
if (energy_aware()) {
if (!is_sd_overutilized(env->sd, env->dst_rq->rd))
goto out_balanced;
}
We have just analyzed all cpus in the sched_domain in update_sd_lb_stats() so it seems reasonable to skip this domain here if we didn't find any over-utilized cpus. Agreed.
local = &sds.local_stat; busiest = &sds.busiest_stat;
@@ -8000,6 +8054,12 @@ static struct sched_group *find_busiest_group(struct lb_env *env) force_balance: /* Looks like there is an imbalance. Compute it */ calculate_imbalance(env, &sds);
/* Is this the correct place to clear this flag? Should access
* to flag be locked? Revisit.
*/
clear_sd_overutilized(env->sd, env->dst_rq->rd);
As I said in my other reply, I think it isn't quite right to clear the flag here.
return sds.busiest;
out_balanced: @@ -8790,6 +8850,11 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
rcu_read_lock(); for_each_domain(cpu, sd) {
if (energy_aware()) {
if (!is_sd_overutilized(sd, rq->rd))
continue;
}
So here were iterating through the sched_domains from lowest to highest. At each level we check if the domain has already been found determined to be over-utilized. IIUC, this scheme is based on the total_util condition that sets the flag on the parent domain, otherwise we would have to iterate over the sched_groups to see if any of them had their over-utilized flag set. For example, if the little cpus are over-utilized and the big cpus have to discover this fact when they balance across clusters.
If we iterated over the groups instead, we would solve the case where a single cpu is over-utilized that Leo pointed out. Also I'm not sure if the setting the flag on the parent sched_domain scheme works with more than two sched_domain levels?
It is quite likely that I'm missing something. I haven't actually tested your patch yet.
Thanks, Morten IMPORTANT NOTICE: The contents of this email and any attachments are confidential and may also be privileged. If you are not the intended recipient, please notify the sender immediately and do not disclose the contents to any other person, use it for any purpose, or store or copy the information in any medium. Thank you.
On 12/09/2016 12:42 PM, Morten Rasmussen wrote:
On Wed, Dec 07, 2016 at 05:22:37PM -0500, Thara Gopinath wrote:
The current implementation of overutilization, aborts energy aware scheduling if any cpu in the system is over-utilized. This patch introduces over utilization flag per sched group level instead of a single flag system wide. Load balancing is done at the sched domain where any of the sched group is over utilized. If energy aware scheduling is enabled and no sched group in a sched domain is overuttilized, load balancing is skipped for that sched domain and energy aware scheduling continues at that level.
The implementation is based on two points
- For every cpu in every sched domain the first group is the group that contains the cpu itself.
- sched groups are shared between cpus.
Thus if a sched group is overutilized the overutilized flag is set at the first sched group of the parent sched domain. This ensures a load balancing at the overutilzed sched domain level. For example consider a big little system with two little cpu's (CPU A and CPU B) and two big cpu's (CPU C and CPU D). In this system, the hierarchy will be as follows CPU A SD level 1 - SG1 (CPUA), SG2 (CPUB) SD level 2 - SG5(CPUA, CPUB), SG6(CPU C, CPU D) RD
CPU B SD level 1 - SG2(CPUB), SG1 (CPUA) SD level 2 - SG5(CPU A, CPU B), SG6(CPU C, CPUD) RD
CPU C SD level 1 - SG3(CPU C), SG4 (CPUD) SD level 2 - SG6(CPUC, CPUD), SG5(CPUA, CPU B) RD
CPU D SD level 1 - SG4(CPU D), SG3(CPU C) SD level2 - SG6(CPUC, CPU D), SG5(CPU A, APU B) RD
In the above system if CPUA is overutilized, the overutilized flag is set at SG5(parent sched domain first sched group). Similarly if CPUB is overutilized, the flag is set at SG5. During load balancing, at SD level 1, the overutilized flag is checked at the parent sched domain, first sched group level(SG5). If there is no parent sched domain, then the flag is set/checked at the root domain. This ensures that load balancing happens irrespective of which cpu is over utilized in a sched domain.
Signed-off-by: Thara Gopinath thara.gopinath@linaro.org
kernel/sched/fair.c | 108 ++++++++++++++++++++++++++++++++++++++++++--------- kernel/sched/sched.h | 1 + 2 files changed, 90 insertions(+), 19 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 01fa969..0c97e0a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4559,6 +4559,36 @@ static inline void hrtick_update(struct rq *rq)
static bool cpu_overutilized(int cpu);
+static bool +is_sd_overutilized(struct sched_domain *sd, struct root_domain *rd) +{
if (sd && sd->parent)
return sd->parent->groups->overutilized;
if (!rd)
return false;
return rd->overutilized;
+}
+static void +set_sd_overutilized(struct sched_domain *sd, struct root_domain *rd) +{
if (sd && sd->parent)
sd->parent->groups->overutilized = true;
else if (rd)
rd->overutilized = true;
+}
+static void +clear_sd_overutilized(struct sched_domain *sd, struct root_domain *rd) +{
if (sd && sd->parent)
sd->parent->groups->overutilized = false;
else if (rd)
rd->overutilized = false;
+}
/*
- The enqueue_task method is called before nr_running is
- increased. Here we update the fair scheduling stats and
@@ -4568,6 +4598,7 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) { struct cfs_rq *cfs_rq;
struct sched_domain *sd; struct sched_entity *se = &p->se; int task_new = !(flags & ENQUEUE_WAKEUP);
@@ -4603,9 +4634,12 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (!se) { add_nr_running(rq, 1);
if (!task_new && !rq->rd->overutilized &&
cpu_overutilized(rq->cpu))
rq->rd->overutilized = true;
rcu_read_lock();
sd = rcu_dereference(rq->sd);
if (!task_new && !is_sd_overutilized(sd, rq->rd) &&
cpu_overutilized(rq->cpu))
set_sd_overutilized(sd, rq->rd);
rcu_read_unlock(); } hrtick_update(rq);
} @@ -5989,8 +6023,6 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu) unsigned long max_spare = 0; struct sched_domain *sd;
rcu_read_lock();
sd = rcu_dereference(per_cpu(sd_ea, prev_cpu)); if (!sd)
@@ -6028,7 +6060,6 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu) }
unlock:
rcu_read_unlock(); if (energy_cpu == prev_cpu && !cpu_overutilized(prev_cpu)) return prev_cpu;
@@ -6063,10 +6094,16 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f && cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); }
if (energy_aware() && !(cpu_rq(prev_cpu)->rd->overutilized))
return select_energy_cpu_brute(p, prev_cpu);
rcu_read_lock();
sd = rcu_dereference(cpu_rq(prev_cpu)->sd);
if (energy_aware() &&
!is_sd_overutilized(sd,
cpu_rq(cpu)->rd)) {
new_cpu = select_energy_cpu_brute(p, prev_cpu);
goto unlock;
This is slightly different from what we currently do. Now we may do energy-aware task placement even when some cpus might be over-utilized other domains. The search space in select_energy_cpu_brute() is the sched_domain pointed at by the sd_ea pointer. For big.LITTLE systems like the one you describe in your example, this would point to the top level and therefore iterate over all cpus, including some the ones in the other cluster not covered by the !is_sd_overutilized(cpu_rq(prev_cpu)->sd) check.
I don't think it is major problem, as I don't recall select_energy_cpu_brute() making any assumptions about cpus not being over-utilized. In fact, the appear to be skipped already.
}
sd = NULL; for_each_domain(cpu, tmp) { if (!(tmp->flags & SD_LOAD_BALANCE)) break;
@@ -6131,6 +6168,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f } /* while loop will break here if sd == NULL */ }
+unlock: rcu_read_unlock();
return new_cpu;
@@ -7178,6 +7217,7 @@ struct sd_lb_stats { struct sched_group *local; /* Local group in this sd */ unsigned long total_load; /* Total load of all groups in sd */ unsigned long total_capacity; /* Total capacity of all groups in sd */
unsigned long total_util; /* Total util of all groups in sd */ unsigned long avg_load; /* Average load across all groups in sd */ struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
@@ -7197,6 +7237,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds) .local = NULL, .total_load = 0UL, .total_capacity = 0UL,
.total_util = 0UL, .busiest_stat = { .avg_load = 0UL, .sum_nr_running = 0,
@@ -7692,6 +7733,7 @@ next_group: /* Now, start updating sd_lb_stats */ sds->total_load += sgs->group_load; sds->total_capacity += sgs->group_capacity;
sds->total_util += sgs->group_util; sg = sg->next; } while (sg != env->sd->groups);
@@ -7701,17 +7743,26 @@ next_group:
env->src_grp_nr_running = sds->busiest_stat.sum_nr_running;
/* Setting overutilized flag might not be necessary here
* Revisit
*/ if (!lb_sd_parent(env->sd)) { /* update overload indicator if we are at root domain */ if (env->dst_rq->rd->overload != overload) env->dst_rq->rd->overload = overload;
}
/* Update over-utilization (tipping point, U >= 0) indicator */
if (env->dst_rq->rd->overutilized != overutilized)
env->dst_rq->rd->overutilized = overutilized;
} else {
if (!env->dst_rq->rd->overutilized && overutilized)
env->dst_rq->rd->overutilized = true;
if (overutilized)
set_sd_overutilized(env->sd, env->dst_rq->rd);
/* If the domain util is greater that domain capacity, load balancing
* needs to be done at the next sched domain level as well
*/
if (sds->total_capacity * 1024 < sds->total_util * capacity_margin) {
/* If already at the highest domain nothing can be done */
if (env->sd->parent)
set_sd_overutilized(env->sd->parent,
env->dst_rq->rd);
As I said in my other reply. I don't quite understand why this total_util condition is necessary.
}
}
@@ -7932,8 +7983,11 @@ static struct sched_group *find_busiest_group(struct lb_env *env) */ update_sd_lb_stats(env, &sds);
if (energy_aware() && !env->dst_rq->rd->overutilized)
goto out_balanced;
/* Is this check really required here?? Revisit */
if (energy_aware()) {
if (!is_sd_overutilized(env->sd, env->dst_rq->rd))
goto out_balanced;
}
We have just analyzed all cpus in the sched_domain in update_sd_lb_stats() so it seems reasonable to skip this domain here if we didn't find any over-utilized cpus. Agreed.
Ok.
local = &sds.local_stat; busiest = &sds.busiest_stat;
@@ -8000,6 +8054,12 @@ static struct sched_group *find_busiest_group(struct lb_env *env) force_balance: /* Looks like there is an imbalance. Compute it */ calculate_imbalance(env, &sds);
/* Is this the correct place to clear this flag? Should access
* to flag be locked? Revisit.
*/
clear_sd_overutilized(env->sd, env->dst_rq->rd);
As I said in my other reply, I think it isn't quite right to clear the flag here.
return sds.busiest;
out_balanced: @@ -8790,6 +8850,11 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
rcu_read_lock(); for_each_domain(cpu, sd) {
if (energy_aware()) {
if (!is_sd_overutilized(sd, rq->rd))
continue;
}
So here were iterating through the sched_domains from lowest to highest. At each level we check if the domain has already been found determined to be over-utilized. IIUC, this scheme is based on the total_util condition that sets the flag on the parent domain, otherwise we would have to iterate over the sched_groups to see if any of them had their over-utilized flag set. For example, if the little cpus are over-utilized and the big cpus have to discover this fact when they balance across clusters.
That is true. If little cpu's are over-utilized,load-balancing on the big cpu will miss it , in this implementation. I was under the assumption that load balancing on the little cpu correcting the imbalance is sufficient. There can be a time lag. I was more focusing on avoiding the loop in rebalance_domains. So is it not acceptable to wait until the load balance on the little cpu corrects itself out? Pardon me , if my question is too dumb as I am new to this.
Regards Thara
If we iterated over the groups instead, we would solve the case where a single cpu is over-utilized that Leo pointed out. Also I'm not sure if the setting the flag on the parent sched_domain scheme works with more than two sched_domain levels?
It is quite likely that I'm missing something. I haven't actually tested your patch yet.
Thanks, Morten IMPORTANT NOTICE: The contents of this email and any attachments are confidential and may also be privileged. If you are not the intended recipient, please notify the sender immediately and do not disclose the contents to any other person, use it for any purpose, or store or copy the information in any medium. Thank you.
-- Regards Thara
On Thu, Dec 15, 2016 at 03:51:01PM -0500, Thara Gopinath wrote:
On 12/09/2016 12:42 PM, Morten Rasmussen wrote:
On Wed, Dec 07, 2016 at 05:22:37PM -0500, Thara Gopinath wrote:
@@ -8790,6 +8850,11 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
rcu_read_lock(); for_each_domain(cpu, sd) {
if (energy_aware()) {
if (!is_sd_overutilized(sd, rq->rd))
continue;
}
So here were iterating through the sched_domains from lowest to highest. At each level we check if the domain has already been found determined to be over-utilized. IIUC, this scheme is based on the total_util condition that sets the flag on the parent domain, otherwise we would have to iterate over the sched_groups to see if any of them had their over-utilized flag set. For example, if the little cpus are over-utilized and the big cpus have to discover this fact when they balance across clusters.
That is true. If little cpu's are over-utilized,load-balancing on the big cpu will miss it , in this implementation. I was under the assumption that load balancing on the little cpu correcting the imbalance is sufficient. There can be a time lag. I was more focusing on avoiding the loop in rebalance_domains. So is it not acceptable to wait until the load balance on the little cpu corrects itself out? Pardon me , if my question is too dumb as I am new to this.
Please keep asking questions :-) They are all very valid.
IMHO, it is fine to wait for the little cpus to sort out the over-utilization amongst themselves if we are sure that they are indeed able to do so. In that case I agree with you. However, if they can't sort it out, like in the 'misfit' task case, we need to involve the big cpus.
I think the whole thing will become a bit more clear if we can sort out a proposal for how to deal with asymmetric cpu capacities (big.LITTLE). I think you need to introduce additional criteria for setting the flag for those systems, but you may be able to make criteria depend on the SD_ASYM_CPUCAPACITY so SMP can avoid them.
Thanks, Morten IMPORTANT NOTICE: The contents of this email and any attachments are confidential and may also be privileged. If you are not the intended recipient, please notify the sender immediately and do not disclose the contents to any other person, use it for any purpose, or store or copy the information in any medium. Thank you.
On 12/16/2016 05:07 AM, Morten Rasmussen wrote:
On Thu, Dec 15, 2016 at 03:51:01PM -0500, Thara Gopinath wrote:
On 12/09/2016 12:42 PM, Morten Rasmussen wrote:
On Wed, Dec 07, 2016 at 05:22:37PM -0500, Thara Gopinath wrote:
@@ -8790,6 +8850,11 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
rcu_read_lock(); for_each_domain(cpu, sd) {
if (energy_aware()) {
if (!is_sd_overutilized(sd, rq->rd))
continue;
}
So here were iterating through the sched_domains from lowest to highest. At each level we check if the domain has already been found determined to be over-utilized. IIUC, this scheme is based on the total_util condition that sets the flag on the parent domain, otherwise we would have to iterate over the sched_groups to see if any of them had their over-utilized flag set. For example, if the little cpus are over-utilized and the big cpus have to discover this fact when they balance across clusters.
That is true. If little cpu's are over-utilized,load-balancing on the big cpu will miss it , in this implementation. I was under the assumption that load balancing on the little cpu correcting the imbalance is sufficient. There can be a time lag. I was more focusing on avoiding the loop in rebalance_domains. So is it not acceptable to wait until the load balance on the little cpu corrects itself out? Pardon me , if my question is too dumb as I am new to this.
Please keep asking questions :-) They are all very valid.
IMHO, it is fine to wait for the little cpus to sort out the over-utilization amongst themselves if we are sure that they are indeed able to do so. In that case I agree with you. However, if they can't sort it out, like in the 'misfit' task case, we need to involve the big cpus.
I think the whole thing will become a bit more clear if we can sort out a proposal for how to deal with asymmetric cpu capacities (big.LITTLE). I think you need to introduce additional criteria for setting the flag for those systems, but you may be able to make criteria depend on the SD_ASYM_CPUCAPACITY so SMP can avoid them.
I will try to incorporate a check for misfit tasks in my next version. I will keep this check for time being as is.
Thanks, Morten IMPORTANT NOTICE: The contents of this email and any attachments are confidential and may also be privileged. If you are not the intended recipient, please notify the sender immediately and do not disclose the contents to any other person, use it for any purpose, or store or copy the information in any medium. Thank you.
-- Regards Thara
Hi Thara,
On Wed, Dec 07, 2016 at 05:22:37PM -0500, Thara Gopinath wrote:
The current implementation of overutilization, aborts energy aware scheduling if any cpu in the system is over-utilized. This patch introduces over utilization flag per sched group level instead of a single flag system wide. Load balancing is done at the sched domain where any of the sched group is over utilized. If energy aware scheduling is enabled and no sched group in a sched domain is overuttilized, load balancing is skipped for that sched domain and energy aware scheduling continues at that level.
The implementation is based on two points
- For every cpu in every sched domain the first group is the group that contains the cpu itself.
- sched groups are shared between cpus.
Thus if a sched group is overutilized the overutilized flag is set at the first sched group of the parent sched domain. This ensures a load balancing at the overutilzed sched domain level. For example consider a big little system with two little cpu's (CPU A and CPU B) and two big cpu's (CPU C and CPU D). In this system, the hierarchy will be as follows CPU A SD level 1 - SG1 (CPUA), SG2 (CPUB) SD level 2 - SG5(CPUA, CPUB), SG6(CPU C, CPU D) RD
CPU B SD level 1 - SG2(CPUB), SG1 (CPUA) SD level 2 - SG5(CPU A, CPU B), SG6(CPU C, CPUD) RD
CPU C SD level 1 - SG3(CPU C), SG4 (CPUD) SD level 2 - SG6(CPUC, CPUD), SG5(CPUA, CPU B) RD
CPU D SD level 1 - SG4(CPU D), SG3(CPU C) SD level2 - SG6(CPUC, CPU D), SG5(CPU A, APU B) RD
In the above system if CPUA is overutilized, the overutilized flag is set at SG5(parent sched domain first sched group). Similarly if CPUB is overutilized, the flag is set at SG5. During load balancing, at SD level 1, the overutilized flag is checked at the parent sched domain, first sched group level(SG5). If there is no parent sched domain, then the flag is set/checked at the root domain. This ensures that load balancing happens irrespective of which cpu is over utilized in a sched domain.
I did some verification for this patch on Juno, please note I verified this patch on EASv5.2 code but not latest EAS code base; there have four test cases:
- Case 1: one ramp up task from duty cycle 10% to 90%, every step increases 10% [1];
Please see analysis result in [5]: The line with Magenta color: LITTLE cluster sched domain flag The line with Yellow color: Big cluster sched domain flag The line with Red color: Root domain flag
- Case 2: 4 middle workload tasks (util_avg ~= 300 < LITTLE core's capacity 447 * 0.8 = 358); check if task can spread out in LITTLE cluster [2];
- Case 3: 2 big tasks (util_avg = 870, > 1024 * 0.8); check if tasks can be spread out in big cluster [3];
- Case 4: 6 big tasks (util_avg = 870, > 1024 * 0.8); check if tasks can be spread out within two clusters [4];
- Below are summary from the plots:
During ramp up task running, root domain's overutilized flag does not set [5], so "misfit" task cannot rely on "overutilized" flag to migrate task from LITTLE cluster to big cluster;
If there have big tasks and after these tasks are migrated onto big cluster, the LITTLE cluster "overutilized" flag cannot be cleared immediately; the flag keeps very long time until it have chance to clear it in load balance [5][7];
In big cluster if every CPU is "overutilized", the "overutilized" flag for big cluster is frequently seted and cleared, we should expect this value keeps "true"; In LITTLE cluster if every CPU is "overutilized", the "overutilized" flag can stay "true" during this period [8];
For LITTLE cluster "overutilized" flag, it only works after whole system is "overutilized". This is for 6 big tasks case, but for 4 middle tasks case, LITTLE cluster "overutilized" flag doesn't set; so if there have several tasks on LITTLE cluster, we cannot rely on LITTLE cluster "overutilized" flag to spread tasks within LITTLE cluster [6].
[1] http://people.linaro.org/~leo.yan/per_sched_domain_overutilized_flag/test_ov... [2] http://people.linaro.org/~leo.yan/per_sched_domain_overutilized_flag/test_ov... [3] http://people.linaro.org/~leo.yan/per_sched_domain_overutilized_flag/test_ov... [4] http://people.linaro.org/~leo.yan/per_sched_domain_overutilized_flag/test_ov... [5] http://people.linaro.org/~leo.yan/per_sched_domain_overutilized_flag/1_ramp_... [6] http://people.linaro.org/~leo.yan/per_sched_domain_overutilized_flag/4_middl... [7] http://people.linaro.org/~leo.yan/per_sched_domain_overutilized_flag/2_big_t... [8] http://people.linaro.org/~leo.yan/per_sched_domain_overutilized_flag/6_big_t...
Thanks, Leo Yan
Sorry for spamming, clarify two things.
On Mon, Dec 12, 2016 at 03:07:47PM +0800, Leo Yan wrote: [...]
Case 1: one ramp up task from duty cycle 10% to 90%, every step increases 10% [1];
Please see analysis result in [5]: The line with Magenta color: LITTLE cluster sched domain flag The line with Yellow color: Big cluster sched domain flag The line with Red color: Root domain flag
Case 2: 4 middle workload tasks (util_avg ~= 300 < LITTLE core's capacity 447 * 0.8 = 358); check if task can spread out in LITTLE cluster [2];
Though this error does not impact final conclusion, but should correct as LITTLE core's max capacity = 593; so 593 * 0.8 = 474.
[...]
For LITTLE cluster "overutilized" flag, it only works after whole system is "overutilized". This is for 6 big tasks case, but for 4 middle tasks case, LITTLE cluster "overutilized" flag doesn't set; so if there have several tasks on LITTLE cluster, we cannot rely on LITTLE cluster "overutilized" flag to spread tasks within LITTLE cluster [6].
I personally think this may not an issue, due we can reply on EAS core algorithm to select best CPU within cluster. So we doesn't rely on "overutilized" flag to spread tasks in LITTLE cluster. And after whole system is "overutilized", then we disable "EAS" path and use SMP traditional load balance. Agree?
Thanks, Leo Yan
Hi Leo,
On Mon, Dec 12, 2016 at 03:22:05PM +0800, Leo Yan wrote:
For LITTLE cluster "overutilized" flag, it only works after whole system is "overutilized". This is for 6 big tasks case, but for 4 middle tasks case, LITTLE cluster "overutilized" flag doesn't set; so if there have several tasks on LITTLE cluster, we cannot rely on LITTLE cluster "overutilized" flag to spread tasks within LITTLE cluster [6].
I personally think this may not an issue, due we can reply on EAS core algorithm to select best CPU within cluster. So we doesn't rely on "overutilized" flag to spread tasks in LITTLE cluster. And after whole system is "overutilized", then we disable "EAS" path and use SMP traditional load balance. Agree?
For 4 middle-sized tasks the EAS-code should hopefully spread the tasks at wake-up. I'm more concerned about big tasks on the little cluster as those would only be migrated by the 'misfit'-code. IIUC, in case we have n_big_tasks < n_little_cpus currently scheduled on the little cluster, 'overutilized' won't be set at the root_domain and hence the big cluster won't try to pull any of the big tasks. IMPORTANT NOTICE: The contents of this email and any attachments are confidential and may also be privileged. If you are not the intended recipient, please notify the sender immediately and do not disclose the contents to any other person, use it for any purpose, or store or copy the information in any medium. Thank you.
Hi Morten,
On Mon, Dec 12, 2016 at 02:02:46PM +0000, Morten Rasmussen wrote:
Hi Leo,
On Mon, Dec 12, 2016 at 03:22:05PM +0800, Leo Yan wrote:
For LITTLE cluster "overutilized" flag, it only works after whole system is "overutilized". This is for 6 big tasks case, but for 4 middle tasks case, LITTLE cluster "overutilized" flag doesn't set; so if there have several tasks on LITTLE cluster, we cannot rely on LITTLE cluster "overutilized" flag to spread tasks within LITTLE cluster [6].
I personally think this may not an issue, due we can reply on EAS core algorithm to select best CPU within cluster. So we doesn't rely on "overutilized" flag to spread tasks in LITTLE cluster. And after whole system is "overutilized", then we disable "EAS" path and use SMP traditional load balance. Agree?
For 4 middle-sized tasks the EAS-code should hopefully spread the tasks at wake-up.
If we want to support spreading 4 middle-size tasks, the question is what's the criteria to set "overutilized" flag for LITTLE cluster's sched domain?
I'm more concerned about big tasks on the little cluster as those would only be migrated by the 'misfit'-code. IIUC, in case we have n_big_tasks < n_little_cpus currently scheduled on the little cluster, 'overutilized' won't be set at the root_domain and hence the big cluster won't try to pull any of the big tasks.
Correct, please see the ramp up task case in my another email: after the task become a 'misfit' task, the root domain's flag doesn't set. So should refine for this part.
Thanks, Leo Yan
On Mon, Dec 12, 2016 at 10:35:24PM +0800, Leo Yan wrote:
Hi Morten,
On Mon, Dec 12, 2016 at 02:02:46PM +0000, Morten Rasmussen wrote:
Hi Leo,
On Mon, Dec 12, 2016 at 03:22:05PM +0800, Leo Yan wrote:
For LITTLE cluster "overutilized" flag, it only works after whole system is "overutilized". This is for 6 big tasks case, but for 4 middle tasks case, LITTLE cluster "overutilized" flag doesn't set; so if there have several tasks on LITTLE cluster, we cannot rely on LITTLE cluster "overutilized" flag to spread tasks within LITTLE cluster [6].
I personally think this may not an issue, due we can reply on EAS core algorithm to select best CPU within cluster. So we doesn't rely on "overutilized" flag to spread tasks in LITTLE cluster. And after whole system is "overutilized", then we disable "EAS" path and use SMP traditional load balance. Agree?
For 4 middle-sized tasks the EAS-code should hopefully spread the tasks at wake-up.
If we want to support spreading 4 middle-size tasks, the question is what's the criteria to set "overutilized" flag for LITTLE cluster's sched domain?
The EAS-code at wake-up should be fine as long tasks do regularly wake up, i.e. no task has a utilization approaching or exceeding the computer capacity available. Otherwise, we may need help from the big cluster.
Ideally, the criteria for calling for help (setting the overutilized flag at the root_domain for this RFC patch) should be:
It is not possible to balance the tasks within the cluster such that every cpu has a minimum of spare cycles.
Figuring out the whether it is possible to balance the tasks such that none of them are constrained in term of available cpu cycles isn't easy. Until now we have taken a much more conservative approach by making the criteria: If any cpu is over-utilized. As this is very easy to determine and should cover all the cases covered by the ideal criteria above, although we will call for help in many cases where it isn't necessary.
A half-baked thought:
If we assume that the wake-up EAS-code does a good job, could we set the flag if the wake target cpu ends up being over-utilized? If EAS failed to find enough capacity for the task it must be due to one the following reasons: 1. Because the task is too big to fit the raw capacity offered by the little cpus. 2. The utilization of other tasks leaves too little spare capacity left for the task, and it is not possible to reorganize the task distribution to get sufficient non-fragmented spare capacity. 3. Spare capacity is fragmented, but it would be possible to reorganize the tasks to provide the necessary spare capacity.
1. and 2. should be fine as those are cases where we do need help from the big cluster. 3. is more difficult as while it is theoretically possible to sort things out, it might take a long time to do so, in the meantime one or more tasks will suffer.
It might look like we bring things almost back to where they are, but not quite, as we would still only flag that the little cluster needs help rather than putting the entire system into over-utilization mode.
The RFC-patch already raises the flag for the little cluster if just one cpu is over-utilized, but it doesn't set it on the root_domain, so balancing will only be enabled within the little cluster. If we raise at the root_domain as well, it should work. No?
As I said, it is just a thought.
I'm more concerned about big tasks on the little cluster as those would only be migrated by the 'misfit'-code. IIUC, in case we have n_big_tasks < n_little_cpus currently scheduled on the little cluster, 'overutilized' won't be set at the root_domain and hence the big cluster won't try to pull any of the big tasks.
Correct, please see the ramp up task case in my another email: after the task become a 'misfit' task, the root domain's flag doesn't set. So should refine for this part.
Agreed.
Thanks, Morten IMPORTANT NOTICE: The contents of this email and any attachments are confidential and may also be privileged. If you are not the intended recipient, please notify the sender immediately and do not disclose the contents to any other person, use it for any purpose, or store or copy the information in any medium. Thank you.
On Mon, Dec 12, 2016 at 03:28:06PM +0000, Morten Rasmussen wrote:
[...]
For 4 middle-sized tasks the EAS-code should hopefully spread the tasks at wake-up.
If we want to support spreading 4 middle-size tasks, the question is what's the criteria to set "overutilized" flag for LITTLE cluster's sched domain?
The EAS-code at wake-up should be fine as long tasks do regularly wake up, i.e. no task has a utilization approaching or exceeding the computer capacity available. Otherwise, we may need help from the big cluster.
Ideally, the criteria for calling for help (setting the overutilized flag at the root_domain for this RFC patch) should be:
It is not possible to balance the tasks within the cluster such that every cpu has a minimum of spare cycles.
Figuring out the whether it is possible to balance the tasks such that none of them are constrained in term of available cpu cycles isn't easy. Until now we have taken a much more conservative approach by making the criteria: If any cpu is over-utilized. As this is very easy to determine and should cover all the cases covered by the ideal criteria above, although we will call for help in many cases where it isn't necessary.
A half-baked thought:
If we assume that the wake-up EAS-code does a good job, could we set the flag if the wake target cpu ends up being over-utilized? If EAS failed to find enough capacity for the task it must be due to one the following reasons: 1. Because the task is too big to fit the raw capacity offered by the little cpus. 2. The utilization of other tasks leaves too little spare capacity left for the task, and it is not possible to reorganize the task distribution to get sufficient non-fragmented spare capacity. 3. Spare capacity is fragmented, but it would be possible to reorganize the tasks to provide the necessary spare capacity.
- and 2. should be fine as those are cases where we do need help from
the big cluster. 3. is more difficult as while it is theoretically possible to sort things out, it might take a long time to do so, in the meantime one or more tasks will suffer.
Your good summary actually reminds another important thing: we have given multiple meanings to "overutilized" this single one flag, as result this requires the single flag (a bool value) to handle multiple cases.
So I try to summary semantics for "overutilized" flag as below:
- Inner "overutilized": This kind "overutilized" is the schedule domain internal issue and can be adjusted within schedule domain; so scheduler should find best combination between tasks and CPUs in the schedule domain, this also is quite match the case which you mantioned in upper item 3;
- Outer "overutilized": This kind "overutilized" is that schedule domain cannot adjust by itself so ask other scheduler domains to help pull tasks. Usually we expects this is a higher capacity scheduler domain to pull tasks so can improve performance; This is quite match upper items 1/2;
- Global "overutilized": This kind "overutilized" is that schedule domains should spread tasks as possible and this may happen for task migration from higher capacity schedule domain to lower capacity schedule domain;
IIUC, in original code "rd->overutilized" flag is used to indicate these three semantics; and in Thara's patch the "sd->overutilized" flag is used for inner "overutilized" case, and use "rd->overutilized" to indicate outer "overutilized" case and global "overutilized" case. So Thara's patch is difficult to handle situation for 'misfit'.
I prefer to we can distinguish upper three semantics properly, something like below define macros:
#define SCHED_INNER_OVERUTILIZED 0x1 #define SCHED_OUTER_OVERUTILIZED 0x2 #define SCHED_GLOBAL_OVERUTILIZED 0x4
So we use all three macros for "sd->overutilized" and only use SCHED_GLOBAL_OVERUTILIZED for "rd->overutilized". So for example, if any schedule domain has set SCHED_OUTER_OVERUTILIZED, that means we could check the local schedule group with higher capacity than busiest schedule group and execute load balance.
Please feel free correct me if wrong.
Thanks, Leo Yan
On Tue, Dec 13, 2016 at 01:20:07AM +0800, Leo Yan wrote:
On Mon, Dec 12, 2016 at 03:28:06PM +0000, Morten Rasmussen wrote:
[...]
For 4 middle-sized tasks the EAS-code should hopefully spread the tasks at wake-up.
If we want to support spreading 4 middle-size tasks, the question is what's the criteria to set "overutilized" flag for LITTLE cluster's sched domain?
The EAS-code at wake-up should be fine as long tasks do regularly wake up, i.e. no task has a utilization approaching or exceeding the computer capacity available. Otherwise, we may need help from the big cluster.
Ideally, the criteria for calling for help (setting the overutilized flag at the root_domain for this RFC patch) should be:
It is not possible to balance the tasks within the cluster such that every cpu has a minimum of spare cycles.
Figuring out the whether it is possible to balance the tasks such that none of them are constrained in term of available cpu cycles isn't easy. Until now we have taken a much more conservative approach by making the criteria: If any cpu is over-utilized. As this is very easy to determine and should cover all the cases covered by the ideal criteria above, although we will call for help in many cases where it isn't necessary.
A half-baked thought:
If we assume that the wake-up EAS-code does a good job, could we set the flag if the wake target cpu ends up being over-utilized? If EAS failed to find enough capacity for the task it must be due to one the following reasons: 1. Because the task is too big to fit the raw capacity offered by the little cpus. 2. The utilization of other tasks leaves too little spare capacity left for the task, and it is not possible to reorganize the task distribution to get sufficient non-fragmented spare capacity. 3. Spare capacity is fragmented, but it would be possible to reorganize the tasks to provide the necessary spare capacity.
- and 2. should be fine as those are cases where we do need help from
the big cluster. 3. is more difficult as while it is theoretically possible to sort things out, it might take a long time to do so, in the meantime one or more tasks will suffer.
Your good summary actually reminds another important thing: we have given multiple meanings to "overutilized" this single one flag, as result this requires the single flag (a bool value) to handle multiple cases.
So I try to summary semantics for "overutilized" flag as below:
Inner "overutilized": This kind "overutilized" is the schedule domain internal issue and can be adjusted within schedule domain; so scheduler should find best combination between tasks and CPUs in the schedule domain, this also is quite match the case which you mantioned in upper item 3;
Outer "overutilized": This kind "overutilized" is that schedule domain cannot adjust by itself so ask other scheduler domains to help pull tasks. Usually we expects this is a higher capacity scheduler domain to pull tasks so can improve performance; This is quite match upper items 1/2;
Global "overutilized": This kind "overutilized" is that schedule domains should spread tasks as possible and this may happen for task migration from higher capacity schedule domain to lower capacity schedule domain;
IIUC, in original code "rd->overutilized" flag is used to indicate these three semantics; and in Thara's patch the "sd->overutilized" flag is used for inner "overutilized" case, and use "rd->overutilized" to indicate outer "overutilized" case and global "overutilized" case. So Thara's patch is difficult to handle situation for 'misfit'.
I prefer to we can distinguish upper three semantics properly, something like below define macros:
#define SCHED_INNER_OVERUTILIZED 0x1 #define SCHED_OUTER_OVERUTILIZED 0x2 #define SCHED_GLOBAL_OVERUTILIZED 0x4
So we use all three macros for "sd->overutilized" and only use SCHED_GLOBAL_OVERUTILIZED for "rd->overutilized". So for example, if any schedule domain has set SCHED_OUTER_OVERUTILIZED, that means we could check the local schedule group with higher capacity than busiest schedule group and execute load balance.
Please feel free correct me if wrong.
I should have read your email before I wrote the reply I just sent to Vincent and given you credit for your proposal. I fully agree with you that for asymmetric cpu capacity systems we have additional meanings of being 'over-utilized' that can be addressed in different ways.
I'm not sure exactly how we would determine when we are 'global over-utilized' and distinguish it from 'outer over-utilized'. That requires a bit more pondering.
Regarding the flags in Thara's proposal. sd->overutilization can be a parent flag as well if you have more than two sched_domain levels. We need to consider more levels to have a scalable solution.
Thanks, Morten IMPORTANT NOTICE: The contents of this email and any attachments are confidential and may also be privileged. If you are not the intended recipient, please notify the sender immediately and do not disclose the contents to any other person, use it for any purpose, or store or copy the information in any medium. Thank you.
On Fri, Dec 16, 2016 at 10:37:23AM +0000, Morten Rasmussen wrote:
[...]
I'm not sure exactly how we would determine when we are 'global over-utilized' and distinguish it from 'outer over-utilized'. That requires a bit more pondering.
Regarding the flags in Thara's proposal. sd->overutilization can be a parent flag as well if you have more than two sched_domain levels. We need to consider more levels to have a scalable solution.
How about below code? I tweaked some based on Thara's patch, the main change is to use SD Level 1's flag for 'inner over-utilized', this part is absent now, I did some search on wiki page and found the algorithm for 'Assignment problem' is not easy to add into scheduler.
So this patch just drafted the implementation for 'outer overutilized' by using SD Level 1 flag and use rd->overutilized to indicate 'global over-utilized'. But I think it's easy to extend to support 'inner overutilized' after we have solution for 'Assignment problem'.
For 'global over-utilized' criteria, I reuse Thara's definition:
+ if (sds->total_capacity * 1024 < sds->total_util * capacity_margin) + set_rd_overutilized(env->dst_rq->rd); + else + clear_rd_overutilized(env->dst_rq->rd);
---8<---
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index cf56241..eeaea72 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4200,6 +4200,83 @@ static inline void hrtick_update(struct rq *rq) #ifdef CONFIG_SMP static bool cpu_overutilized(int cpu); static inline unsigned long boosted_cpu_util(int cpu); + +/* + * Some thoghts to set overutilized flag: + * + * ### SD Level 1 + * + * If we can find algorithm for best combination between CPUs + * and tasks we can use it to check if need set first schedule + * domain level's "overutilized" flag: + * + * if (assign_algorithm()) + * sd->groups->overutilized = true; + * + * After set this flag, the load balance will happen only + * in SD Level 1, so this means only take affact on clustser + * internally. + * + * So far unfortunately this part is absent due the algorithm + * is complex [1]...
+ * [1] https://en.wikipedia.org/wiki/Assignment_problem + * + * ### SD Level 2 + * + * If the CPU has misfit on it, it's no doubt to migrate task + * to another high capacity CPU. + * + * Or if one CPU is overutilized and we assume now scheduler has + * done good enough work to explore cluster internal capacity, so + * if one CPU is overutilized that means finally need seek another + * cluster to provide more computing capacity. + * + * For upper two cases, we set SD level 2 flag. So later this flag is + * used by any CPU trigger load balance in the same schedule domain. + * + * We can add more strick criteria for migration, like the dstination + * CPU acpaicty >= source CPU capacity, then we can finally permit + * the task migration to higher capacity CPU for this case. + * + * ### Root domain + * + * If set root domain flag, means explore performance as possible + * to spread out tasks. + * + */ +static void set_sd_overutilized(struct sched_domain *sd) +{ + if (sd && sd->parent) + sd->parent->groups->overutilized = true; +} + +static void set_rd_overutilized(struct root_domain *rd) +{ + rd->overutilized = true; +} + +static void clear_rd_overutilized(struct root_domain *rd) +{ + rd->overutilized = false; +} + +static bool is_sd_overutilized(struct sched_domain *sd) +{ + struct sched_group *group = sd->groups; + int cpu = smp_processor_id(); + + if (cpu_rq(cpu)->rd->overutilized) + return true; + + do { + if (group->overutilized) + return true; + + } while (group = group->next, group != sd->groups); + + return false; +} + #else #define boosted_cpu_util(cpu) cpu_util(cpu) #endif @@ -4228,6 +4305,7 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) { struct cfs_rq *cfs_rq; + struct sched_domain *sd; struct sched_entity *se = &p->se; #ifdef CONFIG_SMP int task_new = flags & ENQUEUE_WAKEUP_NEW; @@ -4292,11 +4370,12 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (!se) { walt_inc_cumulative_runnable_avg(rq, p); - if (!task_new && !rq->rd->overutilized && - cpu_overutilized(rq->cpu)) { - rq->rd->overutilized = true; - trace_sched_overutilized(true); - } + + rcu_read_lock(); + sd = rcu_dereference(rq->sd); + if (!task_new && cpu_overutilized(rq->cpu)) + set_sd_overutilized(sd); + rcu_read_unlock();
/* * We want to potentially trigger a freq switch @@ -5921,7 +6000,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f }
if (!sd) { - if (energy_aware() && !cpu_rq(cpu)->rd->overutilized) + sd = rcu_dereference(cpu_rq(prev_cpu)->sd); + if (energy_aware() && !is_sd_overutilized(sd)) new_cpu = energy_aware_wake_cpu(p, prev_cpu, sync); else if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ new_cpu = select_idle_sibling(p, new_cpu); @@ -7003,6 +7083,7 @@ struct sd_lb_stats { struct sched_group *local; /* Local group in this sd */ unsigned long total_load; /* Total load of all groups in sd */ unsigned long total_capacity; /* Total capacity of all groups in sd */ + unsigned long total_util; /* Total util of all groups in sd */ unsigned long avg_load; /* Average load across all groups in sd */
struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */ @@ -7022,6 +7103,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds) .local = NULL, .total_load = 0UL, .total_capacity = 0UL, + .total_util = 0UL, .busiest_stat = { .avg_load = 0UL, .sum_nr_running = 0, @@ -7343,10 +7425,11 @@ group_type group_classify(struct sched_group *group, static inline void update_sg_lb_stats(struct lb_env *env, struct sched_group *group, int load_idx, int local_group, struct sg_lb_stats *sgs, - bool *overload, bool *overutilized) + bool *overload) { unsigned long load; int i, nr_running; + bool overutilized = false;
memset(sgs, 0, sizeof(*sgs));
@@ -7379,7 +7462,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->idle_cpus++;
if (cpu_overutilized(i)) { - *overutilized = true; + overutilized = true; if (!sgs->group_misfit_task && rq->misfit_task) sgs->group_misfit_task = capacity_of(i); } @@ -7396,6 +7479,9 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->group_no_capacity = group_is_overloaded(env, sgs); sgs->group_type = group_classify(group, sgs); + + if (group->overutilized != overutilized) + group->overutilized = overutilized; }
/** @@ -7504,7 +7590,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd struct sched_group *sg = env->sd->groups; struct sg_lb_stats tmp_sgs; int load_idx, prefer_sibling = 0; - bool overload = false, overutilized = false; + bool overload = false;
if (child && child->flags & SD_PREFER_SIBLING) prefer_sibling = 1; @@ -7526,7 +7612,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd }
update_sg_lb_stats(env, sg, load_idx, local_group, sgs, - &overload, &overutilized); + &overload);
if (local_group) goto next_group; @@ -7566,6 +7652,7 @@ next_group: /* Now, start updating sd_lb_stats */ sds->total_load += sgs->group_load; sds->total_capacity += sgs->group_capacity; + sds->total_util += sgs->group_util;
sg = sg->next; } while (sg != env->sd->groups); @@ -7580,18 +7667,16 @@ next_group: if (env->dst_rq->rd->overload != overload) env->dst_rq->rd->overload = overload;
- /* Update over-utilization (tipping point, U >= 0) indicator */ - if (env->dst_rq->rd->overutilized != overutilized) { - env->dst_rq->rd->overutilized = overutilized; - trace_sched_overutilized(overutilized); - } - } else { - if (!env->dst_rq->rd->overutilized && overutilized) { - env->dst_rq->rd->overutilized = true; - trace_sched_overutilized(true); - } - } + /* + * If overall util is greater that overall capacity, + * set root domain's overutlized flag. + */ + if (sds->total_capacity * 1024 < sds->total_util * capacity_margin) + set_rd_overutilized(env->dst_rq->rd); + else + clear_rd_overutilized(env->dst_rq->rd);
+ } }
/** @@ -7834,7 +7919,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) */ update_sd_lb_stats(env, &sds);
- if (energy_aware() && !env->dst_rq->rd->overutilized) + if (energy_aware() && !is_sd_overutilized(env->sd)) goto out_balanced;
local = &sds.local_stat; @@ -8748,6 +8833,10 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
rcu_read_lock(); for_each_domain(cpu, sd) { + + if (energy_aware() && !is_sd_overutilized(sd)) + continue; + /* * Decay the newidle max times here because this is a regular * visit to all the domains. Decay ~1% per second. @@ -9039,6 +9128,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) { struct cfs_rq *cfs_rq; struct sched_entity *se = &curr->se; + struct sched_domain *sd;
for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); @@ -9049,12 +9139,13 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) task_tick_numa(rq, curr);
#ifdef CONFIG_SMP - if (!rq->rd->overutilized && cpu_overutilized(task_cpu(curr))) { - rq->rd->overutilized = true; - trace_sched_overutilized(true); - } - rq->misfit_task = !task_fits_max(curr, rq->cpu); + + rcu_read_lock(); + sd = rcu_dereference(rq->sd); + if (cpu_overutilized(task_cpu(curr)) || rq->misfit_task) + set_sd_overutilized(sd); + rcu_read_unlock(); #endif
} diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 2f2b959..ca2cedb 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -916,6 +916,7 @@ struct sched_group { unsigned int group_weight; struct sched_group_capacity *sgc; const struct sched_group_energy const *sge; + bool overutilized;
/* * The CPUs this group covers.
On 12 December 2016 at 18:20, Leo Yan leo.yan@linaro.org wrote:
On Mon, Dec 12, 2016 at 03:28:06PM +0000, Morten Rasmussen wrote:
[...]
For 4 middle-sized tasks the EAS-code should hopefully spread the tasks at wake-up.
If we want to support spreading 4 middle-size tasks, the question is what's the criteria to set "overutilized" flag for LITTLE cluster's sched domain?
The EAS-code at wake-up should be fine as long tasks do regularly wake up, i.e. no task has a utilization approaching or exceeding the computer capacity available. Otherwise, we may need help from the big cluster.
Ideally, the criteria for calling for help (setting the overutilized flag at the root_domain for this RFC patch) should be:
It is not possible to balance the tasks within the cluster such that every cpu has a minimum of spare cycles.
Figuring out the whether it is possible to balance the tasks such that none of them are constrained in term of available cpu cycles isn't easy. Until now we have taken a much more conservative approach by making the criteria: If any cpu is over-utilized. As this is very easy to determine and should cover all the cases covered by the ideal criteria above, although we will call for help in many cases where it isn't necessary.
A half-baked thought:
If we assume that the wake-up EAS-code does a good job, could we set the flag if the wake target cpu ends up being over-utilized? If EAS failed to find enough capacity for the task it must be due to one the following reasons: 1. Because the task is too big to fit the raw capacity offered by the little cpus. 2. The utilization of other tasks leaves too little spare capacity left for the task, and it is not possible to reorganize the task distribution to get sufficient non-fragmented spare capacity. 3. Spare capacity is fragmented, but it would be possible to reorganize the tasks to provide the necessary spare capacity.
- and 2. should be fine as those are cases where we do need help from
the big cluster. 3. is more difficult as while it is theoretically possible to sort things out, it might take a long time to do so, in the meantime one or more tasks will suffer.
Your good summary actually reminds another important thing: we have given multiple meanings to "overutilized" this single one flag, as result this requires the single flag (a bool value) to handle multiple cases.
So I try to summary semantics for "overutilized" flag as below:
Inner "overutilized": This kind "overutilized" is the schedule domain internal issue and can be adjusted within schedule domain; so scheduler should find best combination between tasks and CPUs in the schedule domain, this also is quite match the case which you mantioned in upper item 3;
Outer "overutilized": This kind "overutilized" is that schedule domain cannot adjust by itself so ask other scheduler domains to help pull tasks. Usually we expects this is a higher capacity scheduler domain to pull tasks so can improve performance; This is quite match upper items 1/2;
Global "overutilized": This kind "overutilized" is that schedule domains should spread tasks as possible and this may happen for task migration from higher capacity schedule domain to lower capacity schedule domain;
I agree that we should make the disctinction between overutlization that can be handled by current sched_domain (inner) and those that must be addressed by upper levels (outer) but I don't see the need of a third global state which is redundant with outer
IIUC, in original code "rd->overutilized" flag is used to indicate these three semantics; and in Thara's patch the "sd->overutilized" flag is used for inner "overutilized" case, and use "rd->overutilized" to indicate outer "overutilized" case and global "overutilized" case. So Thara's patch is difficult to handle situation for 'misfit'.
I prefer to we can distinguish upper three semantics properly, something like below define macros:
#define SCHED_INNER_OVERUTILIZED 0x1 #define SCHED_OUTER_OVERUTILIZED 0x2 #define SCHED_GLOBAL_OVERUTILIZED 0x4
So we use all three macros for "sd->overutilized" and only use SCHED_GLOBAL_OVERUTILIZED for "rd->overutilized". So for example, if any schedule domain has set SCHED_OUTER_OVERUTILIZED, that means we could check the local schedule group with higher capacity than busiest schedule group and execute load balance.
Please feel free correct me if wrong.
Thanks, Leo Yan
On Mon, Dec 19, 2016 at 08:24:18AM +0100, Vincent Guittot wrote:
[...]
A half-baked thought:
If we assume that the wake-up EAS-code does a good job, could we set the flag if the wake target cpu ends up being over-utilized? If EAS failed to find enough capacity for the task it must be due to one the following reasons: 1. Because the task is too big to fit the raw capacity offered by the little cpus. 2. The utilization of other tasks leaves too little spare capacity left for the task, and it is not possible to reorganize the task distribution to get sufficient non-fragmented spare capacity. 3. Spare capacity is fragmented, but it would be possible to reorganize the tasks to provide the necessary spare capacity.
- and 2. should be fine as those are cases where we do need help from
the big cluster. 3. is more difficult as while it is theoretically possible to sort things out, it might take a long time to do so, in the meantime one or more tasks will suffer.
Your good summary actually reminds another important thing: we have given multiple meanings to "overutilized" this single one flag, as result this requires the single flag (a bool value) to handle multiple cases.
So I try to summary semantics for "overutilized" flag as below:
Inner "overutilized": This kind "overutilized" is the schedule domain internal issue and can be adjusted within schedule domain; so scheduler should find best combination between tasks and CPUs in the schedule domain, this also is quite match the case which you mantioned in upper item 3;
Outer "overutilized": This kind "overutilized" is that schedule domain cannot adjust by itself so ask other scheduler domains to help pull tasks. Usually we expects this is a higher capacity scheduler domain to pull tasks so can improve performance; This is quite match upper items 1/2;
Global "overutilized": This kind "overutilized" is that schedule domains should spread tasks as possible and this may happen for task migration from higher capacity schedule domain to lower capacity schedule domain;
I agree that we should make the disctinction between overutlization that can be handled by current sched_domain (inner) and those that must be addressed by upper levels (outer) but I don't see the need of a third global state which is redundant with outer
For global state, one benefit I can think out is for benchmark. If whole system is quite busy we can set global state and completely roll back to traditional SMP load balance. For outer "overutilize" flag, it only means there have tasks should be moved out from specific cluster to other clusters. Agree for this?
Thanks, Leo Yan
On 19 December 2016 at 15:22, Leo Yan leo.yan@linaro.org wrote:
On Mon, Dec 19, 2016 at 08:24:18AM +0100, Vincent Guittot wrote:
[...]
A half-baked thought:
If we assume that the wake-up EAS-code does a good job, could we set the flag if the wake target cpu ends up being over-utilized? If EAS failed to find enough capacity for the task it must be due to one the following reasons: 1. Because the task is too big to fit the raw capacity offered by the little cpus. 2. The utilization of other tasks leaves too little spare capacity left for the task, and it is not possible to reorganize the task distribution to get sufficient non-fragmented spare capacity. 3. Spare capacity is fragmented, but it would be possible to reorganize the tasks to provide the necessary spare capacity.
- and 2. should be fine as those are cases where we do need help from
the big cluster. 3. is more difficult as while it is theoretically possible to sort things out, it might take a long time to do so, in the meantime one or more tasks will suffer.
Your good summary actually reminds another important thing: we have given multiple meanings to "overutilized" this single one flag, as result this requires the single flag (a bool value) to handle multiple cases.
So I try to summary semantics for "overutilized" flag as below:
Inner "overutilized": This kind "overutilized" is the schedule domain internal issue and can be adjusted within schedule domain; so scheduler should find best combination between tasks and CPUs in the schedule domain, this also is quite match the case which you mantioned in upper item 3;
Outer "overutilized": This kind "overutilized" is that schedule domain cannot adjust by itself so ask other scheduler domains to help pull tasks. Usually we expects this is a higher capacity scheduler domain to pull tasks so can improve performance; This is quite match upper items 1/2;
Global "overutilized": This kind "overutilized" is that schedule domains should spread tasks as possible and this may happen for task migration from higher capacity schedule domain to lower capacity schedule domain;
I agree that we should make the disctinction between overutlization that can be handled by current sched_domain (inner) and those that must be addressed by upper levels (outer) but I don't see the need of a third global state which is redundant with outer
For global state, one benefit I can think out is for benchmark. If whole system is quite busy we can set global state and completely roll back to traditional SMP load balance. For outer "overutilize" flag, it only means there have tasks should be moved out from specific cluster to other clusters. Agree for this?
outer overutilize means outside the current sched_domain so it still don't see the interest of global state and how to set it compared to outer
Thanks, Leo Yan
On 12 December 2016 at 18:20, Leo Yan leo.yan@linaro.org wrote:
On Mon, Dec 12, 2016 at 03:28:06PM +0000, Morten Rasmussen wrote:
[...]
For 4 middle-sized tasks the EAS-code should hopefully spread the tasks at wake-up.
If we want to support spreading 4 middle-size tasks, the question is what's the criteria to set "overutilized" flag for LITTLE cluster's sched domain?
The EAS-code at wake-up should be fine as long tasks do regularly wake up, i.e. no task has a utilization approaching or exceeding the computer capacity available. Otherwise, we may need help from the big cluster.
Ideally, the criteria for calling for help (setting the overutilized flag at the root_domain for this RFC patch) should be:
It is not possible to balance the tasks within the cluster such that every cpu has a minimum of spare cycles.
Figuring out the whether it is possible to balance the tasks such that none of them are constrained in term of available cpu cycles isn't easy. Until now we have taken a much more conservative approach by making the criteria: If any cpu is over-utilized. As this is very easy to determine and should cover all the cases covered by the ideal criteria above, although we will call for help in many cases where it isn't necessary.
A half-baked thought:
If we assume that the wake-up EAS-code does a good job, could we set the flag if the wake target cpu ends up being over-utilized? If EAS failed to find enough capacity for the task it must be due to one the following reasons: 1. Because the task is too big to fit the raw capacity offered by the little cpus. 2. The utilization of other tasks leaves too little spare capacity left for the task, and it is not possible to reorganize the task distribution to get sufficient non-fragmented spare capacity. 3. Spare capacity is fragmented, but it would be possible to reorganize the tasks to provide the necessary spare capacity.
- and 2. should be fine as those are cases where we do need help from
the big cluster. 3. is more difficult as while it is theoretically possible to sort things out, it might take a long time to do so, in the meantime one or more tasks will suffer.
Your good summary actually reminds another important thing: we have given multiple meanings to "overutilized" this single one flag, as result this requires the single flag (a bool value) to handle multiple cases.
So I try to summary semantics for "overutilized" flag as below:
Inner "overutilized": This kind "overutilized" is the schedule domain internal issue and can be adjusted within schedule domain; so scheduler should find best combination between tasks and CPUs in the schedule domain, this also is quite match the case which you mantioned in upper item 3;
Outer "overutilized": This kind "overutilized" is that schedule domain cannot adjust by itself so ask other scheduler domains to help pull tasks. Usually we expects this is a higher capacity scheduler domain to pull tasks so can improve performance; This is quite match upper items 1/2;
Global "overutilized": This kind "overutilized" is that schedule domains should spread tasks as possible and this may happen for task migration from higher capacity schedule domain to lower capacity schedule domain;
IIUC, in original code "rd->overutilized" flag is used to indicate these three semantics; and in Thara's patch the "sd->overutilized" flag is used for inner "overutilized" case, and use "rd->overutilized" to indicate outer "overutilized" case and global "overutilized" case.
No in Thara's path, sd->overutilized and rd->overutilized have the exact same meaning, it is just that we rely on the parent to share the over utilization with the other cpu at the same level and the rd->overutilized is used as the parent of the last sd level but there is no difference in the usage
So Thara's patch is difficult to handle situation for 'misfit'.
I prefer to we can distinguish upper three semantics properly, something like below define macros:
#define SCHED_INNER_OVERUTILIZED 0x1 #define SCHED_OUTER_OVERUTILIZED 0x2 #define SCHED_GLOBAL_OVERUTILIZED 0x4
So we use all three macros for "sd->overutilized" and only use SCHED_GLOBAL_OVERUTILIZED for "rd->overutilized". So for example, if any schedule domain has set SCHED_OUTER_OVERUTILIZED, that means we could check the local schedule group with higher capacity than busiest schedule group and execute load balance.
Please feel free correct me if wrong.
Thanks, Leo Yan
On Mon, Dec 19, 2016 at 08:27:15AM +0100, Vincent Guittot wrote:
[...]
No in Thara's path, sd->overutilized and rd->overutilized have the exact same meaning, it is just that we rely on the parent to share the over utilization with the other cpu at the same level and the rd->overutilized is used as the parent of the last sd level but there is no difference in the usage
I think sd->overutilized and rd->overutilized have different visibility for CPUs. Please see below example:
CPU A SD level 1 - SG1 (CPUA), SG2 (CPUB) SD level 2 - SG5(CPUA, CPUB), SG6(CPU C, CPU D) RD
CPU B SD level 1 - SG2(CPUB), SG1 (CPUA) SD level 2 - SG5(CPU A, CPU B), SG6(CPU C, CPUD) RD
CPU C SD level 1 - SG3(CPU C), SG4 (CPUD) SD level 2 - SG6(CPUC, CPUD), SG5(CPUA, CPU B) RD
CPU D SD level 1 - SG4(CPU D), SG3(CPU C) SD level2 - SG6(CPUC, CPU D), SG5(CPU A, APU B) RD
If CPUA set its sd->overutilized flag into SG5, then later CPUC check sd->overutilized CPUC will only check the flags in SG6. So CPUA set sd->overutilized flag and this flag can be observed by CPUB, but CPUC cannot observe it.
But for rd->overutilized flag, it is visible to all CPUs. This is why I think function is_sd_overutilized() should change as below, CPUC iterates all "sd->overutilized" flags in the same schedule domain and finally find SG5's "overutilized" flag is set CPUA.
static bool is_sd_overutilized(struct sched_domain *sd) { struct sched_group *group = sd->groups; int cpu = smp_processor_id();
if (cpu_rq(cpu)->rd->overutilized) return true;
do { if (group->overutilized) return true;
} while (group = group->next, group != sd->groups);
return false; }
Thanks, Leo Yan
Hello Leo,
On 12/19/2016 10:02 AM, Leo Yan wrote:
On Mon, Dec 19, 2016 at 08:27:15AM +0100, Vincent Guittot wrote:
[...]
No in Thara's path, sd->overutilized and rd->overutilized have the exact same meaning, it is just that we rely on the parent to share the over utilization with the other cpu at the same level and the rd->overutilized is used as the parent of the last sd level but there is no difference in the usage
I think sd->overutilized and rd->overutilized have different visibility for CPUs. Please see below example:
CPU A SD level 1 - SG1 (CPUA), SG2 (CPUB) SD level 2 - SG5(CPUA, CPUB), SG6(CPU C, CPU D) RD
CPU B SD level 1 - SG2(CPUB), SG1 (CPUA) SD level 2 - SG5(CPU A, CPU B), SG6(CPU C, CPUD) RD
CPU C SD level 1 - SG3(CPU C), SG4 (CPUD) SD level 2 - SG6(CPUC, CPUD), SG5(CPUA, CPU B) RD
CPU D SD level 1 - SG4(CPU D), SG3(CPU C) SD level2 - SG6(CPUC, CPU D), SG5(CPU A, APU B) RD
If CPUA set its sd->overutilized flag into SG5, then later CPUC check sd->overutilized CPUC will only check the flags in SG6. So CPUA set sd->overutilized flag and this flag can be observed by CPUB, but CPUC cannot observe it.
Yes you are correct. The intend here is to allow CPU A and CPU B to sort out the imbalance by themselves before involving CPU C. The update_sd_lb already takes care of setting this flag in RD if the imbalance cannot be sorted out inside SG5.
+ /* If the domain util is greater that domain capacity, load balancing + * needs to be done at the next sched domain level as well + */ + if (sds->total_capacity * 1024 < sds->total_util * capacity_margin) { + /* If already at the highest domain nothing can be done */ + if (env->sd->parent) + set_sd_overutilized(env->sd->parent, + env->dst_rq->rd);
But what is missing is handling of misfit task. Can we not handle misfit task as a separate condition in update_sd_lb? i.e in the above example if either CPU A or CPU B has a misfit task, set the overutilization flag for the next level SD which is equivalent to setting the flag in RD in this case.
Regards Thara
But for rd->overutilized flag, it is visible to all CPUs. This is why I think function is_sd_overutilized() should change as below, CPUC iterates all "sd->overutilized" flags in the same schedule domain and finally find SG5's "overutilized" flag is set CPUA.
static bool is_sd_overutilized(struct sched_domain *sd) { struct sched_group *group = sd->groups; int cpu = smp_processor_id();
if (cpu_rq(cpu)->rd->overutilized) return true; do { if (group->overutilized) return true; } while (group = group->next, group != sd->groups); return false;
}
Thanks, Leo Yan
-- Regards Thara
Hi Thara,
On Mon, Dec 19, 2016 at 10:17:29AM -0500, Thara Gopinath wrote:
[...]
But what is missing is handling of misfit task. Can we not handle misfit task as a separate condition in update_sd_lb? i.e in the above example if either CPU A or CPU B has a misfit task, set the overutilization flag for the next level SD which is equivalent to setting the flag in RD in this case.
Agree, we can do this for misfit task :)
IIUC, the idea of your patch is firstly to use SD level 2 flag to present "inner" overutilized, then later in load balance flow to check if need set rd->overutilized flag for outer 'overutilized'. So for 'misfit' case, we need wait until load balance flow to check it and set rd->overutilized flag.
This is why I suggest to use 'discrete' flags in corresponding SD level to present outer 'overutilized', so we can set flag at the first place for outer 'overutilized' but not delay until in load balance flow.
Thanks, Leo Yan
On 12/19/2016 07:42 PM, Leo Yan wrote:
Hi Thara,
On Mon, Dec 19, 2016 at 10:17:29AM -0500, Thara Gopinath wrote:
[...]
But what is missing is handling of misfit task. Can we not handle misfit task as a separate condition in update_sd_lb? i.e in the above example if either CPU A or CPU B has a misfit task, set the overutilization flag for the next level SD which is equivalent to setting the flag in RD in this case.
Agree, we can do this for misfit task :)
IIUC, the idea of your patch is firstly to use SD level 2 flag to present "inner" overutilized, then later in load balance flow to check if need set rd->overutilized flag for outer 'overutilized'. So for 'misfit' case, we need wait until load balance flow to check it and set rd->overutilized flag.
rd->overutilized is like the overutilized flag at any sched group level but for the highest sched_domain that does not have a parent. I am not sure if i understand inner and outer over utilized properly. Say in a system a cpu has four levels of sched domain - level1, level2, level3 and level4. What my patch proposes is as follows- If a load balance has to happen for this cpu at level1, the flag will be set at first sched group in level2. Similarly if load balance has to happen at level2, the flag will be set at thefirst sched group in level3. Following this, if a load balance has to happen at the highest level, ie level4, the flag will be set at rd.
This is why I suggest to use 'discrete' flags in corresponding SD level to present outer 'overutilized', so we can set flag at the first place for outer 'overutilized' but not delay until in load balance flow.
Instead of directly setting the flag at the highest level, should we not try to balance the load out at a lower level, if possible?
Thanks, Leo Yan
-- Regards Thara
On Mon, Dec 19, 2016 at 09:16:58PM -0500, Thara Gopinath wrote:
On 12/19/2016 07:42 PM, Leo Yan wrote:
On Mon, Dec 19, 2016 at 10:17:29AM -0500, Thara Gopinath wrote:
[...]
But what is missing is handling of misfit task. Can we not handle misfit task as a separate condition in update_sd_lb? i.e in the above example if either CPU A or CPU B has a misfit task, set the overutilization flag for the next level SD which is equivalent to setting the flag in RD in this case.
Agree, we can do this for misfit task :)
IIUC, the idea of your patch is firstly to use SD level 2 flag to present "inner" overutilized, then later in load balance flow to check if need set rd->overutilized flag for outer 'overutilized'. So for 'misfit' case, we need wait until load balance flow to check it and set rd->overutilized flag.
rd->overutilized is like the overutilized flag at any sched group level but for the highest sched_domain that does not have a parent. I am not sure if i understand inner and outer over utilized properly. Say in a system a cpu has four levels of sched domain - level1, level2, level3 and level4. What my patch proposes is as follows- If a load balance has to happen for this cpu at level1, the flag will be set at first sched group in level2. Similarly if load balance has to happen at level2, the flag will be set at thefirst sched group in level3. Following this, if a load balance has to happen at the highest level, ie level4, the flag will be set at rd.
E.g. in upper case after set rd->overutilized flag, the scheduler cannot distinguish the load blance requirement _coming_ from which specific schedule group. rd-overutilied flag is an overall flag to indicate the load balance should happen within level 4, but we lose info like in level 4 which schedule group has performance issue so scheduler should help it.
I recognize here have a big different understanding for how to use the 'overutilized' flag. One method is to use "overutilized" flag to indicate one specific schedule domain is over-utilized so need do load balance but we cannot know from these flags which schedule groups within SD have performance bottleneck.
Another method is to use "overutilized" flag to indicate one specific schedule group has performance bootleneck so any schedule group can set "overutilized" flag for itself. Finally scheduler can easily know which schedule groups have bottlenech (the LB requirement from 'who') and should migrate out tasks from them. I personally this can give us more chance to do subtle optimization with these infos, like we know "overutilized" happens in LITTLE cluster so we can have different strategy when "overutilized" happens in big cluster.
This is why I suggest to use 'discrete' flags in corresponding SD level to present outer 'overutilized', so we can set flag at the first place for outer 'overutilized' but not delay until in load balance flow.
Instead of directly setting the flag at the highest level, should we not try to balance the load out at a lower level, if possible?
For 'misfit' task, we don't need do load balance in SD level 1; For other case, we can firstly do load balance in SD level 1.
Thanks, Leo Yan
On 20 December 2016 at 05:02, Leo Yan leo.yan@linaro.org wrote:
On Mon, Dec 19, 2016 at 09:16:58PM -0500, Thara Gopinath wrote:
On 12/19/2016 07:42 PM, Leo Yan wrote:
On Mon, Dec 19, 2016 at 10:17:29AM -0500, Thara Gopinath wrote:
[...]
But what is missing is handling of misfit task. Can we not handle misfit task as a separate condition in update_sd_lb? i.e in the above example if either CPU A or CPU B has a misfit task, set the overutilization flag for the next level SD which is equivalent to setting the flag in RD in this case.
Agree, we can do this for misfit task :)
IIUC, the idea of your patch is firstly to use SD level 2 flag to present "inner" overutilized, then later in load balance flow to check if need set rd->overutilized flag for outer 'overutilized'. So for 'misfit' case, we need wait until load balance flow to check it and set rd->overutilized flag.
rd->overutilized is like the overutilized flag at any sched group level but for the highest sched_domain that does not have a parent. I am not sure if i understand inner and outer over utilized properly. Say in a system a cpu has four levels of sched domain - level1, level2, level3 and level4. What my patch proposes is as follows- If a load balance has to happen for this cpu at level1, the flag will be set at first sched group in level2. Similarly if load balance has to happen at level2, the flag will be set at thefirst sched group in level3. Following this, if a load balance has to happen at the highest level, ie level4, the flag will be set at rd.
E.g. in upper case after set rd->overutilized flag, the scheduler cannot distinguish the load blance requirement _coming_ from which specific schedule group. rd-overutilied flag is an overall flag to indicate the load balance should happen within level 4, but we lose info like in level 4 which schedule group has performance issue so scheduler should help it.
I recognize here have a big different understanding for how to use the 'overutilized' flag. One method is to use "overutilized" flag to indicate one specific schedule domain is over-utilized so need do load balance but we cannot know from these flags which schedule groups within SD have performance bottleneck.
That's what's happen with normal load balancing. The load balance never tags a sched-group as overloaded but look at the busiest sched_group in the sched-domain. The overutilization flag is just an optimization to say don't waste your time in doing load balance at this SD level because sched-group can handle their utilization by themsleve
Another method is to use "overutilized" flag to indicate one specific schedule group has performance bootleneck so any schedule group can set "overutilized" flag for itself. Finally scheduler can easily know
not easily because it will have to parse all the sched_group in a sched_domain which is not scalable
which schedule groups have bottlenech (the LB requirement from 'who') and should migrate out tasks from them. I personally this can give us more chance to do subtle optimization with these infos, like we know "overutilized" happens in LITTLE cluster so we can have different strategy when "overutilized" happens in big cluster.
This is why I suggest to use 'discrete' flags in corresponding SD level to present outer 'overutilized', so we can set flag at the first place for outer 'overutilized' but not delay until in load balance flow.
Instead of directly setting the flag at the highest level, should we not try to balance the load out at a lower level, if possible?
For 'misfit' task, we don't need do load balance in SD level 1; For other case, we can firstly do load balance in SD level 1.
Thanks, Leo Yan
On 19 December 2016 at 16:02, Leo Yan leo.yan@linaro.org wrote:
On Mon, Dec 19, 2016 at 08:27:15AM +0100, Vincent Guittot wrote:
[...]
No in Thara's path, sd->overutilized and rd->overutilized have the exact same meaning, it is just that we rely on the parent to share the over utilization with the other cpu at the same level and the rd->overutilized is used as the parent of the last sd level but there is no difference in the usage
I think sd->overutilized and rd->overutilized have different visibility for CPUs. Please see below example:
CPU A SD level 1 - SG1 (CPUA), SG2 (CPUB) SD level 2 - SG5(CPUA, CPUB), SG6(CPU C, CPU D) RD
CPU B SD level 1 - SG2(CPUB), SG1 (CPUA) SD level 2 - SG5(CPU A, CPU B), SG6(CPU C, CPUD) RD
CPU C SD level 1 - SG3(CPU C), SG4 (CPUD) SD level 2 - SG6(CPUC, CPUD), SG5(CPUA, CPU B) RD
CPU D SD level 1 - SG4(CPU D), SG3(CPU C) SD level2 - SG6(CPUC, CPU D), SG5(CPU A, APU B) RD
If CPUA set its sd->overutilized flag into SG5, then later CPUC check sd->overutilized CPUC will only check the flags in SG6. So CPUA set sd->overutilized flag and this flag can be observed by CPUB, but CPUC cannot observe it.
yes and it's normal, we set flag into SG5 to say that load balance is need at sd_level1 between CPUA and CPUB. We use the SG at parent level because it is shared between all CPU involved in the child sd level. But the last sd level has not parent :-) so we use rd as the parent
But for rd->overutilized flag, it is visible to all CPUs. This is why I think function is_sd_overutilized() should change as below, CPUC iterates all "sd->overutilized" flags in the same schedule domain and
We use the SG at parent level to prevent this not scalable while loop
finally find SG5's "overutilized" flag is set CPUA.
static bool is_sd_overutilized(struct sched_domain *sd) { struct sched_group *group = sd->groups; int cpu = smp_processor_id();
if (cpu_rq(cpu)->rd->overutilized) return true; do { if (group->overutilized) return true; } while (group = group->next, group != sd->groups); return false;
}
Thanks, Leo Yan
On 12/19/2016 04:36 PM, Vincent Guittot wrote:
On 19 December 2016 at 16:02, Leo Yan leo.yan@linaro.org wrote:
On Mon, Dec 19, 2016 at 08:27:15AM +0100, Vincent Guittot wrote:
[...]
No in Thara's path, sd->overutilized and rd->overutilized have the exact same meaning, it is just that we rely on the parent to share the over utilization with the other cpu at the same level and the rd->overutilized is used as the parent of the last sd level but there is no difference in the usage
I think sd->overutilized and rd->overutilized have different visibility for CPUs. Please see below example:
CPU A SD level 1 - SG1 (CPUA), SG2 (CPUB) SD level 2 - SG5(CPUA, CPUB), SG6(CPU C, CPU D) RD
CPU B SD level 1 - SG2(CPUB), SG1 (CPUA) SD level 2 - SG5(CPU A, CPU B), SG6(CPU C, CPUD) RD
CPU C SD level 1 - SG3(CPU C), SG4 (CPUD) SD level 2 - SG6(CPUC, CPUD), SG5(CPUA, CPU B) RD
CPU D SD level 1 - SG4(CPU D), SG3(CPU C) SD level2 - SG6(CPUC, CPU D), SG5(CPU A, APU B) RD
If CPUA set its sd->overutilized flag into SG5, then later CPUC check sd->overutilized CPUC will only check the flags in SG6. So CPUA set sd->overutilized flag and this flag can be observed by CPUB, but CPUC cannot observe it.
yes and it's normal, we set flag into SG5 to say that load balance is need at sd_level1 between CPUA and CPUB. We use the SG at parent level because it is shared between all CPU involved in the child sd level. But the last sd level has not parent :-) so we use rd as the parent
But for rd->overutilized flag, it is visible to all CPUs. This is why I think function is_sd_overutilized() should change as below, CPUC iterates all "sd->overutilized" flags in the same schedule domain and
We use the SG at parent level to prevent this not scalable while loop
IMHO, wouldn't the newly introduced struct sched_domain_shared (commit 24fc7edb92ee "sched/core: Introduce 'struct sched_domain_shared'" be the perfect infrastructure for this kind of job? It is per-cpu data which is shared between all 'identical' sched domains.
It would allow us to not touch the root_domain for this business and thus this source of potential misunderstanding.
It's currently limited to solve another problem but was designed to be easily extended. It's not in EAS product code line but the latest EAS integration already has it.
What do you guys think?
On 19 December 2016 at 18:47, Dietmar Eggemann dietmar.eggemann@arm.com wrote:
On 12/19/2016 04:36 PM, Vincent Guittot wrote:
On 19 December 2016 at 16:02, Leo Yan leo.yan@linaro.org wrote:
On Mon, Dec 19, 2016 at 08:27:15AM +0100, Vincent Guittot wrote:
[...]
No in Thara's path, sd->overutilized and rd->overutilized have the exact same meaning, it is just that we rely on the parent to share the over utilization with the other cpu at the same level and the rd->overutilized is used as the parent of the last sd level but there is no difference in the usage
I think sd->overutilized and rd->overutilized have different visibility for CPUs. Please see below example:
CPU A SD level 1 - SG1 (CPUA), SG2 (CPUB) SD level 2 - SG5(CPUA, CPUB), SG6(CPU C, CPU D) RD
CPU B SD level 1 - SG2(CPUB), SG1 (CPUA) SD level 2 - SG5(CPU A, CPU B), SG6(CPU C, CPUD) RD
CPU C SD level 1 - SG3(CPU C), SG4 (CPUD) SD level 2 - SG6(CPUC, CPUD), SG5(CPUA, CPU B) RD
CPU D SD level 1 - SG4(CPU D), SG3(CPU C) SD level2 - SG6(CPUC, CPU D), SG5(CPU A, APU B) RD
If CPUA set its sd->overutilized flag into SG5, then later CPUC check sd->overutilized CPUC will only check the flags in SG6. So CPUA set sd->overutilized flag and this flag can be observed by CPUB, but CPUC cannot observe it.
yes and it's normal, we set flag into SG5 to say that load balance is need at sd_level1 between CPUA and CPUB. We use the SG at parent level because it is shared between all CPU involved in the child sd level. But the last sd level has not parent :-) so we use rd as the parent
But for rd->overutilized flag, it is visible to all CPUs. This is why I think function is_sd_overutilized() should change as below, CPUC iterates all "sd->overutilized" flags in the same schedule domain and
We use the SG at parent level to prevent this not scalable while loop
IMHO, wouldn't the newly introduced struct sched_domain_shared (commit 24fc7edb92ee "sched/core: Introduce 'struct sched_domain_shared'" be the perfect infrastructure for this kind of job? It is per-cpu data which is shared between all 'identical' sched domains.
It would allow us to not touch the root_domain for this business and thus this source of potential misunderstanding.
It's currently limited to solve another problem but was designed to be easily extended. It's not in EAS product code line but the latest EAS integration already has it.
What do you guys think?
Yes I agree that this shared struct should be used to share flags between CPUs but it has appeared quite recently and EAS was not available for with this feature when the patch has been done. The priority was to start review and discussion but the next version should use it instead
On 12/19/2016 07:02 PM, Vincent Guittot wrote:
On 19 December 2016 at 18:47, Dietmar Eggemann dietmar.eggemann@arm.com wrote:
On 12/19/2016 04:36 PM, Vincent Guittot wrote:
On 19 December 2016 at 16:02, Leo Yan leo.yan@linaro.org wrote:
On Mon, Dec 19, 2016 at 08:27:15AM +0100, Vincent Guittot wrote:
[...]
Yes I agree that this shared struct should be used to share flags between CPUs but it has appeared quite recently and EAS was not available for with this feature when the patch has been done. The priority was to start review and discussion but the next version should use it instead
Fair enough. Totally agree on this one.
On 12/19/2016 12:47 PM, Dietmar Eggemann wrote:
On 12/19/2016 04:36 PM, Vincent Guittot wrote:
On 19 December 2016 at 16:02, Leo Yan leo.yan@linaro.org wrote:
On Mon, Dec 19, 2016 at 08:27:15AM +0100, Vincent Guittot wrote:
[...]
No in Thara's path, sd->overutilized and rd->overutilized have the exact same meaning, it is just that we rely on the parent to share the over utilization with the other cpu at the same level and the rd->overutilized is used as the parent of the last sd level but there is no difference in the usage
I think sd->overutilized and rd->overutilized have different visibility for CPUs. Please see below example:
CPU A SD level 1 - SG1 (CPUA), SG2 (CPUB) SD level 2 - SG5(CPUA, CPUB), SG6(CPU C, CPU D) RD
CPU B SD level 1 - SG2(CPUB), SG1 (CPUA) SD level 2 - SG5(CPU A, CPU B), SG6(CPU C, CPUD) RD
CPU C SD level 1 - SG3(CPU C), SG4 (CPUD) SD level 2 - SG6(CPUC, CPUD), SG5(CPUA, CPU B) RD
CPU D SD level 1 - SG4(CPU D), SG3(CPU C) SD level2 - SG6(CPUC, CPU D), SG5(CPU A, APU B) RD
If CPUA set its sd->overutilized flag into SG5, then later CPUC check sd->overutilized CPUC will only check the flags in SG6. So CPUA set sd->overutilized flag and this flag can be observed by CPUB, but CPUC cannot observe it.
yes and it's normal, we set flag into SG5 to say that load balance is need at sd_level1 between CPUA and CPUB. We use the SG at parent level because it is shared between all CPU involved in the child sd level. But the last sd level has not parent :-) so we use rd as the parent
But for rd->overutilized flag, it is visible to all CPUs. This is why I think function is_sd_overutilized() should change as below, CPUC iterates all "sd->overutilized" flags in the same schedule domain and
We use the SG at parent level to prevent this not scalable while loop
IMHO, wouldn't the newly introduced struct sched_domain_shared (commit 24fc7edb92ee "sched/core: Introduce 'struct sched_domain_shared'" be the perfect infrastructure for this kind of job? It is per-cpu data which is shared between all 'identical' sched domains.
It would allow us to not touch the root_domain for this business and thus this source of potential misunderstanding.
It's currently limited to solve another problem but was designed to be easily extended. It's not in EAS product code line but the latest EAS integration already has it.
What do you guys think?
Hello Dietmar,
Thanks for the review. As Vincent mentioned, this feature was not integrated with EAS when I was coding the patch. I will rebase for the next version.
Regards Thara
-- Regards Thara
Let me join the party here. Hope it's not too late ;)
On 12/07/2016 02:22 PM, Thara Gopinath wrote:
@@ -6063,10 +6094,16 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f && cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); }
- if (energy_aware() && !(cpu_rq(prev_cpu)->rd->overutilized))
return select_energy_cpu_brute(p, prev_cpu);
- rcu_read_lock();
- sd = rcu_dereference(cpu_rq(prev_cpu)->sd);
- if (energy_aware() &&
!is_sd_overutilized(sd,
cpu_rq(cpu)->rd)) {
IMHO skipping EA task placement based on inner overutilized flag still can occur suboptimal placement when there is unavoidable load imbalance within a SD. For example say there is a 2+2 big.Little system and CPU0 and CPU1 are little CPUs. All other CPUs are idle but little CPU1 has a CPU bound task which makes little cluster's SD marked as overutilzed. Under the such condition any new wake-up of which the task's prev_cpu is either CPU0 or CPU1 will do non-EA placement. As a result small tasks can be on big CPU and also big task can be on little CPU (probably idle little CPU).
Overutilzed big CPUs' SD will have same issue for the tasks happened to be on a big CPU previously but better to be on a little CPU later.
What do you think? It can be tricky but it could be better to mark as overutilzed when all the CPUs in the SD are actually overutilzed for both task placement and lb.
new_cpu = select_energy_cpu_brute(p, prev_cpu);
goto unlock;
- }
- sd = NULL;
Leo, BTW where is the EAS repo I can publicly access to sync with your tip? I'm on google's lsk-v4.4-16.09-android tree and hope I can post my patches not too late but looks like there are quite dependencies.
Thanks, Joonwoo
On Wed, Jan 18, 2017 at 02:26:53PM -0800, Joonwoo Park wrote:
Let me join the party here. Hope it's not too late ;)
Not at all :)
On 12/07/2016 02:22 PM, Thara Gopinath wrote:
@@ -6063,10 +6094,16 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f && cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); }
- if (energy_aware() && !(cpu_rq(prev_cpu)->rd->overutilized))
return select_energy_cpu_brute(p, prev_cpu);
- rcu_read_lock();
- sd = rcu_dereference(cpu_rq(prev_cpu)->sd);
- if (energy_aware() &&
!is_sd_overutilized(sd,
cpu_rq(cpu)->rd)) {
IMHO skipping EA task placement based on inner overutilized flag still can occur suboptimal placement when there is unavoidable load imbalance within a SD. For example say there is a 2+2 big.Little system and CPU0 and CPU1 are little CPUs. All other CPUs are idle but little CPU1 has a CPU bound task which makes little cluster's SD marked as overutilzed. Under the such condition any new wake-up of which the task's prev_cpu is either CPU0 or CPU1 will do non-EA placement. As a result small tasks can be on big CPU and also big task can be on little CPU (probably idle little CPU).
Overutilzed big CPUs' SD will have same issue for the tasks happened to be on a big CPU previously but better to be on a little CPU later.
Good point and agree. In waken path deferring tipping point is important for power optimization by EA task placement.
So I think we should introduce 'global' overutilized flag. For 'inner' and 'outer' overutilized cases, scheduler will always use EA task placement but for 'global' overutilized we can skip it.
What do you think? It can be tricky but it could be better to mark as overutilzed when all the CPUs in the SD are actually overutilzed for both task placement and lb.
This is a bit rigid criteria.
IMHO we can get one obvious benefit from this patch is for power optimization, but we cannot hurt performance. At least cannot introduce too much performance downgradtion by this patch.
If we wait all CPUs marked as overutilized in SD for lb, this may introduce serious performance regression for benchmarks, e.g. vellamo. But maybe some minor tunning we can try, like we can skip task migration when source sched group is not overutilized, but destination sched group is overutilized; so we can implement one-way migration for some specific cases.
new_cpu = select_energy_cpu_brute(p, prev_cpu);
goto unlock;
- }
- sd = NULL;
Leo, BTW where is the EAS repo I can publicly access to sync with your tip? I'm on google's lsk-v4.4-16.09-android tree and hope I can post my patches not too late but looks like there are quite dependencies.
This is Thara's patch :) I think Thara is working on ARM development branch due I saw Thara'a patch includes some new functions introduced by ARM developement branch: git://www.linux-arm.org/linux-power.git, branch: origin/eas/next/integration_20161206_1147
Thara, please correct as needed.
For producetion code line, from previous alignment it's better to work on Android common kernel 3.18 or 4.4. They have quite same code base for EAS:
https://android.googlesource.com/kernel/common/+log/android-4.4 https://android.googlesource.com/kernel/common/+log/android-3.18
I also post my version tweaked Thara's patch in case you are interesting in it, I did some verification on Juno. Feedback and suggestion will be very helpful :)
---8<---
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f98e434..7371e56 100755 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4200,6 +4200,68 @@ static inline void hrtick_update(struct rq *rq) #ifdef CONFIG_SMP static bool cpu_overutilized(int cpu); static inline unsigned long boosted_cpu_util(int cpu); + +/* + * 1. Inner overutilized: + * + * The load balance will happen only in SD Level 1, so this means + * only take affact on clustser internally. + * + * 2. Outer overutilized: + * + * If the CPU has misfit on it, it's no doubt to migrate task + * to another high capacity CPU. + * + * Or if one CPU is overutilized and we assume now scheduler has + * done good enough work to explore cluster internal capacity, so + * if one CPU is overutilized that means finally need seek another + * cluster to provide more computing capacity. + * + * 3. Global overutilized: + * + * If set root domain flag, means explore performance as possible + * to spread out tasks. + * + */ +static void set_sd_overutilized(struct sched_domain *sd) +{ + if (sd) + sd->groups->overutilized = true; +} + +static void clear_sd_overutilized(struct sched_domain *sd) +{ + if (sd) + sd->groups->overutilized = false; +} + +static void set_rd_overutilized(struct root_domain *rd) +{ + rd->overutilized = true; +} + +static void clear_rd_overutilized(struct root_domain *rd) +{ + rd->overutilized = false; +} + +static bool is_sd_overutilized(struct sched_domain *sd) +{ + struct sched_group *group = sd->groups; + int cpu = smp_processor_id(); + + if (cpu_rq(cpu)->rd->overutilized) + return true; + + do { + if (group->overutilized) + return true; + + } while (group = group->next, group != sd->groups); + + return false; +} + #else #define boosted_cpu_util(cpu) cpu_util(cpu) #endif @@ -4228,6 +4290,7 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) { struct cfs_rq *cfs_rq; + struct sched_domain *sd; struct sched_entity *se = &p->se; #ifdef CONFIG_SMP int task_new = flags & ENQUEUE_WAKEUP_NEW; @@ -4292,11 +4355,17 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (!se) { walt_inc_cumulative_runnable_avg(rq, p); - if (!task_new && !rq->rd->overutilized && - cpu_overutilized(rq->cpu)) { - rq->rd->overutilized = true; - trace_sched_overutilized(true); + + rcu_read_lock(); + sd = rcu_dereference(rq->sd); + if (!task_new) { + if (cpu_overutilized(rq->cpu) && sd) + set_sd_overutilized(sd); + + if (rq->misfit_task && sd && sd->parent) + set_sd_overutilized(sd->parent); } + rcu_read_unlock();
/* * We want to potentially trigger a freq switch @@ -7188,6 +7257,7 @@ struct sd_lb_stats { struct sched_group *local; /* Local group in this sd */ unsigned long total_load; /* Total load of all groups in sd */ unsigned long total_capacity; /* Total capacity of all groups in sd */ + unsigned long total_util; /* Total util of all groups in sd */ unsigned long avg_load; /* Average load across all groups in sd */
struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */ @@ -7207,6 +7277,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds) .local = NULL, .total_load = 0UL, .total_capacity = 0UL, + .total_util = 0UL, .busiest_stat = { .avg_load = 0UL, .sum_nr_running = 0, @@ -7528,10 +7599,11 @@ group_type group_classify(struct sched_group *group, static inline void update_sg_lb_stats(struct lb_env *env, struct sched_group *group, int load_idx, int local_group, struct sg_lb_stats *sgs, - bool *overload, bool *overutilized) + bool *overload, bool *misfit) { unsigned long load; int i, nr_running; + bool overutilized = false;
memset(sgs, 0, sizeof(*sgs));
@@ -7564,10 +7636,13 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->idle_cpus++;
if (cpu_overutilized(i)) - *overutilized = true; + overutilized = true;
if (!sgs->group_misfit_task && rq->misfit_task) sgs->group_misfit_task = capacity_of(i); + + if (rq->misfit_task) + *misfit = true; }
/* Adjust by relative CPU capacity of the group */ @@ -7581,6 +7656,9 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->group_no_capacity = group_is_overloaded(env, sgs); sgs->group_type = group_classify(group, sgs); + + if (sgs->group_weight == 1) + group->overutilized = overutilized; }
/** @@ -7689,7 +7767,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd struct sched_group *sg = env->sd->groups; struct sg_lb_stats tmp_sgs; int load_idx, prefer_sibling = 0; - bool overload = false, overutilized = false; + bool overload = false, misfit = false;
if (child && child->flags & SD_PREFER_SIBLING) prefer_sibling = 1; @@ -7711,7 +7789,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd }
update_sg_lb_stats(env, sg, load_idx, local_group, sgs, - &overload, &overutilized); + &overload, &misfit);
if (local_group) goto next_group; @@ -7751,6 +7829,7 @@ next_group: /* Now, start updating sd_lb_stats */ sds->total_load += sgs->group_load; sds->total_capacity += sgs->group_capacity; + sds->total_util += sgs->group_util;
sg = sg->next; } while (sg != env->sd->groups); @@ -7765,18 +7844,28 @@ next_group: if (env->dst_rq->rd->overload != overload) env->dst_rq->rd->overload = overload;
- /* Update over-utilization (tipping point, U >= 0) indicator */ - if (env->dst_rq->rd->overutilized != overutilized) { - env->dst_rq->rd->overutilized = overutilized; - trace_sched_overutilized(overutilized); - } + /* + * If total utilization is more than half of capacity, + * this means at least the average CPU utilization is + * crossing half of max capacity CPU; so this is a quite + * high bar to set root domain's overutlized flag. + */ + if (sds->total_capacity < sds->total_util * 2) + set_rd_overutilized(env->dst_rq->rd); + else + clear_rd_overutilized(env->dst_rq->rd); } else { - if (!env->dst_rq->rd->overutilized && overutilized) { - env->dst_rq->rd->overutilized = true; - trace_sched_overutilized(true); - } + /* + * If the domain util is greater that domain capacity, + * load balancing needs to be done at the next sched + * domain level as well + */ + if ((sds->total_capacity * 1024 < + sds->total_util * capacity_margin) || misfit) + set_sd_overutilized(env->sd->parent); + else + clear_sd_overutilized(env->sd->parent); } - }
/** @@ -8019,7 +8108,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) */ update_sd_lb_stats(env, &sds);
- if (energy_aware() && !env->dst_rq->rd->overutilized) + if (energy_aware() && !is_sd_overutilized(env->sd)) goto out_balanced;
local = &sds.local_stat; @@ -8960,6 +9049,10 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
rcu_read_lock(); for_each_domain(cpu, sd) { + + if (energy_aware() && !is_sd_overutilized(sd)) + continue; + /* * Decay the newidle max times here because this is a regular * visit to all the domains. Decay ~1% per second. @@ -9255,6 +9348,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) { struct cfs_rq *cfs_rq; struct sched_entity *se = &curr->se; + struct sched_domain *sd;
for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); @@ -9265,12 +9359,18 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) task_tick_numa(rq, curr);
#ifdef CONFIG_SMP - if (!rq->rd->overutilized && cpu_overutilized(task_cpu(curr))) { - rq->rd->overutilized = true; - trace_sched_overutilized(true); - } - rq->misfit_task = !task_fits_max(curr, rq->cpu); + + rcu_read_lock(); + sd = rcu_dereference(rq->sd); + + if (cpu_overutilized(task_cpu(curr)) && sd) + set_sd_overutilized(sd); + + if (rq->misfit_task && sd && sd->parent) + set_sd_overutilized(sd->parent); + + rcu_read_unlock(); #endif
} diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 2f2b959..ca2cedb 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -916,6 +916,7 @@ struct sched_group { unsigned int group_weight; struct sched_group_capacity *sgc; const struct sched_group_energy const *sge; + bool overutilized;
/* * The CPUs this group covers. -- 1.9.1
On 01/18/2017 07:51 PM, Leo Yan wrote:
On Wed, Jan 18, 2017 at 02:26:53PM -0800, Joonwoo Park wrote:
Let me join the party here. Hope it's not too late ;)
Not at all :)
On 12/07/2016 02:22 PM, Thara Gopinath wrote:
@@ -6063,10 +6094,16 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f && cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); }
- if (energy_aware() && !(cpu_rq(prev_cpu)->rd->overutilized))
return select_energy_cpu_brute(p, prev_cpu);
- rcu_read_lock();
- sd = rcu_dereference(cpu_rq(prev_cpu)->sd);
- if (energy_aware() &&
!is_sd_overutilized(sd,
cpu_rq(cpu)->rd)) {
IMHO skipping EA task placement based on inner overutilized flag still can occur suboptimal placement when there is unavoidable load imbalance within a SD. For example say there is a 2+2 big.Little system and CPU0 and CPU1 are little CPUs. All other CPUs are idle but little CPU1 has a CPU bound task which makes little cluster's SD marked as overutilzed. Under the such condition any new wake-up of which the task's prev_cpu is either CPU0 or CPU1 will do non-EA placement. As a result small tasks can be on big CPU and also big task can be on little CPU (probably idle little CPU).
Overutilzed big CPUs' SD will have same issue for the tasks happened to be on a big CPU previously but better to be on a little CPU later.
Good point and agree. In waken path deferring tipping point is important for power optimization by EA task placement.
So I think we should introduce 'global' overutilized flag. For 'inner' and 'outer' overutilized cases, scheduler will always use EA task placement but for 'global' overutilized we can skip it.
What do you think? It can be tricky but it could be better to mark as overutilzed when all the CPUs in the SD are actually overutilzed for both task placement and lb.
This is a bit rigid criteria.
IMHO we can get one obvious benefit from this patch is for power optimization, but we cannot hurt performance. At least cannot introduce too much performance downgradtion by this patch.
If we wait all CPUs marked as overutilized in SD for lb, this may introduce serious performance regression for benchmarks, e.g. vellamo. But maybe some minor tunning we can try, like we can skip task migration when source sched group is not overutilized, but destination sched group is overutilized; so we can implement one-way migration for some specific cases.
new_cpu = select_energy_cpu_brute(p, prev_cpu);
goto unlock;
- }
- sd = NULL;
Leo, BTW where is the EAS repo I can publicly access to sync with your tip? I'm on google's lsk-v4.4-16.09-android tree and hope I can post my patches not too late but looks like there are quite dependencies.
This is Thara's patch :) I think Thara is working on ARM development branch due I saw Thara'a patch includes some new functions introduced by ARM developement branch: git://www.linux-arm.org/linux-power.git, branch: origin/eas/next/integration_20161206_1147
Thara, please correct as needed.
For producetion code line, from previous alignment it's better to work on Android common kernel 3.18 or 4.4. They have quite same code base for EAS:
https://android.googlesource.com/kernel/common/+log/android-4.4 https://android.googlesource.com/kernel/common/+log/android-3.18
I also post my version tweaked Thara's patch in case you are interesting in it, I did some verification on Juno. Feedback and suggestion will be very helpful :)
Thanks Leo. I will definitely try out and let you know.
Joonwoo
---8<---
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f98e434..7371e56 100755 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4200,6 +4200,68 @@ static inline void hrtick_update(struct rq *rq) #ifdef CONFIG_SMP static bool cpu_overutilized(int cpu); static inline unsigned long boosted_cpu_util(int cpu);
+/*
- Inner overutilized:
- The load balance will happen only in SD Level 1, so this means
- only take affact on clustser internally.
- Outer overutilized:
- If the CPU has misfit on it, it's no doubt to migrate task
- to another high capacity CPU.
- Or if one CPU is overutilized and we assume now scheduler has
- done good enough work to explore cluster internal capacity, so
- if one CPU is overutilized that means finally need seek another
- cluster to provide more computing capacity.
- Global overutilized:
- If set root domain flag, means explore performance as possible
- to spread out tasks.
- */
+static void set_sd_overutilized(struct sched_domain *sd) +{
- if (sd)
sd->groups->overutilized = true;
+}
+static void clear_sd_overutilized(struct sched_domain *sd) +{
- if (sd)
sd->groups->overutilized = false;
+}
+static void set_rd_overutilized(struct root_domain *rd) +{
- rd->overutilized = true;
+}
+static void clear_rd_overutilized(struct root_domain *rd) +{
- rd->overutilized = false;
+}
+static bool is_sd_overutilized(struct sched_domain *sd) +{
- struct sched_group *group = sd->groups;
- int cpu = smp_processor_id();
- if (cpu_rq(cpu)->rd->overutilized)
return true;
- do {
if (group->overutilized)
return true;
- } while (group = group->next, group != sd->groups);
- return false;
+}
#else #define boosted_cpu_util(cpu) cpu_util(cpu) #endif @@ -4228,6 +4290,7 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) { struct cfs_rq *cfs_rq;
- struct sched_domain *sd; struct sched_entity *se = &p->se;
#ifdef CONFIG_SMP int task_new = flags & ENQUEUE_WAKEUP_NEW; @@ -4292,11 +4355,17 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (!se) { walt_inc_cumulative_runnable_avg(rq, p);
if (!task_new && !rq->rd->overutilized &&
cpu_overutilized(rq->cpu)) {
rq->rd->overutilized = true;
trace_sched_overutilized(true);
rcu_read_lock();
sd = rcu_dereference(rq->sd);
if (!task_new) {
if (cpu_overutilized(rq->cpu) && sd)
set_sd_overutilized(sd);
if (rq->misfit_task && sd && sd->parent)
set_sd_overutilized(sd->parent);
}
rcu_read_unlock();
/*
- We want to potentially trigger a freq switch
@@ -7188,6 +7257,7 @@ struct sd_lb_stats { struct sched_group *local; /* Local group in this sd */ unsigned long total_load; /* Total load of all groups in sd */ unsigned long total_capacity; /* Total capacity of all groups in sd */
unsigned long total_util; /* Total util of all groups in sd */ unsigned long avg_load; /* Average load across all groups in sd */
struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
@@ -7207,6 +7277,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds) .local = NULL, .total_load = 0UL, .total_capacity = 0UL,
.busiest_stat = { .avg_load = 0UL, .sum_nr_running = 0,.total_util = 0UL,
@@ -7528,10 +7599,11 @@ group_type group_classify(struct sched_group *group, static inline void update_sg_lb_stats(struct lb_env *env, struct sched_group *group, int load_idx, int local_group, struct sg_lb_stats *sgs,
bool *overload, bool *overutilized)
bool *overload, bool *misfit)
{ unsigned long load; int i, nr_running;
bool overutilized = false;
memset(sgs, 0, sizeof(*sgs));
@@ -7564,10 +7636,13 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->idle_cpus++;
if (cpu_overutilized(i))
*overutilized = true;
overutilized = true;
if (!sgs->group_misfit_task && rq->misfit_task) sgs->group_misfit_task = capacity_of(i);
if (rq->misfit_task)
*misfit = true;
}
/* Adjust by relative CPU capacity of the group */
@@ -7581,6 +7656,9 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->group_no_capacity = group_is_overloaded(env, sgs); sgs->group_type = group_classify(group, sgs);
- if (sgs->group_weight == 1)
group->overutilized = overutilized;
}
/** @@ -7689,7 +7767,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd struct sched_group *sg = env->sd->groups; struct sg_lb_stats tmp_sgs; int load_idx, prefer_sibling = 0;
- bool overload = false, overutilized = false;
bool overload = false, misfit = false;
if (child && child->flags & SD_PREFER_SIBLING) prefer_sibling = 1;
@@ -7711,7 +7789,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd }
update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
&overload, &overutilized);
&overload, &misfit);
if (local_group) goto next_group;
@@ -7751,6 +7829,7 @@ next_group: /* Now, start updating sd_lb_stats */ sds->total_load += sgs->group_load; sds->total_capacity += sgs->group_capacity;
sds->total_util += sgs->group_util;
sg = sg->next; } while (sg != env->sd->groups);
@@ -7765,18 +7844,28 @@ next_group: if (env->dst_rq->rd->overload != overload) env->dst_rq->rd->overload = overload;
/* Update over-utilization (tipping point, U >= 0) indicator */
if (env->dst_rq->rd->overutilized != overutilized) {
env->dst_rq->rd->overutilized = overutilized;
trace_sched_overutilized(overutilized);
}
/*
* If total utilization is more than half of capacity,
* this means at least the average CPU utilization is
* crossing half of max capacity CPU; so this is a quite
* high bar to set root domain's overutlized flag.
*/
if (sds->total_capacity < sds->total_util * 2)
set_rd_overutilized(env->dst_rq->rd);
else
} else {clear_rd_overutilized(env->dst_rq->rd);
if (!env->dst_rq->rd->overutilized && overutilized) {
env->dst_rq->rd->overutilized = true;
trace_sched_overutilized(true);
}
/*
* If the domain util is greater that domain capacity,
* load balancing needs to be done at the next sched
* domain level as well
*/
if ((sds->total_capacity * 1024 <
sds->total_util * capacity_margin) || misfit)
set_sd_overutilized(env->sd->parent);
else
}clear_sd_overutilized(env->sd->parent);
}
/** @@ -8019,7 +8108,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) */ update_sd_lb_stats(env, &sds);
- if (energy_aware() && !env->dst_rq->rd->overutilized)
if (energy_aware() && !is_sd_overutilized(env->sd)) goto out_balanced;
local = &sds.local_stat;
@@ -8960,6 +9049,10 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
rcu_read_lock(); for_each_domain(cpu, sd) {
if (energy_aware() && !is_sd_overutilized(sd))
continue;
- /*
- Decay the newidle max times here because this is a regular
- visit to all the domains. Decay ~1% per second.
@@ -9255,6 +9348,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) { struct cfs_rq *cfs_rq; struct sched_entity *se = &curr->se;
struct sched_domain *sd;
for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se);
@@ -9265,12 +9359,18 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) task_tick_numa(rq, curr);
#ifdef CONFIG_SMP
- if (!rq->rd->overutilized && cpu_overutilized(task_cpu(curr))) {
rq->rd->overutilized = true;
trace_sched_overutilized(true);
- }
- rq->misfit_task = !task_fits_max(curr, rq->cpu);
- rcu_read_lock();
- sd = rcu_dereference(rq->sd);
- if (cpu_overutilized(task_cpu(curr)) && sd)
set_sd_overutilized(sd);
- if (rq->misfit_task && sd && sd->parent)
set_sd_overutilized(sd->parent);
- rcu_read_unlock();
#endif
} diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 2f2b959..ca2cedb 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -916,6 +916,7 @@ struct sched_group { unsigned int group_weight; struct sched_group_capacity *sgc; const struct sched_group_energy const *sge;
bool overutilized;
/*
- The CPUs this group covers.