The current implementation of overutilization, aborts energy aware scheduling if any cpu in the system is over-utilized. This patch introduces over utilization flag per sched domain level instead of a single flag system wide. Load balancing is done at the sched domain where any of the cpu is over utilized. If energy aware scheduling is enabled and no cpu in a sched domain is overuttilized, load balancing is skipped for that sched domain and energy aware scheduling continues at that level.
The implementation takes advantage of the shared sched_domain structure that is common across all the sched domains at a level. The new flag introduced is placed in this structure so that all the sched domains the same level share the flag. In case of an overutilized cpu, the flag gets set at level1 sched_domain. The flag at the parent sched_domain level gets set in either of the two following scenarios. 1. There is a misfit task in one of the cpu's in this sched_domain. 2. The total utilization of the domain is greater than the domain capacity
The flag is cleared if no cpu in a sched domain is overutilized.
This implementation still can have corner scenarios with respect to misfit tasks. For example consider a sched group with n cpus and n+1 70%utilized tasks. Ideally this is a case for load balance to happen in a parent sched domain. But neither the total group utilization is high enough for the load balance to be triggered in the parent domain nor there is a cpu with a single overutilized task so that aload balance is triggered in a parent domain. But again this could be a purely academic sceanrio, as during task wake up these tasks will be placed more appropriately.
Signed-off-by: Thara Gopinath thara.gopinath@linaro.org --- V1->V2: - Removed overutilized flag from sched_group structure. - In case of misfit task, it is ensured that a load balance is triggered in a parent sched domain with assymetric cpu capacities.
include/linux/sched.h | 1 + kernel/sched/core.c | 7 ++- kernel/sched/fair.c | 138 +++++++++++++++++++++++++++++++++++++++++--------- kernel/sched/sched.h | 3 -- 4 files changed, 117 insertions(+), 32 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h index 1c5122e..971842a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1112,6 +1112,7 @@ struct sched_domain_shared { atomic_t ref; atomic_t nr_busy_cpus; int has_idle_cores; + bool overutilized; };
struct sched_domain { diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 31a466f..e0a8758 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6659,11 +6659,10 @@ sd_init(struct sched_domain_topology_level *tl, * For all levels sharing cache; connect a sched_domain_shared * instance. */ - if (sd->flags & SD_SHARE_PKG_RESOURCES) { - sd->shared = *per_cpu_ptr(sdd->sds, sd_id); - atomic_inc(&sd->shared->ref); + sd->shared = *per_cpu_ptr(sdd->sds, sd_id); + atomic_inc(&sd->shared->ref); + if (sd->flags & SD_SHARE_PKG_RESOURCES) atomic_set(&sd->shared->nr_busy_cpus, sd_weight); - }
sd->private = sdd;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 489f6d3..9d2bb07 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4735,6 +4735,30 @@ static inline void hrtick_update(struct rq *rq)
static bool cpu_overutilized(int cpu);
+static bool +is_sd_overutilized(struct sched_domain *sd) +{ + if (sd) + return sd->shared->overutilized; + else + return false; +} + +static void +set_sd_overutilized(struct sched_domain *sd) +{ + if (sd) + sd->shared->overutilized = true; +} + +static void +clear_sd_overutilized(struct sched_domain *sd) +{ + if (sd) + sd->shared->overutilized = false; +} + + /* * The enqueue_task method is called before nr_running is * increased. Here we update the fair scheduling stats and @@ -4744,6 +4768,7 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) { struct cfs_rq *cfs_rq; + struct sched_domain *sd; struct sched_entity *se = &p->se; int task_new = !(flags & ENQUEUE_WAKEUP);
@@ -4787,9 +4812,12 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (!se) { add_nr_running(rq, 1); - if (!task_new && !rq->rd->overutilized && - cpu_overutilized(rq->cpu)) - rq->rd->overutilized = true; + rcu_read_lock(); + sd = rcu_dereference(rq->sd); + if (!task_new && !is_sd_overutilized(sd) && + cpu_overutilized(rq->cpu)) + set_sd_overutilized(sd); + rcu_read_unlock(); } hrtick_update(rq); } @@ -6173,8 +6201,7 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu) unsigned long max_spare = 0; struct sched_domain *sd;
- rcu_read_lock(); - + /* The rcu lock is/should be held in the caller function */ sd = rcu_dereference(per_cpu(sd_ea, prev_cpu));
if (!sd) @@ -6212,8 +6239,6 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu) }
unlock: - rcu_read_unlock(); - if (energy_cpu == prev_cpu && !cpu_overutilized(prev_cpu)) return prev_cpu;
@@ -6247,10 +6272,16 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f && cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); }
- if (energy_aware() && !(cpu_rq(prev_cpu)->rd->overutilized)) - return select_energy_cpu_brute(p, prev_cpu); - rcu_read_lock(); + sd = rcu_dereference(cpu_rq(prev_cpu)->sd); + if (energy_aware() && + !is_sd_overutilized(sd)) { + new_cpu = select_energy_cpu_brute(p, prev_cpu); + goto unlock; + } + + sd = NULL; + for_each_domain(cpu, tmp) { if (!(tmp->flags & SD_LOAD_BALANCE)) break; @@ -6315,6 +6346,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f } /* while loop will break here if sd == NULL */ } + +unlock: rcu_read_unlock();
return new_cpu; @@ -7366,6 +7399,7 @@ struct sd_lb_stats { struct sched_group *local; /* Local group in this sd */ unsigned long total_load; /* Total load of all groups in sd */ unsigned long total_capacity; /* Total capacity of all groups in sd */ + unsigned long total_util; /* Total util of all groups in sd */ unsigned long avg_load; /* Average load across all groups in sd */
struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */ @@ -7385,6 +7419,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds) .local = NULL, .total_load = 0UL, .total_capacity = 0UL, + .total_util = 0UL, .busiest_stat = { .avg_load = 0UL, .sum_nr_running = 0, @@ -7664,7 +7699,7 @@ group_type group_classify(struct sched_group *group, static inline void update_sg_lb_stats(struct lb_env *env, struct sched_group *group, int load_idx, int local_group, struct sg_lb_stats *sgs, - bool *overload, bool *overutilized) + bool *overload, bool *overutilized, bool *misfit_task) { unsigned long load; int i, nr_running; @@ -7699,8 +7734,16 @@ static inline void update_sg_lb_stats(struct lb_env *env, if (!nr_running && idle_cpu(i)) sgs->idle_cpus++;
- if (cpu_overutilized(i)) + if (cpu_overutilized(i)) { *overutilized = true; + /* + * If the cpu is overutilized and if there is only one + * current task in cfs runqueue, it is potentially a misfit + * task. + */ + if (rq->cfs.h_nr_running == 1) + *misfit_task = true; + } }
/* Adjust by relative CPU capacity of the group */ @@ -7825,11 +7868,11 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq) */ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds) { - struct sched_domain *child = env->sd->child; + struct sched_domain *child = env->sd->child, *sd; struct sched_group *sg = env->sd->groups; struct sg_lb_stats tmp_sgs; int load_idx, prefer_sibling = 0; - bool overload = false, overutilized = false; + bool overload = false, overutilized = false, misfit_task = false;
if (child && child->flags & SD_PREFER_SIBLING) prefer_sibling = 1; @@ -7851,7 +7894,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd }
update_sg_lb_stats(env, sg, load_idx, local_group, sgs, - &overload, &overutilized); + &overload, &overutilized, + &misfit_task);
if (local_group) goto next_group; @@ -7882,6 +7926,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd /* Now, start updating sd_lb_stats */ sds->total_load += sgs->group_load; sds->total_capacity += sgs->group_capacity; + sds->total_util += sgs->group_util;
sg = sg->next; } while (sg != env->sd->groups); @@ -7895,14 +7940,45 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd /* update overload indicator if we are at root domain */ if (env->dst_rq->rd->overload != overload) env->dst_rq->rd->overload = overload; + }
- /* Update over-utilization (tipping point, U >= 0) indicator */ - if (env->dst_rq->rd->overutilized != overutilized) - env->dst_rq->rd->overutilized = overutilized; - } else { - if (!env->dst_rq->rd->overutilized && overutilized) - env->dst_rq->rd->overutilized = true; + if (overutilized) + set_sd_overutilized(env->sd); + else + clear_sd_overutilized(env->sd); + + /* + * If there is a misfit task in one cpu in this sched_domain + * it is likely that the imbalance cannot be sorted out among + * the cpu's in this sched_domain. In this case set the + * overutilized flag at the parent sched_domain. + */ + if (misfit_task) { + + sd = env->sd->parent; + + /* + * In case of a misfit task, load balance at the parent + * sched domain level will make sense only if the the cpus + * have a different capacity. If cpus at a domain level have + * the same capacity, the misfit task cannot be well + * accomodated in any of the cpus and there in no point in + * trying a load balance at this level + */ + while (sd) { + if (sd->flags & SD_ASYM_CPUCAPACITY) { + set_sd_overutilized(sd); + break; + } + sd = sd->parent; + } } + + /* If the domain util is greater that domain capacity, load balancing + * needs to be done at the next sched domain level as well + */ + if (sds->total_capacity * 1024 < sds->total_util * capacity_margin) + set_sd_overutilized(env->sd->parent); }
/** @@ -8122,8 +8198,10 @@ static struct sched_group *find_busiest_group(struct lb_env *env) */ update_sd_lb_stats(env, &sds);
- if (energy_aware() && !env->dst_rq->rd->overutilized) - goto out_balanced; + if (energy_aware()) { + if (!is_sd_overutilized(env->sd)) + goto out_balanced; + }
local = &sds.local_stat; busiest = &sds.busiest_stat; @@ -8981,6 +9059,11 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
rcu_read_lock(); for_each_domain(cpu, sd) { + if (energy_aware()) { + if (!is_sd_overutilized(sd)) + continue; + } + /* * Decay the newidle max times here because this is a regular * visit to all the domains. Decay ~1% per second. @@ -9280,6 +9363,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) { struct cfs_rq *cfs_rq; struct sched_entity *se = &curr->se; + struct sched_domain *sd;
for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); @@ -9289,8 +9373,12 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) if (static_branch_unlikely(&sched_numa_balancing)) task_tick_numa(rq, curr);
- if (!rq->rd->overutilized && cpu_overutilized(task_cpu(curr))) - rq->rd->overutilized = true; + rcu_read_lock(); + sd = rcu_dereference(rq->sd); + if (!is_sd_overutilized(sd) && + cpu_overutilized(task_cpu(curr))) + set_sd_overutilized(sd); + rcu_read_unlock(); }
/* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index fa98ab3..b24cefa 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -563,9 +563,6 @@ struct root_domain { /* Indicate more than one runnable task for any CPU */ bool overload;
- /* Indicate one or more cpus over-utilized (tipping point) */ - bool overutilized; - /* * The bit corresponding to a CPU gets set here if such CPU has more * than one runnable -deadline task (as it is below for RT tasks). -- 2.1.4
Hi Thara,
I'm currently running your patch on 'EAS product line minus the two misfit patches':
"WIP: sched: Add group_misfit_task load-balance type" "WIP: sched: Consider misfit tasks when load-balancing"
on my Pixel phone to be able to compare the behaviour. That's why some of my questions/remarks might be very (too) specific :-)
My overall question is: should this patch replace the whole over-utilze/misfit approach or do we still want to have extra 'misfit code' in the load balancer to manoeuvre differently through load_balance once we detect a misfit scenario? I guess that's the case, just trying to align here.
On 16/02/17 20:33, Thara Gopinath wrote:
The current implementation of overutilization, aborts energy aware scheduling if any cpu in the system is over-utilized. This patch introduces over utilization flag per sched domain level instead of a single flag system wide. Load balancing is done at the sched domain where any of the cpu is over utilized. If energy aware scheduling is enabled and no cpu in a sched domain is overuttilized, load balancing is skipped for that sched domain and energy aware scheduling continues at that level.
The implementation takes advantage of the shared sched_domain structure that is common across all the sched domains at a level. The new flag introduced is placed in this structure so that all the sched domains the same level share the flag. In case of an overutilized cpu, the flag gets set at level1 sched_domain. The flag at the parent sched_domain level gets
By level1 you mean 'sd = rq->sd' which has 'sd->child == NULL', right?
set in either of the two following scenarios.
- There is a misfit task in one of the cpu's in this sched_domain.
... on the first sd in direction sd->parent where SD_ASYM_CPUCAPACITY is set
- The total utilization of the domain is greater than the domain capacity
This one is only set on sd and potentially on sd->parent.
On an 'MC-DIE level' system like Juno, 1. and sd->parent of 2. would be obviously the same.
I'm currently struggling with the fact that we see group over-utilization as something which is the same as having a misfit task.
If the reason was '2.', then we should allow lb at sd->parent level which is already done by not jumping to out_balanced in fbg().
If the reason was '1.', do we want to rely solely on the existing load balance code path (based on avg_load + some coverage of corner cases) or do we want to implement some specific misfit task path with the lb path?
I'm asking because with this patch we get misfit detection in lb code but not the special handling of a misfit task in lb code patch provided by the two misfit patches in product code line mentioned above.
The flag is cleared if no cpu in a sched domain is overutilized.
This implementation still can have corner scenarios with respect to misfit tasks. For example consider a sched group with n cpus and n+1 70%utilized tasks. Ideally this is a case for load balance to happen in a parent sched domain. But neither the total group utilization is high enough for the load balance to be triggered in the parent domain nor there is a cpu with a single overutilized task so that aload balance is triggered in a parent domain. But again this could be a purely academic sceanrio, as during task wake up these tasks will be placed more appropriately.
IMHO, this example depends on the number of cpus and the capacity of the cpus in this cluster.
E.g. in case we deal with a big cluster in which all cpu's still have their full original cpu capacity, this corner scenario happens for clusters with #cpus >= 7.
Otherwise the group utilization is still higher then the group capacity.
task utilization : 0.7*1024 security factor : 1280/1024 original cpu capacity of a big cpu: 1024
capacity (cluster of n big cpus) < utilization (of n+1 tasks) 1024*n < (n+1)*(0.7*1024)*(1280/1024) 1024*n < (n+1)*0.7*1280 1024*n < 896*n + 896 n < 7
I guess we can assume that a big cluster with all cpus at original cpu capacity is the worst case scenario here (the smallest #cpus this can happen). In this case it only depends on the percentage value of the tasks utilization. So maybe this is where the 'academic' could come as well besides the fact that with a 70% task, wakeup can do its job.
Signed-off-by: Thara Gopinath thara.gopinath@linaro.org
V1->V2:
- Removed overutilized flag from sched_group structure.
- In case of misfit task, it is ensured that a load balance is
triggered in a parent sched domain with assymetric cpu capacities.
include/linux/sched.h | 1 + kernel/sched/core.c | 7 ++- kernel/sched/fair.c | 138 +++++++++++++++++++++++++++++++++++++++++--------- kernel/sched/sched.h | 3 -- 4 files changed, 117 insertions(+), 32 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h index 1c5122e..971842a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1112,6 +1112,7 @@ struct sched_domain_shared { atomic_t ref; atomic_t nr_busy_cpus; int has_idle_cores;
Maybe add the old comment from the root domain here ?
/* Indicate one or more cpus over-utilized (tipping point) */
- bool overutilized;
};
I'm not sure ... was the idea to have _one_ struct sched_domain_shared for all sd's which then has all the data elements we want to share on all sd levels?
struct sched_domain { diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 31a466f..e0a8758 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6659,11 +6659,10 @@ sd_init(struct sched_domain_topology_level *tl, * For all levels sharing cache; connect a sched_domain_shared * instance. */
Nitpick: The original comment from Peter Z. doesn't make sense any more. We do this now regardless of SD_SHARE_PKG_RESOURCES being set or not.
- if (sd->flags & SD_SHARE_PKG_RESOURCES) {
sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
atomic_inc(&sd->shared->ref);
- sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
- atomic_inc(&sd->shared->ref);
- if (sd->flags & SD_SHARE_PKG_RESOURCES) atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
}
sd->private = sdd;
[...]
@@ -4787,9 +4812,12 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (!se) { add_nr_running(rq, 1);
if (!task_new && !rq->rd->overutilized &&
cpu_overutilized(rq->cpu))
rq->rd->overutilized = true;
rcu_read_lock();
sd = rcu_dereference(rq->sd);
if (!task_new && !is_sd_overutilized(sd) &&
cpu_overutilized(rq->cpu))
set_sd_overutilized(sd);
rcu_read_unlock()
@@ -6173,8 +6201,7 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu) unsigned long max_spare = 0; struct sched_domain *sd;
- rcu_read_lock();
- /* The rcu lock is/should be held in the caller function */
I guess the code is self explanatory here. IMHO, no need for this extra comment.
sd = rcu_dereference(per_cpu(sd_ea, prev_cpu));
if (!sd)
[...]
@@ -6247,10 +6272,16 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f && cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); }
- if (energy_aware() && !(cpu_rq(prev_cpu)->rd->overutilized))
return select_energy_cpu_brute(p, prev_cpu);
- rcu_read_lock();
- sd = rcu_dereference(cpu_rq(prev_cpu)->sd);
- if (energy_aware() &&
!is_sd_overutilized(sd)) {
Nitpick: One line ?
[...]
@@ -7699,8 +7734,16 @@ static inline void update_sg_lb_stats(struct lb_env *env, if (!nr_running && idle_cpu(i)) sgs->idle_cpus++;
if (cpu_overutilized(i))
if (cpu_overutilized(i)) { *overutilized = true;
/*
* If the cpu is overutilized and if there is only one
* current task in cfs runqueue, it is potentially a misfit
* task.
*/
if (rq->cfs.h_nr_running == 1)
*misfit_task = true;
}
In the product code line, we have misfit detection in pick_next_task_fair() and task_tick_fair()
We should run a test where initially we spawn n big tasks on a n cpus big.little system and see how quickly a task moves from a little cpu once a big cpu becomes available. Maybe you did already?
[...]
@@ -7895,14 +7940,45 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd /* update overload indicator if we are at root domain */ if (env->dst_rq->rd->overload != overload) env->dst_rq->rd->overload = overload;
- }
/* Update over-utilization (tipping point, U >= 0) indicator */
if (env->dst_rq->rd->overutilized != overutilized)
env->dst_rq->rd->overutilized = overutilized;
- } else {
if (!env->dst_rq->rd->overutilized && overutilized)
env->dst_rq->rd->overutilized = true;
- if (overutilized)
set_sd_overutilized(env->sd);
- else
clear_sd_overutilized(env->sd);
- /*
* If there is a misfit task in one cpu in this sched_domain
* it is likely that the imbalance cannot be sorted out among
* the cpu's in this sched_domain. In this case set the
* overutilized flag at the parent sched_domain.
*/
- if (misfit_task) {
sd = env->sd->parent;
/*
* In case of a misfit task, load balance at the parent
* sched domain level will make sense only if the the cpus
* have a different capacity. If cpus at a domain level have
* the same capacity, the misfit task cannot be well
* accomodated in any of the cpus and there in no point in
* trying a load balance at this level
*/
while (sd) {
if (sd->flags & SD_ASYM_CPUCAPACITY) {
set_sd_overutilized(sd);
break;
}
sd = sd->parent;
}}
- /* If the domain util is greater that domain capacity, load balancing
* needs to be done at the next sched domain level as well
*/
- if (sds->total_capacity * 1024 < sds->total_util * capacity_margin)
set_sd_overutilized(env->sd->parent);
sched_domain_shared::overutilized is potentially set twice on DIE level, one time for misfit, one time for overutilized. This could be avoided by rearranging this code a little bit. Since this is the normal topology layout for today's big.LITTLE systems (SD_ASYM_CPUCAPACITY set on DIE level, MC sd level with sd->child = NULL and sd_mc->parent == sd_die), this would be nice to change.
}
/** @@ -8122,8 +8198,10 @@ static struct sched_group *find_busiest_group(struct lb_env *env) */ update_sd_lb_stats(env, &sds);
- if (energy_aware() && !env->dst_rq->rd->overutilized)
goto out_balanced;
- if (energy_aware()) {
if (!is_sd_overutilized(env->sd))
if (energy_aware() && !is_sd_overutilized(env->sd)) ?
goto out_balanced;
}
local = &sds.local_stat; busiest = &sds.busiest_stat;
@@ -8981,6 +9059,11 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
rcu_read_lock(); for_each_domain(cpu, sd) {
if (energy_aware()) {
if (!is_sd_overutilized(sd))
if (energy_aware() && !is_sd_overutilized(env->sd)) ?
[...]
On 20/02/17 20:24, Dietmar Eggemann wrote:
[...]
On 16/02/17 20:33, Thara Gopinath wrote:
[...]
@@ -7895,14 +7940,45 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd /* update overload indicator if we are at root domain */ if (env->dst_rq->rd->overload != overload) env->dst_rq->rd->overload = overload;
- }
/* Update over-utilization (tipping point, U >= 0) indicator */
if (env->dst_rq->rd->overutilized != overutilized)
env->dst_rq->rd->overutilized = overutilized;
- } else {
if (!env->dst_rq->rd->overutilized && overutilized)
env->dst_rq->rd->overutilized = true;
- if (overutilized)
set_sd_overutilized(env->sd);
- else
clear_sd_overutilized(env->sd);
- /*
* If there is a misfit task in one cpu in this sched_domain
* it is likely that the imbalance cannot be sorted out among
* the cpu's in this sched_domain. In this case set the
* overutilized flag at the parent sched_domain.
*/
- if (misfit_task) {
sd = env->sd->parent;
/*
* In case of a misfit task, load balance at the parent
* sched domain level will make sense only if the the cpus
* have a different capacity. If cpus at a domain level have
* the same capacity, the misfit task cannot be well
* accomodated in any of the cpus and there in no point in
* trying a load balance at this level
*/
while (sd) {
if (sd->flags & SD_ASYM_CPUCAPACITY) {
set_sd_overutilized(sd);
break;
}
sd = sd->parent;
}}
- /* If the domain util is greater that domain capacity, load balancing
* needs to be done at the next sched domain level as well
*/
- if (sds->total_capacity * 1024 < sds->total_util * capacity_margin)
set_sd_overutilized(env->sd->parent);
sched_domain_shared::overutilized is potentially set twice on DIE level, one time for misfit, one time for overutilized. This could be avoided by rearranging this code a little bit. Since this is the normal topology layout for today's big.LITTLE systems (SD_ASYM_CPUCAPACITY set on DIE level, MC sd level with sd->child = NULL and sd_mc->parent == sd_die), this would be nice to change.
Maybe something like this? Only lightly tested on Pixel (MC-DIE).
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index abd9dfa3f1ce..e2ff672cea52 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7817,7 +7817,7 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq) */ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds) { - struct sched_domain *child = env->sd->child, *sd; + struct sched_domain *child = env->sd->child, *sd = env->sd; struct sched_group *sg = env->sd->groups; struct sg_lb_stats tmp_sgs; int load_idx, prefer_sibling = 0; @@ -7902,32 +7902,34 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd * the cpu's in this sched_domain. In this case set the * overutilized flag at the parent sched_domain. */ - if (misfit_task) { + while (sd = sd->parent, sd) {
- sd = env->sd->parent; + /* + * If the domain util is greater that domain capacity, + * load balancing needs to be done at the next sched + * domain level as well + */ + if ((sd->child == env->sd) && + (sds->total_capacity * 1024 < + sds->total_util * capacity_margin)) { + set_sd_overutilized(sd); + if (sd->flags & SD_ASYM_CPUCAPACITY) + break; + }
/* * In case of a misfit task, load balance at the parent - * sched domain level will make sense only if the the cpus - * have a different capacity. If cpus at a domain level have - * the same capacity, the misfit task cannot be well - * accomodated in any of the cpus and there in no point in - * trying a load balance at this level + * sched domain level will make sense only if the the + * cpus have a different capacity. If cpus at a domain + * level have the same capacity, the misfit task cannot + * be well accomodated in any of the cpus and there in + * no point in trying a load balance at this level */ - while (sd) { - if (sd->flags & SD_ASYM_CPUCAPACITY) { - set_sd_overutilized(sd); - break; - } - sd = sd->parent; + if (misfit_task && sd->flags & SD_ASYM_CPUCAPACITY) { + set_sd_overutilized(sd); + break; } } - - /* If the domain util is greater that domain capacity, load balancing - * needs to be done at the next sched domain level as well
On 20/02/17 20:24, Dietmar Eggemann wrote:
Hi Thara,
I'm currently running your patch on 'EAS product line minus the two misfit patches':
"WIP: sched: Add group_misfit_task load-balance type" "WIP: sched: Consider misfit tasks when load-balancing"
on my Pixel phone to be able to compare the behaviour. That's why some of my questions/remarks might be very (too) specific :-)
Like promised I ran your patch on top of the EAS production code line:
https://android.googlesource.com/kernel/msm.git/+/android-msm-marlin-3.18-no...
I removed the 'misfit task' code in the product code line before I applied Peter Z.'s 'struct sched_domain_shared' patches and yours on top. So there is _no_ 'misfit task' handling in this stack.
The testcase is Jankbench w/o and w/ your patches using schedfreq (sched) and schedutil governor.
I shared the Lisa ipython notebook here: https://drive.google.com/drive/folders/0B2f-ZAwV_YnmMWFjMWxZanNLaWM
android_jankbench_listview_50iter_2govs_2017-03-07_16-19-02 - baseline android_jankbench_listview_50iter_2govs_2017-03-07_16-22-32 - your patch
(even the name implies 50 iterations, we only run 1 for now)
E.g. cell [55] shows the 'total frame duration' and cell [65] the energy consumption.
To be able to see the influence of your implementation on the ratio between overutilized (OU) and non-overutilized (NON_OU) wakeups, we integrated per-rq counters in the wakeup-path. Please note that the wakeup path in the EAS product codeline is different to the mainline wakeup path. We're planning to integrate these counters into a updated version of the EAS production code line mentioned above.
Fundamentally the sum of the counters 'sis_attempts' (sis ... select_idle_sibling) and 'cas_attempts' (cas ... capacity-aware scheduling) give the number of OU wakeups and the counter 'secb_attempts' (secb ... select_energy_cpu_brute) gives the number of NON-OU wakeups:
Counters are for 1 iteration of Jankbench:
android_jankbench_listview_50iter_2govs_2017-03-07_16-19-02 - baseline
sis_attempts + cas_attempts = 9635 + 21163 = 30798
secb_attempts = 55360
android_jankbench_listview_50iter_2govs_2017-03-07_16-22-32 - w/ your patch
sis_attempts + cas_attempts = 7195 + 12574 = 19769
secb_attempts = 68785
(The counters are not shown in the Lisa notebook)
So it seems that the wakeup path runs more often in an NON_OU scenario, a result which was somehow expected.
The test further hints towards the fact that performance and energy consumption does not regress.
Promising ... so maybe we could integrate this patch in EASv1.3? But first we have to weld it to the 'misfit task' functionality, I guess ...
Cheers,
-- Dietmar
[...]
Hello Dietmar,
On 03/08/2017 05:58 AM, Dietmar Eggemann wrote:
On 20/02/17 20:24, Dietmar Eggemann wrote:
Hi Thara,
I'm currently running your patch on 'EAS product line minus the two misfit patches':
"WIP: sched: Add group_misfit_task load-balance type" "WIP: sched: Consider misfit tasks when load-balancing"
on my Pixel phone to be able to compare the behaviour. That's why some of my questions/remarks might be very (too) specific :-)
Like promised I ran your patch on top of the EAS production code line:
https://android.googlesource.com/kernel/msm.git/+/android-msm-marlin-3.18-no...
I removed the 'misfit task' code in the product code line before I applied Peter Z.'s 'struct sched_domain_shared' patches and yours on top. So there is _no_ 'misfit task' handling in this stack.
The testcase is Jankbench w/o and w/ your patches using schedfreq (sched) and schedutil governor.
I shared the Lisa ipython notebook here: https://drive.google.com/drive/folders/0B2f-ZAwV_YnmMWFjMWxZanNLaWM
android_jankbench_listview_50iter_2govs_2017-03-07_16-19-02 - baseline android_jankbench_listview_50iter_2govs_2017-03-07_16-22-32 - your patch
(even the name implies 50 iterations, we only run 1 for now)
E.g. cell [55] shows the 'total frame duration' and cell [65] the energy consumption.
To be able to see the influence of your implementation on the ratio between overutilized (OU) and non-overutilized (NON_OU) wakeups, we integrated per-rq counters in the wakeup-path. Please note that the wakeup path in the EAS product codeline is different to the mainline wakeup path. We're planning to integrate these counters into a updated version of the EAS production code line mentioned above.
Fundamentally the sum of the counters 'sis_attempts' (sis ... select_idle_sibling) and 'cas_attempts' (cas ... capacity-aware scheduling) give the number of OU wakeups and the counter 'secb_attempts' (secb ... select_energy_cpu_brute) gives the number of NON-OU wakeups:
Counters are for 1 iteration of Jankbench:
android_jankbench_listview_50iter_2govs_2017-03-07_16-19-02 - baseline
sis_attempts + cas_attempts = 9635 + 21163 = 30798
secb_attempts = 55360
android_jankbench_listview_50iter_2govs_2017-03-07_16-22-32 - w/ your patch
sis_attempts + cas_attempts = 7195 + 12574 = 19769
secb_attempts = 68785
(The counters are not shown in the Lisa notebook)
So it seems that the wakeup path runs more often in an NON_OU scenario, a result which was somehow expected.
The test further hints towards the fact that performance and energy consumption does not regress.
Promising ... so maybe we could integrate this patch in EASv1.3? But first we have to weld it to the 'misfit task' functionality, I guess ...
Yes, The results are promising and on target with what was expected. I have not seen the WIP misfit patches to comment on the usability of those with this patch. I work on latest arm eas dev branch and these patches are not part of the mainline. Can you point me to those patches, please?
Regards Thara
Cheers,
-- Dietmar
[...]
-- Regards Thara
On 08/03/17 11:46, Thara Gopinath wrote:
[...]
Yes, The results are promising and on target with what was expected. I have not seen the WIP misfit patches to comment on the usability of those with this patch. I work on latest arm eas dev branch and these patches are not part of the mainline. Can you point me to those patches, please?
I try to join the presentation later this day (4pm CET) so we could discuss it then.
-- Dietmar
On Mar 8, 2017 1:37 PM, "Dietmar Eggemann" dietmar.eggemann@arm.com wrote:
On 08/03/17 11:46, Thara Gopinath wrote:
[...]
Yes, The results are promising and on target with what was expected. I have not seen the WIP misfit patches to comment on the usability of those with this patch. I work on latest arm eas dev branch and these patches are not part of the mainline. Can you point me to those patches, please?
I try to join the presentation later this day (4pm CET) so we could discuss it then.
Sure!
-- Dietmar
On 16/02/17 20:33, Thara Gopinath wrote:
[...]
@@ -4787,9 +4812,12 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (!se) { add_nr_running(rq, 1);
if (!task_new && !rq->rd->overutilized &&
cpu_overutilized(rq->cpu))
rq->rd->overutilized = true;
rcu_read_lock();
sd = rcu_dereference(rq->sd);
if (!task_new && !is_sd_overutilized(sd) &&
IMHO, this is problematic. is_sd_overutilized() returns false if the sd is not overutilized or sd == NULL.
Shouldn't we change all of these occurrences where we fetch the sd from rcu_dereference() to:
if (sd && ...)
This would allow to get rid of the if() condition in [set|clear|is]_sd_overutilized() as well. In case we use env->sd or for_each_domain() or similar we can make sure that sd != NULL.
Nitpick: Could we rename is_sd_overutilized() into sd_overutilized() to be in line with cpu_overutilized()?
Maybe something like this? Only lightly tested on Pixel.
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e2ff672cea52..4b8cbe172d11 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4383,27 +4383,22 @@ static void update_capacity_of(int cpu) } #endif
-static bool -is_sd_overutilized(struct sched_domain *sd) +static bool sd_overutilized(struct sched_domain *sd) { - if (sd) - return sd->shared->overutilized; - else - return false; + BUG_ON(!sd); + return sd->shared->overutilized; }
-static void -set_sd_overutilized(struct sched_domain *sd) +static void set_sd_overutilized(struct sched_domain *sd) { - if (sd) - sd->shared->overutilized = true; + BUG_ON(!sd); + sd->shared->overutilized = true; }
-static void -clear_sd_overutilized(struct sched_domain *sd) +static void clear_sd_overutilized(struct sched_domain *sd) { - if (sd) - sd->shared->overutilized = false; + BUG_ON(!sd); + sd->shared->overutilized = false; }
/* @@ -4491,7 +4486,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
rcu_read_lock(); sd = rcu_dereference(rq->sd); - if (!task_new && !is_sd_overutilized(sd) && + if (sd && !task_new && !sd_overutilized(sd) && cpu_overutilized(rq->cpu)) set_sd_overutilized(sd); rcu_read_unlock(); @@ -6152,7 +6147,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
rcu_read_lock(); sd = rcu_dereference(cpu_rq(prev_cpu)->sd); - if (energy_aware() && !is_sd_overutilized(sd)) { + if (sd && energy_aware() && !sd_overutilized(sd)) { new_cpu = select_energy_cpu_brute(p, prev_cpu, sync); goto unlock; } @@ -8152,7 +8147,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) update_sd_lb_stats(env, &sds);
if (energy_aware()) { - if (!is_sd_overutilized(env->sd)) + if (!sd_overutilized(env->sd)) goto out_balanced; }
@@ -9063,7 +9058,7 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) rcu_read_lock(); for_each_domain(cpu, sd) { if (energy_aware()) { - if (!is_sd_overutilized(sd)) + if (!sd_overutilized(sd)) continue; }
@@ -9371,8 +9366,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) #ifdef CONFIG_SMP rcu_read_lock(); sd = rcu_dereference(rq->sd); - if (!is_sd_overutilized(sd) && cpu_overutilized(task_cpu(curr))) - set_sd_overutilized(sd); + if (sd && !sd_overutilized(sd) && cpu_overutilized(task_cpu(curr))) + set_sd_overutilized(sd); rcu_read_unlock(); #endif