Re: [Eas-dev] [RFC] Per Sched domain over utilization

8 Dec 2016

Hi Leo,
Thanks for the review. It will be great if you can test this patch out
and let me know of any improvements/degradation.
On 12/08/2016 05:24 AM, Leo Yan wrote:
...
Hi Thara,
Good to see the patch; please see some questions.
On Wed, Dec 07, 2016 at 05:22:37PM -0500, Thara Gopinath wrote:
...
The current implementation of overutilization, aborts energy aware
scheduling if any cpu in the system is over-utilized. This patch introduces
over utilization flag per sched group level instead of a single flag
system wide.  Load balancing is done at the sched domain where any
of the sched group is over utilized. If energy aware scheduling is
enabled and no sched group in a sched domain is overuttilized,
load balancing is skipped for that sched domain and energy aware
scheduling continues at that level.
The implementation is based on two points

For every cpu in every sched domain the first group
is the group that contains the cpu itself.
sched groups are shared between cpus.

Thus if a sched group is overutilized the overutilized flag is
set at the first sched group of the parent sched domain. This ensures a
load balancing at the overutilzed sched domain level.
For example consider a big little system with two little cpu's (CPU A and CPU B)
and two big cpu's (CPU C and CPU D). In this system, the hierarchy will be as follows
CPU A
   SD level 1 - SG1 (CPUA), SG2 (CPUB)
   SD level 2 - SG5(CPUA, CPUB), SG6(CPU C, CPU D)
   RD
CPU B
   SD level 1 - SG2(CPUB), SG1 (CPUA)
   SD level 2 - SG5(CPU A, CPU B), SG6(CPU C, CPUD)
   RD
CPU C
   SD level 1 - SG3(CPU C), SG4 (CPUD)
   SD level 2 - SG6(CPUC, CPUD), SG5(CPUA, CPU B)
   RD
CPU D
   SD level 1 - SG4(CPU D), SG3(CPU C)
   SD level2 - SG6(CPUC, CPU D), SG5(CPU A, APU B)
   RD
In the above system if CPUA is overutilized, the overutilized
flag is set at SG5(parent sched domain first sched group). Similarly
if CPUB is overutilized, the flag is set at SG5. During load balancing,
at SD level 1, the overutilized flag is checked at the parent sched domain,
first sched group level(SG5). If there is no parent sched domain, then the flag
is set/checked at the root domain. This ensures that load balancing happens
irrespective of which cpu is over utilized in a sched domain.
So for SD level 1, any CPU is overutilized then it will set
overutilized flag for SD level 1.
SD is cpu specific. If CPU A or CPU B is overutlized, the flag is set at
SG5 level. This way irrespective of which cpu load balancing is
happening from, load is balanced between CPU A and CPU B.
...
What's the criteria to set the root domain flag? I read the code, if
only has one overutilized CPU in system and it still possible to set
root domain's overutilized flag. right?
In the above example if both CPUA and CPU B are over-utilized(which
means for both CPU A and CPU B, SD level 1 is over-utilized which in
turn means SG5 is overutilized), the load balancing has to happen at the
next level between SG5 and SG6. In this case the flag is set at the RD
level.
...
...
Signed-off-by: Thara Gopinath thara.gopinath@linaro.org
kernel/sched/fair.c  | 108 ++++++++++++++++++++++++++++++++++++++++++---------
 kernel/sched/sched.h |   1 +
 2 files changed, 90 insertions(+), 19 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 01fa969..0c97e0a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4559,6 +4559,36 @@ static inline void hrtick_update(struct rq *rq)
static bool cpu_overutilized(int cpu);
+static bool
+is_sd_overutilized(struct sched_domain *sd, struct root_domain *rd)
+{

if (sd && sd->parent)
return sd->parent->groups->overutilized;



if (!rd)
return false;



return rd->overutilized;

+}



+static void
+set_sd_overutilized(struct sched_domain *sd, struct root_domain *rd)
+{

if (sd && sd->parent)
sd->parent->groups->overutilized = true;


else if (rd)
rd->overutilized = true;



+}



+static void
+clear_sd_overutilized(struct sched_domain *sd, struct root_domain *rd)
+{

if (sd && sd->parent)
sd->parent->groups->overutilized = false;


else if (rd)
rd->overutilized = false;



+}



/*

The enqueue_task method is called before nr_running is
increased. Here we update the fair scheduling stats and

@@ -4568,6 +4598,7 @@ static void
 enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 {
   struct cfs_rq *cfs_rq;

struct sched_domain *sd;
struct sched_entity *se = &p->se;
int task_new = !(flags & ENQUEUE_WAKEUP);

@@ -4603,9 +4634,12 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (!se) {
   	add_nr_running(rq, 1);

if (!task_new && !rq->rd->overutilized &&


    cpu_overutilized(rq->cpu))


	rq->rd->overutilized = true;




rcu_read_lock();


sd = rcu_dereference(rq->sd);


if (!task_new && !is_sd_overutilized(sd, rq->rd) &&


	cpu_overutilized(rq->cpu))


		set_sd_overutilized(sd, rq->rd);


rcu_read_unlock();

}
hrtick_update(rq);

}
@@ -5989,8 +6023,6 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu)
   unsigned long max_spare = 0;
   struct sched_domain *sd;

rcu_read_lock();

sd = rcu_dereference(per_cpu(sd_ea, prev_cpu));
if (!sd)


@@ -6028,7 +6060,6 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu)
   }
unlock:

rcu_read_unlock();
if (energy_cpu == prev_cpu && !cpu_overutilized(prev_cpu))
return prev_cpu;


@@ -6063,10 +6094,16 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
   		      && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
   }

if (energy_aware() && !(cpu_rq(prev_cpu)->rd->overutilized))
return select_energy_cpu_brute(p, prev_cpu);


rcu_read_lock();


sd = rcu_dereference(cpu_rq(prev_cpu)->sd);
if (energy_aware() &&
	!is_sd_overutilized(sd,


	cpu_rq(cpu)->rd)) {


new_cpu = select_energy_cpu_brute(p, prev_cpu);


goto unlock;


}

sd = NULL;

Is it better to place function select_energy_cpu_brute() out of rcu
locking, like below?
I dont understand how this will change anything. In the above code,
select_energy_cpu_brute() is rcu protected.
...
    rcu_read_lock();
    sd = rcu_dereference(cpu_rq(prev_cpu)->sd);
    is_overutilized = is_sd_overutilized(sd, cpu_rq(cpu)->rd));
    rcu_read_unlock();

    if (energy_aware() && !is_overutilized)
            return select_energy_cpu_brute(p, prev_cpu);


...
for_each_domain(cpu, tmp) {
   	if (!(tmp->flags & SD_LOAD_BALANCE))
   		break;
@@ -6131,6 +6168,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
   	}
   	/* while loop will break here if sd == NULL */
   }



+unlock:
   rcu_read_unlock();
return new_cpu;
@@ -7178,6 +7217,7 @@ struct sd_lb_stats {
   struct sched_group *local;	/* Local group in this sd */
   unsigned long total_load;	/* Total load of all groups in sd */
   unsigned long total_capacity;	/* Total capacity of all groups in sd */

unsigned long total_util;	/* Total util of all groups in sd */
unsigned long avg_load;	/* Average load across all groups in sd */
struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */


@@ -7197,6 +7237,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
   	.local = NULL,
   	.total_load = 0UL,
   	.total_capacity = 0UL,

.total_util = 0UL,

.busiest_stat = {
	.avg_load = 0UL,
	.sum_nr_running = 0,

@@ -7692,6 +7733,7 @@ next_group:
   	/* Now, start updating sd_lb_stats */
   	sds->total_load += sgs->group_load;
   	sds->total_capacity += sgs->group_capacity;

sds->total_util += sgs->group_util;


sg = sg->next;
} while (sg != env->sd->groups);


@@ -7701,17 +7743,26 @@ next_group:
env->src_grp_nr_running = sds->busiest_stat.sum_nr_running;

/* Setting overutilized flag might not be necessary here
* Revisit


*/

if (!lb_sd_parent(env->sd)) {
  /* update overload indicator if we are at root domain */
  if (env->dst_rq->rd->overload != overload)
  	env->dst_rq->rd->overload = overload;
}


/* Update over-utilization (tipping point, U >= 0) indicator */


if (env->dst_rq->rd->overutilized != overutilized)


	env->dst_rq->rd->overutilized = overutilized;


} else {
if (!env->dst_rq->rd->overutilized && overutilized)


	env->dst_rq->rd->overutilized = true;




if (overutilized)
set_sd_overutilized(env->sd, env->dst_rq->rd);



If it's not overutilized, here should call function
clear_sd_overutilized()? The old code clears root domain flag.
Yes. May be we should. I am not sure about it. Because we clear the flag
at a different place in this implementation.
...
...

/* If the domain util is greater that domain capacity, load balancing
* needs to be done at the next sched domain level as well


*/


if (sds->total_capacity * 1024 < sds->total_util * capacity_margin) {
/* If  already at the highest domain nothing can be done */


if (env->sd->parent)


	set_sd_overutilized(env->sd->parent,


				env->dst_rq->rd);



So usually this will set root domain's flag after the whole schedule
domain util greater than domain capacity. If CPU has one "misfit" task
then scheduler will not reach this condition, so this will not set
root domain's flag and introduce delay to migrate task.
Hmm yes. you are correct. we may have to handle misfit tasks separately.
...
...
}
 }
@@ -7932,8 +7983,11 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
    */
   update_sd_lb_stats(env, &sds);

if (energy_aware() && !env->dst_rq->rd->overutilized)
goto out_balanced;




/* Is this check really required here?? Revisit */

if (energy_aware()) {

if (!is_sd_overutilized(env->sd, env->dst_rq->rd))


	goto out_balanced;


}
local = &sds.local_stat;
busiest = &sds.busiest_stat;


@@ -8000,6 +8054,12 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
 force_balance:
   /* Looks like there is an imbalance. Compute it */
   calculate_imbalance(env, &sds);


/* Is this the correct place to clear this flag? Should access
* to flag be locked? Revisit.


*/


clear_sd_overutilized(env->sd, env->dst_rq->rd);

Have specific sequency to clear overutilized flag for root domain and SD
level 2? Like firstly clean root domain flag and then clear SD level
2 flag.
We don't need a specific sequence. As and when each sched domain get
balanced, the corresponding flag will get cleared.
...
...
return sds.busiest;
out_balanced:
@@ -8790,6 +8850,11 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
rcu_read_lock();
   for_each_domain(cpu, sd) {

if (energy_aware()) {


	if (!is_sd_overutilized(sd, rq->rd))


		continue;


}


/*
Decay the newidle max times here because this is a regular
visit to all the domains. Decay ~1% per second.



@@ -9083,6 +9148,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
 {
   struct cfs_rq *cfs_rq;
   struct sched_entity *se = &curr->se;

struct sched_domain *sd;
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);


@@ -9092,8 +9158,12 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
   if (static_branch_unlikely(&sched_numa_balancing))
   	task_tick_numa(rq, curr);

if (!rq->rd->overutilized && cpu_overutilized(task_cpu(curr)))
rq->rd->overutilized = true;




rcu_read_lock();
sd = rcu_dereference(rq->sd);
if (!is_sd_overutilized(sd, rq->rd) &&
cpu_overutilized(task_cpu(curr)))


	set_sd_overutilized(sd, rq->rd);


rcu_read_unlock();

}
/*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index f99391d..90c48ac 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -913,6 +913,7 @@ struct sched_group {
   unsigned int group_weight;
   struct sched_group_capacity *sgc;
   const struct sched_group_energy const *sge;

bool overutilized;
/*

The CPUs this group covers.



--
2.1.4
--
Regards
Thara

    

2026

2025

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

Re: [Eas-dev] [RFC] Per Sched domain over utilization

Signed-off-by: Thara Gopinath thara.gopinath@linaro.org