[Eas-dev] [PATCH 11/13] Per Sched domain over utilization

26 Jun 2017

From: Thara Gopinath thara.gopinath@linaro.org
The current implementation of overutilization, aborts energy aware
scheduling if any cpu in the system is over-utilized. This patch introduces
over utilization flag per sched group level instead of a single flag
system wide.  Load balancing is done at the sched domain where any
of the sched group is over utilized. If energy aware scheduling is
enabled and no sched group in a sched domain is overuttilized,
load balancing is skipped for that sched domain and energy aware
scheduling continues at that level.
The implementation is based on two points
1. For every cpu in every sched domain the first group
   is the group that contains the cpu itself.
2. sched groups are shared between cpus.
Thus if a sched group find itself need to spread tasks then it should set
corresponding overutilized flag properly. There have three kinds
overutilized flag we should consider:
- Inner overutilized: if one cpu wants to spread tasks within the same
  cluster, then the overutilized flag is set at the first sched group
  of the lowest sched domain. This flag presents the task spreading is
  required from the CPU and ask other CPUs in lowest schedule domain
  to take over possible tasks from it;
- Outer overutilized: if one cpu wants to spread tasks to another
  cluster, then the overutilized flag is set at the first sched group
  of the parent sched domain. This ensures a load balancing at the
  overutilzed sched domain level. So that means the CPU is seeking help
  from another cluster and CPUs in another cluster can migrate tasks
  to improve performance;
- Global overutilized: if whole system is busy, we can set root domain
  flag so bypass energy aware scheduling and totally go back to
  traditional load balance. This can explore the overall performance by
  spreading task as possible.
For example consider a big little system with two little cpu's (CPU A and CPU B)
and two big cpu's (CPU C and CPU D). In this system, the hierarchy will be as follows
CPU A
    SD level 1 - SG1(CPUA), SG2(CPUB)
    SD level 2 - SG5(CPUA, CPUB), SG6(CPUC, CPUD)
    RD
CPU B
    SD level 1 - SG2(CPUB), SG1(CPUA)
    SD level 2 - SG5(CPUA, CPUB), SG6(CPUC, CPUD)
    RD
CPU C
    SD level 1 - SG3(CPUC), SG4(CPUD)
    SD level 2 - SG6(CPUC, CPUD), SG5(CPUA, CPUB)
    RD
CPU D
    SD level 1 - SG4(CPUD), SG3(CPUC)
    SD level 2 - SG6(CPUC, CPUD), SG5(CPUA, APUB)
    RD
In the above system if CPUA is not run at lowest OPP, the overutilized
flag is set at SG1 so scheduler can make load balance between CPUA and
CPUB;
If CPUA is overutilized or has misfit task, the overutilized flag is
set at SG5(parent sched domain first sched group); During load balancing,
at SD level 2, it will iterate all sched group's overutilized flag and
if any flag has been set then it executes load balance in this sched
domain.
If whole system the overall utilization is bigger than 50% of overall
CPU capacity, then the flag is set/checked at the root domain; this
means overall utilization is cross at least one cluster capacity.
[ Changed by Leo to support discrete flags for
  inner/outer/global over-utilization ]
Signed-off-by: Thara Gopinath thara.gopinath@linaro.org
Signed-off-by: Leo Yan leo.yan@linaro.org
---
 kernel/sched/fair.c  | 145 ++++++++++++++++++++++++++++++++++++++++++---------
 kernel/sched/sched.h |   1 +
 2 files changed, 122 insertions(+), 24 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6833524..2a263f7 100755
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4658,6 +4658,68 @@ static inline void hrtick_update(struct rq *rq)
 #ifdef CONFIG_SMP
 static bool cpu_overutilized(int cpu);
 unsigned long boosted_cpu_util(int cpu);
+
+/*
+ * 1. Inner overutilized:
+ *
+ * The load balance will happen only in SD Level 1, so this means
+ * only take affact on clustser internally.
+ *
+ * 2. Outer overutilized:
+ *
+ * If the CPU has misfit on it, it's no doubt to migrate task
+ * to another high capacity CPU.
+ *
+ * Or if one CPU is overutilized and we assume now scheduler has
+ * done good enough work to explore cluster internal capacity, so
+ * if one CPU is overutilized that means finally need seek another
+ * cluster to provide more computing capacity.
+ *
+ * 3. Global overutilized:
+ *
+ * If set root domain flag, means explore performance as possible
+ * to spread out tasks.
+ *
+ */
+static void set_sd_overutilized(struct sched_domain *sd)
+{
+	if (sd)
+		sd->groups->overutilized = true;
+}
+
+static void clear_sd_overutilized(struct sched_domain *sd)
+{
+	if (sd)
+		sd->groups->overutilized = false;
+}
+
+static void set_rd_overutilized(struct root_domain *rd)
+{
+	rd->overutilized = true;
+}
+
+static void clear_rd_overutilized(struct root_domain *rd)
+{
+	rd->overutilized = false;
+}
+
+static bool is_sd_overutilized(struct sched_domain *sd)
+{
+	struct sched_group *group = sd->groups;
+	int cpu = smp_processor_id();
+
+	if (cpu_rq(cpu)->rd->overutilized)
+		return true;
+
+	do {
+		if (group->overutilized)
+			return true;
+
+	} while (group = group->next, group != sd->groups);
+
+	return false;
+}
+
 #else
 #define boosted_cpu_util(cpu) cpu_util(cpu)
 #endif
@@ -4686,6 +4748,7 @@ static void
 enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 {
    struct cfs_rq *cfs_rq;
+	struct sched_domain *sd;
    struct sched_entity *se = &p->se;
 #ifdef CONFIG_SMP
    int task_new = flags & ENQUEUE_WAKEUP_NEW;
@@ -4758,11 +4821,17 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (!se) {
    	walt_inc_cumulative_runnable_avg(rq, p);
-		if (!task_new && !rq->rd->overutilized &&
-		    cpu_overutilized(rq->cpu)) {
-			rq->rd->overutilized = true;
-			trace_sched_overutilized(true);
+
+		rcu_read_lock();
+		sd = rcu_dereference(rq->sd);
+		if (!task_new) {
+			if (cpu_overutilized(rq->cpu) && sd)
+				set_sd_overutilized(sd);
+
+			if (rq->misfit_task && sd && sd->parent)
+				set_sd_overutilized(sd->parent);
    	}
+		rcu_read_unlock();
/*
    	 * We want to potentially trigger a freq switch
@@ -7754,6 +7823,7 @@ struct sd_lb_stats {
    struct sched_group *local;	/* Local group in this sd */
    unsigned long total_load;	/* Total load of all groups in sd */
    unsigned long total_capacity;	/* Total capacity of all groups in sd */
+	unsigned long total_util;	/* Total util of all groups in sd */
    unsigned long avg_load;	/* Average load across all groups in sd */
struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
@@ -7773,6 +7843,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
    	.local = NULL,
    	.total_load = 0UL,
    	.total_capacity = 0UL,
+		.total_util = 0UL,
    	.busiest_stat = {
    		.avg_load = 0UL,
    		.sum_nr_running = 0,
@@ -8100,10 +8171,11 @@ group_type group_classify(struct sched_group *group,
 static inline void update_sg_lb_stats(struct lb_env *env,
    		struct sched_group *group, int load_idx,
    		int local_group, struct sg_lb_stats *sgs,
-			bool *overload, bool *overutilized)
+			bool *overload, bool *misfit)
 {
    unsigned long load;
    int i, nr_running;
+	bool overutilized = false;
memset(sgs, 0, sizeof(*sgs));
@@ -8136,7 +8208,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
    		sgs->idle_cpus++;
if (cpu_overutilized(i)) {
-			*overutilized = true;
+			overutilized = true;
    		if (!sgs->group_misfit_task && rq->misfit_task)
    			sgs->group_misfit_task = capacity_of(i);
    	}
@@ -8153,6 +8225,9 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->group_no_capacity = group_is_overloaded(env, sgs);
    sgs->group_type = group_classify(group, sgs);
+
+	if (sgs->group_weight == 1)
+		group->overutilized = overutilized;
 }
/**
@@ -8270,7 +8345,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
    struct sched_group *sg = env->sd->groups;
    struct sg_lb_stats tmp_sgs;
    int load_idx, prefer_sibling = 0;
-	bool overload = false, overutilized = false;
+	bool overload = false, misfit = false;
if (child && child->flags & SD_PREFER_SIBLING)
    	prefer_sibling = 1;
@@ -8292,7 +8367,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
    	}
update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
-						&overload, &overutilized);
+						&overload, &misfit);
if (local_group)
    		goto next_group;
@@ -8332,6 +8407,7 @@ next_group:
    	/* Now, start updating sd_lb_stats */
    	sds->total_load += sgs->group_load;
    	sds->total_capacity += sgs->group_capacity;
+		sds->total_util += sgs->group_util;
sg = sg->next;
    } while (sg != env->sd->groups);
@@ -8346,18 +8422,28 @@ next_group:
    	if (env->dst_rq->rd->overload != overload)
    		env->dst_rq->rd->overload = overload;
-		/* Update over-utilization (tipping point, U >= 0) indicator */
-		if (env->dst_rq->rd->overutilized != overutilized) {
-			env->dst_rq->rd->overutilized = overutilized;
-			trace_sched_overutilized(overutilized);
-		}
+		/*
+		 * If total utilization is more than half of capacity,
+		 * this means at least the average CPU utilization is
+		 * crossing half of max capacity CPU; so this is a quite
+		 * high bar to set root domain's overutlized flag.
+		 */
+		if (sds->total_capacity < sds->total_util * 2)
+			set_rd_overutilized(env->dst_rq->rd);
+		else
+			clear_rd_overutilized(env->dst_rq->rd);
    } else {
-		if (!env->dst_rq->rd->overutilized && overutilized) {
-			env->dst_rq->rd->overutilized = true;
-			trace_sched_overutilized(true);
-		}
+		/*
+		 * If the domain util is greater that domain capacity,
+		 * load balancing needs to be done at the next sched
+		 * domain level as well
+		 */
+		if ((sds->total_capacity * 1024 <
+				sds->total_util * capacity_margin) || misfit)
+			set_sd_overutilized(env->sd->parent);
+		else
+			clear_sd_overutilized(env->sd->parent);
    }
-
 }
/**
@@ -8600,7 +8686,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
     */
    update_sd_lb_stats(env, &sds);
-	if (energy_aware() && !env->dst_rq->rd->overutilized)
+	if (energy_aware() && !is_sd_overutilized(env->sd))
    	goto out_balanced;
local = &sds.local_stat;
@@ -9514,6 +9600,10 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
rcu_read_lock();
    for_each_domain(cpu, sd) {
+
+		if (energy_aware() && !is_sd_overutilized(sd))
+			continue;
+
    	/*
    	 * Decay the newidle max times here because this is a regular
    	 * visit to all the domains. Decay ~1% per second.
@@ -9805,6 +9895,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
 {
    struct cfs_rq *cfs_rq;
    struct sched_entity *se = &curr->se;
+	struct sched_domain *sd;
for_each_sched_entity(se) {
    	cfs_rq = cfs_rq_of(se);
@@ -9815,12 +9906,18 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
    	task_tick_numa(rq, curr);
#ifdef CONFIG_SMP
-	if (!rq->rd->overutilized && cpu_overutilized(task_cpu(curr))) {
-		rq->rd->overutilized = true;
-		trace_sched_overutilized(true);
-	}
-
    rq->misfit_task = !task_fits_max(curr, rq->cpu);
+
+	rcu_read_lock();
+	sd = rcu_dereference(rq->sd);
+
+	if (cpu_overutilized(task_cpu(curr)) && sd)
+		set_sd_overutilized(sd);
+
+	if (rq->misfit_task && sd && sd->parent)
+		set_sd_overutilized(sd->parent);
+
+	rcu_read_unlock();
 #endif
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ce364dd..c1b03a6 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -925,6 +925,7 @@ struct sched_group {
    unsigned int group_weight;
    struct sched_group_capacity *sgc;
    const struct sched_group_energy *sge;
+	bool overutilized;
/*
     * The CPUs this group covers.
--
1.9.1

    

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

[Eas-dev] [PATCH 11/13] Per Sched domain over utilization