Re: [Eas-dev] [RFC] Per Sched domain over utilization

19 Jan 2017

On Wed, Jan 18, 2017 at 02:26:53PM -0800, Joonwoo Park wrote:
...
Let me join the party here.  Hope it's not too late ;)
Not at all :)
...
On 12/07/2016 02:22 PM, Thara Gopinath wrote:
...
@@ -6063,10 +6094,16 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
   		      && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
   }

if (energy_aware() && !(cpu_rq(prev_cpu)->rd->overutilized))
return select_energy_cpu_brute(p, prev_cpu);


rcu_read_lock();


sd = rcu_dereference(cpu_rq(prev_cpu)->sd);
if (energy_aware() &&
	!is_sd_overutilized(sd,


	cpu_rq(cpu)->rd)) {



IMHO skipping EA task placement based on inner overutilized flag still can
occur suboptimal placement when there is unavoidable load imbalance within a
SD.
For example say there is a 2+2 big.Little system and CPU0 and CPU1 are
little CPUs.
All other CPUs are idle but little CPU1 has a CPU bound task which makes
little cluster's SD marked as overutilzed.
Under the such condition any new wake-up of which the task's prev_cpu is
either CPU0 or CPU1 will do non-EA placement.
As a result small tasks can be on big CPU and also big task can be on little
CPU (probably idle little CPU).
Overutilzed big CPUs' SD will have same issue for the tasks happened to be
on a big CPU previously but better to be on a little CPU later.
Good point and agree. In waken path deferring tipping point is
important for power optimization by EA task placement.
So I think we should introduce 'global' overutilized flag. For 'inner'
and 'outer' overutilized cases, scheduler will always use EA task
placement but for 'global' overutilized we can skip it.
...
What do you think?  It can be tricky but it could be better to mark as
overutilzed when all the CPUs in the SD are actually overutilzed for both
task placement and lb.
This is a bit rigid criteria.
IMHO we can get one obvious benefit from this patch is for power
optimization, but we cannot hurt performance. At least cannot
introduce too much performance downgradtion by this patch.
If we wait all CPUs marked as overutilized in SD for lb, this may
introduce serious performance regression for benchmarks, e.g.
vellamo. But maybe some minor tunning we can try, like we can skip
task migration when source sched group is not overutilized, but
destination sched group is overutilized; so we can implement one-way
migration for some specific cases.
...
...

new_cpu = select_energy_cpu_brute(p, prev_cpu);


goto unlock;


}

sd = NULL;

Leo, BTW where is the EAS repo I can publicly access to sync with your tip?
I'm on google's lsk-v4.4-16.09-android tree and hope I can post my patches
not too late but looks like there are quite dependencies.
This is Thara's patch :) I think Thara is working on ARM development
branch due I saw Thara'a patch includes some new functions introduced
by ARM developement branch:
git://www.linux-arm.org/linux-power.git, branch:
origin/eas/next/integration_20161206_1147
Thara, please correct as needed.
For producetion code line, from previous alignment it's better to work
on Android common kernel 3.18 or 4.4. They have quite same code base
for EAS:
https://android.googlesource.com/kernel/common/+log/android-4.4
https://android.googlesource.com/kernel/common/+log/android-3.18
I also post my version tweaked Thara's patch in case you are
interesting in it, I did some verification on Juno. Feedback
and suggestion will be very helpful :)
---8<---

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f98e434..7371e56 100755
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4200,6 +4200,68 @@ static inline void hrtick_update(struct rq *rq)
 #ifdef CONFIG_SMP
 static bool cpu_overutilized(int cpu);
 static inline unsigned long boosted_cpu_util(int cpu);
+
+/*
+ * 1. Inner overutilized:
+ *
+ * The load balance will happen only in SD Level 1, so this means
+ * only take affact on clustser internally.
+ *
+ * 2. Outer overutilized:
+ *
+ * If the CPU has misfit on it, it's no doubt to migrate task
+ * to another high capacity CPU.
+ *
+ * Or if one CPU is overutilized and we assume now scheduler has
+ * done good enough work to explore cluster internal capacity, so
+ * if one CPU is overutilized that means finally need seek another
+ * cluster to provide more computing capacity.
+ *
+ * 3. Global overutilized:
+ *
+ * If set root domain flag, means explore performance as possible
+ * to spread out tasks.
+ *
+ */
+static void set_sd_overutilized(struct sched_domain *sd)
+{
+	if (sd)
+		sd->groups->overutilized = true;
+}
+
+static void clear_sd_overutilized(struct sched_domain *sd)
+{
+	if (sd)
+		sd->groups->overutilized = false;
+}
+
+static void set_rd_overutilized(struct root_domain *rd)
+{
+	rd->overutilized = true;
+}
+
+static void clear_rd_overutilized(struct root_domain *rd)
+{
+	rd->overutilized = false;
+}
+
+static bool is_sd_overutilized(struct sched_domain *sd)
+{
+	struct sched_group *group = sd->groups;
+	int cpu = smp_processor_id();
+
+	if (cpu_rq(cpu)->rd->overutilized)
+		return true;
+
+	do {
+		if (group->overutilized)
+			return true;
+
+	} while (group = group->next, group != sd->groups);
+
+	return false;
+}
+
 #else
 #define boosted_cpu_util(cpu) cpu_util(cpu)
 #endif
@@ -4228,6 +4290,7 @@ static void
 enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 {
    struct cfs_rq *cfs_rq;
+	struct sched_domain *sd;
    struct sched_entity *se = &p->se;
 #ifdef CONFIG_SMP
    int task_new = flags & ENQUEUE_WAKEUP_NEW;
@@ -4292,11 +4355,17 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (!se) {
    	walt_inc_cumulative_runnable_avg(rq, p);
-		if (!task_new && !rq->rd->overutilized &&
-		    cpu_overutilized(rq->cpu)) {
-			rq->rd->overutilized = true;
-			trace_sched_overutilized(true);
+
+		rcu_read_lock();
+		sd = rcu_dereference(rq->sd);
+		if (!task_new) {
+			if (cpu_overutilized(rq->cpu) && sd)
+				set_sd_overutilized(sd);
+
+			if (rq->misfit_task && sd && sd->parent)
+				set_sd_overutilized(sd->parent);
    	}
+		rcu_read_unlock();
/*
    	 * We want to potentially trigger a freq switch
@@ -7188,6 +7257,7 @@ struct sd_lb_stats {
    struct sched_group *local;	/* Local group in this sd */
    unsigned long total_load;	/* Total load of all groups in sd */
    unsigned long total_capacity;	/* Total capacity of all groups in sd */
+	unsigned long total_util;	/* Total util of all groups in sd */
    unsigned long avg_load;	/* Average load across all groups in sd */
struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
@@ -7207,6 +7277,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
    	.local = NULL,
    	.total_load = 0UL,
    	.total_capacity = 0UL,
+		.total_util = 0UL,
    	.busiest_stat = {
    		.avg_load = 0UL,
    		.sum_nr_running = 0,
@@ -7528,10 +7599,11 @@ group_type group_classify(struct sched_group *group,
 static inline void update_sg_lb_stats(struct lb_env *env,
    		struct sched_group *group, int load_idx,
    		int local_group, struct sg_lb_stats *sgs,
-			bool *overload, bool *overutilized)
+			bool *overload, bool *misfit)
 {
    unsigned long load;
    int i, nr_running;
+	bool overutilized = false;
memset(sgs, 0, sizeof(*sgs));
@@ -7564,10 +7636,13 @@ static inline void update_sg_lb_stats(struct lb_env *env,
    		sgs->idle_cpus++;
if (cpu_overutilized(i))
-			*overutilized = true;
+			overutilized = true;
if (!sgs->group_misfit_task && rq->misfit_task)
    		sgs->group_misfit_task = capacity_of(i);
+
+		if (rq->misfit_task)
+			*misfit = true;
    }
/* Adjust by relative CPU capacity of the group */
@@ -7581,6 +7656,9 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->group_no_capacity = group_is_overloaded(env, sgs);
    sgs->group_type = group_classify(group, sgs);
+
+	if (sgs->group_weight == 1)
+		group->overutilized = overutilized;
 }
/**
@@ -7689,7 +7767,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
    struct sched_group *sg = env->sd->groups;
    struct sg_lb_stats tmp_sgs;
    int load_idx, prefer_sibling = 0;
-	bool overload = false, overutilized = false;
+	bool overload = false, misfit = false;
if (child && child->flags & SD_PREFER_SIBLING)
    	prefer_sibling = 1;
@@ -7711,7 +7789,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
    	}
update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
-						&overload, &overutilized);
+						&overload, &misfit);
if (local_group)
    		goto next_group;
@@ -7751,6 +7829,7 @@ next_group:
    	/* Now, start updating sd_lb_stats */
    	sds->total_load += sgs->group_load;
    	sds->total_capacity += sgs->group_capacity;
+		sds->total_util += sgs->group_util;
sg = sg->next;
    } while (sg != env->sd->groups);
@@ -7765,18 +7844,28 @@ next_group:
    	if (env->dst_rq->rd->overload != overload)
    		env->dst_rq->rd->overload = overload;
-		/* Update over-utilization (tipping point, U >= 0) indicator */
-		if (env->dst_rq->rd->overutilized != overutilized) {
-			env->dst_rq->rd->overutilized = overutilized;
-			trace_sched_overutilized(overutilized);
-		}
+		/*
+		 * If total utilization is more than half of capacity,
+		 * this means at least the average CPU utilization is
+		 * crossing half of max capacity CPU; so this is a quite
+		 * high bar to set root domain's overutlized flag.
+		 */
+		if (sds->total_capacity < sds->total_util * 2)
+			set_rd_overutilized(env->dst_rq->rd);
+		else
+			clear_rd_overutilized(env->dst_rq->rd);
    } else {
-		if (!env->dst_rq->rd->overutilized && overutilized) {
-			env->dst_rq->rd->overutilized = true;
-			trace_sched_overutilized(true);
-		}
+		/*
+		 * If the domain util is greater that domain capacity,
+		 * load balancing needs to be done at the next sched
+		 * domain level as well
+		 */
+		if ((sds->total_capacity * 1024 <
+				sds->total_util * capacity_margin) || misfit)
+			set_sd_overutilized(env->sd->parent);
+		else
+			clear_sd_overutilized(env->sd->parent);
    }
-
 }
/**
@@ -8019,7 +8108,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
     */
    update_sd_lb_stats(env, &sds);
-	if (energy_aware() && !env->dst_rq->rd->overutilized)
+	if (energy_aware() && !is_sd_overutilized(env->sd))
    	goto out_balanced;
local = &sds.local_stat;
@@ -8960,6 +9049,10 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
rcu_read_lock();
    for_each_domain(cpu, sd) {
+
+		if (energy_aware() && !is_sd_overutilized(sd))
+			continue;
+
    	/*
    	 * Decay the newidle max times here because this is a regular
    	 * visit to all the domains. Decay ~1% per second.
@@ -9255,6 +9348,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
 {
    struct cfs_rq *cfs_rq;
    struct sched_entity *se = &curr->se;
+	struct sched_domain *sd;
for_each_sched_entity(se) {
    	cfs_rq = cfs_rq_of(se);
@@ -9265,12 +9359,18 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
    	task_tick_numa(rq, curr);
#ifdef CONFIG_SMP
-	if (!rq->rd->overutilized && cpu_overutilized(task_cpu(curr))) {
-		rq->rd->overutilized = true;
-		trace_sched_overutilized(true);
-	}
-
    rq->misfit_task = !task_fits_max(curr, rq->cpu);
+
+	rcu_read_lock();
+	sd = rcu_dereference(rq->sd);
+
+	if (cpu_overutilized(task_cpu(curr)) && sd)
+		set_sd_overutilized(sd);
+
+	if (rq->misfit_task && sd && sd->parent)
+		set_sd_overutilized(sd->parent);
+
+	rcu_read_unlock();
 #endif
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 2f2b959..ca2cedb 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -916,6 +916,7 @@ struct sched_group {
    unsigned int group_weight;
    struct sched_group_capacity *sgc;
    const struct sched_group_energy const *sge;
+	bool overutilized;
/*
     * The CPUs this group covers.
--
1.9.1

    

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

Re: [Eas-dev] [RFC] Per Sched domain over utilization