Re: [Eas-dev] [PATCH V3] Per Sched domain over utilization

5 Oct 2017

Hi Thara,
On 23/06/17 15:37, Thara Gopinath wrote:
...
The current implementation of overutilization, aborts energy aware
scheduling if any cpu in the system is over-utilized. This patch introduces
over utilization flag per sched domain level instead of a single flag
system wide.  Load balancing is done at the sched domain where any
of the cpu is over utilized. If energy aware scheduling is
enabled and no cpu in a sched domain is overuttilized,
load balancing is skipped for that sched domain and energy aware
scheduling continues at that level.
The implementation takes advantage of the shared sched_domain structure
that is common across all the sched domains at a level. The new flag
introduced is placed in this structure so that all the sched domains the
same level share the flag. In case of an overutilized cpu, the flag gets
set at level1 sched_domain. The flag at the parent sched_domain level gets
set in either of the two following scenarios.

There is a misfit task in one of the cpu's in this sched_domain.
The total utilization of the domain is greater than the domain capacity

The flag is cleared if no cpu in a sched domain is overutilized.
This implementation still can have corner scenarios with respect to
misfit tasks. For example consider a sched group with n cpus and
n+1 70%utilized tasks. Ideally this is a case for load balance to happen
in a parent sched domain. But neither the total group utilization is
high enough for the load balance to be triggered
in the parent domain nor there is a cpu with a single overutilized task so
that aload balance is triggered in a parent domain. But again this could be
a purely academic sceanrio, as during task wake up these tasks will be placed
more appropriately.
Signed-off-by: Thara Gopinath thara.gopinath@linaro.org
V2->V3:

Rebased on latest kernel.
The previous check for misfit task is replaced with the
newely introduced rq->misfit_task flag.

V1->V2:
        - Removed overutilized flag from sched_group structure.
        - In case of misfit task, it is ensured that a load balance is
        triggered in a parent sched domain with assymetric cpu capacities.
include/linux/sched/topology.h |   1 +
 kernel/sched/fair.c            | 137 +++++++++++++++++++++++++++++++++--------
 kernel/sched/sched.h           |   3 -
 kernel/sched/topology.c        |   8 +--
 4 files changed, 117 insertions(+), 32 deletions(-)
[...]
This is what I had to do to apply this patch to the next EAS integration.
Have a look and tell me if you spot any issues.
Thanks,
-- Dietmar
-- >8 --
...
From 1500a85a733af8590c1545928eb589e73a67de57 Mon Sep 17 00:00:00 2001
From: Dietmar Eggemann dietmar.eggemann@arm.com
Date: Thu, 5 Oct 2017 16:52:24 +0100
Subject: [PATCH] Tweaking [PATCH V3] Per Sched domain over utilization into
 the next eas_int
These are the changes I did to integrate this into eas_int:
(1) Incorporating my v3 review comments.
(2) Adaptations in update_sd_lb_stats() due to new mainline base.
(3) Move overutilized check behind 'decay the newidle max times' in
    rebalance_domains()
Signed-off-by: Dietmar Eggemann dietmar.eggemann@arm.com
---
 kernel/sched/fair.c     | 61 ++++++++++++++++++-------------------------------
 kernel/sched/topology.c |  4 ----
 2 files changed, 22 insertions(+), 43 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1cedda74f1f7..aa1388cc673d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4882,27 +4882,19 @@ static inline void hrtick_update(struct rq *rq)
static bool cpu_overutilized(int cpu);
-static bool
-is_sd_overutilized(struct sched_domain *sd)
+static bool sd_overutilized(struct sched_domain *sd)
 {
-       if (sd)
-               return sd->shared->overutilized;
-       else
-               return false;
+       return sd->shared->overutilized;
 }
-static void
-set_sd_overutilized(struct sched_domain *sd)
+static void set_sd_overutilized(struct sched_domain *sd)
 {
-       if (sd)
-               sd->shared->overutilized = true;
+       sd->shared->overutilized = true;
 }
-static void
-clear_sd_overutilized(struct sched_domain *sd)
+static void clear_sd_overutilized(struct sched_domain *sd)
 {
-       if (sd)
-               sd->shared->overutilized = false;
+       sd->shared->overutilized = false;
 }
/*
@@ -4960,8 +4952,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                add_nr_running(rq, 1);
                rcu_read_lock();
                sd = rcu_dereference(rq->sd);
-               if (!task_new && !is_sd_overutilized(sd) &&
-                               cpu_overutilized(rq->cpu))
+               if (!task_new && sd && !sd_overutilized(sd) &&
+                   cpu_overutilized(rq->cpu))
                        set_sd_overutilized(sd);
                rcu_read_unlock();
        }
@@ -6301,7 +6293,6 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu)
        unsigned long max_spare = 0;
        struct sched_domain *sd;
-       /* The rcu lock is/should be held in the caller function */
        sd = rcu_dereference(per_cpu(sd_ea, prev_cpu));
if (!sd)
@@ -6374,8 +6365,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
rcu_read_lock();
        sd = rcu_dereference(cpu_rq(prev_cpu)->sd);
-       if (energy_aware() &&
-                       !is_sd_overutilized(sd)) {
+       if (energy_aware() && sd && !sd_overutilized(sd)) {
                new_cpu = select_energy_cpu_brute(p, prev_cpu);
                goto unlock;
        }
@@ -7908,11 +7898,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
if (cpu_overutilized(i)) {
                        *overutilized = true;
-               /*
-                * If the cpu is overutilized and if there is only one
-                * current task in cfs runqueue, it is potentially a misfit
-                * task.
-                */
+
                        if (rq->misfit_task)
                                *misfit_task = true;
                }
@@ -8165,13 +8151,15 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
                }
        }
-       /* If the domain util is greater that domain capacity, load balancing
-        * needs to be done at the next sched domain level as well
+       /*
+        * If the domain util is greater that domain capacity, load balancing
+        * needs to be done at the next sched domain level as well.
         */
-       if (sds->total_capacity * 1024 < sds->total_util * capacity_margin)
+       if (lb_sd_parent(env->sd) &&
+           sds->total_capacity * 1024 < sds->total_util * capacity_margin)
                set_sd_overutilized(env->sd->parent);
-       if (!shared)
+       if (!(env->sd->flags & SD_SHARE_PKG_RESOURCES))
                return;
/*
@@ -8412,10 +8400,8 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
         */
        update_sd_lb_stats(env, &sds);
-       if (energy_aware()) {
-               if (!is_sd_overutilized(env->sd))
-                       goto out_balanced;
-       }
+       if (energy_aware() && !sd_overutilized(env->sd))
+               goto out_balanced;
local = &sds.local_stat;
        busiest = &sds.busiest_stat;
@@ -9309,11 +9295,6 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
rcu_read_lock();
        for_each_domain(cpu, sd) {
-               if (energy_aware()) {
-                       if (!is_sd_overutilized(sd))
-                               continue;
-               }
-
                /*
                 * Decay the newidle max times here because this is a regular
                 * visit to all the domains. Decay ~1% per second.
@@ -9326,6 +9307,9 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
                }
                max_cost += sd->max_newidle_lb_cost;
+               if (energy_aware() && !sd_overutilized(sd))
+                       continue;
+
                if (!(sd->flags & SD_LOAD_BALANCE))
                        continue;
@@ -9630,8 +9614,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
rcu_read_lock();
        sd = rcu_dereference(rq->sd);
-       if (!is_sd_overutilized(sd) &&
-                       cpu_overutilized(task_cpu(curr)))
+       if (sd && !sd_overutilized(sd) && cpu_overutilized(task_cpu(curr)))
                set_sd_overutilized(sd);
        rcu_read_unlock();
 }
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 42a3ede19a5c..7fe59856a34f 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1274,10 +1274,6 @@ sd_init(struct sched_domain_topology_level *tl,
                sd->idle_idx = 1;
        }
-       /*
-        * For all levels sharing cache; connect a sched_domain_shared
-        * instance.
-        */
        sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
        atomic_inc(&sd->shared->ref);
--
2.11.0

    

2026

2025

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

Re: [Eas-dev] [PATCH V3] Per Sched domain over utilization

Signed-off-by: Thara Gopinath thara.gopinath@linaro.org