The current implementation of overutilization, aborts energy aware
scheduling if any cpu in the system is over-utilized. This patch introduces
over utilization flag per sched domain level instead of a single flag
system wide. Load balancing is done at the sched domain where any
of the cpu is over utilized. If energy aware scheduling is
enabled and no cpu in a sched domain is overuttilized,
load balancing is skipped for that sched domain and energy aware
scheduling continues at that level.
The implementation takes advantage of the shared sched_domain structure
that is common across all the sched domains at a level. The new flag
introduced is placed in this structure so that all the sched domains the
same level share the flag. In case of an overutilized cpu, the flag gets
set at level1 sched_domain. The flag at the parent sched_domain level gets
set in either of the two following scenarios.
1. There is a misfit task in one of the cpu's in this sched_domain.
2. The total utilization of the domain is greater than the domain capacity
The flag is cleared if no cpu in a sched domain is overutilized.
This implementation still can have corner scenarios with respect to
misfit tasks. For example consider a sched group with n cpus and
n+1 70%utilized tasks. Ideally this is a case for load balance to happen
in a parent sched domain. But neither the total group utilization is
high enough for the load balance to be triggered
in the parent domain nor there is a cpu with a single overutilized task so
that aload balance is triggered in a parent domain. But again this could be
a purely academic sceanrio, as during task wake up these tasks will be placed
more appropriately.
Signed-off-by: Thara Gopinath <thara.gopinath(a)linaro.org>
---
V1->V2:
- Removed overutilized flag from sched_group structure.
- In case of misfit task, it is ensured that a load balance is
triggered in a parent sched domain with assymetric cpu capacities.
include/linux/sched.h | 1 +
kernel/sched/core.c | 7 ++-
kernel/sched/fair.c | 138 +++++++++++++++++++++++++++++++++++++++++---------
kernel/sched/sched.h | 3 --
4 files changed, 117 insertions(+), 32 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1c5122e..971842a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1112,6 +1112,7 @@ struct sched_domain_shared {
atomic_t ref;
atomic_t nr_busy_cpus;
int has_idle_cores;
+ bool overutilized;
};
struct sched_domain {
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 31a466f..e0a8758 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6659,11 +6659,10 @@ sd_init(struct sched_domain_topology_level *tl,
* For all levels sharing cache; connect a sched_domain_shared
* instance.
*/
- if (sd->flags & SD_SHARE_PKG_RESOURCES) {
- sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
- atomic_inc(&sd->shared->ref);
+ sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
+ atomic_inc(&sd->shared->ref);
+ if (sd->flags & SD_SHARE_PKG_RESOURCES)
atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
- }
sd->private = sdd;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 489f6d3..9d2bb07 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4735,6 +4735,30 @@ static inline void hrtick_update(struct rq *rq)
static bool cpu_overutilized(int cpu);
+static bool
+is_sd_overutilized(struct sched_domain *sd)
+{
+ if (sd)
+ return sd->shared->overutilized;
+ else
+ return false;
+}
+
+static void
+set_sd_overutilized(struct sched_domain *sd)
+{
+ if (sd)
+ sd->shared->overutilized = true;
+}
+
+static void
+clear_sd_overutilized(struct sched_domain *sd)
+{
+ if (sd)
+ sd->shared->overutilized = false;
+}
+
+
/*
* The enqueue_task method is called before nr_running is
* increased. Here we update the fair scheduling stats and
@@ -4744,6 +4768,7 @@ static void
enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
struct cfs_rq *cfs_rq;
+ struct sched_domain *sd;
struct sched_entity *se = &p->se;
int task_new = !(flags & ENQUEUE_WAKEUP);
@@ -4787,9 +4812,12 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (!se) {
add_nr_running(rq, 1);
- if (!task_new && !rq->rd->overutilized &&
- cpu_overutilized(rq->cpu))
- rq->rd->overutilized = true;
+ rcu_read_lock();
+ sd = rcu_dereference(rq->sd);
+ if (!task_new && !is_sd_overutilized(sd) &&
+ cpu_overutilized(rq->cpu))
+ set_sd_overutilized(sd);
+ rcu_read_unlock();
}
hrtick_update(rq);
}
@@ -6173,8 +6201,7 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu)
unsigned long max_spare = 0;
struct sched_domain *sd;
- rcu_read_lock();
-
+ /* The rcu lock is/should be held in the caller function */
sd = rcu_dereference(per_cpu(sd_ea, prev_cpu));
if (!sd)
@@ -6212,8 +6239,6 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu)
}
unlock:
- rcu_read_unlock();
-
if (energy_cpu == prev_cpu && !cpu_overutilized(prev_cpu))
return prev_cpu;
@@ -6247,10 +6272,16 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
&& cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
}
- if (energy_aware() && !(cpu_rq(prev_cpu)->rd->overutilized))
- return select_energy_cpu_brute(p, prev_cpu);
-
rcu_read_lock();
+ sd = rcu_dereference(cpu_rq(prev_cpu)->sd);
+ if (energy_aware() &&
+ !is_sd_overutilized(sd)) {
+ new_cpu = select_energy_cpu_brute(p, prev_cpu);
+ goto unlock;
+ }
+
+ sd = NULL;
+
for_each_domain(cpu, tmp) {
if (!(tmp->flags & SD_LOAD_BALANCE))
break;
@@ -6315,6 +6346,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
}
/* while loop will break here if sd == NULL */
}
+
+unlock:
rcu_read_unlock();
return new_cpu;
@@ -7366,6 +7399,7 @@ struct sd_lb_stats {
struct sched_group *local; /* Local group in this sd */
unsigned long total_load; /* Total load of all groups in sd */
unsigned long total_capacity; /* Total capacity of all groups in sd */
+ unsigned long total_util; /* Total util of all groups in sd */
unsigned long avg_load; /* Average load across all groups in sd */
struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
@@ -7385,6 +7419,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
.local = NULL,
.total_load = 0UL,
.total_capacity = 0UL,
+ .total_util = 0UL,
.busiest_stat = {
.avg_load = 0UL,
.sum_nr_running = 0,
@@ -7664,7 +7699,7 @@ group_type group_classify(struct sched_group *group,
static inline void update_sg_lb_stats(struct lb_env *env,
struct sched_group *group, int load_idx,
int local_group, struct sg_lb_stats *sgs,
- bool *overload, bool *overutilized)
+ bool *overload, bool *overutilized, bool *misfit_task)
{
unsigned long load;
int i, nr_running;
@@ -7699,8 +7734,16 @@ static inline void update_sg_lb_stats(struct lb_env *env,
if (!nr_running && idle_cpu(i))
sgs->idle_cpus++;
- if (cpu_overutilized(i))
+ if (cpu_overutilized(i)) {
*overutilized = true;
+ /*
+ * If the cpu is overutilized and if there is only one
+ * current task in cfs runqueue, it is potentially a misfit
+ * task.
+ */
+ if (rq->cfs.h_nr_running == 1)
+ *misfit_task = true;
+ }
}
/* Adjust by relative CPU capacity of the group */
@@ -7825,11 +7868,11 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq)
*/
static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
{
- struct sched_domain *child = env->sd->child;
+ struct sched_domain *child = env->sd->child, *sd;
struct sched_group *sg = env->sd->groups;
struct sg_lb_stats tmp_sgs;
int load_idx, prefer_sibling = 0;
- bool overload = false, overutilized = false;
+ bool overload = false, overutilized = false, misfit_task = false;
if (child && child->flags & SD_PREFER_SIBLING)
prefer_sibling = 1;
@@ -7851,7 +7894,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
}
update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
- &overload, &overutilized);
+ &overload, &overutilized,
+ &misfit_task);
if (local_group)
goto next_group;
@@ -7882,6 +7926,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
/* Now, start updating sd_lb_stats */
sds->total_load += sgs->group_load;
sds->total_capacity += sgs->group_capacity;
+ sds->total_util += sgs->group_util;
sg = sg->next;
} while (sg != env->sd->groups);
@@ -7895,14 +7940,45 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
/* update overload indicator if we are at root domain */
if (env->dst_rq->rd->overload != overload)
env->dst_rq->rd->overload = overload;
+ }
- /* Update over-utilization (tipping point, U >= 0) indicator */
- if (env->dst_rq->rd->overutilized != overutilized)
- env->dst_rq->rd->overutilized = overutilized;
- } else {
- if (!env->dst_rq->rd->overutilized && overutilized)
- env->dst_rq->rd->overutilized = true;
+ if (overutilized)
+ set_sd_overutilized(env->sd);
+ else
+ clear_sd_overutilized(env->sd);
+
+ /*
+ * If there is a misfit task in one cpu in this sched_domain
+ * it is likely that the imbalance cannot be sorted out among
+ * the cpu's in this sched_domain. In this case set the
+ * overutilized flag at the parent sched_domain.
+ */
+ if (misfit_task) {
+
+ sd = env->sd->parent;
+
+ /*
+ * In case of a misfit task, load balance at the parent
+ * sched domain level will make sense only if the the cpus
+ * have a different capacity. If cpus at a domain level have
+ * the same capacity, the misfit task cannot be well
+ * accomodated in any of the cpus and there in no point in
+ * trying a load balance at this level
+ */
+ while (sd) {
+ if (sd->flags & SD_ASYM_CPUCAPACITY) {
+ set_sd_overutilized(sd);
+ break;
+ }
+ sd = sd->parent;
+ }
}
+
+ /* If the domain util is greater that domain capacity, load balancing
+ * needs to be done at the next sched domain level as well
+ */
+ if (sds->total_capacity * 1024 < sds->total_util * capacity_margin)
+ set_sd_overutilized(env->sd->parent);
}
/**
@@ -8122,8 +8198,10 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
*/
update_sd_lb_stats(env, &sds);
- if (energy_aware() && !env->dst_rq->rd->overutilized)
- goto out_balanced;
+ if (energy_aware()) {
+ if (!is_sd_overutilized(env->sd))
+ goto out_balanced;
+ }
local = &sds.local_stat;
busiest = &sds.busiest_stat;
@@ -8981,6 +9059,11 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
rcu_read_lock();
for_each_domain(cpu, sd) {
+ if (energy_aware()) {
+ if (!is_sd_overutilized(sd))
+ continue;
+ }
+
/*
* Decay the newidle max times here because this is a regular
* visit to all the domains. Decay ~1% per second.
@@ -9280,6 +9363,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &curr->se;
+ struct sched_domain *sd;
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
@@ -9289,8 +9373,12 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
if (static_branch_unlikely(&sched_numa_balancing))
task_tick_numa(rq, curr);
- if (!rq->rd->overutilized && cpu_overutilized(task_cpu(curr)))
- rq->rd->overutilized = true;
+ rcu_read_lock();
+ sd = rcu_dereference(rq->sd);
+ if (!is_sd_overutilized(sd) &&
+ cpu_overutilized(task_cpu(curr)))
+ set_sd_overutilized(sd);
+ rcu_read_unlock();
}
/*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index fa98ab3..b24cefa 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -563,9 +563,6 @@ struct root_domain {
/* Indicate more than one runnable task for any CPU */
bool overload;
- /* Indicate one or more cpus over-utilized (tipping point) */
- bool overutilized;
-
/*
* The bit corresponding to a CPU gets set here if such CPU has more
* than one runnable -deadline task (as it is below for RT tasks).
--
2.1.4
The rate_limit_us tunable is intended to reduce the possible overhead
from running the schedutil governor. However, that overhead can be
divided into two separate parts: the governor computations and the
invocation of the scaling driver to set the CPU frequency. The latter
is where the real overhead comes from. The former is much less
expensive in terms of execution time and running it every time the
governor callback is invoked by the scheduler, after rate_limit_us
interval has passed since the last frequency update, would not be a
problem.
For this reason, redefine the rate_limit_us tunable so that it means the
minimum time that has to pass between two consecutive invocations of the
scaling driver by the schedutil governor (to set the CPU frequency).
Signed-off-by: Viresh Kumar <viresh.kumar(a)linaro.org>
---
V1->V2: Update $subject and commit log (Rafael)
kernel/sched/cpufreq_schedutil.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index fd4659313640..306d97e7b57c 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -92,14 +92,13 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
{
struct cpufreq_policy *policy = sg_policy->policy;
- sg_policy->last_freq_update_time = time;
-
if (policy->fast_switch_enabled) {
if (sg_policy->next_freq == next_freq) {
trace_cpu_frequency(policy->cur, smp_processor_id());
return;
}
sg_policy->next_freq = next_freq;
+ sg_policy->last_freq_update_time = time;
next_freq = cpufreq_driver_fast_switch(policy, next_freq);
if (next_freq == CPUFREQ_ENTRY_INVALID)
return;
@@ -108,6 +107,7 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
trace_cpu_frequency(next_freq, smp_processor_id());
} else if (sg_policy->next_freq != next_freq) {
sg_policy->next_freq = next_freq;
+ sg_policy->last_freq_update_time = time;
sg_policy->work_in_progress = true;
irq_work_queue(&sg_policy->irq_work);
}
--
2.7.1.410.g6faf27b
Sorry that I forgot to cc eas-dev list for this patch.
----- Forwarded message from Viresh Kumar <viresh.kumar(a)linaro.org> -----
Date: Wed, 15 Feb 2017 22:45:47 +0530
From: Viresh Kumar <viresh.kumar(a)linaro.org>
To: Rafael Wysocki <rjw(a)rjwysocki.net>, Ingo Molnar <mingo(a)redhat.com>, Peter Zijlstra <peterz(a)infradead.org>
Cc: linaro-kernel(a)lists.linaro.org, linux-pm(a)vger.kernel.org, linux-kernel(a)vger.kernel.org, Vincent Guittot <vincent.guittot(a)linaro.org>, Viresh Kumar <viresh.kumar(a)linaro.org>
Subject: [PATCH] cpufreq: schedutil: govern how frequently we change frequency with rate_limit
X-Mailer: git-send-email 2.7.1.410.g6faf27b
For an ideal system (where frequency change doesn't incur any penalty)
we would like to change the frequency as soon as the load changes for a
CPU. But the systems we have to work with are far from ideal and it
takes time to change the frequency of a CPU. For many ARM platforms
specially, it is at least 1 ms. In order to not spend too much time
changing frequency, we have earlier introduced a sysfs controlled
tunable for the schedutil governor: rate_limit_us.
Currently, rate_limit_us controls how frequently we reevaluate frequency
for a set of CPUs controlled by a cpufreq policy. But that may not be
the ideal behavior we want.
Consider for example the following scenario. The rate_limit_us tunable
is set to 10 ms. The CPU has a constant load X and that requires the
frequency to be set to Y. The schedutil governor changes the frequency
to Y, updates last_freq_update_time and we wait for 10 ms to reevaluate
the frequency again. After 10 ms, the schedutil governor reevaluates the
load and finds it to be the same. And so it doesn't update the
frequency, but updates last_freq_update_time before returning. Right
after this point, the scheduler puts more load on the CPU and the CPU
needs to go to a higher frequency Z. Because last_freq_update_time was
updated just now, the schedutil governor waits for additional 10ms
before reevaluating the load again.
Normally, the time it takes to reevaluate the frequency is negligible
compared to the time it takes to change the frequency. And considering
that in the above scenario, as we haven't updated the frequency for over
10ms, we should have changed the frequency as soon as the load changed.
This patch changes the way rate_limit_us is used, i.e. It now governs
"How frequently we change the frequency" instead of "How frequently we
reevaluate the frequency".
One may think that this change may have increased the number of times we
reevaluate the frequency after a period of rate_limit_us has expired
since the last change, if the load isn't changing. But that is protected
by the scheduler as normally it doesn't call into the schedutil governor
before 1 ms (Hint: "decayed" in update_cfs_rq_load_avg()) since the
last call.
Tests were performed with this patch on a Dual cluster (same frequency
domain), octa-core ARM64 platform (Hikey). Hackbench (Debian) and
Vellamo/Galleryfling (Android) didn't had much difference in
performance w/ or w/o this patch.
Its difficult to create a test case (tried rt-app as well) where this
patch will show a lot of improvements as the target of this patch is a
real corner case. I.e. Current load is X (resulting in freq change),
load after rate_limit_us is also X, but right after that load becomes Y.
Undoubtedly this patch would improve the responsiveness in such cases.
Signed-off-by: Viresh Kumar <viresh.kumar(a)linaro.org>
---
kernel/sched/cpufreq_schedutil.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index fd4659313640..306d97e7b57c 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -92,14 +92,13 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
{
struct cpufreq_policy *policy = sg_policy->policy;
- sg_policy->last_freq_update_time = time;
-
if (policy->fast_switch_enabled) {
if (sg_policy->next_freq == next_freq) {
trace_cpu_frequency(policy->cur, smp_processor_id());
return;
}
sg_policy->next_freq = next_freq;
+ sg_policy->last_freq_update_time = time;
next_freq = cpufreq_driver_fast_switch(policy, next_freq);
if (next_freq == CPUFREQ_ENTRY_INVALID)
return;
@@ -108,6 +107,7 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
trace_cpu_frequency(next_freq, smp_processor_id());
} else if (sg_policy->next_freq != next_freq) {
sg_policy->next_freq = next_freq;
+ sg_policy->last_freq_update_time = time;
sg_policy->work_in_progress = true;
irq_work_queue(&sg_policy->irq_work);
}
--
2.7.1.410.g6faf27b
----- End forwarded message -----
--
viresh
The current implementation of overutilization, aborts energy aware
scheduling if any cpu in the system is over-utilized. This patch introduces
over utilization flag per sched domain level instead of a single flag
system wide. Load balancing is done at the sched domain where any
of the cpu is over utilized. If energy aware scheduling is
enabled and no cpu in a sched domain is overuttilized,
load balancing is skipped for that sched domain and energy aware
scheduling continues at that level.
The implementation takes advantage of the shared sched_domain structure
that is common across all the sched domains at a level. The new flag
introduced is placed in this structure so that all the sched domains the
same level share the flag. In case of an overutilized cpu, the flag gets
set at level1 sched_domain. The flag at the parent sched_domain level gets
set in either of the two following scenarios.
1. There is a misfit task in one of the cpu's in this sched_domain.
2. The total utilization of the domain is greater than the domain capacity
The flag is cleared if no cpu in a sched domain is overutilized.
Signed-off-by: Thara Gopinath <thara.gopinath(a)linaro.org>
---
include/linux/sched.h | 1 +
kernel/sched/core.c | 7 ++-
kernel/sched/fair.c | 120 +++++++++++++++++++++++++++++++++++++++-----------
3 files changed, 99 insertions(+), 29 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1c5122e..971842a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1112,6 +1112,7 @@ struct sched_domain_shared {
atomic_t ref;
atomic_t nr_busy_cpus;
int has_idle_cores;
+ bool overutilized;
};
struct sched_domain {
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 31a466f..e0a8758 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6659,11 +6659,10 @@ sd_init(struct sched_domain_topology_level *tl,
* For all levels sharing cache; connect a sched_domain_shared
* instance.
*/
- if (sd->flags & SD_SHARE_PKG_RESOURCES) {
- sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
- atomic_inc(&sd->shared->ref);
+ sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
+ atomic_inc(&sd->shared->ref);
+ if (sd->flags & SD_SHARE_PKG_RESOURCES)
atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
- }
sd->private = sdd;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 489f6d3..485f597 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4735,6 +4735,30 @@ static inline void hrtick_update(struct rq *rq)
static bool cpu_overutilized(int cpu);
+static bool
+is_sd_overutilized(struct sched_domain *sd)
+{
+ if (sd)
+ return sd->shared->overutilized;
+ else
+ return false;
+}
+
+static void
+set_sd_overutilized(struct sched_domain *sd)
+{
+ if (sd)
+ sd->shared->overutilized = true;
+}
+
+static void
+clear_sd_overutilized(struct sched_domain *sd)
+{
+ if (sd)
+ sd->shared->overutilized = false;
+}
+
+
/*
* The enqueue_task method is called before nr_running is
* increased. Here we update the fair scheduling stats and
@@ -4744,6 +4768,7 @@ static void
enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
struct cfs_rq *cfs_rq;
+ struct sched_domain *sd;
struct sched_entity *se = &p->se;
int task_new = !(flags & ENQUEUE_WAKEUP);
@@ -4787,9 +4812,12 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (!se) {
add_nr_running(rq, 1);
- if (!task_new && !rq->rd->overutilized &&
- cpu_overutilized(rq->cpu))
- rq->rd->overutilized = true;
+ rcu_read_lock();
+ sd = rcu_dereference(rq->sd);
+ if (!task_new && !is_sd_overutilized(sd) &&
+ cpu_overutilized(rq->cpu))
+ set_sd_overutilized(sd);
+ rcu_read_unlock();
}
hrtick_update(rq);
}
@@ -6173,8 +6201,7 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu)
unsigned long max_spare = 0;
struct sched_domain *sd;
- rcu_read_lock();
-
+ /* The rcu lock is/should be held in the caller function */
sd = rcu_dereference(per_cpu(sd_ea, prev_cpu));
if (!sd)
@@ -6212,8 +6239,6 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu)
}
unlock:
- rcu_read_unlock();
-
if (energy_cpu == prev_cpu && !cpu_overutilized(prev_cpu))
return prev_cpu;
@@ -6247,10 +6272,16 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
&& cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
}
- if (energy_aware() && !(cpu_rq(prev_cpu)->rd->overutilized))
- return select_energy_cpu_brute(p, prev_cpu);
-
rcu_read_lock();
+ sd = rcu_dereference(cpu_rq(prev_cpu)->sd);
+ if (energy_aware() &&
+ !is_sd_overutilized(sd)) {
+ new_cpu = select_energy_cpu_brute(p, prev_cpu);
+ goto unlock;
+ }
+
+ sd = NULL;
+
for_each_domain(cpu, tmp) {
if (!(tmp->flags & SD_LOAD_BALANCE))
break;
@@ -6315,6 +6346,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
}
/* while loop will break here if sd == NULL */
}
+
+unlock:
rcu_read_unlock();
return new_cpu;
@@ -7366,6 +7399,7 @@ struct sd_lb_stats {
struct sched_group *local; /* Local group in this sd */
unsigned long total_load; /* Total load of all groups in sd */
unsigned long total_capacity; /* Total capacity of all groups in sd */
+ unsigned long total_util; /* Total util of all groups in sd */
unsigned long avg_load; /* Average load across all groups in sd */
struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
@@ -7385,6 +7419,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
.local = NULL,
.total_load = 0UL,
.total_capacity = 0UL,
+ .total_util = 0UL,
.busiest_stat = {
.avg_load = 0UL,
.sum_nr_running = 0,
@@ -7664,7 +7699,7 @@ group_type group_classify(struct sched_group *group,
static inline void update_sg_lb_stats(struct lb_env *env,
struct sched_group *group, int load_idx,
int local_group, struct sg_lb_stats *sgs,
- bool *overload, bool *overutilized)
+ bool *overload, bool *overutilized, bool *misfit_task)
{
unsigned long load;
int i, nr_running;
@@ -7699,8 +7734,16 @@ static inline void update_sg_lb_stats(struct lb_env *env,
if (!nr_running && idle_cpu(i))
sgs->idle_cpus++;
- if (cpu_overutilized(i))
+ if (cpu_overutilized(i)) {
*overutilized = true;
+ /*
+ * If the cpu is overutilized and if there is only one
+ * current task in cfs runqueue, it is potentially a misfit
+ * task.
+ */
+ if (rq->cfs.h_nr_running == 1)
+ *misfit_task = true;
+ }
}
/* Adjust by relative CPU capacity of the group */
@@ -7829,7 +7872,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
struct sched_group *sg = env->sd->groups;
struct sg_lb_stats tmp_sgs;
int load_idx, prefer_sibling = 0;
- bool overload = false, overutilized = false;
+ bool overload = false, overutilized = false, misfit_task = false;
if (child && child->flags & SD_PREFER_SIBLING)
prefer_sibling = 1;
@@ -7851,7 +7894,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
}
update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
- &overload, &overutilized);
+ &overload, &overutilized,
+ &misfit_task);
if (local_group)
goto next_group;
@@ -7882,6 +7926,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
/* Now, start updating sd_lb_stats */
sds->total_load += sgs->group_load;
sds->total_capacity += sgs->group_capacity;
+ sds->total_util += sgs->group_util;
sg = sg->next;
} while (sg != env->sd->groups);
@@ -7895,14 +7940,27 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
/* update overload indicator if we are at root domain */
if (env->dst_rq->rd->overload != overload)
env->dst_rq->rd->overload = overload;
-
- /* Update over-utilization (tipping point, U >= 0) indicator */
- if (env->dst_rq->rd->overutilized != overutilized)
- env->dst_rq->rd->overutilized = overutilized;
- } else {
- if (!env->dst_rq->rd->overutilized && overutilized)
- env->dst_rq->rd->overutilized = true;
}
+
+ if (overutilized)
+ set_sd_overutilized(env->sd);
+ else
+ clear_sd_overutilized(env->sd);
+
+ /*
+ * If there is a misfit task in one cpu in this sched_domain
+ * it is likely that the imbalance cannot be sorted out among
+ * the cpu's in this sched_domain. In this case set the
+ * overutilized flag at the parent sched_domain.
+ */
+ if (misfit_task)
+ set_sd_overutilized(env->sd->parent);
+
+ /* If the domain util is greater that domain capacity, load balancing
+ * needs to be done at the next sched domain level as well
+ */
+ if (sds->total_capacity * 1024 < sds->total_util * capacity_margin)
+ set_sd_overutilized(env->sd->parent);
}
/**
@@ -8122,8 +8180,10 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
*/
update_sd_lb_stats(env, &sds);
- if (energy_aware() && !env->dst_rq->rd->overutilized)
- goto out_balanced;
+ if (energy_aware()) {
+ if (!is_sd_overutilized(env->sd))
+ goto out_balanced;
+ }
local = &sds.local_stat;
busiest = &sds.busiest_stat;
@@ -8981,6 +9041,11 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
rcu_read_lock();
for_each_domain(cpu, sd) {
+ if (energy_aware()) {
+ if (!is_sd_overutilized(sd))
+ continue;
+ }
+
/*
* Decay the newidle max times here because this is a regular
* visit to all the domains. Decay ~1% per second.
@@ -9280,6 +9345,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &curr->se;
+ struct sched_domain *sd;
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
@@ -9289,8 +9355,12 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
if (static_branch_unlikely(&sched_numa_balancing))
task_tick_numa(rq, curr);
- if (!rq->rd->overutilized && cpu_overutilized(task_cpu(curr)))
- rq->rd->overutilized = true;
+ rcu_read_lock();
+ sd = rcu_dereference(rq->sd);
+ if (!is_sd_overutilized(sd) &&
+ cpu_overutilized(task_cpu(curr)))
+ set_sd_overutilized(sd);
+ rcu_read_unlock();
}
/*
--
2.1.4
Power Management and Scheduling in the Linux Kernel (OSPM-summit)
April 3-4, 2017
Scuola Superiore Sant'Anna (SSSA)
Pisa, Italy
http://retis.sssup.it/ospm-summit/
---
.:: FOCUS
Power management and scheduling techniques to reduce energy consumption while
meeting performance and latency requirements are receiving considerable
attention from the Linux Kernel development community.
The Power Management and Scheduling in the Linux Kernel (OSPM-summit) summit
aims at fostering further interest and discussion to happen.
.:: FORMAT
The summit is organized to cover two days of discussions and talks.
First day is mainly focused on discussion and hacking sessions about
topics/patches that are already under review in the Linux kernel mailing lists
and to debate and plan development tasks for more forward looking work items
centred around power management in the Linux kernel. The list of topics
includes (but it is not limited to):
* Energy Aware Scheduling: next steps and energy model expression;
* SCHED_DEADLINE reclaiming of unused bandwidth, coupling with schedutil
cpufreq governor and group scheduling support;
* fix the load metric exposed to cpuidle;
* IRQ prediction;
* ACPI power management: kernel/firmware bindings and development model;
Second day instead welcomes presentations from both end users and developers on
topics about Power management and scheduling in Linux covering, but not limited
to:
* Power management techniques
* Real-time and non real-time scheduling techniques
* Energy awareness
* Mobile/Server power management real-world use cases (successes and
failures)
* Power management and scheduling tooling (tracing, configuration,
integration testing, etc.)
Presentations can cover recently developed technologies, ongoing work and new
ideas. Please understand that this workshop is not intended for presenting
sales and marketing pitches.
.:: ATTENDING
Attending the OSPM-summit is free of charge, but registration to the event is
mandatory. The event can allow a maximum of 50 people (so, be sure to register
early!). To register send an email to ospm-registration(a)retis.sssup.it. While
it is not strictly required to submit a topic/presentation, registrations with
a topic/presentation proposal will take precedence.
.:: VENUE
The workshop will take place at ReTiS Lab*, Scuola Superiore Sant'Anna, Pisa,
Italy. Pisa is a small town, walking distance from the city center to the venue
is 20 minutes, walking distance from the airport to the city center is 30
minutes. More details are available from the summit web page:
http://retis.sssup.it/ospm-summit/
* https://goo.gl/maps/2pPXG2v7Lfp
.:: SUBMIT A TOPIC/PRESENTATION
To submit a topic/presentation send an email to
ospm-registration(a)retis.sssup.it specifying:
subject
- [TOPIC] or [PRESENTATION]
- short title
body
- first name, family name
- abstract/topic of interest
- affiliation (if any)
- short biography
- expected duration (only for topics, presentations get 30min slots)
Deadline for submitting topics/presentations is 26th of February 2017.
Notifications for accepted topics/presentations will be sent out on 5th of
March 2017.
.:: ORGANIZERS (in alphabetical order)
Luca Abeni (SSSA)
Patrick Bellasi (ARM)
Tommaso Cucinotta (SSSA)
Dietmar Eggemann (ARM)
Sudeep Holla (ARM)
Juri Lelli (ARM)
Lorenzo Pieralisi (ARM)
Morten Rasmussen (ARM)
This patch series is to improve load balance with more proper behaviour
for misfit task. Current code introduces type 'group_misfit_task' to
indicate one schedule group has misfit task, but before the misfit task
can be really migrated onto higher capacity CPU there still have some
barriers we need clear up.
The first patch is to correct task_fits_max() so it can properly filter
out misfit task on low capacity CPU. If without this patch, in system
it's possible this function can always return true so the 'misfit' task
mechanism will totally not be triggered.
The second patch is to fix function group_smaller_cpu_capacity(), so we
can make sure the schedule group with type 'group_misfit_task' will not
wrongly be roll back to type 'group_other'. This will let all misfit
related info be abondoned.
The third patch is to fix nr_running accounting, if without this patch
the scheduler will wronly consider the destination CPU has running task
and skip migrate task on it. This patch is to give correct info like
the destination CPU has no running task on it when the CPU is going into
idle state, so should migrate misfit task by utilizing this time balance.
The forth patch is a temperary patch if we have not backported Vincent's
patches "sched: reflect sched_entity move into task_group's load" [1],
If without this patch series, it's possible that the CPU is not
overutilized but the CPU has one misfit task has been enqueued on it.
So we set sgs->group_misfit_task by checking rq->misfit_task but not
rely on cpu is overutilized or not.
The fifth patch is to select busiest rq if the rq has misfit task, we
let this kind rq has higher priority than the rq with highest weighted
load. This criteria is only enabled for energy aware scheduling.
The sixth patch is to aggressively kick active load balance for misfit
task, so it has quite high chance for higher capacity CPU to
immediately pull misfit task on it.
[1] https://lkml.org/lkml/2016/10/17/223
Leo Yan (6):
sched/fair: correct task_fits_max() for misfit task
sched/fair: fix for group_smaller_cpu_capacity()
sched/fair: fix nr_running accounting for new idle CPU
sched/fair: fix to set sgs->group_misfit_task
sched/fair: select busiest rq with misfit task
sched/fair: kick active load balance for misfit task
kernel/sched/fair.c | 59 +++++++++++++++++++++++++++++++++++++++++------------
1 file changed, 46 insertions(+), 13 deletions(-)
--
2.7.4
Hello all,
I have an x86 based platform which is running android. I wanted to
play around with the EAS patches to see if it would improve power
numbers on it.
I had a few basic questions regarding this:
1) Can EAS be used with x86 based platforms ? I see some arm/arm64
energy model related patches in the eas integration tree
(git://linux-arm.org/linux-power.git). However, there aren't any x86
specific changes present. Is that because no x86 specific changes are
required or just that it is untested there ?
2) Is it expected that EAS would show significant power savings on SMP
systems or just on HMP systems ?
3) Would there be any cpufreq/cpuidle integration be required for x86
specifically ? If so, would I need to base them on the arm stuff or is
there any other reference code.
4) Are there other in-flight patches that need to be applied over the
patches in the eas integration tree for best results ?
If indeed the EAS patches can be used on x86. then I would be
interested in integrating and providing results on my platform. Please
guide.
Regards,
Darren
Hello,
I'm pleased to announce that we have pushed a very early version of
some of the key features we intend to make available as EAS 1.2 this
year to Google's msm repository
( https://android.googlesource.com/kernel/msm.git/ ) as
android-msm-marlin-3.18-nougat-mr1-eas-experimental.
EAS 1.2 is intended to be the next iteration of EAS for AOSP,
including improvements to the wakeup path to better support
big.LITTLE and trialling other upstream scheduler enhancements such
as schedutil along with some important load/util tracking enhancements
to PELT.
Although EAS 1.2 will be primarily focused on a 4.4-based kernel, we
are making this experimental branch available on the 3.18-based Pixel
kernel (marlin_defconfig) in order that we have a readily-available
real platform with an optimised userspace for experimentation.
There are some differences in the scheduler task wake-up path between
this release and that shipping in the Pixel kernel which should be
taken into account when using this kernel.
The most visible change in the wake-up path is the removal of the
is_big_little sysctl. Wake-up now uses a single cpu selection
algorithm (the same one used previously for !isBigLittle) but modified
to remove the assumption that the highest capacity cpus have the
highest logical cpu number. We now allow cpu topology independent
selection of max capacity cpus for tasks which belong to a schedtune
group which has some boost applied irrespective of the cpu numbering.
This changes the iteration order of cpus when looking for a place to
run these tasks from [3,2],[1,0] to [2,3], [0,1]. This has an impact
on runtime configuration. Not making a change to this configuration is
likely to have a small impact for lightly-loaded systems where there
will usually be two idle high-capacity cpus, but we should anyway
match cpuset configuration to the selection ordering to restore the
expectations used when tuning.
In Pixel, cpusets are arranged such that one of the highest capacity
cpus is available only to tasks belonging to the ‘top-app’ cpuset. In
combination with the cpu iteration order used for schedtune boosted
tasks, we hope to find an empty cpu more often for these tasks to wake
on. As a result of the changed iteration order, the top-app should now
be set to the lowest numbered high capacity cpu (in this case #2 for
Pixel). The impact of this is likely to be small for most light use
cases if not changed. This is done in the initrc:
The usual group setup for Pixel is in init.sailfish.rc - the part
which configures the CPUSets for the tuning groups is normally as follows:
on property:sys.boot_completed=1
write /proc/sys/kernel/sched_boost 0
# update cpusets now that boot is complete and we want better load balancing
write /dev/cpuset/top-app/cpus 0-3
write /dev/cpuset/foreground/boost/cpus 0-2
write /dev/cpuset/foreground/cpus 0-2
write /dev/cpuset/background/cpus 0
write /dev/cpuset/system-background/cpus 0-2
As we wish to make cpu 2 the one which is only available for tasks in
the top-app group, we should exclude cpu 2 from the other groups.
on property:sys.boot_completed=1
write /proc/sys/kernel/sched_boost 0
# update cpusets now that boot is complete and we want better load balancing
write /dev/cpuset/top-app/cpus 0-3
write /dev/cpuset/foreground/boost/cpus 0-1,3
write /dev/cpuset/foreground/cpus 0-1,3
write /dev/cpuset/background/cpus 0
write /dev/cpuset/system-background/cpus 0-1,3
We normally do this at run time in a root shell rather than modifying
the init scripts.
The schedutil governor is present but not selected as the default
cpufreq governor.
It is important to note that there is a slight difference in the
meaning of the up & down frequency select throttling for the 'sched'
governor (sched-dvfs) and 'schedutil'. The 'sched' governor considers
time to be measured since the last *frequency change* whilst the
'schedutil' governor considers the time to be measured since the last
*utilisation request*. This means that we need to shorten the throttle
periods used for schedutil when comparing it to sched-dvfs to avoid
staying at the maximum frequency for long periods in UI-driven
workloads.
We have been experimenting with up_rate_limit_usec set to 500 and
down_rate_limit_usec set to 2000 or 5000 which appears to give
results comparable with those of the 'sched' governor.
The branch is based upon the mr1 kernel release, and contains the
patches shown at the end of this mail.
They are comprised of 6 main areas of functionality.
* ec114ba...d2238c2 and 8646350...35ea67a
patches to reduce the delta between the msm kernel and the common kernel
* b055eba...d2e2970
introduce a backport of the upstream schedutil governor (but it is not the default
governor in marlin_defconfig)
* 7f7e79e...14531d4e
bring the energy-aware-scheduling calculations into line with our
mainline-focused implementation and backport capacity-based-scheduling to 3.18
* b75b728...407d2a7
integrate the current EAS 1.1 wakeup path with the mainline-focused
wakeup path and introduce a way to provide a common algorithm implementing
the alternate CPU search algorithm for schedtune boosted tasks
* f966249...1ad6d08
Backport some important upstream CFS fixes to 3.18. This fixes some critical
group accounting issues which had a negative impact on the suitability of PELT
utilisation signals for Android
* 6ae4707
Allows EAS to continue to calculate energy for systems which end up with
a single CPU in a sched domain
Best Regards,
Chris
Amit Pundir (3):
sched/walt: use do_div instead of division operator
ANDROID: sched/walt: fix build failure if FAIR_GROUP_SCHED=n
Revert "cgroup: Fix issues in allow_attach callback"
Brendan Jackman (2):
DEBUG: sched/fair: Fix missing sched_load_avg_cpu events
DEBUG: sched/fair: Fix sched_load_avg_cpu events for task_groups
Chris Redpath (17):
Revert "WIP: UTIL_EST: use estimated utilization on load balancing paths"
Revert "WIP: UTIL_EST: use estimated utilization on energy aware wakeup path"
Revert "WIP: UTIL_EST: sched/fair: use estimated utilization to drive CPUFreq"
Revert "WIP: UTIL_EST: switch to usage of tasks's estimated utilization"
sched: revert UTIL_EST usage from commit 6bf72ca7f1
Revert "WIP: UTIL_EST: sched/{core,fair}: add support to use estimated utilization"
Revert "WIP: UTIL_EST: sched/fair: add support for estimated utilization"
sched/fair: missing parts of 'optimize idle cpu selection for boosted tasks'
sched/fair: Fix uninitialised variable in idle_balance
Revert: UTIL_EST code from 'fix set_cfs_cpu_capacity when WALT is in use"
Unify whitespace layout with android-3.18
schedtune: Guarding against compile errors
sched/walt: Drop arch-specific timer access
Revert "DEBUG: UTIL_EST: sched: update tracepoint to report estimated CPU utilzation"
sched: This kernel expects sched_cfs_boost to be signed
schedutil: Fix linkage of schedutil and walt
config: Update marlin_defconfig to include schedutil governor
Dietmar Eggemann (20):
Revert "WIP: sched: Consider spare cpu capacity at task wake-up"
Partial Revert: "WIP: sched: Add cpu capacity awareness to wakeup balancing"
Experimental! arm64: Set SD_SHARE_CAP_STATES sched_domain flag on DIE level
Experimental!: sched/fair: Do not force want_affine eq. true if EAS is enabled
Experimental!: sched/fair: Decommission energy_aware_wake_cpu()
Fixup!: sched/fair.c: Set SchedTune specific struct energy_env.task
Experimental!: EAS: sched/fair: Re-integrate 'honor sync wakeups' into wakeup path
Experimental!: sched/fair: Code !is_big_little path into select_energy_cpu_brute()
Experimental!: sched: Remove sysctl_sched_is_big_little
sched/core: Remove remnants of commit fd5c98da1a42
Experimental!: sched/core: Add first cpu w/ max/min orig capacity to root domain
Experimental!: sched/fair: Change cpu iteration order in find_best_target()
sched/fair: Simplify backup_capacity handling in find_best_target()
Fixup!: sched/fair: Simplify target_util handling in find_best_target()
Fixup!: sched/fair: Simplify idle_idx handling in find_best_target()
Fixup!: sched/fair: Refactor min_util, new_util in find_best_target()
Fixup!: sched/fair: Simplify idle_idx handling in select_idle_sibling()
Fixup!: Return first idle cpu for prefer_idle task immediately
Fixup!: sched/fair: No need to 'and' current cpu w/ online mask in wakeup
sched: EAS & 'single cpu per cluster'/cpu hotplug interoperability
Dmitry Shmidt (1):
sched: Fix sysctl_sched_cfs_boost type to be int
Juri Lelli (3):
sched/cpufreq: make schedutil use WALT signal
trace/sched: add rq utilization signal for WALT
sched/walt: kill {min,max}_capacity
Ke Wang (1):
sched: tune: Fix lacking spinlock initialization
Morten Rasmussen (15):
sched/core: Fix power to capacity renaming in comment
sched/fair: Make the use of prev_cpu consistent in the wakeup path
sched/fair: Optimize find_idlest_cpu() when there is no choice
sched/core: Remove unnecessary NULL-pointer check
sched/core: Introduce SD_ASYM_CPUCAPACITY sched_domain topology flag
sched/core: Pass child domain into sd_init()
sched/core: Enable SD_BALANCE_WAKE for asymmetric capacity systems
sched/fair: Let asymmetric CPU configurations balance at wake-up
sched/fair: Compute task/cpu utilization at wake-up correctly
sched/fair: Consider spare capacity in find_idlest_group()
sched/fair: Add per-CPU min capacity to sched_group_capacity
sched/fair: Avoid pulling tasks from non-overloaded higher capacity groups
sched/fair: Fix incorrect comment for capacity_margin
Experimental!: sched/fair: Add energy_diff dead-zone margin
Experimental!: sched/fair: Energy-aware wake-up task placement
Patrick Bellasi (3):
FIXUP: sched/tune: update accouting before CPU capacity
FIX: sched/tune: move schedtune_nornalize_energy into fair.c
sched/tune: backport 'fix accounting for runnable tasks'
Peter Zijlstra (Intel) (3):
sched/fair: Apply more PELT fixes
sched/fair: Improve PELT stuff some more
sched/fair: Fix effective_load() to consistently use smoothed load
Petr Mladek (1):
kthread: allow to cancel kthread work
Srinath Sridharan (1):
eas/sched/fair: Fixing comments in find_best_target.
Steve Muckle (5):
sched/cpufreq: fix tunables for schedfreq governor
sched: backport cpufreq hooks from 4.9-rc4
sched: backport schedutil governor from 4.9-rc4
sched: cpufreq: use rt_avg as estimate of required RT CPU capacity
cpufreq: schedutil: add up/down frequency transition rate limits
Vincent Guittot (6):
sched: factorize attach entity
sched: factorize PELT update
sched: fix hierarchical order in rq->leaf_cfs_rq_list
sched: propagate load during synchronous attach/detach
sched: propagate asynchrous detach
sched: Multiple upstream load tracking changes
Viresh Kumar (1):
cpufreq: schedutil: move slow path from workqueue to SCHED_FIFO task
Yuyang Du (1):
sched/fair: Initiate a new task's util avg to a bounded value
kbuild test robot (2):
ANDROID: sched/tune: __pcpu_scope_cpu_boost_groups can be static
ANDROID: sched/tune: schedtune_allow_attach() can be static
arch/arm64/configs/marlin_defconfig | 2 +-
arch/arm64/kernel/topology.c | 7 +-
drivers/cpufreq/Kconfig | 27 +
drivers/cpufreq/Makefile | 2 +-
drivers/cpufreq/cpufreq.c | 32 +
drivers/cpufreq/cpufreq_governor_attr_set.c | 84 ++
include/linux/cgroup.h | 2 +-
include/linux/cpufreq.h | 49 ++
include/linux/kthread.h | 4 +
include/linux/sched.h | 20 +-
include/linux/sched/sysctl.h | 7 +-
include/trace/events/sched.h | 22 +-
init/Kconfig | 1 +
kernel/kthread.c | 96 +-
kernel/sched/Makefile | 2 +
kernel/sched/core.c | 84 +-
kernel/sched/cpufreq.c | 63 ++
kernel/sched/cpufreq_sched.c | 220 ++---
kernel/sched/cpufreq_schedutil.c | 762 ++++++++++++++++
kernel/sched/deadline.c | 3 +
kernel/sched/debug.c | 4 -
kernel/sched/fair.c | 1254 ++++++++++++++++++---------
kernel/sched/features.h | 5 -
kernel/sched/rt.c | 3 +
kernel/sched/sched.h | 84 +-
kernel/sched/tune.c | 5 +-
kernel/sched/tune.h | 3 +
kernel/sched/walt.c | 52 +-
kernel/sysctl.c | 7 -
29 files changed, 2261 insertions(+), 645 deletions(-)
create mode 100644 drivers/cpufreq/cpufreq_governor_attr_set.c
create mode 100644 kernel/sched/cpufreq.c
create mode 100644 kernel/sched/cpufreq_schedutil.c
--
1.9.1
IMPORTANT NOTICE: The contents of this email and any attachments are confidential and may also be privileged. If you are not the intended recipient, please notify the sender immediately and do not disclose the contents to any other person, use it for any purpose, or store or copy the information in any medium. Thank you.