The current implementation of overutilization, aborts energy aware
scheduling if any cpu in the system is over-utilized. This patch introduces
over utilization flag per sched domain level instead of a single flag
system wide. Load balancing is done at the sched domain where any
of the cpu is over utilized. If energy aware scheduling is
enabled and no cpu in a sched domain is overuttilized,
load balancing is skipped for that sched domain and energy aware
scheduling continues at that level.
The implementation takes advantage of the shared sched_domain structure
that is common across all the sched domains at a level. The new flag
introduced is placed in this structure so that all the sched domains the
same level share the flag. In case of an overutilized cpu, the flag gets
set at level1 sched_domain. The flag at the parent sched_domain level gets
set in either of the two following scenarios.
1. There is a misfit task in one of the cpu's in this sched_domain.
2. The total utilization of the domain is greater than the domain capacity
The flag is cleared if no cpu in a sched domain is overutilized.
This implementation still can have corner scenarios with respect to
misfit tasks. For example consider a sched group with n cpus and
n+1 70%utilized tasks. Ideally this is a case for load balance to happen
in a parent sched domain. But neither the total group utilization is
high enough for the load balance to be triggered
in the parent domain nor there is a cpu with a single overutilized task so
that aload balance is triggered in a parent domain. But again this could be
a purely academic sceanrio, as during task wake up these tasks will be placed
more appropriately.
Signed-off-by: Thara Gopinath <thara.gopinath(a)linaro.org>
---
V2->V3:
- Rebased on latest kernel.
- The previous check for misfit task is replaced with the
newely introduced rq->misfit_task flag.
V1->V2:
- Removed overutilized flag from sched_group structure.
- In case of misfit task, it is ensured that a load balance is
triggered in a parent sched domain with assymetric cpu capacities.
include/linux/sched/topology.h | 1 +
kernel/sched/fair.c | 137 +++++++++++++++++++++++++++++++++--------
kernel/sched/sched.h | 3 -
kernel/sched/topology.c | 8 +--
4 files changed, 117 insertions(+), 32 deletions(-)
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 3137750..ae44044 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -88,6 +88,7 @@ struct sched_domain_shared {
atomic_t ref;
atomic_t nr_busy_cpus;
int has_idle_cores;
+ bool overutilized;
};
struct sched_domain {
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a9ac67c..34bdfeb 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4791,6 +4791,29 @@ static inline void hrtick_update(struct rq *rq)
static bool cpu_overutilized(int cpu);
+static bool
+is_sd_overutilized(struct sched_domain *sd)
+{
+ if (sd)
+ return sd->shared->overutilized;
+ else
+ return false;
+}
+
+static void
+set_sd_overutilized(struct sched_domain *sd)
+{
+ if (sd)
+ sd->shared->overutilized = true;
+}
+
+static void
+clear_sd_overutilized(struct sched_domain *sd)
+{
+ if (sd)
+ sd->shared->overutilized = false;
+}
+
/*
* The enqueue_task method is called before nr_running is
* increased. Here we update the fair scheduling stats and
@@ -4800,6 +4823,7 @@ static void
enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
struct cfs_rq *cfs_rq;
+ struct sched_domain *sd;
struct sched_entity *se = &p->se;
int task_new = !(flags & ENQUEUE_WAKEUP);
@@ -4843,9 +4867,12 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (!se) {
add_nr_running(rq, 1);
- if (!task_new && !rq->rd->overutilized &&
- cpu_overutilized(rq->cpu))
- rq->rd->overutilized = true;
+ rcu_read_lock();
+ sd = rcu_dereference(rq->sd);
+ if (!task_new && !is_sd_overutilized(sd) &&
+ cpu_overutilized(rq->cpu))
+ set_sd_overutilized(sd);
+ rcu_read_unlock();
}
hrtick_update(rq);
}
@@ -6276,8 +6303,7 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu)
unsigned long max_spare = 0;
struct sched_domain *sd;
- rcu_read_lock();
-
+ /* The rcu lock is/should be held in the caller function */
sd = rcu_dereference(per_cpu(sd_ea, prev_cpu));
if (!sd)
@@ -6315,8 +6341,6 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu)
}
unlock:
- rcu_read_unlock();
-
if (energy_cpu == prev_cpu && !cpu_overutilized(prev_cpu))
return prev_cpu;
@@ -6350,10 +6374,16 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
&& cpumask_test_cpu(cpu, &p->cpus_allowed);
}
- if (energy_aware() && !(cpu_rq(prev_cpu)->rd->overutilized))
- return select_energy_cpu_brute(p, prev_cpu);
-
rcu_read_lock();
+ sd = rcu_dereference(cpu_rq(prev_cpu)->sd);
+ if (energy_aware() &&
+ !is_sd_overutilized(sd)) {
+ new_cpu = select_energy_cpu_brute(p, prev_cpu);
+ goto unlock;
+ }
+
+ sd = NULL;
+
for_each_domain(cpu, tmp) {
if (!(tmp->flags & SD_LOAD_BALANCE))
break;
@@ -6418,6 +6448,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
}
/* while loop will break here if sd == NULL */
}
+
+unlock:
rcu_read_unlock();
return new_cpu;
@@ -7478,6 +7510,7 @@ struct sd_lb_stats {
struct sched_group *local; /* Local group in this sd */
unsigned long total_load; /* Total load of all groups in sd */
unsigned long total_capacity; /* Total capacity of all groups in sd */
+ unsigned long total_util; /* Total util of all groups in sd */
unsigned long avg_load; /* Average load across all groups in sd */
struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
@@ -7497,6 +7530,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
.local = NULL,
.total_load = 0UL,
.total_capacity = 0UL,
+ .total_util = 0UL,
.busiest_stat = {
.avg_load = 0UL,
.sum_nr_running = 0,
@@ -7792,7 +7826,7 @@ group_type group_classify(struct sched_group *group,
static inline void update_sg_lb_stats(struct lb_env *env,
struct sched_group *group, int load_idx,
int local_group, struct sg_lb_stats *sgs,
- bool *overload, bool *overutilized)
+ bool *overload, bool *overutilized, bool *misfit_task)
{
unsigned long load;
int i, nr_running;
@@ -7831,8 +7865,16 @@ static inline void update_sg_lb_stats(struct lb_env *env,
!sgs->group_misfit_task && rq->misfit_task)
sgs->group_misfit_task = capacity_of(i);
- if (cpu_overutilized(i))
+ if (cpu_overutilized(i)) {
*overutilized = true;
+ /*
+ * If the cpu is overutilized and if there is only one
+ * current task in cfs runqueue, it is potentially a misfit
+ * task.
+ */
+ if (rq->misfit_task)
+ *misfit_task = true;
+ }
}
/* Adjust by relative CPU capacity of the group */
@@ -7974,12 +8016,12 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq)
*/
static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
{
- struct sched_domain *child = env->sd->child;
+ struct sched_domain *child = env->sd->child, *sd;
struct sched_group *sg = env->sd->groups;
struct sg_lb_stats *local = &sds->local_stat;
struct sg_lb_stats tmp_sgs;
int load_idx, prefer_sibling = 0;
- bool overload = false, overutilized = false;
+ bool overload = false, overutilized = false, misfit_task = false;
if (child && child->flags & SD_PREFER_SIBLING)
prefer_sibling = 1;
@@ -8001,7 +8043,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
}
update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
- &overload, &overutilized);
+ &overload, &overutilized,
+ &misfit_task);
if (local_group)
goto next_group;
@@ -8032,6 +8075,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
/* Now, start updating sd_lb_stats */
sds->total_load += sgs->group_load;
sds->total_capacity += sgs->group_capacity;
+ sds->total_util += sgs->group_util;
sg = sg->next;
} while (sg != env->sd->groups);
@@ -8045,14 +8089,45 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
/* update overload indicator if we are at root domain */
if (env->dst_rq->rd->overload != overload)
env->dst_rq->rd->overload = overload;
+ }
- /* Update over-utilization (tipping point, U >= 0) indicator */
- if (env->dst_rq->rd->overutilized != overutilized)
- env->dst_rq->rd->overutilized = overutilized;
- } else {
- if (!env->dst_rq->rd->overutilized && overutilized)
- env->dst_rq->rd->overutilized = true;
+ if (overutilized)
+ set_sd_overutilized(env->sd);
+ else
+ clear_sd_overutilized(env->sd);
+
+ /*
+ * If there is a misfit task in one cpu in this sched_domain
+ * it is likely that the imbalance cannot be sorted out among
+ * the cpu's in this sched_domain. In this case set the
+ * overutilized flag at the parent sched_domain.
+ */
+ if (misfit_task) {
+
+ sd = env->sd->parent;
+
+ /*
+ * In case of a misfit task, load balance at the parent
+ * sched domain level will make sense only if the the cpus
+ * have a different capacity. If cpus at a domain level have
+ * the same capacity, the misfit task cannot be well
+ * accomodated in any of the cpus and there in no point in
+ * trying a load balance at this level
+ */
+ while (sd) {
+ if (sd->flags & SD_ASYM_CPUCAPACITY) {
+ set_sd_overutilized(sd);
+ break;
+ }
+ sd = sd->parent;
+ }
}
+
+ /* If the domain util is greater that domain capacity, load balancing
+ * needs to be done at the next sched domain level as well
+ */
+ if (sds->total_capacity * 1024 < sds->total_util * capacity_margin)
+ set_sd_overutilized(env->sd->parent);
}
/**
@@ -8279,8 +8354,10 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
*/
update_sd_lb_stats(env, &sds);
- if (energy_aware() && !env->dst_rq->rd->overutilized)
- goto out_balanced;
+ if (energy_aware()) {
+ if (!is_sd_overutilized(env->sd))
+ goto out_balanced;
+ }
local = &sds.local_stat;
busiest = &sds.busiest_stat;
@@ -9164,6 +9241,11 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
rcu_read_lock();
for_each_domain(cpu, sd) {
+ if (energy_aware()) {
+ if (!is_sd_overutilized(sd))
+ continue;
+ }
+
/*
* Decay the newidle max times here because this is a regular
* visit to all the domains. Decay ~1% per second.
@@ -9466,6 +9548,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &curr->se;
+ struct sched_domain *sd;
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
@@ -9477,8 +9560,12 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
rq->misfit_task = !task_fits_capacity(curr, capacity_of(rq->cpu));
- if (!rq->rd->overutilized && cpu_overutilized(task_cpu(curr)))
- rq->rd->overutilized = true;
+ rcu_read_lock();
+ sd = rcu_dereference(rq->sd);
+ if (!is_sd_overutilized(sd) &&
+ cpu_overutilized(task_cpu(curr)))
+ set_sd_overutilized(sd);
+ rcu_read_unlock();
}
/*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 8d27d5b..1604ef2 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -585,9 +585,6 @@ struct root_domain {
/* Indicate more than one runnable task for any CPU */
bool overload;
- /* Indicate one or more cpus over-utilized (tipping point) */
- bool overutilized;
-
/*
* The bit corresponding to a CPU gets set here if such CPU has more
* than one runnable -deadline task (as it is below for RT tasks).
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 263e549..e5ba6fc 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1040,11 +1040,11 @@ sd_init(struct sched_domain_topology_level *tl,
* For all levels sharing cache; connect a sched_domain_shared
* instance.
*/
- if (sd->flags & SD_SHARE_PKG_RESOURCES) {
- sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
- atomic_inc(&sd->shared->ref);
+ sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
+ atomic_inc(&sd->shared->ref);
+
+ if (sd->flags & SD_SHARE_PKG_RESOURCES)
atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
- }
sd->private = sdd;
--
2.1.4
With Android UI and benchmarks the latency of cpufreq response to
certain scheduling events can become very critical. Currently, callbacks
into cpufreq governors are only made from the scheduler if the target
CPU of the event is the same as the current CPU. This means there are
certain situations where a target CPU may not run the cpufreq governor
for some time.
One testcase [1] to show this behavior is where a task starts running on
CPU0, then a new task is also spawned on CPU0 by a task on CPU1. If the
system is configured such that the new tasks should receive maximum
demand initially, this should result in CPU0 increasing frequency
immediately. But because of the above mentioned limitation though, this
does not occur.
This series updates the scheduler core to call the cpufreq callbacks for
remote CPUs as well and updates the registered hooks to handle that.
This is tested with couple of usecases (Android: hackbench, recentfling,
galleryfling, vellamo, Ubuntu: hackbench) on ARM hikey board (64 bit
octa-core, single policy). Only galleryfling showed minor improvements,
while others didn't had much deviation.
The reason being that this patch only targets a corner case, where
following are required to be true to improve performance and that
doesn't happen too often with these tests:
- Task is migrated to another CPU.
- The task has high demand, and should take the target CPU to higher
OPPs.
- And the target CPU doesn't call into the cpufreq governor until the
next tick.
Rebased over: pm/linux-next
V4->V5:
- Drop cpu field from "struct update_util_data" and add it in "struct
sugov_cpu" instead.
- Can't have separate patches now because of the above change and so
merged all the patches from V4 into a single patch.
- Add a comment suggested by PeterZ.
- Commit log of 1/2 is improved to contain more details.
- A new patch (which was posted during V1) is also added to take care of
platforms where any CPU can do DVFS on behalf of any other CPU, even
if they are part of different cpufreq policies. This has been
requested by Saravana several times already and as the series is quite
straight forward now, I decided to include it in.
V3->V4:
- Respect iowait boost flag and util updates for the all remote
callbacks.
- Minor updates in commit log of 2/3.
V2->V3:
- Rearranged/merged patches as suggested by Rafael (looks much better
now)
- Also handle new hook added to intel-pstate driver.
- The final code remains the same as V2, except for the above hook.
V1->V2:
- Don't support remote callbacks for unshared cpufreq policies.
- Don't support remote callbacks where local CPU isn't part of the
target CPU's cpufreq policy.
- Dropped dvfs_possible_from_any_cpu flag.
--
viresh
[1] http://pastebin.com/7LkMSRxE
Viresh Kumar (2):
sched: cpufreq: Allow remote cpufreq callbacks
cpufreq: Process remote callbacks from any CPU if the platform permits
drivers/cpufreq/cpufreq-dt.c | 1 +
drivers/cpufreq/cpufreq_governor.c | 3 +++
drivers/cpufreq/intel_pstate.c | 8 ++++++++
include/linux/cpufreq.h | 23 +++++++++++++++++++++++
kernel/sched/cpufreq_schedutil.c | 31 ++++++++++++++++++++++++++-----
kernel/sched/deadline.c | 2 +-
kernel/sched/fair.c | 8 +++++---
kernel/sched/rt.c | 2 +-
kernel/sched/sched.h | 10 ++--------
9 files changed, 70 insertions(+), 18 deletions(-)
--
2.13.0.71.gd7076ec9c9cb
Hello EAS-dev!
ARM is pleased to announce the EAS r1.3 release.
This is the next tick in our regular updates to EAS in AOSP, including documentation and testing updates.
In particular this release is the first major update to EAS in Android Common Kernel 4.9
Changes in EAS 1.3
* Validation on real devices and additional development boards (Hikey960)
* Increased test coverage
* Upstream schedutil backporting
* Schedutil is now the recommended CPUFreq governor
* General EAS refactoring improvements (find_best_target changes)
* android common kernel-4.9 brought to EAS equivalence with 4.4
Android Common Kernel 4.4:
https://android.googlesource.com/kernel/common/+/android-4.4
Android Common Kernel 4.9:
https://android-review.googlesource.com/#/c/444387/
Once merged into android-4.9, the gerrit web interface will tell you that the patches have been merged
however the changeset link should stay active.
Documentation:
https://developer.arm.com/-/media/developer/developers/open-source/energy-a…
Specifically about schedutil:
We have backported schedutil patches up until 38d4ea229d which was included in v4.12.
(https://github.com/torvalds/linux/commit/38d4ea229d25d30be6bf41bcd6cd663a58…)
"cpufreq: schedutil: Trace frequency only if it has changed".
The version included in android-4.9 includes backported patches to the same level. This brings
schedutil in both versions of Android up to v4.12.
We have satisfied ourselves in testing that this version of schedutil works well enough to be used
in place of schedfreq both for performance and energy usage.
EAS Updates:
We have done a large refactoring of find_best_target as it was beginning to become difficult
to make further improvements without impacting other behaviors. The refactored version
has exactly the same behaviour in the refactor commit, and it has allowed us to further
refine the task of selecting a CPU during wakeup. We added the ability to return a second
target CPU from find_best_target, which is chosen using a different strategy. When the first
target is not allowable due to the energy/performance trade-off not being good enough,
we now check the alternative strategy as well (but only if the primary strategy fails).
A new tracepoint was added to help in understanding EAS task placement decisions -
sched_find_best_target - which traces the task, schedtune flags and CPUs which were
selected by find_best_target for energy evaluation.
More patches were added to improve system behavior with idle CPUs. We now prevent
an idle CPU from holding the system in overutilized mode (if it was overutilized just
before going into nohz mode), allowing EAS to handle task placements again sooner.
In addition, when misfit tasks are present, we bypass some of the normal nohz balance
rate-limiting to reduce the time needed for those tasks to be redistributed.
Finally, we added the ability for EAS to forecast the idle state which could potentially be
selected under the utilization conditions when calculating the energy for a particular
sched group. The forecast is intentionally simple as it is done during wakeup - we
reserve the deepest idle state for completely idle groups and otherwise linearly
map the group utilization to idle states. In previous versions, EAS used the current
idle state when estimating energy. This change allows EAS to see the potential
impact of moving the last task from one group to another and move tasks if appropriate.
Android-4.9:
android-4.9 has not yet had the same level of testing that android-4.4 has due
to us having a limited set of platforms which can run a 4.9 kernel version. For most of
this dev cycle we have only had access to Juno, and we have confirmed that our tests
behave the same on 4.9 as they do on 4.4. In the last week or two, the Hikey960 board
has gained a usable BSP for running android-4.9, so we have also been testing that
but it is too early to share those results.
We continue to develop EAS on AOSP in public. Please feel free to participate in
testing patches, reviewing code and generally being a good open-source citizen.
Best Regards,
Chris Redpath
Hi,
With Android UI and benchmarks the latency of cpufreq response to
certain scheduling events can become very critical. Currently, callbacks
into schedutil are only made from the scheduler if the target CPU of the
event is the same as the current CPU. This means there are certain
situations where a target CPU may not run schedutil for some time.
One testcase to show this behavior is where a task starts running on
CPU0, then a new task is also spawned on CPU0 by a task on CPU1. If the
system is configured such that new tasks should receive maximum demand
initially, this should result in CPU0 increasing frequency immediately.
Because of the above mentioned limitation though this does not occur.
This is verified using ftrace with the sample [1] application.
Maybe the ideal solution is to always allow remote callbacks but that
has its own challenges:
o There is no protection required for single CPU per policy case today,
and adding any kind of locking there, to supply remote callbacks,
isn't really a good idea.
o If is local CPU isn't part of the same cpufreq policy as the target
CPU, then we wouldn't be able to do fast switching at all and have to
use some kind of bottom half to schedule work on the target CPU to do
real switching. That may be overkill as well.
And so this series only allows remote callbacks for target CPUs that
share the cpufreq policy with the local CPU.
This series is tested with couple of usecases (Android: hackbench,
recentfling, galleryfling, vellamo, Ubuntu: hackbench) on ARM hikey
board (64 bit octa-core, single policy). Only galleryfling showed minor
improvements, while others didn't had much deviation.
The reason being that this patchset only targets a corner case, where
following are required to be true to improve performance and that
doesn't happen too often with these tests:
- Task is migrated to another CPU.
- The task has maximum demand initially, and should take the CPU to
higher OPPs.
- And the target CPU doesn't call into schedutil until the next tick.
V2->V3:
- Rearranged/merged patches as suggested by Rafael (looks much better
now)
- Also handle new hook added to intel-pstate driver.
- The final code remains the same as V2, except for the above hook.
V1->V2:
- Don't support remote callbacks for unshared cpufreq policies.
- Don't support remote callbacks where local CPU isn't part of the
target CPU's cpufreq policy.
- Dropped dvfs_possible_from_any_cpu flag.
--
viresh
[1] http://pastebin.com/7LkMSRxE
Viresh Kumar (3):
sched: cpufreq: Allow remote cpufreq callbacks
cpufreq: schedutil: Process remote callback for shared policies
cpufreq: governor: Process remote callback for shared policies
drivers/cpufreq/cpufreq_governor.c | 4 ++++
drivers/cpufreq/intel_pstate.c | 8 ++++++++
include/linux/sched/cpufreq.h | 1 +
kernel/sched/cpufreq.c | 1 +
kernel/sched/cpufreq_schedutil.c | 19 ++++++++++++++-----
kernel/sched/deadline.c | 2 +-
kernel/sched/fair.c | 8 +++++---
kernel/sched/rt.c | 2 +-
kernel/sched/sched.h | 10 ++--------
9 files changed, 37 insertions(+), 18 deletions(-)
--
2.13.0.71.gd7076ec9c9cb
Hi,
I had some IRC discussions with Peter and V4 is based on his feedback.
Here is the diff between V3 and V4:
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index d64754fb912e..df9aa1ee53ff 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -79,6 +79,10 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
s64 delta_ns;
bool update;
+ /* Allow remote callbacks only on the CPUs sharing cpufreq policy */
+ if (!cpumask_test_cpu(smp_processor_id(), sg_policy->policy->cpus))
+ return false;
+
if (sg_policy->work_in_progress)
return false;
@@ -225,10 +229,6 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
unsigned int next_f;
bool busy;
- /* Remote callbacks aren't allowed for policies which aren't shared */
- if (smp_processor_id() != hook->cpu)
- return;
-
sugov_set_iowait_boost(sg_cpu, time, flags);
sg_cpu->last_update = time;
@@ -298,14 +298,9 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time,
{
struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
struct sugov_policy *sg_policy = sg_cpu->sg_policy;
- struct cpufreq_policy *policy = sg_policy->policy;
unsigned long util, max;
unsigned int next_f;
- /* Allow remote callbacks only on the CPUs sharing cpufreq policy */
- if (!cpumask_test_cpu(smp_processor_id(), policy->cpus))
- return;
-
sugov_get_util(&util, &max, hook->cpu);
raw_spin_lock(&sg_policy->update_lock);
-------------------------8<-------------------------
With Android UI and benchmarks the latency of cpufreq response to
certain scheduling events can become very critical. Currently, callbacks
into schedutil are only made from the scheduler if the target CPU of the
event is the same as the current CPU. This means there are certain
situations where a target CPU may not run schedutil for some time.
One testcase to show this behavior is where a task starts running on
CPU0, then a new task is also spawned on CPU0 by a task on CPU1. If the
system is configured such that new tasks should receive maximum demand
initially, this should result in CPU0 increasing frequency immediately.
Because of the above mentioned limitation though this does not occur.
This is verified using ftrace with the sample [1] application.
Maybe the ideal solution is to always allow remote callbacks but that
has its own challenges:
o There is no protection required for single CPU per policy case today,
and adding any kind of locking there, to supply remote callbacks,
isn't really a good idea.
o If is local CPU isn't part of the same cpufreq policy as the target
CPU, then we wouldn't be able to do fast switching at all and have to
use some kind of bottom half to schedule work on the target CPU to do
real switching. That may be overkill as well.
And so this series only allows remote callbacks for target CPUs that
share the cpufreq policy with the local CPU.
This series is tested with couple of usecases (Android: hackbench,
recentfling, galleryfling, vellamo, Ubuntu: hackbench) on ARM hikey
board (64 bit octa-core, single policy). Only galleryfling showed minor
improvements, while others didn't had much deviation.
The reason being that this patchset only targets a corner case, where
following are required to be true to improve performance and that
doesn't happen too often with these tests:
- Task is migrated to another CPU.
- The task has maximum demand initially, and should take the CPU to
higher OPPs.
- And the target CPU doesn't call into schedutil until the next tick.
V3->V4:
- Respect iowait boost flag and util updates for the all remote
callbacks.
- Minor updates in commit log of 2/3.
V2->V3:
- Rearranged/merged patches as suggested by Rafael (looks much better
now)
- Also handle new hook added to intel-pstate driver.
- The final code remains the same as V2, except for the above hook.
V1->V2:
- Don't support remote callbacks for unshared cpufreq policies.
- Don't support remote callbacks where local CPU isn't part of the
target CPU's cpufreq policy.
- Dropped dvfs_possible_from_any_cpu flag.
--
viresh
Viresh Kumar (3):
sched: cpufreq: Allow remote cpufreq callbacks
cpufreq: schedutil: Process remote callback for shared policies
cpufreq: governor: Process remote callback for shared policies
drivers/cpufreq/cpufreq_governor.c | 4 ++++
drivers/cpufreq/intel_pstate.c | 8 ++++++++
include/linux/sched/cpufreq.h | 1 +
kernel/sched/cpufreq.c | 1 +
kernel/sched/cpufreq_schedutil.c | 14 +++++++++-----
kernel/sched/deadline.c | 2 +-
kernel/sched/fair.c | 8 +++++---
kernel/sched/rt.c | 2 +-
kernel/sched/sched.h | 10 ++--------
9 files changed, 32 insertions(+), 18 deletions(-)
--
2.13.0.71.gd7076ec9c9cb
Thanks guys for all the great info! I will take another look and see what I
can do now that I have a better idea of how to go about it. Once again,
it's appreciated that you guys are working out in the open. I know many
others that are also keeping up with this mailing list. It has been a great
learning experience.
Kind Regards,
Zachariah Kennedy
Good day!
I have been following EAS development for sometime now. Currently, I have
implemented EAS in my own personal kernel for the Oneplus 3. It was largely
based on the work done for the pixel and I am happy to say that currently,
I have gotten better performance and battery life compared to stock CAF
with HMP.
These questions will be based on the ACK android-4.4 branch
My first question is regarding tunings for EAS. I have seen many different
values thrown around for awhile but I was curious about what everyone close
to the project is using for schedutil up/down_rate_limit. Currently the
stock values are 1000 (for up and down). Is this still the case for those
testing the newest EAS changes using schedutil?
Also what about stune? I know stock pixel is using 50 for
top-app\schedtune.boost for interactions but that turns out to be overkill
with schedutil.
Lastly, I had purchased the Oneplus 5 with the SD835 just so I can port EAS
to it as well. I am looking forward to testing how EAS scales with the
extra cores to work with when compared to the SD820/821. One main questions
regarding the SD835 is I wanted to see if anyone on the EAS-DEV list has
developed an energy model for the SD835 (MSM8998). Even if it is just
preliminary, I would appreciate any help with this. I do not have a proper
energy meter yet.
This is something I am truly interested in. I love the openness of all the
Devs close to this project. I have become a better developer having
participated and watching from the sidelines. Thanks guys for your hard
work.
Kind Regards,
Zachariah Kennedy
Hi,
Here is the second version of this series. The first [1] version was
sent several months back.
With Android UI and benchmarks the latency of cpufreq response to
certain scheduling events can become very critical. Currently, callbacks
into schedutil are only made from the scheduler if the target CPU of the
event is the same as the current CPU. This means there are certain
situations where a target CPU may not run schedutil for some time.
One testcase to show this behavior is where a task starts running on
CPU0, then a new task is also spawned on CPU0 by a task on CPU1. If the
system is configured such that new tasks should receive maximum demand
initially, this should result in CPU0 increasing frequency immediately.
Because of the above mentioned limitation though this does not occur.
This is verified using ftrace with the sample [2] application.
Maybe the ideal solution is to always allow remote callbacks but that
has its own challenges:
o There is no protection required for single CPU per policy case today,
and adding any kind of locking there, to supply remote callbacks,
isn't really a good idea.
o If is local CPU isn't part of the same cpufreq policy as the target
CPU, then we wouldn't be able to do fast switching at all and have to
use some kind of bottom half to schedule work on the target CPU to do
real switching. That may be overkill as well.
Taking above challenges into consideration, this version proposes a much
simpler diff as compared to the first version.
This series only allows remote callbacks for target CPUs that share the
cpufreq policy with the local CPU. Locking is mostly in place everywhere
and we wouldn't be required to change a lot of things.
This series is tested with couple of usecases (Android: hackbench,
recentfling, galleryfling, vellamo, Ubuntu: hackbench) on ARM hikey
board (64 bit octa-core, single policy). Only galleryfling showed minor
improvements, while others didn't had much deviation.
The reason being that this patchset only targets a corner case, where
following are required to be true to improve performance and that
doesn't happen too often with these tests:
- Task is migrated to another CPU.
- The task has maximum demand initially, and should take the CPU to
higher OPPs.
- And the target CPU doesn't call into schedutil until the next tick.
V1->V2:
- Don't support remote callbacks for unshared cpufreq policies.
- Don't support remote callbacks where local CPU isn't part of the
target CPU's cpufreq policy.
- Dropped dvfs_possible_from_any_cpu flag.
--
viresh
[1] https://marc.info/?l=linux-pm&m=148906015927796&w=2
[2] http://pastebin.com/7LkMSRxE
Steve Muckle (1):
intel_pstate: Ignore scheduler cpufreq callbacks on remote CPUs
Viresh Kumar (3):
cpufreq: schedutil: Process remote callback for shared policies
cpufreq: governor: Process remote callback for shared policies
sched: cpufreq: Enable remote sched cpufreq callbacks
drivers/cpufreq/cpufreq_governor.c | 4 ++++
drivers/cpufreq/intel_pstate.c | 3 +++
include/linux/sched/cpufreq.h | 1 +
kernel/sched/cpufreq.c | 1 +
kernel/sched/cpufreq_schedutil.c | 19 ++++++++++++++-----
kernel/sched/deadline.c | 2 +-
kernel/sched/fair.c | 8 +++++---
kernel/sched/rt.c | 2 +-
kernel/sched/sched.h | 10 ++--------
9 files changed, 32 insertions(+), 18 deletions(-)
--
2.13.0.71.gd7076ec9c9cb