Hi all,
On Thu, Dec 10, 2015 at 10:53:06AM +0800, Leo Yan wrote:
On Wed, Nov 25, 2015 at 10:41:32AM -0800, Steve Muckle wrote:
On 11/24/2015 07:55 PM, Leo Yan wrote:
[...]
Let's say we are placing a small task on a big.Little system, and that small task could fit on both the big and Little cluster.
Does the above statement imply that we would not evaluate the best CPU in the big cluster? I'd think we should, in addition to the best CPU in the little cluster, and decide between those two options. This is because we can have cases where the big cluster is actually the most efficient place to run a task due to current task loads and the OPP of the little cluster.
- Select CPUs with lowest OPP to meet capacity requirement;
- Select CPUs with highest utilization (as your said, here need to try to use least one, and I think it's more suitable for rt-app cases, even rt-app-6 also will take 35% CPU's utilization when CPU run at lowest OPP);
- Select CPUs with least CPU ID;
If you think here have no obvious logic error, I will try it in next 1~2 weeks and post result after finish related testing.
Could you post your draft changes here prior to testing? It'll help ensure I'm following your proposal correctly.
Below are the code with our discussion, please help review; I also enclosed the patch in case you want to check with diff format.
---<8---
static int find_cpu_new_capacity(int cpu, unsigned long util) { struct sched_domain *sd; struct sched_group_energy *sge; int idx;
sd = rcu_dereference(per_cpu(sd_ea, cpu)); sge = sd->groups->sge;
for (idx = 0; idx < sge->nr_cap_states; idx++) if (sge->cap_states[idx].cap >= util) break;
if (idx == sge->nr_cap_states) idx = idx - 1;
return idx; }
static void find_best_cpu_in_sg(struct cpumask *mask, struct sched_group *sg, struct task_struct *p) { int min_opp = INT_MAX, max_usage = 0, new_usage; int target_cpu = -1, i;
for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) {
int opp; /* * p's blocked utilization is still accounted for on prev_cpu * so prev_cpu will receive a negative bias due the double * accouting. However, the blocked utilization may be zero. */ new_usage = get_cpu_usage(i) + task_utilization(p);
When I continue to profile with this patch, I found I cannot get expected result; the tasks will be migrated in a mess after applied this patch.
The target CPU's selection is quite dependent on CPU's utilization, but from ftrace data file, the cfs_rq::utilization_load_avg will be increased sharply. Then finally it will imapct on CPU's migration.
So in [2], we can see the task even has been migrated on CPU2 with energy calculation, but it will be finally be migrated onto CPU3 becaused CPU2's utilization value is increased sharply and meet the condition of cpu_overutilized().
It's make sense for CPU's utilization's decay, but it should increase in step-wise when CPU is running for some tasks. So I want to confirm if this is the expected behavior for CPU's utilization, which will be increased sharply when enqueue one task on the CPU's rq?
I saw there have many polishment on CPU and task's load tracking recently, so do you think this issue has been fixed with new kernel (I'm using 4.2-rc6)?
Welcome any comment and suggestion.
Thanks, Leo Yan
[1] http://people.linaro.org/~leo.yan/eas_profiling/eas_cpu_utilization_increase... [2] http://people.linaro.org/~leo.yan/eas_profiling/eas_task_migrate_with_high_c...
opp = find_cpu_new_capacity(i, new_usage); /* If need higher OPP, then skip */ if (min_opp < opp) continue; /* If CPU with lowwer OPP, just use it */ if (min_opp > opp) { min_opp = opp; max_usage = new_usage; target_cpu = i; continue; } if (max_usage < new_usage) { max_usage = new_usage; target_cpu = i; continue; } if (i < target_cpu) { target_cpu = i; continue; }
}
BUG_ON(target_cpu == -1);
cpumask_set_cpu(target_cpu, mask); return; }
static int find_power_efficient_cpu(struct cpumask *mask, struct task_struct *p) { int i, target_cpu; int min_energy = 0, diff; struct energy_env eenv;
target_cpu = task_cpu(p);
for_each_cpu(i, mask) {
if (i == task_cpu(p)) continue; memset(&eenv, 0, sizeof(eenv)); eenv.usage_delta = task_utilization(p), eenv.src_cpu = task_cpu(p), eenv.dst_cpu = i, eenv.task = p, diff = energy_diff(&eenv); if (diff < min_energy) { target_cpu = i; min_energy = diff; }
}
return target_cpu; }
static int energy_aware_wake_cpu(struct task_struct *p, int target) { struct sched_domain *sd; struct sched_group *sg, *sg_target; int target_cpu; struct cpumask target_cpus;
sd = rcu_dereference(per_cpu(sd_ea, task_cpu(p)));
if (!sd) return target;
sg = sd->groups; sg_target = sg;
cpumask_clear(&target_cpus);
do { find_best_cpu_in_sg(&target_cpus, sg, p);
} while (sg = sg->next, sg != sd->groups);
if (cpumask_empty(&target_cpus)) cpumask_set_cpu(task_cpu(p), &target_cpus);
target_cpu = find_power_efficient_cpu(&target_cpus, p);
return target_cpu; }
--->8---
Thanks, Leo Yan
From c9dfdeb5b9f38e94eca3c489091314a4e82f4864 Mon Sep 17 00:00:00 2001 From: Leo Yan leo.yan@linaro.org Date: Thu, 10 Dec 2015 10:41:39 +0800 Subject: [PATCH] sched/fair: EASv5: Spread Tasks With Lower OPP
With this patch, we will select best CPU from every sched group with below priority:
- Select CPUs with lowest OPP to meet capacity requirement
- Select CPUs with highest utilization
- Select CPUs with least CPU ID
After the selections, then need compare these candidates CPUs and select best CPU from energy data.
Signed-off-by: Leo Yan leo.yan@linaro.org
kernel/sched/fair.c | 157 ++++++++++++++++++++++++++++++++++------------------ 1 file changed, 104 insertions(+), 53 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ce293ff..127a354 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5038,6 +5038,9 @@ static int find_new_capacity(struct energy_env *eenv, } }
- if (idx == sge->nr_cap_states)
idx = idx - 1;
- eenv->cap_idx = idx; return idx;
} @@ -5557,87 +5560,135 @@ done: return target; }
-static int energy_aware_wake_cpu(struct task_struct *p, int target) +static int find_cpu_new_capacity(int cpu, unsigned long util) { struct sched_domain *sd;
- struct sched_group *sg, *sg_target;
- int target_max_cap = INT_MAX;
- int target_cpu = task_cpu(p);
- int i;
- struct sched_group_energy *sge;
- int idx;
- sd = rcu_dereference(per_cpu(sd_ea, task_cpu(p)));
- sd = rcu_dereference(per_cpu(sd_ea, cpu));
- sge = sd->groups->sge;
- if (!sd)
return target;
- for (idx = 0; idx < sge->nr_cap_states; idx++)
if (sge->cap_states[idx].cap >= util)
break;
- sg = sd->groups;
- sg_target = sg;
- if (idx == sge->nr_cap_states)
idx = idx - 1;
- /*
* Find group with sufficient capacity. We only get here if no cpu is
* overutilized. We may end up overutilizing a cpu by adding the task,
* but that should not be any worse than select_idle_sibling().
* load_balance() should sort it out later as we get above the tipping
* point.
*/
- do {
/* Assuming all cpus are the same in group */
int max_cap_cpu = group_first_cpu(sg);
- return idx;
+}
/*
* Assume smaller max capacity means more energy-efficient.
* Ideally we should query the energy model for the right
* answer but it easily ends up in an exhaustive search.
*/
if (capacity_of(max_cap_cpu) < target_max_cap &&
task_fits_capacity(p, max_cap_cpu)) {
sg_target = sg;
target_max_cap = capacity_of(max_cap_cpu);
}
- } while (sg = sg->next, sg != sd->groups);
+static void find_best_cpu_in_sg(struct cpumask *mask, struct sched_group *sg,
struct task_struct *p)
+{
- int min_opp = INT_MAX, max_usage = 0, new_usage;
- int target_cpu = -1, i;
- for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) {
int opp;
- /* Find cpu with sufficient capacity */
- for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg_target)) { /*
*/
- p's blocked utilization is still accounted for on prev_cpu
- so prev_cpu will receive a negative bias due the double
- accouting. However, the blocked utilization may be zero.
int new_usage = get_cpu_usage(i) + task_utilization(p);
new_usage = get_cpu_usage(i) + task_utilization(p);
if (new_usage > capacity_orig_of(i))
opp = find_cpu_new_capacity(i, new_usage);
/* If need higher OPP, then skip */
if (min_opp < opp) continue;
if (new_usage < capacity_curr_of(i)) {
/* If CPU with lowwer OPP, just use it */
if (min_opp > opp) {
min_opp = opp;
max_usage = new_usage; target_cpu = i;
if (cpu_rq(i)->nr_running)
break;
}continue;
/* cpu has capacity at higher OPP, keep it as fallback */
if (target_cpu == task_cpu(p))
if (max_usage < new_usage) {
max_usage = new_usage; target_cpu = i;
- }
continue;
}
- if (target_cpu != task_cpu(p)) {
struct energy_env eenv = {
.usage_delta = task_utilization(p),
.src_cpu = task_cpu(p),
.dst_cpu = target_cpu,
.task = p,
};
if (i < target_cpu) {
target_cpu = i;
continue;
}
- }
/* Not enough spare capacity on previous cpu */
if (cpu_overutilized(task_cpu(p)))
return target_cpu;
- BUG_ON(target_cpu == -1);
if (energy_diff(&eenv) >= 0)
return task_cpu(p);
- cpumask_set_cpu(target_cpu, mask);
- return;
+}
+static int find_power_efficient_cpu(struct cpumask *mask, struct task_struct *p) +{
int i, target_cpu;
int min_energy = 0, diff;
struct energy_env eenv;
target_cpu = task_cpu(p);
for_each_cpu(i, mask) {
if (i == task_cpu(p))
continue;
memset(&eenv, 0, sizeof(eenv));
eenv.usage_delta = task_utilization(p),
eenv.src_cpu = task_cpu(p),
eenv.dst_cpu = i,
eenv.task = p,
diff = energy_diff(&eenv);
if (diff < min_energy) {
target_cpu = i;
min_energy = diff;
}
}
return target_cpu;
}
+static int energy_aware_wake_cpu(struct task_struct *p, int target) +{
- struct sched_domain *sd;
- struct sched_group *sg, *sg_target;
- int target_cpu;
- struct cpumask target_cpus;
- sd = rcu_dereference(per_cpu(sd_ea, task_cpu(p)));
- if (!sd)
return target;
- sg = sd->groups;
- sg_target = sg;
- cpumask_clear(&target_cpus);
- do {
find_best_cpu_in_sg(&target_cpus, sg, p);
- } while (sg = sg->next, sg != sd->groups);
- if (cpumask_empty(&target_cpus))
cpumask_set_cpu(task_cpu(p), &target_cpus);
- target_cpu = find_power_efficient_cpu(&target_cpus, p);
- return target_cpu;
+}
/*
- select_task_rq_fair: Select target runqueue for the waking task in domains
- that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
-- 1.9.1