Hi Quentin and EAS team,
As we discussed offline with Quentin my issues with EAS (a bit old version) and tasks migration, here is the patch set that I have with some fixes. The version that I have been working on is: - kernel: 4.14.24 with some driver fixes for Exynos SoC (it was close to eas/next/integration_base_20171211_0913) - EAS: eas/next/integration_20171211_0913 http://linux-arm.org/git?p=linux-power.git%3Ba=shortlog%3Bh=refs/heads/eas/n...
This patch set tries to solve: - issue with one little core not used in 1st phase, - issues with not used LITTLE cores in the second phase,
The test script uses sysbench in 2 phases. First it starts two task sets: 4 tasks on big cores and 4 will be on little. The big tasks finish earlier and little should be migrated there. Then there is the second phase. The set of 4 new tasks is started and should be placed on little cores. ------------8<------------------- REQUESTS=20000 BIGS="4 5 6 7" LITTLES="0 1 2 3"
PATH="$PATH:/root/devlib-target/bin" # Don't care about the score for those, just keep the bigs busy for i in $BIGS; do taskset -c $i sysbench --max-requests=$((REQUESTS / 4)) \ --test=cpu run &>/dev/null & done
for i in $LITTLES; do $(sleep 25 && sysbench --max-requests=$((REQUESTS / 8)) --test=cpu run &>/dev/null &) & done
for i in $LITTLES; do sysbench --max-requests=$REQUESTS --test=cpu run \ | grep "total time:" & done
wait ----------->8--------------------
It is a bit old EAS but maybe you can find something useful in this fixes or test scenario.
Regards, Lukasz Luba
Lukasz Luba (5): arm64: exynos: Add support of exynos5433 to EAS with energy model v1 trace: sched: add new trace events for tracking migrations sched/fair: drop aggressive migration sched/fair: change migration destination based on CPU utilization sched/fair: change finding idle group
arch/arm64/boot/dts/exynos/exynos5433.dtsi | 8 ++ arch/arm64/kernel/energy_model.h | 119 +++++++++++++++++++ include/trace/events/sched.h | 130 +++++++++++++++++++++ kernel/sched/fair.c | 112 +++++++++++++++--- 4 files changed, 351 insertions(+), 18 deletions(-)
This patch adds a basic support of Exynos5433 SoC to EAS.
Signed-off-by: Lukasz Luba l.luba@partner.samsung.com --- arch/arm64/boot/dts/exynos/exynos5433.dtsi | 8 ++ arch/arm64/kernel/energy_model.h | 119 +++++++++++++++++++++ 2 files changed, 127 insertions(+)
diff --git a/arch/arm64/boot/dts/exynos/exynos5433.dtsi b/arch/arm64/boot/dts/exynos/exynos5433.dtsi index bee00ead0a7b..4bbd6ae1ab49 100644 --- a/arch/arm64/boot/dts/exynos/exynos5433.dtsi +++ b/arch/arm64/boot/dts/exynos/exynos5433.dtsi @@ -72,6 +72,7 @@ clock-names = "apolloclk"; operating-points-v2 = <&cluster_a53_opp_table>; #cooling-cells = <2>; + capacity-dmips-mhz = <512>; };
cpu1: cpu@101 { @@ -82,6 +83,7 @@ clock-frequency = <1300000000>; operating-points-v2 = <&cluster_a53_opp_table>; #cooling-cells = <2>; + capacity-dmips-mhz = <512>; };
cpu2: cpu@102 { @@ -92,6 +94,7 @@ clock-frequency = <1300000000>; operating-points-v2 = <&cluster_a53_opp_table>; #cooling-cells = <2>; + capacity-dmips-mhz = <512>; };
cpu3: cpu@103 { @@ -102,6 +105,7 @@ clock-frequency = <1300000000>; operating-points-v2 = <&cluster_a53_opp_table>; #cooling-cells = <2>; + capacity-dmips-mhz = <512>; };
cpu4: cpu@0 { @@ -114,6 +118,7 @@ clock-names = "atlasclk"; operating-points-v2 = <&cluster_a57_opp_table>; #cooling-cells = <2>; + capacity-dmips-mhz = <1024>; };
cpu5: cpu@1 { @@ -124,6 +129,7 @@ clock-frequency = <1900000000>; operating-points-v2 = <&cluster_a57_opp_table>; #cooling-cells = <2>; + capacity-dmips-mhz = <1024>; };
cpu6: cpu@2 { @@ -134,6 +140,7 @@ clock-frequency = <1900000000>; operating-points-v2 = <&cluster_a57_opp_table>; #cooling-cells = <2>; + capacity-dmips-mhz = <1024>; };
cpu7: cpu@3 { @@ -144,6 +151,7 @@ clock-frequency = <1900000000>; operating-points-v2 = <&cluster_a57_opp_table>; #cooling-cells = <2>; + capacity-dmips-mhz = <1024>; }; };
diff --git a/arch/arm64/kernel/energy_model.h b/arch/arm64/kernel/energy_model.h index 0623bd01905c..349ea307b3a0 100644 --- a/arch/arm64/kernel/energy_model.h +++ b/arch/arm64/kernel/energy_model.h @@ -263,6 +263,118 @@ static struct sched_group_energy energy_core_hikey960_a72 = { .cap_states = cap_states_core_hikey960_a72, };
+/* exynos5433 */ + +static struct idle_state idle_states_cluster_exynos5433_a53[] = { + { .power = 16 }, /* arch_cpu_idle() (active idle) = WFI */ + { .power = 0 }, /* cluster-sleep */ +}; + +static struct idle_state idle_states_cluster_exynos5433_a57[] = { + { .power = 150 }, /* arch_cpu_idle() (active idle) = WFI */ + { .power = 0 }, /* cluster-sleep */ +}; + +static struct capacity_state cap_states_cluster_exynos5433_a53[] = { + { .cap = 194, .power = 13, }, /* 400 MHz */ + { .cap = 242, .power = 17, }, /* 500 MHz */ + { .cap = 292, .power = 22, }, /* 600 MHz */ + { .cap = 339, .power = 27, }, /* 700 MHz */ + { .cap = 388, .power = 32, }, /* 800 MHz */ + { .cap = 436, .power = 38, }, /* 900 MHz */ + { .cap = 485, .power = 45, }, /* 1000 MHz */ + { .cap = 533, .power = 52, }, /* 1100 MHz */ + { .cap = 581, .power = 59, }, /* 1200 MHz */ + { .cap = 630, .power = 67, }, /* 1300 MHz */ +}; + +static struct capacity_state cap_states_cluster_exynos5433_a57[] = { + { .cap = 269, .power = 22, }, /* 500MHz */ + { .cap = 323, .power = 26, }, /* 600MHz */ + { .cap = 377, .power = 32, }, /* 700MHz */ + { .cap = 431, .power = 36, }, /* 800MHz */ + { .cap = 485, .power = 43, }, /* 900MHz */ + { .cap = 538, .power = 51, }, /* 1000MHz */ + { .cap = 592, .power = 61, }, /* 1100MHz */ + { .cap = 646, .power = 70, }, /* 1200MHz */ + { .cap = 700, .power = 79, }, /* 1300MHz */ + { .cap = 754, .power = 90, }, /* 1400MHz */ + { .cap = 808, .power = 103, }, /* 1500MHz */ + { .cap = 861, .power = 112, }, /* 1600MHz */ + { .cap = 915, .power = 127, }, /* 1700MHz */ + { .cap = 969, .power = 143, }, /* 1800MHz */ + { .cap = 1023, .power = 164, }, /* 1900MHz */ +}; + +static struct sched_group_energy energy_cluster_exynos5433_a53 = { + .nr_idle_states = ARRAY_SIZE(idle_states_cluster_exynos5433_a53), + .idle_states = idle_states_cluster_exynos5433_a53, + .nr_cap_states = ARRAY_SIZE(cap_states_cluster_exynos5433_a53), + .cap_states = cap_states_cluster_exynos5433_a53, +}; + +static struct sched_group_energy energy_cluster_exynos5433_a57 = { + .nr_idle_states = ARRAY_SIZE(idle_states_cluster_exynos5433_a57), + .idle_states = idle_states_cluster_exynos5433_a57, + .nr_cap_states = ARRAY_SIZE(cap_states_cluster_exynos5433_a57), + .cap_states = cap_states_cluster_exynos5433_a57, +}; + +static struct idle_state idle_states_core_exynos5433_a53[] = { + { .power = 14 }, /* arch_cpu_idle() (active idle) = WFI */ + { .power = 0 }, /* cluster-sleep */ +}; + +static struct idle_state idle_states_core_exynos5433_a57[] = { + { .power = 50 }, /* arch_cpu_idle() (active idle) = WFI */ + { .power = 0 }, /* cluster-sleep */ +}; + +static struct capacity_state cap_states_core_exynos5433_a53[] = { + { .cap = 194, .power = 82, }, /* 400 MHz */ + { .cap = 242, .power = 108, }, /* 500 MHz */ + { .cap = 292, .power = 137, }, /* 600 MHz */ + { .cap = 339, .power = 169, }, /* 700 MHz */ + { .cap = 388, .power = 203, }, /* 800 MHz */ + { .cap = 436, .power = 240, }, /* 900 MHz */ + { .cap = 485, .power = 280, }, /* 1000 MHz */ + { .cap = 533, .power = 322, }, /* 1100 MHz */ + { .cap = 581, .power = 368, }, /* 1200 MHz */ + { .cap = 630, .power = 417, }, /* 1300 MHz */ +}; + +static struct capacity_state cap_states_core_exynos5433_a57[] = { + { .cap = 269, .power = 137, }, /* 500MHz */ + { .cap = 323, .power = 164, }, /* 600MHz */ + { .cap = 377, .power = 197, }, /* 700MHz */ + { .cap = 431, .power = 225, }, /* 800MHz */ + { .cap = 485, .power = 268, }, /* 900MHz */ + { .cap = 538, .power = 322, }, /* 1000MHz */ + { .cap = 592, .power = 381, }, /* 1100MHz */ + { .cap = 646, .power = 437, }, /* 1200MHz */ + { .cap = 700, .power = 496, }, /* 1300MHz */ + { .cap = 754, .power = 560, }, /* 1400MHz */ + { .cap = 808, .power = 642, }, /* 1500MHz */ + { .cap = 861, .power = 700, }, /* 1600MHz */ + { .cap = 915, .power = 793, }, /* 1700MHz */ + { .cap = 969, .power = 894, }, /* 1800MHz */ + { .cap = 1023, .power = 1023, }, /* 1900MHz */ +}; + +static struct sched_group_energy energy_core_exynos5433_a53 = { + .nr_idle_states = ARRAY_SIZE(idle_states_core_exynos5433_a53), + .idle_states = idle_states_core_exynos5433_a53, + .nr_cap_states = ARRAY_SIZE(cap_states_core_exynos5433_a53), + .cap_states = cap_states_core_exynos5433_a53, +}; + +static struct sched_group_energy energy_core_exynos5433_a57 = { + .nr_idle_states = ARRAY_SIZE(idle_states_core_exynos5433_a57), + .idle_states = idle_states_core_exynos5433_a57, + .nr_cap_states = ARRAY_SIZE(cap_states_core_exynos5433_a57), + .cap_states = cap_states_core_exynos5433_a57, +}; + /* An energy model contains core, cluster and system sched group energy * for 2 clusters (cluster id 0 and 1). set_energy_model() relies on * this feature. It is enforced by a BUG_ON in energy(). @@ -292,10 +404,17 @@ static struct energy_model hikey960_model = { {}, };
+static struct energy_model exynos5433_model = { + { &energy_core_exynos5433_a53, &energy_core_exynos5433_a57, }, + { &energy_cluster_exynos5433_a53, &energy_cluster_exynos5433_a57, }, + {}, +}; + static struct of_device_id model_matches[] = { { .compatible = "arm,juno", .data = &juno_model }, { .compatible = "hisilicon,hi6220-hikey", .data = &hikey_model }, { .compatible = "hisilicon,hi3660-hikey960", .data = &hikey960_model }, + { .compatible = "samsung,exynos5433", .data = &exynos5433_model }, {}, };
This patch adds some new traces which can help during load balance and/or migration investigations.
Signed-off-by: Lukasz Luba l.luba@partner.samsung.com --- include/trace/events/sched.h | 130 +++++++++++++++++++++++++++++++++++ 1 file changed, 130 insertions(+)
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 79d3af7e627e..36aef01a8c00 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -777,6 +777,136 @@ TRACE_EVENT(sched_util_est_cpu, __entry->util_avg, __entry->util_est_runnable) ); + +TRACE_EVENT(sched_active_lb_stop_cpu, + + TP_PROTO(int busiest_cpu, int target_cpu), + + TP_ARGS(busiest_cpu, target_cpu), + + TP_STRUCT__entry( + __field( int, busiest_cpu ) + __field( int, target_cpu ) + ), + + TP_fast_assign( + __entry->busiest_cpu = busiest_cpu; + __entry->target_cpu = target_cpu; + ), + + TP_printk("busiest_cpu=%d target_cpu=%d", + __entry->busiest_cpu, + __entry->target_cpu) +); + +TRACE_EVENT(sched_can_migrate_task, + + TP_PROTO(int pid, int migrate, int task_hot, int busiest_cpu, int target_cpu), + + TP_ARGS(pid, migrate, task_hot, busiest_cpu, target_cpu), + + TP_STRUCT__entry( + __field( int, pid ) + __field( int, migrate ) + __field( int, task_hot ) + __field( int, busiest_cpu ) + __field( int, target_cpu ) + ), + + TP_fast_assign( + __entry->pid = pid; + __entry->migrate = migrate; + __entry->task_hot = task_hot; + __entry->busiest_cpu = busiest_cpu; + __entry->target_cpu = target_cpu; + ), + + TP_printk("pid=%d migrate=%d task_hot=%d src_cpu=%d dst_cpu=%d", + __entry->pid, + __entry->migrate, + __entry->task_hot, + __entry->busiest_cpu, + __entry->target_cpu) +); + +TRACE_EVENT(sched_migrate_capacity_comparison, + + TP_PROTO(int src_cpu, int dst_cpu, int src_cpu_cap, int dst_cpu_cap, + int src_cpu_util, int dst_cpu_util, int needed), + + TP_ARGS(src_cpu, dst_cpu, src_cpu_cap, dst_cpu_cap, src_cpu_util, + dst_cpu_util, needed), + + TP_STRUCT__entry( + __field( int, src_cpu ) + __field( int, dst_cpu ) + __field( int, src_cpu_cap ) + __field( int, dst_cpu_cap ) + __field( int, src_cpu_util ) + __field( int, dst_cpu_util ) + __field( int, needed ) + ), + + TP_fast_assign( + __entry->src_cpu = src_cpu; + __entry->dst_cpu = dst_cpu; + __entry->src_cpu_cap = src_cpu_cap; + __entry->dst_cpu_cap = dst_cpu_cap; + __entry->src_cpu_util = src_cpu_util; + __entry->dst_cpu_util = dst_cpu_util; + __entry->needed = needed; + ), + + TP_printk("src_cpu=%d dst_cpu=%d src_cpu_cap=%d dst_cpu_cap=%d src_cpu_util=%d dst_cpu_util=%d needed=%d", + __entry->src_cpu, + __entry->dst_cpu, + __entry->src_cpu_cap, + __entry->dst_cpu_cap, + __entry->src_cpu_util, + __entry->dst_cpu_util, + __entry->needed) +); + +TRACE_EVENT(sched_need_active_balance, + + TP_PROTO(int needed), + + TP_ARGS(needed), + + TP_STRUCT__entry( + __field( int, needed ) + ), + + TP_fast_assign( + __entry->needed = needed; + ), + + TP_printk("needed=%d", __entry->needed) +); + +TRACE_EVENT(sched_find_idlest_cpu, + + TP_PROTO(const struct cpumask *group_cpus, int cpu, int new_cpu), + + TP_ARGS(group_cpus, cpu, new_cpu), + + TP_STRUCT__entry( + __bitmask(cpumask, num_possible_cpus()) + __field( int, cpu ) + __field( int, new_cpu ) + ), + + TP_fast_assign( + __assign_bitmask(cpumask, cpumask_bits(group_cpus), + num_possible_cpus()); + __entry->cpu = cpu; + __entry->new_cpu = new_cpu; + ), + + TP_printk("group_cpus=%s cpu=%d new_cpu=%d", + __get_bitmask(cpumask), __entry->cpu, __entry->new_cpu) +); + #endif /* CONFIG_SMP */ #endif /* _TRACE_SCHED_H */
This patch drops the aggressive migration in load balancing by neglecting statisting of failing too many balance attempts.
In some architectures which are build around asymetric CPUs, ie ARM big.LITTLE the task was forced migrated even when there was no need. It was causing jumping around of the task and not utilizing all possible CPUs in the system.
Signed-off-by: Lukasz Luba l.luba@partner.samsung.com --- kernel/sched/fair.c | 79 ++++++++++++++++++++++++++++++++++++--------- 1 file changed, 63 insertions(+), 16 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index cea6df0949a8..8ac2ad9a5b8d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7612,13 +7612,22 @@ static inline int migrate_degrades_locality(struct task_struct *p, } #endif
+static inline bool check_cpu_spare_capacity(int cpu, + unsigned int needed_spare_capacity) +{ + return (capacity_of(cpu) > + (cpu_util(cpu) + (needed_spare_capacity / 2))); + +} + /* * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? */ static int can_migrate_task(struct task_struct *p, struct lb_env *env) { - int tsk_cache_hot; + int tsk_cache_hot = 0; + int ret = 0;
lockdep_assert_held(&env->src_rq->lock);
@@ -7629,8 +7638,10 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) * 3) running (obviously), or * 4) are cache-hot on their current CPU. */ - if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) - return 0; + if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) { + ret = 0; + goto out; + }
if (!cpumask_test_cpu(env->dst_cpu, &p->cpus_allowed)) { int cpu; @@ -7647,8 +7658,11 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) * Avoid computing new_dst_cpu for NEWLY_IDLE or if we have * already computed one in current iteration. */ - if (env->idle == CPU_NEWLY_IDLE || (env->flags & LBF_DST_PINNED)) - return 0; + if (env->idle == CPU_NEWLY_IDLE || (env->flags & + LBF_DST_PINNED)) { + ret = 0; + goto out; + }
/* Prevent to re-select dst_cpu via env's cpus */ for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { @@ -7659,7 +7673,8 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) } }
- return 0; + ret = 0; + goto out; }
/* Record that we found atleast one task that could run on dst_cpu */ @@ -7667,7 +7682,8 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
if (task_running(env->src_rq, p)) { schedstat_inc(p->se.statistics.nr_failed_migrations_running); - return 0; + ret = 0; + goto out; }
/* @@ -7686,11 +7702,15 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) schedstat_inc(env->sd->lb_hot_gained[env->idle]); schedstat_inc(p->se.statistics.nr_forced_migrations); } - return 1; + ret = 1; + goto out; }
schedstat_inc(p->se.statistics.nr_failed_migrations_hot); - return 0; +out: + trace_sched_can_migrate_task(p->pid, ret, tsk_cache_hot, env->src_cpu, + env->dst_cpu); + return ret; }
/* @@ -9085,9 +9105,36 @@ static struct rq *find_busiest_queue(struct lb_env *env, */ #define MAX_PINNED_INTERVAL 512
+static inline int need_park_into_spare_capacity(struct lb_env *env) +{ + bool fits_in = check_cpu_spare_capacity(env->dst_cpu, + cpu_util(env->src_cpu)); + int ret; + + if ((capacity_of(env->src_cpu) < capacity_of(env->dst_cpu)) && + env->src_rq->cfs.h_nr_running == 1 && + cpu_overutilized(env->src_cpu) && + !cpu_overutilized(env->dst_cpu) && + fits_in) { + ret = 1; + } else { + ret = 0; + } + + trace_sched_migrate_capacity_comparison(env->src_cpu, env->dst_cpu, + capacity_of(env->src_cpu), + capacity_of(env->dst_cpu), + cpu_util(env->src_cpu), + cpu_util(env->dst_cpu), ret); + + return ret; + +} + static int need_active_balance(struct lb_env *env) { struct sched_domain *sd = env->sd; + int need_balance = sd->nr_balance_failed > sd->cache_nice_tries + 2;
if (env->idle == CPU_NEWLY_IDLE) {
@@ -9114,14 +9161,11 @@ static int need_active_balance(struct lb_env *env) return 1; }
- if ((capacity_of(env->src_cpu) < capacity_of(env->dst_cpu)) && - env->src_rq->cfs.h_nr_running == 1 && - cpu_overutilized(env->src_cpu) && - !cpu_overutilized(env->dst_cpu)) { - return 1; - } + if (need_park_into_spare_capacity(env)) + return 1;
- return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); + trace_sched_need_active_balance(need_balance); + return 0; }
static int active_load_balance_cpu_stop(void *data); @@ -9345,6 +9389,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, if (need_active_balance(&env)) { unsigned long flags;
+ raw_spin_lock_irqsave(&busiest->lock, flags);
/* don't kick the active_load_balance_cpu_stop, @@ -9590,6 +9635,8 @@ static int active_load_balance_cpu_stop(void *data) struct task_struct *p = NULL; struct rq_flags rf;
+ trace_sched_active_lb_stop_cpu(busiest_cpu, target_cpu); + rq_lock_irq(busiest_rq, &rf); /* * Between queueing the stop-work and running it is a hole in which
The patch changes 'fitness' of the destination CPU during migration. The destiation CPU can have lite utilization of around ~6% and in that case the task utilization is neglected (since the src CPU is allready overutilized).
Signed-off-by: Lukasz Luba l.luba@partner.samsung.com --- kernel/sched/fair.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8ac2ad9a5b8d..d6c9e4b41330 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7616,7 +7616,7 @@ static inline bool check_cpu_spare_capacity(int cpu, unsigned int needed_spare_capacity) { return (capacity_of(cpu) > - (cpu_util(cpu) + (needed_spare_capacity / 2))); + (cpu_util(cpu) + needed_spare_capacity));
}
@@ -9105,6 +9105,12 @@ static struct rq *find_busiest_queue(struct lb_env *env, */ #define MAX_PINNED_INTERVAL 512
+static inline bool check_cpu_lite_util(int cpu) +{ + /* Lite utilization is defined as less then ~6% */ + return (capacity_of(cpu) >> 4 >= cpu_util(cpu)); +} + static inline int need_park_into_spare_capacity(struct lb_env *env) { bool fits_in = check_cpu_spare_capacity(env->dst_cpu, @@ -9115,7 +9121,7 @@ static inline int need_park_into_spare_capacity(struct lb_env *env) env->src_rq->cfs.h_nr_running == 1 && cpu_overutilized(env->src_cpu) && !cpu_overutilized(env->dst_cpu) && - fits_in) { + (fits_in || check_cpu_lite_util(env->dst_cpu))) { ret = 1; } else { ret = 0;
Check if some of the CPUs in the group were actually idle. If it was balancing due to fork and all CPUs are loaded, try to run in the same group.
Signed-off-by: Lukasz Luba l.luba@partner.samsung.com --- kernel/sched/fair.c | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d6c9e4b41330..0ceebc12bb4d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6214,6 +6214,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, { struct sched_group *idlest = NULL, *group = sd->groups; struct sched_group *most_spare_sg = NULL; + struct sched_group *group_with_idle = NULL; unsigned long min_runnable_load = ULONG_MAX; unsigned long this_runnable_load = ULONG_MAX; unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX; @@ -6222,6 +6223,8 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int imbalance_scale = 100 + (sd->imbalance_pct-100)/2; unsigned long imbalance = scale_load_down(NICE_0_LOAD) * (sd->imbalance_pct-100) / 100; + bool found_local_idle = false; + int found_idle_cpu = -1;
if (sd_flag & SD_BALANCE_WAKE) load_idx = sd->wake_idx; @@ -6263,6 +6266,17 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
if (spare_cap > max_spare_cap) max_spare_cap = spare_cap; + + /* If there is an idle CPU, try it */ + if (idle_cpu(i)) { + if (found_local_idle) + continue; + if (local_group) + found_local_idle = true; + + found_idle_cpu = i; + group_with_idle = group; + } }
/* Adjust by relative CPU capacity of the group */ @@ -6313,7 +6327,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, * utilization. */ if (sd_flag & SD_BALANCE_FORK) - goto skip_spare; + goto try_skip_packing;
if (this_spare > task_util(p) / 2 && imbalance_scale*this_spare > 100*most_spare) @@ -6322,7 +6336,10 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, if (most_spare > task_util(p) / 2) return most_spare_sg;
-skip_spare: +try_skip_packing: + if (found_idle_cpu != -1) + return group_with_idle; + if (!idlest) return NULL;
@@ -6333,6 +6350,10 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, (100*this_avg_load < imbalance_scale*min_avg_load)) return NULL;
+ /* Last try: all CPUs are loaded, so keep continue on current */ + if (found_idle_cpu == -1 && sd_flag & SD_BALANCE_FORK) + return NULL; + return idlest; }
@@ -6414,6 +6435,8 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p }
new_cpu = find_idlest_group_cpu(group, p, cpu); + trace_sched_find_idlest_cpu(sched_group_span(group), cpu, + new_cpu); if (new_cpu == cpu) { /* Now try balancing at a lower domain level of cpu */ sd = sd->child;