From: Alex Shi alex.shi@intel.com
Sleeping task has no utiliation, when they were bursty waked up, the zero utilization make scheduler out of balance, like aim7 benchmark.
rq->avg_idle is 'to used to accommodate bursty loads in a dirt simple dirt cheap manner' -- Mike Galbraith.
With this cheap and smart bursty indicator, we can find the wake up burst, and use nr_running as instant utilization in this scenario.
For other scenarios, we still use the precise CPU utilization to judage if a domain is eligible for power scheduling.
Thanks for Mike Galbraith's idea!
Signed-off-by: Alex Shi alex.shi@intel.com [Added CONFIG_SCHED_POWER switch to enable this patch] Signed-off-by: Preeti U Murthy preeti@linux.vnet.ibm.com ---
kernel/sched/fair.c | 33 ++++++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 7 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e993f1c..3db77e8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4605,12 +4605,19 @@ struct sd_lb_stats { * Try to collect the task running number and capacity of the group. */ static void get_sg_power_stats(struct sched_group *group, - struct sched_domain *sd, struct sg_lb_stats *sgs) + struct sched_domain *sd, struct sg_lb_stats *sgs, int burst) { int i;
- for_each_cpu(i, sched_group_cpus(group)) - sgs->group_util += max_rq_util(i); + for_each_cpu(i, sched_group_cpus(group)) { + struct rq *rq = cpu_rq(i); + + if (burst && rq->nr_running > 1) + /* use nr_running as instant utilization */ + sgs->group_util += rq->nr_running; + else + sgs->group_util += max_rq_util(i); + }
sgs->group_weight = group->group_weight; } @@ -4624,6 +4631,8 @@ static int is_sd_full(struct sched_domain *sd, struct sched_group *group; struct sg_lb_stats sgs; long sd_min_delta = LONG_MAX; + int cpu = task_cpu(p); + int burst = 0; unsigned int putil;
if (p->se.load.weight == p->se.avg.load_avg_contrib) @@ -4633,15 +4642,21 @@ static int is_sd_full(struct sched_domain *sd, putil = (u64)(p->se.avg.runnable_avg_sum << SCHED_CAPACITY_SHIFT) / (p->se.avg.runnable_avg_period + 1);
+ if (cpu_rq(cpu)->avg_idle < sysctl_sched_burst_threshold) + burst = 1; + /* Try to collect the domain's utilization */ group = sd->groups; do { long g_delta;
memset(&sgs, 0, sizeof(sgs)); - get_sg_power_stats(group, sd, &sgs); + get_sg_power_stats(group, sd, &sgs, burst);
- g_delta = sgs.group_weight * FULL_UTIL - sgs.group_util; + if (burst) + g_delta = sgs.group_weight - sgs.group_util; + else + g_delta = sgs.group_weight * FULL_UTIL - sgs.group_util;
if (g_delta > 0 && g_delta < sd_min_delta) { sd_min_delta = g_delta; @@ -4651,8 +4666,12 @@ static int is_sd_full(struct sched_domain *sd, sds->sd_util += sgs.group_util; } while (group = group->next, group != sd->groups);
- if (sds->sd_util + putil < sd->span_weight * FULL_UTIL) - return 0; + if (burst) { + if (sds->sd_util < sd->span_weight) + return 0; + } else + if (sds->sd_util + putil < sd->span_weight * FULL_UTIL) + return 0;
/* can not hold one more task in this domain */ return 1;