Hi Yuyang,
On 05/04/16 01:15, Yuyang Du wrote:
On Tue, Apr 05, 2016 at 08:51:13AM +0100, Morten Rasmussen wrote:
On Tue, Apr 05, 2016 at 02:30:03AM +0800, Yuyang Du wrote:
On Mon, Apr 04, 2016 at 09:48:23AM +0100, Morten Rasmussen wrote:
On Sat, Apr 02, 2016 at 03:11:54PM +0800, Leo Yan wrote:
On Fri, Apr 01, 2016 at 03:28:49PM -0700, Steve Muckle wrote:
I think I follow - Leo please correct me if I mangle your intentions. It's an issue that Morten and Dietmar had mentioned to me as well.
Yes. We have been working on this issue for a while without getting to a nice solution yet.
So do you want a "flat hirarchy" for util_avg - just do util_avg for rq and task respectively? Seems it is what you want, and it is even easier?
Pretty much, yes. I can't think of a good reason why we need the utilization of groups as long as we have the task utilization and the sum of those for the root cfs_rq.
Sound good to me too.
I'm not saying it can't be implemented, just saying that it will make utilization tracking for groups redundant and possibly duplicate or hack some the existing code to implement the new root utilization sum.
A initial evaluation of the implementation: it looks much easier to do (at least) than the current. Lets wait for a day or two, if no objection, then lets do it.
I have been playing with this patch to achieve this 'flat hirarchy" for util_avg' after I gave up to implement this propagating down the cfs_rq/se hierarchy thing for task groups. The patch has been created w/o your 'sched/fair: Initiate a new task's util avg to a bounded value' which recently went into tip/sched/core.
-- >8 --
Subject: [PATCH] sched/fair: Aggregate task utilization only on the root cfs_rq
cpu utilization is defined as the cpu (original) capacity capped sched_avg.util_avg signal of the root cfs_rq of that cpu.
With the current pelt version, the utilization of a task enqueued/dequeued on/from a cfs_rq, representing a task group other than the root task group on a cpu, is not immediately propagated down to the root cfs_rq of that cpu.
This makes decisions based on cpu_util() for scheduling or cpu frequency settings less accurate in case tasks are running in task groups.
This patch aggregates the task utilization only on the root cfs_rq, essentially bypassing cfs_rq's and se's representing task groups (&rq_of(cfs_rq)->cfs != cfs_rq and !entity_is_task(se)).
Signed-off-by: Dietmar Eggemann dietmar.eggemann@arm.com --- kernel/sched/fair.c | 55 ++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 42 insertions(+), 13 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 33130529e9b5..51d675715776 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -682,8 +682,10 @@ void init_entity_runnable_average(struct sched_entity *se) sa->period_contrib = 1023; sa->load_avg = scale_load_down(se->load.weight); sa->load_sum = sa->load_avg * LOAD_AVG_MAX; - sa->util_avg = scale_load_down(SCHED_LOAD_SCALE); - sa->util_sum = sa->util_avg * LOAD_AVG_MAX; + if (entity_is_task(se)) { + sa->util_avg = scale_load_down(SCHED_LOAD_SCALE); + sa->util_sum = sa->util_avg * LOAD_AVG_MAX; + } /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ }
@@ -2651,6 +2653,15 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, u32 contrib; unsigned int delta_w, scaled_delta_w, decayed = 0; unsigned long scale_freq, scale_cpu; + int update_util = 0; + + if (cfs_rq) { + if (&rq_of(cfs_rq)->cfs == cfs_rq) + update_util = 1; + } else { + if (entity_is_task(container_of(sa, struct sched_entity, avg))) + update_util = 1; + }
delta = now - sa->last_update_time; /* @@ -2696,7 +2707,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, weight * scaled_delta_w; } } - if (running) + if (update_util && running) sa->util_sum += scaled_delta_w * scale_cpu;
delta -= delta_w; @@ -2720,7 +2731,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, if (cfs_rq) cfs_rq->runnable_load_sum += weight * contrib; } - if (running) + if (update_util && running) sa->util_sum += contrib * scale_cpu; }
@@ -2731,7 +2742,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, if (cfs_rq) cfs_rq->runnable_load_sum += weight * scaled_delta; } - if (running) + if (update_util && running) sa->util_sum += scaled_delta * scale_cpu;
sa->period_contrib += delta; @@ -2742,7 +2753,8 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, cfs_rq->runnable_load_avg = div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX); } - sa->util_avg = sa->util_sum / LOAD_AVG_MAX; + if (update_util) + sa->util_avg = sa->util_sum / LOAD_AVG_MAX; }
return decayed; @@ -2834,7 +2846,8 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) removed = 1; }
- if (atomic_long_read(&cfs_rq->removed_util_avg)) { + if ((&rq_of(cfs_rq)->cfs == cfs_rq) && + atomic_long_read(&cfs_rq->removed_util_avg)) { long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0); sa->util_avg = max_t(long, sa->util_avg - r, 0); sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0); @@ -2893,8 +2906,12 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s se->avg.last_update_time = cfs_rq->avg.last_update_time; cfs_rq->avg.load_avg += se->avg.load_avg; cfs_rq->avg.load_sum += se->avg.load_sum; - cfs_rq->avg.util_avg += se->avg.util_avg; - cfs_rq->avg.util_sum += se->avg.util_sum; + + if (!entity_is_task(se)) + return; + + rq_of(cfs_rq)->cfs.avg.util_avg += se->avg.util_avg; + rq_of(cfs_rq)->cfs.avg.util_sum += se->avg.util_sum; }
static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -2905,8 +2922,14 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0); cfs_rq->avg.load_sum = max_t(s64, cfs_rq->avg.load_sum - se->avg.load_sum, 0); - cfs_rq->avg.util_avg = max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0); - cfs_rq->avg.util_sum = max_t(s32, cfs_rq->avg.util_sum - se->avg.util_sum, 0); + + if (!entity_is_task(se)) + return; + + rq_of(cfs_rq)->cfs.avg.util_avg = + max_t(long, rq_of(cfs_rq)->cfs.avg.util_avg - se->avg.util_avg, 0); + rq_of(cfs_rq)->cfs.avg.util_sum = + max_t(s32, rq_of(cfs_rq)->cfs.avg.util_sum - se->avg.util_sum, 0); }
/* Add the load generated by se into cfs_rq's load average */ @@ -2989,7 +3012,11 @@ void remove_entity_load_avg(struct sched_entity *se)
__update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL); atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg); - atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg); + + if (!entity_is_task(se)) + return; + + atomic_long_add(se->avg.util_avg, &rq_of(cfs_rq)->cfs.removed_util_avg); }
static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq) @@ -8268,7 +8295,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) #endif #ifdef CONFIG_SMP atomic_long_set(&cfs_rq->removed_load_avg, 0); - atomic_long_set(&cfs_rq->removed_util_avg, 0); + + if (&rq_of(cfs_rq)->cfs == cfs_rq) + atomic_long_set(&cfs_rq->removed_util_avg, 0); #endif }
-- 1.9.1