Ensure that the move of a sched_entity will be reflected in load and utilization of the task_group hierarchy.
When a sched_entity moves between groups or CPUs, load and utilization of cfs_rq don't reflect the changes immediately but converge to new values. As a result, the metrics are no more aligned with the new balance of the load in the system and next decisions will have a biased view.
This patchset synchronizes load/utilization of sched_entity with its child cfs_rq (se->my-q) only when tasks move to/from child cfs_rq: -move between task group -migration between CPUs Otherwise, PELT is updated as usual.
This version doesn't include any changes related to discussion that have started during the review of the previous version about: - encapsulate the sequence for changing the propoerty of a task - remove a cfs_rq from list during update_blocked_averages These topics don't gain anything from being added in this patchset as they are fairly independent and deserve a separate patch.
Changes since v3: - Replaced the 2 arguments of update_load_avg by 1 flags argument - Propagated move in runnable_load_avg when sched_entity is already on_rq - Ensure that intermediate value will not reach memory when updating load and utilization - Optimize the the calculation of load_avg of the sched_entity - Fixed some typo
Changes since v2: - Propagate both utilization and load - Synced sched_entity and se->my_q instead of adding the delta
Changes since v1: - This patch needs the patch that fixes issue with rq->leaf_cfs_rq_list "sched: fix hierarchical order in rq->leaf_cfs_rq_list" in order to work correctly. I haven't sent them as a single patchset because the fix is independent of this one - Merge some functions that are always used together - During update of blocked load, ensure that the sched_entity is synced with the cfs_rq applying changes - Fix an issue when task changes its cpu affinity Vincent Guittot (7): sched: factorize attach entity sched: fix hierarchical order in rq->leaf_cfs_rq_list sched: factorize PELT update sched: propagate load during synchronous attach/detach sched: propagate asynchrous detach sched: fix task group initialization sched: fix wrong utilization accounting when switching to fair class
kernel/sched/core.c | 21 +-- kernel/sched/fair.c | 354 +++++++++++++++++++++++++++++++++++++++++---------- kernel/sched/sched.h | 2 + 3 files changed, 300 insertions(+), 77 deletions(-)
Factorize post_init_entity_util_avg and part of attach_task_cfs_rq in one function attach_entity_cfs_rq
Signed-off-by: Vincent Guittot vincent.guittot@linaro.org --- kernel/sched/fair.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 986c10c..e8ed8d1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -697,9 +697,7 @@ void init_entity_runnable_average(struct sched_entity *se) }
static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); -static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq); -static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force); -static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se); +static void attach_entity_cfs_rq(struct sched_entity *se);
/* * With new tasks being created, their initial util_avgs are extrapolated @@ -764,9 +762,7 @@ void post_init_entity_util_avg(struct sched_entity *se) } }
- update_cfs_rq_load_avg(now, cfs_rq, false); - attach_entity_load_avg(cfs_rq, se); - update_tg_load_avg(cfs_rq, false); + attach_entity_cfs_rq(se); }
#else /* !CONFIG_SMP */ @@ -8501,9 +8497,8 @@ static void detach_task_cfs_rq(struct task_struct *p) update_tg_load_avg(cfs_rq, false); }
-static void attach_task_cfs_rq(struct task_struct *p) +static void attach_entity_cfs_rq(struct sched_entity *se) { - struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); u64 now = cfs_rq_clock_task(cfs_rq);
@@ -8519,6 +8514,14 @@ static void attach_task_cfs_rq(struct task_struct *p) update_cfs_rq_load_avg(now, cfs_rq, false); attach_entity_load_avg(cfs_rq, se); update_tg_load_avg(cfs_rq, false); +} + +static void attach_task_cfs_rq(struct task_struct *p) +{ + struct sched_entity *se = &p->se; + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + attach_entity_cfs_rq(se);
if (!vruntime_normalized(p)) se->vruntime += cfs_rq->min_vruntime;
On 09/26/2016 01:19 PM, Vincent Guittot wrote:
Factorize post_init_entity_util_avg and part of attach_task_cfs_rq in one function attach_entity_cfs_rq
Signed-off-by: Vincent Guittot vincent.guittot@linaro.org
kernel/sched/fair.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 986c10c..e8ed8d1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -697,9 +697,7 @@ void init_entity_runnable_average(struct sched_entity *se) }
static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); -static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq); -static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force); -static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se); +static void attach_entity_cfs_rq(struct sched_entity *se);
/*
- With new tasks being created, their initial util_avgs are extrapolated
@@ -764,9 +762,7 @@ void post_init_entity_util_avg(struct sched_entity *se) } }
You now could move the 'u64 now = cfs_rq_clock_task(cfs_rq);' into the if condition to handle !fair_sched_class tasks.
update_cfs_rq_load_avg(now, cfs_rq, false);
attach_entity_load_avg(cfs_rq, se);
update_tg_load_avg(cfs_rq, false);
attach_entity_cfs_rq(se);
}
#else /* !CONFIG_SMP */ @@ -8501,9 +8497,8 @@ static void detach_task_cfs_rq(struct task_struct *p) update_tg_load_avg(cfs_rq, false); }
-static void attach_task_cfs_rq(struct task_struct *p) +static void attach_entity_cfs_rq(struct sched_entity *se) {
struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se);
Both callers of attach_entity_cfs_rq() already use cfs_rq_of(se). You could pass it into attach_entity_cfs_rq().
u64 now = cfs_rq_clock_task(cfs_rq);
@@ -8519,6 +8514,14 @@ static void attach_task_cfs_rq(struct task_struct *p)
The old comment /* Synchronize task ... */ should be changed to /* Synchronize entity ... */
update_cfs_rq_load_avg(now, cfs_rq, false); attach_entity_load_avg(cfs_rq, se); update_tg_load_avg(cfs_rq, false);
+}
+static void attach_task_cfs_rq(struct task_struct *p) +{
struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se);
attach_entity_cfs_rq(se); if (!vruntime_normalized(p)) se->vruntime += cfs_rq->min_vruntime;
IMPORTANT NOTICE: The contents of this email and any attachments are confidential and may also be privileged. If you are not the intended recipient, please notify the sender immediately and do not disclose the contents to any other person, use it for any purpose, or store or copy the information in any medium. Thank you.
On 5 October 2016 at 11:38, Dietmar Eggemann dietmar.eggemann@arm.com wrote:
On 09/26/2016 01:19 PM, Vincent Guittot wrote:
Factorize post_init_entity_util_avg and part of attach_task_cfs_rq in one function attach_entity_cfs_rq
Signed-off-by: Vincent Guittot vincent.guittot@linaro.org
kernel/sched/fair.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 986c10c..e8ed8d1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -697,9 +697,7 @@ void init_entity_runnable_average(struct sched_entity *se) }
static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); -static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq); -static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force); -static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se); +static void attach_entity_cfs_rq(struct sched_entity *se);
/*
- With new tasks being created, their initial util_avgs are extrapolated
@@ -764,9 +762,7 @@ void post_init_entity_util_avg(struct sched_entity *se) } }
You now could move the 'u64 now = cfs_rq_clock_task(cfs_rq);' into the if condition to handle !fair_sched_class tasks.
yes
update_cfs_rq_load_avg(now, cfs_rq, false);
attach_entity_load_avg(cfs_rq, se);
update_tg_load_avg(cfs_rq, false);
attach_entity_cfs_rq(se);
}
#else /* !CONFIG_SMP */ @@ -8501,9 +8497,8 @@ static void detach_task_cfs_rq(struct task_struct *p) update_tg_load_avg(cfs_rq, false); }
-static void attach_task_cfs_rq(struct task_struct *p) +static void attach_entity_cfs_rq(struct sched_entity *se) {
struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se);
Both callers of attach_entity_cfs_rq() already use cfs_rq_of(se). You could pass it into attach_entity_cfs_rq().
Yes that would make sense
u64 now = cfs_rq_clock_task(cfs_rq);
@@ -8519,6 +8514,14 @@ static void attach_task_cfs_rq(struct task_struct *p)
The old comment /* Synchronize task ... */ should be changed to /* Synchronize entity ... */
yes
update_cfs_rq_load_avg(now, cfs_rq, false); attach_entity_load_avg(cfs_rq, se); update_tg_load_avg(cfs_rq, false);
+}
+static void attach_task_cfs_rq(struct task_struct *p) +{
struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se);
attach_entity_cfs_rq(se); if (!vruntime_normalized(p)) se->vruntime += cfs_rq->min_vruntime;
IMPORTANT NOTICE: The contents of this email and any attachments are confidential and may also be privileged. If you are not the intended recipient, please notify the sender immediately and do not disclose the contents to any other person, use it for any purpose, or store or copy the information in any medium. Thank you.
On 7 October 2016 at 01:11, Vincent Guittot vincent.guittot@linaro.org wrote:
On 5 October 2016 at 11:38, Dietmar Eggemann dietmar.eggemann@arm.com wrote:
On 09/26/2016 01:19 PM, Vincent Guittot wrote:
Factorize post_init_entity_util_avg and part of attach_task_cfs_rq in one function attach_entity_cfs_rq
Signed-off-by: Vincent Guittot vincent.guittot@linaro.org
kernel/sched/fair.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 986c10c..e8ed8d1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -697,9 +697,7 @@ void init_entity_runnable_average(struct sched_entity *se) }
static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); -static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq); -static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force); -static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se); +static void attach_entity_cfs_rq(struct sched_entity *se);
/*
- With new tasks being created, their initial util_avgs are extrapolated
@@ -764,9 +762,7 @@ void post_init_entity_util_avg(struct sched_entity *se) } }
You now could move the 'u64 now = cfs_rq_clock_task(cfs_rq);' into the if condition to handle !fair_sched_class tasks.
yes
update_cfs_rq_load_avg(now, cfs_rq, false);
attach_entity_load_avg(cfs_rq, se);
update_tg_load_avg(cfs_rq, false);
attach_entity_cfs_rq(se);
}
#else /* !CONFIG_SMP */ @@ -8501,9 +8497,8 @@ static void detach_task_cfs_rq(struct task_struct *p) update_tg_load_avg(cfs_rq, false); }
-static void attach_task_cfs_rq(struct task_struct *p) +static void attach_entity_cfs_rq(struct sched_entity *se) {
struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se);
Both callers of attach_entity_cfs_rq() already use cfs_rq_of(se). You could pass it into attach_entity_cfs_rq().
Yes that would make sense
In fact there is a 3rd caller online_fair_sched_group which calls attach_entity_cfs_rq and doesn't already use cfs_rq_of(se) so i wonder if it's worth doing the interface change.
u64 now = cfs_rq_clock_task(cfs_rq);
@@ -8519,6 +8514,14 @@ static void attach_task_cfs_rq(struct task_struct *p)
The old comment /* Synchronize task ... */ should be changed to /* Synchronize entity ... */
yes
update_cfs_rq_load_avg(now, cfs_rq, false); attach_entity_load_avg(cfs_rq, se); update_tg_load_avg(cfs_rq, false);
+}
+static void attach_task_cfs_rq(struct task_struct *p) +{
struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se);
attach_entity_cfs_rq(se); if (!vruntime_normalized(p)) se->vruntime += cfs_rq->min_vruntime;
IMPORTANT NOTICE: The contents of this email and any attachments are confidential and may also be privileged. If you are not the intended recipient, please notify the sender immediately and do not disclose the contents to any other person, use it for any purpose, or store or copy the information in any medium. Thank you.
On 12/10/16 11:59, Vincent Guittot wrote:
On 7 October 2016 at 01:11, Vincent Guittot vincent.guittot@linaro.org wrote:
On 5 October 2016 at 11:38, Dietmar Eggemann dietmar.eggemann@arm.com wrote:
On 09/26/2016 01:19 PM, Vincent Guittot wrote:
[...]
-static void attach_task_cfs_rq(struct task_struct *p) +static void attach_entity_cfs_rq(struct sched_entity *se) {
struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se);
Both callers of attach_entity_cfs_rq() already use cfs_rq_of(se). You could pass it into attach_entity_cfs_rq().
Yes that would make sense
In fact there is a 3rd caller online_fair_sched_group which calls attach_entity_cfs_rq and doesn't already use cfs_rq_of(se) so i wonder if it's worth doing the interface change.
OK, this change gets in w/ patch 6/7. Yeah, skip it, it's not so important.
Fix the insertion of cfs_rq in rq->leaf_cfs_rq_list to ensure that a child will always be called before its parent.
The hierarchical order in shares update list has been introduced by commit 67e86250f8ea ("sched: Introduce hierarchal order on shares update list")
With the current implementation a child can be still put after its parent.
Lets take the example of root \ b /\ c d* | e*
with root -> b -> c already enqueued but not d -> e so the leaf_cfs_rq_list looks like: head -> c -> b -> root -> tail
The branch d -> e will be added the first time that they are enqueued, starting with e then d.
When e is added, its parents is not already on the list so e is put at the tail : head -> c -> b -> root -> e -> tail
Then, d is added at the head because its parent is already on the list: head -> d -> c -> b -> root -> e -> tail
e is not placed at the right position and will be called the last whereas it should be called at the beginning.
Because it follows the bottom-up enqueue sequence, we are sure that we will finished to add either a cfs_rq without parent or a cfs_rq with a parent that is already on the list. We can use this event to detect when we have finished to add a new branch. For the others, whose parents are not already added, we have to ensure that they will be added after their children that have just been inserted the steps before, and after any potential parents that are already in the list. The easiest way is to put the cfs_rq just after the last inserted one and to keep track of it untl the branch is fully added.
Signed-off-by: Vincent Guittot vincent.guittot@linaro.org --- kernel/sched/core.c | 1 + kernel/sched/fair.c | 54 +++++++++++++++++++++++++++++++++++++++++++++------- kernel/sched/sched.h | 1 + 3 files changed, 49 insertions(+), 7 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c :confirm b3 index 860070f..3e52d08 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7489,6 +7489,7 @@ void __init sched_init(void) #ifdef CONFIG_FAIR_GROUP_SCHED root_task_group.shares = ROOT_TASK_GROUP_LOAD; INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); + rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; /* * How much cpu bandwidth does root_task_group get? * diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e8ed8d1..3d29492 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -292,19 +292,59 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) { if (!cfs_rq->on_list) { + struct rq *rq = rq_of(cfs_rq); + int cpu = cpu_of(rq); /* * Ensure we either appear before our parent (if already * enqueued) or force our parent to appear after us when it is - * enqueued. The fact that we always enqueue bottom-up - * reduces this to two cases. + * enqueued. The fact that we always enqueue bottom-up + * reduces this to two cases and a special case for the root + * cfs_rq. Furthermore, it also means that we will always reset + * tmp_alone_branch either when the branch is connected + * to a tree or when we reach the beg of the tree */ if (cfs_rq->tg->parent && - cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) { - list_add_rcu(&cfs_rq->leaf_cfs_rq_list, - &rq_of(cfs_rq)->leaf_cfs_rq_list); - } else { + cfs_rq->tg->parent->cfs_rq[cpu]->on_list) { + /* + * If parent is already on the list, we add the child + * just before. Thanks to circular linked property of + * the list, this means to put the child at the tail + * of the list that starts by parent. + */ + list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, + &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list)); + /* + * The branch is now connected to its tree so we can + * reset tmp_alone_branch to the beginning of the + * list. + */ + rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; + } else if (!cfs_rq->tg->parent) { + /* + * cfs rq without parent should be put + * at the tail of the list. + */ list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, - &rq_of(cfs_rq)->leaf_cfs_rq_list); + &rq->leaf_cfs_rq_list); + /* + * We have reach the beg of a tree so we can reset + * tmp_alone_branch to the beginning of the list. + */ + rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; + } else { + /* + * The parent has not already been added so we want to + * make sure that it will be put after us. + * tmp_alone_branch points to the beg of the branch + * where we will add parent. + */ + list_add_rcu(&cfs_rq->leaf_cfs_rq_list, + rq->tmp_alone_branch); + /* + * update tmp_alone_branch to points to the new beg + * of the branch + */ + rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list; }
cfs_rq->on_list = 1; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 420c05d..483616a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -616,6 +616,7 @@ struct rq { #ifdef CONFIG_FAIR_GROUP_SCHED /* list of leaf cfs_rq on this cpu: */ struct list_head leaf_cfs_rq_list; + struct list_head *tmp_alone_branch; #endif /* CONFIG_FAIR_GROUP_SCHED */
/*
Every time, we modify load/utilization of sched_entity, we start to sync it with its cfs_rq. This update is done is different ways: -when attaching/detaching a sched_entity, we update cfs_rq and then we sync the entity with the cfs_rq. -when enqueueing/dequeuing the sched_entity, we update both sched_entity and cfs_rq metrics to now.
Use update_load_avg everytime we have to update and sync cfs_rq and sched_entity before changing the state of a sched_enity
Signed-off-by: Vincent Guittot vincent.guittot@linaro.org --- kernel/sched/fair.c | 75 +++++++++++++++++------------------------------------ 1 file changed, 24 insertions(+), 51 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3d29492..625e7f7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3088,8 +3088,14 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) return decayed || removed_load; }
+/* + * Optional action to be done while updating the load average + */ +#define UPDATE_TG 0x1 +#define SKIP_AGE_LOAD 0x2 + /* Update task and its cfs_rq load average */ -static inline void update_load_avg(struct sched_entity *se, int update_tg) +static inline void update_load_avg(struct sched_entity *se, int flags) { struct cfs_rq *cfs_rq = cfs_rq_of(se); u64 now = cfs_rq_clock_task(cfs_rq); @@ -3100,11 +3106,12 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg) * Track task load average for carrying it to new CPU after migrated, and * track group sched_entity load average for task_h_load calc in migration */ - __update_load_avg(now, cpu, &se->avg, + if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) + __update_load_avg(now, cpu, &se->avg, se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se, NULL);
- if (update_cfs_rq_load_avg(now, cfs_rq, true) && update_tg) + if (update_cfs_rq_load_avg(now, cfs_rq, true) && (flags & UPDATE_TG)) update_tg_load_avg(cfs_rq, 0); }
@@ -3118,26 +3125,6 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg) */ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - if (!sched_feat(ATTACH_AGE_LOAD)) - goto skip_aging; - - /* - * If we got migrated (either between CPUs or between cgroups) we'll - * have aged the average right before clearing @last_update_time. - * - * Or we're fresh through post_init_entity_util_avg(). - */ - if (se->avg.last_update_time) { - __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), - &se->avg, 0, 0, NULL); - - /* - * XXX: we could have just aged the entire load away if we've been - * absent from the fair class for too long. - */ - } - -skip_aging: se->avg.last_update_time = cfs_rq->avg.last_update_time; cfs_rq->avg.load_avg += se->avg.load_avg; cfs_rq->avg.load_sum += se->avg.load_sum; @@ -3157,9 +3144,6 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s */ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), - &se->avg, se->on_rq * scale_load_down(se->load.weight), - cfs_rq->curr == se, NULL);
sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg); sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum); @@ -3174,34 +3158,20 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { struct sched_avg *sa = &se->avg; - u64 now = cfs_rq_clock_task(cfs_rq); - int migrated, decayed; - - migrated = !sa->last_update_time; - if (!migrated) { - __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, - se->on_rq * scale_load_down(se->load.weight), - cfs_rq->curr == se, NULL); - } - - decayed = update_cfs_rq_load_avg(now, cfs_rq, !migrated);
cfs_rq->runnable_load_avg += sa->load_avg; cfs_rq->runnable_load_sum += sa->load_sum;
- if (migrated) + if (!sa->last_update_time) { attach_entity_load_avg(cfs_rq, se); - - if (decayed || migrated) update_tg_load_avg(cfs_rq, 0); + } }
/* Remove the runnable load generated by se from cfs_rq's runnable load average */ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - update_load_avg(se, 1); - cfs_rq->runnable_load_avg = max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0); cfs_rq->runnable_load_sum = @@ -3275,7 +3245,10 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) return 0; }
-static inline void update_load_avg(struct sched_entity *se, int not_used) +#define UPDATE_TG 0x0 +#define SKIP_AGE_LOAD 0x0 + +static inline void update_load_avg(struct sched_entity *se, int not_used1) { struct cfs_rq *cfs_rq = cfs_rq_of(se); struct rq *rq = rq_of(cfs_rq); @@ -3423,6 +3396,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (renorm && !curr) se->vruntime += cfs_rq->min_vruntime;
+ update_load_avg(se, UPDATE_TG); enqueue_entity_load_avg(cfs_rq, se); account_entity_enqueue(cfs_rq, se); update_cfs_shares(cfs_rq); @@ -3497,6 +3471,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); + update_load_avg(se, UPDATE_TG); dequeue_entity_load_avg(cfs_rq, se);
update_stats_dequeue(cfs_rq, se, flags); @@ -3575,7 +3550,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) */ update_stats_wait_end(cfs_rq, se); __dequeue_entity(cfs_rq, se); - update_load_avg(se, 1); + update_load_avg(se, UPDATE_TG); }
update_stats_curr_start(cfs_rq, se); @@ -3693,7 +3668,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) /* * Ensure that runnable average is periodically updated. */ - update_load_avg(curr, 1); + update_load_avg(curr, UPDATE_TG); update_cfs_shares(cfs_rq);
#ifdef CONFIG_SCHED_HRTICK @@ -4582,7 +4557,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break;
- update_load_avg(se, 1); + update_load_avg(se, UPDATE_TG); update_cfs_shares(cfs_rq); }
@@ -4641,7 +4616,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break;
- update_load_avg(se, 1); + update_load_avg(se, UPDATE_TG); update_cfs_shares(cfs_rq); }
@@ -8520,7 +8495,6 @@ static void detach_task_cfs_rq(struct task_struct *p) { struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); - u64 now = cfs_rq_clock_task(cfs_rq);
if (!vruntime_normalized(p)) { /* @@ -8532,7 +8506,7 @@ static void detach_task_cfs_rq(struct task_struct *p) }
/* Catch up with the cfs_rq and remove our load when we leave */ - update_cfs_rq_load_avg(now, cfs_rq, false); + update_load_avg(se, 0); detach_entity_load_avg(cfs_rq, se); update_tg_load_avg(cfs_rq, false); } @@ -8540,7 +8514,6 @@ static void detach_task_cfs_rq(struct task_struct *p) static void attach_entity_cfs_rq(struct sched_entity *se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - u64 now = cfs_rq_clock_task(cfs_rq);
#ifdef CONFIG_FAIR_GROUP_SCHED /* @@ -8551,7 +8524,7 @@ static void attach_entity_cfs_rq(struct sched_entity *se) #endif
/* Synchronize task with its cfs_rq */ - update_cfs_rq_load_avg(now, cfs_rq, false); + update_load_avg(se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD); attach_entity_load_avg(cfs_rq, se); update_tg_load_avg(cfs_rq, false); }
When a task moves from/to a cfs_rq, we set a flag which is then used to propagate the change at parent level (sched_entity and cfs_rq) during next update. If the cfs_rq is throttled, the flag will stay pending until the cfs_rw is unthrottled.
For propagating the utilization, we copy the utilization of group cfs_rq to the sched_entity.
For propagating the load, we have to take into account the load of the whole task group in order to evaluate the load of the sched_entity. Similarly to what was done before the rewrite of PELT, we add a correction factor in case the task group's load is greater than its share so it will contribute the same load of a task of equal weight.
Signed-off-by: Vincent Guittot vincent.guittot@linaro.org --- kernel/sched/fair.c | 200 ++++++++++++++++++++++++++++++++++++++++++++++++++- kernel/sched/sched.h | 1 + 2 files changed, 200 insertions(+), 1 deletion(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 625e7f7..8ba500f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3021,6 +3021,162 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) }
/* + * Signed add and clamp on underflow. + * + * Explicitly do a load-store to ensure the intermediate value never hits + * memory. This allows lockless observations without ever seeing the negative + * values. + */ +#define add_positive(_ptr, _val) do { \ + typeof(_ptr) ptr = (_ptr); \ + typeof(_val) res, val = (_val); \ + typeof(*ptr) var = READ_ONCE(*ptr); \ + res = var + val; \ + if (res < 0) \ + res = 0; \ + WRITE_ONCE(*ptr, res); \ +} while (0) + +#ifdef CONFIG_FAIR_GROUP_SCHED +/* Take into account change of utilization of a child task group */ +static inline void +update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + struct cfs_rq *gcfs_rq = group_cfs_rq(se); + long delta = gcfs_rq->avg.util_avg - se->avg.util_avg; + + /* Nothing to update */ + if (!delta) + return; + + /* Set new sched_entity's utilization */ + se->avg.util_avg = gcfs_rq->avg.util_avg; + se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX; + + /* Update parent cfs_rq utilization */ + add_positive(&cfs_rq->avg.util_avg, delta); + cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX; +} + +/* Take into account change of load of a child task group */ +static inline void +update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + struct cfs_rq *gcfs_rq = group_cfs_rq(se); + long delta, load = gcfs_rq->avg.load_avg; + + /* + * If the load of group cfs_rq is null, the load of the + * sched_entity will also be null so we can skip the formula + */ + if (load) { + long tg_load; + + /* Get tg's load and ensure tg_load > 0 */ + tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1; + + /* Ensure tg_load >= load and updated with current load*/ + tg_load -= gcfs_rq->tg_load_avg_contrib; + tg_load += load; + + /* + * We need to compute a correction term in the case that the + * task group is consuming more cpu than a task of equal + * weight. A task with a weight equals to tg->shares will have + * a load less or equal to scale_load_down(tg->shares). + * Similarly, the sched_entities that represent the task group + * at parent level, can't have a load higher than + * scale_load_down(tg->shares). And the Sum of sched_entities' + * load must be <= scale_load_down(tg->shares). + */ + if (tg_load > scale_load_down(gcfs_rq->tg->shares)) { + /* scale gcfs_rq's load into tg's shares*/ + load *= scale_load_down(gcfs_rq->tg->shares); + load /= tg_load; + } + } + + delta = load - se->avg.load_avg; + + /* Nothing to update */ + if (!delta) + return; + + /* Set new sched_entity's load */ + se->avg.load_avg = load; + se->avg.load_sum = se->avg.load_avg * LOAD_AVG_MAX; + + /* Update parent cfs_rq load */ + add_positive(&cfs_rq->avg.load_avg, delta); + cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * LOAD_AVG_MAX; + + /* + * If the sched_entity is already enqueued, we also have to update the + * runnable load avg. + */ + if (se->on_rq) { + /* Update parent cfs_rq runnable_load_avg */ + add_positive(&cfs_rq->runnable_load_avg, delta); + cfs_rq->runnable_load_sum = cfs_rq->runnable_load_avg * LOAD_AVG_MAX; + } +} + +static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) +{ + /* set cfs_rq's flag */ + cfs_rq->propagate_avg = 1; +} + +static inline int test_and_clear_tg_cfs_propagate(struct sched_entity *se) +{ + /* Get my cfs_rq */ + struct cfs_rq *cfs_rq = group_cfs_rq(se); + + /* Nothing to propagate */ + if (!cfs_rq->propagate_avg) + return 0; + + /* Clear my cfs_rq's flag */ + cfs_rq->propagate_avg = 0; + + return 1; +} + +/* Update task and its cfs_rq load average */ +static inline int propagate_entity_load_avg(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq; + + if (entity_is_task(se)) + return 0; + + if (!test_and_clear_tg_cfs_propagate(se)) + return 0; + + /* Get parent cfs_rq */ + cfs_rq = cfs_rq_of(se); + + /* Propagate to parent */ + set_tg_cfs_propagate(cfs_rq); + + /* Update utilization */ + update_tg_cfs_util(cfs_rq, se); + + /* Update load */ + update_tg_cfs_load(cfs_rq, se); + + return 1; +} +#else +static inline int propagate_entity_load_avg(struct sched_entity *se) +{ + return 0; +} + +static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {} +#endif + +/* * Unsigned subtract and clamp on underflow. * * Explicitly do a load-store to ensure the intermediate value never hits @@ -3101,6 +3257,7 @@ static inline void update_load_avg(struct sched_entity *se, int flags) u64 now = cfs_rq_clock_task(cfs_rq); struct rq *rq = rq_of(cfs_rq); int cpu = cpu_of(rq); + int decayed;
/* * Track task load average for carrying it to new CPU after migrated, and @@ -3111,7 +3268,11 @@ static inline void update_load_avg(struct sched_entity *se, int flags) se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se, NULL);
- if (update_cfs_rq_load_avg(now, cfs_rq, true) && (flags & UPDATE_TG)) + decayed = update_cfs_rq_load_avg(now, cfs_rq, true); + + decayed |= propagate_entity_load_avg(se); + + if (decayed && (flags & UPDATE_TG)) update_tg_load_avg(cfs_rq, 0); }
@@ -3130,6 +3291,7 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s cfs_rq->avg.load_sum += se->avg.load_sum; cfs_rq->avg.util_avg += se->avg.util_avg; cfs_rq->avg.util_sum += se->avg.util_sum; + set_tg_cfs_propagate(cfs_rq);
cfs_rq_util_change(cfs_rq); } @@ -3149,6 +3311,7 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum); sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg); sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum); + set_tg_cfs_propagate(cfs_rq);
cfs_rq_util_change(cfs_rq); } @@ -8509,6 +8672,22 @@ static void detach_task_cfs_rq(struct task_struct *p) update_load_avg(se, 0); detach_entity_load_avg(cfs_rq, se); update_tg_load_avg(cfs_rq, false); + +#ifdef CONFIG_FAIR_GROUP_SCHED + /* + * Propagate the detach across the tg tree to make it visible to the + * root + */ + se = se->parent; + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + + if (cfs_rq_throttled(cfs_rq)) + break; + + update_load_avg(se, UPDATE_TG); + } +#endif }
static void attach_entity_cfs_rq(struct sched_entity *se) @@ -8527,6 +8706,22 @@ static void attach_entity_cfs_rq(struct sched_entity *se) update_load_avg(se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD); attach_entity_load_avg(cfs_rq, se); update_tg_load_avg(cfs_rq, false); + +#ifdef CONFIG_FAIR_GROUP_SCHED + /* + * Propagate the attach across the tg tree to make it visible to the + * root + */ + se = se->parent; + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + + if (cfs_rq_throttled(cfs_rq)) + break; + + update_load_avg(se, UPDATE_TG); + } +#endif }
static void attach_task_cfs_rq(struct task_struct *p) @@ -8588,6 +8783,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; #endif #ifdef CONFIG_SMP +#ifdef CONFIG_FAIR_GROUP_SCHED + cfs_rq->propagate_avg = 0; +#endif atomic_long_set(&cfs_rq->removed_load_avg, 0); atomic_long_set(&cfs_rq->removed_util_avg, 0); #endif diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 483616a..0517a9e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -397,6 +397,7 @@ struct cfs_rq { unsigned long runnable_load_avg; #ifdef CONFIG_FAIR_GROUP_SCHED unsigned long tg_load_avg_contrib; + unsigned long propagate_avg; #endif atomic_long_t removed_load_avg, removed_util_avg; #ifndef CONFIG_64BIT
A task can be asynchronously detached from cfs_rq when migrating between CPUs. The load of the migrated task is then removed from source cfs_rq during its next update. We use this event to set propagation flag.
During the load balance, we take advanatge of the update of blocked load to we propagate any pending changes.
Signed-off-by: Vincent Guittot vincent.guittot@linaro.org --- kernel/sched/fair.c | 6 ++++++ 1 file changed, 6 insertions(+)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8ba500f..bd3b6b9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3221,6 +3221,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) sub_positive(&sa->load_avg, r); sub_positive(&sa->load_sum, r * LOAD_AVG_MAX); removed_load = 1; + set_tg_cfs_propagate(cfs_rq); }
if (atomic_long_read(&cfs_rq->removed_util_avg)) { @@ -3228,6 +3229,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) sub_positive(&sa->util_avg, r); sub_positive(&sa->util_sum, r * LOAD_AVG_MAX); removed_util = 1; + set_tg_cfs_propagate(cfs_rq); }
decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, @@ -6607,6 +6609,10 @@ static void update_blocked_averages(int cpu)
if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true)) update_tg_load_avg(cfs_rq, 0); + + /* Propagate pending load changes to the parent */ + if (cfs_rq->tg->se[cpu]) + update_load_avg(cfs_rq->tg->se[cpu], 0); } raw_spin_unlock_irqrestore(&rq->lock, flags); }
On 26/09/16 13:19, Vincent Guittot wrote:
A task can be asynchronously detached from cfs_rq when migrating between CPUs. The load of the migrated task is then removed from source cfs_rq during its next update. We use this event to set propagation flag.
During the load balance, we take advanatge of the update of blocked load to we propagate any pending changes.
IMHO, it would be a good idea to mention that '2/7 sched: fix hierarchical order in rq->leaf_cfs_rq_list' is a hard requirement for this to work. The functionality relies on the order of cfs_rq's (top to root) in the rq->leaf_cfs_rq_list list.
Signed-off-by: Vincent Guittot vincent.guittot@linaro.org
kernel/sched/fair.c | 6 ++++++ 1 file changed, 6 insertions(+)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8ba500f..bd3b6b9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3221,6 +3221,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) sub_positive(&sa->load_avg, r); sub_positive(&sa->load_sum, r * LOAD_AVG_MAX); removed_load = 1;
}set_tg_cfs_propagate(cfs_rq);
if (atomic_long_read(&cfs_rq->removed_util_avg)) { @@ -3228,6 +3229,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) sub_positive(&sa->util_avg, r); sub_positive(&sa->util_sum, r * LOAD_AVG_MAX); removed_util = 1;
}set_tg_cfs_propagate(cfs_rq);
decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, @@ -6607,6 +6609,10 @@ static void update_blocked_averages(int cpu) if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true)) update_tg_load_avg(cfs_rq, 0);
/* Propagate pending load changes to the parent */
if (cfs_rq->tg->se[cpu])
update_load_avg(cfs_rq->tg->se[cpu], 0);
In my test (1 task (run/period: 8ms/16ms) in tg_root->tg_x->tg_y->*tg_z* and oscillating between cpu1 and cpu2) the cfs_rq related signals are nicely going down to 0 after the task has left the cpu but it doesn't seem to be the case for the corresponding se (cfs_rq->tg->se[cpu])?
It should actually work correctly because of the update_tg_cfs_util/load() calls in update_load_avg(cfs_rq->tg->se[cpu], 0)->propagate_entity_load_avg()
[...]
On 12 October 2016 at 17:03, Dietmar Eggemann dietmar.eggemann@arm.com wrote:
On 26/09/16 13:19, Vincent Guittot wrote:
A task can be asynchronously detached from cfs_rq when migrating between CPUs. The load of the migrated task is then removed from source cfs_rq during its next update. We use this event to set propagation flag.
During the load balance, we take advanatge of the update of blocked load to we propagate any pending changes.
IMHO, it would be a good idea to mention that '2/7 sched: fix hierarchical order in rq->leaf_cfs_rq_list' is a hard requirement for this to work. The functionality relies on the order of cfs_rq's (top to root) in the rq->leaf_cfs_rq_list list.
yes. i will add a comment
Signed-off-by: Vincent Guittot vincent.guittot@linaro.org
kernel/sched/fair.c | 6 ++++++ 1 file changed, 6 insertions(+)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8ba500f..bd3b6b9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3221,6 +3221,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) sub_positive(&sa->load_avg, r); sub_positive(&sa->load_sum, r * LOAD_AVG_MAX); removed_load = 1;
set_tg_cfs_propagate(cfs_rq); } if (atomic_long_read(&cfs_rq->removed_util_avg)) {
@@ -3228,6 +3229,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) sub_positive(&sa->util_avg, r); sub_positive(&sa->util_sum, r * LOAD_AVG_MAX); removed_util = 1;
set_tg_cfs_propagate(cfs_rq); } decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
@@ -6607,6 +6609,10 @@ static void update_blocked_averages(int cpu)
if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true)) update_tg_load_avg(cfs_rq, 0);
/* Propagate pending load changes to the parent */
if (cfs_rq->tg->se[cpu])
update_load_avg(cfs_rq->tg->se[cpu], 0);
In my test (1 task (run/period: 8ms/16ms) in tg_root->tg_x->tg_y->*tg_z* and oscillating between cpu1 and cpu2) the cfs_rq related signals are nicely going down to 0 after the task has left the cpu but it doesn't seem to be the case for the corresponding se (cfs_rq->tg->se[cpu])?
strange because such use case is part of the functional tests that I run and it was working fine according to last test that I did
It should actually work correctly because of the update_tg_cfs_util/load() calls in update_load_avg(cfs_rq->tg->se[cpu], 0)->propagate_entity_load_avg()
Furthermore, the update of the parent cfs_rq tg_x->cfs_rq[cpu] uses the delta between previous and new value for the child tg_y->se[cpu]. So it means that if tg_x->cfs_rq[cpu]->avg.load_avg goes down to 0, tg_y->se[cpu]->avg.load_avg has at least changed and most probably goes down to 0 too
Can't it be a misplaced trace point ?
[...]
On 12/10/16 16:45, Vincent Guittot wrote:
On 12 October 2016 at 17:03, Dietmar Eggemann dietmar.eggemann@arm.com wrote:
On 26/09/16 13:19, Vincent Guittot wrote:
[...]
@@ -6607,6 +6609,10 @@ static void update_blocked_averages(int cpu)
if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true)) update_tg_load_avg(cfs_rq, 0);
/* Propagate pending load changes to the parent */
if (cfs_rq->tg->se[cpu])
update_load_avg(cfs_rq->tg->se[cpu], 0);
In my test (1 task (run/period: 8ms/16ms) in tg_root->tg_x->tg_y->*tg_z* and oscillating between cpu1 and cpu2) the cfs_rq related signals are nicely going down to 0 after the task has left the cpu but it doesn't seem to be the case for the corresponding se (cfs_rq->tg->se[cpu])?
strange because such use case is part of the functional tests that I run and it was working fine according to last test that I did
It should actually work correctly because of the update_tg_cfs_util/load() calls in update_load_avg(cfs_rq->tg->se[cpu], 0)->propagate_entity_load_avg()
Furthermore, the update of the parent cfs_rq tg_x->cfs_rq[cpu] uses the delta between previous and new value for the child tg_y->se[cpu]. So it means that if tg_x->cfs_rq[cpu]->avg.load_avg goes down to 0, tg_y->se[cpu]->avg.load_avg has at least changed and most probably goes down to 0 too
Makes sense.
Can't it be a misplaced trace point ?
Yes, you're right, it was a missing tracepoint. I only had se and cfs_rq pelt tracepoints in __update_load_avg() and attach/detach_entity_load_avg(). I've added them as well to propagate_entity_load_avg() after the update_tg_cfs_load() call and now it makes sense. Thanks!
[...]
The moves of tasks are now propagated down to root and the utilization of cfs_rq reflects reality so it doesn't need to be estimated at init.
Signed-off-by: Vincent Guittot vincent.guittot@linaro.org --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index bd3b6b9..7acb77fb 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8903,7 +8903,7 @@ void online_fair_sched_group(struct task_group *tg) se = tg->se[i];
raw_spin_lock_irq(&rq->lock); - post_init_entity_util_avg(se); + attach_entity_cfs_rq(se); sync_throttle(tg, i); raw_spin_unlock_irq(&rq->lock); }
When a task switches to fair scheduling class, the period between now and the last update of its utilization is accounted as running time whatever happened during this period. This wrong accounting applies to the task and also to the task group branch.
When changing the property of a running task like its list of allowed CPUs or its scheduling class, we follow the sequence: -dequeue task -put task -change the property -set task as current task -enqueue task
The end of the sequence doesn't follow the normal sequence which is : -enqueue a task -then set the task as current task.
This wrong ordering is the root cause of wrong utilization accounting. Update the sequence to follow the right one: -dequeue task -put task -change the property -enqueue task -set task as current task
Signed-off-by: Vincent Guittot vincent.guittot@linaro.org --- kernel/sched/core.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3e52d08..7a9c9b9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1105,10 +1105,10 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
p->sched_class->set_cpus_allowed(p, new_mask);
- if (running) - p->sched_class->set_curr_task(rq); if (queued) enqueue_task(rq, p, ENQUEUE_RESTORE); + if (running) + p->sched_class->set_curr_task(rq); }
/* @@ -3687,10 +3687,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
p->prio = prio;
- if (running) - p->sched_class->set_curr_task(rq); if (queued) enqueue_task(rq, p, queue_flag); + if (running) + p->sched_class->set_curr_task(rq);
check_class_changed(rq, p, prev_class, oldprio); out_unlock: @@ -4243,8 +4243,6 @@ static int __sched_setscheduler(struct task_struct *p, prev_class = p->sched_class; __setscheduler(rq, p, attr, pi);
- if (running) - p->sched_class->set_curr_task(rq); if (queued) { /* * We enqueue to tail when the priority of a task is @@ -4255,6 +4253,8 @@ static int __sched_setscheduler(struct task_struct *p,
enqueue_task(rq, p, queue_flags); } + if (running) + p->sched_class->set_curr_task(rq);
check_class_changed(rq, p, prev_class, oldprio); preempt_disable(); /* avoid rq from going away on us */ @@ -5417,10 +5417,10 @@ void sched_setnuma(struct task_struct *p, int nid)
p->numa_preferred_nid = nid;
- if (running) - p->sched_class->set_curr_task(rq); if (queued) enqueue_task(rq, p, ENQUEUE_RESTORE); + if (running) + p->sched_class->set_curr_task(rq); task_rq_unlock(rq, p, &rf); } #endif /* CONFIG_NUMA_BALANCING */ @@ -7868,10 +7868,10 @@ void sched_move_task(struct task_struct *tsk)
sched_change_group(tsk, TASK_MOVE_GROUP);
- if (unlikely(running)) - tsk->sched_class->set_curr_task(rq); if (queued) enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE); + if (unlikely(running)) + tsk->sched_class->set_curr_task(rq);
task_rq_unlock(rq, tsk, &rf); }
linaro-kernel@lists.linaro.org