CFS scheduler uses rq::cfs_tasks as list to track tasks which are been enqueued on its rq, during load balance scheduler iterates this list to migrate tasks. The tasks are is randomly added into the list so finally the task migration is random without considering task load or utilization.
This patch introduces an extra rb tree in rq, use this rb tree to trace task sequence based on load or util metrics. With this rb tree it can easily get to know which task has biggest load so it's very helpful to distinguish heavy load tasks.
In traditional SMP load balance, scheduler should consider smp nice; so this patch introduce the flag sysctl_sched_task_seq_use_load and it can be set from proc fs. If sysctl_sched_task_seq_use_load = 1, then rb tree will use load metrics for the key value; other it will use util_avg metrics as the key value, this can be used for energy aware scheduling to filter out high utilization tasks.
Signed-off-by: Leo Yan leo.yan@linaro.org --- include/linux/sched.h | 1 + include/linux/sched/sysctl.h | 1 + kernel/sched/fair.c | 66 ++++++++++++++++++++++++++++++++++++++++++++ kernel/sched/sched.h | 3 ++ kernel/sysctl.c | 7 +++++ 5 files changed, 78 insertions(+)
diff --git a/include/linux/sched.h b/include/linux/sched.h index ede29e8..6e9a2f4 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1323,6 +1323,7 @@ struct ravg { struct sched_entity { struct load_weight load; /* for load-balancing */ struct rb_node run_node; + struct rb_node seq_node; struct list_head group_node; unsigned int on_rq;
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index d68e88c..3d28019 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -43,6 +43,7 @@ extern unsigned int sysctl_sched_is_big_little; extern unsigned int sysctl_sched_sync_hint_enable; extern unsigned int sysctl_sched_initial_task_util; extern unsigned int sysctl_sched_cstate_aware; +extern unsigned int sysctl_sched_task_seq_use_load; #ifdef CONFIG_SCHED_WALT extern unsigned int sysctl_sched_use_walt_cpu_util; extern unsigned int sysctl_sched_use_walt_task_util; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3dfaf21..f2148d9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -57,6 +57,7 @@ unsigned int sysctl_sched_is_big_little = 0; unsigned int sysctl_sched_sync_hint_enable = 1; unsigned int sysctl_sched_initial_task_util = 0; unsigned int sysctl_sched_cstate_aware = 1; +unsigned int sysctl_sched_task_seq_use_load = 0;
#ifdef CONFIG_SCHED_WALT unsigned int sysctl_sched_use_walt_cpu_util = 1; @@ -2357,6 +2358,60 @@ static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p) } #endif /* CONFIG_NUMA_BALANCING */
+/* + * Enqueue/dequeue task into rb-tree based on task load or util metrics, + * so can easily to fetch biggest task from RQ. + */ +static inline int task_sequence_before(struct sched_entity *a, + struct sched_entity *b) +{ + s64 seq; + + if (sysctl_sched_task_seq_use_load) + seq = (s64)(task_h_load(task_of(a)) - task_h_load(task_of(b))); + else + seq = (s64)(a->avg.util_avg - b->avg.util_avg); + + return seq > 0; +} + +static void __enqueue_sequence(struct rq *rq, struct sched_entity *se) +{ + struct rb_node **link = &rq->seq_node.rb_node; + struct rb_node *parent = NULL; + struct sched_entity *entry; + + /* + * Find the right place in the rbtree: + */ + while (*link) { + parent = *link; + entry = rb_entry(parent, struct sched_entity, seq_node); + /* + * We dont care about collisions. Nodes with + * the same key stay together. + */ + if (task_sequence_before(se, entry)) + link = &parent->rb_left; + else + link = &parent->rb_right; + } + + rb_link_node(&se->seq_node, parent, link); + rb_insert_color(&se->seq_node, &rq->seq_node); +} + +static void __dequeue_sequence(struct rq *rq, struct sched_entity *se) +{ + rb_erase(&se->seq_node, &rq->seq_node); +} + +static void __update_sequence(struct rq *rq, struct sched_entity *se) +{ + __dequeue_sequence(rq, se); + __enqueue_sequence(rq, se); +} + static void account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) { @@ -2369,6 +2424,7 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
account_numa_enqueue(rq, task_of(se)); list_add(&se->group_node, &rq->cfs_tasks); + __enqueue_sequence(rq, se); } #endif cfs_rq->nr_running++; @@ -2383,6 +2439,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) if (entity_is_task(se)) { account_numa_dequeue(rq_of(cfs_rq), task_of(se)); list_del_init(&se->group_node); + __dequeue_sequence(rq_of(cfs_rq), se); } cfs_rq->nr_running--; } @@ -3246,6 +3303,9 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) update_stats_wait_end(cfs_rq, se); __dequeue_entity(cfs_rq, se); update_load_avg(se, 1); + + if (entity_is_task(se)) + __update_sequence(rq_of(cfs_rq), se); }
update_stats_curr_start(cfs_rq, se); @@ -3346,6 +3406,9 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) __enqueue_entity(cfs_rq, prev); /* in !on_rq case, update occurred at dequeue */ update_load_avg(prev, 0); + + if (entity_is_task(prev)) + __update_sequence(rq_of(cfs_rq), prev); } cfs_rq->curr = NULL; } @@ -3364,6 +3427,9 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) update_load_avg(curr, 1); update_cfs_shares(cfs_rq);
+ if (entity_is_task(curr)) + __update_sequence(rq_of(cfs_rq), curr); + #ifdef CONFIG_SCHED_HRTICK /* * queued ticks are scheduled to match the slice, so don't bother diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 2f2b959..7284a45 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -606,6 +606,9 @@ struct rq { seqcount_t ave_seqcnt; #endif
+ /* track highest load task */ + struct rb_root seq_node; + /* capture load from *all* tasks on this cpu: */ struct load_weight load; unsigned long nr_load_updates; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index d964422..14a3cfe 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -311,6 +311,13 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "sched_task_seq_use_load", + .data = &sysctl_sched_task_seq_use_load, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, #ifdef CONFIG_SCHED_WALT { .procname = "sched_use_walt_cpu_util", -- 1.9.1