CFS scheduler uses rq->cfs_tasks to track all tasks have been enqueued on it and it is when iterate tasks of source rq for load balance. This list is in the order for first waken task will be inserted into first on the list, so it don't consider any attribution for task utilizations.
This patch introduces to use rb tree for rq, so every rq has one rb tree to track all runnable task and running task on it. And it uses highest utilization task as its most left node, so can easily to get to know which task has highest utilization on the rq.
Signed-off-by: Leo Yan leo.yan@linaro.org --- include/linux/sched.h | 1 + kernel/sched/fair.c | 95 +++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/sched/sched.h | 4 +++ 3 files changed, 100 insertions(+)
diff --git a/include/linux/sched.h b/include/linux/sched.h index ede29e8..4fa171d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1323,6 +1323,7 @@ struct ravg { struct sched_entity { struct load_weight load; /* for load-balancing */ struct rb_node run_node; + struct rb_node util_node; struct list_head group_node; unsigned int on_rq;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 713b031..7183000 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -605,6 +605,90 @@ int sched_proc_update_handler(struct ctl_table *table, int write, #endif
/* + * Enqueue/dequeue task into rb-tree based on task util metrics, + * so can easily to fetch biggest task from RQ. + */ +static inline int task_util_before(struct sched_entity *a, + struct sched_entity *b) +{ + return (s64)(a->avg.util_avg - b->avg.util_avg) > 0; +} + +static void __enqueue_util(struct rq *rq, struct sched_entity *se) +{ + struct rb_node **link = &rq->util_node.rb_node; + struct rb_node *parent = NULL; + struct sched_entity *entry; + int leftmost = 1; + + /* + * Find the right place in the rbtree: + */ + while (*link) { + parent = *link; + entry = rb_entry(parent, struct sched_entity, util_node); + /* + * We dont care about collisions. Nodes with + * the same key stay together. + */ + if (task_util_before(se, entry)) { + link = &parent->rb_left; + } else { + link = &parent->rb_right; + leftmost = 0; + } + } + + /* + * Maintain a cache of leftmost tree entries (it is frequently + * used): + */ + if (leftmost) + rq->util_node_leftmost = &se->util_node; + + rb_link_node(&se->util_node, parent, link); + rb_insert_color(&se->util_node, &rq->util_node); +} + +static void __dequeue_util(struct rq *rq, struct sched_entity *se) +{ + if (rq->util_node_leftmost == &se->util_node) { + struct rb_node *next_node; + + next_node = rb_next(&se->util_node); + rq->util_node_leftmost = next_node; + } + + rb_erase(&se->util_node, &rq->util_node); +} + +static void __update_queue_util(struct rq *rq, struct sched_entity *se) +{ + __dequeue_util(rq, se); + __enqueue_util(rq, se); +} + +struct sched_entity *__pick_first_queue_util(struct rq *rq) +{ + struct rb_node *left = rq->util_node_leftmost; + + if (!left) + return NULL; + + return rb_entry(left, struct sched_entity, util_node); +} + +static struct sched_entity *__pick_next_queue_util(struct sched_entity *se) +{ + struct rb_node *next = rb_next(&se->util_node); + + if (!next) + return NULL; + + return rb_entry(next, struct sched_entity, util_node); +} + +/* * delta /= w */ static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se) @@ -2369,6 +2453,7 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
account_numa_enqueue(rq, task_of(se)); list_add(&se->group_node, &rq->cfs_tasks); + __enqueue_util(rq, se); } #endif cfs_rq->nr_running++; @@ -2383,6 +2468,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) if (entity_is_task(se)) { account_numa_dequeue(rq_of(cfs_rq), task_of(se)); list_del_init(&se->group_node); + __dequeue_util(rq_of(cfs_rq), se); } cfs_rq->nr_running--; } @@ -3246,6 +3332,9 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) update_stats_wait_end(cfs_rq, se); __dequeue_entity(cfs_rq, se); update_load_avg(se, 1); + + if (entity_is_task(se)) + __update_queue_util(rq_of(cfs_rq), se); }
update_stats_curr_start(cfs_rq, se); @@ -3346,6 +3435,9 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) __enqueue_entity(cfs_rq, prev); /* in !on_rq case, update occurred at dequeue */ update_load_avg(prev, 0); + + if (entity_is_task(prev)) + __update_queue_util(rq_of(cfs_rq), prev); } cfs_rq->curr = NULL; } @@ -3364,6 +3456,9 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) update_load_avg(curr, 1); update_cfs_shares(cfs_rq);
+ if (entity_is_task(curr)) + __update_queue_util(rq_of(cfs_rq), curr); + #ifdef CONFIG_SCHED_HRTICK /* * queued ticks are scheduled to match the slice, so don't bother diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 2f2b959..b77c17a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -606,6 +606,10 @@ struct rq { seqcount_t ave_seqcnt; #endif
+ /* track most high utilization tasks */ + struct rb_root util_node; + struct rb_node *util_node_leftmost; + /* capture load from *all* tasks on this cpu: */ struct load_weight load; unsigned long nr_load_updates; -- 1.9.1