This approach aims to retain the current behavior of load balancer with the change being only in the metric consumed during load balancing, without unnecessary introduction of new variables.This RFC has been posted to evaluate its design;to see if this is the right way to go about introducing per-entity-load-tracking metric for the consumers of the same; in this specific case,the load balancer.Once the design has been approved off,I can go around to testing it.
The patch has been based out of tip-master:HEAD at commit 8a1d31c703d Subject:Merge branch 'x86/urgent'
Grateful to Peter Zijlstra and Ingo Molnar for their valuable feedback on v1 of the RFC which was the foundation for this version.
PATCH[1/2] Aims at enabling usage of Per-Entity-Load-Tracking for load balacing PATCH[2/2] The crux of the patchset lies here. ---
Preeti U Murthy (2): sched: Revert temporary FAIR_GROUP_SCHED dependency for load-tracking sched: Use Per-Entity-Load-Tracking metric for load balancing
include/linux/sched.h | 9 +----- kernel/sched/core.c | 19 +++++------- kernel/sched/fair.c | 76 +++++++++++++++++++++---------------------------- kernel/sched/sched.h | 9 +----- 4 files changed, 43 insertions(+), 70 deletions(-)
Now that we need the per-entity load tracking for load balancing, trivially revert the patch which introduced the FAIR_GROUP_SCHED dependence for load tracking.
Signed-off-by: Preeti U Murthypreeti@linux.vnet.ibm.com --- include/linux/sched.h | 7 +------ kernel/sched/core.c | 7 +------ kernel/sched/fair.c | 12 +----------- kernel/sched/sched.h | 7 ------- 4 files changed, 3 insertions(+), 30 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h index 03be150..087dd20 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1169,12 +1169,7 @@ struct sched_entity { /* rq "owned" by this entity/group: */ struct cfs_rq *my_q; #endif -/* - * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be - * removed when useful for applications beyond shares distribution (e.g. - * load-balance). - */ -#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED) +#if defined(CONFIG_SMP) /* Per-entity load-tracking */ struct sched_avg avg; #endif diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c2e077c..24d8b9b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1526,12 +1526,7 @@ static void __sched_fork(struct task_struct *p) p->se.vruntime = 0; INIT_LIST_HEAD(&p->se.group_node);
-/* - * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be - * removed when useful for applications beyond shares distribution (e.g. - * load-balance). - */ -#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED) +#if defined(CONFIG_SMP) p->se.avg.runnable_avg_period = 0; p->se.avg.runnable_avg_sum = 0; #endif diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2cebc81..a9cdc8f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1149,8 +1149,7 @@ static inline void update_cfs_shares(struct cfs_rq *cfs_rq) } #endif /* CONFIG_FAIR_GROUP_SCHED */
-/* Only depends on SMP, FAIR_GROUP_SCHED may be removed when useful in lb */ -#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED) +#if defined(CONFIG_SMP) /* * We choose a half-life close to 1 scheduling period. * Note: The tables below are dependent on this value. @@ -3503,12 +3502,6 @@ unlock: }
/* - * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be - * removed when useful for applications beyond shares distribution (e.g. - * load-balance). - */ -#ifdef CONFIG_FAIR_GROUP_SCHED -/* * Called immediately before a task is migrated to a new cpu; task_cpu(p) and * cfs_rq_of(p) references at time of call are still valid and identify the * previous cpu. However, the caller only guarantees p->pi_lock is held; no @@ -3531,7 +3524,6 @@ migrate_task_rq_fair(struct task_struct *p, int next_cpu) atomic64_add(se->avg.load_avg_contrib, &cfs_rq->removed_load); } } -#endif #endif /* CONFIG_SMP */
static unsigned long @@ -6416,9 +6408,7 @@ const struct sched_class fair_sched_class = {
#ifdef CONFIG_SMP .select_task_rq = select_task_rq_fair, -#ifdef CONFIG_FAIR_GROUP_SCHED .migrate_task_rq = migrate_task_rq_fair, -#endif .rq_online = rq_online_fair, .rq_offline = rq_offline_fair,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 508e77e..bfd004a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -226,12 +226,6 @@ struct cfs_rq { #endif
#ifdef CONFIG_SMP -/* - * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be - * removed when useful for applications beyond shares distribution (e.g. - * load-balance). - */ -#ifdef CONFIG_FAIR_GROUP_SCHED /* * CFS Load tracking * Under CFS, load is tracked on a per-entity basis and aggregated up. @@ -241,7 +235,6 @@ struct cfs_rq { u64 runnable_load_avg, blocked_load_avg; atomic64_t decay_counter, removed_load; u64 last_decay; -#endif /* CONFIG_FAIR_GROUP_SCHED */ /* These always depend on CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_FAIR_GROUP_SCHED u32 tg_runnable_contrib;
Currently the load balancer weighs a task based upon its priority,and this weight consequently gets added up to the weight of the run queue that it is on.It is this weight of the runqueue that sums up to a sched group's load which is used to decide the busiest or the idlest group and the runqueue thereof.
The Per-Entity-Load-Tracking metric however measures how long a task has been runnable over the duration of its lifetime.This gives us a hint of the amount of CPU time that the task can demand.This metric takes care of the task priority as well.Therefore apart from the priority of a task we also have an idea of the live behavior of the task.This seems to be a more realistic metric to use to compute task weight which adds upto the run queue weight and the weight of the sched group.Consequently they can be used for load balancing.
The semantics of load balancing is left untouched.The two functions load_balance() and select_task_rq_fair() perform the task of load balancing.These two paths have been browsed through in this patch to make necessary changes.
weighted_cpuload() and task_h_load() provide the run queue weight and the weight of the task respectively.They have been modified to provide the Per-Entity-Load-Tracking metric as relevant for each. The rest of the modifications had to be made to suit these two changes.
Completely Fair Scheduler class is the only sched_class which contributes to the run queue load.Therefore the rq->load.weight==cfs_rq->load.weight when the cfs_rq is the root cfs_rq (rq->cfs) of the hierarchy.When replacing this with Per-Entity-Load-Tracking metric,cfs_rq->runnable_load_avg needs to be used as this is the right reflection of the run queue load when the cfs_rq is the root cfs_rq (rq->cfs) of the hierarchy.This metric reflects the percentage uptime of the tasks that are queued on it and hence that contribute to the load.Thus cfs_rq->runnable_load_avg replaces the metric earlier used in weighted_cpuload().
The task load is aptly captured by se.avg.load_avg_contrib which captures the runnable time vs the alive time of the task against its priority.This metric replaces the earlier metric used in task_h_load().
The consequent changes appear as data type changes for the helper variables; they abound in number.Because cfs_rq->runnable_load_avg needs to be big enough to capture the tasks' load often and accurately.
The following patch does not consider CONFIG_FAIR_GROUP_SCHED AND CONFIG_SCHED_NUMA.This is done so as to evaluate this approach starting from the simplest scenario.Earlier discussions can be found in the link below.
Link: https://lkml.org/lkml/2012/10/25/162 Signed-off-by: Preeti U Murthypreeti@linux.vnet.ibm.com --- include/linux/sched.h | 2 +- kernel/sched/core.c | 12 +++++---- kernel/sched/fair.c | 64 +++++++++++++++++++++++++------------------------ kernel/sched/sched.h | 2 +- 4 files changed, 40 insertions(+), 40 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h index 087dd20..302756e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -924,7 +924,7 @@ struct sched_domain { unsigned int lb_count[CPU_MAX_IDLE_TYPES]; unsigned int lb_failed[CPU_MAX_IDLE_TYPES]; unsigned int lb_balanced[CPU_MAX_IDLE_TYPES]; - unsigned int lb_imbalance[CPU_MAX_IDLE_TYPES]; + u64 lb_imbalance[CPU_MAX_IDLE_TYPES]; unsigned int lb_gained[CPU_MAX_IDLE_TYPES]; unsigned int lb_hot_gained[CPU_MAX_IDLE_TYPES]; unsigned int lb_nobusyg[CPU_MAX_IDLE_TYPES]; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 24d8b9b..4dea057 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2415,8 +2415,8 @@ static const unsigned char * would be when CPU is idle and so we just decay the old load without * adding any new load. */ -static unsigned long -decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) +static u64 +decay_load_missed(u64 load, unsigned long missed_updates, int idx) { int j = 0;
@@ -2444,7 +2444,7 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) * scheduler tick (TICK_NSEC). With tickless idle this will not be called * every tick. We fix it up based on jiffies. */ -static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, +static void __update_cpu_load(struct rq *this_rq, u64 this_load, unsigned long pending_updates) { int i, scale; @@ -2454,7 +2454,7 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, /* Update our load: */ this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { - unsigned long old_load, new_load; + u64 old_load, new_load;
/* scale is effectively 1 << i now, and >> i divides by scale */
@@ -2496,7 +2496,7 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, void update_idle_cpu_load(struct rq *this_rq) { unsigned long curr_jiffies = ACCESS_ONCE(jiffies); - unsigned long load = this_rq->load.weight; + u64 load = this_rq->cfs.runnable_load_avg; unsigned long pending_updates;
/* @@ -2546,7 +2546,7 @@ static void update_cpu_load_active(struct rq *this_rq) * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). */ this_rq->last_load_update_tick = jiffies; - __update_cpu_load(this_rq, this_rq->load.weight, 1); + __update_cpu_load(this_rq, this_rq->cfs.runnable_load_avg, 1);
calc_load_account_active(this_rq); } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a9cdc8f..f8f3a29 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2935,9 +2935,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
#ifdef CONFIG_SMP /* Used instead of source_load when we know the type == 0 */ -static unsigned long weighted_cpuload(const int cpu) +static u64 weighted_cpuload(const int cpu) { - return cpu_rq(cpu)->load.weight; + return cpu_rq(cpu)->cfs.runnable_load_avg; }
/* @@ -2947,10 +2947,10 @@ static unsigned long weighted_cpuload(const int cpu) * We want to under-estimate the load of migration sources, to * balance conservatively. */ -static unsigned long source_load(int cpu, int type) +static u64 source_load(int cpu, int type) { struct rq *rq = cpu_rq(cpu); - unsigned long total = weighted_cpuload(cpu); + u64 total = weighted_cpuload(cpu);
if (type == 0 || !sched_feat(LB_BIAS)) return total; @@ -2962,10 +2962,10 @@ static unsigned long source_load(int cpu, int type) * Return a high guess at the load of a migration-target cpu weighted * according to the scheduling class and "nice" value. */ -static unsigned long target_load(int cpu, int type) +static u64 target_load(int cpu, int type) { struct rq *rq = cpu_rq(cpu); - unsigned long total = weighted_cpuload(cpu); + u64 total = weighted_cpuload(cpu);
if (type == 0 || !sched_feat(LB_BIAS)) return total; @@ -2978,13 +2978,13 @@ static unsigned long power_of(int cpu) return cpu_rq(cpu)->cpu_power; }
-static unsigned long cpu_avg_load_per_task(int cpu) +static u64 cpu_avg_load_per_task(int cpu) { struct rq *rq = cpu_rq(cpu); unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
if (nr_running) - return rq->load.weight / nr_running; + return rq->cfs.runnable_load_avg / nr_running;
return 0; } @@ -3131,7 +3131,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) { s64 this_load, load; int idx, this_cpu, prev_cpu; - unsigned long tl_per_task; + u64 tl_per_task; struct task_group *tg; unsigned long weight; int balanced; @@ -3149,14 +3149,14 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) */ if (sync) { tg = task_group(current); - weight = current->se.load.weight; + weight = current->se.avg.load_avg_contrib;
this_load += effective_load(tg, this_cpu, -weight, -weight); load += effective_load(tg, prev_cpu, 0, -weight); }
tg = task_group(p); - weight = p->se.load.weight; + weight = p->se.avg.load_avg_contrib;
/* * In low-load situations, where prev_cpu is idle and this_cpu is idle @@ -3219,11 +3219,11 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu, int load_idx) { struct sched_group *idlest = NULL, *group = sd->groups; - unsigned long min_load = ULONG_MAX, this_load = 0; + u64 min_load = ~0ULL, this_load = 0; int imbalance = 100 + (sd->imbalance_pct-100)/2;
do { - unsigned long load, avg_load; + u64 load, avg_load; int local_group; int i;
@@ -3270,7 +3270,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, static int find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) { - unsigned long load, min_load = ULONG_MAX; + u64 load, min_load = ~0ULL; int idlest = -1; int i;
@@ -3911,7 +3911,7 @@ struct lb_env { struct cpumask *dst_grpmask; int new_dst_cpu; enum cpu_idle_type idle; - long imbalance; + long long imbalance; /* The set of CPUs under consideration for load-balancing */ struct cpumask *cpus;
@@ -4314,7 +4314,7 @@ static inline void update_h_load(long cpu) #ifdef CONFIG_SMP static unsigned long task_h_load(struct task_struct *p) { - return p->se.load.weight; + return p->se.avg.load_avg_contrib; } #endif #endif @@ -4327,21 +4327,21 @@ static unsigned long task_h_load(struct task_struct *p) struct sd_lb_stats { struct sched_group *busiest; /* Busiest group in this sd */ struct sched_group *this; /* Local group in this sd */ - unsigned long total_load; /* Total load of all groups in sd */ + u64 total_load; /* Total load of all groups in sd */ unsigned long total_pwr; /* Total power of all groups in sd */ - unsigned long avg_load; /* Average load across all groups in sd */ + u64 avg_load; /* Average load across all groups in sd */
/** Statistics of this group */ - unsigned long this_load; - unsigned long this_load_per_task; + u64 this_load; + u64 this_load_per_task; unsigned long this_nr_running; unsigned long this_has_capacity; unsigned int this_idle_cpus;
/* Statistics of the busiest group */ unsigned int busiest_idle_cpus; - unsigned long max_load; - unsigned long busiest_load_per_task; + u64 max_load; + u64 busiest_load_per_task; unsigned long busiest_nr_running; unsigned long busiest_group_capacity; unsigned long busiest_has_capacity; @@ -4363,9 +4363,9 @@ struct sd_lb_stats { */ struct sg_lb_stats { unsigned long avg_load; /*Avg load across the CPUs of the group */ - unsigned long group_load; /* Total load over the CPUs of the group */ + u64 group_load; /* Total load over the CPUs of the group */ unsigned long sum_nr_running; /* Nr tasks running in the group */ - unsigned long sum_weighted_load; /* Weighted load of group's tasks */ + u64 sum_weighted_load; /* Weighted load of group's tasks */ unsigned long group_capacity; unsigned long idle_cpus; unsigned long group_weight; @@ -4688,9 +4688,9 @@ static inline void update_sg_lb_stats(struct lb_env *env, int local_group, int *balance, struct sg_lb_stats *sgs) { unsigned long nr_running, max_nr_running, min_nr_running; - unsigned long load, max_cpu_load, min_cpu_load; + u64 load, max_cpu_load, min_cpu_load; unsigned int balance_cpu = -1, first_idle_cpu = 0; - unsigned long avg_load_per_task = 0; + u64 avg_load_per_task = 0; int i;
if (local_group) @@ -4698,7 +4698,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
/* Tally up the load of all CPUs in the group */ max_cpu_load = 0; - min_cpu_load = ~0UL; + min_cpu_load = ~0ULL; max_nr_running = 0; min_nr_running = ~0UL;
@@ -4948,9 +4948,9 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) static inline void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) { - unsigned long tmp, pwr_now = 0, pwr_move = 0; + u64 tmp, pwr_now = 0, pwr_move = 0; unsigned int imbn = 2; - unsigned long scaled_busy_load_per_task; + u64 scaled_busy_load_per_task;
if (sds->this_nr_running) { sds->this_load_per_task /= sds->this_nr_running; @@ -5016,7 +5016,7 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) */ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds) { - unsigned long max_pull, load_above_capacity = ~0UL; + u64 max_pull, load_above_capacity = ~0ULL;
sds->busiest_load_per_task /= sds->busiest_nr_running; if (sds->group_imb) { @@ -5192,14 +5192,14 @@ static struct rq *find_busiest_queue(struct lb_env *env, struct sched_group *group) { struct rq *busiest = NULL, *rq; - unsigned long max_load = 0; + u64 max_load = 0; int i;
for_each_cpu(i, sched_group_cpus(group)) { unsigned long power = power_of(i); unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE); - unsigned long wl; + u64 wl;
if (!capacity) capacity = fix_small_capacity(env->sd, group); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index bfd004a..cff1926 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -362,7 +362,7 @@ struct rq { */ unsigned int nr_running; #define CPU_LOAD_IDX_MAX 5 - unsigned long cpu_load[CPU_LOAD_IDX_MAX]; + u64 cpu_load[CPU_LOAD_IDX_MAX]; unsigned long last_load_update_tick; #ifdef CONFIG_NO_HZ u64 nohz_stamp;
Hi Preeti,
On 15 November 2012 17:54, Preeti U Murthy preeti@linux.vnet.ibm.com wrote:
Currently the load balancer weighs a task based upon its priority,and this weight consequently gets added up to the weight of the run queue that it is on.It is this weight of the runqueue that sums up to a sched group's load which is used to decide the busiest or the idlest group and the runqueue thereof.
The Per-Entity-Load-Tracking metric however measures how long a task has been runnable over the duration of its lifetime.This gives us a hint of the amount of CPU time that the task can demand.This metric takes care of the task priority as well.Therefore apart from the priority of a task we also have an idea of the live behavior of the task.This seems to be a more realistic metric to use to compute task weight which adds upto the run queue weight and the weight of the sched group.Consequently they can be used for load balancing.
The semantics of load balancing is left untouched.The two functions load_balance() and select_task_rq_fair() perform the task of load balancing.These two paths have been browsed through in this patch to make necessary changes.
weighted_cpuload() and task_h_load() provide the run queue weight and the weight of the task respectively.They have been modified to provide the Per-Entity-Load-Tracking metric as relevant for each. The rest of the modifications had to be made to suit these two changes.
Completely Fair Scheduler class is the only sched_class which contributes to the run queue load.Therefore the rq->load.weight==cfs_rq->load.weight when the cfs_rq is the root cfs_rq (rq->cfs) of the hierarchy.When replacing this with Per-Entity-Load-Tracking metric,cfs_rq->runnable_load_avg needs to be used as this is the right reflection of the run queue load when the cfs_rq is the root cfs_rq (rq->cfs) of the hierarchy.This metric reflects the percentage uptime of the tasks that are queued on it and hence that contribute to the load.Thus cfs_rq->runnable_load_avg replaces the metric earlier used in weighted_cpuload().
The task load is aptly captured by se.avg.load_avg_contrib which captures the runnable time vs the alive time of the task against its priority.This metric replaces the earlier metric used in task_h_load().
The consequent changes appear as data type changes for the helper variables; they abound in number.Because cfs_rq->runnable_load_avg needs to be big enough to capture the tasks' load often and accurately.
You are now using cfs_rq->runnable_load_avg instead of cfs_rq->load.weight for calculation of cpu_load but cfs_rq->runnable_load_avg is smaller or equal to cfs_rq->load.weight value. This implies that the new value is smaller or equal to the old statistic so you should be able to keep the same variable width for the computation of cpu_load
The following patch does not consider CONFIG_FAIR_GROUP_SCHED AND CONFIG_SCHED_NUMA.This is done so as to evaluate this approach starting from the simplest scenario.Earlier discussions can be found in the link below.
Link: https://lkml.org/lkml/2012/10/25/162 Signed-off-by: Preeti U Murthypreeti@linux.vnet.ibm.com
include/linux/sched.h | 2 +- kernel/sched/core.c | 12 +++++---- kernel/sched/fair.c | 64 +++++++++++++++++++++++++------------------------ kernel/sched/sched.h | 2 +- 4 files changed, 40 insertions(+), 40 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h index 087dd20..302756e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -924,7 +924,7 @@ struct sched_domain { unsigned int lb_count[CPU_MAX_IDLE_TYPES]; unsigned int lb_failed[CPU_MAX_IDLE_TYPES]; unsigned int lb_balanced[CPU_MAX_IDLE_TYPES];
unsigned int lb_imbalance[CPU_MAX_IDLE_TYPES];
u64 lb_imbalance[CPU_MAX_IDLE_TYPES]; unsigned int lb_gained[CPU_MAX_IDLE_TYPES]; unsigned int lb_hot_gained[CPU_MAX_IDLE_TYPES]; unsigned int lb_nobusyg[CPU_MAX_IDLE_TYPES];
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 24d8b9b..4dea057 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2415,8 +2415,8 @@ static const unsigned char
- would be when CPU is idle and so we just decay the old load without
- adding any new load.
*/ -static unsigned long -decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) +static u64 +decay_load_missed(u64 load, unsigned long missed_updates, int idx) { int j = 0;
@@ -2444,7 +2444,7 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
- scheduler tick (TICK_NSEC). With tickless idle this will not be called
- every tick. We fix it up based on jiffies.
*/ -static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, +static void __update_cpu_load(struct rq *this_rq, u64 this_load, unsigned long pending_updates) { int i, scale; @@ -2454,7 +2454,7 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, /* Update our load: */ this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
unsigned long old_load, new_load;
u64 old_load, new_load; /* scale is effectively 1 << i now, and >> i divides by scale */
@@ -2496,7 +2496,7 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, void update_idle_cpu_load(struct rq *this_rq) { unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
unsigned long load = this_rq->load.weight;
u64 load = this_rq->cfs.runnable_load_avg; unsigned long pending_updates; /*
@@ -2546,7 +2546,7 @@ static void update_cpu_load_active(struct rq *this_rq) * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). */ this_rq->last_load_update_tick = jiffies;
__update_cpu_load(this_rq, this_rq->load.weight, 1);
__update_cpu_load(this_rq, this_rq->cfs.runnable_load_avg, 1); calc_load_account_active(this_rq);
} diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a9cdc8f..f8f3a29 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2935,9 +2935,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
#ifdef CONFIG_SMP /* Used instead of source_load when we know the type == 0 */ -static unsigned long weighted_cpuload(const int cpu) +static u64 weighted_cpuload(const int cpu) {
return cpu_rq(cpu)->load.weight;
return cpu_rq(cpu)->cfs.runnable_load_avg;
}
/* @@ -2947,10 +2947,10 @@ static unsigned long weighted_cpuload(const int cpu)
- We want to under-estimate the load of migration sources, to
- balance conservatively.
*/ -static unsigned long source_load(int cpu, int type) +static u64 source_load(int cpu, int type) { struct rq *rq = cpu_rq(cpu);
unsigned long total = weighted_cpuload(cpu);
u64 total = weighted_cpuload(cpu); if (type == 0 || !sched_feat(LB_BIAS)) return total;
@@ -2962,10 +2962,10 @@ static unsigned long source_load(int cpu, int type)
- Return a high guess at the load of a migration-target cpu weighted
- according to the scheduling class and "nice" value.
*/ -static unsigned long target_load(int cpu, int type) +static u64 target_load(int cpu, int type) { struct rq *rq = cpu_rq(cpu);
unsigned long total = weighted_cpuload(cpu);
u64 total = weighted_cpuload(cpu); if (type == 0 || !sched_feat(LB_BIAS)) return total;
@@ -2978,13 +2978,13 @@ static unsigned long power_of(int cpu) return cpu_rq(cpu)->cpu_power; }
-static unsigned long cpu_avg_load_per_task(int cpu) +static u64 cpu_avg_load_per_task(int cpu) { struct rq *rq = cpu_rq(cpu); unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
if (nr_running)
return rq->load.weight / nr_running;
return rq->cfs.runnable_load_avg / nr_running;
You now need to use div_u64 for all division of a 64bits variable like runnable_load_avg otherwise it can't compile on 32bits platform like ARM. This one is obvious because it appears in your patch but other division could be now 64bits division
Regards, Vincent
return 0;
} @@ -3131,7 +3131,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) { s64 this_load, load; int idx, this_cpu, prev_cpu;
unsigned long tl_per_task;
u64 tl_per_task; struct task_group *tg; unsigned long weight; int balanced;
@@ -3149,14 +3149,14 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) */ if (sync) { tg = task_group(current);
weight = current->se.load.weight;
weight = current->se.avg.load_avg_contrib; this_load += effective_load(tg, this_cpu, -weight, -weight); load += effective_load(tg, prev_cpu, 0, -weight); } tg = task_group(p);
weight = p->se.load.weight;
weight = p->se.avg.load_avg_contrib; /* * In low-load situations, where prev_cpu is idle and this_cpu is idle
@@ -3219,11 +3219,11 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu, int load_idx) { struct sched_group *idlest = NULL, *group = sd->groups;
unsigned long min_load = ULONG_MAX, this_load = 0;
u64 min_load = ~0ULL, this_load = 0; int imbalance = 100 + (sd->imbalance_pct-100)/2; do {
unsigned long load, avg_load;
u64 load, avg_load; int local_group; int i;
@@ -3270,7 +3270,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, static int find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) {
unsigned long load, min_load = ULONG_MAX;
u64 load, min_load = ~0ULL; int idlest = -1; int i;
@@ -3911,7 +3911,7 @@ struct lb_env { struct cpumask *dst_grpmask; int new_dst_cpu; enum cpu_idle_type idle;
long imbalance;
long long imbalance; /* The set of CPUs under consideration for load-balancing */ struct cpumask *cpus;
@@ -4314,7 +4314,7 @@ static inline void update_h_load(long cpu) #ifdef CONFIG_SMP static unsigned long task_h_load(struct task_struct *p) {
return p->se.load.weight;
return p->se.avg.load_avg_contrib;
} #endif #endif @@ -4327,21 +4327,21 @@ static unsigned long task_h_load(struct task_struct *p) struct sd_lb_stats { struct sched_group *busiest; /* Busiest group in this sd */ struct sched_group *this; /* Local group in this sd */
unsigned long total_load; /* Total load of all groups in sd */
u64 total_load; /* Total load of all groups in sd */ unsigned long total_pwr; /* Total power of all groups in sd */
unsigned long avg_load; /* Average load across all groups in sd */
u64 avg_load; /* Average load across all groups in sd */ /** Statistics of this group */
unsigned long this_load;
unsigned long this_load_per_task;
u64 this_load;
u64 this_load_per_task; unsigned long this_nr_running; unsigned long this_has_capacity; unsigned int this_idle_cpus; /* Statistics of the busiest group */ unsigned int busiest_idle_cpus;
unsigned long max_load;
unsigned long busiest_load_per_task;
u64 max_load;
u64 busiest_load_per_task; unsigned long busiest_nr_running; unsigned long busiest_group_capacity; unsigned long busiest_has_capacity;
@@ -4363,9 +4363,9 @@ struct sd_lb_stats { */ struct sg_lb_stats { unsigned long avg_load; /*Avg load across the CPUs of the group */
unsigned long group_load; /* Total load over the CPUs of the group */
u64 group_load; /* Total load over the CPUs of the group */ unsigned long sum_nr_running; /* Nr tasks running in the group */
unsigned long sum_weighted_load; /* Weighted load of group's tasks */
u64 sum_weighted_load; /* Weighted load of group's tasks */ unsigned long group_capacity; unsigned long idle_cpus; unsigned long group_weight;
@@ -4688,9 +4688,9 @@ static inline void update_sg_lb_stats(struct lb_env *env, int local_group, int *balance, struct sg_lb_stats *sgs) { unsigned long nr_running, max_nr_running, min_nr_running;
unsigned long load, max_cpu_load, min_cpu_load;
u64 load, max_cpu_load, min_cpu_load; unsigned int balance_cpu = -1, first_idle_cpu = 0;
unsigned long avg_load_per_task = 0;
u64 avg_load_per_task = 0; int i; if (local_group)
@@ -4698,7 +4698,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
/* Tally up the load of all CPUs in the group */ max_cpu_load = 0;
min_cpu_load = ~0UL;
min_cpu_load = ~0ULL; max_nr_running = 0; min_nr_running = ~0UL;
@@ -4948,9 +4948,9 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) static inline void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) {
unsigned long tmp, pwr_now = 0, pwr_move = 0;
u64 tmp, pwr_now = 0, pwr_move = 0; unsigned int imbn = 2;
unsigned long scaled_busy_load_per_task;
u64 scaled_busy_load_per_task; if (sds->this_nr_running) { sds->this_load_per_task /= sds->this_nr_running;
@@ -5016,7 +5016,7 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) */ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds) {
unsigned long max_pull, load_above_capacity = ~0UL;
u64 max_pull, load_above_capacity = ~0ULL; sds->busiest_load_per_task /= sds->busiest_nr_running; if (sds->group_imb) {
@@ -5192,14 +5192,14 @@ static struct rq *find_busiest_queue(struct lb_env *env, struct sched_group *group) { struct rq *busiest = NULL, *rq;
unsigned long max_load = 0;
u64 max_load = 0; int i; for_each_cpu(i, sched_group_cpus(group)) { unsigned long power = power_of(i); unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
unsigned long wl;
u64 wl; if (!capacity) capacity = fix_small_capacity(env->sd, group);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index bfd004a..cff1926 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -362,7 +362,7 @@ struct rq { */ unsigned int nr_running; #define CPU_LOAD_IDX_MAX 5
unsigned long cpu_load[CPU_LOAD_IDX_MAX];
u64 cpu_load[CPU_LOAD_IDX_MAX]; unsigned long last_load_update_tick;
#ifdef CONFIG_NO_HZ u64 nohz_stamp;
Hi Vincent, Thank you for your review.
On 11/15/2012 11:43 PM, Vincent Guittot wrote:
Hi Preeti,
On 15 November 2012 17:54, Preeti U Murthy preeti@linux.vnet.ibm.com wrote:
Currently the load balancer weighs a task based upon its priority,and this weight consequently gets added up to the weight of the run queue that it is on.It is this weight of the runqueue that sums up to a sched group's load which is used to decide the busiest or the idlest group and the runqueue thereof.
The Per-Entity-Load-Tracking metric however measures how long a task has been runnable over the duration of its lifetime.This gives us a hint of the amount of CPU time that the task can demand.This metric takes care of the task priority as well.Therefore apart from the priority of a task we also have an idea of the live behavior of the task.This seems to be a more realistic metric to use to compute task weight which adds upto the run queue weight and the weight of the sched group.Consequently they can be used for load balancing.
The semantics of load balancing is left untouched.The two functions load_balance() and select_task_rq_fair() perform the task of load balancing.These two paths have been browsed through in this patch to make necessary changes.
weighted_cpuload() and task_h_load() provide the run queue weight and the weight of the task respectively.They have been modified to provide the Per-Entity-Load-Tracking metric as relevant for each. The rest of the modifications had to be made to suit these two changes.
Completely Fair Scheduler class is the only sched_class which contributes to the run queue load.Therefore the rq->load.weight==cfs_rq->load.weight when the cfs_rq is the root cfs_rq (rq->cfs) of the hierarchy.When replacing this with Per-Entity-Load-Tracking metric,cfs_rq->runnable_load_avg needs to be used as this is the right reflection of the run queue load when the cfs_rq is the root cfs_rq (rq->cfs) of the hierarchy.This metric reflects the percentage uptime of the tasks that are queued on it and hence that contribute to the load.Thus cfs_rq->runnable_load_avg replaces the metric earlier used in weighted_cpuload().
The task load is aptly captured by se.avg.load_avg_contrib which captures the runnable time vs the alive time of the task against its priority.This metric replaces the earlier metric used in task_h_load().
The consequent changes appear as data type changes for the helper variables; they abound in number.Because cfs_rq->runnable_load_avg needs to be big enough to capture the tasks' load often and accurately.
You are now using cfs_rq->runnable_load_avg instead of cfs_rq->load.weight for calculation of cpu_load but cfs_rq->runnable_load_avg is smaller or equal to cfs_rq->load.weight value. This implies that the new value is smaller or equal to the old statistic so you should be able to keep the same variable width for the computation of cpu_load
Right.But cfs_rq->runnable_load_avg is a 64 bit unsigned integer as per the Per-entity-load-tracking patchset.I could not figure out why this is the case although as you mention, its value will not exceed cfs_rq->load.weight.In order to retain the data type of cfs_rq->runnable_load_avg as it is,these changes had to be made to suit it.It would be good if someone would clarify why it is a 64 bit integer,will save a lot of trouble if we could consider this the same length as cfs_rq->load.weight.Ben,Paul? can you clarify this point?
The following patch does not consider CONFIG_FAIR_GROUP_SCHED AND CONFIG_SCHED_NUMA.This is done so as to evaluate this approach starting from the simplest scenario.Earlier discussions can be found in the link below.
Link: https://lkml.org/lkml/2012/10/25/162 Signed-off-by: Preeti U Murthypreeti@linux.vnet.ibm.com
include/linux/sched.h | 2 +- kernel/sched/core.c | 12 +++++---- kernel/sched/fair.c | 64 +++++++++++++++++++++++++------------------------ kernel/sched/sched.h | 2 +- 4 files changed, 40 insertions(+), 40 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h index 087dd20..302756e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -924,7 +924,7 @@ struct sched_domain { unsigned int lb_count[CPU_MAX_IDLE_TYPES]; unsigned int lb_failed[CPU_MAX_IDLE_TYPES]; unsigned int lb_balanced[CPU_MAX_IDLE_TYPES];
unsigned int lb_imbalance[CPU_MAX_IDLE_TYPES];
u64 lb_imbalance[CPU_MAX_IDLE_TYPES]; unsigned int lb_gained[CPU_MAX_IDLE_TYPES]; unsigned int lb_hot_gained[CPU_MAX_IDLE_TYPES]; unsigned int lb_nobusyg[CPU_MAX_IDLE_TYPES];
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 24d8b9b..4dea057 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2415,8 +2415,8 @@ static const unsigned char
- would be when CPU is idle and so we just decay the old load without
- adding any new load.
*/ -static unsigned long -decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) +static u64 +decay_load_missed(u64 load, unsigned long missed_updates, int idx) { int j = 0;
@@ -2444,7 +2444,7 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
- scheduler tick (TICK_NSEC). With tickless idle this will not be called
- every tick. We fix it up based on jiffies.
*/ -static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, +static void __update_cpu_load(struct rq *this_rq, u64 this_load, unsigned long pending_updates) { int i, scale; @@ -2454,7 +2454,7 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, /* Update our load: */ this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
unsigned long old_load, new_load;
u64 old_load, new_load; /* scale is effectively 1 << i now, and >> i divides by scale */
@@ -2496,7 +2496,7 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, void update_idle_cpu_load(struct rq *this_rq) { unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
unsigned long load = this_rq->load.weight;
u64 load = this_rq->cfs.runnable_load_avg; unsigned long pending_updates; /*
@@ -2546,7 +2546,7 @@ static void update_cpu_load_active(struct rq *this_rq) * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). */ this_rq->last_load_update_tick = jiffies;
__update_cpu_load(this_rq, this_rq->load.weight, 1);
__update_cpu_load(this_rq, this_rq->cfs.runnable_load_avg, 1); calc_load_account_active(this_rq);
} diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a9cdc8f..f8f3a29 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2935,9 +2935,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
#ifdef CONFIG_SMP /* Used instead of source_load when we know the type == 0 */ -static unsigned long weighted_cpuload(const int cpu) +static u64 weighted_cpuload(const int cpu) {
return cpu_rq(cpu)->load.weight;
return cpu_rq(cpu)->cfs.runnable_load_avg;
}
/* @@ -2947,10 +2947,10 @@ static unsigned long weighted_cpuload(const int cpu)
- We want to under-estimate the load of migration sources, to
- balance conservatively.
*/ -static unsigned long source_load(int cpu, int type) +static u64 source_load(int cpu, int type) { struct rq *rq = cpu_rq(cpu);
unsigned long total = weighted_cpuload(cpu);
u64 total = weighted_cpuload(cpu); if (type == 0 || !sched_feat(LB_BIAS)) return total;
@@ -2962,10 +2962,10 @@ static unsigned long source_load(int cpu, int type)
- Return a high guess at the load of a migration-target cpu weighted
- according to the scheduling class and "nice" value.
*/ -static unsigned long target_load(int cpu, int type) +static u64 target_load(int cpu, int type) { struct rq *rq = cpu_rq(cpu);
unsigned long total = weighted_cpuload(cpu);
u64 total = weighted_cpuload(cpu); if (type == 0 || !sched_feat(LB_BIAS)) return total;
@@ -2978,13 +2978,13 @@ static unsigned long power_of(int cpu) return cpu_rq(cpu)->cpu_power; }
-static unsigned long cpu_avg_load_per_task(int cpu) +static u64 cpu_avg_load_per_task(int cpu) { struct rq *rq = cpu_rq(cpu); unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
if (nr_running)
return rq->load.weight / nr_running;
return rq->cfs.runnable_load_avg / nr_running;
You now need to use div_u64 for all division of a 64bits variable like runnable_load_avg otherwise it can't compile on 32bits platform like ARM. This one is obvious because it appears in your patch but other division could be now 64bits division
Ah yes,there will be some trouble here,Explicit do_div() calls need to be inserted,and there will be plenty such cases.But as I mentioned above,once we are clear about why the width of cfs_rq->runnable_load_avg is 64 bit, we can sort this out.We will need someone to clarify this.
I am at a loss to see the solution around making the above changes if for some reason the width of cfs_rq->runnable_load_avg has to be maintained as is.any thoughts on this?
Regards, Vincent
Regards Preeti U Murthy
sched: Use Per-Entity-Load-Tracking metric for load balancing
From: Preeti U Murthy preeti@linux.vnet.ibm.com
Currently the load balancer weighs a task based upon its priority,and this weight consequently gets added up to the weight of the run queue that it is on.It is this weight of the runqueue that sums up to a sched group's load which is used to decide the busiest or the idlest group and the runqueue thereof.
The Per-Entity-Load-Tracking metric however measures how long a task has been runnable over the duration of its lifetime.This gives us a hint of the amount of CPU time that the task can demand.This metric takes care of the task priority as well.Therefore apart from the priority of a task we also have an idea of the live behavior of the task.This seems to be a more realistic metric to use to compute task weight which adds upto the run queue weight and the weight of the sched group.Consequently they can be used for load balancing.
The semantics of load balancing is left untouched.The two functions load_balance() and select_task_rq_fair() perform the task of load balancing.These two paths have been browsed through in this patch to make necessary changes.
weighted_cpuload() and task_h_load() provide the run queue weight and the weight of the task respectively.They have been modified to provide the Per-Entity-Load-Tracking metric as relevant for each. The rest of the modifications had to be made to suit these two changes.
Completely Fair Scheduler class is the only sched_class which contributes to the run queue load.Therefore the rq->load.weight==cfs_rq->load.weight when the cfs_rq is the root cfs_rq (rq->cfs) of the hierarchy.When replacing this with Per-Entity-Load-Tracking metric,cfs_rq->runnable_load_avg needs to be used as this is the right reflection of the run queue load when the cfs_rq is the root cfs_rq (rq->cfs) of the hierarchy.This metric reflects the percentage uptime of the tasks that are queued on it and hence that contribute to the load.Thus cfs_rq->runnable_load_avg replaces the metric earlier used in weighted_cpuload().
The task load is aptly captured by se.avg.load_avg_contrib which captures the runnable time vs the alive time of the task against its priority.This metric replaces the earlier metric used in task_h_load().
The consequent changes appear as data type changes for the helper variables; they abound in number.Because cfs_rq->runnable_load_avg needs to be big enough to capture the tasks' load often and accurately.
The following patch does not consider CONFIG_FAIR_GROUP_SCHED AND CONFIG_SCHED_NUMA.This is done so as to evaluate this approach starting from the simplest scenario.Earlier discussions can be found in the link below.
Link: https://lkml.org/lkml/2012/10/25/162 Signed-off-by: Preeti U Murthy preeti@linux.vnet.ibm.com --- I apologise about having overlooked this one change in the patchset.This needs to be applied on top of patch2 of this patchset.The experiment results that have been posted in reply to this thread are done after having applied this patch.
kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f8f3a29..19094eb 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4362,7 +4362,7 @@ struct sd_lb_stats { * sg_lb_stats - stats of a sched_group required for load_balancing */ struct sg_lb_stats { - unsigned long avg_load; /*Avg load across the CPUs of the group */ + u64 avg_load; /*Avg load across the CPUs of the group */ u64 group_load; /* Total load over the CPUs of the group */ unsigned long sum_nr_running; /* Nr tasks running in the group */ u64 sum_weighted_load; /* Weighted load of group's tasks */
Certain gcc tool chains convert the division on a 64-bit dividend into a __aeabi_uldivmod call which does unnecessary 64-bit by 64-bit divides although the divisor is 32-bit.This 64 by 64 bit division is not implemented in the kernel for reasons of efficiency,which results in undefined reference errors during link time.Hence perform the division on 64-bit dividends using do_div() function. The below use case is the integration of Per-entity-Load-Tracking metric with the load balancer,where cfs_rq->runnable_load_avg, a 64 bit unsigned integer is used to as the base metric for load balancing.
Signed-off-by: Preeti U Murthypreeti@linux.vnet.ibm.com --- kernel/sched/fair.c | 51 +++++++++++++++++++++++++++++++-------------------- 1 file changed, 31 insertions(+), 20 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f8f3a29..7cd3096 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2982,9 +2982,13 @@ static u64 cpu_avg_load_per_task(int cpu) { struct rq *rq = cpu_rq(cpu); unsigned long nr_running = ACCESS_ONCE(rq->nr_running); + u64 cfs_avg_load_per_task;
- if (nr_running) - return rq->cfs.runnable_load_avg / nr_running; + if (nr_running) { + cfs_avg_load_per_task = rq->cfs.runnable_load_avg; + do_div(cfs_avg_load_per_task, nr_running); + return cfs_avg_load_per_task; + }
return 0; } @@ -3249,7 +3253,8 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, }
/* Adjust by relative CPU power of the group */ - avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power; + avg_load = (avg_load * SCHED_POWER_SCALE); + do_div(avg_load, group->sgp->power);
if (local_group) { this_load = avg_load; @@ -4756,7 +4761,8 @@ static inline void update_sg_lb_stats(struct lb_env *env, }
/* Adjust by relative CPU power of the group */ - sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power; + sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE); + do_div(sgs->avg_load, group->sgp->power);
/* * Consider the group unbalanced when the imbalance is larger @@ -4767,8 +4773,10 @@ static inline void update_sg_lb_stats(struct lb_env *env, * normalized nr_running number somewhere that negates * the hierarchy? */ - if (sgs->sum_nr_running) - avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; + if (sgs->sum_nr_running) { + avg_load_per_task = sgs->sum_weighted_load; + do_div(avg_load_per_task, sgs->sum_nr_running); + }
if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && (max_nr_running - min_nr_running) > 1) @@ -4953,7 +4961,7 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) u64 scaled_busy_load_per_task;
if (sds->this_nr_running) { - sds->this_load_per_task /= sds->this_nr_running; + do_div(sds->this_load_per_task, sds->this_nr_running); if (sds->busiest_load_per_task > sds->this_load_per_task) imbn = 1; @@ -4964,7 +4972,7 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
scaled_busy_load_per_task = sds->busiest_load_per_task * SCHED_POWER_SCALE; - scaled_busy_load_per_task /= sds->busiest->sgp->power; + do_div(scaled_busy_load_per_task, sds->busiest->sgp->power);
if (sds->max_load - sds->this_load + scaled_busy_load_per_task >= (scaled_busy_load_per_task * imbn)) { @@ -4985,20 +4993,21 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) pwr_now /= SCHED_POWER_SCALE;
/* Amount of load we'd subtract */ - tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) / - sds->busiest->sgp->power; + tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE); + do_div(tmp, sds->busiest->sgp->power); if (sds->max_load > tmp) pwr_move += sds->busiest->sgp->power * min(sds->busiest_load_per_task, sds->max_load - tmp);
/* Amount of load we'd add */ if (sds->max_load * sds->busiest->sgp->power < - sds->busiest_load_per_task * SCHED_POWER_SCALE) - tmp = (sds->max_load * sds->busiest->sgp->power) / - sds->this->sgp->power; - else - tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) / - sds->this->sgp->power; + sds->busiest_load_per_task * SCHED_POWER_SCALE) { + tmp = (sds->max_load * sds->busiest->sgp->power); + do_div(tmp, sds->this->sgp->power); + } else { + tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE); + do_div(tmp, sds->this->sgp->power); + } pwr_move += sds->this->sgp->power * min(sds->this_load_per_task, sds->this_load + tmp); pwr_move /= SCHED_POWER_SCALE; @@ -5018,7 +5027,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s { u64 max_pull, load_above_capacity = ~0ULL;
- sds->busiest_load_per_task /= sds->busiest_nr_running; + do_div(sds->busiest_load_per_task, sds->busiest_nr_running); if (sds->group_imb) { sds->busiest_load_per_task = min(sds->busiest_load_per_task, sds->avg_load); @@ -5043,7 +5052,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE);
- load_above_capacity /= sds->busiest->sgp->power; + do_div(load_above_capacity, sds->busiest->sgp->power); }
/* @@ -5123,7 +5132,8 @@ find_busiest_group(struct lb_env *env, int *balance) if (!sds.busiest || sds.busiest_nr_running == 0) goto ret;
- sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr; + sds.avg_load = (SCHED_POWER_SCALE * sds.total_load); + do_div(sds.avg_load, sds.total_pwr);
/* * If the busiest group is imbalanced the below checks don't @@ -5223,7 +5233,8 @@ static struct rq *find_busiest_queue(struct lb_env *env, * the load can be moved away from the cpu that is potentially * running at a lower capacity. */ - wl = (wl * SCHED_POWER_SCALE) / power; + wl = (wl * SCHED_POWER_SCALE); + do_div(wl, power);
if (wl > max_load) { max_load = wl;
Hi everyone
I conducted a few experiments with a workload to compare the following parameters with this patchset and without this patchset: 1.The performance of the workload 2.The sum of the waitime to run of the processes queued on each cpu-the cumulative latency. 3.The number of migrations of tasks between cpus.
The observations and inferences are given below:
_Experimental setup:
_1.The workload is at the end of the mail.Every run of the workload was for 10s. 2.Different number of long running and short running threads were run each time. 3.The setup was on a two socket Pre-Nehalam machine,but one socket had all its cpus offlined.Thus only one socket was active throughout the experiment.The socket consisted of 4 cores. 4.The statistics below have been collected from /proc/schedstats except throughput which is output by the workload. -Latency has been observed from the eighth field in the cpu statistics in /proc/schedstat cpu<N> 1 2 3 4 5 6 7 "8" 9 -Number of migrations has been calculated by summing up the #pulls during the idle,busy and newly_idle states of all the cpus.This is also given by /proc/schedstats
5.FieldA->#short-running-tasks [For every 10ms passed sleep for 9ms,work for 1ms] a 10% task. FieldB->#long-running-tasks Field1->Throughput with patch (records/s read) Field2->Throughput without patch (records/s read) Field3->#Migrations with patch Field4->#Migrations without patch Field5->Latency with patch Field6->Latency without patch
A B 1 2 3 4 5 6 ------------------------------------------------------------------------------------- 5 5 49,93,368 48,68,351 108 28 22s 18.3s 4 2 34,37,669 34,37,547 58 50 0.6s 0.17s 16 0 38,66,597 38,74,580 1151 1014 1.88s 1.65s
_Inferences_
1.Clearly an increase in the number of pulls can be seen with this patch,this has resulted in an increase in the latency.This *should have* resulted in a decrease in throughput but in the first two cases this is not reflected.This could be due to some error in the benchmark itself or the way I am calculating the throughput.Keeping this issue aside,I focus on the #pulls and latency effect.
2.On integrating PJT's metric with the load balancer,#Migrations increase due to the following reason, which I figured out by going through the traces.
Task1 Task3 Task2 Task4 ------ ------ Group1 Group2
Case1:Load_as_per_pjt 1028 1121 Case2:Load_without_pjt 2048 2048
Fig1.
During load balancing Case1: Group2 is overloaded,one of the tasks is moved to Group1 Case2: Group1 and Group2 are equally loaded,hence no migrations
This is observed so many times,that it is no wonder that the #migrations have increased with this patch.Here Group refers to sched_group.
3.The next obvious step was be to see if so many migrations with my patch is prudent or not.The latency numbers reflect that it is not.
4.As I said earlier,I keep throughput out of these inferences because it distracts us from something that is stark clear *Migrations incurred due to PJT's metric is not affecting the tasks positively.*
5.The above is my first observation.This does not however say that using PJT's metric with the load balancer might be a bad idea.This could mean many things out which the correct one has to be figured out.Among them I list out a few.
a)Simply replacing the existing metric used by Load Balancer with PJT's metric might not really derive the benefit that PJT's metric has to offer. b)I have not been able to figure out what kind of workloads actually benefit from the way we have applied the PJT's metric.Maybe we are using a workload which is adversely getting affected.
6.My next step in my opinion will be to resolve the following issues in the decreasing order of priority:
a)Run some other benchmark like kernbench and find out if the throughput reflects increase in latency correctly.If it does,then I will need to find out why the current benchmark was behaving weird,else I will need to go through the traces to figure out this issue. b)If I find out that the throughput is consistent with the latency,then we need to modify the strictness(the granularity of time at which the load is getting updated) with which PJT's metric is calculating load,or use it in some other way in load balancing.
Looking forward to your feedback on this :)
--------------------------BEGIN WORKLOAD--------------------------------- /* * test.c - Two instances of this program is run.One instance where sleep * time is 0 and another instance which sleeps between regular instances * of time.This is done to create both long running and short running tasks * on the cpu. * * Multiple threads are created of each instance.The threads request for a * memory chunk,write into it and then free it.This is done throughout the * period of the run. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; version 2 of the License. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 * USA */
#include <stdio.h> #include <unistd.h> #include <stdlib.h> #include <sys/mman.h> #include <pthread.h> #include <string.h> #include <time.h> #include <sys/time.h> #include <sys/resource.h> #include "malloc.h"
/* Variable entities */ static unsigned int seconds; static unsigned int threads; static unsigned int mem_chunk_size; static unsigned int sleep_at; static unsigned int sleep_interval;
/* Fixed entities */ typedef size_t mem_slot_t;/* 8 bytes */ static unsigned int slot_size = sizeof(mem_slot_t);
/* Other parameters */ static volatile int start; static time_t start_time; static unsigned int records_read; pthread_mutex_t records_count_lock = PTHREAD_MUTEX_INITIALIZER;
static unsigned int write_to_mem(void) { int i, j; mem_slot_t *scratch_pad, *temp; mem_chunk_size = slot_size * 256; mem_slot_t *end; sleep_at = 2800; /* sleep for every 2800 records-short runs,else sleep_at=0 */ sleep_interval = 9000; /* sleep for 9 ms */
for (i=0; start == 1; i++) { /* ask for a memory chunk */ scratch_pad = (mem_slot_t *)malloc(mem_chunk_size); if (scratch_pad == NULL) { fprintf(stderr,"Could not allocate memory\n"); exit(1); } end = scratch_pad + (mem_chunk_size / slot_size); /* write into this chunk */ for (temp = scratch_pad, j=0; temp < end; temp++, j++) *temp = (mem_slot_t)j;
/* Free this chunk */ free(scratch_pad);
/* Decide the duty cycle;currently 10 ms */ if (sleep_at && !(i % sleep_at)) usleep(sleep_interval);
} return (i); }
static void * thread_run(void *arg) {
unsigned int records_local;
/* Wait for the start signal */
while (start == 0);
records_local = write_to_mem();
pthread_mutex_lock(&records_count_lock); records_read += records_local; pthread_mutex_unlock(&records_count_lock);
return NULL; }
static void start_threads() { double diff_time; unsigned int i; int err; threads = 8; seconds = 10;
pthread_t thread_array[threads]; for (i = 0; i < threads; i++) { err = pthread_create(&thread_array[i], NULL, thread_run, NULL); if (err) { fprintf(stderr, "Error creating thread %d\n", i); exit(1); } } start_time = time(NULL); start = 1; sleep(seconds); start = 0; diff_time = difftime(time(NULL), start_time);
for (i = 0; i < threads; i++) { err = pthread_join(thread_array[i], NULL); if (err) { fprintf(stderr, "Error joining thread %d\n", i); exit(1); } } printf("%u records/s\n", (unsigned int) (((double) records_read)/diff_time));
} int main() { start_threads(); return 0; }
------------------------END WORKLOAD------------------------------------ Regards Preeti U Murthy