The cpu_load decays on time according past cpu load of rq. The sched_avg also decays tasks' load on time. Now we has 2 kind decay for cpu_load. That is a kind of redundancy. And increase the system load by decay calculation. This patch try to remove the cpu_load decay.
There are 5 load_idx used for cpu_load in sched_domain. busy_idx and idle_idx are not zero usually, but newidle_idx, wake_idx and forkexec_idx are all zero on every arch. A shortcut to remove cpu_Load decay in the first patch. just one line patch for this change.
V3, 1, correct the wake_affine bias. Thanks for Morten's reminder! 2, replace source_load by weighted_cpuload for better function name meaning.
V2, 1, This version do some tuning on load bias of target load. 2, Got further to remove the cpu_load in rq. 3, Revert the patch 'Limit sd->*_idx range on sysctl' since no needs
Any testing/comments are appreciated.
This patch rebase on latest tip/master. The git tree for this patchset at: git@github.com:alexshi/power-scheduling.git noload
Shortcut to remove rq->cpu_load[load_idx] effect in scheduler. In five load idx, only busy_idx, idle_idx are not zero. Newidle_idx, wake_idx and fork_idx are all zero in all archs.
So, change the idx to zero here can fully remove load_idx effect.
Signed-off-by: Alex Shi alex.shi@linaro.org --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 235cfa7..4fcc3a3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5908,7 +5908,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd if (child && child->flags & SD_PREFER_SIBLING) prefer_sibling = 1;
- load_idx = get_sd_load_idx(env->sd, env->idle); + load_idx = 0;
do { struct sg_lb_stats *sgs = &tmp_sgs;
Since load_idx effect removed in load balance, we don't need the load_idx decays in scheduler. that will save some process in sched_tick and others places.
Signed-off-by: Alex Shi alex.shi@linaro.org --- arch/ia64/include/asm/topology.h | 5 --- arch/metag/include/asm/topology.h | 5 --- arch/tile/include/asm/topology.h | 6 --- include/linux/sched.h | 5 --- include/linux/topology.h | 8 ---- kernel/sched/core.c | 58 +++++++----------------- kernel/sched/debug.c | 6 +-- kernel/sched/fair.c | 79 +++++++++------------------------ kernel/sched/proc.c | 92 ++------------------------------------- kernel/sched/sched.h | 3 +- 10 files changed, 42 insertions(+), 225 deletions(-)
diff --git a/arch/ia64/include/asm/topology.h b/arch/ia64/include/asm/topology.h index a2496e4..54e5b17 100644 --- a/arch/ia64/include/asm/topology.h +++ b/arch/ia64/include/asm/topology.h @@ -55,11 +55,6 @@ void build_cpu_to_node_map(void); .busy_factor = 64, \ .imbalance_pct = 125, \ .cache_nice_tries = 2, \ - .busy_idx = 2, \ - .idle_idx = 1, \ - .newidle_idx = 0, \ - .wake_idx = 0, \ - .forkexec_idx = 0, \ .flags = SD_LOAD_BALANCE \ | SD_BALANCE_NEWIDLE \ | SD_BALANCE_EXEC \ diff --git a/arch/metag/include/asm/topology.h b/arch/metag/include/asm/topology.h index 8e9c0b3..d1d15cd 100644 --- a/arch/metag/include/asm/topology.h +++ b/arch/metag/include/asm/topology.h @@ -13,11 +13,6 @@ .busy_factor = 32, \ .imbalance_pct = 125, \ .cache_nice_tries = 2, \ - .busy_idx = 3, \ - .idle_idx = 2, \ - .newidle_idx = 0, \ - .wake_idx = 0, \ - .forkexec_idx = 0, \ .flags = SD_LOAD_BALANCE \ | SD_BALANCE_FORK \ | SD_BALANCE_EXEC \ diff --git a/arch/tile/include/asm/topology.h b/arch/tile/include/asm/topology.h index d15c0d8..05f6ffe 100644 --- a/arch/tile/include/asm/topology.h +++ b/arch/tile/include/asm/topology.h @@ -57,12 +57,6 @@ static inline const struct cpumask *cpumask_of_node(int node) .busy_factor = 64, \ .imbalance_pct = 125, \ .cache_nice_tries = 1, \ - .busy_idx = 2, \ - .idle_idx = 1, \ - .newidle_idx = 0, \ - .wake_idx = 0, \ - .forkexec_idx = 0, \ - \ .flags = 1*SD_LOAD_BALANCE \ | 1*SD_BALANCE_NEWIDLE \ | 1*SD_BALANCE_EXEC \ diff --git a/include/linux/sched.h b/include/linux/sched.h index c49a258..6c416c8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -892,11 +892,6 @@ struct sched_domain { unsigned int busy_factor; /* less balancing by factor if busy */ unsigned int imbalance_pct; /* No balance until over watermark */ unsigned int cache_nice_tries; /* Leave cache hot tasks for # tries */ - unsigned int busy_idx; - unsigned int idle_idx; - unsigned int newidle_idx; - unsigned int wake_idx; - unsigned int forkexec_idx; unsigned int smt_gain;
int nohz_idle; /* NOHZ IDLE status */ diff --git a/include/linux/topology.h b/include/linux/topology.h index 12ae6ce..863fad3 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -121,9 +121,6 @@ int arch_update_cpu_topology(void); .busy_factor = 64, \ .imbalance_pct = 125, \ .cache_nice_tries = 1, \ - .busy_idx = 2, \ - .wake_idx = 0, \ - .forkexec_idx = 0, \ \ .flags = 1*SD_LOAD_BALANCE \ | 1*SD_BALANCE_NEWIDLE \ @@ -151,11 +148,6 @@ int arch_update_cpu_topology(void); .busy_factor = 64, \ .imbalance_pct = 125, \ .cache_nice_tries = 1, \ - .busy_idx = 2, \ - .idle_idx = 1, \ - .newidle_idx = 0, \ - .wake_idx = 0, \ - .forkexec_idx = 0, \ \ .flags = 1*SD_LOAD_BALANCE \ | 1*SD_BALANCE_NEWIDLE \ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fb9764f..ac2f10c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4787,64 +4787,45 @@ static void sd_free_ctl_entry(struct ctl_table **tablep) *tablep = NULL; }
-static int min_load_idx = 0; -static int max_load_idx = CPU_LOAD_IDX_MAX-1; - static void set_table_entry(struct ctl_table *entry, const char *procname, void *data, int maxlen, - umode_t mode, proc_handler *proc_handler, - bool load_idx) + umode_t mode, proc_handler *proc_handler) { entry->procname = procname; entry->data = data; entry->maxlen = maxlen; entry->mode = mode; entry->proc_handler = proc_handler; - - if (load_idx) { - entry->extra1 = &min_load_idx; - entry->extra2 = &max_load_idx; - } }
static struct ctl_table * sd_alloc_ctl_domain_table(struct sched_domain *sd) { - struct ctl_table *table = sd_alloc_ctl_entry(14); + struct ctl_table *table = sd_alloc_ctl_entry(9);
if (table == NULL) return NULL;
set_table_entry(&table[0], "min_interval", &sd->min_interval, - sizeof(long), 0644, proc_doulongvec_minmax, false); + sizeof(long), 0644, proc_doulongvec_minmax); set_table_entry(&table[1], "max_interval", &sd->max_interval, - sizeof(long), 0644, proc_doulongvec_minmax, false); - set_table_entry(&table[2], "busy_idx", &sd->busy_idx, - sizeof(int), 0644, proc_dointvec_minmax, true); - set_table_entry(&table[3], "idle_idx", &sd->idle_idx, - sizeof(int), 0644, proc_dointvec_minmax, true); - set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, - sizeof(int), 0644, proc_dointvec_minmax, true); - set_table_entry(&table[5], "wake_idx", &sd->wake_idx, - sizeof(int), 0644, proc_dointvec_minmax, true); - set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, - sizeof(int), 0644, proc_dointvec_minmax, true); - set_table_entry(&table[7], "busy_factor", &sd->busy_factor, - sizeof(int), 0644, proc_dointvec_minmax, false); - set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, - sizeof(int), 0644, proc_dointvec_minmax, false); - set_table_entry(&table[9], "cache_nice_tries", + sizeof(long), 0644, proc_doulongvec_minmax); + set_table_entry(&table[2], "busy_factor", &sd->busy_factor, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[3], "imbalance_pct", &sd->imbalance_pct, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[4], "cache_nice_tries", &sd->cache_nice_tries, - sizeof(int), 0644, proc_dointvec_minmax, false); + sizeof(int), 0644, proc_dointvec_minmax); set_table_entry(&table[10], "flags", &sd->flags, - sizeof(int), 0644, proc_dointvec_minmax, false); + sizeof(int), 0644, proc_dointvec_minmax); set_table_entry(&table[11], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, - sizeof(long), 0644, proc_doulongvec_minmax, false); + sizeof(long), 0644, proc_doulongvec_minmax); set_table_entry(&table[12], "name", sd->name, - CORENAME_MAX_SIZE, 0444, proc_dostring, false); - /* &table[13] is terminator */ + CORENAME_MAX_SIZE, 0444, proc_dostring); + /* &table[8] is terminator */
return table; } @@ -5967,11 +5948,6 @@ sd_numa_init(struct sched_domain_topology_level *tl, int cpu) .busy_factor = 32, .imbalance_pct = 125, .cache_nice_tries = 2, - .busy_idx = 3, - .idle_idx = 2, - .newidle_idx = 0, - .wake_idx = 0, - .forkexec_idx = 0,
.flags = 1*SD_LOAD_BALANCE | 1*SD_BALANCE_NEWIDLE @@ -6721,7 +6697,7 @@ DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
void __init sched_init(void) { - int i, j; + int i; unsigned long alloc_size = 0, ptr;
#ifdef CONFIG_FAIR_GROUP_SCHED @@ -6825,9 +6801,7 @@ void __init sched_init(void) init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); #endif
- for (j = 0; j < CPU_LOAD_IDX_MAX; j++) - rq->cpu_load[j] = 0; - + rq->cpu_load = 0; rq->last_load_update_tick = jiffies;
#ifdef CONFIG_SMP diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index f3344c3..a24d549 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -303,11 +303,7 @@ do { \ PN(next_balance); SEQ_printf(m, " .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr))); PN(clock); - P(cpu_load[0]); - P(cpu_load[1]); - P(cpu_load[2]); - P(cpu_load[3]); - P(cpu_load[4]); + P(cpu_load); #undef P #undef PN
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4fcc3a3..eeffe75 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1015,8 +1015,8 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, }
static unsigned long weighted_cpuload(const int cpu); -static unsigned long source_load(int cpu, int type); -static unsigned long target_load(int cpu, int type); +static unsigned long source_load(int cpu); +static unsigned long target_load(int cpu); static unsigned long power_of(int cpu); static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
@@ -3952,30 +3952,30 @@ static unsigned long weighted_cpuload(const int cpu) * We want to under-estimate the load of migration sources, to * balance conservatively. */ -static unsigned long source_load(int cpu, int type) +static unsigned long source_load(int cpu) { struct rq *rq = cpu_rq(cpu); unsigned long total = weighted_cpuload(cpu);
- if (type == 0 || !sched_feat(LB_BIAS)) + if (!sched_feat(LB_BIAS)) return total;
- return min(rq->cpu_load[type-1], total); + return min(rq->cpu_load, total); }
/* * Return a high guess at the load of a migration-target cpu weighted * according to the scheduling class and "nice" value. */ -static unsigned long target_load(int cpu, int type) +static unsigned long target_load(int cpu) { struct rq *rq = cpu_rq(cpu); unsigned long total = weighted_cpuload(cpu);
- if (type == 0 || !sched_feat(LB_BIAS)) + if (!sched_feat(LB_BIAS)) return total;
- return max(rq->cpu_load[type-1], total); + return max(rq->cpu_load, total); }
static unsigned long power_of(int cpu) @@ -4175,7 +4175,7 @@ static int wake_wide(struct task_struct *p) static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) { s64 this_load, load; - int idx, this_cpu, prev_cpu; + int this_cpu, prev_cpu; unsigned long tl_per_task; struct task_group *tg; unsigned long weight; @@ -4188,11 +4188,10 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) if (wake_wide(p)) return 0;
- idx = sd->wake_idx; this_cpu = smp_processor_id(); prev_cpu = task_cpu(p); - load = source_load(prev_cpu, idx); - this_load = target_load(this_cpu, idx); + load = source_load(prev_cpu); + this_load = target_load(this_cpu);
/* * If sync wakeup then subtract the (maximum possible) @@ -4248,7 +4247,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
if (balanced || (this_load <= load && - this_load + target_load(prev_cpu, idx) <= tl_per_task)) { + this_load + target_load(prev_cpu) <= tl_per_task)) { /* * This domain has SD_WAKE_AFFINE and * p is cache cold in this domain, and @@ -4267,17 +4266,12 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) * domain. */ static struct sched_group * -find_idlest_group(struct sched_domain *sd, struct task_struct *p, - int this_cpu, int sd_flag) +find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) { struct sched_group *idlest = NULL, *group = sd->groups; unsigned long min_load = ULONG_MAX, this_load = 0; - int load_idx = sd->forkexec_idx; int imbalance = 100 + (sd->imbalance_pct-100)/2;
- if (sd_flag & SD_BALANCE_WAKE) - load_idx = sd->wake_idx; - do { unsigned long load, avg_load; int local_group; @@ -4297,9 +4291,9 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, for_each_cpu(i, sched_group_cpus(group)) { /* Bias balancing toward cpus of our domain */ if (local_group) - load = source_load(i, load_idx); + load = source_load(i); else - load = target_load(i, load_idx); + load = target_load(i);
avg_load += load; } @@ -4453,7 +4447,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f continue; }
- group = find_idlest_group(sd, p, cpu, sd_flag); + group = find_idlest_group(sd, p, cpu); if (!group) { sd = sd->child; continue; @@ -5495,34 +5489,6 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds) }; }
-/** - * get_sd_load_idx - Obtain the load index for a given sched domain. - * @sd: The sched_domain whose load_idx is to be obtained. - * @idle: The idle status of the CPU for whose sd load_idx is obtained. - * - * Return: The load index. - */ -static inline int get_sd_load_idx(struct sched_domain *sd, - enum cpu_idle_type idle) -{ - int load_idx; - - switch (idle) { - case CPU_NOT_IDLE: - load_idx = sd->busy_idx; - break; - - case CPU_NEWLY_IDLE: - load_idx = sd->newidle_idx; - break; - default: - load_idx = sd->idle_idx; - break; - } - - return load_idx; -} - static unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) { return SCHED_POWER_SCALE; @@ -5770,12 +5736,11 @@ static inline int sg_capacity(struct lb_env *env, struct sched_group *group) * update_sg_lb_stats - Update sched_group's statistics for load balancing. * @env: The load balancing environment. * @group: sched_group whose statistics are to be updated. - * @load_idx: Load index of sched_domain of this_cpu for load calc. * @local_group: Does group contain this_cpu. * @sgs: variable to hold the statistics for this group. */ static inline void update_sg_lb_stats(struct lb_env *env, - struct sched_group *group, int load_idx, + struct sched_group *group, int local_group, struct sg_lb_stats *sgs) { unsigned long load; @@ -5788,9 +5753,9 @@ static inline void update_sg_lb_stats(struct lb_env *env,
/* Bias balancing toward cpus of our domain */ if (local_group) - load = target_load(i, load_idx); + load = target_load(i); else - load = source_load(i, load_idx); + load = source_load(i);
sgs->group_load += load; sgs->sum_nr_running += rq->nr_running; @@ -5903,13 +5868,11 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd struct sched_domain *child = env->sd->child; struct sched_group *sg = env->sd->groups; struct sg_lb_stats tmp_sgs; - int load_idx, prefer_sibling = 0; + int prefer_sibling = 0;
if (child && child->flags & SD_PREFER_SIBLING) prefer_sibling = 1;
- load_idx = 0; - do { struct sg_lb_stats *sgs = &tmp_sgs; int local_group; @@ -5924,7 +5887,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd update_group_power(env->sd, env->dst_cpu); }
- update_sg_lb_stats(env, sg, load_idx, local_group, sgs); + update_sg_lb_stats(env, sg, local_group, sgs);
if (local_group) goto next_group; diff --git a/kernel/sched/proc.c b/kernel/sched/proc.c index 16f5a30..a2435c5 100644 --- a/kernel/sched/proc.c +++ b/kernel/sched/proc.c @@ -11,7 +11,7 @@ unsigned long this_cpu_load(void) { struct rq *this = this_rq(); - return this->cpu_load[0]; + return this->cpu_load; }
@@ -398,105 +398,19 @@ static void calc_load_account_active(struct rq *this_rq) * End of global load-average stuff */
-/* - * The exact cpuload at various idx values, calculated at every tick would be - * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load - * - * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called - * on nth tick when cpu may be busy, then we have: - * load = ((2^idx - 1) / 2^idx)^(n-1) * load - * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load - * - * decay_load_missed() below does efficient calculation of - * load = ((2^idx - 1) / 2^idx)^(n-1) * load - * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load - * - * The calculation is approximated on a 128 point scale. - * degrade_zero_ticks is the number of ticks after which load at any - * particular idx is approximated to be zero. - * degrade_factor is a precomputed table, a row for each load idx. - * Each column corresponds to degradation factor for a power of two ticks, - * based on 128 point scale. - * Example: - * row 2, col 3 (=12) says that the degradation at load idx 2 after - * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8). - * - * With this power of 2 load factors, we can degrade the load n times - * by looking at 1 bits in n and doing as many mult/shift instead of - * n mult/shifts needed by the exact degradation. - */ -#define DEGRADE_SHIFT 7 -static const unsigned char - degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; -static const unsigned char - degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { - {0, 0, 0, 0, 0, 0, 0, 0}, - {64, 32, 8, 0, 0, 0, 0, 0}, - {96, 72, 40, 12, 1, 0, 0}, - {112, 98, 75, 43, 15, 1, 0}, - {120, 112, 98, 76, 45, 16, 2} };
/* - * Update cpu_load for any missed ticks, due to tickless idle. The backlog - * would be when CPU is idle and so we just decay the old load without - * adding any new load. - */ -static unsigned long -decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) -{ - int j = 0; - - if (!missed_updates) - return load; - - if (missed_updates >= degrade_zero_ticks[idx]) - return 0; - - if (idx == 1) - return load >> missed_updates; - - while (missed_updates) { - if (missed_updates % 2) - load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT; - - missed_updates >>= 1; - j++; - } - return load; -} - -/* - * Update rq->cpu_load[] statistics. This function is usually called every + * Update rq->cpu_load statistics. This function is usually called every * scheduler tick (TICK_NSEC). With tickless idle this will not be called * every tick. We fix it up based on jiffies. */ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, unsigned long pending_updates) { - int i, scale; - this_rq->nr_load_updates++;
/* Update our load: */ - this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ - for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { - unsigned long old_load, new_load; - - /* scale is effectively 1 << i now, and >> i divides by scale */ - - old_load = this_rq->cpu_load[i]; - old_load = decay_load_missed(old_load, pending_updates - 1, i); - new_load = this_load; - /* - * Round up the averaging division if load is increasing. This - * prevents us from getting stuck on 9 if the load is 10, for - * example. - */ - if (new_load > old_load) - new_load += scale - 1; - - this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; - } + this_rq->cpu_load = this_load; /* Fasttrack for idx 0 */
sched_avg_update(this_rq); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 1bf34c2..5b2d4a1 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -517,8 +517,7 @@ struct rq { unsigned int nr_numa_running; unsigned int nr_preferred_running; #endif - #define CPU_LOAD_IDX_MAX 5 - unsigned long cpu_load[CPU_LOAD_IDX_MAX]; + unsigned long cpu_load; unsigned long last_load_update_tick; #ifdef CONFIG_NO_HZ_COMMON u64 nohz_stamp;
Since we don't decay the rq->cpu_load, so we don't need the pending_updates. But we still want update rq->rt_avg, so still keep rq->last_load_update_tick and func __update_cpu_load.
Signed-off-by: Alex Shi alex.shi@linaro.org --- kernel/sched/proc.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-)
diff --git a/kernel/sched/proc.c b/kernel/sched/proc.c index a2435c5..057bb9b 100644 --- a/kernel/sched/proc.c +++ b/kernel/sched/proc.c @@ -404,8 +404,7 @@ static void calc_load_account_active(struct rq *this_rq) * scheduler tick (TICK_NSEC). With tickless idle this will not be called * every tick. We fix it up based on jiffies. */ -static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, - unsigned long pending_updates) +static void __update_cpu_load(struct rq *this_rq, unsigned long this_load) { this_rq->nr_load_updates++;
@@ -449,7 +448,6 @@ void update_idle_cpu_load(struct rq *this_rq) { unsigned long curr_jiffies = ACCESS_ONCE(jiffies); unsigned long load = get_rq_runnable_load(this_rq); - unsigned long pending_updates;
/* * bail if there's load or we're actually up-to-date. @@ -457,10 +455,9 @@ void update_idle_cpu_load(struct rq *this_rq) if (load || curr_jiffies == this_rq->last_load_update_tick) return;
- pending_updates = curr_jiffies - this_rq->last_load_update_tick; this_rq->last_load_update_tick = curr_jiffies;
- __update_cpu_load(this_rq, load, pending_updates); + __update_cpu_load(this_rq, load); }
/* @@ -483,7 +480,7 @@ void update_cpu_load_nohz(void) * We were idle, this means load 0, the current load might be * !0 due to remote wakeups and the sort. */ - __update_cpu_load(this_rq, 0, pending_updates); + __update_cpu_load(this_rq, 0); } raw_spin_unlock(&this_rq->lock); } @@ -499,7 +496,7 @@ void update_cpu_load_active(struct rq *this_rq) * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). */ this_rq->last_load_update_tick = jiffies; - __update_cpu_load(this_rq, load, 1); + __update_cpu_load(this_rq, load);
calc_load_account_active(this_rq); }
Old code considers the bias in source/target_load already. but still use imbalance_pct as last check in idlest/busiest group finding. It is also a kind of redundant job. If we bias imbalance in source/target_load, we'd better not use imbalance_pct again.
After cpu_load array removed, it is nice time to unify the target bias consideration. So I remove the imbalance_pct from last check and add the live bias using.
On wake_affine, since all archs' wake_idx is 0, current logical is just want to prefer current cpu. so we follows this logical. Just renaming the target_load/source_load to wegithed_cpuload for more exact meaning. Thanks for reminding from Morten!
Signed-off-by: Alex Shi alex.shi@linaro.org --- kernel/sched/fair.c | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index eeffe75..5a3ea72 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1016,7 +1016,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
static unsigned long weighted_cpuload(const int cpu); static unsigned long source_load(int cpu); -static unsigned long target_load(int cpu); +static unsigned long target_load(int cpu, int imbalance_pct); static unsigned long power_of(int cpu); static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
@@ -3967,7 +3967,7 @@ static unsigned long source_load(int cpu) * Return a high guess at the load of a migration-target cpu weighted * according to the scheduling class and "nice" value. */ -static unsigned long target_load(int cpu) +static unsigned long target_load(int cpu, int imbalance_pct) { struct rq *rq = cpu_rq(cpu); unsigned long total = weighted_cpuload(cpu); @@ -3975,6 +3975,11 @@ static unsigned long target_load(int cpu) if (!sched_feat(LB_BIAS)) return total;
+ /* + * Bias target load with imbalance_pct. + */ + total = total * imbalance_pct / 100; + return max(rq->cpu_load, total); }
@@ -4190,8 +4195,8 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
this_cpu = smp_processor_id(); prev_cpu = task_cpu(p); - load = source_load(prev_cpu); - this_load = target_load(this_cpu); + load = weighted_cpuload(prev_cpu); + this_load = weighted_cpuload(this_cpu);
/* * If sync wakeup then subtract the (maximum possible) @@ -4247,7 +4252,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
if (balanced || (this_load <= load && - this_load + target_load(prev_cpu) <= tl_per_task)) { + this_load + weighted_cpuload(prev_cpu) <= tl_per_task)) { /* * This domain has SD_WAKE_AFFINE and * p is cache cold in this domain, and @@ -4293,7 +4298,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) if (local_group) load = source_load(i); else - load = target_load(i); + load = target_load(i, imbalance);
avg_load += load; } @@ -4309,7 +4314,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) } } while (group = group->next, group != sd->groups);
- if (!idlest || 100*this_load < imbalance*min_load) + if (!idlest || this_load < min_load) return NULL; return idlest; } @@ -5745,6 +5750,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, { unsigned long load; int i; + int bias = 100 + (env->sd->imbalance_pct - 100) / 2;
memset(sgs, 0, sizeof(*sgs));
@@ -5752,8 +5758,8 @@ static inline void update_sg_lb_stats(struct lb_env *env, struct rq *rq = cpu_rq(i);
/* Bias balancing toward cpus of our domain */ - if (local_group) - load = target_load(i); + if (local_group && env->idle != CPU_IDLE) + load = target_load(i, bias); else load = source_load(i);
@@ -6193,14 +6199,6 @@ static struct sched_group *find_busiest_group(struct lb_env *env) if ((local->idle_cpus < busiest->idle_cpus) && busiest->sum_nr_running <= busiest->group_weight) goto out_balanced; - } else { - /* - * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use - * imbalance_pct to be conservative. - */ - if (100 * busiest->avg_load <= - env->sd->imbalance_pct * local->avg_load) - goto out_balanced; }
force_balance:
After change to sched_avg, the cpu load in idle exit was decayed. So, it maybe near zero if waking a long time sleep task, or, a full non-decay load if waking a new forked task. Then, we can use it to reflect the cpu load, don't need to pretend 0.
Signed-off-by: Alex Shi alex.shi@linaro.org --- kernel/sched/proc.c | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-)
diff --git a/kernel/sched/proc.c b/kernel/sched/proc.c index 057bb9b..383c4ba 100644 --- a/kernel/sched/proc.c +++ b/kernel/sched/proc.c @@ -461,28 +461,13 @@ void update_idle_cpu_load(struct rq *this_rq) }
/* - * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed. + * Called from tick_nohz_idle_exit() */ void update_cpu_load_nohz(void) { struct rq *this_rq = this_rq(); - unsigned long curr_jiffies = ACCESS_ONCE(jiffies); - unsigned long pending_updates; - - if (curr_jiffies == this_rq->last_load_update_tick) - return;
- raw_spin_lock(&this_rq->lock); - pending_updates = curr_jiffies - this_rq->last_load_update_tick; - if (pending_updates) { - this_rq->last_load_update_tick = curr_jiffies; - /* - * We were idle, this means load 0, the current load might be - * !0 due to remote wakeups and the sort. - */ - __update_cpu_load(this_rq, 0); - } - raw_spin_unlock(&this_rq->lock); + update_idle_cpu_load(this_rq); } #endif /* CONFIG_NO_HZ */
Don't need 'rq' variable now.
Signed-off-by: Alex Shi alex.shi@linaro.org --- kernel/sched/fair.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5a3ea72..d91d925 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3954,13 +3954,7 @@ static unsigned long weighted_cpuload(const int cpu) */ static unsigned long source_load(int cpu) { - struct rq *rq = cpu_rq(cpu); - unsigned long total = weighted_cpuload(cpu); - - if (!sched_feat(LB_BIAS)) - return total; - - return min(rq->cpu_load, total); + return weighted_cpuload(cpu); }
/* @@ -3969,7 +3963,6 @@ static unsigned long source_load(int cpu) */ static unsigned long target_load(int cpu, int imbalance_pct) { - struct rq *rq = cpu_rq(cpu); unsigned long total = weighted_cpuload(cpu);
if (!sched_feat(LB_BIAS)) @@ -3978,9 +3971,7 @@ static unsigned long target_load(int cpu, int imbalance_pct) /* * Bias target load with imbalance_pct. */ - total = total * imbalance_pct / 100; - - return max(rq->cpu_load, total); + return total * imbalance_pct / 100; }
static unsigned long power_of(int cpu)
Now, without load_idx source_load is just weighted_cpuload, so replace it to reduce a conception for cpu load.
Signed-off-by: Alex Shi alex.shi@linaro.org --- kernel/sched/fair.c | 21 ++++----------------- 1 file changed, 4 insertions(+), 17 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d91d925..66fa69b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1015,7 +1015,6 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, }
static unsigned long weighted_cpuload(const int cpu); -static unsigned long source_load(int cpu); static unsigned long target_load(int cpu, int imbalance_pct); static unsigned long power_of(int cpu); static long effective_load(struct task_group *tg, int cpu, long wl, long wg); @@ -3939,27 +3938,15 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) }
#ifdef CONFIG_SMP -/* Used instead of source_load when we know the type == 0 */ +/* weighted cpu load with runnable time, 'nice' value on CFS class */ static unsigned long weighted_cpuload(const int cpu) { return cpu_rq(cpu)->cfs.runnable_load_avg; }
/* - * Return a low guess at the load of a migration-source cpu weighted - * according to the scheduling class and "nice" value. - * - * We want to under-estimate the load of migration sources, to - * balance conservatively. - */ -static unsigned long source_load(int cpu) -{ - return weighted_cpuload(cpu); -} - -/* * Return a high guess at the load of a migration-target cpu weighted - * according to the scheduling class and "nice" value. + * according to the runnable time and "nice" value. */ static unsigned long target_load(int cpu, int imbalance_pct) { @@ -4287,7 +4274,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) for_each_cpu(i, sched_group_cpus(group)) { /* Bias balancing toward cpus of our domain */ if (local_group) - load = source_load(i); + load = weighted_cpuload(i); else load = target_load(i, imbalance);
@@ -5752,7 +5739,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, if (local_group && env->idle != CPU_IDLE) load = target_load(i, bias); else - load = source_load(i); + load = weighted_cpuload(i);
sgs->group_load += load; sgs->sum_nr_running += rq->nr_running;
The cpu_load is the copy of rq->cfs.runnable_load_avg. And it updated on time. So we can use the latter directly. Thus saved 2 rq variables: cpu_load and nr_load_updates. Then don't need __update_cpu_load(), just keep sched_avg_update(). Thus removed get_rq_runnable_load() which used for update_cpu_load only.
Signed-off-by: Alex Shi alex.shi@linaro.org --- kernel/sched/core.c | 2 -- kernel/sched/debug.c | 2 -- kernel/sched/proc.c | 55 +++++++++++++--------------------------------------- kernel/sched/sched.h | 2 -- 4 files changed, 13 insertions(+), 48 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ac2f10c..32602595 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6800,8 +6800,6 @@ void __init sched_init(void) INIT_LIST_HEAD(&rq->leaf_rt_rq_list); init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); #endif - - rq->cpu_load = 0; rq->last_load_update_tick = jiffies;
#ifdef CONFIG_SMP diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index a24d549..83737ce 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -298,12 +298,10 @@ do { \ SEQ_printf(m, " .%-30s: %lu\n", "load", rq->load.weight); P(nr_switches); - P(nr_load_updates); P(nr_uninterruptible); PN(next_balance); SEQ_printf(m, " .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr))); PN(clock); - P(cpu_load); #undef P #undef PN
diff --git a/kernel/sched/proc.c b/kernel/sched/proc.c index 383c4ba..dd3c2d9 100644 --- a/kernel/sched/proc.c +++ b/kernel/sched/proc.c @@ -8,12 +8,19 @@
#include "sched.h"
+#ifdef CONFIG_SMP unsigned long this_cpu_load(void) { - struct rq *this = this_rq(); - return this->cpu_load; + struct rq *rq = this_rq(); + return rq->cfs.runnable_load_avg; } - +#else +unsigned long this_cpu_load(void) +{ + struct rq *rq = this_rq(); + return rq->load.weight; +} +#endif
/* * Global load-average calculations @@ -398,34 +405,6 @@ static void calc_load_account_active(struct rq *this_rq) * End of global load-average stuff */
- -/* - * Update rq->cpu_load statistics. This function is usually called every - * scheduler tick (TICK_NSEC). With tickless idle this will not be called - * every tick. We fix it up based on jiffies. - */ -static void __update_cpu_load(struct rq *this_rq, unsigned long this_load) -{ - this_rq->nr_load_updates++; - - /* Update our load: */ - this_rq->cpu_load = this_load; /* Fasttrack for idx 0 */ - - sched_avg_update(this_rq); -} - -#ifdef CONFIG_SMP -static inline unsigned long get_rq_runnable_load(struct rq *rq) -{ - return rq->cfs.runnable_load_avg; -} -#else -static inline unsigned long get_rq_runnable_load(struct rq *rq) -{ - return rq->load.weight; -} -#endif - #ifdef CONFIG_NO_HZ_COMMON /* * There is no sane way to deal with nohz on smp when using jiffies because the @@ -447,17 +426,15 @@ static inline unsigned long get_rq_runnable_load(struct rq *rq) void update_idle_cpu_load(struct rq *this_rq) { unsigned long curr_jiffies = ACCESS_ONCE(jiffies); - unsigned long load = get_rq_runnable_load(this_rq);
/* * bail if there's load or we're actually up-to-date. */ - if (load || curr_jiffies == this_rq->last_load_update_tick) + if (curr_jiffies == this_rq->last_load_update_tick) return;
this_rq->last_load_update_tick = curr_jiffies; - - __update_cpu_load(this_rq, load); + sched_avg_update(this_rq); }
/* @@ -466,7 +443,6 @@ void update_idle_cpu_load(struct rq *this_rq) void update_cpu_load_nohz(void) { struct rq *this_rq = this_rq(); - update_idle_cpu_load(this_rq); } #endif /* CONFIG_NO_HZ */ @@ -476,12 +452,7 @@ void update_cpu_load_nohz(void) */ void update_cpu_load_active(struct rq *this_rq) { - unsigned long load = get_rq_runnable_load(this_rq); - /* - * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). - */ this_rq->last_load_update_tick = jiffies; - __update_cpu_load(this_rq, load); - + sched_avg_update(this_rq); calc_load_account_active(this_rq); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 5b2d4a1..c623131 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -517,7 +517,6 @@ struct rq { unsigned int nr_numa_running; unsigned int nr_preferred_running; #endif - unsigned long cpu_load; unsigned long last_load_update_tick; #ifdef CONFIG_NO_HZ_COMMON u64 nohz_stamp; @@ -530,7 +529,6 @@ struct rq {
/* capture load from *all* tasks on this cpu: */ struct load_weight load; - unsigned long nr_load_updates; u64 nr_switches;
struct cfs_rq cfs;
Since we have no cpu_load update, rename the related functions: s/update_idle_cpu_load/update_idle_rt_avg/ s/update_cpu_load_nohz/update_rt_avg_nohz/ s/update_cpu_load_active/update_avg_load_active/
No functional change.
Signed-off-by: Alex Shi alex.shi@linaro.org --- include/linux/sched.h | 2 +- kernel/sched/core.c | 2 +- kernel/sched/fair.c | 2 +- kernel/sched/proc.c | 8 ++++---- kernel/sched/sched.h | 4 ++-- kernel/time/tick-sched.c | 2 +- 6 files changed, 10 insertions(+), 10 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h index 6c416c8..f6afcb3 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -174,7 +174,7 @@ extern unsigned long this_cpu_load(void);
extern void calc_global_load(unsigned long ticks); -extern void update_cpu_load_nohz(void); +extern void update_rt_avg_nohz(void);
extern unsigned long get_parent_ip(unsigned long addr);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 32602595..74dae0e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2431,7 +2431,7 @@ void scheduler_tick(void) raw_spin_lock(&rq->lock); update_rq_clock(rq); curr->sched_class->task_tick(rq, curr, 0); - update_cpu_load_active(rq); + update_avg_load_active(rq); raw_spin_unlock(&rq->lock);
perf_event_task_tick(); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 66fa69b..6469c7a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6986,7 +6986,7 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
raw_spin_lock_irq(&rq->lock); update_rq_clock(rq); - update_idle_cpu_load(rq); + update_idle_rt_avg(rq); raw_spin_unlock_irq(&rq->lock);
rebalance_domains(rq, CPU_IDLE); diff --git a/kernel/sched/proc.c b/kernel/sched/proc.c index dd3c2d9..42b7706 100644 --- a/kernel/sched/proc.c +++ b/kernel/sched/proc.c @@ -423,7 +423,7 @@ static void calc_load_account_active(struct rq *this_rq) * Called from nohz_idle_balance() to update the load ratings before doing the * idle balance. */ -void update_idle_cpu_load(struct rq *this_rq) +void update_idle_rt_avg(struct rq *this_rq) { unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
@@ -440,17 +440,17 @@ void update_idle_cpu_load(struct rq *this_rq) /* * Called from tick_nohz_idle_exit() */ -void update_cpu_load_nohz(void) +void update_rt_avg_nohz(void) { struct rq *this_rq = this_rq(); - update_idle_cpu_load(this_rq); + update_idle_rt_avg(this_rq); } #endif /* CONFIG_NO_HZ */
/* * Called from scheduler_tick() */ -void update_cpu_load_active(struct rq *this_rq) +void update_avg_load_active(struct rq *this_rq) { this_rq->last_load_update_tick = jiffies; sched_avg_update(this_rq); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c623131..ab310c2 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -21,7 +21,7 @@ extern unsigned long calc_load_update; extern atomic_long_t calc_load_tasks;
extern long calc_load_fold_active(struct rq *this_rq); -extern void update_cpu_load_active(struct rq *this_rq); +extern void update_avg_load_active(struct rq *this_rq);
/* * Helpers for converting nanosecond timing to jiffy resolution @@ -1194,7 +1194,7 @@ extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
unsigned long to_ratio(u64 period, u64 runtime);
-extern void update_idle_cpu_load(struct rq *this_rq); +extern void update_idle_rt_avg(struct rq *this_rq);
extern void init_task_runnable_average(struct task_struct *p);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 9f8af69..b1a400a 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -866,7 +866,7 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) { /* Update jiffies first */ tick_do_update_jiffies64(now); - update_cpu_load_nohz(); + update_rt_avg_nohz();
calc_load_exit_idle(); touch_softlockup_watchdog();
task_hot doesn't need the 'sched_domain' parameter, so remove it.
Signed-off-by: Alex Shi alex.shi@linaro.org --- kernel/sched/fair.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6469c7a..b0c189d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5003,7 +5003,7 @@ static void move_task(struct task_struct *p, struct lb_env *env) * Is this task likely cache-hot: */ static int -task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) +task_hot(struct task_struct *p, u64 now) { s64 delta;
@@ -5164,7 +5164,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) * 2) task is cache cold, or * 3) too many balance attempts have failed. */ - tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd); + tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq)); if (!tsk_cache_hot) tsk_cache_hot = migrate_degrades_locality(p, env);
linaro-kernel@lists.linaro.org