In order to save power, it would be useful to schedule work onto non-IDLE cpus instead of waking up an IDLE one.
To achieve this, we need scheduler to guide kernel frameworks (like: timers & workqueues) on which is the most preferred CPU that must be used for these tasks.
This routine returns the preferred cpu which is non-idle. It accepts max level of sched domain, upto which we can choose a CPU from. It can accept following options: SD_SIBLING, SD_MC, SD_BOOK, SD_CPU or SD_NUMA.
If user passed SD_MC, then we can return a CPU from SD_SIBLING or SD_MC. If the level requested by user is not available for the current kernel configuration, then current CPU will be returned.
If user has passed NUMA level, then we may need to go through numa_levels too. Second parameter to this routine will now come into play. Its minimum value is zero, in which case there is only one NUMA level to go through. If you want to go through all NUMA levels, pass -1 here. This should cover all NUMA levels.
This patch reuses the code from get_nohz_timer_target() routine, which had similar implementation. get_nohz_timer_target() is also modified to use sched_select_cpu() now.
Signed-off-by: Viresh Kumar viresh.kumar@linaro.org --- include/linux/sched.h | 11 +++++++ kernel/sched/core.c | 88 +++++++++++++++++++++++++++++++++++++++------------ 2 files changed, 79 insertions(+), 20 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h index 0059212..4b660ee 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -281,6 +281,10 @@ static inline void select_nohz_load_balancer(int stop_tick) { } static inline void set_cpu_sd_state_idle(void) { } #endif
+#ifdef CONFIG_SMP +extern int sched_select_cpu(int sd_max_level, u32 numa_level); +#endif + /* * Only dump TASK_* tasks. (0 for all tasks) */ @@ -868,6 +872,13 @@ enum cpu_idle_type { #define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */ #define SD_OVERLAP 0x2000 /* sched_domains of this level overlap */
+/* sched-domain levels */ +#define SD_SIBLING 0x01 /* Only for CONFIG_SCHED_SMT */ +#define SD_MC 0x02 /* Only for CONFIG_SCHED_MC */ +#define SD_BOOK 0x04 /* Only for CONFIG_SCHED_BOOK */ +#define SD_CPU 0x08 /* Always enabled */ +#define SD_NUMA 0x10 /* Only for CONFIG_NUMA */ + extern int __weak arch_sd_sibiling_asym_packing(void);
struct sched_group_power { diff --git a/kernel/sched/core.c b/kernel/sched/core.c index de97083..a14014c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -551,22 +551,7 @@ void resched_cpu(int cpu) */ int get_nohz_timer_target(void) { - int cpu = smp_processor_id(); - int i; - struct sched_domain *sd; - - rcu_read_lock(); - for_each_domain(cpu, sd) { - for_each_cpu(i, sched_domain_span(sd)) { - if (!idle_cpu(i)) { - cpu = i; - goto unlock; - } - } - } -unlock: - rcu_read_unlock(); - return cpu; + return sched_select_cpu(SD_NUMA, -1); } /* * When add_timer_on() enqueues a timer into the timer wheel of an @@ -639,6 +624,66 @@ void sched_avg_update(struct rq *rq) } }
+/* Mask of all the SD levels present in current configuration */ +static int sd_present_levels; + +/* + * This routine returns the preferred cpu which is non-idle. It accepts max + * level of sched domain, upto which we can choose a CPU from. It can accept + * following options: SD_SIBLING, SD_MC, SD_BOOK, SD_CPU or SD_NUMA. + * + * If user passed SD_MC, then we can return a CPU from SD_SIBLING or SD_MC. + * If the level requested by user is not available for the current kernel + * configuration, then current CPU will be returned. + * + * If user has passed NUMA level, then we may need to go through numa_levels + * too. Second parameter to this routine will now come into play. Its minimum + * value is zero, in which case there is only one NUMA level to go through. If + * you want to go through all NUMA levels, pass -1 here. This should cover all + * NUMA levels. + */ +int sched_select_cpu(int sd_max_level, u32 numa_level) +{ + struct sched_domain *sd; + int cpu = smp_processor_id(); + int i, sd_target_levels; + + sd_target_levels = (sd_max_level | (sd_max_level - 1)) + & sd_present_levels; + + /* return current cpu if no sd_present_levels <= sd_max_level */ + if (!sd_target_levels) + return cpu; + + rcu_read_lock(); + for_each_domain(cpu, sd) { + for_each_cpu(i, sched_domain_span(sd)) { + if (!idle_cpu(i)) { + cpu = i; + goto unlock; + } + } + + /* Do we need to go through NUMA levels now */ + if (sd_target_levels == SD_NUMA) { + /* Go through NUMA levels until numa_level is zero */ + if (numa_level--) + continue; + } + + /* + * clear first bit set in sd_target_levels, and return if no + * more sd levels must be checked + */ + sd_target_levels &= sd_target_levels - 1; + if (!sd_target_levels) + goto unlock; + } +unlock: + rcu_read_unlock(); + return cpu; +} + #else /* !CONFIG_SMP */ void resched_task(struct task_struct *p) { @@ -6188,6 +6233,7 @@ typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); struct sched_domain_topology_level { sched_domain_init_f init; sched_domain_mask_f mask; + int level_mask; int flags; int numa_level; struct sd_data data; @@ -6434,6 +6480,7 @@ sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \ *sd = SD_##type##_INIT; \ SD_INIT_NAME(sd, type); \ sd->private = &tl->data; \ + sd_present_levels |= tl->level_mask; \ return sd; \ }
@@ -6547,15 +6594,15 @@ static const struct cpumask *cpu_smt_mask(int cpu) */ static struct sched_domain_topology_level default_topology[] = { #ifdef CONFIG_SCHED_SMT - { sd_init_SIBLING, cpu_smt_mask, }, + { sd_init_SIBLING, cpu_smt_mask, SD_SIBLING, }, #endif #ifdef CONFIG_SCHED_MC - { sd_init_MC, cpu_coregroup_mask, }, + { sd_init_MC, cpu_coregroup_mask, SD_MC, }, #endif #ifdef CONFIG_SCHED_BOOK - { sd_init_BOOK, cpu_book_mask, }, + { sd_init_BOOK, cpu_book_mask, SD_BOOK, }, #endif - { sd_init_CPU, cpu_cpu_mask, }, + { sd_init_CPU, cpu_cpu_mask, SD_CPU, }, { NULL, }, };
@@ -6778,6 +6825,7 @@ static void sched_init_numa(void) }; }
+ sd_present_levels |= SD_NUMA; sched_domain_topology = tl; } #else