On 5 November 2013 23:27, Peter Zijlstra peterz@infradead.org wrote:
On Tue, Nov 05, 2013 at 03:57:23PM +0100, Vincent Guittot wrote:
Your proposal looks fine for me. It's clearly better to move in one place the configuration of sched_domain fields. Have you already got an idea about how to let architecture override the topology?
Maybe something like the below -- completely untested (my s390 compiler is on a machine that's currently powered off).
My primary need comes from the fact that the topology configuration is not the same for all cores
Do expand.. the various cpu masks used in the topology list are per cpu, is that sufficient room to wriggle or do you need more?
My current implementation sets a flag in each level (SMT, MC and CPU) to describe the power gating capabilities for the groups of cpus but the capabilities can be different for a same level; I mean that we can have a group of cpus that can power gate at MC level in the system whereas another group of CPUs can only power gate at CPU level. With the current implementation i can't make the difference so i have added the cpu parameters when setting the flags. The other solution is to add new topology levels with cpu masks that can give the power dependency with other (currently the power gating but we can have more level for frequency dependency as an example). In this case the current implementation is enough and the main difficulty will be the place where we can insert these new levels compared to current ones.
A typical example with one cluster that can power gate at core level whereas the other cluster can power gate at cluster level, will give the following domain topology:
If we set a flag in the current topology levels we should have something like below
CPU0: domain 0: span 0-1 level: SMT flags: SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES | SD_SHARE_POWERDOMAIN groups: 0 1 domain 1: span 0-7 level: MC flags: SD_SHARE_PKG_RESOURCES groups: 0-1 2-3 4-5 6-7 domain 2: span 0-15 level: CPU flags: groups: 0-7 8-15
CPU8 domain 0: span 8-9 level: SMT flags: SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES | SD_SHARE_POWERDOMAIN groups: 8 9 domain 1: span 8-15 level: MC flags: SD_SHARE_PKG_RESOURCES | SD_SHARE_POWERDOMAIN groups: 8-9 10-11 12-13 14-15 domain 2: span 0-15 level CPU flags: groups: 8-15 0-7
If we create new levels, we could have something like below
CPU0 domain 0: span 0-1 level: SMT flags: SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES groups: 0 1 domain 1: span 0-7 level: MC flags: SD_SHARE_PKG_RESOURCES groups: 0-1 2-3 4-5 6-7 domain 2: span 0-15 level PWR flags SD_NOT_SHARE_POWERDOMAIN groups: 0-1 2-3 4-5 6-7 8-15 domain 3: span 0-15 level: CPU flags: groups: 0-7 8-15
CPU8 domain 0: span 8-9 level: SMT flags: SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES groups: 8 9 domain 1: span 8-15 level: MC flags: SD_SHARE_PKG_RESOURCES groups: 8-9 10-11 12-13 14-15 domain 2: span 0-15 level PWR flags SD_NOT_SHARE_POWERDOMAIN groups: 0-1 2-3 4-5 6-7 8-15 domain 3: span 0-15 level CPU flags: groups: 8-15 0-7
Vincent
--- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -1070,3 +1070,23 @@ static int __init s390_smp_init(void) return 0; } subsys_initcall(s390_smp_init);
+static struct sched_domain_topology_level s390_topology[] = { +#ifdef CONFIG_SCHED_SMT
{ cpu_smt_mask, SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES },
+#endif +#ifdef CONFIG_SCHED_MC
{ cpu_coregroup_mask, SD_SHARE_PKG_RESOURCES },
+#endif +#ifdef CONFIG_SCHED_BOOK
{ cpu_book_mask, },
+#endif
{ cpu_cpu_mask, },
{ NULL, },
+};
+static int __init s390_sched_topology(void) +{
sched_domain_topology = s390_topology;
+} +early_initcall(s390_sched_topology); --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -889,6 +889,20 @@ void free_sched_domains(cpumask_var_t do
bool cpus_share_cache(int this_cpu, int that_cpu);
+typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
+#define SDTL_OVERLAP 0x01
+struct sched_domain_topology_level {
sched_domain_mask_f mask;
int sd_flags;
int flags;
int numa_level;
struct sd_data data;
+};
+extern struct sched_domain_topology_level *sched_domain_topology;
#else /* CONFIG_SMP */
struct sched_domain_attr; --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5377,20 +5377,6 @@ enum s_alloc { sa_none, };
-struct sched_domain_topology_level;
-typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
-#define SDTL_OVERLAP 0x01
-struct sched_domain_topology_level {
sched_domain_mask_f mask;
int sd_flags;
int flags;
int numa_level;
struct sd_data data;
-};
/*
- Build an iteration mask that can exclude certain CPUs from the upwards
- domain traversal.
@@ -5841,6 +5827,7 @@ sd_init(struct sched_domain_topology_lev
return sd;
}
/*
- Topology list, bottom-up.
*/ @@ -5851,14 +5838,11 @@ static struct sched_domain_topology_leve #ifdef CONFIG_SCHED_MC { cpu_coregroup_mask, SD_SHARE_PKG_RESOURCES }, #endif -#ifdef CONFIG_SCHED_BOOK
{ cpu_book_mask, },
-#endif { cpu_cpu_mask, }, { NULL, }, };
-static struct sched_domain_topology_level *sched_domain_topology = default_topology; +struct sched_domain_topology_level *sched_domain_topology = default_topology;
#define for_each_sd_topology(tl) \ for (tl = sched_domain_topology; tl->mask; tl++)