From: Mark Brown broonie@linaro.org
This reposting of the series rolls together the work from myself and Zi Shen, it includes the revisions Lorenzo provided on the last round of review. It is also available at:
git://git.kernel.org/pub/scm/linux/kernel/git/broonie/misc.git tags/arm64-topology-dt-mpidr
Mark Brown (4): arm64: topology: Initialise default topology state immediately arm64: topology: Add support for topology DT bindings arm64: topology: Tell the scheduler about the relative power of cores arm64: topology: Provide relative power numbers for cores
Zi Shen Lim (2): arm64: sched: Remove unused mc_capable() and smt_capable() arm64: topology: add MPIDR-based detection
arch/arm64/include/asm/cputype.h | 5 + arch/arm64/include/asm/topology.h | 3 - arch/arm64/kernel/topology.c | 411 ++++++++++++++++++++++++++++++++++++-- 3 files changed, 401 insertions(+), 18 deletions(-)
From: Zi Shen Lim zlim@broadcom.com
Remove unused and deprecated mc_capable() and smt_capable().
Both were added recently by f6e763b93a6c ("arm64: topology: Implement basic CPU topology support"). Uses of both were removed by 8e7fbcbc22c1 ("sched: Remove stale power aware scheduling remnants and dysfunctional knobs").
Signed-off-by: Zi Shen Lim zlim@broadcom.com Signed-off-by: Mark Brown broonie@linaro.org --- arch/arm64/include/asm/topology.h | 3 --- 1 file changed, 3 deletions(-)
diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h index 0172e6d..7ebcd31 100644 --- a/arch/arm64/include/asm/topology.h +++ b/arch/arm64/include/asm/topology.h @@ -20,9 +20,6 @@ extern struct cpu_topology cpu_topology[NR_CPUS]; #define topology_core_cpumask(cpu) (&cpu_topology[cpu].core_sibling) #define topology_thread_cpumask(cpu) (&cpu_topology[cpu].thread_sibling)
-#define mc_capable() (cpu_topology[0].cluster_id != -1) -#define smt_capable() (cpu_topology[0].thread_id != -1) - void init_cpu_topology(void); void store_cpu_topology(unsigned int cpuid); const struct cpumask *cpu_coregroup_mask(int cpu);
From: Mark Brown broonie@linaro.org
As a legacy of the way 32 bit ARM did things the topology code uses a null topology map by default and then overwrites it by mapping cores with no information to a cluster by themselves later. In order to make it simpler to reset things as part of recovering from parse failures in firmware information directly set this configuration on init. A core will always be its own sibling so there should be no risk of confusion with firmware provided information.
Signed-off-by: Mark Brown broonie@linaro.org Reviewed-by: Lorenzo Pieralisi lorenzo.pieralisi@arm.com --- arch/arm64/kernel/topology.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index 3e06b0b..ff662b2 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -43,9 +43,6 @@ static void update_siblings_masks(unsigned int cpuid) * reset it to default behaviour */ pr_debug("CPU%u: No topology information configured\n", cpuid); - cpuid_topo->core_id = 0; - cpumask_set_cpu(cpuid, &cpuid_topo->core_sibling); - cpumask_set_cpu(cpuid, &cpuid_topo->thread_sibling); return; }
@@ -87,9 +84,12 @@ void __init init_cpu_topology(void) struct cpu_topology *cpu_topo = &cpu_topology[cpu];
cpu_topo->thread_id = -1; - cpu_topo->core_id = -1; + cpu_topo->core_id = 0; cpu_topo->cluster_id = -1; + cpumask_clear(&cpu_topo->core_sibling); + cpumask_set_cpu(cpu, &cpu_topo->core_sibling); cpumask_clear(&cpu_topo->thread_sibling); + cpumask_set_cpu(cpu, &cpu_topo->thread_sibling); } }
From: Mark Brown broonie@linaro.org
Add support for parsing the explicit topology bindings to discover the topology of the system.
Since it is not currently clear how to map multi-level clusters for the scheduler all leaf clusters are presented to the scheduler at the same level. This should be enough to provide good support for current systems.
Signed-off-by: Mark Brown broonie@linaro.org Reviewed-by: Lorenzo Pieralisi lorenzo.pieralisi@arm.com --- arch/arm64/kernel/topology.c | 204 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 196 insertions(+), 8 deletions(-)
diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index ff662b2..43514f9 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -17,10 +17,192 @@ #include <linux/percpu.h> #include <linux/node.h> #include <linux/nodemask.h> +#include <linux/of.h> #include <linux/sched.h>
#include <asm/topology.h>
+static int __init get_cpu_for_node(struct device_node *node) +{ + struct device_node *cpu_node; + int cpu; + + cpu_node = of_parse_phandle(node, "cpu", 0); + if (!cpu_node) + return -1; + + for_each_possible_cpu(cpu) { + if (of_get_cpu_node(cpu, NULL) == cpu_node) { + of_node_put(cpu_node); + return cpu; + } + } + + pr_crit("Unable to find CPU node for %s\n", cpu_node->full_name); + + of_node_put(cpu_node); + return -1; +} + +static int __init parse_core(struct device_node *core, int cluster_id, + int core_id) +{ + char name[10]; + bool leaf = true; + int i = 0; + int cpu; + struct device_node *t; + + do { + snprintf(name, sizeof(name), "thread%d", i); + t = of_get_child_by_name(core, name); + if (t) { + leaf = false; + cpu = get_cpu_for_node(t); + if (cpu >= 0) { + cpu_topology[cpu].cluster_id = cluster_id; + cpu_topology[cpu].core_id = core_id; + cpu_topology[cpu].thread_id = i; + } else { + pr_err("%s: Can't get CPU for thread\n", + t->full_name); + of_node_put(t); + return -EINVAL; + } + of_node_put(t); + } + i++; + } while (t); + + cpu = get_cpu_for_node(core); + if (cpu >= 0) { + if (!leaf) { + pr_err("%s: Core has both threads and CPU\n", + core->full_name); + return -EINVAL; + } + + cpu_topology[cpu].cluster_id = cluster_id; + cpu_topology[cpu].core_id = core_id; + } else if (leaf) { + pr_err("%s: Can't get CPU for leaf core\n", core->full_name); + return -EINVAL; + } + + return 0; +} + +static int __init parse_cluster(struct device_node *cluster, int depth) +{ + char name[10]; + bool leaf = true; + bool has_cores = false; + struct device_node *c; + static int cluster_id __initdata; + int core_id = 0; + int i, ret; + + /* + * First check for child clusters; we currently ignore any + * information about the nesting of clusters and present the + * scheduler with a flat list of them. + */ + i = 0; + do { + snprintf(name, sizeof(name), "cluster%d", i); + c = of_get_child_by_name(cluster, name); + if (c) { + leaf = false; + ret = parse_cluster(c, depth + 1); + of_node_put(c); + if (ret != 0) + return ret; + } + i++; + } while (c); + + /* Now check for cores */ + i = 0; + do { + snprintf(name, sizeof(name), "core%d", i); + c = of_get_child_by_name(cluster, name); + if (c) { + has_cores = true; + + if (depth == 0) { + pr_err("%s: cpu-map children should be clusters\n", + c->full_name); + of_node_put(c); + return -EINVAL; + } + + if (leaf) { + ret = parse_core(c, cluster_id, core_id++); + } else { + pr_err("%s: Non-leaf cluster with core %s\n", + cluster->full_name, name); + ret = -EINVAL; + } + + of_node_put(c); + if (ret != 0) + return ret; + } + i++; + } while (c); + + if (leaf && !has_cores) + pr_warn("%s: empty cluster\n", cluster->full_name); + + if (leaf) + cluster_id++; + + return 0; +} + +static int __init parse_dt_topology(void) +{ + struct device_node *cn, *map; + int ret = 0; + int cpu; + + cn = of_find_node_by_path("/cpus"); + if (!cn) { + pr_err("No CPU information found in DT\n"); + return 0; + } + + /* + * When topology is provided cpu-map is essentially a root + * cluster with restricted subnodes. + */ + map = of_get_child_by_name(cn, "cpu-map"); + if (!map) + goto out; + + ret = parse_cluster(map, 0); + if (ret != 0) + goto out_map; + + /* + * Check that all cores are in the topology; the SMP code will + * only mark cores described in the DT as possible. + */ + for_each_possible_cpu(cpu) { + if (cpu_topology[cpu].cluster_id == -1) { + pr_err("CPU%d: No topology information specified\n", + cpu); + ret = -EINVAL; + } + } + +out_map: + of_node_put(map); +out: + of_node_put(cn); + return ret; +} + /* * cpu topology table */ @@ -39,8 +221,7 @@ static void update_siblings_masks(unsigned int cpuid)
if (cpuid_topo->cluster_id == -1) { /* - * DT does not contain topology information for this cpu - * reset it to default behaviour + * DT does not contain topology information for this cpu. */ pr_debug("CPU%u: No topology information configured\n", cpuid); return; @@ -71,15 +252,10 @@ void store_cpu_topology(unsigned int cpuid) update_siblings_masks(cpuid); }
-/* - * init_cpu_topology is called at boot when only one cpu is running - * which prevent simultaneous write access to cpu_topology array - */ -void __init init_cpu_topology(void) +static void __init reset_cpu_topology(void) { unsigned int cpu;
- /* init core mask and power*/ for_each_possible_cpu(cpu) { struct cpu_topology *cpu_topo = &cpu_topology[cpu];
@@ -93,3 +269,15 @@ void __init init_cpu_topology(void) cpumask_set_cpu(cpu, &cpu_topo->thread_sibling); } } + +void __init init_cpu_topology(void) +{ + reset_cpu_topology(); + + /* + * Discard anything that was parsed if we hit an error so we + * don't use partial information. + */ + if (parse_dt_topology()) + reset_cpu_topology(); +}
From: Zi Shen Lim zlim@broadcom.com
Create cpu topology based on MPIDR. When hardware sets MPIDR to sane values, this method will always work. Therefore it should also work well as the fallback method. [1]
When we have multiple processing elements in the system, we create the cpu topology by mapping each affinity level (from lowest to highest) to threads (if they exist), cores, and clusters.
We combine data from all higher affinity levels into cluster_id so we don't lose any information from MPIDR. [2]
[1] http://www.spinics.net/lists/arm-kernel/msg317445.html [2] https://lkml.org/lkml/2014/4/23/703
[Raise the priority of the error message if we don't discover topology now that we can read it from MPIDIR -- broonie]
Signed-off-by: Zi Shen Lim zlim@broadcom.com Signed-off-by: Mark Brown broonie@linaro.org --- arch/arm64/include/asm/cputype.h | 5 +++++ arch/arm64/kernel/topology.c | 46 ++++++++++++++++++++++++++++++++++++---- 2 files changed, 47 insertions(+), 4 deletions(-)
diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h index c404fb0..b3b3287 100644 --- a/arch/arm64/include/asm/cputype.h +++ b/arch/arm64/include/asm/cputype.h @@ -18,6 +18,8 @@
#define INVALID_HWID ULONG_MAX
+#define MPIDR_UP_BITMASK (0x1 << 30) +#define MPIDR_MT_BITMASK (0x1 << 24) #define MPIDR_HWID_BITMASK 0xff00ffffff
#define MPIDR_LEVEL_BITS_SHIFT 3 @@ -30,6 +32,9 @@ #define MPIDR_AFFINITY_LEVEL(mpidr, level) \ ((mpidr >> MPIDR_LEVEL_SHIFT(level)) & MPIDR_LEVEL_MASK)
+#define MPIDR_AFF_MASK(level) \ + ((u64)MPIDR_LEVEL_MASK << MPIDR_LEVEL_SHIFT(level)) + #define read_cpuid(reg) ({ \ u64 __val; \ asm("mrs %0, " #reg : "=r" (__val)); \ diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index 43514f9..3b2479c 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -20,6 +20,8 @@ #include <linux/of.h> #include <linux/sched.h>
+#include <asm/cputype.h> +#include <asm/smp_plat.h> #include <asm/topology.h>
static int __init get_cpu_for_node(struct device_node *node) @@ -220,10 +222,8 @@ static void update_siblings_masks(unsigned int cpuid) int cpu;
if (cpuid_topo->cluster_id == -1) { - /* - * DT does not contain topology information for this cpu. - */ - pr_debug("CPU%u: No topology information configured\n", cpuid); + /* No topology information for this cpu ?! */ + pr_err("CPU%u: No topology information configured\n", cpuid); return; }
@@ -249,6 +249,44 @@ static void update_siblings_masks(unsigned int cpuid)
void store_cpu_topology(unsigned int cpuid) { + struct cpu_topology *cpuid_topo = &cpu_topology[cpuid]; + u64 mpidr; + + if (cpuid_topo->cluster_id != -1) + goto topology_populated; + + mpidr = read_cpuid_mpidr(); + + /* Create cpu topology mapping based on MPIDR. */ + if (mpidr & MPIDR_UP_BITMASK) { + /* Uniprocessor system */ + cpuid_topo->thread_id = -1; + cpuid_topo->core_id = MPIDR_AFFINITY_LEVEL(mpidr, 0); + cpuid_topo->cluster_id = 0; + } else if (mpidr & MPIDR_MT_BITMASK) { + /* Multiprocessor system : Multi-threads per core */ + cpuid_topo->thread_id = MPIDR_AFFINITY_LEVEL(mpidr, 0); + cpuid_topo->core_id = MPIDR_AFFINITY_LEVEL(mpidr, 1); + cpuid_topo->cluster_id = + ((mpidr & MPIDR_AFF_MASK(2)) >> mpidr_hash.shift_aff[2] | + (mpidr & MPIDR_AFF_MASK(3)) >> mpidr_hash.shift_aff[3]) + >> mpidr_hash.shift_aff[1] >> mpidr_hash.shift_aff[0]; + } else { + /* Multiprocessor system : Single-thread per core */ + cpuid_topo->thread_id = -1; + cpuid_topo->core_id = MPIDR_AFFINITY_LEVEL(mpidr, 0); + cpuid_topo->cluster_id = + ((mpidr & MPIDR_AFF_MASK(1)) >> mpidr_hash.shift_aff[1] | + (mpidr & MPIDR_AFF_MASK(2)) >> mpidr_hash.shift_aff[2] | + (mpidr & MPIDR_AFF_MASK(3)) >> mpidr_hash.shift_aff[3]) + >> mpidr_hash.shift_aff[0]; + } + + pr_debug("CPU%u: cluster %d core %d thread %d mpidr %llx\n", + cpuid, cpuid_topo->cluster_id, cpuid_topo->core_id, + cpuid_topo->thread_id, mpidr); + +topology_populated: update_siblings_masks(cpuid); }
On 02/05/14 21:38, Mark Brown wrote:
From: Zi Shen Lim zlim@broadcom.com
Create cpu topology based on MPIDR. When hardware sets MPIDR to sane values, this method will always work. Therefore it should also work well as the fallback method. [1]
When we have multiple processing elements in the system, we create the cpu topology by mapping each affinity level (from lowest to highest) to threads (if they exist), cores, and clusters.
We combine data from all higher affinity levels into cluster_id so we don't lose any information from MPIDR. [2]
[1] http://www.spinics.net/lists/arm-kernel/msg317445.html [2] https://lkml.org/lkml/2014/4/23/703
[Raise the priority of the error message if we don't discover topology now that we can read it from MPIDIR -- broonie]
Signed-off-by: Zi Shen Lim zlim@broadcom.com Signed-off-by: Mark Brown broonie@linaro.org
arch/arm64/include/asm/cputype.h | 5 +++++ arch/arm64/kernel/topology.c | 46 ++++++++++++++++++++++++++++++++++++---- 2 files changed, 47 insertions(+), 4 deletions(-)
diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h index c404fb0..b3b3287 100644 --- a/arch/arm64/include/asm/cputype.h +++ b/arch/arm64/include/asm/cputype.h @@ -18,6 +18,8 @@
#define INVALID_HWID ULONG_MAX
+#define MPIDR_UP_BITMASK (0x1 << 30) +#define MPIDR_MT_BITMASK (0x1 << 24) #define MPIDR_HWID_BITMASK 0xff00ffffff
#define MPIDR_LEVEL_BITS_SHIFT 3 @@ -30,6 +32,9 @@ #define MPIDR_AFFINITY_LEVEL(mpidr, level) \ ((mpidr >> MPIDR_LEVEL_SHIFT(level)) & MPIDR_LEVEL_MASK)
+#define MPIDR_AFF_MASK(level) \
- ((u64)MPIDR_LEVEL_MASK << MPIDR_LEVEL_SHIFT(level))
- #define read_cpuid(reg) ({ \ u64 __val; \ asm("mrs %0, " #reg : "=r" (__val)); \
diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index 43514f9..3b2479c 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -20,6 +20,8 @@ #include <linux/of.h> #include <linux/sched.h>
+#include <asm/cputype.h> +#include <asm/smp_plat.h> #include <asm/topology.h>
static int __init get_cpu_for_node(struct device_node *node) @@ -220,10 +222,8 @@ static void update_siblings_masks(unsigned int cpuid) int cpu;
if (cpuid_topo->cluster_id == -1) {
/*
* DT does not contain topology information for this cpu.
*/
pr_debug("CPU%u: No topology information configured\n", cpuid);
/* No topology information for this cpu ?! */
return; }pr_err("CPU%u: No topology information configured\n", cpuid);
@@ -249,6 +249,44 @@ static void update_siblings_masks(unsigned int cpuid)
void store_cpu_topology(unsigned int cpuid) {
- struct cpu_topology *cpuid_topo = &cpu_topology[cpuid];
- u64 mpidr;
- if (cpuid_topo->cluster_id != -1)
goto topology_populated;
- mpidr = read_cpuid_mpidr();
- /* Create cpu topology mapping based on MPIDR. */
- if (mpidr & MPIDR_UP_BITMASK) {
/* Uniprocessor system */
cpuid_topo->thread_id = -1;
cpuid_topo->core_id = MPIDR_AFFINITY_LEVEL(mpidr, 0);
cpuid_topo->cluster_id = 0;
- } else if (mpidr & MPIDR_MT_BITMASK) {
/* Multiprocessor system : Multi-threads per core */
cpuid_topo->thread_id = MPIDR_AFFINITY_LEVEL(mpidr, 0);
cpuid_topo->core_id = MPIDR_AFFINITY_LEVEL(mpidr, 1);
cpuid_topo->cluster_id =
((mpidr & MPIDR_AFF_MASK(2)) >> mpidr_hash.shift_aff[2] |
(mpidr & MPIDR_AFF_MASK(3)) >> mpidr_hash.shift_aff[3])
>> mpidr_hash.shift_aff[1] >> mpidr_hash.shift_aff[0];
This is broken, IIRC Lorenzo commented on this in the previous version of the patch.
- } else {
/* Multiprocessor system : Single-thread per core */
cpuid_topo->thread_id = -1;
cpuid_topo->core_id = MPIDR_AFFINITY_LEVEL(mpidr, 0);
cpuid_topo->cluster_id =
((mpidr & MPIDR_AFF_MASK(1)) >> mpidr_hash.shift_aff[1] |
(mpidr & MPIDR_AFF_MASK(2)) >> mpidr_hash.shift_aff[2] |
(mpidr & MPIDR_AFF_MASK(3)) >> mpidr_hash.shift_aff[3])
>> mpidr_hash.shift_aff[0];
Ditto here.
Take a simple example of system with 2 Quad core clusters. The mpidr_hash.shift_aff[1] will be 6 as you need minimum 2 bits to represent aff[0]. So you will end up with second cluster with id = 4 instead of 1.
I am not sure if we need this serialization, but even if we need it you can't simply use the hash bits generated for MPIDR.Aff{3..0} serialization directly as is for serializing parts of it.
Regards, Sudeep
On Fri, May 16, 2014 at 05:34:04PM +0100, Sudeep Holla wrote:
This is broken, IIRC Lorenzo commented on this in the previous version of the patch.
Could you please be specific? Lorenzo was concerned about overflow but that ought to be addressed here.
Take a simple example of system with 2 Quad core clusters. The mpidr_hash.shift_aff[1] will be 6 as you need minimum 2 bits to represent aff[0]. So you will end up with second cluster with id = 4 instead of 1.
This isn't a problem, the clusters can have any numbers so long as they are distinct. There is no other requirement.
I am not sure if we need this serialization, but even if we need it you can't simply use the hash bits generated for MPIDR.Aff{3..0} serialization directly as is for serializing parts of it.
Ah, now I look at what the hash is doing that is indeed directly useful - we can mask or shift out the bits we don't want. Equally well it just looks like a preference?
This does seem like something that could be dealt with incrementally.
On 16/05/14 19:39, Mark Brown wrote:
On Fri, May 16, 2014 at 05:34:04PM +0100, Sudeep Holla wrote:
This is broken, IIRC Lorenzo commented on this in the previous version of the patch.
Could you please be specific? Lorenzo was concerned about overflow but that ought to be addressed here.
Ah, my bad. You are right, I took his comment on the shift to be different from overflow which is not the case.
Take a simple example of system with 2 Quad core clusters. The mpidr_hash.shift_aff[1] will be 6 as you need minimum 2 bits to represent aff[0]. So you will end up with second cluster with id = 4 instead of 1.
This isn't a problem, the clusters can have any numbers so long as they are distinct. There is no other requirement.
IIUC these topology information is exposed via sysfs. So it's good to have uniformity though they can have any number. As mentioned in the example, if the linearisation depend on aff[0], then this factor will not be uniform.
I am not sure if we need this serialization, but even if we need it you can't simply use the hash bits generated for MPIDR.Aff{3..0} serialization directly as is for serializing parts of it.
Ah, now I look at what the hash is doing that is indeed directly useful
- we can mask or shift out the bits we don't want. Equally well it just
looks like a preference?
Yes we can use the hash bits, but the way it's done in this patch needs fixing so that we can be more uniform(as its exposed via sysfs)
This does seem like something that could be dealt with incrementally.
Sorry, I didn't mean to block this patch, I am just mentioning the possible issue with this patch.
Regards, Sudeep
On Mon, May 19, 2014 at 10:57:40AM +0100, Sudeep Holla wrote:
On 16/05/14 19:39, Mark Brown wrote:
On Fri, May 16, 2014 at 05:34:04PM +0100, Sudeep Holla wrote:
This is broken, IIRC Lorenzo commented on this in the previous version of the patch.
Could you please be specific? Lorenzo was concerned about overflow but that ought to be addressed here.
Ah, my bad. You are right, I took his comment on the shift to be different from overflow which is not the case.
Take a simple example of system with 2 Quad core clusters. The mpidr_hash.shift_aff[1] will be 6 as you need minimum 2 bits to represent aff[0]. So you will end up with second cluster with id = 4 instead of 1.
This isn't a problem, the clusters can have any numbers so long as they are distinct. There is no other requirement.
IIUC these topology information is exposed via sysfs. So it's good to have uniformity though they can have any number. As mentioned in the example, if the linearisation depend on aff[0], then this factor will not be uniform.
I am not sure if we need this serialization, but even if we need it you can't simply use the hash bits generated for MPIDR.Aff{3..0} serialization directly as is for serializing parts of it.
Ah, now I look at what the hash is doing that is indeed directly useful
- we can mask or shift out the bits we don't want. Equally well it just
looks like a preference?
Yes we can use the hash bits, but the way it's done in this patch needs fixing so that we can be more uniform(as its exposed via sysfs)
This does seem like something that could be dealt with incrementally.
Sorry, I didn't mean to block this patch, I am just mentioning the possible issue with this patch.
Hash is used to save memory, if all we need is a unique identifier we do not need the hash at all.
As to what should we use as cluster id, honestly I do not know.
I am quite tempted to use just three affinity levels for the moment and fix it when need arises, after all on ARM we have aff2 completely unused at the moment for the non-SMT systems (ie all ARM SMP systems in the mainline) and we are not coalescing affinity 2 into affinity 1 in any way.
So either you ignore aff3, or can do something like that (non-SMT):
cluster_id = MPIDR_EL1[39:32] << 16 | MPIDR_EL1[23:16] << 8 | MPIDR_EL1[15:8]
Also please remove the warning on the missing topology information, if we fall back to MPIDR_EL1 we will always have a topology of sorts, as borked as it might be, so that warning becomes useless, ie it is never triggered.
Lorenzo
On Mon, May 19, 2014 at 11:54:05AM +0100, Lorenzo Pieralisi wrote:
As to what should we use as cluster id, honestly I do not know.
I am quite tempted to use just three affinity levels for the moment and fix it when need arises, after all on ARM we have aff2 completely unused at the moment for the non-SMT systems (ie all ARM SMP systems in the mainline) and we are not coalescing affinity 2 into affinity 1 in any way.
So either you ignore aff3, or can do something like that (non-SMT):
cluster_id = MPIDR_EL1[39:32] << 16 | MPIDR_EL1[23:16] << 8 | MPIDR_EL1[15:8]
Which is roughly what the original code that you were worried about did IIRC? It seems silly to ignore the higher affinity levels since it's trivial to look at them and means the kernel will at least split everything into clusters if it does encounter some hardware that uses them as opposed to merging those distinguished only by the higher affinity levels.
Also please remove the warning on the missing topology information, if we fall back to MPIDR_EL1 we will always have a topology of sorts, as borked as it might be, so that warning becomes useless, ie it is never triggered.
I'll add something to this patch - the warning is needed if the DT code gets merged without this and it seems this one still going round the loop. :/
On Mon, May 19, 2014 at 01:33:14PM +0100, Mark Brown wrote:
On Mon, May 19, 2014 at 11:54:05AM +0100, Lorenzo Pieralisi wrote:
As to what should we use as cluster id, honestly I do not know.
I am quite tempted to use just three affinity levels for the moment and fix it when need arises, after all on ARM we have aff2 completely unused at the moment for the non-SMT systems (ie all ARM SMP systems in the mainline) and we are not coalescing affinity 2 into affinity 1 in any way.
So either you ignore aff3, or can do something like that (non-SMT):
cluster_id = MPIDR_EL1[39:32] << 16 | MPIDR_EL1[23:16] << 8 | MPIDR_EL1[15:8]
Which is roughly what the original code that you were worried about did IIRC? It seems silly to ignore the higher affinity levels since it's trivial to look at them and means the kernel will at least split everything into clusters if it does encounter some hardware that uses them as opposed to merging those distinguished only by the higher affinity levels.
No, it is not, at all, you do not remember correctly I am afraid.
Using the pseudo code above is as useful as using the hashing algorithm, they both provide you with a unique id and that's all we need for the topology.
The original code was using the hash shifts the wrong way, which could lead to incorrect behaviour.
If ignoring affinity levels is silly, then we have to fix ARM 32 bit code too, since there on ALL SMP ARM systems in the mainline, affinity level 2 *is* ignored (since they are not SMT systems).
Having said that, I really think that for the time being we should use three affinity levels (for topology purposes) as ARM 32 does and be done with this stuff.
To be 100% precise, I think the best way to do it is to add another topology level (ie "book" in the kernel or whatever you want to call it for ARM, which I think is unused) and update the parsing code and data structure accordingly so that if two clusters have the same id but belong to different "books" (aff2 or aff3 - depending on SMT) the sibling masks are still correct.
Documentation/cputopology.txt
We will cross that bridge when we come to it instead of lumping affinity levels together, that's my opinion.
Also please remove the warning on the missing topology information, if we fall back to MPIDR_EL1 we will always have a topology of sorts, as borked as it might be, so that warning becomes useless, ie it is never triggered.
I'll add something to this patch - the warning is needed if the DT code gets merged without this and it seems this one still going round the loop. :/
Yes but *this* patch is bumping the log level not the other ones and what I am saying is that when this code patch gets merged that warning is useless (ie never triggered - cluster id can't be == -1) unless I am missing something here.
Do not bother, use three affinity levels and be done with that, we will deal with aff3 (and aff2) when time comes.
Thanks, Lorenzo
On Mon, May 19, 2014 at 03:13:24PM +0100, Lorenzo Pieralisi wrote:
Using the pseudo code above is as useful as using the hashing algorithm, they both provide you with a unique id and that's all we need for the topology.
The original code was using the hash shifts the wrong way, which could lead to incorrect behaviour.
Ah, OK. I misremembered.
If ignoring affinity levels is silly, then we have to fix ARM 32 bit code too, since there on ALL SMP ARM systems in the mainline, affinity level 2 *is* ignored (since they are not SMT systems).
I think someone should, yes, though I'd be a bit surprised if anyone ever actually makes 32 bit ARM hardware where it makes any difference.
Having said that, I really think that for the time being we should use three affinity levels (for topology purposes) as ARM 32 does and be done with this stuff.
To be 100% precise, I think the best way to do it is to add another topology level (ie "book" in the kernel or whatever you want to call it for ARM, which I think is unused) and update the parsing code and data structure accordingly so that if two clusters have the same id but belong to different "books" (aff2 or aff3 - depending on SMT) the sibling masks are still correct.
Documentation/cputopology.txt
We will cross that bridge when we come to it instead of lumping affinity levels together, that's my opinion.
Right, that's about what I'm saying - I don't think we should actually do anything to describe a multi-level topology to the scheduler since it's not clear to me if the physical realities of such systems system will map on to what the scheduler thinks it's modelling well, or for that matter if the scheduler won't have a better model or be capable of automatically tuning this stuff without explicit information from the architecture by the time anyone gets around to making such hardware.
The only reason for paying attention at all at present is to ensure that we don't do something actively broken and squash clusters together should we encounter such a system - we do the same thing that the DT code, we just ignore the heirachy and report everything as a flat topology.
I'll add something to this patch - the warning is needed if the DT code gets merged without this and it seems this one still going round the loop. :/
Yes but *this* patch is bumping the log level not the other ones and what I am saying is that when this code patch gets merged that warning is useless (ie never triggered - cluster id can't be == -1) unless I am missing something here.
Right, if we have the MPIDR support then it should be removed. It's useful if we get the DT without MPIDR but not otherwise - once we can parse MPIDRs for most SoCs there should be very little reason to ever bother with the DT binding.
Do not bother, use three affinity levels and be done with that, we will deal with aff3 (and aff2) when time comes.
This would leave the MPIDR support worse than DT which seems retrograde especially given that it's so simple to differentiate the clusters. The only issue with that appears to be about precisely how to make up the cluster numbers which is a cosmetic one.
On Mon, May 19, 2014 at 05:12:17PM +0100, Mark Brown wrote:
[...]
Do not bother, use three affinity levels and be done with that, we will deal with aff3 (and aff2) when time comes.
This would leave the MPIDR support worse than DT which seems retrograde especially given that it's so simple to differentiate the clusters. The only issue with that appears to be about precisely how to make up the cluster numbers which is a cosmetic one.
That's the reason why I said you should not bother. If you want to use 4 affinity levels, I do not see the point in using the hash for that though, since all we need is a unique id which can be easily created without resorting to hashing.
Hashing compresses the cluster index, but that index is not representative of HW anyway. If you go for simple shifting we might end up with huge cluster ids, which is fine but a bit weird.
So either (1) you use three affinity levels or (2) the simplest way to combine the affinity levels.
I personally do not like squashing affinity levels, but I can live with that as long as we are done with this, functionality is ready and it is time we get that in.
Thanks, Lorenzo
On Mon, May 19, 2014 at 06:15:11PM +0100, Lorenzo Pieralisi wrote:
On Mon, May 19, 2014 at 05:12:17PM +0100, Mark Brown wrote:
This would leave the MPIDR support worse than DT which seems retrograde especially given that it's so simple to differentiate the clusters. The only issue with that appears to be about precisely how to make up the cluster numbers which is a cosmetic one.
That's the reason why I said you should not bother. If you want to use 4 affinity levels, I do not see the point in using the hash for that though, since all we need is a unique id which can be easily created without resorting to hashing.
Right, OK. I personally wouldn't have used anything non-trivial either but equally well I didn't see any strong reason not to do it either and it's what Zi Shen sent. I suspect this is based on the review comments the first time around.
On Mon, May 19, 2014 at 06:15:11PM +0100, Lorenzo Pieralisi wrote: [...]
Hashing compresses the cluster index, but that index is not representative of HW anyway. If you go for simple shifting we might end up with huge cluster ids, which is fine but a bit weird.
So either (1) you use three affinity levels or (2) the simplest way to combine the affinity levels.
Sorry for jumping in late. The original patch packs the cluster_id, in hope of providing linear mapping when the affinity tree is balanced. I'm fine with the simplest way of shifting/oring, if that's the preferred method :)
The patch below applies on top of the series Mark sent out. 1. Dropped mpidr_hash-related bits, in favor of simpler shift/or using MPIDR. 2. Also addressed Lorenzo's comment about redundant cluster_id==-1 check.
Mark, can you please apply/squash this patch?
Thanks, z
--- diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h index b3b3287..7639e8b 100644 --- a/arch/arm64/include/asm/cputype.h +++ b/arch/arm64/include/asm/cputype.h @@ -32,9 +32,6 @@ #define MPIDR_AFFINITY_LEVEL(mpidr, level) \ ((mpidr >> MPIDR_LEVEL_SHIFT(level)) & MPIDR_LEVEL_MASK)
-#define MPIDR_AFF_MASK(level) \ - ((u64)MPIDR_LEVEL_MASK << MPIDR_LEVEL_SHIFT(level)) - #define read_cpuid(reg) ({ \ u64 __val; \ asm("mrs %0, " #reg : "=r" (__val)); \ diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index f7f3478..26fc5b0 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -22,7 +22,6 @@ #include <linux/slab.h>
#include <asm/cputype.h> -#include <asm/smp_plat.h> #include <asm/topology.h>
/* @@ -364,12 +363,6 @@ static void update_siblings_masks(unsigned int cpuid) struct cpu_topology *cpu_topo, *cpuid_topo = &cpu_topology[cpuid]; int cpu;
- if (cpuid_topo->cluster_id == -1) { - /* No topology information for this cpu ?! */ - pr_err("CPU%u: No topology information configured\n", cpuid); - return; - } - /* update core and thread sibling masks */ for_each_possible_cpu(cpu) { cpu_topo = &cpu_topology[cpu]; @@ -410,19 +403,15 @@ void store_cpu_topology(unsigned int cpuid) /* Multiprocessor system : Multi-threads per core */ cpuid_topo->thread_id = MPIDR_AFFINITY_LEVEL(mpidr, 0); cpuid_topo->core_id = MPIDR_AFFINITY_LEVEL(mpidr, 1); - cpuid_topo->cluster_id = - ((mpidr & MPIDR_AFF_MASK(2)) >> mpidr_hash.shift_aff[2] | - (mpidr & MPIDR_AFF_MASK(3)) >> mpidr_hash.shift_aff[3]) - >> mpidr_hash.shift_aff[1] >> mpidr_hash.shift_aff[0]; + cpuid_topo->cluster_id = MPIDR_AFFINITY_LEVEL(mpidr, 2) | + MPIDR_AFFINITY_LEVEL(mpidr, 3) << 8; } else { /* Multiprocessor system : Single-thread per core */ cpuid_topo->thread_id = -1; cpuid_topo->core_id = MPIDR_AFFINITY_LEVEL(mpidr, 0); - cpuid_topo->cluster_id = - ((mpidr & MPIDR_AFF_MASK(1)) >> mpidr_hash.shift_aff[1] | - (mpidr & MPIDR_AFF_MASK(2)) >> mpidr_hash.shift_aff[2] | - (mpidr & MPIDR_AFF_MASK(3)) >> mpidr_hash.shift_aff[3]) - >> mpidr_hash.shift_aff[0]; + cpuid_topo->cluster_id = MPIDR_AFFINITY_LEVEL(mpidr, 1) | + MPIDR_AFFINITY_LEVEL(mpidr, 2) << 8 | + MPIDR_AFFINITY_LEVEL(mpidr, 3) << 16; }
pr_debug("CPU%u: cluster %d core %d thread %d mpidr %llx\n",
On Mon, May 19, 2014 at 10:57:40AM +0100, Sudeep Holla wrote:
On 16/05/14 19:39, Mark Brown wrote:
On Fri, May 16, 2014 at 05:34:04PM +0100, Sudeep Holla wrote:
Take a simple example of system with 2 Quad core clusters. The mpidr_hash.shift_aff[1] will be 6 as you need minimum 2 bits to represent aff[0]. So you will end up with second cluster with id = 4 instead of 1.
This isn't a problem, the clusters can have any numbers so long as they are distinct. There is no other requirement.
IIUC these topology information is exposed via sysfs. So it's good to have uniformity though they can have any number. As mentioned in the example, if the linearisation depend on aff[0], then this factor will not be uniform.
So your concern is that the width of aff[0] will vary? That's reasonable.
From: Mark Brown broonie@linaro.org
In heterogeneous systems like big.LITTLE systems the scheduler will be able to make better use of the available cores if we provide power numbers to it indicating their relative performance. Do this by parsing the CPU nodes in the DT.
This code currently has no effect as no information on the relative performance of the cores is provided.
Signed-off-by: Mark Brown broonie@linaro.org --- arch/arm64/kernel/topology.c | 153 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 153 insertions(+)
diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index 3b2479c..ed0b443 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -19,11 +19,35 @@ #include <linux/nodemask.h> #include <linux/of.h> #include <linux/sched.h> +#include <linux/slab.h>
#include <asm/cputype.h> #include <asm/smp_plat.h> #include <asm/topology.h>
+/* + * cpu power table + * This per cpu data structure describes the relative capacity of each core. + * On a heteregenous system, cores don't have the same computation capacity + * and we reflect that difference in the cpu_power field so the scheduler can + * take this difference into account during load balance. A per cpu structure + * is preferred because each CPU updates its own cpu_power field during the + * load balance except for idle cores. One idle core is selected to run the + * rebalance_domains for all idle cores and the cpu_power can be updated + * during this sequence. + */ +static DEFINE_PER_CPU(unsigned long, cpu_scale); + +unsigned long arch_scale_freq_power(struct sched_domain *sd, int cpu) +{ + return per_cpu(cpu_scale, cpu); +} + +static void set_power_scale(unsigned int cpu, unsigned long power) +{ + per_cpu(cpu_scale, cpu) = power; +} + static int __init get_cpu_for_node(struct device_node *node) { struct device_node *cpu_node; @@ -162,6 +186,38 @@ static int __init parse_cluster(struct device_node *cluster, int depth) return 0; }
+struct cpu_efficiency { + const char *compatible; + unsigned long efficiency; +}; + +/* + * Table of relative efficiency of each processors + * The efficiency value must fit in 20bit and the final + * cpu_scale value must be in the range + * 0 < cpu_scale < 3*SCHED_POWER_SCALE/2 + * in order to return at most 1 when DIV_ROUND_CLOSEST + * is used to compute the capacity of a CPU. + * Processors that are not defined in the table, + * use the default SCHED_POWER_SCALE value for cpu_scale. + */ +static const struct cpu_efficiency table_efficiency[] = { + { NULL, }, +}; + +static unsigned long *__cpu_capacity; +#define cpu_capacity(cpu) __cpu_capacity[cpu] + +static unsigned long middle_capacity = 1; + +/* + * Iterate all CPUs' descriptor in DT and compute the efficiency + * (as per table_efficiency). Also calculate a middle efficiency + * as close as possible to (max{eff_i} - min{eff_i}) / 2 + * This is later used to scale the cpu_power field such that an + * 'average' CPU is of middle power. Also see the comments near + * table_efficiency[] and update_cpu_power(). + */ static int __init parse_dt_topology(void) { struct device_node *cn, *map; @@ -205,6 +261,91 @@ out: return ret; }
+static void __init parse_dt_cpu_power(void) +{ + const struct cpu_efficiency *cpu_eff; + struct device_node *cn; + unsigned long min_capacity = ULONG_MAX; + unsigned long max_capacity = 0; + unsigned long capacity = 0; + int cpu; + + __cpu_capacity = kcalloc(nr_cpu_ids, sizeof(*__cpu_capacity), + GFP_NOWAIT); + + for_each_possible_cpu(cpu) { + const u32 *rate; + int len; + + /* Too early to use cpu->of_node */ + cn = of_get_cpu_node(cpu, NULL); + if (!cn) { + pr_err("Missing device node for CPU %d\n", cpu); + continue; + } + + for (cpu_eff = table_efficiency; cpu_eff->compatible; cpu_eff++) + if (of_device_is_compatible(cn, cpu_eff->compatible)) + break; + + if (cpu_eff->compatible == NULL) { + pr_warn("%s: Unknown CPU type\n", cn->full_name); + continue; + } + + rate = of_get_property(cn, "clock-frequency", &len); + if (!rate || len != 4) { + pr_err("%s: Missing clock-frequency property\n", + cn->full_name); + continue; + } + + capacity = ((be32_to_cpup(rate)) >> 20) * cpu_eff->efficiency; + + /* Save min capacity of the system */ + if (capacity < min_capacity) + min_capacity = capacity; + + /* Save max capacity of the system */ + if (capacity > max_capacity) + max_capacity = capacity; + + cpu_capacity(cpu) = capacity; + } + + /* If min and max capacities are equal we bypass the update of the + * cpu_scale because all CPUs have the same capacity. Otherwise, we + * compute a middle_capacity factor that will ensure that the capacity + * of an 'average' CPU of the system will be as close as possible to + * SCHED_POWER_SCALE, which is the default value, but with the + * constraint explained near table_efficiency[]. + */ + if (min_capacity == max_capacity) + return; + else if (4 * max_capacity < (3 * (max_capacity + min_capacity))) + middle_capacity = (min_capacity + max_capacity) + >> (SCHED_POWER_SHIFT+1); + else + middle_capacity = ((max_capacity / 3) + >> (SCHED_POWER_SHIFT-1)) + 1; +} + +/* + * Look for a customed capacity of a CPU in the cpu_topo_data table during the + * boot. The update of all CPUs is in O(n^2) for heteregeneous system but the + * function returns directly for SMP system. + */ +static void update_cpu_power(unsigned int cpu) +{ + if (!cpu_capacity(cpu)) + return; + + set_power_scale(cpu, cpu_capacity(cpu) / middle_capacity); + + pr_info("CPU%u: update cpu_power %lu\n", + cpu, arch_scale_freq_power(NULL, cpu)); +} + /* * cpu topology table */ @@ -288,6 +429,7 @@ void store_cpu_topology(unsigned int cpuid)
topology_populated: update_siblings_masks(cpuid); + update_cpu_power(cpuid); }
static void __init reset_cpu_topology(void) @@ -308,6 +450,14 @@ static void __init reset_cpu_topology(void) } }
+static void __init reset_cpu_power(void) +{ + unsigned int cpu; + + for_each_possible_cpu(cpu) + set_power_scale(cpu, SCHED_POWER_SCALE); +} + void __init init_cpu_topology(void) { reset_cpu_topology(); @@ -318,4 +468,7 @@ void __init init_cpu_topology(void) */ if (parse_dt_topology()) reset_cpu_topology(); + + reset_cpu_power(); + parse_dt_cpu_power(); }
From: Mark Brown broonie@linaro.org
Provide performance numbers to the scheduler to help it fill the cores in the system on big.LITTLE systems. With the current scheduler this may perform poorly for applications that try to do OpenMP style work over all cores but should help for more common workloads. The current 32 bit ARM implementation provides a similar estimate so this helps ensure that work to improve big.LITTLE systems on ARMv7 systems performs similarly on ARMv8 systems.
The power numbers are the same as for ARMv7 since it seems that the expected differential between the big and little cores is very similar on both ARMv7 and ARMv8. In both ARMv7 and ARMv8 cases the numbers were based on the published DMIPS numbers.
These numbers are just an initial and basic approximation for use with the current scheduler, it is likely that both experience with silicon and ongoing work on improving the scheduler will lead to further tuning or will tune automatically at runtime and so make the specific choice of numbers here less critical.
Signed-off-by: Mark Brown broonie@linaro.org --- arch/arm64/kernel/topology.c | 2 ++ 1 file changed, 2 insertions(+)
diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index ed0b443..f7f3478 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -202,6 +202,8 @@ struct cpu_efficiency { * use the default SCHED_POWER_SCALE value for cpu_scale. */ static const struct cpu_efficiency table_efficiency[] = { + { "arm,cortex-a57", 3891 }, + { "arm,cortex-a53", 2048 }, { NULL, }, };
linaro-kernel@lists.linaro.org