From: Mark Brown broonie@linaro.org
Add support for parsing the explicit topology bindings to discover the topology of the system.
Since it is not currently clear how to map multi-level clusters for the scheduler all leaf clusters are presented to the scheduler at the same level. This should be enough to provide good support for current systems.
Signed-off-by: Mark Brown broonie@linaro.org ---
Stylistic updates requested by Lorenzo and a fix for a missing error check in DT parsing.
arch/arm64/kernel/topology.c | 172 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 166 insertions(+), 6 deletions(-)
diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index 3e06b0be4ec8..548f04572e26 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -17,10 +17,163 @@ #include <linux/percpu.h> #include <linux/node.h> #include <linux/nodemask.h> +#include <linux/of.h> #include <linux/sched.h>
#include <asm/topology.h>
+#ifdef CONFIG_OF +static int __init get_cpu_for_node(struct device_node *node) +{ + struct device_node *cpu_node; + int cpu; + + cpu_node = of_parse_phandle(node, "cpu", 0); + if (!cpu_node) + return -1; + + for_each_possible_cpu(cpu) { + if (of_get_cpu_node(cpu, NULL) == cpu_node) + return cpu; + } + + pr_crit("Unable to find CPU node for %s\n", cpu_node->full_name); + return -1; +} + +static int __init parse_core(struct device_node *core, int cluster_id, + int core_id) +{ + char name[10]; + bool leaf = true; + int i = 0; + int cpu; + struct device_node *t; + + do { + snprintf(name, sizeof(name), "thread%d", i); + t = of_get_child_by_name(core, name); + if (t) { + leaf = false; + cpu = get_cpu_for_node(t); + if (cpu >= 0) { + cpu_topology[cpu].cluster_id = cluster_id; + cpu_topology[cpu].core_id = core_id; + cpu_topology[cpu].thread_id = i; + } else { + pr_err("%s: Can't get CPU for thread\n", + t->full_name); + return -EINVAL; + } + } + i++; + } while (t); + + cpu = get_cpu_for_node(core); + if (cpu >= 0) { + if (!leaf) { + pr_err("%s: Core has both threads and CPU\n", + core->full_name); + return -EINVAL; + } + + cpu_topology[cpu].cluster_id = cluster_id; + cpu_topology[cpu].core_id = core_id; + } else if (leaf) { + pr_err("%s: Can't get CPU for leaf core\n", core->full_name); + return -EINVAL; + } + + return 0; +} + +static int __init parse_cluster(struct device_node *cluster, int depth) +{ + char name[10]; + bool leaf = true; + bool has_cores = false; + struct device_node *c; + static int __initdata cluster_id; + int core_id = 0; + int i, ret; + + /* + * First check for child clusters; we currently ignore any + * information about the nesting of clusters and present the + * scheduler with a flat list of them. + */ + i = 0; + do { + snprintf(name, sizeof(name), "cluster%d", i); + c = of_get_child_by_name(cluster, name); + if (c) { + ret = parse_cluster(c, depth + 1); + if (ret != 0) + return ret; + leaf = false; + } + i++; + } while (c); + + /* Now check for cores */ + i = 0; + do { + snprintf(name, sizeof(name), "core%d", i); + c = of_get_child_by_name(cluster, name); + if (c) { + has_cores = true; + + if (depth == 0) + pr_err("%s: cpu-map children should be clusters\n", + c->full_name); + + if (leaf) { + ret = parse_core(c, cluster_id, core_id++); + if (ret != 0) { + return ret; + } + } else { + pr_err("%s: Non-leaf cluster with core %s\n", + cluster->full_name, name); + return -EINVAL; + } + } + i++; + } while (c); + + if (leaf && !has_cores) + pr_warn("%s: empty cluster\n", cluster->full_name); + + if (leaf) + cluster_id++; + + return 0; +} + +static int __init parse_dt_topology(void) +{ + struct device_node *cn; + + cn = of_find_node_by_path("/cpus"); + if (!cn) { + pr_err("No CPU information found in DT\n"); + return 0; + } + + /* + * When topology is provided cpu-map is essentially a root + * cluster with restricted subnodes. + */ + cn = of_get_child_by_name(cn, "cpu-map"); + if (!cn) + return 0; + return parse_cluster(cn, 0); +} + +#else +static inline int parse_dt_topology(void) { return 0; } +#endif + /* * cpu topology table */ @@ -74,15 +227,10 @@ void store_cpu_topology(unsigned int cpuid) update_siblings_masks(cpuid); }
-/* - * init_cpu_topology is called at boot when only one cpu is running - * which prevent simultaneous write access to cpu_topology array - */ -void __init init_cpu_topology(void) +static void __init reset_cpu_topology(void) { unsigned int cpu;
- /* init core mask and power*/ for_each_possible_cpu(cpu) { struct cpu_topology *cpu_topo = &cpu_topology[cpu];
@@ -93,3 +241,15 @@ void __init init_cpu_topology(void) cpumask_clear(&cpu_topo->thread_sibling); } } + +void __init init_cpu_topology(void) +{ + reset_cpu_topology(); + + /* + * Discard anything that was parsed if we hit an error so we + * don't use partial information. + */ + if (parse_dt_topology()) + reset_cpu_topology(); +}
From: Mark Brown broonie@linaro.org
In heterogeneous systems like big.LITTLE systems the scheduler will be able to make better use of the available cores if we provide power numbers to it indicating their relative performance. Do this by parsing the CPU nodes in the DT.
This code currently has no effect as no information on the relative performance of the cores is provided.
Signed-off-by: Mark Brown broonie@linaro.org --- arch/arm64/kernel/topology.c | 146 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 145 insertions(+), 1 deletion(-)
diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index 548f04572e26..6713c7de4be3 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -19,9 +19,33 @@ #include <linux/nodemask.h> #include <linux/of.h> #include <linux/sched.h> +#include <linux/slab.h>
#include <asm/topology.h>
+/* + * cpu power table + * This per cpu data structure describes the relative capacity of each core. + * On a heteregenous system, cores don't have the same computation capacity + * and we reflect that difference in the cpu_power field so the scheduler can + * take this difference into account during load balance. A per cpu structure + * is preferred because each CPU updates its own cpu_power field during the + * load balance except for idle cores. One idle core is selected to run the + * rebalance_domains for all idle cores and the cpu_power can be updated + * during this sequence. + */ +static DEFINE_PER_CPU(unsigned long, cpu_scale); + +unsigned long arch_scale_freq_power(struct sched_domain *sd, int cpu) +{ + return per_cpu(cpu_scale, cpu); +} + +static void set_power_scale(unsigned int cpu, unsigned long power) +{ + per_cpu(cpu_scale, cpu) = power; +} + #ifdef CONFIG_OF static int __init get_cpu_for_node(struct device_node *node) { @@ -150,9 +174,49 @@ static int __init parse_cluster(struct device_node *cluster, int depth) return 0; }
+struct cpu_efficiency { + const char *compatible; + unsigned long efficiency; +}; + +/* + * Table of relative efficiency of each processors + * The efficiency value must fit in 20bit and the final + * cpu_scale value must be in the range + * 0 < cpu_scale < 3*SCHED_POWER_SCALE/2 + * in order to return at most 1 when DIV_ROUND_CLOSEST + * is used to compute the capacity of a CPU. + * Processors that are not defined in the table, + * use the default SCHED_POWER_SCALE value for cpu_scale. + */ +static const struct cpu_efficiency table_efficiency[] = { + { NULL, }, +}; + +static unsigned long *__cpu_capacity; +#define cpu_capacity(cpu) __cpu_capacity[cpu] + +static unsigned long middle_capacity = 1; + +/* + * Iterate all CPUs' descriptor in DT and compute the efficiency + * (as per table_efficiency). Also calculate a middle efficiency + * as close as possible to (max{eff_i} - min{eff_i}) / 2 + * This is later used to scale the cpu_power field such that an + * 'average' CPU is of middle power. Also see the comments near + * table_efficiency[] and update_cpu_power(). + */ static int __init parse_dt_topology(void) { + const struct cpu_efficiency *cpu_eff; struct device_node *cn; + unsigned long min_capacity = ULONG_MAX; + unsigned long max_capacity = 0; + unsigned long capacity = 0; + int cpu, ret; + + __cpu_capacity = kcalloc(nr_cpu_ids, sizeof(*__cpu_capacity), + GFP_NOWAIT);
cn = of_find_node_by_path("/cpus"); if (!cn) { @@ -167,11 +231,88 @@ static int __init parse_dt_topology(void) cn = of_get_child_by_name(cn, "cpu-map"); if (!cn) return 0; - return parse_cluster(cn, 0); + + ret = parse_cluster(cn, 0); + if (ret != 0) + return ret; + + for_each_possible_cpu(cpu) { + const u32 *rate; + int len; + + /* Too early to use cpu->of_node */ + cn = of_get_cpu_node(cpu, NULL); + if (!cn) { + pr_err("Missing device node for CPU %d\n", cpu); + continue; + } + + for (cpu_eff = table_efficiency; cpu_eff->compatible; cpu_eff++) + if (of_device_is_compatible(cn, cpu_eff->compatible)) + break; + + if (cpu_eff->compatible == NULL) { + pr_warn("%s: Unknown CPU type\n", cn->full_name); + continue; + } + + rate = of_get_property(cn, "clock-frequency", &len); + if (!rate || len != 4) { + pr_err("%s: Missing clock-frequency property\n", + cn->full_name); + continue; + } + + capacity = ((be32_to_cpup(rate)) >> 20) * cpu_eff->efficiency; + + /* Save min capacity of the system */ + if (capacity < min_capacity) + min_capacity = capacity; + + /* Save max capacity of the system */ + if (capacity > max_capacity) + max_capacity = capacity; + + cpu_capacity(cpu) = capacity; + } + + /* If min and max capacities are equal we bypass the update of the + * cpu_scale because all CPUs have the same capacity. Otherwise, we + * compute a middle_capacity factor that will ensure that the capacity + * of an 'average' CPU of the system will be as close as possible to + * SCHED_POWER_SCALE, which is the default value, but with the + * constraint explained near table_efficiency[]. + */ + if (min_capacity == max_capacity) + return 0; + else if (4 * max_capacity < (3 * (max_capacity + min_capacity))) + middle_capacity = (min_capacity + max_capacity) + >> (SCHED_POWER_SHIFT+1); + else + middle_capacity = ((max_capacity / 3) + >> (SCHED_POWER_SHIFT-1)) + 1; + +} + +/* + * Look for a customed capacity of a CPU in the cpu_topo_data table during the + * boot. The update of all CPUs is in O(n^2) for heteregeneous system but the + * function returns directly for SMP system. + */ +static void update_cpu_power(unsigned int cpu) +{ + if (!cpu_capacity(cpu)) + return; + + set_power_scale(cpu, cpu_capacity(cpu) / middle_capacity); + + pr_info("CPU%u: update cpu_power %lu\n", + cpu, arch_scale_freq_power(NULL, cpu)); }
#else static inline int parse_dt_topology(void) { return 0; } +static inline void update_cpu_power(unsigned int cpuid) {} #endif
/* @@ -225,6 +366,7 @@ static void update_siblings_masks(unsigned int cpuid) void store_cpu_topology(unsigned int cpuid) { update_siblings_masks(cpuid); + update_cpu_power(cpuid); }
static void __init reset_cpu_topology(void) @@ -239,6 +381,8 @@ static void __init reset_cpu_topology(void) cpu_topo->cluster_id = -1; cpumask_clear(&cpu_topo->core_sibling); cpumask_clear(&cpu_topo->thread_sibling); + + set_power_scale(cpu, SCHED_POWER_SCALE); } }
From: Mark Brown broonie@linaro.org
Provide performance numbers to the scheduler to help it fill the cores in the system on big.LITTLE systems. With the current scheduler this may perform poorly for applications that try to do OpenMP style work over all cores but should help for more common workloads. The current 32 bit ARM implementation provides a similar estimate so this helps ensure that work to improve big.LITTLE systems on ARMv7 systems performs similarly on ARMv8 systems.
The power numbers are the same as for ARMv7 since it seems that the expected differential between the big and little cores is very similar on both ARMv7 and ARMv8. In both ARMv7 and ARMv8 cases the numbers were based on the published DMIPS numbers.
These numbers are just an initial and basic approximation for use with the current scheduler, it is likely that both experience with silicon and ongoing work on improving the scheduler will lead to further tuning or will tune automatically at runtime and so make the specific choice of numbers here less critical.
Signed-off-by: Mark Brown broonie@linaro.org --- arch/arm64/kernel/topology.c | 2 ++ 1 file changed, 2 insertions(+)
diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index 6713c7de4be3..01ab52be2764 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -190,6 +190,8 @@ struct cpu_efficiency { * use the default SCHED_POWER_SCALE value for cpu_scale. */ static const struct cpu_efficiency table_efficiency[] = { + { "arm,cortex-a57", 3891 }, + { "arm,cortex-a53", 2048 }, { NULL, }, };
On Wed, Mar 19, 2014 at 06:02:17PM +0000, Mark Brown wrote:
+#ifdef CONFIG_OF
This ifdef can be removed, CONFIG_OF is always selected for arm64 and the !CONFIG_OF path
+#else +static inline int parse_dt_topology(void) { return 0; } +#endif
is wrong, it should return failure. You should remove the CONFIG_OF ifdeffery.
+static int __init get_cpu_for_node(struct device_node *node) +{
- struct device_node *cpu_node;
- int cpu;
- cpu_node = of_parse_phandle(node, "cpu", 0);
- if (!cpu_node)
return -1;
- for_each_possible_cpu(cpu) {
if (of_get_cpu_node(cpu, NULL) == cpu_node)
return cpu;
- }
- pr_crit("Unable to find CPU node for %s\n", cpu_node->full_name);
- return -1;
+}
+static int __init parse_core(struct device_node *core, int cluster_id,
int core_id)
+{
- char name[10];
- bool leaf = true;
- int i = 0;
- int cpu;
- struct device_node *t;
- do {
snprintf(name, sizeof(name), "thread%d", i);
t = of_get_child_by_name(core, name);
if (t) {
leaf = false;
cpu = get_cpu_for_node(t);
if (cpu >= 0) {
cpu_topology[cpu].cluster_id = cluster_id;
cpu_topology[cpu].core_id = core_id;
cpu_topology[cpu].thread_id = i;
} else {
pr_err("%s: Can't get CPU for thread\n",
t->full_name);
return -EINVAL;
}
}
i++;
- } while (t);
- cpu = get_cpu_for_node(core);
- if (cpu >= 0) {
if (!leaf) {
pr_err("%s: Core has both threads and CPU\n",
core->full_name);
return -EINVAL;
}
cpu_topology[cpu].cluster_id = cluster_id;
cpu_topology[cpu].core_id = core_id;
- } else if (leaf) {
pr_err("%s: Can't get CPU for leaf core\n", core->full_name);
return -EINVAL;
- }
- return 0;
+}
+static int __init parse_cluster(struct device_node *cluster, int depth) +{
- char name[10];
- bool leaf = true;
- bool has_cores = false;
- struct device_node *c;
- static int __initdata cluster_id;
WARNING: __initdata should be placed after cluster_id #103: FILE: arch/arm64/kernel/topology.c:96: + static int __initdata cluster_id;
- int core_id = 0;
- int i, ret;
- /*
* First check for child clusters; we currently ignore any
* information about the nesting of clusters and present the
* scheduler with a flat list of them.
*/
- i = 0;
- do {
snprintf(name, sizeof(name), "cluster%d", i);
c = of_get_child_by_name(cluster, name);
if (c) {
ret = parse_cluster(c, depth + 1);
if (ret != 0)
return ret;
leaf = false;
}
i++;
- } while (c);
- /* Now check for cores */
- i = 0;
- do {
snprintf(name, sizeof(name), "core%d", i);
c = of_get_child_by_name(cluster, name);
if (c) {
has_cores = true;
if (depth == 0)
pr_err("%s: cpu-map children should be clusters\n",
c->full_name);
if (leaf) {
ret = parse_core(c, cluster_id, core_id++);
if (ret != 0) {
return ret;
}
WARNING: braces {} are not necessary for single statement blocks #139: FILE: arch/arm64/kernel/topology.c:132: + if (ret != 0) { + return ret; + }
} else {
pr_err("%s: Non-leaf cluster with core %s\n",
cluster->full_name, name);
return -EINVAL;
}
}
i++;
- } while (c);
- if (leaf && !has_cores)
pr_warn("%s: empty cluster\n", cluster->full_name);
- if (leaf)
cluster_id++;
- return 0;
+}
+static int __init parse_dt_topology(void) +{
- struct device_node *cn;
- cn = of_find_node_by_path("/cpus");
- if (!cn) {
pr_err("No CPU information found in DT\n");
return 0;
- }
- /*
* When topology is provided cpu-map is essentially a root
* cluster with restricted subnodes.
*/
- cn = of_get_child_by_name(cn, "cpu-map");
- if (!cn)
return 0;
- return parse_cluster(cn, 0);
We still have a problem here. If the topology does not contain bindings for some cpu nodes, parse_cluster() does not fail and we end up with an incomplete topology. We have two choices: either we check the topology info for all possible cpus here and reset if there is missing information or we do the lazy version and reset the topology (for all possible cpus) in update_siblings_masks(). I'd rather do it here, in preparation for MPIDR_EL1 fallback solution (where there will always be topology information configured and the register will always be there in all its glory).
This also means that update_sibling_masks() should just pr_debug on missing information since by the time a cpu get there either the topology has been parsed correctly or it has been reset for all CPUs, no need to reset it again.
Lorenzo
On Thu, Mar 20, 2014 at 11:26:50AM +0000, Lorenzo Pieralisi wrote:
On Wed, Mar 19, 2014 at 06:02:17PM +0000, Mark Brown wrote:
+#ifdef CONFIG_OF
This ifdef can be removed, CONFIG_OF is always selected for arm64 and the !CONFIG_OF path
This has been present since the very first time these patches were posted but hasn't been mentioned as being a problem previously.
+#else +static inline int parse_dt_topology(void) { return 0; } +#endif
is wrong, it should return failure. You should remove the CONFIG_OF ifdeffery.
Yup. It actually won't affect the behaviour at present though - since it won't do anything the result will be just the same as if we return an error and reset.
Given ACPI (which really looks like it's going to happen at some point and presumably make OF optional) I'm not sure removing the handling of OF is actually constructive but whatever, it's done now...
if (leaf) {
ret = parse_core(c, cluster_id, core_id++);
if (ret != 0) {
return ret;
}
WARNING: braces {} are not necessary for single statement blocks #139: FILE: arch/arm64/kernel/topology.c:132:
Like I say I don't think checkpatch is being helpful on this one, the code looks worse. Again, whatever.
We still have a problem here. If the topology does not contain bindings for some cpu nodes, parse_cluster() does not fail and we end up with an incomplete topology. We have two choices: either we check the topology
Hrm, looking at the topology binding it doesn't specificially require that the topology be complete. I can see why you would want that.
I'd rather do it here, in preparation for MPIDR_EL1 fallback solution (where there will always be topology information configured and the register will always be there in all its glory).
To be honest at this point I think what I want to do is go back to the original approach of layering DT on top of MPIDR. MPIDR is smaller and simpler code so seems more likely to make progress. I really do expect that for a very large proportion of systems it'll be sufficient.
On Thu, Mar 20, 2014 at 01:43:57PM +0000, Mark Brown wrote:
On Thu, Mar 20, 2014 at 11:26:50AM +0000, Lorenzo Pieralisi wrote:
On Wed, Mar 19, 2014 at 06:02:17PM +0000, Mark Brown wrote:
+#ifdef CONFIG_OF
This ifdef can be removed, CONFIG_OF is always selected for arm64 and the !CONFIG_OF path
This has been present since the very first time these patches were posted but hasn't been mentioned as being a problem previously.
I recall I mentioned the unneeded CONFIG_OF in the past but I can't really tell whether it was for this set of patches or not.
+#else +static inline int parse_dt_topology(void) { return 0; } +#endif
is wrong, it should return failure. You should remove the CONFIG_OF ifdeffery.
Yup. It actually won't affect the behaviour at present though - since it won't do anything the result will be just the same as if we return an error and reset.
Given ACPI (which really looks like it's going to happen at some point and presumably make OF optional) I'm not sure removing the handling of OF is actually constructive but whatever, it's done now...
CONFIG_OF will always be enabled in the kernel even when we get ACPI. We still use the chosen DT node to tell the kernel about ACPI.
We still have a problem here. If the topology does not contain bindings for some cpu nodes, parse_cluster() does not fail and we end up with an incomplete topology. We have two choices: either we check the topology
Hrm, looking at the topology binding it doesn't specificially require that the topology be complete. I can see why you would want that.
I'd rather do it here, in preparation for MPIDR_EL1 fallback solution (where there will always be topology information configured and the register will always be there in all its glory).
To be honest at this point I think what I want to do is go back to the original approach of layering DT on top of MPIDR. MPIDR is smaller and simpler code so seems more likely to make progress. I really do expect that for a very large proportion of systems it'll be sufficient.
Do you mean the physical MPIDR_EL1 or the DT representation of MPIDR_EL1?
On Thu, Mar 20, 2014 at 05:19:32PM +0000, Catalin Marinas wrote:
On Thu, Mar 20, 2014 at 01:43:57PM +0000, Mark Brown wrote:
To be honest at this point I think what I want to do is go back to the original approach of layering DT on top of MPIDR. MPIDR is smaller and simpler code so seems more likely to make progress. I really do expect that for a very large proportion of systems it'll be sufficient.
Do you mean the physical MPIDR_EL1 or the DT representation of MPIDR_EL1?
Well, the affinities need to be the same anyway (so we can tie the hardware to the description in DT) though we need to use the physical register to get the MT bit since the binding requires that this be omitted from the value stored in DT. Lorenzo was keen on paying attention to the MT bit which does seem like a reasonable thing to do.
On Thu, Mar 20, 2014 at 05:52:53PM +0000, Mark Brown wrote:
On Thu, Mar 20, 2014 at 05:19:32PM +0000, Catalin Marinas wrote:
On Thu, Mar 20, 2014 at 01:43:57PM +0000, Mark Brown wrote:
To be honest at this point I think what I want to do is go back to the original approach of layering DT on top of MPIDR. MPIDR is smaller and simpler code so seems more likely to make progress. I really do expect that for a very large proportion of systems it'll be sufficient.
Do you mean the physical MPIDR_EL1 or the DT representation of MPIDR_EL1?
Well, the affinities need to be the same anyway (so we can tie the hardware to the description in DT) though we need to use the physical register to get the MT bit since the binding requires that this be omitted from the value stored in DT. Lorenzo was keen on paying attention to the MT bit which does seem like a reasonable thing to do.
OK, as long as topology in DT takes priority (in case the hardware got it wrong).
On Thu, Mar 20, 2014 at 05:19:32PM +0000, Catalin Marinas wrote:
On Thu, Mar 20, 2014 at 01:43:57PM +0000, Mark Brown wrote:
Given ACPI (which really looks like it's going to happen at some point and presumably make OF optional) I'm not sure removing the handling of OF is actually constructive but whatever, it's done now...
CONFIG_OF will always be enabled in the kernel even when we get ACPI. We still use the chosen DT node to tell the kernel about ACPI.
One thing that occurs to me with this - if we've always got a DT even if we are booting with ACPI that might confuse code that implements handling for firmware idioms. The regulator code does this sort of thing, though in that case we make exactly the same assumptions for ACPI and DT so there won't be any confusion. There's a few other examples though and we might acquire more by the time ACPI gets merged. Probably not a big deal but something that needs attention paying.
I was aware that there was a stub DT on ACPI systems but had expected that if we were booting with real ACPI support that'd get masked from the running system. I can see why at least initially we'd want to just pull in the infrastructure to use when identifying that this is an ACPI system.
On Fri, Mar 21, 2014 at 11:13:53AM +0000, Mark Brown wrote:
On Thu, Mar 20, 2014 at 05:19:32PM +0000, Catalin Marinas wrote:
On Thu, Mar 20, 2014 at 01:43:57PM +0000, Mark Brown wrote:
Given ACPI (which really looks like it's going to happen at some point and presumably make OF optional) I'm not sure removing the handling of OF is actually constructive but whatever, it's done now...
CONFIG_OF will always be enabled in the kernel even when we get ACPI. We still use the chosen DT node to tell the kernel about ACPI.
One thing that occurs to me with this - if we've always got a DT even if we are booting with ACPI that might confuse code that implements handling for firmware idioms.
The DT presented on an ACPI-capable system only contains the chosen node (I guess the DT will not even be unflattened) .So the topology code would check for DT, if not it would check for ACPI (or the other way around) and only after that fall back to hardware MPIDR. I'm not sure whether current ACPI gives us rich enough information about topology like DT, in which case it could simply use MPIDR.
I was aware that there was a stub DT on ACPI systems but had expected that if we were booting with real ACPI support that'd get masked from the running system.
That's the plan. The original point was that CONFIG_OF is going to stay even if CONFIG_ACPI is enabled. But we shouldn't mix the two, so most of the DT information will not be available to the kernel (not even CPU topology) if ACPI tables are provided.
On Fri, Mar 21, 2014 at 03:01:18PM +0000, Catalin Marinas wrote:
On Fri, Mar 21, 2014 at 11:13:53AM +0000, Mark Brown wrote:
One thing that occurs to me with this - if we've always got a DT even if we are booting with ACPI that might confuse code that implements handling for firmware idioms.
The DT presented on an ACPI-capable system only contains the chosen node (I guess the DT will not even be unflattened) .So the topology code would check for DT, if not it would check for ACPI (or the other way around) and only after that fall back to hardware MPIDR. I'm not sure whether current ACPI gives us rich enough information about topology like DT, in which case it could simply use MPIDR.
Sorry, this isn't related to topology - it's to do with other code that checks if a DT was present and makes decisions based on that. So long as of_have_populated_dt() doesn't report true things should be fine but it's something to watch out for.
On Thu, Mar 20, 2014 at 01:43:57PM +0000, Mark Brown wrote:
On Thu, Mar 20, 2014 at 11:26:50AM +0000, Lorenzo Pieralisi wrote:
On Wed, Mar 19, 2014 at 06:02:17PM +0000, Mark Brown wrote:
+#ifdef CONFIG_OF
This ifdef can be removed, CONFIG_OF is always selected for arm64 and the !CONFIG_OF path
This has been present since the very first time these patches were posted but hasn't been mentioned as being a problem previously.
I am sorry, I missed it, doing my best to help you get it through.
+#else +static inline int parse_dt_topology(void) { return 0; } +#endif
is wrong, it should return failure. You should remove the CONFIG_OF ifdeffery.
Yup. It actually won't affect the behaviour at present though - since it won't do anything the result will be just the same as if we return an error and reset.
Given ACPI (which really looks like it's going to happen at some point and presumably make OF optional) I'm not sure removing the handling of OF is actually constructive but whatever, it's done now...
DT is there to stay, regardless of ACPI. However, given the function call logic, returning 0 on !CONFIG_OF was correct since it meant "no cpu-map". Anyway, CONFIG_OF ifdef should be removed.
if (leaf) {
ret = parse_core(c, cluster_id, core_id++);
if (ret != 0) {
return ret;
}
WARNING: braces {} are not necessary for single statement blocks #139: FILE: arch/arm64/kernel/topology.c:132:
Like I say I don't think checkpatch is being helpful on this one, the code looks worse. Again, whatever.
Worse or better, it has to be consistent. Either you leave them everywhere (but there is a coding style, it is for a reason) or you remove them everywhere (there are other nested paths where it is removed in the patch). Do not take it as a nitpick please, I just want the code to be consistent.
We still have a problem here. If the topology does not contain bindings for some cpu nodes, parse_cluster() does not fail and we end up with an incomplete topology. We have two choices: either we check the topology
Hrm, looking at the topology binding it doesn't specificially require that the topology be complete. I can see why you would want that.
I'd rather do it here, in preparation for MPIDR_EL1 fallback solution (where there will always be topology information configured and the register will always be there in all its glory).
To be honest at this point I think what I want to do is go back to the original approach of layering DT on top of MPIDR. MPIDR is smaller and simpler code so seems more likely to make progress. I really do expect that for a very large proportion of systems it'll be sufficient.
DT (cpu-map) takes precedence though. Yes, instead of resetting the topology, falling back to MPIDR_EL1 is acceptable if either there are broken bindings or cpus with missing topology information.
Honestly, it is not up to the kernel to validate DT, since this adds complexity, but I think that a big fat WARN_ON on missing or broken topology information would help fix firmware at early stages.
You should fall back to HW MPIDR_EL1.
I know, it is complex, there is little we can do about that and it is code run just at cold boot and freed later so I deem that acceptable.
Thanks, Lorenzo
On Thu, Mar 20, 2014 at 06:08:36PM +0000, Lorenzo Pieralisi wrote:
This ifdef can be removed, CONFIG_OF is always selected for arm64 and the !CONFIG_OF path
This has been present since the very first time these patches were posted but hasn't been mentioned as being a problem previously.
I am sorry, I missed it, doing my best to help you get it through.
Please try to consider this from a submitter point of view; it is quite frustrating every time something that has been around for a while and fairly obvious gets identified as a new issue, it makes it hard to have confidence that addressing review comments is moving closer to getting things accepted.
Worse or better, it has to be consistent. Either you leave them everywhere (but there is a coding style, it is for a reason) or you remove them everywhere (there are other nested paths where it is removed in the patch). Do not take it as a nitpick please, I just want the code to be consistent.
Hrm, I couldn't actually find any other examples where the if is an edge statement in a block.
DT (cpu-map) takes precedence though. Yes, instead of resetting the topology, falling back to MPIDR_EL1 is acceptable if either there are broken bindings or cpus with missing topology information.
Quite; and hopefully in most cases the hardware designers will set MPIDR sensibly and so the DT could just omit the topology information entirely since it's redundant.
Honestly, it is not up to the kernel to validate DT, since this adds complexity, but I think that a big fat WARN_ON on missing or broken topology information would help fix firmware at early stages.
I dunno, we're now doing active things like insisting on at least one level of cluster node which definitely seem to push us into actively looking for problems where there's no real ambiguity about what's meant. Though we're not currently doing any WARN_ON()s...
I know, it is complex, there is little we can do about that and it is code run just at cold boot and freed later so I deem that acceptable.
It's not that anything is really complex (the requirements keep on evolving but nothing is particularly hard), it's that we could most likely have had something useful for people already.
On Fri, Mar 21, 2014 at 11:32:02AM +0000, Mark Brown wrote:
On Thu, Mar 20, 2014 at 06:08:36PM +0000, Lorenzo Pieralisi wrote:
This ifdef can be removed, CONFIG_OF is always selected for arm64 and the !CONFIG_OF path
This has been present since the very first time these patches were posted but hasn't been mentioned as being a problem previously.
I am sorry, I missed it, doing my best to help you get it through.
Please try to consider this from a submitter point of view; it is quite frustrating every time something that has been around for a while and fairly obvious gets identified as a new issue, it makes it hard to have confidence that addressing review comments is moving closer to getting things accepted.
I understand that and I apologize, but I will still flag issues up as long as I find some. For instance, I noticed patch is missing of_node_put() calls almost everywhere DT nodes are parsed and we have to fix that before merging it. I can do that to save another respin.
[...]
I know, it is complex, there is little we can do about that and it is code run just at cold boot and freed later so I deem that acceptable.
It's not that anything is really complex (the requirements keep on evolving but nothing is particularly hard), it's that we could most likely have had something useful for people already.
I agree and you are right.
In order to add all these checks this code is getting ways too complex for my taste, it is not your problem and it is not mine either, I just can't help notice though.
Patch is complete and I think it is almost ready to get merged. Instead of bothering you with another respin with minor changes I suggest I will pick the last version up, diff what's needed and repost it here for final look so that we are done with it.
Please let me know if that's plausible, thank you.
Lorenzo
On Fri, Mar 21, 2014 at 03:16:17PM +0000, Lorenzo Pieralisi wrote:
I understand that and I apologize, but I will still flag issues up as long as I find some. For instance, I noticed patch is missing
Thanks. I do agree that it's important to find and fix issues, it was more about the mechanics of how that gets done.
Instead of bothering you with another respin with minor changes I suggest I will pick the last version up, diff what's needed and repost it here for final look so that we are done with it.
Please let me know if that's plausible, thank you.
I've actually already got a mostly complete respin so I may as well send that, just doing a bit of testing though I'll wrap in the of_node_put()s as well now you mention that. Should be out this afternoon.
linaro-kernel@lists.linaro.org