this is the RFC version of ACPI based NUMA support for ARM64, it is based on 3.18-rc4 and v5 of ARM64 ACPI core patches, it also based on DT based NUMA patch from Ganapatrao [1].
Have not tested yet, will do that after posted this patch for RFC, any comment are welcomed.
[1] http://www.spinics.net/lists/arm-kernel/msg380197.html
Ganapatrao Kulkarni (1): arm64:numa: adding numa support for arm64 platforms.
Hanjun Guo (4): ACPI / NUMA: Use pr_fmt() instead of printk ACPI / NUMA: Remove redundant ACPI_DEBUG_OUTPUT ARM64 / ACPI: NUMA support based on SRAT and SLIT ACPI / NUMA: Enable ACPI based NUMA on ARM64
arch/arm64/Kconfig | 33 ++ arch/arm64/include/asm/acpi.h | 3 + arch/arm64/include/asm/mmzone.h | 32 ++ arch/arm64/include/asm/numa.h | 46 +++ arch/arm64/kernel/Makefile | 1 + arch/arm64/kernel/acpi_numa.c | 176 +++++++++++ arch/arm64/kernel/setup.c | 10 + arch/arm64/kernel/smp.c | 2 + arch/arm64/mm/Makefile | 1 + arch/arm64/mm/init.c | 34 +- arch/arm64/mm/numa.c | 675 ++++++++++++++++++++++++++++++++++++++++ drivers/acpi/Kconfig | 2 +- drivers/acpi/numa.c | 64 ++-- drivers/acpi/processor_core.c | 5 + include/acpi/processor.h | 1 + include/linux/acpi.h | 15 + 16 files changed, 1074 insertions(+), 26 deletions(-) create mode 100644 arch/arm64/include/asm/mmzone.h create mode 100644 arch/arm64/include/asm/numa.h create mode 100644 arch/arm64/kernel/acpi_numa.c create mode 100644 arch/arm64/mm/numa.c
From: Ganapatrao Kulkarni ganapatrao.kulkarni@caviumnetworks.com
Adding numa support for arm64 based platforms. creating numa mapping by parsing the dt node numa-map.
Signed-off-by: Ganapatrao Kulkarni ganapatrao.kulkarni@caviumnetworks.com --- arch/arm64/Kconfig | 33 +++ arch/arm64/include/asm/mmzone.h | 32 ++ arch/arm64/include/asm/numa.h | 35 +++ arch/arm64/kernel/setup.c | 8 + arch/arm64/kernel/smp.c | 2 + arch/arm64/mm/Makefile | 1 + arch/arm64/mm/init.c | 34 ++- arch/arm64/mm/numa.c | 631 ++++++++++++++++++++++++++++++++++++++++ 8 files changed, 770 insertions(+), 6 deletions(-) create mode 100644 arch/arm64/include/asm/mmzone.h create mode 100644 arch/arm64/include/asm/numa.h create mode 100644 arch/arm64/mm/numa.c
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 46d3c38..3dd6d87 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -62,6 +62,7 @@ config ARM64 select HAVE_PERF_USER_STACK_DUMP select HAVE_RCU_TABLE_FREE select HAVE_SYSCALL_TRACEPOINTS + select HAVE_MEMBLOCK_NODE_MAP if NUMA select IRQ_DOMAIN select MODULES_USE_ELF_RELA select NO_BOOTMEM @@ -303,6 +304,38 @@ config HOTPLUG_CPU Say Y here to experiment with turning CPUs off and on. CPUs can be controlled through /sys/devices/system/cpu.
+# Common NUMA Features +config NUMA + bool "Numa Memory Allocation and Scheduler Support" + depends on SMP + ---help--- + Enable NUMA (Non Uniform Memory Access) support. + + The kernel will try to allocate memory used by a CPU on the + local memory controller of the CPU and add some more + NUMA awareness to the kernel. + +config ARM64_DT_NUMA + def_bool y + prompt "DT NUMA detection" + default n + ---help--- + Enable DT based numa. + +config NODES_SHIFT + int "Maximum NUMA Nodes (as a power of 2)" + range 1 10 + default "2" + depends on NEED_MULTIPLE_NODES + ---help--- + Specify the maximum number of NUMA Nodes available on the target + system. Increases memory reserved to accommodate various tables. + +config USE_PERCPU_NUMA_NODE_ID + def_bool y + depends on NUMA + + source kernel/Kconfig.preempt
config HZ diff --git a/arch/arm64/include/asm/mmzone.h b/arch/arm64/include/asm/mmzone.h new file mode 100644 index 0000000..d27ee66 --- /dev/null +++ b/arch/arm64/include/asm/mmzone.h @@ -0,0 +1,32 @@ +#ifndef __ASM_ARM64_MMZONE_H_ +#define __ASM_ARM64_MMZONE_H_ + +#ifdef CONFIG_NUMA + +#include <linux/mmdebug.h> +#include <asm/smp.h> +#include <linux/types.h> +#include <asm/numa.h> + +extern struct pglist_data *node_data[]; + +#define NODE_DATA(nid) (node_data[nid]) + + +struct numa_memblk { + u64 start; + u64 end; + int nid; +}; + +struct numa_meminfo { + int nr_blks; + struct numa_memblk blk[NR_NODE_MEMBLKS]; +}; + +void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi); +int __init numa_cleanup_meminfo(struct numa_meminfo *mi); +void __init numa_reset_distance(void); + +#endif /* CONFIG_NUMA */ +#endif /* __ASM_ARM64_MMZONE_H_ */ diff --git a/arch/arm64/include/asm/numa.h b/arch/arm64/include/asm/numa.h new file mode 100644 index 0000000..e4c2ed0 --- /dev/null +++ b/arch/arm64/include/asm/numa.h @@ -0,0 +1,35 @@ +#ifndef _ASM_ARM64_NUMA_H +#define _ASM_ARM64_NUMA_H + +#include <linux/nodemask.h> +#include <asm/topology.h> + +#ifdef CONFIG_NUMA + +#define NR_NODE_MEMBLKS (MAX_NUMNODES * 2) +#define ZONE_ALIGN (1UL << (MAX_ORDER + PAGE_SHIFT)) + +/* currently, arm64 implements flat NUMA topology */ +#define parent_node(node) (node) + +/* dummy definitions for pci functions */ +#define pcibus_to_node(node) 0 +#define cpumask_of_pcibus(bus) 0 + +const struct cpumask *cpumask_of_node(int node); +/* Mappings between node number and cpus on that node. */ +extern cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; + +void __init arm64_numa_init(void); +int __init numa_add_memblk(u32 nodeid, u64 start, u64 end); +void numa_store_cpu_info(int cpu); +void numa_set_node(int cpu, int node); +void numa_clear_node(int cpu); +void numa_add_cpu(int cpu); +void numa_remove_cpu(int cpu); +#else /* CONFIG_NUMA */ +static inline void arm64_numa_init(void); +static inline void numa_store_cpu_info(int cpu) { } +static inline void arm64_numa_init() { } +#endif /* CONFIG_NUMA */ +#endif /* _ASM_ARM64_NUMA_H */ diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index 070d99a..2ba96b2 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -438,6 +438,9 @@ static int __init topology_init(void) { int i;
+ for_each_online_node(i) + register_one_node(i); + for_each_possible_cpu(i) { struct cpu *cpu = &per_cpu(cpu_data.cpu, i); cpu->hotpluggable = 1; @@ -474,7 +477,12 @@ static int c_show(struct seq_file *m, void *v) * "processor". Give glibc what it expects. */ #ifdef CONFIG_SMP + if (IS_ENABLED(CONFIG_NUMA)) { + seq_printf(m, "processor\t: %d", i); + seq_printf(m, " [nid: %d]\n", cpu_to_node(i)); + } else { seq_printf(m, "processor\t: %d\n", i); + } #endif }
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index 2988829..8c0acc7 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -50,6 +50,7 @@ #include <asm/sections.h> #include <asm/tlbflush.h> #include <asm/ptrace.h> +#include <asm/numa.h>
#define CREATE_TRACE_POINTS #include <trace/events/ipi.h> @@ -123,6 +124,7 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle) static void smp_store_cpu_info(unsigned int cpuid) { store_cpu_topology(cpuid); + numa_store_cpu_info(cpuid); }
/* diff --git a/arch/arm64/mm/Makefile b/arch/arm64/mm/Makefile index c56179e..c86e6de 100644 --- a/arch/arm64/mm/Makefile +++ b/arch/arm64/mm/Makefile @@ -3,3 +3,4 @@ obj-y := dma-mapping.o extable.o fault.o init.o \ ioremap.o mmap.o pgd.o mmu.o \ context.o proc.o pageattr.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o +obj-$(CONFIG_NUMA) += numa.o diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 494297c..6fd6802 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -39,6 +39,7 @@ #include <asm/setup.h> #include <asm/sizes.h> #include <asm/tlb.h> +#include <asm/numa.h>
#include "mm.h"
@@ -73,6 +74,20 @@ static phys_addr_t max_zone_dma_phys(void) return min(offset + (1ULL << 32), memblock_end_of_DRAM()); }
+#ifdef CONFIG_NUMA +static void __init zone_sizes_init(unsigned long min, unsigned long max) +{ + unsigned long max_zone_pfns[MAX_NR_ZONES]; + + memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); + if (IS_ENABLED(CONFIG_ZONE_DMA)) + max_zone_pfns[ZONE_DMA] = PFN_DOWN(max_zone_dma_phys()); + max_zone_pfns[ZONE_NORMAL] = max; + + free_area_init_nodes(max_zone_pfns); +} + +#else static void __init zone_sizes_init(unsigned long min, unsigned long max) { struct memblock_region *reg; @@ -111,6 +126,7 @@ static void __init zone_sizes_init(unsigned long min, unsigned long max)
free_area_init_node(0, zone_size, min, zhole_size); } +#endif /* CONFIG_NUMA */
#ifdef CONFIG_HAVE_ARCH_PFN_VALID int pfn_valid(unsigned long pfn) @@ -128,10 +144,15 @@ static void arm64_memory_present(void) static void arm64_memory_present(void) { struct memblock_region *reg; + int nid = 0;
- for_each_memblock(memory, reg) - memory_present(0, memblock_region_memory_base_pfn(reg), - memblock_region_memory_end_pfn(reg)); + for_each_memblock(memory, reg) { +#ifdef CONFIG_NUMA + nid = reg->nid; +#endif + memory_present(nid, memblock_region_memory_base_pfn(reg), + memblock_region_memory_end_pfn(reg)); + } } #endif
@@ -167,6 +188,10 @@ void __init bootmem_init(void) min = PFN_UP(memblock_start_of_DRAM()); max = PFN_DOWN(memblock_end_of_DRAM());
+ high_memory = __va((max << PAGE_SHIFT) - 1) + 1; + max_pfn = max_low_pfn = max; + + arm64_numa_init(); /* * Sparsemem tries to allocate bootmem in memory_present(), so must be * done after the fixed reservations. @@ -175,9 +200,6 @@ void __init bootmem_init(void)
sparse_init(); zone_sizes_init(min, max); - - high_memory = __va((max << PAGE_SHIFT) - 1) + 1; - max_pfn = max_low_pfn = max; }
#ifndef CONFIG_SPARSEMEM_VMEMMAP diff --git a/arch/arm64/mm/numa.c b/arch/arm64/mm/numa.c new file mode 100644 index 0000000..dbe76a3 --- /dev/null +++ b/arch/arm64/mm/numa.c @@ -0,0 +1,631 @@ +/* + * NUMA support, based on the x86 implementation. + * + * Copyright (C) 2014 Cavium Inc. + * Author: Ganapatrao Kulkarni gkulkarni@cavium.com + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/. + */ + +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/string.h> +#include <linux/init.h> +#include <linux/bootmem.h> +#include <linux/memblock.h> +#include <linux/mmzone.h> +#include <linux/ctype.h> +#include <linux/module.h> +#include <linux/nodemask.h> +#include <linux/sched.h> +#include <linux/topology.h> +#include <linux/of.h> +#include <linux/of_fdt.h> +#include <asm/smp_plat.h> + +int __initdata numa_off; +nodemask_t numa_nodes_parsed __initdata; +static int numa_distance_cnt; +static u8 *numa_distance; + +struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; + +static struct { + u32 node_id; + u64 cpu_hwid; +}cpu_info[NR_CPUS]; + +EXPORT_SYMBOL(node_data); + +static struct numa_meminfo numa_meminfo; + +static __init int numa_setup(char *opt) +{ + if (!opt) + return -EINVAL; + if (!strncmp(opt, "off", 3)) { + pr_info("%s\n", "NUMA turned off"); + numa_off = 1; + } + return 0; +} +early_param("numa", numa_setup); + +cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; +EXPORT_SYMBOL(node_to_cpumask_map); + +/* + * Returns a pointer to the bitmask of CPUs on Node 'node'. + */ +const struct cpumask *cpumask_of_node(int node) +{ + if (node >= nr_node_ids) { + pr_warn("cpumask_of_node(%d): node > nr_node_ids(%d)\n", + node, nr_node_ids); + dump_stack(); + return cpu_none_mask; + } + if (node_to_cpumask_map[node] == NULL) { + pr_warn("cpumask_of_node(%d): no node_to_cpumask_map!\n", + node); + dump_stack(); + return cpu_online_mask; + } + return node_to_cpumask_map[node]; +} +EXPORT_SYMBOL(cpumask_of_node); + + +int cpu_to_node_map[NR_CPUS]; +EXPORT_SYMBOL(cpu_to_node_map); + +void numa_clear_node(int cpu) +{ + cpu_to_node_map[cpu] = NUMA_NO_NODE; +} + +/* + * Allocate node_to_cpumask_map based on number of available nodes + * Requires node_possible_map to be valid. + * + * Note: cpumask_of_node() is not valid until after this is done. + * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.) + */ +void __init setup_node_to_cpumask_map(void) +{ + unsigned int node; + + /* setup nr_node_ids if not done yet */ + if (nr_node_ids == MAX_NUMNODES) + setup_nr_node_ids(); + + /* allocate the map */ + for (node = 0; node < nr_node_ids; node++) + alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]); + + /* cpumask_of_node() will now work */ + pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids); +} + +/* + * Set the cpu to node and mem mapping + */ +void numa_store_cpu_info(cpu) +{ + cpu_to_node_map[cpu] = cpu_info[cpu].node_id; + /* mapping of MPIDR/hwid, node and logical id */ + cpu_info[cpu].cpu_hwid = cpu_logical_map(cpu); + cpumask_set_cpu(cpu, node_to_cpumask_map[cpu_to_node_map[cpu]]); + set_numa_node(cpu_to_node_map[cpu]); + set_numa_mem(local_memory_node(cpu_to_node_map[cpu])); +} + +/** + * numa_add_memblk_to - Add one numa_memblk to a numa_meminfo + */ + +static int __init numa_add_memblk_to(int nid, u64 start, u64 end, + struct numa_meminfo *mi) +{ + /* ignore zero length blks */ + if (start == end) + return 0; + + /* whine about and ignore invalid blks */ + if (start > end || nid < 0 || nid >= MAX_NUMNODES) { + pr_warn("NUMA: Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n", + nid, start, end - 1); + return 0; + } + + if (mi->nr_blks >= NR_NODE_MEMBLKS) { + pr_err("NUMA: too many memblk ranges\n"); + return -EINVAL; + } + + pr_info("NUMA: Adding memblock %d [0x%llx - 0x%llx] on node %d\n", + mi->nr_blks, start, end, nid); + mi->blk[mi->nr_blks].start = start; + mi->blk[mi->nr_blks].end = end; + mi->blk[mi->nr_blks].nid = nid; + mi->nr_blks++; + return 0; +} + +/** + * numa_add_memblk - Add one numa_memblk to numa_meminfo + * @nid: NUMA node ID of the new memblk + * @start: Start address of the new memblk + * @end: End address of the new memblk + * + * Add a new memblk to the default numa_meminfo. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +#define MAX_PHYS_ADDR ((phys_addr_t)~0) + +int __init numa_add_memblk(u32 nid, u64 base, u64 size) +{ + const u64 phys_offset = __pa(PAGE_OFFSET); + + base &= PAGE_MASK; + size &= PAGE_MASK; + + if (base > MAX_PHYS_ADDR) { + pr_warn("NUMA: Ignoring memory block 0x%llx - 0x%llx\n", + base, base + size); + return -ENOMEM; + } + + if (base + size > MAX_PHYS_ADDR) { + pr_info("NUMA: Ignoring memory range 0x%lx - 0x%llx\n", + ULONG_MAX, base + size); + size = MAX_PHYS_ADDR - base; + } + + if (base + size < phys_offset) { + pr_warn("NUMA: Ignoring memory block 0x%llx - 0x%llx\n", + base, base + size); + return -ENOMEM; + } + if (base < phys_offset) { + pr_info("NUMA: Ignoring memory range 0x%llx - 0x%llx\n", + base, phys_offset); + size -= phys_offset - base; + base = phys_offset; + } + + return numa_add_memblk_to(nid, base, base+size, &numa_meminfo); +} +EXPORT_SYMBOL(numa_add_memblk); + +/* Initialize NODE_DATA for a node on the local memory */ +static void __init setup_node_data(int nid, u64 start, u64 end) +{ + const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE); + u64 nd_pa; + void *nd; + int tnid; + + start = roundup(start, ZONE_ALIGN); + + pr_info("Initmem setup node %d [mem %#010Lx-%#010Lx]\n", + nid, start, end - 1); + + /* + * Allocate node data. Try node-local memory and then any node. + */ + nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid); + if (!nd_pa) { + nd_pa = __memblock_alloc_base(nd_size, SMP_CACHE_BYTES, + MEMBLOCK_ALLOC_ACCESSIBLE); + if (!nd_pa) { + pr_err("Cannot find %zu bytes in node %d\n", + nd_size, nid); + return; + } + } + nd = __va(nd_pa); + + /* report and initialize */ + pr_info(" NODE_DATA [mem %#010Lx-%#010Lx]\n", + nd_pa, nd_pa + nd_size - 1); + tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT); + if (tnid != nid) + pr_info(" NODE_DATA(%d) on node %d\n", nid, tnid); + + node_data[nid] = nd; + memset(NODE_DATA(nid), 0, sizeof(pg_data_t)); + NODE_DATA(nid)->node_id = nid; + NODE_DATA(nid)->node_start_pfn = start >> PAGE_SHIFT; + NODE_DATA(nid)->node_spanned_pages = (end - start) >> PAGE_SHIFT; + + node_set_online(nid); +} + +/* + * Set nodes, which have memory in @mi, in *@nodemask. + */ +static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask, + const struct numa_meminfo *mi) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(mi->blk); i++) + if (mi->blk[i].start != mi->blk[i].end && + mi->blk[i].nid != NUMA_NO_NODE) + node_set(mi->blk[i].nid, *nodemask); +} + +/* + * Sanity check to catch more bad NUMA configurations (they are amazingly + * common). Make sure the nodes cover all memory. + */ +static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi) +{ + u64 numaram, totalram; + int i; + + numaram = 0; + for (i = 0; i < mi->nr_blks; i++) { + u64 s = mi->blk[i].start >> PAGE_SHIFT; + u64 e = mi->blk[i].end >> PAGE_SHIFT; + + numaram += e - s; + numaram -= __absent_pages_in_range(mi->blk[i].nid, s, e); + if ((s64)numaram < 0) + numaram = 0; + } + + totalram = max_pfn - absent_pages_in_range(0, max_pfn); + + /* We seem to lose 3 pages somewhere. Allow 1M of slack. */ + if ((s64)(totalram - numaram) >= (1 << (20 - PAGE_SHIFT))) { + pr_err("NUMA: nodes only cover %lluMB of your %lluMB Total RAM. Not used.\n", + (numaram << PAGE_SHIFT) >> 20, + (totalram << PAGE_SHIFT) >> 20); + return false; + } + return true; +} + +/** + * numa_reset_distance - Reset NUMA distance table + * + * The current table is freed. The next numa_set_distance() call will + * create a new one. + */ +void __init numa_reset_distance(void) +{ + size_t size = numa_distance_cnt * numa_distance_cnt * + sizeof(numa_distance[0]); + + /* numa_distance could be 1LU marking allocation failure, test cnt */ + if (numa_distance_cnt) + memblock_free(__pa(numa_distance), size); + numa_distance_cnt = 0; + numa_distance = NULL; /* enable table creation */ +} + +static int __init numa_alloc_distance(void) +{ + nodemask_t nodes_parsed; + size_t size; + int i, j, cnt = 0; + u64 phys; + + /* size the new table and allocate it */ + nodes_parsed = numa_nodes_parsed; + numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo); + + for_each_node_mask(i, nodes_parsed) + cnt = i; + cnt++; + size = cnt * cnt * sizeof(numa_distance[0]); + + phys = memblock_find_in_range(0, PFN_PHYS(max_pfn), + size, PAGE_SIZE); + if (!phys) { + pr_warning("NUMA: Warning: can't allocate distance table!\n"); + /* don't retry until explicitly reset */ + numa_distance = (void *)1LU; + return -ENOMEM; + } + memblock_reserve(phys, size); + + numa_distance = __va(phys); + numa_distance_cnt = cnt; + + /* fill with the default distances */ + for (i = 0; i < cnt; i++) + for (j = 0; j < cnt; j++) + numa_distance[i * cnt + j] = i == j ? + LOCAL_DISTANCE : REMOTE_DISTANCE; + pr_debug("NUMA: Initialized distance table, cnt=%d\n", cnt); + + return 0; +} + +/** + * numa_set_distance - Set NUMA distance from one NUMA to another + * @from: the 'from' node to set distance + * @to: the 'to' node to set distance + * @distance: NUMA distance + * + * Set the distance from node @from to @to to @distance. If distance table + * doesn't exist, one which is large enough to accommodate all the currently + * known nodes will be created. + * + * If such table cannot be allocated, a warning is printed and further + * calls are ignored until the distance table is reset with + * numa_reset_distance(). + * + * If @from or @to is higher than the highest known node or lower than zero + * at the time of table creation or @distance doesn't make sense, the call + * is ignored. + * This is to allow simplification of specific NUMA config implementations. + */ +void __init numa_set_distance(int from, int to, int distance) +{ + if (!numa_distance && numa_alloc_distance() < 0) + return; + + if (from >= numa_distance_cnt || to >= numa_distance_cnt || + from < 0 || to < 0) { + pr_warn_once("NUMA: Warning: node ids are out of bound, from=%d to=%d distance=%d\n", + from, to, distance); + return; + } + + if ((u8)distance != distance || + (from == to && distance != LOCAL_DISTANCE)) { + pr_warn_once("NUMA: Warning: invalid distance parameter, from=%d to=%d distance=%d\n", + from, to, distance); + return; + } + + numa_distance[from * numa_distance_cnt + to] = distance; +} + +int __node_distance(int from, int to) +{ + if (from >= numa_distance_cnt || to >= numa_distance_cnt) + return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE; + return numa_distance[from * numa_distance_cnt + to]; +} +EXPORT_SYMBOL(__node_distance); + +static int __init numa_register_memblks(struct numa_meminfo *mi) +{ + unsigned long uninitialized_var(pfn_align); + int i, nid; + + /* Account for nodes with cpus and no memory */ + node_possible_map = numa_nodes_parsed; + numa_nodemask_from_meminfo(&node_possible_map, mi); + if (WARN_ON(nodes_empty(node_possible_map))) + return -EINVAL; + + for (i = 0; i < mi->nr_blks; i++) { + struct numa_memblk *mb = &mi->blk[i]; + + memblock_set_node(mb->start, mb->end - mb->start, + &memblock.memory, mb->nid); + } + + /* + * If sections array is gonna be used for pfn -> nid mapping, check + * whether its granularity is fine enough. + */ +#ifdef NODE_NOT_IN_PAGE_FLAGS + pfn_align = node_map_pfn_alignment(); + if (pfn_align && pfn_align < PAGES_PER_SECTION) { + pr_warn("Node alignment %lluMB < min %lluMB, rejecting NUMA config\n", + PFN_PHYS(pfn_align) >> 20, + PFN_PHYS(PAGES_PER_SECTION) >> 20); + return -EINVAL; + } +#endif + if (!numa_meminfo_cover_memory(mi)) + return -EINVAL; + + /* Finally register nodes. */ + for_each_node_mask(nid, node_possible_map) { + u64 start = PFN_PHYS(max_pfn); + u64 end = 0; + + for (i = 0; i < mi->nr_blks; i++) { + if (nid != mi->blk[i].nid) + continue; + start = min(mi->blk[i].start, start); + end = max(mi->blk[i].end, end); + } + + if (start < end) + setup_node_data(nid, start, end); + } + + /* Dump memblock with node info and return. */ + memblock_dump_all(); + return 0; +} + +static int __init numa_init(int (*init_func)(void)) +{ + int ret, i; + + nodes_clear(node_possible_map); + nodes_clear(node_online_map); + + ret = init_func(); + if (ret < 0) + return ret; + + ret = numa_register_memblks(&numa_meminfo); + if (ret < 0) + return ret; + + for (i = 0; i < nr_cpu_ids; i++) + numa_clear_node(i); + + setup_node_to_cpumask_map(); + return 0; +} + +/** + * dummy_numa_init - Fallback dummy NUMA init + * + * Used if there's no underlying NUMA architecture, NUMA initialization + * fails, or NUMA is disabled on the command line. + * + * Must online at least one node and add memory blocks that cover all + * allowed memory. This function must not fail. + */ +static int __init dummy_numa_init(void) +{ + pr_info("%s\n","No NUMA configuration found"); + pr_info("Faking a node at [mem %#018Lx-%#018Lx]\n", + 0LLU, PFN_PHYS(max_pfn) - 1); + node_set(0, numa_nodes_parsed); + numa_add_memblk(0, 0, PFN_PHYS(max_pfn)); + + return 0; +} + +/** + * early_init_dt_scan_numa_map - parse memory node and map nid to memory range. + */ +int __init early_init_dt_scan_numa_map(unsigned long node, const char *uname, + int depth, void *data) +{ + const __be32 *numa_prop; + int nr_address_cells = OF_ROOT_NODE_ADDR_CELLS_DEFAULT; + int nr_size_cells = OF_ROOT_NODE_SIZE_CELLS_DEFAULT; + int node_count = MAX_NUMNODES; + int mem_ranges, cpu_ranges, matrix_count, i, length; + + /* We are scanning "numa-map" nodes only */ + if (strcmp(uname, "numa-map") != 0) + return 0; + + numa_prop = of_get_flat_dt_prop(node, "#address-cells", &length); + if (numa_prop) + nr_address_cells = dt_mem_next_cell( + OF_ROOT_NODE_ADDR_CELLS_DEFAULT, &numa_prop); + pr_debug("NUMA-DT: #nr_address_cells = %u\n",nr_address_cells); + + numa_prop = of_get_flat_dt_prop(node, "#size-cells", &length); + if (numa_prop) + nr_size_cells = dt_mem_next_cell( + OF_ROOT_NODE_ADDR_CELLS_DEFAULT, &numa_prop); + pr_debug("NUMA-DT: #nr_size_cells = %d\n",nr_size_cells); + + numa_prop = of_get_flat_dt_prop(node, "#node-count", &length); + if (numa_prop == NULL) + return -EINVAL; + node_count = dt_mem_next_cell(nr_size_cells, &numa_prop); + pr_debug("NUMA-DT: #node-count = %d\n",node_count); + + if (node_count > MAX_NUMNODES) + BUG(); + + for(i = 0; i <node_count; i++) + node_set(i, numa_nodes_parsed); + + numa_prop = of_get_flat_dt_prop(node, "mem-map", &length); + if (numa_prop == NULL) + return -EINVAL; + mem_ranges = (length / + sizeof(__be32))/(nr_address_cells + nr_size_cells); + for (i = 0; i < mem_ranges; i++) { + u64 base; + u32 node; + struct memblock_region *reg; + + base = dt_mem_next_cell(nr_address_cells, &numa_prop); + node = dt_mem_next_cell(nr_size_cells, &numa_prop); + pr_debug("NUMA-DT: mem-address = %llx , node = %u\n", + base, node); + for_each_memblock(memory, reg) { + if (reg->base == base) { + numa_add_memblk(node, reg->base,reg->size); + break; + } + } + } + + numa_prop = of_get_flat_dt_prop(node, "cpu-map", &length); + if (numa_prop == NULL) + return -EINVAL; + cpu_ranges = ((length / sizeof(__be32)) / + (nr_address_cells + nr_size_cells)); + for (i = 0; i < cpu_ranges; i++) { + u32 cpus, cpue, node_id, j; + cpus = dt_mem_next_cell(nr_size_cells, &numa_prop); + cpue = dt_mem_next_cell(nr_size_cells, &numa_prop); + node_id = dt_mem_next_cell(nr_size_cells, &numa_prop); + for (j = cpus; j <= cpue; j++) + cpu_info[j].node_id = node_id; + pr_debug("NUMA-DT: start cpu = %d end cpu = %d node-id %d\n", + cpus, cpue, node_id); + } + + + numa_prop = of_get_flat_dt_prop(node, "node-matrix", &length); + if (numa_prop == NULL) + return -EINVAL; + + matrix_count = ((length / sizeof(__be32)) / (3 * nr_size_cells)); + for (i = 0; i < matrix_count; i++) { + u32 nodea, nodeb, distance; + + nodea = dt_mem_next_cell(nr_size_cells, &numa_prop); + nodeb = dt_mem_next_cell(nr_size_cells, &numa_prop); + distance = dt_mem_next_cell(nr_size_cells, &numa_prop); + + numa_set_distance(nodea, nodeb, distance); + pr_debug("NUMA-DT: distance[node%d -> node%d] = %d\n", + nodea, nodeb, distance); + /* Set default distance of node B->A same as A->B */ + if (nodeb > nodea) + numa_set_distance(nodeb, nodea, distance); + } + + return 0; +} + +/* DT node mapping is done already early_init_dt_scan_memory */ +static inline int __init arm64_dt_numa_init(void) +{ + return of_scan_flat_dt(early_init_dt_scan_numa_map, NULL); +} + +/** + * arm64_numa_init - Initialize NUMA + * + * Try each configured NUMA initialization method until one succeeds. The + * last fallback is dummy single node config encomapssing whole memory and + * never fails. + */ +void __init arm64_numa_init(void) +{ + if (!numa_off) { +#ifdef CONFIG_ARM64_DT_NUMA + if (!numa_init(arm64_dt_numa_init)) + return; +#endif + } + + numa_init(dummy_numa_init); +}
On Tuesday 09 December 2014 00:53:03 Hanjun Guo wrote:
From: Ganapatrao Kulkarni ganapatrao.kulkarni@caviumnetworks.com
Adding numa support for arm64 based platforms. creating numa mapping by parsing the dt node numa-map.
Signed-off-by: Ganapatrao Kulkarni ganapatrao.kulkarni@caviumnetworks.com
This patch is currently under review and I assume it will change a lot to match the powerpc way of encoding associativity.
Arnd
Hi Arnd,
On 2014年12月09日 01:08, Arnd Bergmann wrote:
On Tuesday 09 December 2014 00:53:03 Hanjun Guo wrote:
From: Ganapatrao Kulkarni ganapatrao.kulkarni@caviumnetworks.com
Adding numa support for arm64 based platforms. creating numa mapping by parsing the dt node numa-map.
Signed-off-by: Ganapatrao Kulkarni ganapatrao.kulkarni@caviumnetworks.com
This patch is currently under review and I assume it will change a lot to match the powerpc way of encoding associativity.
Yes, it definitely will, I will rebase on top of this patch once it is accpeted.
thanks Hanjun
________________________________________ From: Arnd Bergmann arnd@arndb.de Sent: Monday, December 8, 2014 10:38 PM To: linaro-acpi@lists.linaro.org Cc: Hanjun Guo; tomasz.nowicki@linaro.org; Kulkarni, Ganapatrao; ashoks@broadcom.com Subject: Re: [Linaro-acpi] [RFC PATCH 1/5] arm64:numa: adding numa support for arm64 platforms.
Hi Arnd,
On Tuesday 09 December 2014 00:53:03 Hanjun Guo wrote:
From: Ganapatrao Kulkarni ganapatrao.kulkarni@caviumnetworks.com
Adding numa support for arm64 based platforms. creating numa mapping by parsing the dt node numa-map.
Signed-off-by: Ganapatrao Kulkarni ganapatrao.kulkarni@caviumnetworks.com
This patch is currently under review and I assume it will change a lot to match the powerpc way of encoding associativity.
There is one function(arm64_dt_numa_init) in this patch which does DT parsing , only that function changes, rest of the code should remains same which is common to ACPI and DT.
Arnd
thanks Ganapat
Just do some cleanups to replace printk with pr_fmt().
Signed-off-by: Hanjun Guo hanjun.guo@linaro.org --- drivers/acpi/numa.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-)
diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c index 24b5476..8b407fc 100644 --- a/drivers/acpi/numa.c +++ b/drivers/acpi/numa.c @@ -22,6 +22,9 @@ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * */ + +#define pr_fmt(fmt) "ACPI: " fmt + #include <linux/module.h> #include <linux/init.h> #include <linux/kernel.h> @@ -30,8 +33,6 @@ #include <linux/acpi.h> #include <linux/numa.h>
-#define PREFIX "ACPI: " - #define ACPI_NUMA 0x80000000 #define _COMPONENT ACPI_NUMA ACPI_MODULE_NAME("numa"); @@ -145,9 +146,8 @@ acpi_table_print_srat_entry(struct acpi_subtable_header *header) #endif /* ACPI_DEBUG_OUTPUT */ break; default: - printk(KERN_WARNING PREFIX - "Found unsupported SRAT entry (type = 0x%x)\n", - header->type); + pr_warn("Found unsupported SRAT entry (type = 0x%x)\n", + header->type); break; } } @@ -185,7 +185,7 @@ static int __init acpi_parse_slit(struct acpi_table_header *table) slit = (struct acpi_table_slit *)table;
if (!slit_valid(slit)) { - printk(KERN_INFO "ACPI: SLIT table looks invalid. Not used.\n"); + pr_info("SLIT table looks invalid. Not used.\n"); return -EINVAL; } acpi_numa_slit_init(slit); @@ -196,12 +196,9 @@ static int __init acpi_parse_slit(struct acpi_table_header *table) void __init __weak acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa) { - printk(KERN_WARNING PREFIX - "Found unsupported x2apic [0x%08x] SRAT entry\n", pa->apic_id); - return; + pr_warn("Found unsupported x2apic [0x%08x] SRAT entry\n", pa->apic_id); }
- static int __init acpi_parse_x2apic_affinity(struct acpi_subtable_header *header, const unsigned long end)
ACPI_DEBUG_PRINT() will replaced with empty when ACPI_DEBUG_OUTPUT is not defined, so #ifdef ACPI_DEBUG_OUTPUT is redundant and can be removed.
Signed-off-by: Hanjun Guo hanjun.guo@linaro.org --- drivers/acpi/numa.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-)
diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c index 8b407fc..b217333 100644 --- a/drivers/acpi/numa.c +++ b/drivers/acpi/numa.c @@ -87,16 +87,12 @@ int acpi_map_pxm_to_node(int pxm) static void __init acpi_table_print_srat_entry(struct acpi_subtable_header *header) { - - ACPI_FUNCTION_NAME("acpi_table_print_srat_entry"); - if (!header) return;
switch (header->type) {
case ACPI_SRAT_TYPE_CPU_AFFINITY: -#ifdef ACPI_DEBUG_OUTPUT { struct acpi_srat_cpu_affinity *p = (struct acpi_srat_cpu_affinity *)header; @@ -107,11 +103,9 @@ acpi_table_print_srat_entry(struct acpi_subtable_header *header) (p->flags & ACPI_SRAT_CPU_ENABLED)? "enabled" : "disabled")); } -#endif /* ACPI_DEBUG_OUTPUT */ break;
case ACPI_SRAT_TYPE_MEMORY_AFFINITY: -#ifdef ACPI_DEBUG_OUTPUT { struct acpi_srat_mem_affinity *p = (struct acpi_srat_mem_affinity *)header; @@ -127,11 +121,9 @@ acpi_table_print_srat_entry(struct acpi_subtable_header *header) (p->flags & ACPI_SRAT_MEM_NON_VOLATILE)? " non-volatile" : "")); } -#endif /* ACPI_DEBUG_OUTPUT */ break;
case ACPI_SRAT_TYPE_X2APIC_CPU_AFFINITY: -#ifdef ACPI_DEBUG_OUTPUT { struct acpi_srat_x2apic_cpu_affinity *p = (struct acpi_srat_x2apic_cpu_affinity *)header; @@ -143,8 +135,8 @@ acpi_table_print_srat_entry(struct acpi_subtable_header *header) (p->flags & ACPI_SRAT_CPU_ENABLED) ? "enabled" : "disabled")); } -#endif /* ACPI_DEBUG_OUTPUT */ break; + default: pr_warn("Found unsupported SRAT entry (type = 0x%x)\n", header->type);
Introduce a new file to hold ACPI based NUMA infomation parsing from SRAT and SLIT.
Signed-off-by: Hanjun Guo hanjun.guo@linaro.org --- arch/arm64/include/asm/acpi.h | 3 + arch/arm64/include/asm/numa.h | 27 +++++-- arch/arm64/kernel/Makefile | 1 + arch/arm64/kernel/acpi_numa.c | 176 ++++++++++++++++++++++++++++++++++++++++++ arch/arm64/kernel/setup.c | 2 + arch/arm64/mm/numa.c | 70 +++++++++++++---- drivers/acpi/processor_core.c | 5 ++ include/acpi/processor.h | 1 + 8 files changed, 264 insertions(+), 21 deletions(-) create mode 100644 arch/arm64/kernel/acpi_numa.c
diff --git a/arch/arm64/include/asm/acpi.h b/arch/arm64/include/asm/acpi.h index 483ff45..4ccfb89 100644 --- a/arch/arm64/include/asm/acpi.h +++ b/arch/arm64/include/asm/acpi.h @@ -89,11 +89,14 @@ static inline bool acpi_has_cpu_in_madt(void) static inline void arch_fix_phys_package_id(int num, u32 slot) { } void __init acpi_smp_init_cpus(void);
+int __init arm64_acpi_numa_init(void); + #else static inline void disable_acpi(void) { } static inline bool acpi_psci_present(void) { return false; } static inline bool acpi_psci_use_hvc(void) { return false; } static inline void acpi_smp_init_cpus(void) { } +static inline int __init arm64_acpi_numa_init(void) { return -ENODEV; } #endif /* CONFIG_ACPI */
#endif /*_ASM_ACPI_H*/ diff --git a/arch/arm64/include/asm/numa.h b/arch/arm64/include/asm/numa.h index e4c2ed0..8165bc9 100644 --- a/arch/arm64/include/asm/numa.h +++ b/arch/arm64/include/asm/numa.h @@ -1,5 +1,5 @@ -#ifndef _ASM_ARM64_NUMA_H -#define _ASM_ARM64_NUMA_H +#ifndef _ASM_NUMA_H +#define _ASM_NUMA_H
#include <linux/nodemask.h> #include <asm/topology.h> @@ -16,6 +16,14 @@ #define pcibus_to_node(node) 0 #define cpumask_of_pcibus(bus) 0
+struct node_cpu_hwid { + u32 node_id; /* logical node containing this CPU */ + u64 cpu_hwid; /* MPIDR for this CPU */ +}; + +extern struct node_cpu_hwid node_cpuid[NR_CPUS]; +extern nodemask_t numa_nodes_parsed __initdata; + const struct cpumask *cpumask_of_node(int node); /* Mappings between node number and cpus on that node. */ extern cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; @@ -23,13 +31,16 @@ extern cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; void __init arm64_numa_init(void); int __init numa_add_memblk(u32 nodeid, u64 start, u64 end); void numa_store_cpu_info(int cpu); -void numa_set_node(int cpu, int node); void numa_clear_node(int cpu); -void numa_add_cpu(int cpu); -void numa_remove_cpu(int cpu); +void __init build_cpu_to_node_map(void); +void __init numa_set_distance(int from, int to, int distance); #else /* CONFIG_NUMA */ -static inline void arm64_numa_init(void); +static inline void arm64_numa_init(void); { } static inline void numa_store_cpu_info(int cpu) { } -static inline void arm64_numa_init() { } +static inline void __init build_cpu_to_node_map(void) { } +static inline void __init numa_set_distance(int from, int to, int distance) +{ + return; +} #endif /* CONFIG_NUMA */ -#endif /* _ASM_ARM64_NUMA_H */ +#endif /* _ASM_NUMA_H */ diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index ec576e6..c23c872 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -32,6 +32,7 @@ arm64-obj-$(CONFIG_KGDB) += kgdb.o arm64-obj-$(CONFIG_EFI) += efi.o efi-stub.o efi-entry.o arm64-obj-$(CONFIG_PCI) += pci.o arm64-obj-$(CONFIG_ACPI) += acpi.o +arm64-obj-$(CONFIG_ACPI_NUMA) += acpi_numa.o arm64-obj-$(CONFIG_PCI_MMCONFIG) += mmconfig.o
obj-y += $(arm64-obj-y) vdso/ diff --git a/arch/arm64/kernel/acpi_numa.c b/arch/arm64/kernel/acpi_numa.c new file mode 100644 index 0000000..d0245de --- /dev/null +++ b/arch/arm64/kernel/acpi_numa.c @@ -0,0 +1,176 @@ +/* + * ACPI 5.1 based NUMA setup for ARM64 + * Lots of code was borrowed from arch/x86/mm/srat.c + * + * Copyright 2004 Andi Kleen, SuSE Labs. + * Copyright (C) 2013-2014, Linaro Ltd. + * Author: Hanjun Guo hanjun.guo@linaro.org + * + * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs. + * + * Called from acpi_numa_init while reading the SRAT and SLIT tables. + * Assumes all memory regions belonging to a single proximity domain + * are in one chunk. Holes between them will be included in the node. + */ + +#define pr_fmt(fmt) "ACPI: SRAT: " fmt + +#include <linux/acpi.h> +#include <linux/bitmap.h> +#include <linux/bootmem.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/memblock.h> +#include <linux/mmzone.h> +#include <linux/module.h> +#include <linux/topology.h> + +#include <acpi/processor.h> +#include <asm/numa.h> + +int acpi_numa __initdata; + +static __init int setup_node(int pxm) +{ + return acpi_map_pxm_to_node(pxm); +} + +static __init void bad_srat(void) +{ + pr_err("SRAT not used.\n"); + acpi_numa = -1; +} + +static __init inline int srat_disabled(void) +{ + return acpi_numa < 0; +} + +/* + * Callback for SLIT parsing. + * It will get the distance information presented by SLIT + * and init the distance matrix of numa nodes + */ +void __init acpi_numa_slit_init(struct acpi_table_slit *slit) +{ + int i, j; + + for (i = 0; i < slit->locality_count; i++) { + const int from_node = pxm_to_node(i); + + if (from_node == NUMA_NO_NODE) + continue; + + for (j = 0; j < slit->locality_count; j++) { + const int to_node = pxm_to_node(j); + + if (to_node == NUMA_NO_NODE) + continue; + + numa_set_distance(from_node, to_node, + slit->entry[slit->locality_count * i + j]); + } + } +} + +/* Callback for Proximity Domain -> ACPI processor UID mapping */ +void __init +acpi_numa_gicc_affinity_init(struct acpi_srat_gicc_affinity *pa) +{ + int pxm, node; + int acpi_id; + int acpi_phys_id; + u64 mpidr; + static int cpus_in_srat; + + if (srat_disabled()) + return; + if (pa->header.length < sizeof(struct acpi_srat_gicc_affinity)) { + bad_srat(); + return; + } + if (!(pa->flags & ACPI_SRAT_GICC_ENABLED)) + return; + + if (cpus_in_srat >= ARRAY_SIZE(node_cpuid)) { + printk_once(KERN_WARNING + "node_cpuid[%ld] is too small, may not be able to use all cpus\n", + ARRAY_SIZE(node_cpuid)); + return; + } + + pxm = pa->proximity_domain; + node = setup_node(pxm); + if (node < 0) { + pr_err("Too many proximity domains %x\n", pxm); + bad_srat(); + return; + } + + acpi_id = pa->acpi_processor_uid; + acpi_phys_id = acpi_get_phys_id_in_madt(ACPI_MADT_TYPE_GENERIC_INTERRUPT, + acpi_id); + if (acpi_phys_id) { + pr_info("PXM %d with ACPI ID %d has no valid MPIDR in MADT\n", pxm, acpi_id); + return; + } + + /* MPIDR was packed into 32 bits */ + mpidr = ((acpi_phys_id & 0xff000000) << 8) | (acpi_phys_id & 0x00ffffff); + node_cpuid[cpus_in_srat].node_id = node; + node_cpuid[cpus_in_srat].cpu_hwid = mpidr; + node_set(node, numa_nodes_parsed); + acpi_numa = 1; + cpus_in_srat++; + pr_info("PXM %u -> MPIDR 0x%08x -> Node %u\n", pxm, mpidr, node); +} + +/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */ +int __init +acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) +{ + u64 start, end; + int node, pxm; + + if (srat_disabled()) + goto out_err; + if (ma->header.length != sizeof(struct acpi_srat_mem_affinity)) + goto out_err_bad_srat; + + start = ma->base_address; + end = start + ma->length; + pxm = ma->proximity_domain; + + node = setup_node(pxm); + if (node < 0) { + printk(KERN_ERR "SRAT: Too many proximity domains.\n"); + goto out_err_bad_srat; + } + + if (numa_add_memblk(node, start, end) < 0) + goto out_err_bad_srat; + + node_set(node, numa_nodes_parsed); + + pr_info("SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]\n", + node, pxm, + (unsigned long long) start, (unsigned long long) end - 1); + + return 0; +out_err_bad_srat: + bad_srat(); +out_err: + return -EINVAL; +} + +void __init acpi_numa_arch_fixup(void) {} + +int __init arm64_acpi_numa_init(void) +{ + int ret; + + ret = acpi_numa_init(); + if (ret < 0) + return ret; + return srat_disabled() ? -EINVAL : 0; +} diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index 2ba96b2..d5bd782 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -414,6 +414,8 @@ void __init setup_arch(char **cmdline_p) acpi_smp_init_cpus(); }
+ build_cpu_to_node_map(); + #ifdef CONFIG_SMP smp_build_mpidr_hash(); #endif diff --git a/arch/arm64/mm/numa.c b/arch/arm64/mm/numa.c index dbe76a3..441f75a 100644 --- a/arch/arm64/mm/numa.c +++ b/arch/arm64/mm/numa.c @@ -17,6 +17,7 @@ * along with this program. If not, see http://www.gnu.org/licenses/. */
+#include <linux/acpi.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/string.h> @@ -31,7 +32,9 @@ #include <linux/topology.h> #include <linux/of.h> #include <linux/of_fdt.h> + #include <asm/smp_plat.h> +#include <asm/acpi.h>
int __initdata numa_off; nodemask_t numa_nodes_parsed __initdata; @@ -39,14 +42,10 @@ static int numa_distance_cnt; static u8 *numa_distance;
struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; - -static struct { - u32 node_id; - u64 cpu_hwid; -}cpu_info[NR_CPUS]; - EXPORT_SYMBOL(node_data);
+struct node_cpu_hwid node_cpuid[NR_CPUS]; + static struct numa_meminfo numa_meminfo;
static __init int numa_setup(char *opt) @@ -85,7 +84,6 @@ const struct cpumask *cpumask_of_node(int node) } EXPORT_SYMBOL(cpumask_of_node);
- int cpu_to_node_map[NR_CPUS]; EXPORT_SYMBOL(cpu_to_node_map);
@@ -94,6 +92,46 @@ void numa_clear_node(int cpu) cpu_to_node_map[cpu] = NUMA_NO_NODE; }
+void map_cpu_to_node(int cpu, int nid) +{ + if (nid < 0) { /* just initialize by zero */ + cpu_to_node_map[cpu] = 0; + return; + } + + if (!node_online(nid)) + nid = first_online_node; /* FIXME: find nearest node instead */ + + cpu_to_node_map[cpu] = nid; + cpumask_set_cpu(cpu, node_to_cpumask_map[nid]); + set_numa_node(nid); +} + +/** + * build_cpu_to_node_map - setup cpu to node and node to cpumask arrays + * + * Build cpu to node mapping and initialize the per node cpu masks using + * info from the node_cpuid array handed to us by ACPI or DT. + */ +void __init build_cpu_to_node_map(void) +{ + int cpu, i, node; + + for (node = 0; node < MAX_NUMNODES; node++) + cpumask_clear(node_to_cpumask_map[node]); + + for_each_possible_cpu(cpu) { + node = NUMA_NO_NODE; + for (i = 0; i < NR_CPUS; i++) { + if (cpu_physical_id(cpu) == node_cpuid[i].cpu_hwid) { + node = node_cpuid[i].node_id; + break; + } + } + map_cpu_to_node(cpu, node); + } +} + /* * Allocate node_to_cpumask_map based on number of available nodes * Requires node_possible_map to be valid. @@ -122,9 +160,11 @@ void __init setup_node_to_cpumask_map(void) */ void numa_store_cpu_info(cpu) { - cpu_to_node_map[cpu] = cpu_info[cpu].node_id; + if (!acpi_disabled) /* TODO: should be updated with new patches */ + return; + cpu_to_node_map[cpu] = node_cpuid[cpu].node_id; /* mapping of MPIDR/hwid, node and logical id */ - cpu_info[cpu].cpu_hwid = cpu_logical_map(cpu); + node_cpuid[cpu].cpu_hwid = cpu_logical_map(cpu); cpumask_set_cpu(cpu, node_to_cpumask_map[cpu_to_node_map[cpu]]); set_numa_node(cpu_to_node_map[cpu]); set_numa_mem(local_memory_node(cpu_to_node_map[cpu])); @@ -503,6 +543,7 @@ static int __init dummy_numa_init(void) return 0; }
+#ifdef CONFIG_ARM64_DT_NUMA /** * early_init_dt_scan_numa_map - parse memory node and map nid to memory range. */ @@ -576,7 +617,7 @@ int __init early_init_dt_scan_numa_map(unsigned long node, const char *uname, cpue = dt_mem_next_cell(nr_size_cells, &numa_prop); node_id = dt_mem_next_cell(nr_size_cells, &numa_prop); for (j = cpus; j <= cpue; j++) - cpu_info[j].node_id = node_id; + node_cpuid[j].node_id = node_id; pr_debug("NUMA-DT: start cpu = %d end cpu = %d node-id %d\n", cpus, cpue, node_id); } @@ -610,6 +651,9 @@ static inline int __init arm64_dt_numa_init(void) { return of_scan_flat_dt(early_init_dt_scan_numa_map, NULL); } +#else +static inline int __init arm64_dt_numa_init(void) { return -EINVAL; } +#endif
/** * arm64_numa_init - Initialize NUMA @@ -621,10 +665,10 @@ static inline int __init arm64_dt_numa_init(void) void __init arm64_numa_init(void) { if (!numa_off) { -#ifdef CONFIG_ARM64_DT_NUMA - if (!numa_init(arm64_dt_numa_init)) + if (acpi_disabled && !numa_init(arm64_dt_numa_init)) + return; + else if (!numa_init(arm64_acpi_numa_init)) return; -#endif }
numa_init(dummy_numa_init); diff --git a/drivers/acpi/processor_core.c b/drivers/acpi/processor_core.c index 5c84e0d..b12fc7a 100644 --- a/drivers/acpi/processor_core.c +++ b/drivers/acpi/processor_core.c @@ -175,6 +175,11 @@ exit: return apic_id; }
+int acpi_get_phys_id_in_madt(int type, u32 acpi_id) +{ + return map_madt_entry(type, acpi_id); +} + int acpi_get_apicid(acpi_handle handle, int type, u32 acpi_id) { int apic_id; diff --git a/include/acpi/processor.h b/include/acpi/processor.h index 9b9b6f2..d88c985 100644 --- a/include/acpi/processor.h +++ b/include/acpi/processor.h @@ -317,6 +317,7 @@ void acpi_processor_set_pdc(acpi_handle handle); int acpi_get_apicid(acpi_handle, int type, u32 acpi_id); int acpi_map_cpuid(int apic_id, u32 acpi_id); int acpi_get_cpuid(acpi_handle, int type, u32 acpi_id); +int acpi_get_phys_id_in_madt(int type, u32 acpi_id);
/* in processor_throttling.c */ int acpi_processor_tstate_has_changed(struct acpi_processor *pr);
On Tuesday 09 December 2014 00:53:06 Hanjun Guo wrote:
diff --git a/arch/arm64/kernel/acpi_numa.c b/arch/arm64/kernel/acpi_numa.c new file mode 100644 index 0000000..d0245de --- /dev/null +++ b/arch/arm64/kernel/acpi_numa.c @@ -0,0 +1,176 @@ +/*
- ACPI 5.1 based NUMA setup for ARM64
- Lots of code was borrowed from arch/x86/mm/srat.c
I think you need to return what you borrowed ;-)
Can the ACPI table parsing code just be moved to drivers/acpi for sharing? Is this different between x86 and ia64?
+#ifdef CONFIG_ARM64_DT_NUMA /**
- early_init_dt_scan_numa_map - parse memory node and map nid to memory range.
*/ @@ -576,7 +617,7 @@ int __init early_init_dt_scan_numa_map(unsigned long node, const char *uname, cpue = dt_mem_next_cell(nr_size_cells, &numa_prop); node_id = dt_mem_next_cell(nr_size_cells, &numa_prop); for (j = cpus; j <= cpue; j++)
cpu_info[j].node_id = node_id;
pr_debug("NUMA-DT: start cpu = %d end cpu = %d node-id %d\n", cpus, cpue, node_id); }node_cpuid[j].node_id = node_id;
@@ -610,6 +651,9 @@ static inline int __init arm64_dt_numa_init(void) { return of_scan_flat_dt(early_init_dt_scan_numa_map, NULL); } +#else +static inline int __init arm64_dt_numa_init(void) { return -EINVAL; } +#endif
Please try to avoid adding #ifdef sequences like this and instead use something like
if (!IS_ENABLED(CONFIG_ARM64_DT_NUMA)) return -ENXIO;
or change the caller to only call this function like this.
Arnd
On 2014年12月09日 01:07, Arnd Bergmann wrote:
On Tuesday 09 December 2014 00:53:06 Hanjun Guo wrote:
diff --git a/arch/arm64/kernel/acpi_numa.c b/arch/arm64/kernel/acpi_numa.c new file mode 100644 index 0000000..d0245de --- /dev/null +++ b/arch/arm64/kernel/acpi_numa.c @@ -0,0 +1,176 @@ +/*
- ACPI 5.1 based NUMA setup for ARM64
- Lots of code was borrowed from arch/x86/mm/srat.c
I think you need to return what you borrowed ;-)
Can the ACPI table parsing code just be moved to drivers/acpi for sharing? Is this different between x86 and ia64?
yes, x86 and ia64 are quite different and hard to keep them as one driver, because they have different mapping mechanism between cpu hardware id and NUMA node ID.
+#ifdef CONFIG_ARM64_DT_NUMA /**
- early_init_dt_scan_numa_map - parse memory node and map nid to memory range.
*/ @@ -576,7 +617,7 @@ int __init early_init_dt_scan_numa_map(unsigned long node, const char *uname, cpue = dt_mem_next_cell(nr_size_cells, &numa_prop); node_id = dt_mem_next_cell(nr_size_cells, &numa_prop); for (j = cpus; j <= cpue; j++)
cpu_info[j].node_id = node_id;
pr_debug("NUMA-DT: start cpu = %d end cpu = %d node-id %d\n", cpus, cpue, node_id); }node_cpuid[j].node_id = node_id;
@@ -610,6 +651,9 @@ static inline int __init arm64_dt_numa_init(void) { return of_scan_flat_dt(early_init_dt_scan_numa_map, NULL); } +#else +static inline int __init arm64_dt_numa_init(void) { return -EINVAL; } +#endif
Please try to avoid adding #ifdef sequences like this and instead use something like
if (!IS_ENABLED(CONFIG_ARM64_DT_NUMA)) return -ENXIO;
or change the caller to only call this function like this.
That's good to me too, I will update this patch.
Thanks Hanjun
Add function needed for CPU to node mapping, and enable ACPI based NUMA for ARM64 in Kconfig
Signed-off-by: Hanjun Guo hanjun.guo@linaro.org --- drivers/acpi/Kconfig | 2 +- drivers/acpi/numa.c | 37 +++++++++++++++++++++++++++++++++++++ include/linux/acpi.h | 15 +++++++++++++++ 3 files changed, 53 insertions(+), 1 deletion(-)
diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig index 812596d..c1d4796 100644 --- a/drivers/acpi/Kconfig +++ b/drivers/acpi/Kconfig @@ -222,7 +222,7 @@ config ACPI_THERMAL config ACPI_NUMA bool "NUMA support" depends on NUMA - depends on (X86 || IA64) + depends on (X86 || IA64 || ARM64) default y if IA64_GENERIC || IA64_SGI_SN2
config ACPI_CUSTOM_DSDT_FILE diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c index b217333..76ec90a 100644 --- a/drivers/acpi/numa.c +++ b/drivers/acpi/numa.c @@ -137,6 +137,20 @@ acpi_table_print_srat_entry(struct acpi_subtable_header *header) } break;
+ case ACPI_SRAT_TYPE_GICC_AFFINITY: + { + struct acpi_srat_gicc_affinity *p = + (struct acpi_srat_gicc_affinity *)header; + ACPI_DEBUG_PRINT((ACPI_DB_INFO, + "SRAT Processor (acpi id[0x%04x]) in" + " proximity domain %d %s\n", + p->acpi_id, + p->proximity_domain, + (p->flags & ACPI_SRAT_GICC_ENABLED) ? + "enabled" : "disabled")); + } + break; + default: pr_warn("Found unsupported SRAT entry (type = 0x%x)\n", header->type); @@ -227,6 +241,24 @@ acpi_parse_processor_affinity(struct acpi_subtable_header *header, return 0; }
+static int __init +acpi_parse_gicc_affinity(struct acpi_subtable_header *header, + const unsigned long end) +{ + struct acpi_srat_gicc_affinity *processor_affinity; + + processor_affinity = (struct acpi_srat_gicc_affinity *)header; + if (!processor_affinity) + return -EINVAL; + + acpi_table_print_srat_entry(header); + + /* let architecture-dependent part to do it */ + acpi_numa_gicc_affinity_init(processor_affinity); + + return 0; +} + static int __initdata parsed_numa_memblks;
static int __init @@ -274,6 +306,9 @@ int __init acpi_numa_init(void) { int cnt = 0;
+ if (acpi_disabled) + return -EINVAL; + /* * Should not limit number with cpu num that is from NR_CPUS or nr_cpus= * SRAT cpu entries could have different order with that in MADT. @@ -286,6 +321,8 @@ int __init acpi_numa_init(void) acpi_parse_x2apic_affinity, 0); acpi_table_parse_srat(ACPI_SRAT_TYPE_CPU_AFFINITY, acpi_parse_processor_affinity, 0); + acpi_table_parse_srat(ACPI_SRAT_TYPE_GICC_AFFINITY, + acpi_parse_gicc_affinity, 0); cnt = acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY, acpi_parse_memory_affinity, NR_NODE_MEMBLKS); diff --git a/include/linux/acpi.h b/include/linux/acpi.h index de81de3..da2ec1e 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -141,8 +141,23 @@ void acpi_table_print_madt_entry (struct acpi_subtable_header *madt);
/* the following four functions are architecture-dependent */ void acpi_numa_slit_init (struct acpi_table_slit *slit); + +#if defined(CONFIG_X86) || defined(CONFIG_IA64) void acpi_numa_processor_affinity_init (struct acpi_srat_cpu_affinity *pa); +#else +static inline void +acpi_numa_processor_affinity_init (struct acpi_srat_cpu_affinity *pa) { } +#endif + void acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa); + +#ifdef CONFIG_ARM64 +void acpi_numa_gicc_affinity_init(struct acpi_srat_gicc_affinity *pa); +#else +static inline void +acpi_numa_gicc_affinity_init(struct acpi_srat_gicc_affinity *pa) { } +#endif + int acpi_numa_memory_affinity_init (struct acpi_srat_mem_affinity *ma); void acpi_numa_arch_fixup(void);