Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

s390/numa: add core infrastructure

Enable core NUMA support for s390 and add one simple default mode "plain"
that creates one single NUMA node.

This patch contains several changes from Michael Holzheu.

Signed-off-by: Philipp Hachtmann <phacht@linux.vnet.ibm.com>
Signed-off-by: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>

authored by

Philipp Hachtmann and committed by
Martin Schwidefsky
3a368f74 199071f1

+375 -26
+1
arch/s390/Kbuild
··· 6 6 obj-$(CONFIG_APPLDATA_BASE) += appldata/ 7 7 obj-y += net/ 8 8 obj-$(CONFIG_PCI) += pci/ 9 + obj-$(CONFIG_NUMA) += numa/
+37
arch/s390/Kconfig
··· 153 153 select TTY 154 154 select VIRT_CPU_ACCOUNTING 155 155 select VIRT_TO_BUS 156 + select ARCH_SUPPORTS_NUMA_BALANCING 157 + select ARCH_WANTS_PROT_NUMA_PROT_NONE 158 + select HAVE_ARCH_EARLY_PFN_TO_NID 159 + 156 160 157 161 config SCHED_OMIT_FRAME_POINTER 158 162 def_bool y ··· 389 385 390 386 config SCHED_SMT 391 387 def_bool n 388 + 389 + # Some NUMA nodes have memory ranges that span 390 + # other nodes. Even though a pfn is valid and 391 + # between a node's start and end pfns, it may not 392 + # reside on that node. See memmap_init_zone() 393 + # for details. <- They meant memory holes! 394 + config NODES_SPAN_OTHER_NODES 395 + def_bool NUMA 396 + 397 + config NUMA 398 + bool "NUMA support" 399 + depends on SMP && 64BIT && SCHED_TOPOLOGY 400 + default n 401 + help 402 + Enable NUMA support 403 + 404 + This option adds NUMA support to the kernel. 405 + 406 + An operation mode can be selected by appending 407 + numa=<method> to the kernel command line. 408 + 409 + The default behaviour is identical to appending numa=plain to 410 + the command line. This will create just one node with all 411 + available memory and all CPUs in it. 412 + 413 + config NODES_SHIFT 414 + int "Maximum NUMA nodes (as a power of 2)" 415 + range 1 10 416 + depends on NUMA 417 + default "4" 418 + help 419 + Specify the maximum number of NUMA nodes available on the target 420 + system. Increases memory reserved to accommodate various tables. 392 421 393 422 config SCHED_MC 394 423 def_bool n
+16
arch/s390/include/asm/mmzone.h
··· 1 + /* 2 + * NUMA support for s390 3 + * 4 + * Copyright IBM Corp. 2015 5 + */ 6 + 7 + #ifndef _ASM_S390_MMZONE_H 8 + #define _ASM_S390_MMZONE_H 9 + 10 + #ifdef CONFIG_NUMA 11 + 12 + extern struct pglist_data *node_data[]; 13 + #define NODE_DATA(nid) (node_data[nid]) 14 + 15 + #endif /* CONFIG_NUMA */ 16 + #endif /* _ASM_S390_MMZONE_H */
+31
arch/s390/include/asm/numa.h
··· 1 + /* 2 + * NUMA support for s390 3 + * 4 + * Declare the NUMA core code structures and functions. 5 + * 6 + * Copyright IBM Corp. 2015 7 + */ 8 + 9 + #ifndef _ASM_S390_NUMA_H 10 + #define _ASM_S390_NUMA_H 11 + 12 + #ifdef CONFIG_NUMA 13 + 14 + #include <linux/numa.h> 15 + #include <linux/cpumask.h> 16 + 17 + void numa_setup(void); 18 + int numa_pfn_to_nid(unsigned long pfn); 19 + int __node_distance(int a, int b); 20 + void numa_update_cpu_topology(void); 21 + 22 + extern cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; 23 + extern int numa_debug_enabled; 24 + 25 + #else 26 + 27 + static inline void numa_setup(void) { } 28 + static inline void numa_update_cpu_topology(void) { } 29 + 30 + #endif /* CONFIG_NUMA */ 31 + #endif /* _ASM_S390_NUMA_H */
+16
arch/s390/include/asm/pci.h
··· 192 192 void zpci_debug_exit_device(struct zpci_dev *); 193 193 void zpci_debug_info(struct zpci_dev *, struct seq_file *); 194 194 195 + #ifdef CONFIG_NUMA 196 + 197 + /* Returns the node based on PCI bus */ 198 + static inline int __pcibus_to_node(const struct pci_bus *bus) 199 + { 200 + return NUMA_NO_NODE; 201 + } 202 + 203 + static inline const struct cpumask * 204 + cpumask_of_pcibus(const struct pci_bus *bus) 205 + { 206 + return cpu_online_mask; 207 + } 208 + 209 + #endif /* CONFIG_NUMA */ 210 + 195 211 #endif
+39
arch/s390/include/asm/topology.h
··· 2 2 #define _ASM_S390_TOPOLOGY_H 3 3 4 4 #include <linux/cpumask.h> 5 + #include <asm/numa.h> 5 6 6 7 struct sysinfo_15_1_x; 7 8 struct cpu; ··· 14 13 unsigned short core_id; 15 14 unsigned short socket_id; 16 15 unsigned short book_id; 16 + unsigned short node_id; 17 17 cpumask_t thread_mask; 18 18 cpumask_t core_mask; 19 19 cpumask_t book_mask; ··· 53 51 #define POLARIZATION_VL (1) 54 52 #define POLARIZATION_VM (2) 55 53 #define POLARIZATION_VH (3) 54 + 55 + #define SD_BOOK_INIT SD_CPU_INIT 56 + 57 + #ifdef CONFIG_NUMA 58 + 59 + #define cpu_to_node cpu_to_node 60 + static inline int cpu_to_node(int cpu) 61 + { 62 + return per_cpu(cpu_topology, cpu).node_id; 63 + } 64 + 65 + /* Returns a pointer to the cpumask of CPUs on node 'node'. */ 66 + #define cpumask_of_node cpumask_of_node 67 + static inline const struct cpumask *cpumask_of_node(int node) 68 + { 69 + return node_to_cpumask_map[node]; 70 + } 71 + 72 + /* 73 + * Returns the number of the node containing node 'node'. This 74 + * architecture is flat, so it is a pretty simple function! 75 + */ 76 + #define parent_node(node) (node) 77 + 78 + #define pcibus_to_node(bus) __pcibus_to_node(bus) 79 + 80 + #define node_distance(a, b) __node_distance(a, b) 81 + 82 + #else /* !CONFIG_NUMA */ 83 + 84 + #define numa_node_id numa_node_id 85 + static inline int numa_node_id(void) 86 + { 87 + return 0; 88 + } 89 + 90 + #endif /* CONFIG_NUMA */ 56 91 57 92 #include <asm-generic/topology.h> 58 93
+6 -6
arch/s390/include/asm/unistd.h
··· 11 11 12 12 #define __IGNORE_time 13 13 14 - /* Ignore NUMA system calls. Not wired up on s390. */ 15 - #define __IGNORE_mbind 16 - #define __IGNORE_get_mempolicy 17 - #define __IGNORE_set_mempolicy 18 - #define __IGNORE_migrate_pages 19 - #define __IGNORE_move_pages 14 + /* NUMA system calls */ 15 + #define _ARCH_WANT_mbind 16 + #define __ARCH_WANT_get_mempolicy 17 + #define __ARCH_WANT_set_mempolicy 18 + #define __ARCH_WANT_migrate_pages 19 + #define __ARCH_WANT_move_pages 20 20 21 21 /* Ignore system calls that are also reachable via sys_socket */ 22 22 #define __IGNORE_recvmmsg
+5 -5
arch/s390/include/uapi/asm/unistd.h
··· 204 204 #define __NR_statfs64 265 205 205 #define __NR_fstatfs64 266 206 206 #define __NR_remap_file_pages 267 207 - /* Number 268 is reserved for new sys_mbind */ 208 - /* Number 269 is reserved for new sys_get_mempolicy */ 209 - /* Number 270 is reserved for new sys_set_mempolicy */ 207 + #define __NR_mbind 268 208 + #define __NR_get_mempolicy 269 209 + #define __NR_set_mempolicy 270 210 210 #define __NR_mq_open 271 211 211 #define __NR_mq_unlink 272 212 212 #define __NR_mq_timedsend 273 ··· 223 223 #define __NR_inotify_init 284 224 224 #define __NR_inotify_add_watch 285 225 225 #define __NR_inotify_rm_watch 286 226 - /* Number 287 is reserved for new sys_migrate_pages */ 226 + #define __NR_migrate_pages 287 227 227 #define __NR_openat 288 228 228 #define __NR_mkdirat 289 229 229 #define __NR_mknodat 290 ··· 245 245 #define __NR_sync_file_range 307 246 246 #define __NR_tee 308 247 247 #define __NR_vmsplice 309 248 - /* Number 310 is reserved for new sys_move_pages */ 248 + #define __NR_move_pages 310 249 249 #define __NR_getcpu 311 250 250 #define __NR_epoll_pwait 312 251 251 #define __NR_utimes 313
+2
arch/s390/kernel/setup.c
··· 62 62 #include <asm/os_info.h> 63 63 #include <asm/sclp.h> 64 64 #include <asm/sysinfo.h> 65 + #include <asm/numa.h> 65 66 #include "entry.h" 66 67 67 68 /* ··· 880 879 setup_lowcore(); 881 880 smp_fill_possible_mask(); 882 881 cpu_init(); 882 + numa_setup(); 883 883 884 884 /* 885 885 * Setup capabilities (ELF_HWCAP & ELF_PLATFORM).
+5 -5
arch/s390/kernel/syscalls.S
··· 276 276 SYSCALL(sys_statfs64,compat_sys_statfs64) 277 277 SYSCALL(sys_fstatfs64,compat_sys_fstatfs64) 278 278 SYSCALL(sys_remap_file_pages,compat_sys_remap_file_pages) 279 - NI_SYSCALL /* 268 sys_mbind */ 280 - NI_SYSCALL /* 269 sys_get_mempolicy */ 281 - NI_SYSCALL /* 270 sys_set_mempolicy */ 279 + SYSCALL(sys_mbind,compat_sys_mbind) 280 + SYSCALL(sys_get_mempolicy,compat_sys_get_mempolicy) 281 + SYSCALL(sys_set_mempolicy,compat_sys_set_mempolicy) 282 282 SYSCALL(sys_mq_open,compat_sys_mq_open) 283 283 SYSCALL(sys_mq_unlink,compat_sys_mq_unlink) 284 284 SYSCALL(sys_mq_timedsend,compat_sys_mq_timedsend) ··· 295 295 SYSCALL(sys_inotify_init,sys_inotify_init) 296 296 SYSCALL(sys_inotify_add_watch,compat_sys_inotify_add_watch) /* 285 */ 297 297 SYSCALL(sys_inotify_rm_watch,compat_sys_inotify_rm_watch) 298 - NI_SYSCALL /* 287 sys_migrate_pages */ 298 + SYSCALL(sys_migrate_pages,compat_sys_migrate_pages) 299 299 SYSCALL(sys_openat,compat_sys_openat) 300 300 SYSCALL(sys_mkdirat,compat_sys_mkdirat) 301 301 SYSCALL(sys_mknodat,compat_sys_mknodat) /* 290 */ ··· 318 318 SYSCALL(sys_sync_file_range,compat_sys_s390_sync_file_range) 319 319 SYSCALL(sys_tee,compat_sys_tee) 320 320 SYSCALL(sys_vmsplice,compat_sys_vmsplice) 321 - NI_SYSCALL /* 310 sys_move_pages */ 321 + SYSCALL(sys_move_pages,compat_sys_move_pages) 322 322 SYSCALL(sys_getcpu,compat_sys_getcpu) 323 323 SYSCALL(sys_epoll_pwait,compat_sys_epoll_pwait) 324 324 SYSCALL(sys_utimes,compat_sys_utimes)
+12 -9
arch/s390/kernel/topology.c
··· 18 18 #include <linux/cpu.h> 19 19 #include <linux/smp.h> 20 20 #include <linux/mm.h> 21 + #include <linux/nodemask.h> 22 + #include <linux/node.h> 21 23 #include <asm/sysinfo.h> 24 + #include <asm/numa.h> 22 25 23 26 #define PTF_HORIZONTAL (0UL) 24 27 #define PTF_VERTICAL (1UL) ··· 263 260 } 264 261 } 265 262 spin_unlock_irqrestore(&topology_lock, flags); 263 + numa_update_cpu_topology(); 266 264 } 267 265 268 266 void store_topology(struct sysinfo_15_1_x *info) ··· 278 274 { 279 275 struct sysinfo_15_1_x *info = tl_info; 280 276 struct device *dev; 281 - int cpu; 277 + int cpu, rc = 0; 282 278 283 - if (!MACHINE_HAS_TOPOLOGY) { 284 - update_cpu_masks(); 285 - topology_update_polarization_simple(); 286 - return 0; 279 + if (MACHINE_HAS_TOPOLOGY) { 280 + rc = 1; 281 + store_topology(info); 282 + tl_to_masks(info); 287 283 } 288 - store_topology(info); 289 - tl_to_masks(info); 290 284 update_cpu_masks(); 285 + if (!MACHINE_HAS_TOPOLOGY) 286 + topology_update_polarization_simple(); 291 287 for_each_online_cpu(cpu) { 292 288 dev = get_cpu_device(cpu); 293 289 kobject_uevent(&dev->kobj, KOBJ_CHANGE); 294 290 } 295 - return 1; 291 + return rc; 296 292 } 297 293 298 294 static void topology_work_fn(struct work_struct *work) ··· 454 450 { cpu_thread_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, 455 451 { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, 456 452 { cpu_book_mask, SD_INIT_NAME(BOOK) }, 457 - { cpu_cpu_mask, SD_INIT_NAME(DIE) }, 458 453 { NULL, }, 459 454 }; 460 455
+1 -1
arch/s390/mm/init.c
··· 139 139 cpumask_set_cpu(0, mm_cpumask(&init_mm)); 140 140 atomic_set(&init_mm.context.attach_count, 1); 141 141 142 - max_mapnr = max_low_pfn; 142 + set_max_mapnr(max_low_pfn); 143 143 high_memory = (void *) __va(max_low_pfn * PAGE_SIZE); 144 144 145 145 /* Setup guest page hinting */
+1
arch/s390/numa/Makefile
··· 1 + obj-y += numa.o
+180
arch/s390/numa/numa.c
··· 1 + /* 2 + * NUMA support for s390 3 + * 4 + * Implement NUMA core code. 5 + * 6 + * Copyright IBM Corp. 2015 7 + */ 8 + 9 + #define KMSG_COMPONENT "numa" 10 + #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt 11 + 12 + #include <linux/kernel.h> 13 + #include <linux/mmzone.h> 14 + #include <linux/cpumask.h> 15 + #include <linux/bootmem.h> 16 + #include <linux/memblock.h> 17 + #include <linux/slab.h> 18 + #include <linux/node.h> 19 + 20 + #include <asm/numa.h> 21 + #include "numa_mode.h" 22 + 23 + pg_data_t *node_data[MAX_NUMNODES]; 24 + EXPORT_SYMBOL(node_data); 25 + 26 + cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; 27 + EXPORT_SYMBOL(node_to_cpumask_map); 28 + 29 + const struct numa_mode numa_mode_plain = { 30 + .name = "plain", 31 + }; 32 + 33 + static const struct numa_mode *mode = &numa_mode_plain; 34 + 35 + int numa_pfn_to_nid(unsigned long pfn) 36 + { 37 + return mode->__pfn_to_nid ? mode->__pfn_to_nid(pfn) : 0; 38 + } 39 + 40 + void numa_update_cpu_topology(void) 41 + { 42 + if (mode->update_cpu_topology) 43 + mode->update_cpu_topology(); 44 + } 45 + 46 + int __node_distance(int a, int b) 47 + { 48 + return mode->distance ? mode->distance(a, b) : 0; 49 + } 50 + 51 + int numa_debug_enabled; 52 + 53 + /* 54 + * alloc_node_data() - Allocate node data 55 + */ 56 + static __init pg_data_t *alloc_node_data(void) 57 + { 58 + pg_data_t *res; 59 + 60 + res = (pg_data_t *) memblock_alloc(sizeof(pg_data_t), 1); 61 + if (!res) 62 + panic("Could not allocate memory for node data!\n"); 63 + memset(res, 0, sizeof(pg_data_t)); 64 + return res; 65 + } 66 + 67 + /* 68 + * numa_setup_memory() - Assign bootmem to nodes 69 + * 70 + * The memory is first added to memblock without any respect to nodes. 71 + * This is fixed before remaining memblock memory is handed over to the 72 + * buddy allocator. 73 + * An important side effect is that large bootmem allocations might easily 74 + * cross node boundaries, which can be needed for large allocations with 75 + * smaller memory stripes in each node (i.e. when using NUMA emulation). 76 + * 77 + * Memory defines nodes: 78 + * Therefore this routine also sets the nodes online with memory. 79 + */ 80 + static void __init numa_setup_memory(void) 81 + { 82 + unsigned long cur_base, align, end_of_dram; 83 + int nid = 0; 84 + 85 + end_of_dram = memblock_end_of_DRAM(); 86 + align = mode->align ? mode->align() : ULONG_MAX; 87 + 88 + /* 89 + * Step through all available memory and assign it to the nodes 90 + * indicated by the mode implementation. 91 + * All nodes which are seen here will be set online. 92 + */ 93 + cur_base = 0; 94 + do { 95 + nid = numa_pfn_to_nid(PFN_DOWN(cur_base)); 96 + node_set_online(nid); 97 + memblock_set_node(cur_base, align, &memblock.memory, nid); 98 + cur_base += align; 99 + } while (cur_base < end_of_dram); 100 + 101 + /* Allocate and fill out node_data */ 102 + for (nid = 0; nid < MAX_NUMNODES; nid++) 103 + NODE_DATA(nid) = alloc_node_data(); 104 + 105 + for_each_online_node(nid) { 106 + unsigned long start_pfn, end_pfn; 107 + unsigned long t_start, t_end; 108 + int i; 109 + 110 + start_pfn = ULONG_MAX; 111 + end_pfn = 0; 112 + for_each_mem_pfn_range(i, nid, &t_start, &t_end, NULL) { 113 + if (t_start < start_pfn) 114 + start_pfn = t_start; 115 + if (t_end > end_pfn) 116 + end_pfn = t_end; 117 + } 118 + NODE_DATA(nid)->node_spanned_pages = end_pfn - start_pfn; 119 + NODE_DATA(nid)->node_id = nid; 120 + } 121 + } 122 + 123 + /* 124 + * numa_setup() - Earliest initialization 125 + * 126 + * Assign the mode and call the mode's setup routine. 127 + */ 128 + void __init numa_setup(void) 129 + { 130 + pr_info("NUMA mode: %s\n", mode->name); 131 + if (mode->setup) 132 + mode->setup(); 133 + numa_setup_memory(); 134 + memblock_dump_all(); 135 + } 136 + 137 + 138 + /* 139 + * numa_init_early() - Initialization initcall 140 + * 141 + * This runs when only one CPU is online and before the first 142 + * topology update is called for by the scheduler. 143 + */ 144 + static int __init numa_init_early(void) 145 + { 146 + /* Attach all possible CPUs to node 0 for now. */ 147 + cpumask_copy(node_to_cpumask_map[0], cpu_possible_mask); 148 + return 0; 149 + } 150 + early_initcall(numa_init_early); 151 + 152 + /* 153 + * numa_init_late() - Initialization initcall 154 + * 155 + * Register NUMA nodes. 156 + */ 157 + static int __init numa_init_late(void) 158 + { 159 + int nid; 160 + 161 + for_each_online_node(nid) 162 + register_one_node(nid); 163 + return 0; 164 + } 165 + device_initcall(numa_init_late); 166 + 167 + static int __init parse_debug(char *parm) 168 + { 169 + numa_debug_enabled = 1; 170 + return 0; 171 + } 172 + early_param("numa_debug", parse_debug); 173 + 174 + static int __init parse_numa(char *parm) 175 + { 176 + if (strcmp(parm, numa_mode_plain.name) == 0) 177 + mode = &numa_mode_plain; 178 + return 0; 179 + } 180 + early_param("numa", parse_numa);
+23
arch/s390/numa/numa_mode.h
··· 1 + /* 2 + * NUMA support for s390 3 + * 4 + * Define declarations used for communication between NUMA mode 5 + * implementations and NUMA core functionality. 6 + * 7 + * Copyright IBM Corp. 2015 8 + */ 9 + #ifndef __S390_NUMA_MODE_H 10 + #define __S390_NUMA_MODE_H 11 + 12 + struct numa_mode { 13 + char *name; /* Name of mode */ 14 + void (*setup)(void); /* Initizalize mode */ 15 + void (*update_cpu_topology)(void); /* Called by topology code */ 16 + int (*__pfn_to_nid)(unsigned long pfn); /* PFN to node ID */ 17 + unsigned long (*align)(void); /* Minimum node alignment */ 18 + int (*distance)(int a, int b); /* Distance between two nodes */ 19 + }; 20 + 21 + extern const struct numa_mode numa_mode_plain; 22 + 23 + #endif /* __S390_NUMA_MODE_H */