Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

x86, NUMA: Move NUMA init logic from numa_64.c to numa.c

Move the generic 64bit NUMA init machinery from numa_64.c to numa.c.

* node_data[], numa_mem_info and numa_distance
* numa_add_memblk[_to](), numa_remove_memblk[_from]()
* numa_set_distance() and friends
* numa_init() and all the numa_meminfo handling helpers called from it
* dummy_numa_init()
* memory_add_physaddr_to_nid()

A new function x86_numa_init() is added and the content of
numa_64.c::initmem_init() is moved into it. initmem_init() now simply
calls x86_numa_init().

Constants and numa_off declaration are moved from numa_{32|64}.h to
numa.h.

This is code reorganization and doesn't involve any functional change.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>

Tejun Heo a4106eae 299a180a

+539 -526
+16
arch/x86/include/asm/numa.h
··· 9 9 #ifdef CONFIG_NUMA 10 10 11 11 #define NR_NODE_MEMBLKS (MAX_NUMNODES*2) 12 + #define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT)) 13 + 14 + /* 15 + * Too small node sizes may confuse the VM badly. Usually they 16 + * result from BIOS bugs. So dont recognize nodes as standalone 17 + * NUMA entities that have less than this amount of RAM listed: 18 + */ 19 + #define NODE_MIN_SIZE (4*1024*1024) 20 + 21 + extern int numa_off; 12 22 13 23 /* 14 24 * __apicid_to_node[] stores the raw mapping between physical apicid and ··· 77 67 #ifdef CONFIG_DEBUG_PER_CPU_MAPS 78 68 void debug_cpumask_set_cpu(int cpu, int node, bool enable); 79 69 #endif 70 + 71 + #ifdef CONFIG_NUMA_EMU 72 + #define FAKE_NODE_MIN_SIZE ((u64)32 << 20) 73 + #define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL)) 74 + void numa_emu_cmdline(char *); 75 + #endif /* CONFIG_NUMA_EMU */ 80 76 81 77 #endif /* _ASM_X86_NUMA_H */
-2
arch/x86/include/asm/numa_32.h
··· 1 1 #ifndef _ASM_X86_NUMA_32_H 2 2 #define _ASM_X86_NUMA_32_H 3 3 4 - extern int numa_off; 5 - 6 4 #ifdef CONFIG_HIGHMEM 7 5 extern void set_highmem_pages_init(void); 8 6 #else
-19
arch/x86/include/asm/numa_64.h
··· 1 1 #ifndef _ASM_X86_NUMA_64_H 2 2 #define _ASM_X86_NUMA_64_H 3 3 4 - #define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT)) 5 - 6 - extern int numa_off; 7 - 8 4 extern unsigned long numa_free_all_bootmem(void); 9 - 10 - #ifdef CONFIG_NUMA 11 - /* 12 - * Too small node sizes may confuse the VM badly. Usually they 13 - * result from BIOS bugs. So dont recognize nodes as standalone 14 - * NUMA entities that have less than this amount of RAM listed: 15 - */ 16 - #define NODE_MIN_SIZE (4*1024*1024) 17 - 18 - #ifdef CONFIG_NUMA_EMU 19 - #define FAKE_NODE_MIN_SIZE ((u64)32 << 20) 20 - #define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL)) 21 - void numa_emu_cmdline(char *); 22 - #endif /* CONFIG_NUMA_EMU */ 23 - #endif 24 5 25 6 #endif /* _ASM_X86_NUMA_64_H */
+520 -3
arch/x86/mm/numa.c
··· 1 1 /* Common code for 32 and 64-bit NUMA */ 2 - #include <linux/topology.h> 3 - #include <linux/module.h> 2 + #include <linux/kernel.h> 3 + #include <linux/mm.h> 4 + #include <linux/string.h> 5 + #include <linux/init.h> 4 6 #include <linux/bootmem.h> 5 - #include <asm/numa.h> 7 + #include <linux/memblock.h> 8 + #include <linux/mmzone.h> 9 + #include <linux/ctype.h> 10 + #include <linux/module.h> 11 + #include <linux/nodemask.h> 12 + #include <linux/sched.h> 13 + #include <linux/topology.h> 14 + 15 + #include <asm/e820.h> 16 + #include <asm/proto.h> 17 + #include <asm/dma.h> 6 18 #include <asm/acpi.h> 19 + #include <asm/amd_nb.h> 20 + 21 + #include "numa_internal.h" 7 22 8 23 int __initdata numa_off; 9 24 nodemask_t numa_nodes_parsed __initdata; 25 + 26 + #ifdef CONFIG_X86_64 27 + struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; 28 + EXPORT_SYMBOL(node_data); 29 + 30 + static struct numa_meminfo numa_meminfo 31 + #ifndef CONFIG_MEMORY_HOTPLUG 32 + __initdata 33 + #endif 34 + ; 35 + 36 + static int numa_distance_cnt; 37 + static u8 *numa_distance; 38 + #endif 10 39 11 40 static __init int numa_setup(char *opt) 12 41 { ··· 134 105 pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids); 135 106 } 136 107 108 + #ifdef CONFIG_X86_64 109 + static int __init numa_add_memblk_to(int nid, u64 start, u64 end, 110 + struct numa_meminfo *mi) 111 + { 112 + /* ignore zero length blks */ 113 + if (start == end) 114 + return 0; 115 + 116 + /* whine about and ignore invalid blks */ 117 + if (start > end || nid < 0 || nid >= MAX_NUMNODES) { 118 + pr_warning("NUMA: Warning: invalid memblk node %d (%Lx-%Lx)\n", 119 + nid, start, end); 120 + return 0; 121 + } 122 + 123 + if (mi->nr_blks >= NR_NODE_MEMBLKS) { 124 + pr_err("NUMA: too many memblk ranges\n"); 125 + return -EINVAL; 126 + } 127 + 128 + mi->blk[mi->nr_blks].start = start; 129 + mi->blk[mi->nr_blks].end = end; 130 + mi->blk[mi->nr_blks].nid = nid; 131 + mi->nr_blks++; 132 + return 0; 133 + } 134 + 135 + /** 136 + * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo 137 + * @idx: Index of memblk to remove 138 + * @mi: numa_meminfo to remove memblk from 139 + * 140 + * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and 141 + * decrementing @mi->nr_blks. 142 + */ 143 + void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi) 144 + { 145 + mi->nr_blks--; 146 + memmove(&mi->blk[idx], &mi->blk[idx + 1], 147 + (mi->nr_blks - idx) * sizeof(mi->blk[0])); 148 + } 149 + 150 + /** 151 + * numa_add_memblk - Add one numa_memblk to numa_meminfo 152 + * @nid: NUMA node ID of the new memblk 153 + * @start: Start address of the new memblk 154 + * @end: End address of the new memblk 155 + * 156 + * Add a new memblk to the default numa_meminfo. 157 + * 158 + * RETURNS: 159 + * 0 on success, -errno on failure. 160 + */ 161 + int __init numa_add_memblk(int nid, u64 start, u64 end) 162 + { 163 + return numa_add_memblk_to(nid, start, end, &numa_meminfo); 164 + } 165 + 166 + /* Initialize bootmem allocator for a node */ 167 + static void __init 168 + setup_node_bootmem(int nid, unsigned long start, unsigned long end) 169 + { 170 + const u64 nd_low = (u64)MAX_DMA_PFN << PAGE_SHIFT; 171 + const u64 nd_high = (u64)max_pfn_mapped << PAGE_SHIFT; 172 + const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE); 173 + unsigned long nd_pa; 174 + int tnid; 175 + 176 + /* 177 + * Don't confuse VM with a node that doesn't have the 178 + * minimum amount of memory: 179 + */ 180 + if (end && (end - start) < NODE_MIN_SIZE) 181 + return; 182 + 183 + start = roundup(start, ZONE_ALIGN); 184 + 185 + printk(KERN_INFO "Initmem setup node %d %016lx-%016lx\n", 186 + nid, start, end); 187 + 188 + /* 189 + * Try to allocate node data on local node and then fall back to 190 + * all nodes. Never allocate in DMA zone. 191 + */ 192 + nd_pa = memblock_x86_find_in_range_node(nid, nd_low, nd_high, 193 + nd_size, SMP_CACHE_BYTES); 194 + if (nd_pa == MEMBLOCK_ERROR) 195 + nd_pa = memblock_find_in_range(nd_low, nd_high, 196 + nd_size, SMP_CACHE_BYTES); 197 + if (nd_pa == MEMBLOCK_ERROR) { 198 + pr_err("Cannot find %lu bytes in node %d\n", nd_size, nid); 199 + return; 200 + } 201 + memblock_x86_reserve_range(nd_pa, nd_pa + nd_size, "NODE_DATA"); 202 + 203 + /* report and initialize */ 204 + printk(KERN_INFO " NODE_DATA [%016lx - %016lx]\n", 205 + nd_pa, nd_pa + nd_size - 1); 206 + tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT); 207 + if (tnid != nid) 208 + printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nid, tnid); 209 + 210 + node_data[nid] = __va(nd_pa); 211 + memset(NODE_DATA(nid), 0, sizeof(pg_data_t)); 212 + NODE_DATA(nid)->node_id = nid; 213 + NODE_DATA(nid)->node_start_pfn = start >> PAGE_SHIFT; 214 + NODE_DATA(nid)->node_spanned_pages = (end - start) >> PAGE_SHIFT; 215 + 216 + node_set_online(nid); 217 + } 218 + 219 + /** 220 + * numa_cleanup_meminfo - Cleanup a numa_meminfo 221 + * @mi: numa_meminfo to clean up 222 + * 223 + * Sanitize @mi by merging and removing unncessary memblks. Also check for 224 + * conflicts and clear unused memblks. 225 + * 226 + * RETURNS: 227 + * 0 on success, -errno on failure. 228 + */ 229 + int __init numa_cleanup_meminfo(struct numa_meminfo *mi) 230 + { 231 + const u64 low = 0; 232 + const u64 high = (u64)max_pfn << PAGE_SHIFT; 233 + int i, j, k; 234 + 235 + for (i = 0; i < mi->nr_blks; i++) { 236 + struct numa_memblk *bi = &mi->blk[i]; 237 + 238 + /* make sure all blocks are inside the limits */ 239 + bi->start = max(bi->start, low); 240 + bi->end = min(bi->end, high); 241 + 242 + /* and there's no empty block */ 243 + if (bi->start >= bi->end) { 244 + numa_remove_memblk_from(i--, mi); 245 + continue; 246 + } 247 + 248 + for (j = i + 1; j < mi->nr_blks; j++) { 249 + struct numa_memblk *bj = &mi->blk[j]; 250 + unsigned long start, end; 251 + 252 + /* 253 + * See whether there are overlapping blocks. Whine 254 + * about but allow overlaps of the same nid. They 255 + * will be merged below. 256 + */ 257 + if (bi->end > bj->start && bi->start < bj->end) { 258 + if (bi->nid != bj->nid) { 259 + pr_err("NUMA: node %d (%Lx-%Lx) overlaps with node %d (%Lx-%Lx)\n", 260 + bi->nid, bi->start, bi->end, 261 + bj->nid, bj->start, bj->end); 262 + return -EINVAL; 263 + } 264 + pr_warning("NUMA: Warning: node %d (%Lx-%Lx) overlaps with itself (%Lx-%Lx)\n", 265 + bi->nid, bi->start, bi->end, 266 + bj->start, bj->end); 267 + } 268 + 269 + /* 270 + * Join together blocks on the same node, holes 271 + * between which don't overlap with memory on other 272 + * nodes. 273 + */ 274 + if (bi->nid != bj->nid) 275 + continue; 276 + start = max(min(bi->start, bj->start), low); 277 + end = min(max(bi->end, bj->end), high); 278 + for (k = 0; k < mi->nr_blks; k++) { 279 + struct numa_memblk *bk = &mi->blk[k]; 280 + 281 + if (bi->nid == bk->nid) 282 + continue; 283 + if (start < bk->end && end > bk->start) 284 + break; 285 + } 286 + if (k < mi->nr_blks) 287 + continue; 288 + printk(KERN_INFO "NUMA: Node %d [%Lx,%Lx) + [%Lx,%Lx) -> [%lx,%lx)\n", 289 + bi->nid, bi->start, bi->end, bj->start, bj->end, 290 + start, end); 291 + bi->start = start; 292 + bi->end = end; 293 + numa_remove_memblk_from(j--, mi); 294 + } 295 + } 296 + 297 + for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) { 298 + mi->blk[i].start = mi->blk[i].end = 0; 299 + mi->blk[i].nid = NUMA_NO_NODE; 300 + } 301 + 302 + return 0; 303 + } 304 + 305 + /* 306 + * Set nodes, which have memory in @mi, in *@nodemask. 307 + */ 308 + static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask, 309 + const struct numa_meminfo *mi) 310 + { 311 + int i; 312 + 313 + for (i = 0; i < ARRAY_SIZE(mi->blk); i++) 314 + if (mi->blk[i].start != mi->blk[i].end && 315 + mi->blk[i].nid != NUMA_NO_NODE) 316 + node_set(mi->blk[i].nid, *nodemask); 317 + } 318 + 319 + /** 320 + * numa_reset_distance - Reset NUMA distance table 321 + * 322 + * The current table is freed. The next numa_set_distance() call will 323 + * create a new one. 324 + */ 325 + void __init numa_reset_distance(void) 326 + { 327 + size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]); 328 + 329 + /* numa_distance could be 1LU marking allocation failure, test cnt */ 330 + if (numa_distance_cnt) 331 + memblock_x86_free_range(__pa(numa_distance), 332 + __pa(numa_distance) + size); 333 + numa_distance_cnt = 0; 334 + numa_distance = NULL; /* enable table creation */ 335 + } 336 + 337 + static int __init numa_alloc_distance(void) 338 + { 339 + nodemask_t nodes_parsed; 340 + size_t size; 341 + int i, j, cnt = 0; 342 + u64 phys; 343 + 344 + /* size the new table and allocate it */ 345 + nodes_parsed = numa_nodes_parsed; 346 + numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo); 347 + 348 + for_each_node_mask(i, nodes_parsed) 349 + cnt = i; 350 + cnt++; 351 + size = cnt * cnt * sizeof(numa_distance[0]); 352 + 353 + phys = memblock_find_in_range(0, (u64)max_pfn_mapped << PAGE_SHIFT, 354 + size, PAGE_SIZE); 355 + if (phys == MEMBLOCK_ERROR) { 356 + pr_warning("NUMA: Warning: can't allocate distance table!\n"); 357 + /* don't retry until explicitly reset */ 358 + numa_distance = (void *)1LU; 359 + return -ENOMEM; 360 + } 361 + memblock_x86_reserve_range(phys, phys + size, "NUMA DIST"); 362 + 363 + numa_distance = __va(phys); 364 + numa_distance_cnt = cnt; 365 + 366 + /* fill with the default distances */ 367 + for (i = 0; i < cnt; i++) 368 + for (j = 0; j < cnt; j++) 369 + numa_distance[i * cnt + j] = i == j ? 370 + LOCAL_DISTANCE : REMOTE_DISTANCE; 371 + printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt); 372 + 373 + return 0; 374 + } 375 + 376 + /** 377 + * numa_set_distance - Set NUMA distance from one NUMA to another 378 + * @from: the 'from' node to set distance 379 + * @to: the 'to' node to set distance 380 + * @distance: NUMA distance 381 + * 382 + * Set the distance from node @from to @to to @distance. If distance table 383 + * doesn't exist, one which is large enough to accommodate all the currently 384 + * known nodes will be created. 385 + * 386 + * If such table cannot be allocated, a warning is printed and further 387 + * calls are ignored until the distance table is reset with 388 + * numa_reset_distance(). 389 + * 390 + * If @from or @to is higher than the highest known node at the time of 391 + * table creation or @distance doesn't make sense, the call is ignored. 392 + * This is to allow simplification of specific NUMA config implementations. 393 + */ 394 + void __init numa_set_distance(int from, int to, int distance) 395 + { 396 + if (!numa_distance && numa_alloc_distance() < 0) 397 + return; 398 + 399 + if (from >= numa_distance_cnt || to >= numa_distance_cnt) { 400 + printk_once(KERN_DEBUG "NUMA: Debug: distance out of bound, from=%d to=%d distance=%d\n", 401 + from, to, distance); 402 + return; 403 + } 404 + 405 + if ((u8)distance != distance || 406 + (from == to && distance != LOCAL_DISTANCE)) { 407 + pr_warn_once("NUMA: Warning: invalid distance parameter, from=%d to=%d distance=%d\n", 408 + from, to, distance); 409 + return; 410 + } 411 + 412 + numa_distance[from * numa_distance_cnt + to] = distance; 413 + } 414 + 415 + int __node_distance(int from, int to) 416 + { 417 + if (from >= numa_distance_cnt || to >= numa_distance_cnt) 418 + return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE; 419 + return numa_distance[from * numa_distance_cnt + to]; 420 + } 421 + EXPORT_SYMBOL(__node_distance); 422 + 423 + /* 424 + * Sanity check to catch more bad NUMA configurations (they are amazingly 425 + * common). Make sure the nodes cover all memory. 426 + */ 427 + static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi) 428 + { 429 + unsigned long numaram, e820ram; 430 + int i; 431 + 432 + numaram = 0; 433 + for (i = 0; i < mi->nr_blks; i++) { 434 + unsigned long s = mi->blk[i].start >> PAGE_SHIFT; 435 + unsigned long e = mi->blk[i].end >> PAGE_SHIFT; 436 + numaram += e - s; 437 + numaram -= __absent_pages_in_range(mi->blk[i].nid, s, e); 438 + if ((long)numaram < 0) 439 + numaram = 0; 440 + } 441 + 442 + e820ram = max_pfn - (memblock_x86_hole_size(0, 443 + max_pfn << PAGE_SHIFT) >> PAGE_SHIFT); 444 + /* We seem to lose 3 pages somewhere. Allow 1M of slack. */ 445 + if ((long)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) { 446 + printk(KERN_ERR "NUMA: nodes only cover %luMB of your %luMB e820 RAM. Not used.\n", 447 + (numaram << PAGE_SHIFT) >> 20, 448 + (e820ram << PAGE_SHIFT) >> 20); 449 + return false; 450 + } 451 + return true; 452 + } 453 + 454 + static int __init numa_register_memblks(struct numa_meminfo *mi) 455 + { 456 + int i, nid; 457 + 458 + /* Account for nodes with cpus and no memory */ 459 + node_possible_map = numa_nodes_parsed; 460 + numa_nodemask_from_meminfo(&node_possible_map, mi); 461 + if (WARN_ON(nodes_empty(node_possible_map))) 462 + return -EINVAL; 463 + 464 + for (i = 0; i < mi->nr_blks; i++) 465 + memblock_x86_register_active_regions(mi->blk[i].nid, 466 + mi->blk[i].start >> PAGE_SHIFT, 467 + mi->blk[i].end >> PAGE_SHIFT); 468 + 469 + /* for out of order entries */ 470 + sort_node_map(); 471 + if (!numa_meminfo_cover_memory(mi)) 472 + return -EINVAL; 473 + 474 + /* Finally register nodes. */ 475 + for_each_node_mask(nid, node_possible_map) { 476 + u64 start = (u64)max_pfn << PAGE_SHIFT; 477 + u64 end = 0; 478 + 479 + for (i = 0; i < mi->nr_blks; i++) { 480 + if (nid != mi->blk[i].nid) 481 + continue; 482 + start = min(mi->blk[i].start, start); 483 + end = max(mi->blk[i].end, end); 484 + } 485 + 486 + if (start < end) 487 + setup_node_bootmem(nid, start, end); 488 + } 489 + 490 + return 0; 491 + } 492 + #endif 493 + 137 494 /* 138 495 * There are unfortunately some poorly designed mainboards around that 139 496 * only connect memory to a single CPU. This breaks the 1:1 cpu->node ··· 541 126 rr = first_node(node_online_map); 542 127 } 543 128 } 129 + 130 + #ifdef CONFIG_X86_64 131 + static int __init numa_init(int (*init_func)(void)) 132 + { 133 + int i; 134 + int ret; 135 + 136 + for (i = 0; i < MAX_LOCAL_APIC; i++) 137 + set_apicid_to_node(i, NUMA_NO_NODE); 138 + 139 + nodes_clear(numa_nodes_parsed); 140 + nodes_clear(node_possible_map); 141 + nodes_clear(node_online_map); 142 + memset(&numa_meminfo, 0, sizeof(numa_meminfo)); 143 + remove_all_active_ranges(); 144 + numa_reset_distance(); 145 + 146 + ret = init_func(); 147 + if (ret < 0) 148 + return ret; 149 + ret = numa_cleanup_meminfo(&numa_meminfo); 150 + if (ret < 0) 151 + return ret; 152 + 153 + numa_emulation(&numa_meminfo, numa_distance_cnt); 154 + 155 + ret = numa_register_memblks(&numa_meminfo); 156 + if (ret < 0) 157 + return ret; 158 + 159 + for (i = 0; i < nr_cpu_ids; i++) { 160 + int nid = early_cpu_to_node(i); 161 + 162 + if (nid == NUMA_NO_NODE) 163 + continue; 164 + if (!node_online(nid)) 165 + numa_clear_node(i); 166 + } 167 + numa_init_array(); 168 + return 0; 169 + } 170 + 171 + /** 172 + * dummy_numa_init - Fallback dummy NUMA init 173 + * 174 + * Used if there's no underlying NUMA architecture, NUMA initialization 175 + * fails, or NUMA is disabled on the command line. 176 + * 177 + * Must online at least one node and add memory blocks that cover all 178 + * allowed memory. This function must not fail. 179 + */ 180 + static int __init dummy_numa_init(void) 181 + { 182 + printk(KERN_INFO "%s\n", 183 + numa_off ? "NUMA turned off" : "No NUMA configuration found"); 184 + printk(KERN_INFO "Faking a node at %016lx-%016lx\n", 185 + 0LU, max_pfn << PAGE_SHIFT); 186 + 187 + node_set(0, numa_nodes_parsed); 188 + numa_add_memblk(0, 0, (u64)max_pfn << PAGE_SHIFT); 189 + 190 + return 0; 191 + } 192 + 193 + /** 194 + * x86_numa_init - Initialize NUMA 195 + * 196 + * Try each configured NUMA initialization method until one succeeds. The 197 + * last fallback is dummy single node config encomapssing whole memory and 198 + * never fails. 199 + */ 200 + void __init x86_numa_init(void) 201 + { 202 + if (!numa_off) { 203 + #ifdef CONFIG_ACPI_NUMA 204 + if (!numa_init(x86_acpi_numa_init)) 205 + return; 206 + #endif 207 + #ifdef CONFIG_AMD_NUMA 208 + if (!numa_init(amd_numa_init)) 209 + return; 210 + #endif 211 + } 212 + 213 + numa_init(dummy_numa_init); 214 + } 215 + #endif 544 216 545 217 static __init int find_near_online_node(int node) 546 218 { ··· 794 292 EXPORT_SYMBOL(cpumask_of_node); 795 293 796 294 #endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ 295 + 296 + #if defined(CONFIG_X86_64) && defined(CONFIG_MEMORY_HOTPLUG) 297 + int memory_add_physaddr_to_nid(u64 start) 298 + { 299 + struct numa_meminfo *mi = &numa_meminfo; 300 + int nid = mi->blk[0].nid; 301 + int i; 302 + 303 + for (i = 0; i < mi->nr_blks; i++) 304 + if (mi->blk[i].start <= start && mi->blk[i].end > start) 305 + nid = mi->blk[i].nid; 306 + return nid; 307 + } 308 + EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); 309 + #endif
+1 -502
arch/x86/mm/numa_64.c
··· 2 2 * Generic VM initialization for x86-64 NUMA setups. 3 3 * Copyright 2002,2003 Andi Kleen, SuSE Labs. 4 4 */ 5 - #include <linux/kernel.h> 6 - #include <linux/mm.h> 7 - #include <linux/string.h> 8 - #include <linux/init.h> 9 5 #include <linux/bootmem.h> 10 - #include <linux/memblock.h> 11 - #include <linux/mmzone.h> 12 - #include <linux/ctype.h> 13 - #include <linux/module.h> 14 - #include <linux/nodemask.h> 15 - #include <linux/sched.h> 16 - #include <linux/acpi.h> 17 - 18 - #include <asm/e820.h> 19 - #include <asm/proto.h> 20 - #include <asm/dma.h> 21 - #include <asm/acpi.h> 22 - #include <asm/amd_nb.h> 23 6 24 7 #include "numa_internal.h" 25 8 26 - struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; 27 - EXPORT_SYMBOL(node_data); 28 - 29 - static struct numa_meminfo numa_meminfo 30 - #ifndef CONFIG_MEMORY_HOTPLUG 31 - __initdata 32 - #endif 33 - ; 34 - 35 - static int numa_distance_cnt; 36 - static u8 *numa_distance; 37 - 38 - static int __init numa_add_memblk_to(int nid, u64 start, u64 end, 39 - struct numa_meminfo *mi) 40 - { 41 - /* ignore zero length blks */ 42 - if (start == end) 43 - return 0; 44 - 45 - /* whine about and ignore invalid blks */ 46 - if (start > end || nid < 0 || nid >= MAX_NUMNODES) { 47 - pr_warning("NUMA: Warning: invalid memblk node %d (%Lx-%Lx)\n", 48 - nid, start, end); 49 - return 0; 50 - } 51 - 52 - if (mi->nr_blks >= NR_NODE_MEMBLKS) { 53 - pr_err("NUMA: too many memblk ranges\n"); 54 - return -EINVAL; 55 - } 56 - 57 - mi->blk[mi->nr_blks].start = start; 58 - mi->blk[mi->nr_blks].end = end; 59 - mi->blk[mi->nr_blks].nid = nid; 60 - mi->nr_blks++; 61 - return 0; 62 - } 63 - 64 - /** 65 - * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo 66 - * @idx: Index of memblk to remove 67 - * @mi: numa_meminfo to remove memblk from 68 - * 69 - * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and 70 - * decrementing @mi->nr_blks. 71 - */ 72 - void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi) 73 - { 74 - mi->nr_blks--; 75 - memmove(&mi->blk[idx], &mi->blk[idx + 1], 76 - (mi->nr_blks - idx) * sizeof(mi->blk[0])); 77 - } 78 - 79 - /** 80 - * numa_add_memblk - Add one numa_memblk to numa_meminfo 81 - * @nid: NUMA node ID of the new memblk 82 - * @start: Start address of the new memblk 83 - * @end: End address of the new memblk 84 - * 85 - * Add a new memblk to the default numa_meminfo. 86 - * 87 - * RETURNS: 88 - * 0 on success, -errno on failure. 89 - */ 90 - int __init numa_add_memblk(int nid, u64 start, u64 end) 91 - { 92 - return numa_add_memblk_to(nid, start, end, &numa_meminfo); 93 - } 94 - 95 - /* Initialize bootmem allocator for a node */ 96 - static void __init 97 - setup_node_bootmem(int nid, unsigned long start, unsigned long end) 98 - { 99 - const u64 nd_low = (u64)MAX_DMA_PFN << PAGE_SHIFT; 100 - const u64 nd_high = (u64)max_pfn_mapped << PAGE_SHIFT; 101 - const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE); 102 - unsigned long nd_pa; 103 - int tnid; 104 - 105 - /* 106 - * Don't confuse VM with a node that doesn't have the 107 - * minimum amount of memory: 108 - */ 109 - if (end && (end - start) < NODE_MIN_SIZE) 110 - return; 111 - 112 - start = roundup(start, ZONE_ALIGN); 113 - 114 - printk(KERN_INFO "Initmem setup node %d %016lx-%016lx\n", 115 - nid, start, end); 116 - 117 - /* 118 - * Try to allocate node data on local node and then fall back to 119 - * all nodes. Never allocate in DMA zone. 120 - */ 121 - nd_pa = memblock_x86_find_in_range_node(nid, nd_low, nd_high, 122 - nd_size, SMP_CACHE_BYTES); 123 - if (nd_pa == MEMBLOCK_ERROR) 124 - nd_pa = memblock_find_in_range(nd_low, nd_high, 125 - nd_size, SMP_CACHE_BYTES); 126 - if (nd_pa == MEMBLOCK_ERROR) { 127 - pr_err("Cannot find %lu bytes in node %d\n", nd_size, nid); 128 - return; 129 - } 130 - memblock_x86_reserve_range(nd_pa, nd_pa + nd_size, "NODE_DATA"); 131 - 132 - /* report and initialize */ 133 - printk(KERN_INFO " NODE_DATA [%016lx - %016lx]\n", 134 - nd_pa, nd_pa + nd_size - 1); 135 - tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT); 136 - if (tnid != nid) 137 - printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nid, tnid); 138 - 139 - node_data[nid] = __va(nd_pa); 140 - memset(NODE_DATA(nid), 0, sizeof(pg_data_t)); 141 - NODE_DATA(nid)->node_id = nid; 142 - NODE_DATA(nid)->node_start_pfn = start >> PAGE_SHIFT; 143 - NODE_DATA(nid)->node_spanned_pages = (end - start) >> PAGE_SHIFT; 144 - 145 - node_set_online(nid); 146 - } 147 - 148 - /** 149 - * numa_cleanup_meminfo - Cleanup a numa_meminfo 150 - * @mi: numa_meminfo to clean up 151 - * 152 - * Sanitize @mi by merging and removing unncessary memblks. Also check for 153 - * conflicts and clear unused memblks. 154 - * 155 - * RETURNS: 156 - * 0 on success, -errno on failure. 157 - */ 158 - int __init numa_cleanup_meminfo(struct numa_meminfo *mi) 159 - { 160 - const u64 low = 0; 161 - const u64 high = (u64)max_pfn << PAGE_SHIFT; 162 - int i, j, k; 163 - 164 - for (i = 0; i < mi->nr_blks; i++) { 165 - struct numa_memblk *bi = &mi->blk[i]; 166 - 167 - /* make sure all blocks are inside the limits */ 168 - bi->start = max(bi->start, low); 169 - bi->end = min(bi->end, high); 170 - 171 - /* and there's no empty block */ 172 - if (bi->start >= bi->end) { 173 - numa_remove_memblk_from(i--, mi); 174 - continue; 175 - } 176 - 177 - for (j = i + 1; j < mi->nr_blks; j++) { 178 - struct numa_memblk *bj = &mi->blk[j]; 179 - unsigned long start, end; 180 - 181 - /* 182 - * See whether there are overlapping blocks. Whine 183 - * about but allow overlaps of the same nid. They 184 - * will be merged below. 185 - */ 186 - if (bi->end > bj->start && bi->start < bj->end) { 187 - if (bi->nid != bj->nid) { 188 - pr_err("NUMA: node %d (%Lx-%Lx) overlaps with node %d (%Lx-%Lx)\n", 189 - bi->nid, bi->start, bi->end, 190 - bj->nid, bj->start, bj->end); 191 - return -EINVAL; 192 - } 193 - pr_warning("NUMA: Warning: node %d (%Lx-%Lx) overlaps with itself (%Lx-%Lx)\n", 194 - bi->nid, bi->start, bi->end, 195 - bj->start, bj->end); 196 - } 197 - 198 - /* 199 - * Join together blocks on the same node, holes 200 - * between which don't overlap with memory on other 201 - * nodes. 202 - */ 203 - if (bi->nid != bj->nid) 204 - continue; 205 - start = max(min(bi->start, bj->start), low); 206 - end = min(max(bi->end, bj->end), high); 207 - for (k = 0; k < mi->nr_blks; k++) { 208 - struct numa_memblk *bk = &mi->blk[k]; 209 - 210 - if (bi->nid == bk->nid) 211 - continue; 212 - if (start < bk->end && end > bk->start) 213 - break; 214 - } 215 - if (k < mi->nr_blks) 216 - continue; 217 - printk(KERN_INFO "NUMA: Node %d [%Lx,%Lx) + [%Lx,%Lx) -> [%lx,%lx)\n", 218 - bi->nid, bi->start, bi->end, bj->start, bj->end, 219 - start, end); 220 - bi->start = start; 221 - bi->end = end; 222 - numa_remove_memblk_from(j--, mi); 223 - } 224 - } 225 - 226 - for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) { 227 - mi->blk[i].start = mi->blk[i].end = 0; 228 - mi->blk[i].nid = NUMA_NO_NODE; 229 - } 230 - 231 - return 0; 232 - } 233 - 234 - /* 235 - * Set nodes, which have memory in @mi, in *@nodemask. 236 - */ 237 - static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask, 238 - const struct numa_meminfo *mi) 239 - { 240 - int i; 241 - 242 - for (i = 0; i < ARRAY_SIZE(mi->blk); i++) 243 - if (mi->blk[i].start != mi->blk[i].end && 244 - mi->blk[i].nid != NUMA_NO_NODE) 245 - node_set(mi->blk[i].nid, *nodemask); 246 - } 247 - 248 - /** 249 - * numa_reset_distance - Reset NUMA distance table 250 - * 251 - * The current table is freed. The next numa_set_distance() call will 252 - * create a new one. 253 - */ 254 - void __init numa_reset_distance(void) 255 - { 256 - size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]); 257 - 258 - /* numa_distance could be 1LU marking allocation failure, test cnt */ 259 - if (numa_distance_cnt) 260 - memblock_x86_free_range(__pa(numa_distance), 261 - __pa(numa_distance) + size); 262 - numa_distance_cnt = 0; 263 - numa_distance = NULL; /* enable table creation */ 264 - } 265 - 266 - static int __init numa_alloc_distance(void) 267 - { 268 - nodemask_t nodes_parsed; 269 - size_t size; 270 - int i, j, cnt = 0; 271 - u64 phys; 272 - 273 - /* size the new table and allocate it */ 274 - nodes_parsed = numa_nodes_parsed; 275 - numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo); 276 - 277 - for_each_node_mask(i, nodes_parsed) 278 - cnt = i; 279 - cnt++; 280 - size = cnt * cnt * sizeof(numa_distance[0]); 281 - 282 - phys = memblock_find_in_range(0, (u64)max_pfn_mapped << PAGE_SHIFT, 283 - size, PAGE_SIZE); 284 - if (phys == MEMBLOCK_ERROR) { 285 - pr_warning("NUMA: Warning: can't allocate distance table!\n"); 286 - /* don't retry until explicitly reset */ 287 - numa_distance = (void *)1LU; 288 - return -ENOMEM; 289 - } 290 - memblock_x86_reserve_range(phys, phys + size, "NUMA DIST"); 291 - 292 - numa_distance = __va(phys); 293 - numa_distance_cnt = cnt; 294 - 295 - /* fill with the default distances */ 296 - for (i = 0; i < cnt; i++) 297 - for (j = 0; j < cnt; j++) 298 - numa_distance[i * cnt + j] = i == j ? 299 - LOCAL_DISTANCE : REMOTE_DISTANCE; 300 - printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt); 301 - 302 - return 0; 303 - } 304 - 305 - /** 306 - * numa_set_distance - Set NUMA distance from one NUMA to another 307 - * @from: the 'from' node to set distance 308 - * @to: the 'to' node to set distance 309 - * @distance: NUMA distance 310 - * 311 - * Set the distance from node @from to @to to @distance. If distance table 312 - * doesn't exist, one which is large enough to accommodate all the currently 313 - * known nodes will be created. 314 - * 315 - * If such table cannot be allocated, a warning is printed and further 316 - * calls are ignored until the distance table is reset with 317 - * numa_reset_distance(). 318 - * 319 - * If @from or @to is higher than the highest known node at the time of 320 - * table creation or @distance doesn't make sense, the call is ignored. 321 - * This is to allow simplification of specific NUMA config implementations. 322 - */ 323 - void __init numa_set_distance(int from, int to, int distance) 324 - { 325 - if (!numa_distance && numa_alloc_distance() < 0) 326 - return; 327 - 328 - if (from >= numa_distance_cnt || to >= numa_distance_cnt) { 329 - printk_once(KERN_DEBUG "NUMA: Debug: distance out of bound, from=%d to=%d distance=%d\n", 330 - from, to, distance); 331 - return; 332 - } 333 - 334 - if ((u8)distance != distance || 335 - (from == to && distance != LOCAL_DISTANCE)) { 336 - pr_warn_once("NUMA: Warning: invalid distance parameter, from=%d to=%d distance=%d\n", 337 - from, to, distance); 338 - return; 339 - } 340 - 341 - numa_distance[from * numa_distance_cnt + to] = distance; 342 - } 343 - 344 - int __node_distance(int from, int to) 345 - { 346 - if (from >= numa_distance_cnt || to >= numa_distance_cnt) 347 - return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE; 348 - return numa_distance[from * numa_distance_cnt + to]; 349 - } 350 - EXPORT_SYMBOL(__node_distance); 351 - 352 - /* 353 - * Sanity check to catch more bad NUMA configurations (they are amazingly 354 - * common). Make sure the nodes cover all memory. 355 - */ 356 - static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi) 357 - { 358 - unsigned long numaram, e820ram; 359 - int i; 360 - 361 - numaram = 0; 362 - for (i = 0; i < mi->nr_blks; i++) { 363 - unsigned long s = mi->blk[i].start >> PAGE_SHIFT; 364 - unsigned long e = mi->blk[i].end >> PAGE_SHIFT; 365 - numaram += e - s; 366 - numaram -= __absent_pages_in_range(mi->blk[i].nid, s, e); 367 - if ((long)numaram < 0) 368 - numaram = 0; 369 - } 370 - 371 - e820ram = max_pfn - (memblock_x86_hole_size(0, 372 - max_pfn << PAGE_SHIFT) >> PAGE_SHIFT); 373 - /* We seem to lose 3 pages somewhere. Allow 1M of slack. */ 374 - if ((long)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) { 375 - printk(KERN_ERR "NUMA: nodes only cover %luMB of your %luMB e820 RAM. Not used.\n", 376 - (numaram << PAGE_SHIFT) >> 20, 377 - (e820ram << PAGE_SHIFT) >> 20); 378 - return false; 379 - } 380 - return true; 381 - } 382 - 383 - static int __init numa_register_memblks(struct numa_meminfo *mi) 384 - { 385 - int i, nid; 386 - 387 - /* Account for nodes with cpus and no memory */ 388 - node_possible_map = numa_nodes_parsed; 389 - numa_nodemask_from_meminfo(&node_possible_map, mi); 390 - if (WARN_ON(nodes_empty(node_possible_map))) 391 - return -EINVAL; 392 - 393 - for (i = 0; i < mi->nr_blks; i++) 394 - memblock_x86_register_active_regions(mi->blk[i].nid, 395 - mi->blk[i].start >> PAGE_SHIFT, 396 - mi->blk[i].end >> PAGE_SHIFT); 397 - 398 - /* for out of order entries */ 399 - sort_node_map(); 400 - if (!numa_meminfo_cover_memory(mi)) 401 - return -EINVAL; 402 - 403 - /* Finally register nodes. */ 404 - for_each_node_mask(nid, node_possible_map) { 405 - u64 start = (u64)max_pfn << PAGE_SHIFT; 406 - u64 end = 0; 407 - 408 - for (i = 0; i < mi->nr_blks; i++) { 409 - if (nid != mi->blk[i].nid) 410 - continue; 411 - start = min(mi->blk[i].start, start); 412 - end = max(mi->blk[i].end, end); 413 - } 414 - 415 - if (start < end) 416 - setup_node_bootmem(nid, start, end); 417 - } 418 - 419 - return 0; 420 - } 421 - 422 - /** 423 - * dummy_numma_init - Fallback dummy NUMA init 424 - * 425 - * Used if there's no underlying NUMA architecture, NUMA initialization 426 - * fails, or NUMA is disabled on the command line. 427 - * 428 - * Must online at least one node and add memory blocks that cover all 429 - * allowed memory. This function must not fail. 430 - */ 431 - static int __init dummy_numa_init(void) 432 - { 433 - printk(KERN_INFO "%s\n", 434 - numa_off ? "NUMA turned off" : "No NUMA configuration found"); 435 - printk(KERN_INFO "Faking a node at %016lx-%016lx\n", 436 - 0LU, max_pfn << PAGE_SHIFT); 437 - 438 - node_set(0, numa_nodes_parsed); 439 - numa_add_memblk(0, 0, (u64)max_pfn << PAGE_SHIFT); 440 - 441 - return 0; 442 - } 443 - 444 - static int __init numa_init(int (*init_func)(void)) 445 - { 446 - int i; 447 - int ret; 448 - 449 - for (i = 0; i < MAX_LOCAL_APIC; i++) 450 - set_apicid_to_node(i, NUMA_NO_NODE); 451 - 452 - nodes_clear(numa_nodes_parsed); 453 - nodes_clear(node_possible_map); 454 - nodes_clear(node_online_map); 455 - memset(&numa_meminfo, 0, sizeof(numa_meminfo)); 456 - remove_all_active_ranges(); 457 - numa_reset_distance(); 458 - 459 - ret = init_func(); 460 - if (ret < 0) 461 - return ret; 462 - ret = numa_cleanup_meminfo(&numa_meminfo); 463 - if (ret < 0) 464 - return ret; 465 - 466 - numa_emulation(&numa_meminfo, numa_distance_cnt); 467 - 468 - ret = numa_register_memblks(&numa_meminfo); 469 - if (ret < 0) 470 - return ret; 471 - 472 - for (i = 0; i < nr_cpu_ids; i++) { 473 - int nid = early_cpu_to_node(i); 474 - 475 - if (nid == NUMA_NO_NODE) 476 - continue; 477 - if (!node_online(nid)) 478 - numa_clear_node(i); 479 - } 480 - numa_init_array(); 481 - return 0; 482 - } 483 - 484 9 void __init initmem_init(void) 485 10 { 486 - if (!numa_off) { 487 - #ifdef CONFIG_ACPI_NUMA 488 - if (!numa_init(x86_acpi_numa_init)) 489 - return; 490 - #endif 491 - #ifdef CONFIG_AMD_NUMA 492 - if (!numa_init(amd_numa_init)) 493 - return; 494 - #endif 495 - } 496 - 497 - numa_init(dummy_numa_init); 11 + x86_numa_init(); 498 12 } 499 13 500 14 unsigned long __init numa_free_all_bootmem(void) ··· 23 509 24 510 return pages; 25 511 } 26 - 27 - #ifdef CONFIG_MEMORY_HOTPLUG 28 - int memory_add_physaddr_to_nid(u64 start) 29 - { 30 - struct numa_meminfo *mi = &numa_meminfo; 31 - int nid = mi->blk[0].nid; 32 - int i; 33 - 34 - for (i = 0; i < mi->nr_blks; i++) 35 - if (mi->blk[i].start <= start && mi->blk[i].end > start) 36 - nid = mi->blk[i].nid; 37 - return nid; 38 - } 39 - EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); 40 - #endif
+2
arch/x86/mm/numa_internal.h
··· 19 19 int __init numa_cleanup_meminfo(struct numa_meminfo *mi); 20 20 void __init numa_reset_distance(void); 21 21 22 + void __init x86_numa_init(void); 23 + 22 24 #ifdef CONFIG_NUMA_EMU 23 25 void __init numa_emulation(struct numa_meminfo *numa_meminfo, 24 26 int numa_dist_cnt);