at v4.11-rc1 2195 lines 57 kB view raw
1/* 2 * linux/mm/memory_hotplug.c 3 * 4 * Copyright (C) 5 */ 6 7#include <linux/stddef.h> 8#include <linux/mm.h> 9#include <linux/sched/signal.h> 10#include <linux/swap.h> 11#include <linux/interrupt.h> 12#include <linux/pagemap.h> 13#include <linux/compiler.h> 14#include <linux/export.h> 15#include <linux/pagevec.h> 16#include <linux/writeback.h> 17#include <linux/slab.h> 18#include <linux/sysctl.h> 19#include <linux/cpu.h> 20#include <linux/memory.h> 21#include <linux/memremap.h> 22#include <linux/memory_hotplug.h> 23#include <linux/highmem.h> 24#include <linux/vmalloc.h> 25#include <linux/ioport.h> 26#include <linux/delay.h> 27#include <linux/migrate.h> 28#include <linux/page-isolation.h> 29#include <linux/pfn.h> 30#include <linux/suspend.h> 31#include <linux/mm_inline.h> 32#include <linux/firmware-map.h> 33#include <linux/stop_machine.h> 34#include <linux/hugetlb.h> 35#include <linux/memblock.h> 36#include <linux/bootmem.h> 37#include <linux/compaction.h> 38 39#include <asm/tlbflush.h> 40 41#include "internal.h" 42 43/* 44 * online_page_callback contains pointer to current page onlining function. 45 * Initially it is generic_online_page(). If it is required it could be 46 * changed by calling set_online_page_callback() for callback registration 47 * and restore_online_page_callback() for generic callback restore. 48 */ 49 50static void generic_online_page(struct page *page); 51 52static online_page_callback_t online_page_callback = generic_online_page; 53static DEFINE_MUTEX(online_page_callback_lock); 54 55/* The same as the cpu_hotplug lock, but for memory hotplug. */ 56static struct { 57 struct task_struct *active_writer; 58 struct mutex lock; /* Synchronizes accesses to refcount, */ 59 /* 60 * Also blocks the new readers during 61 * an ongoing mem hotplug operation. 62 */ 63 int refcount; 64 65#ifdef CONFIG_DEBUG_LOCK_ALLOC 66 struct lockdep_map dep_map; 67#endif 68} mem_hotplug = { 69 .active_writer = NULL, 70 .lock = __MUTEX_INITIALIZER(mem_hotplug.lock), 71 .refcount = 0, 72#ifdef CONFIG_DEBUG_LOCK_ALLOC 73 .dep_map = {.name = "mem_hotplug.lock" }, 74#endif 75}; 76 77/* Lockdep annotations for get/put_online_mems() and mem_hotplug_begin/end() */ 78#define memhp_lock_acquire_read() lock_map_acquire_read(&mem_hotplug.dep_map) 79#define memhp_lock_acquire() lock_map_acquire(&mem_hotplug.dep_map) 80#define memhp_lock_release() lock_map_release(&mem_hotplug.dep_map) 81 82#ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE 83bool memhp_auto_online; 84#else 85bool memhp_auto_online = true; 86#endif 87EXPORT_SYMBOL_GPL(memhp_auto_online); 88 89static int __init setup_memhp_default_state(char *str) 90{ 91 if (!strcmp(str, "online")) 92 memhp_auto_online = true; 93 else if (!strcmp(str, "offline")) 94 memhp_auto_online = false; 95 96 return 1; 97} 98__setup("memhp_default_state=", setup_memhp_default_state); 99 100void get_online_mems(void) 101{ 102 might_sleep(); 103 if (mem_hotplug.active_writer == current) 104 return; 105 memhp_lock_acquire_read(); 106 mutex_lock(&mem_hotplug.lock); 107 mem_hotplug.refcount++; 108 mutex_unlock(&mem_hotplug.lock); 109 110} 111 112void put_online_mems(void) 113{ 114 if (mem_hotplug.active_writer == current) 115 return; 116 mutex_lock(&mem_hotplug.lock); 117 118 if (WARN_ON(!mem_hotplug.refcount)) 119 mem_hotplug.refcount++; /* try to fix things up */ 120 121 if (!--mem_hotplug.refcount && unlikely(mem_hotplug.active_writer)) 122 wake_up_process(mem_hotplug.active_writer); 123 mutex_unlock(&mem_hotplug.lock); 124 memhp_lock_release(); 125 126} 127 128void mem_hotplug_begin(void) 129{ 130 assert_held_device_hotplug(); 131 132 mem_hotplug.active_writer = current; 133 134 memhp_lock_acquire(); 135 for (;;) { 136 mutex_lock(&mem_hotplug.lock); 137 if (likely(!mem_hotplug.refcount)) 138 break; 139 __set_current_state(TASK_UNINTERRUPTIBLE); 140 mutex_unlock(&mem_hotplug.lock); 141 schedule(); 142 } 143} 144 145void mem_hotplug_done(void) 146{ 147 mem_hotplug.active_writer = NULL; 148 mutex_unlock(&mem_hotplug.lock); 149 memhp_lock_release(); 150} 151 152/* add this memory to iomem resource */ 153static struct resource *register_memory_resource(u64 start, u64 size) 154{ 155 struct resource *res; 156 res = kzalloc(sizeof(struct resource), GFP_KERNEL); 157 if (!res) 158 return ERR_PTR(-ENOMEM); 159 160 res->name = "System RAM"; 161 res->start = start; 162 res->end = start + size - 1; 163 res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; 164 if (request_resource(&iomem_resource, res) < 0) { 165 pr_debug("System RAM resource %pR cannot be added\n", res); 166 kfree(res); 167 return ERR_PTR(-EEXIST); 168 } 169 return res; 170} 171 172static void release_memory_resource(struct resource *res) 173{ 174 if (!res) 175 return; 176 release_resource(res); 177 kfree(res); 178 return; 179} 180 181#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE 182void get_page_bootmem(unsigned long info, struct page *page, 183 unsigned long type) 184{ 185 page->freelist = (void *)type; 186 SetPagePrivate(page); 187 set_page_private(page, info); 188 page_ref_inc(page); 189} 190 191void put_page_bootmem(struct page *page) 192{ 193 unsigned long type; 194 195 type = (unsigned long) page->freelist; 196 BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || 197 type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE); 198 199 if (page_ref_dec_return(page) == 1) { 200 page->freelist = NULL; 201 ClearPagePrivate(page); 202 set_page_private(page, 0); 203 INIT_LIST_HEAD(&page->lru); 204 free_reserved_page(page); 205 } 206} 207 208#ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE 209#ifndef CONFIG_SPARSEMEM_VMEMMAP 210static void register_page_bootmem_info_section(unsigned long start_pfn) 211{ 212 unsigned long *usemap, mapsize, section_nr, i; 213 struct mem_section *ms; 214 struct page *page, *memmap; 215 216 section_nr = pfn_to_section_nr(start_pfn); 217 ms = __nr_to_section(section_nr); 218 219 /* Get section's memmap address */ 220 memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); 221 222 /* 223 * Get page for the memmap's phys address 224 * XXX: need more consideration for sparse_vmemmap... 225 */ 226 page = virt_to_page(memmap); 227 mapsize = sizeof(struct page) * PAGES_PER_SECTION; 228 mapsize = PAGE_ALIGN(mapsize) >> PAGE_SHIFT; 229 230 /* remember memmap's page */ 231 for (i = 0; i < mapsize; i++, page++) 232 get_page_bootmem(section_nr, page, SECTION_INFO); 233 234 usemap = __nr_to_section(section_nr)->pageblock_flags; 235 page = virt_to_page(usemap); 236 237 mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT; 238 239 for (i = 0; i < mapsize; i++, page++) 240 get_page_bootmem(section_nr, page, MIX_SECTION_INFO); 241 242} 243#else /* CONFIG_SPARSEMEM_VMEMMAP */ 244static void register_page_bootmem_info_section(unsigned long start_pfn) 245{ 246 unsigned long *usemap, mapsize, section_nr, i; 247 struct mem_section *ms; 248 struct page *page, *memmap; 249 250 if (!pfn_valid(start_pfn)) 251 return; 252 253 section_nr = pfn_to_section_nr(start_pfn); 254 ms = __nr_to_section(section_nr); 255 256 memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); 257 258 register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION); 259 260 usemap = __nr_to_section(section_nr)->pageblock_flags; 261 page = virt_to_page(usemap); 262 263 mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT; 264 265 for (i = 0; i < mapsize; i++, page++) 266 get_page_bootmem(section_nr, page, MIX_SECTION_INFO); 267} 268#endif /* !CONFIG_SPARSEMEM_VMEMMAP */ 269 270void __init register_page_bootmem_info_node(struct pglist_data *pgdat) 271{ 272 unsigned long i, pfn, end_pfn, nr_pages; 273 int node = pgdat->node_id; 274 struct page *page; 275 276 nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT; 277 page = virt_to_page(pgdat); 278 279 for (i = 0; i < nr_pages; i++, page++) 280 get_page_bootmem(node, page, NODE_INFO); 281 282 pfn = pgdat->node_start_pfn; 283 end_pfn = pgdat_end_pfn(pgdat); 284 285 /* register section info */ 286 for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 287 /* 288 * Some platforms can assign the same pfn to multiple nodes - on 289 * node0 as well as nodeN. To avoid registering a pfn against 290 * multiple nodes we check that this pfn does not already 291 * reside in some other nodes. 292 */ 293 if (pfn_valid(pfn) && (early_pfn_to_nid(pfn) == node)) 294 register_page_bootmem_info_section(pfn); 295 } 296} 297#endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */ 298 299static void __meminit grow_zone_span(struct zone *zone, unsigned long start_pfn, 300 unsigned long end_pfn) 301{ 302 unsigned long old_zone_end_pfn; 303 304 zone_span_writelock(zone); 305 306 old_zone_end_pfn = zone_end_pfn(zone); 307 if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn) 308 zone->zone_start_pfn = start_pfn; 309 310 zone->spanned_pages = max(old_zone_end_pfn, end_pfn) - 311 zone->zone_start_pfn; 312 313 zone_span_writeunlock(zone); 314} 315 316static void resize_zone(struct zone *zone, unsigned long start_pfn, 317 unsigned long end_pfn) 318{ 319 zone_span_writelock(zone); 320 321 if (end_pfn - start_pfn) { 322 zone->zone_start_pfn = start_pfn; 323 zone->spanned_pages = end_pfn - start_pfn; 324 } else { 325 /* 326 * make it consist as free_area_init_core(), 327 * if spanned_pages = 0, then keep start_pfn = 0 328 */ 329 zone->zone_start_pfn = 0; 330 zone->spanned_pages = 0; 331 } 332 333 zone_span_writeunlock(zone); 334} 335 336static void fix_zone_id(struct zone *zone, unsigned long start_pfn, 337 unsigned long end_pfn) 338{ 339 enum zone_type zid = zone_idx(zone); 340 int nid = zone->zone_pgdat->node_id; 341 unsigned long pfn; 342 343 for (pfn = start_pfn; pfn < end_pfn; pfn++) 344 set_page_links(pfn_to_page(pfn), zid, nid, pfn); 345} 346 347/* Can fail with -ENOMEM from allocating a wait table with vmalloc() or 348 * alloc_bootmem_node_nopanic()/memblock_virt_alloc_node_nopanic() */ 349static int __ref ensure_zone_is_initialized(struct zone *zone, 350 unsigned long start_pfn, unsigned long num_pages) 351{ 352 if (!zone_is_initialized(zone)) 353 return init_currently_empty_zone(zone, start_pfn, num_pages); 354 355 return 0; 356} 357 358static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2, 359 unsigned long start_pfn, unsigned long end_pfn) 360{ 361 int ret; 362 unsigned long flags; 363 unsigned long z1_start_pfn; 364 365 ret = ensure_zone_is_initialized(z1, start_pfn, end_pfn - start_pfn); 366 if (ret) 367 return ret; 368 369 pgdat_resize_lock(z1->zone_pgdat, &flags); 370 371 /* can't move pfns which are higher than @z2 */ 372 if (end_pfn > zone_end_pfn(z2)) 373 goto out_fail; 374 /* the move out part must be at the left most of @z2 */ 375 if (start_pfn > z2->zone_start_pfn) 376 goto out_fail; 377 /* must included/overlap */ 378 if (end_pfn <= z2->zone_start_pfn) 379 goto out_fail; 380 381 /* use start_pfn for z1's start_pfn if z1 is empty */ 382 if (!zone_is_empty(z1)) 383 z1_start_pfn = z1->zone_start_pfn; 384 else 385 z1_start_pfn = start_pfn; 386 387 resize_zone(z1, z1_start_pfn, end_pfn); 388 resize_zone(z2, end_pfn, zone_end_pfn(z2)); 389 390 pgdat_resize_unlock(z1->zone_pgdat, &flags); 391 392 fix_zone_id(z1, start_pfn, end_pfn); 393 394 return 0; 395out_fail: 396 pgdat_resize_unlock(z1->zone_pgdat, &flags); 397 return -1; 398} 399 400static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2, 401 unsigned long start_pfn, unsigned long end_pfn) 402{ 403 int ret; 404 unsigned long flags; 405 unsigned long z2_end_pfn; 406 407 ret = ensure_zone_is_initialized(z2, start_pfn, end_pfn - start_pfn); 408 if (ret) 409 return ret; 410 411 pgdat_resize_lock(z1->zone_pgdat, &flags); 412 413 /* can't move pfns which are lower than @z1 */ 414 if (z1->zone_start_pfn > start_pfn) 415 goto out_fail; 416 /* the move out part mast at the right most of @z1 */ 417 if (zone_end_pfn(z1) > end_pfn) 418 goto out_fail; 419 /* must included/overlap */ 420 if (start_pfn >= zone_end_pfn(z1)) 421 goto out_fail; 422 423 /* use end_pfn for z2's end_pfn if z2 is empty */ 424 if (!zone_is_empty(z2)) 425 z2_end_pfn = zone_end_pfn(z2); 426 else 427 z2_end_pfn = end_pfn; 428 429 resize_zone(z1, z1->zone_start_pfn, start_pfn); 430 resize_zone(z2, start_pfn, z2_end_pfn); 431 432 pgdat_resize_unlock(z1->zone_pgdat, &flags); 433 434 fix_zone_id(z2, start_pfn, end_pfn); 435 436 return 0; 437out_fail: 438 pgdat_resize_unlock(z1->zone_pgdat, &flags); 439 return -1; 440} 441 442static struct zone * __meminit move_pfn_range(int zone_shift, 443 unsigned long start_pfn, unsigned long end_pfn) 444{ 445 struct zone *zone = page_zone(pfn_to_page(start_pfn)); 446 int ret = 0; 447 448 if (zone_shift < 0) 449 ret = move_pfn_range_left(zone + zone_shift, zone, 450 start_pfn, end_pfn); 451 else if (zone_shift) 452 ret = move_pfn_range_right(zone, zone + zone_shift, 453 start_pfn, end_pfn); 454 455 if (ret) 456 return NULL; 457 458 return zone + zone_shift; 459} 460 461static void __meminit grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, 462 unsigned long end_pfn) 463{ 464 unsigned long old_pgdat_end_pfn = pgdat_end_pfn(pgdat); 465 466 if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn) 467 pgdat->node_start_pfn = start_pfn; 468 469 pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) - 470 pgdat->node_start_pfn; 471} 472 473static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn) 474{ 475 struct pglist_data *pgdat = zone->zone_pgdat; 476 int nr_pages = PAGES_PER_SECTION; 477 int nid = pgdat->node_id; 478 int zone_type; 479 unsigned long flags, pfn; 480 int ret; 481 482 zone_type = zone - pgdat->node_zones; 483 ret = ensure_zone_is_initialized(zone, phys_start_pfn, nr_pages); 484 if (ret) 485 return ret; 486 487 pgdat_resize_lock(zone->zone_pgdat, &flags); 488 grow_zone_span(zone, phys_start_pfn, phys_start_pfn + nr_pages); 489 grow_pgdat_span(zone->zone_pgdat, phys_start_pfn, 490 phys_start_pfn + nr_pages); 491 pgdat_resize_unlock(zone->zone_pgdat, &flags); 492 memmap_init_zone(nr_pages, nid, zone_type, 493 phys_start_pfn, MEMMAP_HOTPLUG); 494 495 /* online_page_range is called later and expects pages reserved */ 496 for (pfn = phys_start_pfn; pfn < phys_start_pfn + nr_pages; pfn++) { 497 if (!pfn_valid(pfn)) 498 continue; 499 500 SetPageReserved(pfn_to_page(pfn)); 501 } 502 return 0; 503} 504 505static int __meminit __add_section(int nid, struct zone *zone, 506 unsigned long phys_start_pfn) 507{ 508 int ret; 509 510 if (pfn_valid(phys_start_pfn)) 511 return -EEXIST; 512 513 ret = sparse_add_one_section(zone, phys_start_pfn); 514 515 if (ret < 0) 516 return ret; 517 518 ret = __add_zone(zone, phys_start_pfn); 519 520 if (ret < 0) 521 return ret; 522 523 return register_new_memory(nid, __pfn_to_section(phys_start_pfn)); 524} 525 526/* 527 * Reasonably generic function for adding memory. It is 528 * expected that archs that support memory hotplug will 529 * call this function after deciding the zone to which to 530 * add the new pages. 531 */ 532int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, 533 unsigned long nr_pages) 534{ 535 unsigned long i; 536 int err = 0; 537 int start_sec, end_sec; 538 struct vmem_altmap *altmap; 539 540 clear_zone_contiguous(zone); 541 542 /* during initialize mem_map, align hot-added range to section */ 543 start_sec = pfn_to_section_nr(phys_start_pfn); 544 end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); 545 546 altmap = to_vmem_altmap((unsigned long) pfn_to_page(phys_start_pfn)); 547 if (altmap) { 548 /* 549 * Validate altmap is within bounds of the total request 550 */ 551 if (altmap->base_pfn != phys_start_pfn 552 || vmem_altmap_offset(altmap) > nr_pages) { 553 pr_warn_once("memory add fail, invalid altmap\n"); 554 err = -EINVAL; 555 goto out; 556 } 557 altmap->alloc = 0; 558 } 559 560 for (i = start_sec; i <= end_sec; i++) { 561 err = __add_section(nid, zone, section_nr_to_pfn(i)); 562 563 /* 564 * EEXIST is finally dealt with by ioresource collision 565 * check. see add_memory() => register_memory_resource() 566 * Warning will be printed if there is collision. 567 */ 568 if (err && (err != -EEXIST)) 569 break; 570 err = 0; 571 } 572 vmemmap_populate_print_last(); 573out: 574 set_zone_contiguous(zone); 575 return err; 576} 577EXPORT_SYMBOL_GPL(__add_pages); 578 579#ifdef CONFIG_MEMORY_HOTREMOVE 580/* find the smallest valid pfn in the range [start_pfn, end_pfn) */ 581static int find_smallest_section_pfn(int nid, struct zone *zone, 582 unsigned long start_pfn, 583 unsigned long end_pfn) 584{ 585 struct mem_section *ms; 586 587 for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SECTION) { 588 ms = __pfn_to_section(start_pfn); 589 590 if (unlikely(!valid_section(ms))) 591 continue; 592 593 if (unlikely(pfn_to_nid(start_pfn) != nid)) 594 continue; 595 596 if (zone && zone != page_zone(pfn_to_page(start_pfn))) 597 continue; 598 599 return start_pfn; 600 } 601 602 return 0; 603} 604 605/* find the biggest valid pfn in the range [start_pfn, end_pfn). */ 606static int find_biggest_section_pfn(int nid, struct zone *zone, 607 unsigned long start_pfn, 608 unsigned long end_pfn) 609{ 610 struct mem_section *ms; 611 unsigned long pfn; 612 613 /* pfn is the end pfn of a memory section. */ 614 pfn = end_pfn - 1; 615 for (; pfn >= start_pfn; pfn -= PAGES_PER_SECTION) { 616 ms = __pfn_to_section(pfn); 617 618 if (unlikely(!valid_section(ms))) 619 continue; 620 621 if (unlikely(pfn_to_nid(pfn) != nid)) 622 continue; 623 624 if (zone && zone != page_zone(pfn_to_page(pfn))) 625 continue; 626 627 return pfn; 628 } 629 630 return 0; 631} 632 633static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, 634 unsigned long end_pfn) 635{ 636 unsigned long zone_start_pfn = zone->zone_start_pfn; 637 unsigned long z = zone_end_pfn(zone); /* zone_end_pfn namespace clash */ 638 unsigned long zone_end_pfn = z; 639 unsigned long pfn; 640 struct mem_section *ms; 641 int nid = zone_to_nid(zone); 642 643 zone_span_writelock(zone); 644 if (zone_start_pfn == start_pfn) { 645 /* 646 * If the section is smallest section in the zone, it need 647 * shrink zone->zone_start_pfn and zone->zone_spanned_pages. 648 * In this case, we find second smallest valid mem_section 649 * for shrinking zone. 650 */ 651 pfn = find_smallest_section_pfn(nid, zone, end_pfn, 652 zone_end_pfn); 653 if (pfn) { 654 zone->zone_start_pfn = pfn; 655 zone->spanned_pages = zone_end_pfn - pfn; 656 } 657 } else if (zone_end_pfn == end_pfn) { 658 /* 659 * If the section is biggest section in the zone, it need 660 * shrink zone->spanned_pages. 661 * In this case, we find second biggest valid mem_section for 662 * shrinking zone. 663 */ 664 pfn = find_biggest_section_pfn(nid, zone, zone_start_pfn, 665 start_pfn); 666 if (pfn) 667 zone->spanned_pages = pfn - zone_start_pfn + 1; 668 } 669 670 /* 671 * The section is not biggest or smallest mem_section in the zone, it 672 * only creates a hole in the zone. So in this case, we need not 673 * change the zone. But perhaps, the zone has only hole data. Thus 674 * it check the zone has only hole or not. 675 */ 676 pfn = zone_start_pfn; 677 for (; pfn < zone_end_pfn; pfn += PAGES_PER_SECTION) { 678 ms = __pfn_to_section(pfn); 679 680 if (unlikely(!valid_section(ms))) 681 continue; 682 683 if (page_zone(pfn_to_page(pfn)) != zone) 684 continue; 685 686 /* If the section is current section, it continues the loop */ 687 if (start_pfn == pfn) 688 continue; 689 690 /* If we find valid section, we have nothing to do */ 691 zone_span_writeunlock(zone); 692 return; 693 } 694 695 /* The zone has no valid section */ 696 zone->zone_start_pfn = 0; 697 zone->spanned_pages = 0; 698 zone_span_writeunlock(zone); 699} 700 701static void shrink_pgdat_span(struct pglist_data *pgdat, 702 unsigned long start_pfn, unsigned long end_pfn) 703{ 704 unsigned long pgdat_start_pfn = pgdat->node_start_pfn; 705 unsigned long p = pgdat_end_pfn(pgdat); /* pgdat_end_pfn namespace clash */ 706 unsigned long pgdat_end_pfn = p; 707 unsigned long pfn; 708 struct mem_section *ms; 709 int nid = pgdat->node_id; 710 711 if (pgdat_start_pfn == start_pfn) { 712 /* 713 * If the section is smallest section in the pgdat, it need 714 * shrink pgdat->node_start_pfn and pgdat->node_spanned_pages. 715 * In this case, we find second smallest valid mem_section 716 * for shrinking zone. 717 */ 718 pfn = find_smallest_section_pfn(nid, NULL, end_pfn, 719 pgdat_end_pfn); 720 if (pfn) { 721 pgdat->node_start_pfn = pfn; 722 pgdat->node_spanned_pages = pgdat_end_pfn - pfn; 723 } 724 } else if (pgdat_end_pfn == end_pfn) { 725 /* 726 * If the section is biggest section in the pgdat, it need 727 * shrink pgdat->node_spanned_pages. 728 * In this case, we find second biggest valid mem_section for 729 * shrinking zone. 730 */ 731 pfn = find_biggest_section_pfn(nid, NULL, pgdat_start_pfn, 732 start_pfn); 733 if (pfn) 734 pgdat->node_spanned_pages = pfn - pgdat_start_pfn + 1; 735 } 736 737 /* 738 * If the section is not biggest or smallest mem_section in the pgdat, 739 * it only creates a hole in the pgdat. So in this case, we need not 740 * change the pgdat. 741 * But perhaps, the pgdat has only hole data. Thus it check the pgdat 742 * has only hole or not. 743 */ 744 pfn = pgdat_start_pfn; 745 for (; pfn < pgdat_end_pfn; pfn += PAGES_PER_SECTION) { 746 ms = __pfn_to_section(pfn); 747 748 if (unlikely(!valid_section(ms))) 749 continue; 750 751 if (pfn_to_nid(pfn) != nid) 752 continue; 753 754 /* If the section is current section, it continues the loop */ 755 if (start_pfn == pfn) 756 continue; 757 758 /* If we find valid section, we have nothing to do */ 759 return; 760 } 761 762 /* The pgdat has no valid section */ 763 pgdat->node_start_pfn = 0; 764 pgdat->node_spanned_pages = 0; 765} 766 767static void __remove_zone(struct zone *zone, unsigned long start_pfn) 768{ 769 struct pglist_data *pgdat = zone->zone_pgdat; 770 int nr_pages = PAGES_PER_SECTION; 771 int zone_type; 772 unsigned long flags; 773 774 zone_type = zone - pgdat->node_zones; 775 776 pgdat_resize_lock(zone->zone_pgdat, &flags); 777 shrink_zone_span(zone, start_pfn, start_pfn + nr_pages); 778 shrink_pgdat_span(pgdat, start_pfn, start_pfn + nr_pages); 779 pgdat_resize_unlock(zone->zone_pgdat, &flags); 780} 781 782static int __remove_section(struct zone *zone, struct mem_section *ms, 783 unsigned long map_offset) 784{ 785 unsigned long start_pfn; 786 int scn_nr; 787 int ret = -EINVAL; 788 789 if (!valid_section(ms)) 790 return ret; 791 792 ret = unregister_memory_section(ms); 793 if (ret) 794 return ret; 795 796 scn_nr = __section_nr(ms); 797 start_pfn = section_nr_to_pfn(scn_nr); 798 __remove_zone(zone, start_pfn); 799 800 sparse_remove_one_section(zone, ms, map_offset); 801 return 0; 802} 803 804/** 805 * __remove_pages() - remove sections of pages from a zone 806 * @zone: zone from which pages need to be removed 807 * @phys_start_pfn: starting pageframe (must be aligned to start of a section) 808 * @nr_pages: number of pages to remove (must be multiple of section size) 809 * 810 * Generic helper function to remove section mappings and sysfs entries 811 * for the section of the memory we are removing. Caller needs to make 812 * sure that pages are marked reserved and zones are adjust properly by 813 * calling offline_pages(). 814 */ 815int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, 816 unsigned long nr_pages) 817{ 818 unsigned long i; 819 unsigned long map_offset = 0; 820 int sections_to_remove, ret = 0; 821 822 /* In the ZONE_DEVICE case device driver owns the memory region */ 823 if (is_dev_zone(zone)) { 824 struct page *page = pfn_to_page(phys_start_pfn); 825 struct vmem_altmap *altmap; 826 827 altmap = to_vmem_altmap((unsigned long) page); 828 if (altmap) 829 map_offset = vmem_altmap_offset(altmap); 830 } else { 831 resource_size_t start, size; 832 833 start = phys_start_pfn << PAGE_SHIFT; 834 size = nr_pages * PAGE_SIZE; 835 836 ret = release_mem_region_adjustable(&iomem_resource, start, 837 size); 838 if (ret) { 839 resource_size_t endres = start + size - 1; 840 841 pr_warn("Unable to release resource <%pa-%pa> (%d)\n", 842 &start, &endres, ret); 843 } 844 } 845 846 clear_zone_contiguous(zone); 847 848 /* 849 * We can only remove entire sections 850 */ 851 BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK); 852 BUG_ON(nr_pages % PAGES_PER_SECTION); 853 854 sections_to_remove = nr_pages / PAGES_PER_SECTION; 855 for (i = 0; i < sections_to_remove; i++) { 856 unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION; 857 858 ret = __remove_section(zone, __pfn_to_section(pfn), map_offset); 859 map_offset = 0; 860 if (ret) 861 break; 862 } 863 864 set_zone_contiguous(zone); 865 866 return ret; 867} 868#endif /* CONFIG_MEMORY_HOTREMOVE */ 869 870int set_online_page_callback(online_page_callback_t callback) 871{ 872 int rc = -EINVAL; 873 874 get_online_mems(); 875 mutex_lock(&online_page_callback_lock); 876 877 if (online_page_callback == generic_online_page) { 878 online_page_callback = callback; 879 rc = 0; 880 } 881 882 mutex_unlock(&online_page_callback_lock); 883 put_online_mems(); 884 885 return rc; 886} 887EXPORT_SYMBOL_GPL(set_online_page_callback); 888 889int restore_online_page_callback(online_page_callback_t callback) 890{ 891 int rc = -EINVAL; 892 893 get_online_mems(); 894 mutex_lock(&online_page_callback_lock); 895 896 if (online_page_callback == callback) { 897 online_page_callback = generic_online_page; 898 rc = 0; 899 } 900 901 mutex_unlock(&online_page_callback_lock); 902 put_online_mems(); 903 904 return rc; 905} 906EXPORT_SYMBOL_GPL(restore_online_page_callback); 907 908void __online_page_set_limits(struct page *page) 909{ 910} 911EXPORT_SYMBOL_GPL(__online_page_set_limits); 912 913void __online_page_increment_counters(struct page *page) 914{ 915 adjust_managed_page_count(page, 1); 916} 917EXPORT_SYMBOL_GPL(__online_page_increment_counters); 918 919void __online_page_free(struct page *page) 920{ 921 __free_reserved_page(page); 922} 923EXPORT_SYMBOL_GPL(__online_page_free); 924 925static void generic_online_page(struct page *page) 926{ 927 __online_page_set_limits(page); 928 __online_page_increment_counters(page); 929 __online_page_free(page); 930} 931 932static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, 933 void *arg) 934{ 935 unsigned long i; 936 unsigned long onlined_pages = *(unsigned long *)arg; 937 struct page *page; 938 if (PageReserved(pfn_to_page(start_pfn))) 939 for (i = 0; i < nr_pages; i++) { 940 page = pfn_to_page(start_pfn + i); 941 (*online_page_callback)(page); 942 onlined_pages++; 943 } 944 *(unsigned long *)arg = onlined_pages; 945 return 0; 946} 947 948#ifdef CONFIG_MOVABLE_NODE 949/* 950 * When CONFIG_MOVABLE_NODE, we permit onlining of a node which doesn't have 951 * normal memory. 952 */ 953static bool can_online_high_movable(struct zone *zone) 954{ 955 return true; 956} 957#else /* CONFIG_MOVABLE_NODE */ 958/* ensure every online node has NORMAL memory */ 959static bool can_online_high_movable(struct zone *zone) 960{ 961 return node_state(zone_to_nid(zone), N_NORMAL_MEMORY); 962} 963#endif /* CONFIG_MOVABLE_NODE */ 964 965/* check which state of node_states will be changed when online memory */ 966static void node_states_check_changes_online(unsigned long nr_pages, 967 struct zone *zone, struct memory_notify *arg) 968{ 969 int nid = zone_to_nid(zone); 970 enum zone_type zone_last = ZONE_NORMAL; 971 972 /* 973 * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY] 974 * contains nodes which have zones of 0...ZONE_NORMAL, 975 * set zone_last to ZONE_NORMAL. 976 * 977 * If we don't have HIGHMEM nor movable node, 978 * node_states[N_NORMAL_MEMORY] contains nodes which have zones of 979 * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. 980 */ 981 if (N_MEMORY == N_NORMAL_MEMORY) 982 zone_last = ZONE_MOVABLE; 983 984 /* 985 * if the memory to be online is in a zone of 0...zone_last, and 986 * the zones of 0...zone_last don't have memory before online, we will 987 * need to set the node to node_states[N_NORMAL_MEMORY] after 988 * the memory is online. 989 */ 990 if (zone_idx(zone) <= zone_last && !node_state(nid, N_NORMAL_MEMORY)) 991 arg->status_change_nid_normal = nid; 992 else 993 arg->status_change_nid_normal = -1; 994 995#ifdef CONFIG_HIGHMEM 996 /* 997 * If we have movable node, node_states[N_HIGH_MEMORY] 998 * contains nodes which have zones of 0...ZONE_HIGHMEM, 999 * set zone_last to ZONE_HIGHMEM. 1000 * 1001 * If we don't have movable node, node_states[N_NORMAL_MEMORY] 1002 * contains nodes which have zones of 0...ZONE_MOVABLE, 1003 * set zone_last to ZONE_MOVABLE. 1004 */ 1005 zone_last = ZONE_HIGHMEM; 1006 if (N_MEMORY == N_HIGH_MEMORY) 1007 zone_last = ZONE_MOVABLE; 1008 1009 if (zone_idx(zone) <= zone_last && !node_state(nid, N_HIGH_MEMORY)) 1010 arg->status_change_nid_high = nid; 1011 else 1012 arg->status_change_nid_high = -1; 1013#else 1014 arg->status_change_nid_high = arg->status_change_nid_normal; 1015#endif 1016 1017 /* 1018 * if the node don't have memory befor online, we will need to 1019 * set the node to node_states[N_MEMORY] after the memory 1020 * is online. 1021 */ 1022 if (!node_state(nid, N_MEMORY)) 1023 arg->status_change_nid = nid; 1024 else 1025 arg->status_change_nid = -1; 1026} 1027 1028static void node_states_set_node(int node, struct memory_notify *arg) 1029{ 1030 if (arg->status_change_nid_normal >= 0) 1031 node_set_state(node, N_NORMAL_MEMORY); 1032 1033 if (arg->status_change_nid_high >= 0) 1034 node_set_state(node, N_HIGH_MEMORY); 1035 1036 node_set_state(node, N_MEMORY); 1037} 1038 1039bool zone_can_shift(unsigned long pfn, unsigned long nr_pages, 1040 enum zone_type target, int *zone_shift) 1041{ 1042 struct zone *zone = page_zone(pfn_to_page(pfn)); 1043 enum zone_type idx = zone_idx(zone); 1044 int i; 1045 1046 *zone_shift = 0; 1047 1048 if (idx < target) { 1049 /* pages must be at end of current zone */ 1050 if (pfn + nr_pages != zone_end_pfn(zone)) 1051 return false; 1052 1053 /* no zones in use between current zone and target */ 1054 for (i = idx + 1; i < target; i++) 1055 if (zone_is_initialized(zone - idx + i)) 1056 return false; 1057 } 1058 1059 if (target < idx) { 1060 /* pages must be at beginning of current zone */ 1061 if (pfn != zone->zone_start_pfn) 1062 return false; 1063 1064 /* no zones in use between current zone and target */ 1065 for (i = target + 1; i < idx; i++) 1066 if (zone_is_initialized(zone - idx + i)) 1067 return false; 1068 } 1069 1070 *zone_shift = target - idx; 1071 return true; 1072} 1073 1074/* Must be protected by mem_hotplug_begin() */ 1075int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type) 1076{ 1077 unsigned long flags; 1078 unsigned long onlined_pages = 0; 1079 struct zone *zone; 1080 int need_zonelists_rebuild = 0; 1081 int nid; 1082 int ret; 1083 struct memory_notify arg; 1084 int zone_shift = 0; 1085 1086 /* 1087 * This doesn't need a lock to do pfn_to_page(). 1088 * The section can't be removed here because of the 1089 * memory_block->state_mutex. 1090 */ 1091 zone = page_zone(pfn_to_page(pfn)); 1092 1093 if ((zone_idx(zone) > ZONE_NORMAL || 1094 online_type == MMOP_ONLINE_MOVABLE) && 1095 !can_online_high_movable(zone)) 1096 return -EINVAL; 1097 1098 if (online_type == MMOP_ONLINE_KERNEL) { 1099 if (!zone_can_shift(pfn, nr_pages, ZONE_NORMAL, &zone_shift)) 1100 return -EINVAL; 1101 } else if (online_type == MMOP_ONLINE_MOVABLE) { 1102 if (!zone_can_shift(pfn, nr_pages, ZONE_MOVABLE, &zone_shift)) 1103 return -EINVAL; 1104 } 1105 1106 zone = move_pfn_range(zone_shift, pfn, pfn + nr_pages); 1107 if (!zone) 1108 return -EINVAL; 1109 1110 arg.start_pfn = pfn; 1111 arg.nr_pages = nr_pages; 1112 node_states_check_changes_online(nr_pages, zone, &arg); 1113 1114 nid = zone_to_nid(zone); 1115 1116 ret = memory_notify(MEM_GOING_ONLINE, &arg); 1117 ret = notifier_to_errno(ret); 1118 if (ret) 1119 goto failed_addition; 1120 1121 /* 1122 * If this zone is not populated, then it is not in zonelist. 1123 * This means the page allocator ignores this zone. 1124 * So, zonelist must be updated after online. 1125 */ 1126 mutex_lock(&zonelists_mutex); 1127 if (!populated_zone(zone)) { 1128 need_zonelists_rebuild = 1; 1129 build_all_zonelists(NULL, zone); 1130 } 1131 1132 ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages, 1133 online_pages_range); 1134 if (ret) { 1135 if (need_zonelists_rebuild) 1136 zone_pcp_reset(zone); 1137 mutex_unlock(&zonelists_mutex); 1138 goto failed_addition; 1139 } 1140 1141 zone->present_pages += onlined_pages; 1142 1143 pgdat_resize_lock(zone->zone_pgdat, &flags); 1144 zone->zone_pgdat->node_present_pages += onlined_pages; 1145 pgdat_resize_unlock(zone->zone_pgdat, &flags); 1146 1147 if (onlined_pages) { 1148 node_states_set_node(nid, &arg); 1149 if (need_zonelists_rebuild) 1150 build_all_zonelists(NULL, NULL); 1151 else 1152 zone_pcp_update(zone); 1153 } 1154 1155 mutex_unlock(&zonelists_mutex); 1156 1157 init_per_zone_wmark_min(); 1158 1159 if (onlined_pages) { 1160 kswapd_run(nid); 1161 kcompactd_run(nid); 1162 } 1163 1164 vm_total_pages = nr_free_pagecache_pages(); 1165 1166 writeback_set_ratelimit(); 1167 1168 if (onlined_pages) 1169 memory_notify(MEM_ONLINE, &arg); 1170 return 0; 1171 1172failed_addition: 1173 pr_debug("online_pages [mem %#010llx-%#010llx] failed\n", 1174 (unsigned long long) pfn << PAGE_SHIFT, 1175 (((unsigned long long) pfn + nr_pages) << PAGE_SHIFT) - 1); 1176 memory_notify(MEM_CANCEL_ONLINE, &arg); 1177 return ret; 1178} 1179#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ 1180 1181static void reset_node_present_pages(pg_data_t *pgdat) 1182{ 1183 struct zone *z; 1184 1185 for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) 1186 z->present_pages = 0; 1187 1188 pgdat->node_present_pages = 0; 1189} 1190 1191/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ 1192static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) 1193{ 1194 struct pglist_data *pgdat; 1195 unsigned long zones_size[MAX_NR_ZONES] = {0}; 1196 unsigned long zholes_size[MAX_NR_ZONES] = {0}; 1197 unsigned long start_pfn = PFN_DOWN(start); 1198 1199 pgdat = NODE_DATA(nid); 1200 if (!pgdat) { 1201 pgdat = arch_alloc_nodedata(nid); 1202 if (!pgdat) 1203 return NULL; 1204 1205 arch_refresh_nodedata(nid, pgdat); 1206 } else { 1207 /* Reset the nr_zones, order and classzone_idx before reuse */ 1208 pgdat->nr_zones = 0; 1209 pgdat->kswapd_order = 0; 1210 pgdat->kswapd_classzone_idx = 0; 1211 } 1212 1213 /* we can use NODE_DATA(nid) from here */ 1214 1215 /* init node's zones as empty zones, we don't have any present pages.*/ 1216 free_area_init_node(nid, zones_size, start_pfn, zholes_size); 1217 pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat); 1218 1219 /* 1220 * The node we allocated has no zone fallback lists. For avoiding 1221 * to access not-initialized zonelist, build here. 1222 */ 1223 mutex_lock(&zonelists_mutex); 1224 build_all_zonelists(pgdat, NULL); 1225 mutex_unlock(&zonelists_mutex); 1226 1227 /* 1228 * zone->managed_pages is set to an approximate value in 1229 * free_area_init_core(), which will cause 1230 * /sys/device/system/node/nodeX/meminfo has wrong data. 1231 * So reset it to 0 before any memory is onlined. 1232 */ 1233 reset_node_managed_pages(pgdat); 1234 1235 /* 1236 * When memory is hot-added, all the memory is in offline state. So 1237 * clear all zones' present_pages because they will be updated in 1238 * online_pages() and offline_pages(). 1239 */ 1240 reset_node_present_pages(pgdat); 1241 1242 return pgdat; 1243} 1244 1245static void rollback_node_hotadd(int nid, pg_data_t *pgdat) 1246{ 1247 arch_refresh_nodedata(nid, NULL); 1248 free_percpu(pgdat->per_cpu_nodestats); 1249 arch_free_nodedata(pgdat); 1250 return; 1251} 1252 1253 1254/** 1255 * try_online_node - online a node if offlined 1256 * 1257 * called by cpu_up() to online a node without onlined memory. 1258 */ 1259int try_online_node(int nid) 1260{ 1261 pg_data_t *pgdat; 1262 int ret; 1263 1264 if (node_online(nid)) 1265 return 0; 1266 1267 mem_hotplug_begin(); 1268 pgdat = hotadd_new_pgdat(nid, 0); 1269 if (!pgdat) { 1270 pr_err("Cannot online node %d due to NULL pgdat\n", nid); 1271 ret = -ENOMEM; 1272 goto out; 1273 } 1274 node_set_online(nid); 1275 ret = register_one_node(nid); 1276 BUG_ON(ret); 1277 1278 if (pgdat->node_zonelists->_zonerefs->zone == NULL) { 1279 mutex_lock(&zonelists_mutex); 1280 build_all_zonelists(NULL, NULL); 1281 mutex_unlock(&zonelists_mutex); 1282 } 1283 1284out: 1285 mem_hotplug_done(); 1286 return ret; 1287} 1288 1289static int check_hotplug_memory_range(u64 start, u64 size) 1290{ 1291 u64 start_pfn = PFN_DOWN(start); 1292 u64 nr_pages = size >> PAGE_SHIFT; 1293 1294 /* Memory range must be aligned with section */ 1295 if ((start_pfn & ~PAGE_SECTION_MASK) || 1296 (nr_pages % PAGES_PER_SECTION) || (!nr_pages)) { 1297 pr_err("Section-unaligned hotplug range: start 0x%llx, size 0x%llx\n", 1298 (unsigned long long)start, 1299 (unsigned long long)size); 1300 return -EINVAL; 1301 } 1302 1303 return 0; 1304} 1305 1306/* 1307 * If movable zone has already been setup, newly added memory should be check. 1308 * If its address is higher than movable zone, it should be added as movable. 1309 * Without this check, movable zone may overlap with other zone. 1310 */ 1311static int should_add_memory_movable(int nid, u64 start, u64 size) 1312{ 1313 unsigned long start_pfn = start >> PAGE_SHIFT; 1314 pg_data_t *pgdat = NODE_DATA(nid); 1315 struct zone *movable_zone = pgdat->node_zones + ZONE_MOVABLE; 1316 1317 if (zone_is_empty(movable_zone)) 1318 return 0; 1319 1320 if (movable_zone->zone_start_pfn <= start_pfn) 1321 return 1; 1322 1323 return 0; 1324} 1325 1326int zone_for_memory(int nid, u64 start, u64 size, int zone_default, 1327 bool for_device) 1328{ 1329#ifdef CONFIG_ZONE_DEVICE 1330 if (for_device) 1331 return ZONE_DEVICE; 1332#endif 1333 if (should_add_memory_movable(nid, start, size)) 1334 return ZONE_MOVABLE; 1335 1336 return zone_default; 1337} 1338 1339static int online_memory_block(struct memory_block *mem, void *arg) 1340{ 1341 return device_online(&mem->dev); 1342} 1343 1344/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ 1345int __ref add_memory_resource(int nid, struct resource *res, bool online) 1346{ 1347 u64 start, size; 1348 pg_data_t *pgdat = NULL; 1349 bool new_pgdat; 1350 bool new_node; 1351 int ret; 1352 1353 start = res->start; 1354 size = resource_size(res); 1355 1356 ret = check_hotplug_memory_range(start, size); 1357 if (ret) 1358 return ret; 1359 1360 { /* Stupid hack to suppress address-never-null warning */ 1361 void *p = NODE_DATA(nid); 1362 new_pgdat = !p; 1363 } 1364 1365 mem_hotplug_begin(); 1366 1367 /* 1368 * Add new range to memblock so that when hotadd_new_pgdat() is called 1369 * to allocate new pgdat, get_pfn_range_for_nid() will be able to find 1370 * this new range and calculate total pages correctly. The range will 1371 * be removed at hot-remove time. 1372 */ 1373 memblock_add_node(start, size, nid); 1374 1375 new_node = !node_online(nid); 1376 if (new_node) { 1377 pgdat = hotadd_new_pgdat(nid, start); 1378 ret = -ENOMEM; 1379 if (!pgdat) 1380 goto error; 1381 } 1382 1383 /* call arch's memory hotadd */ 1384 ret = arch_add_memory(nid, start, size, false); 1385 1386 if (ret < 0) 1387 goto error; 1388 1389 /* we online node here. we can't roll back from here. */ 1390 node_set_online(nid); 1391 1392 if (new_node) { 1393 ret = register_one_node(nid); 1394 /* 1395 * If sysfs file of new node can't create, cpu on the node 1396 * can't be hot-added. There is no rollback way now. 1397 * So, check by BUG_ON() to catch it reluctantly.. 1398 */ 1399 BUG_ON(ret); 1400 } 1401 1402 /* create new memmap entry */ 1403 firmware_map_add_hotplug(start, start + size, "System RAM"); 1404 1405 /* online pages if requested */ 1406 if (online) 1407 walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), 1408 NULL, online_memory_block); 1409 1410 goto out; 1411 1412error: 1413 /* rollback pgdat allocation and others */ 1414 if (new_pgdat) 1415 rollback_node_hotadd(nid, pgdat); 1416 memblock_remove(start, size); 1417 1418out: 1419 mem_hotplug_done(); 1420 return ret; 1421} 1422EXPORT_SYMBOL_GPL(add_memory_resource); 1423 1424int __ref add_memory(int nid, u64 start, u64 size) 1425{ 1426 struct resource *res; 1427 int ret; 1428 1429 res = register_memory_resource(start, size); 1430 if (IS_ERR(res)) 1431 return PTR_ERR(res); 1432 1433 ret = add_memory_resource(nid, res, memhp_auto_online); 1434 if (ret < 0) 1435 release_memory_resource(res); 1436 return ret; 1437} 1438EXPORT_SYMBOL_GPL(add_memory); 1439 1440#ifdef CONFIG_MEMORY_HOTREMOVE 1441/* 1442 * A free page on the buddy free lists (not the per-cpu lists) has PageBuddy 1443 * set and the size of the free page is given by page_order(). Using this, 1444 * the function determines if the pageblock contains only free pages. 1445 * Due to buddy contraints, a free page at least the size of a pageblock will 1446 * be located at the start of the pageblock 1447 */ 1448static inline int pageblock_free(struct page *page) 1449{ 1450 return PageBuddy(page) && page_order(page) >= pageblock_order; 1451} 1452 1453/* Return the start of the next active pageblock after a given page */ 1454static struct page *next_active_pageblock(struct page *page) 1455{ 1456 /* Ensure the starting page is pageblock-aligned */ 1457 BUG_ON(page_to_pfn(page) & (pageblock_nr_pages - 1)); 1458 1459 /* If the entire pageblock is free, move to the end of free page */ 1460 if (pageblock_free(page)) { 1461 int order; 1462 /* be careful. we don't have locks, page_order can be changed.*/ 1463 order = page_order(page); 1464 if ((order < MAX_ORDER) && (order >= pageblock_order)) 1465 return page + (1 << order); 1466 } 1467 1468 return page + pageblock_nr_pages; 1469} 1470 1471/* Checks if this range of memory is likely to be hot-removable. */ 1472bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) 1473{ 1474 struct page *page = pfn_to_page(start_pfn); 1475 struct page *end_page = page + nr_pages; 1476 1477 /* Check the starting page of each pageblock within the range */ 1478 for (; page < end_page; page = next_active_pageblock(page)) { 1479 if (!is_pageblock_removable_nolock(page)) 1480 return false; 1481 cond_resched(); 1482 } 1483 1484 /* All pageblocks in the memory block are likely to be hot-removable */ 1485 return true; 1486} 1487 1488/* 1489 * Confirm all pages in a range [start, end) belong to the same zone. 1490 * When true, return its valid [start, end). 1491 */ 1492int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn, 1493 unsigned long *valid_start, unsigned long *valid_end) 1494{ 1495 unsigned long pfn, sec_end_pfn; 1496 unsigned long start, end; 1497 struct zone *zone = NULL; 1498 struct page *page; 1499 int i; 1500 for (pfn = start_pfn, sec_end_pfn = SECTION_ALIGN_UP(start_pfn + 1); 1501 pfn < end_pfn; 1502 pfn = sec_end_pfn, sec_end_pfn += PAGES_PER_SECTION) { 1503 /* Make sure the memory section is present first */ 1504 if (!present_section_nr(pfn_to_section_nr(pfn))) 1505 continue; 1506 for (; pfn < sec_end_pfn && pfn < end_pfn; 1507 pfn += MAX_ORDER_NR_PAGES) { 1508 i = 0; 1509 /* This is just a CONFIG_HOLES_IN_ZONE check.*/ 1510 while ((i < MAX_ORDER_NR_PAGES) && 1511 !pfn_valid_within(pfn + i)) 1512 i++; 1513 if (i == MAX_ORDER_NR_PAGES || pfn + i >= end_pfn) 1514 continue; 1515 page = pfn_to_page(pfn + i); 1516 if (zone && page_zone(page) != zone) 1517 return 0; 1518 if (!zone) 1519 start = pfn + i; 1520 zone = page_zone(page); 1521 end = pfn + MAX_ORDER_NR_PAGES; 1522 } 1523 } 1524 1525 if (zone) { 1526 *valid_start = start; 1527 *valid_end = min(end, end_pfn); 1528 return 1; 1529 } else { 1530 return 0; 1531 } 1532} 1533 1534/* 1535 * Scan pfn range [start,end) to find movable/migratable pages (LRU pages, 1536 * non-lru movable pages and hugepages). We scan pfn because it's much 1537 * easier than scanning over linked list. This function returns the pfn 1538 * of the first found movable page if it's found, otherwise 0. 1539 */ 1540static unsigned long scan_movable_pages(unsigned long start, unsigned long end) 1541{ 1542 unsigned long pfn; 1543 struct page *page; 1544 for (pfn = start; pfn < end; pfn++) { 1545 if (pfn_valid(pfn)) { 1546 page = pfn_to_page(pfn); 1547 if (PageLRU(page)) 1548 return pfn; 1549 if (__PageMovable(page)) 1550 return pfn; 1551 if (PageHuge(page)) { 1552 if (page_huge_active(page)) 1553 return pfn; 1554 else 1555 pfn = round_up(pfn + 1, 1556 1 << compound_order(page)) - 1; 1557 } 1558 } 1559 } 1560 return 0; 1561} 1562 1563static struct page *new_node_page(struct page *page, unsigned long private, 1564 int **result) 1565{ 1566 gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE; 1567 int nid = page_to_nid(page); 1568 nodemask_t nmask = node_states[N_MEMORY]; 1569 struct page *new_page = NULL; 1570 1571 /* 1572 * TODO: allocate a destination hugepage from a nearest neighbor node, 1573 * accordance with memory policy of the user process if possible. For 1574 * now as a simple work-around, we use the next node for destination. 1575 */ 1576 if (PageHuge(page)) 1577 return alloc_huge_page_node(page_hstate(compound_head(page)), 1578 next_node_in(nid, nmask)); 1579 1580 node_clear(nid, nmask); 1581 1582 if (PageHighMem(page) 1583 || (zone_idx(page_zone(page)) == ZONE_MOVABLE)) 1584 gfp_mask |= __GFP_HIGHMEM; 1585 1586 if (!nodes_empty(nmask)) 1587 new_page = __alloc_pages_nodemask(gfp_mask, 0, 1588 node_zonelist(nid, gfp_mask), &nmask); 1589 if (!new_page) 1590 new_page = __alloc_pages(gfp_mask, 0, 1591 node_zonelist(nid, gfp_mask)); 1592 1593 return new_page; 1594} 1595 1596#define NR_OFFLINE_AT_ONCE_PAGES (256) 1597static int 1598do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) 1599{ 1600 unsigned long pfn; 1601 struct page *page; 1602 int move_pages = NR_OFFLINE_AT_ONCE_PAGES; 1603 int not_managed = 0; 1604 int ret = 0; 1605 LIST_HEAD(source); 1606 1607 for (pfn = start_pfn; pfn < end_pfn && move_pages > 0; pfn++) { 1608 if (!pfn_valid(pfn)) 1609 continue; 1610 page = pfn_to_page(pfn); 1611 1612 if (PageHuge(page)) { 1613 struct page *head = compound_head(page); 1614 pfn = page_to_pfn(head) + (1<<compound_order(head)) - 1; 1615 if (compound_order(head) > PFN_SECTION_SHIFT) { 1616 ret = -EBUSY; 1617 break; 1618 } 1619 if (isolate_huge_page(page, &source)) 1620 move_pages -= 1 << compound_order(head); 1621 continue; 1622 } 1623 1624 if (!get_page_unless_zero(page)) 1625 continue; 1626 /* 1627 * We can skip free pages. And we can deal with pages on 1628 * LRU and non-lru movable pages. 1629 */ 1630 if (PageLRU(page)) 1631 ret = isolate_lru_page(page); 1632 else 1633 ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE); 1634 if (!ret) { /* Success */ 1635 put_page(page); 1636 list_add_tail(&page->lru, &source); 1637 move_pages--; 1638 if (!__PageMovable(page)) 1639 inc_node_page_state(page, NR_ISOLATED_ANON + 1640 page_is_file_cache(page)); 1641 1642 } else { 1643#ifdef CONFIG_DEBUG_VM 1644 pr_alert("failed to isolate pfn %lx\n", pfn); 1645 dump_page(page, "isolation failed"); 1646#endif 1647 put_page(page); 1648 /* Because we don't have big zone->lock. we should 1649 check this again here. */ 1650 if (page_count(page)) { 1651 not_managed++; 1652 ret = -EBUSY; 1653 break; 1654 } 1655 } 1656 } 1657 if (!list_empty(&source)) { 1658 if (not_managed) { 1659 putback_movable_pages(&source); 1660 goto out; 1661 } 1662 1663 /* Allocate a new page from the nearest neighbor node */ 1664 ret = migrate_pages(&source, new_node_page, NULL, 0, 1665 MIGRATE_SYNC, MR_MEMORY_HOTPLUG); 1666 if (ret) 1667 putback_movable_pages(&source); 1668 } 1669out: 1670 return ret; 1671} 1672 1673/* 1674 * remove from free_area[] and mark all as Reserved. 1675 */ 1676static int 1677offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages, 1678 void *data) 1679{ 1680 __offline_isolated_pages(start, start + nr_pages); 1681 return 0; 1682} 1683 1684static void 1685offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) 1686{ 1687 walk_system_ram_range(start_pfn, end_pfn - start_pfn, NULL, 1688 offline_isolated_pages_cb); 1689} 1690 1691/* 1692 * Check all pages in range, recoreded as memory resource, are isolated. 1693 */ 1694static int 1695check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages, 1696 void *data) 1697{ 1698 int ret; 1699 long offlined = *(long *)data; 1700 ret = test_pages_isolated(start_pfn, start_pfn + nr_pages, true); 1701 offlined = nr_pages; 1702 if (!ret) 1703 *(long *)data += offlined; 1704 return ret; 1705} 1706 1707static long 1708check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) 1709{ 1710 long offlined = 0; 1711 int ret; 1712 1713 ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, &offlined, 1714 check_pages_isolated_cb); 1715 if (ret < 0) 1716 offlined = (long)ret; 1717 return offlined; 1718} 1719 1720#ifdef CONFIG_MOVABLE_NODE 1721/* 1722 * When CONFIG_MOVABLE_NODE, we permit offlining of a node which doesn't have 1723 * normal memory. 1724 */ 1725static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) 1726{ 1727 return true; 1728} 1729#else /* CONFIG_MOVABLE_NODE */ 1730/* ensure the node has NORMAL memory if it is still online */ 1731static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) 1732{ 1733 struct pglist_data *pgdat = zone->zone_pgdat; 1734 unsigned long present_pages = 0; 1735 enum zone_type zt; 1736 1737 for (zt = 0; zt <= ZONE_NORMAL; zt++) 1738 present_pages += pgdat->node_zones[zt].present_pages; 1739 1740 if (present_pages > nr_pages) 1741 return true; 1742 1743 present_pages = 0; 1744 for (; zt <= ZONE_MOVABLE; zt++) 1745 present_pages += pgdat->node_zones[zt].present_pages; 1746 1747 /* 1748 * we can't offline the last normal memory until all 1749 * higher memory is offlined. 1750 */ 1751 return present_pages == 0; 1752} 1753#endif /* CONFIG_MOVABLE_NODE */ 1754 1755static int __init cmdline_parse_movable_node(char *p) 1756{ 1757#ifdef CONFIG_MOVABLE_NODE 1758 movable_node_enabled = true; 1759#else 1760 pr_warn("movable_node option not supported\n"); 1761#endif 1762 return 0; 1763} 1764early_param("movable_node", cmdline_parse_movable_node); 1765 1766/* check which state of node_states will be changed when offline memory */ 1767static void node_states_check_changes_offline(unsigned long nr_pages, 1768 struct zone *zone, struct memory_notify *arg) 1769{ 1770 struct pglist_data *pgdat = zone->zone_pgdat; 1771 unsigned long present_pages = 0; 1772 enum zone_type zt, zone_last = ZONE_NORMAL; 1773 1774 /* 1775 * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY] 1776 * contains nodes which have zones of 0...ZONE_NORMAL, 1777 * set zone_last to ZONE_NORMAL. 1778 * 1779 * If we don't have HIGHMEM nor movable node, 1780 * node_states[N_NORMAL_MEMORY] contains nodes which have zones of 1781 * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. 1782 */ 1783 if (N_MEMORY == N_NORMAL_MEMORY) 1784 zone_last = ZONE_MOVABLE; 1785 1786 /* 1787 * check whether node_states[N_NORMAL_MEMORY] will be changed. 1788 * If the memory to be offline is in a zone of 0...zone_last, 1789 * and it is the last present memory, 0...zone_last will 1790 * become empty after offline , thus we can determind we will 1791 * need to clear the node from node_states[N_NORMAL_MEMORY]. 1792 */ 1793 for (zt = 0; zt <= zone_last; zt++) 1794 present_pages += pgdat->node_zones[zt].present_pages; 1795 if (zone_idx(zone) <= zone_last && nr_pages >= present_pages) 1796 arg->status_change_nid_normal = zone_to_nid(zone); 1797 else 1798 arg->status_change_nid_normal = -1; 1799 1800#ifdef CONFIG_HIGHMEM 1801 /* 1802 * If we have movable node, node_states[N_HIGH_MEMORY] 1803 * contains nodes which have zones of 0...ZONE_HIGHMEM, 1804 * set zone_last to ZONE_HIGHMEM. 1805 * 1806 * If we don't have movable node, node_states[N_NORMAL_MEMORY] 1807 * contains nodes which have zones of 0...ZONE_MOVABLE, 1808 * set zone_last to ZONE_MOVABLE. 1809 */ 1810 zone_last = ZONE_HIGHMEM; 1811 if (N_MEMORY == N_HIGH_MEMORY) 1812 zone_last = ZONE_MOVABLE; 1813 1814 for (; zt <= zone_last; zt++) 1815 present_pages += pgdat->node_zones[zt].present_pages; 1816 if (zone_idx(zone) <= zone_last && nr_pages >= present_pages) 1817 arg->status_change_nid_high = zone_to_nid(zone); 1818 else 1819 arg->status_change_nid_high = -1; 1820#else 1821 arg->status_change_nid_high = arg->status_change_nid_normal; 1822#endif 1823 1824 /* 1825 * node_states[N_HIGH_MEMORY] contains nodes which have 0...ZONE_MOVABLE 1826 */ 1827 zone_last = ZONE_MOVABLE; 1828 1829 /* 1830 * check whether node_states[N_HIGH_MEMORY] will be changed 1831 * If we try to offline the last present @nr_pages from the node, 1832 * we can determind we will need to clear the node from 1833 * node_states[N_HIGH_MEMORY]. 1834 */ 1835 for (; zt <= zone_last; zt++) 1836 present_pages += pgdat->node_zones[zt].present_pages; 1837 if (nr_pages >= present_pages) 1838 arg->status_change_nid = zone_to_nid(zone); 1839 else 1840 arg->status_change_nid = -1; 1841} 1842 1843static void node_states_clear_node(int node, struct memory_notify *arg) 1844{ 1845 if (arg->status_change_nid_normal >= 0) 1846 node_clear_state(node, N_NORMAL_MEMORY); 1847 1848 if ((N_MEMORY != N_NORMAL_MEMORY) && 1849 (arg->status_change_nid_high >= 0)) 1850 node_clear_state(node, N_HIGH_MEMORY); 1851 1852 if ((N_MEMORY != N_HIGH_MEMORY) && 1853 (arg->status_change_nid >= 0)) 1854 node_clear_state(node, N_MEMORY); 1855} 1856 1857static int __ref __offline_pages(unsigned long start_pfn, 1858 unsigned long end_pfn, unsigned long timeout) 1859{ 1860 unsigned long pfn, nr_pages, expire; 1861 long offlined_pages; 1862 int ret, drain, retry_max, node; 1863 unsigned long flags; 1864 unsigned long valid_start, valid_end; 1865 struct zone *zone; 1866 struct memory_notify arg; 1867 1868 /* at least, alignment against pageblock is necessary */ 1869 if (!IS_ALIGNED(start_pfn, pageblock_nr_pages)) 1870 return -EINVAL; 1871 if (!IS_ALIGNED(end_pfn, pageblock_nr_pages)) 1872 return -EINVAL; 1873 /* This makes hotplug much easier...and readable. 1874 we assume this for now. .*/ 1875 if (!test_pages_in_a_zone(start_pfn, end_pfn, &valid_start, &valid_end)) 1876 return -EINVAL; 1877 1878 zone = page_zone(pfn_to_page(valid_start)); 1879 node = zone_to_nid(zone); 1880 nr_pages = end_pfn - start_pfn; 1881 1882 if (zone_idx(zone) <= ZONE_NORMAL && !can_offline_normal(zone, nr_pages)) 1883 return -EINVAL; 1884 1885 /* set above range as isolated */ 1886 ret = start_isolate_page_range(start_pfn, end_pfn, 1887 MIGRATE_MOVABLE, true); 1888 if (ret) 1889 return ret; 1890 1891 arg.start_pfn = start_pfn; 1892 arg.nr_pages = nr_pages; 1893 node_states_check_changes_offline(nr_pages, zone, &arg); 1894 1895 ret = memory_notify(MEM_GOING_OFFLINE, &arg); 1896 ret = notifier_to_errno(ret); 1897 if (ret) 1898 goto failed_removal; 1899 1900 pfn = start_pfn; 1901 expire = jiffies + timeout; 1902 drain = 0; 1903 retry_max = 5; 1904repeat: 1905 /* start memory hot removal */ 1906 ret = -EAGAIN; 1907 if (time_after(jiffies, expire)) 1908 goto failed_removal; 1909 ret = -EINTR; 1910 if (signal_pending(current)) 1911 goto failed_removal; 1912 ret = 0; 1913 if (drain) { 1914 lru_add_drain_all(); 1915 cond_resched(); 1916 drain_all_pages(zone); 1917 } 1918 1919 pfn = scan_movable_pages(start_pfn, end_pfn); 1920 if (pfn) { /* We have movable pages */ 1921 ret = do_migrate_range(pfn, end_pfn); 1922 if (!ret) { 1923 drain = 1; 1924 goto repeat; 1925 } else { 1926 if (ret < 0) 1927 if (--retry_max == 0) 1928 goto failed_removal; 1929 yield(); 1930 drain = 1; 1931 goto repeat; 1932 } 1933 } 1934 /* drain all zone's lru pagevec, this is asynchronous... */ 1935 lru_add_drain_all(); 1936 yield(); 1937 /* drain pcp pages, this is synchronous. */ 1938 drain_all_pages(zone); 1939 /* 1940 * dissolve free hugepages in the memory block before doing offlining 1941 * actually in order to make hugetlbfs's object counting consistent. 1942 */ 1943 ret = dissolve_free_huge_pages(start_pfn, end_pfn); 1944 if (ret) 1945 goto failed_removal; 1946 /* check again */ 1947 offlined_pages = check_pages_isolated(start_pfn, end_pfn); 1948 if (offlined_pages < 0) { 1949 ret = -EBUSY; 1950 goto failed_removal; 1951 } 1952 pr_info("Offlined Pages %ld\n", offlined_pages); 1953 /* Ok, all of our target is isolated. 1954 We cannot do rollback at this point. */ 1955 offline_isolated_pages(start_pfn, end_pfn); 1956 /* reset pagetype flags and makes migrate type to be MOVABLE */ 1957 undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); 1958 /* removal success */ 1959 adjust_managed_page_count(pfn_to_page(start_pfn), -offlined_pages); 1960 zone->present_pages -= offlined_pages; 1961 1962 pgdat_resize_lock(zone->zone_pgdat, &flags); 1963 zone->zone_pgdat->node_present_pages -= offlined_pages; 1964 pgdat_resize_unlock(zone->zone_pgdat, &flags); 1965 1966 init_per_zone_wmark_min(); 1967 1968 if (!populated_zone(zone)) { 1969 zone_pcp_reset(zone); 1970 mutex_lock(&zonelists_mutex); 1971 build_all_zonelists(NULL, NULL); 1972 mutex_unlock(&zonelists_mutex); 1973 } else 1974 zone_pcp_update(zone); 1975 1976 node_states_clear_node(node, &arg); 1977 if (arg.status_change_nid >= 0) { 1978 kswapd_stop(node); 1979 kcompactd_stop(node); 1980 } 1981 1982 vm_total_pages = nr_free_pagecache_pages(); 1983 writeback_set_ratelimit(); 1984 1985 memory_notify(MEM_OFFLINE, &arg); 1986 return 0; 1987 1988failed_removal: 1989 pr_debug("memory offlining [mem %#010llx-%#010llx] failed\n", 1990 (unsigned long long) start_pfn << PAGE_SHIFT, 1991 ((unsigned long long) end_pfn << PAGE_SHIFT) - 1); 1992 memory_notify(MEM_CANCEL_OFFLINE, &arg); 1993 /* pushback to free area */ 1994 undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); 1995 return ret; 1996} 1997 1998/* Must be protected by mem_hotplug_begin() */ 1999int offline_pages(unsigned long start_pfn, unsigned long nr_pages) 2000{ 2001 return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ); 2002} 2003#endif /* CONFIG_MEMORY_HOTREMOVE */ 2004 2005/** 2006 * walk_memory_range - walks through all mem sections in [start_pfn, end_pfn) 2007 * @start_pfn: start pfn of the memory range 2008 * @end_pfn: end pfn of the memory range 2009 * @arg: argument passed to func 2010 * @func: callback for each memory section walked 2011 * 2012 * This function walks through all present mem sections in range 2013 * [start_pfn, end_pfn) and call func on each mem section. 2014 * 2015 * Returns the return value of func. 2016 */ 2017int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn, 2018 void *arg, int (*func)(struct memory_block *, void *)) 2019{ 2020 struct memory_block *mem = NULL; 2021 struct mem_section *section; 2022 unsigned long pfn, section_nr; 2023 int ret; 2024 2025 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 2026 section_nr = pfn_to_section_nr(pfn); 2027 if (!present_section_nr(section_nr)) 2028 continue; 2029 2030 section = __nr_to_section(section_nr); 2031 /* same memblock? */ 2032 if (mem) 2033 if ((section_nr >= mem->start_section_nr) && 2034 (section_nr <= mem->end_section_nr)) 2035 continue; 2036 2037 mem = find_memory_block_hinted(section, mem); 2038 if (!mem) 2039 continue; 2040 2041 ret = func(mem, arg); 2042 if (ret) { 2043 kobject_put(&mem->dev.kobj); 2044 return ret; 2045 } 2046 } 2047 2048 if (mem) 2049 kobject_put(&mem->dev.kobj); 2050 2051 return 0; 2052} 2053 2054#ifdef CONFIG_MEMORY_HOTREMOVE 2055static int check_memblock_offlined_cb(struct memory_block *mem, void *arg) 2056{ 2057 int ret = !is_memblock_offlined(mem); 2058 2059 if (unlikely(ret)) { 2060 phys_addr_t beginpa, endpa; 2061 2062 beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr)); 2063 endpa = PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1; 2064 pr_warn("removing memory fails, because memory [%pa-%pa] is onlined\n", 2065 &beginpa, &endpa); 2066 } 2067 2068 return ret; 2069} 2070 2071static int check_cpu_on_node(pg_data_t *pgdat) 2072{ 2073 int cpu; 2074 2075 for_each_present_cpu(cpu) { 2076 if (cpu_to_node(cpu) == pgdat->node_id) 2077 /* 2078 * the cpu on this node isn't removed, and we can't 2079 * offline this node. 2080 */ 2081 return -EBUSY; 2082 } 2083 2084 return 0; 2085} 2086 2087static void unmap_cpu_on_node(pg_data_t *pgdat) 2088{ 2089#ifdef CONFIG_ACPI_NUMA 2090 int cpu; 2091 2092 for_each_possible_cpu(cpu) 2093 if (cpu_to_node(cpu) == pgdat->node_id) 2094 numa_clear_node(cpu); 2095#endif 2096} 2097 2098static int check_and_unmap_cpu_on_node(pg_data_t *pgdat) 2099{ 2100 int ret; 2101 2102 ret = check_cpu_on_node(pgdat); 2103 if (ret) 2104 return ret; 2105 2106 /* 2107 * the node will be offlined when we come here, so we can clear 2108 * the cpu_to_node() now. 2109 */ 2110 2111 unmap_cpu_on_node(pgdat); 2112 return 0; 2113} 2114 2115/** 2116 * try_offline_node 2117 * 2118 * Offline a node if all memory sections and cpus of the node are removed. 2119 * 2120 * NOTE: The caller must call lock_device_hotplug() to serialize hotplug 2121 * and online/offline operations before this call. 2122 */ 2123void try_offline_node(int nid) 2124{ 2125 pg_data_t *pgdat = NODE_DATA(nid); 2126 unsigned long start_pfn = pgdat->node_start_pfn; 2127 unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages; 2128 unsigned long pfn; 2129 2130 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 2131 unsigned long section_nr = pfn_to_section_nr(pfn); 2132 2133 if (!present_section_nr(section_nr)) 2134 continue; 2135 2136 if (pfn_to_nid(pfn) != nid) 2137 continue; 2138 2139 /* 2140 * some memory sections of this node are not removed, and we 2141 * can't offline node now. 2142 */ 2143 return; 2144 } 2145 2146 if (check_and_unmap_cpu_on_node(pgdat)) 2147 return; 2148 2149 /* 2150 * all memory/cpu of this node are removed, we can offline this 2151 * node now. 2152 */ 2153 node_set_offline(nid); 2154 unregister_one_node(nid); 2155} 2156EXPORT_SYMBOL(try_offline_node); 2157 2158/** 2159 * remove_memory 2160 * 2161 * NOTE: The caller must call lock_device_hotplug() to serialize hotplug 2162 * and online/offline operations before this call, as required by 2163 * try_offline_node(). 2164 */ 2165void __ref remove_memory(int nid, u64 start, u64 size) 2166{ 2167 int ret; 2168 2169 BUG_ON(check_hotplug_memory_range(start, size)); 2170 2171 mem_hotplug_begin(); 2172 2173 /* 2174 * All memory blocks must be offlined before removing memory. Check 2175 * whether all memory blocks in question are offline and trigger a BUG() 2176 * if this is not the case. 2177 */ 2178 ret = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL, 2179 check_memblock_offlined_cb); 2180 if (ret) 2181 BUG(); 2182 2183 /* remove memmap entry */ 2184 firmware_map_remove(start, start + size, "System RAM"); 2185 memblock_free(start, size); 2186 memblock_remove(start, size); 2187 2188 arch_remove_memory(start, size); 2189 2190 try_offline_node(nid); 2191 2192 mem_hotplug_done(); 2193} 2194EXPORT_SYMBOL_GPL(remove_memory); 2195#endif /* CONFIG_MEMORY_HOTREMOVE */