mm/memory_hotplug.c at v4.11-rc1

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / mm / memory_hotplug.c
at v4.11-rc1 2195 lines 57 kB view raw
wrap content
   1/*
   2 *  linux/mm/memory_hotplug.c
   3 *
   4 *  Copyright (C)
   5 */
   6
   7#include <linux/stddef.h>
   8#include <linux/mm.h>
   9#include <linux/sched/signal.h>
  10#include <linux/swap.h>
  11#include <linux/interrupt.h>
  12#include <linux/pagemap.h>
  13#include <linux/compiler.h>
  14#include <linux/export.h>
  15#include <linux/pagevec.h>
  16#include <linux/writeback.h>
  17#include <linux/slab.h>
  18#include <linux/sysctl.h>
  19#include <linux/cpu.h>
  20#include <linux/memory.h>
  21#include <linux/memremap.h>
  22#include <linux/memory_hotplug.h>
  23#include <linux/highmem.h>
  24#include <linux/vmalloc.h>
  25#include <linux/ioport.h>
  26#include <linux/delay.h>
  27#include <linux/migrate.h>
  28#include <linux/page-isolation.h>
  29#include <linux/pfn.h>
  30#include <linux/suspend.h>
  31#include <linux/mm_inline.h>
  32#include <linux/firmware-map.h>
  33#include <linux/stop_machine.h>
  34#include <linux/hugetlb.h>
  35#include <linux/memblock.h>
  36#include <linux/bootmem.h>
  37#include <linux/compaction.h>
  38
  39#include <asm/tlbflush.h>
  40
  41#include "internal.h"
  42
  43/*
  44 * online_page_callback contains pointer to current page onlining function.
  45 * Initially it is generic_online_page(). If it is required it could be
  46 * changed by calling set_online_page_callback() for callback registration
  47 * and restore_online_page_callback() for generic callback restore.
  48 */
  49
  50static void generic_online_page(struct page *page);
  51
  52static online_page_callback_t online_page_callback = generic_online_page;
  53static DEFINE_MUTEX(online_page_callback_lock);
  54
  55/* The same as the cpu_hotplug lock, but for memory hotplug. */
  56static struct {
  57	struct task_struct *active_writer;
  58	struct mutex lock; /* Synchronizes accesses to refcount, */
  59	/*
  60	 * Also blocks the new readers during
  61	 * an ongoing mem hotplug operation.
  62	 */
  63	int refcount;
  64
  65#ifdef CONFIG_DEBUG_LOCK_ALLOC
  66	struct lockdep_map dep_map;
  67#endif
  68} mem_hotplug = {
  69	.active_writer = NULL,
  70	.lock = __MUTEX_INITIALIZER(mem_hotplug.lock),
  71	.refcount = 0,
  72#ifdef CONFIG_DEBUG_LOCK_ALLOC
  73	.dep_map = {.name = "mem_hotplug.lock" },
  74#endif
  75};
  76
  77/* Lockdep annotations for get/put_online_mems() and mem_hotplug_begin/end() */
  78#define memhp_lock_acquire_read() lock_map_acquire_read(&mem_hotplug.dep_map)
  79#define memhp_lock_acquire()      lock_map_acquire(&mem_hotplug.dep_map)
  80#define memhp_lock_release()      lock_map_release(&mem_hotplug.dep_map)
  81
  82#ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE
  83bool memhp_auto_online;
  84#else
  85bool memhp_auto_online = true;
  86#endif
  87EXPORT_SYMBOL_GPL(memhp_auto_online);
  88
  89static int __init setup_memhp_default_state(char *str)
  90{
  91	if (!strcmp(str, "online"))
  92		memhp_auto_online = true;
  93	else if (!strcmp(str, "offline"))
  94		memhp_auto_online = false;
  95
  96	return 1;
  97}
  98__setup("memhp_default_state=", setup_memhp_default_state);
  99
 100void get_online_mems(void)
 101{
 102	might_sleep();
 103	if (mem_hotplug.active_writer == current)
 104		return;
 105	memhp_lock_acquire_read();
 106	mutex_lock(&mem_hotplug.lock);
 107	mem_hotplug.refcount++;
 108	mutex_unlock(&mem_hotplug.lock);
 109
 110}
 111
 112void put_online_mems(void)
 113{
 114	if (mem_hotplug.active_writer == current)
 115		return;
 116	mutex_lock(&mem_hotplug.lock);
 117
 118	if (WARN_ON(!mem_hotplug.refcount))
 119		mem_hotplug.refcount++; /* try to fix things up */
 120
 121	if (!--mem_hotplug.refcount && unlikely(mem_hotplug.active_writer))
 122		wake_up_process(mem_hotplug.active_writer);
 123	mutex_unlock(&mem_hotplug.lock);
 124	memhp_lock_release();
 125
 126}
 127
 128void mem_hotplug_begin(void)
 129{
 130	assert_held_device_hotplug();
 131
 132	mem_hotplug.active_writer = current;
 133
 134	memhp_lock_acquire();
 135	for (;;) {
 136		mutex_lock(&mem_hotplug.lock);
 137		if (likely(!mem_hotplug.refcount))
 138			break;
 139		__set_current_state(TASK_UNINTERRUPTIBLE);
 140		mutex_unlock(&mem_hotplug.lock);
 141		schedule();
 142	}
 143}
 144
 145void mem_hotplug_done(void)
 146{
 147	mem_hotplug.active_writer = NULL;
 148	mutex_unlock(&mem_hotplug.lock);
 149	memhp_lock_release();
 150}
 151
 152/* add this memory to iomem resource */
 153static struct resource *register_memory_resource(u64 start, u64 size)
 154{
 155	struct resource *res;
 156	res = kzalloc(sizeof(struct resource), GFP_KERNEL);
 157	if (!res)
 158		return ERR_PTR(-ENOMEM);
 159
 160	res->name = "System RAM";
 161	res->start = start;
 162	res->end = start + size - 1;
 163	res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
 164	if (request_resource(&iomem_resource, res) < 0) {
 165		pr_debug("System RAM resource %pR cannot be added\n", res);
 166		kfree(res);
 167		return ERR_PTR(-EEXIST);
 168	}
 169	return res;
 170}
 171
 172static void release_memory_resource(struct resource *res)
 173{
 174	if (!res)
 175		return;
 176	release_resource(res);
 177	kfree(res);
 178	return;
 179}
 180
 181#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
 182void get_page_bootmem(unsigned long info,  struct page *page,
 183		      unsigned long type)
 184{
 185	page->freelist = (void *)type;
 186	SetPagePrivate(page);
 187	set_page_private(page, info);
 188	page_ref_inc(page);
 189}
 190
 191void put_page_bootmem(struct page *page)
 192{
 193	unsigned long type;
 194
 195	type = (unsigned long) page->freelist;
 196	BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
 197	       type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE);
 198
 199	if (page_ref_dec_return(page) == 1) {
 200		page->freelist = NULL;
 201		ClearPagePrivate(page);
 202		set_page_private(page, 0);
 203		INIT_LIST_HEAD(&page->lru);
 204		free_reserved_page(page);
 205	}
 206}
 207
 208#ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE
 209#ifndef CONFIG_SPARSEMEM_VMEMMAP
 210static void register_page_bootmem_info_section(unsigned long start_pfn)
 211{
 212	unsigned long *usemap, mapsize, section_nr, i;
 213	struct mem_section *ms;
 214	struct page *page, *memmap;
 215
 216	section_nr = pfn_to_section_nr(start_pfn);
 217	ms = __nr_to_section(section_nr);
 218
 219	/* Get section's memmap address */
 220	memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
 221
 222	/*
 223	 * Get page for the memmap's phys address
 224	 * XXX: need more consideration for sparse_vmemmap...
 225	 */
 226	page = virt_to_page(memmap);
 227	mapsize = sizeof(struct page) * PAGES_PER_SECTION;
 228	mapsize = PAGE_ALIGN(mapsize) >> PAGE_SHIFT;
 229
 230	/* remember memmap's page */
 231	for (i = 0; i < mapsize; i++, page++)
 232		get_page_bootmem(section_nr, page, SECTION_INFO);
 233
 234	usemap = __nr_to_section(section_nr)->pageblock_flags;
 235	page = virt_to_page(usemap);
 236
 237	mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;
 238
 239	for (i = 0; i < mapsize; i++, page++)
 240		get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
 241
 242}
 243#else /* CONFIG_SPARSEMEM_VMEMMAP */
 244static void register_page_bootmem_info_section(unsigned long start_pfn)
 245{
 246	unsigned long *usemap, mapsize, section_nr, i;
 247	struct mem_section *ms;
 248	struct page *page, *memmap;
 249
 250	if (!pfn_valid(start_pfn))
 251		return;
 252
 253	section_nr = pfn_to_section_nr(start_pfn);
 254	ms = __nr_to_section(section_nr);
 255
 256	memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
 257
 258	register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION);
 259
 260	usemap = __nr_to_section(section_nr)->pageblock_flags;
 261	page = virt_to_page(usemap);
 262
 263	mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;
 264
 265	for (i = 0; i < mapsize; i++, page++)
 266		get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
 267}
 268#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
 269
 270void __init register_page_bootmem_info_node(struct pglist_data *pgdat)
 271{
 272	unsigned long i, pfn, end_pfn, nr_pages;
 273	int node = pgdat->node_id;
 274	struct page *page;
 275
 276	nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT;
 277	page = virt_to_page(pgdat);
 278
 279	for (i = 0; i < nr_pages; i++, page++)
 280		get_page_bootmem(node, page, NODE_INFO);
 281
 282	pfn = pgdat->node_start_pfn;
 283	end_pfn = pgdat_end_pfn(pgdat);
 284
 285	/* register section info */
 286	for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
 287		/*
 288		 * Some platforms can assign the same pfn to multiple nodes - on
 289		 * node0 as well as nodeN.  To avoid registering a pfn against
 290		 * multiple nodes we check that this pfn does not already
 291		 * reside in some other nodes.
 292		 */
 293		if (pfn_valid(pfn) && (early_pfn_to_nid(pfn) == node))
 294			register_page_bootmem_info_section(pfn);
 295	}
 296}
 297#endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */
 298
 299static void __meminit grow_zone_span(struct zone *zone, unsigned long start_pfn,
 300				     unsigned long end_pfn)
 301{
 302	unsigned long old_zone_end_pfn;
 303
 304	zone_span_writelock(zone);
 305
 306	old_zone_end_pfn = zone_end_pfn(zone);
 307	if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn)
 308		zone->zone_start_pfn = start_pfn;
 309
 310	zone->spanned_pages = max(old_zone_end_pfn, end_pfn) -
 311				zone->zone_start_pfn;
 312
 313	zone_span_writeunlock(zone);
 314}
 315
 316static void resize_zone(struct zone *zone, unsigned long start_pfn,
 317		unsigned long end_pfn)
 318{
 319	zone_span_writelock(zone);
 320
 321	if (end_pfn - start_pfn) {
 322		zone->zone_start_pfn = start_pfn;
 323		zone->spanned_pages = end_pfn - start_pfn;
 324	} else {
 325		/*
 326		 * make it consist as free_area_init_core(),
 327		 * if spanned_pages = 0, then keep start_pfn = 0
 328		 */
 329		zone->zone_start_pfn = 0;
 330		zone->spanned_pages = 0;
 331	}
 332
 333	zone_span_writeunlock(zone);
 334}
 335
 336static void fix_zone_id(struct zone *zone, unsigned long start_pfn,
 337		unsigned long end_pfn)
 338{
 339	enum zone_type zid = zone_idx(zone);
 340	int nid = zone->zone_pgdat->node_id;
 341	unsigned long pfn;
 342
 343	for (pfn = start_pfn; pfn < end_pfn; pfn++)
 344		set_page_links(pfn_to_page(pfn), zid, nid, pfn);
 345}
 346
 347/* Can fail with -ENOMEM from allocating a wait table with vmalloc() or
 348 * alloc_bootmem_node_nopanic()/memblock_virt_alloc_node_nopanic() */
 349static int __ref ensure_zone_is_initialized(struct zone *zone,
 350			unsigned long start_pfn, unsigned long num_pages)
 351{
 352	if (!zone_is_initialized(zone))
 353		return init_currently_empty_zone(zone, start_pfn, num_pages);
 354
 355	return 0;
 356}
 357
 358static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2,
 359		unsigned long start_pfn, unsigned long end_pfn)
 360{
 361	int ret;
 362	unsigned long flags;
 363	unsigned long z1_start_pfn;
 364
 365	ret = ensure_zone_is_initialized(z1, start_pfn, end_pfn - start_pfn);
 366	if (ret)
 367		return ret;
 368
 369	pgdat_resize_lock(z1->zone_pgdat, &flags);
 370
 371	/* can't move pfns which are higher than @z2 */
 372	if (end_pfn > zone_end_pfn(z2))
 373		goto out_fail;
 374	/* the move out part must be at the left most of @z2 */
 375	if (start_pfn > z2->zone_start_pfn)
 376		goto out_fail;
 377	/* must included/overlap */
 378	if (end_pfn <= z2->zone_start_pfn)
 379		goto out_fail;
 380
 381	/* use start_pfn for z1's start_pfn if z1 is empty */
 382	if (!zone_is_empty(z1))
 383		z1_start_pfn = z1->zone_start_pfn;
 384	else
 385		z1_start_pfn = start_pfn;
 386
 387	resize_zone(z1, z1_start_pfn, end_pfn);
 388	resize_zone(z2, end_pfn, zone_end_pfn(z2));
 389
 390	pgdat_resize_unlock(z1->zone_pgdat, &flags);
 391
 392	fix_zone_id(z1, start_pfn, end_pfn);
 393
 394	return 0;
 395out_fail:
 396	pgdat_resize_unlock(z1->zone_pgdat, &flags);
 397	return -1;
 398}
 399
 400static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2,
 401		unsigned long start_pfn, unsigned long end_pfn)
 402{
 403	int ret;
 404	unsigned long flags;
 405	unsigned long z2_end_pfn;
 406
 407	ret = ensure_zone_is_initialized(z2, start_pfn, end_pfn - start_pfn);
 408	if (ret)
 409		return ret;
 410
 411	pgdat_resize_lock(z1->zone_pgdat, &flags);
 412
 413	/* can't move pfns which are lower than @z1 */
 414	if (z1->zone_start_pfn > start_pfn)
 415		goto out_fail;
 416	/* the move out part mast at the right most of @z1 */
 417	if (zone_end_pfn(z1) >  end_pfn)
 418		goto out_fail;
 419	/* must included/overlap */
 420	if (start_pfn >= zone_end_pfn(z1))
 421		goto out_fail;
 422
 423	/* use end_pfn for z2's end_pfn if z2 is empty */
 424	if (!zone_is_empty(z2))
 425		z2_end_pfn = zone_end_pfn(z2);
 426	else
 427		z2_end_pfn = end_pfn;
 428
 429	resize_zone(z1, z1->zone_start_pfn, start_pfn);
 430	resize_zone(z2, start_pfn, z2_end_pfn);
 431
 432	pgdat_resize_unlock(z1->zone_pgdat, &flags);
 433
 434	fix_zone_id(z2, start_pfn, end_pfn);
 435
 436	return 0;
 437out_fail:
 438	pgdat_resize_unlock(z1->zone_pgdat, &flags);
 439	return -1;
 440}
 441
 442static struct zone * __meminit move_pfn_range(int zone_shift,
 443		unsigned long start_pfn, unsigned long end_pfn)
 444{
 445	struct zone *zone = page_zone(pfn_to_page(start_pfn));
 446	int ret = 0;
 447
 448	if (zone_shift < 0)
 449		ret = move_pfn_range_left(zone + zone_shift, zone,
 450					  start_pfn, end_pfn);
 451	else if (zone_shift)
 452		ret = move_pfn_range_right(zone, zone + zone_shift,
 453					   start_pfn, end_pfn);
 454
 455	if (ret)
 456		return NULL;
 457
 458	return zone + zone_shift;
 459}
 460
 461static void __meminit grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn,
 462				      unsigned long end_pfn)
 463{
 464	unsigned long old_pgdat_end_pfn = pgdat_end_pfn(pgdat);
 465
 466	if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn)
 467		pgdat->node_start_pfn = start_pfn;
 468
 469	pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) -
 470					pgdat->node_start_pfn;
 471}
 472
 473static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn)
 474{
 475	struct pglist_data *pgdat = zone->zone_pgdat;
 476	int nr_pages = PAGES_PER_SECTION;
 477	int nid = pgdat->node_id;
 478	int zone_type;
 479	unsigned long flags, pfn;
 480	int ret;
 481
 482	zone_type = zone - pgdat->node_zones;
 483	ret = ensure_zone_is_initialized(zone, phys_start_pfn, nr_pages);
 484	if (ret)
 485		return ret;
 486
 487	pgdat_resize_lock(zone->zone_pgdat, &flags);
 488	grow_zone_span(zone, phys_start_pfn, phys_start_pfn + nr_pages);
 489	grow_pgdat_span(zone->zone_pgdat, phys_start_pfn,
 490			phys_start_pfn + nr_pages);
 491	pgdat_resize_unlock(zone->zone_pgdat, &flags);
 492	memmap_init_zone(nr_pages, nid, zone_type,
 493			 phys_start_pfn, MEMMAP_HOTPLUG);
 494
 495	/* online_page_range is called later and expects pages reserved */
 496	for (pfn = phys_start_pfn; pfn < phys_start_pfn + nr_pages; pfn++) {
 497		if (!pfn_valid(pfn))
 498			continue;
 499
 500		SetPageReserved(pfn_to_page(pfn));
 501	}
 502	return 0;
 503}
 504
 505static int __meminit __add_section(int nid, struct zone *zone,
 506					unsigned long phys_start_pfn)
 507{
 508	int ret;
 509
 510	if (pfn_valid(phys_start_pfn))
 511		return -EEXIST;
 512
 513	ret = sparse_add_one_section(zone, phys_start_pfn);
 514
 515	if (ret < 0)
 516		return ret;
 517
 518	ret = __add_zone(zone, phys_start_pfn);
 519
 520	if (ret < 0)
 521		return ret;
 522
 523	return register_new_memory(nid, __pfn_to_section(phys_start_pfn));
 524}
 525
 526/*
 527 * Reasonably generic function for adding memory.  It is
 528 * expected that archs that support memory hotplug will
 529 * call this function after deciding the zone to which to
 530 * add the new pages.
 531 */
 532int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
 533			unsigned long nr_pages)
 534{
 535	unsigned long i;
 536	int err = 0;
 537	int start_sec, end_sec;
 538	struct vmem_altmap *altmap;
 539
 540	clear_zone_contiguous(zone);
 541
 542	/* during initialize mem_map, align hot-added range to section */
 543	start_sec = pfn_to_section_nr(phys_start_pfn);
 544	end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
 545
 546	altmap = to_vmem_altmap((unsigned long) pfn_to_page(phys_start_pfn));
 547	if (altmap) {
 548		/*
 549		 * Validate altmap is within bounds of the total request
 550		 */
 551		if (altmap->base_pfn != phys_start_pfn
 552				|| vmem_altmap_offset(altmap) > nr_pages) {
 553			pr_warn_once("memory add fail, invalid altmap\n");
 554			err = -EINVAL;
 555			goto out;
 556		}
 557		altmap->alloc = 0;
 558	}
 559
 560	for (i = start_sec; i <= end_sec; i++) {
 561		err = __add_section(nid, zone, section_nr_to_pfn(i));
 562
 563		/*
 564		 * EEXIST is finally dealt with by ioresource collision
 565		 * check. see add_memory() => register_memory_resource()
 566		 * Warning will be printed if there is collision.
 567		 */
 568		if (err && (err != -EEXIST))
 569			break;
 570		err = 0;
 571	}
 572	vmemmap_populate_print_last();
 573out:
 574	set_zone_contiguous(zone);
 575	return err;
 576}
 577EXPORT_SYMBOL_GPL(__add_pages);
 578
 579#ifdef CONFIG_MEMORY_HOTREMOVE
 580/* find the smallest valid pfn in the range [start_pfn, end_pfn) */
 581static int find_smallest_section_pfn(int nid, struct zone *zone,
 582				     unsigned long start_pfn,
 583				     unsigned long end_pfn)
 584{
 585	struct mem_section *ms;
 586
 587	for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SECTION) {
 588		ms = __pfn_to_section(start_pfn);
 589
 590		if (unlikely(!valid_section(ms)))
 591			continue;
 592
 593		if (unlikely(pfn_to_nid(start_pfn) != nid))
 594			continue;
 595
 596		if (zone && zone != page_zone(pfn_to_page(start_pfn)))
 597			continue;
 598
 599		return start_pfn;
 600	}
 601
 602	return 0;
 603}
 604
 605/* find the biggest valid pfn in the range [start_pfn, end_pfn). */
 606static int find_biggest_section_pfn(int nid, struct zone *zone,
 607				    unsigned long start_pfn,
 608				    unsigned long end_pfn)
 609{
 610	struct mem_section *ms;
 611	unsigned long pfn;
 612
 613	/* pfn is the end pfn of a memory section. */
 614	pfn = end_pfn - 1;
 615	for (; pfn >= start_pfn; pfn -= PAGES_PER_SECTION) {
 616		ms = __pfn_to_section(pfn);
 617
 618		if (unlikely(!valid_section(ms)))
 619			continue;
 620
 621		if (unlikely(pfn_to_nid(pfn) != nid))
 622			continue;
 623
 624		if (zone && zone != page_zone(pfn_to_page(pfn)))
 625			continue;
 626
 627		return pfn;
 628	}
 629
 630	return 0;
 631}
 632
 633static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
 634			     unsigned long end_pfn)
 635{
 636	unsigned long zone_start_pfn = zone->zone_start_pfn;
 637	unsigned long z = zone_end_pfn(zone); /* zone_end_pfn namespace clash */
 638	unsigned long zone_end_pfn = z;
 639	unsigned long pfn;
 640	struct mem_section *ms;
 641	int nid = zone_to_nid(zone);
 642
 643	zone_span_writelock(zone);
 644	if (zone_start_pfn == start_pfn) {
 645		/*
 646		 * If the section is smallest section in the zone, it need
 647		 * shrink zone->zone_start_pfn and zone->zone_spanned_pages.
 648		 * In this case, we find second smallest valid mem_section
 649		 * for shrinking zone.
 650		 */
 651		pfn = find_smallest_section_pfn(nid, zone, end_pfn,
 652						zone_end_pfn);
 653		if (pfn) {
 654			zone->zone_start_pfn = pfn;
 655			zone->spanned_pages = zone_end_pfn - pfn;
 656		}
 657	} else if (zone_end_pfn == end_pfn) {
 658		/*
 659		 * If the section is biggest section in the zone, it need
 660		 * shrink zone->spanned_pages.
 661		 * In this case, we find second biggest valid mem_section for
 662		 * shrinking zone.
 663		 */
 664		pfn = find_biggest_section_pfn(nid, zone, zone_start_pfn,
 665					       start_pfn);
 666		if (pfn)
 667			zone->spanned_pages = pfn - zone_start_pfn + 1;
 668	}
 669
 670	/*
 671	 * The section is not biggest or smallest mem_section in the zone, it
 672	 * only creates a hole in the zone. So in this case, we need not
 673	 * change the zone. But perhaps, the zone has only hole data. Thus
 674	 * it check the zone has only hole or not.
 675	 */
 676	pfn = zone_start_pfn;
 677	for (; pfn < zone_end_pfn; pfn += PAGES_PER_SECTION) {
 678		ms = __pfn_to_section(pfn);
 679
 680		if (unlikely(!valid_section(ms)))
 681			continue;
 682
 683		if (page_zone(pfn_to_page(pfn)) != zone)
 684			continue;
 685
 686		 /* If the section is current section, it continues the loop */
 687		if (start_pfn == pfn)
 688			continue;
 689
 690		/* If we find valid section, we have nothing to do */
 691		zone_span_writeunlock(zone);
 692		return;
 693	}
 694
 695	/* The zone has no valid section */
 696	zone->zone_start_pfn = 0;
 697	zone->spanned_pages = 0;
 698	zone_span_writeunlock(zone);
 699}
 700
 701static void shrink_pgdat_span(struct pglist_data *pgdat,
 702			      unsigned long start_pfn, unsigned long end_pfn)
 703{
 704	unsigned long pgdat_start_pfn = pgdat->node_start_pfn;
 705	unsigned long p = pgdat_end_pfn(pgdat); /* pgdat_end_pfn namespace clash */
 706	unsigned long pgdat_end_pfn = p;
 707	unsigned long pfn;
 708	struct mem_section *ms;
 709	int nid = pgdat->node_id;
 710
 711	if (pgdat_start_pfn == start_pfn) {
 712		/*
 713		 * If the section is smallest section in the pgdat, it need
 714		 * shrink pgdat->node_start_pfn and pgdat->node_spanned_pages.
 715		 * In this case, we find second smallest valid mem_section
 716		 * for shrinking zone.
 717		 */
 718		pfn = find_smallest_section_pfn(nid, NULL, end_pfn,
 719						pgdat_end_pfn);
 720		if (pfn) {
 721			pgdat->node_start_pfn = pfn;
 722			pgdat->node_spanned_pages = pgdat_end_pfn - pfn;
 723		}
 724	} else if (pgdat_end_pfn == end_pfn) {
 725		/*
 726		 * If the section is biggest section in the pgdat, it need
 727		 * shrink pgdat->node_spanned_pages.
 728		 * In this case, we find second biggest valid mem_section for
 729		 * shrinking zone.
 730		 */
 731		pfn = find_biggest_section_pfn(nid, NULL, pgdat_start_pfn,
 732					       start_pfn);
 733		if (pfn)
 734			pgdat->node_spanned_pages = pfn - pgdat_start_pfn + 1;
 735	}
 736
 737	/*
 738	 * If the section is not biggest or smallest mem_section in the pgdat,
 739	 * it only creates a hole in the pgdat. So in this case, we need not
 740	 * change the pgdat.
 741	 * But perhaps, the pgdat has only hole data. Thus it check the pgdat
 742	 * has only hole or not.
 743	 */
 744	pfn = pgdat_start_pfn;
 745	for (; pfn < pgdat_end_pfn; pfn += PAGES_PER_SECTION) {
 746		ms = __pfn_to_section(pfn);
 747
 748		if (unlikely(!valid_section(ms)))
 749			continue;
 750
 751		if (pfn_to_nid(pfn) != nid)
 752			continue;
 753
 754		 /* If the section is current section, it continues the loop */
 755		if (start_pfn == pfn)
 756			continue;
 757
 758		/* If we find valid section, we have nothing to do */
 759		return;
 760	}
 761
 762	/* The pgdat has no valid section */
 763	pgdat->node_start_pfn = 0;
 764	pgdat->node_spanned_pages = 0;
 765}
 766
 767static void __remove_zone(struct zone *zone, unsigned long start_pfn)
 768{
 769	struct pglist_data *pgdat = zone->zone_pgdat;
 770	int nr_pages = PAGES_PER_SECTION;
 771	int zone_type;
 772	unsigned long flags;
 773
 774	zone_type = zone - pgdat->node_zones;
 775
 776	pgdat_resize_lock(zone->zone_pgdat, &flags);
 777	shrink_zone_span(zone, start_pfn, start_pfn + nr_pages);
 778	shrink_pgdat_span(pgdat, start_pfn, start_pfn + nr_pages);
 779	pgdat_resize_unlock(zone->zone_pgdat, &flags);
 780}
 781
 782static int __remove_section(struct zone *zone, struct mem_section *ms,
 783		unsigned long map_offset)
 784{
 785	unsigned long start_pfn;
 786	int scn_nr;
 787	int ret = -EINVAL;
 788
 789	if (!valid_section(ms))
 790		return ret;
 791
 792	ret = unregister_memory_section(ms);
 793	if (ret)
 794		return ret;
 795
 796	scn_nr = __section_nr(ms);
 797	start_pfn = section_nr_to_pfn(scn_nr);
 798	__remove_zone(zone, start_pfn);
 799
 800	sparse_remove_one_section(zone, ms, map_offset);
 801	return 0;
 802}
 803
 804/**
 805 * __remove_pages() - remove sections of pages from a zone
 806 * @zone: zone from which pages need to be removed
 807 * @phys_start_pfn: starting pageframe (must be aligned to start of a section)
 808 * @nr_pages: number of pages to remove (must be multiple of section size)
 809 *
 810 * Generic helper function to remove section mappings and sysfs entries
 811 * for the section of the memory we are removing. Caller needs to make
 812 * sure that pages are marked reserved and zones are adjust properly by
 813 * calling offline_pages().
 814 */
 815int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
 816		 unsigned long nr_pages)
 817{
 818	unsigned long i;
 819	unsigned long map_offset = 0;
 820	int sections_to_remove, ret = 0;
 821
 822	/* In the ZONE_DEVICE case device driver owns the memory region */
 823	if (is_dev_zone(zone)) {
 824		struct page *page = pfn_to_page(phys_start_pfn);
 825		struct vmem_altmap *altmap;
 826
 827		altmap = to_vmem_altmap((unsigned long) page);
 828		if (altmap)
 829			map_offset = vmem_altmap_offset(altmap);
 830	} else {
 831		resource_size_t start, size;
 832
 833		start = phys_start_pfn << PAGE_SHIFT;
 834		size = nr_pages * PAGE_SIZE;
 835
 836		ret = release_mem_region_adjustable(&iomem_resource, start,
 837					size);
 838		if (ret) {
 839			resource_size_t endres = start + size - 1;
 840
 841			pr_warn("Unable to release resource <%pa-%pa> (%d)\n",
 842					&start, &endres, ret);
 843		}
 844	}
 845
 846	clear_zone_contiguous(zone);
 847
 848	/*
 849	 * We can only remove entire sections
 850	 */
 851	BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK);
 852	BUG_ON(nr_pages % PAGES_PER_SECTION);
 853
 854	sections_to_remove = nr_pages / PAGES_PER_SECTION;
 855	for (i = 0; i < sections_to_remove; i++) {
 856		unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION;
 857
 858		ret = __remove_section(zone, __pfn_to_section(pfn), map_offset);
 859		map_offset = 0;
 860		if (ret)
 861			break;
 862	}
 863
 864	set_zone_contiguous(zone);
 865
 866	return ret;
 867}
 868#endif /* CONFIG_MEMORY_HOTREMOVE */
 869
 870int set_online_page_callback(online_page_callback_t callback)
 871{
 872	int rc = -EINVAL;
 873
 874	get_online_mems();
 875	mutex_lock(&online_page_callback_lock);
 876
 877	if (online_page_callback == generic_online_page) {
 878		online_page_callback = callback;
 879		rc = 0;
 880	}
 881
 882	mutex_unlock(&online_page_callback_lock);
 883	put_online_mems();
 884
 885	return rc;
 886}
 887EXPORT_SYMBOL_GPL(set_online_page_callback);
 888
 889int restore_online_page_callback(online_page_callback_t callback)
 890{
 891	int rc = -EINVAL;
 892
 893	get_online_mems();
 894	mutex_lock(&online_page_callback_lock);
 895
 896	if (online_page_callback == callback) {
 897		online_page_callback = generic_online_page;
 898		rc = 0;
 899	}
 900
 901	mutex_unlock(&online_page_callback_lock);
 902	put_online_mems();
 903
 904	return rc;
 905}
 906EXPORT_SYMBOL_GPL(restore_online_page_callback);
 907
 908void __online_page_set_limits(struct page *page)
 909{
 910}
 911EXPORT_SYMBOL_GPL(__online_page_set_limits);
 912
 913void __online_page_increment_counters(struct page *page)
 914{
 915	adjust_managed_page_count(page, 1);
 916}
 917EXPORT_SYMBOL_GPL(__online_page_increment_counters);
 918
 919void __online_page_free(struct page *page)
 920{
 921	__free_reserved_page(page);
 922}
 923EXPORT_SYMBOL_GPL(__online_page_free);
 924
 925static void generic_online_page(struct page *page)
 926{
 927	__online_page_set_limits(page);
 928	__online_page_increment_counters(page);
 929	__online_page_free(page);
 930}
 931
 932static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
 933			void *arg)
 934{
 935	unsigned long i;
 936	unsigned long onlined_pages = *(unsigned long *)arg;
 937	struct page *page;
 938	if (PageReserved(pfn_to_page(start_pfn)))
 939		for (i = 0; i < nr_pages; i++) {
 940			page = pfn_to_page(start_pfn + i);
 941			(*online_page_callback)(page);
 942			onlined_pages++;
 943		}
 944	*(unsigned long *)arg = onlined_pages;
 945	return 0;
 946}
 947
 948#ifdef CONFIG_MOVABLE_NODE
 949/*
 950 * When CONFIG_MOVABLE_NODE, we permit onlining of a node which doesn't have
 951 * normal memory.
 952 */
 953static bool can_online_high_movable(struct zone *zone)
 954{
 955	return true;
 956}
 957#else /* CONFIG_MOVABLE_NODE */
 958/* ensure every online node has NORMAL memory */
 959static bool can_online_high_movable(struct zone *zone)
 960{
 961	return node_state(zone_to_nid(zone), N_NORMAL_MEMORY);
 962}
 963#endif /* CONFIG_MOVABLE_NODE */
 964
 965/* check which state of node_states will be changed when online memory */
 966static void node_states_check_changes_online(unsigned long nr_pages,
 967	struct zone *zone, struct memory_notify *arg)
 968{
 969	int nid = zone_to_nid(zone);
 970	enum zone_type zone_last = ZONE_NORMAL;
 971
 972	/*
 973	 * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY]
 974	 * contains nodes which have zones of 0...ZONE_NORMAL,
 975	 * set zone_last to ZONE_NORMAL.
 976	 *
 977	 * If we don't have HIGHMEM nor movable node,
 978	 * node_states[N_NORMAL_MEMORY] contains nodes which have zones of
 979	 * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE.
 980	 */
 981	if (N_MEMORY == N_NORMAL_MEMORY)
 982		zone_last = ZONE_MOVABLE;
 983
 984	/*
 985	 * if the memory to be online is in a zone of 0...zone_last, and
 986	 * the zones of 0...zone_last don't have memory before online, we will
 987	 * need to set the node to node_states[N_NORMAL_MEMORY] after
 988	 * the memory is online.
 989	 */
 990	if (zone_idx(zone) <= zone_last && !node_state(nid, N_NORMAL_MEMORY))
 991		arg->status_change_nid_normal = nid;
 992	else
 993		arg->status_change_nid_normal = -1;
 994
 995#ifdef CONFIG_HIGHMEM
 996	/*
 997	 * If we have movable node, node_states[N_HIGH_MEMORY]
 998	 * contains nodes which have zones of 0...ZONE_HIGHMEM,
 999	 * set zone_last to ZONE_HIGHMEM.
1000	 *
1001	 * If we don't have movable node, node_states[N_NORMAL_MEMORY]
1002	 * contains nodes which have zones of 0...ZONE_MOVABLE,
1003	 * set zone_last to ZONE_MOVABLE.
1004	 */
1005	zone_last = ZONE_HIGHMEM;
1006	if (N_MEMORY == N_HIGH_MEMORY)
1007		zone_last = ZONE_MOVABLE;
1008
1009	if (zone_idx(zone) <= zone_last && !node_state(nid, N_HIGH_MEMORY))
1010		arg->status_change_nid_high = nid;
1011	else
1012		arg->status_change_nid_high = -1;
1013#else
1014	arg->status_change_nid_high = arg->status_change_nid_normal;
1015#endif
1016
1017	/*
1018	 * if the node don't have memory befor online, we will need to
1019	 * set the node to node_states[N_MEMORY] after the memory
1020	 * is online.
1021	 */
1022	if (!node_state(nid, N_MEMORY))
1023		arg->status_change_nid = nid;
1024	else
1025		arg->status_change_nid = -1;
1026}
1027
1028static void node_states_set_node(int node, struct memory_notify *arg)
1029{
1030	if (arg->status_change_nid_normal >= 0)
1031		node_set_state(node, N_NORMAL_MEMORY);
1032
1033	if (arg->status_change_nid_high >= 0)
1034		node_set_state(node, N_HIGH_MEMORY);
1035
1036	node_set_state(node, N_MEMORY);
1037}
1038
1039bool zone_can_shift(unsigned long pfn, unsigned long nr_pages,
1040		   enum zone_type target, int *zone_shift)
1041{
1042	struct zone *zone = page_zone(pfn_to_page(pfn));
1043	enum zone_type idx = zone_idx(zone);
1044	int i;
1045
1046	*zone_shift = 0;
1047
1048	if (idx < target) {
1049		/* pages must be at end of current zone */
1050		if (pfn + nr_pages != zone_end_pfn(zone))
1051			return false;
1052
1053		/* no zones in use between current zone and target */
1054		for (i = idx + 1; i < target; i++)
1055			if (zone_is_initialized(zone - idx + i))
1056				return false;
1057	}
1058
1059	if (target < idx) {
1060		/* pages must be at beginning of current zone */
1061		if (pfn != zone->zone_start_pfn)
1062			return false;
1063
1064		/* no zones in use between current zone and target */
1065		for (i = target + 1; i < idx; i++)
1066			if (zone_is_initialized(zone - idx + i))
1067				return false;
1068	}
1069
1070	*zone_shift = target - idx;
1071	return true;
1072}
1073
1074/* Must be protected by mem_hotplug_begin() */
1075int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type)
1076{
1077	unsigned long flags;
1078	unsigned long onlined_pages = 0;
1079	struct zone *zone;
1080	int need_zonelists_rebuild = 0;
1081	int nid;
1082	int ret;
1083	struct memory_notify arg;
1084	int zone_shift = 0;
1085
1086	/*
1087	 * This doesn't need a lock to do pfn_to_page().
1088	 * The section can't be removed here because of the
1089	 * memory_block->state_mutex.
1090	 */
1091	zone = page_zone(pfn_to_page(pfn));
1092
1093	if ((zone_idx(zone) > ZONE_NORMAL ||
1094	    online_type == MMOP_ONLINE_MOVABLE) &&
1095	    !can_online_high_movable(zone))
1096		return -EINVAL;
1097
1098	if (online_type == MMOP_ONLINE_KERNEL) {
1099		if (!zone_can_shift(pfn, nr_pages, ZONE_NORMAL, &zone_shift))
1100			return -EINVAL;
1101	} else if (online_type == MMOP_ONLINE_MOVABLE) {
1102		if (!zone_can_shift(pfn, nr_pages, ZONE_MOVABLE, &zone_shift))
1103			return -EINVAL;
1104	}
1105
1106	zone = move_pfn_range(zone_shift, pfn, pfn + nr_pages);
1107	if (!zone)
1108		return -EINVAL;
1109
1110	arg.start_pfn = pfn;
1111	arg.nr_pages = nr_pages;
1112	node_states_check_changes_online(nr_pages, zone, &arg);
1113
1114	nid = zone_to_nid(zone);
1115
1116	ret = memory_notify(MEM_GOING_ONLINE, &arg);
1117	ret = notifier_to_errno(ret);
1118	if (ret)
1119		goto failed_addition;
1120
1121	/*
1122	 * If this zone is not populated, then it is not in zonelist.
1123	 * This means the page allocator ignores this zone.
1124	 * So, zonelist must be updated after online.
1125	 */
1126	mutex_lock(&zonelists_mutex);
1127	if (!populated_zone(zone)) {
1128		need_zonelists_rebuild = 1;
1129		build_all_zonelists(NULL, zone);
1130	}
1131
1132	ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages,
1133		online_pages_range);
1134	if (ret) {
1135		if (need_zonelists_rebuild)
1136			zone_pcp_reset(zone);
1137		mutex_unlock(&zonelists_mutex);
1138		goto failed_addition;
1139	}
1140
1141	zone->present_pages += onlined_pages;
1142
1143	pgdat_resize_lock(zone->zone_pgdat, &flags);
1144	zone->zone_pgdat->node_present_pages += onlined_pages;
1145	pgdat_resize_unlock(zone->zone_pgdat, &flags);
1146
1147	if (onlined_pages) {
1148		node_states_set_node(nid, &arg);
1149		if (need_zonelists_rebuild)
1150			build_all_zonelists(NULL, NULL);
1151		else
1152			zone_pcp_update(zone);
1153	}
1154
1155	mutex_unlock(&zonelists_mutex);
1156
1157	init_per_zone_wmark_min();
1158
1159	if (onlined_pages) {
1160		kswapd_run(nid);
1161		kcompactd_run(nid);
1162	}
1163
1164	vm_total_pages = nr_free_pagecache_pages();
1165
1166	writeback_set_ratelimit();
1167
1168	if (onlined_pages)
1169		memory_notify(MEM_ONLINE, &arg);
1170	return 0;
1171
1172failed_addition:
1173	pr_debug("online_pages [mem %#010llx-%#010llx] failed\n",
1174		 (unsigned long long) pfn << PAGE_SHIFT,
1175		 (((unsigned long long) pfn + nr_pages) << PAGE_SHIFT) - 1);
1176	memory_notify(MEM_CANCEL_ONLINE, &arg);
1177	return ret;
1178}
1179#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
1180
1181static void reset_node_present_pages(pg_data_t *pgdat)
1182{
1183	struct zone *z;
1184
1185	for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
1186		z->present_pages = 0;
1187
1188	pgdat->node_present_pages = 0;
1189}
1190
1191/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
1192static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
1193{
1194	struct pglist_data *pgdat;
1195	unsigned long zones_size[MAX_NR_ZONES] = {0};
1196	unsigned long zholes_size[MAX_NR_ZONES] = {0};
1197	unsigned long start_pfn = PFN_DOWN(start);
1198
1199	pgdat = NODE_DATA(nid);
1200	if (!pgdat) {
1201		pgdat = arch_alloc_nodedata(nid);
1202		if (!pgdat)
1203			return NULL;
1204
1205		arch_refresh_nodedata(nid, pgdat);
1206	} else {
1207		/* Reset the nr_zones, order and classzone_idx before reuse */
1208		pgdat->nr_zones = 0;
1209		pgdat->kswapd_order = 0;
1210		pgdat->kswapd_classzone_idx = 0;
1211	}
1212
1213	/* we can use NODE_DATA(nid) from here */
1214
1215	/* init node's zones as empty zones, we don't have any present pages.*/
1216	free_area_init_node(nid, zones_size, start_pfn, zholes_size);
1217	pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat);
1218
1219	/*
1220	 * The node we allocated has no zone fallback lists. For avoiding
1221	 * to access not-initialized zonelist, build here.
1222	 */
1223	mutex_lock(&zonelists_mutex);
1224	build_all_zonelists(pgdat, NULL);
1225	mutex_unlock(&zonelists_mutex);
1226
1227	/*
1228	 * zone->managed_pages is set to an approximate value in
1229	 * free_area_init_core(), which will cause
1230	 * /sys/device/system/node/nodeX/meminfo has wrong data.
1231	 * So reset it to 0 before any memory is onlined.
1232	 */
1233	reset_node_managed_pages(pgdat);
1234
1235	/*
1236	 * When memory is hot-added, all the memory is in offline state. So
1237	 * clear all zones' present_pages because they will be updated in
1238	 * online_pages() and offline_pages().
1239	 */
1240	reset_node_present_pages(pgdat);
1241
1242	return pgdat;
1243}
1244
1245static void rollback_node_hotadd(int nid, pg_data_t *pgdat)
1246{
1247	arch_refresh_nodedata(nid, NULL);
1248	free_percpu(pgdat->per_cpu_nodestats);
1249	arch_free_nodedata(pgdat);
1250	return;
1251}
1252
1253
1254/**
1255 * try_online_node - online a node if offlined
1256 *
1257 * called by cpu_up() to online a node without onlined memory.
1258 */
1259int try_online_node(int nid)
1260{
1261	pg_data_t	*pgdat;
1262	int	ret;
1263
1264	if (node_online(nid))
1265		return 0;
1266
1267	mem_hotplug_begin();
1268	pgdat = hotadd_new_pgdat(nid, 0);
1269	if (!pgdat) {
1270		pr_err("Cannot online node %d due to NULL pgdat\n", nid);
1271		ret = -ENOMEM;
1272		goto out;
1273	}
1274	node_set_online(nid);
1275	ret = register_one_node(nid);
1276	BUG_ON(ret);
1277
1278	if (pgdat->node_zonelists->_zonerefs->zone == NULL) {
1279		mutex_lock(&zonelists_mutex);
1280		build_all_zonelists(NULL, NULL);
1281		mutex_unlock(&zonelists_mutex);
1282	}
1283
1284out:
1285	mem_hotplug_done();
1286	return ret;
1287}
1288
1289static int check_hotplug_memory_range(u64 start, u64 size)
1290{
1291	u64 start_pfn = PFN_DOWN(start);
1292	u64 nr_pages = size >> PAGE_SHIFT;
1293
1294	/* Memory range must be aligned with section */
1295	if ((start_pfn & ~PAGE_SECTION_MASK) ||
1296	    (nr_pages % PAGES_PER_SECTION) || (!nr_pages)) {
1297		pr_err("Section-unaligned hotplug range: start 0x%llx, size 0x%llx\n",
1298				(unsigned long long)start,
1299				(unsigned long long)size);
1300		return -EINVAL;
1301	}
1302
1303	return 0;
1304}
1305
1306/*
1307 * If movable zone has already been setup, newly added memory should be check.
1308 * If its address is higher than movable zone, it should be added as movable.
1309 * Without this check, movable zone may overlap with other zone.
1310 */
1311static int should_add_memory_movable(int nid, u64 start, u64 size)
1312{
1313	unsigned long start_pfn = start >> PAGE_SHIFT;
1314	pg_data_t *pgdat = NODE_DATA(nid);
1315	struct zone *movable_zone = pgdat->node_zones + ZONE_MOVABLE;
1316
1317	if (zone_is_empty(movable_zone))
1318		return 0;
1319
1320	if (movable_zone->zone_start_pfn <= start_pfn)
1321		return 1;
1322
1323	return 0;
1324}
1325
1326int zone_for_memory(int nid, u64 start, u64 size, int zone_default,
1327		bool for_device)
1328{
1329#ifdef CONFIG_ZONE_DEVICE
1330	if (for_device)
1331		return ZONE_DEVICE;
1332#endif
1333	if (should_add_memory_movable(nid, start, size))
1334		return ZONE_MOVABLE;
1335
1336	return zone_default;
1337}
1338
1339static int online_memory_block(struct memory_block *mem, void *arg)
1340{
1341	return device_online(&mem->dev);
1342}
1343
1344/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
1345int __ref add_memory_resource(int nid, struct resource *res, bool online)
1346{
1347	u64 start, size;
1348	pg_data_t *pgdat = NULL;
1349	bool new_pgdat;
1350	bool new_node;
1351	int ret;
1352
1353	start = res->start;
1354	size = resource_size(res);
1355
1356	ret = check_hotplug_memory_range(start, size);
1357	if (ret)
1358		return ret;
1359
1360	{	/* Stupid hack to suppress address-never-null warning */
1361		void *p = NODE_DATA(nid);
1362		new_pgdat = !p;
1363	}
1364
1365	mem_hotplug_begin();
1366
1367	/*
1368	 * Add new range to memblock so that when hotadd_new_pgdat() is called
1369	 * to allocate new pgdat, get_pfn_range_for_nid() will be able to find
1370	 * this new range and calculate total pages correctly.  The range will
1371	 * be removed at hot-remove time.
1372	 */
1373	memblock_add_node(start, size, nid);
1374
1375	new_node = !node_online(nid);
1376	if (new_node) {
1377		pgdat = hotadd_new_pgdat(nid, start);
1378		ret = -ENOMEM;
1379		if (!pgdat)
1380			goto error;
1381	}
1382
1383	/* call arch's memory hotadd */
1384	ret = arch_add_memory(nid, start, size, false);
1385
1386	if (ret < 0)
1387		goto error;
1388
1389	/* we online node here. we can't roll back from here. */
1390	node_set_online(nid);
1391
1392	if (new_node) {
1393		ret = register_one_node(nid);
1394		/*
1395		 * If sysfs file of new node can't create, cpu on the node
1396		 * can't be hot-added. There is no rollback way now.
1397		 * So, check by BUG_ON() to catch it reluctantly..
1398		 */
1399		BUG_ON(ret);
1400	}
1401
1402	/* create new memmap entry */
1403	firmware_map_add_hotplug(start, start + size, "System RAM");
1404
1405	/* online pages if requested */
1406	if (online)
1407		walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1),
1408				  NULL, online_memory_block);
1409
1410	goto out;
1411
1412error:
1413	/* rollback pgdat allocation and others */
1414	if (new_pgdat)
1415		rollback_node_hotadd(nid, pgdat);
1416	memblock_remove(start, size);
1417
1418out:
1419	mem_hotplug_done();
1420	return ret;
1421}
1422EXPORT_SYMBOL_GPL(add_memory_resource);
1423
1424int __ref add_memory(int nid, u64 start, u64 size)
1425{
1426	struct resource *res;
1427	int ret;
1428
1429	res = register_memory_resource(start, size);
1430	if (IS_ERR(res))
1431		return PTR_ERR(res);
1432
1433	ret = add_memory_resource(nid, res, memhp_auto_online);
1434	if (ret < 0)
1435		release_memory_resource(res);
1436	return ret;
1437}
1438EXPORT_SYMBOL_GPL(add_memory);
1439
1440#ifdef CONFIG_MEMORY_HOTREMOVE
1441/*
1442 * A free page on the buddy free lists (not the per-cpu lists) has PageBuddy
1443 * set and the size of the free page is given by page_order(). Using this,
1444 * the function determines if the pageblock contains only free pages.
1445 * Due to buddy contraints, a free page at least the size of a pageblock will
1446 * be located at the start of the pageblock
1447 */
1448static inline int pageblock_free(struct page *page)
1449{
1450	return PageBuddy(page) && page_order(page) >= pageblock_order;
1451}
1452
1453/* Return the start of the next active pageblock after a given page */
1454static struct page *next_active_pageblock(struct page *page)
1455{
1456	/* Ensure the starting page is pageblock-aligned */
1457	BUG_ON(page_to_pfn(page) & (pageblock_nr_pages - 1));
1458
1459	/* If the entire pageblock is free, move to the end of free page */
1460	if (pageblock_free(page)) {
1461		int order;
1462		/* be careful. we don't have locks, page_order can be changed.*/
1463		order = page_order(page);
1464		if ((order < MAX_ORDER) && (order >= pageblock_order))
1465			return page + (1 << order);
1466	}
1467
1468	return page + pageblock_nr_pages;
1469}
1470
1471/* Checks if this range of memory is likely to be hot-removable. */
1472bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
1473{
1474	struct page *page = pfn_to_page(start_pfn);
1475	struct page *end_page = page + nr_pages;
1476
1477	/* Check the starting page of each pageblock within the range */
1478	for (; page < end_page; page = next_active_pageblock(page)) {
1479		if (!is_pageblock_removable_nolock(page))
1480			return false;
1481		cond_resched();
1482	}
1483
1484	/* All pageblocks in the memory block are likely to be hot-removable */
1485	return true;
1486}
1487
1488/*
1489 * Confirm all pages in a range [start, end) belong to the same zone.
1490 * When true, return its valid [start, end).
1491 */
1492int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn,
1493			 unsigned long *valid_start, unsigned long *valid_end)
1494{
1495	unsigned long pfn, sec_end_pfn;
1496	unsigned long start, end;
1497	struct zone *zone = NULL;
1498	struct page *page;
1499	int i;
1500	for (pfn = start_pfn, sec_end_pfn = SECTION_ALIGN_UP(start_pfn + 1);
1501	     pfn < end_pfn;
1502	     pfn = sec_end_pfn, sec_end_pfn += PAGES_PER_SECTION) {
1503		/* Make sure the memory section is present first */
1504		if (!present_section_nr(pfn_to_section_nr(pfn)))
1505			continue;
1506		for (; pfn < sec_end_pfn && pfn < end_pfn;
1507		     pfn += MAX_ORDER_NR_PAGES) {
1508			i = 0;
1509			/* This is just a CONFIG_HOLES_IN_ZONE check.*/
1510			while ((i < MAX_ORDER_NR_PAGES) &&
1511				!pfn_valid_within(pfn + i))
1512				i++;
1513			if (i == MAX_ORDER_NR_PAGES || pfn + i >= end_pfn)
1514				continue;
1515			page = pfn_to_page(pfn + i);
1516			if (zone && page_zone(page) != zone)
1517				return 0;
1518			if (!zone)
1519				start = pfn + i;
1520			zone = page_zone(page);
1521			end = pfn + MAX_ORDER_NR_PAGES;
1522		}
1523	}
1524
1525	if (zone) {
1526		*valid_start = start;
1527		*valid_end = min(end, end_pfn);
1528		return 1;
1529	} else {
1530		return 0;
1531	}
1532}
1533
1534/*
1535 * Scan pfn range [start,end) to find movable/migratable pages (LRU pages,
1536 * non-lru movable pages and hugepages). We scan pfn because it's much
1537 * easier than scanning over linked list. This function returns the pfn
1538 * of the first found movable page if it's found, otherwise 0.
1539 */
1540static unsigned long scan_movable_pages(unsigned long start, unsigned long end)
1541{
1542	unsigned long pfn;
1543	struct page *page;
1544	for (pfn = start; pfn < end; pfn++) {
1545		if (pfn_valid(pfn)) {
1546			page = pfn_to_page(pfn);
1547			if (PageLRU(page))
1548				return pfn;
1549			if (__PageMovable(page))
1550				return pfn;
1551			if (PageHuge(page)) {
1552				if (page_huge_active(page))
1553					return pfn;
1554				else
1555					pfn = round_up(pfn + 1,
1556						1 << compound_order(page)) - 1;
1557			}
1558		}
1559	}
1560	return 0;
1561}
1562
1563static struct page *new_node_page(struct page *page, unsigned long private,
1564		int **result)
1565{
1566	gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE;
1567	int nid = page_to_nid(page);
1568	nodemask_t nmask = node_states[N_MEMORY];
1569	struct page *new_page = NULL;
1570
1571	/*
1572	 * TODO: allocate a destination hugepage from a nearest neighbor node,
1573	 * accordance with memory policy of the user process if possible. For
1574	 * now as a simple work-around, we use the next node for destination.
1575	 */
1576	if (PageHuge(page))
1577		return alloc_huge_page_node(page_hstate(compound_head(page)),
1578					next_node_in(nid, nmask));
1579
1580	node_clear(nid, nmask);
1581
1582	if (PageHighMem(page)
1583	    || (zone_idx(page_zone(page)) == ZONE_MOVABLE))
1584		gfp_mask |= __GFP_HIGHMEM;
1585
1586	if (!nodes_empty(nmask))
1587		new_page = __alloc_pages_nodemask(gfp_mask, 0,
1588					node_zonelist(nid, gfp_mask), &nmask);
1589	if (!new_page)
1590		new_page = __alloc_pages(gfp_mask, 0,
1591					node_zonelist(nid, gfp_mask));
1592
1593	return new_page;
1594}
1595
1596#define NR_OFFLINE_AT_ONCE_PAGES	(256)
1597static int
1598do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
1599{
1600	unsigned long pfn;
1601	struct page *page;
1602	int move_pages = NR_OFFLINE_AT_ONCE_PAGES;
1603	int not_managed = 0;
1604	int ret = 0;
1605	LIST_HEAD(source);
1606
1607	for (pfn = start_pfn; pfn < end_pfn && move_pages > 0; pfn++) {
1608		if (!pfn_valid(pfn))
1609			continue;
1610		page = pfn_to_page(pfn);
1611
1612		if (PageHuge(page)) {
1613			struct page *head = compound_head(page);
1614			pfn = page_to_pfn(head) + (1<<compound_order(head)) - 1;
1615			if (compound_order(head) > PFN_SECTION_SHIFT) {
1616				ret = -EBUSY;
1617				break;
1618			}
1619			if (isolate_huge_page(page, &source))
1620				move_pages -= 1 << compound_order(head);
1621			continue;
1622		}
1623
1624		if (!get_page_unless_zero(page))
1625			continue;
1626		/*
1627		 * We can skip free pages. And we can deal with pages on
1628		 * LRU and non-lru movable pages.
1629		 */
1630		if (PageLRU(page))
1631			ret = isolate_lru_page(page);
1632		else
1633			ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE);
1634		if (!ret) { /* Success */
1635			put_page(page);
1636			list_add_tail(&page->lru, &source);
1637			move_pages--;
1638			if (!__PageMovable(page))
1639				inc_node_page_state(page, NR_ISOLATED_ANON +
1640						    page_is_file_cache(page));
1641
1642		} else {
1643#ifdef CONFIG_DEBUG_VM
1644			pr_alert("failed to isolate pfn %lx\n", pfn);
1645			dump_page(page, "isolation failed");
1646#endif
1647			put_page(page);
1648			/* Because we don't have big zone->lock. we should
1649			   check this again here. */
1650			if (page_count(page)) {
1651				not_managed++;
1652				ret = -EBUSY;
1653				break;
1654			}
1655		}
1656	}
1657	if (!list_empty(&source)) {
1658		if (not_managed) {
1659			putback_movable_pages(&source);
1660			goto out;
1661		}
1662
1663		/* Allocate a new page from the nearest neighbor node */
1664		ret = migrate_pages(&source, new_node_page, NULL, 0,
1665					MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
1666		if (ret)
1667			putback_movable_pages(&source);
1668	}
1669out:
1670	return ret;
1671}
1672
1673/*
1674 * remove from free_area[] and mark all as Reserved.
1675 */
1676static int
1677offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages,
1678			void *data)
1679{
1680	__offline_isolated_pages(start, start + nr_pages);
1681	return 0;
1682}
1683
1684static void
1685offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
1686{
1687	walk_system_ram_range(start_pfn, end_pfn - start_pfn, NULL,
1688				offline_isolated_pages_cb);
1689}
1690
1691/*
1692 * Check all pages in range, recoreded as memory resource, are isolated.
1693 */
1694static int
1695check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages,
1696			void *data)
1697{
1698	int ret;
1699	long offlined = *(long *)data;
1700	ret = test_pages_isolated(start_pfn, start_pfn + nr_pages, true);
1701	offlined = nr_pages;
1702	if (!ret)
1703		*(long *)data += offlined;
1704	return ret;
1705}
1706
1707static long
1708check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
1709{
1710	long offlined = 0;
1711	int ret;
1712
1713	ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, &offlined,
1714			check_pages_isolated_cb);
1715	if (ret < 0)
1716		offlined = (long)ret;
1717	return offlined;
1718}
1719
1720#ifdef CONFIG_MOVABLE_NODE
1721/*
1722 * When CONFIG_MOVABLE_NODE, we permit offlining of a node which doesn't have
1723 * normal memory.
1724 */
1725static bool can_offline_normal(struct zone *zone, unsigned long nr_pages)
1726{
1727	return true;
1728}
1729#else /* CONFIG_MOVABLE_NODE */
1730/* ensure the node has NORMAL memory if it is still online */
1731static bool can_offline_normal(struct zone *zone, unsigned long nr_pages)
1732{
1733	struct pglist_data *pgdat = zone->zone_pgdat;
1734	unsigned long present_pages = 0;
1735	enum zone_type zt;
1736
1737	for (zt = 0; zt <= ZONE_NORMAL; zt++)
1738		present_pages += pgdat->node_zones[zt].present_pages;
1739
1740	if (present_pages > nr_pages)
1741		return true;
1742
1743	present_pages = 0;
1744	for (; zt <= ZONE_MOVABLE; zt++)
1745		present_pages += pgdat->node_zones[zt].present_pages;
1746
1747	/*
1748	 * we can't offline the last normal memory until all
1749	 * higher memory is offlined.
1750	 */
1751	return present_pages == 0;
1752}
1753#endif /* CONFIG_MOVABLE_NODE */
1754
1755static int __init cmdline_parse_movable_node(char *p)
1756{
1757#ifdef CONFIG_MOVABLE_NODE
1758	movable_node_enabled = true;
1759#else
1760	pr_warn("movable_node option not supported\n");
1761#endif
1762	return 0;
1763}
1764early_param("movable_node", cmdline_parse_movable_node);
1765
1766/* check which state of node_states will be changed when offline memory */
1767static void node_states_check_changes_offline(unsigned long nr_pages,
1768		struct zone *zone, struct memory_notify *arg)
1769{
1770	struct pglist_data *pgdat = zone->zone_pgdat;
1771	unsigned long present_pages = 0;
1772	enum zone_type zt, zone_last = ZONE_NORMAL;
1773
1774	/*
1775	 * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY]
1776	 * contains nodes which have zones of 0...ZONE_NORMAL,
1777	 * set zone_last to ZONE_NORMAL.
1778	 *
1779	 * If we don't have HIGHMEM nor movable node,
1780	 * node_states[N_NORMAL_MEMORY] contains nodes which have zones of
1781	 * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE.
1782	 */
1783	if (N_MEMORY == N_NORMAL_MEMORY)
1784		zone_last = ZONE_MOVABLE;
1785
1786	/*
1787	 * check whether node_states[N_NORMAL_MEMORY] will be changed.
1788	 * If the memory to be offline is in a zone of 0...zone_last,
1789	 * and it is the last present memory, 0...zone_last will
1790	 * become empty after offline , thus we can determind we will
1791	 * need to clear the node from node_states[N_NORMAL_MEMORY].
1792	 */
1793	for (zt = 0; zt <= zone_last; zt++)
1794		present_pages += pgdat->node_zones[zt].present_pages;
1795	if (zone_idx(zone) <= zone_last && nr_pages >= present_pages)
1796		arg->status_change_nid_normal = zone_to_nid(zone);
1797	else
1798		arg->status_change_nid_normal = -1;
1799
1800#ifdef CONFIG_HIGHMEM
1801	/*
1802	 * If we have movable node, node_states[N_HIGH_MEMORY]
1803	 * contains nodes which have zones of 0...ZONE_HIGHMEM,
1804	 * set zone_last to ZONE_HIGHMEM.
1805	 *
1806	 * If we don't have movable node, node_states[N_NORMAL_MEMORY]
1807	 * contains nodes which have zones of 0...ZONE_MOVABLE,
1808	 * set zone_last to ZONE_MOVABLE.
1809	 */
1810	zone_last = ZONE_HIGHMEM;
1811	if (N_MEMORY == N_HIGH_MEMORY)
1812		zone_last = ZONE_MOVABLE;
1813
1814	for (; zt <= zone_last; zt++)
1815		present_pages += pgdat->node_zones[zt].present_pages;
1816	if (zone_idx(zone) <= zone_last && nr_pages >= present_pages)
1817		arg->status_change_nid_high = zone_to_nid(zone);
1818	else
1819		arg->status_change_nid_high = -1;
1820#else
1821	arg->status_change_nid_high = arg->status_change_nid_normal;
1822#endif
1823
1824	/*
1825	 * node_states[N_HIGH_MEMORY] contains nodes which have 0...ZONE_MOVABLE
1826	 */
1827	zone_last = ZONE_MOVABLE;
1828
1829	/*
1830	 * check whether node_states[N_HIGH_MEMORY] will be changed
1831	 * If we try to offline the last present @nr_pages from the node,
1832	 * we can determind we will need to clear the node from
1833	 * node_states[N_HIGH_MEMORY].
1834	 */
1835	for (; zt <= zone_last; zt++)
1836		present_pages += pgdat->node_zones[zt].present_pages;
1837	if (nr_pages >= present_pages)
1838		arg->status_change_nid = zone_to_nid(zone);
1839	else
1840		arg->status_change_nid = -1;
1841}
1842
1843static void node_states_clear_node(int node, struct memory_notify *arg)
1844{
1845	if (arg->status_change_nid_normal >= 0)
1846		node_clear_state(node, N_NORMAL_MEMORY);
1847
1848	if ((N_MEMORY != N_NORMAL_MEMORY) &&
1849	    (arg->status_change_nid_high >= 0))
1850		node_clear_state(node, N_HIGH_MEMORY);
1851
1852	if ((N_MEMORY != N_HIGH_MEMORY) &&
1853	    (arg->status_change_nid >= 0))
1854		node_clear_state(node, N_MEMORY);
1855}
1856
1857static int __ref __offline_pages(unsigned long start_pfn,
1858		  unsigned long end_pfn, unsigned long timeout)
1859{
1860	unsigned long pfn, nr_pages, expire;
1861	long offlined_pages;
1862	int ret, drain, retry_max, node;
1863	unsigned long flags;
1864	unsigned long valid_start, valid_end;
1865	struct zone *zone;
1866	struct memory_notify arg;
1867
1868	/* at least, alignment against pageblock is necessary */
1869	if (!IS_ALIGNED(start_pfn, pageblock_nr_pages))
1870		return -EINVAL;
1871	if (!IS_ALIGNED(end_pfn, pageblock_nr_pages))
1872		return -EINVAL;
1873	/* This makes hotplug much easier...and readable.
1874	   we assume this for now. .*/
1875	if (!test_pages_in_a_zone(start_pfn, end_pfn, &valid_start, &valid_end))
1876		return -EINVAL;
1877
1878	zone = page_zone(pfn_to_page(valid_start));
1879	node = zone_to_nid(zone);
1880	nr_pages = end_pfn - start_pfn;
1881
1882	if (zone_idx(zone) <= ZONE_NORMAL && !can_offline_normal(zone, nr_pages))
1883		return -EINVAL;
1884
1885	/* set above range as isolated */
1886	ret = start_isolate_page_range(start_pfn, end_pfn,
1887				       MIGRATE_MOVABLE, true);
1888	if (ret)
1889		return ret;
1890
1891	arg.start_pfn = start_pfn;
1892	arg.nr_pages = nr_pages;
1893	node_states_check_changes_offline(nr_pages, zone, &arg);
1894
1895	ret = memory_notify(MEM_GOING_OFFLINE, &arg);
1896	ret = notifier_to_errno(ret);
1897	if (ret)
1898		goto failed_removal;
1899
1900	pfn = start_pfn;
1901	expire = jiffies + timeout;
1902	drain = 0;
1903	retry_max = 5;
1904repeat:
1905	/* start memory hot removal */
1906	ret = -EAGAIN;
1907	if (time_after(jiffies, expire))
1908		goto failed_removal;
1909	ret = -EINTR;
1910	if (signal_pending(current))
1911		goto failed_removal;
1912	ret = 0;
1913	if (drain) {
1914		lru_add_drain_all();
1915		cond_resched();
1916		drain_all_pages(zone);
1917	}
1918
1919	pfn = scan_movable_pages(start_pfn, end_pfn);
1920	if (pfn) { /* We have movable pages */
1921		ret = do_migrate_range(pfn, end_pfn);
1922		if (!ret) {
1923			drain = 1;
1924			goto repeat;
1925		} else {
1926			if (ret < 0)
1927				if (--retry_max == 0)
1928					goto failed_removal;
1929			yield();
1930			drain = 1;
1931			goto repeat;
1932		}
1933	}
1934	/* drain all zone's lru pagevec, this is asynchronous... */
1935	lru_add_drain_all();
1936	yield();
1937	/* drain pcp pages, this is synchronous. */
1938	drain_all_pages(zone);
1939	/*
1940	 * dissolve free hugepages in the memory block before doing offlining
1941	 * actually in order to make hugetlbfs's object counting consistent.
1942	 */
1943	ret = dissolve_free_huge_pages(start_pfn, end_pfn);
1944	if (ret)
1945		goto failed_removal;
1946	/* check again */
1947	offlined_pages = check_pages_isolated(start_pfn, end_pfn);
1948	if (offlined_pages < 0) {
1949		ret = -EBUSY;
1950		goto failed_removal;
1951	}
1952	pr_info("Offlined Pages %ld\n", offlined_pages);
1953	/* Ok, all of our target is isolated.
1954	   We cannot do rollback at this point. */
1955	offline_isolated_pages(start_pfn, end_pfn);
1956	/* reset pagetype flags and makes migrate type to be MOVABLE */
1957	undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
1958	/* removal success */
1959	adjust_managed_page_count(pfn_to_page(start_pfn), -offlined_pages);
1960	zone->present_pages -= offlined_pages;
1961
1962	pgdat_resize_lock(zone->zone_pgdat, &flags);
1963	zone->zone_pgdat->node_present_pages -= offlined_pages;
1964	pgdat_resize_unlock(zone->zone_pgdat, &flags);
1965
1966	init_per_zone_wmark_min();
1967
1968	if (!populated_zone(zone)) {
1969		zone_pcp_reset(zone);
1970		mutex_lock(&zonelists_mutex);
1971		build_all_zonelists(NULL, NULL);
1972		mutex_unlock(&zonelists_mutex);
1973	} else
1974		zone_pcp_update(zone);
1975
1976	node_states_clear_node(node, &arg);
1977	if (arg.status_change_nid >= 0) {
1978		kswapd_stop(node);
1979		kcompactd_stop(node);
1980	}
1981
1982	vm_total_pages = nr_free_pagecache_pages();
1983	writeback_set_ratelimit();
1984
1985	memory_notify(MEM_OFFLINE, &arg);
1986	return 0;
1987
1988failed_removal:
1989	pr_debug("memory offlining [mem %#010llx-%#010llx] failed\n",
1990		 (unsigned long long) start_pfn << PAGE_SHIFT,
1991		 ((unsigned long long) end_pfn << PAGE_SHIFT) - 1);
1992	memory_notify(MEM_CANCEL_OFFLINE, &arg);
1993	/* pushback to free area */
1994	undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
1995	return ret;
1996}
1997
1998/* Must be protected by mem_hotplug_begin() */
1999int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
2000{
2001	return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ);
2002}
2003#endif /* CONFIG_MEMORY_HOTREMOVE */
2004
2005/**
2006 * walk_memory_range - walks through all mem sections in [start_pfn, end_pfn)
2007 * @start_pfn: start pfn of the memory range
2008 * @end_pfn: end pfn of the memory range
2009 * @arg: argument passed to func
2010 * @func: callback for each memory section walked
2011 *
2012 * This function walks through all present mem sections in range
2013 * [start_pfn, end_pfn) and call func on each mem section.
2014 *
2015 * Returns the return value of func.
2016 */
2017int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
2018		void *arg, int (*func)(struct memory_block *, void *))
2019{
2020	struct memory_block *mem = NULL;
2021	struct mem_section *section;
2022	unsigned long pfn, section_nr;
2023	int ret;
2024
2025	for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
2026		section_nr = pfn_to_section_nr(pfn);
2027		if (!present_section_nr(section_nr))
2028			continue;
2029
2030		section = __nr_to_section(section_nr);
2031		/* same memblock? */
2032		if (mem)
2033			if ((section_nr >= mem->start_section_nr) &&
2034			    (section_nr <= mem->end_section_nr))
2035				continue;
2036
2037		mem = find_memory_block_hinted(section, mem);
2038		if (!mem)
2039			continue;
2040
2041		ret = func(mem, arg);
2042		if (ret) {
2043			kobject_put(&mem->dev.kobj);
2044			return ret;
2045		}
2046	}
2047
2048	if (mem)
2049		kobject_put(&mem->dev.kobj);
2050
2051	return 0;
2052}
2053
2054#ifdef CONFIG_MEMORY_HOTREMOVE
2055static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)
2056{
2057	int ret = !is_memblock_offlined(mem);
2058
2059	if (unlikely(ret)) {
2060		phys_addr_t beginpa, endpa;
2061
2062		beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr));
2063		endpa = PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1;
2064		pr_warn("removing memory fails, because memory [%pa-%pa] is onlined\n",
2065			&beginpa, &endpa);
2066	}
2067
2068	return ret;
2069}
2070
2071static int check_cpu_on_node(pg_data_t *pgdat)
2072{
2073	int cpu;
2074
2075	for_each_present_cpu(cpu) {
2076		if (cpu_to_node(cpu) == pgdat->node_id)
2077			/*
2078			 * the cpu on this node isn't removed, and we can't
2079			 * offline this node.
2080			 */
2081			return -EBUSY;
2082	}
2083
2084	return 0;
2085}
2086
2087static void unmap_cpu_on_node(pg_data_t *pgdat)
2088{
2089#ifdef CONFIG_ACPI_NUMA
2090	int cpu;
2091
2092	for_each_possible_cpu(cpu)
2093		if (cpu_to_node(cpu) == pgdat->node_id)
2094			numa_clear_node(cpu);
2095#endif
2096}
2097
2098static int check_and_unmap_cpu_on_node(pg_data_t *pgdat)
2099{
2100	int ret;
2101
2102	ret = check_cpu_on_node(pgdat);
2103	if (ret)
2104		return ret;
2105
2106	/*
2107	 * the node will be offlined when we come here, so we can clear
2108	 * the cpu_to_node() now.
2109	 */
2110
2111	unmap_cpu_on_node(pgdat);
2112	return 0;
2113}
2114
2115/**
2116 * try_offline_node
2117 *
2118 * Offline a node if all memory sections and cpus of the node are removed.
2119 *
2120 * NOTE: The caller must call lock_device_hotplug() to serialize hotplug
2121 * and online/offline operations before this call.
2122 */
2123void try_offline_node(int nid)
2124{
2125	pg_data_t *pgdat = NODE_DATA(nid);
2126	unsigned long start_pfn = pgdat->node_start_pfn;
2127	unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages;
2128	unsigned long pfn;
2129
2130	for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
2131		unsigned long section_nr = pfn_to_section_nr(pfn);
2132
2133		if (!present_section_nr(section_nr))
2134			continue;
2135
2136		if (pfn_to_nid(pfn) != nid)
2137			continue;
2138
2139		/*
2140		 * some memory sections of this node are not removed, and we
2141		 * can't offline node now.
2142		 */
2143		return;
2144	}
2145
2146	if (check_and_unmap_cpu_on_node(pgdat))
2147		return;
2148
2149	/*
2150	 * all memory/cpu of this node are removed, we can offline this
2151	 * node now.
2152	 */
2153	node_set_offline(nid);
2154	unregister_one_node(nid);
2155}
2156EXPORT_SYMBOL(try_offline_node);
2157
2158/**
2159 * remove_memory
2160 *
2161 * NOTE: The caller must call lock_device_hotplug() to serialize hotplug
2162 * and online/offline operations before this call, as required by
2163 * try_offline_node().
2164 */
2165void __ref remove_memory(int nid, u64 start, u64 size)
2166{
2167	int ret;
2168
2169	BUG_ON(check_hotplug_memory_range(start, size));
2170
2171	mem_hotplug_begin();
2172
2173	/*
2174	 * All memory blocks must be offlined before removing memory.  Check
2175	 * whether all memory blocks in question are offline and trigger a BUG()
2176	 * if this is not the case.
2177	 */
2178	ret = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL,
2179				check_memblock_offlined_cb);
2180	if (ret)
2181		BUG();
2182
2183	/* remove memmap entry */
2184	firmware_map_remove(start, start + size, "System RAM");
2185	memblock_free(start, size);
2186	memblock_remove(start, size);
2187
2188	arch_remove_memory(start, size);
2189
2190	try_offline_node(nid);
2191
2192	mem_hotplug_done();
2193}
2194EXPORT_SYMBOL_GPL(remove_memory);
2195#endif /* CONFIG_MEMORY_HOTREMOVE */