drivers/base/memory.c at v5.18-rc6 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / drivers / base / memory.c
at v5.18-rc6 31 kB view raw
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Memory subsystem support
   4 *
   5 * Written by Matt Tolentino <matthew.e.tolentino@intel.com>
   6 *            Dave Hansen <haveblue@us.ibm.com>
   7 *
   8 * This file provides the necessary infrastructure to represent
   9 * a SPARSEMEM-memory-model system's physical memory in /sysfs.
  10 * All arch-independent code that assumes MEMORY_HOTPLUG requires
  11 * SPARSEMEM should be contained here, or in mm/memory_hotplug.c.
  12 */
  13
  14#include <linux/module.h>
  15#include <linux/init.h>
  16#include <linux/topology.h>
  17#include <linux/capability.h>
  18#include <linux/device.h>
  19#include <linux/memory.h>
  20#include <linux/memory_hotplug.h>
  21#include <linux/mm.h>
  22#include <linux/stat.h>
  23#include <linux/slab.h>
  24#include <linux/xarray.h>
  25
  26#include <linux/atomic.h>
  27#include <linux/uaccess.h>
  28
  29#define MEMORY_CLASS_NAME	"memory"
  30
  31static const char *const online_type_to_str[] = {
  32	[MMOP_OFFLINE] = "offline",
  33	[MMOP_ONLINE] = "online",
  34	[MMOP_ONLINE_KERNEL] = "online_kernel",
  35	[MMOP_ONLINE_MOVABLE] = "online_movable",
  36};
  37
  38int mhp_online_type_from_str(const char *str)
  39{
  40	int i;
  41
  42	for (i = 0; i < ARRAY_SIZE(online_type_to_str); i++) {
  43		if (sysfs_streq(str, online_type_to_str[i]))
  44			return i;
  45	}
  46	return -EINVAL;
  47}
  48
  49#define to_memory_block(dev) container_of(dev, struct memory_block, dev)
  50
  51static int sections_per_block;
  52
  53static inline unsigned long memory_block_id(unsigned long section_nr)
  54{
  55	return section_nr / sections_per_block;
  56}
  57
  58static inline unsigned long pfn_to_block_id(unsigned long pfn)
  59{
  60	return memory_block_id(pfn_to_section_nr(pfn));
  61}
  62
  63static inline unsigned long phys_to_block_id(unsigned long phys)
  64{
  65	return pfn_to_block_id(PFN_DOWN(phys));
  66}
  67
  68static int memory_subsys_online(struct device *dev);
  69static int memory_subsys_offline(struct device *dev);
  70
  71static struct bus_type memory_subsys = {
  72	.name = MEMORY_CLASS_NAME,
  73	.dev_name = MEMORY_CLASS_NAME,
  74	.online = memory_subsys_online,
  75	.offline = memory_subsys_offline,
  76};
  77
  78/*
  79 * Memory blocks are cached in a local radix tree to avoid
  80 * a costly linear search for the corresponding device on
  81 * the subsystem bus.
  82 */
  83static DEFINE_XARRAY(memory_blocks);
  84
  85/*
  86 * Memory groups, indexed by memory group id (mgid).
  87 */
  88static DEFINE_XARRAY_FLAGS(memory_groups, XA_FLAGS_ALLOC);
  89#define MEMORY_GROUP_MARK_DYNAMIC	XA_MARK_1
  90
  91static BLOCKING_NOTIFIER_HEAD(memory_chain);
  92
  93int register_memory_notifier(struct notifier_block *nb)
  94{
  95	return blocking_notifier_chain_register(&memory_chain, nb);
  96}
  97EXPORT_SYMBOL(register_memory_notifier);
  98
  99void unregister_memory_notifier(struct notifier_block *nb)
 100{
 101	blocking_notifier_chain_unregister(&memory_chain, nb);
 102}
 103EXPORT_SYMBOL(unregister_memory_notifier);
 104
 105static void memory_block_release(struct device *dev)
 106{
 107	struct memory_block *mem = to_memory_block(dev);
 108
 109	kfree(mem);
 110}
 111
 112unsigned long __weak memory_block_size_bytes(void)
 113{
 114	return MIN_MEMORY_BLOCK_SIZE;
 115}
 116EXPORT_SYMBOL_GPL(memory_block_size_bytes);
 117
 118/*
 119 * Show the first physical section index (number) of this memory block.
 120 */
 121static ssize_t phys_index_show(struct device *dev,
 122			       struct device_attribute *attr, char *buf)
 123{
 124	struct memory_block *mem = to_memory_block(dev);
 125	unsigned long phys_index;
 126
 127	phys_index = mem->start_section_nr / sections_per_block;
 128
 129	return sysfs_emit(buf, "%08lx\n", phys_index);
 130}
 131
 132/*
 133 * Legacy interface that we cannot remove. Always indicate "removable"
 134 * with CONFIG_MEMORY_HOTREMOVE - bad heuristic.
 135 */
 136static ssize_t removable_show(struct device *dev, struct device_attribute *attr,
 137			      char *buf)
 138{
 139	return sysfs_emit(buf, "%d\n", (int)IS_ENABLED(CONFIG_MEMORY_HOTREMOVE));
 140}
 141
 142/*
 143 * online, offline, going offline, etc.
 144 */
 145static ssize_t state_show(struct device *dev, struct device_attribute *attr,
 146			  char *buf)
 147{
 148	struct memory_block *mem = to_memory_block(dev);
 149	const char *output;
 150
 151	/*
 152	 * We can probably put these states in a nice little array
 153	 * so that they're not open-coded
 154	 */
 155	switch (mem->state) {
 156	case MEM_ONLINE:
 157		output = "online";
 158		break;
 159	case MEM_OFFLINE:
 160		output = "offline";
 161		break;
 162	case MEM_GOING_OFFLINE:
 163		output = "going-offline";
 164		break;
 165	default:
 166		WARN_ON(1);
 167		return sysfs_emit(buf, "ERROR-UNKNOWN-%ld\n", mem->state);
 168	}
 169
 170	return sysfs_emit(buf, "%s\n", output);
 171}
 172
 173int memory_notify(unsigned long val, void *v)
 174{
 175	return blocking_notifier_call_chain(&memory_chain, val, v);
 176}
 177
 178static int memory_block_online(struct memory_block *mem)
 179{
 180	unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
 181	unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
 182	unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages;
 183	struct zone *zone;
 184	int ret;
 185
 186	zone = zone_for_pfn_range(mem->online_type, mem->nid, mem->group,
 187				  start_pfn, nr_pages);
 188
 189	/*
 190	 * Although vmemmap pages have a different lifecycle than the pages
 191	 * they describe (they remain until the memory is unplugged), doing
 192	 * their initialization and accounting at memory onlining/offlining
 193	 * stage helps to keep accounting easier to follow - e.g vmemmaps
 194	 * belong to the same zone as the memory they backed.
 195	 */
 196	if (nr_vmemmap_pages) {
 197		ret = mhp_init_memmap_on_memory(start_pfn, nr_vmemmap_pages, zone);
 198		if (ret)
 199			return ret;
 200	}
 201
 202	ret = online_pages(start_pfn + nr_vmemmap_pages,
 203			   nr_pages - nr_vmemmap_pages, zone, mem->group);
 204	if (ret) {
 205		if (nr_vmemmap_pages)
 206			mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages);
 207		return ret;
 208	}
 209
 210	/*
 211	 * Account once onlining succeeded. If the zone was unpopulated, it is
 212	 * now already properly populated.
 213	 */
 214	if (nr_vmemmap_pages)
 215		adjust_present_page_count(pfn_to_page(start_pfn), mem->group,
 216					  nr_vmemmap_pages);
 217
 218	mem->zone = zone;
 219	return ret;
 220}
 221
 222static int memory_block_offline(struct memory_block *mem)
 223{
 224	unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
 225	unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
 226	unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages;
 227	int ret;
 228
 229	if (!mem->zone)
 230		return -EINVAL;
 231
 232	/*
 233	 * Unaccount before offlining, such that unpopulated zone and kthreads
 234	 * can properly be torn down in offline_pages().
 235	 */
 236	if (nr_vmemmap_pages)
 237		adjust_present_page_count(pfn_to_page(start_pfn), mem->group,
 238					  -nr_vmemmap_pages);
 239
 240	ret = offline_pages(start_pfn + nr_vmemmap_pages,
 241			    nr_pages - nr_vmemmap_pages, mem->zone, mem->group);
 242	if (ret) {
 243		/* offline_pages() failed. Account back. */
 244		if (nr_vmemmap_pages)
 245			adjust_present_page_count(pfn_to_page(start_pfn),
 246						  mem->group, nr_vmemmap_pages);
 247		return ret;
 248	}
 249
 250	if (nr_vmemmap_pages)
 251		mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages);
 252
 253	mem->zone = NULL;
 254	return ret;
 255}
 256
 257/*
 258 * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is
 259 * OK to have direct references to sparsemem variables in here.
 260 */
 261static int
 262memory_block_action(struct memory_block *mem, unsigned long action)
 263{
 264	int ret;
 265
 266	switch (action) {
 267	case MEM_ONLINE:
 268		ret = memory_block_online(mem);
 269		break;
 270	case MEM_OFFLINE:
 271		ret = memory_block_offline(mem);
 272		break;
 273	default:
 274		WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: "
 275		     "%ld\n", __func__, mem->start_section_nr, action, action);
 276		ret = -EINVAL;
 277	}
 278
 279	return ret;
 280}
 281
 282static int memory_block_change_state(struct memory_block *mem,
 283		unsigned long to_state, unsigned long from_state_req)
 284{
 285	int ret = 0;
 286
 287	if (mem->state != from_state_req)
 288		return -EINVAL;
 289
 290	if (to_state == MEM_OFFLINE)
 291		mem->state = MEM_GOING_OFFLINE;
 292
 293	ret = memory_block_action(mem, to_state);
 294	mem->state = ret ? from_state_req : to_state;
 295
 296	return ret;
 297}
 298
 299/* The device lock serializes operations on memory_subsys_[online|offline] */
 300static int memory_subsys_online(struct device *dev)
 301{
 302	struct memory_block *mem = to_memory_block(dev);
 303	int ret;
 304
 305	if (mem->state == MEM_ONLINE)
 306		return 0;
 307
 308	/*
 309	 * When called via device_online() without configuring the online_type,
 310	 * we want to default to MMOP_ONLINE.
 311	 */
 312	if (mem->online_type == MMOP_OFFLINE)
 313		mem->online_type = MMOP_ONLINE;
 314
 315	ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE);
 316	mem->online_type = MMOP_OFFLINE;
 317
 318	return ret;
 319}
 320
 321static int memory_subsys_offline(struct device *dev)
 322{
 323	struct memory_block *mem = to_memory_block(dev);
 324
 325	if (mem->state == MEM_OFFLINE)
 326		return 0;
 327
 328	return memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE);
 329}
 330
 331static ssize_t state_store(struct device *dev, struct device_attribute *attr,
 332			   const char *buf, size_t count)
 333{
 334	const int online_type = mhp_online_type_from_str(buf);
 335	struct memory_block *mem = to_memory_block(dev);
 336	int ret;
 337
 338	if (online_type < 0)
 339		return -EINVAL;
 340
 341	ret = lock_device_hotplug_sysfs();
 342	if (ret)
 343		return ret;
 344
 345	switch (online_type) {
 346	case MMOP_ONLINE_KERNEL:
 347	case MMOP_ONLINE_MOVABLE:
 348	case MMOP_ONLINE:
 349		/* mem->online_type is protected by device_hotplug_lock */
 350		mem->online_type = online_type;
 351		ret = device_online(&mem->dev);
 352		break;
 353	case MMOP_OFFLINE:
 354		ret = device_offline(&mem->dev);
 355		break;
 356	default:
 357		ret = -EINVAL; /* should never happen */
 358	}
 359
 360	unlock_device_hotplug();
 361
 362	if (ret < 0)
 363		return ret;
 364	if (ret)
 365		return -EINVAL;
 366
 367	return count;
 368}
 369
 370/*
 371 * Legacy interface that we cannot remove: s390x exposes the storage increment
 372 * covered by a memory block, allowing for identifying which memory blocks
 373 * comprise a storage increment. Since a memory block spans complete
 374 * storage increments nowadays, this interface is basically unused. Other
 375 * archs never exposed != 0.
 376 */
 377static ssize_t phys_device_show(struct device *dev,
 378				struct device_attribute *attr, char *buf)
 379{
 380	struct memory_block *mem = to_memory_block(dev);
 381	unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
 382
 383	return sysfs_emit(buf, "%d\n",
 384			  arch_get_memory_phys_device(start_pfn));
 385}
 386
 387#ifdef CONFIG_MEMORY_HOTREMOVE
 388static int print_allowed_zone(char *buf, int len, int nid,
 389			      struct memory_group *group,
 390			      unsigned long start_pfn, unsigned long nr_pages,
 391			      int online_type, struct zone *default_zone)
 392{
 393	struct zone *zone;
 394
 395	zone = zone_for_pfn_range(online_type, nid, group, start_pfn, nr_pages);
 396	if (zone == default_zone)
 397		return 0;
 398
 399	return sysfs_emit_at(buf, len, " %s", zone->name);
 400}
 401
 402static ssize_t valid_zones_show(struct device *dev,
 403				struct device_attribute *attr, char *buf)
 404{
 405	struct memory_block *mem = to_memory_block(dev);
 406	unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
 407	unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
 408	struct memory_group *group = mem->group;
 409	struct zone *default_zone;
 410	int nid = mem->nid;
 411	int len = 0;
 412
 413	/*
 414	 * Check the existing zone. Make sure that we do that only on the
 415	 * online nodes otherwise the page_zone is not reliable
 416	 */
 417	if (mem->state == MEM_ONLINE) {
 418		/*
 419		 * If !mem->zone, the memory block spans multiple zones and
 420		 * cannot get offlined.
 421		 */
 422		default_zone = mem->zone;
 423		if (!default_zone)
 424			return sysfs_emit(buf, "%s\n", "none");
 425		len += sysfs_emit_at(buf, len, "%s", default_zone->name);
 426		goto out;
 427	}
 428
 429	default_zone = zone_for_pfn_range(MMOP_ONLINE, nid, group,
 430					  start_pfn, nr_pages);
 431
 432	len += sysfs_emit_at(buf, len, "%s", default_zone->name);
 433	len += print_allowed_zone(buf, len, nid, group, start_pfn, nr_pages,
 434				  MMOP_ONLINE_KERNEL, default_zone);
 435	len += print_allowed_zone(buf, len, nid, group, start_pfn, nr_pages,
 436				  MMOP_ONLINE_MOVABLE, default_zone);
 437out:
 438	len += sysfs_emit_at(buf, len, "\n");
 439	return len;
 440}
 441static DEVICE_ATTR_RO(valid_zones);
 442#endif
 443
 444static DEVICE_ATTR_RO(phys_index);
 445static DEVICE_ATTR_RW(state);
 446static DEVICE_ATTR_RO(phys_device);
 447static DEVICE_ATTR_RO(removable);
 448
 449/*
 450 * Show the memory block size (shared by all memory blocks).
 451 */
 452static ssize_t block_size_bytes_show(struct device *dev,
 453				     struct device_attribute *attr, char *buf)
 454{
 455	return sysfs_emit(buf, "%lx\n", memory_block_size_bytes());
 456}
 457
 458static DEVICE_ATTR_RO(block_size_bytes);
 459
 460/*
 461 * Memory auto online policy.
 462 */
 463
 464static ssize_t auto_online_blocks_show(struct device *dev,
 465				       struct device_attribute *attr, char *buf)
 466{
 467	return sysfs_emit(buf, "%s\n",
 468			  online_type_to_str[mhp_default_online_type]);
 469}
 470
 471static ssize_t auto_online_blocks_store(struct device *dev,
 472					struct device_attribute *attr,
 473					const char *buf, size_t count)
 474{
 475	const int online_type = mhp_online_type_from_str(buf);
 476
 477	if (online_type < 0)
 478		return -EINVAL;
 479
 480	mhp_default_online_type = online_type;
 481	return count;
 482}
 483
 484static DEVICE_ATTR_RW(auto_online_blocks);
 485
 486/*
 487 * Some architectures will have custom drivers to do this, and
 488 * will not need to do it from userspace.  The fake hot-add code
 489 * as well as ppc64 will do all of their discovery in userspace
 490 * and will require this interface.
 491 */
 492#ifdef CONFIG_ARCH_MEMORY_PROBE
 493static ssize_t probe_store(struct device *dev, struct device_attribute *attr,
 494			   const char *buf, size_t count)
 495{
 496	u64 phys_addr;
 497	int nid, ret;
 498	unsigned long pages_per_block = PAGES_PER_SECTION * sections_per_block;
 499
 500	ret = kstrtoull(buf, 0, &phys_addr);
 501	if (ret)
 502		return ret;
 503
 504	if (phys_addr & ((pages_per_block << PAGE_SHIFT) - 1))
 505		return -EINVAL;
 506
 507	ret = lock_device_hotplug_sysfs();
 508	if (ret)
 509		return ret;
 510
 511	nid = memory_add_physaddr_to_nid(phys_addr);
 512	ret = __add_memory(nid, phys_addr,
 513			   MIN_MEMORY_BLOCK_SIZE * sections_per_block,
 514			   MHP_NONE);
 515
 516	if (ret)
 517		goto out;
 518
 519	ret = count;
 520out:
 521	unlock_device_hotplug();
 522	return ret;
 523}
 524
 525static DEVICE_ATTR_WO(probe);
 526#endif
 527
 528#ifdef CONFIG_MEMORY_FAILURE
 529/*
 530 * Support for offlining pages of memory
 531 */
 532
 533/* Soft offline a page */
 534static ssize_t soft_offline_page_store(struct device *dev,
 535				       struct device_attribute *attr,
 536				       const char *buf, size_t count)
 537{
 538	int ret;
 539	u64 pfn;
 540	if (!capable(CAP_SYS_ADMIN))
 541		return -EPERM;
 542	if (kstrtoull(buf, 0, &pfn) < 0)
 543		return -EINVAL;
 544	pfn >>= PAGE_SHIFT;
 545	ret = soft_offline_page(pfn, 0);
 546	return ret == 0 ? count : ret;
 547}
 548
 549/* Forcibly offline a page, including killing processes. */
 550static ssize_t hard_offline_page_store(struct device *dev,
 551				       struct device_attribute *attr,
 552				       const char *buf, size_t count)
 553{
 554	int ret;
 555	u64 pfn;
 556	if (!capable(CAP_SYS_ADMIN))
 557		return -EPERM;
 558	if (kstrtoull(buf, 0, &pfn) < 0)
 559		return -EINVAL;
 560	pfn >>= PAGE_SHIFT;
 561	ret = memory_failure(pfn, 0);
 562	if (ret == -EOPNOTSUPP)
 563		ret = 0;
 564	return ret ? ret : count;
 565}
 566
 567static DEVICE_ATTR_WO(soft_offline_page);
 568static DEVICE_ATTR_WO(hard_offline_page);
 569#endif
 570
 571/* See phys_device_show(). */
 572int __weak arch_get_memory_phys_device(unsigned long start_pfn)
 573{
 574	return 0;
 575}
 576
 577/*
 578 * A reference for the returned memory block device is acquired.
 579 *
 580 * Called under device_hotplug_lock.
 581 */
 582static struct memory_block *find_memory_block_by_id(unsigned long block_id)
 583{
 584	struct memory_block *mem;
 585
 586	mem = xa_load(&memory_blocks, block_id);
 587	if (mem)
 588		get_device(&mem->dev);
 589	return mem;
 590}
 591
 592/*
 593 * Called under device_hotplug_lock.
 594 */
 595struct memory_block *find_memory_block(unsigned long section_nr)
 596{
 597	unsigned long block_id = memory_block_id(section_nr);
 598
 599	return find_memory_block_by_id(block_id);
 600}
 601
 602static struct attribute *memory_memblk_attrs[] = {
 603	&dev_attr_phys_index.attr,
 604	&dev_attr_state.attr,
 605	&dev_attr_phys_device.attr,
 606	&dev_attr_removable.attr,
 607#ifdef CONFIG_MEMORY_HOTREMOVE
 608	&dev_attr_valid_zones.attr,
 609#endif
 610	NULL
 611};
 612
 613static const struct attribute_group memory_memblk_attr_group = {
 614	.attrs = memory_memblk_attrs,
 615};
 616
 617static const struct attribute_group *memory_memblk_attr_groups[] = {
 618	&memory_memblk_attr_group,
 619	NULL,
 620};
 621
 622static int __add_memory_block(struct memory_block *memory)
 623{
 624	int ret;
 625
 626	memory->dev.bus = &memory_subsys;
 627	memory->dev.id = memory->start_section_nr / sections_per_block;
 628	memory->dev.release = memory_block_release;
 629	memory->dev.groups = memory_memblk_attr_groups;
 630	memory->dev.offline = memory->state == MEM_OFFLINE;
 631
 632	ret = device_register(&memory->dev);
 633	if (ret) {
 634		put_device(&memory->dev);
 635		return ret;
 636	}
 637	ret = xa_err(xa_store(&memory_blocks, memory->dev.id, memory,
 638			      GFP_KERNEL));
 639	if (ret) {
 640		put_device(&memory->dev);
 641		device_unregister(&memory->dev);
 642	}
 643	return ret;
 644}
 645
 646static struct zone *early_node_zone_for_memory_block(struct memory_block *mem,
 647						     int nid)
 648{
 649	const unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
 650	const unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
 651	struct zone *zone, *matching_zone = NULL;
 652	pg_data_t *pgdat = NODE_DATA(nid);
 653	int i;
 654
 655	/*
 656	 * This logic only works for early memory, when the applicable zones
 657	 * already span the memory block. We don't expect overlapping zones on
 658	 * a single node for early memory. So if we're told that some PFNs
 659	 * of a node fall into this memory block, we can assume that all node
 660	 * zones that intersect with the memory block are actually applicable.
 661	 * No need to look at the memmap.
 662	 */
 663	for (i = 0; i < MAX_NR_ZONES; i++) {
 664		zone = pgdat->node_zones + i;
 665		if (!populated_zone(zone))
 666			continue;
 667		if (!zone_intersects(zone, start_pfn, nr_pages))
 668			continue;
 669		if (!matching_zone) {
 670			matching_zone = zone;
 671			continue;
 672		}
 673		/* Spans multiple zones ... */
 674		matching_zone = NULL;
 675		break;
 676	}
 677	return matching_zone;
 678}
 679
 680#ifdef CONFIG_NUMA
 681/**
 682 * memory_block_add_nid() - Indicate that system RAM falling into this memory
 683 *			    block device (partially) belongs to the given node.
 684 * @mem: The memory block device.
 685 * @nid: The node id.
 686 * @context: The memory initialization context.
 687 *
 688 * Indicate that system RAM falling into this memory block (partially) belongs
 689 * to the given node. If the context indicates ("early") that we are adding the
 690 * node during node device subsystem initialization, this will also properly
 691 * set/adjust mem->zone based on the zone ranges of the given node.
 692 */
 693void memory_block_add_nid(struct memory_block *mem, int nid,
 694			  enum meminit_context context)
 695{
 696	if (context == MEMINIT_EARLY && mem->nid != nid) {
 697		/*
 698		 * For early memory we have to determine the zone when setting
 699		 * the node id and handle multiple nodes spanning a single
 700		 * memory block by indicate via zone == NULL that we're not
 701		 * dealing with a single zone. So if we're setting the node id
 702		 * the first time, determine if there is a single zone. If we're
 703		 * setting the node id a second time to a different node,
 704		 * invalidate the single detected zone.
 705		 */
 706		if (mem->nid == NUMA_NO_NODE)
 707			mem->zone = early_node_zone_for_memory_block(mem, nid);
 708		else
 709			mem->zone = NULL;
 710	}
 711
 712	/*
 713	 * If this memory block spans multiple nodes, we only indicate
 714	 * the last processed node. If we span multiple nodes (not applicable
 715	 * to hotplugged memory), zone == NULL will prohibit memory offlining
 716	 * and consequently unplug.
 717	 */
 718	mem->nid = nid;
 719}
 720#endif
 721
 722static int add_memory_block(unsigned long block_id, unsigned long state,
 723			    unsigned long nr_vmemmap_pages,
 724			    struct memory_group *group)
 725{
 726	struct memory_block *mem;
 727	int ret = 0;
 728
 729	mem = find_memory_block_by_id(block_id);
 730	if (mem) {
 731		put_device(&mem->dev);
 732		return -EEXIST;
 733	}
 734	mem = kzalloc(sizeof(*mem), GFP_KERNEL);
 735	if (!mem)
 736		return -ENOMEM;
 737
 738	mem->start_section_nr = block_id * sections_per_block;
 739	mem->state = state;
 740	mem->nid = NUMA_NO_NODE;
 741	mem->nr_vmemmap_pages = nr_vmemmap_pages;
 742	INIT_LIST_HEAD(&mem->group_next);
 743
 744#ifndef CONFIG_NUMA
 745	if (state == MEM_ONLINE)
 746		/*
 747		 * MEM_ONLINE at this point implies early memory. With NUMA,
 748		 * we'll determine the zone when setting the node id via
 749		 * memory_block_add_nid(). Memory hotplug updated the zone
 750		 * manually when memory onlining/offlining succeeds.
 751		 */
 752		mem->zone = early_node_zone_for_memory_block(mem, NUMA_NO_NODE);
 753#endif /* CONFIG_NUMA */
 754
 755	ret = __add_memory_block(mem);
 756	if (ret)
 757		return ret;
 758
 759	if (group) {
 760		mem->group = group;
 761		list_add(&mem->group_next, &group->memory_blocks);
 762	}
 763
 764	return 0;
 765}
 766
 767static int __init add_boot_memory_block(unsigned long base_section_nr)
 768{
 769	int section_count = 0;
 770	unsigned long nr;
 771
 772	for (nr = base_section_nr; nr < base_section_nr + sections_per_block;
 773	     nr++)
 774		if (present_section_nr(nr))
 775			section_count++;
 776
 777	if (section_count == 0)
 778		return 0;
 779	return add_memory_block(memory_block_id(base_section_nr),
 780				MEM_ONLINE, 0,  NULL);
 781}
 782
 783static int add_hotplug_memory_block(unsigned long block_id,
 784				    unsigned long nr_vmemmap_pages,
 785				    struct memory_group *group)
 786{
 787	return add_memory_block(block_id, MEM_OFFLINE, nr_vmemmap_pages, group);
 788}
 789
 790static void remove_memory_block(struct memory_block *memory)
 791{
 792	if (WARN_ON_ONCE(memory->dev.bus != &memory_subsys))
 793		return;
 794
 795	WARN_ON(xa_erase(&memory_blocks, memory->dev.id) == NULL);
 796
 797	if (memory->group) {
 798		list_del(&memory->group_next);
 799		memory->group = NULL;
 800	}
 801
 802	/* drop the ref. we got via find_memory_block() */
 803	put_device(&memory->dev);
 804	device_unregister(&memory->dev);
 805}
 806
 807/*
 808 * Create memory block devices for the given memory area. Start and size
 809 * have to be aligned to memory block granularity. Memory block devices
 810 * will be initialized as offline.
 811 *
 812 * Called under device_hotplug_lock.
 813 */
 814int create_memory_block_devices(unsigned long start, unsigned long size,
 815				unsigned long vmemmap_pages,
 816				struct memory_group *group)
 817{
 818	const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start));
 819	unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size));
 820	struct memory_block *mem;
 821	unsigned long block_id;
 822	int ret = 0;
 823
 824	if (WARN_ON_ONCE(!IS_ALIGNED(start, memory_block_size_bytes()) ||
 825			 !IS_ALIGNED(size, memory_block_size_bytes())))
 826		return -EINVAL;
 827
 828	for (block_id = start_block_id; block_id != end_block_id; block_id++) {
 829		ret = add_hotplug_memory_block(block_id, vmemmap_pages, group);
 830		if (ret)
 831			break;
 832	}
 833	if (ret) {
 834		end_block_id = block_id;
 835		for (block_id = start_block_id; block_id != end_block_id;
 836		     block_id++) {
 837			mem = find_memory_block_by_id(block_id);
 838			if (WARN_ON_ONCE(!mem))
 839				continue;
 840			remove_memory_block(mem);
 841		}
 842	}
 843	return ret;
 844}
 845
 846/*
 847 * Remove memory block devices for the given memory area. Start and size
 848 * have to be aligned to memory block granularity. Memory block devices
 849 * have to be offline.
 850 *
 851 * Called under device_hotplug_lock.
 852 */
 853void remove_memory_block_devices(unsigned long start, unsigned long size)
 854{
 855	const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start));
 856	const unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size));
 857	struct memory_block *mem;
 858	unsigned long block_id;
 859
 860	if (WARN_ON_ONCE(!IS_ALIGNED(start, memory_block_size_bytes()) ||
 861			 !IS_ALIGNED(size, memory_block_size_bytes())))
 862		return;
 863
 864	for (block_id = start_block_id; block_id != end_block_id; block_id++) {
 865		mem = find_memory_block_by_id(block_id);
 866		if (WARN_ON_ONCE(!mem))
 867			continue;
 868		unregister_memory_block_under_nodes(mem);
 869		remove_memory_block(mem);
 870	}
 871}
 872
 873/* return true if the memory block is offlined, otherwise, return false */
 874bool is_memblock_offlined(struct memory_block *mem)
 875{
 876	return mem->state == MEM_OFFLINE;
 877}
 878
 879static struct attribute *memory_root_attrs[] = {
 880#ifdef CONFIG_ARCH_MEMORY_PROBE
 881	&dev_attr_probe.attr,
 882#endif
 883
 884#ifdef CONFIG_MEMORY_FAILURE
 885	&dev_attr_soft_offline_page.attr,
 886	&dev_attr_hard_offline_page.attr,
 887#endif
 888
 889	&dev_attr_block_size_bytes.attr,
 890	&dev_attr_auto_online_blocks.attr,
 891	NULL
 892};
 893
 894static const struct attribute_group memory_root_attr_group = {
 895	.attrs = memory_root_attrs,
 896};
 897
 898static const struct attribute_group *memory_root_attr_groups[] = {
 899	&memory_root_attr_group,
 900	NULL,
 901};
 902
 903/*
 904 * Initialize the sysfs support for memory devices. At the time this function
 905 * is called, we cannot have concurrent creation/deletion of memory block
 906 * devices, the device_hotplug_lock is not needed.
 907 */
 908void __init memory_dev_init(void)
 909{
 910	int ret;
 911	unsigned long block_sz, nr;
 912
 913	/* Validate the configured memory block size */
 914	block_sz = memory_block_size_bytes();
 915	if (!is_power_of_2(block_sz) || block_sz < MIN_MEMORY_BLOCK_SIZE)
 916		panic("Memory block size not suitable: 0x%lx\n", block_sz);
 917	sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE;
 918
 919	ret = subsys_system_register(&memory_subsys, memory_root_attr_groups);
 920	if (ret)
 921		panic("%s() failed to register subsystem: %d\n", __func__, ret);
 922
 923	/*
 924	 * Create entries for memory sections that were found
 925	 * during boot and have been initialized
 926	 */
 927	for (nr = 0; nr <= __highest_present_section_nr;
 928	     nr += sections_per_block) {
 929		ret = add_boot_memory_block(nr);
 930		if (ret)
 931			panic("%s() failed to add memory block: %d\n", __func__,
 932			      ret);
 933	}
 934}
 935
 936/**
 937 * walk_memory_blocks - walk through all present memory blocks overlapped
 938 *			by the range [start, start + size)
 939 *
 940 * @start: start address of the memory range
 941 * @size: size of the memory range
 942 * @arg: argument passed to func
 943 * @func: callback for each memory section walked
 944 *
 945 * This function walks through all present memory blocks overlapped by the
 946 * range [start, start + size), calling func on each memory block.
 947 *
 948 * In case func() returns an error, walking is aborted and the error is
 949 * returned.
 950 *
 951 * Called under device_hotplug_lock.
 952 */
 953int walk_memory_blocks(unsigned long start, unsigned long size,
 954		       void *arg, walk_memory_blocks_func_t func)
 955{
 956	const unsigned long start_block_id = phys_to_block_id(start);
 957	const unsigned long end_block_id = phys_to_block_id(start + size - 1);
 958	struct memory_block *mem;
 959	unsigned long block_id;
 960	int ret = 0;
 961
 962	if (!size)
 963		return 0;
 964
 965	for (block_id = start_block_id; block_id <= end_block_id; block_id++) {
 966		mem = find_memory_block_by_id(block_id);
 967		if (!mem)
 968			continue;
 969
 970		ret = func(mem, arg);
 971		put_device(&mem->dev);
 972		if (ret)
 973			break;
 974	}
 975	return ret;
 976}
 977
 978struct for_each_memory_block_cb_data {
 979	walk_memory_blocks_func_t func;
 980	void *arg;
 981};
 982
 983static int for_each_memory_block_cb(struct device *dev, void *data)
 984{
 985	struct memory_block *mem = to_memory_block(dev);
 986	struct for_each_memory_block_cb_data *cb_data = data;
 987
 988	return cb_data->func(mem, cb_data->arg);
 989}
 990
 991/**
 992 * for_each_memory_block - walk through all present memory blocks
 993 *
 994 * @arg: argument passed to func
 995 * @func: callback for each memory block walked
 996 *
 997 * This function walks through all present memory blocks, calling func on
 998 * each memory block.
 999 *
1000 * In case func() returns an error, walking is aborted and the error is
1001 * returned.
1002 */
1003int for_each_memory_block(void *arg, walk_memory_blocks_func_t func)
1004{
1005	struct for_each_memory_block_cb_data cb_data = {
1006		.func = func,
1007		.arg = arg,
1008	};
1009
1010	return bus_for_each_dev(&memory_subsys, NULL, &cb_data,
1011				for_each_memory_block_cb);
1012}
1013
1014/*
1015 * This is an internal helper to unify allocation and initialization of
1016 * memory groups. Note that the passed memory group will be copied to a
1017 * dynamically allocated memory group. After this call, the passed
1018 * memory group should no longer be used.
1019 */
1020static int memory_group_register(struct memory_group group)
1021{
1022	struct memory_group *new_group;
1023	uint32_t mgid;
1024	int ret;
1025
1026	if (!node_possible(group.nid))
1027		return -EINVAL;
1028
1029	new_group = kzalloc(sizeof(group), GFP_KERNEL);
1030	if (!new_group)
1031		return -ENOMEM;
1032	*new_group = group;
1033	INIT_LIST_HEAD(&new_group->memory_blocks);
1034
1035	ret = xa_alloc(&memory_groups, &mgid, new_group, xa_limit_31b,
1036		       GFP_KERNEL);
1037	if (ret) {
1038		kfree(new_group);
1039		return ret;
1040	} else if (group.is_dynamic) {
1041		xa_set_mark(&memory_groups, mgid, MEMORY_GROUP_MARK_DYNAMIC);
1042	}
1043	return mgid;
1044}
1045
1046/**
1047 * memory_group_register_static() - Register a static memory group.
1048 * @nid: The node id.
1049 * @max_pages: The maximum number of pages we'll have in this static memory
1050 *	       group.
1051 *
1052 * Register a new static memory group and return the memory group id.
1053 * All memory in the group belongs to a single unit, such as a DIMM. All
1054 * memory belonging to a static memory group is added in one go to be removed
1055 * in one go -- it's static.
1056 *
1057 * Returns an error if out of memory, if the node id is invalid, if no new
1058 * memory groups can be registered, or if max_pages is invalid (0). Otherwise,
1059 * returns the new memory group id.
1060 */
1061int memory_group_register_static(int nid, unsigned long max_pages)
1062{
1063	struct memory_group group = {
1064		.nid = nid,
1065		.s = {
1066			.max_pages = max_pages,
1067		},
1068	};
1069
1070	if (!max_pages)
1071		return -EINVAL;
1072	return memory_group_register(group);
1073}
1074EXPORT_SYMBOL_GPL(memory_group_register_static);
1075
1076/**
1077 * memory_group_register_dynamic() - Register a dynamic memory group.
1078 * @nid: The node id.
1079 * @unit_pages: Unit in pages in which is memory added/removed in this dynamic
1080 *		memory group.
1081 *
1082 * Register a new dynamic memory group and return the memory group id.
1083 * Memory within a dynamic memory group is added/removed dynamically
1084 * in unit_pages.
1085 *
1086 * Returns an error if out of memory, if the node id is invalid, if no new
1087 * memory groups can be registered, or if unit_pages is invalid (0, not a
1088 * power of two, smaller than a single memory block). Otherwise, returns the
1089 * new memory group id.
1090 */
1091int memory_group_register_dynamic(int nid, unsigned long unit_pages)
1092{
1093	struct memory_group group = {
1094		.nid = nid,
1095		.is_dynamic = true,
1096		.d = {
1097			.unit_pages = unit_pages,
1098		},
1099	};
1100
1101	if (!unit_pages || !is_power_of_2(unit_pages) ||
1102	    unit_pages < PHYS_PFN(memory_block_size_bytes()))
1103		return -EINVAL;
1104	return memory_group_register(group);
1105}
1106EXPORT_SYMBOL_GPL(memory_group_register_dynamic);
1107
1108/**
1109 * memory_group_unregister() - Unregister a memory group.
1110 * @mgid: the memory group id
1111 *
1112 * Unregister a memory group. If any memory block still belongs to this
1113 * memory group, unregistering will fail.
1114 *
1115 * Returns -EINVAL if the memory group id is invalid, returns -EBUSY if some
1116 * memory blocks still belong to this memory group and returns 0 if
1117 * unregistering succeeded.
1118 */
1119int memory_group_unregister(int mgid)
1120{
1121	struct memory_group *group;
1122
1123	if (mgid < 0)
1124		return -EINVAL;
1125
1126	group = xa_load(&memory_groups, mgid);
1127	if (!group)
1128		return -EINVAL;
1129	if (!list_empty(&group->memory_blocks))
1130		return -EBUSY;
1131	xa_erase(&memory_groups, mgid);
1132	kfree(group);
1133	return 0;
1134}
1135EXPORT_SYMBOL_GPL(memory_group_unregister);
1136
1137/*
1138 * This is an internal helper only to be used in core memory hotplug code to
1139 * lookup a memory group. We don't care about locking, as we don't expect a
1140 * memory group to get unregistered while adding memory to it -- because
1141 * the group and the memory is managed by the same driver.
1142 */
1143struct memory_group *memory_group_find_by_id(int mgid)
1144{
1145	return xa_load(&memory_groups, mgid);
1146}
1147
1148/*
1149 * This is an internal helper only to be used in core memory hotplug code to
1150 * walk all dynamic memory groups excluding a given memory group, either
1151 * belonging to a specific node, or belonging to any node.
1152 */
1153int walk_dynamic_memory_groups(int nid, walk_memory_groups_func_t func,
1154			       struct memory_group *excluded, void *arg)
1155{
1156	struct memory_group *group;
1157	unsigned long index;
1158	int ret = 0;
1159
1160	xa_for_each_marked(&memory_groups, index, group,
1161			   MEMORY_GROUP_MARK_DYNAMIC) {
1162		if (group == excluded)
1163			continue;
1164#ifdef CONFIG_NUMA
1165		if (nid != NUMA_NO_NODE && group->nid != nid)
1166			continue;
1167#endif /* CONFIG_NUMA */
1168		ret = func(group, arg);
1169		if (ret)
1170			break;
1171	}
1172	return ret;
1173}