drivers/base/memory.c at v6.17-rc3 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / drivers / base / memory.c
at v6.17-rc3 34 kB view raw
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Memory subsystem support
   4 *
   5 * Written by Matt Tolentino <matthew.e.tolentino@intel.com>
   6 *            Dave Hansen <haveblue@us.ibm.com>
   7 *
   8 * This file provides the necessary infrastructure to represent
   9 * a SPARSEMEM-memory-model system's physical memory in /sysfs.
  10 * All arch-independent code that assumes MEMORY_HOTPLUG requires
  11 * SPARSEMEM should be contained here, or in mm/memory_hotplug.c.
  12 */
  13
  14#include <linux/module.h>
  15#include <linux/init.h>
  16#include <linux/topology.h>
  17#include <linux/capability.h>
  18#include <linux/device.h>
  19#include <linux/memory.h>
  20#include <linux/memory_hotplug.h>
  21#include <linux/mm.h>
  22#include <linux/stat.h>
  23#include <linux/slab.h>
  24#include <linux/xarray.h>
  25#include <linux/export.h>
  26
  27#include <linux/atomic.h>
  28#include <linux/uaccess.h>
  29
  30#define MEMORY_CLASS_NAME	"memory"
  31
  32static const char *const online_type_to_str[] = {
  33	[MMOP_OFFLINE] = "offline",
  34	[MMOP_ONLINE] = "online",
  35	[MMOP_ONLINE_KERNEL] = "online_kernel",
  36	[MMOP_ONLINE_MOVABLE] = "online_movable",
  37};
  38
  39int mhp_online_type_from_str(const char *str)
  40{
  41	int i;
  42
  43	for (i = 0; i < ARRAY_SIZE(online_type_to_str); i++) {
  44		if (sysfs_streq(str, online_type_to_str[i]))
  45			return i;
  46	}
  47	return -EINVAL;
  48}
  49
  50#define to_memory_block(dev) container_of(dev, struct memory_block, dev)
  51
  52int sections_per_block;
  53EXPORT_SYMBOL(sections_per_block);
  54
  55static int memory_subsys_online(struct device *dev);
  56static int memory_subsys_offline(struct device *dev);
  57
  58static const struct bus_type memory_subsys = {
  59	.name = MEMORY_CLASS_NAME,
  60	.dev_name = MEMORY_CLASS_NAME,
  61	.online = memory_subsys_online,
  62	.offline = memory_subsys_offline,
  63};
  64
  65/*
  66 * Memory blocks are cached in a local radix tree to avoid
  67 * a costly linear search for the corresponding device on
  68 * the subsystem bus.
  69 */
  70static DEFINE_XARRAY(memory_blocks);
  71
  72/*
  73 * Memory groups, indexed by memory group id (mgid).
  74 */
  75static DEFINE_XARRAY_FLAGS(memory_groups, XA_FLAGS_ALLOC);
  76#define MEMORY_GROUP_MARK_DYNAMIC	XA_MARK_1
  77
  78static BLOCKING_NOTIFIER_HEAD(memory_chain);
  79
  80int register_memory_notifier(struct notifier_block *nb)
  81{
  82	return blocking_notifier_chain_register(&memory_chain, nb);
  83}
  84EXPORT_SYMBOL(register_memory_notifier);
  85
  86void unregister_memory_notifier(struct notifier_block *nb)
  87{
  88	blocking_notifier_chain_unregister(&memory_chain, nb);
  89}
  90EXPORT_SYMBOL(unregister_memory_notifier);
  91
  92static void memory_block_release(struct device *dev)
  93{
  94	struct memory_block *mem = to_memory_block(dev);
  95	/* Verify that the altmap is freed */
  96	WARN_ON(mem->altmap);
  97	kfree(mem);
  98}
  99
 100
 101/* Max block size to be set by memory_block_advise_max_size */
 102static unsigned long memory_block_advised_size;
 103static bool memory_block_advised_size_queried;
 104
 105/**
 106 * memory_block_advise_max_size() - advise memory hotplug on the max suggested
 107 *				    block size, usually for alignment.
 108 * @size: suggestion for maximum block size. must be aligned on power of 2.
 109 *
 110 * Early boot software (pre-allocator init) may advise archs on the max block
 111 * size. This value can only decrease after initialization, as the intent is
 112 * to identify the largest supported alignment for all sources.
 113 *
 114 * Use of this value is arch-defined, as is min/max block size.
 115 *
 116 * Return: 0 on success
 117 *	   -EINVAL if size is 0 or not pow2 aligned
 118 *	   -EBUSY if value has already been probed
 119 */
 120int __init memory_block_advise_max_size(unsigned long size)
 121{
 122	if (!size || !is_power_of_2(size))
 123		return -EINVAL;
 124
 125	if (memory_block_advised_size_queried)
 126		return -EBUSY;
 127
 128	if (memory_block_advised_size)
 129		memory_block_advised_size = min(memory_block_advised_size, size);
 130	else
 131		memory_block_advised_size = size;
 132
 133	return 0;
 134}
 135
 136/**
 137 * memory_block_advised_max_size() - query advised max hotplug block size.
 138 *
 139 * After the first call, the value can never change. Callers looking for the
 140 * actual block size should use memory_block_size_bytes. This interface is
 141 * intended for use by arch-init when initializing the hotplug block size.
 142 *
 143 * Return: advised size in bytes, or 0 if never set.
 144 */
 145unsigned long memory_block_advised_max_size(void)
 146{
 147	memory_block_advised_size_queried = true;
 148	return memory_block_advised_size;
 149}
 150
 151unsigned long __weak memory_block_size_bytes(void)
 152{
 153	return MIN_MEMORY_BLOCK_SIZE;
 154}
 155EXPORT_SYMBOL_GPL(memory_block_size_bytes);
 156
 157/* Show the memory block ID, relative to the memory block size */
 158static ssize_t phys_index_show(struct device *dev,
 159			       struct device_attribute *attr, char *buf)
 160{
 161	struct memory_block *mem = to_memory_block(dev);
 162
 163	return sysfs_emit(buf, "%08lx\n", memory_block_id(mem->start_section_nr));
 164}
 165
 166/*
 167 * Legacy interface that we cannot remove. Always indicate "removable"
 168 * with CONFIG_MEMORY_HOTREMOVE - bad heuristic.
 169 */
 170static ssize_t removable_show(struct device *dev, struct device_attribute *attr,
 171			      char *buf)
 172{
 173	return sysfs_emit(buf, "%d\n", (int)IS_ENABLED(CONFIG_MEMORY_HOTREMOVE));
 174}
 175
 176/*
 177 * online, offline, going offline, etc.
 178 */
 179static ssize_t state_show(struct device *dev, struct device_attribute *attr,
 180			  char *buf)
 181{
 182	struct memory_block *mem = to_memory_block(dev);
 183	const char *output;
 184
 185	/*
 186	 * We can probably put these states in a nice little array
 187	 * so that they're not open-coded
 188	 */
 189	switch (mem->state) {
 190	case MEM_ONLINE:
 191		output = "online";
 192		break;
 193	case MEM_OFFLINE:
 194		output = "offline";
 195		break;
 196	case MEM_GOING_OFFLINE:
 197		output = "going-offline";
 198		break;
 199	default:
 200		WARN_ON(1);
 201		return sysfs_emit(buf, "ERROR-UNKNOWN-%ld\n", mem->state);
 202	}
 203
 204	return sysfs_emit(buf, "%s\n", output);
 205}
 206
 207int memory_notify(unsigned long val, void *v)
 208{
 209	return blocking_notifier_call_chain(&memory_chain, val, v);
 210}
 211
 212#if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG)
 213static unsigned long memblk_nr_poison(struct memory_block *mem);
 214#else
 215static inline unsigned long memblk_nr_poison(struct memory_block *mem)
 216{
 217	return 0;
 218}
 219#endif
 220
 221/*
 222 * Must acquire mem_hotplug_lock in write mode.
 223 */
 224static int memory_block_online(struct memory_block *mem)
 225{
 226	unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
 227	unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
 228	unsigned long nr_vmemmap_pages = 0;
 229	struct memory_notify arg;
 230	struct zone *zone;
 231	int ret;
 232
 233	if (memblk_nr_poison(mem))
 234		return -EHWPOISON;
 235
 236	zone = zone_for_pfn_range(mem->online_type, mem->nid, mem->group,
 237				  start_pfn, nr_pages);
 238
 239	/*
 240	 * Although vmemmap pages have a different lifecycle than the pages
 241	 * they describe (they remain until the memory is unplugged), doing
 242	 * their initialization and accounting at memory onlining/offlining
 243	 * stage helps to keep accounting easier to follow - e.g vmemmaps
 244	 * belong to the same zone as the memory they backed.
 245	 */
 246	if (mem->altmap)
 247		nr_vmemmap_pages = mem->altmap->free;
 248
 249	arg.altmap_start_pfn = start_pfn;
 250	arg.altmap_nr_pages = nr_vmemmap_pages;
 251	arg.start_pfn = start_pfn + nr_vmemmap_pages;
 252	arg.nr_pages = nr_pages - nr_vmemmap_pages;
 253	mem_hotplug_begin();
 254	ret = memory_notify(MEM_PREPARE_ONLINE, &arg);
 255	ret = notifier_to_errno(ret);
 256	if (ret)
 257		goto out_notifier;
 258
 259	if (nr_vmemmap_pages) {
 260		ret = mhp_init_memmap_on_memory(start_pfn, nr_vmemmap_pages,
 261						zone, mem->altmap->inaccessible);
 262		if (ret)
 263			goto out;
 264	}
 265
 266	ret = online_pages(start_pfn + nr_vmemmap_pages,
 267			   nr_pages - nr_vmemmap_pages, zone, mem->group);
 268	if (ret) {
 269		if (nr_vmemmap_pages)
 270			mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages);
 271		goto out;
 272	}
 273
 274	/*
 275	 * Account once onlining succeeded. If the zone was unpopulated, it is
 276	 * now already properly populated.
 277	 */
 278	if (nr_vmemmap_pages)
 279		adjust_present_page_count(pfn_to_page(start_pfn), mem->group,
 280					  nr_vmemmap_pages);
 281
 282	mem->zone = zone;
 283	mem_hotplug_done();
 284	return ret;
 285out:
 286	memory_notify(MEM_FINISH_OFFLINE, &arg);
 287out_notifier:
 288	mem_hotplug_done();
 289	return ret;
 290}
 291
 292/*
 293 * Must acquire mem_hotplug_lock in write mode.
 294 */
 295static int memory_block_offline(struct memory_block *mem)
 296{
 297	unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
 298	unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
 299	unsigned long nr_vmemmap_pages = 0;
 300	struct memory_notify arg;
 301	int ret;
 302
 303	if (!mem->zone)
 304		return -EINVAL;
 305
 306	/*
 307	 * Unaccount before offlining, such that unpopulated zone and kthreads
 308	 * can properly be torn down in offline_pages().
 309	 */
 310	if (mem->altmap)
 311		nr_vmemmap_pages = mem->altmap->free;
 312
 313	mem_hotplug_begin();
 314	if (nr_vmemmap_pages)
 315		adjust_present_page_count(pfn_to_page(start_pfn), mem->group,
 316					  -nr_vmemmap_pages);
 317
 318	ret = offline_pages(start_pfn + nr_vmemmap_pages,
 319			    nr_pages - nr_vmemmap_pages, mem->zone, mem->group);
 320	if (ret) {
 321		/* offline_pages() failed. Account back. */
 322		if (nr_vmemmap_pages)
 323			adjust_present_page_count(pfn_to_page(start_pfn),
 324						  mem->group, nr_vmemmap_pages);
 325		goto out;
 326	}
 327
 328	if (nr_vmemmap_pages)
 329		mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages);
 330
 331	mem->zone = NULL;
 332	arg.altmap_start_pfn = start_pfn;
 333	arg.altmap_nr_pages = nr_vmemmap_pages;
 334	arg.start_pfn = start_pfn + nr_vmemmap_pages;
 335	arg.nr_pages = nr_pages - nr_vmemmap_pages;
 336	memory_notify(MEM_FINISH_OFFLINE, &arg);
 337out:
 338	mem_hotplug_done();
 339	return ret;
 340}
 341
 342/*
 343 * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is
 344 * OK to have direct references to sparsemem variables in here.
 345 */
 346static int
 347memory_block_action(struct memory_block *mem, unsigned long action)
 348{
 349	int ret;
 350
 351	switch (action) {
 352	case MEM_ONLINE:
 353		ret = memory_block_online(mem);
 354		break;
 355	case MEM_OFFLINE:
 356		ret = memory_block_offline(mem);
 357		break;
 358	default:
 359		WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: "
 360		     "%ld\n", __func__, mem->start_section_nr, action, action);
 361		ret = -EINVAL;
 362	}
 363
 364	return ret;
 365}
 366
 367static int memory_block_change_state(struct memory_block *mem,
 368		unsigned long to_state, unsigned long from_state_req)
 369{
 370	int ret = 0;
 371
 372	if (mem->state != from_state_req)
 373		return -EINVAL;
 374
 375	if (to_state == MEM_OFFLINE)
 376		mem->state = MEM_GOING_OFFLINE;
 377
 378	ret = memory_block_action(mem, to_state);
 379	mem->state = ret ? from_state_req : to_state;
 380
 381	return ret;
 382}
 383
 384/* The device lock serializes operations on memory_subsys_[online|offline] */
 385static int memory_subsys_online(struct device *dev)
 386{
 387	struct memory_block *mem = to_memory_block(dev);
 388	int ret;
 389
 390	if (mem->state == MEM_ONLINE)
 391		return 0;
 392
 393	/*
 394	 * When called via device_online() without configuring the online_type,
 395	 * we want to default to MMOP_ONLINE.
 396	 */
 397	if (mem->online_type == MMOP_OFFLINE)
 398		mem->online_type = MMOP_ONLINE;
 399
 400	ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE);
 401	mem->online_type = MMOP_OFFLINE;
 402
 403	return ret;
 404}
 405
 406static int memory_subsys_offline(struct device *dev)
 407{
 408	struct memory_block *mem = to_memory_block(dev);
 409
 410	if (mem->state == MEM_OFFLINE)
 411		return 0;
 412
 413	return memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE);
 414}
 415
 416static ssize_t state_store(struct device *dev, struct device_attribute *attr,
 417			   const char *buf, size_t count)
 418{
 419	const int online_type = mhp_online_type_from_str(buf);
 420	struct memory_block *mem = to_memory_block(dev);
 421	int ret;
 422
 423	if (online_type < 0)
 424		return -EINVAL;
 425
 426	ret = lock_device_hotplug_sysfs();
 427	if (ret)
 428		return ret;
 429
 430	switch (online_type) {
 431	case MMOP_ONLINE_KERNEL:
 432	case MMOP_ONLINE_MOVABLE:
 433	case MMOP_ONLINE:
 434		/* mem->online_type is protected by device_hotplug_lock */
 435		mem->online_type = online_type;
 436		ret = device_online(&mem->dev);
 437		break;
 438	case MMOP_OFFLINE:
 439		ret = device_offline(&mem->dev);
 440		break;
 441	default:
 442		ret = -EINVAL; /* should never happen */
 443	}
 444
 445	unlock_device_hotplug();
 446
 447	if (ret < 0)
 448		return ret;
 449	if (ret)
 450		return -EINVAL;
 451
 452	return count;
 453}
 454
 455/*
 456 * Legacy interface that we cannot remove: s390x exposes the storage increment
 457 * covered by a memory block, allowing for identifying which memory blocks
 458 * comprise a storage increment. Since a memory block spans complete
 459 * storage increments nowadays, this interface is basically unused. Other
 460 * archs never exposed != 0.
 461 */
 462static ssize_t phys_device_show(struct device *dev,
 463				struct device_attribute *attr, char *buf)
 464{
 465	struct memory_block *mem = to_memory_block(dev);
 466	unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
 467
 468	return sysfs_emit(buf, "%d\n",
 469			  arch_get_memory_phys_device(start_pfn));
 470}
 471
 472#ifdef CONFIG_MEMORY_HOTREMOVE
 473static int print_allowed_zone(char *buf, int len, int nid,
 474			      struct memory_group *group,
 475			      unsigned long start_pfn, unsigned long nr_pages,
 476			      int online_type, struct zone *default_zone)
 477{
 478	struct zone *zone;
 479
 480	zone = zone_for_pfn_range(online_type, nid, group, start_pfn, nr_pages);
 481	if (zone == default_zone)
 482		return 0;
 483
 484	return sysfs_emit_at(buf, len, " %s", zone->name);
 485}
 486
 487static ssize_t valid_zones_show(struct device *dev,
 488				struct device_attribute *attr, char *buf)
 489{
 490	struct memory_block *mem = to_memory_block(dev);
 491	unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
 492	unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
 493	struct memory_group *group = mem->group;
 494	struct zone *default_zone;
 495	int nid = mem->nid;
 496	int len;
 497
 498	/*
 499	 * Check the existing zone. Make sure that we do that only on the
 500	 * online nodes otherwise the page_zone is not reliable
 501	 */
 502	if (mem->state == MEM_ONLINE) {
 503		/*
 504		 * If !mem->zone, the memory block spans multiple zones and
 505		 * cannot get offlined.
 506		 */
 507		return sysfs_emit(buf, "%s\n",
 508				  mem->zone ? mem->zone->name : "none");
 509	}
 510
 511	default_zone = zone_for_pfn_range(MMOP_ONLINE, nid, group,
 512					  start_pfn, nr_pages);
 513
 514	len = sysfs_emit(buf, "%s", default_zone->name);
 515	len += print_allowed_zone(buf, len, nid, group, start_pfn, nr_pages,
 516				  MMOP_ONLINE_KERNEL, default_zone);
 517	len += print_allowed_zone(buf, len, nid, group, start_pfn, nr_pages,
 518				  MMOP_ONLINE_MOVABLE, default_zone);
 519	len += sysfs_emit_at(buf, len, "\n");
 520	return len;
 521}
 522static DEVICE_ATTR_RO(valid_zones);
 523#endif
 524
 525static DEVICE_ATTR_RO(phys_index);
 526static DEVICE_ATTR_RW(state);
 527static DEVICE_ATTR_RO(phys_device);
 528static DEVICE_ATTR_RO(removable);
 529
 530/*
 531 * Show the memory block size (shared by all memory blocks).
 532 */
 533static ssize_t block_size_bytes_show(struct device *dev,
 534				     struct device_attribute *attr, char *buf)
 535{
 536	return sysfs_emit(buf, "%lx\n", memory_block_size_bytes());
 537}
 538
 539static DEVICE_ATTR_RO(block_size_bytes);
 540
 541/*
 542 * Memory auto online policy.
 543 */
 544
 545static ssize_t auto_online_blocks_show(struct device *dev,
 546				       struct device_attribute *attr, char *buf)
 547{
 548	return sysfs_emit(buf, "%s\n",
 549			  online_type_to_str[mhp_get_default_online_type()]);
 550}
 551
 552static ssize_t auto_online_blocks_store(struct device *dev,
 553					struct device_attribute *attr,
 554					const char *buf, size_t count)
 555{
 556	const int online_type = mhp_online_type_from_str(buf);
 557
 558	if (online_type < 0)
 559		return -EINVAL;
 560
 561	mhp_set_default_online_type(online_type);
 562	return count;
 563}
 564
 565static DEVICE_ATTR_RW(auto_online_blocks);
 566
 567#ifdef CONFIG_CRASH_HOTPLUG
 568#include <linux/kexec.h>
 569static ssize_t crash_hotplug_show(struct device *dev,
 570				       struct device_attribute *attr, char *buf)
 571{
 572	return sysfs_emit(buf, "%d\n", crash_check_hotplug_support());
 573}
 574static DEVICE_ATTR_RO(crash_hotplug);
 575#endif
 576
 577/*
 578 * Some architectures will have custom drivers to do this, and
 579 * will not need to do it from userspace.  The fake hot-add code
 580 * as well as ppc64 will do all of their discovery in userspace
 581 * and will require this interface.
 582 */
 583#ifdef CONFIG_ARCH_MEMORY_PROBE
 584static ssize_t probe_store(struct device *dev, struct device_attribute *attr,
 585			   const char *buf, size_t count)
 586{
 587	u64 phys_addr;
 588	int nid, ret;
 589	unsigned long pages_per_block = PAGES_PER_SECTION * sections_per_block;
 590
 591	ret = kstrtoull(buf, 0, &phys_addr);
 592	if (ret)
 593		return ret;
 594
 595	if (phys_addr & ((pages_per_block << PAGE_SHIFT) - 1))
 596		return -EINVAL;
 597
 598	ret = lock_device_hotplug_sysfs();
 599	if (ret)
 600		return ret;
 601
 602	nid = memory_add_physaddr_to_nid(phys_addr);
 603	ret = __add_memory(nid, phys_addr,
 604			   MIN_MEMORY_BLOCK_SIZE * sections_per_block,
 605			   MHP_NONE);
 606
 607	if (ret)
 608		goto out;
 609
 610	ret = count;
 611out:
 612	unlock_device_hotplug();
 613	return ret;
 614}
 615
 616static DEVICE_ATTR_WO(probe);
 617#endif
 618
 619#ifdef CONFIG_MEMORY_FAILURE
 620/*
 621 * Support for offlining pages of memory
 622 */
 623
 624/* Soft offline a page */
 625static ssize_t soft_offline_page_store(struct device *dev,
 626				       struct device_attribute *attr,
 627				       const char *buf, size_t count)
 628{
 629	int ret;
 630	u64 pfn;
 631	if (!capable(CAP_SYS_ADMIN))
 632		return -EPERM;
 633	if (kstrtoull(buf, 0, &pfn) < 0)
 634		return -EINVAL;
 635	pfn >>= PAGE_SHIFT;
 636	ret = soft_offline_page(pfn, 0);
 637	return ret == 0 ? count : ret;
 638}
 639
 640/* Forcibly offline a page, including killing processes. */
 641static ssize_t hard_offline_page_store(struct device *dev,
 642				       struct device_attribute *attr,
 643				       const char *buf, size_t count)
 644{
 645	int ret;
 646	u64 pfn;
 647	if (!capable(CAP_SYS_ADMIN))
 648		return -EPERM;
 649	if (kstrtoull(buf, 0, &pfn) < 0)
 650		return -EINVAL;
 651	pfn >>= PAGE_SHIFT;
 652	ret = memory_failure(pfn, MF_SW_SIMULATED);
 653	if (ret == -EOPNOTSUPP)
 654		ret = 0;
 655	return ret ? ret : count;
 656}
 657
 658static DEVICE_ATTR_WO(soft_offline_page);
 659static DEVICE_ATTR_WO(hard_offline_page);
 660#endif
 661
 662/* See phys_device_show(). */
 663int __weak arch_get_memory_phys_device(unsigned long start_pfn)
 664{
 665	return 0;
 666}
 667
 668/*
 669 * A reference for the returned memory block device is acquired.
 670 *
 671 * Called under device_hotplug_lock.
 672 */
 673struct memory_block *find_memory_block_by_id(unsigned long block_id)
 674{
 675	struct memory_block *mem;
 676
 677	mem = xa_load(&memory_blocks, block_id);
 678	if (mem)
 679		get_device(&mem->dev);
 680	return mem;
 681}
 682
 683/*
 684 * Called under device_hotplug_lock.
 685 */
 686struct memory_block *find_memory_block(unsigned long section_nr)
 687{
 688	unsigned long block_id = memory_block_id(section_nr);
 689
 690	return find_memory_block_by_id(block_id);
 691}
 692
 693static struct attribute *memory_memblk_attrs[] = {
 694	&dev_attr_phys_index.attr,
 695	&dev_attr_state.attr,
 696	&dev_attr_phys_device.attr,
 697	&dev_attr_removable.attr,
 698#ifdef CONFIG_MEMORY_HOTREMOVE
 699	&dev_attr_valid_zones.attr,
 700#endif
 701	NULL
 702};
 703
 704static const struct attribute_group memory_memblk_attr_group = {
 705	.attrs = memory_memblk_attrs,
 706};
 707
 708static const struct attribute_group *memory_memblk_attr_groups[] = {
 709	&memory_memblk_attr_group,
 710	NULL,
 711};
 712
 713static int __add_memory_block(struct memory_block *memory)
 714{
 715	int ret;
 716
 717	memory->dev.bus = &memory_subsys;
 718	memory->dev.id = memory->start_section_nr / sections_per_block;
 719	memory->dev.release = memory_block_release;
 720	memory->dev.groups = memory_memblk_attr_groups;
 721	memory->dev.offline = memory->state == MEM_OFFLINE;
 722
 723	ret = device_register(&memory->dev);
 724	if (ret) {
 725		put_device(&memory->dev);
 726		return ret;
 727	}
 728	ret = xa_err(xa_store(&memory_blocks, memory->dev.id, memory,
 729			      GFP_KERNEL));
 730	if (ret)
 731		device_unregister(&memory->dev);
 732
 733	return ret;
 734}
 735
 736static struct zone *early_node_zone_for_memory_block(struct memory_block *mem,
 737						     int nid)
 738{
 739	const unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
 740	const unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
 741	struct zone *zone, *matching_zone = NULL;
 742	pg_data_t *pgdat = NODE_DATA(nid);
 743	int i;
 744
 745	/*
 746	 * This logic only works for early memory, when the applicable zones
 747	 * already span the memory block. We don't expect overlapping zones on
 748	 * a single node for early memory. So if we're told that some PFNs
 749	 * of a node fall into this memory block, we can assume that all node
 750	 * zones that intersect with the memory block are actually applicable.
 751	 * No need to look at the memmap.
 752	 */
 753	for (i = 0; i < MAX_NR_ZONES; i++) {
 754		zone = pgdat->node_zones + i;
 755		if (!populated_zone(zone))
 756			continue;
 757		if (!zone_intersects(zone, start_pfn, nr_pages))
 758			continue;
 759		if (!matching_zone) {
 760			matching_zone = zone;
 761			continue;
 762		}
 763		/* Spans multiple zones ... */
 764		matching_zone = NULL;
 765		break;
 766	}
 767	return matching_zone;
 768}
 769
 770#ifdef CONFIG_NUMA
 771/**
 772 * memory_block_add_nid() - Indicate that system RAM falling into this memory
 773 *			    block device (partially) belongs to the given node.
 774 * @mem: The memory block device.
 775 * @nid: The node id.
 776 * @context: The memory initialization context.
 777 *
 778 * Indicate that system RAM falling into this memory block (partially) belongs
 779 * to the given node. If the context indicates ("early") that we are adding the
 780 * node during node device subsystem initialization, this will also properly
 781 * set/adjust mem->zone based on the zone ranges of the given node.
 782 */
 783void memory_block_add_nid(struct memory_block *mem, int nid,
 784			  enum meminit_context context)
 785{
 786	if (context == MEMINIT_EARLY && mem->nid != nid) {
 787		/*
 788		 * For early memory we have to determine the zone when setting
 789		 * the node id and handle multiple nodes spanning a single
 790		 * memory block by indicate via zone == NULL that we're not
 791		 * dealing with a single zone. So if we're setting the node id
 792		 * the first time, determine if there is a single zone. If we're
 793		 * setting the node id a second time to a different node,
 794		 * invalidate the single detected zone.
 795		 */
 796		if (mem->nid == NUMA_NO_NODE)
 797			mem->zone = early_node_zone_for_memory_block(mem, nid);
 798		else
 799			mem->zone = NULL;
 800	}
 801
 802	/*
 803	 * If this memory block spans multiple nodes, we only indicate
 804	 * the last processed node. If we span multiple nodes (not applicable
 805	 * to hotplugged memory), zone == NULL will prohibit memory offlining
 806	 * and consequently unplug.
 807	 */
 808	mem->nid = nid;
 809}
 810#endif
 811
 812static int add_memory_block(unsigned long block_id, unsigned long state,
 813			    struct vmem_altmap *altmap,
 814			    struct memory_group *group)
 815{
 816	struct memory_block *mem;
 817	int ret = 0;
 818
 819	mem = find_memory_block_by_id(block_id);
 820	if (mem) {
 821		put_device(&mem->dev);
 822		return -EEXIST;
 823	}
 824	mem = kzalloc(sizeof(*mem), GFP_KERNEL);
 825	if (!mem)
 826		return -ENOMEM;
 827
 828	mem->start_section_nr = block_id * sections_per_block;
 829	mem->state = state;
 830	mem->nid = NUMA_NO_NODE;
 831	mem->altmap = altmap;
 832	INIT_LIST_HEAD(&mem->group_next);
 833
 834#ifndef CONFIG_NUMA
 835	if (state == MEM_ONLINE)
 836		/*
 837		 * MEM_ONLINE at this point implies early memory. With NUMA,
 838		 * we'll determine the zone when setting the node id via
 839		 * memory_block_add_nid(). Memory hotplug updated the zone
 840		 * manually when memory onlining/offlining succeeds.
 841		 */
 842		mem->zone = early_node_zone_for_memory_block(mem, NUMA_NO_NODE);
 843#endif /* CONFIG_NUMA */
 844
 845	ret = __add_memory_block(mem);
 846	if (ret)
 847		return ret;
 848
 849	if (group) {
 850		mem->group = group;
 851		list_add(&mem->group_next, &group->memory_blocks);
 852	}
 853
 854	return 0;
 855}
 856
 857static int add_hotplug_memory_block(unsigned long block_id,
 858				    struct vmem_altmap *altmap,
 859				    struct memory_group *group)
 860{
 861	return add_memory_block(block_id, MEM_OFFLINE, altmap, group);
 862}
 863
 864static void remove_memory_block(struct memory_block *memory)
 865{
 866	if (WARN_ON_ONCE(memory->dev.bus != &memory_subsys))
 867		return;
 868
 869	WARN_ON(xa_erase(&memory_blocks, memory->dev.id) == NULL);
 870
 871	if (memory->group) {
 872		list_del(&memory->group_next);
 873		memory->group = NULL;
 874	}
 875
 876	/* drop the ref. we got via find_memory_block() */
 877	put_device(&memory->dev);
 878	device_unregister(&memory->dev);
 879}
 880
 881/*
 882 * Create memory block devices for the given memory area. Start and size
 883 * have to be aligned to memory block granularity. Memory block devices
 884 * will be initialized as offline.
 885 *
 886 * Called under device_hotplug_lock.
 887 */
 888int create_memory_block_devices(unsigned long start, unsigned long size,
 889				struct vmem_altmap *altmap,
 890				struct memory_group *group)
 891{
 892	const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start));
 893	unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size));
 894	struct memory_block *mem;
 895	unsigned long block_id;
 896	int ret = 0;
 897
 898	if (WARN_ON_ONCE(!IS_ALIGNED(start, memory_block_size_bytes()) ||
 899			 !IS_ALIGNED(size, memory_block_size_bytes())))
 900		return -EINVAL;
 901
 902	for (block_id = start_block_id; block_id != end_block_id; block_id++) {
 903		ret = add_hotplug_memory_block(block_id, altmap, group);
 904		if (ret)
 905			break;
 906	}
 907	if (ret) {
 908		end_block_id = block_id;
 909		for (block_id = start_block_id; block_id != end_block_id;
 910		     block_id++) {
 911			mem = find_memory_block_by_id(block_id);
 912			if (WARN_ON_ONCE(!mem))
 913				continue;
 914			remove_memory_block(mem);
 915		}
 916	}
 917	return ret;
 918}
 919
 920/*
 921 * Remove memory block devices for the given memory area. Start and size
 922 * have to be aligned to memory block granularity. Memory block devices
 923 * have to be offline.
 924 *
 925 * Called under device_hotplug_lock.
 926 */
 927void remove_memory_block_devices(unsigned long start, unsigned long size)
 928{
 929	const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start));
 930	const unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size));
 931	struct memory_block *mem;
 932	unsigned long block_id;
 933
 934	if (WARN_ON_ONCE(!IS_ALIGNED(start, memory_block_size_bytes()) ||
 935			 !IS_ALIGNED(size, memory_block_size_bytes())))
 936		return;
 937
 938	for (block_id = start_block_id; block_id != end_block_id; block_id++) {
 939		mem = find_memory_block_by_id(block_id);
 940		if (WARN_ON_ONCE(!mem))
 941			continue;
 942		num_poisoned_pages_sub(-1UL, memblk_nr_poison(mem));
 943		unregister_memory_block_under_nodes(mem);
 944		remove_memory_block(mem);
 945	}
 946}
 947
 948static struct attribute *memory_root_attrs[] = {
 949#ifdef CONFIG_ARCH_MEMORY_PROBE
 950	&dev_attr_probe.attr,
 951#endif
 952
 953#ifdef CONFIG_MEMORY_FAILURE
 954	&dev_attr_soft_offline_page.attr,
 955	&dev_attr_hard_offline_page.attr,
 956#endif
 957
 958	&dev_attr_block_size_bytes.attr,
 959	&dev_attr_auto_online_blocks.attr,
 960#ifdef CONFIG_CRASH_HOTPLUG
 961	&dev_attr_crash_hotplug.attr,
 962#endif
 963	NULL
 964};
 965
 966static const struct attribute_group memory_root_attr_group = {
 967	.attrs = memory_root_attrs,
 968};
 969
 970static const struct attribute_group *memory_root_attr_groups[] = {
 971	&memory_root_attr_group,
 972	NULL,
 973};
 974
 975/*
 976 * Initialize the sysfs support for memory devices. At the time this function
 977 * is called, we cannot have concurrent creation/deletion of memory block
 978 * devices, the device_hotplug_lock is not needed.
 979 */
 980void __init memory_dev_init(void)
 981{
 982	int ret;
 983	unsigned long block_sz, block_id, nr;
 984
 985	/* Validate the configured memory block size */
 986	block_sz = memory_block_size_bytes();
 987	if (!is_power_of_2(block_sz) || block_sz < MIN_MEMORY_BLOCK_SIZE)
 988		panic("Memory block size not suitable: 0x%lx\n", block_sz);
 989	sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE;
 990
 991	ret = subsys_system_register(&memory_subsys, memory_root_attr_groups);
 992	if (ret)
 993		panic("%s() failed to register subsystem: %d\n", __func__, ret);
 994
 995	/*
 996	 * Create entries for memory sections that were found during boot
 997	 * and have been initialized. Use @block_id to track the last
 998	 * handled block and initialize it to an invalid value (ULONG_MAX)
 999	 * to bypass the block ID matching check for the first present
1000	 * block so that it can be covered.
1001	 */
1002	block_id = ULONG_MAX;
1003	for_each_present_section_nr(0, nr) {
1004		if (block_id != ULONG_MAX && memory_block_id(nr) == block_id)
1005			continue;
1006
1007		block_id = memory_block_id(nr);
1008		ret = add_memory_block(block_id, MEM_ONLINE, NULL, NULL);
1009		if (ret) {
1010			panic("%s() failed to add memory block: %d\n",
1011			      __func__, ret);
1012		}
1013	}
1014}
1015
1016/**
1017 * walk_memory_blocks - walk through all present memory blocks overlapped
1018 *			by the range [start, start + size)
1019 *
1020 * @start: start address of the memory range
1021 * @size: size of the memory range
1022 * @arg: argument passed to func
1023 * @func: callback for each memory section walked
1024 *
1025 * This function walks through all present memory blocks overlapped by the
1026 * range [start, start + size), calling func on each memory block.
1027 *
1028 * In case func() returns an error, walking is aborted and the error is
1029 * returned.
1030 *
1031 * Called under device_hotplug_lock.
1032 */
1033int walk_memory_blocks(unsigned long start, unsigned long size,
1034		       void *arg, walk_memory_blocks_func_t func)
1035{
1036	const unsigned long start_block_id = phys_to_block_id(start);
1037	const unsigned long end_block_id = phys_to_block_id(start + size - 1);
1038	struct memory_block *mem;
1039	unsigned long block_id;
1040	int ret = 0;
1041
1042	if (!size)
1043		return 0;
1044
1045	for (block_id = start_block_id; block_id <= end_block_id; block_id++) {
1046		mem = find_memory_block_by_id(block_id);
1047		if (!mem)
1048			continue;
1049
1050		ret = func(mem, arg);
1051		put_device(&mem->dev);
1052		if (ret)
1053			break;
1054	}
1055	return ret;
1056}
1057
1058struct for_each_memory_block_cb_data {
1059	walk_memory_blocks_func_t func;
1060	void *arg;
1061};
1062
1063static int for_each_memory_block_cb(struct device *dev, void *data)
1064{
1065	struct memory_block *mem = to_memory_block(dev);
1066	struct for_each_memory_block_cb_data *cb_data = data;
1067
1068	return cb_data->func(mem, cb_data->arg);
1069}
1070
1071/**
1072 * for_each_memory_block - walk through all present memory blocks
1073 *
1074 * @arg: argument passed to func
1075 * @func: callback for each memory block walked
1076 *
1077 * This function walks through all present memory blocks, calling func on
1078 * each memory block.
1079 *
1080 * In case func() returns an error, walking is aborted and the error is
1081 * returned.
1082 */
1083int for_each_memory_block(void *arg, walk_memory_blocks_func_t func)
1084{
1085	struct for_each_memory_block_cb_data cb_data = {
1086		.func = func,
1087		.arg = arg,
1088	};
1089
1090	return bus_for_each_dev(&memory_subsys, NULL, &cb_data,
1091				for_each_memory_block_cb);
1092}
1093
1094/*
1095 * This is an internal helper to unify allocation and initialization of
1096 * memory groups. Note that the passed memory group will be copied to a
1097 * dynamically allocated memory group. After this call, the passed
1098 * memory group should no longer be used.
1099 */
1100static int memory_group_register(struct memory_group group)
1101{
1102	struct memory_group *new_group;
1103	uint32_t mgid;
1104	int ret;
1105
1106	if (!node_possible(group.nid))
1107		return -EINVAL;
1108
1109	new_group = kzalloc(sizeof(group), GFP_KERNEL);
1110	if (!new_group)
1111		return -ENOMEM;
1112	*new_group = group;
1113	INIT_LIST_HEAD(&new_group->memory_blocks);
1114
1115	ret = xa_alloc(&memory_groups, &mgid, new_group, xa_limit_31b,
1116		       GFP_KERNEL);
1117	if (ret) {
1118		kfree(new_group);
1119		return ret;
1120	} else if (group.is_dynamic) {
1121		xa_set_mark(&memory_groups, mgid, MEMORY_GROUP_MARK_DYNAMIC);
1122	}
1123	return mgid;
1124}
1125
1126/**
1127 * memory_group_register_static() - Register a static memory group.
1128 * @nid: The node id.
1129 * @max_pages: The maximum number of pages we'll have in this static memory
1130 *	       group.
1131 *
1132 * Register a new static memory group and return the memory group id.
1133 * All memory in the group belongs to a single unit, such as a DIMM. All
1134 * memory belonging to a static memory group is added in one go to be removed
1135 * in one go -- it's static.
1136 *
1137 * Returns an error if out of memory, if the node id is invalid, if no new
1138 * memory groups can be registered, or if max_pages is invalid (0). Otherwise,
1139 * returns the new memory group id.
1140 */
1141int memory_group_register_static(int nid, unsigned long max_pages)
1142{
1143	struct memory_group group = {
1144		.nid = nid,
1145		.s = {
1146			.max_pages = max_pages,
1147		},
1148	};
1149
1150	if (!max_pages)
1151		return -EINVAL;
1152	return memory_group_register(group);
1153}
1154EXPORT_SYMBOL_GPL(memory_group_register_static);
1155
1156/**
1157 * memory_group_register_dynamic() - Register a dynamic memory group.
1158 * @nid: The node id.
1159 * @unit_pages: Unit in pages in which is memory added/removed in this dynamic
1160 *		memory group.
1161 *
1162 * Register a new dynamic memory group and return the memory group id.
1163 * Memory within a dynamic memory group is added/removed dynamically
1164 * in unit_pages.
1165 *
1166 * Returns an error if out of memory, if the node id is invalid, if no new
1167 * memory groups can be registered, or if unit_pages is invalid (0, not a
1168 * power of two, smaller than a single memory block). Otherwise, returns the
1169 * new memory group id.
1170 */
1171int memory_group_register_dynamic(int nid, unsigned long unit_pages)
1172{
1173	struct memory_group group = {
1174		.nid = nid,
1175		.is_dynamic = true,
1176		.d = {
1177			.unit_pages = unit_pages,
1178		},
1179	};
1180
1181	if (!unit_pages || !is_power_of_2(unit_pages) ||
1182	    unit_pages < PHYS_PFN(memory_block_size_bytes()))
1183		return -EINVAL;
1184	return memory_group_register(group);
1185}
1186EXPORT_SYMBOL_GPL(memory_group_register_dynamic);
1187
1188/**
1189 * memory_group_unregister() - Unregister a memory group.
1190 * @mgid: the memory group id
1191 *
1192 * Unregister a memory group. If any memory block still belongs to this
1193 * memory group, unregistering will fail.
1194 *
1195 * Returns -EINVAL if the memory group id is invalid, returns -EBUSY if some
1196 * memory blocks still belong to this memory group and returns 0 if
1197 * unregistering succeeded.
1198 */
1199int memory_group_unregister(int mgid)
1200{
1201	struct memory_group *group;
1202
1203	if (mgid < 0)
1204		return -EINVAL;
1205
1206	group = xa_load(&memory_groups, mgid);
1207	if (!group)
1208		return -EINVAL;
1209	if (!list_empty(&group->memory_blocks))
1210		return -EBUSY;
1211	xa_erase(&memory_groups, mgid);
1212	kfree(group);
1213	return 0;
1214}
1215EXPORT_SYMBOL_GPL(memory_group_unregister);
1216
1217/*
1218 * This is an internal helper only to be used in core memory hotplug code to
1219 * lookup a memory group. We don't care about locking, as we don't expect a
1220 * memory group to get unregistered while adding memory to it -- because
1221 * the group and the memory is managed by the same driver.
1222 */
1223struct memory_group *memory_group_find_by_id(int mgid)
1224{
1225	return xa_load(&memory_groups, mgid);
1226}
1227
1228/*
1229 * This is an internal helper only to be used in core memory hotplug code to
1230 * walk all dynamic memory groups excluding a given memory group, either
1231 * belonging to a specific node, or belonging to any node.
1232 */
1233int walk_dynamic_memory_groups(int nid, walk_memory_groups_func_t func,
1234			       struct memory_group *excluded, void *arg)
1235{
1236	struct memory_group *group;
1237	unsigned long index;
1238	int ret = 0;
1239
1240	xa_for_each_marked(&memory_groups, index, group,
1241			   MEMORY_GROUP_MARK_DYNAMIC) {
1242		if (group == excluded)
1243			continue;
1244#ifdef CONFIG_NUMA
1245		if (nid != NUMA_NO_NODE && group->nid != nid)
1246			continue;
1247#endif /* CONFIG_NUMA */
1248		ret = func(group, arg);
1249		if (ret)
1250			break;
1251	}
1252	return ret;
1253}
1254
1255#if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG)
1256void memblk_nr_poison_inc(unsigned long pfn)
1257{
1258	const unsigned long block_id = pfn_to_block_id(pfn);
1259	struct memory_block *mem = find_memory_block_by_id(block_id);
1260
1261	if (mem)
1262		atomic_long_inc(&mem->nr_hwpoison);
1263}
1264
1265void memblk_nr_poison_sub(unsigned long pfn, long i)
1266{
1267	const unsigned long block_id = pfn_to_block_id(pfn);
1268	struct memory_block *mem = find_memory_block_by_id(block_id);
1269
1270	if (mem)
1271		atomic_long_sub(i, &mem->nr_hwpoison);
1272}
1273
1274static unsigned long memblk_nr_poison(struct memory_block *mem)
1275{
1276	return atomic_long_read(&mem->nr_hwpoison);
1277}
1278#endif