at v3.2 17 kB view raw
1/* 2 * drivers/base/memory.c - basic Memory class support 3 * 4 * Written by Matt Tolentino <matthew.e.tolentino@intel.com> 5 * Dave Hansen <haveblue@us.ibm.com> 6 * 7 * This file provides the necessary infrastructure to represent 8 * a SPARSEMEM-memory-model system's physical memory in /sysfs. 9 * All arch-independent code that assumes MEMORY_HOTPLUG requires 10 * SPARSEMEM should be contained here, or in mm/memory_hotplug.c. 11 */ 12 13#include <linux/sysdev.h> 14#include <linux/module.h> 15#include <linux/init.h> 16#include <linux/topology.h> 17#include <linux/capability.h> 18#include <linux/device.h> 19#include <linux/memory.h> 20#include <linux/kobject.h> 21#include <linux/memory_hotplug.h> 22#include <linux/mm.h> 23#include <linux/mutex.h> 24#include <linux/stat.h> 25#include <linux/slab.h> 26 27#include <linux/atomic.h> 28#include <asm/uaccess.h> 29 30static DEFINE_MUTEX(mem_sysfs_mutex); 31 32#define MEMORY_CLASS_NAME "memory" 33 34static int sections_per_block; 35 36static inline int base_memory_block_id(int section_nr) 37{ 38 return section_nr / sections_per_block; 39} 40 41static struct sysdev_class memory_sysdev_class = { 42 .name = MEMORY_CLASS_NAME, 43}; 44 45static const char *memory_uevent_name(struct kset *kset, struct kobject *kobj) 46{ 47 return MEMORY_CLASS_NAME; 48} 49 50static int memory_uevent(struct kset *kset, struct kobject *obj, 51 struct kobj_uevent_env *env) 52{ 53 int retval = 0; 54 55 return retval; 56} 57 58static const struct kset_uevent_ops memory_uevent_ops = { 59 .name = memory_uevent_name, 60 .uevent = memory_uevent, 61}; 62 63static BLOCKING_NOTIFIER_HEAD(memory_chain); 64 65int register_memory_notifier(struct notifier_block *nb) 66{ 67 return blocking_notifier_chain_register(&memory_chain, nb); 68} 69EXPORT_SYMBOL(register_memory_notifier); 70 71void unregister_memory_notifier(struct notifier_block *nb) 72{ 73 blocking_notifier_chain_unregister(&memory_chain, nb); 74} 75EXPORT_SYMBOL(unregister_memory_notifier); 76 77static ATOMIC_NOTIFIER_HEAD(memory_isolate_chain); 78 79int register_memory_isolate_notifier(struct notifier_block *nb) 80{ 81 return atomic_notifier_chain_register(&memory_isolate_chain, nb); 82} 83EXPORT_SYMBOL(register_memory_isolate_notifier); 84 85void unregister_memory_isolate_notifier(struct notifier_block *nb) 86{ 87 atomic_notifier_chain_unregister(&memory_isolate_chain, nb); 88} 89EXPORT_SYMBOL(unregister_memory_isolate_notifier); 90 91/* 92 * register_memory - Setup a sysfs device for a memory block 93 */ 94static 95int register_memory(struct memory_block *memory) 96{ 97 int error; 98 99 memory->sysdev.cls = &memory_sysdev_class; 100 memory->sysdev.id = memory->start_section_nr / sections_per_block; 101 102 error = sysdev_register(&memory->sysdev); 103 return error; 104} 105 106static void 107unregister_memory(struct memory_block *memory) 108{ 109 BUG_ON(memory->sysdev.cls != &memory_sysdev_class); 110 111 /* drop the ref. we got in remove_memory_block() */ 112 kobject_put(&memory->sysdev.kobj); 113 sysdev_unregister(&memory->sysdev); 114} 115 116unsigned long __weak memory_block_size_bytes(void) 117{ 118 return MIN_MEMORY_BLOCK_SIZE; 119} 120 121static unsigned long get_memory_block_size(void) 122{ 123 unsigned long block_sz; 124 125 block_sz = memory_block_size_bytes(); 126 127 /* Validate blk_sz is a power of 2 and not less than section size */ 128 if ((block_sz & (block_sz - 1)) || (block_sz < MIN_MEMORY_BLOCK_SIZE)) { 129 WARN_ON(1); 130 block_sz = MIN_MEMORY_BLOCK_SIZE; 131 } 132 133 return block_sz; 134} 135 136/* 137 * use this as the physical section index that this memsection 138 * uses. 139 */ 140 141static ssize_t show_mem_start_phys_index(struct sys_device *dev, 142 struct sysdev_attribute *attr, char *buf) 143{ 144 struct memory_block *mem = 145 container_of(dev, struct memory_block, sysdev); 146 unsigned long phys_index; 147 148 phys_index = mem->start_section_nr / sections_per_block; 149 return sprintf(buf, "%08lx\n", phys_index); 150} 151 152static ssize_t show_mem_end_phys_index(struct sys_device *dev, 153 struct sysdev_attribute *attr, char *buf) 154{ 155 struct memory_block *mem = 156 container_of(dev, struct memory_block, sysdev); 157 unsigned long phys_index; 158 159 phys_index = mem->end_section_nr / sections_per_block; 160 return sprintf(buf, "%08lx\n", phys_index); 161} 162 163/* 164 * Show whether the section of memory is likely to be hot-removable 165 */ 166static ssize_t show_mem_removable(struct sys_device *dev, 167 struct sysdev_attribute *attr, char *buf) 168{ 169 unsigned long i, pfn; 170 int ret = 1; 171 struct memory_block *mem = 172 container_of(dev, struct memory_block, sysdev); 173 174 for (i = 0; i < sections_per_block; i++) { 175 pfn = section_nr_to_pfn(mem->start_section_nr + i); 176 ret &= is_mem_section_removable(pfn, PAGES_PER_SECTION); 177 } 178 179 return sprintf(buf, "%d\n", ret); 180} 181 182/* 183 * online, offline, going offline, etc. 184 */ 185static ssize_t show_mem_state(struct sys_device *dev, 186 struct sysdev_attribute *attr, char *buf) 187{ 188 struct memory_block *mem = 189 container_of(dev, struct memory_block, sysdev); 190 ssize_t len = 0; 191 192 /* 193 * We can probably put these states in a nice little array 194 * so that they're not open-coded 195 */ 196 switch (mem->state) { 197 case MEM_ONLINE: 198 len = sprintf(buf, "online\n"); 199 break; 200 case MEM_OFFLINE: 201 len = sprintf(buf, "offline\n"); 202 break; 203 case MEM_GOING_OFFLINE: 204 len = sprintf(buf, "going-offline\n"); 205 break; 206 default: 207 len = sprintf(buf, "ERROR-UNKNOWN-%ld\n", 208 mem->state); 209 WARN_ON(1); 210 break; 211 } 212 213 return len; 214} 215 216int memory_notify(unsigned long val, void *v) 217{ 218 return blocking_notifier_call_chain(&memory_chain, val, v); 219} 220 221int memory_isolate_notify(unsigned long val, void *v) 222{ 223 return atomic_notifier_call_chain(&memory_isolate_chain, val, v); 224} 225 226/* 227 * The probe routines leave the pages reserved, just as the bootmem code does. 228 * Make sure they're still that way. 229 */ 230static bool pages_correctly_reserved(unsigned long start_pfn, 231 unsigned long nr_pages) 232{ 233 int i, j; 234 struct page *page; 235 unsigned long pfn = start_pfn; 236 237 /* 238 * memmap between sections is not contiguous except with 239 * SPARSEMEM_VMEMMAP. We lookup the page once per section 240 * and assume memmap is contiguous within each section 241 */ 242 for (i = 0; i < sections_per_block; i++, pfn += PAGES_PER_SECTION) { 243 if (WARN_ON_ONCE(!pfn_valid(pfn))) 244 return false; 245 page = pfn_to_page(pfn); 246 247 for (j = 0; j < PAGES_PER_SECTION; j++) { 248 if (PageReserved(page + j)) 249 continue; 250 251 printk(KERN_WARNING "section number %ld page number %d " 252 "not reserved, was it already online?\n", 253 pfn_to_section_nr(pfn), j); 254 255 return false; 256 } 257 } 258 259 return true; 260} 261 262/* 263 * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is 264 * OK to have direct references to sparsemem variables in here. 265 */ 266static int 267memory_block_action(unsigned long phys_index, unsigned long action) 268{ 269 unsigned long start_pfn, start_paddr; 270 unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; 271 struct page *first_page; 272 int ret; 273 274 first_page = pfn_to_page(phys_index << PFN_SECTION_SHIFT); 275 276 switch (action) { 277 case MEM_ONLINE: 278 start_pfn = page_to_pfn(first_page); 279 280 if (!pages_correctly_reserved(start_pfn, nr_pages)) 281 return -EBUSY; 282 283 ret = online_pages(start_pfn, nr_pages); 284 break; 285 case MEM_OFFLINE: 286 start_paddr = page_to_pfn(first_page) << PAGE_SHIFT; 287 ret = remove_memory(start_paddr, 288 nr_pages << PAGE_SHIFT); 289 break; 290 default: 291 WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: " 292 "%ld\n", __func__, phys_index, action, action); 293 ret = -EINVAL; 294 } 295 296 return ret; 297} 298 299static int memory_block_change_state(struct memory_block *mem, 300 unsigned long to_state, unsigned long from_state_req) 301{ 302 int ret = 0; 303 304 mutex_lock(&mem->state_mutex); 305 306 if (mem->state != from_state_req) { 307 ret = -EINVAL; 308 goto out; 309 } 310 311 if (to_state == MEM_OFFLINE) 312 mem->state = MEM_GOING_OFFLINE; 313 314 ret = memory_block_action(mem->start_section_nr, to_state); 315 316 if (ret) 317 mem->state = from_state_req; 318 else 319 mem->state = to_state; 320 321out: 322 mutex_unlock(&mem->state_mutex); 323 return ret; 324} 325 326static ssize_t 327store_mem_state(struct sys_device *dev, 328 struct sysdev_attribute *attr, const char *buf, size_t count) 329{ 330 struct memory_block *mem; 331 int ret = -EINVAL; 332 333 mem = container_of(dev, struct memory_block, sysdev); 334 335 if (!strncmp(buf, "online", min((int)count, 6))) 336 ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE); 337 else if(!strncmp(buf, "offline", min((int)count, 7))) 338 ret = memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE); 339 340 if (ret) 341 return ret; 342 return count; 343} 344 345/* 346 * phys_device is a bad name for this. What I really want 347 * is a way to differentiate between memory ranges that 348 * are part of physical devices that constitute 349 * a complete removable unit or fru. 350 * i.e. do these ranges belong to the same physical device, 351 * s.t. if I offline all of these sections I can then 352 * remove the physical device? 353 */ 354static ssize_t show_phys_device(struct sys_device *dev, 355 struct sysdev_attribute *attr, char *buf) 356{ 357 struct memory_block *mem = 358 container_of(dev, struct memory_block, sysdev); 359 return sprintf(buf, "%d\n", mem->phys_device); 360} 361 362static SYSDEV_ATTR(phys_index, 0444, show_mem_start_phys_index, NULL); 363static SYSDEV_ATTR(end_phys_index, 0444, show_mem_end_phys_index, NULL); 364static SYSDEV_ATTR(state, 0644, show_mem_state, store_mem_state); 365static SYSDEV_ATTR(phys_device, 0444, show_phys_device, NULL); 366static SYSDEV_ATTR(removable, 0444, show_mem_removable, NULL); 367 368#define mem_create_simple_file(mem, attr_name) \ 369 sysdev_create_file(&mem->sysdev, &attr_##attr_name) 370#define mem_remove_simple_file(mem, attr_name) \ 371 sysdev_remove_file(&mem->sysdev, &attr_##attr_name) 372 373/* 374 * Block size attribute stuff 375 */ 376static ssize_t 377print_block_size(struct sysdev_class *class, struct sysdev_class_attribute *attr, 378 char *buf) 379{ 380 return sprintf(buf, "%lx\n", get_memory_block_size()); 381} 382 383static SYSDEV_CLASS_ATTR(block_size_bytes, 0444, print_block_size, NULL); 384 385static int block_size_init(void) 386{ 387 return sysfs_create_file(&memory_sysdev_class.kset.kobj, 388 &attr_block_size_bytes.attr); 389} 390 391/* 392 * Some architectures will have custom drivers to do this, and 393 * will not need to do it from userspace. The fake hot-add code 394 * as well as ppc64 will do all of their discovery in userspace 395 * and will require this interface. 396 */ 397#ifdef CONFIG_ARCH_MEMORY_PROBE 398static ssize_t 399memory_probe_store(struct class *class, struct class_attribute *attr, 400 const char *buf, size_t count) 401{ 402 u64 phys_addr; 403 int nid; 404 int i, ret; 405 unsigned long pages_per_block = PAGES_PER_SECTION * sections_per_block; 406 407 phys_addr = simple_strtoull(buf, NULL, 0); 408 409 if (phys_addr & ((pages_per_block << PAGE_SHIFT) - 1)) 410 return -EINVAL; 411 412 for (i = 0; i < sections_per_block; i++) { 413 nid = memory_add_physaddr_to_nid(phys_addr); 414 ret = add_memory(nid, phys_addr, 415 PAGES_PER_SECTION << PAGE_SHIFT); 416 if (ret) 417 goto out; 418 419 phys_addr += MIN_MEMORY_BLOCK_SIZE; 420 } 421 422 ret = count; 423out: 424 return ret; 425} 426static CLASS_ATTR(probe, S_IWUSR, NULL, memory_probe_store); 427 428static int memory_probe_init(void) 429{ 430 return sysfs_create_file(&memory_sysdev_class.kset.kobj, 431 &class_attr_probe.attr); 432} 433#else 434static inline int memory_probe_init(void) 435{ 436 return 0; 437} 438#endif 439 440#ifdef CONFIG_MEMORY_FAILURE 441/* 442 * Support for offlining pages of memory 443 */ 444 445/* Soft offline a page */ 446static ssize_t 447store_soft_offline_page(struct class *class, 448 struct class_attribute *attr, 449 const char *buf, size_t count) 450{ 451 int ret; 452 u64 pfn; 453 if (!capable(CAP_SYS_ADMIN)) 454 return -EPERM; 455 if (strict_strtoull(buf, 0, &pfn) < 0) 456 return -EINVAL; 457 pfn >>= PAGE_SHIFT; 458 if (!pfn_valid(pfn)) 459 return -ENXIO; 460 ret = soft_offline_page(pfn_to_page(pfn), 0); 461 return ret == 0 ? count : ret; 462} 463 464/* Forcibly offline a page, including killing processes. */ 465static ssize_t 466store_hard_offline_page(struct class *class, 467 struct class_attribute *attr, 468 const char *buf, size_t count) 469{ 470 int ret; 471 u64 pfn; 472 if (!capable(CAP_SYS_ADMIN)) 473 return -EPERM; 474 if (strict_strtoull(buf, 0, &pfn) < 0) 475 return -EINVAL; 476 pfn >>= PAGE_SHIFT; 477 ret = __memory_failure(pfn, 0, 0); 478 return ret ? ret : count; 479} 480 481static CLASS_ATTR(soft_offline_page, 0644, NULL, store_soft_offline_page); 482static CLASS_ATTR(hard_offline_page, 0644, NULL, store_hard_offline_page); 483 484static __init int memory_fail_init(void) 485{ 486 int err; 487 488 err = sysfs_create_file(&memory_sysdev_class.kset.kobj, 489 &class_attr_soft_offline_page.attr); 490 if (!err) 491 err = sysfs_create_file(&memory_sysdev_class.kset.kobj, 492 &class_attr_hard_offline_page.attr); 493 return err; 494} 495#else 496static inline int memory_fail_init(void) 497{ 498 return 0; 499} 500#endif 501 502/* 503 * Note that phys_device is optional. It is here to allow for 504 * differentiation between which *physical* devices each 505 * section belongs to... 506 */ 507int __weak arch_get_memory_phys_device(unsigned long start_pfn) 508{ 509 return 0; 510} 511 512struct memory_block *find_memory_block_hinted(struct mem_section *section, 513 struct memory_block *hint) 514{ 515 struct kobject *kobj; 516 struct sys_device *sysdev; 517 struct memory_block *mem; 518 char name[sizeof(MEMORY_CLASS_NAME) + 9 + 1]; 519 int block_id = base_memory_block_id(__section_nr(section)); 520 521 kobj = hint ? &hint->sysdev.kobj : NULL; 522 523 /* 524 * This only works because we know that section == sysdev->id 525 * slightly redundant with sysdev_register() 526 */ 527 sprintf(&name[0], "%s%d", MEMORY_CLASS_NAME, block_id); 528 529 kobj = kset_find_obj_hinted(&memory_sysdev_class.kset, name, kobj); 530 if (!kobj) 531 return NULL; 532 533 sysdev = container_of(kobj, struct sys_device, kobj); 534 mem = container_of(sysdev, struct memory_block, sysdev); 535 536 return mem; 537} 538 539/* 540 * For now, we have a linear search to go find the appropriate 541 * memory_block corresponding to a particular phys_index. If 542 * this gets to be a real problem, we can always use a radix 543 * tree or something here. 544 * 545 * This could be made generic for all sysdev classes. 546 */ 547struct memory_block *find_memory_block(struct mem_section *section) 548{ 549 return find_memory_block_hinted(section, NULL); 550} 551 552static int init_memory_block(struct memory_block **memory, 553 struct mem_section *section, unsigned long state) 554{ 555 struct memory_block *mem; 556 unsigned long start_pfn; 557 int scn_nr; 558 int ret = 0; 559 560 mem = kzalloc(sizeof(*mem), GFP_KERNEL); 561 if (!mem) 562 return -ENOMEM; 563 564 scn_nr = __section_nr(section); 565 mem->start_section_nr = 566 base_memory_block_id(scn_nr) * sections_per_block; 567 mem->end_section_nr = mem->start_section_nr + sections_per_block - 1; 568 mem->state = state; 569 mem->section_count++; 570 mutex_init(&mem->state_mutex); 571 start_pfn = section_nr_to_pfn(mem->start_section_nr); 572 mem->phys_device = arch_get_memory_phys_device(start_pfn); 573 574 ret = register_memory(mem); 575 if (!ret) 576 ret = mem_create_simple_file(mem, phys_index); 577 if (!ret) 578 ret = mem_create_simple_file(mem, end_phys_index); 579 if (!ret) 580 ret = mem_create_simple_file(mem, state); 581 if (!ret) 582 ret = mem_create_simple_file(mem, phys_device); 583 if (!ret) 584 ret = mem_create_simple_file(mem, removable); 585 586 *memory = mem; 587 return ret; 588} 589 590static int add_memory_section(int nid, struct mem_section *section, 591 unsigned long state, enum mem_add_context context) 592{ 593 struct memory_block *mem; 594 int ret = 0; 595 596 mutex_lock(&mem_sysfs_mutex); 597 598 mem = find_memory_block(section); 599 if (mem) { 600 mem->section_count++; 601 kobject_put(&mem->sysdev.kobj); 602 } else 603 ret = init_memory_block(&mem, section, state); 604 605 if (!ret) { 606 if (context == HOTPLUG && 607 mem->section_count == sections_per_block) 608 ret = register_mem_sect_under_node(mem, nid); 609 } 610 611 mutex_unlock(&mem_sysfs_mutex); 612 return ret; 613} 614 615int remove_memory_block(unsigned long node_id, struct mem_section *section, 616 int phys_device) 617{ 618 struct memory_block *mem; 619 620 mutex_lock(&mem_sysfs_mutex); 621 mem = find_memory_block(section); 622 unregister_mem_sect_under_nodes(mem, __section_nr(section)); 623 624 mem->section_count--; 625 if (mem->section_count == 0) { 626 mem_remove_simple_file(mem, phys_index); 627 mem_remove_simple_file(mem, end_phys_index); 628 mem_remove_simple_file(mem, state); 629 mem_remove_simple_file(mem, phys_device); 630 mem_remove_simple_file(mem, removable); 631 unregister_memory(mem); 632 kfree(mem); 633 } else 634 kobject_put(&mem->sysdev.kobj); 635 636 mutex_unlock(&mem_sysfs_mutex); 637 return 0; 638} 639 640/* 641 * need an interface for the VM to add new memory regions, 642 * but without onlining it. 643 */ 644int register_new_memory(int nid, struct mem_section *section) 645{ 646 return add_memory_section(nid, section, MEM_OFFLINE, HOTPLUG); 647} 648 649int unregister_memory_section(struct mem_section *section) 650{ 651 if (!present_section(section)) 652 return -EINVAL; 653 654 return remove_memory_block(0, section, 0); 655} 656 657/* 658 * Initialize the sysfs support for memory devices... 659 */ 660int __init memory_dev_init(void) 661{ 662 unsigned int i; 663 int ret; 664 int err; 665 unsigned long block_sz; 666 667 memory_sysdev_class.kset.uevent_ops = &memory_uevent_ops; 668 ret = sysdev_class_register(&memory_sysdev_class); 669 if (ret) 670 goto out; 671 672 block_sz = get_memory_block_size(); 673 sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE; 674 675 /* 676 * Create entries for memory sections that were found 677 * during boot and have been initialized 678 */ 679 for (i = 0; i < NR_MEM_SECTIONS; i++) { 680 if (!present_section_nr(i)) 681 continue; 682 err = add_memory_section(0, __nr_to_section(i), MEM_ONLINE, 683 BOOT); 684 if (!ret) 685 ret = err; 686 } 687 688 err = memory_probe_init(); 689 if (!ret) 690 ret = err; 691 err = memory_fail_init(); 692 if (!ret) 693 ret = err; 694 err = block_size_init(); 695 if (!ret) 696 ret = err; 697out: 698 if (ret) 699 printk(KERN_ERR "%s() failed: %d\n", __func__, ret); 700 return ret; 701}