Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

memory hotplug: Allow memory blocks to span multiple memory sections

Update the memory sysfs code such that each sysfs memory directory is now
considered a memory block that can span multiple memory sections per
memory block. The default size of each memory block is SECTION_SIZE_BITS
to maintain the current behavior of having a single memory section per
memory block (i.e. one sysfs directory per memory section).

For architectures that want to have memory blocks span multiple
memory sections they need only define their own memory_block_size_bytes()
routine.

Update the memory hotplug documentation to reflect the new behaviors of
memory blocks reflected in sysfs.

Signed-off-by: Nathan Fontenot <nfont@austin.ibm.com>
Reviewed-by: Robin Holt <holt@sgi.com>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>

authored by

Nathan Fontenot and committed by
Greg Kroah-Hartman
0c2c99b1 e8d9792a

+139 -63
+31 -16
Documentation/memory-hotplug.txt
··· 126 126 -------------------------------- 127 127 4 sysfs files for memory hotplug 128 128 -------------------------------- 129 - All sections have their device information under /sys/devices/system/memory as 129 + All sections have their device information in sysfs. Each section is part of 130 + a memory block under /sys/devices/system/memory as 130 131 131 132 /sys/devices/system/memory/memoryXXX 132 - (XXX is section id.) 133 + (XXX is the section id.) 133 134 134 - Now, XXX is defined as start_address_of_section / section_size. 135 + Now, XXX is defined as (start_address_of_section / section_size) of the first 136 + section contained in the memory block. The files 'phys_index' and 137 + 'end_phys_index' under each directory report the beginning and end section id's 138 + for the memory block covered by the sysfs directory. It is expected that all 139 + memory sections in this range are present and no memory holes exist in the 140 + range. Currently there is no way to determine if there is a memory hole, but 141 + the existence of one should not affect the hotplug capabilities of the memory 142 + block. 135 143 136 144 For example, assume 1GiB section size. A device for a memory starting at 137 145 0x100000000 is /sys/device/system/memory/memory4 138 146 (0x100000000 / 1Gib = 4) 139 147 This device covers address range [0x100000000 ... 0x140000000) 140 148 141 - Under each section, you can see 4 files. 149 + Under each section, you can see 4 or 5 files, the end_phys_index file being 150 + a recent addition and not present on older kernels. 142 151 143 - /sys/devices/system/memory/memoryXXX/phys_index 152 + /sys/devices/system/memory/memoryXXX/start_phys_index 153 + /sys/devices/system/memory/memoryXXX/end_phys_index 144 154 /sys/devices/system/memory/memoryXXX/phys_device 145 155 /sys/devices/system/memory/memoryXXX/state 146 156 /sys/devices/system/memory/memoryXXX/removable 147 157 148 - 'phys_index' : read-only and contains section id, same as XXX. 149 - 'state' : read-write 150 - at read: contains online/offline state of memory. 151 - at write: user can specify "online", "offline" command 152 - 'phys_device': read-only: designed to show the name of physical memory device. 153 - This is not well implemented now. 154 - 'removable' : read-only: contains an integer value indicating 155 - whether the memory section is removable or not 156 - removable. A value of 1 indicates that the memory 157 - section is removable and a value of 0 indicates that 158 - it is not removable. 158 + 'phys_index' : read-only and contains section id of the first section 159 + in the memory block, same as XXX. 160 + 'end_phys_index' : read-only and contains section id of the last section 161 + in the memory block. 162 + 'state' : read-write 163 + at read: contains online/offline state of memory. 164 + at write: user can specify "online", "offline" command 165 + which will be performed on al sections in the block. 166 + 'phys_device' : read-only: designed to show the name of physical memory 167 + device. This is not well implemented now. 168 + 'removable' : read-only: contains an integer value indicating 169 + whether the memory block is removable or not 170 + removable. A value of 1 indicates that the memory 171 + block is removable and a value of 0 indicates that 172 + it is not removable. A memory block is removable only if 173 + every section in the block is removable. 159 174 160 175 NOTE: 161 176 These directories/files appear after physical memory hotplug phase.
+108 -47
drivers/base/memory.c
··· 30 30 static DEFINE_MUTEX(mem_sysfs_mutex); 31 31 32 32 #define MEMORY_CLASS_NAME "memory" 33 + #define MIN_MEMORY_BLOCK_SIZE (1 << SECTION_SIZE_BITS) 34 + 35 + static int sections_per_block; 36 + 37 + static inline int base_memory_block_id(int section_nr) 38 + { 39 + return section_nr / sections_per_block; 40 + } 33 41 34 42 static struct sysdev_class memory_sysdev_class = { 35 43 .name = MEMORY_CLASS_NAME, ··· 92 84 * register_memory - Setup a sysfs device for a memory block 93 85 */ 94 86 static 95 - int register_memory(struct memory_block *memory, struct mem_section *section) 87 + int register_memory(struct memory_block *memory) 96 88 { 97 89 int error; 98 90 99 91 memory->sysdev.cls = &memory_sysdev_class; 100 - memory->sysdev.id = __section_nr(section); 92 + memory->sysdev.id = memory->phys_index / sections_per_block; 101 93 102 94 error = sysdev_register(&memory->sysdev); 103 95 return error; 104 96 } 105 97 106 98 static void 107 - unregister_memory(struct memory_block *memory, struct mem_section *section) 99 + unregister_memory(struct memory_block *memory) 108 100 { 109 101 BUG_ON(memory->sysdev.cls != &memory_sysdev_class); 110 - BUG_ON(memory->sysdev.id != __section_nr(section)); 111 102 112 103 /* drop the ref. we got in remove_memory_block() */ 113 104 kobject_put(&memory->sysdev.kobj); 114 105 sysdev_unregister(&memory->sysdev); 106 + } 107 + 108 + unsigned long __weak memory_block_size_bytes(void) 109 + { 110 + return MIN_MEMORY_BLOCK_SIZE; 111 + } 112 + 113 + static unsigned long get_memory_block_size(void) 114 + { 115 + unsigned long block_sz; 116 + 117 + block_sz = memory_block_size_bytes(); 118 + 119 + /* Validate blk_sz is a power of 2 and not less than section size */ 120 + if ((block_sz & (block_sz - 1)) || (block_sz < MIN_MEMORY_BLOCK_SIZE)) { 121 + WARN_ON(1); 122 + block_sz = MIN_MEMORY_BLOCK_SIZE; 123 + } 124 + 125 + return block_sz; 115 126 } 116 127 117 128 /* ··· 143 116 { 144 117 struct memory_block *mem = 145 118 container_of(dev, struct memory_block, sysdev); 146 - return sprintf(buf, "%08lx\n", mem->phys_index); 119 + return sprintf(buf, "%08lx\n", mem->phys_index / sections_per_block); 147 120 } 148 121 149 122 /* ··· 152 125 static ssize_t show_mem_removable(struct sys_device *dev, 153 126 struct sysdev_attribute *attr, char *buf) 154 127 { 155 - unsigned long start_pfn; 156 - int ret; 128 + unsigned long i, pfn; 129 + int ret = 1; 157 130 struct memory_block *mem = 158 131 container_of(dev, struct memory_block, sysdev); 159 132 160 - start_pfn = section_nr_to_pfn(mem->phys_index); 161 - ret = is_mem_section_removable(start_pfn, PAGES_PER_SECTION); 133 + for (i = 0; i < sections_per_block; i++) { 134 + pfn = section_nr_to_pfn(mem->phys_index + i); 135 + ret &= is_mem_section_removable(pfn, PAGES_PER_SECTION); 136 + } 137 + 162 138 return sprintf(buf, "%d\n", ret); 163 139 } 164 140 ··· 214 184 * OK to have direct references to sparsemem variables in here. 215 185 */ 216 186 static int 217 - memory_block_action(struct memory_block *mem, unsigned long action) 187 + memory_section_action(unsigned long phys_index, unsigned long action) 218 188 { 219 189 int i; 220 - unsigned long psection; 221 190 unsigned long start_pfn, start_paddr; 222 191 struct page *first_page; 223 192 int ret; 224 - int old_state = mem->state; 225 193 226 - psection = mem->phys_index; 227 - first_page = pfn_to_page(psection << PFN_SECTION_SHIFT); 194 + first_page = pfn_to_page(phys_index << PFN_SECTION_SHIFT); 228 195 229 196 /* 230 197 * The probe routines leave the pages reserved, just ··· 234 207 continue; 235 208 236 209 printk(KERN_WARNING "section number %ld page number %d " 237 - "not reserved, was it already online? \n", 238 - psection, i); 210 + "not reserved, was it already online?\n", 211 + phys_index, i); 239 212 return -EBUSY; 240 213 } 241 214 } ··· 246 219 ret = online_pages(start_pfn, PAGES_PER_SECTION); 247 220 break; 248 221 case MEM_OFFLINE: 249 - mem->state = MEM_GOING_OFFLINE; 250 222 start_paddr = page_to_pfn(first_page) << PAGE_SHIFT; 251 223 ret = remove_memory(start_paddr, 252 224 PAGES_PER_SECTION << PAGE_SHIFT); 253 - if (ret) { 254 - mem->state = old_state; 255 - break; 256 - } 257 225 break; 258 226 default: 259 - WARN(1, KERN_WARNING "%s(%p, %ld) unknown action: %ld\n", 260 - __func__, mem, action, action); 227 + WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: " 228 + "%ld\n", __func__, phys_index, action, action); 261 229 ret = -EINVAL; 262 230 } 263 231 ··· 262 240 static int memory_block_change_state(struct memory_block *mem, 263 241 unsigned long to_state, unsigned long from_state_req) 264 242 { 265 - int ret = 0; 243 + int i, ret = 0; 244 + 266 245 mutex_lock(&mem->state_mutex); 267 246 268 247 if (mem->state != from_state_req) { ··· 271 248 goto out; 272 249 } 273 250 274 - ret = memory_block_action(mem, to_state); 275 - if (!ret) 251 + if (to_state == MEM_OFFLINE) 252 + mem->state = MEM_GOING_OFFLINE; 253 + 254 + for (i = 0; i < sections_per_block; i++) { 255 + ret = memory_section_action(mem->phys_index + i, to_state); 256 + if (ret) 257 + break; 258 + } 259 + 260 + if (ret) { 261 + for (i = 0; i < sections_per_block; i++) 262 + memory_section_action(mem->phys_index + i, 263 + from_state_req); 264 + 265 + mem->state = from_state_req; 266 + } else 276 267 mem->state = to_state; 277 268 278 269 out: ··· 299 262 struct sysdev_attribute *attr, const char *buf, size_t count) 300 263 { 301 264 struct memory_block *mem; 302 - unsigned int phys_section_nr; 303 265 int ret = -EINVAL; 304 266 305 267 mem = container_of(dev, struct memory_block, sysdev); 306 - phys_section_nr = mem->phys_index; 307 - 308 - if (!present_section_nr(phys_section_nr)) 309 - goto out; 310 268 311 269 if (!strncmp(buf, "online", min((int)count, 6))) 312 270 ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE); 313 271 else if(!strncmp(buf, "offline", min((int)count, 7))) 314 272 ret = memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE); 315 - out: 273 + 316 274 if (ret) 317 275 return ret; 318 276 return count; ··· 347 315 print_block_size(struct sysdev_class *class, struct sysdev_class_attribute *attr, 348 316 char *buf) 349 317 { 350 - return sprintf(buf, "%lx\n", (unsigned long)PAGES_PER_SECTION * PAGE_SIZE); 318 + return sprintf(buf, "%lx\n", get_memory_block_size()); 351 319 } 352 320 353 321 static SYSDEV_CLASS_ATTR(block_size_bytes, 0444, print_block_size, NULL); ··· 476 444 struct sys_device *sysdev; 477 445 struct memory_block *mem; 478 446 char name[sizeof(MEMORY_CLASS_NAME) + 9 + 1]; 447 + int block_id = base_memory_block_id(__section_nr(section)); 479 448 480 449 kobj = hint ? &hint->sysdev.kobj : NULL; 481 450 ··· 484 451 * This only works because we know that section == sysdev->id 485 452 * slightly redundant with sysdev_register() 486 453 */ 487 - sprintf(&name[0], "%s%d", MEMORY_CLASS_NAME, __section_nr(section)); 454 + sprintf(&name[0], "%s%d", MEMORY_CLASS_NAME, block_id); 488 455 489 456 kobj = kset_find_obj_hinted(&memory_sysdev_class.kset, name, kobj); 490 457 if (!kobj) ··· 509 476 return find_memory_block_hinted(section, NULL); 510 477 } 511 478 512 - static int add_memory_block(int nid, struct mem_section *section, 513 - unsigned long state, enum mem_add_context context) 479 + static int init_memory_block(struct memory_block **memory, 480 + struct mem_section *section, unsigned long state) 514 481 { 515 - struct memory_block *mem = kzalloc(sizeof(*mem), GFP_KERNEL); 482 + struct memory_block *mem; 516 483 unsigned long start_pfn; 484 + int scn_nr; 517 485 int ret = 0; 518 486 487 + mem = kzalloc(sizeof(*mem), GFP_KERNEL); 519 488 if (!mem) 520 489 return -ENOMEM; 521 490 522 - mutex_lock(&mem_sysfs_mutex); 523 - 524 - mem->phys_index = __section_nr(section); 491 + scn_nr = __section_nr(section); 492 + mem->phys_index = base_memory_block_id(scn_nr) * sections_per_block; 525 493 mem->state = state; 526 494 mem->section_count++; 527 495 mutex_init(&mem->state_mutex); 528 496 start_pfn = section_nr_to_pfn(mem->phys_index); 529 497 mem->phys_device = arch_get_memory_phys_device(start_pfn); 530 498 531 - ret = register_memory(mem, section); 499 + ret = register_memory(mem); 532 500 if (!ret) 533 501 ret = mem_create_simple_file(mem, phys_index); 534 502 if (!ret) ··· 538 504 ret = mem_create_simple_file(mem, phys_device); 539 505 if (!ret) 540 506 ret = mem_create_simple_file(mem, removable); 507 + 508 + *memory = mem; 509 + return ret; 510 + } 511 + 512 + static int add_memory_section(int nid, struct mem_section *section, 513 + unsigned long state, enum mem_add_context context) 514 + { 515 + struct memory_block *mem; 516 + int ret = 0; 517 + 518 + mutex_lock(&mem_sysfs_mutex); 519 + 520 + mem = find_memory_block(section); 521 + if (mem) { 522 + mem->section_count++; 523 + kobject_put(&mem->sysdev.kobj); 524 + } else 525 + ret = init_memory_block(&mem, section, state); 526 + 541 527 if (!ret) { 542 - if (context == HOTPLUG) 528 + if (context == HOTPLUG && 529 + mem->section_count == sections_per_block) 543 530 ret = register_mem_sect_under_node(mem, nid); 544 531 } 545 532 ··· 583 528 mem_remove_simple_file(mem, state); 584 529 mem_remove_simple_file(mem, phys_device); 585 530 mem_remove_simple_file(mem, removable); 586 - unregister_memory(mem, section); 587 - } 531 + unregister_memory(mem); 532 + kfree(mem); 533 + } else 534 + kobject_put(&mem->sysdev.kobj); 588 535 589 536 mutex_unlock(&mem_sysfs_mutex); 590 537 return 0; ··· 598 541 */ 599 542 int register_new_memory(int nid, struct mem_section *section) 600 543 { 601 - return add_memory_block(nid, section, MEM_OFFLINE, HOTPLUG); 544 + return add_memory_section(nid, section, MEM_OFFLINE, HOTPLUG); 602 545 } 603 546 604 547 int unregister_memory_section(struct mem_section *section) ··· 617 560 unsigned int i; 618 561 int ret; 619 562 int err; 563 + unsigned long block_sz; 620 564 621 565 memory_sysdev_class.kset.uevent_ops = &memory_uevent_ops; 622 566 ret = sysdev_class_register(&memory_sysdev_class); 623 567 if (ret) 624 568 goto out; 569 + 570 + block_sz = get_memory_block_size(); 571 + sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE; 625 572 626 573 /* 627 574 * Create entries for memory sections that were found ··· 634 573 for (i = 0; i < NR_MEM_SECTIONS; i++) { 635 574 if (!present_section_nr(i)) 636 575 continue; 637 - err = add_memory_block(0, __nr_to_section(i), MEM_ONLINE, 638 - BOOT); 576 + err = add_memory_section(0, __nr_to_section(i), MEM_ONLINE, 577 + BOOT); 639 578 if (!ret) 640 579 ret = err; 641 580 }