[PATCH] memory hotplug: sysfs and add/remove functions

+1

drivers/base/Makefile

··· 7 7 obj-y += power/ 8 8 obj-$(CONFIG_FW_LOADER) += firmware_class.o 9 9 obj-$(CONFIG_NUMA) += node.o 10 + obj-$(CONFIG_MEMORY_HOTPLUG) += memory.o 10 11 11 12 ifeq ($(CONFIG_DEBUG_DRIVER),y) 12 13 EXTRA_CFLAGS += -DDEBUG

+2

drivers/base/init.c

··· 9 9 10 10 #include <linux/device.h> 11 11 #include <linux/init.h> 12 + #include <linux/memory.h> 12 13 13 14 #include "base.h" 14 15 ··· 34 33 platform_bus_init(); 35 34 system_bus_init(); 36 35 cpu_dev_init(); 36 + memory_dev_init(); 37 37 attribute_container_init(); 38 38 }

+455

drivers/base/memory.c

··· 1 + /* 2 + * drivers/base/memory.c - basic Memory class support 3 + * 4 + * Written by Matt Tolentino <matthew.e.tolentino@intel.com> 5 + * Dave Hansen <haveblue@us.ibm.com> 6 + * 7 + * This file provides the necessary infrastructure to represent 8 + * a SPARSEMEM-memory-model system's physical memory in /sysfs. 9 + * All arch-independent code that assumes MEMORY_HOTPLUG requires 10 + * SPARSEMEM should be contained here, or in mm/memory_hotplug.c. 11 + */ 12 + 13 + #include <linux/sysdev.h> 14 + #include <linux/module.h> 15 + #include <linux/init.h> 16 + #include <linux/sched.h> /* capable() */ 17 + #include <linux/topology.h> 18 + #include <linux/device.h> 19 + #include <linux/memory.h> 20 + #include <linux/kobject.h> 21 + #include <linux/memory_hotplug.h> 22 + #include <linux/mm.h> 23 + #include <asm/atomic.h> 24 + #include <asm/uaccess.h> 25 + 26 + #define MEMORY_CLASS_NAME "memory" 27 + 28 + static struct sysdev_class memory_sysdev_class = { 29 + set_kset_name(MEMORY_CLASS_NAME), 30 + }; 31 + EXPORT_SYMBOL(memory_sysdev_class); 32 + 33 + static char *memory_hotplug_name(struct kset *kset, struct kobject *kobj) 34 + { 35 + return MEMORY_CLASS_NAME; 36 + } 37 + 38 + static int memory_hotplug(struct kset *kset, struct kobject *kobj, char **envp, 39 + int num_envp, char *buffer, int buffer_size) 40 + { 41 + int retval = 0; 42 + 43 + return retval; 44 + } 45 + 46 + static struct kset_hotplug_ops memory_hotplug_ops = { 47 + .name = memory_hotplug_name, 48 + .hotplug = memory_hotplug, 49 + }; 50 + 51 + static struct notifier_block *memory_chain; 52 + 53 + static int register_memory_notifier(struct notifier_block *nb) 54 + { 55 + return notifier_chain_register(&memory_chain, nb); 56 + } 57 + 58 + static void unregister_memory_notifier(struct notifier_block *nb) 59 + { 60 + notifier_chain_unregister(&memory_chain, nb); 61 + } 62 + 63 + /* 64 + * register_memory - Setup a sysfs device for a memory block 65 + */ 66 + static int 67 + register_memory(struct memory_block *memory, struct mem_section *section, 68 + struct node *root) 69 + { 70 + int error; 71 + 72 + memory->sysdev.cls = &memory_sysdev_class; 73 + memory->sysdev.id = __section_nr(section); 74 + 75 + error = sysdev_register(&memory->sysdev); 76 + 77 + if (root && !error) 78 + error = sysfs_create_link(&root->sysdev.kobj, 79 + &memory->sysdev.kobj, 80 + kobject_name(&memory->sysdev.kobj)); 81 + 82 + return error; 83 + } 84 + 85 + static void 86 + unregister_memory(struct memory_block *memory, struct mem_section *section, 87 + struct node *root) 88 + { 89 + BUG_ON(memory->sysdev.cls != &memory_sysdev_class); 90 + BUG_ON(memory->sysdev.id != __section_nr(section)); 91 + 92 + sysdev_unregister(&memory->sysdev); 93 + if (root) 94 + sysfs_remove_link(&root->sysdev.kobj, 95 + kobject_name(&memory->sysdev.kobj)); 96 + } 97 + 98 + /* 99 + * use this as the physical section index that this memsection 100 + * uses. 101 + */ 102 + 103 + static ssize_t show_mem_phys_index(struct sys_device *dev, char *buf) 104 + { 105 + struct memory_block *mem = 106 + container_of(dev, struct memory_block, sysdev); 107 + return sprintf(buf, "%08lx\n", mem->phys_index); 108 + } 109 + 110 + /* 111 + * online, offline, going offline, etc. 112 + */ 113 + static ssize_t show_mem_state(struct sys_device *dev, char *buf) 114 + { 115 + struct memory_block *mem = 116 + container_of(dev, struct memory_block, sysdev); 117 + ssize_t len = 0; 118 + 119 + /* 120 + * We can probably put these states in a nice little array 121 + * so that they're not open-coded 122 + */ 123 + switch (mem->state) { 124 + case MEM_ONLINE: 125 + len = sprintf(buf, "online\n"); 126 + break; 127 + case MEM_OFFLINE: 128 + len = sprintf(buf, "offline\n"); 129 + break; 130 + case MEM_GOING_OFFLINE: 131 + len = sprintf(buf, "going-offline\n"); 132 + break; 133 + default: 134 + len = sprintf(buf, "ERROR-UNKNOWN-%ld\n", 135 + mem->state); 136 + WARN_ON(1); 137 + break; 138 + } 139 + 140 + return len; 141 + } 142 + 143 + static inline int memory_notify(unsigned long val, void *v) 144 + { 145 + return notifier_call_chain(&memory_chain, val, v); 146 + } 147 + 148 + /* 149 + * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is 150 + * OK to have direct references to sparsemem variables in here. 151 + */ 152 + static int 153 + memory_block_action(struct memory_block *mem, unsigned long action) 154 + { 155 + int i; 156 + unsigned long psection; 157 + unsigned long start_pfn, start_paddr; 158 + struct page *first_page; 159 + int ret; 160 + int old_state = mem->state; 161 + 162 + psection = mem->phys_index; 163 + first_page = pfn_to_page(psection << PFN_SECTION_SHIFT); 164 + 165 + /* 166 + * The probe routines leave the pages reserved, just 167 + * as the bootmem code does. Make sure they're still 168 + * that way. 169 + */ 170 + if (action == MEM_ONLINE) { 171 + for (i = 0; i < PAGES_PER_SECTION; i++) { 172 + if (PageReserved(first_page+i)) 173 + continue; 174 + 175 + printk(KERN_WARNING "section number %ld page number %d " 176 + "not reserved, was it already online? \n", 177 + psection, i); 178 + return -EBUSY; 179 + } 180 + } 181 + 182 + switch (action) { 183 + case MEM_ONLINE: 184 + start_pfn = page_to_pfn(first_page); 185 + ret = online_pages(start_pfn, PAGES_PER_SECTION); 186 + break; 187 + case MEM_OFFLINE: 188 + mem->state = MEM_GOING_OFFLINE; 189 + memory_notify(MEM_GOING_OFFLINE, NULL); 190 + start_paddr = page_to_pfn(first_page) << PAGE_SHIFT; 191 + ret = remove_memory(start_paddr, 192 + PAGES_PER_SECTION << PAGE_SHIFT); 193 + if (ret) { 194 + mem->state = old_state; 195 + break; 196 + } 197 + memory_notify(MEM_MAPPING_INVALID, NULL); 198 + break; 199 + default: 200 + printk(KERN_WARNING "%s(%p, %ld) unknown action: %ld\n", 201 + __FUNCTION__, mem, action, action); 202 + WARN_ON(1); 203 + ret = -EINVAL; 204 + } 205 + /* 206 + * For now, only notify on successful memory operations 207 + */ 208 + if (!ret) 209 + memory_notify(action, NULL); 210 + 211 + return ret; 212 + } 213 + 214 + static int memory_block_change_state(struct memory_block *mem, 215 + unsigned long to_state, unsigned long from_state_req) 216 + { 217 + int ret = 0; 218 + down(&mem->state_sem); 219 + 220 + if (mem->state != from_state_req) { 221 + ret = -EINVAL; 222 + goto out; 223 + } 224 + 225 + ret = memory_block_action(mem, to_state); 226 + if (!ret) 227 + mem->state = to_state; 228 + 229 + out: 230 + up(&mem->state_sem); 231 + return ret; 232 + } 233 + 234 + static ssize_t 235 + store_mem_state(struct sys_device *dev, const char *buf, size_t count) 236 + { 237 + struct memory_block *mem; 238 + unsigned int phys_section_nr; 239 + int ret = -EINVAL; 240 + 241 + mem = container_of(dev, struct memory_block, sysdev); 242 + phys_section_nr = mem->phys_index; 243 + 244 + if (!valid_section_nr(phys_section_nr)) 245 + goto out; 246 + 247 + if (!strncmp(buf, "online", min((int)count, 6))) 248 + ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE); 249 + else if(!strncmp(buf, "offline", min((int)count, 7))) 250 + ret = memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE); 251 + out: 252 + if (ret) 253 + return ret; 254 + return count; 255 + } 256 + 257 + /* 258 + * phys_device is a bad name for this. What I really want 259 + * is a way to differentiate between memory ranges that 260 + * are part of physical devices that constitute 261 + * a complete removable unit or fru. 262 + * i.e. do these ranges belong to the same physical device, 263 + * s.t. if I offline all of these sections I can then 264 + * remove the physical device? 265 + */ 266 + static ssize_t show_phys_device(struct sys_device *dev, char *buf) 267 + { 268 + struct memory_block *mem = 269 + container_of(dev, struct memory_block, sysdev); 270 + return sprintf(buf, "%d\n", mem->phys_device); 271 + } 272 + 273 + static SYSDEV_ATTR(phys_index, 0444, show_mem_phys_index, NULL); 274 + static SYSDEV_ATTR(state, 0644, show_mem_state, store_mem_state); 275 + static SYSDEV_ATTR(phys_device, 0444, show_phys_device, NULL); 276 + 277 + #define mem_create_simple_file(mem, attr_name) \ 278 + sysdev_create_file(&mem->sysdev, &attr_##attr_name) 279 + #define mem_remove_simple_file(mem, attr_name) \ 280 + sysdev_remove_file(&mem->sysdev, &attr_##attr_name) 281 + 282 + /* 283 + * Block size attribute stuff 284 + */ 285 + static ssize_t 286 + print_block_size(struct class *class, char *buf) 287 + { 288 + return sprintf(buf, "%lx\n", (unsigned long)PAGES_PER_SECTION * PAGE_SIZE); 289 + } 290 + 291 + static CLASS_ATTR(block_size_bytes, 0444, print_block_size, NULL); 292 + 293 + static int block_size_init(void) 294 + { 295 + sysfs_create_file(&memory_sysdev_class.kset.kobj, 296 + &class_attr_block_size_bytes.attr); 297 + return 0; 298 + } 299 + 300 + /* 301 + * Some architectures will have custom drivers to do this, and 302 + * will not need to do it from userspace. The fake hot-add code 303 + * as well as ppc64 will do all of their discovery in userspace 304 + * and will require this interface. 305 + */ 306 + #ifdef CONFIG_ARCH_MEMORY_PROBE 307 + static ssize_t 308 + memory_probe_store(struct class *class, const char __user *buf, size_t count) 309 + { 310 + u64 phys_addr; 311 + int ret; 312 + 313 + phys_addr = simple_strtoull(buf, NULL, 0); 314 + 315 + ret = add_memory(phys_addr, PAGES_PER_SECTION << PAGE_SHIFT); 316 + 317 + if (ret) 318 + count = ret; 319 + 320 + return count; 321 + } 322 + static CLASS_ATTR(probe, 0700, NULL, memory_probe_store); 323 + 324 + static int memory_probe_init(void) 325 + { 326 + sysfs_create_file(&memory_sysdev_class.kset.kobj, 327 + &class_attr_probe.attr); 328 + return 0; 329 + } 330 + #else 331 + #define memory_probe_init(...) do {} while (0) 332 + #endif 333 + 334 + /* 335 + * Note that phys_device is optional. It is here to allow for 336 + * differentiation between which *physical* devices each 337 + * section belongs to... 338 + */ 339 + 340 + static int add_memory_block(unsigned long node_id, struct mem_section *section, 341 + unsigned long state, int phys_device) 342 + { 343 + size_t size = sizeof(struct memory_block); 344 + struct memory_block *mem = kmalloc(size, GFP_KERNEL); 345 + int ret = 0; 346 + 347 + if (!mem) 348 + return -ENOMEM; 349 + 350 + memset(mem, 0, size); 351 + 352 + mem->phys_index = __section_nr(section); 353 + mem->state = state; 354 + init_MUTEX(&mem->state_sem); 355 + mem->phys_device = phys_device; 356 + 357 + ret = register_memory(mem, section, NULL); 358 + if (!ret) 359 + ret = mem_create_simple_file(mem, phys_index); 360 + if (!ret) 361 + ret = mem_create_simple_file(mem, state); 362 + if (!ret) 363 + ret = mem_create_simple_file(mem, phys_device); 364 + 365 + return ret; 366 + } 367 + 368 + /* 369 + * For now, we have a linear search to go find the appropriate 370 + * memory_block corresponding to a particular phys_index. If 371 + * this gets to be a real problem, we can always use a radix 372 + * tree or something here. 373 + * 374 + * This could be made generic for all sysdev classes. 375 + */ 376 + static struct memory_block *find_memory_block(struct mem_section *section) 377 + { 378 + struct kobject *kobj; 379 + struct sys_device *sysdev; 380 + struct memory_block *mem; 381 + char name[sizeof(MEMORY_CLASS_NAME) + 9 + 1]; 382 + 383 + /* 384 + * This only works because we know that section == sysdev->id 385 + * slightly redundant with sysdev_register() 386 + */ 387 + sprintf(&name[0], "%s%d", MEMORY_CLASS_NAME, __section_nr(section)); 388 + 389 + kobj = kset_find_obj(&memory_sysdev_class.kset, name); 390 + if (!kobj) 391 + return NULL; 392 + 393 + sysdev = container_of(kobj, struct sys_device, kobj); 394 + mem = container_of(sysdev, struct memory_block, sysdev); 395 + 396 + return mem; 397 + } 398 + 399 + int remove_memory_block(unsigned long node_id, struct mem_section *section, 400 + int phys_device) 401 + { 402 + struct memory_block *mem; 403 + 404 + mem = find_memory_block(section); 405 + mem_remove_simple_file(mem, phys_index); 406 + mem_remove_simple_file(mem, state); 407 + mem_remove_simple_file(mem, phys_device); 408 + unregister_memory(mem, section, NULL); 409 + 410 + return 0; 411 + } 412 + 413 + /* 414 + * need an interface for the VM to add new memory regions, 415 + * but without onlining it. 416 + */ 417 + int register_new_memory(struct mem_section *section) 418 + { 419 + return add_memory_block(0, section, MEM_OFFLINE, 0); 420 + } 421 + 422 + int unregister_memory_section(struct mem_section *section) 423 + { 424 + if (!valid_section(section)) 425 + return -EINVAL; 426 + 427 + return remove_memory_block(0, section, 0); 428 + } 429 + 430 + /* 431 + * Initialize the sysfs support for memory devices... 432 + */ 433 + int __init memory_dev_init(void) 434 + { 435 + unsigned int i; 436 + int ret; 437 + 438 + memory_sysdev_class.kset.hotplug_ops = &memory_hotplug_ops; 439 + ret = sysdev_class_register(&memory_sysdev_class); 440 + 441 + /* 442 + * Create entries for memory sections that were found 443 + * during boot and have been initialized 444 + */ 445 + for (i = 0; i < NR_MEM_SECTIONS; i++) { 446 + if (!valid_section_nr(i)) 447 + continue; 448 + add_memory_block(0, __nr_to_section(i), MEM_ONLINE, 0); 449 + } 450 + 451 + memory_probe_init(); 452 + block_size_init(); 453 + 454 + return ret; 455 + }

+94

include/linux/memory.h

··· 1 + /* 2 + * include/linux/memory.h - generic memory definition 3 + * 4 + * This is mainly for topological representation. We define the 5 + * basic "struct memory_block" here, which can be embedded in per-arch 6 + * definitions or NUMA information. 7 + * 8 + * Basic handling of the devices is done in drivers/base/memory.c 9 + * and system devices are handled in drivers/base/sys.c. 10 + * 11 + * Memory block are exported via sysfs in the class/memory/devices/ 12 + * directory. 13 + * 14 + */ 15 + #ifndef _LINUX_MEMORY_H_ 16 + #define _LINUX_MEMORY_H_ 17 + 18 + #include <linux/sysdev.h> 19 + #include <linux/node.h> 20 + #include <linux/compiler.h> 21 + 22 + #include <asm/semaphore.h> 23 + 24 + struct memory_block { 25 + unsigned long phys_index; 26 + unsigned long state; 27 + /* 28 + * This serializes all state change requests. It isn't 29 + * held during creation because the control files are 30 + * created long after the critical areas during 31 + * initialization. 32 + */ 33 + struct semaphore state_sem; 34 + int phys_device; /* to which fru does this belong? */ 35 + void *hw; /* optional pointer to fw/hw data */ 36 + int (*phys_callback)(struct memory_block *); 37 + struct sys_device sysdev; 38 + }; 39 + 40 + /* These states are exposed to userspace as text strings in sysfs */ 41 + #define MEM_ONLINE (1<<0) /* exposed to userspace */ 42 + #define MEM_GOING_OFFLINE (1<<1) /* exposed to userspace */ 43 + #define MEM_OFFLINE (1<<2) /* exposed to userspace */ 44 + 45 + /* 46 + * All of these states are currently kernel-internal for notifying 47 + * kernel components and architectures. 48 + * 49 + * For MEM_MAPPING_INVALID, all notifier chains with priority >0 50 + * are called before pfn_to_page() becomes invalid. The priority=0 51 + * entry is reserved for the function that actually makes 52 + * pfn_to_page() stop working. Any notifiers that want to be called 53 + * after that should have priority <0. 54 + */ 55 + #define MEM_MAPPING_INVALID (1<<3) 56 + 57 + #ifndef CONFIG_MEMORY_HOTPLUG 58 + static inline int memory_dev_init(void) 59 + { 60 + return 0; 61 + } 62 + static inline int register_memory_notifier(struct notifier_block *nb) 63 + { 64 + return 0; 65 + } 66 + static inline void unregister_memory_notifier(struct notifier_block *nb) 67 + { 68 + } 69 + #else 70 + extern int register_memory(struct memory_block *, struct mem_section *section, struct node *); 71 + extern int register_new_memory(struct mem_section *); 72 + extern int unregister_memory_section(struct mem_section *); 73 + extern int memory_dev_init(void); 74 + extern int register_memory_notifier(struct notifier_block *nb); 75 + extern void unregister_memory_notifier(struct notifier_block *nb); 76 + 77 + #define CONFIG_MEM_BLOCK_SIZE (PAGES_PER_SECTION<<PAGE_SHIFT) 78 + 79 + extern int invalidate_phys_mapping(unsigned long, unsigned long); 80 + struct notifier_block; 81 + 82 + extern int register_memory_notifier(struct notifier_block *nb); 83 + extern void unregister_memory_notifier(struct notifier_block *nb); 84 + 85 + extern struct sysdev_class memory_sysdev_class; 86 + #endif /* CONFIG_MEMORY_HOTPLUG */ 87 + 88 + #define hotplug_memory_notifier(fn, pri) { \ 89 + static struct notifier_block fn##_mem_nb = \ 90 + { .notifier_call = fn, .priority = pri }; \ 91 + register_memory_notifier(&fn##_mem_nb); \ 92 + } 93 + 94 + #endif /* _LINUX_MEMORY_H_ */

+35

include/linux/memory_hotplug.h

··· 3 3 4 4 #include <linux/mmzone.h> 5 5 #include <linux/spinlock.h> 6 + #include <linux/mmzone.h> 7 + #include <linux/notifier.h> 6 8 7 9 #ifdef CONFIG_MEMORY_HOTPLUG 8 10 /* ··· 48 46 { 49 47 seqlock_init(&zone->span_seqlock); 50 48 } 49 + extern int zone_grow_free_lists(struct zone *zone, unsigned long new_nr_pages); 50 + extern int zone_grow_waitqueues(struct zone *zone, unsigned long nr_pages); 51 + extern int add_one_highpage(struct page *page, int pfn, int bad_ppro); 52 + /* need some defines for these for archs that don't support it */ 53 + extern void online_page(struct page *page); 54 + /* VM interface that may be used by firmware interface */ 55 + extern int add_memory(u64 start, u64 size); 56 + extern int remove_memory(u64 start, u64 size); 57 + extern int online_pages(unsigned long, unsigned long); 58 + 59 + /* reasonably generic interface to expand the physical pages in a zone */ 60 + extern int __add_pages(struct zone *zone, unsigned long start_pfn, 61 + unsigned long nr_pages); 51 62 #else /* ! CONFIG_MEMORY_HOTPLUG */ 52 63 /* 53 64 * Stub functions for when hotplug is off ··· 80 65 static inline void zone_span_writelock(struct zone *zone) {} 81 66 static inline void zone_span_writeunlock(struct zone *zone) {} 82 67 static inline void zone_seqlock_init(struct zone *zone) {} 68 + 69 + static inline int mhp_notimplemented(const char *func) 70 + { 71 + printk(KERN_WARNING "%s() called, with CONFIG_MEMORY_HOTPLUG disabled\n", func); 72 + dump_stack(); 73 + return -ENOSYS; 74 + } 75 + 76 + static inline int __add_pages(struct zone *zone, unsigned long start_pfn, 77 + unsigned long nr_pages) 78 + { 79 + return mhp_notimplemented(__FUNCTION__); 80 + } 83 81 #endif /* ! CONFIG_MEMORY_HOTPLUG */ 82 + static inline int __remove_pages(struct zone *zone, unsigned long start_pfn, 83 + unsigned long nr_pages) 84 + { 85 + printk(KERN_WARNING "%s() called, not yet supported\n", __FUNCTION__); 86 + dump_stack(); 87 + return -ENOSYS; 88 + } 84 89 #endif /* __LINUX_MEMORY_HOTPLUG_H */

+1

include/linux/mm.h

··· 840 840 unsigned long * zones_size, unsigned long zone_start_pfn, 841 841 unsigned long *zholes_size); 842 842 extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long); 843 + extern void setup_per_zone_pages_min(void); 843 844 extern void mem_init(void); 844 845 extern void show_mem(void); 845 846 extern void si_meminfo(struct sysinfo * val);

+8

mm/Kconfig

··· 112 112 def_bool y 113 113 depends on SPARSEMEM && !SPARSEMEM_STATIC 114 114 115 + # eventually, we can have this option just 'select SPARSEMEM' 116 + config MEMORY_HOTPLUG 117 + bool "Allow for memory hot-add" 118 + depends on SPARSEMEM && HOTPLUG && !SOFTWARE_SUSPEND 119 + 120 + comment "Memory hotplug is currently incompatible with Software Suspend" 121 + depends on SPARSEMEM && HOTPLUG && SOFTWARE_SUSPEND 122 + 115 123 # Heavily threaded applications may benefit from splitting the mm-wide 116 124 # page_table_lock, so that faults on different parts of the user address 117 125 # space can be handled with less contention: split it at this NR_CPUS.

+1 -1

mm/Makefile

··· 18 18 obj-$(CONFIG_SPARSEMEM) += sparse.o 19 19 obj-$(CONFIG_SHMEM) += shmem.o 20 20 obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o 21 - 21 + obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o 22 22 obj-$(CONFIG_FS_XIP) += filemap_xip.o

+178

mm/memory_hotplug.c

··· 1 + /* 2 + * linux/mm/memory_hotplug.c 3 + * 4 + * Copyright (C) 5 + */ 6 + 7 + #include <linux/config.h> 8 + #include <linux/stddef.h> 9 + #include <linux/mm.h> 10 + #include <linux/swap.h> 11 + #include <linux/interrupt.h> 12 + #include <linux/pagemap.h> 13 + #include <linux/bootmem.h> 14 + #include <linux/compiler.h> 15 + #include <linux/module.h> 16 + #include <linux/pagevec.h> 17 + #include <linux/slab.h> 18 + #include <linux/sysctl.h> 19 + #include <linux/cpu.h> 20 + #include <linux/memory.h> 21 + #include <linux/memory_hotplug.h> 22 + #include <linux/highmem.h> 23 + #include <linux/vmalloc.h> 24 + 25 + #include <asm/tlbflush.h> 26 + 27 + static struct page *__kmalloc_section_memmap(unsigned long nr_pages) 28 + { 29 + struct page *page, *ret; 30 + unsigned long memmap_size = sizeof(struct page) * nr_pages; 31 + 32 + page = alloc_pages(GFP_KERNEL, get_order(memmap_size)); 33 + if (page) 34 + goto got_map_page; 35 + 36 + ret = vmalloc(memmap_size); 37 + if (ret) 38 + goto got_map_ptr; 39 + 40 + return NULL; 41 + got_map_page: 42 + ret = (struct page *)pfn_to_kaddr(page_to_pfn(page)); 43 + got_map_ptr: 44 + memset(ret, 0, memmap_size); 45 + 46 + return ret; 47 + } 48 + 49 + extern void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn, 50 + unsigned long size); 51 + static void __add_zone(struct zone *zone, unsigned long phys_start_pfn) 52 + { 53 + struct pglist_data *pgdat = zone->zone_pgdat; 54 + int nr_pages = PAGES_PER_SECTION; 55 + int nid = pgdat->node_id; 56 + int zone_type; 57 + 58 + zone_type = zone - pgdat->node_zones; 59 + memmap_init_zone(nr_pages, nid, zone_type, phys_start_pfn); 60 + zonetable_add(zone, nid, zone_type, phys_start_pfn, nr_pages); 61 + } 62 + 63 + extern int sparse_add_one_section(struct zone *, unsigned long, 64 + struct page *mem_map); 65 + static int __add_section(struct zone *zone, unsigned long phys_start_pfn) 66 + { 67 + struct pglist_data *pgdat = zone->zone_pgdat; 68 + int nr_pages = PAGES_PER_SECTION; 69 + struct page *memmap; 70 + int ret; 71 + 72 + /* 73 + * This can potentially allocate memory, and does its own 74 + * internal locking. 75 + */ 76 + sparse_index_init(pfn_to_section_nr(phys_start_pfn), pgdat->node_id); 77 + 78 + pgdat_resize_lock(pgdat, &flags); 79 + memmap = __kmalloc_section_memmap(nr_pages); 80 + ret = sparse_add_one_section(zone, phys_start_pfn, memmap); 81 + pgdat_resize_unlock(pgdat, &flags); 82 + 83 + if (ret <= 0) { 84 + /* the mem_map didn't get used */ 85 + if (memmap >= (struct page *)VMALLOC_START && 86 + memmap < (struct page *)VMALLOC_END) 87 + vfree(memmap); 88 + else 89 + free_pages((unsigned long)memmap, 90 + get_order(sizeof(struct page) * nr_pages)); 91 + } 92 + 93 + if (ret < 0) 94 + return ret; 95 + 96 + __add_zone(zone, phys_start_pfn); 97 + return register_new_memory(__pfn_to_section(phys_start_pfn)); 98 + } 99 + 100 + /* 101 + * Reasonably generic function for adding memory. It is 102 + * expected that archs that support memory hotplug will 103 + * call this function after deciding the zone to which to 104 + * add the new pages. 105 + */ 106 + int __add_pages(struct zone *zone, unsigned long phys_start_pfn, 107 + unsigned long nr_pages) 108 + { 109 + unsigned long i; 110 + int err = 0; 111 + 112 + for (i = 0; i < nr_pages; i += PAGES_PER_SECTION) { 113 + err = __add_section(zone, phys_start_pfn + i); 114 + 115 + if (err) 116 + break; 117 + } 118 + 119 + return err; 120 + } 121 + 122 + static void grow_zone_span(struct zone *zone, 123 + unsigned long start_pfn, unsigned long end_pfn) 124 + { 125 + unsigned long old_zone_end_pfn; 126 + 127 + zone_span_writelock(zone); 128 + 129 + old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; 130 + if (start_pfn < zone->zone_start_pfn) 131 + zone->zone_start_pfn = start_pfn; 132 + 133 + if (end_pfn > old_zone_end_pfn) 134 + zone->spanned_pages = end_pfn - zone->zone_start_pfn; 135 + 136 + zone_span_writeunlock(zone); 137 + } 138 + 139 + static void grow_pgdat_span(struct pglist_data *pgdat, 140 + unsigned long start_pfn, unsigned long end_pfn) 141 + { 142 + unsigned long old_pgdat_end_pfn = 143 + pgdat->node_start_pfn + pgdat->node_spanned_pages; 144 + 145 + if (start_pfn < pgdat->node_start_pfn) 146 + pgdat->node_start_pfn = start_pfn; 147 + 148 + if (end_pfn > old_pgdat_end_pfn) 149 + pgdat->node_spanned_pages = end_pfn - pgdat->node_spanned_pages; 150 + } 151 + 152 + int online_pages(unsigned long pfn, unsigned long nr_pages) 153 + { 154 + unsigned long i; 155 + unsigned long flags; 156 + unsigned long onlined_pages = 0; 157 + struct zone *zone; 158 + 159 + /* 160 + * This doesn't need a lock to do pfn_to_page(). 161 + * The section can't be removed here because of the 162 + * memory_block->state_sem. 163 + */ 164 + zone = page_zone(pfn_to_page(pfn)); 165 + pgdat_resize_lock(zone->zone_pgdat, &flags); 166 + grow_zone_span(zone, pfn, pfn + nr_pages); 167 + grow_pgdat_span(zone->zone_pgdat, pfn, pfn + nr_pages); 168 + pgdat_resize_unlock(zone->zone_pgdat, &flags); 169 + 170 + for (i = 0; i < nr_pages; i++) { 171 + struct page *page = pfn_to_page(pfn + i); 172 + online_page(page); 173 + onlined_pages++; 174 + } 175 + zone->present_pages += onlined_pages; 176 + 177 + return 0; 178 + }

+2 -2

mm/page_alloc.c

··· 1686 1686 * up by free_all_bootmem() once the early boot process is 1687 1687 * done. Non-atomic initialization, single-pass. 1688 1688 */ 1689 - void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone, 1689 + void __devinit memmap_init_zone(unsigned long size, int nid, unsigned long zone, 1690 1690 unsigned long start_pfn) 1691 1691 { 1692 1692 struct page *page; ··· 2407 2407 * that the pages_{min,low,high} values for each zone are set correctly 2408 2408 * with respect to min_free_kbytes. 2409 2409 */ 2410 - static void setup_per_zone_pages_min(void) 2410 + void setup_per_zone_pages_min(void) 2411 2411 { 2412 2412 unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); 2413 2413 unsigned long lowmem_pages = 0;