Linux kernel mirror (for testing)
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel
os
linux
1/*
2 * Memory subsystem support
3 *
4 * Written by Matt Tolentino <matthew.e.tolentino@intel.com>
5 * Dave Hansen <haveblue@us.ibm.com>
6 *
7 * This file provides the necessary infrastructure to represent
8 * a SPARSEMEM-memory-model system's physical memory in /sysfs.
9 * All arch-independent code that assumes MEMORY_HOTPLUG requires
10 * SPARSEMEM should be contained here, or in mm/memory_hotplug.c.
11 */
12
13#include <linux/module.h>
14#include <linux/init.h>
15#include <linux/topology.h>
16#include <linux/capability.h>
17#include <linux/device.h>
18#include <linux/memory.h>
19#include <linux/kobject.h>
20#include <linux/memory_hotplug.h>
21#include <linux/mm.h>
22#include <linux/mutex.h>
23#include <linux/stat.h>
24#include <linux/slab.h>
25
26#include <linux/atomic.h>
27#include <asm/uaccess.h>
28
29static DEFINE_MUTEX(mem_sysfs_mutex);
30
31#define MEMORY_CLASS_NAME "memory"
32
33static int sections_per_block;
34
35static inline int base_memory_block_id(int section_nr)
36{
37 return section_nr / sections_per_block;
38}
39
40static int memory_subsys_online(struct device *dev);
41static int memory_subsys_offline(struct device *dev);
42
43static struct bus_type memory_subsys = {
44 .name = MEMORY_CLASS_NAME,
45 .dev_name = MEMORY_CLASS_NAME,
46 .online = memory_subsys_online,
47 .offline = memory_subsys_offline,
48};
49
50static BLOCKING_NOTIFIER_HEAD(memory_chain);
51
52int register_memory_notifier(struct notifier_block *nb)
53{
54 return blocking_notifier_chain_register(&memory_chain, nb);
55}
56EXPORT_SYMBOL(register_memory_notifier);
57
58void unregister_memory_notifier(struct notifier_block *nb)
59{
60 blocking_notifier_chain_unregister(&memory_chain, nb);
61}
62EXPORT_SYMBOL(unregister_memory_notifier);
63
64static ATOMIC_NOTIFIER_HEAD(memory_isolate_chain);
65
66int register_memory_isolate_notifier(struct notifier_block *nb)
67{
68 return atomic_notifier_chain_register(&memory_isolate_chain, nb);
69}
70EXPORT_SYMBOL(register_memory_isolate_notifier);
71
72void unregister_memory_isolate_notifier(struct notifier_block *nb)
73{
74 atomic_notifier_chain_unregister(&memory_isolate_chain, nb);
75}
76EXPORT_SYMBOL(unregister_memory_isolate_notifier);
77
78static void memory_block_release(struct device *dev)
79{
80 struct memory_block *mem = container_of(dev, struct memory_block, dev);
81
82 kfree(mem);
83}
84
85unsigned long __weak memory_block_size_bytes(void)
86{
87 return MIN_MEMORY_BLOCK_SIZE;
88}
89
90static unsigned long get_memory_block_size(void)
91{
92 unsigned long block_sz;
93
94 block_sz = memory_block_size_bytes();
95
96 /* Validate blk_sz is a power of 2 and not less than section size */
97 if ((block_sz & (block_sz - 1)) || (block_sz < MIN_MEMORY_BLOCK_SIZE)) {
98 WARN_ON(1);
99 block_sz = MIN_MEMORY_BLOCK_SIZE;
100 }
101
102 return block_sz;
103}
104
105/*
106 * use this as the physical section index that this memsection
107 * uses.
108 */
109
110static ssize_t show_mem_start_phys_index(struct device *dev,
111 struct device_attribute *attr, char *buf)
112{
113 struct memory_block *mem =
114 container_of(dev, struct memory_block, dev);
115 unsigned long phys_index;
116
117 phys_index = mem->start_section_nr / sections_per_block;
118 return sprintf(buf, "%08lx\n", phys_index);
119}
120
121static ssize_t show_mem_end_phys_index(struct device *dev,
122 struct device_attribute *attr, char *buf)
123{
124 struct memory_block *mem =
125 container_of(dev, struct memory_block, dev);
126 unsigned long phys_index;
127
128 phys_index = mem->end_section_nr / sections_per_block;
129 return sprintf(buf, "%08lx\n", phys_index);
130}
131
132/*
133 * Show whether the section of memory is likely to be hot-removable
134 */
135static ssize_t show_mem_removable(struct device *dev,
136 struct device_attribute *attr, char *buf)
137{
138 unsigned long i, pfn;
139 int ret = 1;
140 struct memory_block *mem =
141 container_of(dev, struct memory_block, dev);
142
143 for (i = 0; i < sections_per_block; i++) {
144 if (!present_section_nr(mem->start_section_nr + i))
145 continue;
146 pfn = section_nr_to_pfn(mem->start_section_nr + i);
147 ret &= is_mem_section_removable(pfn, PAGES_PER_SECTION);
148 }
149
150 return sprintf(buf, "%d\n", ret);
151}
152
153/*
154 * online, offline, going offline, etc.
155 */
156static ssize_t show_mem_state(struct device *dev,
157 struct device_attribute *attr, char *buf)
158{
159 struct memory_block *mem =
160 container_of(dev, struct memory_block, dev);
161 ssize_t len = 0;
162
163 /*
164 * We can probably put these states in a nice little array
165 * so that they're not open-coded
166 */
167 switch (mem->state) {
168 case MEM_ONLINE:
169 len = sprintf(buf, "online\n");
170 break;
171 case MEM_OFFLINE:
172 len = sprintf(buf, "offline\n");
173 break;
174 case MEM_GOING_OFFLINE:
175 len = sprintf(buf, "going-offline\n");
176 break;
177 default:
178 len = sprintf(buf, "ERROR-UNKNOWN-%ld\n",
179 mem->state);
180 WARN_ON(1);
181 break;
182 }
183
184 return len;
185}
186
187int memory_notify(unsigned long val, void *v)
188{
189 return blocking_notifier_call_chain(&memory_chain, val, v);
190}
191
192int memory_isolate_notify(unsigned long val, void *v)
193{
194 return atomic_notifier_call_chain(&memory_isolate_chain, val, v);
195}
196
197/*
198 * The probe routines leave the pages reserved, just as the bootmem code does.
199 * Make sure they're still that way.
200 */
201static bool pages_correctly_reserved(unsigned long start_pfn)
202{
203 int i, j;
204 struct page *page;
205 unsigned long pfn = start_pfn;
206
207 /*
208 * memmap between sections is not contiguous except with
209 * SPARSEMEM_VMEMMAP. We lookup the page once per section
210 * and assume memmap is contiguous within each section
211 */
212 for (i = 0; i < sections_per_block; i++, pfn += PAGES_PER_SECTION) {
213 if (WARN_ON_ONCE(!pfn_valid(pfn)))
214 return false;
215 page = pfn_to_page(pfn);
216
217 for (j = 0; j < PAGES_PER_SECTION; j++) {
218 if (PageReserved(page + j))
219 continue;
220
221 printk(KERN_WARNING "section number %ld page number %d "
222 "not reserved, was it already online?\n",
223 pfn_to_section_nr(pfn), j);
224
225 return false;
226 }
227 }
228
229 return true;
230}
231
232/*
233 * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is
234 * OK to have direct references to sparsemem variables in here.
235 */
236static int
237memory_block_action(unsigned long phys_index, unsigned long action, int online_type)
238{
239 unsigned long start_pfn;
240 unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
241 struct page *first_page;
242 int ret;
243
244 first_page = pfn_to_page(phys_index << PFN_SECTION_SHIFT);
245 start_pfn = page_to_pfn(first_page);
246
247 switch (action) {
248 case MEM_ONLINE:
249 if (!pages_correctly_reserved(start_pfn))
250 return -EBUSY;
251
252 ret = online_pages(start_pfn, nr_pages, online_type);
253 break;
254 case MEM_OFFLINE:
255 ret = offline_pages(start_pfn, nr_pages);
256 break;
257 default:
258 WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: "
259 "%ld\n", __func__, phys_index, action, action);
260 ret = -EINVAL;
261 }
262
263 return ret;
264}
265
266static int __memory_block_change_state(struct memory_block *mem,
267 unsigned long to_state, unsigned long from_state_req,
268 int online_type)
269{
270 int ret = 0;
271
272 if (mem->state != from_state_req)
273 return -EINVAL;
274
275 if (to_state == MEM_OFFLINE)
276 mem->state = MEM_GOING_OFFLINE;
277
278 ret = memory_block_action(mem->start_section_nr, to_state, online_type);
279 mem->state = ret ? from_state_req : to_state;
280 return ret;
281}
282
283static int memory_subsys_online(struct device *dev)
284{
285 struct memory_block *mem = container_of(dev, struct memory_block, dev);
286 int ret;
287
288 mutex_lock(&mem->state_mutex);
289
290 ret = mem->state == MEM_ONLINE ? 0 :
291 __memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE,
292 ONLINE_KEEP);
293
294 mutex_unlock(&mem->state_mutex);
295 return ret;
296}
297
298static int memory_subsys_offline(struct device *dev)
299{
300 struct memory_block *mem = container_of(dev, struct memory_block, dev);
301 int ret;
302
303 mutex_lock(&mem->state_mutex);
304
305 ret = mem->state == MEM_OFFLINE ? 0 :
306 __memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE, -1);
307
308 mutex_unlock(&mem->state_mutex);
309 return ret;
310}
311
312static int __memory_block_change_state_uevent(struct memory_block *mem,
313 unsigned long to_state, unsigned long from_state_req,
314 int online_type)
315{
316 int ret = __memory_block_change_state(mem, to_state, from_state_req,
317 online_type);
318 if (!ret) {
319 switch (mem->state) {
320 case MEM_OFFLINE:
321 kobject_uevent(&mem->dev.kobj, KOBJ_OFFLINE);
322 break;
323 case MEM_ONLINE:
324 kobject_uevent(&mem->dev.kobj, KOBJ_ONLINE);
325 break;
326 default:
327 break;
328 }
329 }
330 return ret;
331}
332
333static int memory_block_change_state(struct memory_block *mem,
334 unsigned long to_state, unsigned long from_state_req,
335 int online_type)
336{
337 int ret;
338
339 mutex_lock(&mem->state_mutex);
340 ret = __memory_block_change_state_uevent(mem, to_state, from_state_req,
341 online_type);
342 mutex_unlock(&mem->state_mutex);
343
344 return ret;
345}
346static ssize_t
347store_mem_state(struct device *dev,
348 struct device_attribute *attr, const char *buf, size_t count)
349{
350 struct memory_block *mem;
351 bool offline;
352 int ret = -EINVAL;
353
354 mem = container_of(dev, struct memory_block, dev);
355
356 lock_device_hotplug();
357
358 if (!strncmp(buf, "online_kernel", min_t(int, count, 13))) {
359 offline = false;
360 ret = memory_block_change_state(mem, MEM_ONLINE,
361 MEM_OFFLINE, ONLINE_KERNEL);
362 } else if (!strncmp(buf, "online_movable", min_t(int, count, 14))) {
363 offline = false;
364 ret = memory_block_change_state(mem, MEM_ONLINE,
365 MEM_OFFLINE, ONLINE_MOVABLE);
366 } else if (!strncmp(buf, "online", min_t(int, count, 6))) {
367 offline = false;
368 ret = memory_block_change_state(mem, MEM_ONLINE,
369 MEM_OFFLINE, ONLINE_KEEP);
370 } else if(!strncmp(buf, "offline", min_t(int, count, 7))) {
371 offline = true;
372 ret = memory_block_change_state(mem, MEM_OFFLINE,
373 MEM_ONLINE, -1);
374 }
375 if (!ret)
376 dev->offline = offline;
377
378 unlock_device_hotplug();
379
380 if (ret)
381 return ret;
382 return count;
383}
384
385/*
386 * phys_device is a bad name for this. What I really want
387 * is a way to differentiate between memory ranges that
388 * are part of physical devices that constitute
389 * a complete removable unit or fru.
390 * i.e. do these ranges belong to the same physical device,
391 * s.t. if I offline all of these sections I can then
392 * remove the physical device?
393 */
394static ssize_t show_phys_device(struct device *dev,
395 struct device_attribute *attr, char *buf)
396{
397 struct memory_block *mem =
398 container_of(dev, struct memory_block, dev);
399 return sprintf(buf, "%d\n", mem->phys_device);
400}
401
402static DEVICE_ATTR(phys_index, 0444, show_mem_start_phys_index, NULL);
403static DEVICE_ATTR(end_phys_index, 0444, show_mem_end_phys_index, NULL);
404static DEVICE_ATTR(state, 0644, show_mem_state, store_mem_state);
405static DEVICE_ATTR(phys_device, 0444, show_phys_device, NULL);
406static DEVICE_ATTR(removable, 0444, show_mem_removable, NULL);
407
408/*
409 * Block size attribute stuff
410 */
411static ssize_t
412print_block_size(struct device *dev, struct device_attribute *attr,
413 char *buf)
414{
415 return sprintf(buf, "%lx\n", get_memory_block_size());
416}
417
418static DEVICE_ATTR(block_size_bytes, 0444, print_block_size, NULL);
419
420/*
421 * Some architectures will have custom drivers to do this, and
422 * will not need to do it from userspace. The fake hot-add code
423 * as well as ppc64 will do all of their discovery in userspace
424 * and will require this interface.
425 */
426#ifdef CONFIG_ARCH_MEMORY_PROBE
427static ssize_t
428memory_probe_store(struct device *dev, struct device_attribute *attr,
429 const char *buf, size_t count)
430{
431 u64 phys_addr;
432 int nid;
433 int i, ret;
434 unsigned long pages_per_block = PAGES_PER_SECTION * sections_per_block;
435
436 phys_addr = simple_strtoull(buf, NULL, 0);
437
438 if (phys_addr & ((pages_per_block << PAGE_SHIFT) - 1))
439 return -EINVAL;
440
441 for (i = 0; i < sections_per_block; i++) {
442 nid = memory_add_physaddr_to_nid(phys_addr);
443 ret = add_memory(nid, phys_addr,
444 PAGES_PER_SECTION << PAGE_SHIFT);
445 if (ret)
446 goto out;
447
448 phys_addr += MIN_MEMORY_BLOCK_SIZE;
449 }
450
451 ret = count;
452out:
453 return ret;
454}
455
456static DEVICE_ATTR(probe, S_IWUSR, NULL, memory_probe_store);
457#endif
458
459#ifdef CONFIG_MEMORY_FAILURE
460/*
461 * Support for offlining pages of memory
462 */
463
464/* Soft offline a page */
465static ssize_t
466store_soft_offline_page(struct device *dev,
467 struct device_attribute *attr,
468 const char *buf, size_t count)
469{
470 int ret;
471 u64 pfn;
472 if (!capable(CAP_SYS_ADMIN))
473 return -EPERM;
474 if (strict_strtoull(buf, 0, &pfn) < 0)
475 return -EINVAL;
476 pfn >>= PAGE_SHIFT;
477 if (!pfn_valid(pfn))
478 return -ENXIO;
479 ret = soft_offline_page(pfn_to_page(pfn), 0);
480 return ret == 0 ? count : ret;
481}
482
483/* Forcibly offline a page, including killing processes. */
484static ssize_t
485store_hard_offline_page(struct device *dev,
486 struct device_attribute *attr,
487 const char *buf, size_t count)
488{
489 int ret;
490 u64 pfn;
491 if (!capable(CAP_SYS_ADMIN))
492 return -EPERM;
493 if (strict_strtoull(buf, 0, &pfn) < 0)
494 return -EINVAL;
495 pfn >>= PAGE_SHIFT;
496 ret = memory_failure(pfn, 0, 0);
497 return ret ? ret : count;
498}
499
500static DEVICE_ATTR(soft_offline_page, S_IWUSR, NULL, store_soft_offline_page);
501static DEVICE_ATTR(hard_offline_page, S_IWUSR, NULL, store_hard_offline_page);
502#endif
503
504/*
505 * Note that phys_device is optional. It is here to allow for
506 * differentiation between which *physical* devices each
507 * section belongs to...
508 */
509int __weak arch_get_memory_phys_device(unsigned long start_pfn)
510{
511 return 0;
512}
513
514/*
515 * A reference for the returned object is held and the reference for the
516 * hinted object is released.
517 */
518struct memory_block *find_memory_block_hinted(struct mem_section *section,
519 struct memory_block *hint)
520{
521 int block_id = base_memory_block_id(__section_nr(section));
522 struct device *hintdev = hint ? &hint->dev : NULL;
523 struct device *dev;
524
525 dev = subsys_find_device_by_id(&memory_subsys, block_id, hintdev);
526 if (hint)
527 put_device(&hint->dev);
528 if (!dev)
529 return NULL;
530 return container_of(dev, struct memory_block, dev);
531}
532
533/*
534 * For now, we have a linear search to go find the appropriate
535 * memory_block corresponding to a particular phys_index. If
536 * this gets to be a real problem, we can always use a radix
537 * tree or something here.
538 *
539 * This could be made generic for all device subsystems.
540 */
541struct memory_block *find_memory_block(struct mem_section *section)
542{
543 return find_memory_block_hinted(section, NULL);
544}
545
546static struct attribute *memory_memblk_attrs[] = {
547 &dev_attr_phys_index.attr,
548 &dev_attr_end_phys_index.attr,
549 &dev_attr_state.attr,
550 &dev_attr_phys_device.attr,
551 &dev_attr_removable.attr,
552 NULL
553};
554
555static struct attribute_group memory_memblk_attr_group = {
556 .attrs = memory_memblk_attrs,
557};
558
559static const struct attribute_group *memory_memblk_attr_groups[] = {
560 &memory_memblk_attr_group,
561 NULL,
562};
563
564/*
565 * register_memory - Setup a sysfs device for a memory block
566 */
567static
568int register_memory(struct memory_block *memory)
569{
570 int error;
571
572 memory->dev.bus = &memory_subsys;
573 memory->dev.id = memory->start_section_nr / sections_per_block;
574 memory->dev.release = memory_block_release;
575 memory->dev.groups = memory_memblk_attr_groups;
576 memory->dev.offline = memory->state == MEM_OFFLINE;
577
578 error = device_register(&memory->dev);
579 return error;
580}
581
582static int init_memory_block(struct memory_block **memory,
583 struct mem_section *section, unsigned long state)
584{
585 struct memory_block *mem;
586 unsigned long start_pfn;
587 int scn_nr;
588 int ret = 0;
589
590 mem = kzalloc(sizeof(*mem), GFP_KERNEL);
591 if (!mem)
592 return -ENOMEM;
593
594 scn_nr = __section_nr(section);
595 mem->start_section_nr =
596 base_memory_block_id(scn_nr) * sections_per_block;
597 mem->end_section_nr = mem->start_section_nr + sections_per_block - 1;
598 mem->state = state;
599 mem->section_count++;
600 mutex_init(&mem->state_mutex);
601 start_pfn = section_nr_to_pfn(mem->start_section_nr);
602 mem->phys_device = arch_get_memory_phys_device(start_pfn);
603
604 ret = register_memory(mem);
605
606 *memory = mem;
607 return ret;
608}
609
610static int add_memory_section(int nid, struct mem_section *section,
611 struct memory_block **mem_p,
612 unsigned long state, enum mem_add_context context)
613{
614 struct memory_block *mem = NULL;
615 int scn_nr = __section_nr(section);
616 int ret = 0;
617
618 mutex_lock(&mem_sysfs_mutex);
619
620 if (context == BOOT) {
621 /* same memory block ? */
622 if (mem_p && *mem_p)
623 if (scn_nr >= (*mem_p)->start_section_nr &&
624 scn_nr <= (*mem_p)->end_section_nr) {
625 mem = *mem_p;
626 kobject_get(&mem->dev.kobj);
627 }
628 } else
629 mem = find_memory_block(section);
630
631 if (mem) {
632 mem->section_count++;
633 kobject_put(&mem->dev.kobj);
634 } else {
635 ret = init_memory_block(&mem, section, state);
636 /* store memory_block pointer for next loop */
637 if (!ret && context == BOOT)
638 if (mem_p)
639 *mem_p = mem;
640 }
641
642 if (!ret) {
643 if (context == HOTPLUG &&
644 mem->section_count == sections_per_block)
645 ret = register_mem_sect_under_node(mem, nid);
646 }
647
648 mutex_unlock(&mem_sysfs_mutex);
649 return ret;
650}
651
652/*
653 * need an interface for the VM to add new memory regions,
654 * but without onlining it.
655 */
656int register_new_memory(int nid, struct mem_section *section)
657{
658 return add_memory_section(nid, section, NULL, MEM_OFFLINE, HOTPLUG);
659}
660
661#ifdef CONFIG_MEMORY_HOTREMOVE
662static void
663unregister_memory(struct memory_block *memory)
664{
665 BUG_ON(memory->dev.bus != &memory_subsys);
666
667 /* drop the ref. we got in remove_memory_block() */
668 kobject_put(&memory->dev.kobj);
669 device_unregister(&memory->dev);
670}
671
672static int remove_memory_block(unsigned long node_id,
673 struct mem_section *section, int phys_device)
674{
675 struct memory_block *mem;
676
677 mutex_lock(&mem_sysfs_mutex);
678 mem = find_memory_block(section);
679 unregister_mem_sect_under_nodes(mem, __section_nr(section));
680
681 mem->section_count--;
682 if (mem->section_count == 0)
683 unregister_memory(mem);
684 else
685 kobject_put(&mem->dev.kobj);
686
687 mutex_unlock(&mem_sysfs_mutex);
688 return 0;
689}
690
691int unregister_memory_section(struct mem_section *section)
692{
693 if (!present_section(section))
694 return -EINVAL;
695
696 return remove_memory_block(0, section, 0);
697}
698#endif /* CONFIG_MEMORY_HOTREMOVE */
699
700/* return true if the memory block is offlined, otherwise, return false */
701bool is_memblock_offlined(struct memory_block *mem)
702{
703 return mem->state == MEM_OFFLINE;
704}
705
706static struct attribute *memory_root_attrs[] = {
707#ifdef CONFIG_ARCH_MEMORY_PROBE
708 &dev_attr_probe.attr,
709#endif
710
711#ifdef CONFIG_MEMORY_FAILURE
712 &dev_attr_soft_offline_page.attr,
713 &dev_attr_hard_offline_page.attr,
714#endif
715
716 &dev_attr_block_size_bytes.attr,
717 NULL
718};
719
720static struct attribute_group memory_root_attr_group = {
721 .attrs = memory_root_attrs,
722};
723
724static const struct attribute_group *memory_root_attr_groups[] = {
725 &memory_root_attr_group,
726 NULL,
727};
728
729/*
730 * Initialize the sysfs support for memory devices...
731 */
732int __init memory_dev_init(void)
733{
734 unsigned int i;
735 int ret;
736 int err;
737 unsigned long block_sz;
738 struct memory_block *mem = NULL;
739
740 ret = subsys_system_register(&memory_subsys, memory_root_attr_groups);
741 if (ret)
742 goto out;
743
744 block_sz = get_memory_block_size();
745 sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE;
746
747 /*
748 * Create entries for memory sections that were found
749 * during boot and have been initialized
750 */
751 for (i = 0; i < NR_MEM_SECTIONS; i++) {
752 if (!present_section_nr(i))
753 continue;
754 /* don't need to reuse memory_block if only one per block */
755 err = add_memory_section(0, __nr_to_section(i),
756 (sections_per_block == 1) ? NULL : &mem,
757 MEM_ONLINE,
758 BOOT);
759 if (!ret)
760 ret = err;
761 }
762
763out:
764 if (ret)
765 printk(KERN_ERR "%s() failed: %d\n", __func__, ret);
766 return ret;
767}