mm/page_cgroup.c at v3.1 · tjh.dev/kernel

tjh.dev / kernel
fork
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork
kernel / mm / page_cgroup.c
at v3.1 571 lines 13 kB view raw
wrap content
  1#include <linux/mm.h>
  2#include <linux/mmzone.h>
  3#include <linux/bootmem.h>
  4#include <linux/bit_spinlock.h>
  5#include <linux/page_cgroup.h>
  6#include <linux/hash.h>
  7#include <linux/slab.h>
  8#include <linux/memory.h>
  9#include <linux/vmalloc.h>
 10#include <linux/cgroup.h>
 11#include <linux/swapops.h>
 12#include <linux/kmemleak.h>
 13
 14static void __meminit init_page_cgroup(struct page_cgroup *pc, unsigned long id)
 15{
 16	pc->flags = 0;
 17	set_page_cgroup_array_id(pc, id);
 18	pc->mem_cgroup = NULL;
 19	INIT_LIST_HEAD(&pc->lru);
 20}
 21static unsigned long total_usage;
 22
 23#if !defined(CONFIG_SPARSEMEM)
 24
 25
 26void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
 27{
 28	pgdat->node_page_cgroup = NULL;
 29}
 30
 31struct page_cgroup *lookup_page_cgroup(struct page *page)
 32{
 33	unsigned long pfn = page_to_pfn(page);
 34	unsigned long offset;
 35	struct page_cgroup *base;
 36
 37	base = NODE_DATA(page_to_nid(page))->node_page_cgroup;
 38	if (unlikely(!base))
 39		return NULL;
 40
 41	offset = pfn - NODE_DATA(page_to_nid(page))->node_start_pfn;
 42	return base + offset;
 43}
 44
 45struct page *lookup_cgroup_page(struct page_cgroup *pc)
 46{
 47	unsigned long pfn;
 48	struct page *page;
 49	pg_data_t *pgdat;
 50
 51	pgdat = NODE_DATA(page_cgroup_array_id(pc));
 52	pfn = pc - pgdat->node_page_cgroup + pgdat->node_start_pfn;
 53	page = pfn_to_page(pfn);
 54	VM_BUG_ON(pc != lookup_page_cgroup(page));
 55	return page;
 56}
 57
 58static int __init alloc_node_page_cgroup(int nid)
 59{
 60	struct page_cgroup *base, *pc;
 61	unsigned long table_size;
 62	unsigned long start_pfn, nr_pages, index;
 63
 64	start_pfn = NODE_DATA(nid)->node_start_pfn;
 65	nr_pages = NODE_DATA(nid)->node_spanned_pages;
 66
 67	if (!nr_pages)
 68		return 0;
 69
 70	table_size = sizeof(struct page_cgroup) * nr_pages;
 71
 72	base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
 73			table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
 74	if (!base)
 75		return -ENOMEM;
 76	for (index = 0; index < nr_pages; index++) {
 77		pc = base + index;
 78		init_page_cgroup(pc, nid);
 79	}
 80	NODE_DATA(nid)->node_page_cgroup = base;
 81	total_usage += table_size;
 82	return 0;
 83}
 84
 85void __init page_cgroup_init_flatmem(void)
 86{
 87
 88	int nid, fail;
 89
 90	if (mem_cgroup_disabled())
 91		return;
 92
 93	for_each_online_node(nid)  {
 94		fail = alloc_node_page_cgroup(nid);
 95		if (fail)
 96			goto fail;
 97	}
 98	printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
 99	printk(KERN_INFO "please try 'cgroup_disable=memory' option if you"
100	" don't want memory cgroups\n");
101	return;
102fail:
103	printk(KERN_CRIT "allocation of page_cgroup failed.\n");
104	printk(KERN_CRIT "please try 'cgroup_disable=memory' boot option\n");
105	panic("Out of memory");
106}
107
108#else /* CONFIG_FLAT_NODE_MEM_MAP */
109
110struct page_cgroup *lookup_page_cgroup(struct page *page)
111{
112	unsigned long pfn = page_to_pfn(page);
113	struct mem_section *section = __pfn_to_section(pfn);
114
115	if (!section->page_cgroup)
116		return NULL;
117	return section->page_cgroup + pfn;
118}
119
120struct page *lookup_cgroup_page(struct page_cgroup *pc)
121{
122	struct mem_section *section;
123	struct page *page;
124	unsigned long nr;
125
126	nr = page_cgroup_array_id(pc);
127	section = __nr_to_section(nr);
128	page = pfn_to_page(pc - section->page_cgroup);
129	VM_BUG_ON(pc != lookup_page_cgroup(page));
130	return page;
131}
132
133static void *__meminit alloc_page_cgroup(size_t size, int nid)
134{
135	void *addr = NULL;
136
137	addr = alloc_pages_exact_nid(nid, size, GFP_KERNEL | __GFP_NOWARN);
138	if (addr)
139		return addr;
140
141	if (node_state(nid, N_HIGH_MEMORY))
142		addr = vmalloc_node(size, nid);
143	else
144		addr = vmalloc(size);
145
146	return addr;
147}
148
149#ifdef CONFIG_MEMORY_HOTPLUG
150static void free_page_cgroup(void *addr)
151{
152	if (is_vmalloc_addr(addr)) {
153		vfree(addr);
154	} else {
155		struct page *page = virt_to_page(addr);
156		size_t table_size =
157			sizeof(struct page_cgroup) * PAGES_PER_SECTION;
158
159		BUG_ON(PageReserved(page));
160		free_pages_exact(addr, table_size);
161	}
162}
163#endif
164
165static int __meminit init_section_page_cgroup(unsigned long pfn, int nid)
166{
167	struct page_cgroup *base, *pc;
168	struct mem_section *section;
169	unsigned long table_size;
170	unsigned long nr;
171	int index;
172
173	nr = pfn_to_section_nr(pfn);
174	section = __nr_to_section(nr);
175
176	if (section->page_cgroup)
177		return 0;
178
179	table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
180	base = alloc_page_cgroup(table_size, nid);
181
182	/*
183	 * The value stored in section->page_cgroup is (base - pfn)
184	 * and it does not point to the memory block allocated above,
185	 * causing kmemleak false positives.
186	 */
187	kmemleak_not_leak(base);
188
189	if (!base) {
190		printk(KERN_ERR "page cgroup allocation failure\n");
191		return -ENOMEM;
192	}
193
194	for (index = 0; index < PAGES_PER_SECTION; index++) {
195		pc = base + index;
196		init_page_cgroup(pc, nr);
197	}
198	/*
199	 * The passed "pfn" may not be aligned to SECTION.  For the calculation
200	 * we need to apply a mask.
201	 */
202	pfn &= PAGE_SECTION_MASK;
203	section->page_cgroup = base - pfn;
204	total_usage += table_size;
205	return 0;
206}
207#ifdef CONFIG_MEMORY_HOTPLUG
208void __free_page_cgroup(unsigned long pfn)
209{
210	struct mem_section *ms;
211	struct page_cgroup *base;
212
213	ms = __pfn_to_section(pfn);
214	if (!ms || !ms->page_cgroup)
215		return;
216	base = ms->page_cgroup + pfn;
217	free_page_cgroup(base);
218	ms->page_cgroup = NULL;
219}
220
221int __meminit online_page_cgroup(unsigned long start_pfn,
222			unsigned long nr_pages,
223			int nid)
224{
225	unsigned long start, end, pfn;
226	int fail = 0;
227
228	start = SECTION_ALIGN_DOWN(start_pfn);
229	end = SECTION_ALIGN_UP(start_pfn + nr_pages);
230
231	if (nid == -1) {
232		/*
233		 * In this case, "nid" already exists and contains valid memory.
234		 * "start_pfn" passed to us is a pfn which is an arg for
235		 * online__pages(), and start_pfn should exist.
236		 */
237		nid = pfn_to_nid(start_pfn);
238		VM_BUG_ON(!node_state(nid, N_ONLINE));
239	}
240
241	for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) {
242		if (!pfn_present(pfn))
243			continue;
244		fail = init_section_page_cgroup(pfn, nid);
245	}
246	if (!fail)
247		return 0;
248
249	/* rollback */
250	for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
251		__free_page_cgroup(pfn);
252
253	return -ENOMEM;
254}
255
256int __meminit offline_page_cgroup(unsigned long start_pfn,
257		unsigned long nr_pages, int nid)
258{
259	unsigned long start, end, pfn;
260
261	start = SECTION_ALIGN_DOWN(start_pfn);
262	end = SECTION_ALIGN_UP(start_pfn + nr_pages);
263
264	for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
265		__free_page_cgroup(pfn);
266	return 0;
267
268}
269
270static int __meminit page_cgroup_callback(struct notifier_block *self,
271			       unsigned long action, void *arg)
272{
273	struct memory_notify *mn = arg;
274	int ret = 0;
275	switch (action) {
276	case MEM_GOING_ONLINE:
277		ret = online_page_cgroup(mn->start_pfn,
278				   mn->nr_pages, mn->status_change_nid);
279		break;
280	case MEM_OFFLINE:
281		offline_page_cgroup(mn->start_pfn,
282				mn->nr_pages, mn->status_change_nid);
283		break;
284	case MEM_CANCEL_ONLINE:
285	case MEM_GOING_OFFLINE:
286		break;
287	case MEM_ONLINE:
288	case MEM_CANCEL_OFFLINE:
289		break;
290	}
291
292	return notifier_from_errno(ret);
293}
294
295#endif
296
297void __init page_cgroup_init(void)
298{
299	unsigned long pfn;
300	int nid;
301
302	if (mem_cgroup_disabled())
303		return;
304
305	for_each_node_state(nid, N_HIGH_MEMORY) {
306		unsigned long start_pfn, end_pfn;
307
308		start_pfn = node_start_pfn(nid);
309		end_pfn = node_end_pfn(nid);
310		/*
311		 * start_pfn and end_pfn may not be aligned to SECTION and the
312		 * page->flags of out of node pages are not initialized.  So we
313		 * scan [start_pfn, the biggest section's pfn < end_pfn) here.
314		 */
315		for (pfn = start_pfn;
316		     pfn < end_pfn;
317                     pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) {
318
319			if (!pfn_valid(pfn))
320				continue;
321			/*
322			 * Nodes's pfns can be overlapping.
323			 * We know some arch can have a nodes layout such as
324			 * -------------pfn-------------->
325			 * N0 | N1 | N2 | N0 | N1 | N2|....
326			 */
327			if (pfn_to_nid(pfn) != nid)
328				continue;
329			if (init_section_page_cgroup(pfn, nid))
330				goto oom;
331		}
332	}
333	hotplug_memory_notifier(page_cgroup_callback, 0);
334	printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
335	printk(KERN_INFO "please try 'cgroup_disable=memory' option if you "
336			 "don't want memory cgroups\n");
337	return;
338oom:
339	printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n");
340	panic("Out of memory");
341}
342
343void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
344{
345	return;
346}
347
348#endif
349
350
351#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
352
353static DEFINE_MUTEX(swap_cgroup_mutex);
354struct swap_cgroup_ctrl {
355	struct page **map;
356	unsigned long length;
357	spinlock_t	lock;
358};
359
360struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
361
362struct swap_cgroup {
363	unsigned short		id;
364};
365#define SC_PER_PAGE	(PAGE_SIZE/sizeof(struct swap_cgroup))
366#define SC_POS_MASK	(SC_PER_PAGE - 1)
367
368/*
369 * SwapCgroup implements "lookup" and "exchange" operations.
370 * In typical usage, this swap_cgroup is accessed via memcg's charge/uncharge
371 * against SwapCache. At swap_free(), this is accessed directly from swap.
372 *
373 * This means,
374 *  - we have no race in "exchange" when we're accessed via SwapCache because
375 *    SwapCache(and its swp_entry) is under lock.
376 *  - When called via swap_free(), there is no user of this entry and no race.
377 * Then, we don't need lock around "exchange".
378 *
379 * TODO: we can push these buffers out to HIGHMEM.
380 */
381
382/*
383 * allocate buffer for swap_cgroup.
384 */
385static int swap_cgroup_prepare(int type)
386{
387	struct page *page;
388	struct swap_cgroup_ctrl *ctrl;
389	unsigned long idx, max;
390
391	ctrl = &swap_cgroup_ctrl[type];
392
393	for (idx = 0; idx < ctrl->length; idx++) {
394		page = alloc_page(GFP_KERNEL | __GFP_ZERO);
395		if (!page)
396			goto not_enough_page;
397		ctrl->map[idx] = page;
398	}
399	return 0;
400not_enough_page:
401	max = idx;
402	for (idx = 0; idx < max; idx++)
403		__free_page(ctrl->map[idx]);
404
405	return -ENOMEM;
406}
407
408/**
409 * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry.
410 * @end: swap entry to be cmpxchged
411 * @old: old id
412 * @new: new id
413 *
414 * Returns old id at success, 0 at failure.
415 * (There is no mem_cgroup using 0 as its id)
416 */
417unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
418					unsigned short old, unsigned short new)
419{
420	int type = swp_type(ent);
421	unsigned long offset = swp_offset(ent);
422	unsigned long idx = offset / SC_PER_PAGE;
423	unsigned long pos = offset & SC_POS_MASK;
424	struct swap_cgroup_ctrl *ctrl;
425	struct page *mappage;
426	struct swap_cgroup *sc;
427	unsigned long flags;
428	unsigned short retval;
429
430	ctrl = &swap_cgroup_ctrl[type];
431
432	mappage = ctrl->map[idx];
433	sc = page_address(mappage);
434	sc += pos;
435	spin_lock_irqsave(&ctrl->lock, flags);
436	retval = sc->id;
437	if (retval == old)
438		sc->id = new;
439	else
440		retval = 0;
441	spin_unlock_irqrestore(&ctrl->lock, flags);
442	return retval;
443}
444
445/**
446 * swap_cgroup_record - record mem_cgroup for this swp_entry.
447 * @ent: swap entry to be recorded into
448 * @mem: mem_cgroup to be recorded
449 *
450 * Returns old value at success, 0 at failure.
451 * (Of course, old value can be 0.)
452 */
453unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
454{
455	int type = swp_type(ent);
456	unsigned long offset = swp_offset(ent);
457	unsigned long idx = offset / SC_PER_PAGE;
458	unsigned long pos = offset & SC_POS_MASK;
459	struct swap_cgroup_ctrl *ctrl;
460	struct page *mappage;
461	struct swap_cgroup *sc;
462	unsigned short old;
463	unsigned long flags;
464
465	ctrl = &swap_cgroup_ctrl[type];
466
467	mappage = ctrl->map[idx];
468	sc = page_address(mappage);
469	sc += pos;
470	spin_lock_irqsave(&ctrl->lock, flags);
471	old = sc->id;
472	sc->id = id;
473	spin_unlock_irqrestore(&ctrl->lock, flags);
474
475	return old;
476}
477
478/**
479 * lookup_swap_cgroup - lookup mem_cgroup tied to swap entry
480 * @ent: swap entry to be looked up.
481 *
482 * Returns CSS ID of mem_cgroup at success. 0 at failure. (0 is invalid ID)
483 */
484unsigned short lookup_swap_cgroup(swp_entry_t ent)
485{
486	int type = swp_type(ent);
487	unsigned long offset = swp_offset(ent);
488	unsigned long idx = offset / SC_PER_PAGE;
489	unsigned long pos = offset & SC_POS_MASK;
490	struct swap_cgroup_ctrl *ctrl;
491	struct page *mappage;
492	struct swap_cgroup *sc;
493	unsigned short ret;
494
495	ctrl = &swap_cgroup_ctrl[type];
496	mappage = ctrl->map[idx];
497	sc = page_address(mappage);
498	sc += pos;
499	ret = sc->id;
500	return ret;
501}
502
503int swap_cgroup_swapon(int type, unsigned long max_pages)
504{
505	void *array;
506	unsigned long array_size;
507	unsigned long length;
508	struct swap_cgroup_ctrl *ctrl;
509
510	if (!do_swap_account)
511		return 0;
512
513	length = DIV_ROUND_UP(max_pages, SC_PER_PAGE);
514	array_size = length * sizeof(void *);
515
516	array = vmalloc(array_size);
517	if (!array)
518		goto nomem;
519
520	memset(array, 0, array_size);
521	ctrl = &swap_cgroup_ctrl[type];
522	mutex_lock(&swap_cgroup_mutex);
523	ctrl->length = length;
524	ctrl->map = array;
525	spin_lock_init(&ctrl->lock);
526	if (swap_cgroup_prepare(type)) {
527		/* memory shortage */
528		ctrl->map = NULL;
529		ctrl->length = 0;
530		mutex_unlock(&swap_cgroup_mutex);
531		vfree(array);
532		goto nomem;
533	}
534	mutex_unlock(&swap_cgroup_mutex);
535
536	return 0;
537nomem:
538	printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n");
539	printk(KERN_INFO
540		"swap_cgroup can be disabled by swapaccount=0 boot option\n");
541	return -ENOMEM;
542}
543
544void swap_cgroup_swapoff(int type)
545{
546	struct page **map;
547	unsigned long i, length;
548	struct swap_cgroup_ctrl *ctrl;
549
550	if (!do_swap_account)
551		return;
552
553	mutex_lock(&swap_cgroup_mutex);
554	ctrl = &swap_cgroup_ctrl[type];
555	map = ctrl->map;
556	length = ctrl->length;
557	ctrl->map = NULL;
558	ctrl->length = 0;
559	mutex_unlock(&swap_cgroup_mutex);
560
561	if (map) {
562		for (i = 0; i < length; i++) {
563			struct page *page = map[i];
564			if (page)
565				__free_page(page);
566		}
567		vfree(map);
568	}
569}
570
571#endif
Configure Feed

Configure Feed