mm/page_cgroup.c at v3.0-rc3

tjh.dev / kernel
fork
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork
kernel / mm / page_cgroup.c
at v3.0-rc3 536 lines 12 kB view raw
wrap content
  1#include <linux/mm.h>
  2#include <linux/mmzone.h>
  3#include <linux/bootmem.h>
  4#include <linux/bit_spinlock.h>
  5#include <linux/page_cgroup.h>
  6#include <linux/hash.h>
  7#include <linux/slab.h>
  8#include <linux/memory.h>
  9#include <linux/vmalloc.h>
 10#include <linux/cgroup.h>
 11#include <linux/swapops.h>
 12#include <linux/kmemleak.h>
 13
 14static void __meminit init_page_cgroup(struct page_cgroup *pc, unsigned long id)
 15{
 16	pc->flags = 0;
 17	set_page_cgroup_array_id(pc, id);
 18	pc->mem_cgroup = NULL;
 19	INIT_LIST_HEAD(&pc->lru);
 20}
 21static unsigned long total_usage;
 22
 23#if !defined(CONFIG_SPARSEMEM)
 24
 25
 26void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
 27{
 28	pgdat->node_page_cgroup = NULL;
 29}
 30
 31struct page_cgroup *lookup_page_cgroup(struct page *page)
 32{
 33	unsigned long pfn = page_to_pfn(page);
 34	unsigned long offset;
 35	struct page_cgroup *base;
 36
 37	base = NODE_DATA(page_to_nid(page))->node_page_cgroup;
 38	if (unlikely(!base))
 39		return NULL;
 40
 41	offset = pfn - NODE_DATA(page_to_nid(page))->node_start_pfn;
 42	return base + offset;
 43}
 44
 45struct page *lookup_cgroup_page(struct page_cgroup *pc)
 46{
 47	unsigned long pfn;
 48	struct page *page;
 49	pg_data_t *pgdat;
 50
 51	pgdat = NODE_DATA(page_cgroup_array_id(pc));
 52	pfn = pc - pgdat->node_page_cgroup + pgdat->node_start_pfn;
 53	page = pfn_to_page(pfn);
 54	VM_BUG_ON(pc != lookup_page_cgroup(page));
 55	return page;
 56}
 57
 58static int __init alloc_node_page_cgroup(int nid)
 59{
 60	struct page_cgroup *base, *pc;
 61	unsigned long table_size;
 62	unsigned long start_pfn, nr_pages, index;
 63
 64	start_pfn = NODE_DATA(nid)->node_start_pfn;
 65	nr_pages = NODE_DATA(nid)->node_spanned_pages;
 66
 67	if (!nr_pages)
 68		return 0;
 69
 70	table_size = sizeof(struct page_cgroup) * nr_pages;
 71
 72	base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
 73			table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
 74	if (!base)
 75		return -ENOMEM;
 76	for (index = 0; index < nr_pages; index++) {
 77		pc = base + index;
 78		init_page_cgroup(pc, nid);
 79	}
 80	NODE_DATA(nid)->node_page_cgroup = base;
 81	total_usage += table_size;
 82	return 0;
 83}
 84
 85void __init page_cgroup_init_flatmem(void)
 86{
 87
 88	int nid, fail;
 89
 90	if (mem_cgroup_disabled())
 91		return;
 92
 93	for_each_online_node(nid)  {
 94		fail = alloc_node_page_cgroup(nid);
 95		if (fail)
 96			goto fail;
 97	}
 98	printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
 99	printk(KERN_INFO "please try 'cgroup_disable=memory' option if you"
100	" don't want memory cgroups\n");
101	return;
102fail:
103	printk(KERN_CRIT "allocation of page_cgroup failed.\n");
104	printk(KERN_CRIT "please try 'cgroup_disable=memory' boot option\n");
105	panic("Out of memory");
106}
107
108#else /* CONFIG_FLAT_NODE_MEM_MAP */
109
110struct page_cgroup *lookup_page_cgroup(struct page *page)
111{
112	unsigned long pfn = page_to_pfn(page);
113	struct mem_section *section = __pfn_to_section(pfn);
114
115	if (!section->page_cgroup)
116		return NULL;
117	return section->page_cgroup + pfn;
118}
119
120struct page *lookup_cgroup_page(struct page_cgroup *pc)
121{
122	struct mem_section *section;
123	struct page *page;
124	unsigned long nr;
125
126	nr = page_cgroup_array_id(pc);
127	section = __nr_to_section(nr);
128	page = pfn_to_page(pc - section->page_cgroup);
129	VM_BUG_ON(pc != lookup_page_cgroup(page));
130	return page;
131}
132
133static void *__meminit alloc_page_cgroup(size_t size, int nid)
134{
135	void *addr = NULL;
136
137	addr = alloc_pages_exact_nid(nid, size, GFP_KERNEL | __GFP_NOWARN);
138	if (addr)
139		return addr;
140
141	if (node_state(nid, N_HIGH_MEMORY))
142		addr = vmalloc_node(size, nid);
143	else
144		addr = vmalloc(size);
145
146	return addr;
147}
148
149#ifdef CONFIG_MEMORY_HOTPLUG
150static void free_page_cgroup(void *addr)
151{
152	if (is_vmalloc_addr(addr)) {
153		vfree(addr);
154	} else {
155		struct page *page = virt_to_page(addr);
156		size_t table_size =
157			sizeof(struct page_cgroup) * PAGES_PER_SECTION;
158
159		BUG_ON(PageReserved(page));
160		free_pages_exact(addr, table_size);
161	}
162}
163#endif
164
165static int __meminit init_section_page_cgroup(unsigned long pfn)
166{
167	struct page_cgroup *base, *pc;
168	struct mem_section *section;
169	unsigned long table_size;
170	unsigned long nr;
171	int nid, index;
172
173	nr = pfn_to_section_nr(pfn);
174	section = __nr_to_section(nr);
175
176	if (section->page_cgroup)
177		return 0;
178
179	nid = page_to_nid(pfn_to_page(pfn));
180	table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
181	base = alloc_page_cgroup(table_size, nid);
182
183	/*
184	 * The value stored in section->page_cgroup is (base - pfn)
185	 * and it does not point to the memory block allocated above,
186	 * causing kmemleak false positives.
187	 */
188	kmemleak_not_leak(base);
189
190	if (!base) {
191		printk(KERN_ERR "page cgroup allocation failure\n");
192		return -ENOMEM;
193	}
194
195	for (index = 0; index < PAGES_PER_SECTION; index++) {
196		pc = base + index;
197		init_page_cgroup(pc, nr);
198	}
199
200	section->page_cgroup = base - pfn;
201	total_usage += table_size;
202	return 0;
203}
204#ifdef CONFIG_MEMORY_HOTPLUG
205void __free_page_cgroup(unsigned long pfn)
206{
207	struct mem_section *ms;
208	struct page_cgroup *base;
209
210	ms = __pfn_to_section(pfn);
211	if (!ms || !ms->page_cgroup)
212		return;
213	base = ms->page_cgroup + pfn;
214	free_page_cgroup(base);
215	ms->page_cgroup = NULL;
216}
217
218int __meminit online_page_cgroup(unsigned long start_pfn,
219			unsigned long nr_pages,
220			int nid)
221{
222	unsigned long start, end, pfn;
223	int fail = 0;
224
225	start = start_pfn & ~(PAGES_PER_SECTION - 1);
226	end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION);
227
228	for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) {
229		if (!pfn_present(pfn))
230			continue;
231		fail = init_section_page_cgroup(pfn);
232	}
233	if (!fail)
234		return 0;
235
236	/* rollback */
237	for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
238		__free_page_cgroup(pfn);
239
240	return -ENOMEM;
241}
242
243int __meminit offline_page_cgroup(unsigned long start_pfn,
244		unsigned long nr_pages, int nid)
245{
246	unsigned long start, end, pfn;
247
248	start = start_pfn & ~(PAGES_PER_SECTION - 1);
249	end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION);
250
251	for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
252		__free_page_cgroup(pfn);
253	return 0;
254
255}
256
257static int __meminit page_cgroup_callback(struct notifier_block *self,
258			       unsigned long action, void *arg)
259{
260	struct memory_notify *mn = arg;
261	int ret = 0;
262	switch (action) {
263	case MEM_GOING_ONLINE:
264		ret = online_page_cgroup(mn->start_pfn,
265				   mn->nr_pages, mn->status_change_nid);
266		break;
267	case MEM_OFFLINE:
268		offline_page_cgroup(mn->start_pfn,
269				mn->nr_pages, mn->status_change_nid);
270		break;
271	case MEM_CANCEL_ONLINE:
272	case MEM_GOING_OFFLINE:
273		break;
274	case MEM_ONLINE:
275	case MEM_CANCEL_OFFLINE:
276		break;
277	}
278
279	return notifier_from_errno(ret);
280}
281
282#endif
283
284void __init page_cgroup_init(void)
285{
286	unsigned long pfn;
287	int fail = 0;
288
289	if (mem_cgroup_disabled())
290		return;
291
292	for (pfn = 0; !fail && pfn < max_pfn; pfn += PAGES_PER_SECTION) {
293		if (!pfn_present(pfn))
294			continue;
295		fail = init_section_page_cgroup(pfn);
296	}
297	if (fail) {
298		printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n");
299		panic("Out of memory");
300	} else {
301		hotplug_memory_notifier(page_cgroup_callback, 0);
302	}
303	printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
304	printk(KERN_INFO "please try 'cgroup_disable=memory' option if you don't"
305	" want memory cgroups\n");
306}
307
308void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
309{
310	return;
311}
312
313#endif
314
315
316#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
317
318static DEFINE_MUTEX(swap_cgroup_mutex);
319struct swap_cgroup_ctrl {
320	struct page **map;
321	unsigned long length;
322	spinlock_t	lock;
323};
324
325struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
326
327struct swap_cgroup {
328	unsigned short		id;
329};
330#define SC_PER_PAGE	(PAGE_SIZE/sizeof(struct swap_cgroup))
331#define SC_POS_MASK	(SC_PER_PAGE - 1)
332
333/*
334 * SwapCgroup implements "lookup" and "exchange" operations.
335 * In typical usage, this swap_cgroup is accessed via memcg's charge/uncharge
336 * against SwapCache. At swap_free(), this is accessed directly from swap.
337 *
338 * This means,
339 *  - we have no race in "exchange" when we're accessed via SwapCache because
340 *    SwapCache(and its swp_entry) is under lock.
341 *  - When called via swap_free(), there is no user of this entry and no race.
342 * Then, we don't need lock around "exchange".
343 *
344 * TODO: we can push these buffers out to HIGHMEM.
345 */
346
347/*
348 * allocate buffer for swap_cgroup.
349 */
350static int swap_cgroup_prepare(int type)
351{
352	struct page *page;
353	struct swap_cgroup_ctrl *ctrl;
354	unsigned long idx, max;
355
356	ctrl = &swap_cgroup_ctrl[type];
357
358	for (idx = 0; idx < ctrl->length; idx++) {
359		page = alloc_page(GFP_KERNEL | __GFP_ZERO);
360		if (!page)
361			goto not_enough_page;
362		ctrl->map[idx] = page;
363	}
364	return 0;
365not_enough_page:
366	max = idx;
367	for (idx = 0; idx < max; idx++)
368		__free_page(ctrl->map[idx]);
369
370	return -ENOMEM;
371}
372
373/**
374 * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry.
375 * @end: swap entry to be cmpxchged
376 * @old: old id
377 * @new: new id
378 *
379 * Returns old id at success, 0 at failure.
380 * (There is no mem_cgroup using 0 as its id)
381 */
382unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
383					unsigned short old, unsigned short new)
384{
385	int type = swp_type(ent);
386	unsigned long offset = swp_offset(ent);
387	unsigned long idx = offset / SC_PER_PAGE;
388	unsigned long pos = offset & SC_POS_MASK;
389	struct swap_cgroup_ctrl *ctrl;
390	struct page *mappage;
391	struct swap_cgroup *sc;
392	unsigned long flags;
393	unsigned short retval;
394
395	ctrl = &swap_cgroup_ctrl[type];
396
397	mappage = ctrl->map[idx];
398	sc = page_address(mappage);
399	sc += pos;
400	spin_lock_irqsave(&ctrl->lock, flags);
401	retval = sc->id;
402	if (retval == old)
403		sc->id = new;
404	else
405		retval = 0;
406	spin_unlock_irqrestore(&ctrl->lock, flags);
407	return retval;
408}
409
410/**
411 * swap_cgroup_record - record mem_cgroup for this swp_entry.
412 * @ent: swap entry to be recorded into
413 * @mem: mem_cgroup to be recorded
414 *
415 * Returns old value at success, 0 at failure.
416 * (Of course, old value can be 0.)
417 */
418unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
419{
420	int type = swp_type(ent);
421	unsigned long offset = swp_offset(ent);
422	unsigned long idx = offset / SC_PER_PAGE;
423	unsigned long pos = offset & SC_POS_MASK;
424	struct swap_cgroup_ctrl *ctrl;
425	struct page *mappage;
426	struct swap_cgroup *sc;
427	unsigned short old;
428	unsigned long flags;
429
430	ctrl = &swap_cgroup_ctrl[type];
431
432	mappage = ctrl->map[idx];
433	sc = page_address(mappage);
434	sc += pos;
435	spin_lock_irqsave(&ctrl->lock, flags);
436	old = sc->id;
437	sc->id = id;
438	spin_unlock_irqrestore(&ctrl->lock, flags);
439
440	return old;
441}
442
443/**
444 * lookup_swap_cgroup - lookup mem_cgroup tied to swap entry
445 * @ent: swap entry to be looked up.
446 *
447 * Returns CSS ID of mem_cgroup at success. 0 at failure. (0 is invalid ID)
448 */
449unsigned short lookup_swap_cgroup(swp_entry_t ent)
450{
451	int type = swp_type(ent);
452	unsigned long offset = swp_offset(ent);
453	unsigned long idx = offset / SC_PER_PAGE;
454	unsigned long pos = offset & SC_POS_MASK;
455	struct swap_cgroup_ctrl *ctrl;
456	struct page *mappage;
457	struct swap_cgroup *sc;
458	unsigned short ret;
459
460	ctrl = &swap_cgroup_ctrl[type];
461	mappage = ctrl->map[idx];
462	sc = page_address(mappage);
463	sc += pos;
464	ret = sc->id;
465	return ret;
466}
467
468int swap_cgroup_swapon(int type, unsigned long max_pages)
469{
470	void *array;
471	unsigned long array_size;
472	unsigned long length;
473	struct swap_cgroup_ctrl *ctrl;
474
475	if (!do_swap_account)
476		return 0;
477
478	length = DIV_ROUND_UP(max_pages, SC_PER_PAGE);
479	array_size = length * sizeof(void *);
480
481	array = vmalloc(array_size);
482	if (!array)
483		goto nomem;
484
485	memset(array, 0, array_size);
486	ctrl = &swap_cgroup_ctrl[type];
487	mutex_lock(&swap_cgroup_mutex);
488	ctrl->length = length;
489	ctrl->map = array;
490	spin_lock_init(&ctrl->lock);
491	if (swap_cgroup_prepare(type)) {
492		/* memory shortage */
493		ctrl->map = NULL;
494		ctrl->length = 0;
495		mutex_unlock(&swap_cgroup_mutex);
496		vfree(array);
497		goto nomem;
498	}
499	mutex_unlock(&swap_cgroup_mutex);
500
501	return 0;
502nomem:
503	printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n");
504	printk(KERN_INFO
505		"swap_cgroup can be disabled by noswapaccount boot option\n");
506	return -ENOMEM;
507}
508
509void swap_cgroup_swapoff(int type)
510{
511	struct page **map;
512	unsigned long i, length;
513	struct swap_cgroup_ctrl *ctrl;
514
515	if (!do_swap_account)
516		return;
517
518	mutex_lock(&swap_cgroup_mutex);
519	ctrl = &swap_cgroup_ctrl[type];
520	map = ctrl->map;
521	length = ctrl->length;
522	ctrl->map = NULL;
523	ctrl->length = 0;
524	mutex_unlock(&swap_cgroup_mutex);
525
526	if (map) {
527		for (i = 0; i < length; i++) {
528			struct page *page = map[i];
529			if (page)
530				__free_page(page);
531		}
532		vfree(map);
533	}
534}
535
536#endif
Configure Feed

Configure Feed