commit 0cad3ff40486e341715cac3b5ef7b135fa0ba316 · tjh.dev/kernel

+4

Documentation/cgroups/memory.txt

··· 466 466 5.3 swappiness 467 467 468 468 Similar to /proc/sys/vm/swappiness, but affecting a hierarchy of groups only. 469 + Please note that unlike the global swappiness, memcg knob set to 0 470 + really prevents from any swapping even if there is a swap storage 471 + available. This might lead to memcg OOM killer if there are no file 472 + pages to reclaim. 469 473 470 474 Following cgroups' swappiness can't be changed. 471 475 - root cgroup (uses /proc/sys/vm/swappiness).

-1

arch/ia64/mm/init.c

··· 637 637 638 638 high_memory = __va(max_low_pfn * PAGE_SIZE); 639 639 640 - reset_zone_present_pages(); 641 640 for_each_online_pgdat(pgdat) 642 641 if (pgdat->bdata->node_bootmem_map) 643 642 totalram_pages += free_all_bootmem_node(pgdat);

+1

arch/mips/fw/arc/misc.c

··· 11 11 */ 12 12 #include <linux/init.h> 13 13 #include <linux/kernel.h> 14 + #include <linux/irqflags.h> 14 15 15 16 #include <asm/bcache.h> 16 17

+1 -1

drivers/rapidio/rio.c

··· 401 401 /** 402 402 * rio_map_inb_region -- Map inbound memory region. 403 403 * @mport: Master port. 404 - * @lstart: physical address of memory region to be mapped 404 + * @local: physical address of memory region to be mapped 405 405 * @rbase: RIO base address assigned to this window 406 406 * @size: Size of the memory region 407 407 * @rflags: Flags for mapping.

-4

include/linux/mm.h

··· 1684 1684 static inline bool page_is_guard(struct page *page) { return false; } 1685 1685 #endif /* CONFIG_DEBUG_PAGEALLOC */ 1686 1686 1687 - extern void reset_zone_present_pages(void); 1688 - extern void fixup_zone_present_pages(int nid, unsigned long start_pfn, 1689 - unsigned long end_pfn); 1690 - 1691 1687 #endif /* __KERNEL__ */ 1692 1688 #endif /* _LINUX_MM_H */

+1 -1

include/linux/mmzone.h

··· 752 752 unsigned long size, 753 753 enum memmap_context context); 754 754 755 - extern void lruvec_init(struct lruvec *lruvec, struct zone *zone); 755 + extern void lruvec_init(struct lruvec *lruvec); 756 756 757 757 static inline struct zone *lruvec_zone(struct lruvec *lruvec) 758 758 {

+2

include/linux/rio.h

··· 275 275 * struct rio_net - RIO network info 276 276 * @node: Node in global list of RIO networks 277 277 * @devices: List of devices in this network 278 + * @switches: List of switches in this netowrk 278 279 * @mports: List of master ports accessing this network 279 280 * @hport: Default port for accessing this network 280 281 * @id: RIO network ID 282 + * @destid_table: destID allocation table 281 283 */ 282 284 struct rio_net { 283 285 struct list_head node; /* node in list of networks */

+1 -9

mm/bootmem.c

··· 198 198 int order = ilog2(BITS_PER_LONG); 199 199 200 200 __free_pages_bootmem(pfn_to_page(start), order); 201 - fixup_zone_present_pages(page_to_nid(pfn_to_page(start)), 202 - start, start + BITS_PER_LONG); 203 201 count += BITS_PER_LONG; 204 202 start += BITS_PER_LONG; 205 203 } else { ··· 208 210 if (vec & 1) { 209 211 page = pfn_to_page(start + off); 210 212 __free_pages_bootmem(page, 0); 211 - fixup_zone_present_pages( 212 - page_to_nid(page), 213 - start + off, start + off + 1); 214 213 count++; 215 214 } 216 215 vec >>= 1; ··· 221 226 pages = bdata->node_low_pfn - bdata->node_min_pfn; 222 227 pages = bootmem_bootmap_pages(pages); 223 228 count += pages; 224 - while (pages--) { 225 - fixup_zone_present_pages(page_to_nid(page), 226 - page_to_pfn(page), page_to_pfn(page) + 1); 229 + while (pages--) 227 230 __free_pages_bootmem(page++, 0); 228 - } 229 231 230 232 bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count); 231 233

+1 -1

mm/highmem.c

··· 98 98 { 99 99 unsigned long addr = (unsigned long)vaddr; 100 100 101 - if (addr >= PKMAP_ADDR(0) && addr <= PKMAP_ADDR(LAST_PKMAP)) { 101 + if (addr >= PKMAP_ADDR(0) && addr < PKMAP_ADDR(LAST_PKMAP)) { 102 102 int i = (addr - PKMAP_ADDR(0)) >> PAGE_SHIFT; 103 103 return pte_page(pkmap_page_table[i]); 104 104 }

+50 -17

mm/memcontrol.c

··· 1055 1055 struct mem_cgroup *memcg) 1056 1056 { 1057 1057 struct mem_cgroup_per_zone *mz; 1058 + struct lruvec *lruvec; 1058 1059 1059 - if (mem_cgroup_disabled()) 1060 - return &zone->lruvec; 1060 + if (mem_cgroup_disabled()) { 1061 + lruvec = &zone->lruvec; 1062 + goto out; 1063 + } 1061 1064 1062 1065 mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone)); 1063 - return &mz->lruvec; 1066 + lruvec = &mz->lruvec; 1067 + out: 1068 + /* 1069 + * Since a node can be onlined after the mem_cgroup was created, 1070 + * we have to be prepared to initialize lruvec->zone here; 1071 + * and if offlined then reonlined, we need to reinitialize it. 1072 + */ 1073 + if (unlikely(lruvec->zone != zone)) 1074 + lruvec->zone = zone; 1075 + return lruvec; 1064 1076 } 1065 1077 1066 1078 /* ··· 1099 1087 struct mem_cgroup_per_zone *mz; 1100 1088 struct mem_cgroup *memcg; 1101 1089 struct page_cgroup *pc; 1090 + struct lruvec *lruvec; 1102 1091 1103 - if (mem_cgroup_disabled()) 1104 - return &zone->lruvec; 1092 + if (mem_cgroup_disabled()) { 1093 + lruvec = &zone->lruvec; 1094 + goto out; 1095 + } 1105 1096 1106 1097 pc = lookup_page_cgroup(page); 1107 1098 memcg = pc->mem_cgroup; ··· 1122 1107 pc->mem_cgroup = memcg = root_mem_cgroup; 1123 1108 1124 1109 mz = page_cgroup_zoneinfo(memcg, page); 1125 - return &mz->lruvec; 1110 + lruvec = &mz->lruvec; 1111 + out: 1112 + /* 1113 + * Since a node can be onlined after the mem_cgroup was created, 1114 + * we have to be prepared to initialize lruvec->zone here; 1115 + * and if offlined then reonlined, we need to reinitialize it. 1116 + */ 1117 + if (unlikely(lruvec->zone != zone)) 1118 + lruvec->zone = zone; 1119 + return lruvec; 1126 1120 } 1127 1121 1128 1122 /** ··· 1476 1452 static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) 1477 1453 { 1478 1454 u64 limit; 1479 - u64 memsw; 1480 1455 1481 1456 limit = res_counter_read_u64(&memcg->res, RES_LIMIT); 1482 - limit += total_swap_pages << PAGE_SHIFT; 1483 1457 1484 - memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 1485 1458 /* 1486 - * If memsw is finite and limits the amount of swap space available 1487 - * to this memcg, return that limit. 1459 + * Do not consider swap space if we cannot swap due to swappiness 1488 1460 */ 1489 - return min(limit, memsw); 1461 + if (mem_cgroup_swappiness(memcg)) { 1462 + u64 memsw; 1463 + 1464 + limit += total_swap_pages << PAGE_SHIFT; 1465 + memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 1466 + 1467 + /* 1468 + * If memsw is finite and limits the amount of swap space 1469 + * available to this memcg, return that limit. 1470 + */ 1471 + limit = min(limit, memsw); 1472 + } 1473 + 1474 + return limit; 1490 1475 } 1491 1476 1492 1477 void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, ··· 3721 3688 static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg, 3722 3689 int node, int zid, enum lru_list lru) 3723 3690 { 3724 - struct mem_cgroup_per_zone *mz; 3691 + struct lruvec *lruvec; 3725 3692 unsigned long flags, loop; 3726 3693 struct list_head *list; 3727 3694 struct page *busy; 3728 3695 struct zone *zone; 3729 3696 3730 3697 zone = &NODE_DATA(node)->node_zones[zid]; 3731 - mz = mem_cgroup_zoneinfo(memcg, node, zid); 3732 - list = &mz->lruvec.lists[lru]; 3698 + lruvec = mem_cgroup_zone_lruvec(zone, memcg); 3699 + list = &lruvec->lists[lru]; 3733 3700 3734 - loop = mz->lru_size[lru]; 3701 + loop = mem_cgroup_get_lru_size(lruvec, lru); 3735 3702 /* give some margin against EBUSY etc...*/ 3736 3703 loop += 256; 3737 3704 busy = NULL; ··· 4769 4736 4770 4737 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 4771 4738 mz = &pn->zoneinfo[zone]; 4772 - lruvec_init(&mz->lruvec, &NODE_DATA(node)->node_zones[zone]); 4739 + lruvec_init(&mz->lruvec); 4773 4740 mz->usage_in_excess = 0; 4774 4741 mz->on_tree = false; 4775 4742 mz->memcg = memcg;

+4 -6

mm/memory.c

··· 2527 2527 int ret = 0; 2528 2528 int page_mkwrite = 0; 2529 2529 struct page *dirty_page = NULL; 2530 - unsigned long mmun_start; /* For mmu_notifiers */ 2531 - unsigned long mmun_end; /* For mmu_notifiers */ 2532 - bool mmun_called = false; /* For mmu_notifiers */ 2530 + unsigned long mmun_start = 0; /* For mmu_notifiers */ 2531 + unsigned long mmun_end = 0; /* For mmu_notifiers */ 2533 2532 2534 2533 old_page = vm_normal_page(vma, address, orig_pte); 2535 2534 if (!old_page) { ··· 2707 2708 goto oom_free_new; 2708 2709 2709 2710 mmun_start = address & PAGE_MASK; 2710 - mmun_end = (address & PAGE_MASK) + PAGE_SIZE; 2711 - mmun_called = true; 2711 + mmun_end = mmun_start + PAGE_SIZE; 2712 2712 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 2713 2713 2714 2714 /* ··· 2776 2778 page_cache_release(new_page); 2777 2779 unlock: 2778 2780 pte_unmap_unlock(page_table, ptl); 2779 - if (mmun_called) 2781 + if (mmun_end > mmun_start) 2780 2782 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 2781 2783 if (old_page) { 2782 2784 /*

-7

mm/memory_hotplug.c

··· 106 106 void __ref put_page_bootmem(struct page *page) 107 107 { 108 108 unsigned long type; 109 - struct zone *zone; 110 109 111 110 type = (unsigned long) page->lru.next; 112 111 BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || ··· 116 117 set_page_private(page, 0); 117 118 INIT_LIST_HEAD(&page->lru); 118 119 __free_pages_bootmem(page, 0); 119 - 120 - zone = page_zone(page); 121 - zone_span_writelock(zone); 122 - zone->present_pages++; 123 - zone_span_writeunlock(zone); 124 - totalram_pages++; 125 120 } 126 121 127 122 }

+2

mm/mmap.c

··· 334 334 struct vm_area_struct *vma = mm->mmap; 335 335 while (vma) { 336 336 struct anon_vma_chain *avc; 337 + vma_lock_anon_vma(vma); 337 338 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) 338 339 anon_vma_interval_tree_verify(avc); 340 + vma_unlock_anon_vma(vma); 339 341 vma = vma->vm_next; 340 342 i++; 341 343 }

+1 -5

mm/mmzone.c

··· 87 87 } 88 88 #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */ 89 89 90 - void lruvec_init(struct lruvec *lruvec, struct zone *zone) 90 + void lruvec_init(struct lruvec *lruvec) 91 91 { 92 92 enum lru_list lru; 93 93 ··· 95 95 96 96 for_each_lru(lru) 97 97 INIT_LIST_HEAD(&lruvec->lists[lru]); 98 - 99 - #ifdef CONFIG_MEMCG 100 - lruvec->zone = zone; 101 - #endif 102 98 }

-3

mm/nobootmem.c

··· 116 116 return 0; 117 117 118 118 __free_pages_memory(start_pfn, end_pfn); 119 - fixup_zone_present_pages(pfn_to_nid(start >> PAGE_SHIFT), 120 - start_pfn, end_pfn); 121 119 122 120 return end_pfn - start_pfn; 123 121 } ··· 126 128 phys_addr_t start, end, size; 127 129 u64 i; 128 130 129 - reset_zone_present_pages(); 130 131 for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) 131 132 count += __free_memory_core(start, end); 132 133

+1 -35

mm/page_alloc.c

··· 4505 4505 zone->zone_pgdat = pgdat; 4506 4506 4507 4507 zone_pcp_init(zone); 4508 - lruvec_init(&zone->lruvec, zone); 4508 + lruvec_init(&zone->lruvec); 4509 4509 if (!size) 4510 4510 continue; 4511 4511 ··· 6097 6097 page->mapping, page->index); 6098 6098 dump_page_flags(page->flags); 6099 6099 mem_cgroup_print_bad_page(page); 6100 - } 6101 - 6102 - /* reset zone->present_pages */ 6103 - void reset_zone_present_pages(void) 6104 - { 6105 - struct zone *z; 6106 - int i, nid; 6107 - 6108 - for_each_node_state(nid, N_HIGH_MEMORY) { 6109 - for (i = 0; i < MAX_NR_ZONES; i++) { 6110 - z = NODE_DATA(nid)->node_zones + i; 6111 - z->present_pages = 0; 6112 - } 6113 - } 6114 - } 6115 - 6116 - /* calculate zone's present pages in buddy system */ 6117 - void fixup_zone_present_pages(int nid, unsigned long start_pfn, 6118 - unsigned long end_pfn) 6119 - { 6120 - struct zone *z; 6121 - unsigned long zone_start_pfn, zone_end_pfn; 6122 - int i; 6123 - 6124 - for (i = 0; i < MAX_NR_ZONES; i++) { 6125 - z = NODE_DATA(nid)->node_zones + i; 6126 - zone_start_pfn = z->zone_start_pfn; 6127 - zone_end_pfn = zone_start_pfn + z->spanned_pages; 6128 - 6129 - /* if the two regions intersect */ 6130 - if (!(zone_start_pfn >= end_pfn || zone_end_pfn <= start_pfn)) 6131 - z->present_pages += min(end_pfn, zone_end_pfn) - 6132 - max(start_pfn, zone_start_pfn); 6133 - } 6134 6100 }

+15 -3

mm/shmem.c

··· 643 643 kfree(info->symlink); 644 644 645 645 simple_xattrs_free(&info->xattrs); 646 - BUG_ON(inode->i_blocks); 646 + WARN_ON(inode->i_blocks); 647 647 shmem_free_inode(inode->i_sb); 648 648 clear_inode(inode); 649 649 } ··· 1145 1145 if (!error) { 1146 1146 error = shmem_add_to_page_cache(page, mapping, index, 1147 1147 gfp, swp_to_radix_entry(swap)); 1148 - /* We already confirmed swap, and make no allocation */ 1149 - VM_BUG_ON(error); 1148 + /* 1149 + * We already confirmed swap under page lock, and make 1150 + * no memory allocation here, so usually no possibility 1151 + * of error; but free_swap_and_cache() only trylocks a 1152 + * page, so it is just possible that the entry has been 1153 + * truncated or holepunched since swap was confirmed. 1154 + * shmem_undo_range() will have done some of the 1155 + * unaccounting, now delete_from_swap_cache() will do 1156 + * the rest (including mem_cgroup_uncharge_swapcache). 1157 + * Reset swap.val? No, leave it so "failed" goes back to 1158 + * "repeat": reading a hole and writing should succeed. 1159 + */ 1160 + if (error) 1161 + delete_from_swap_cache(page); 1150 1162 } 1151 1163 if (error) 1152 1164 goto failed;

+2 -2

mm/swapfile.c

··· 1494 1494 BUG_ON(!current->mm); 1495 1495 1496 1496 pathname = getname(specialfile); 1497 - err = PTR_ERR(pathname); 1498 1497 if (IS_ERR(pathname)) 1499 - goto out; 1498 + return PTR_ERR(pathname); 1500 1499 1501 1500 victim = file_open_name(pathname, O_RDWR|O_LARGEFILE, 0); 1502 1501 err = PTR_ERR(victim); ··· 1607 1608 out_dput: 1608 1609 filp_close(victim, NULL); 1609 1610 out: 1611 + putname(pathname); 1610 1612 return err; 1611 1613 } 1612 1614

-25

mm/vmscan.c

··· 1760 1760 return false; 1761 1761 } 1762 1762 1763 - #ifdef CONFIG_COMPACTION 1764 - /* 1765 - * If compaction is deferred for sc->order then scale the number of pages 1766 - * reclaimed based on the number of consecutive allocation failures 1767 - */ 1768 - static unsigned long scale_for_compaction(unsigned long pages_for_compaction, 1769 - struct lruvec *lruvec, struct scan_control *sc) 1770 - { 1771 - struct zone *zone = lruvec_zone(lruvec); 1772 - 1773 - if (zone->compact_order_failed <= sc->order) 1774 - pages_for_compaction <<= zone->compact_defer_shift; 1775 - return pages_for_compaction; 1776 - } 1777 - #else 1778 - static unsigned long scale_for_compaction(unsigned long pages_for_compaction, 1779 - struct lruvec *lruvec, struct scan_control *sc) 1780 - { 1781 - return pages_for_compaction; 1782 - } 1783 - #endif 1784 - 1785 1763 /* 1786 1764 * Reclaim/compaction is used for high-order allocation requests. It reclaims 1787 1765 * order-0 pages before compacting the zone. should_continue_reclaim() returns ··· 1807 1829 * inactive lists are large enough, continue reclaiming 1808 1830 */ 1809 1831 pages_for_compaction = (2UL << sc->order); 1810 - 1811 - pages_for_compaction = scale_for_compaction(pages_for_compaction, 1812 - lruvec, sc); 1813 1832 inactive_lru_pages = get_lru_size(lruvec, LRU_INACTIVE_FILE); 1814 1833 if (nr_swap_pages > 0) 1815 1834 inactive_lru_pages += get_lru_size(lruvec, LRU_INACTIVE_ANON);