Merge branch 'akpm' (patches from Andrew)

tjh.dev / kernel

fork atom

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

fork atom

Merge branch 'akpm' (patches from Andrew)

Merge misc fixes from Andrew Morton:
"12 fixes"

* emailed patches from Andrew Morton <akpm@linux-foundation.org>:
mm, vmscan: prevent kswapd livelock due to pfmemalloc-throttled process being killed
memcg: fix destination cgroup leak on task charges migration
mm: memcontrol: switch soft limit default back to infinity
mm/debug_pagealloc: remove obsolete Kconfig options
vfs: renumber FMODE_NONOTIFY and add to uniqueness check
arch/blackfin/mach-bf533/boards/stamp.c: add linux/delay.h
ocfs2: fix the wrong directory passed to ocfs2_lookup_ino_from_name() when link file
MAINTAINERS: update rydberg's addresses
mm: protect set_page_dirty() from ongoing truncation
mm: prevent endless growth of anon_vma hierarchy
exit: fix race between wait_consider_task() and wait_task_zombie()
ocfs2: remove bogus check in dlm_process_recovery_data

Linus Torvalds 11 years ago b3d574ae 11c8f01b

+155 -101

17 changed files

expand all collapse all

.mailmap

MAINTAINERS

arch

blackfin

mach-bf533

boards

stamp.c

fcntl.c

ocfs2

dlm

dlmrecovery.c

namei.c

include

linux

fs.h

rmap.h

writeback.h

uapi

asm-generic

fcntl.h

kernel

exit.c

Kconfig.debug

memcontrol.c

memory.c

page-writeback.c

rmap.c

vmscan.c

.mailmap

reviewed

··· 51 51 Greg Kroah-Hartman <greg@kroah.com> 52 52 Henk Vergonet <Henk.Vergonet@gmail.com> 53 53 Henrik Kretzschmar <henne@nachtwindheim.de> 54 54 + Henrik Rydberg <rydberg@bitmath.org> 54 55 Herbert Xu <herbert@gondor.apana.org.au> 55 56 Jacob Shin <Jacob.Shin@amd.com> 56 57 James Bottomley <jejb@mulgrave.(none)>

+6 -6

MAINTAINERS

reviewed

··· 724 724 F: drivers/char/apm-emulation.c 725 725 726 726 APPLE BCM5974 MULTITOUCH DRIVER 727 727 - M: Henrik Rydberg <rydberg@euromail.se> 727 727 + M: Henrik Rydberg <rydberg@bitmath.org> 728 728 L: linux-input@vger.kernel.org 729 729 - S: Maintained 729 729 + S: Odd fixes 730 730 F: drivers/input/mouse/bcm5974.c 731 731 732 732 APPLE SMC DRIVER 733 733 - M: Henrik Rydberg <rydberg@euromail.se> 733 733 + M: Henrik Rydberg <rydberg@bitmath.org> 734 734 L: lm-sensors@lm-sensors.org 735 735 - S: Maintained 735 735 + S: Odd fixes 736 736 F: drivers/hwmon/applesmc.c 737 737 738 738 APPLETALK NETWORK LAYER ··· 4940 4940 F: include/linux/input/ 4941 4941 4942 4942 INPUT MULTITOUCH (MT) PROTOCOL 4943 4943 - M: Henrik Rydberg <rydberg@euromail.se> 4943 4943 + M: Henrik Rydberg <rydberg@bitmath.org> 4944 4944 L: linux-input@vger.kernel.org 4945 4945 T: git git://git.kernel.org/pub/scm/linux/kernel/git/rydberg/input-mt.git 4946 4946 - S: Maintained 4946 4946 + S: Odd fixes 4947 4947 F: Documentation/input/multi-touch-protocol.txt 4948 4948 F: drivers/input/input-mt.c 4949 4949 K: \b(ABS|SYN)_MT_

arch/blackfin/mach-bf533/boards/stamp.c

reviewed

··· 7 7 */ 8 8 9 9 #include <linux/device.h> 10 10 + #include <linux/delay.h> 10 11 #include <linux/platform_device.h> 11 12 #include <linux/mtd/mtd.h> 12 13 #include <linux/mtd/partitions.h>

+3 -2

fs/fcntl.c

reviewed

··· 740 740 * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY 741 741 * is defined as O_NONBLOCK on some platforms and not on others. 742 742 */ 743 743 - BUILD_BUG_ON(20 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32( 743 743 + BUILD_BUG_ON(21 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32( 744 744 O_RDONLY | O_WRONLY | O_RDWR | 745 745 O_CREAT | O_EXCL | O_NOCTTY | 746 746 O_TRUNC | O_APPEND | /* O_NONBLOCK | */ 747 747 __O_SYNC | O_DSYNC | FASYNC | 748 748 O_DIRECT | O_LARGEFILE | O_DIRECTORY | 749 749 O_NOFOLLOW | O_NOATIME | O_CLOEXEC | 750 750 - __FMODE_EXEC | O_PATH | __O_TMPFILE 750 750 + __FMODE_EXEC | O_PATH | __O_TMPFILE | 751 751 + __FMODE_NONOTIFY 751 752 )); 752 753 753 754 fasync_cache = kmem_cache_create("fasync_cache",

+1 -4

fs/ocfs2/dlm/dlmrecovery.c

reviewed

··· 2023 2023 dlm_lockres_drop_inflight_ref(dlm, res); 2024 2024 spin_unlock(&res->spinlock); 2025 2025 2026 2026 - if (ret < 0) { 2026 2026 + if (ret < 0) 2027 2027 mlog_errno(ret); 2028 2028 - if (newlock) 2029 2029 - dlm_lock_put(newlock); 2030 2030 - } 2031 2028 2032 2029 return ret; 2033 2030 }

+35 -8

fs/ocfs2/namei.c

reviewed

··· 94 94 struct inode *inode, 95 95 const char *symname); 96 96 97 97 + static int ocfs2_double_lock(struct ocfs2_super *osb, 98 98 + struct buffer_head **bh1, 99 99 + struct inode *inode1, 100 100 + struct buffer_head **bh2, 101 101 + struct inode *inode2, 102 102 + int rename); 103 103 + 104 104 + static void ocfs2_double_unlock(struct inode *inode1, struct inode *inode2); 97 105 /* An orphan dir name is an 8 byte value, printed as a hex string */ 98 106 #define OCFS2_ORPHAN_NAMELEN ((int)(2 * sizeof(u64))) 99 107 ··· 686 678 { 687 679 handle_t *handle; 688 680 struct inode *inode = old_dentry->d_inode; 681 681 + struct inode *old_dir = old_dentry->d_parent->d_inode; 689 682 int err; 690 683 struct buffer_head *fe_bh = NULL; 684 684 + struct buffer_head *old_dir_bh = NULL; 691 685 struct buffer_head *parent_fe_bh = NULL; 692 686 struct ocfs2_dinode *fe = NULL; 693 687 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); ··· 706 696 707 697 dquot_initialize(dir); 708 698 709 709 - err = ocfs2_inode_lock_nested(dir, &parent_fe_bh, 1, OI_LS_PARENT); 699 699 + err = ocfs2_double_lock(osb, &old_dir_bh, old_dir, 700 700 + &parent_fe_bh, dir, 0); 710 701 if (err < 0) { 711 702 if (err != -ENOENT) 712 703 mlog_errno(err); 713 704 return err; 705 705 + } 706 706 + 707 707 + /* make sure both dirs have bhs 708 708 + * get an extra ref on old_dir_bh if old==new */ 709 709 + if (!parent_fe_bh) { 710 710 + if (old_dir_bh) { 711 711 + parent_fe_bh = old_dir_bh; 712 712 + get_bh(parent_fe_bh); 713 713 + } else { 714 714 + mlog(ML_ERROR, "%s: no old_dir_bh!\n", osb->uuid_str); 715 715 + err = -EIO; 716 716 + goto out; 717 717 + } 714 718 } 715 719 716 720 if (!dir->i_nlink) { ··· 732 708 goto out; 733 709 } 734 710 735 735 - err = ocfs2_lookup_ino_from_name(dir, old_dentry->d_name.name, 711 711 + err = ocfs2_lookup_ino_from_name(old_dir, old_dentry->d_name.name, 736 712 old_dentry->d_name.len, &old_de_ino); 737 713 if (err) { 738 714 err = -ENOENT; ··· 825 801 ocfs2_inode_unlock(inode, 1); 826 802 827 803 out: 828 828 - ocfs2_inode_unlock(dir, 1); 804 804 + ocfs2_double_unlock(old_dir, dir); 829 805 830 806 brelse(fe_bh); 831 807 brelse(parent_fe_bh); 808 808 + brelse(old_dir_bh); 832 809 833 810 ocfs2_free_dir_lookup_result(&lookup); 834 811 ··· 1097 1072 } 1098 1073 1099 1074 /* 1100 1100 - * The only place this should be used is rename! 1075 1075 + * The only place this should be used is rename and link! 1101 1076 * if they have the same id, then the 1st one is the only one locked. 1102 1077 */ 1103 1078 static int ocfs2_double_lock(struct ocfs2_super *osb, 1104 1079 struct buffer_head **bh1, 1105 1080 struct inode *inode1, 1106 1081 struct buffer_head **bh2, 1107 1107 - struct inode *inode2) 1082 1082 + struct inode *inode2, 1083 1083 + int rename) 1108 1084 { 1109 1085 int status; 1110 1086 int inode1_is_ancestor, inode2_is_ancestor; ··· 1153 1127 } 1154 1128 /* lock id2 */ 1155 1129 status = ocfs2_inode_lock_nested(inode2, bh2, 1, 1156 1156 - OI_LS_RENAME1); 1130 1130 + rename == 1 ? OI_LS_RENAME1 : OI_LS_PARENT); 1157 1131 if (status < 0) { 1158 1132 if (status != -ENOENT) 1159 1133 mlog_errno(status); ··· 1162 1136 } 1163 1137 1164 1138 /* lock id1 */ 1165 1165 - status = ocfs2_inode_lock_nested(inode1, bh1, 1, OI_LS_RENAME2); 1139 1139 + status = ocfs2_inode_lock_nested(inode1, bh1, 1, 1140 1140 + rename == 1 ? OI_LS_RENAME2 : OI_LS_PARENT); 1166 1141 if (status < 0) { 1167 1142 /* 1168 1143 * An error return must mean that no cluster locks ··· 1279 1252 1280 1253 /* if old and new are the same, this'll just do one lock. */ 1281 1254 status = ocfs2_double_lock(osb, &old_dir_bh, old_dir, 1282 1282 - &new_dir_bh, new_dir); 1255 1255 + &new_dir_bh, new_dir, 1); 1283 1256 if (status < 0) { 1284 1257 mlog_errno(status); 1285 1258 goto bail;

+1 -1

include/linux/fs.h

reviewed

··· 135 135 #define FMODE_CAN_WRITE ((__force fmode_t)0x40000) 136 136 137 137 /* File was opened by fanotify and shouldn't generate fanotify events */ 138 138 - #define FMODE_NONOTIFY ((__force fmode_t)0x1000000) 138 138 + #define FMODE_NONOTIFY ((__force fmode_t)0x4000000) 139 139 140 140 /* 141 141 * Flag for rw_copy_check_uvector and compat_rw_copy_check_uvector

+10

include/linux/rmap.h

reviewed

··· 37 37 atomic_t refcount; 38 38 39 39 /* 40 40 + * Count of child anon_vmas and VMAs which points to this anon_vma. 41 41 + * 42 42 + * This counter is used for making decision about reusing anon_vma 43 43 + * instead of forking new one. See comments in function anon_vma_clone. 44 44 + */ 45 45 + unsigned degree; 46 46 + 47 47 + struct anon_vma *parent; /* Parent of this anon_vma */ 48 48 + 49 49 + /* 40 50 * NOTE: the LSB of the rb_root.rb_node is set by 41 51 * mm_take_all_locks() _after_ taking the above lock. So the 42 52 * rb_root must only be read/written after taking the above lock

-1

include/linux/writeback.h

reviewed

··· 177 177 struct writeback_control *wbc, writepage_t writepage, 178 178 void *data); 179 179 int do_writepages(struct address_space *mapping, struct writeback_control *wbc); 180 180 - void set_page_dirty_balance(struct page *page); 181 180 void writeback_set_ratelimit(void); 182 181 void tag_pages_for_writeback(struct address_space *mapping, 183 182 pgoff_t start, pgoff_t end);

+1 -1

include/uapi/asm-generic/fcntl.h

reviewed

··· 5 5 6 6 /* 7 7 * FMODE_EXEC is 0x20 8 8 - * FMODE_NONOTIFY is 0x1000000 8 8 + * FMODE_NONOTIFY is 0x4000000 9 9 * These cannot be used by userspace O_* until internal and external open 10 10 * flags are split. 11 11 * -Eric Paris

+9 -3

kernel/exit.c

reviewed

··· 1287 1287 static int wait_consider_task(struct wait_opts *wo, int ptrace, 1288 1288 struct task_struct *p) 1289 1289 { 1290 1290 + /* 1291 1291 + * We can race with wait_task_zombie() from another thread. 1292 1292 + * Ensure that EXIT_ZOMBIE -> EXIT_DEAD/EXIT_TRACE transition 1293 1293 + * can't confuse the checks below. 1294 1294 + */ 1295 1295 + int exit_state = ACCESS_ONCE(p->exit_state); 1290 1296 int ret; 1291 1297 1292 1292 - if (unlikely(p->exit_state == EXIT_DEAD)) 1298 1298 + if (unlikely(exit_state == EXIT_DEAD)) 1293 1299 return 0; 1294 1300 1295 1301 ret = eligible_child(wo, p); ··· 1316 1310 return 0; 1317 1311 } 1318 1312 1319 1319 - if (unlikely(p->exit_state == EXIT_TRACE)) { 1313 1313 + if (unlikely(exit_state == EXIT_TRACE)) { 1320 1314 /* 1321 1315 * ptrace == 0 means we are the natural parent. In this case 1322 1316 * we should clear notask_error, debugger will notify us. ··· 1343 1337 } 1344 1338 1345 1339 /* slay zombie? */ 1346 1346 - if (p->exit_state == EXIT_ZOMBIE) { 1340 1340 + if (exit_state == EXIT_ZOMBIE) { 1347 1341 /* we don't reap group leaders with subthreads */ 1348 1342 if (!delay_group_leader(p)) { 1349 1343 /*

-9

mm/Kconfig.debug

reviewed

··· 14 14 depends on !KMEMCHECK 15 15 select PAGE_EXTENSION 16 16 select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC 17 17 - select PAGE_GUARD if ARCH_SUPPORTS_DEBUG_PAGEALLOC 18 17 ---help--- 19 18 Unmap pages from the kernel linear mapping after free_pages(). 20 19 This results in a large slowdown, but helps to find certain types ··· 26 27 that would result in incorrect warnings of memory corruption after 27 28 a resume because free pages are not saved to the suspend image. 28 29 29 29 - config WANT_PAGE_DEBUG_FLAGS 30 30 - bool 31 31 - 32 30 config PAGE_POISONING 33 31 bool 34 34 - select WANT_PAGE_DEBUG_FLAGS 35 35 - 36 36 - config PAGE_GUARD 37 37 - bool 38 38 - select WANT_PAGE_DEBUG_FLAGS

+4 -13

mm/memcontrol.c

reviewed

··· 3043 3043 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { 3044 3044 mem_cgroup_swap_statistics(from, false); 3045 3045 mem_cgroup_swap_statistics(to, true); 3046 3046 - /* 3047 3047 - * This function is only called from task migration context now. 3048 3048 - * It postpones page_counter and refcount handling till the end 3049 3049 - * of task migration(mem_cgroup_clear_mc()) for performance 3050 3050 - * improvement. But we cannot postpone css_get(to) because if 3051 3051 - * the process that has been moved to @to does swap-in, the 3052 3052 - * refcount of @to might be decreased to 0. 3053 3053 - * 3054 3054 - * We are in attach() phase, so the cgroup is guaranteed to be 3055 3055 - * alive, so we can just call css_get(). 3056 3056 - */ 3057 3057 - css_get(&to->css); 3058 3046 return 0; 3059 3047 } 3060 3048 return -EINVAL; ··· 4667 4679 if (parent_css == NULL) { 4668 4680 root_mem_cgroup = memcg; 4669 4681 page_counter_init(&memcg->memory, NULL); 4682 4682 + memcg->soft_limit = PAGE_COUNTER_MAX; 4670 4683 page_counter_init(&memcg->memsw, NULL); 4671 4684 page_counter_init(&memcg->kmem, NULL); 4672 4685 } ··· 4713 4724 4714 4725 if (parent->use_hierarchy) { 4715 4726 page_counter_init(&memcg->memory, &parent->memory); 4727 4727 + memcg->soft_limit = PAGE_COUNTER_MAX; 4716 4728 page_counter_init(&memcg->memsw, &parent->memsw); 4717 4729 page_counter_init(&memcg->kmem, &parent->kmem); 4718 4730 ··· 4723 4733 */ 4724 4734 } else { 4725 4735 page_counter_init(&memcg->memory, NULL); 4736 4736 + memcg->soft_limit = PAGE_COUNTER_MAX; 4726 4737 page_counter_init(&memcg->memsw, NULL); 4727 4738 page_counter_init(&memcg->kmem, NULL); 4728 4739 /* ··· 4798 4807 mem_cgroup_resize_limit(memcg, PAGE_COUNTER_MAX); 4799 4808 mem_cgroup_resize_memsw_limit(memcg, PAGE_COUNTER_MAX); 4800 4809 memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX); 4801 4801 - memcg->soft_limit = 0; 4810 4810 + memcg->soft_limit = PAGE_COUNTER_MAX; 4802 4811 } 4803 4812 4804 4813 #ifdef CONFIG_MMU

+17 -10

mm/memory.c

reviewed

··· 2137 2137 if (!dirty_page) 2138 2138 return ret; 2139 2139 2140 2140 - /* 2141 2141 - * Yes, Virginia, this is actually required to prevent a race 2142 2142 - * with clear_page_dirty_for_io() from clearing the page dirty 2143 2143 - * bit after it clear all dirty ptes, but before a racing 2144 2144 - * do_wp_page installs a dirty pte. 2145 2145 - * 2146 2146 - * do_shared_fault is protected similarly. 2147 2147 - */ 2148 2140 if (!page_mkwrite) { 2149 2149 - wait_on_page_locked(dirty_page); 2150 2150 - set_page_dirty_balance(dirty_page); 2141 2141 + struct address_space *mapping; 2142 2142 + int dirtied; 2143 2143 + 2144 2144 + lock_page(dirty_page); 2145 2145 + dirtied = set_page_dirty(dirty_page); 2146 2146 + VM_BUG_ON_PAGE(PageAnon(dirty_page), dirty_page); 2147 2147 + mapping = dirty_page->mapping; 2148 2148 + unlock_page(dirty_page); 2149 2149 + 2150 2150 + if (dirtied && mapping) { 2151 2151 + /* 2152 2152 + * Some device drivers do not set page.mapping 2153 2153 + * but still dirty their pages 2154 2154 + */ 2155 2155 + balance_dirty_pages_ratelimited(mapping); 2156 2156 + } 2157 2157 + 2151 2158 /* file_update_time outside page_lock */ 2152 2159 if (vma->vm_file) 2153 2160 file_update_time(vma->vm_file);

+12 -31

mm/page-writeback.c

reviewed

··· 1541 1541 bdi_start_background_writeback(bdi); 1542 1542 } 1543 1543 1544 1544 - void set_page_dirty_balance(struct page *page) 1545 1545 - { 1546 1546 - if (set_page_dirty(page)) { 1547 1547 - struct address_space *mapping = page_mapping(page); 1548 1548 - 1549 1549 - if (mapping) 1550 1550 - balance_dirty_pages_ratelimited(mapping); 1551 1551 - } 1552 1552 - } 1553 1553 - 1554 1544 static DEFINE_PER_CPU(int, bdp_ratelimits); 1555 1545 1556 1546 /* ··· 2113 2123 * page dirty in that case, but not all the buffers. This is a "bottom-up" 2114 2124 * dirtying, whereas __set_page_dirty_buffers() is a "top-down" dirtying. 2115 2125 * 2116 2116 - * Most callers have locked the page, which pins the address_space in memory. 2117 2117 - * But zap_pte_range() does not lock the page, however in that case the 2118 2118 - * mapping is pinned by the vma's ->vm_file reference. 2119 2119 - * 2120 2120 - * We take care to handle the case where the page was truncated from the 2121 2121 - * mapping by re-checking page_mapping() inside tree_lock. 2126 2126 + * The caller must ensure this doesn't race with truncation. Most will simply 2127 2127 + * hold the page lock, but e.g. zap_pte_range() calls with the page mapped and 2128 2128 + * the pte lock held, which also locks out truncation. 2122 2129 */ 2123 2130 int __set_page_dirty_nobuffers(struct page *page) 2124 2131 { 2125 2132 if (!TestSetPageDirty(page)) { 2126 2133 struct address_space *mapping = page_mapping(page); 2127 2127 - struct address_space *mapping2; 2128 2134 unsigned long flags; 2129 2135 2130 2136 if (!mapping) 2131 2137 return 1; 2132 2138 2133 2139 spin_lock_irqsave(&mapping->tree_lock, flags); 2134 2134 - mapping2 = page_mapping(page); 2135 2135 - if (mapping2) { /* Race with truncate? */ 2136 2136 - BUG_ON(mapping2 != mapping); 2137 2137 - WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); 2138 2138 - account_page_dirtied(page, mapping); 2139 2139 - radix_tree_tag_set(&mapping->page_tree, 2140 2140 - page_index(page), PAGECACHE_TAG_DIRTY); 2141 2141 - } 2140 2140 + BUG_ON(page_mapping(page) != mapping); 2141 2141 + WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); 2142 2142 + account_page_dirtied(page, mapping); 2143 2143 + radix_tree_tag_set(&mapping->page_tree, page_index(page), 2144 2144 + PAGECACHE_TAG_DIRTY); 2142 2145 spin_unlock_irqrestore(&mapping->tree_lock, flags); 2143 2146 if (mapping->host) { 2144 2147 /* !PageAnon && !swapper_space */ ··· 2288 2305 /* 2289 2306 * We carefully synchronise fault handlers against 2290 2307 * installing a dirty pte and marking the page dirty 2291 2291 - * at this point. We do this by having them hold the 2292 2292 - * page lock at some point after installing their 2293 2293 - * pte, but before marking the page dirty. 2294 2294 - * Pages are always locked coming in here, so we get 2295 2295 - * the desired exclusion. See mm/memory.c:do_wp_page() 2296 2296 - * for more comments. 2308 2308 + * at this point. We do this by having them hold the 2309 2309 + * page lock while dirtying the page, and pages are 2310 2310 + * always locked coming in here, so we get the desired 2311 2311 + * exclusion. 2297 2312 */ 2298 2313 if (TestClearPageDirty(page)) { 2299 2314 dec_zone_page_state(page, NR_FILE_DIRTY);

+41 -1

mm/rmap.c

reviewed

··· 72 72 anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); 73 73 if (anon_vma) { 74 74 atomic_set(&anon_vma->refcount, 1); 75 75 + anon_vma->degree = 1; /* Reference for first vma */ 76 76 + anon_vma->parent = anon_vma; 75 77 /* 76 78 * Initialise the anon_vma root to point to itself. If called 77 79 * from fork, the root will be reset to the parents anon_vma. ··· 190 188 if (likely(!vma->anon_vma)) { 191 189 vma->anon_vma = anon_vma; 192 190 anon_vma_chain_link(vma, avc, anon_vma); 191 191 + /* vma reference or self-parent link for new root */ 192 192 + anon_vma->degree++; 193 193 allocated = NULL; 194 194 avc = NULL; 195 195 } ··· 240 236 /* 241 237 * Attach the anon_vmas from src to dst. 242 238 * Returns 0 on success, -ENOMEM on failure. 239 239 + * 240 240 + * If dst->anon_vma is NULL this function tries to find and reuse existing 241 241 + * anon_vma which has no vmas and only one child anon_vma. This prevents 242 242 + * degradation of anon_vma hierarchy to endless linear chain in case of 243 243 + * constantly forking task. On the other hand, an anon_vma with more than one 244 244 + * child isn't reused even if there was no alive vma, thus rmap walker has a 245 245 + * good chance of avoiding scanning the whole hierarchy when it searches where 246 246 + * page is mapped. 243 247 */ 244 248 int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) 245 249 { ··· 268 256 anon_vma = pavc->anon_vma; 269 257 root = lock_anon_vma_root(root, anon_vma); 270 258 anon_vma_chain_link(dst, avc, anon_vma); 259 259 + 260 260 + /* 261 261 + * Reuse existing anon_vma if its degree lower than two, 262 262 + * that means it has no vma and only one anon_vma child. 263 263 + * 264 264 + * Do not chose parent anon_vma, otherwise first child 265 265 + * will always reuse it. Root anon_vma is never reused: 266 266 + * it has self-parent reference and at least one child. 267 267 + */ 268 268 + if (!dst->anon_vma && anon_vma != src->anon_vma && 269 269 + anon_vma->degree < 2) 270 270 + dst->anon_vma = anon_vma; 271 271 } 272 272 + if (dst->anon_vma) 273 273 + dst->anon_vma->degree++; 272 274 unlock_anon_vma_root(root); 273 275 return 0; 274 276 ··· 306 280 if (!pvma->anon_vma) 307 281 return 0; 308 282 283 283 + /* Drop inherited anon_vma, we'll reuse existing or allocate new. */ 284 284 + vma->anon_vma = NULL; 285 285 + 309 286 /* 310 287 * First, attach the new VMA to the parent VMA's anon_vmas, 311 288 * so rmap can find non-COWed pages in child processes. ··· 316 287 error = anon_vma_clone(vma, pvma); 317 288 if (error) 318 289 return error; 290 290 + 291 291 + /* An existing anon_vma has been reused, all done then. */ 292 292 + if (vma->anon_vma) 293 293 + return 0; 319 294 320 295 /* Then add our own anon_vma. */ 321 296 anon_vma = anon_vma_alloc(); ··· 334 301 * lock any of the anon_vmas in this anon_vma tree. 335 302 */ 336 303 anon_vma->root = pvma->anon_vma->root; 304 304 + anon_vma->parent = pvma->anon_vma; 337 305 /* 338 306 * With refcounts, an anon_vma can stay around longer than the 339 307 * process it belongs to. The root anon_vma needs to be pinned until ··· 345 311 vma->anon_vma = anon_vma; 346 312 anon_vma_lock_write(anon_vma); 347 313 anon_vma_chain_link(vma, avc, anon_vma); 314 314 + anon_vma->parent->degree++; 348 315 anon_vma_unlock_write(anon_vma); 349 316 350 317 return 0; ··· 376 341 * Leave empty anon_vmas on the list - we'll need 377 342 * to free them outside the lock. 378 343 */ 379 379 - if (RB_EMPTY_ROOT(&anon_vma->rb_root)) 344 344 + if (RB_EMPTY_ROOT(&anon_vma->rb_root)) { 345 345 + anon_vma->parent->degree--; 380 346 continue; 347 347 + } 381 348 382 349 list_del(&avc->same_vma); 383 350 anon_vma_chain_free(avc); 384 351 } 352 352 + if (vma->anon_vma) 353 353 + vma->anon_vma->degree--; 385 354 unlock_anon_vma_root(root); 386 355 387 356 /* ··· 396 357 list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { 397 358 struct anon_vma *anon_vma = avc->anon_vma; 398 359 360 360 + BUG_ON(anon_vma->degree); 399 361 put_anon_vma(anon_vma); 400 362 401 363 list_del(&avc->same_vma);

+13 -11

mm/vmscan.c

reviewed

··· 2921 2921 return false; 2922 2922 2923 2923 /* 2924 2924 - * There is a potential race between when kswapd checks its watermarks 2925 2925 - * and a process gets throttled. There is also a potential race if 2926 2926 - * processes get throttled, kswapd wakes, a large process exits therby 2927 2927 - * balancing the zones that causes kswapd to miss a wakeup. If kswapd 2928 2928 - * is going to sleep, no process should be sleeping on pfmemalloc_wait 2929 2929 - * so wake them now if necessary. If necessary, processes will wake 2930 2930 - * kswapd and get throttled again 2924 2924 + * The throttled processes are normally woken up in balance_pgdat() as 2925 2925 + * soon as pfmemalloc_watermark_ok() is true. But there is a potential 2926 2926 + * race between when kswapd checks the watermarks and a process gets 2927 2927 + * throttled. There is also a potential race if processes get 2928 2928 + * throttled, kswapd wakes, a large process exits thereby balancing the 2929 2929 + * zones, which causes kswapd to exit balance_pgdat() before reaching 2930 2930 + * the wake up checks. If kswapd is going to sleep, no process should 2931 2931 + * be sleeping on pfmemalloc_wait, so wake them now if necessary. If 2932 2932 + * the wake up is premature, processes will wake kswapd and get 2933 2933 + * throttled again. The difference from wake ups in balance_pgdat() is 2934 2934 + * that here we are under prepare_to_wait(). 2931 2935 */ 2932 2932 - if (waitqueue_active(&pgdat->pfmemalloc_wait)) { 2933 2933 - wake_up(&pgdat->pfmemalloc_wait); 2934 2934 - return false; 2935 2935 - } 2936 2936 + if (waitqueue_active(&pgdat->pfmemalloc_wait)) 2937 2937 + wake_up_all(&pgdat->pfmemalloc_wait); 2936 2938 2937 2939 return pgdat_balanced(pgdat, order, classzone_idx); 2938 2940 }