Merge branch 'akpm' (patches from Andrew)

+5

MAINTAINERS

··· 10519 10519 F: Documentation/core-api/boot-time-mm.rst 10520 10520 10521 10521 MEMORY MANAGEMENT 10522 + M: Andrew Morton <akpm@linux-foundation.org> 10522 10523 L: linux-mm@kvack.org 10523 10524 W: http://www.linux-mm.org 10525 + T: quilt https://ozlabs.org/~akpm/mmotm/ 10526 + T: quilt https://ozlabs.org/~akpm/mmots/ 10527 + T: git git://github.com/hnaz/linux-mm.git 10524 10528 S: Maintained 10525 10529 F: include/linux/mm.h 10526 10530 F: include/linux/gfp.h ··· 18038 18034 ZSWAP COMPRESSED SWAP CACHING 18039 18035 M: Seth Jennings <sjenning@redhat.com> 18040 18036 M: Dan Streetman <ddstreet@ieee.org> 18037 + M: Vitaly Wool <vitaly.wool@konsulko.com> 18041 18038 L: linux-mm@kvack.org 18042 18039 S: Maintained 18043 18040 F: mm/zswap.c

+91 -45

fs/ocfs2/file.c

··· 2098 2098 return 0; 2099 2099 } 2100 2100 2101 - static int ocfs2_prepare_inode_for_refcount(struct inode *inode, 2102 - struct file *file, 2103 - loff_t pos, size_t count, 2104 - int *meta_level) 2101 + static int ocfs2_inode_lock_for_extent_tree(struct inode *inode, 2102 + struct buffer_head **di_bh, 2103 + int meta_level, 2104 + int overwrite_io, 2105 + int write_sem, 2106 + int wait) 2105 2107 { 2106 - int ret; 2107 - struct buffer_head *di_bh = NULL; 2108 - u32 cpos = pos >> OCFS2_SB(inode->i_sb)->s_clustersize_bits; 2109 - u32 clusters = 2110 - ocfs2_clusters_for_bytes(inode->i_sb, pos + count) - cpos; 2108 + int ret = 0; 2111 2109 2112 - ret = ocfs2_inode_lock(inode, &di_bh, 1); 2113 - if (ret) { 2114 - mlog_errno(ret); 2110 + if (wait) 2111 + ret = ocfs2_inode_lock(inode, NULL, meta_level); 2112 + else 2113 + ret = ocfs2_try_inode_lock(inode, 2114 + overwrite_io ? NULL : di_bh, meta_level); 2115 + if (ret < 0) 2115 2116 goto out; 2117 + 2118 + if (wait) { 2119 + if (write_sem) 2120 + down_write(&OCFS2_I(inode)->ip_alloc_sem); 2121 + else 2122 + down_read(&OCFS2_I(inode)->ip_alloc_sem); 2123 + } else { 2124 + if (write_sem) 2125 + ret = down_write_trylock(&OCFS2_I(inode)->ip_alloc_sem); 2126 + else 2127 + ret = down_read_trylock(&OCFS2_I(inode)->ip_alloc_sem); 2128 + 2129 + if (!ret) { 2130 + ret = -EAGAIN; 2131 + goto out_unlock; 2132 + } 2116 2133 } 2117 2134 2118 - *meta_level = 1; 2119 - 2120 - ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX); 2121 - if (ret) 2122 - mlog_errno(ret); 2123 - out: 2124 - brelse(di_bh); 2125 2135 return ret; 2136 + 2137 + out_unlock: 2138 + brelse(*di_bh); 2139 + ocfs2_inode_unlock(inode, meta_level); 2140 + out: 2141 + return ret; 2142 + } 2143 + 2144 + static void ocfs2_inode_unlock_for_extent_tree(struct inode *inode, 2145 + struct buffer_head **di_bh, 2146 + int meta_level, 2147 + int write_sem) 2148 + { 2149 + if (write_sem) 2150 + up_write(&OCFS2_I(inode)->ip_alloc_sem); 2151 + else 2152 + up_read(&OCFS2_I(inode)->ip_alloc_sem); 2153 + 2154 + brelse(*di_bh); 2155 + *di_bh = NULL; 2156 + 2157 + if (meta_level >= 0) 2158 + ocfs2_inode_unlock(inode, meta_level); 2126 2159 } 2127 2160 2128 2161 static int ocfs2_prepare_inode_for_write(struct file *file, 2129 2162 loff_t pos, size_t count, int wait) 2130 2163 { 2131 2164 int ret = 0, meta_level = 0, overwrite_io = 0; 2165 + int write_sem = 0; 2132 2166 struct dentry *dentry = file->f_path.dentry; 2133 2167 struct inode *inode = d_inode(dentry); 2134 2168 struct buffer_head *di_bh = NULL; 2169 + u32 cpos; 2170 + u32 clusters; 2135 2171 2136 2172 /* 2137 2173 * We start with a read level meta lock and only jump to an ex 2138 2174 * if we need to make modifications here. 2139 2175 */ 2140 2176 for(;;) { 2141 - if (wait) 2142 - ret = ocfs2_inode_lock(inode, NULL, meta_level); 2143 - else 2144 - ret = ocfs2_try_inode_lock(inode, 2145 - overwrite_io ? NULL : &di_bh, meta_level); 2177 + ret = ocfs2_inode_lock_for_extent_tree(inode, 2178 + &di_bh, 2179 + meta_level, 2180 + overwrite_io, 2181 + write_sem, 2182 + wait); 2146 2183 if (ret < 0) { 2147 - meta_level = -1; 2148 2184 if (ret != -EAGAIN) 2149 2185 mlog_errno(ret); 2150 2186 goto out; ··· 2192 2156 */ 2193 2157 if (!wait && !overwrite_io) { 2194 2158 overwrite_io = 1; 2195 - if (!down_read_trylock(&OCFS2_I(inode)->ip_alloc_sem)) { 2196 - ret = -EAGAIN; 2197 - goto out_unlock; 2198 - } 2199 2159 2200 2160 ret = ocfs2_overwrite_io(inode, di_bh, pos, count); 2201 - brelse(di_bh); 2202 - di_bh = NULL; 2203 - up_read(&OCFS2_I(inode)->ip_alloc_sem); 2204 2161 if (ret < 0) { 2205 2162 if (ret != -EAGAIN) 2206 2163 mlog_errno(ret); ··· 2212 2183 * set inode->i_size at the end of a write. */ 2213 2184 if (should_remove_suid(dentry)) { 2214 2185 if (meta_level == 0) { 2215 - ocfs2_inode_unlock(inode, meta_level); 2186 + ocfs2_inode_unlock_for_extent_tree(inode, 2187 + &di_bh, 2188 + meta_level, 2189 + write_sem); 2216 2190 meta_level = 1; 2217 2191 continue; 2218 2192 } ··· 2229 2197 2230 2198 ret = ocfs2_check_range_for_refcount(inode, pos, count); 2231 2199 if (ret == 1) { 2232 - ocfs2_inode_unlock(inode, meta_level); 2233 - meta_level = -1; 2200 + ocfs2_inode_unlock_for_extent_tree(inode, 2201 + &di_bh, 2202 + meta_level, 2203 + write_sem); 2204 + ret = ocfs2_inode_lock_for_extent_tree(inode, 2205 + &di_bh, 2206 + meta_level, 2207 + overwrite_io, 2208 + 1, 2209 + wait); 2210 + write_sem = 1; 2211 + if (ret < 0) { 2212 + if (ret != -EAGAIN) 2213 + mlog_errno(ret); 2214 + goto out; 2215 + } 2234 2216 2235 - ret = ocfs2_prepare_inode_for_refcount(inode, 2236 - file, 2237 - pos, 2238 - count, 2239 - &meta_level); 2217 + cpos = pos >> OCFS2_SB(inode->i_sb)->s_clustersize_bits; 2218 + clusters = 2219 + ocfs2_clusters_for_bytes(inode->i_sb, pos + count) - cpos; 2220 + ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX); 2240 2221 } 2241 2222 2242 2223 if (ret < 0) { 2243 - mlog_errno(ret); 2224 + if (ret != -EAGAIN) 2225 + mlog_errno(ret); 2244 2226 goto out_unlock; 2245 2227 } 2246 2228 ··· 2265 2219 trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno, 2266 2220 pos, count, wait); 2267 2221 2268 - brelse(di_bh); 2269 - 2270 - if (meta_level >= 0) 2271 - ocfs2_inode_unlock(inode, meta_level); 2222 + ocfs2_inode_unlock_for_extent_tree(inode, 2223 + &di_bh, 2224 + meta_level, 2225 + write_sem); 2272 2226 2273 2227 out: 2274 2228 return ret;

-5

include/linux/mm.h

··· 695 695 696 696 extern void kvfree(const void *addr); 697 697 698 - static inline atomic_t *compound_mapcount_ptr(struct page *page) 699 - { 700 - return &page[1].compound_mapcount; 701 - } 702 - 703 698 static inline int compound_mapcount(struct page *page) 704 699 { 705 700 VM_BUG_ON_PAGE(!PageCompound(page), page);

+5

include/linux/mm_types.h

··· 221 221 #endif 222 222 } _struct_page_alignment; 223 223 224 + static inline atomic_t *compound_mapcount_ptr(struct page *page) 225 + { 226 + return &page[1].compound_mapcount; 227 + } 228 + 224 229 /* 225 230 * Used for sizing the vmemmap region on some architectures 226 231 */

+18 -2

include/linux/page-flags.h

··· 622 622 * 623 623 * Unlike PageTransCompound, this is safe to be called only while 624 624 * split_huge_pmd() cannot run from under us, like if protected by the 625 - * MMU notifier, otherwise it may result in page->_mapcount < 0 false 625 + * MMU notifier, otherwise it may result in page->_mapcount check false 626 626 * positives. 627 + * 628 + * We have to treat page cache THP differently since every subpage of it 629 + * would get _mapcount inc'ed once it is PMD mapped. But, it may be PTE 630 + * mapped in the current process so comparing subpage's _mapcount to 631 + * compound_mapcount to filter out PTE mapped case. 627 632 */ 628 633 static inline int PageTransCompoundMap(struct page *page) 629 634 { 630 - return PageTransCompound(page) && atomic_read(&page->_mapcount) < 0; 635 + struct page *head; 636 + 637 + if (!PageTransCompound(page)) 638 + return 0; 639 + 640 + if (PageAnon(page)) 641 + return atomic_read(&page->_mapcount) < 0; 642 + 643 + head = compound_head(page); 644 + /* File THP is PMD mapped and not PTE mapped */ 645 + return atomic_read(&page->_mapcount) == 646 + atomic_read(compound_mapcount_ptr(head)); 631 647 } 632 648 633 649 /*

+6 -1

lib/dump_stack.c

··· 106 106 was_locked = 1; 107 107 } else { 108 108 local_irq_restore(flags); 109 - cpu_relax(); 109 + /* 110 + * Wait for the lock to release before jumping to 111 + * atomic_cmpxchg() in order to mitigate the thundering herd 112 + * problem. 113 + */ 114 + do { cpu_relax(); } while (atomic_read(&dump_lock) != -1); 110 115 goto retry; 111 116 } 112 117

+4 -3

mm/khugepaged.c

··· 1028 1028 1029 1029 anon_vma_lock_write(vma->anon_vma); 1030 1030 1031 - pte = pte_offset_map(pmd, address); 1032 - pte_ptl = pte_lockptr(mm, pmd); 1033 - 1034 1031 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm, 1035 1032 address, address + HPAGE_PMD_SIZE); 1036 1033 mmu_notifier_invalidate_range_start(&range); 1034 + 1035 + pte = pte_offset_map(pmd, address); 1036 + pte_ptl = pte_lockptr(mm, pmd); 1037 + 1037 1038 pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */ 1038 1039 /* 1039 1040 * After this gup_fast can't run anymore. This also removes

+16 -7

mm/memcontrol.c

··· 484 484 unsigned long ino = 0; 485 485 486 486 rcu_read_lock(); 487 - if (PageHead(page) && PageSlab(page)) 487 + if (PageSlab(page) && !PageTail(page)) 488 488 memcg = memcg_from_slab_page(page); 489 489 else 490 490 memcg = READ_ONCE(page->mem_cgroup); ··· 2533 2533 batch = nr_pages; 2534 2534 goto retry; 2535 2535 } 2536 + 2537 + /* 2538 + * Memcg doesn't have a dedicated reserve for atomic 2539 + * allocations. But like the global atomic pool, we need to 2540 + * put the burden of reclaim on regular allocation requests 2541 + * and let these go through as privileged allocations. 2542 + */ 2543 + if (gfp_mask & __GFP_ATOMIC) 2544 + goto force; 2536 2545 2537 2546 /* 2538 2547 * Unlike in global OOM situations, memcg is not in a physical ··· 5023 5014 { 5024 5015 int node; 5025 5016 5026 - /* 5027 - * Flush percpu vmstats and vmevents to guarantee the value correctness 5028 - * on parent's and all ancestor levels. 5029 - */ 5030 - memcg_flush_percpu_vmstats(memcg, false); 5031 - memcg_flush_percpu_vmevents(memcg); 5032 5017 for_each_node(node) 5033 5018 free_mem_cgroup_per_node_info(memcg, node); 5034 5019 free_percpu(memcg->vmstats_percpu); ··· 5033 5030 static void mem_cgroup_free(struct mem_cgroup *memcg) 5034 5031 { 5035 5032 memcg_wb_domain_exit(memcg); 5033 + /* 5034 + * Flush percpu vmstats and vmevents to guarantee the value correctness 5035 + * on parent's and all ancestor levels. 5036 + */ 5037 + memcg_flush_percpu_vmstats(memcg, false); 5038 + memcg_flush_percpu_vmevents(memcg); 5036 5039 __mem_cgroup_free(memcg); 5037 5040 } 5038 5041

+8

mm/memory_hotplug.c

··· 447 447 zone->spanned_pages; 448 448 449 449 /* No need to lock the zones, they can't change. */ 450 + if (!zone->spanned_pages) 451 + continue; 452 + if (!node_end_pfn) { 453 + node_start_pfn = zone->zone_start_pfn; 454 + node_end_pfn = zone_end_pfn; 455 + continue; 456 + } 457 + 450 458 if (zone_end_pfn > node_end_pfn) 451 459 node_end_pfn = zone_end_pfn; 452 460 if (zone->zone_start_pfn < node_start_pfn)

+1 -1

mm/mmu_notifier.c

··· 180 180 mn->ops->invalidate_range_start, _ret, 181 181 !mmu_notifier_range_blockable(range) ? "non-" : ""); 182 182 WARN_ON(mmu_notifier_range_blockable(range) || 183 - ret != -EAGAIN); 183 + _ret != -EAGAIN); 184 184 ret = _ret; 185 185 } 186 186 }

+9 -8

mm/page_alloc.c

··· 1948 1948 wait_for_completion(&pgdat_init_all_done_comp); 1949 1949 1950 1950 /* 1951 + * The number of managed pages has changed due to the initialisation 1952 + * so the pcpu batch and high limits needs to be updated or the limits 1953 + * will be artificially small. 1954 + */ 1955 + for_each_populated_zone(zone) 1956 + zone_pcp_update(zone); 1957 + 1958 + /* 1951 1959 * We initialized the rest of the deferred pages. Permanently disable 1952 1960 * on-demand struct page initialization. 1953 1961 */ ··· 3728 3720 static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask) 3729 3721 { 3730 3722 unsigned int filter = SHOW_MEM_FILTER_NODES; 3731 - static DEFINE_RATELIMIT_STATE(show_mem_rs, HZ, 1); 3732 - 3733 - if (!__ratelimit(&show_mem_rs)) 3734 - return; 3735 3723 3736 3724 /* 3737 3725 * This documents exceptions given to allocations in certain ··· 3748 3744 { 3749 3745 struct va_format vaf; 3750 3746 va_list args; 3751 - static DEFINE_RATELIMIT_STATE(nopage_rs, DEFAULT_RATELIMIT_INTERVAL, 3752 - DEFAULT_RATELIMIT_BURST); 3747 + static DEFINE_RATELIMIT_STATE(nopage_rs, 10*HZ, 1); 3753 3748 3754 3749 if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs)) 3755 3750 return; ··· 8517 8514 WARN(count != 0, "%d pages are still in use!\n", count); 8518 8515 } 8519 8516 8520 - #ifdef CONFIG_MEMORY_HOTPLUG 8521 8517 /* 8522 8518 * The zone indicated has a new number of managed_pages; batch sizes and percpu 8523 8519 * page high values need to be recalulated. ··· 8530 8528 per_cpu_ptr(zone->pageset, cpu)); 8531 8529 mutex_unlock(&pcp_batch_high_lock); 8532 8530 } 8533 - #endif 8534 8531 8535 8532 void zone_pcp_reset(struct zone *zone) 8536 8533 {

+2 -2

mm/slab.h

··· 323 323 * Expects a pointer to a slab page. Please note, that PageSlab() check 324 324 * isn't sufficient, as it returns true also for tail compound slab pages, 325 325 * which do not have slab_cache pointer set. 326 - * So this function assumes that the page can pass PageHead() and PageSlab() 327 - * checks. 326 + * So this function assumes that the page can pass PageSlab() && !PageTail() 327 + * check. 328 328 * 329 329 * The kmem_cache can be reparented asynchronously. The caller must ensure 330 330 * the memcg lifetime, e.g. by taking rcu_read_lock() or cgroup_mutex.

+21 -4

mm/vmstat.c

··· 1383 1383 unsigned long freecount = 0; 1384 1384 struct free_area *area; 1385 1385 struct list_head *curr; 1386 + bool overflow = false; 1386 1387 1387 1388 area = &(zone->free_area[order]); 1388 1389 1389 - list_for_each(curr, &area->free_list[mtype]) 1390 - freecount++; 1391 - seq_printf(m, "%6lu ", freecount); 1390 + list_for_each(curr, &area->free_list[mtype]) { 1391 + /* 1392 + * Cap the free_list iteration because it might 1393 + * be really large and we are under a spinlock 1394 + * so a long time spent here could trigger a 1395 + * hard lockup detector. Anyway this is a 1396 + * debugging tool so knowing there is a handful 1397 + * of pages of this order should be more than 1398 + * sufficient. 1399 + */ 1400 + if (++freecount >= 100000) { 1401 + overflow = true; 1402 + break; 1403 + } 1404 + } 1405 + seq_printf(m, "%s%6lu ", overflow ? ">" : "", freecount); 1406 + spin_unlock_irq(&zone->lock); 1407 + cond_resched(); 1408 + spin_lock_irq(&zone->lock); 1392 1409 } 1393 1410 seq_putc(m, '\n'); 1394 1411 } ··· 1989 1972 #endif 1990 1973 #ifdef CONFIG_PROC_FS 1991 1974 proc_create_seq("buddyinfo", 0444, NULL, &fragmentation_op); 1992 - proc_create_seq("pagetypeinfo", 0444, NULL, &pagetypeinfo_op); 1975 + proc_create_seq("pagetypeinfo", 0400, NULL, &pagetypeinfo_op); 1993 1976 proc_create_seq("vmstat", 0444, NULL, &vmstat_op); 1994 1977 proc_create_seq("zoneinfo", 0444, NULL, &zoneinfo_op); 1995 1978 #endif

+2 -1

scripts/gdb/linux/symbols.py

··· 99 99 attrs[n]['name'].string(): attrs[n]['address'] 100 100 for n in range(int(sect_attrs['nsections']))} 101 101 args = [] 102 - for section_name in [".data", ".data..read_mostly", ".rodata", ".bss"]: 102 + for section_name in [".data", ".data..read_mostly", ".rodata", ".bss", 103 + ".text", ".text.hot", ".text.unlikely"]: 103 104 address = section_name_to_address.get(section_name) 104 105 if address: 105 106 args.append(" -s {name} {addr}".format(

+1 -1

tools/testing/selftests/vm/gup_benchmark.c

··· 71 71 flags |= MAP_SHARED; 72 72 break; 73 73 case 'H': 74 - flags |= MAP_HUGETLB; 74 + flags |= (MAP_HUGETLB | MAP_ANONYMOUS); 75 75 break; 76 76 default: 77 77 return -1;