commit f47edb59bb4897a5d495139bb12e93cd3f0cbded · tjh.dev/kernel

+1 -2

arch/parisc/include/asm/pgtable.h

··· 2 2 #ifndef _PARISC_PGTABLE_H 3 3 #define _PARISC_PGTABLE_H 4 4 5 + #include <asm/page.h> 5 6 #include <asm-generic/4level-fixup.h> 6 7 7 8 #include <asm/fixmap.h> ··· 98 97 } while (0) 99 98 100 99 #endif /* !__ASSEMBLY__ */ 101 - 102 - #include <asm/page.h> 103 100 104 101 #define pte_ERROR(e) \ 105 102 printk("%s:%d: bad pte %08lx.\n", __FILE__, __LINE__, pte_val(e))

+13 -12

fs/userfaultfd.c

··· 880 880 /* len == 0 means wake all */ 881 881 struct userfaultfd_wake_range range = { .len = 0, }; 882 882 unsigned long new_flags; 883 + bool still_valid; 883 884 884 885 WRITE_ONCE(ctx->released, true); 885 886 ··· 896 895 * taking the mmap_sem for writing. 897 896 */ 898 897 down_write(&mm->mmap_sem); 899 - if (!mmget_still_valid(mm)) 900 - goto skip_mm; 898 + still_valid = mmget_still_valid(mm); 901 899 prev = NULL; 902 900 for (vma = mm->mmap; vma; vma = vma->vm_next) { 903 901 cond_resched(); ··· 907 907 continue; 908 908 } 909 909 new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP); 910 - prev = vma_merge(mm, prev, vma->vm_start, vma->vm_end, 911 - new_flags, vma->anon_vma, 912 - vma->vm_file, vma->vm_pgoff, 913 - vma_policy(vma), 914 - NULL_VM_UFFD_CTX); 915 - if (prev) 916 - vma = prev; 917 - else 918 - prev = vma; 910 + if (still_valid) { 911 + prev = vma_merge(mm, prev, vma->vm_start, vma->vm_end, 912 + new_flags, vma->anon_vma, 913 + vma->vm_file, vma->vm_pgoff, 914 + vma_policy(vma), 915 + NULL_VM_UFFD_CTX); 916 + if (prev) 917 + vma = prev; 918 + else 919 + prev = vma; 920 + } 919 921 vma->vm_flags = new_flags; 920 922 vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; 921 923 } 922 - skip_mm: 923 924 up_write(&mm->mmap_sem); 924 925 mmput(mm); 925 926 wakeup:

+8

kernel/sched/psi.c

··· 1131 1131 * deadlock while waiting for psi_poll_work to acquire trigger_lock 1132 1132 */ 1133 1133 if (kworker_to_destroy) { 1134 + /* 1135 + * After the RCU grace period has expired, the worker 1136 + * can no longer be found through group->poll_kworker. 1137 + * But it might have been already scheduled before 1138 + * that - deschedule it cleanly before destroying it. 1139 + */ 1134 1140 kthread_cancel_delayed_work_sync(&group->poll_work); 1141 + atomic_set(&group->poll_scheduled, 0); 1142 + 1135 1143 kthread_destroy_worker(kworker_to_destroy); 1136 1144 } 1137 1145 kfree(t);

+4

mm/huge_memory.c

··· 32 32 #include <linux/shmem_fs.h> 33 33 #include <linux/oom.h> 34 34 #include <linux/numa.h> 35 + #include <linux/page_owner.h> 35 36 36 37 #include <asm/tlb.h> 37 38 #include <asm/pgalloc.h> ··· 2517 2516 } 2518 2517 2519 2518 ClearPageCompound(head); 2519 + 2520 + split_page_owner(head, HPAGE_PMD_ORDER); 2521 + 2520 2522 /* See comment in __split_huge_page_tail() */ 2521 2523 if (PageAnon(head)) { 2522 2524 /* Additional pin to swap cache */

+8 -2

mm/kasan/common.c

··· 407 407 if (IS_ENABLED(CONFIG_KASAN_GENERIC)) 408 408 return shadow_byte < 0 || 409 409 shadow_byte >= KASAN_SHADOW_SCALE_SIZE; 410 - else 411 - return tag != (u8)shadow_byte; 410 + 411 + /* else CONFIG_KASAN_SW_TAGS: */ 412 + if ((u8)shadow_byte == KASAN_TAG_INVALID) 413 + return true; 414 + if ((tag != KASAN_TAG_KERNEL) && (tag != (u8)shadow_byte)) 415 + return true; 416 + 417 + return false; 412 418 } 413 419 414 420 static bool __kasan_slab_free(struct kmem_cache *cache, void *object,

+60

mm/memcontrol.c

··· 3260 3260 } 3261 3261 } 3262 3262 3263 + static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg) 3264 + { 3265 + unsigned long stat[MEMCG_NR_STAT]; 3266 + struct mem_cgroup *mi; 3267 + int node, cpu, i; 3268 + 3269 + for (i = 0; i < MEMCG_NR_STAT; i++) 3270 + stat[i] = 0; 3271 + 3272 + for_each_online_cpu(cpu) 3273 + for (i = 0; i < MEMCG_NR_STAT; i++) 3274 + stat[i] += raw_cpu_read(memcg->vmstats_percpu->stat[i]); 3275 + 3276 + for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) 3277 + for (i = 0; i < MEMCG_NR_STAT; i++) 3278 + atomic_long_add(stat[i], &mi->vmstats[i]); 3279 + 3280 + for_each_node(node) { 3281 + struct mem_cgroup_per_node *pn = memcg->nodeinfo[node]; 3282 + struct mem_cgroup_per_node *pi; 3283 + 3284 + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) 3285 + stat[i] = 0; 3286 + 3287 + for_each_online_cpu(cpu) 3288 + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) 3289 + stat[i] += raw_cpu_read( 3290 + pn->lruvec_stat_cpu->count[i]); 3291 + 3292 + for (pi = pn; pi; pi = parent_nodeinfo(pi, node)) 3293 + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) 3294 + atomic_long_add(stat[i], &pi->lruvec_stat[i]); 3295 + } 3296 + } 3297 + 3298 + static void memcg_flush_percpu_vmevents(struct mem_cgroup *memcg) 3299 + { 3300 + unsigned long events[NR_VM_EVENT_ITEMS]; 3301 + struct mem_cgroup *mi; 3302 + int cpu, i; 3303 + 3304 + for (i = 0; i < NR_VM_EVENT_ITEMS; i++) 3305 + events[i] = 0; 3306 + 3307 + for_each_online_cpu(cpu) 3308 + for (i = 0; i < NR_VM_EVENT_ITEMS; i++) 3309 + events[i] += raw_cpu_read( 3310 + memcg->vmstats_percpu->events[i]); 3311 + 3312 + for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) 3313 + for (i = 0; i < NR_VM_EVENT_ITEMS; i++) 3314 + atomic_long_add(events[i], &mi->vmevents[i]); 3315 + } 3316 + 3263 3317 #ifdef CONFIG_MEMCG_KMEM 3264 3318 static int memcg_online_kmem(struct mem_cgroup *memcg) 3265 3319 { ··· 4736 4682 { 4737 4683 int node; 4738 4684 4685 + /* 4686 + * Flush percpu vmstats and vmevents to guarantee the value correctness 4687 + * on parent's and all ancestor levels. 4688 + */ 4689 + memcg_flush_percpu_vmstats(memcg); 4690 + memcg_flush_percpu_vmevents(memcg); 4739 4691 for_each_node(node) 4740 4692 free_mem_cgroup_per_node_info(memcg, node); 4741 4693 free_percpu(memcg->vmstats_percpu);

+4 -15

mm/page_alloc.c

··· 2238 2238 unsigned int order; 2239 2239 int pages_moved = 0; 2240 2240 2241 - #ifndef CONFIG_HOLES_IN_ZONE 2242 - /* 2243 - * page_zone is not safe to call in this context when 2244 - * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant 2245 - * anyway as we check zone boundaries in move_freepages_block(). 2246 - * Remove at a later date when no bug reports exist related to 2247 - * grouping pages by mobility 2248 - */ 2249 - VM_BUG_ON(pfn_valid(page_to_pfn(start_page)) && 2250 - pfn_valid(page_to_pfn(end_page)) && 2251 - page_zone(start_page) != page_zone(end_page)); 2252 - #endif 2253 2241 for (page = start_page; page <= end_page;) { 2254 2242 if (!pfn_valid_within(page_to_pfn(page))) { 2255 2243 page++; 2256 2244 continue; 2257 2245 } 2258 - 2259 - /* Make sure we are not inadvertently changing nodes */ 2260 - VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page); 2261 2246 2262 2247 if (!PageBuddy(page)) { 2263 2248 /* ··· 2257 2272 page++; 2258 2273 continue; 2259 2274 } 2275 + 2276 + /* Make sure we are not inadvertently changing nodes */ 2277 + VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page); 2278 + VM_BUG_ON_PAGE(page_zone(page) != zone, page); 2260 2279 2261 2280 order = page_order(page); 2262 2281 move_to_free_area(page, &zone->free_area[order], migratetype);

+89

mm/z3fold.c

··· 41 41 #include <linux/workqueue.h> 42 42 #include <linux/slab.h> 43 43 #include <linux/spinlock.h> 44 + #include <linux/wait.h> 44 45 #include <linux/zpool.h> 45 46 #include <linux/magic.h> 46 47 ··· 146 145 * @release_wq: workqueue for safe page release 147 146 * @work: work_struct for safe page release 148 147 * @inode: inode for z3fold pseudo filesystem 148 + * @destroying: bool to stop migration once we start destruction 149 + * @isolated: int to count the number of pages currently in isolation 149 150 * 150 151 * This structure is allocated at pool creation time and maintains metadata 151 152 * pertaining to a particular z3fold pool. ··· 166 163 const struct zpool_ops *zpool_ops; 167 164 struct workqueue_struct *compact_wq; 168 165 struct workqueue_struct *release_wq; 166 + struct wait_queue_head isolate_wait; 169 167 struct work_struct work; 170 168 struct inode *inode; 169 + bool destroying; 170 + int isolated; 171 171 }; 172 172 173 173 /* ··· 775 769 goto out_c; 776 770 spin_lock_init(&pool->lock); 777 771 spin_lock_init(&pool->stale_lock); 772 + init_waitqueue_head(&pool->isolate_wait); 778 773 pool->unbuddied = __alloc_percpu(sizeof(struct list_head)*NCHUNKS, 2); 779 774 if (!pool->unbuddied) 780 775 goto out_pool; ··· 815 808 return NULL; 816 809 } 817 810 811 + static bool pool_isolated_are_drained(struct z3fold_pool *pool) 812 + { 813 + bool ret; 814 + 815 + spin_lock(&pool->lock); 816 + ret = pool->isolated == 0; 817 + spin_unlock(&pool->lock); 818 + return ret; 819 + } 818 820 /** 819 821 * z3fold_destroy_pool() - destroys an existing z3fold pool 820 822 * @pool: the z3fold pool to be destroyed ··· 833 817 static void z3fold_destroy_pool(struct z3fold_pool *pool) 834 818 { 835 819 kmem_cache_destroy(pool->c_handle); 820 + /* 821 + * We set pool-> destroying under lock to ensure that 822 + * z3fold_page_isolate() sees any changes to destroying. This way we 823 + * avoid the need for any memory barriers. 824 + */ 825 + 826 + spin_lock(&pool->lock); 827 + pool->destroying = true; 828 + spin_unlock(&pool->lock); 829 + 830 + /* 831 + * We need to ensure that no pages are being migrated while we destroy 832 + * these workqueues, as migration can queue work on either of the 833 + * workqueues. 834 + */ 835 + wait_event(pool->isolate_wait, !pool_isolated_are_drained(pool)); 836 836 837 837 /* 838 838 * We need to destroy pool->compact_wq before pool->release_wq, ··· 1339 1307 return atomic64_read(&pool->pages_nr); 1340 1308 } 1341 1309 1310 + /* 1311 + * z3fold_dec_isolated() expects to be called while pool->lock is held. 1312 + */ 1313 + static void z3fold_dec_isolated(struct z3fold_pool *pool) 1314 + { 1315 + assert_spin_locked(&pool->lock); 1316 + VM_BUG_ON(pool->isolated <= 0); 1317 + pool->isolated--; 1318 + 1319 + /* 1320 + * If we have no more isolated pages, we have to see if 1321 + * z3fold_destroy_pool() is waiting for a signal. 1322 + */ 1323 + if (pool->isolated == 0 && waitqueue_active(&pool->isolate_wait)) 1324 + wake_up_all(&pool->isolate_wait); 1325 + } 1326 + 1327 + static void z3fold_inc_isolated(struct z3fold_pool *pool) 1328 + { 1329 + pool->isolated++; 1330 + } 1331 + 1342 1332 static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode) 1343 1333 { 1344 1334 struct z3fold_header *zhdr; ··· 1387 1333 spin_lock(&pool->lock); 1388 1334 if (!list_empty(&page->lru)) 1389 1335 list_del(&page->lru); 1336 + /* 1337 + * We need to check for destruction while holding pool->lock, as 1338 + * otherwise destruction could see 0 isolated pages, and 1339 + * proceed. 1340 + */ 1341 + if (unlikely(pool->destroying)) { 1342 + spin_unlock(&pool->lock); 1343 + /* 1344 + * If this page isn't stale, somebody else holds a 1345 + * reference to it. Let't drop our refcount so that they 1346 + * can call the release logic. 1347 + */ 1348 + if (unlikely(kref_put(&zhdr->refcount, 1349 + release_z3fold_page_locked))) { 1350 + /* 1351 + * If we get here we have kref problems, so we 1352 + * should freak out. 1353 + */ 1354 + WARN(1, "Z3fold is experiencing kref problems\n"); 1355 + return false; 1356 + } 1357 + z3fold_page_unlock(zhdr); 1358 + return false; 1359 + } 1360 + 1361 + 1362 + z3fold_inc_isolated(pool); 1390 1363 spin_unlock(&pool->lock); 1391 1364 z3fold_page_unlock(zhdr); 1392 1365 return true; ··· 1482 1401 1483 1402 queue_work_on(new_zhdr->cpu, pool->compact_wq, &new_zhdr->work); 1484 1403 1404 + spin_lock(&pool->lock); 1405 + z3fold_dec_isolated(pool); 1406 + spin_unlock(&pool->lock); 1407 + 1485 1408 page_mapcount_reset(page); 1486 1409 put_page(page); 1487 1410 return 0; ··· 1505 1420 INIT_LIST_HEAD(&page->lru); 1506 1421 if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) { 1507 1422 atomic64_dec(&pool->pages_nr); 1423 + spin_lock(&pool->lock); 1424 + z3fold_dec_isolated(pool); 1425 + spin_unlock(&pool->lock); 1508 1426 return; 1509 1427 } 1510 1428 spin_lock(&pool->lock); 1511 1429 list_add(&page->lru, &pool->lru); 1430 + z3fold_dec_isolated(pool); 1512 1431 spin_unlock(&pool->lock); 1513 1432 z3fold_page_unlock(zhdr); 1514 1433 }

+73 -5

mm/zsmalloc.c

··· 54 54 #include <linux/mount.h> 55 55 #include <linux/pseudo_fs.h> 56 56 #include <linux/migrate.h> 57 + #include <linux/wait.h> 57 58 #include <linux/pagemap.h> 58 59 #include <linux/fs.h> 59 60 ··· 269 268 #ifdef CONFIG_COMPACTION 270 269 struct inode *inode; 271 270 struct work_struct free_work; 271 + /* A wait queue for when migration races with async_free_zspage() */ 272 + struct wait_queue_head migration_wait; 273 + atomic_long_t isolated_pages; 274 + bool destroying; 272 275 #endif 273 276 }; 274 277 ··· 1867 1862 zspage->isolated--; 1868 1863 } 1869 1864 1865 + static void putback_zspage_deferred(struct zs_pool *pool, 1866 + struct size_class *class, 1867 + struct zspage *zspage) 1868 + { 1869 + enum fullness_group fg; 1870 + 1871 + fg = putback_zspage(class, zspage); 1872 + if (fg == ZS_EMPTY) 1873 + schedule_work(&pool->free_work); 1874 + 1875 + } 1876 + 1877 + static inline void zs_pool_dec_isolated(struct zs_pool *pool) 1878 + { 1879 + VM_BUG_ON(atomic_long_read(&pool->isolated_pages) <= 0); 1880 + atomic_long_dec(&pool->isolated_pages); 1881 + /* 1882 + * There's no possibility of racing, since wait_for_isolated_drain() 1883 + * checks the isolated count under &class->lock after enqueuing 1884 + * on migration_wait. 1885 + */ 1886 + if (atomic_long_read(&pool->isolated_pages) == 0 && pool->destroying) 1887 + wake_up_all(&pool->migration_wait); 1888 + } 1889 + 1870 1890 static void replace_sub_page(struct size_class *class, struct zspage *zspage, 1871 1891 struct page *newpage, struct page *oldpage) 1872 1892 { ··· 1961 1931 */ 1962 1932 if (!list_empty(&zspage->list) && !is_zspage_isolated(zspage)) { 1963 1933 get_zspage_mapping(zspage, &class_idx, &fullness); 1934 + atomic_long_inc(&pool->isolated_pages); 1964 1935 remove_zspage(class, zspage, fullness); 1965 1936 } 1966 1937 ··· 2061 2030 * Page migration is done so let's putback isolated zspage to 2062 2031 * the list if @page is final isolated subpage in the zspage. 2063 2032 */ 2064 - if (!is_zspage_isolated(zspage)) 2065 - putback_zspage(class, zspage); 2033 + if (!is_zspage_isolated(zspage)) { 2034 + /* 2035 + * We cannot race with zs_destroy_pool() here because we wait 2036 + * for isolation to hit zero before we start destroying. 2037 + * Also, we ensure that everyone can see pool->destroying before 2038 + * we start waiting. 2039 + */ 2040 + putback_zspage_deferred(pool, class, zspage); 2041 + zs_pool_dec_isolated(pool); 2042 + } 2066 2043 2067 2044 reset_page(page); 2068 2045 put_page(page); ··· 2116 2077 spin_lock(&class->lock); 2117 2078 dec_zspage_isolation(zspage); 2118 2079 if (!is_zspage_isolated(zspage)) { 2119 - fg = putback_zspage(class, zspage); 2120 2080 /* 2121 2081 * Due to page_lock, we cannot free zspage immediately 2122 2082 * so let's defer. 2123 2083 */ 2124 - if (fg == ZS_EMPTY) 2125 - schedule_work(&pool->free_work); 2084 + putback_zspage_deferred(pool, class, zspage); 2085 + zs_pool_dec_isolated(pool); 2126 2086 } 2127 2087 spin_unlock(&class->lock); 2128 2088 } ··· 2145 2107 return 0; 2146 2108 } 2147 2109 2110 + static bool pool_isolated_are_drained(struct zs_pool *pool) 2111 + { 2112 + return atomic_long_read(&pool->isolated_pages) == 0; 2113 + } 2114 + 2115 + /* Function for resolving migration */ 2116 + static void wait_for_isolated_drain(struct zs_pool *pool) 2117 + { 2118 + 2119 + /* 2120 + * We're in the process of destroying the pool, so there are no 2121 + * active allocations. zs_page_isolate() fails for completely free 2122 + * zspages, so we need only wait for the zs_pool's isolated 2123 + * count to hit zero. 2124 + */ 2125 + wait_event(pool->migration_wait, 2126 + pool_isolated_are_drained(pool)); 2127 + } 2128 + 2148 2129 static void zs_unregister_migration(struct zs_pool *pool) 2149 2130 { 2131 + pool->destroying = true; 2132 + /* 2133 + * We need a memory barrier here to ensure global visibility of 2134 + * pool->destroying. Thus pool->isolated pages will either be 0 in which 2135 + * case we don't care, or it will be > 0 and pool->destroying will 2136 + * ensure that we wake up once isolation hits 0. 2137 + */ 2138 + smp_mb(); 2139 + wait_for_isolated_drain(pool); /* This can block */ 2150 2140 flush_work(&pool->free_work); 2151 2141 iput(pool->inode); 2152 2142 } ··· 2411 2345 pool->name = kstrdup(name, GFP_KERNEL); 2412 2346 if (!pool->name) 2413 2347 goto err; 2348 + 2349 + init_waitqueue_head(&pool->migration_wait); 2414 2350 2415 2351 if (create_cache(pool)) 2416 2352 goto err;