mm: thrash detection-based file cache sizing

+5

include/linux/mmzone.h

··· 142 142 NUMA_LOCAL, /* allocation from local node */ 143 143 NUMA_OTHER, /* allocation from other node */ 144 144 #endif 145 + WORKINGSET_REFAULT, 146 + WORKINGSET_ACTIVATE, 145 147 NR_ANON_TRANSPARENT_HUGEPAGES, 146 148 NR_FREE_CMA_PAGES, 147 149 NR_VM_ZONE_STAT_ITEMS }; ··· 393 391 /* Fields commonly accessed by the page reclaim scanner */ 394 392 spinlock_t lru_lock; 395 393 struct lruvec lruvec; 394 + 395 + /* Evictions & activations on the inactive file list */ 396 + atomic_long_t inactive_age; 396 397 397 398 unsigned long pages_scanned; /* since last reclaim */ 398 399 unsigned long flags; /* zone flags, see below */

+5

include/linux/swap.h

··· 260 260 int next; /* swapfile to be used next */ 261 261 }; 262 262 263 + /* linux/mm/workingset.c */ 264 + void *workingset_eviction(struct address_space *mapping, struct page *page); 265 + bool workingset_refault(void *shadow); 266 + void workingset_activation(struct page *page); 267 + 263 268 /* linux/mm/page_alloc.c */ 264 269 extern unsigned long totalram_pages; 265 270 extern unsigned long totalreserve_pages;

+1 -1

mm/Makefile

··· 17 17 util.o mmzone.o vmstat.o backing-dev.o \ 18 18 mm_init.o mmu_context.o percpu.o slab_common.o \ 19 19 compaction.o balloon_compaction.o \ 20 - interval_tree.o list_lru.o $(mmu-y) 20 + interval_tree.o list_lru.o workingset.o $(mmu-y) 21 21 22 22 obj-y += init-mm.o 23 23

+44 -17

mm/filemap.c

··· 469 469 EXPORT_SYMBOL_GPL(replace_page_cache_page); 470 470 471 471 static int page_cache_tree_insert(struct address_space *mapping, 472 - struct page *page) 472 + struct page *page, void **shadowp) 473 473 { 474 474 void **slot; 475 475 int error; ··· 484 484 radix_tree_replace_slot(slot, page); 485 485 mapping->nrshadows--; 486 486 mapping->nrpages++; 487 + if (shadowp) 488 + *shadowp = p; 487 489 return 0; 488 490 } 489 491 error = radix_tree_insert(&mapping->page_tree, page->index, page); ··· 494 492 return error; 495 493 } 496 494 497 - /** 498 - * add_to_page_cache_locked - add a locked page to the pagecache 499 - * @page: page to add 500 - * @mapping: the page's address_space 501 - * @offset: page index 502 - * @gfp_mask: page allocation mode 503 - * 504 - * This function is used to add a page to the pagecache. It must be locked. 505 - * This function does not add the page to the LRU. The caller must do that. 506 - */ 507 - int add_to_page_cache_locked(struct page *page, struct address_space *mapping, 508 - pgoff_t offset, gfp_t gfp_mask) 495 + static int __add_to_page_cache_locked(struct page *page, 496 + struct address_space *mapping, 497 + pgoff_t offset, gfp_t gfp_mask, 498 + void **shadowp) 509 499 { 510 500 int error; 511 501 ··· 520 526 page->index = offset; 521 527 522 528 spin_lock_irq(&mapping->tree_lock); 523 - error = page_cache_tree_insert(mapping, page); 529 + error = page_cache_tree_insert(mapping, page, shadowp); 524 530 radix_tree_preload_end(); 525 531 if (unlikely(error)) 526 532 goto err_insert; ··· 536 542 page_cache_release(page); 537 543 return error; 538 544 } 545 + 546 + /** 547 + * add_to_page_cache_locked - add a locked page to the pagecache 548 + * @page: page to add 549 + * @mapping: the page's address_space 550 + * @offset: page index 551 + * @gfp_mask: page allocation mode 552 + * 553 + * This function is used to add a page to the pagecache. It must be locked. 554 + * This function does not add the page to the LRU. The caller must do that. 555 + */ 556 + int add_to_page_cache_locked(struct page *page, struct address_space *mapping, 557 + pgoff_t offset, gfp_t gfp_mask) 558 + { 559 + return __add_to_page_cache_locked(page, mapping, offset, 560 + gfp_mask, NULL); 561 + } 539 562 EXPORT_SYMBOL(add_to_page_cache_locked); 540 563 541 564 int add_to_page_cache_lru(struct page *page, struct address_space *mapping, 542 565 pgoff_t offset, gfp_t gfp_mask) 543 566 { 567 + void *shadow = NULL; 544 568 int ret; 545 569 546 - ret = add_to_page_cache(page, mapping, offset, gfp_mask); 547 - if (ret == 0) 548 - lru_cache_add_file(page); 570 + __set_page_locked(page); 571 + ret = __add_to_page_cache_locked(page, mapping, offset, 572 + gfp_mask, &shadow); 573 + if (unlikely(ret)) 574 + __clear_page_locked(page); 575 + else { 576 + /* 577 + * The page might have been evicted from cache only 578 + * recently, in which case it should be activated like 579 + * any other repeatedly accessed page. 580 + */ 581 + if (shadow && workingset_refault(shadow)) { 582 + SetPageActive(page); 583 + workingset_activation(page); 584 + } else 585 + ClearPageActive(page); 586 + lru_cache_add(page); 587 + } 549 588 return ret; 550 589 } 551 590 EXPORT_SYMBOL_GPL(add_to_page_cache_lru);

+2

mm/swap.c

··· 574 574 else 575 575 __lru_cache_activate_page(page); 576 576 ClearPageReferenced(page); 577 + if (page_is_file_cache(page)) 578 + workingset_activation(page); 577 579 } else if (!PageReferenced(page)) { 578 580 SetPageReferenced(page); 579 581 }

+19 -5

mm/vmscan.c

··· 523 523 * Same as remove_mapping, but if the page is removed from the mapping, it 524 524 * gets returned with a refcount of 0. 525 525 */ 526 - static int __remove_mapping(struct address_space *mapping, struct page *page) 526 + static int __remove_mapping(struct address_space *mapping, struct page *page, 527 + bool reclaimed) 527 528 { 528 529 BUG_ON(!PageLocked(page)); 529 530 BUG_ON(mapping != page_mapping(page)); ··· 570 569 swapcache_free(swap, page); 571 570 } else { 572 571 void (*freepage)(struct page *); 572 + void *shadow = NULL; 573 573 574 574 freepage = mapping->a_ops->freepage; 575 - 576 - __delete_from_page_cache(page, NULL); 575 + /* 576 + * Remember a shadow entry for reclaimed file cache in 577 + * order to detect refaults, thus thrashing, later on. 578 + * 579 + * But don't store shadows in an address space that is 580 + * already exiting. This is not just an optizimation, 581 + * inode reclaim needs to empty out the radix tree or 582 + * the nodes are lost. Don't plant shadows behind its 583 + * back. 584 + */ 585 + if (reclaimed && page_is_file_cache(page) && 586 + !mapping_exiting(mapping)) 587 + shadow = workingset_eviction(mapping, page); 588 + __delete_from_page_cache(page, shadow); 577 589 spin_unlock_irq(&mapping->tree_lock); 578 590 mem_cgroup_uncharge_cache_page(page); 579 591 ··· 609 595 */ 610 596 int remove_mapping(struct address_space *mapping, struct page *page) 611 597 { 612 - if (__remove_mapping(mapping, page)) { 598 + if (__remove_mapping(mapping, page, false)) { 613 599 /* 614 600 * Unfreezing the refcount with 1 rather than 2 effectively 615 601 * drops the pagecache ref for us without requiring another ··· 1079 1065 } 1080 1066 } 1081 1067 1082 - if (!mapping || !__remove_mapping(mapping, page)) 1068 + if (!mapping || !__remove_mapping(mapping, page, true)) 1083 1069 goto keep_locked; 1084 1070 1085 1071 /*

+2

mm/vmstat.c

··· 770 770 "numa_local", 771 771 "numa_other", 772 772 #endif 773 + "workingset_refault", 774 + "workingset_activate", 773 775 "nr_anon_transparent_hugepages", 774 776 "nr_free_cma", 775 777 "nr_dirty_threshold",

+253

mm/workingset.c

··· 1 + /* 2 + * Workingset detection 3 + * 4 + * Copyright (C) 2013 Red Hat, Inc., Johannes Weiner 5 + */ 6 + 7 + #include <linux/memcontrol.h> 8 + #include <linux/writeback.h> 9 + #include <linux/pagemap.h> 10 + #include <linux/atomic.h> 11 + #include <linux/module.h> 12 + #include <linux/swap.h> 13 + #include <linux/fs.h> 14 + #include <linux/mm.h> 15 + 16 + /* 17 + * Double CLOCK lists 18 + * 19 + * Per zone, two clock lists are maintained for file pages: the 20 + * inactive and the active list. Freshly faulted pages start out at 21 + * the head of the inactive list and page reclaim scans pages from the 22 + * tail. Pages that are accessed multiple times on the inactive list 23 + * are promoted to the active list, to protect them from reclaim, 24 + * whereas active pages are demoted to the inactive list when the 25 + * active list grows too big. 26 + * 27 + * fault ------------------------+ 28 + * | 29 + * +--------------+ | +-------------+ 30 + * reclaim <- | inactive | <-+-- demotion | active | <--+ 31 + * +--------------+ +-------------+ | 32 + * | | 33 + * +-------------- promotion ------------------+ 34 + * 35 + * 36 + * Access frequency and refault distance 37 + * 38 + * A workload is thrashing when its pages are frequently used but they 39 + * are evicted from the inactive list every time before another access 40 + * would have promoted them to the active list. 41 + * 42 + * In cases where the average access distance between thrashing pages 43 + * is bigger than the size of memory there is nothing that can be 44 + * done - the thrashing set could never fit into memory under any 45 + * circumstance. 46 + * 47 + * However, the average access distance could be bigger than the 48 + * inactive list, yet smaller than the size of memory. In this case, 49 + * the set could fit into memory if it weren't for the currently 50 + * active pages - which may be used more, hopefully less frequently: 51 + * 52 + * +-memory available to cache-+ 53 + * | | 54 + * +-inactive------+-active----+ 55 + * a b | c d e f g h i | J K L M N | 56 + * +---------------+-----------+ 57 + * 58 + * It is prohibitively expensive to accurately track access frequency 59 + * of pages. But a reasonable approximation can be made to measure 60 + * thrashing on the inactive list, after which refaulting pages can be 61 + * activated optimistically to compete with the existing active pages. 62 + * 63 + * Approximating inactive page access frequency - Observations: 64 + * 65 + * 1. When a page is accessed for the first time, it is added to the 66 + * head of the inactive list, slides every existing inactive page 67 + * towards the tail by one slot, and pushes the current tail page 68 + * out of memory. 69 + * 70 + * 2. When a page is accessed for the second time, it is promoted to 71 + * the active list, shrinking the inactive list by one slot. This 72 + * also slides all inactive pages that were faulted into the cache 73 + * more recently than the activated page towards the tail of the 74 + * inactive list. 75 + * 76 + * Thus: 77 + * 78 + * 1. The sum of evictions and activations between any two points in 79 + * time indicate the minimum number of inactive pages accessed in 80 + * between. 81 + * 82 + * 2. Moving one inactive page N page slots towards the tail of the 83 + * list requires at least N inactive page accesses. 84 + * 85 + * Combining these: 86 + * 87 + * 1. When a page is finally evicted from memory, the number of 88 + * inactive pages accessed while the page was in cache is at least 89 + * the number of page slots on the inactive list. 90 + * 91 + * 2. In addition, measuring the sum of evictions and activations (E) 92 + * at the time of a page's eviction, and comparing it to another 93 + * reading (R) at the time the page faults back into memory tells 94 + * the minimum number of accesses while the page was not cached. 95 + * This is called the refault distance. 96 + * 97 + * Because the first access of the page was the fault and the second 98 + * access the refault, we combine the in-cache distance with the 99 + * out-of-cache distance to get the complete minimum access distance 100 + * of this page: 101 + * 102 + * NR_inactive + (R - E) 103 + * 104 + * And knowing the minimum access distance of a page, we can easily 105 + * tell if the page would be able to stay in cache assuming all page 106 + * slots in the cache were available: 107 + * 108 + * NR_inactive + (R - E) <= NR_inactive + NR_active 109 + * 110 + * which can be further simplified to 111 + * 112 + * (R - E) <= NR_active 113 + * 114 + * Put into words, the refault distance (out-of-cache) can be seen as 115 + * a deficit in inactive list space (in-cache). If the inactive list 116 + * had (R - E) more page slots, the page would not have been evicted 117 + * in between accesses, but activated instead. And on a full system, 118 + * the only thing eating into inactive list space is active pages. 119 + * 120 + * 121 + * Activating refaulting pages 122 + * 123 + * All that is known about the active list is that the pages have been 124 + * accessed more than once in the past. This means that at any given 125 + * time there is actually a good chance that pages on the active list 126 + * are no longer in active use. 127 + * 128 + * So when a refault distance of (R - E) is observed and there are at 129 + * least (R - E) active pages, the refaulting page is activated 130 + * optimistically in the hope that (R - E) active pages are actually 131 + * used less frequently than the refaulting page - or even not used at 132 + * all anymore. 133 + * 134 + * If this is wrong and demotion kicks in, the pages which are truly 135 + * used more frequently will be reactivated while the less frequently 136 + * used once will be evicted from memory. 137 + * 138 + * But if this is right, the stale pages will be pushed out of memory 139 + * and the used pages get to stay in cache. 140 + * 141 + * 142 + * Implementation 143 + * 144 + * For each zone's file LRU lists, a counter for inactive evictions 145 + * and activations is maintained (zone->inactive_age). 146 + * 147 + * On eviction, a snapshot of this counter (along with some bits to 148 + * identify the zone) is stored in the now empty page cache radix tree 149 + * slot of the evicted page. This is called a shadow entry. 150 + * 151 + * On cache misses for which there are shadow entries, an eligible 152 + * refault distance will immediately activate the refaulting page. 153 + */ 154 + 155 + static void *pack_shadow(unsigned long eviction, struct zone *zone) 156 + { 157 + eviction = (eviction << NODES_SHIFT) | zone_to_nid(zone); 158 + eviction = (eviction << ZONES_SHIFT) | zone_idx(zone); 159 + eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT); 160 + 161 + return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY); 162 + } 163 + 164 + static void unpack_shadow(void *shadow, 165 + struct zone **zone, 166 + unsigned long *distance) 167 + { 168 + unsigned long entry = (unsigned long)shadow; 169 + unsigned long eviction; 170 + unsigned long refault; 171 + unsigned long mask; 172 + int zid, nid; 173 + 174 + entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT; 175 + zid = entry & ((1UL << ZONES_SHIFT) - 1); 176 + entry >>= ZONES_SHIFT; 177 + nid = entry & ((1UL << NODES_SHIFT) - 1); 178 + entry >>= NODES_SHIFT; 179 + eviction = entry; 180 + 181 + *zone = NODE_DATA(nid)->node_zones + zid; 182 + 183 + refault = atomic_long_read(&(*zone)->inactive_age); 184 + mask = ~0UL >> (NODES_SHIFT + ZONES_SHIFT + 185 + RADIX_TREE_EXCEPTIONAL_SHIFT); 186 + /* 187 + * The unsigned subtraction here gives an accurate distance 188 + * across inactive_age overflows in most cases. 189 + * 190 + * There is a special case: usually, shadow entries have a 191 + * short lifetime and are either refaulted or reclaimed along 192 + * with the inode before they get too old. But it is not 193 + * impossible for the inactive_age to lap a shadow entry in 194 + * the field, which can then can result in a false small 195 + * refault distance, leading to a false activation should this 196 + * old entry actually refault again. However, earlier kernels 197 + * used to deactivate unconditionally with *every* reclaim 198 + * invocation for the longest time, so the occasional 199 + * inappropriate activation leading to pressure on the active 200 + * list is not a problem. 201 + */ 202 + *distance = (refault - eviction) & mask; 203 + } 204 + 205 + /** 206 + * workingset_eviction - note the eviction of a page from memory 207 + * @mapping: address space the page was backing 208 + * @page: the page being evicted 209 + * 210 + * Returns a shadow entry to be stored in @mapping->page_tree in place 211 + * of the evicted @page so that a later refault can be detected. 212 + */ 213 + void *workingset_eviction(struct address_space *mapping, struct page *page) 214 + { 215 + struct zone *zone = page_zone(page); 216 + unsigned long eviction; 217 + 218 + eviction = atomic_long_inc_return(&zone->inactive_age); 219 + return pack_shadow(eviction, zone); 220 + } 221 + 222 + /** 223 + * workingset_refault - evaluate the refault of a previously evicted page 224 + * @shadow: shadow entry of the evicted page 225 + * 226 + * Calculates and evaluates the refault distance of the previously 227 + * evicted page in the context of the zone it was allocated in. 228 + * 229 + * Returns %true if the page should be activated, %false otherwise. 230 + */ 231 + bool workingset_refault(void *shadow) 232 + { 233 + unsigned long refault_distance; 234 + struct zone *zone; 235 + 236 + unpack_shadow(shadow, &zone, &refault_distance); 237 + inc_zone_state(zone, WORKINGSET_REFAULT); 238 + 239 + if (refault_distance <= zone_page_state(zone, NR_ACTIVE_FILE)) { 240 + inc_zone_state(zone, WORKINGSET_ACTIVATE); 241 + return true; 242 + } 243 + return false; 244 + } 245 + 246 + /** 247 + * workingset_activation - note a page activation 248 + * @page: page that is being activated 249 + */ 250 + void workingset_activation(struct page *page) 251 + { 252 + atomic_long_inc(&page_zone(page)->inactive_age); 253 + }