Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork

Configure Feed

Select the types of activity you want to include in your feed.

at v5.13 501 lines 15 kB view raw
1// SPDX-License-Identifier: GPL-2.0-only 2/* 3 * Frontswap frontend 4 * 5 * This code provides the generic "frontend" layer to call a matching 6 * "backend" driver implementation of frontswap. See 7 * Documentation/vm/frontswap.rst for more information. 8 * 9 * Copyright (C) 2009-2012 Oracle Corp. All rights reserved. 10 * Author: Dan Magenheimer 11 */ 12 13#include <linux/mman.h> 14#include <linux/swap.h> 15#include <linux/swapops.h> 16#include <linux/security.h> 17#include <linux/module.h> 18#include <linux/debugfs.h> 19#include <linux/frontswap.h> 20#include <linux/swapfile.h> 21 22DEFINE_STATIC_KEY_FALSE(frontswap_enabled_key); 23 24/* 25 * frontswap_ops are added by frontswap_register_ops, and provide the 26 * frontswap "backend" implementation functions. Multiple implementations 27 * may be registered, but implementations can never deregister. This 28 * is a simple singly-linked list of all registered implementations. 29 */ 30static struct frontswap_ops *frontswap_ops __read_mostly; 31 32#define for_each_frontswap_ops(ops) \ 33 for ((ops) = frontswap_ops; (ops); (ops) = (ops)->next) 34 35/* 36 * If enabled, frontswap_store will return failure even on success. As 37 * a result, the swap subsystem will always write the page to swap, in 38 * effect converting frontswap into a writethrough cache. In this mode, 39 * there is no direct reduction in swap writes, but a frontswap backend 40 * can unilaterally "reclaim" any pages in use with no data loss, thus 41 * providing increases control over maximum memory usage due to frontswap. 42 */ 43static bool frontswap_writethrough_enabled __read_mostly; 44 45/* 46 * If enabled, the underlying tmem implementation is capable of doing 47 * exclusive gets, so frontswap_load, on a successful tmem_get must 48 * mark the page as no longer in frontswap AND mark it dirty. 49 */ 50static bool frontswap_tmem_exclusive_gets_enabled __read_mostly; 51 52#ifdef CONFIG_DEBUG_FS 53/* 54 * Counters available via /sys/kernel/debug/frontswap (if debugfs is 55 * properly configured). These are for information only so are not protected 56 * against increment races. 57 */ 58static u64 frontswap_loads; 59static u64 frontswap_succ_stores; 60static u64 frontswap_failed_stores; 61static u64 frontswap_invalidates; 62 63static inline void inc_frontswap_loads(void) 64{ 65 data_race(frontswap_loads++); 66} 67static inline void inc_frontswap_succ_stores(void) 68{ 69 data_race(frontswap_succ_stores++); 70} 71static inline void inc_frontswap_failed_stores(void) 72{ 73 data_race(frontswap_failed_stores++); 74} 75static inline void inc_frontswap_invalidates(void) 76{ 77 data_race(frontswap_invalidates++); 78} 79#else 80static inline void inc_frontswap_loads(void) { } 81static inline void inc_frontswap_succ_stores(void) { } 82static inline void inc_frontswap_failed_stores(void) { } 83static inline void inc_frontswap_invalidates(void) { } 84#endif 85 86/* 87 * Due to the asynchronous nature of the backends loading potentially 88 * _after_ the swap system has been activated, we have chokepoints 89 * on all frontswap functions to not call the backend until the backend 90 * has registered. 91 * 92 * This would not guards us against the user deciding to call swapoff right as 93 * we are calling the backend to initialize (so swapon is in action). 94 * Fortunately for us, the swapon_mutex has been taken by the callee so we are 95 * OK. The other scenario where calls to frontswap_store (called via 96 * swap_writepage) is racing with frontswap_invalidate_area (called via 97 * swapoff) is again guarded by the swap subsystem. 98 * 99 * While no backend is registered all calls to frontswap_[store|load| 100 * invalidate_area|invalidate_page] are ignored or fail. 101 * 102 * The time between the backend being registered and the swap file system 103 * calling the backend (via the frontswap_* functions) is indeterminate as 104 * frontswap_ops is not atomic_t (or a value guarded by a spinlock). 105 * That is OK as we are comfortable missing some of these calls to the newly 106 * registered backend. 107 * 108 * Obviously the opposite (unloading the backend) must be done after all 109 * the frontswap_[store|load|invalidate_area|invalidate_page] start 110 * ignoring or failing the requests. However, there is currently no way 111 * to unload a backend once it is registered. 112 */ 113 114/* 115 * Register operations for frontswap 116 */ 117void frontswap_register_ops(struct frontswap_ops *ops) 118{ 119 DECLARE_BITMAP(a, MAX_SWAPFILES); 120 DECLARE_BITMAP(b, MAX_SWAPFILES); 121 struct swap_info_struct *si; 122 unsigned int i; 123 124 bitmap_zero(a, MAX_SWAPFILES); 125 bitmap_zero(b, MAX_SWAPFILES); 126 127 spin_lock(&swap_lock); 128 plist_for_each_entry(si, &swap_active_head, list) { 129 if (!WARN_ON(!si->frontswap_map)) 130 set_bit(si->type, a); 131 } 132 spin_unlock(&swap_lock); 133 134 /* the new ops needs to know the currently active swap devices */ 135 for_each_set_bit(i, a, MAX_SWAPFILES) 136 ops->init(i); 137 138 /* 139 * Setting frontswap_ops must happen after the ops->init() calls 140 * above; cmpxchg implies smp_mb() which will ensure the init is 141 * complete at this point. 142 */ 143 do { 144 ops->next = frontswap_ops; 145 } while (cmpxchg(&frontswap_ops, ops->next, ops) != ops->next); 146 147 static_branch_inc(&frontswap_enabled_key); 148 149 spin_lock(&swap_lock); 150 plist_for_each_entry(si, &swap_active_head, list) { 151 if (si->frontswap_map) 152 set_bit(si->type, b); 153 } 154 spin_unlock(&swap_lock); 155 156 /* 157 * On the very unlikely chance that a swap device was added or 158 * removed between setting the "a" list bits and the ops init 159 * calls, we re-check and do init or invalidate for any changed 160 * bits. 161 */ 162 if (unlikely(!bitmap_equal(a, b, MAX_SWAPFILES))) { 163 for (i = 0; i < MAX_SWAPFILES; i++) { 164 if (!test_bit(i, a) && test_bit(i, b)) 165 ops->init(i); 166 else if (test_bit(i, a) && !test_bit(i, b)) 167 ops->invalidate_area(i); 168 } 169 } 170} 171EXPORT_SYMBOL(frontswap_register_ops); 172 173/* 174 * Enable/disable frontswap writethrough (see above). 175 */ 176void frontswap_writethrough(bool enable) 177{ 178 frontswap_writethrough_enabled = enable; 179} 180EXPORT_SYMBOL(frontswap_writethrough); 181 182/* 183 * Enable/disable frontswap exclusive gets (see above). 184 */ 185void frontswap_tmem_exclusive_gets(bool enable) 186{ 187 frontswap_tmem_exclusive_gets_enabled = enable; 188} 189EXPORT_SYMBOL(frontswap_tmem_exclusive_gets); 190 191/* 192 * Called when a swap device is swapon'd. 193 */ 194void __frontswap_init(unsigned type, unsigned long *map) 195{ 196 struct swap_info_struct *sis = swap_info[type]; 197 struct frontswap_ops *ops; 198 199 VM_BUG_ON(sis == NULL); 200 201 /* 202 * p->frontswap is a bitmap that we MUST have to figure out which page 203 * has gone in frontswap. Without it there is no point of continuing. 204 */ 205 if (WARN_ON(!map)) 206 return; 207 /* 208 * Irregardless of whether the frontswap backend has been loaded 209 * before this function or it will be later, we _MUST_ have the 210 * p->frontswap set to something valid to work properly. 211 */ 212 frontswap_map_set(sis, map); 213 214 for_each_frontswap_ops(ops) 215 ops->init(type); 216} 217EXPORT_SYMBOL(__frontswap_init); 218 219bool __frontswap_test(struct swap_info_struct *sis, 220 pgoff_t offset) 221{ 222 if (sis->frontswap_map) 223 return test_bit(offset, sis->frontswap_map); 224 return false; 225} 226EXPORT_SYMBOL(__frontswap_test); 227 228static inline void __frontswap_set(struct swap_info_struct *sis, 229 pgoff_t offset) 230{ 231 set_bit(offset, sis->frontswap_map); 232 atomic_inc(&sis->frontswap_pages); 233} 234 235static inline void __frontswap_clear(struct swap_info_struct *sis, 236 pgoff_t offset) 237{ 238 clear_bit(offset, sis->frontswap_map); 239 atomic_dec(&sis->frontswap_pages); 240} 241 242/* 243 * "Store" data from a page to frontswap and associate it with the page's 244 * swaptype and offset. Page must be locked and in the swap cache. 245 * If frontswap already contains a page with matching swaptype and 246 * offset, the frontswap implementation may either overwrite the data and 247 * return success or invalidate the page from frontswap and return failure. 248 */ 249int __frontswap_store(struct page *page) 250{ 251 int ret = -1; 252 swp_entry_t entry = { .val = page_private(page), }; 253 int type = swp_type(entry); 254 struct swap_info_struct *sis = swap_info[type]; 255 pgoff_t offset = swp_offset(entry); 256 struct frontswap_ops *ops; 257 258 VM_BUG_ON(!frontswap_ops); 259 VM_BUG_ON(!PageLocked(page)); 260 VM_BUG_ON(sis == NULL); 261 262 /* 263 * If a dup, we must remove the old page first; we can't leave the 264 * old page no matter if the store of the new page succeeds or fails, 265 * and we can't rely on the new page replacing the old page as we may 266 * not store to the same implementation that contains the old page. 267 */ 268 if (__frontswap_test(sis, offset)) { 269 __frontswap_clear(sis, offset); 270 for_each_frontswap_ops(ops) 271 ops->invalidate_page(type, offset); 272 } 273 274 /* Try to store in each implementation, until one succeeds. */ 275 for_each_frontswap_ops(ops) { 276 ret = ops->store(type, offset, page); 277 if (!ret) /* successful store */ 278 break; 279 } 280 if (ret == 0) { 281 __frontswap_set(sis, offset); 282 inc_frontswap_succ_stores(); 283 } else { 284 inc_frontswap_failed_stores(); 285 } 286 if (frontswap_writethrough_enabled) 287 /* report failure so swap also writes to swap device */ 288 ret = -1; 289 return ret; 290} 291EXPORT_SYMBOL(__frontswap_store); 292 293/* 294 * "Get" data from frontswap associated with swaptype and offset that were 295 * specified when the data was put to frontswap and use it to fill the 296 * specified page with data. Page must be locked and in the swap cache. 297 */ 298int __frontswap_load(struct page *page) 299{ 300 int ret = -1; 301 swp_entry_t entry = { .val = page_private(page), }; 302 int type = swp_type(entry); 303 struct swap_info_struct *sis = swap_info[type]; 304 pgoff_t offset = swp_offset(entry); 305 struct frontswap_ops *ops; 306 307 VM_BUG_ON(!frontswap_ops); 308 VM_BUG_ON(!PageLocked(page)); 309 VM_BUG_ON(sis == NULL); 310 311 if (!__frontswap_test(sis, offset)) 312 return -1; 313 314 /* Try loading from each implementation, until one succeeds. */ 315 for_each_frontswap_ops(ops) { 316 ret = ops->load(type, offset, page); 317 if (!ret) /* successful load */ 318 break; 319 } 320 if (ret == 0) { 321 inc_frontswap_loads(); 322 if (frontswap_tmem_exclusive_gets_enabled) { 323 SetPageDirty(page); 324 __frontswap_clear(sis, offset); 325 } 326 } 327 return ret; 328} 329EXPORT_SYMBOL(__frontswap_load); 330 331/* 332 * Invalidate any data from frontswap associated with the specified swaptype 333 * and offset so that a subsequent "get" will fail. 334 */ 335void __frontswap_invalidate_page(unsigned type, pgoff_t offset) 336{ 337 struct swap_info_struct *sis = swap_info[type]; 338 struct frontswap_ops *ops; 339 340 VM_BUG_ON(!frontswap_ops); 341 VM_BUG_ON(sis == NULL); 342 343 if (!__frontswap_test(sis, offset)) 344 return; 345 346 for_each_frontswap_ops(ops) 347 ops->invalidate_page(type, offset); 348 __frontswap_clear(sis, offset); 349 inc_frontswap_invalidates(); 350} 351EXPORT_SYMBOL(__frontswap_invalidate_page); 352 353/* 354 * Invalidate all data from frontswap associated with all offsets for the 355 * specified swaptype. 356 */ 357void __frontswap_invalidate_area(unsigned type) 358{ 359 struct swap_info_struct *sis = swap_info[type]; 360 struct frontswap_ops *ops; 361 362 VM_BUG_ON(!frontswap_ops); 363 VM_BUG_ON(sis == NULL); 364 365 if (sis->frontswap_map == NULL) 366 return; 367 368 for_each_frontswap_ops(ops) 369 ops->invalidate_area(type); 370 atomic_set(&sis->frontswap_pages, 0); 371 bitmap_zero(sis->frontswap_map, sis->max); 372} 373EXPORT_SYMBOL(__frontswap_invalidate_area); 374 375static unsigned long __frontswap_curr_pages(void) 376{ 377 unsigned long totalpages = 0; 378 struct swap_info_struct *si = NULL; 379 380 assert_spin_locked(&swap_lock); 381 plist_for_each_entry(si, &swap_active_head, list) 382 totalpages += atomic_read(&si->frontswap_pages); 383 return totalpages; 384} 385 386static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused, 387 int *swapid) 388{ 389 int ret = -EINVAL; 390 struct swap_info_struct *si = NULL; 391 int si_frontswap_pages; 392 unsigned long total_pages_to_unuse = total; 393 unsigned long pages = 0, pages_to_unuse = 0; 394 395 assert_spin_locked(&swap_lock); 396 plist_for_each_entry(si, &swap_active_head, list) { 397 si_frontswap_pages = atomic_read(&si->frontswap_pages); 398 if (total_pages_to_unuse < si_frontswap_pages) { 399 pages = pages_to_unuse = total_pages_to_unuse; 400 } else { 401 pages = si_frontswap_pages; 402 pages_to_unuse = 0; /* unuse all */ 403 } 404 /* ensure there is enough RAM to fetch pages from frontswap */ 405 if (security_vm_enough_memory_mm(current->mm, pages)) { 406 ret = -ENOMEM; 407 continue; 408 } 409 vm_unacct_memory(pages); 410 *unused = pages_to_unuse; 411 *swapid = si->type; 412 ret = 0; 413 break; 414 } 415 416 return ret; 417} 418 419/* 420 * Used to check if it's necessary and feasible to unuse pages. 421 * Return 1 when nothing to do, 0 when need to shrink pages, 422 * error code when there is an error. 423 */ 424static int __frontswap_shrink(unsigned long target_pages, 425 unsigned long *pages_to_unuse, 426 int *type) 427{ 428 unsigned long total_pages = 0, total_pages_to_unuse; 429 430 assert_spin_locked(&swap_lock); 431 432 total_pages = __frontswap_curr_pages(); 433 if (total_pages <= target_pages) { 434 /* Nothing to do */ 435 *pages_to_unuse = 0; 436 return 1; 437 } 438 total_pages_to_unuse = total_pages - target_pages; 439 return __frontswap_unuse_pages(total_pages_to_unuse, pages_to_unuse, type); 440} 441 442/* 443 * Frontswap, like a true swap device, may unnecessarily retain pages 444 * under certain circumstances; "shrink" frontswap is essentially a 445 * "partial swapoff" and works by calling try_to_unuse to attempt to 446 * unuse enough frontswap pages to attempt to -- subject to memory 447 * constraints -- reduce the number of pages in frontswap to the 448 * number given in the parameter target_pages. 449 */ 450void frontswap_shrink(unsigned long target_pages) 451{ 452 unsigned long pages_to_unuse = 0; 453 int type, ret; 454 455 /* 456 * we don't want to hold swap_lock while doing a very 457 * lengthy try_to_unuse, but swap_list may change 458 * so restart scan from swap_active_head each time 459 */ 460 spin_lock(&swap_lock); 461 ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type); 462 spin_unlock(&swap_lock); 463 if (ret == 0) 464 try_to_unuse(type, true, pages_to_unuse); 465 return; 466} 467EXPORT_SYMBOL(frontswap_shrink); 468 469/* 470 * Count and return the number of frontswap pages across all 471 * swap devices. This is exported so that backend drivers can 472 * determine current usage without reading debugfs. 473 */ 474unsigned long frontswap_curr_pages(void) 475{ 476 unsigned long totalpages = 0; 477 478 spin_lock(&swap_lock); 479 totalpages = __frontswap_curr_pages(); 480 spin_unlock(&swap_lock); 481 482 return totalpages; 483} 484EXPORT_SYMBOL(frontswap_curr_pages); 485 486static int __init init_frontswap(void) 487{ 488#ifdef CONFIG_DEBUG_FS 489 struct dentry *root = debugfs_create_dir("frontswap", NULL); 490 if (root == NULL) 491 return -ENXIO; 492 debugfs_create_u64("loads", 0444, root, &frontswap_loads); 493 debugfs_create_u64("succ_stores", 0444, root, &frontswap_succ_stores); 494 debugfs_create_u64("failed_stores", 0444, root, 495 &frontswap_failed_stores); 496 debugfs_create_u64("invalidates", 0444, root, &frontswap_invalidates); 497#endif 498 return 0; 499} 500 501module_init(init_frontswap);