Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at v4.17-rc2 1356 lines 35 kB view raw
1/* 2 * zswap.c - zswap driver file 3 * 4 * zswap is a backend for frontswap that takes pages that are in the process 5 * of being swapped out and attempts to compress and store them in a 6 * RAM-based memory pool. This can result in a significant I/O reduction on 7 * the swap device and, in the case where decompressing from RAM is faster 8 * than reading from the swap device, can also improve workload performance. 9 * 10 * Copyright (C) 2012 Seth Jennings <sjenning@linux.vnet.ibm.com> 11 * 12 * This program is free software; you can redistribute it and/or 13 * modify it under the terms of the GNU General Public License 14 * as published by the Free Software Foundation; either version 2 15 * of the License, or (at your option) any later version. 16 * 17 * This program is distributed in the hope that it will be useful, 18 * but WITHOUT ANY WARRANTY; without even the implied warranty of 19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 20 * GNU General Public License for more details. 21*/ 22 23#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 24 25#include <linux/module.h> 26#include <linux/cpu.h> 27#include <linux/highmem.h> 28#include <linux/slab.h> 29#include <linux/spinlock.h> 30#include <linux/types.h> 31#include <linux/atomic.h> 32#include <linux/frontswap.h> 33#include <linux/rbtree.h> 34#include <linux/swap.h> 35#include <linux/crypto.h> 36#include <linux/mempool.h> 37#include <linux/zpool.h> 38 39#include <linux/mm_types.h> 40#include <linux/page-flags.h> 41#include <linux/swapops.h> 42#include <linux/writeback.h> 43#include <linux/pagemap.h> 44 45/********************************* 46* statistics 47**********************************/ 48/* Total bytes used by the compressed storage */ 49static u64 zswap_pool_total_size; 50/* The number of compressed pages currently stored in zswap */ 51static atomic_t zswap_stored_pages = ATOMIC_INIT(0); 52/* The number of same-value filled pages currently stored in zswap */ 53static atomic_t zswap_same_filled_pages = ATOMIC_INIT(0); 54 55/* 56 * The statistics below are not protected from concurrent access for 57 * performance reasons so they may not be a 100% accurate. However, 58 * they do provide useful information on roughly how many times a 59 * certain event is occurring. 60*/ 61 62/* Pool limit was hit (see zswap_max_pool_percent) */ 63static u64 zswap_pool_limit_hit; 64/* Pages written back when pool limit was reached */ 65static u64 zswap_written_back_pages; 66/* Store failed due to a reclaim failure after pool limit was reached */ 67static u64 zswap_reject_reclaim_fail; 68/* Compressed page was too big for the allocator to (optimally) store */ 69static u64 zswap_reject_compress_poor; 70/* Store failed because underlying allocator could not get memory */ 71static u64 zswap_reject_alloc_fail; 72/* Store failed because the entry metadata could not be allocated (rare) */ 73static u64 zswap_reject_kmemcache_fail; 74/* Duplicate store was encountered (rare) */ 75static u64 zswap_duplicate_entry; 76 77/********************************* 78* tunables 79**********************************/ 80 81#define ZSWAP_PARAM_UNSET "" 82 83/* Enable/disable zswap (disabled by default) */ 84static bool zswap_enabled; 85static int zswap_enabled_param_set(const char *, 86 const struct kernel_param *); 87static struct kernel_param_ops zswap_enabled_param_ops = { 88 .set = zswap_enabled_param_set, 89 .get = param_get_bool, 90}; 91module_param_cb(enabled, &zswap_enabled_param_ops, &zswap_enabled, 0644); 92 93/* Crypto compressor to use */ 94#define ZSWAP_COMPRESSOR_DEFAULT "lzo" 95static char *zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT; 96static int zswap_compressor_param_set(const char *, 97 const struct kernel_param *); 98static struct kernel_param_ops zswap_compressor_param_ops = { 99 .set = zswap_compressor_param_set, 100 .get = param_get_charp, 101 .free = param_free_charp, 102}; 103module_param_cb(compressor, &zswap_compressor_param_ops, 104 &zswap_compressor, 0644); 105 106/* Compressed storage zpool to use */ 107#define ZSWAP_ZPOOL_DEFAULT "zbud" 108static char *zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT; 109static int zswap_zpool_param_set(const char *, const struct kernel_param *); 110static struct kernel_param_ops zswap_zpool_param_ops = { 111 .set = zswap_zpool_param_set, 112 .get = param_get_charp, 113 .free = param_free_charp, 114}; 115module_param_cb(zpool, &zswap_zpool_param_ops, &zswap_zpool_type, 0644); 116 117/* The maximum percentage of memory that the compressed pool can occupy */ 118static unsigned int zswap_max_pool_percent = 20; 119module_param_named(max_pool_percent, zswap_max_pool_percent, uint, 0644); 120 121/* Enable/disable handling same-value filled pages (enabled by default) */ 122static bool zswap_same_filled_pages_enabled = true; 123module_param_named(same_filled_pages_enabled, zswap_same_filled_pages_enabled, 124 bool, 0644); 125 126/********************************* 127* data structures 128**********************************/ 129 130struct zswap_pool { 131 struct zpool *zpool; 132 struct crypto_comp * __percpu *tfm; 133 struct kref kref; 134 struct list_head list; 135 struct work_struct work; 136 struct hlist_node node; 137 char tfm_name[CRYPTO_MAX_ALG_NAME]; 138}; 139 140/* 141 * struct zswap_entry 142 * 143 * This structure contains the metadata for tracking a single compressed 144 * page within zswap. 145 * 146 * rbnode - links the entry into red-black tree for the appropriate swap type 147 * offset - the swap offset for the entry. Index into the red-black tree. 148 * refcount - the number of outstanding reference to the entry. This is needed 149 * to protect against premature freeing of the entry by code 150 * concurrent calls to load, invalidate, and writeback. The lock 151 * for the zswap_tree structure that contains the entry must 152 * be held while changing the refcount. Since the lock must 153 * be held, there is no reason to also make refcount atomic. 154 * length - the length in bytes of the compressed page data. Needed during 155 * decompression. For a same value filled page length is 0. 156 * pool - the zswap_pool the entry's data is in 157 * handle - zpool allocation handle that stores the compressed page data 158 * value - value of the same-value filled pages which have same content 159 */ 160struct zswap_entry { 161 struct rb_node rbnode; 162 pgoff_t offset; 163 int refcount; 164 unsigned int length; 165 struct zswap_pool *pool; 166 union { 167 unsigned long handle; 168 unsigned long value; 169 }; 170}; 171 172struct zswap_header { 173 swp_entry_t swpentry; 174}; 175 176/* 177 * The tree lock in the zswap_tree struct protects a few things: 178 * - the rbtree 179 * - the refcount field of each entry in the tree 180 */ 181struct zswap_tree { 182 struct rb_root rbroot; 183 spinlock_t lock; 184}; 185 186static struct zswap_tree *zswap_trees[MAX_SWAPFILES]; 187 188/* RCU-protected iteration */ 189static LIST_HEAD(zswap_pools); 190/* protects zswap_pools list modification */ 191static DEFINE_SPINLOCK(zswap_pools_lock); 192/* pool counter to provide unique names to zpool */ 193static atomic_t zswap_pools_count = ATOMIC_INIT(0); 194 195/* used by param callback function */ 196static bool zswap_init_started; 197 198/* fatal error during init */ 199static bool zswap_init_failed; 200 201/* init completed, but couldn't create the initial pool */ 202static bool zswap_has_pool; 203 204/********************************* 205* helpers and fwd declarations 206**********************************/ 207 208#define zswap_pool_debug(msg, p) \ 209 pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name, \ 210 zpool_get_type((p)->zpool)) 211 212static int zswap_writeback_entry(struct zpool *pool, unsigned long handle); 213static int zswap_pool_get(struct zswap_pool *pool); 214static void zswap_pool_put(struct zswap_pool *pool); 215 216static const struct zpool_ops zswap_zpool_ops = { 217 .evict = zswap_writeback_entry 218}; 219 220static bool zswap_is_full(void) 221{ 222 return totalram_pages * zswap_max_pool_percent / 100 < 223 DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE); 224} 225 226static void zswap_update_total_size(void) 227{ 228 struct zswap_pool *pool; 229 u64 total = 0; 230 231 rcu_read_lock(); 232 233 list_for_each_entry_rcu(pool, &zswap_pools, list) 234 total += zpool_get_total_size(pool->zpool); 235 236 rcu_read_unlock(); 237 238 zswap_pool_total_size = total; 239} 240 241/********************************* 242* zswap entry functions 243**********************************/ 244static struct kmem_cache *zswap_entry_cache; 245 246static int __init zswap_entry_cache_create(void) 247{ 248 zswap_entry_cache = KMEM_CACHE(zswap_entry, 0); 249 return zswap_entry_cache == NULL; 250} 251 252static void __init zswap_entry_cache_destroy(void) 253{ 254 kmem_cache_destroy(zswap_entry_cache); 255} 256 257static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp) 258{ 259 struct zswap_entry *entry; 260 entry = kmem_cache_alloc(zswap_entry_cache, gfp); 261 if (!entry) 262 return NULL; 263 entry->refcount = 1; 264 RB_CLEAR_NODE(&entry->rbnode); 265 return entry; 266} 267 268static void zswap_entry_cache_free(struct zswap_entry *entry) 269{ 270 kmem_cache_free(zswap_entry_cache, entry); 271} 272 273/********************************* 274* rbtree functions 275**********************************/ 276static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset) 277{ 278 struct rb_node *node = root->rb_node; 279 struct zswap_entry *entry; 280 281 while (node) { 282 entry = rb_entry(node, struct zswap_entry, rbnode); 283 if (entry->offset > offset) 284 node = node->rb_left; 285 else if (entry->offset < offset) 286 node = node->rb_right; 287 else 288 return entry; 289 } 290 return NULL; 291} 292 293/* 294 * In the case that a entry with the same offset is found, a pointer to 295 * the existing entry is stored in dupentry and the function returns -EEXIST 296 */ 297static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry, 298 struct zswap_entry **dupentry) 299{ 300 struct rb_node **link = &root->rb_node, *parent = NULL; 301 struct zswap_entry *myentry; 302 303 while (*link) { 304 parent = *link; 305 myentry = rb_entry(parent, struct zswap_entry, rbnode); 306 if (myentry->offset > entry->offset) 307 link = &(*link)->rb_left; 308 else if (myentry->offset < entry->offset) 309 link = &(*link)->rb_right; 310 else { 311 *dupentry = myentry; 312 return -EEXIST; 313 } 314 } 315 rb_link_node(&entry->rbnode, parent, link); 316 rb_insert_color(&entry->rbnode, root); 317 return 0; 318} 319 320static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry) 321{ 322 if (!RB_EMPTY_NODE(&entry->rbnode)) { 323 rb_erase(&entry->rbnode, root); 324 RB_CLEAR_NODE(&entry->rbnode); 325 } 326} 327 328/* 329 * Carries out the common pattern of freeing and entry's zpool allocation, 330 * freeing the entry itself, and decrementing the number of stored pages. 331 */ 332static void zswap_free_entry(struct zswap_entry *entry) 333{ 334 if (!entry->length) 335 atomic_dec(&zswap_same_filled_pages); 336 else { 337 zpool_free(entry->pool->zpool, entry->handle); 338 zswap_pool_put(entry->pool); 339 } 340 zswap_entry_cache_free(entry); 341 atomic_dec(&zswap_stored_pages); 342 zswap_update_total_size(); 343} 344 345/* caller must hold the tree lock */ 346static void zswap_entry_get(struct zswap_entry *entry) 347{ 348 entry->refcount++; 349} 350 351/* caller must hold the tree lock 352* remove from the tree and free it, if nobody reference the entry 353*/ 354static void zswap_entry_put(struct zswap_tree *tree, 355 struct zswap_entry *entry) 356{ 357 int refcount = --entry->refcount; 358 359 BUG_ON(refcount < 0); 360 if (refcount == 0) { 361 zswap_rb_erase(&tree->rbroot, entry); 362 zswap_free_entry(entry); 363 } 364} 365 366/* caller must hold the tree lock */ 367static struct zswap_entry *zswap_entry_find_get(struct rb_root *root, 368 pgoff_t offset) 369{ 370 struct zswap_entry *entry; 371 372 entry = zswap_rb_search(root, offset); 373 if (entry) 374 zswap_entry_get(entry); 375 376 return entry; 377} 378 379/********************************* 380* per-cpu code 381**********************************/ 382static DEFINE_PER_CPU(u8 *, zswap_dstmem); 383 384static int zswap_dstmem_prepare(unsigned int cpu) 385{ 386 u8 *dst; 387 388 dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); 389 if (!dst) 390 return -ENOMEM; 391 392 per_cpu(zswap_dstmem, cpu) = dst; 393 return 0; 394} 395 396static int zswap_dstmem_dead(unsigned int cpu) 397{ 398 u8 *dst; 399 400 dst = per_cpu(zswap_dstmem, cpu); 401 kfree(dst); 402 per_cpu(zswap_dstmem, cpu) = NULL; 403 404 return 0; 405} 406 407static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) 408{ 409 struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); 410 struct crypto_comp *tfm; 411 412 if (WARN_ON(*per_cpu_ptr(pool->tfm, cpu))) 413 return 0; 414 415 tfm = crypto_alloc_comp(pool->tfm_name, 0, 0); 416 if (IS_ERR_OR_NULL(tfm)) { 417 pr_err("could not alloc crypto comp %s : %ld\n", 418 pool->tfm_name, PTR_ERR(tfm)); 419 return -ENOMEM; 420 } 421 *per_cpu_ptr(pool->tfm, cpu) = tfm; 422 return 0; 423} 424 425static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node) 426{ 427 struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); 428 struct crypto_comp *tfm; 429 430 tfm = *per_cpu_ptr(pool->tfm, cpu); 431 if (!IS_ERR_OR_NULL(tfm)) 432 crypto_free_comp(tfm); 433 *per_cpu_ptr(pool->tfm, cpu) = NULL; 434 return 0; 435} 436 437/********************************* 438* pool functions 439**********************************/ 440 441static struct zswap_pool *__zswap_pool_current(void) 442{ 443 struct zswap_pool *pool; 444 445 pool = list_first_or_null_rcu(&zswap_pools, typeof(*pool), list); 446 WARN_ONCE(!pool && zswap_has_pool, 447 "%s: no page storage pool!\n", __func__); 448 449 return pool; 450} 451 452static struct zswap_pool *zswap_pool_current(void) 453{ 454 assert_spin_locked(&zswap_pools_lock); 455 456 return __zswap_pool_current(); 457} 458 459static struct zswap_pool *zswap_pool_current_get(void) 460{ 461 struct zswap_pool *pool; 462 463 rcu_read_lock(); 464 465 pool = __zswap_pool_current(); 466 if (!zswap_pool_get(pool)) 467 pool = NULL; 468 469 rcu_read_unlock(); 470 471 return pool; 472} 473 474static struct zswap_pool *zswap_pool_last_get(void) 475{ 476 struct zswap_pool *pool, *last = NULL; 477 478 rcu_read_lock(); 479 480 list_for_each_entry_rcu(pool, &zswap_pools, list) 481 last = pool; 482 WARN_ONCE(!last && zswap_has_pool, 483 "%s: no page storage pool!\n", __func__); 484 if (!zswap_pool_get(last)) 485 last = NULL; 486 487 rcu_read_unlock(); 488 489 return last; 490} 491 492/* type and compressor must be null-terminated */ 493static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor) 494{ 495 struct zswap_pool *pool; 496 497 assert_spin_locked(&zswap_pools_lock); 498 499 list_for_each_entry_rcu(pool, &zswap_pools, list) { 500 if (strcmp(pool->tfm_name, compressor)) 501 continue; 502 if (strcmp(zpool_get_type(pool->zpool), type)) 503 continue; 504 /* if we can't get it, it's about to be destroyed */ 505 if (!zswap_pool_get(pool)) 506 continue; 507 return pool; 508 } 509 510 return NULL; 511} 512 513static struct zswap_pool *zswap_pool_create(char *type, char *compressor) 514{ 515 struct zswap_pool *pool; 516 char name[38]; /* 'zswap' + 32 char (max) num + \0 */ 517 gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; 518 int ret; 519 520 if (!zswap_has_pool) { 521 /* if either are unset, pool initialization failed, and we 522 * need both params to be set correctly before trying to 523 * create a pool. 524 */ 525 if (!strcmp(type, ZSWAP_PARAM_UNSET)) 526 return NULL; 527 if (!strcmp(compressor, ZSWAP_PARAM_UNSET)) 528 return NULL; 529 } 530 531 pool = kzalloc(sizeof(*pool), GFP_KERNEL); 532 if (!pool) 533 return NULL; 534 535 /* unique name for each pool specifically required by zsmalloc */ 536 snprintf(name, 38, "zswap%x", atomic_inc_return(&zswap_pools_count)); 537 538 pool->zpool = zpool_create_pool(type, name, gfp, &zswap_zpool_ops); 539 if (!pool->zpool) { 540 pr_err("%s zpool not available\n", type); 541 goto error; 542 } 543 pr_debug("using %s zpool\n", zpool_get_type(pool->zpool)); 544 545 strlcpy(pool->tfm_name, compressor, sizeof(pool->tfm_name)); 546 pool->tfm = alloc_percpu(struct crypto_comp *); 547 if (!pool->tfm) { 548 pr_err("percpu alloc failed\n"); 549 goto error; 550 } 551 552 ret = cpuhp_state_add_instance(CPUHP_MM_ZSWP_POOL_PREPARE, 553 &pool->node); 554 if (ret) 555 goto error; 556 pr_debug("using %s compressor\n", pool->tfm_name); 557 558 /* being the current pool takes 1 ref; this func expects the 559 * caller to always add the new pool as the current pool 560 */ 561 kref_init(&pool->kref); 562 INIT_LIST_HEAD(&pool->list); 563 564 zswap_pool_debug("created", pool); 565 566 return pool; 567 568error: 569 free_percpu(pool->tfm); 570 if (pool->zpool) 571 zpool_destroy_pool(pool->zpool); 572 kfree(pool); 573 return NULL; 574} 575 576static __init struct zswap_pool *__zswap_pool_create_fallback(void) 577{ 578 bool has_comp, has_zpool; 579 580 has_comp = crypto_has_comp(zswap_compressor, 0, 0); 581 if (!has_comp && strcmp(zswap_compressor, ZSWAP_COMPRESSOR_DEFAULT)) { 582 pr_err("compressor %s not available, using default %s\n", 583 zswap_compressor, ZSWAP_COMPRESSOR_DEFAULT); 584 param_free_charp(&zswap_compressor); 585 zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT; 586 has_comp = crypto_has_comp(zswap_compressor, 0, 0); 587 } 588 if (!has_comp) { 589 pr_err("default compressor %s not available\n", 590 zswap_compressor); 591 param_free_charp(&zswap_compressor); 592 zswap_compressor = ZSWAP_PARAM_UNSET; 593 } 594 595 has_zpool = zpool_has_pool(zswap_zpool_type); 596 if (!has_zpool && strcmp(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT)) { 597 pr_err("zpool %s not available, using default %s\n", 598 zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT); 599 param_free_charp(&zswap_zpool_type); 600 zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT; 601 has_zpool = zpool_has_pool(zswap_zpool_type); 602 } 603 if (!has_zpool) { 604 pr_err("default zpool %s not available\n", 605 zswap_zpool_type); 606 param_free_charp(&zswap_zpool_type); 607 zswap_zpool_type = ZSWAP_PARAM_UNSET; 608 } 609 610 if (!has_comp || !has_zpool) 611 return NULL; 612 613 return zswap_pool_create(zswap_zpool_type, zswap_compressor); 614} 615 616static void zswap_pool_destroy(struct zswap_pool *pool) 617{ 618 zswap_pool_debug("destroying", pool); 619 620 cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node); 621 free_percpu(pool->tfm); 622 zpool_destroy_pool(pool->zpool); 623 kfree(pool); 624} 625 626static int __must_check zswap_pool_get(struct zswap_pool *pool) 627{ 628 if (!pool) 629 return 0; 630 631 return kref_get_unless_zero(&pool->kref); 632} 633 634static void __zswap_pool_release(struct work_struct *work) 635{ 636 struct zswap_pool *pool = container_of(work, typeof(*pool), work); 637 638 synchronize_rcu(); 639 640 /* nobody should have been able to get a kref... */ 641 WARN_ON(kref_get_unless_zero(&pool->kref)); 642 643 /* pool is now off zswap_pools list and has no references. */ 644 zswap_pool_destroy(pool); 645} 646 647static void __zswap_pool_empty(struct kref *kref) 648{ 649 struct zswap_pool *pool; 650 651 pool = container_of(kref, typeof(*pool), kref); 652 653 spin_lock(&zswap_pools_lock); 654 655 WARN_ON(pool == zswap_pool_current()); 656 657 list_del_rcu(&pool->list); 658 659 INIT_WORK(&pool->work, __zswap_pool_release); 660 schedule_work(&pool->work); 661 662 spin_unlock(&zswap_pools_lock); 663} 664 665static void zswap_pool_put(struct zswap_pool *pool) 666{ 667 kref_put(&pool->kref, __zswap_pool_empty); 668} 669 670/********************************* 671* param callbacks 672**********************************/ 673 674/* val must be a null-terminated string */ 675static int __zswap_param_set(const char *val, const struct kernel_param *kp, 676 char *type, char *compressor) 677{ 678 struct zswap_pool *pool, *put_pool = NULL; 679 char *s = strstrip((char *)val); 680 int ret; 681 682 if (zswap_init_failed) { 683 pr_err("can't set param, initialization failed\n"); 684 return -ENODEV; 685 } 686 687 /* no change required */ 688 if (!strcmp(s, *(char **)kp->arg) && zswap_has_pool) 689 return 0; 690 691 /* if this is load-time (pre-init) param setting, 692 * don't create a pool; that's done during init. 693 */ 694 if (!zswap_init_started) 695 return param_set_charp(s, kp); 696 697 if (!type) { 698 if (!zpool_has_pool(s)) { 699 pr_err("zpool %s not available\n", s); 700 return -ENOENT; 701 } 702 type = s; 703 } else if (!compressor) { 704 if (!crypto_has_comp(s, 0, 0)) { 705 pr_err("compressor %s not available\n", s); 706 return -ENOENT; 707 } 708 compressor = s; 709 } else { 710 WARN_ON(1); 711 return -EINVAL; 712 } 713 714 spin_lock(&zswap_pools_lock); 715 716 pool = zswap_pool_find_get(type, compressor); 717 if (pool) { 718 zswap_pool_debug("using existing", pool); 719 WARN_ON(pool == zswap_pool_current()); 720 list_del_rcu(&pool->list); 721 } 722 723 spin_unlock(&zswap_pools_lock); 724 725 if (!pool) 726 pool = zswap_pool_create(type, compressor); 727 728 if (pool) 729 ret = param_set_charp(s, kp); 730 else 731 ret = -EINVAL; 732 733 spin_lock(&zswap_pools_lock); 734 735 if (!ret) { 736 put_pool = zswap_pool_current(); 737 list_add_rcu(&pool->list, &zswap_pools); 738 zswap_has_pool = true; 739 } else if (pool) { 740 /* add the possibly pre-existing pool to the end of the pools 741 * list; if it's new (and empty) then it'll be removed and 742 * destroyed by the put after we drop the lock 743 */ 744 list_add_tail_rcu(&pool->list, &zswap_pools); 745 put_pool = pool; 746 } 747 748 spin_unlock(&zswap_pools_lock); 749 750 if (!zswap_has_pool && !pool) { 751 /* if initial pool creation failed, and this pool creation also 752 * failed, maybe both compressor and zpool params were bad. 753 * Allow changing this param, so pool creation will succeed 754 * when the other param is changed. We already verified this 755 * param is ok in the zpool_has_pool() or crypto_has_comp() 756 * checks above. 757 */ 758 ret = param_set_charp(s, kp); 759 } 760 761 /* drop the ref from either the old current pool, 762 * or the new pool we failed to add 763 */ 764 if (put_pool) 765 zswap_pool_put(put_pool); 766 767 return ret; 768} 769 770static int zswap_compressor_param_set(const char *val, 771 const struct kernel_param *kp) 772{ 773 return __zswap_param_set(val, kp, zswap_zpool_type, NULL); 774} 775 776static int zswap_zpool_param_set(const char *val, 777 const struct kernel_param *kp) 778{ 779 return __zswap_param_set(val, kp, NULL, zswap_compressor); 780} 781 782static int zswap_enabled_param_set(const char *val, 783 const struct kernel_param *kp) 784{ 785 if (zswap_init_failed) { 786 pr_err("can't enable, initialization failed\n"); 787 return -ENODEV; 788 } 789 if (!zswap_has_pool && zswap_init_started) { 790 pr_err("can't enable, no pool configured\n"); 791 return -ENODEV; 792 } 793 794 return param_set_bool(val, kp); 795} 796 797/********************************* 798* writeback code 799**********************************/ 800/* return enum for zswap_get_swap_cache_page */ 801enum zswap_get_swap_ret { 802 ZSWAP_SWAPCACHE_NEW, 803 ZSWAP_SWAPCACHE_EXIST, 804 ZSWAP_SWAPCACHE_FAIL, 805}; 806 807/* 808 * zswap_get_swap_cache_page 809 * 810 * This is an adaption of read_swap_cache_async() 811 * 812 * This function tries to find a page with the given swap entry 813 * in the swapper_space address space (the swap cache). If the page 814 * is found, it is returned in retpage. Otherwise, a page is allocated, 815 * added to the swap cache, and returned in retpage. 816 * 817 * If success, the swap cache page is returned in retpage 818 * Returns ZSWAP_SWAPCACHE_EXIST if page was already in the swap cache 819 * Returns ZSWAP_SWAPCACHE_NEW if the new page needs to be populated, 820 * the new page is added to swapcache and locked 821 * Returns ZSWAP_SWAPCACHE_FAIL on error 822 */ 823static int zswap_get_swap_cache_page(swp_entry_t entry, 824 struct page **retpage) 825{ 826 bool page_was_allocated; 827 828 *retpage = __read_swap_cache_async(entry, GFP_KERNEL, 829 NULL, 0, &page_was_allocated); 830 if (page_was_allocated) 831 return ZSWAP_SWAPCACHE_NEW; 832 if (!*retpage) 833 return ZSWAP_SWAPCACHE_FAIL; 834 return ZSWAP_SWAPCACHE_EXIST; 835} 836 837/* 838 * Attempts to free an entry by adding a page to the swap cache, 839 * decompressing the entry data into the page, and issuing a 840 * bio write to write the page back to the swap device. 841 * 842 * This can be thought of as a "resumed writeback" of the page 843 * to the swap device. We are basically resuming the same swap 844 * writeback path that was intercepted with the frontswap_store() 845 * in the first place. After the page has been decompressed into 846 * the swap cache, the compressed version stored by zswap can be 847 * freed. 848 */ 849static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) 850{ 851 struct zswap_header *zhdr; 852 swp_entry_t swpentry; 853 struct zswap_tree *tree; 854 pgoff_t offset; 855 struct zswap_entry *entry; 856 struct page *page; 857 struct crypto_comp *tfm; 858 u8 *src, *dst; 859 unsigned int dlen; 860 int ret; 861 struct writeback_control wbc = { 862 .sync_mode = WB_SYNC_NONE, 863 }; 864 865 /* extract swpentry from data */ 866 zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO); 867 swpentry = zhdr->swpentry; /* here */ 868 zpool_unmap_handle(pool, handle); 869 tree = zswap_trees[swp_type(swpentry)]; 870 offset = swp_offset(swpentry); 871 872 /* find and ref zswap entry */ 873 spin_lock(&tree->lock); 874 entry = zswap_entry_find_get(&tree->rbroot, offset); 875 if (!entry) { 876 /* entry was invalidated */ 877 spin_unlock(&tree->lock); 878 return 0; 879 } 880 spin_unlock(&tree->lock); 881 BUG_ON(offset != entry->offset); 882 883 /* try to allocate swap cache page */ 884 switch (zswap_get_swap_cache_page(swpentry, &page)) { 885 case ZSWAP_SWAPCACHE_FAIL: /* no memory or invalidate happened */ 886 ret = -ENOMEM; 887 goto fail; 888 889 case ZSWAP_SWAPCACHE_EXIST: 890 /* page is already in the swap cache, ignore for now */ 891 put_page(page); 892 ret = -EEXIST; 893 goto fail; 894 895 case ZSWAP_SWAPCACHE_NEW: /* page is locked */ 896 /* decompress */ 897 dlen = PAGE_SIZE; 898 src = (u8 *)zpool_map_handle(entry->pool->zpool, entry->handle, 899 ZPOOL_MM_RO) + sizeof(struct zswap_header); 900 dst = kmap_atomic(page); 901 tfm = *get_cpu_ptr(entry->pool->tfm); 902 ret = crypto_comp_decompress(tfm, src, entry->length, 903 dst, &dlen); 904 put_cpu_ptr(entry->pool->tfm); 905 kunmap_atomic(dst); 906 zpool_unmap_handle(entry->pool->zpool, entry->handle); 907 BUG_ON(ret); 908 BUG_ON(dlen != PAGE_SIZE); 909 910 /* page is up to date */ 911 SetPageUptodate(page); 912 } 913 914 /* move it to the tail of the inactive list after end_writeback */ 915 SetPageReclaim(page); 916 917 /* start writeback */ 918 __swap_writepage(page, &wbc, end_swap_bio_write); 919 put_page(page); 920 zswap_written_back_pages++; 921 922 spin_lock(&tree->lock); 923 /* drop local reference */ 924 zswap_entry_put(tree, entry); 925 926 /* 927 * There are two possible situations for entry here: 928 * (1) refcount is 1(normal case), entry is valid and on the tree 929 * (2) refcount is 0, entry is freed and not on the tree 930 * because invalidate happened during writeback 931 * search the tree and free the entry if find entry 932 */ 933 if (entry == zswap_rb_search(&tree->rbroot, offset)) 934 zswap_entry_put(tree, entry); 935 spin_unlock(&tree->lock); 936 937 goto end; 938 939 /* 940 * if we get here due to ZSWAP_SWAPCACHE_EXIST 941 * a load may happening concurrently 942 * it is safe and okay to not free the entry 943 * if we free the entry in the following put 944 * it it either okay to return !0 945 */ 946fail: 947 spin_lock(&tree->lock); 948 zswap_entry_put(tree, entry); 949 spin_unlock(&tree->lock); 950 951end: 952 return ret; 953} 954 955static int zswap_shrink(void) 956{ 957 struct zswap_pool *pool; 958 int ret; 959 960 pool = zswap_pool_last_get(); 961 if (!pool) 962 return -ENOENT; 963 964 ret = zpool_shrink(pool->zpool, 1, NULL); 965 966 zswap_pool_put(pool); 967 968 return ret; 969} 970 971static int zswap_is_page_same_filled(void *ptr, unsigned long *value) 972{ 973 unsigned int pos; 974 unsigned long *page; 975 976 page = (unsigned long *)ptr; 977 for (pos = 1; pos < PAGE_SIZE / sizeof(*page); pos++) { 978 if (page[pos] != page[0]) 979 return 0; 980 } 981 *value = page[0]; 982 return 1; 983} 984 985static void zswap_fill_page(void *ptr, unsigned long value) 986{ 987 unsigned long *page; 988 989 page = (unsigned long *)ptr; 990 memset_l(page, value, PAGE_SIZE / sizeof(unsigned long)); 991} 992 993/********************************* 994* frontswap hooks 995**********************************/ 996/* attempts to compress and store an single page */ 997static int zswap_frontswap_store(unsigned type, pgoff_t offset, 998 struct page *page) 999{ 1000 struct zswap_tree *tree = zswap_trees[type]; 1001 struct zswap_entry *entry, *dupentry; 1002 struct crypto_comp *tfm; 1003 int ret; 1004 unsigned int hlen, dlen = PAGE_SIZE; 1005 unsigned long handle, value; 1006 char *buf; 1007 u8 *src, *dst; 1008 struct zswap_header zhdr = { .swpentry = swp_entry(type, offset) }; 1009 1010 /* THP isn't supported */ 1011 if (PageTransHuge(page)) { 1012 ret = -EINVAL; 1013 goto reject; 1014 } 1015 1016 if (!zswap_enabled || !tree) { 1017 ret = -ENODEV; 1018 goto reject; 1019 } 1020 1021 /* reclaim space if needed */ 1022 if (zswap_is_full()) { 1023 zswap_pool_limit_hit++; 1024 if (zswap_shrink()) { 1025 zswap_reject_reclaim_fail++; 1026 ret = -ENOMEM; 1027 goto reject; 1028 } 1029 } 1030 1031 /* allocate entry */ 1032 entry = zswap_entry_cache_alloc(GFP_KERNEL); 1033 if (!entry) { 1034 zswap_reject_kmemcache_fail++; 1035 ret = -ENOMEM; 1036 goto reject; 1037 } 1038 1039 if (zswap_same_filled_pages_enabled) { 1040 src = kmap_atomic(page); 1041 if (zswap_is_page_same_filled(src, &value)) { 1042 kunmap_atomic(src); 1043 entry->offset = offset; 1044 entry->length = 0; 1045 entry->value = value; 1046 atomic_inc(&zswap_same_filled_pages); 1047 goto insert_entry; 1048 } 1049 kunmap_atomic(src); 1050 } 1051 1052 /* if entry is successfully added, it keeps the reference */ 1053 entry->pool = zswap_pool_current_get(); 1054 if (!entry->pool) { 1055 ret = -EINVAL; 1056 goto freepage; 1057 } 1058 1059 /* compress */ 1060 dst = get_cpu_var(zswap_dstmem); 1061 tfm = *get_cpu_ptr(entry->pool->tfm); 1062 src = kmap_atomic(page); 1063 ret = crypto_comp_compress(tfm, src, PAGE_SIZE, dst, &dlen); 1064 kunmap_atomic(src); 1065 put_cpu_ptr(entry->pool->tfm); 1066 if (ret) { 1067 ret = -EINVAL; 1068 goto put_dstmem; 1069 } 1070 1071 /* store */ 1072 hlen = zpool_evictable(entry->pool->zpool) ? sizeof(zhdr) : 0; 1073 ret = zpool_malloc(entry->pool->zpool, hlen + dlen, 1074 __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM, 1075 &handle); 1076 if (ret == -ENOSPC) { 1077 zswap_reject_compress_poor++; 1078 goto put_dstmem; 1079 } 1080 if (ret) { 1081 zswap_reject_alloc_fail++; 1082 goto put_dstmem; 1083 } 1084 buf = zpool_map_handle(entry->pool->zpool, handle, ZPOOL_MM_RW); 1085 memcpy(buf, &zhdr, hlen); 1086 memcpy(buf + hlen, dst, dlen); 1087 zpool_unmap_handle(entry->pool->zpool, handle); 1088 put_cpu_var(zswap_dstmem); 1089 1090 /* populate entry */ 1091 entry->offset = offset; 1092 entry->handle = handle; 1093 entry->length = dlen; 1094 1095insert_entry: 1096 /* map */ 1097 spin_lock(&tree->lock); 1098 do { 1099 ret = zswap_rb_insert(&tree->rbroot, entry, &dupentry); 1100 if (ret == -EEXIST) { 1101 zswap_duplicate_entry++; 1102 /* remove from rbtree */ 1103 zswap_rb_erase(&tree->rbroot, dupentry); 1104 zswap_entry_put(tree, dupentry); 1105 } 1106 } while (ret == -EEXIST); 1107 spin_unlock(&tree->lock); 1108 1109 /* update stats */ 1110 atomic_inc(&zswap_stored_pages); 1111 zswap_update_total_size(); 1112 1113 return 0; 1114 1115put_dstmem: 1116 put_cpu_var(zswap_dstmem); 1117 zswap_pool_put(entry->pool); 1118freepage: 1119 zswap_entry_cache_free(entry); 1120reject: 1121 return ret; 1122} 1123 1124/* 1125 * returns 0 if the page was successfully decompressed 1126 * return -1 on entry not found or error 1127*/ 1128static int zswap_frontswap_load(unsigned type, pgoff_t offset, 1129 struct page *page) 1130{ 1131 struct zswap_tree *tree = zswap_trees[type]; 1132 struct zswap_entry *entry; 1133 struct crypto_comp *tfm; 1134 u8 *src, *dst; 1135 unsigned int dlen; 1136 int ret; 1137 1138 /* find */ 1139 spin_lock(&tree->lock); 1140 entry = zswap_entry_find_get(&tree->rbroot, offset); 1141 if (!entry) { 1142 /* entry was written back */ 1143 spin_unlock(&tree->lock); 1144 return -1; 1145 } 1146 spin_unlock(&tree->lock); 1147 1148 if (!entry->length) { 1149 dst = kmap_atomic(page); 1150 zswap_fill_page(dst, entry->value); 1151 kunmap_atomic(dst); 1152 goto freeentry; 1153 } 1154 1155 /* decompress */ 1156 dlen = PAGE_SIZE; 1157 src = zpool_map_handle(entry->pool->zpool, entry->handle, ZPOOL_MM_RO); 1158 if (zpool_evictable(entry->pool->zpool)) 1159 src += sizeof(struct zswap_header); 1160 dst = kmap_atomic(page); 1161 tfm = *get_cpu_ptr(entry->pool->tfm); 1162 ret = crypto_comp_decompress(tfm, src, entry->length, dst, &dlen); 1163 put_cpu_ptr(entry->pool->tfm); 1164 kunmap_atomic(dst); 1165 zpool_unmap_handle(entry->pool->zpool, entry->handle); 1166 BUG_ON(ret); 1167 1168freeentry: 1169 spin_lock(&tree->lock); 1170 zswap_entry_put(tree, entry); 1171 spin_unlock(&tree->lock); 1172 1173 return 0; 1174} 1175 1176/* frees an entry in zswap */ 1177static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset) 1178{ 1179 struct zswap_tree *tree = zswap_trees[type]; 1180 struct zswap_entry *entry; 1181 1182 /* find */ 1183 spin_lock(&tree->lock); 1184 entry = zswap_rb_search(&tree->rbroot, offset); 1185 if (!entry) { 1186 /* entry was written back */ 1187 spin_unlock(&tree->lock); 1188 return; 1189 } 1190 1191 /* remove from rbtree */ 1192 zswap_rb_erase(&tree->rbroot, entry); 1193 1194 /* drop the initial reference from entry creation */ 1195 zswap_entry_put(tree, entry); 1196 1197 spin_unlock(&tree->lock); 1198} 1199 1200/* frees all zswap entries for the given swap type */ 1201static void zswap_frontswap_invalidate_area(unsigned type) 1202{ 1203 struct zswap_tree *tree = zswap_trees[type]; 1204 struct zswap_entry *entry, *n; 1205 1206 if (!tree) 1207 return; 1208 1209 /* walk the tree and free everything */ 1210 spin_lock(&tree->lock); 1211 rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode) 1212 zswap_free_entry(entry); 1213 tree->rbroot = RB_ROOT; 1214 spin_unlock(&tree->lock); 1215 kfree(tree); 1216 zswap_trees[type] = NULL; 1217} 1218 1219static void zswap_frontswap_init(unsigned type) 1220{ 1221 struct zswap_tree *tree; 1222 1223 tree = kzalloc(sizeof(*tree), GFP_KERNEL); 1224 if (!tree) { 1225 pr_err("alloc failed, zswap disabled for swap type %d\n", type); 1226 return; 1227 } 1228 1229 tree->rbroot = RB_ROOT; 1230 spin_lock_init(&tree->lock); 1231 zswap_trees[type] = tree; 1232} 1233 1234static struct frontswap_ops zswap_frontswap_ops = { 1235 .store = zswap_frontswap_store, 1236 .load = zswap_frontswap_load, 1237 .invalidate_page = zswap_frontswap_invalidate_page, 1238 .invalidate_area = zswap_frontswap_invalidate_area, 1239 .init = zswap_frontswap_init 1240}; 1241 1242/********************************* 1243* debugfs functions 1244**********************************/ 1245#ifdef CONFIG_DEBUG_FS 1246#include <linux/debugfs.h> 1247 1248static struct dentry *zswap_debugfs_root; 1249 1250static int __init zswap_debugfs_init(void) 1251{ 1252 if (!debugfs_initialized()) 1253 return -ENODEV; 1254 1255 zswap_debugfs_root = debugfs_create_dir("zswap", NULL); 1256 if (!zswap_debugfs_root) 1257 return -ENOMEM; 1258 1259 debugfs_create_u64("pool_limit_hit", S_IRUGO, 1260 zswap_debugfs_root, &zswap_pool_limit_hit); 1261 debugfs_create_u64("reject_reclaim_fail", S_IRUGO, 1262 zswap_debugfs_root, &zswap_reject_reclaim_fail); 1263 debugfs_create_u64("reject_alloc_fail", S_IRUGO, 1264 zswap_debugfs_root, &zswap_reject_alloc_fail); 1265 debugfs_create_u64("reject_kmemcache_fail", S_IRUGO, 1266 zswap_debugfs_root, &zswap_reject_kmemcache_fail); 1267 debugfs_create_u64("reject_compress_poor", S_IRUGO, 1268 zswap_debugfs_root, &zswap_reject_compress_poor); 1269 debugfs_create_u64("written_back_pages", S_IRUGO, 1270 zswap_debugfs_root, &zswap_written_back_pages); 1271 debugfs_create_u64("duplicate_entry", S_IRUGO, 1272 zswap_debugfs_root, &zswap_duplicate_entry); 1273 debugfs_create_u64("pool_total_size", S_IRUGO, 1274 zswap_debugfs_root, &zswap_pool_total_size); 1275 debugfs_create_atomic_t("stored_pages", S_IRUGO, 1276 zswap_debugfs_root, &zswap_stored_pages); 1277 debugfs_create_atomic_t("same_filled_pages", 0444, 1278 zswap_debugfs_root, &zswap_same_filled_pages); 1279 1280 return 0; 1281} 1282 1283static void __exit zswap_debugfs_exit(void) 1284{ 1285 debugfs_remove_recursive(zswap_debugfs_root); 1286} 1287#else 1288static int __init zswap_debugfs_init(void) 1289{ 1290 return 0; 1291} 1292 1293static void __exit zswap_debugfs_exit(void) { } 1294#endif 1295 1296/********************************* 1297* module init and exit 1298**********************************/ 1299static int __init init_zswap(void) 1300{ 1301 struct zswap_pool *pool; 1302 int ret; 1303 1304 zswap_init_started = true; 1305 1306 if (zswap_entry_cache_create()) { 1307 pr_err("entry cache creation failed\n"); 1308 goto cache_fail; 1309 } 1310 1311 ret = cpuhp_setup_state(CPUHP_MM_ZSWP_MEM_PREPARE, "mm/zswap:prepare", 1312 zswap_dstmem_prepare, zswap_dstmem_dead); 1313 if (ret) { 1314 pr_err("dstmem alloc failed\n"); 1315 goto dstmem_fail; 1316 } 1317 1318 ret = cpuhp_setup_state_multi(CPUHP_MM_ZSWP_POOL_PREPARE, 1319 "mm/zswap_pool:prepare", 1320 zswap_cpu_comp_prepare, 1321 zswap_cpu_comp_dead); 1322 if (ret) 1323 goto hp_fail; 1324 1325 pool = __zswap_pool_create_fallback(); 1326 if (pool) { 1327 pr_info("loaded using pool %s/%s\n", pool->tfm_name, 1328 zpool_get_type(pool->zpool)); 1329 list_add(&pool->list, &zswap_pools); 1330 zswap_has_pool = true; 1331 } else { 1332 pr_err("pool creation failed\n"); 1333 zswap_enabled = false; 1334 } 1335 1336 frontswap_register_ops(&zswap_frontswap_ops); 1337 if (zswap_debugfs_init()) 1338 pr_warn("debugfs initialization failed\n"); 1339 return 0; 1340 1341hp_fail: 1342 cpuhp_remove_state(CPUHP_MM_ZSWP_MEM_PREPARE); 1343dstmem_fail: 1344 zswap_entry_cache_destroy(); 1345cache_fail: 1346 /* if built-in, we aren't unloaded on failure; don't allow use */ 1347 zswap_init_failed = true; 1348 zswap_enabled = false; 1349 return -ENOMEM; 1350} 1351/* must be late so crypto has time to come up */ 1352late_initcall(init_zswap); 1353 1354MODULE_LICENSE("GPL"); 1355MODULE_AUTHOR("Seth Jennings <sjennings@variantweb.net>"); 1356MODULE_DESCRIPTION("Compressed cache for swap pages");