Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
at v3.1-rc10 2003 lines 55 kB view raw
1/* 2 * zcache.c 3 * 4 * Copyright (c) 2010,2011, Dan Magenheimer, Oracle Corp. 5 * Copyright (c) 2010,2011, Nitin Gupta 6 * 7 * Zcache provides an in-kernel "host implementation" for transcendent memory 8 * and, thus indirectly, for cleancache and frontswap. Zcache includes two 9 * page-accessible memory [1] interfaces, both utilizing lzo1x compression: 10 * 1) "compression buddies" ("zbud") is used for ephemeral pages 11 * 2) xvmalloc is used for persistent pages. 12 * Xvmalloc (based on the TLSF allocator) has very low fragmentation 13 * so maximizes space efficiency, while zbud allows pairs (and potentially, 14 * in the future, more than a pair of) compressed pages to be closely linked 15 * so that reclaiming can be done via the kernel's physical-page-oriented 16 * "shrinker" interface. 17 * 18 * [1] For a definition of page-accessible memory (aka PAM), see: 19 * http://marc.info/?l=linux-mm&m=127811271605009 20 */ 21 22#include <linux/module.h> 23#include <linux/cpu.h> 24#include <linux/highmem.h> 25#include <linux/list.h> 26#include <linux/lzo.h> 27#include <linux/slab.h> 28#include <linux/spinlock.h> 29#include <linux/types.h> 30#include <linux/atomic.h> 31#include <linux/math64.h> 32#include "tmem.h" 33 34#include "../zram/xvmalloc.h" /* if built in drivers/staging */ 35 36#if (!defined(CONFIG_CLEANCACHE) && !defined(CONFIG_FRONTSWAP)) 37#error "zcache is useless without CONFIG_CLEANCACHE or CONFIG_FRONTSWAP" 38#endif 39#ifdef CONFIG_CLEANCACHE 40#include <linux/cleancache.h> 41#endif 42#ifdef CONFIG_FRONTSWAP 43#include <linux/frontswap.h> 44#endif 45 46#if 0 47/* this is more aggressive but may cause other problems? */ 48#define ZCACHE_GFP_MASK (GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN) 49#else 50#define ZCACHE_GFP_MASK \ 51 (__GFP_FS | __GFP_NORETRY | __GFP_NOWARN | __GFP_NOMEMALLOC) 52#endif 53 54#define MAX_POOLS_PER_CLIENT 16 55 56#define MAX_CLIENTS 16 57#define LOCAL_CLIENT ((uint16_t)-1) 58 59MODULE_LICENSE("GPL"); 60 61struct zcache_client { 62 struct tmem_pool *tmem_pools[MAX_POOLS_PER_CLIENT]; 63 struct xv_pool *xvpool; 64 bool allocated; 65 atomic_t refcount; 66}; 67 68static struct zcache_client zcache_host; 69static struct zcache_client zcache_clients[MAX_CLIENTS]; 70 71static inline uint16_t get_client_id_from_client(struct zcache_client *cli) 72{ 73 BUG_ON(cli == NULL); 74 if (cli == &zcache_host) 75 return LOCAL_CLIENT; 76 return cli - &zcache_clients[0]; 77} 78 79static inline bool is_local_client(struct zcache_client *cli) 80{ 81 return cli == &zcache_host; 82} 83 84/********** 85 * Compression buddies ("zbud") provides for packing two (or, possibly 86 * in the future, more) compressed ephemeral pages into a single "raw" 87 * (physical) page and tracking them with data structures so that 88 * the raw pages can be easily reclaimed. 89 * 90 * A zbud page ("zbpg") is an aligned page containing a list_head, 91 * a lock, and two "zbud headers". The remainder of the physical 92 * page is divided up into aligned 64-byte "chunks" which contain 93 * the compressed data for zero, one, or two zbuds. Each zbpg 94 * resides on: (1) an "unused list" if it has no zbuds; (2) a 95 * "buddied" list if it is fully populated with two zbuds; or 96 * (3) one of PAGE_SIZE/64 "unbuddied" lists indexed by how many chunks 97 * the one unbuddied zbud uses. The data inside a zbpg cannot be 98 * read or written unless the zbpg's lock is held. 99 */ 100 101#define ZBH_SENTINEL 0x43214321 102#define ZBPG_SENTINEL 0xdeadbeef 103 104#define ZBUD_MAX_BUDS 2 105 106struct zbud_hdr { 107 uint16_t client_id; 108 uint16_t pool_id; 109 struct tmem_oid oid; 110 uint32_t index; 111 uint16_t size; /* compressed size in bytes, zero means unused */ 112 DECL_SENTINEL 113}; 114 115struct zbud_page { 116 struct list_head bud_list; 117 spinlock_t lock; 118 struct zbud_hdr buddy[ZBUD_MAX_BUDS]; 119 DECL_SENTINEL 120 /* followed by NUM_CHUNK aligned CHUNK_SIZE-byte chunks */ 121}; 122 123#define CHUNK_SHIFT 6 124#define CHUNK_SIZE (1 << CHUNK_SHIFT) 125#define CHUNK_MASK (~(CHUNK_SIZE-1)) 126#define NCHUNKS (((PAGE_SIZE - sizeof(struct zbud_page)) & \ 127 CHUNK_MASK) >> CHUNK_SHIFT) 128#define MAX_CHUNK (NCHUNKS-1) 129 130static struct { 131 struct list_head list; 132 unsigned count; 133} zbud_unbuddied[NCHUNKS]; 134/* list N contains pages with N chunks USED and NCHUNKS-N unused */ 135/* element 0 is never used but optimizing that isn't worth it */ 136static unsigned long zbud_cumul_chunk_counts[NCHUNKS]; 137 138struct list_head zbud_buddied_list; 139static unsigned long zcache_zbud_buddied_count; 140 141/* protects the buddied list and all unbuddied lists */ 142static DEFINE_SPINLOCK(zbud_budlists_spinlock); 143 144static LIST_HEAD(zbpg_unused_list); 145static unsigned long zcache_zbpg_unused_list_count; 146 147/* protects the unused page list */ 148static DEFINE_SPINLOCK(zbpg_unused_list_spinlock); 149 150static atomic_t zcache_zbud_curr_raw_pages; 151static atomic_t zcache_zbud_curr_zpages; 152static unsigned long zcache_zbud_curr_zbytes; 153static unsigned long zcache_zbud_cumul_zpages; 154static unsigned long zcache_zbud_cumul_zbytes; 155static unsigned long zcache_compress_poor; 156static unsigned long zcache_mean_compress_poor; 157 158/* forward references */ 159static void *zcache_get_free_page(void); 160static void zcache_free_page(void *p); 161 162/* 163 * zbud helper functions 164 */ 165 166static inline unsigned zbud_max_buddy_size(void) 167{ 168 return MAX_CHUNK << CHUNK_SHIFT; 169} 170 171static inline unsigned zbud_size_to_chunks(unsigned size) 172{ 173 BUG_ON(size == 0 || size > zbud_max_buddy_size()); 174 return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT; 175} 176 177static inline int zbud_budnum(struct zbud_hdr *zh) 178{ 179 unsigned offset = (unsigned long)zh & (PAGE_SIZE - 1); 180 struct zbud_page *zbpg = NULL; 181 unsigned budnum = -1U; 182 int i; 183 184 for (i = 0; i < ZBUD_MAX_BUDS; i++) 185 if (offset == offsetof(typeof(*zbpg), buddy[i])) { 186 budnum = i; 187 break; 188 } 189 BUG_ON(budnum == -1U); 190 return budnum; 191} 192 193static char *zbud_data(struct zbud_hdr *zh, unsigned size) 194{ 195 struct zbud_page *zbpg; 196 char *p; 197 unsigned budnum; 198 199 ASSERT_SENTINEL(zh, ZBH); 200 budnum = zbud_budnum(zh); 201 BUG_ON(size == 0 || size > zbud_max_buddy_size()); 202 zbpg = container_of(zh, struct zbud_page, buddy[budnum]); 203 ASSERT_SPINLOCK(&zbpg->lock); 204 p = (char *)zbpg; 205 if (budnum == 0) 206 p += ((sizeof(struct zbud_page) + CHUNK_SIZE - 1) & 207 CHUNK_MASK); 208 else if (budnum == 1) 209 p += PAGE_SIZE - ((size + CHUNK_SIZE - 1) & CHUNK_MASK); 210 return p; 211} 212 213/* 214 * zbud raw page management 215 */ 216 217static struct zbud_page *zbud_alloc_raw_page(void) 218{ 219 struct zbud_page *zbpg = NULL; 220 struct zbud_hdr *zh0, *zh1; 221 bool recycled = 0; 222 223 /* if any pages on the zbpg list, use one */ 224 spin_lock(&zbpg_unused_list_spinlock); 225 if (!list_empty(&zbpg_unused_list)) { 226 zbpg = list_first_entry(&zbpg_unused_list, 227 struct zbud_page, bud_list); 228 list_del_init(&zbpg->bud_list); 229 zcache_zbpg_unused_list_count--; 230 recycled = 1; 231 } 232 spin_unlock(&zbpg_unused_list_spinlock); 233 if (zbpg == NULL) 234 /* none on zbpg list, try to get a kernel page */ 235 zbpg = zcache_get_free_page(); 236 if (likely(zbpg != NULL)) { 237 INIT_LIST_HEAD(&zbpg->bud_list); 238 zh0 = &zbpg->buddy[0]; zh1 = &zbpg->buddy[1]; 239 spin_lock_init(&zbpg->lock); 240 if (recycled) { 241 ASSERT_INVERTED_SENTINEL(zbpg, ZBPG); 242 SET_SENTINEL(zbpg, ZBPG); 243 BUG_ON(zh0->size != 0 || tmem_oid_valid(&zh0->oid)); 244 BUG_ON(zh1->size != 0 || tmem_oid_valid(&zh1->oid)); 245 } else { 246 atomic_inc(&zcache_zbud_curr_raw_pages); 247 INIT_LIST_HEAD(&zbpg->bud_list); 248 SET_SENTINEL(zbpg, ZBPG); 249 zh0->size = 0; zh1->size = 0; 250 tmem_oid_set_invalid(&zh0->oid); 251 tmem_oid_set_invalid(&zh1->oid); 252 } 253 } 254 return zbpg; 255} 256 257static void zbud_free_raw_page(struct zbud_page *zbpg) 258{ 259 struct zbud_hdr *zh0 = &zbpg->buddy[0], *zh1 = &zbpg->buddy[1]; 260 261 ASSERT_SENTINEL(zbpg, ZBPG); 262 BUG_ON(!list_empty(&zbpg->bud_list)); 263 ASSERT_SPINLOCK(&zbpg->lock); 264 BUG_ON(zh0->size != 0 || tmem_oid_valid(&zh0->oid)); 265 BUG_ON(zh1->size != 0 || tmem_oid_valid(&zh1->oid)); 266 INVERT_SENTINEL(zbpg, ZBPG); 267 spin_unlock(&zbpg->lock); 268 spin_lock(&zbpg_unused_list_spinlock); 269 list_add(&zbpg->bud_list, &zbpg_unused_list); 270 zcache_zbpg_unused_list_count++; 271 spin_unlock(&zbpg_unused_list_spinlock); 272} 273 274/* 275 * core zbud handling routines 276 */ 277 278static unsigned zbud_free(struct zbud_hdr *zh) 279{ 280 unsigned size; 281 282 ASSERT_SENTINEL(zh, ZBH); 283 BUG_ON(!tmem_oid_valid(&zh->oid)); 284 size = zh->size; 285 BUG_ON(zh->size == 0 || zh->size > zbud_max_buddy_size()); 286 zh->size = 0; 287 tmem_oid_set_invalid(&zh->oid); 288 INVERT_SENTINEL(zh, ZBH); 289 zcache_zbud_curr_zbytes -= size; 290 atomic_dec(&zcache_zbud_curr_zpages); 291 return size; 292} 293 294static void zbud_free_and_delist(struct zbud_hdr *zh) 295{ 296 unsigned chunks; 297 struct zbud_hdr *zh_other; 298 unsigned budnum = zbud_budnum(zh), size; 299 struct zbud_page *zbpg = 300 container_of(zh, struct zbud_page, buddy[budnum]); 301 302 spin_lock(&zbpg->lock); 303 if (list_empty(&zbpg->bud_list)) { 304 /* ignore zombie page... see zbud_evict_pages() */ 305 spin_unlock(&zbpg->lock); 306 return; 307 } 308 size = zbud_free(zh); 309 ASSERT_SPINLOCK(&zbpg->lock); 310 zh_other = &zbpg->buddy[(budnum == 0) ? 1 : 0]; 311 if (zh_other->size == 0) { /* was unbuddied: unlist and free */ 312 chunks = zbud_size_to_chunks(size) ; 313 spin_lock(&zbud_budlists_spinlock); 314 BUG_ON(list_empty(&zbud_unbuddied[chunks].list)); 315 list_del_init(&zbpg->bud_list); 316 zbud_unbuddied[chunks].count--; 317 spin_unlock(&zbud_budlists_spinlock); 318 zbud_free_raw_page(zbpg); 319 } else { /* was buddied: move remaining buddy to unbuddied list */ 320 chunks = zbud_size_to_chunks(zh_other->size) ; 321 spin_lock(&zbud_budlists_spinlock); 322 list_del_init(&zbpg->bud_list); 323 zcache_zbud_buddied_count--; 324 list_add_tail(&zbpg->bud_list, &zbud_unbuddied[chunks].list); 325 zbud_unbuddied[chunks].count++; 326 spin_unlock(&zbud_budlists_spinlock); 327 spin_unlock(&zbpg->lock); 328 } 329} 330 331static struct zbud_hdr *zbud_create(uint16_t client_id, uint16_t pool_id, 332 struct tmem_oid *oid, 333 uint32_t index, struct page *page, 334 void *cdata, unsigned size) 335{ 336 struct zbud_hdr *zh0, *zh1, *zh = NULL; 337 struct zbud_page *zbpg = NULL, *ztmp; 338 unsigned nchunks; 339 char *to; 340 int i, found_good_buddy = 0; 341 342 nchunks = zbud_size_to_chunks(size) ; 343 for (i = MAX_CHUNK - nchunks + 1; i > 0; i--) { 344 spin_lock(&zbud_budlists_spinlock); 345 if (!list_empty(&zbud_unbuddied[i].list)) { 346 list_for_each_entry_safe(zbpg, ztmp, 347 &zbud_unbuddied[i].list, bud_list) { 348 if (spin_trylock(&zbpg->lock)) { 349 found_good_buddy = i; 350 goto found_unbuddied; 351 } 352 } 353 } 354 spin_unlock(&zbud_budlists_spinlock); 355 } 356 /* didn't find a good buddy, try allocating a new page */ 357 zbpg = zbud_alloc_raw_page(); 358 if (unlikely(zbpg == NULL)) 359 goto out; 360 /* ok, have a page, now compress the data before taking locks */ 361 spin_lock(&zbpg->lock); 362 spin_lock(&zbud_budlists_spinlock); 363 list_add_tail(&zbpg->bud_list, &zbud_unbuddied[nchunks].list); 364 zbud_unbuddied[nchunks].count++; 365 zh = &zbpg->buddy[0]; 366 goto init_zh; 367 368found_unbuddied: 369 ASSERT_SPINLOCK(&zbpg->lock); 370 zh0 = &zbpg->buddy[0]; zh1 = &zbpg->buddy[1]; 371 BUG_ON(!((zh0->size == 0) ^ (zh1->size == 0))); 372 if (zh0->size != 0) { /* buddy0 in use, buddy1 is vacant */ 373 ASSERT_SENTINEL(zh0, ZBH); 374 zh = zh1; 375 } else if (zh1->size != 0) { /* buddy1 in use, buddy0 is vacant */ 376 ASSERT_SENTINEL(zh1, ZBH); 377 zh = zh0; 378 } else 379 BUG(); 380 list_del_init(&zbpg->bud_list); 381 zbud_unbuddied[found_good_buddy].count--; 382 list_add_tail(&zbpg->bud_list, &zbud_buddied_list); 383 zcache_zbud_buddied_count++; 384 385init_zh: 386 SET_SENTINEL(zh, ZBH); 387 zh->size = size; 388 zh->index = index; 389 zh->oid = *oid; 390 zh->pool_id = pool_id; 391 zh->client_id = client_id; 392 /* can wait to copy the data until the list locks are dropped */ 393 spin_unlock(&zbud_budlists_spinlock); 394 395 to = zbud_data(zh, size); 396 memcpy(to, cdata, size); 397 spin_unlock(&zbpg->lock); 398 zbud_cumul_chunk_counts[nchunks]++; 399 atomic_inc(&zcache_zbud_curr_zpages); 400 zcache_zbud_cumul_zpages++; 401 zcache_zbud_curr_zbytes += size; 402 zcache_zbud_cumul_zbytes += size; 403out: 404 return zh; 405} 406 407static int zbud_decompress(struct page *page, struct zbud_hdr *zh) 408{ 409 struct zbud_page *zbpg; 410 unsigned budnum = zbud_budnum(zh); 411 size_t out_len = PAGE_SIZE; 412 char *to_va, *from_va; 413 unsigned size; 414 int ret = 0; 415 416 zbpg = container_of(zh, struct zbud_page, buddy[budnum]); 417 spin_lock(&zbpg->lock); 418 if (list_empty(&zbpg->bud_list)) { 419 /* ignore zombie page... see zbud_evict_pages() */ 420 ret = -EINVAL; 421 goto out; 422 } 423 ASSERT_SENTINEL(zh, ZBH); 424 BUG_ON(zh->size == 0 || zh->size > zbud_max_buddy_size()); 425 to_va = kmap_atomic(page, KM_USER0); 426 size = zh->size; 427 from_va = zbud_data(zh, size); 428 ret = lzo1x_decompress_safe(from_va, size, to_va, &out_len); 429 BUG_ON(ret != LZO_E_OK); 430 BUG_ON(out_len != PAGE_SIZE); 431 kunmap_atomic(to_va, KM_USER0); 432out: 433 spin_unlock(&zbpg->lock); 434 return ret; 435} 436 437/* 438 * The following routines handle shrinking of ephemeral pages by evicting 439 * pages "least valuable" first. 440 */ 441 442static unsigned long zcache_evicted_raw_pages; 443static unsigned long zcache_evicted_buddied_pages; 444static unsigned long zcache_evicted_unbuddied_pages; 445 446static struct tmem_pool *zcache_get_pool_by_id(uint16_t cli_id, 447 uint16_t poolid); 448static void zcache_put_pool(struct tmem_pool *pool); 449 450/* 451 * Flush and free all zbuds in a zbpg, then free the pageframe 452 */ 453static void zbud_evict_zbpg(struct zbud_page *zbpg) 454{ 455 struct zbud_hdr *zh; 456 int i, j; 457 uint32_t pool_id[ZBUD_MAX_BUDS], client_id[ZBUD_MAX_BUDS]; 458 uint32_t index[ZBUD_MAX_BUDS]; 459 struct tmem_oid oid[ZBUD_MAX_BUDS]; 460 struct tmem_pool *pool; 461 462 ASSERT_SPINLOCK(&zbpg->lock); 463 BUG_ON(!list_empty(&zbpg->bud_list)); 464 for (i = 0, j = 0; i < ZBUD_MAX_BUDS; i++) { 465 zh = &zbpg->buddy[i]; 466 if (zh->size) { 467 client_id[j] = zh->client_id; 468 pool_id[j] = zh->pool_id; 469 oid[j] = zh->oid; 470 index[j] = zh->index; 471 j++; 472 zbud_free(zh); 473 } 474 } 475 spin_unlock(&zbpg->lock); 476 for (i = 0; i < j; i++) { 477 pool = zcache_get_pool_by_id(client_id[i], pool_id[i]); 478 if (pool != NULL) { 479 tmem_flush_page(pool, &oid[i], index[i]); 480 zcache_put_pool(pool); 481 } 482 } 483 ASSERT_SENTINEL(zbpg, ZBPG); 484 spin_lock(&zbpg->lock); 485 zbud_free_raw_page(zbpg); 486} 487 488/* 489 * Free nr pages. This code is funky because we want to hold the locks 490 * protecting various lists for as short a time as possible, and in some 491 * circumstances the list may change asynchronously when the list lock is 492 * not held. In some cases we also trylock not only to avoid waiting on a 493 * page in use by another cpu, but also to avoid potential deadlock due to 494 * lock inversion. 495 */ 496static void zbud_evict_pages(int nr) 497{ 498 struct zbud_page *zbpg; 499 int i; 500 501 /* first try freeing any pages on unused list */ 502retry_unused_list: 503 spin_lock_bh(&zbpg_unused_list_spinlock); 504 if (!list_empty(&zbpg_unused_list)) { 505 /* can't walk list here, since it may change when unlocked */ 506 zbpg = list_first_entry(&zbpg_unused_list, 507 struct zbud_page, bud_list); 508 list_del_init(&zbpg->bud_list); 509 zcache_zbpg_unused_list_count--; 510 atomic_dec(&zcache_zbud_curr_raw_pages); 511 spin_unlock_bh(&zbpg_unused_list_spinlock); 512 zcache_free_page(zbpg); 513 zcache_evicted_raw_pages++; 514 if (--nr <= 0) 515 goto out; 516 goto retry_unused_list; 517 } 518 spin_unlock_bh(&zbpg_unused_list_spinlock); 519 520 /* now try freeing unbuddied pages, starting with least space avail */ 521 for (i = 0; i < MAX_CHUNK; i++) { 522retry_unbud_list_i: 523 spin_lock_bh(&zbud_budlists_spinlock); 524 if (list_empty(&zbud_unbuddied[i].list)) { 525 spin_unlock_bh(&zbud_budlists_spinlock); 526 continue; 527 } 528 list_for_each_entry(zbpg, &zbud_unbuddied[i].list, bud_list) { 529 if (unlikely(!spin_trylock(&zbpg->lock))) 530 continue; 531 list_del_init(&zbpg->bud_list); 532 zbud_unbuddied[i].count--; 533 spin_unlock(&zbud_budlists_spinlock); 534 zcache_evicted_unbuddied_pages++; 535 /* want budlists unlocked when doing zbpg eviction */ 536 zbud_evict_zbpg(zbpg); 537 local_bh_enable(); 538 if (--nr <= 0) 539 goto out; 540 goto retry_unbud_list_i; 541 } 542 spin_unlock_bh(&zbud_budlists_spinlock); 543 } 544 545 /* as a last resort, free buddied pages */ 546retry_bud_list: 547 spin_lock_bh(&zbud_budlists_spinlock); 548 if (list_empty(&zbud_buddied_list)) { 549 spin_unlock_bh(&zbud_budlists_spinlock); 550 goto out; 551 } 552 list_for_each_entry(zbpg, &zbud_buddied_list, bud_list) { 553 if (unlikely(!spin_trylock(&zbpg->lock))) 554 continue; 555 list_del_init(&zbpg->bud_list); 556 zcache_zbud_buddied_count--; 557 spin_unlock(&zbud_budlists_spinlock); 558 zcache_evicted_buddied_pages++; 559 /* want budlists unlocked when doing zbpg eviction */ 560 zbud_evict_zbpg(zbpg); 561 local_bh_enable(); 562 if (--nr <= 0) 563 goto out; 564 goto retry_bud_list; 565 } 566 spin_unlock_bh(&zbud_budlists_spinlock); 567out: 568 return; 569} 570 571static void zbud_init(void) 572{ 573 int i; 574 575 INIT_LIST_HEAD(&zbud_buddied_list); 576 zcache_zbud_buddied_count = 0; 577 for (i = 0; i < NCHUNKS; i++) { 578 INIT_LIST_HEAD(&zbud_unbuddied[i].list); 579 zbud_unbuddied[i].count = 0; 580 } 581} 582 583#ifdef CONFIG_SYSFS 584/* 585 * These sysfs routines show a nice distribution of how many zbpg's are 586 * currently (and have ever been placed) in each unbuddied list. It's fun 587 * to watch but can probably go away before final merge. 588 */ 589static int zbud_show_unbuddied_list_counts(char *buf) 590{ 591 int i; 592 char *p = buf; 593 594 for (i = 0; i < NCHUNKS; i++) 595 p += sprintf(p, "%u ", zbud_unbuddied[i].count); 596 return p - buf; 597} 598 599static int zbud_show_cumul_chunk_counts(char *buf) 600{ 601 unsigned long i, chunks = 0, total_chunks = 0, sum_total_chunks = 0; 602 unsigned long total_chunks_lte_21 = 0, total_chunks_lte_32 = 0; 603 unsigned long total_chunks_lte_42 = 0; 604 char *p = buf; 605 606 for (i = 0; i < NCHUNKS; i++) { 607 p += sprintf(p, "%lu ", zbud_cumul_chunk_counts[i]); 608 chunks += zbud_cumul_chunk_counts[i]; 609 total_chunks += zbud_cumul_chunk_counts[i]; 610 sum_total_chunks += i * zbud_cumul_chunk_counts[i]; 611 if (i == 21) 612 total_chunks_lte_21 = total_chunks; 613 if (i == 32) 614 total_chunks_lte_32 = total_chunks; 615 if (i == 42) 616 total_chunks_lte_42 = total_chunks; 617 } 618 p += sprintf(p, "<=21:%lu <=32:%lu <=42:%lu, mean:%lu\n", 619 total_chunks_lte_21, total_chunks_lte_32, total_chunks_lte_42, 620 chunks == 0 ? 0 : sum_total_chunks / chunks); 621 return p - buf; 622} 623#endif 624 625/********** 626 * This "zv" PAM implementation combines the TLSF-based xvMalloc 627 * with lzo1x compression to maximize the amount of data that can 628 * be packed into a physical page. 629 * 630 * Zv represents a PAM page with the index and object (plus a "size" value 631 * necessary for decompression) immediately preceding the compressed data. 632 */ 633 634#define ZVH_SENTINEL 0x43214321 635 636struct zv_hdr { 637 uint32_t pool_id; 638 struct tmem_oid oid; 639 uint32_t index; 640 DECL_SENTINEL 641}; 642 643/* rudimentary policy limits */ 644/* total number of persistent pages may not exceed this percentage */ 645static unsigned int zv_page_count_policy_percent = 75; 646/* 647 * byte count defining poor compression; pages with greater zsize will be 648 * rejected 649 */ 650static unsigned int zv_max_zsize = (PAGE_SIZE / 8) * 7; 651/* 652 * byte count defining poor *mean* compression; pages with greater zsize 653 * will be rejected until sufficient better-compressed pages are accepted 654 * driving the man below this threshold 655 */ 656static unsigned int zv_max_mean_zsize = (PAGE_SIZE / 8) * 5; 657 658static unsigned long zv_curr_dist_counts[NCHUNKS]; 659static unsigned long zv_cumul_dist_counts[NCHUNKS]; 660 661static struct zv_hdr *zv_create(struct xv_pool *xvpool, uint32_t pool_id, 662 struct tmem_oid *oid, uint32_t index, 663 void *cdata, unsigned clen) 664{ 665 struct page *page; 666 struct zv_hdr *zv = NULL; 667 uint32_t offset; 668 int alloc_size = clen + sizeof(struct zv_hdr); 669 int chunks = (alloc_size + (CHUNK_SIZE - 1)) >> CHUNK_SHIFT; 670 int ret; 671 672 BUG_ON(!irqs_disabled()); 673 BUG_ON(chunks >= NCHUNKS); 674 ret = xv_malloc(xvpool, alloc_size, 675 &page, &offset, ZCACHE_GFP_MASK); 676 if (unlikely(ret)) 677 goto out; 678 zv_curr_dist_counts[chunks]++; 679 zv_cumul_dist_counts[chunks]++; 680 zv = kmap_atomic(page, KM_USER0) + offset; 681 zv->index = index; 682 zv->oid = *oid; 683 zv->pool_id = pool_id; 684 SET_SENTINEL(zv, ZVH); 685 memcpy((char *)zv + sizeof(struct zv_hdr), cdata, clen); 686 kunmap_atomic(zv, KM_USER0); 687out: 688 return zv; 689} 690 691static void zv_free(struct xv_pool *xvpool, struct zv_hdr *zv) 692{ 693 unsigned long flags; 694 struct page *page; 695 uint32_t offset; 696 uint16_t size = xv_get_object_size(zv); 697 int chunks = (size + (CHUNK_SIZE - 1)) >> CHUNK_SHIFT; 698 699 ASSERT_SENTINEL(zv, ZVH); 700 BUG_ON(chunks >= NCHUNKS); 701 zv_curr_dist_counts[chunks]--; 702 size -= sizeof(*zv); 703 BUG_ON(size == 0); 704 INVERT_SENTINEL(zv, ZVH); 705 page = virt_to_page(zv); 706 offset = (unsigned long)zv & ~PAGE_MASK; 707 local_irq_save(flags); 708 xv_free(xvpool, page, offset); 709 local_irq_restore(flags); 710} 711 712static void zv_decompress(struct page *page, struct zv_hdr *zv) 713{ 714 size_t clen = PAGE_SIZE; 715 char *to_va; 716 unsigned size; 717 int ret; 718 719 ASSERT_SENTINEL(zv, ZVH); 720 size = xv_get_object_size(zv) - sizeof(*zv); 721 BUG_ON(size == 0); 722 to_va = kmap_atomic(page, KM_USER0); 723 ret = lzo1x_decompress_safe((char *)zv + sizeof(*zv), 724 size, to_va, &clen); 725 kunmap_atomic(to_va, KM_USER0); 726 BUG_ON(ret != LZO_E_OK); 727 BUG_ON(clen != PAGE_SIZE); 728} 729 730#ifdef CONFIG_SYSFS 731/* 732 * show a distribution of compression stats for zv pages. 733 */ 734 735static int zv_curr_dist_counts_show(char *buf) 736{ 737 unsigned long i, n, chunks = 0, sum_total_chunks = 0; 738 char *p = buf; 739 740 for (i = 0; i < NCHUNKS; i++) { 741 n = zv_curr_dist_counts[i]; 742 p += sprintf(p, "%lu ", n); 743 chunks += n; 744 sum_total_chunks += i * n; 745 } 746 p += sprintf(p, "mean:%lu\n", 747 chunks == 0 ? 0 : sum_total_chunks / chunks); 748 return p - buf; 749} 750 751static int zv_cumul_dist_counts_show(char *buf) 752{ 753 unsigned long i, n, chunks = 0, sum_total_chunks = 0; 754 char *p = buf; 755 756 for (i = 0; i < NCHUNKS; i++) { 757 n = zv_cumul_dist_counts[i]; 758 p += sprintf(p, "%lu ", n); 759 chunks += n; 760 sum_total_chunks += i * n; 761 } 762 p += sprintf(p, "mean:%lu\n", 763 chunks == 0 ? 0 : sum_total_chunks / chunks); 764 return p - buf; 765} 766 767/* 768 * setting zv_max_zsize via sysfs causes all persistent (e.g. swap) 769 * pages that don't compress to less than this value (including metadata 770 * overhead) to be rejected. We don't allow the value to get too close 771 * to PAGE_SIZE. 772 */ 773static ssize_t zv_max_zsize_show(struct kobject *kobj, 774 struct kobj_attribute *attr, 775 char *buf) 776{ 777 return sprintf(buf, "%u\n", zv_max_zsize); 778} 779 780static ssize_t zv_max_zsize_store(struct kobject *kobj, 781 struct kobj_attribute *attr, 782 const char *buf, size_t count) 783{ 784 unsigned long val; 785 int err; 786 787 if (!capable(CAP_SYS_ADMIN)) 788 return -EPERM; 789 790 err = strict_strtoul(buf, 10, &val); 791 if (err || (val == 0) || (val > (PAGE_SIZE / 8) * 7)) 792 return -EINVAL; 793 zv_max_zsize = val; 794 return count; 795} 796 797/* 798 * setting zv_max_mean_zsize via sysfs causes all persistent (e.g. swap) 799 * pages that don't compress to less than this value (including metadata 800 * overhead) to be rejected UNLESS the mean compression is also smaller 801 * than this value. In other words, we are load-balancing-by-zsize the 802 * accepted pages. Again, we don't allow the value to get too close 803 * to PAGE_SIZE. 804 */ 805static ssize_t zv_max_mean_zsize_show(struct kobject *kobj, 806 struct kobj_attribute *attr, 807 char *buf) 808{ 809 return sprintf(buf, "%u\n", zv_max_mean_zsize); 810} 811 812static ssize_t zv_max_mean_zsize_store(struct kobject *kobj, 813 struct kobj_attribute *attr, 814 const char *buf, size_t count) 815{ 816 unsigned long val; 817 int err; 818 819 if (!capable(CAP_SYS_ADMIN)) 820 return -EPERM; 821 822 err = strict_strtoul(buf, 10, &val); 823 if (err || (val == 0) || (val > (PAGE_SIZE / 8) * 7)) 824 return -EINVAL; 825 zv_max_mean_zsize = val; 826 return count; 827} 828 829/* 830 * setting zv_page_count_policy_percent via sysfs sets an upper bound of 831 * persistent (e.g. swap) pages that will be retained according to: 832 * (zv_page_count_policy_percent * totalram_pages) / 100) 833 * when that limit is reached, further puts will be rejected (until 834 * some pages have been flushed). Note that, due to compression, 835 * this number may exceed 100; it defaults to 75 and we set an 836 * arbitary limit of 150. A poor choice will almost certainly result 837 * in OOM's, so this value should only be changed prudently. 838 */ 839static ssize_t zv_page_count_policy_percent_show(struct kobject *kobj, 840 struct kobj_attribute *attr, 841 char *buf) 842{ 843 return sprintf(buf, "%u\n", zv_page_count_policy_percent); 844} 845 846static ssize_t zv_page_count_policy_percent_store(struct kobject *kobj, 847 struct kobj_attribute *attr, 848 const char *buf, size_t count) 849{ 850 unsigned long val; 851 int err; 852 853 if (!capable(CAP_SYS_ADMIN)) 854 return -EPERM; 855 856 err = strict_strtoul(buf, 10, &val); 857 if (err || (val == 0) || (val > 150)) 858 return -EINVAL; 859 zv_page_count_policy_percent = val; 860 return count; 861} 862 863static struct kobj_attribute zcache_zv_max_zsize_attr = { 864 .attr = { .name = "zv_max_zsize", .mode = 0644 }, 865 .show = zv_max_zsize_show, 866 .store = zv_max_zsize_store, 867}; 868 869static struct kobj_attribute zcache_zv_max_mean_zsize_attr = { 870 .attr = { .name = "zv_max_mean_zsize", .mode = 0644 }, 871 .show = zv_max_mean_zsize_show, 872 .store = zv_max_mean_zsize_store, 873}; 874 875static struct kobj_attribute zcache_zv_page_count_policy_percent_attr = { 876 .attr = { .name = "zv_page_count_policy_percent", 877 .mode = 0644 }, 878 .show = zv_page_count_policy_percent_show, 879 .store = zv_page_count_policy_percent_store, 880}; 881#endif 882 883/* 884 * zcache core code starts here 885 */ 886 887/* useful stats not collected by cleancache or frontswap */ 888static unsigned long zcache_flush_total; 889static unsigned long zcache_flush_found; 890static unsigned long zcache_flobj_total; 891static unsigned long zcache_flobj_found; 892static unsigned long zcache_failed_eph_puts; 893static unsigned long zcache_failed_pers_puts; 894 895/* 896 * Tmem operations assume the poolid implies the invoking client. 897 * Zcache only has one client (the kernel itself): LOCAL_CLIENT. 898 * RAMster has each client numbered by cluster node, and a KVM version 899 * of zcache would have one client per guest and each client might 900 * have a poolid==N. 901 */ 902static struct tmem_pool *zcache_get_pool_by_id(uint16_t cli_id, uint16_t poolid) 903{ 904 struct tmem_pool *pool = NULL; 905 struct zcache_client *cli = NULL; 906 907 if (cli_id == LOCAL_CLIENT) 908 cli = &zcache_host; 909 else { 910 if (cli_id >= MAX_CLIENTS) 911 goto out; 912 cli = &zcache_clients[cli_id]; 913 if (cli == NULL) 914 goto out; 915 atomic_inc(&cli->refcount); 916 } 917 if (poolid < MAX_POOLS_PER_CLIENT) { 918 pool = cli->tmem_pools[poolid]; 919 if (pool != NULL) 920 atomic_inc(&pool->refcount); 921 } 922out: 923 return pool; 924} 925 926static void zcache_put_pool(struct tmem_pool *pool) 927{ 928 struct zcache_client *cli = NULL; 929 930 if (pool == NULL) 931 BUG(); 932 cli = pool->client; 933 atomic_dec(&pool->refcount); 934 atomic_dec(&cli->refcount); 935} 936 937int zcache_new_client(uint16_t cli_id) 938{ 939 struct zcache_client *cli = NULL; 940 int ret = -1; 941 942 if (cli_id == LOCAL_CLIENT) 943 cli = &zcache_host; 944 else if ((unsigned int)cli_id < MAX_CLIENTS) 945 cli = &zcache_clients[cli_id]; 946 if (cli == NULL) 947 goto out; 948 if (cli->allocated) 949 goto out; 950 cli->allocated = 1; 951#ifdef CONFIG_FRONTSWAP 952 cli->xvpool = xv_create_pool(); 953 if (cli->xvpool == NULL) 954 goto out; 955#endif 956 ret = 0; 957out: 958 return ret; 959} 960 961/* counters for debugging */ 962static unsigned long zcache_failed_get_free_pages; 963static unsigned long zcache_failed_alloc; 964static unsigned long zcache_put_to_flush; 965static unsigned long zcache_aborted_preload; 966static unsigned long zcache_aborted_shrink; 967 968/* 969 * Ensure that memory allocation requests in zcache don't result 970 * in direct reclaim requests via the shrinker, which would cause 971 * an infinite loop. Maybe a GFP flag would be better? 972 */ 973static DEFINE_SPINLOCK(zcache_direct_reclaim_lock); 974 975/* 976 * for now, used named slabs so can easily track usage; later can 977 * either just use kmalloc, or perhaps add a slab-like allocator 978 * to more carefully manage total memory utilization 979 */ 980static struct kmem_cache *zcache_objnode_cache; 981static struct kmem_cache *zcache_obj_cache; 982static atomic_t zcache_curr_obj_count = ATOMIC_INIT(0); 983static unsigned long zcache_curr_obj_count_max; 984static atomic_t zcache_curr_objnode_count = ATOMIC_INIT(0); 985static unsigned long zcache_curr_objnode_count_max; 986 987/* 988 * to avoid memory allocation recursion (e.g. due to direct reclaim), we 989 * preload all necessary data structures so the hostops callbacks never 990 * actually do a malloc 991 */ 992struct zcache_preload { 993 void *page; 994 struct tmem_obj *obj; 995 int nr; 996 struct tmem_objnode *objnodes[OBJNODE_TREE_MAX_PATH]; 997}; 998static DEFINE_PER_CPU(struct zcache_preload, zcache_preloads) = { 0, }; 999 1000static int zcache_do_preload(struct tmem_pool *pool) 1001{ 1002 struct zcache_preload *kp; 1003 struct tmem_objnode *objnode; 1004 struct tmem_obj *obj; 1005 void *page; 1006 int ret = -ENOMEM; 1007 1008 if (unlikely(zcache_objnode_cache == NULL)) 1009 goto out; 1010 if (unlikely(zcache_obj_cache == NULL)) 1011 goto out; 1012 if (!spin_trylock(&zcache_direct_reclaim_lock)) { 1013 zcache_aborted_preload++; 1014 goto out; 1015 } 1016 preempt_disable(); 1017 kp = &__get_cpu_var(zcache_preloads); 1018 while (kp->nr < ARRAY_SIZE(kp->objnodes)) { 1019 preempt_enable_no_resched(); 1020 objnode = kmem_cache_alloc(zcache_objnode_cache, 1021 ZCACHE_GFP_MASK); 1022 if (unlikely(objnode == NULL)) { 1023 zcache_failed_alloc++; 1024 goto unlock_out; 1025 } 1026 preempt_disable(); 1027 kp = &__get_cpu_var(zcache_preloads); 1028 if (kp->nr < ARRAY_SIZE(kp->objnodes)) 1029 kp->objnodes[kp->nr++] = objnode; 1030 else 1031 kmem_cache_free(zcache_objnode_cache, objnode); 1032 } 1033 preempt_enable_no_resched(); 1034 obj = kmem_cache_alloc(zcache_obj_cache, ZCACHE_GFP_MASK); 1035 if (unlikely(obj == NULL)) { 1036 zcache_failed_alloc++; 1037 goto unlock_out; 1038 } 1039 page = (void *)__get_free_page(ZCACHE_GFP_MASK); 1040 if (unlikely(page == NULL)) { 1041 zcache_failed_get_free_pages++; 1042 kmem_cache_free(zcache_obj_cache, obj); 1043 goto unlock_out; 1044 } 1045 preempt_disable(); 1046 kp = &__get_cpu_var(zcache_preloads); 1047 if (kp->obj == NULL) 1048 kp->obj = obj; 1049 else 1050 kmem_cache_free(zcache_obj_cache, obj); 1051 if (kp->page == NULL) 1052 kp->page = page; 1053 else 1054 free_page((unsigned long)page); 1055 ret = 0; 1056unlock_out: 1057 spin_unlock(&zcache_direct_reclaim_lock); 1058out: 1059 return ret; 1060} 1061 1062static void *zcache_get_free_page(void) 1063{ 1064 struct zcache_preload *kp; 1065 void *page; 1066 1067 kp = &__get_cpu_var(zcache_preloads); 1068 page = kp->page; 1069 BUG_ON(page == NULL); 1070 kp->page = NULL; 1071 return page; 1072} 1073 1074static void zcache_free_page(void *p) 1075{ 1076 free_page((unsigned long)p); 1077} 1078 1079/* 1080 * zcache implementation for tmem host ops 1081 */ 1082 1083static struct tmem_objnode *zcache_objnode_alloc(struct tmem_pool *pool) 1084{ 1085 struct tmem_objnode *objnode = NULL; 1086 unsigned long count; 1087 struct zcache_preload *kp; 1088 1089 kp = &__get_cpu_var(zcache_preloads); 1090 if (kp->nr <= 0) 1091 goto out; 1092 objnode = kp->objnodes[kp->nr - 1]; 1093 BUG_ON(objnode == NULL); 1094 kp->objnodes[kp->nr - 1] = NULL; 1095 kp->nr--; 1096 count = atomic_inc_return(&zcache_curr_objnode_count); 1097 if (count > zcache_curr_objnode_count_max) 1098 zcache_curr_objnode_count_max = count; 1099out: 1100 return objnode; 1101} 1102 1103static void zcache_objnode_free(struct tmem_objnode *objnode, 1104 struct tmem_pool *pool) 1105{ 1106 atomic_dec(&zcache_curr_objnode_count); 1107 BUG_ON(atomic_read(&zcache_curr_objnode_count) < 0); 1108 kmem_cache_free(zcache_objnode_cache, objnode); 1109} 1110 1111static struct tmem_obj *zcache_obj_alloc(struct tmem_pool *pool) 1112{ 1113 struct tmem_obj *obj = NULL; 1114 unsigned long count; 1115 struct zcache_preload *kp; 1116 1117 kp = &__get_cpu_var(zcache_preloads); 1118 obj = kp->obj; 1119 BUG_ON(obj == NULL); 1120 kp->obj = NULL; 1121 count = atomic_inc_return(&zcache_curr_obj_count); 1122 if (count > zcache_curr_obj_count_max) 1123 zcache_curr_obj_count_max = count; 1124 return obj; 1125} 1126 1127static void zcache_obj_free(struct tmem_obj *obj, struct tmem_pool *pool) 1128{ 1129 atomic_dec(&zcache_curr_obj_count); 1130 BUG_ON(atomic_read(&zcache_curr_obj_count) < 0); 1131 kmem_cache_free(zcache_obj_cache, obj); 1132} 1133 1134static struct tmem_hostops zcache_hostops = { 1135 .obj_alloc = zcache_obj_alloc, 1136 .obj_free = zcache_obj_free, 1137 .objnode_alloc = zcache_objnode_alloc, 1138 .objnode_free = zcache_objnode_free, 1139}; 1140 1141/* 1142 * zcache implementations for PAM page descriptor ops 1143 */ 1144 1145static atomic_t zcache_curr_eph_pampd_count = ATOMIC_INIT(0); 1146static unsigned long zcache_curr_eph_pampd_count_max; 1147static atomic_t zcache_curr_pers_pampd_count = ATOMIC_INIT(0); 1148static unsigned long zcache_curr_pers_pampd_count_max; 1149 1150/* forward reference */ 1151static int zcache_compress(struct page *from, void **out_va, size_t *out_len); 1152 1153static void *zcache_pampd_create(char *data, size_t size, bool raw, int eph, 1154 struct tmem_pool *pool, struct tmem_oid *oid, 1155 uint32_t index) 1156{ 1157 void *pampd = NULL, *cdata; 1158 size_t clen; 1159 int ret; 1160 unsigned long count; 1161 struct page *page = (struct page *)(data); 1162 struct zcache_client *cli = pool->client; 1163 uint16_t client_id = get_client_id_from_client(cli); 1164 unsigned long zv_mean_zsize; 1165 unsigned long curr_pers_pampd_count; 1166 u64 total_zsize; 1167 1168 if (eph) { 1169 ret = zcache_compress(page, &cdata, &clen); 1170 if (ret == 0) 1171 goto out; 1172 if (clen == 0 || clen > zbud_max_buddy_size()) { 1173 zcache_compress_poor++; 1174 goto out; 1175 } 1176 pampd = (void *)zbud_create(client_id, pool->pool_id, oid, 1177 index, page, cdata, clen); 1178 if (pampd != NULL) { 1179 count = atomic_inc_return(&zcache_curr_eph_pampd_count); 1180 if (count > zcache_curr_eph_pampd_count_max) 1181 zcache_curr_eph_pampd_count_max = count; 1182 } 1183 } else { 1184 curr_pers_pampd_count = 1185 atomic_read(&zcache_curr_pers_pampd_count); 1186 if (curr_pers_pampd_count > 1187 (zv_page_count_policy_percent * totalram_pages) / 100) 1188 goto out; 1189 ret = zcache_compress(page, &cdata, &clen); 1190 if (ret == 0) 1191 goto out; 1192 /* reject if compression is too poor */ 1193 if (clen > zv_max_zsize) { 1194 zcache_compress_poor++; 1195 goto out; 1196 } 1197 /* reject if mean compression is too poor */ 1198 if ((clen > zv_max_mean_zsize) && (curr_pers_pampd_count > 0)) { 1199 total_zsize = xv_get_total_size_bytes(cli->xvpool); 1200 zv_mean_zsize = div_u64(total_zsize, 1201 curr_pers_pampd_count); 1202 if (zv_mean_zsize > zv_max_mean_zsize) { 1203 zcache_mean_compress_poor++; 1204 goto out; 1205 } 1206 } 1207 pampd = (void *)zv_create(cli->xvpool, pool->pool_id, 1208 oid, index, cdata, clen); 1209 if (pampd == NULL) 1210 goto out; 1211 count = atomic_inc_return(&zcache_curr_pers_pampd_count); 1212 if (count > zcache_curr_pers_pampd_count_max) 1213 zcache_curr_pers_pampd_count_max = count; 1214 } 1215out: 1216 return pampd; 1217} 1218 1219/* 1220 * fill the pageframe corresponding to the struct page with the data 1221 * from the passed pampd 1222 */ 1223static int zcache_pampd_get_data(char *data, size_t *bufsize, bool raw, 1224 void *pampd, struct tmem_pool *pool, 1225 struct tmem_oid *oid, uint32_t index) 1226{ 1227 int ret = 0; 1228 1229 BUG_ON(is_ephemeral(pool)); 1230 zv_decompress((struct page *)(data), pampd); 1231 return ret; 1232} 1233 1234/* 1235 * fill the pageframe corresponding to the struct page with the data 1236 * from the passed pampd 1237 */ 1238static int zcache_pampd_get_data_and_free(char *data, size_t *bufsize, bool raw, 1239 void *pampd, struct tmem_pool *pool, 1240 struct tmem_oid *oid, uint32_t index) 1241{ 1242 int ret = 0; 1243 1244 BUG_ON(!is_ephemeral(pool)); 1245 zbud_decompress((struct page *)(data), pampd); 1246 zbud_free_and_delist((struct zbud_hdr *)pampd); 1247 atomic_dec(&zcache_curr_eph_pampd_count); 1248 return ret; 1249} 1250 1251/* 1252 * free the pampd and remove it from any zcache lists 1253 * pampd must no longer be pointed to from any tmem data structures! 1254 */ 1255static void zcache_pampd_free(void *pampd, struct tmem_pool *pool, 1256 struct tmem_oid *oid, uint32_t index) 1257{ 1258 struct zcache_client *cli = pool->client; 1259 1260 if (is_ephemeral(pool)) { 1261 zbud_free_and_delist((struct zbud_hdr *)pampd); 1262 atomic_dec(&zcache_curr_eph_pampd_count); 1263 BUG_ON(atomic_read(&zcache_curr_eph_pampd_count) < 0); 1264 } else { 1265 zv_free(cli->xvpool, (struct zv_hdr *)pampd); 1266 atomic_dec(&zcache_curr_pers_pampd_count); 1267 BUG_ON(atomic_read(&zcache_curr_pers_pampd_count) < 0); 1268 } 1269} 1270 1271static void zcache_pampd_free_obj(struct tmem_pool *pool, struct tmem_obj *obj) 1272{ 1273} 1274 1275static void zcache_pampd_new_obj(struct tmem_obj *obj) 1276{ 1277} 1278 1279static int zcache_pampd_replace_in_obj(void *pampd, struct tmem_obj *obj) 1280{ 1281 return -1; 1282} 1283 1284static bool zcache_pampd_is_remote(void *pampd) 1285{ 1286 return 0; 1287} 1288 1289static struct tmem_pamops zcache_pamops = { 1290 .create = zcache_pampd_create, 1291 .get_data = zcache_pampd_get_data, 1292 .get_data_and_free = zcache_pampd_get_data_and_free, 1293 .free = zcache_pampd_free, 1294 .free_obj = zcache_pampd_free_obj, 1295 .new_obj = zcache_pampd_new_obj, 1296 .replace_in_obj = zcache_pampd_replace_in_obj, 1297 .is_remote = zcache_pampd_is_remote, 1298}; 1299 1300/* 1301 * zcache compression/decompression and related per-cpu stuff 1302 */ 1303 1304#define LZO_WORKMEM_BYTES LZO1X_1_MEM_COMPRESS 1305#define LZO_DSTMEM_PAGE_ORDER 1 1306static DEFINE_PER_CPU(unsigned char *, zcache_workmem); 1307static DEFINE_PER_CPU(unsigned char *, zcache_dstmem); 1308 1309static int zcache_compress(struct page *from, void **out_va, size_t *out_len) 1310{ 1311 int ret = 0; 1312 unsigned char *dmem = __get_cpu_var(zcache_dstmem); 1313 unsigned char *wmem = __get_cpu_var(zcache_workmem); 1314 char *from_va; 1315 1316 BUG_ON(!irqs_disabled()); 1317 if (unlikely(dmem == NULL || wmem == NULL)) 1318 goto out; /* no buffer, so can't compress */ 1319 from_va = kmap_atomic(from, KM_USER0); 1320 mb(); 1321 ret = lzo1x_1_compress(from_va, PAGE_SIZE, dmem, out_len, wmem); 1322 BUG_ON(ret != LZO_E_OK); 1323 *out_va = dmem; 1324 kunmap_atomic(from_va, KM_USER0); 1325 ret = 1; 1326out: 1327 return ret; 1328} 1329 1330 1331static int zcache_cpu_notifier(struct notifier_block *nb, 1332 unsigned long action, void *pcpu) 1333{ 1334 int cpu = (long)pcpu; 1335 struct zcache_preload *kp; 1336 1337 switch (action) { 1338 case CPU_UP_PREPARE: 1339 per_cpu(zcache_dstmem, cpu) = (void *)__get_free_pages( 1340 GFP_KERNEL | __GFP_REPEAT, 1341 LZO_DSTMEM_PAGE_ORDER), 1342 per_cpu(zcache_workmem, cpu) = 1343 kzalloc(LZO1X_MEM_COMPRESS, 1344 GFP_KERNEL | __GFP_REPEAT); 1345 break; 1346 case CPU_DEAD: 1347 case CPU_UP_CANCELED: 1348 free_pages((unsigned long)per_cpu(zcache_dstmem, cpu), 1349 LZO_DSTMEM_PAGE_ORDER); 1350 per_cpu(zcache_dstmem, cpu) = NULL; 1351 kfree(per_cpu(zcache_workmem, cpu)); 1352 per_cpu(zcache_workmem, cpu) = NULL; 1353 kp = &per_cpu(zcache_preloads, cpu); 1354 while (kp->nr) { 1355 kmem_cache_free(zcache_objnode_cache, 1356 kp->objnodes[kp->nr - 1]); 1357 kp->objnodes[kp->nr - 1] = NULL; 1358 kp->nr--; 1359 } 1360 kmem_cache_free(zcache_obj_cache, kp->obj); 1361 free_page((unsigned long)kp->page); 1362 break; 1363 default: 1364 break; 1365 } 1366 return NOTIFY_OK; 1367} 1368 1369static struct notifier_block zcache_cpu_notifier_block = { 1370 .notifier_call = zcache_cpu_notifier 1371}; 1372 1373#ifdef CONFIG_SYSFS 1374#define ZCACHE_SYSFS_RO(_name) \ 1375 static ssize_t zcache_##_name##_show(struct kobject *kobj, \ 1376 struct kobj_attribute *attr, char *buf) \ 1377 { \ 1378 return sprintf(buf, "%lu\n", zcache_##_name); \ 1379 } \ 1380 static struct kobj_attribute zcache_##_name##_attr = { \ 1381 .attr = { .name = __stringify(_name), .mode = 0444 }, \ 1382 .show = zcache_##_name##_show, \ 1383 } 1384 1385#define ZCACHE_SYSFS_RO_ATOMIC(_name) \ 1386 static ssize_t zcache_##_name##_show(struct kobject *kobj, \ 1387 struct kobj_attribute *attr, char *buf) \ 1388 { \ 1389 return sprintf(buf, "%d\n", atomic_read(&zcache_##_name)); \ 1390 } \ 1391 static struct kobj_attribute zcache_##_name##_attr = { \ 1392 .attr = { .name = __stringify(_name), .mode = 0444 }, \ 1393 .show = zcache_##_name##_show, \ 1394 } 1395 1396#define ZCACHE_SYSFS_RO_CUSTOM(_name, _func) \ 1397 static ssize_t zcache_##_name##_show(struct kobject *kobj, \ 1398 struct kobj_attribute *attr, char *buf) \ 1399 { \ 1400 return _func(buf); \ 1401 } \ 1402 static struct kobj_attribute zcache_##_name##_attr = { \ 1403 .attr = { .name = __stringify(_name), .mode = 0444 }, \ 1404 .show = zcache_##_name##_show, \ 1405 } 1406 1407ZCACHE_SYSFS_RO(curr_obj_count_max); 1408ZCACHE_SYSFS_RO(curr_objnode_count_max); 1409ZCACHE_SYSFS_RO(flush_total); 1410ZCACHE_SYSFS_RO(flush_found); 1411ZCACHE_SYSFS_RO(flobj_total); 1412ZCACHE_SYSFS_RO(flobj_found); 1413ZCACHE_SYSFS_RO(failed_eph_puts); 1414ZCACHE_SYSFS_RO(failed_pers_puts); 1415ZCACHE_SYSFS_RO(zbud_curr_zbytes); 1416ZCACHE_SYSFS_RO(zbud_cumul_zpages); 1417ZCACHE_SYSFS_RO(zbud_cumul_zbytes); 1418ZCACHE_SYSFS_RO(zbud_buddied_count); 1419ZCACHE_SYSFS_RO(zbpg_unused_list_count); 1420ZCACHE_SYSFS_RO(evicted_raw_pages); 1421ZCACHE_SYSFS_RO(evicted_unbuddied_pages); 1422ZCACHE_SYSFS_RO(evicted_buddied_pages); 1423ZCACHE_SYSFS_RO(failed_get_free_pages); 1424ZCACHE_SYSFS_RO(failed_alloc); 1425ZCACHE_SYSFS_RO(put_to_flush); 1426ZCACHE_SYSFS_RO(aborted_preload); 1427ZCACHE_SYSFS_RO(aborted_shrink); 1428ZCACHE_SYSFS_RO(compress_poor); 1429ZCACHE_SYSFS_RO(mean_compress_poor); 1430ZCACHE_SYSFS_RO_ATOMIC(zbud_curr_raw_pages); 1431ZCACHE_SYSFS_RO_ATOMIC(zbud_curr_zpages); 1432ZCACHE_SYSFS_RO_ATOMIC(curr_obj_count); 1433ZCACHE_SYSFS_RO_ATOMIC(curr_objnode_count); 1434ZCACHE_SYSFS_RO_CUSTOM(zbud_unbuddied_list_counts, 1435 zbud_show_unbuddied_list_counts); 1436ZCACHE_SYSFS_RO_CUSTOM(zbud_cumul_chunk_counts, 1437 zbud_show_cumul_chunk_counts); 1438ZCACHE_SYSFS_RO_CUSTOM(zv_curr_dist_counts, 1439 zv_curr_dist_counts_show); 1440ZCACHE_SYSFS_RO_CUSTOM(zv_cumul_dist_counts, 1441 zv_cumul_dist_counts_show); 1442 1443static struct attribute *zcache_attrs[] = { 1444 &zcache_curr_obj_count_attr.attr, 1445 &zcache_curr_obj_count_max_attr.attr, 1446 &zcache_curr_objnode_count_attr.attr, 1447 &zcache_curr_objnode_count_max_attr.attr, 1448 &zcache_flush_total_attr.attr, 1449 &zcache_flobj_total_attr.attr, 1450 &zcache_flush_found_attr.attr, 1451 &zcache_flobj_found_attr.attr, 1452 &zcache_failed_eph_puts_attr.attr, 1453 &zcache_failed_pers_puts_attr.attr, 1454 &zcache_compress_poor_attr.attr, 1455 &zcache_mean_compress_poor_attr.attr, 1456 &zcache_zbud_curr_raw_pages_attr.attr, 1457 &zcache_zbud_curr_zpages_attr.attr, 1458 &zcache_zbud_curr_zbytes_attr.attr, 1459 &zcache_zbud_cumul_zpages_attr.attr, 1460 &zcache_zbud_cumul_zbytes_attr.attr, 1461 &zcache_zbud_buddied_count_attr.attr, 1462 &zcache_zbpg_unused_list_count_attr.attr, 1463 &zcache_evicted_raw_pages_attr.attr, 1464 &zcache_evicted_unbuddied_pages_attr.attr, 1465 &zcache_evicted_buddied_pages_attr.attr, 1466 &zcache_failed_get_free_pages_attr.attr, 1467 &zcache_failed_alloc_attr.attr, 1468 &zcache_put_to_flush_attr.attr, 1469 &zcache_aborted_preload_attr.attr, 1470 &zcache_aborted_shrink_attr.attr, 1471 &zcache_zbud_unbuddied_list_counts_attr.attr, 1472 &zcache_zbud_cumul_chunk_counts_attr.attr, 1473 &zcache_zv_curr_dist_counts_attr.attr, 1474 &zcache_zv_cumul_dist_counts_attr.attr, 1475 &zcache_zv_max_zsize_attr.attr, 1476 &zcache_zv_max_mean_zsize_attr.attr, 1477 &zcache_zv_page_count_policy_percent_attr.attr, 1478 NULL, 1479}; 1480 1481static struct attribute_group zcache_attr_group = { 1482 .attrs = zcache_attrs, 1483 .name = "zcache", 1484}; 1485 1486#endif /* CONFIG_SYSFS */ 1487/* 1488 * When zcache is disabled ("frozen"), pools can be created and destroyed, 1489 * but all puts (and thus all other operations that require memory allocation) 1490 * must fail. If zcache is unfrozen, accepts puts, then frozen again, 1491 * data consistency requires all puts while frozen to be converted into 1492 * flushes. 1493 */ 1494static bool zcache_freeze; 1495 1496/* 1497 * zcache shrinker interface (only useful for ephemeral pages, so zbud only) 1498 */ 1499static int shrink_zcache_memory(struct shrinker *shrink, 1500 struct shrink_control *sc) 1501{ 1502 int ret = -1; 1503 int nr = sc->nr_to_scan; 1504 gfp_t gfp_mask = sc->gfp_mask; 1505 1506 if (nr >= 0) { 1507 if (!(gfp_mask & __GFP_FS)) 1508 /* does this case really need to be skipped? */ 1509 goto out; 1510 if (spin_trylock(&zcache_direct_reclaim_lock)) { 1511 zbud_evict_pages(nr); 1512 spin_unlock(&zcache_direct_reclaim_lock); 1513 } else 1514 zcache_aborted_shrink++; 1515 } 1516 ret = (int)atomic_read(&zcache_zbud_curr_raw_pages); 1517out: 1518 return ret; 1519} 1520 1521static struct shrinker zcache_shrinker = { 1522 .shrink = shrink_zcache_memory, 1523 .seeks = DEFAULT_SEEKS, 1524}; 1525 1526/* 1527 * zcache shims between cleancache/frontswap ops and tmem 1528 */ 1529 1530static int zcache_put_page(int cli_id, int pool_id, struct tmem_oid *oidp, 1531 uint32_t index, struct page *page) 1532{ 1533 struct tmem_pool *pool; 1534 int ret = -1; 1535 1536 BUG_ON(!irqs_disabled()); 1537 pool = zcache_get_pool_by_id(cli_id, pool_id); 1538 if (unlikely(pool == NULL)) 1539 goto out; 1540 if (!zcache_freeze && zcache_do_preload(pool) == 0) { 1541 /* preload does preempt_disable on success */ 1542 ret = tmem_put(pool, oidp, index, (char *)(page), 1543 PAGE_SIZE, 0, is_ephemeral(pool)); 1544 if (ret < 0) { 1545 if (is_ephemeral(pool)) 1546 zcache_failed_eph_puts++; 1547 else 1548 zcache_failed_pers_puts++; 1549 } 1550 zcache_put_pool(pool); 1551 preempt_enable_no_resched(); 1552 } else { 1553 zcache_put_to_flush++; 1554 if (atomic_read(&pool->obj_count) > 0) 1555 /* the put fails whether the flush succeeds or not */ 1556 (void)tmem_flush_page(pool, oidp, index); 1557 zcache_put_pool(pool); 1558 } 1559out: 1560 return ret; 1561} 1562 1563static int zcache_get_page(int cli_id, int pool_id, struct tmem_oid *oidp, 1564 uint32_t index, struct page *page) 1565{ 1566 struct tmem_pool *pool; 1567 int ret = -1; 1568 unsigned long flags; 1569 size_t size = PAGE_SIZE; 1570 1571 local_irq_save(flags); 1572 pool = zcache_get_pool_by_id(cli_id, pool_id); 1573 if (likely(pool != NULL)) { 1574 if (atomic_read(&pool->obj_count) > 0) 1575 ret = tmem_get(pool, oidp, index, (char *)(page), 1576 &size, 0, is_ephemeral(pool)); 1577 zcache_put_pool(pool); 1578 } 1579 local_irq_restore(flags); 1580 return ret; 1581} 1582 1583static int zcache_flush_page(int cli_id, int pool_id, 1584 struct tmem_oid *oidp, uint32_t index) 1585{ 1586 struct tmem_pool *pool; 1587 int ret = -1; 1588 unsigned long flags; 1589 1590 local_irq_save(flags); 1591 zcache_flush_total++; 1592 pool = zcache_get_pool_by_id(cli_id, pool_id); 1593 if (likely(pool != NULL)) { 1594 if (atomic_read(&pool->obj_count) > 0) 1595 ret = tmem_flush_page(pool, oidp, index); 1596 zcache_put_pool(pool); 1597 } 1598 if (ret >= 0) 1599 zcache_flush_found++; 1600 local_irq_restore(flags); 1601 return ret; 1602} 1603 1604static int zcache_flush_object(int cli_id, int pool_id, 1605 struct tmem_oid *oidp) 1606{ 1607 struct tmem_pool *pool; 1608 int ret = -1; 1609 unsigned long flags; 1610 1611 local_irq_save(flags); 1612 zcache_flobj_total++; 1613 pool = zcache_get_pool_by_id(cli_id, pool_id); 1614 if (likely(pool != NULL)) { 1615 if (atomic_read(&pool->obj_count) > 0) 1616 ret = tmem_flush_object(pool, oidp); 1617 zcache_put_pool(pool); 1618 } 1619 if (ret >= 0) 1620 zcache_flobj_found++; 1621 local_irq_restore(flags); 1622 return ret; 1623} 1624 1625static int zcache_destroy_pool(int cli_id, int pool_id) 1626{ 1627 struct tmem_pool *pool = NULL; 1628 struct zcache_client *cli = NULL; 1629 int ret = -1; 1630 1631 if (pool_id < 0) 1632 goto out; 1633 if (cli_id == LOCAL_CLIENT) 1634 cli = &zcache_host; 1635 else if ((unsigned int)cli_id < MAX_CLIENTS) 1636 cli = &zcache_clients[cli_id]; 1637 if (cli == NULL) 1638 goto out; 1639 atomic_inc(&cli->refcount); 1640 pool = cli->tmem_pools[pool_id]; 1641 if (pool == NULL) 1642 goto out; 1643 cli->tmem_pools[pool_id] = NULL; 1644 /* wait for pool activity on other cpus to quiesce */ 1645 while (atomic_read(&pool->refcount) != 0) 1646 ; 1647 atomic_dec(&cli->refcount); 1648 local_bh_disable(); 1649 ret = tmem_destroy_pool(pool); 1650 local_bh_enable(); 1651 kfree(pool); 1652 pr_info("zcache: destroyed pool id=%d, cli_id=%d\n", 1653 pool_id, cli_id); 1654out: 1655 return ret; 1656} 1657 1658static int zcache_new_pool(uint16_t cli_id, uint32_t flags) 1659{ 1660 int poolid = -1; 1661 struct tmem_pool *pool; 1662 struct zcache_client *cli = NULL; 1663 1664 if (cli_id == LOCAL_CLIENT) 1665 cli = &zcache_host; 1666 else if ((unsigned int)cli_id < MAX_CLIENTS) 1667 cli = &zcache_clients[cli_id]; 1668 if (cli == NULL) 1669 goto out; 1670 atomic_inc(&cli->refcount); 1671 pool = kmalloc(sizeof(struct tmem_pool), GFP_KERNEL); 1672 if (pool == NULL) { 1673 pr_info("zcache: pool creation failed: out of memory\n"); 1674 goto out; 1675 } 1676 1677 for (poolid = 0; poolid < MAX_POOLS_PER_CLIENT; poolid++) 1678 if (cli->tmem_pools[poolid] == NULL) 1679 break; 1680 if (poolid >= MAX_POOLS_PER_CLIENT) { 1681 pr_info("zcache: pool creation failed: max exceeded\n"); 1682 kfree(pool); 1683 poolid = -1; 1684 goto out; 1685 } 1686 atomic_set(&pool->refcount, 0); 1687 pool->client = cli; 1688 pool->pool_id = poolid; 1689 tmem_new_pool(pool, flags); 1690 cli->tmem_pools[poolid] = pool; 1691 pr_info("zcache: created %s tmem pool, id=%d, client=%d\n", 1692 flags & TMEM_POOL_PERSIST ? "persistent" : "ephemeral", 1693 poolid, cli_id); 1694out: 1695 if (cli != NULL) 1696 atomic_dec(&cli->refcount); 1697 return poolid; 1698} 1699 1700/********** 1701 * Two kernel functionalities currently can be layered on top of tmem. 1702 * These are "cleancache" which is used as a second-chance cache for clean 1703 * page cache pages; and "frontswap" which is used for swap pages 1704 * to avoid writes to disk. A generic "shim" is provided here for each 1705 * to translate in-kernel semantics to zcache semantics. 1706 */ 1707 1708#ifdef CONFIG_CLEANCACHE 1709static void zcache_cleancache_put_page(int pool_id, 1710 struct cleancache_filekey key, 1711 pgoff_t index, struct page *page) 1712{ 1713 u32 ind = (u32) index; 1714 struct tmem_oid oid = *(struct tmem_oid *)&key; 1715 1716 if (likely(ind == index)) 1717 (void)zcache_put_page(LOCAL_CLIENT, pool_id, &oid, index, page); 1718} 1719 1720static int zcache_cleancache_get_page(int pool_id, 1721 struct cleancache_filekey key, 1722 pgoff_t index, struct page *page) 1723{ 1724 u32 ind = (u32) index; 1725 struct tmem_oid oid = *(struct tmem_oid *)&key; 1726 int ret = -1; 1727 1728 if (likely(ind == index)) 1729 ret = zcache_get_page(LOCAL_CLIENT, pool_id, &oid, index, page); 1730 return ret; 1731} 1732 1733static void zcache_cleancache_flush_page(int pool_id, 1734 struct cleancache_filekey key, 1735 pgoff_t index) 1736{ 1737 u32 ind = (u32) index; 1738 struct tmem_oid oid = *(struct tmem_oid *)&key; 1739 1740 if (likely(ind == index)) 1741 (void)zcache_flush_page(LOCAL_CLIENT, pool_id, &oid, ind); 1742} 1743 1744static void zcache_cleancache_flush_inode(int pool_id, 1745 struct cleancache_filekey key) 1746{ 1747 struct tmem_oid oid = *(struct tmem_oid *)&key; 1748 1749 (void)zcache_flush_object(LOCAL_CLIENT, pool_id, &oid); 1750} 1751 1752static void zcache_cleancache_flush_fs(int pool_id) 1753{ 1754 if (pool_id >= 0) 1755 (void)zcache_destroy_pool(LOCAL_CLIENT, pool_id); 1756} 1757 1758static int zcache_cleancache_init_fs(size_t pagesize) 1759{ 1760 BUG_ON(sizeof(struct cleancache_filekey) != 1761 sizeof(struct tmem_oid)); 1762 BUG_ON(pagesize != PAGE_SIZE); 1763 return zcache_new_pool(LOCAL_CLIENT, 0); 1764} 1765 1766static int zcache_cleancache_init_shared_fs(char *uuid, size_t pagesize) 1767{ 1768 /* shared pools are unsupported and map to private */ 1769 BUG_ON(sizeof(struct cleancache_filekey) != 1770 sizeof(struct tmem_oid)); 1771 BUG_ON(pagesize != PAGE_SIZE); 1772 return zcache_new_pool(LOCAL_CLIENT, 0); 1773} 1774 1775static struct cleancache_ops zcache_cleancache_ops = { 1776 .put_page = zcache_cleancache_put_page, 1777 .get_page = zcache_cleancache_get_page, 1778 .flush_page = zcache_cleancache_flush_page, 1779 .flush_inode = zcache_cleancache_flush_inode, 1780 .flush_fs = zcache_cleancache_flush_fs, 1781 .init_shared_fs = zcache_cleancache_init_shared_fs, 1782 .init_fs = zcache_cleancache_init_fs 1783}; 1784 1785struct cleancache_ops zcache_cleancache_register_ops(void) 1786{ 1787 struct cleancache_ops old_ops = 1788 cleancache_register_ops(&zcache_cleancache_ops); 1789 1790 return old_ops; 1791} 1792#endif 1793 1794#ifdef CONFIG_FRONTSWAP 1795/* a single tmem poolid is used for all frontswap "types" (swapfiles) */ 1796static int zcache_frontswap_poolid = -1; 1797 1798/* 1799 * Swizzling increases objects per swaptype, increasing tmem concurrency 1800 * for heavy swaploads. Later, larger nr_cpus -> larger SWIZ_BITS 1801 */ 1802#define SWIZ_BITS 4 1803#define SWIZ_MASK ((1 << SWIZ_BITS) - 1) 1804#define _oswiz(_type, _ind) ((_type << SWIZ_BITS) | (_ind & SWIZ_MASK)) 1805#define iswiz(_ind) (_ind >> SWIZ_BITS) 1806 1807static inline struct tmem_oid oswiz(unsigned type, u32 ind) 1808{ 1809 struct tmem_oid oid = { .oid = { 0 } }; 1810 oid.oid[0] = _oswiz(type, ind); 1811 return oid; 1812} 1813 1814static int zcache_frontswap_put_page(unsigned type, pgoff_t offset, 1815 struct page *page) 1816{ 1817 u64 ind64 = (u64)offset; 1818 u32 ind = (u32)offset; 1819 struct tmem_oid oid = oswiz(type, ind); 1820 int ret = -1; 1821 unsigned long flags; 1822 1823 BUG_ON(!PageLocked(page)); 1824 if (likely(ind64 == ind)) { 1825 local_irq_save(flags); 1826 ret = zcache_put_page(LOCAL_CLIENT, zcache_frontswap_poolid, 1827 &oid, iswiz(ind), page); 1828 local_irq_restore(flags); 1829 } 1830 return ret; 1831} 1832 1833/* returns 0 if the page was successfully gotten from frontswap, -1 if 1834 * was not present (should never happen!) */ 1835static int zcache_frontswap_get_page(unsigned type, pgoff_t offset, 1836 struct page *page) 1837{ 1838 u64 ind64 = (u64)offset; 1839 u32 ind = (u32)offset; 1840 struct tmem_oid oid = oswiz(type, ind); 1841 int ret = -1; 1842 1843 BUG_ON(!PageLocked(page)); 1844 if (likely(ind64 == ind)) 1845 ret = zcache_get_page(LOCAL_CLIENT, zcache_frontswap_poolid, 1846 &oid, iswiz(ind), page); 1847 return ret; 1848} 1849 1850/* flush a single page from frontswap */ 1851static void zcache_frontswap_flush_page(unsigned type, pgoff_t offset) 1852{ 1853 u64 ind64 = (u64)offset; 1854 u32 ind = (u32)offset; 1855 struct tmem_oid oid = oswiz(type, ind); 1856 1857 if (likely(ind64 == ind)) 1858 (void)zcache_flush_page(LOCAL_CLIENT, zcache_frontswap_poolid, 1859 &oid, iswiz(ind)); 1860} 1861 1862/* flush all pages from the passed swaptype */ 1863static void zcache_frontswap_flush_area(unsigned type) 1864{ 1865 struct tmem_oid oid; 1866 int ind; 1867 1868 for (ind = SWIZ_MASK; ind >= 0; ind--) { 1869 oid = oswiz(type, ind); 1870 (void)zcache_flush_object(LOCAL_CLIENT, 1871 zcache_frontswap_poolid, &oid); 1872 } 1873} 1874 1875static void zcache_frontswap_init(unsigned ignored) 1876{ 1877 /* a single tmem poolid is used for all frontswap "types" (swapfiles) */ 1878 if (zcache_frontswap_poolid < 0) 1879 zcache_frontswap_poolid = 1880 zcache_new_pool(LOCAL_CLIENT, TMEM_POOL_PERSIST); 1881} 1882 1883static struct frontswap_ops zcache_frontswap_ops = { 1884 .put_page = zcache_frontswap_put_page, 1885 .get_page = zcache_frontswap_get_page, 1886 .flush_page = zcache_frontswap_flush_page, 1887 .flush_area = zcache_frontswap_flush_area, 1888 .init = zcache_frontswap_init 1889}; 1890 1891struct frontswap_ops zcache_frontswap_register_ops(void) 1892{ 1893 struct frontswap_ops old_ops = 1894 frontswap_register_ops(&zcache_frontswap_ops); 1895 1896 return old_ops; 1897} 1898#endif 1899 1900/* 1901 * zcache initialization 1902 * NOTE FOR NOW zcache MUST BE PROVIDED AS A KERNEL BOOT PARAMETER OR 1903 * NOTHING HAPPENS! 1904 */ 1905 1906static int zcache_enabled; 1907 1908static int __init enable_zcache(char *s) 1909{ 1910 zcache_enabled = 1; 1911 return 1; 1912} 1913__setup("zcache", enable_zcache); 1914 1915/* allow independent dynamic disabling of cleancache and frontswap */ 1916 1917static int use_cleancache = 1; 1918 1919static int __init no_cleancache(char *s) 1920{ 1921 use_cleancache = 0; 1922 return 1; 1923} 1924 1925__setup("nocleancache", no_cleancache); 1926 1927static int use_frontswap = 1; 1928 1929static int __init no_frontswap(char *s) 1930{ 1931 use_frontswap = 0; 1932 return 1; 1933} 1934 1935__setup("nofrontswap", no_frontswap); 1936 1937static int __init zcache_init(void) 1938{ 1939 int ret = 0; 1940 1941#ifdef CONFIG_SYSFS 1942 ret = sysfs_create_group(mm_kobj, &zcache_attr_group); 1943 if (ret) { 1944 pr_err("zcache: can't create sysfs\n"); 1945 goto out; 1946 } 1947#endif /* CONFIG_SYSFS */ 1948#if defined(CONFIG_CLEANCACHE) || defined(CONFIG_FRONTSWAP) 1949 if (zcache_enabled) { 1950 unsigned int cpu; 1951 1952 tmem_register_hostops(&zcache_hostops); 1953 tmem_register_pamops(&zcache_pamops); 1954 ret = register_cpu_notifier(&zcache_cpu_notifier_block); 1955 if (ret) { 1956 pr_err("zcache: can't register cpu notifier\n"); 1957 goto out; 1958 } 1959 for_each_online_cpu(cpu) { 1960 void *pcpu = (void *)(long)cpu; 1961 zcache_cpu_notifier(&zcache_cpu_notifier_block, 1962 CPU_UP_PREPARE, pcpu); 1963 } 1964 } 1965 zcache_objnode_cache = kmem_cache_create("zcache_objnode", 1966 sizeof(struct tmem_objnode), 0, 0, NULL); 1967 zcache_obj_cache = kmem_cache_create("zcache_obj", 1968 sizeof(struct tmem_obj), 0, 0, NULL); 1969 ret = zcache_new_client(LOCAL_CLIENT); 1970 if (ret) { 1971 pr_err("zcache: can't create client\n"); 1972 goto out; 1973 } 1974#endif 1975#ifdef CONFIG_CLEANCACHE 1976 if (zcache_enabled && use_cleancache) { 1977 struct cleancache_ops old_ops; 1978 1979 zbud_init(); 1980 register_shrinker(&zcache_shrinker); 1981 old_ops = zcache_cleancache_register_ops(); 1982 pr_info("zcache: cleancache enabled using kernel " 1983 "transcendent memory and compression buddies\n"); 1984 if (old_ops.init_fs != NULL) 1985 pr_warning("zcache: cleancache_ops overridden"); 1986 } 1987#endif 1988#ifdef CONFIG_FRONTSWAP 1989 if (zcache_enabled && use_frontswap) { 1990 struct frontswap_ops old_ops; 1991 1992 old_ops = zcache_frontswap_register_ops(); 1993 pr_info("zcache: frontswap enabled using kernel " 1994 "transcendent memory and xvmalloc\n"); 1995 if (old_ops.init != NULL) 1996 pr_warning("ktmem: frontswap_ops overridden"); 1997 } 1998#endif 1999out: 2000 return ret; 2001} 2002 2003module_init(zcache_init)