Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at v3.7-rc5 1060 lines 33 kB view raw
1/* 2 * zbud.c - Compression buddies allocator 3 * 4 * Copyright (c) 2010-2012, Dan Magenheimer, Oracle Corp. 5 * 6 * Compression buddies ("zbud") provides for efficiently packing two 7 * (or, possibly in the future, more) compressed pages ("zpages") into 8 * a single "raw" pageframe and for tracking both zpages and pageframes 9 * so that whole pageframes can be easily reclaimed in LRU-like order. 10 * It is designed to be used in conjunction with transcendent memory 11 * ("tmem"); for example separate LRU lists are maintained for persistent 12 * vs. ephemeral pages. 13 * 14 * A zbudpage is an overlay for a struct page and thus each zbudpage 15 * refers to a physical pageframe of RAM. When the caller passes a 16 * struct page from the kernel's page allocator, zbud "transforms" it 17 * to a zbudpage which sets/uses a different set of fields than the 18 * struct-page and thus must "untransform" it back by reinitializing 19 * certain fields before the struct-page can be freed. The fields 20 * of a zbudpage include a page lock for controlling access to the 21 * corresponding pageframe, and there is a size field for each zpage. 22 * Each zbudpage also lives on two linked lists: a "budlist" which is 23 * used to support efficient buddying of zpages; and an "lru" which 24 * is used for reclaiming pageframes in approximately least-recently-used 25 * order. 26 * 27 * A zbudpageframe is a pageframe divided up into aligned 64-byte "chunks" 28 * which contain the compressed data for zero, one, or two zbuds. Contained 29 * with the compressed data is a tmem_handle which is a key to allow 30 * the same data to be found via the tmem interface so the zpage can 31 * be invalidated (for ephemeral pages) or repatriated to the swap cache 32 * (for persistent pages). The contents of a zbudpageframe must never 33 * be accessed without holding the page lock for the corresponding 34 * zbudpage and, to accomodate highmem machines, the contents may 35 * only be examined or changes when kmapped. Thus, when in use, a 36 * kmapped zbudpageframe is referred to in the zbud code as "void *zbpg". 37 * 38 * Note that the term "zbud" refers to the combination of a zpage and 39 * a tmem_handle that is stored as one of possibly two "buddied" zpages; 40 * it also generically refers to this allocator... sorry for any confusion. 41 * 42 * A zbudref is a pointer to a struct zbudpage (which can be cast to a 43 * struct page), with the LSB either cleared or set to indicate, respectively, 44 * the first or second zpage in the zbudpageframe. Since a zbudref can be 45 * cast to a pointer, it is used as the tmem "pampd" pointer and uniquely 46 * references a stored tmem page and so is the only zbud data structure 47 * externally visible to zbud.c/zbud.h. 48 * 49 * Since we wish to reclaim entire pageframes but zpages may be randomly 50 * added and deleted to any given pageframe, we approximate LRU by 51 * promoting a pageframe to MRU when a zpage is added to it, but 52 * leaving it at the current place in the list when a zpage is deleted 53 * from it. As a side effect, zpages that are difficult to buddy (e.g. 54 * very large paages) will be reclaimed faster than average, which seems 55 * reasonable. 56 * 57 * In the current implementation, no more than two zpages may be stored in 58 * any pageframe and no zpage ever crosses a pageframe boundary. While 59 * other zpage allocation mechanisms may allow greater density, this two 60 * zpage-per-pageframe limit both ensures simple reclaim of pageframes 61 * (including garbage collection of references to the contents of those 62 * pageframes from tmem data structures) AND avoids the need for compaction. 63 * With additional complexity, zbud could be modified to support storing 64 * up to three zpages per pageframe or, to handle larger average zpages, 65 * up to three zpages per pair of pageframes, but it is not clear if the 66 * additional complexity would be worth it. So consider it an exercise 67 * for future developers. 68 * 69 * Note also that zbud does no page allocation or freeing. This is so 70 * that the caller has complete control over and, for accounting, visibility 71 * into if/when pages are allocated and freed. 72 * 73 * Finally, note that zbud limits the size of zpages it can store; the 74 * caller must check the zpage size with zbud_max_buddy_size before 75 * storing it, else BUGs will result. User beware. 76 */ 77 78#include <linux/module.h> 79#include <linux/highmem.h> 80#include <linux/list.h> 81#include <linux/spinlock.h> 82#include <linux/pagemap.h> 83#include <linux/atomic.h> 84#include <linux/bug.h> 85#include "tmem.h" 86#include "zcache.h" 87#include "zbud.h" 88 89/* 90 * We need to ensure that a struct zbudpage is never larger than a 91 * struct page. This is checked with a BUG_ON in zbud_init. 92 * 93 * The unevictable field indicates that a zbud is being added to the 94 * zbudpage. Since this is a two-phase process (due to tmem locking), 95 * this field locks the zbudpage against eviction when a zbud match 96 * or creation is in process. Since this addition process may occur 97 * in parallel for two zbuds in one zbudpage, the field is a counter 98 * that must not exceed two. 99 */ 100struct zbudpage { 101 union { 102 struct page page; 103 struct { 104 unsigned long space_for_flags; 105 struct { 106 unsigned zbud0_size:12; 107 unsigned zbud1_size:12; 108 unsigned unevictable:2; 109 }; 110 struct list_head budlist; 111 struct list_head lru; 112 }; 113 }; 114}; 115 116struct zbudref { 117 union { 118 struct zbudpage *zbudpage; 119 unsigned long zbudref; 120 }; 121}; 122 123#define CHUNK_SHIFT 6 124#define CHUNK_SIZE (1 << CHUNK_SHIFT) 125#define CHUNK_MASK (~(CHUNK_SIZE-1)) 126#define NCHUNKS (PAGE_SIZE >> CHUNK_SHIFT) 127#define MAX_CHUNK (NCHUNKS-1) 128 129/* 130 * The following functions deal with the difference between struct 131 * page and struct zbudpage. Note the hack of using the pageflags 132 * from struct page; this is to avoid duplicating all the complex 133 * pageflag macros. 134 */ 135static inline void zbudpage_spin_lock(struct zbudpage *zbudpage) 136{ 137 struct page *page = (struct page *)zbudpage; 138 139 while (unlikely(test_and_set_bit_lock(PG_locked, &page->flags))) { 140 do { 141 cpu_relax(); 142 } while (test_bit(PG_locked, &page->flags)); 143 } 144} 145 146static inline void zbudpage_spin_unlock(struct zbudpage *zbudpage) 147{ 148 struct page *page = (struct page *)zbudpage; 149 150 clear_bit(PG_locked, &page->flags); 151} 152 153static inline int zbudpage_spin_trylock(struct zbudpage *zbudpage) 154{ 155 return trylock_page((struct page *)zbudpage); 156} 157 158static inline int zbudpage_is_locked(struct zbudpage *zbudpage) 159{ 160 return PageLocked((struct page *)zbudpage); 161} 162 163static inline void *kmap_zbudpage_atomic(struct zbudpage *zbudpage) 164{ 165 return kmap_atomic((struct page *)zbudpage); 166} 167 168/* 169 * A dying zbudpage is an ephemeral page in the process of being evicted. 170 * Any data contained in the zbudpage is invalid and we are just waiting for 171 * the tmem pampds to be invalidated before freeing the page 172 */ 173static inline int zbudpage_is_dying(struct zbudpage *zbudpage) 174{ 175 struct page *page = (struct page *)zbudpage; 176 177 return test_bit(PG_reclaim, &page->flags); 178} 179 180static inline void zbudpage_set_dying(struct zbudpage *zbudpage) 181{ 182 struct page *page = (struct page *)zbudpage; 183 184 set_bit(PG_reclaim, &page->flags); 185} 186 187static inline void zbudpage_clear_dying(struct zbudpage *zbudpage) 188{ 189 struct page *page = (struct page *)zbudpage; 190 191 clear_bit(PG_reclaim, &page->flags); 192} 193 194/* 195 * A zombie zbudpage is a persistent page in the process of being evicted. 196 * The data contained in the zbudpage is valid and we are just waiting for 197 * the tmem pampds to be invalidated before freeing the page 198 */ 199static inline int zbudpage_is_zombie(struct zbudpage *zbudpage) 200{ 201 struct page *page = (struct page *)zbudpage; 202 203 return test_bit(PG_dirty, &page->flags); 204} 205 206static inline void zbudpage_set_zombie(struct zbudpage *zbudpage) 207{ 208 struct page *page = (struct page *)zbudpage; 209 210 set_bit(PG_dirty, &page->flags); 211} 212 213static inline void zbudpage_clear_zombie(struct zbudpage *zbudpage) 214{ 215 struct page *page = (struct page *)zbudpage; 216 217 clear_bit(PG_dirty, &page->flags); 218} 219 220static inline void kunmap_zbudpage_atomic(void *zbpg) 221{ 222 kunmap_atomic(zbpg); 223} 224 225/* 226 * zbud "translation" and helper functions 227 */ 228 229static inline struct zbudpage *zbudref_to_zbudpage(struct zbudref *zref) 230{ 231 unsigned long zbud = (unsigned long)zref; 232 zbud &= ~1UL; 233 return (struct zbudpage *)zbud; 234} 235 236static inline struct zbudref *zbudpage_to_zbudref(struct zbudpage *zbudpage, 237 unsigned budnum) 238{ 239 unsigned long zbud = (unsigned long)zbudpage; 240 BUG_ON(budnum > 1); 241 zbud |= budnum; 242 return (struct zbudref *)zbud; 243} 244 245static inline int zbudref_budnum(struct zbudref *zbudref) 246{ 247 unsigned long zbud = (unsigned long)zbudref; 248 return zbud & 1UL; 249} 250 251static inline unsigned zbud_max_size(void) 252{ 253 return MAX_CHUNK << CHUNK_SHIFT; 254} 255 256static inline unsigned zbud_size_to_chunks(unsigned size) 257{ 258 BUG_ON(size == 0 || size > zbud_max_size()); 259 return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT; 260} 261 262/* can only be used between kmap_zbudpage_atomic/kunmap_zbudpage_atomic! */ 263static inline char *zbud_data(void *zbpg, 264 unsigned budnum, unsigned size) 265{ 266 char *p; 267 268 BUG_ON(size == 0 || size > zbud_max_size()); 269 p = (char *)zbpg; 270 if (budnum == 1) 271 p += PAGE_SIZE - ((size + CHUNK_SIZE - 1) & CHUNK_MASK); 272 return p; 273} 274 275/* 276 * These are all informative and exposed through debugfs... except for 277 * the arrays... anyone know how to do that? To avoid confusion for 278 * debugfs viewers, some of these should also be atomic_long_t, but 279 * I don't know how to expose atomics via debugfs either... 280 */ 281static unsigned long zbud_eph_pageframes; 282static unsigned long zbud_pers_pageframes; 283static unsigned long zbud_eph_zpages; 284static unsigned long zbud_pers_zpages; 285static u64 zbud_eph_zbytes; 286static u64 zbud_pers_zbytes; 287static unsigned long zbud_eph_evicted_pageframes; 288static unsigned long zbud_pers_evicted_pageframes; 289static unsigned long zbud_eph_cumul_zpages; 290static unsigned long zbud_pers_cumul_zpages; 291static u64 zbud_eph_cumul_zbytes; 292static u64 zbud_pers_cumul_zbytes; 293static unsigned long zbud_eph_cumul_chunk_counts[NCHUNKS]; 294static unsigned long zbud_pers_cumul_chunk_counts[NCHUNKS]; 295static unsigned long zbud_eph_buddied_count; 296static unsigned long zbud_pers_buddied_count; 297static unsigned long zbud_eph_unbuddied_count; 298static unsigned long zbud_pers_unbuddied_count; 299static unsigned long zbud_eph_zombie_count; 300static unsigned long zbud_pers_zombie_count; 301static atomic_t zbud_eph_zombie_atomic; 302static atomic_t zbud_pers_zombie_atomic; 303 304#ifdef CONFIG_DEBUG_FS 305#include <linux/debugfs.h> 306#define zdfs debugfs_create_size_t 307#define zdfs64 debugfs_create_u64 308static int zbud_debugfs_init(void) 309{ 310 struct dentry *root = debugfs_create_dir("zbud", NULL); 311 if (root == NULL) 312 return -ENXIO; 313 314 /* 315 * would be nice to dump the sizes of the unbuddied 316 * arrays, like was done with sysfs, but it doesn't 317 * look like debugfs is flexible enough to do that 318 */ 319 zdfs64("eph_zbytes", S_IRUGO, root, &zbud_eph_zbytes); 320 zdfs64("eph_cumul_zbytes", S_IRUGO, root, &zbud_eph_cumul_zbytes); 321 zdfs64("pers_zbytes", S_IRUGO, root, &zbud_pers_zbytes); 322 zdfs64("pers_cumul_zbytes", S_IRUGO, root, &zbud_pers_cumul_zbytes); 323 zdfs("eph_cumul_zpages", S_IRUGO, root, &zbud_eph_cumul_zpages); 324 zdfs("eph_evicted_pageframes", S_IRUGO, root, 325 &zbud_eph_evicted_pageframes); 326 zdfs("eph_zpages", S_IRUGO, root, &zbud_eph_zpages); 327 zdfs("eph_pageframes", S_IRUGO, root, &zbud_eph_pageframes); 328 zdfs("eph_buddied_count", S_IRUGO, root, &zbud_eph_buddied_count); 329 zdfs("eph_unbuddied_count", S_IRUGO, root, &zbud_eph_unbuddied_count); 330 zdfs("pers_cumul_zpages", S_IRUGO, root, &zbud_pers_cumul_zpages); 331 zdfs("pers_evicted_pageframes", S_IRUGO, root, 332 &zbud_pers_evicted_pageframes); 333 zdfs("pers_zpages", S_IRUGO, root, &zbud_pers_zpages); 334 zdfs("pers_pageframes", S_IRUGO, root, &zbud_pers_pageframes); 335 zdfs("pers_buddied_count", S_IRUGO, root, &zbud_pers_buddied_count); 336 zdfs("pers_unbuddied_count", S_IRUGO, root, &zbud_pers_unbuddied_count); 337 zdfs("pers_zombie_count", S_IRUGO, root, &zbud_pers_zombie_count); 338 return 0; 339} 340#undef zdfs 341#undef zdfs64 342#endif 343 344/* protects the buddied list and all unbuddied lists */ 345static DEFINE_SPINLOCK(zbud_eph_lists_lock); 346static DEFINE_SPINLOCK(zbud_pers_lists_lock); 347 348struct zbud_unbuddied { 349 struct list_head list; 350 unsigned count; 351}; 352 353/* list N contains pages with N chunks USED and NCHUNKS-N unused */ 354/* element 0 is never used but optimizing that isn't worth it */ 355static struct zbud_unbuddied zbud_eph_unbuddied[NCHUNKS]; 356static struct zbud_unbuddied zbud_pers_unbuddied[NCHUNKS]; 357static LIST_HEAD(zbud_eph_lru_list); 358static LIST_HEAD(zbud_pers_lru_list); 359static LIST_HEAD(zbud_eph_buddied_list); 360static LIST_HEAD(zbud_pers_buddied_list); 361static LIST_HEAD(zbud_eph_zombie_list); 362static LIST_HEAD(zbud_pers_zombie_list); 363 364/* 365 * Given a struct page, transform it to a zbudpage so that it can be 366 * used by zbud and initialize fields as necessary. 367 */ 368static inline struct zbudpage *zbud_init_zbudpage(struct page *page, bool eph) 369{ 370 struct zbudpage *zbudpage = (struct zbudpage *)page; 371 372 BUG_ON(page == NULL); 373 INIT_LIST_HEAD(&zbudpage->budlist); 374 INIT_LIST_HEAD(&zbudpage->lru); 375 zbudpage->zbud0_size = 0; 376 zbudpage->zbud1_size = 0; 377 zbudpage->unevictable = 0; 378 if (eph) 379 zbud_eph_pageframes++; 380 else 381 zbud_pers_pageframes++; 382 return zbudpage; 383} 384 385/* "Transform" a zbudpage back to a struct page suitable to free. */ 386static inline struct page *zbud_unuse_zbudpage(struct zbudpage *zbudpage, 387 bool eph) 388{ 389 struct page *page = (struct page *)zbudpage; 390 391 BUG_ON(!list_empty(&zbudpage->budlist)); 392 BUG_ON(!list_empty(&zbudpage->lru)); 393 BUG_ON(zbudpage->zbud0_size != 0); 394 BUG_ON(zbudpage->zbud1_size != 0); 395 BUG_ON(!PageLocked(page)); 396 BUG_ON(zbudpage->unevictable != 0); 397 BUG_ON(zbudpage_is_dying(zbudpage)); 398 BUG_ON(zbudpage_is_zombie(zbudpage)); 399 if (eph) 400 zbud_eph_pageframes--; 401 else 402 zbud_pers_pageframes--; 403 zbudpage_spin_unlock(zbudpage); 404 reset_page_mapcount(page); 405 init_page_count(page); 406 page->index = 0; 407 return page; 408} 409 410/* Mark a zbud as unused and do accounting */ 411static inline void zbud_unuse_zbud(struct zbudpage *zbudpage, 412 int budnum, bool eph) 413{ 414 unsigned size; 415 416 BUG_ON(!zbudpage_is_locked(zbudpage)); 417 if (budnum == 0) { 418 size = zbudpage->zbud0_size; 419 zbudpage->zbud0_size = 0; 420 } else { 421 size = zbudpage->zbud1_size; 422 zbudpage->zbud1_size = 0; 423 } 424 if (eph) { 425 zbud_eph_zbytes -= size; 426 zbud_eph_zpages--; 427 } else { 428 zbud_pers_zbytes -= size; 429 zbud_pers_zpages--; 430 } 431} 432 433/* 434 * Given a zbudpage/budnum/size, a tmem handle, and a kmapped pointer 435 * to some data, set up the zbud appropriately including data copying 436 * and accounting. Note that if cdata is NULL, the data copying is 437 * skipped. (This is useful for lazy writes such as for RAMster.) 438 */ 439static void zbud_init_zbud(struct zbudpage *zbudpage, struct tmem_handle *th, 440 bool eph, void *cdata, 441 unsigned budnum, unsigned size) 442{ 443 char *to; 444 void *zbpg; 445 struct tmem_handle *to_th; 446 unsigned nchunks = zbud_size_to_chunks(size); 447 448 BUG_ON(!zbudpage_is_locked(zbudpage)); 449 zbpg = kmap_zbudpage_atomic(zbudpage); 450 to = zbud_data(zbpg, budnum, size); 451 to_th = (struct tmem_handle *)to; 452 to_th->index = th->index; 453 to_th->oid = th->oid; 454 to_th->pool_id = th->pool_id; 455 to_th->client_id = th->client_id; 456 to += sizeof(struct tmem_handle); 457 if (cdata != NULL) 458 memcpy(to, cdata, size - sizeof(struct tmem_handle)); 459 kunmap_zbudpage_atomic(zbpg); 460 if (budnum == 0) 461 zbudpage->zbud0_size = size; 462 else 463 zbudpage->zbud1_size = size; 464 if (eph) { 465 zbud_eph_cumul_chunk_counts[nchunks]++; 466 zbud_eph_zpages++; 467 zbud_eph_cumul_zpages++; 468 zbud_eph_zbytes += size; 469 zbud_eph_cumul_zbytes += size; 470 } else { 471 zbud_pers_cumul_chunk_counts[nchunks]++; 472 zbud_pers_zpages++; 473 zbud_pers_cumul_zpages++; 474 zbud_pers_zbytes += size; 475 zbud_pers_cumul_zbytes += size; 476 } 477} 478 479/* 480 * Given a locked dying zbudpage, read out the tmem handles from the data, 481 * unlock the page, then use the handles to tell tmem to flush out its 482 * references 483 */ 484static void zbud_evict_tmem(struct zbudpage *zbudpage) 485{ 486 int i, j; 487 uint32_t pool_id[2], client_id[2]; 488 uint32_t index[2]; 489 struct tmem_oid oid[2]; 490 struct tmem_pool *pool; 491 void *zbpg; 492 struct tmem_handle *th; 493 unsigned size; 494 495 /* read out the tmem handles from the data and set aside */ 496 zbpg = kmap_zbudpage_atomic(zbudpage); 497 for (i = 0, j = 0; i < 2; i++) { 498 size = (i == 0) ? zbudpage->zbud0_size : zbudpage->zbud1_size; 499 if (size) { 500 th = (struct tmem_handle *)zbud_data(zbpg, i, size); 501 client_id[j] = th->client_id; 502 pool_id[j] = th->pool_id; 503 oid[j] = th->oid; 504 index[j] = th->index; 505 j++; 506 zbud_unuse_zbud(zbudpage, i, true); 507 } 508 } 509 kunmap_zbudpage_atomic(zbpg); 510 zbudpage_spin_unlock(zbudpage); 511 /* zbudpage is now an unlocked dying... tell tmem to flush pointers */ 512 for (i = 0; i < j; i++) { 513 pool = zcache_get_pool_by_id(client_id[i], pool_id[i]); 514 if (pool != NULL) { 515 tmem_flush_page(pool, &oid[i], index[i]); 516 zcache_put_pool(pool); 517 } 518 } 519} 520 521/* 522 * Externally callable zbud handling routines. 523 */ 524 525/* 526 * Return the maximum size compressed page that can be stored (secretly 527 * setting aside space for the tmem handle. 528 */ 529unsigned int zbud_max_buddy_size(void) 530{ 531 return zbud_max_size() - sizeof(struct tmem_handle); 532} 533 534/* 535 * Given a zbud reference, free the corresponding zbud from all lists, 536 * mark it as unused, do accounting, and if the freeing of the zbud 537 * frees up an entire pageframe, return it to the caller (else NULL). 538 */ 539struct page *zbud_free_and_delist(struct zbudref *zref, bool eph, 540 unsigned int *zsize, unsigned int *zpages) 541{ 542 unsigned long budnum = zbudref_budnum(zref); 543 struct zbudpage *zbudpage = zbudref_to_zbudpage(zref); 544 struct page *page = NULL; 545 unsigned chunks, bud_size, other_bud_size; 546 spinlock_t *lists_lock = 547 eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock; 548 struct zbud_unbuddied *unbud = 549 eph ? zbud_eph_unbuddied : zbud_pers_unbuddied; 550 551 552 spin_lock(lists_lock); 553 zbudpage_spin_lock(zbudpage); 554 if (zbudpage_is_dying(zbudpage)) { 555 /* ignore dying zbudpage... see zbud_evict_pageframe_lru() */ 556 zbudpage_spin_unlock(zbudpage); 557 spin_unlock(lists_lock); 558 *zpages = 0; 559 *zsize = 0; 560 goto out; 561 } 562 if (budnum == 0) { 563 bud_size = zbudpage->zbud0_size; 564 other_bud_size = zbudpage->zbud1_size; 565 } else { 566 bud_size = zbudpage->zbud1_size; 567 other_bud_size = zbudpage->zbud0_size; 568 } 569 *zsize = bud_size - sizeof(struct tmem_handle); 570 *zpages = 1; 571 zbud_unuse_zbud(zbudpage, budnum, eph); 572 if (other_bud_size == 0) { /* was unbuddied: unlist and free */ 573 chunks = zbud_size_to_chunks(bud_size) ; 574 if (zbudpage_is_zombie(zbudpage)) { 575 if (eph) 576 zbud_pers_zombie_count = 577 atomic_dec_return(&zbud_eph_zombie_atomic); 578 else 579 zbud_pers_zombie_count = 580 atomic_dec_return(&zbud_pers_zombie_atomic); 581 zbudpage_clear_zombie(zbudpage); 582 } else { 583 BUG_ON(list_empty(&unbud[chunks].list)); 584 list_del_init(&zbudpage->budlist); 585 unbud[chunks].count--; 586 } 587 list_del_init(&zbudpage->lru); 588 spin_unlock(lists_lock); 589 if (eph) 590 zbud_eph_unbuddied_count--; 591 else 592 zbud_pers_unbuddied_count--; 593 page = zbud_unuse_zbudpage(zbudpage, eph); 594 } else { /* was buddied: move remaining buddy to unbuddied list */ 595 chunks = zbud_size_to_chunks(other_bud_size) ; 596 if (!zbudpage_is_zombie(zbudpage)) { 597 list_del_init(&zbudpage->budlist); 598 list_add_tail(&zbudpage->budlist, &unbud[chunks].list); 599 unbud[chunks].count++; 600 } 601 if (eph) { 602 zbud_eph_buddied_count--; 603 zbud_eph_unbuddied_count++; 604 } else { 605 zbud_pers_unbuddied_count++; 606 zbud_pers_buddied_count--; 607 } 608 /* don't mess with lru, no need to move it */ 609 zbudpage_spin_unlock(zbudpage); 610 spin_unlock(lists_lock); 611 } 612out: 613 return page; 614} 615 616/* 617 * Given a tmem handle, and a kmapped pointer to compressed data of 618 * the given size, try to find an unbuddied zbudpage in which to 619 * create a zbud. If found, put it there, mark the zbudpage unevictable, 620 * and return a zbudref to it. Else return NULL. 621 */ 622struct zbudref *zbud_match_prep(struct tmem_handle *th, bool eph, 623 void *cdata, unsigned size) 624{ 625 struct zbudpage *zbudpage = NULL, *zbudpage2; 626 unsigned long budnum = 0UL; 627 unsigned nchunks; 628 int i, found_good_buddy = 0; 629 spinlock_t *lists_lock = 630 eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock; 631 struct zbud_unbuddied *unbud = 632 eph ? zbud_eph_unbuddied : zbud_pers_unbuddied; 633 634 size += sizeof(struct tmem_handle); 635 nchunks = zbud_size_to_chunks(size); 636 for (i = MAX_CHUNK - nchunks + 1; i > 0; i--) { 637 spin_lock(lists_lock); 638 if (!list_empty(&unbud[i].list)) { 639 list_for_each_entry_safe(zbudpage, zbudpage2, 640 &unbud[i].list, budlist) { 641 if (zbudpage_spin_trylock(zbudpage)) { 642 found_good_buddy = i; 643 goto found_unbuddied; 644 } 645 } 646 } 647 spin_unlock(lists_lock); 648 } 649 zbudpage = NULL; 650 goto out; 651 652found_unbuddied: 653 BUG_ON(!zbudpage_is_locked(zbudpage)); 654 BUG_ON(!((zbudpage->zbud0_size == 0) ^ (zbudpage->zbud1_size == 0))); 655 if (zbudpage->zbud0_size == 0) 656 budnum = 0UL; 657 else if (zbudpage->zbud1_size == 0) 658 budnum = 1UL; 659 list_del_init(&zbudpage->budlist); 660 if (eph) { 661 list_add_tail(&zbudpage->budlist, &zbud_eph_buddied_list); 662 unbud[found_good_buddy].count--; 663 zbud_eph_unbuddied_count--; 664 zbud_eph_buddied_count++; 665 /* "promote" raw zbudpage to most-recently-used */ 666 list_del_init(&zbudpage->lru); 667 list_add_tail(&zbudpage->lru, &zbud_eph_lru_list); 668 } else { 669 list_add_tail(&zbudpage->budlist, &zbud_pers_buddied_list); 670 unbud[found_good_buddy].count--; 671 zbud_pers_unbuddied_count--; 672 zbud_pers_buddied_count++; 673 /* "promote" raw zbudpage to most-recently-used */ 674 list_del_init(&zbudpage->lru); 675 list_add_tail(&zbudpage->lru, &zbud_pers_lru_list); 676 } 677 zbud_init_zbud(zbudpage, th, eph, cdata, budnum, size); 678 zbudpage->unevictable++; 679 BUG_ON(zbudpage->unevictable == 3); 680 zbudpage_spin_unlock(zbudpage); 681 spin_unlock(lists_lock); 682out: 683 return zbudpage_to_zbudref(zbudpage, budnum); 684 685} 686 687/* 688 * Given a tmem handle, and a kmapped pointer to compressed data of 689 * the given size, and a newly allocated struct page, create an unevictable 690 * zbud in that new page and return a zbudref to it. 691 */ 692struct zbudref *zbud_create_prep(struct tmem_handle *th, bool eph, 693 void *cdata, unsigned size, 694 struct page *newpage) 695{ 696 struct zbudpage *zbudpage; 697 unsigned long budnum = 0; 698 unsigned nchunks; 699 spinlock_t *lists_lock = 700 eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock; 701 struct zbud_unbuddied *unbud = 702 eph ? zbud_eph_unbuddied : zbud_pers_unbuddied; 703 704#if 0 705 /* this may be worth it later to support decompress-in-place? */ 706 static unsigned long counter; 707 budnum = counter++ & 1; /* alternate using zbud0 and zbud1 */ 708#endif 709 710 if (size > zbud_max_buddy_size()) 711 return NULL; 712 if (newpage == NULL) 713 return NULL; 714 715 size += sizeof(struct tmem_handle); 716 nchunks = zbud_size_to_chunks(size) ; 717 spin_lock(lists_lock); 718 zbudpage = zbud_init_zbudpage(newpage, eph); 719 zbudpage_spin_lock(zbudpage); 720 list_add_tail(&zbudpage->budlist, &unbud[nchunks].list); 721 if (eph) { 722 list_add_tail(&zbudpage->lru, &zbud_eph_lru_list); 723 zbud_eph_unbuddied_count++; 724 } else { 725 list_add_tail(&zbudpage->lru, &zbud_pers_lru_list); 726 zbud_pers_unbuddied_count++; 727 } 728 unbud[nchunks].count++; 729 zbud_init_zbud(zbudpage, th, eph, cdata, budnum, size); 730 zbudpage->unevictable++; 731 BUG_ON(zbudpage->unevictable == 3); 732 zbudpage_spin_unlock(zbudpage); 733 spin_unlock(lists_lock); 734 return zbudpage_to_zbudref(zbudpage, budnum); 735} 736 737/* 738 * Finish creation of a zbud by, assuming another zbud isn't being created 739 * in parallel, marking it evictable. 740 */ 741void zbud_create_finish(struct zbudref *zref, bool eph) 742{ 743 struct zbudpage *zbudpage = zbudref_to_zbudpage(zref); 744 spinlock_t *lists_lock = 745 eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock; 746 747 spin_lock(lists_lock); 748 zbudpage_spin_lock(zbudpage); 749 BUG_ON(zbudpage_is_dying(zbudpage)); 750 zbudpage->unevictable--; 751 BUG_ON((int)zbudpage->unevictable < 0); 752 zbudpage_spin_unlock(zbudpage); 753 spin_unlock(lists_lock); 754} 755 756/* 757 * Given a zbudref and a struct page, decompress the data from 758 * the zbud into the physical page represented by the struct page 759 * by upcalling to zcache_decompress 760 */ 761int zbud_decompress(struct page *data_page, struct zbudref *zref, bool eph, 762 void (*decompress)(char *, unsigned int, char *)) 763{ 764 struct zbudpage *zbudpage = zbudref_to_zbudpage(zref); 765 unsigned long budnum = zbudref_budnum(zref); 766 void *zbpg; 767 char *to_va, *from_va; 768 unsigned size; 769 int ret = -1; 770 spinlock_t *lists_lock = 771 eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock; 772 773 spin_lock(lists_lock); 774 zbudpage_spin_lock(zbudpage); 775 if (zbudpage_is_dying(zbudpage)) { 776 /* ignore dying zbudpage... see zbud_evict_pageframe_lru() */ 777 goto out; 778 } 779 zbpg = kmap_zbudpage_atomic(zbudpage); 780 to_va = kmap_atomic(data_page); 781 if (budnum == 0) 782 size = zbudpage->zbud0_size; 783 else 784 size = zbudpage->zbud1_size; 785 BUG_ON(size == 0 || size > zbud_max_size()); 786 from_va = zbud_data(zbpg, budnum, size); 787 from_va += sizeof(struct tmem_handle); 788 size -= sizeof(struct tmem_handle); 789 decompress(from_va, size, to_va); 790 kunmap_atomic(to_va); 791 kunmap_zbudpage_atomic(zbpg); 792 ret = 0; 793out: 794 zbudpage_spin_unlock(zbudpage); 795 spin_unlock(lists_lock); 796 return ret; 797} 798 799/* 800 * Given a zbudref and a kernel pointer, copy the data from 801 * the zbud to the kernel pointer. 802 */ 803int zbud_copy_from_zbud(char *to_va, struct zbudref *zref, 804 size_t *sizep, bool eph) 805{ 806 struct zbudpage *zbudpage = zbudref_to_zbudpage(zref); 807 unsigned long budnum = zbudref_budnum(zref); 808 void *zbpg; 809 char *from_va; 810 unsigned size; 811 int ret = -1; 812 spinlock_t *lists_lock = 813 eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock; 814 815 spin_lock(lists_lock); 816 zbudpage_spin_lock(zbudpage); 817 if (zbudpage_is_dying(zbudpage)) { 818 /* ignore dying zbudpage... see zbud_evict_pageframe_lru() */ 819 goto out; 820 } 821 zbpg = kmap_zbudpage_atomic(zbudpage); 822 if (budnum == 0) 823 size = zbudpage->zbud0_size; 824 else 825 size = zbudpage->zbud1_size; 826 BUG_ON(size == 0 || size > zbud_max_size()); 827 from_va = zbud_data(zbpg, budnum, size); 828 from_va += sizeof(struct tmem_handle); 829 size -= sizeof(struct tmem_handle); 830 *sizep = size; 831 memcpy(to_va, from_va, size); 832 833 kunmap_zbudpage_atomic(zbpg); 834 ret = 0; 835out: 836 zbudpage_spin_unlock(zbudpage); 837 spin_unlock(lists_lock); 838 return ret; 839} 840 841/* 842 * Given a zbudref and a kernel pointer, copy the data from 843 * the kernel pointer to the zbud. 844 */ 845int zbud_copy_to_zbud(struct zbudref *zref, char *from_va, bool eph) 846{ 847 struct zbudpage *zbudpage = zbudref_to_zbudpage(zref); 848 unsigned long budnum = zbudref_budnum(zref); 849 void *zbpg; 850 char *to_va; 851 unsigned size; 852 int ret = -1; 853 spinlock_t *lists_lock = 854 eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock; 855 856 spin_lock(lists_lock); 857 zbudpage_spin_lock(zbudpage); 858 if (zbudpage_is_dying(zbudpage)) { 859 /* ignore dying zbudpage... see zbud_evict_pageframe_lru() */ 860 goto out; 861 } 862 zbpg = kmap_zbudpage_atomic(zbudpage); 863 if (budnum == 0) 864 size = zbudpage->zbud0_size; 865 else 866 size = zbudpage->zbud1_size; 867 BUG_ON(size == 0 || size > zbud_max_size()); 868 to_va = zbud_data(zbpg, budnum, size); 869 to_va += sizeof(struct tmem_handle); 870 size -= sizeof(struct tmem_handle); 871 memcpy(to_va, from_va, size); 872 873 kunmap_zbudpage_atomic(zbpg); 874 ret = 0; 875out: 876 zbudpage_spin_unlock(zbudpage); 877 spin_unlock(lists_lock); 878 return ret; 879} 880 881/* 882 * Choose an ephemeral LRU zbudpage that is evictable (not locked), ensure 883 * there are no references to it remaining, and return the now unused 884 * (and re-init'ed) struct page and the total amount of compressed 885 * data that was evicted. 886 */ 887struct page *zbud_evict_pageframe_lru(unsigned int *zsize, unsigned int *zpages) 888{ 889 struct zbudpage *zbudpage = NULL, *zbudpage2; 890 struct zbud_unbuddied *unbud = zbud_eph_unbuddied; 891 struct page *page = NULL; 892 bool irqs_disabled = irqs_disabled(); 893 894 /* 895 * Since this can be called indirectly from cleancache_put, which 896 * has interrupts disabled, as well as frontswap_put, which does not, 897 * we need to be able to handle both cases, even though it is ugly. 898 */ 899 if (irqs_disabled) 900 spin_lock(&zbud_eph_lists_lock); 901 else 902 spin_lock_bh(&zbud_eph_lists_lock); 903 *zsize = 0; 904 if (list_empty(&zbud_eph_lru_list)) 905 goto unlock_out; 906 list_for_each_entry_safe(zbudpage, zbudpage2, &zbud_eph_lru_list, lru) { 907 /* skip a locked zbudpage */ 908 if (unlikely(!zbudpage_spin_trylock(zbudpage))) 909 continue; 910 /* skip an unevictable zbudpage */ 911 if (unlikely(zbudpage->unevictable != 0)) { 912 zbudpage_spin_unlock(zbudpage); 913 continue; 914 } 915 /* got a locked evictable page */ 916 goto evict_page; 917 918 } 919unlock_out: 920 /* no unlocked evictable pages, give up */ 921 if (irqs_disabled) 922 spin_unlock(&zbud_eph_lists_lock); 923 else 924 spin_unlock_bh(&zbud_eph_lists_lock); 925 goto out; 926 927evict_page: 928 list_del_init(&zbudpage->budlist); 929 list_del_init(&zbudpage->lru); 930 zbudpage_set_dying(zbudpage); 931 /* 932 * the zbudpage is now "dying" and attempts to read, write, 933 * or delete data from it will be ignored 934 */ 935 if (zbudpage->zbud0_size != 0 && zbudpage->zbud1_size != 0) { 936 *zsize = zbudpage->zbud0_size + zbudpage->zbud1_size - 937 (2 * sizeof(struct tmem_handle)); 938 *zpages = 2; 939 } else if (zbudpage->zbud0_size != 0) { 940 unbud[zbud_size_to_chunks(zbudpage->zbud0_size)].count--; 941 *zsize = zbudpage->zbud0_size - sizeof(struct tmem_handle); 942 *zpages = 1; 943 } else if (zbudpage->zbud1_size != 0) { 944 unbud[zbud_size_to_chunks(zbudpage->zbud1_size)].count--; 945 *zsize = zbudpage->zbud1_size - sizeof(struct tmem_handle); 946 *zpages = 1; 947 } else { 948 BUG(); 949 } 950 spin_unlock(&zbud_eph_lists_lock); 951 zbud_eph_evicted_pageframes++; 952 if (*zpages == 1) 953 zbud_eph_unbuddied_count--; 954 else 955 zbud_eph_buddied_count--; 956 zbud_evict_tmem(zbudpage); 957 zbudpage_spin_lock(zbudpage); 958 zbudpage_clear_dying(zbudpage); 959 page = zbud_unuse_zbudpage(zbudpage, true); 960 if (!irqs_disabled) 961 local_bh_enable(); 962out: 963 return page; 964} 965 966/* 967 * Choose a persistent LRU zbudpage that is evictable (not locked), zombify it, 968 * read the tmem_handle(s) out of it into the passed array, and return the 969 * number of zbuds. Caller must perform necessary tmem functions and, 970 * indirectly, zbud functions to fetch any valid data and cause the 971 * now-zombified zbudpage to eventually be freed. We track the zombified 972 * zbudpage count so it is possible to observe if there is a leak. 973 FIXME: describe (ramster) case where data pointers are passed in for memcpy 974 */ 975unsigned int zbud_make_zombie_lru(struct tmem_handle *th, unsigned char **data, 976 unsigned int *zsize, bool eph) 977{ 978 struct zbudpage *zbudpage = NULL, *zbudpag2; 979 struct tmem_handle *thfrom; 980 char *from_va; 981 void *zbpg; 982 unsigned size; 983 int ret = 0, i; 984 spinlock_t *lists_lock = 985 eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock; 986 struct list_head *lru_list = 987 eph ? &zbud_eph_lru_list : &zbud_pers_lru_list; 988 989 spin_lock_bh(lists_lock); 990 if (list_empty(lru_list)) 991 goto out; 992 list_for_each_entry_safe(zbudpage, zbudpag2, lru_list, lru) { 993 /* skip a locked zbudpage */ 994 if (unlikely(!zbudpage_spin_trylock(zbudpage))) 995 continue; 996 /* skip an unevictable zbudpage */ 997 if (unlikely(zbudpage->unevictable != 0)) { 998 zbudpage_spin_unlock(zbudpage); 999 continue; 1000 } 1001 /* got a locked evictable page */ 1002 goto zombify_page; 1003 } 1004 /* no unlocked evictable pages, give up */ 1005 goto out; 1006 1007zombify_page: 1008 /* got an unlocked evictable page, zombify it */ 1009 list_del_init(&zbudpage->budlist); 1010 zbudpage_set_zombie(zbudpage); 1011 /* FIXME what accounting do I need to do here? */ 1012 list_del_init(&zbudpage->lru); 1013 if (eph) { 1014 list_add_tail(&zbudpage->lru, &zbud_eph_zombie_list); 1015 zbud_eph_zombie_count = 1016 atomic_inc_return(&zbud_eph_zombie_atomic); 1017 } else { 1018 list_add_tail(&zbudpage->lru, &zbud_pers_zombie_list); 1019 zbud_pers_zombie_count = 1020 atomic_inc_return(&zbud_pers_zombie_atomic); 1021 } 1022 /* FIXME what accounting do I need to do here? */ 1023 zbpg = kmap_zbudpage_atomic(zbudpage); 1024 for (i = 0; i < 2; i++) { 1025 size = (i == 0) ? zbudpage->zbud0_size : zbudpage->zbud1_size; 1026 if (size) { 1027 from_va = zbud_data(zbpg, i, size); 1028 thfrom = (struct tmem_handle *)from_va; 1029 from_va += sizeof(struct tmem_handle); 1030 size -= sizeof(struct tmem_handle); 1031 if (th != NULL) 1032 th[ret] = *thfrom; 1033 if (data != NULL) 1034 memcpy(data[ret], from_va, size); 1035 if (zsize != NULL) 1036 *zsize++ = size; 1037 ret++; 1038 } 1039 } 1040 kunmap_zbudpage_atomic(zbpg); 1041 zbudpage_spin_unlock(zbudpage); 1042out: 1043 spin_unlock_bh(lists_lock); 1044 return ret; 1045} 1046 1047void __init zbud_init(void) 1048{ 1049 int i; 1050 1051#ifdef CONFIG_DEBUG_FS 1052 zbud_debugfs_init(); 1053#endif 1054 BUG_ON((sizeof(struct tmem_handle) * 2 > CHUNK_SIZE)); 1055 BUG_ON(sizeof(struct zbudpage) > sizeof(struct page)); 1056 for (i = 0; i < NCHUNKS; i++) { 1057 INIT_LIST_HEAD(&zbud_eph_unbuddied[i].list); 1058 INIT_LIST_HEAD(&zbud_pers_unbuddied[i].list); 1059 } 1060}