drivers/staging/ramster/zbud.c at v3.7-rc5

tjh.dev / kernel
fork
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork
kernel / drivers / staging / ramster / zbud.c
at v3.7-rc5 1060 lines 33 kB view raw
wrap content
   1/*
   2 * zbud.c - Compression buddies allocator
   3 *
   4 * Copyright (c) 2010-2012, Dan Magenheimer, Oracle Corp.
   5 *
   6 * Compression buddies ("zbud") provides for efficiently packing two
   7 * (or, possibly in the future, more) compressed pages ("zpages") into
   8 * a single "raw" pageframe and for tracking both zpages and pageframes
   9 * so that whole pageframes can be easily reclaimed in LRU-like order.
  10 * It is designed to be used in conjunction with transcendent memory
  11 * ("tmem"); for example separate LRU lists are maintained for persistent
  12 * vs. ephemeral pages.
  13 *
  14 * A zbudpage is an overlay for a struct page and thus each zbudpage
  15 * refers to a physical pageframe of RAM.  When the caller passes a
  16 * struct page from the kernel's page allocator, zbud "transforms" it
  17 * to a zbudpage which sets/uses a different set of fields than the
  18 * struct-page and thus must "untransform" it back by reinitializing
  19 * certain fields before the struct-page can be freed.  The fields
  20 * of a zbudpage include a page lock for controlling access to the
  21 * corresponding pageframe, and there is a size field for each zpage.
  22 * Each zbudpage also lives on two linked lists: a "budlist" which is
  23 * used to support efficient buddying of zpages; and an "lru" which
  24 * is used for reclaiming pageframes in approximately least-recently-used
  25 * order.
  26 *
  27 * A zbudpageframe is a pageframe divided up into aligned 64-byte "chunks"
  28 * which contain the compressed data for zero, one, or two zbuds.  Contained
  29 * with the compressed data is a tmem_handle which is a key to allow
  30 * the same data to be found via the tmem interface so the zpage can
  31 * be invalidated (for ephemeral pages) or repatriated to the swap cache
  32 * (for persistent pages).  The contents of a zbudpageframe must never
  33 * be accessed without holding the page lock for the corresponding
  34 * zbudpage and, to accomodate highmem machines, the contents may
  35 * only be examined or changes when kmapped.  Thus, when in use, a
  36 * kmapped zbudpageframe is referred to in the zbud code as "void *zbpg".
  37 *
  38 * Note that the term "zbud" refers to the combination of a zpage and
  39 * a tmem_handle that is stored as one of possibly two "buddied" zpages;
  40 * it also generically refers to this allocator... sorry for any confusion.
  41 *
  42 * A zbudref is a pointer to a struct zbudpage (which can be cast to a
  43 * struct page), with the LSB either cleared or set to indicate, respectively,
  44 * the first or second zpage in the zbudpageframe. Since a zbudref can be
  45 * cast to a pointer, it is used as the tmem "pampd" pointer and uniquely
  46 * references a stored tmem page and so is the only zbud data structure
  47 * externally visible to zbud.c/zbud.h.
  48 *
  49 * Since we wish to reclaim entire pageframes but zpages may be randomly
  50 * added and deleted to any given pageframe, we approximate LRU by
  51 * promoting a pageframe to MRU when a zpage is added to it, but
  52 * leaving it at the current place in the list when a zpage is deleted
  53 * from it.  As a side effect, zpages that are difficult to buddy (e.g.
  54 * very large paages) will be reclaimed faster than average, which seems
  55 * reasonable.
  56 *
  57 * In the current implementation, no more than two zpages may be stored in
  58 * any pageframe and no zpage ever crosses a pageframe boundary.  While
  59 * other zpage allocation mechanisms may allow greater density, this two
  60 * zpage-per-pageframe limit both ensures simple reclaim of pageframes
  61 * (including garbage collection of references to the contents of those
  62 * pageframes from tmem data structures) AND avoids the need for compaction.
  63 * With additional complexity, zbud could be modified to support storing
  64 * up to three zpages per pageframe or, to handle larger average zpages,
  65 * up to three zpages per pair of pageframes, but it is not clear if the
  66 * additional complexity would be worth it.  So consider it an exercise
  67 * for future developers.
  68 *
  69 * Note also that zbud does no page allocation or freeing.  This is so
  70 * that the caller has complete control over and, for accounting, visibility
  71 * into if/when pages are allocated and freed.
  72 *
  73 * Finally, note that zbud limits the size of zpages it can store; the
  74 * caller must check the zpage size with zbud_max_buddy_size before
  75 * storing it, else BUGs will result.  User beware.
  76 */
  77
  78#include <linux/module.h>
  79#include <linux/highmem.h>
  80#include <linux/list.h>
  81#include <linux/spinlock.h>
  82#include <linux/pagemap.h>
  83#include <linux/atomic.h>
  84#include <linux/bug.h>
  85#include "tmem.h"
  86#include "zcache.h"
  87#include "zbud.h"
  88
  89/*
  90 * We need to ensure that a struct zbudpage is never larger than a
  91 * struct page.  This is checked with a BUG_ON in zbud_init.
  92 *
  93 * The unevictable field indicates that a zbud is being added to the
  94 * zbudpage.  Since this is a two-phase process (due to tmem locking),
  95 * this field locks the zbudpage against eviction when a zbud match
  96 * or creation is in process.  Since this addition process may occur
  97 * in parallel for two zbuds in one zbudpage, the field is a counter
  98 * that must not exceed two.
  99 */
 100struct zbudpage {
 101	union {
 102		struct page page;
 103		struct {
 104			unsigned long space_for_flags;
 105			struct {
 106				unsigned zbud0_size:12;
 107				unsigned zbud1_size:12;
 108				unsigned unevictable:2;
 109			};
 110			struct list_head budlist;
 111			struct list_head lru;
 112		};
 113	};
 114};
 115
 116struct zbudref {
 117	union {
 118		struct zbudpage *zbudpage;
 119		unsigned long zbudref;
 120	};
 121};
 122
 123#define CHUNK_SHIFT	6
 124#define CHUNK_SIZE	(1 << CHUNK_SHIFT)
 125#define CHUNK_MASK	(~(CHUNK_SIZE-1))
 126#define NCHUNKS		(PAGE_SIZE >> CHUNK_SHIFT)
 127#define MAX_CHUNK	(NCHUNKS-1)
 128
 129/*
 130 * The following functions deal with the difference between struct
 131 * page and struct zbudpage.  Note the hack of using the pageflags
 132 * from struct page; this is to avoid duplicating all the complex
 133 * pageflag macros.
 134 */
 135static inline void zbudpage_spin_lock(struct zbudpage *zbudpage)
 136{
 137	struct page *page = (struct page *)zbudpage;
 138
 139	while (unlikely(test_and_set_bit_lock(PG_locked, &page->flags))) {
 140		do {
 141			cpu_relax();
 142		} while (test_bit(PG_locked, &page->flags));
 143	}
 144}
 145
 146static inline void zbudpage_spin_unlock(struct zbudpage *zbudpage)
 147{
 148	struct page *page = (struct page *)zbudpage;
 149
 150	clear_bit(PG_locked, &page->flags);
 151}
 152
 153static inline int zbudpage_spin_trylock(struct zbudpage *zbudpage)
 154{
 155	return trylock_page((struct page *)zbudpage);
 156}
 157
 158static inline int zbudpage_is_locked(struct zbudpage *zbudpage)
 159{
 160	return PageLocked((struct page *)zbudpage);
 161}
 162
 163static inline void *kmap_zbudpage_atomic(struct zbudpage *zbudpage)
 164{
 165	return kmap_atomic((struct page *)zbudpage);
 166}
 167
 168/*
 169 * A dying zbudpage is an ephemeral page in the process of being evicted.
 170 * Any data contained in the zbudpage is invalid and we are just waiting for
 171 * the tmem pampds to be invalidated before freeing the page
 172 */
 173static inline int zbudpage_is_dying(struct zbudpage *zbudpage)
 174{
 175	struct page *page = (struct page *)zbudpage;
 176
 177	return test_bit(PG_reclaim, &page->flags);
 178}
 179
 180static inline void zbudpage_set_dying(struct zbudpage *zbudpage)
 181{
 182	struct page *page = (struct page *)zbudpage;
 183
 184	set_bit(PG_reclaim, &page->flags);
 185}
 186
 187static inline void zbudpage_clear_dying(struct zbudpage *zbudpage)
 188{
 189	struct page *page = (struct page *)zbudpage;
 190
 191	clear_bit(PG_reclaim, &page->flags);
 192}
 193
 194/*
 195 * A zombie zbudpage is a persistent page in the process of being evicted.
 196 * The data contained in the zbudpage is valid and we are just waiting for
 197 * the tmem pampds to be invalidated before freeing the page
 198 */
 199static inline int zbudpage_is_zombie(struct zbudpage *zbudpage)
 200{
 201	struct page *page = (struct page *)zbudpage;
 202
 203	return test_bit(PG_dirty, &page->flags);
 204}
 205
 206static inline void zbudpage_set_zombie(struct zbudpage *zbudpage)
 207{
 208	struct page *page = (struct page *)zbudpage;
 209
 210	set_bit(PG_dirty, &page->flags);
 211}
 212
 213static inline void zbudpage_clear_zombie(struct zbudpage *zbudpage)
 214{
 215	struct page *page = (struct page *)zbudpage;
 216
 217	clear_bit(PG_dirty, &page->flags);
 218}
 219
 220static inline void kunmap_zbudpage_atomic(void *zbpg)
 221{
 222	kunmap_atomic(zbpg);
 223}
 224
 225/*
 226 * zbud "translation" and helper functions
 227 */
 228
 229static inline struct zbudpage *zbudref_to_zbudpage(struct zbudref *zref)
 230{
 231	unsigned long zbud = (unsigned long)zref;
 232	zbud &= ~1UL;
 233	return (struct zbudpage *)zbud;
 234}
 235
 236static inline struct zbudref *zbudpage_to_zbudref(struct zbudpage *zbudpage,
 237							unsigned budnum)
 238{
 239	unsigned long zbud = (unsigned long)zbudpage;
 240	BUG_ON(budnum > 1);
 241	zbud |= budnum;
 242	return (struct zbudref *)zbud;
 243}
 244
 245static inline int zbudref_budnum(struct zbudref *zbudref)
 246{
 247	unsigned long zbud = (unsigned long)zbudref;
 248	return zbud & 1UL;
 249}
 250
 251static inline unsigned zbud_max_size(void)
 252{
 253	return MAX_CHUNK << CHUNK_SHIFT;
 254}
 255
 256static inline unsigned zbud_size_to_chunks(unsigned size)
 257{
 258	BUG_ON(size == 0 || size > zbud_max_size());
 259	return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT;
 260}
 261
 262/* can only be used between kmap_zbudpage_atomic/kunmap_zbudpage_atomic! */
 263static inline char *zbud_data(void *zbpg,
 264			unsigned budnum, unsigned size)
 265{
 266	char *p;
 267
 268	BUG_ON(size == 0 || size > zbud_max_size());
 269	p = (char *)zbpg;
 270	if (budnum == 1)
 271		p += PAGE_SIZE - ((size + CHUNK_SIZE - 1) & CHUNK_MASK);
 272	return p;
 273}
 274
 275/*
 276 * These are all informative and exposed through debugfs... except for
 277 * the arrays... anyone know how to do that?  To avoid confusion for
 278 * debugfs viewers, some of these should also be atomic_long_t, but
 279 * I don't know how to expose atomics via debugfs either...
 280 */
 281static unsigned long zbud_eph_pageframes;
 282static unsigned long zbud_pers_pageframes;
 283static unsigned long zbud_eph_zpages;
 284static unsigned long zbud_pers_zpages;
 285static u64 zbud_eph_zbytes;
 286static u64 zbud_pers_zbytes;
 287static unsigned long zbud_eph_evicted_pageframes;
 288static unsigned long zbud_pers_evicted_pageframes;
 289static unsigned long zbud_eph_cumul_zpages;
 290static unsigned long zbud_pers_cumul_zpages;
 291static u64 zbud_eph_cumul_zbytes;
 292static u64 zbud_pers_cumul_zbytes;
 293static unsigned long zbud_eph_cumul_chunk_counts[NCHUNKS];
 294static unsigned long zbud_pers_cumul_chunk_counts[NCHUNKS];
 295static unsigned long zbud_eph_buddied_count;
 296static unsigned long zbud_pers_buddied_count;
 297static unsigned long zbud_eph_unbuddied_count;
 298static unsigned long zbud_pers_unbuddied_count;
 299static unsigned long zbud_eph_zombie_count;
 300static unsigned long zbud_pers_zombie_count;
 301static atomic_t zbud_eph_zombie_atomic;
 302static atomic_t zbud_pers_zombie_atomic;
 303
 304#ifdef CONFIG_DEBUG_FS
 305#include <linux/debugfs.h>
 306#define	zdfs	debugfs_create_size_t
 307#define	zdfs64	debugfs_create_u64
 308static int zbud_debugfs_init(void)
 309{
 310	struct dentry *root = debugfs_create_dir("zbud", NULL);
 311	if (root == NULL)
 312		return -ENXIO;
 313
 314	/*
 315	 * would be nice to dump the sizes of the unbuddied
 316	 * arrays, like was done with sysfs, but it doesn't
 317	 * look like debugfs is flexible enough to do that
 318	 */
 319	zdfs64("eph_zbytes", S_IRUGO, root, &zbud_eph_zbytes);
 320	zdfs64("eph_cumul_zbytes", S_IRUGO, root, &zbud_eph_cumul_zbytes);
 321	zdfs64("pers_zbytes", S_IRUGO, root, &zbud_pers_zbytes);
 322	zdfs64("pers_cumul_zbytes", S_IRUGO, root, &zbud_pers_cumul_zbytes);
 323	zdfs("eph_cumul_zpages", S_IRUGO, root, &zbud_eph_cumul_zpages);
 324	zdfs("eph_evicted_pageframes", S_IRUGO, root,
 325				&zbud_eph_evicted_pageframes);
 326	zdfs("eph_zpages", S_IRUGO, root, &zbud_eph_zpages);
 327	zdfs("eph_pageframes", S_IRUGO, root, &zbud_eph_pageframes);
 328	zdfs("eph_buddied_count", S_IRUGO, root, &zbud_eph_buddied_count);
 329	zdfs("eph_unbuddied_count", S_IRUGO, root, &zbud_eph_unbuddied_count);
 330	zdfs("pers_cumul_zpages", S_IRUGO, root, &zbud_pers_cumul_zpages);
 331	zdfs("pers_evicted_pageframes", S_IRUGO, root,
 332				&zbud_pers_evicted_pageframes);
 333	zdfs("pers_zpages", S_IRUGO, root, &zbud_pers_zpages);
 334	zdfs("pers_pageframes", S_IRUGO, root, &zbud_pers_pageframes);
 335	zdfs("pers_buddied_count", S_IRUGO, root, &zbud_pers_buddied_count);
 336	zdfs("pers_unbuddied_count", S_IRUGO, root, &zbud_pers_unbuddied_count);
 337	zdfs("pers_zombie_count", S_IRUGO, root, &zbud_pers_zombie_count);
 338	return 0;
 339}
 340#undef	zdfs
 341#undef	zdfs64
 342#endif
 343
 344/* protects the buddied list and all unbuddied lists */
 345static DEFINE_SPINLOCK(zbud_eph_lists_lock);
 346static DEFINE_SPINLOCK(zbud_pers_lists_lock);
 347
 348struct zbud_unbuddied {
 349	struct list_head list;
 350	unsigned count;
 351};
 352
 353/* list N contains pages with N chunks USED and NCHUNKS-N unused */
 354/* element 0 is never used but optimizing that isn't worth it */
 355static struct zbud_unbuddied zbud_eph_unbuddied[NCHUNKS];
 356static struct zbud_unbuddied zbud_pers_unbuddied[NCHUNKS];
 357static LIST_HEAD(zbud_eph_lru_list);
 358static LIST_HEAD(zbud_pers_lru_list);
 359static LIST_HEAD(zbud_eph_buddied_list);
 360static LIST_HEAD(zbud_pers_buddied_list);
 361static LIST_HEAD(zbud_eph_zombie_list);
 362static LIST_HEAD(zbud_pers_zombie_list);
 363
 364/*
 365 * Given a struct page, transform it to a zbudpage so that it can be
 366 * used by zbud and initialize fields as necessary.
 367 */
 368static inline struct zbudpage *zbud_init_zbudpage(struct page *page, bool eph)
 369{
 370	struct zbudpage *zbudpage = (struct zbudpage *)page;
 371
 372	BUG_ON(page == NULL);
 373	INIT_LIST_HEAD(&zbudpage->budlist);
 374	INIT_LIST_HEAD(&zbudpage->lru);
 375	zbudpage->zbud0_size = 0;
 376	zbudpage->zbud1_size = 0;
 377	zbudpage->unevictable = 0;
 378	if (eph)
 379		zbud_eph_pageframes++;
 380	else
 381		zbud_pers_pageframes++;
 382	return zbudpage;
 383}
 384
 385/* "Transform" a zbudpage back to a struct page suitable to free. */
 386static inline struct page *zbud_unuse_zbudpage(struct zbudpage *zbudpage,
 387								bool eph)
 388{
 389	struct page *page = (struct page *)zbudpage;
 390
 391	BUG_ON(!list_empty(&zbudpage->budlist));
 392	BUG_ON(!list_empty(&zbudpage->lru));
 393	BUG_ON(zbudpage->zbud0_size != 0);
 394	BUG_ON(zbudpage->zbud1_size != 0);
 395	BUG_ON(!PageLocked(page));
 396	BUG_ON(zbudpage->unevictable != 0);
 397	BUG_ON(zbudpage_is_dying(zbudpage));
 398	BUG_ON(zbudpage_is_zombie(zbudpage));
 399	if (eph)
 400		zbud_eph_pageframes--;
 401	else
 402		zbud_pers_pageframes--;
 403	zbudpage_spin_unlock(zbudpage);
 404	reset_page_mapcount(page);
 405	init_page_count(page);
 406	page->index = 0;
 407	return page;
 408}
 409
 410/* Mark a zbud as unused and do accounting */
 411static inline void zbud_unuse_zbud(struct zbudpage *zbudpage,
 412					int budnum, bool eph)
 413{
 414	unsigned size;
 415
 416	BUG_ON(!zbudpage_is_locked(zbudpage));
 417	if (budnum == 0) {
 418		size = zbudpage->zbud0_size;
 419		zbudpage->zbud0_size = 0;
 420	} else {
 421		size = zbudpage->zbud1_size;
 422		zbudpage->zbud1_size = 0;
 423	}
 424	if (eph) {
 425		zbud_eph_zbytes -= size;
 426		zbud_eph_zpages--;
 427	} else {
 428		zbud_pers_zbytes -= size;
 429		zbud_pers_zpages--;
 430	}
 431}
 432
 433/*
 434 * Given a zbudpage/budnum/size, a tmem handle, and a kmapped pointer
 435 * to some data, set up the zbud appropriately including data copying
 436 * and accounting.  Note that if cdata is NULL, the data copying is
 437 * skipped.  (This is useful for lazy writes such as for RAMster.)
 438 */
 439static void zbud_init_zbud(struct zbudpage *zbudpage, struct tmem_handle *th,
 440				bool eph, void *cdata,
 441				unsigned budnum, unsigned size)
 442{
 443	char *to;
 444	void *zbpg;
 445	struct tmem_handle *to_th;
 446	unsigned nchunks = zbud_size_to_chunks(size);
 447
 448	BUG_ON(!zbudpage_is_locked(zbudpage));
 449	zbpg = kmap_zbudpage_atomic(zbudpage);
 450	to = zbud_data(zbpg, budnum, size);
 451	to_th = (struct tmem_handle *)to;
 452	to_th->index = th->index;
 453	to_th->oid = th->oid;
 454	to_th->pool_id = th->pool_id;
 455	to_th->client_id = th->client_id;
 456	to += sizeof(struct tmem_handle);
 457	if (cdata != NULL)
 458		memcpy(to, cdata, size - sizeof(struct tmem_handle));
 459	kunmap_zbudpage_atomic(zbpg);
 460	if (budnum == 0)
 461		zbudpage->zbud0_size = size;
 462	else
 463		zbudpage->zbud1_size = size;
 464	if (eph) {
 465		zbud_eph_cumul_chunk_counts[nchunks]++;
 466		zbud_eph_zpages++;
 467		zbud_eph_cumul_zpages++;
 468		zbud_eph_zbytes += size;
 469		zbud_eph_cumul_zbytes += size;
 470	} else {
 471		zbud_pers_cumul_chunk_counts[nchunks]++;
 472		zbud_pers_zpages++;
 473		zbud_pers_cumul_zpages++;
 474		zbud_pers_zbytes += size;
 475		zbud_pers_cumul_zbytes += size;
 476	}
 477}
 478
 479/*
 480 * Given a locked dying zbudpage, read out the tmem handles from the data,
 481 * unlock the page, then use the handles to tell tmem to flush out its
 482 * references
 483 */
 484static void zbud_evict_tmem(struct zbudpage *zbudpage)
 485{
 486	int i, j;
 487	uint32_t pool_id[2], client_id[2];
 488	uint32_t index[2];
 489	struct tmem_oid oid[2];
 490	struct tmem_pool *pool;
 491	void *zbpg;
 492	struct tmem_handle *th;
 493	unsigned size;
 494
 495	/* read out the tmem handles from the data and set aside */
 496	zbpg = kmap_zbudpage_atomic(zbudpage);
 497	for (i = 0, j = 0; i < 2; i++) {
 498		size = (i == 0) ? zbudpage->zbud0_size : zbudpage->zbud1_size;
 499		if (size) {
 500			th = (struct tmem_handle *)zbud_data(zbpg, i, size);
 501			client_id[j] = th->client_id;
 502			pool_id[j] = th->pool_id;
 503			oid[j] = th->oid;
 504			index[j] = th->index;
 505			j++;
 506			zbud_unuse_zbud(zbudpage, i, true);
 507		}
 508	}
 509	kunmap_zbudpage_atomic(zbpg);
 510	zbudpage_spin_unlock(zbudpage);
 511	/* zbudpage is now an unlocked dying... tell tmem to flush pointers */
 512	for (i = 0; i < j; i++) {
 513		pool = zcache_get_pool_by_id(client_id[i], pool_id[i]);
 514		if (pool != NULL) {
 515			tmem_flush_page(pool, &oid[i], index[i]);
 516			zcache_put_pool(pool);
 517		}
 518	}
 519}
 520
 521/*
 522 * Externally callable zbud handling routines.
 523 */
 524
 525/*
 526 * Return the maximum size compressed page that can be stored (secretly
 527 * setting aside space for the tmem handle.
 528 */
 529unsigned int zbud_max_buddy_size(void)
 530{
 531	return zbud_max_size() - sizeof(struct tmem_handle);
 532}
 533
 534/*
 535 * Given a zbud reference, free the corresponding zbud from all lists,
 536 * mark it as unused, do accounting, and if the freeing of the zbud
 537 * frees up an entire pageframe, return it to the caller (else NULL).
 538 */
 539struct page *zbud_free_and_delist(struct zbudref *zref, bool eph,
 540				  unsigned int *zsize, unsigned int *zpages)
 541{
 542	unsigned long budnum = zbudref_budnum(zref);
 543	struct zbudpage *zbudpage = zbudref_to_zbudpage(zref);
 544	struct page *page = NULL;
 545	unsigned chunks, bud_size, other_bud_size;
 546	spinlock_t *lists_lock =
 547		eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock;
 548	struct zbud_unbuddied *unbud =
 549		eph ? zbud_eph_unbuddied : zbud_pers_unbuddied;
 550
 551
 552	spin_lock(lists_lock);
 553	zbudpage_spin_lock(zbudpage);
 554	if (zbudpage_is_dying(zbudpage)) {
 555		/* ignore dying zbudpage... see zbud_evict_pageframe_lru() */
 556		zbudpage_spin_unlock(zbudpage);
 557		spin_unlock(lists_lock);
 558		*zpages = 0;
 559		*zsize = 0;
 560		goto out;
 561	}
 562	if (budnum == 0) {
 563		bud_size = zbudpage->zbud0_size;
 564		other_bud_size = zbudpage->zbud1_size;
 565	} else {
 566		bud_size = zbudpage->zbud1_size;
 567		other_bud_size = zbudpage->zbud0_size;
 568	}
 569	*zsize = bud_size - sizeof(struct tmem_handle);
 570	*zpages = 1;
 571	zbud_unuse_zbud(zbudpage, budnum, eph);
 572	if (other_bud_size == 0) { /* was unbuddied: unlist and free */
 573		chunks = zbud_size_to_chunks(bud_size) ;
 574		if (zbudpage_is_zombie(zbudpage)) {
 575			if (eph)
 576				zbud_pers_zombie_count =
 577				  atomic_dec_return(&zbud_eph_zombie_atomic);
 578			else
 579				zbud_pers_zombie_count =
 580				  atomic_dec_return(&zbud_pers_zombie_atomic);
 581			zbudpage_clear_zombie(zbudpage);
 582		} else {
 583			BUG_ON(list_empty(&unbud[chunks].list));
 584			list_del_init(&zbudpage->budlist);
 585			unbud[chunks].count--;
 586		}
 587		list_del_init(&zbudpage->lru);
 588		spin_unlock(lists_lock);
 589		if (eph)
 590			zbud_eph_unbuddied_count--;
 591		else
 592			zbud_pers_unbuddied_count--;
 593		page = zbud_unuse_zbudpage(zbudpage, eph);
 594	} else { /* was buddied: move remaining buddy to unbuddied list */
 595		chunks = zbud_size_to_chunks(other_bud_size) ;
 596		if (!zbudpage_is_zombie(zbudpage)) {
 597			list_del_init(&zbudpage->budlist);
 598			list_add_tail(&zbudpage->budlist, &unbud[chunks].list);
 599			unbud[chunks].count++;
 600		}
 601		if (eph) {
 602			zbud_eph_buddied_count--;
 603			zbud_eph_unbuddied_count++;
 604		} else {
 605			zbud_pers_unbuddied_count++;
 606			zbud_pers_buddied_count--;
 607		}
 608		/* don't mess with lru, no need to move it */
 609		zbudpage_spin_unlock(zbudpage);
 610		spin_unlock(lists_lock);
 611	}
 612out:
 613	return page;
 614}
 615
 616/*
 617 * Given a tmem handle, and a kmapped pointer to compressed data of
 618 * the given size, try to find an unbuddied zbudpage in which to
 619 * create a zbud. If found, put it there, mark the zbudpage unevictable,
 620 * and return a zbudref to it.  Else return NULL.
 621 */
 622struct zbudref *zbud_match_prep(struct tmem_handle *th, bool eph,
 623				void *cdata, unsigned size)
 624{
 625	struct zbudpage *zbudpage = NULL, *zbudpage2;
 626	unsigned long budnum = 0UL;
 627	unsigned nchunks;
 628	int i, found_good_buddy = 0;
 629	spinlock_t *lists_lock =
 630		eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock;
 631	struct zbud_unbuddied *unbud =
 632		eph ? zbud_eph_unbuddied : zbud_pers_unbuddied;
 633
 634	size += sizeof(struct tmem_handle);
 635	nchunks = zbud_size_to_chunks(size);
 636	for (i = MAX_CHUNK - nchunks + 1; i > 0; i--) {
 637		spin_lock(lists_lock);
 638		if (!list_empty(&unbud[i].list)) {
 639			list_for_each_entry_safe(zbudpage, zbudpage2,
 640				    &unbud[i].list, budlist) {
 641				if (zbudpage_spin_trylock(zbudpage)) {
 642					found_good_buddy = i;
 643					goto found_unbuddied;
 644				}
 645			}
 646		}
 647		spin_unlock(lists_lock);
 648	}
 649	zbudpage = NULL;
 650	goto out;
 651
 652found_unbuddied:
 653	BUG_ON(!zbudpage_is_locked(zbudpage));
 654	BUG_ON(!((zbudpage->zbud0_size == 0) ^ (zbudpage->zbud1_size == 0)));
 655	if (zbudpage->zbud0_size == 0)
 656		budnum = 0UL;
 657	else if (zbudpage->zbud1_size == 0)
 658		budnum = 1UL;
 659	list_del_init(&zbudpage->budlist);
 660	if (eph) {
 661		list_add_tail(&zbudpage->budlist, &zbud_eph_buddied_list);
 662		unbud[found_good_buddy].count--;
 663		zbud_eph_unbuddied_count--;
 664		zbud_eph_buddied_count++;
 665		/* "promote" raw zbudpage to most-recently-used */
 666		list_del_init(&zbudpage->lru);
 667		list_add_tail(&zbudpage->lru, &zbud_eph_lru_list);
 668	} else {
 669		list_add_tail(&zbudpage->budlist, &zbud_pers_buddied_list);
 670		unbud[found_good_buddy].count--;
 671		zbud_pers_unbuddied_count--;
 672		zbud_pers_buddied_count++;
 673		/* "promote" raw zbudpage to most-recently-used */
 674		list_del_init(&zbudpage->lru);
 675		list_add_tail(&zbudpage->lru, &zbud_pers_lru_list);
 676	}
 677	zbud_init_zbud(zbudpage, th, eph, cdata, budnum, size);
 678	zbudpage->unevictable++;
 679	BUG_ON(zbudpage->unevictable == 3);
 680	zbudpage_spin_unlock(zbudpage);
 681	spin_unlock(lists_lock);
 682out:
 683	return zbudpage_to_zbudref(zbudpage, budnum);
 684
 685}
 686
 687/*
 688 * Given a tmem handle, and a kmapped pointer to compressed data of
 689 * the given size, and a newly allocated struct page, create an unevictable
 690 * zbud in that new page and return a zbudref to it.
 691 */
 692struct zbudref *zbud_create_prep(struct tmem_handle *th, bool eph,
 693					void *cdata, unsigned size,
 694					struct page *newpage)
 695{
 696	struct zbudpage *zbudpage;
 697	unsigned long budnum = 0;
 698	unsigned nchunks;
 699	spinlock_t *lists_lock =
 700		eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock;
 701	struct zbud_unbuddied *unbud =
 702		eph ? zbud_eph_unbuddied : zbud_pers_unbuddied;
 703
 704#if 0
 705	/* this may be worth it later to support decompress-in-place? */
 706	static unsigned long counter;
 707	budnum = counter++ & 1;	/* alternate using zbud0 and zbud1 */
 708#endif
 709
 710	if (size  > zbud_max_buddy_size())
 711		return NULL;
 712	if (newpage == NULL)
 713		return NULL;
 714
 715	size += sizeof(struct tmem_handle);
 716	nchunks = zbud_size_to_chunks(size) ;
 717	spin_lock(lists_lock);
 718	zbudpage = zbud_init_zbudpage(newpage, eph);
 719	zbudpage_spin_lock(zbudpage);
 720	list_add_tail(&zbudpage->budlist, &unbud[nchunks].list);
 721	if (eph) {
 722		list_add_tail(&zbudpage->lru, &zbud_eph_lru_list);
 723		zbud_eph_unbuddied_count++;
 724	} else {
 725		list_add_tail(&zbudpage->lru, &zbud_pers_lru_list);
 726		zbud_pers_unbuddied_count++;
 727	}
 728	unbud[nchunks].count++;
 729	zbud_init_zbud(zbudpage, th, eph, cdata, budnum, size);
 730	zbudpage->unevictable++;
 731	BUG_ON(zbudpage->unevictable == 3);
 732	zbudpage_spin_unlock(zbudpage);
 733	spin_unlock(lists_lock);
 734	return zbudpage_to_zbudref(zbudpage, budnum);
 735}
 736
 737/*
 738 * Finish creation of a zbud by, assuming another zbud isn't being created
 739 * in parallel, marking it evictable.
 740 */
 741void zbud_create_finish(struct zbudref *zref, bool eph)
 742{
 743	struct zbudpage *zbudpage = zbudref_to_zbudpage(zref);
 744	spinlock_t *lists_lock =
 745		eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock;
 746
 747	spin_lock(lists_lock);
 748	zbudpage_spin_lock(zbudpage);
 749	BUG_ON(zbudpage_is_dying(zbudpage));
 750	zbudpage->unevictable--;
 751	BUG_ON((int)zbudpage->unevictable < 0);
 752	zbudpage_spin_unlock(zbudpage);
 753	spin_unlock(lists_lock);
 754}
 755
 756/*
 757 * Given a zbudref and a struct page, decompress the data from
 758 * the zbud into the physical page represented by the struct page
 759 * by upcalling to zcache_decompress
 760 */
 761int zbud_decompress(struct page *data_page, struct zbudref *zref, bool eph,
 762			void (*decompress)(char *, unsigned int, char *))
 763{
 764	struct zbudpage *zbudpage = zbudref_to_zbudpage(zref);
 765	unsigned long budnum = zbudref_budnum(zref);
 766	void *zbpg;
 767	char *to_va, *from_va;
 768	unsigned size;
 769	int ret = -1;
 770	spinlock_t *lists_lock =
 771		eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock;
 772
 773	spin_lock(lists_lock);
 774	zbudpage_spin_lock(zbudpage);
 775	if (zbudpage_is_dying(zbudpage)) {
 776		/* ignore dying zbudpage... see zbud_evict_pageframe_lru() */
 777		goto out;
 778	}
 779	zbpg = kmap_zbudpage_atomic(zbudpage);
 780	to_va = kmap_atomic(data_page);
 781	if (budnum == 0)
 782		size = zbudpage->zbud0_size;
 783	else
 784		size = zbudpage->zbud1_size;
 785	BUG_ON(size == 0 || size > zbud_max_size());
 786	from_va = zbud_data(zbpg, budnum, size);
 787	from_va += sizeof(struct tmem_handle);
 788	size -= sizeof(struct tmem_handle);
 789	decompress(from_va, size, to_va);
 790	kunmap_atomic(to_va);
 791	kunmap_zbudpage_atomic(zbpg);
 792	ret = 0;
 793out:
 794	zbudpage_spin_unlock(zbudpage);
 795	spin_unlock(lists_lock);
 796	return ret;
 797}
 798
 799/*
 800 * Given a zbudref and a kernel pointer, copy the data from
 801 * the zbud to the kernel pointer.
 802 */
 803int zbud_copy_from_zbud(char *to_va, struct zbudref *zref,
 804				size_t *sizep, bool eph)
 805{
 806	struct zbudpage *zbudpage = zbudref_to_zbudpage(zref);
 807	unsigned long budnum = zbudref_budnum(zref);
 808	void *zbpg;
 809	char *from_va;
 810	unsigned size;
 811	int ret = -1;
 812	spinlock_t *lists_lock =
 813		eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock;
 814
 815	spin_lock(lists_lock);
 816	zbudpage_spin_lock(zbudpage);
 817	if (zbudpage_is_dying(zbudpage)) {
 818		/* ignore dying zbudpage... see zbud_evict_pageframe_lru() */
 819		goto out;
 820	}
 821	zbpg = kmap_zbudpage_atomic(zbudpage);
 822	if (budnum == 0)
 823		size = zbudpage->zbud0_size;
 824	else
 825		size = zbudpage->zbud1_size;
 826	BUG_ON(size == 0 || size > zbud_max_size());
 827	from_va = zbud_data(zbpg, budnum, size);
 828	from_va += sizeof(struct tmem_handle);
 829	size -= sizeof(struct tmem_handle);
 830	*sizep = size;
 831	memcpy(to_va, from_va, size);
 832
 833	kunmap_zbudpage_atomic(zbpg);
 834	ret = 0;
 835out:
 836	zbudpage_spin_unlock(zbudpage);
 837	spin_unlock(lists_lock);
 838	return ret;
 839}
 840
 841/*
 842 * Given a zbudref and a kernel pointer, copy the data from
 843 * the kernel pointer to the zbud.
 844 */
 845int zbud_copy_to_zbud(struct zbudref *zref, char *from_va, bool eph)
 846{
 847	struct zbudpage *zbudpage = zbudref_to_zbudpage(zref);
 848	unsigned long budnum = zbudref_budnum(zref);
 849	void *zbpg;
 850	char *to_va;
 851	unsigned size;
 852	int ret = -1;
 853	spinlock_t *lists_lock =
 854		eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock;
 855
 856	spin_lock(lists_lock);
 857	zbudpage_spin_lock(zbudpage);
 858	if (zbudpage_is_dying(zbudpage)) {
 859		/* ignore dying zbudpage... see zbud_evict_pageframe_lru() */
 860		goto out;
 861	}
 862	zbpg = kmap_zbudpage_atomic(zbudpage);
 863	if (budnum == 0)
 864		size = zbudpage->zbud0_size;
 865	else
 866		size = zbudpage->zbud1_size;
 867	BUG_ON(size == 0 || size > zbud_max_size());
 868	to_va = zbud_data(zbpg, budnum, size);
 869	to_va += sizeof(struct tmem_handle);
 870	size -= sizeof(struct tmem_handle);
 871	memcpy(to_va, from_va, size);
 872
 873	kunmap_zbudpage_atomic(zbpg);
 874	ret = 0;
 875out:
 876	zbudpage_spin_unlock(zbudpage);
 877	spin_unlock(lists_lock);
 878	return ret;
 879}
 880
 881/*
 882 * Choose an ephemeral LRU zbudpage that is evictable (not locked), ensure
 883 * there are no references to it remaining, and return the now unused
 884 * (and re-init'ed) struct page and the total amount of compressed
 885 * data that was evicted.
 886 */
 887struct page *zbud_evict_pageframe_lru(unsigned int *zsize, unsigned int *zpages)
 888{
 889	struct zbudpage *zbudpage = NULL, *zbudpage2;
 890	struct zbud_unbuddied *unbud = zbud_eph_unbuddied;
 891	struct page *page = NULL;
 892	bool irqs_disabled = irqs_disabled();
 893
 894	/*
 895	 * Since this can be called indirectly from cleancache_put, which
 896	 * has interrupts disabled, as well as frontswap_put, which does not,
 897	 * we need to be able to handle both cases, even though it is ugly.
 898	 */
 899	if (irqs_disabled)
 900		spin_lock(&zbud_eph_lists_lock);
 901	else
 902		spin_lock_bh(&zbud_eph_lists_lock);
 903	*zsize = 0;
 904	if (list_empty(&zbud_eph_lru_list))
 905		goto unlock_out;
 906	list_for_each_entry_safe(zbudpage, zbudpage2, &zbud_eph_lru_list, lru) {
 907		/* skip a locked zbudpage */
 908		if (unlikely(!zbudpage_spin_trylock(zbudpage)))
 909			continue;
 910		/* skip an unevictable zbudpage */
 911		if (unlikely(zbudpage->unevictable != 0)) {
 912			zbudpage_spin_unlock(zbudpage);
 913			continue;
 914		}
 915		/* got a locked evictable page */
 916		goto evict_page;
 917
 918	}
 919unlock_out:
 920	/* no unlocked evictable pages, give up */
 921	if (irqs_disabled)
 922		spin_unlock(&zbud_eph_lists_lock);
 923	else
 924		spin_unlock_bh(&zbud_eph_lists_lock);
 925	goto out;
 926
 927evict_page:
 928	list_del_init(&zbudpage->budlist);
 929	list_del_init(&zbudpage->lru);
 930	zbudpage_set_dying(zbudpage);
 931	/*
 932	 * the zbudpage is now "dying" and attempts to read, write,
 933	 * or delete data from it will be ignored
 934	 */
 935	if (zbudpage->zbud0_size != 0 && zbudpage->zbud1_size !=  0) {
 936		*zsize = zbudpage->zbud0_size + zbudpage->zbud1_size -
 937				(2 * sizeof(struct tmem_handle));
 938		*zpages = 2;
 939	} else if (zbudpage->zbud0_size != 0) {
 940		unbud[zbud_size_to_chunks(zbudpage->zbud0_size)].count--;
 941		*zsize = zbudpage->zbud0_size - sizeof(struct tmem_handle);
 942		*zpages = 1;
 943	} else if (zbudpage->zbud1_size != 0) {
 944		unbud[zbud_size_to_chunks(zbudpage->zbud1_size)].count--;
 945		*zsize = zbudpage->zbud1_size - sizeof(struct tmem_handle);
 946		*zpages = 1;
 947	} else {
 948		BUG();
 949	}
 950	spin_unlock(&zbud_eph_lists_lock);
 951	zbud_eph_evicted_pageframes++;
 952	if (*zpages == 1)
 953		zbud_eph_unbuddied_count--;
 954	else
 955		zbud_eph_buddied_count--;
 956	zbud_evict_tmem(zbudpage);
 957	zbudpage_spin_lock(zbudpage);
 958	zbudpage_clear_dying(zbudpage);
 959	page = zbud_unuse_zbudpage(zbudpage, true);
 960	if (!irqs_disabled)
 961		local_bh_enable();
 962out:
 963	return page;
 964}
 965
 966/*
 967 * Choose a persistent LRU zbudpage that is evictable (not locked), zombify it,
 968 * read the tmem_handle(s) out of it into the passed array, and return the
 969 * number of zbuds.  Caller must perform necessary tmem functions and,
 970 * indirectly, zbud functions to fetch any valid data and cause the
 971 * now-zombified zbudpage to eventually be freed.  We track the zombified
 972 * zbudpage count so it is possible to observe if there is a leak.
 973 FIXME: describe (ramster) case where data pointers are passed in for memcpy
 974 */
 975unsigned int zbud_make_zombie_lru(struct tmem_handle *th, unsigned char **data,
 976					unsigned int *zsize, bool eph)
 977{
 978	struct zbudpage *zbudpage = NULL, *zbudpag2;
 979	struct tmem_handle *thfrom;
 980	char *from_va;
 981	void *zbpg;
 982	unsigned size;
 983	int ret = 0, i;
 984	spinlock_t *lists_lock =
 985		eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock;
 986	struct list_head *lru_list =
 987		eph ? &zbud_eph_lru_list : &zbud_pers_lru_list;
 988
 989	spin_lock_bh(lists_lock);
 990	if (list_empty(lru_list))
 991		goto out;
 992	list_for_each_entry_safe(zbudpage, zbudpag2, lru_list, lru) {
 993		/* skip a locked zbudpage */
 994		if (unlikely(!zbudpage_spin_trylock(zbudpage)))
 995			continue;
 996		/* skip an unevictable zbudpage */
 997		if (unlikely(zbudpage->unevictable != 0)) {
 998			zbudpage_spin_unlock(zbudpage);
 999			continue;
1000		}
1001		/* got a locked evictable page */
1002		goto zombify_page;
1003	}
1004	/* no unlocked evictable pages, give up */
1005	goto out;
1006
1007zombify_page:
1008	/* got an unlocked evictable page, zombify it */
1009	list_del_init(&zbudpage->budlist);
1010	zbudpage_set_zombie(zbudpage);
1011	/* FIXME what accounting do I need to do here? */
1012	list_del_init(&zbudpage->lru);
1013	if (eph) {
1014		list_add_tail(&zbudpage->lru, &zbud_eph_zombie_list);
1015		zbud_eph_zombie_count =
1016				atomic_inc_return(&zbud_eph_zombie_atomic);
1017	} else {
1018		list_add_tail(&zbudpage->lru, &zbud_pers_zombie_list);
1019		zbud_pers_zombie_count =
1020				atomic_inc_return(&zbud_pers_zombie_atomic);
1021	}
1022	/* FIXME what accounting do I need to do here? */
1023	zbpg = kmap_zbudpage_atomic(zbudpage);
1024	for (i = 0; i < 2; i++) {
1025		size = (i == 0) ? zbudpage->zbud0_size : zbudpage->zbud1_size;
1026		if (size) {
1027			from_va = zbud_data(zbpg, i, size);
1028			thfrom = (struct tmem_handle *)from_va;
1029			from_va += sizeof(struct tmem_handle);
1030			size -= sizeof(struct tmem_handle);
1031			if (th != NULL)
1032				th[ret] = *thfrom;
1033			if (data != NULL)
1034				memcpy(data[ret], from_va, size);
1035			if (zsize != NULL)
1036				*zsize++ = size;
1037			ret++;
1038		}
1039	}
1040	kunmap_zbudpage_atomic(zbpg);
1041	zbudpage_spin_unlock(zbudpage);
1042out:
1043	spin_unlock_bh(lists_lock);
1044	return ret;
1045}
1046
1047void __init zbud_init(void)
1048{
1049	int i;
1050
1051#ifdef CONFIG_DEBUG_FS
1052	zbud_debugfs_init();
1053#endif
1054	BUG_ON((sizeof(struct tmem_handle) * 2 > CHUNK_SIZE));
1055	BUG_ON(sizeof(struct zbudpage) > sizeof(struct page));
1056	for (i = 0; i < NCHUNKS; i++) {
1057		INIT_LIST_HEAD(&zbud_eph_unbuddied[i].list);
1058		INIT_LIST_HEAD(&zbud_pers_unbuddied[i].list);
1059	}
1060}
Configure Feed

Configure Feed