Merge branch 'akpm' (patches from Andrew Morton)

+68

Documentation/vm/zswap.txt

··· 1 + Overview: 2 + 3 + Zswap is a lightweight compressed cache for swap pages. It takes pages that are 4 + in the process of being swapped out and attempts to compress them into a 5 + dynamically allocated RAM-based memory pool. zswap basically trades CPU cycles 6 + for potentially reduced swap I/O. This trade-off can also result in a 7 + significant performance improvement if reads from the compressed cache are 8 + faster than reads from a swap device. 9 + 10 + NOTE: Zswap is a new feature as of v3.11 and interacts heavily with memory 11 + reclaim. This interaction has not be fully explored on the large set of 12 + potential configurations and workloads that exist. For this reason, zswap 13 + is a work in progress and should be considered experimental. 14 + 15 + Some potential benefits: 16 + * Desktop/laptop users with limited RAM capacities can mitigate the 17 + performance impact of swapping. 18 + * Overcommitted guests that share a common I/O resource can 19 + dramatically reduce their swap I/O pressure, avoiding heavy handed I/O 20 + throttling by the hypervisor. This allows more work to get done with less 21 + impact to the guest workload and guests sharing the I/O subsystem 22 + * Users with SSDs as swap devices can extend the life of the device by 23 + drastically reducing life-shortening writes. 24 + 25 + Zswap evicts pages from compressed cache on an LRU basis to the backing swap 26 + device when the compressed pool reaches it size limit. This requirement had 27 + been identified in prior community discussions. 28 + 29 + To enabled zswap, the "enabled" attribute must be set to 1 at boot time. e.g. 30 + zswap.enabled=1 31 + 32 + Design: 33 + 34 + Zswap receives pages for compression through the Frontswap API and is able to 35 + evict pages from its own compressed pool on an LRU basis and write them back to 36 + the backing swap device in the case that the compressed pool is full. 37 + 38 + Zswap makes use of zbud for the managing the compressed memory pool. Each 39 + allocation in zbud is not directly accessible by address. Rather, a handle is 40 + return by the allocation routine and that handle must be mapped before being 41 + accessed. The compressed memory pool grows on demand and shrinks as compressed 42 + pages are freed. The pool is not preallocated. 43 + 44 + When a swap page is passed from frontswap to zswap, zswap maintains a mapping 45 + of the swap entry, a combination of the swap type and swap offset, to the zbud 46 + handle that references that compressed swap page. This mapping is achieved 47 + with a red-black tree per swap type. The swap offset is the search key for the 48 + tree nodes. 49 + 50 + During a page fault on a PTE that is a swap entry, frontswap calls the zswap 51 + load function to decompress the page into the page allocated by the page fault 52 + handler. 53 + 54 + Once there are no PTEs referencing a swap page stored in zswap (i.e. the count 55 + in the swap_map goes to 0) the swap code calls the zswap invalidate function, 56 + via frontswap, to free the compressed entry. 57 + 58 + Zswap seeks to be simple in its policies. Sysfs attributes allow for one user 59 + controlled policies: 60 + * max_pool_percent - The maximum percentage of memory that the compressed 61 + pool can occupy. 62 + 63 + Zswap allows the compressor to be selected at kernel boot time by setting the 64 + “compressor” attribute. The default compressor is lzo. e.g. 65 + zswap.compressor=deflate 66 + 67 + A debugfs interface is provided for various statistic about pool size, number 68 + of pages stored, and various counters for the reasons pages are rejected.

-2

arch/arm/mm/mmap.c

··· 181 181 if (mmap_is_legacy()) { 182 182 mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; 183 183 mm->get_unmapped_area = arch_get_unmapped_area; 184 - mm->unmap_area = arch_unmap_area; 185 184 } else { 186 185 mm->mmap_base = mmap_base(random_factor); 187 186 mm->get_unmapped_area = arch_get_unmapped_area_topdown; 188 - mm->unmap_area = arch_unmap_area_topdown; 189 187 } 190 188 } 191 189

-2

arch/arm64/mm/mmap.c

··· 90 90 if (mmap_is_legacy()) { 91 91 mm->mmap_base = TASK_UNMAPPED_BASE; 92 92 mm->get_unmapped_area = arch_get_unmapped_area; 93 - mm->unmap_area = arch_unmap_area; 94 93 } else { 95 94 mm->mmap_base = mmap_base(); 96 95 mm->get_unmapped_area = arch_get_unmapped_area_topdown; 97 - mm->unmap_area = arch_unmap_area_topdown; 98 96 } 99 97 } 100 98 EXPORT_SYMBOL_GPL(arch_pick_mmap_layout);

-2

arch/mips/mm/mmap.c

··· 158 158 if (mmap_is_legacy()) { 159 159 mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; 160 160 mm->get_unmapped_area = arch_get_unmapped_area; 161 - mm->unmap_area = arch_unmap_area; 162 161 } else { 163 162 mm->mmap_base = mmap_base(random_factor); 164 163 mm->get_unmapped_area = arch_get_unmapped_area_topdown; 165 - mm->unmap_area = arch_unmap_area_topdown; 166 164 } 167 165 } 168 166

-2

arch/powerpc/mm/mmap.c

··· 92 92 if (mmap_is_legacy()) { 93 93 mm->mmap_base = TASK_UNMAPPED_BASE; 94 94 mm->get_unmapped_area = arch_get_unmapped_area; 95 - mm->unmap_area = arch_unmap_area; 96 95 } else { 97 96 mm->mmap_base = mmap_base(); 98 97 mm->get_unmapped_area = arch_get_unmapped_area_topdown; 99 - mm->unmap_area = arch_unmap_area_topdown; 100 98 } 101 99 }

-4

arch/s390/mm/mmap.c

··· 91 91 if (mmap_is_legacy()) { 92 92 mm->mmap_base = TASK_UNMAPPED_BASE; 93 93 mm->get_unmapped_area = arch_get_unmapped_area; 94 - mm->unmap_area = arch_unmap_area; 95 94 } else { 96 95 mm->mmap_base = mmap_base(); 97 96 mm->get_unmapped_area = arch_get_unmapped_area_topdown; 98 - mm->unmap_area = arch_unmap_area_topdown; 99 97 } 100 98 } 101 99 ··· 174 176 if (mmap_is_legacy()) { 175 177 mm->mmap_base = TASK_UNMAPPED_BASE; 176 178 mm->get_unmapped_area = s390_get_unmapped_area; 177 - mm->unmap_area = arch_unmap_area; 178 179 } else { 179 180 mm->mmap_base = mmap_base(); 180 181 mm->get_unmapped_area = s390_get_unmapped_area_topdown; 181 - mm->unmap_area = arch_unmap_area_topdown; 182 182 } 183 183 } 184 184

-2

arch/sparc/kernel/sys_sparc_64.c

··· 290 290 sysctl_legacy_va_layout) { 291 291 mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; 292 292 mm->get_unmapped_area = arch_get_unmapped_area; 293 - mm->unmap_area = arch_unmap_area; 294 293 } else { 295 294 /* We know it's 32-bit */ 296 295 unsigned long task_size = STACK_TOP32; ··· 301 302 302 303 mm->mmap_base = PAGE_ALIGN(task_size - gap - random_factor); 303 304 mm->get_unmapped_area = arch_get_unmapped_area_topdown; 304 - mm->unmap_area = arch_unmap_area_topdown; 305 305 } 306 306 } 307 307

-2

arch/tile/mm/mmap.c

··· 66 66 if (!is_32bit || rlimit(RLIMIT_STACK) == RLIM_INFINITY) { 67 67 mm->mmap_base = TASK_UNMAPPED_BASE; 68 68 mm->get_unmapped_area = arch_get_unmapped_area; 69 - mm->unmap_area = arch_unmap_area; 70 69 } else { 71 70 mm->mmap_base = mmap_base(mm); 72 71 mm->get_unmapped_area = arch_get_unmapped_area_topdown; 73 - mm->unmap_area = arch_unmap_area_topdown; 74 72 } 75 73 }

-2

arch/x86/ia32/ia32_aout.c

··· 308 308 (current->mm->start_data = N_DATADDR(ex)); 309 309 current->mm->brk = ex.a_bss + 310 310 (current->mm->start_brk = N_BSSADDR(ex)); 311 - current->mm->free_area_cache = TASK_UNMAPPED_BASE; 312 - current->mm->cached_hole_size = 0; 313 311 314 312 retval = setup_arg_pages(bprm, IA32_STACK_TOP, EXSTACK_DEFAULT); 315 313 if (retval < 0) {

-2

arch/x86/mm/mmap.c

··· 115 115 if (mmap_is_legacy()) { 116 116 mm->mmap_base = mmap_legacy_base(); 117 117 mm->get_unmapped_area = arch_get_unmapped_area; 118 - mm->unmap_area = arch_unmap_area; 119 118 } else { 120 119 mm->mmap_base = mmap_base(); 121 120 mm->get_unmapped_area = arch_get_unmapped_area_topdown; 122 - mm->unmap_area = arch_unmap_area_topdown; 123 121 } 124 122 }

-2

fs/binfmt_aout.c

··· 255 255 (current->mm->start_data = N_DATADDR(ex)); 256 256 current->mm->brk = ex.a_bss + 257 257 (current->mm->start_brk = N_BSSADDR(ex)); 258 - current->mm->free_area_cache = current->mm->mmap_base; 259 - current->mm->cached_hole_size = 0; 260 258 261 259 retval = setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT); 262 260 if (retval < 0) {

-2

fs/binfmt_elf.c

··· 738 738 739 739 /* Do this so that we can load the interpreter, if need be. We will 740 740 change some of these later */ 741 - current->mm->free_area_cache = current->mm->mmap_base; 742 - current->mm->cached_hole_size = 0; 743 741 retval = setup_arg_pages(bprm, randomize_stack_top(STACK_TOP), 744 742 executable_stack); 745 743 if (retval < 0) {

-3

include/linux/mm_types.h

··· 330 330 unsigned long (*get_unmapped_area) (struct file *filp, 331 331 unsigned long addr, unsigned long len, 332 332 unsigned long pgoff, unsigned long flags); 333 - void (*unmap_area) (struct mm_struct *mm, unsigned long addr); 334 333 #endif 335 334 unsigned long mmap_base; /* base of mmap area */ 336 335 unsigned long task_size; /* size of task vm space */ 337 - unsigned long cached_hole_size; /* if non-zero, the largest hole below free_area_cache */ 338 - unsigned long free_area_cache; /* first hole of size cached_hole_size or larger */ 339 336 unsigned long highest_vm_end; /* highest vma end address */ 340 337 pgd_t * pgd; 341 338 atomic_t mm_users; /* How many users with user space? */

-2

include/linux/sched.h

··· 322 322 arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, 323 323 unsigned long len, unsigned long pgoff, 324 324 unsigned long flags); 325 - extern void arch_unmap_area(struct mm_struct *, unsigned long); 326 - extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long); 327 325 #else 328 326 static inline void arch_pick_mmap_layout(struct mm_struct *mm) {} 329 327 #endif

+22

include/linux/zbud.h

··· 1 + #ifndef _ZBUD_H_ 2 + #define _ZBUD_H_ 3 + 4 + #include <linux/types.h> 5 + 6 + struct zbud_pool; 7 + 8 + struct zbud_ops { 9 + int (*evict)(struct zbud_pool *pool, unsigned long handle); 10 + }; 11 + 12 + struct zbud_pool *zbud_create_pool(gfp_t gfp, struct zbud_ops *ops); 13 + void zbud_destroy_pool(struct zbud_pool *pool); 14 + int zbud_alloc(struct zbud_pool *pool, int size, gfp_t gfp, 15 + unsigned long *handle); 16 + void zbud_free(struct zbud_pool *pool, unsigned long handle); 17 + int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries); 18 + void *zbud_map(struct zbud_pool *pool, unsigned long handle); 19 + void zbud_unmap(struct zbud_pool *pool, unsigned long handle); 20 + u64 zbud_get_pool_size(struct zbud_pool *pool); 21 + 22 + #endif /* _ZBUD_H_ */

-4

kernel/fork.c

··· 365 365 mm->locked_vm = 0; 366 366 mm->mmap = NULL; 367 367 mm->mmap_cache = NULL; 368 - mm->free_area_cache = oldmm->mmap_base; 369 - mm->cached_hole_size = ~0UL; 370 368 mm->map_count = 0; 371 369 cpumask_clear(mm_cpumask(mm)); 372 370 mm->mm_rb = RB_ROOT; ··· 538 540 mm->nr_ptes = 0; 539 541 memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); 540 542 spin_lock_init(&mm->page_table_lock); 541 - mm->free_area_cache = TASK_UNMAPPED_BASE; 542 - mm->cached_hole_size = ~0UL; 543 543 mm_init_aio(mm); 544 544 mm_init_owner(mm, p); 545 545

+30

mm/Kconfig

··· 478 478 479 479 If unsure, say Y to enable frontswap. 480 480 481 + config ZBUD 482 + tristate 483 + default n 484 + help 485 + A special purpose allocator for storing compressed pages. 486 + It is designed to store up to two compressed pages per physical 487 + page. While this design limits storage density, it has simple and 488 + deterministic reclaim properties that make it preferable to a higher 489 + density approach when reclaim will be used. 490 + 491 + config ZSWAP 492 + bool "Compressed cache for swap pages (EXPERIMENTAL)" 493 + depends on FRONTSWAP && CRYPTO=y 494 + select CRYPTO_LZO 495 + select ZBUD 496 + default n 497 + help 498 + A lightweight compressed cache for swap pages. It takes 499 + pages that are in the process of being swapped out and attempts to 500 + compress them into a dynamically allocated RAM-based memory pool. 501 + This can result in a significant I/O reduction on swap device and, 502 + in the case where decompressing from RAM is faster that swap device 503 + reads, can also improve workload performance. 504 + 505 + This is marked experimental because it is a new feature (as of 506 + v3.11) that interacts heavily with memory reclaim. While these 507 + interactions don't cause any known issues on simple memory setups, 508 + they have not be fully explored on the large set of potential 509 + configurations and workloads that exist. 510 + 481 511 config MEM_SOFT_DIRTY 482 512 bool "Track memory changes" 483 513 depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY

+2

mm/Makefile

··· 32 32 obj-$(CONFIG_BOUNCE) += bounce.o 33 33 obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o 34 34 obj-$(CONFIG_FRONTSWAP) += frontswap.o 35 + obj-$(CONFIG_ZSWAP) += zswap.o 35 36 obj-$(CONFIG_HAS_DMA) += dmapool.o 36 37 obj-$(CONFIG_HUGETLBFS) += hugetlb.o 37 38 obj-$(CONFIG_NUMA) += mempolicy.o ··· 59 58 obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o 60 59 obj-$(CONFIG_CLEANCACHE) += cleancache.o 61 60 obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o 61 + obj-$(CONFIG_ZBUD) += zbud.o

-28

mm/mmap.c

··· 1878 1878 } 1879 1879 #endif 1880 1880 1881 - void arch_unmap_area(struct mm_struct *mm, unsigned long addr) 1882 - { 1883 - /* 1884 - * Is this a new hole at the lowest possible address? 1885 - */ 1886 - if (addr >= TASK_UNMAPPED_BASE && addr < mm->free_area_cache) 1887 - mm->free_area_cache = addr; 1888 - } 1889 - 1890 1881 /* 1891 1882 * This mmap-allocator allocates new areas top-down from below the 1892 1883 * stack's low limit (the base): ··· 1933 1942 return addr; 1934 1943 } 1935 1944 #endif 1936 - 1937 - void arch_unmap_area_topdown(struct mm_struct *mm, unsigned long addr) 1938 - { 1939 - /* 1940 - * Is this a new hole at the highest possible address? 1941 - */ 1942 - if (addr > mm->free_area_cache) 1943 - mm->free_area_cache = addr; 1944 - 1945 - /* dont allow allocations above current base */ 1946 - if (mm->free_area_cache > mm->mmap_base) 1947 - mm->free_area_cache = mm->mmap_base; 1948 - } 1949 1945 1950 1946 unsigned long 1951 1947 get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, ··· 2354 2376 { 2355 2377 struct vm_area_struct **insertion_point; 2356 2378 struct vm_area_struct *tail_vma = NULL; 2357 - unsigned long addr; 2358 2379 2359 2380 insertion_point = (prev ? &prev->vm_next : &mm->mmap); 2360 2381 vma->vm_prev = NULL; ··· 2370 2393 } else 2371 2394 mm->highest_vm_end = prev ? prev->vm_end : 0; 2372 2395 tail_vma->vm_next = NULL; 2373 - if (mm->unmap_area == arch_unmap_area) 2374 - addr = prev ? prev->vm_end : mm->mmap_base; 2375 - else 2376 - addr = vma ? vma->vm_start : mm->mmap_base; 2377 - mm->unmap_area(mm, addr); 2378 2396 mm->mmap_cache = NULL; /* Kill the cache. */ 2379 2397 } 2380 2398

-4

mm/nommu.c

··· 1871 1871 return -ENOMEM; 1872 1872 } 1873 1873 1874 - void arch_unmap_area(struct mm_struct *mm, unsigned long addr) 1875 - { 1876 - } 1877 - 1878 1874 void unmap_mapping_range(struct address_space *mapping, 1879 1875 loff_t const holebegin, loff_t const holelen, 1880 1876 int even_cows)

-1

mm/util.c

··· 295 295 { 296 296 mm->mmap_base = TASK_UNMAPPED_BASE; 297 297 mm->get_unmapped_area = arch_get_unmapped_area; 298 - mm->unmap_area = arch_unmap_area; 299 298 } 300 299 #endif 301 300

+527

mm/zbud.c

··· 1 + /* 2 + * zbud.c 3 + * 4 + * Copyright (C) 2013, Seth Jennings, IBM 5 + * 6 + * Concepts based on zcache internal zbud allocator by Dan Magenheimer. 7 + * 8 + * zbud is an special purpose allocator for storing compressed pages. Contrary 9 + * to what its name may suggest, zbud is not a buddy allocator, but rather an 10 + * allocator that "buddies" two compressed pages together in a single memory 11 + * page. 12 + * 13 + * While this design limits storage density, it has simple and deterministic 14 + * reclaim properties that make it preferable to a higher density approach when 15 + * reclaim will be used. 16 + * 17 + * zbud works by storing compressed pages, or "zpages", together in pairs in a 18 + * single memory page called a "zbud page". The first buddy is "left 19 + * justifed" at the beginning of the zbud page, and the last buddy is "right 20 + * justified" at the end of the zbud page. The benefit is that if either 21 + * buddy is freed, the freed buddy space, coalesced with whatever slack space 22 + * that existed between the buddies, results in the largest possible free region 23 + * within the zbud page. 24 + * 25 + * zbud also provides an attractive lower bound on density. The ratio of zpages 26 + * to zbud pages can not be less than 1. This ensures that zbud can never "do 27 + * harm" by using more pages to store zpages than the uncompressed zpages would 28 + * have used on their own. 29 + * 30 + * zbud pages are divided into "chunks". The size of the chunks is fixed at 31 + * compile time and determined by NCHUNKS_ORDER below. Dividing zbud pages 32 + * into chunks allows organizing unbuddied zbud pages into a manageable number 33 + * of unbuddied lists according to the number of free chunks available in the 34 + * zbud page. 35 + * 36 + * The zbud API differs from that of conventional allocators in that the 37 + * allocation function, zbud_alloc(), returns an opaque handle to the user, 38 + * not a dereferenceable pointer. The user must map the handle using 39 + * zbud_map() in order to get a usable pointer by which to access the 40 + * allocation data and unmap the handle with zbud_unmap() when operations 41 + * on the allocation data are complete. 42 + */ 43 + 44 + #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 45 + 46 + #include <linux/atomic.h> 47 + #include <linux/list.h> 48 + #include <linux/mm.h> 49 + #include <linux/module.h> 50 + #include <linux/preempt.h> 51 + #include <linux/slab.h> 52 + #include <linux/spinlock.h> 53 + #include <linux/zbud.h> 54 + 55 + /***************** 56 + * Structures 57 + *****************/ 58 + /* 59 + * NCHUNKS_ORDER determines the internal allocation granularity, effectively 60 + * adjusting internal fragmentation. It also determines the number of 61 + * freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the 62 + * allocation granularity will be in chunks of size PAGE_SIZE/64, and there 63 + * will be 64 freelists per pool. 64 + */ 65 + #define NCHUNKS_ORDER 6 66 + 67 + #define CHUNK_SHIFT (PAGE_SHIFT - NCHUNKS_ORDER) 68 + #define CHUNK_SIZE (1 << CHUNK_SHIFT) 69 + #define NCHUNKS (PAGE_SIZE >> CHUNK_SHIFT) 70 + #define ZHDR_SIZE_ALIGNED CHUNK_SIZE 71 + 72 + /** 73 + * struct zbud_pool - stores metadata for each zbud pool 74 + * @lock: protects all pool fields and first|last_chunk fields of any 75 + * zbud page in the pool 76 + * @unbuddied: array of lists tracking zbud pages that only contain one buddy; 77 + * the lists each zbud page is added to depends on the size of 78 + * its free region. 79 + * @buddied: list tracking the zbud pages that contain two buddies; 80 + * these zbud pages are full 81 + * @lru: list tracking the zbud pages in LRU order by most recently 82 + * added buddy. 83 + * @pages_nr: number of zbud pages in the pool. 84 + * @ops: pointer to a structure of user defined operations specified at 85 + * pool creation time. 86 + * 87 + * This structure is allocated at pool creation time and maintains metadata 88 + * pertaining to a particular zbud pool. 89 + */ 90 + struct zbud_pool { 91 + spinlock_t lock; 92 + struct list_head unbuddied[NCHUNKS]; 93 + struct list_head buddied; 94 + struct list_head lru; 95 + u64 pages_nr; 96 + struct zbud_ops *ops; 97 + }; 98 + 99 + /* 100 + * struct zbud_header - zbud page metadata occupying the first chunk of each 101 + * zbud page. 102 + * @buddy: links the zbud page into the unbuddied/buddied lists in the pool 103 + * @lru: links the zbud page into the lru list in the pool 104 + * @first_chunks: the size of the first buddy in chunks, 0 if free 105 + * @last_chunks: the size of the last buddy in chunks, 0 if free 106 + */ 107 + struct zbud_header { 108 + struct list_head buddy; 109 + struct list_head lru; 110 + unsigned int first_chunks; 111 + unsigned int last_chunks; 112 + bool under_reclaim; 113 + }; 114 + 115 + /***************** 116 + * Helpers 117 + *****************/ 118 + /* Just to make the code easier to read */ 119 + enum buddy { 120 + FIRST, 121 + LAST 122 + }; 123 + 124 + /* Converts an allocation size in bytes to size in zbud chunks */ 125 + static int size_to_chunks(int size) 126 + { 127 + return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT; 128 + } 129 + 130 + #define for_each_unbuddied_list(_iter, _begin) \ 131 + for ((_iter) = (_begin); (_iter) < NCHUNKS; (_iter)++) 132 + 133 + /* Initializes the zbud header of a newly allocated zbud page */ 134 + static struct zbud_header *init_zbud_page(struct page *page) 135 + { 136 + struct zbud_header *zhdr = page_address(page); 137 + zhdr->first_chunks = 0; 138 + zhdr->last_chunks = 0; 139 + INIT_LIST_HEAD(&zhdr->buddy); 140 + INIT_LIST_HEAD(&zhdr->lru); 141 + zhdr->under_reclaim = 0; 142 + return zhdr; 143 + } 144 + 145 + /* Resets the struct page fields and frees the page */ 146 + static void free_zbud_page(struct zbud_header *zhdr) 147 + { 148 + __free_page(virt_to_page(zhdr)); 149 + } 150 + 151 + /* 152 + * Encodes the handle of a particular buddy within a zbud page 153 + * Pool lock should be held as this function accesses first|last_chunks 154 + */ 155 + static unsigned long encode_handle(struct zbud_header *zhdr, enum buddy bud) 156 + { 157 + unsigned long handle; 158 + 159 + /* 160 + * For now, the encoded handle is actually just the pointer to the data 161 + * but this might not always be the case. A little information hiding. 162 + * Add CHUNK_SIZE to the handle if it is the first allocation to jump 163 + * over the zbud header in the first chunk. 164 + */ 165 + handle = (unsigned long)zhdr; 166 + if (bud == FIRST) 167 + /* skip over zbud header */ 168 + handle += ZHDR_SIZE_ALIGNED; 169 + else /* bud == LAST */ 170 + handle += PAGE_SIZE - (zhdr->last_chunks << CHUNK_SHIFT); 171 + return handle; 172 + } 173 + 174 + /* Returns the zbud page where a given handle is stored */ 175 + static struct zbud_header *handle_to_zbud_header(unsigned long handle) 176 + { 177 + return (struct zbud_header *)(handle & PAGE_MASK); 178 + } 179 + 180 + /* Returns the number of free chunks in a zbud page */ 181 + static int num_free_chunks(struct zbud_header *zhdr) 182 + { 183 + /* 184 + * Rather than branch for different situations, just use the fact that 185 + * free buddies have a length of zero to simplify everything. -1 at the 186 + * end for the zbud header. 187 + */ 188 + return NCHUNKS - zhdr->first_chunks - zhdr->last_chunks - 1; 189 + } 190 + 191 + /***************** 192 + * API Functions 193 + *****************/ 194 + /** 195 + * zbud_create_pool() - create a new zbud pool 196 + * @gfp: gfp flags when allocating the zbud pool structure 197 + * @ops: user-defined operations for the zbud pool 198 + * 199 + * Return: pointer to the new zbud pool or NULL if the metadata allocation 200 + * failed. 201 + */ 202 + struct zbud_pool *zbud_create_pool(gfp_t gfp, struct zbud_ops *ops) 203 + { 204 + struct zbud_pool *pool; 205 + int i; 206 + 207 + pool = kmalloc(sizeof(struct zbud_pool), gfp); 208 + if (!pool) 209 + return NULL; 210 + spin_lock_init(&pool->lock); 211 + for_each_unbuddied_list(i, 0) 212 + INIT_LIST_HEAD(&pool->unbuddied[i]); 213 + INIT_LIST_HEAD(&pool->buddied); 214 + INIT_LIST_HEAD(&pool->lru); 215 + pool->pages_nr = 0; 216 + pool->ops = ops; 217 + return pool; 218 + } 219 + 220 + /** 221 + * zbud_destroy_pool() - destroys an existing zbud pool 222 + * @pool: the zbud pool to be destroyed 223 + * 224 + * The pool should be emptied before this function is called. 225 + */ 226 + void zbud_destroy_pool(struct zbud_pool *pool) 227 + { 228 + kfree(pool); 229 + } 230 + 231 + /** 232 + * zbud_alloc() - allocates a region of a given size 233 + * @pool: zbud pool from which to allocate 234 + * @size: size in bytes of the desired allocation 235 + * @gfp: gfp flags used if the pool needs to grow 236 + * @handle: handle of the new allocation 237 + * 238 + * This function will attempt to find a free region in the pool large enough to 239 + * satisfy the allocation request. A search of the unbuddied lists is 240 + * performed first. If no suitable free region is found, then a new page is 241 + * allocated and added to the pool to satisfy the request. 242 + * 243 + * gfp should not set __GFP_HIGHMEM as highmem pages cannot be used 244 + * as zbud pool pages. 245 + * 246 + * Return: 0 if success and handle is set, otherwise -EINVAL is the size or 247 + * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate 248 + * a new page. 249 + */ 250 + int zbud_alloc(struct zbud_pool *pool, int size, gfp_t gfp, 251 + unsigned long *handle) 252 + { 253 + int chunks, i, freechunks; 254 + struct zbud_header *zhdr = NULL; 255 + enum buddy bud; 256 + struct page *page; 257 + 258 + if (size <= 0 || gfp & __GFP_HIGHMEM) 259 + return -EINVAL; 260 + if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED) 261 + return -ENOSPC; 262 + chunks = size_to_chunks(size); 263 + spin_lock(&pool->lock); 264 + 265 + /* First, try to find an unbuddied zbud page. */ 266 + zhdr = NULL; 267 + for_each_unbuddied_list(i, chunks) { 268 + if (!list_empty(&pool->unbuddied[i])) { 269 + zhdr = list_first_entry(&pool->unbuddied[i], 270 + struct zbud_header, buddy); 271 + list_del(&zhdr->buddy); 272 + if (zhdr->first_chunks == 0) 273 + bud = FIRST; 274 + else 275 + bud = LAST; 276 + goto found; 277 + } 278 + } 279 + 280 + /* Couldn't find unbuddied zbud page, create new one */ 281 + spin_unlock(&pool->lock); 282 + page = alloc_page(gfp); 283 + if (!page) 284 + return -ENOMEM; 285 + spin_lock(&pool->lock); 286 + pool->pages_nr++; 287 + zhdr = init_zbud_page(page); 288 + bud = FIRST; 289 + 290 + found: 291 + if (bud == FIRST) 292 + zhdr->first_chunks = chunks; 293 + else 294 + zhdr->last_chunks = chunks; 295 + 296 + if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0) { 297 + /* Add to unbuddied list */ 298 + freechunks = num_free_chunks(zhdr); 299 + list_add(&zhdr->buddy, &pool->unbuddied[freechunks]); 300 + } else { 301 + /* Add to buddied list */ 302 + list_add(&zhdr->buddy, &pool->buddied); 303 + } 304 + 305 + /* Add/move zbud page to beginning of LRU */ 306 + if (!list_empty(&zhdr->lru)) 307 + list_del(&zhdr->lru); 308 + list_add(&zhdr->lru, &pool->lru); 309 + 310 + *handle = encode_handle(zhdr, bud); 311 + spin_unlock(&pool->lock); 312 + 313 + return 0; 314 + } 315 + 316 + /** 317 + * zbud_free() - frees the allocation associated with the given handle 318 + * @pool: pool in which the allocation resided 319 + * @handle: handle associated with the allocation returned by zbud_alloc() 320 + * 321 + * In the case that the zbud page in which the allocation resides is under 322 + * reclaim, as indicated by the PG_reclaim flag being set, this function 323 + * only sets the first|last_chunks to 0. The page is actually freed 324 + * once both buddies are evicted (see zbud_reclaim_page() below). 325 + */ 326 + void zbud_free(struct zbud_pool *pool, unsigned long handle) 327 + { 328 + struct zbud_header *zhdr; 329 + int freechunks; 330 + 331 + spin_lock(&pool->lock); 332 + zhdr = handle_to_zbud_header(handle); 333 + 334 + /* If first buddy, handle will be page aligned */ 335 + if ((handle - ZHDR_SIZE_ALIGNED) & ~PAGE_MASK) 336 + zhdr->last_chunks = 0; 337 + else 338 + zhdr->first_chunks = 0; 339 + 340 + if (zhdr->under_reclaim) { 341 + /* zbud page is under reclaim, reclaim will free */ 342 + spin_unlock(&pool->lock); 343 + return; 344 + } 345 + 346 + /* Remove from existing buddy list */ 347 + list_del(&zhdr->buddy); 348 + 349 + if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) { 350 + /* zbud page is empty, free */ 351 + list_del(&zhdr->lru); 352 + free_zbud_page(zhdr); 353 + pool->pages_nr--; 354 + } else { 355 + /* Add to unbuddied list */ 356 + freechunks = num_free_chunks(zhdr); 357 + list_add(&zhdr->buddy, &pool->unbuddied[freechunks]); 358 + } 359 + 360 + spin_unlock(&pool->lock); 361 + } 362 + 363 + #define list_tail_entry(ptr, type, member) \ 364 + list_entry((ptr)->prev, type, member) 365 + 366 + /** 367 + * zbud_reclaim_page() - evicts allocations from a pool page and frees it 368 + * @pool: pool from which a page will attempt to be evicted 369 + * @retires: number of pages on the LRU list for which eviction will 370 + * be attempted before failing 371 + * 372 + * zbud reclaim is different from normal system reclaim in that the reclaim is 373 + * done from the bottom, up. This is because only the bottom layer, zbud, has 374 + * information on how the allocations are organized within each zbud page. This 375 + * has the potential to create interesting locking situations between zbud and 376 + * the user, however. 377 + * 378 + * To avoid these, this is how zbud_reclaim_page() should be called: 379 + 380 + * The user detects a page should be reclaimed and calls zbud_reclaim_page(). 381 + * zbud_reclaim_page() will remove a zbud page from the pool LRU list and call 382 + * the user-defined eviction handler with the pool and handle as arguments. 383 + * 384 + * If the handle can not be evicted, the eviction handler should return 385 + * non-zero. zbud_reclaim_page() will add the zbud page back to the 386 + * appropriate list and try the next zbud page on the LRU up to 387 + * a user defined number of retries. 388 + * 389 + * If the handle is successfully evicted, the eviction handler should 390 + * return 0 _and_ should have called zbud_free() on the handle. zbud_free() 391 + * contains logic to delay freeing the page if the page is under reclaim, 392 + * as indicated by the setting of the PG_reclaim flag on the underlying page. 393 + * 394 + * If all buddies in the zbud page are successfully evicted, then the 395 + * zbud page can be freed. 396 + * 397 + * Returns: 0 if page is successfully freed, otherwise -EINVAL if there are 398 + * no pages to evict or an eviction handler is not registered, -EAGAIN if 399 + * the retry limit was hit. 400 + */ 401 + int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries) 402 + { 403 + int i, ret, freechunks; 404 + struct zbud_header *zhdr; 405 + unsigned long first_handle = 0, last_handle = 0; 406 + 407 + spin_lock(&pool->lock); 408 + if (!pool->ops || !pool->ops->evict || list_empty(&pool->lru) || 409 + retries == 0) { 410 + spin_unlock(&pool->lock); 411 + return -EINVAL; 412 + } 413 + for (i = 0; i < retries; i++) { 414 + zhdr = list_tail_entry(&pool->lru, struct zbud_header, lru); 415 + list_del(&zhdr->lru); 416 + list_del(&zhdr->buddy); 417 + /* Protect zbud page against free */ 418 + zhdr->under_reclaim = true; 419 + /* 420 + * We need encode the handles before unlocking, since we can 421 + * race with free that will set (first|last)_chunks to 0 422 + */ 423 + first_handle = 0; 424 + last_handle = 0; 425 + if (zhdr->first_chunks) 426 + first_handle = encode_handle(zhdr, FIRST); 427 + if (zhdr->last_chunks) 428 + last_handle = encode_handle(zhdr, LAST); 429 + spin_unlock(&pool->lock); 430 + 431 + /* Issue the eviction callback(s) */ 432 + if (first_handle) { 433 + ret = pool->ops->evict(pool, first_handle); 434 + if (ret) 435 + goto next; 436 + } 437 + if (last_handle) { 438 + ret = pool->ops->evict(pool, last_handle); 439 + if (ret) 440 + goto next; 441 + } 442 + next: 443 + spin_lock(&pool->lock); 444 + zhdr->under_reclaim = false; 445 + if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) { 446 + /* 447 + * Both buddies are now free, free the zbud page and 448 + * return success. 449 + */ 450 + free_zbud_page(zhdr); 451 + pool->pages_nr--; 452 + spin_unlock(&pool->lock); 453 + return 0; 454 + } else if (zhdr->first_chunks == 0 || 455 + zhdr->last_chunks == 0) { 456 + /* add to unbuddied list */ 457 + freechunks = num_free_chunks(zhdr); 458 + list_add(&zhdr->buddy, &pool->unbuddied[freechunks]); 459 + } else { 460 + /* add to buddied list */ 461 + list_add(&zhdr->buddy, &pool->buddied); 462 + } 463 + 464 + /* add to beginning of LRU */ 465 + list_add(&zhdr->lru, &pool->lru); 466 + } 467 + spin_unlock(&pool->lock); 468 + return -EAGAIN; 469 + } 470 + 471 + /** 472 + * zbud_map() - maps the allocation associated with the given handle 473 + * @pool: pool in which the allocation resides 474 + * @handle: handle associated with the allocation to be mapped 475 + * 476 + * While trivial for zbud, the mapping functions for others allocators 477 + * implementing this allocation API could have more complex information encoded 478 + * in the handle and could create temporary mappings to make the data 479 + * accessible to the user. 480 + * 481 + * Returns: a pointer to the mapped allocation 482 + */ 483 + void *zbud_map(struct zbud_pool *pool, unsigned long handle) 484 + { 485 + return (void *)(handle); 486 + } 487 + 488 + /** 489 + * zbud_unmap() - maps the allocation associated with the given handle 490 + * @pool: pool in which the allocation resides 491 + * @handle: handle associated with the allocation to be unmapped 492 + */ 493 + void zbud_unmap(struct zbud_pool *pool, unsigned long handle) 494 + { 495 + } 496 + 497 + /** 498 + * zbud_get_pool_size() - gets the zbud pool size in pages 499 + * @pool: pool whose size is being queried 500 + * 501 + * Returns: size in pages of the given pool. The pool lock need not be 502 + * taken to access pages_nr. 503 + */ 504 + u64 zbud_get_pool_size(struct zbud_pool *pool) 505 + { 506 + return pool->pages_nr; 507 + } 508 + 509 + static int __init init_zbud(void) 510 + { 511 + /* Make sure the zbud header will fit in one chunk */ 512 + BUILD_BUG_ON(sizeof(struct zbud_header) > ZHDR_SIZE_ALIGNED); 513 + pr_info("loaded\n"); 514 + return 0; 515 + } 516 + 517 + static void __exit exit_zbud(void) 518 + { 519 + pr_info("unloaded\n"); 520 + } 521 + 522 + module_init(init_zbud); 523 + module_exit(exit_zbud); 524 + 525 + MODULE_LICENSE("GPL"); 526 + MODULE_AUTHOR("Seth Jennings <sjenning@linux.vnet.ibm.com>"); 527 + MODULE_DESCRIPTION("Buddy Allocator for Compressed Pages");

+943

mm/zswap.c

··· 1 + /* 2 + * zswap.c - zswap driver file 3 + * 4 + * zswap is a backend for frontswap that takes pages that are in the process 5 + * of being swapped out and attempts to compress and store them in a 6 + * RAM-based memory pool. This can result in a significant I/O reduction on 7 + * the swap device and, in the case where decompressing from RAM is faster 8 + * than reading from the swap device, can also improve workload performance. 9 + * 10 + * Copyright (C) 2012 Seth Jennings <sjenning@linux.vnet.ibm.com> 11 + * 12 + * This program is free software; you can redistribute it and/or 13 + * modify it under the terms of the GNU General Public License 14 + * as published by the Free Software Foundation; either version 2 15 + * of the License, or (at your option) any later version. 16 + * 17 + * This program is distributed in the hope that it will be useful, 18 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 19 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 20 + * GNU General Public License for more details. 21 + */ 22 + 23 + #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 24 + 25 + #include <linux/module.h> 26 + #include <linux/cpu.h> 27 + #include <linux/highmem.h> 28 + #include <linux/slab.h> 29 + #include <linux/spinlock.h> 30 + #include <linux/types.h> 31 + #include <linux/atomic.h> 32 + #include <linux/frontswap.h> 33 + #include <linux/rbtree.h> 34 + #include <linux/swap.h> 35 + #include <linux/crypto.h> 36 + #include <linux/mempool.h> 37 + #include <linux/zbud.h> 38 + 39 + #include <linux/mm_types.h> 40 + #include <linux/page-flags.h> 41 + #include <linux/swapops.h> 42 + #include <linux/writeback.h> 43 + #include <linux/pagemap.h> 44 + 45 + /********************************* 46 + * statistics 47 + **********************************/ 48 + /* Number of memory pages used by the compressed pool */ 49 + static u64 zswap_pool_pages; 50 + /* The number of compressed pages currently stored in zswap */ 51 + static atomic_t zswap_stored_pages = ATOMIC_INIT(0); 52 + 53 + /* 54 + * The statistics below are not protected from concurrent access for 55 + * performance reasons so they may not be a 100% accurate. However, 56 + * they do provide useful information on roughly how many times a 57 + * certain event is occurring. 58 + */ 59 + 60 + /* Pool limit was hit (see zswap_max_pool_percent) */ 61 + static u64 zswap_pool_limit_hit; 62 + /* Pages written back when pool limit was reached */ 63 + static u64 zswap_written_back_pages; 64 + /* Store failed due to a reclaim failure after pool limit was reached */ 65 + static u64 zswap_reject_reclaim_fail; 66 + /* Compressed page was too big for the allocator to (optimally) store */ 67 + static u64 zswap_reject_compress_poor; 68 + /* Store failed because underlying allocator could not get memory */ 69 + static u64 zswap_reject_alloc_fail; 70 + /* Store failed because the entry metadata could not be allocated (rare) */ 71 + static u64 zswap_reject_kmemcache_fail; 72 + /* Duplicate store was encountered (rare) */ 73 + static u64 zswap_duplicate_entry; 74 + 75 + /********************************* 76 + * tunables 77 + **********************************/ 78 + /* Enable/disable zswap (disabled by default, fixed at boot for now) */ 79 + static bool zswap_enabled __read_mostly; 80 + module_param_named(enabled, zswap_enabled, bool, 0); 81 + 82 + /* Compressor to be used by zswap (fixed at boot for now) */ 83 + #define ZSWAP_COMPRESSOR_DEFAULT "lzo" 84 + static char *zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT; 85 + module_param_named(compressor, zswap_compressor, charp, 0); 86 + 87 + /* The maximum percentage of memory that the compressed pool can occupy */ 88 + static unsigned int zswap_max_pool_percent = 20; 89 + module_param_named(max_pool_percent, 90 + zswap_max_pool_percent, uint, 0644); 91 + 92 + /********************************* 93 + * compression functions 94 + **********************************/ 95 + /* per-cpu compression transforms */ 96 + static struct crypto_comp * __percpu *zswap_comp_pcpu_tfms; 97 + 98 + enum comp_op { 99 + ZSWAP_COMPOP_COMPRESS, 100 + ZSWAP_COMPOP_DECOMPRESS 101 + }; 102 + 103 + static int zswap_comp_op(enum comp_op op, const u8 *src, unsigned int slen, 104 + u8 *dst, unsigned int *dlen) 105 + { 106 + struct crypto_comp *tfm; 107 + int ret; 108 + 109 + tfm = *per_cpu_ptr(zswap_comp_pcpu_tfms, get_cpu()); 110 + switch (op) { 111 + case ZSWAP_COMPOP_COMPRESS: 112 + ret = crypto_comp_compress(tfm, src, slen, dst, dlen); 113 + break; 114 + case ZSWAP_COMPOP_DECOMPRESS: 115 + ret = crypto_comp_decompress(tfm, src, slen, dst, dlen); 116 + break; 117 + default: 118 + ret = -EINVAL; 119 + } 120 + 121 + put_cpu(); 122 + return ret; 123 + } 124 + 125 + static int __init zswap_comp_init(void) 126 + { 127 + if (!crypto_has_comp(zswap_compressor, 0, 0)) { 128 + pr_info("%s compressor not available\n", zswap_compressor); 129 + /* fall back to default compressor */ 130 + zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT; 131 + if (!crypto_has_comp(zswap_compressor, 0, 0)) 132 + /* can't even load the default compressor */ 133 + return -ENODEV; 134 + } 135 + pr_info("using %s compressor\n", zswap_compressor); 136 + 137 + /* alloc percpu transforms */ 138 + zswap_comp_pcpu_tfms = alloc_percpu(struct crypto_comp *); 139 + if (!zswap_comp_pcpu_tfms) 140 + return -ENOMEM; 141 + return 0; 142 + } 143 + 144 + static void zswap_comp_exit(void) 145 + { 146 + /* free percpu transforms */ 147 + if (zswap_comp_pcpu_tfms) 148 + free_percpu(zswap_comp_pcpu_tfms); 149 + } 150 + 151 + /********************************* 152 + * data structures 153 + **********************************/ 154 + /* 155 + * struct zswap_entry 156 + * 157 + * This structure contains the metadata for tracking a single compressed 158 + * page within zswap. 159 + * 160 + * rbnode - links the entry into red-black tree for the appropriate swap type 161 + * refcount - the number of outstanding reference to the entry. This is needed 162 + * to protect against premature freeing of the entry by code 163 + * concurent calls to load, invalidate, and writeback. The lock 164 + * for the zswap_tree structure that contains the entry must 165 + * be held while changing the refcount. Since the lock must 166 + * be held, there is no reason to also make refcount atomic. 167 + * offset - the swap offset for the entry. Index into the red-black tree. 168 + * handle - zsmalloc allocation handle that stores the compressed page data 169 + * length - the length in bytes of the compressed page data. Needed during 170 + * decompression 171 + */ 172 + struct zswap_entry { 173 + struct rb_node rbnode; 174 + pgoff_t offset; 175 + int refcount; 176 + unsigned int length; 177 + unsigned long handle; 178 + }; 179 + 180 + struct zswap_header { 181 + swp_entry_t swpentry; 182 + }; 183 + 184 + /* 185 + * The tree lock in the zswap_tree struct protects a few things: 186 + * - the rbtree 187 + * - the refcount field of each entry in the tree 188 + */ 189 + struct zswap_tree { 190 + struct rb_root rbroot; 191 + spinlock_t lock; 192 + struct zbud_pool *pool; 193 + }; 194 + 195 + static struct zswap_tree *zswap_trees[MAX_SWAPFILES]; 196 + 197 + /********************************* 198 + * zswap entry functions 199 + **********************************/ 200 + static struct kmem_cache *zswap_entry_cache; 201 + 202 + static int zswap_entry_cache_create(void) 203 + { 204 + zswap_entry_cache = KMEM_CACHE(zswap_entry, 0); 205 + return (zswap_entry_cache == NULL); 206 + } 207 + 208 + static void zswap_entry_cache_destory(void) 209 + { 210 + kmem_cache_destroy(zswap_entry_cache); 211 + } 212 + 213 + static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp) 214 + { 215 + struct zswap_entry *entry; 216 + entry = kmem_cache_alloc(zswap_entry_cache, gfp); 217 + if (!entry) 218 + return NULL; 219 + entry->refcount = 1; 220 + return entry; 221 + } 222 + 223 + static void zswap_entry_cache_free(struct zswap_entry *entry) 224 + { 225 + kmem_cache_free(zswap_entry_cache, entry); 226 + } 227 + 228 + /* caller must hold the tree lock */ 229 + static void zswap_entry_get(struct zswap_entry *entry) 230 + { 231 + entry->refcount++; 232 + } 233 + 234 + /* caller must hold the tree lock */ 235 + static int zswap_entry_put(struct zswap_entry *entry) 236 + { 237 + entry->refcount--; 238 + return entry->refcount; 239 + } 240 + 241 + /********************************* 242 + * rbtree functions 243 + **********************************/ 244 + static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset) 245 + { 246 + struct rb_node *node = root->rb_node; 247 + struct zswap_entry *entry; 248 + 249 + while (node) { 250 + entry = rb_entry(node, struct zswap_entry, rbnode); 251 + if (entry->offset > offset) 252 + node = node->rb_left; 253 + else if (entry->offset < offset) 254 + node = node->rb_right; 255 + else 256 + return entry; 257 + } 258 + return NULL; 259 + } 260 + 261 + /* 262 + * In the case that a entry with the same offset is found, a pointer to 263 + * the existing entry is stored in dupentry and the function returns -EEXIST 264 + */ 265 + static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry, 266 + struct zswap_entry **dupentry) 267 + { 268 + struct rb_node **link = &root->rb_node, *parent = NULL; 269 + struct zswap_entry *myentry; 270 + 271 + while (*link) { 272 + parent = *link; 273 + myentry = rb_entry(parent, struct zswap_entry, rbnode); 274 + if (myentry->offset > entry->offset) 275 + link = &(*link)->rb_left; 276 + else if (myentry->offset < entry->offset) 277 + link = &(*link)->rb_right; 278 + else { 279 + *dupentry = myentry; 280 + return -EEXIST; 281 + } 282 + } 283 + rb_link_node(&entry->rbnode, parent, link); 284 + rb_insert_color(&entry->rbnode, root); 285 + return 0; 286 + } 287 + 288 + /********************************* 289 + * per-cpu code 290 + **********************************/ 291 + static DEFINE_PER_CPU(u8 *, zswap_dstmem); 292 + 293 + static int __zswap_cpu_notifier(unsigned long action, unsigned long cpu) 294 + { 295 + struct crypto_comp *tfm; 296 + u8 *dst; 297 + 298 + switch (action) { 299 + case CPU_UP_PREPARE: 300 + tfm = crypto_alloc_comp(zswap_compressor, 0, 0); 301 + if (IS_ERR(tfm)) { 302 + pr_err("can't allocate compressor transform\n"); 303 + return NOTIFY_BAD; 304 + } 305 + *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = tfm; 306 + dst = kmalloc(PAGE_SIZE * 2, GFP_KERNEL); 307 + if (!dst) { 308 + pr_err("can't allocate compressor buffer\n"); 309 + crypto_free_comp(tfm); 310 + *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = NULL; 311 + return NOTIFY_BAD; 312 + } 313 + per_cpu(zswap_dstmem, cpu) = dst; 314 + break; 315 + case CPU_DEAD: 316 + case CPU_UP_CANCELED: 317 + tfm = *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu); 318 + if (tfm) { 319 + crypto_free_comp(tfm); 320 + *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = NULL; 321 + } 322 + dst = per_cpu(zswap_dstmem, cpu); 323 + kfree(dst); 324 + per_cpu(zswap_dstmem, cpu) = NULL; 325 + break; 326 + default: 327 + break; 328 + } 329 + return NOTIFY_OK; 330 + } 331 + 332 + static int zswap_cpu_notifier(struct notifier_block *nb, 333 + unsigned long action, void *pcpu) 334 + { 335 + unsigned long cpu = (unsigned long)pcpu; 336 + return __zswap_cpu_notifier(action, cpu); 337 + } 338 + 339 + static struct notifier_block zswap_cpu_notifier_block = { 340 + .notifier_call = zswap_cpu_notifier 341 + }; 342 + 343 + static int zswap_cpu_init(void) 344 + { 345 + unsigned long cpu; 346 + 347 + get_online_cpus(); 348 + for_each_online_cpu(cpu) 349 + if (__zswap_cpu_notifier(CPU_UP_PREPARE, cpu) != NOTIFY_OK) 350 + goto cleanup; 351 + register_cpu_notifier(&zswap_cpu_notifier_block); 352 + put_online_cpus(); 353 + return 0; 354 + 355 + cleanup: 356 + for_each_online_cpu(cpu) 357 + __zswap_cpu_notifier(CPU_UP_CANCELED, cpu); 358 + put_online_cpus(); 359 + return -ENOMEM; 360 + } 361 + 362 + /********************************* 363 + * helpers 364 + **********************************/ 365 + static bool zswap_is_full(void) 366 + { 367 + return (totalram_pages * zswap_max_pool_percent / 100 < 368 + zswap_pool_pages); 369 + } 370 + 371 + /* 372 + * Carries out the common pattern of freeing and entry's zsmalloc allocation, 373 + * freeing the entry itself, and decrementing the number of stored pages. 374 + */ 375 + static void zswap_free_entry(struct zswap_tree *tree, struct zswap_entry *entry) 376 + { 377 + zbud_free(tree->pool, entry->handle); 378 + zswap_entry_cache_free(entry); 379 + atomic_dec(&zswap_stored_pages); 380 + zswap_pool_pages = zbud_get_pool_size(tree->pool); 381 + } 382 + 383 + /********************************* 384 + * writeback code 385 + **********************************/ 386 + /* return enum for zswap_get_swap_cache_page */ 387 + enum zswap_get_swap_ret { 388 + ZSWAP_SWAPCACHE_NEW, 389 + ZSWAP_SWAPCACHE_EXIST, 390 + ZSWAP_SWAPCACHE_NOMEM 391 + }; 392 + 393 + /* 394 + * zswap_get_swap_cache_page 395 + * 396 + * This is an adaption of read_swap_cache_async() 397 + * 398 + * This function tries to find a page with the given swap entry 399 + * in the swapper_space address space (the swap cache). If the page 400 + * is found, it is returned in retpage. Otherwise, a page is allocated, 401 + * added to the swap cache, and returned in retpage. 402 + * 403 + * If success, the swap cache page is returned in retpage 404 + * Returns 0 if page was already in the swap cache, page is not locked 405 + * Returns 1 if the new page needs to be populated, page is locked 406 + * Returns <0 on error 407 + */ 408 + static int zswap_get_swap_cache_page(swp_entry_t entry, 409 + struct page **retpage) 410 + { 411 + struct page *found_page, *new_page = NULL; 412 + struct address_space *swapper_space = &swapper_spaces[swp_type(entry)]; 413 + int err; 414 + 415 + *retpage = NULL; 416 + do { 417 + /* 418 + * First check the swap cache. Since this is normally 419 + * called after lookup_swap_cache() failed, re-calling 420 + * that would confuse statistics. 421 + */ 422 + found_page = find_get_page(swapper_space, entry.val); 423 + if (found_page) 424 + break; 425 + 426 + /* 427 + * Get a new page to read into from swap. 428 + */ 429 + if (!new_page) { 430 + new_page = alloc_page(GFP_KERNEL); 431 + if (!new_page) 432 + break; /* Out of memory */ 433 + } 434 + 435 + /* 436 + * call radix_tree_preload() while we can wait. 437 + */ 438 + err = radix_tree_preload(GFP_KERNEL); 439 + if (err) 440 + break; 441 + 442 + /* 443 + * Swap entry may have been freed since our caller observed it. 444 + */ 445 + err = swapcache_prepare(entry); 446 + if (err == -EEXIST) { /* seems racy */ 447 + radix_tree_preload_end(); 448 + continue; 449 + } 450 + if (err) { /* swp entry is obsolete ? */ 451 + radix_tree_preload_end(); 452 + break; 453 + } 454 + 455 + /* May fail (-ENOMEM) if radix-tree node allocation failed. */ 456 + __set_page_locked(new_page); 457 + SetPageSwapBacked(new_page); 458 + err = __add_to_swap_cache(new_page, entry); 459 + if (likely(!err)) { 460 + radix_tree_preload_end(); 461 + lru_cache_add_anon(new_page); 462 + *retpage = new_page; 463 + return ZSWAP_SWAPCACHE_NEW; 464 + } 465 + radix_tree_preload_end(); 466 + ClearPageSwapBacked(new_page); 467 + __clear_page_locked(new_page); 468 + /* 469 + * add_to_swap_cache() doesn't return -EEXIST, so we can safely 470 + * clear SWAP_HAS_CACHE flag. 471 + */ 472 + swapcache_free(entry, NULL); 473 + } while (err != -ENOMEM); 474 + 475 + if (new_page) 476 + page_cache_release(new_page); 477 + if (!found_page) 478 + return ZSWAP_SWAPCACHE_NOMEM; 479 + *retpage = found_page; 480 + return ZSWAP_SWAPCACHE_EXIST; 481 + } 482 + 483 + /* 484 + * Attempts to free an entry by adding a page to the swap cache, 485 + * decompressing the entry data into the page, and issuing a 486 + * bio write to write the page back to the swap device. 487 + * 488 + * This can be thought of as a "resumed writeback" of the page 489 + * to the swap device. We are basically resuming the same swap 490 + * writeback path that was intercepted with the frontswap_store() 491 + * in the first place. After the page has been decompressed into 492 + * the swap cache, the compressed version stored by zswap can be 493 + * freed. 494 + */ 495 + static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle) 496 + { 497 + struct zswap_header *zhdr; 498 + swp_entry_t swpentry; 499 + struct zswap_tree *tree; 500 + pgoff_t offset; 501 + struct zswap_entry *entry; 502 + struct page *page; 503 + u8 *src, *dst; 504 + unsigned int dlen; 505 + int ret, refcount; 506 + struct writeback_control wbc = { 507 + .sync_mode = WB_SYNC_NONE, 508 + }; 509 + 510 + /* extract swpentry from data */ 511 + zhdr = zbud_map(pool, handle); 512 + swpentry = zhdr->swpentry; /* here */ 513 + zbud_unmap(pool, handle); 514 + tree = zswap_trees[swp_type(swpentry)]; 515 + offset = swp_offset(swpentry); 516 + BUG_ON(pool != tree->pool); 517 + 518 + /* find and ref zswap entry */ 519 + spin_lock(&tree->lock); 520 + entry = zswap_rb_search(&tree->rbroot, offset); 521 + if (!entry) { 522 + /* entry was invalidated */ 523 + spin_unlock(&tree->lock); 524 + return 0; 525 + } 526 + zswap_entry_get(entry); 527 + spin_unlock(&tree->lock); 528 + BUG_ON(offset != entry->offset); 529 + 530 + /* try to allocate swap cache page */ 531 + switch (zswap_get_swap_cache_page(swpentry, &page)) { 532 + case ZSWAP_SWAPCACHE_NOMEM: /* no memory */ 533 + ret = -ENOMEM; 534 + goto fail; 535 + 536 + case ZSWAP_SWAPCACHE_EXIST: /* page is unlocked */ 537 + /* page is already in the swap cache, ignore for now */ 538 + page_cache_release(page); 539 + ret = -EEXIST; 540 + goto fail; 541 + 542 + case ZSWAP_SWAPCACHE_NEW: /* page is locked */ 543 + /* decompress */ 544 + dlen = PAGE_SIZE; 545 + src = (u8 *)zbud_map(tree->pool, entry->handle) + 546 + sizeof(struct zswap_header); 547 + dst = kmap_atomic(page); 548 + ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, 549 + entry->length, dst, &dlen); 550 + kunmap_atomic(dst); 551 + zbud_unmap(tree->pool, entry->handle); 552 + BUG_ON(ret); 553 + BUG_ON(dlen != PAGE_SIZE); 554 + 555 + /* page is up to date */ 556 + SetPageUptodate(page); 557 + } 558 + 559 + /* start writeback */ 560 + __swap_writepage(page, &wbc, end_swap_bio_write); 561 + page_cache_release(page); 562 + zswap_written_back_pages++; 563 + 564 + spin_lock(&tree->lock); 565 + 566 + /* drop local reference */ 567 + zswap_entry_put(entry); 568 + /* drop the initial reference from entry creation */ 569 + refcount = zswap_entry_put(entry); 570 + 571 + /* 572 + * There are three possible values for refcount here: 573 + * (1) refcount is 1, load is in progress, unlink from rbtree, 574 + * load will free 575 + * (2) refcount is 0, (normal case) entry is valid, 576 + * remove from rbtree and free entry 577 + * (3) refcount is -1, invalidate happened during writeback, 578 + * free entry 579 + */ 580 + if (refcount >= 0) { 581 + /* no invalidate yet, remove from rbtree */ 582 + rb_erase(&entry->rbnode, &tree->rbroot); 583 + } 584 + spin_unlock(&tree->lock); 585 + if (refcount <= 0) { 586 + /* free the entry */ 587 + zswap_free_entry(tree, entry); 588 + return 0; 589 + } 590 + return -EAGAIN; 591 + 592 + fail: 593 + spin_lock(&tree->lock); 594 + zswap_entry_put(entry); 595 + spin_unlock(&tree->lock); 596 + return ret; 597 + } 598 + 599 + /********************************* 600 + * frontswap hooks 601 + **********************************/ 602 + /* attempts to compress and store an single page */ 603 + static int zswap_frontswap_store(unsigned type, pgoff_t offset, 604 + struct page *page) 605 + { 606 + struct zswap_tree *tree = zswap_trees[type]; 607 + struct zswap_entry *entry, *dupentry; 608 + int ret; 609 + unsigned int dlen = PAGE_SIZE, len; 610 + unsigned long handle; 611 + char *buf; 612 + u8 *src, *dst; 613 + struct zswap_header *zhdr; 614 + 615 + if (!tree) { 616 + ret = -ENODEV; 617 + goto reject; 618 + } 619 + 620 + /* reclaim space if needed */ 621 + if (zswap_is_full()) { 622 + zswap_pool_limit_hit++; 623 + if (zbud_reclaim_page(tree->pool, 8)) { 624 + zswap_reject_reclaim_fail++; 625 + ret = -ENOMEM; 626 + goto reject; 627 + } 628 + } 629 + 630 + /* allocate entry */ 631 + entry = zswap_entry_cache_alloc(GFP_KERNEL); 632 + if (!entry) { 633 + zswap_reject_kmemcache_fail++; 634 + ret = -ENOMEM; 635 + goto reject; 636 + } 637 + 638 + /* compress */ 639 + dst = get_cpu_var(zswap_dstmem); 640 + src = kmap_atomic(page); 641 + ret = zswap_comp_op(ZSWAP_COMPOP_COMPRESS, src, PAGE_SIZE, dst, &dlen); 642 + kunmap_atomic(src); 643 + if (ret) { 644 + ret = -EINVAL; 645 + goto freepage; 646 + } 647 + 648 + /* store */ 649 + len = dlen + sizeof(struct zswap_header); 650 + ret = zbud_alloc(tree->pool, len, __GFP_NORETRY | __GFP_NOWARN, 651 + &handle); 652 + if (ret == -ENOSPC) { 653 + zswap_reject_compress_poor++; 654 + goto freepage; 655 + } 656 + if (ret) { 657 + zswap_reject_alloc_fail++; 658 + goto freepage; 659 + } 660 + zhdr = zbud_map(tree->pool, handle); 661 + zhdr->swpentry = swp_entry(type, offset); 662 + buf = (u8 *)(zhdr + 1); 663 + memcpy(buf, dst, dlen); 664 + zbud_unmap(tree->pool, handle); 665 + put_cpu_var(zswap_dstmem); 666 + 667 + /* populate entry */ 668 + entry->offset = offset; 669 + entry->handle = handle; 670 + entry->length = dlen; 671 + 672 + /* map */ 673 + spin_lock(&tree->lock); 674 + do { 675 + ret = zswap_rb_insert(&tree->rbroot, entry, &dupentry); 676 + if (ret == -EEXIST) { 677 + zswap_duplicate_entry++; 678 + /* remove from rbtree */ 679 + rb_erase(&dupentry->rbnode, &tree->rbroot); 680 + if (!zswap_entry_put(dupentry)) { 681 + /* free */ 682 + zswap_free_entry(tree, dupentry); 683 + } 684 + } 685 + } while (ret == -EEXIST); 686 + spin_unlock(&tree->lock); 687 + 688 + /* update stats */ 689 + atomic_inc(&zswap_stored_pages); 690 + zswap_pool_pages = zbud_get_pool_size(tree->pool); 691 + 692 + return 0; 693 + 694 + freepage: 695 + put_cpu_var(zswap_dstmem); 696 + zswap_entry_cache_free(entry); 697 + reject: 698 + return ret; 699 + } 700 + 701 + /* 702 + * returns 0 if the page was successfully decompressed 703 + * return -1 on entry not found or error 704 + */ 705 + static int zswap_frontswap_load(unsigned type, pgoff_t offset, 706 + struct page *page) 707 + { 708 + struct zswap_tree *tree = zswap_trees[type]; 709 + struct zswap_entry *entry; 710 + u8 *src, *dst; 711 + unsigned int dlen; 712 + int refcount, ret; 713 + 714 + /* find */ 715 + spin_lock(&tree->lock); 716 + entry = zswap_rb_search(&tree->rbroot, offset); 717 + if (!entry) { 718 + /* entry was written back */ 719 + spin_unlock(&tree->lock); 720 + return -1; 721 + } 722 + zswap_entry_get(entry); 723 + spin_unlock(&tree->lock); 724 + 725 + /* decompress */ 726 + dlen = PAGE_SIZE; 727 + src = (u8 *)zbud_map(tree->pool, entry->handle) + 728 + sizeof(struct zswap_header); 729 + dst = kmap_atomic(page); 730 + ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, entry->length, 731 + dst, &dlen); 732 + kunmap_atomic(dst); 733 + zbud_unmap(tree->pool, entry->handle); 734 + BUG_ON(ret); 735 + 736 + spin_lock(&tree->lock); 737 + refcount = zswap_entry_put(entry); 738 + if (likely(refcount)) { 739 + spin_unlock(&tree->lock); 740 + return 0; 741 + } 742 + spin_unlock(&tree->lock); 743 + 744 + /* 745 + * We don't have to unlink from the rbtree because 746 + * zswap_writeback_entry() or zswap_frontswap_invalidate page() 747 + * has already done this for us if we are the last reference. 748 + */ 749 + /* free */ 750 + 751 + zswap_free_entry(tree, entry); 752 + 753 + return 0; 754 + } 755 + 756 + /* frees an entry in zswap */ 757 + static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset) 758 + { 759 + struct zswap_tree *tree = zswap_trees[type]; 760 + struct zswap_entry *entry; 761 + int refcount; 762 + 763 + /* find */ 764 + spin_lock(&tree->lock); 765 + entry = zswap_rb_search(&tree->rbroot, offset); 766 + if (!entry) { 767 + /* entry was written back */ 768 + spin_unlock(&tree->lock); 769 + return; 770 + } 771 + 772 + /* remove from rbtree */ 773 + rb_erase(&entry->rbnode, &tree->rbroot); 774 + 775 + /* drop the initial reference from entry creation */ 776 + refcount = zswap_entry_put(entry); 777 + 778 + spin_unlock(&tree->lock); 779 + 780 + if (refcount) { 781 + /* writeback in progress, writeback will free */ 782 + return; 783 + } 784 + 785 + /* free */ 786 + zswap_free_entry(tree, entry); 787 + } 788 + 789 + /* frees all zswap entries for the given swap type */ 790 + static void zswap_frontswap_invalidate_area(unsigned type) 791 + { 792 + struct zswap_tree *tree = zswap_trees[type]; 793 + struct rb_node *node; 794 + struct zswap_entry *entry; 795 + 796 + if (!tree) 797 + return; 798 + 799 + /* walk the tree and free everything */ 800 + spin_lock(&tree->lock); 801 + /* 802 + * TODO: Even though this code should not be executed because 803 + * the try_to_unuse() in swapoff should have emptied the tree, 804 + * it is very wasteful to rebalance the tree after every 805 + * removal when we are freeing the whole tree. 806 + * 807 + * If post-order traversal code is ever added to the rbtree 808 + * implementation, it should be used here. 809 + */ 810 + while ((node = rb_first(&tree->rbroot))) { 811 + entry = rb_entry(node, struct zswap_entry, rbnode); 812 + rb_erase(&entry->rbnode, &tree->rbroot); 813 + zbud_free(tree->pool, entry->handle); 814 + zswap_entry_cache_free(entry); 815 + atomic_dec(&zswap_stored_pages); 816 + } 817 + tree->rbroot = RB_ROOT; 818 + spin_unlock(&tree->lock); 819 + } 820 + 821 + static struct zbud_ops zswap_zbud_ops = { 822 + .evict = zswap_writeback_entry 823 + }; 824 + 825 + static void zswap_frontswap_init(unsigned type) 826 + { 827 + struct zswap_tree *tree; 828 + 829 + tree = kzalloc(sizeof(struct zswap_tree), GFP_KERNEL); 830 + if (!tree) 831 + goto err; 832 + tree->pool = zbud_create_pool(GFP_KERNEL, &zswap_zbud_ops); 833 + if (!tree->pool) 834 + goto freetree; 835 + tree->rbroot = RB_ROOT; 836 + spin_lock_init(&tree->lock); 837 + zswap_trees[type] = tree; 838 + return; 839 + 840 + freetree: 841 + kfree(tree); 842 + err: 843 + pr_err("alloc failed, zswap disabled for swap type %d\n", type); 844 + } 845 + 846 + static struct frontswap_ops zswap_frontswap_ops = { 847 + .store = zswap_frontswap_store, 848 + .load = zswap_frontswap_load, 849 + .invalidate_page = zswap_frontswap_invalidate_page, 850 + .invalidate_area = zswap_frontswap_invalidate_area, 851 + .init = zswap_frontswap_init 852 + }; 853 + 854 + /********************************* 855 + * debugfs functions 856 + **********************************/ 857 + #ifdef CONFIG_DEBUG_FS 858 + #include <linux/debugfs.h> 859 + 860 + static struct dentry *zswap_debugfs_root; 861 + 862 + static int __init zswap_debugfs_init(void) 863 + { 864 + if (!debugfs_initialized()) 865 + return -ENODEV; 866 + 867 + zswap_debugfs_root = debugfs_create_dir("zswap", NULL); 868 + if (!zswap_debugfs_root) 869 + return -ENOMEM; 870 + 871 + debugfs_create_u64("pool_limit_hit", S_IRUGO, 872 + zswap_debugfs_root, &zswap_pool_limit_hit); 873 + debugfs_create_u64("reject_reclaim_fail", S_IRUGO, 874 + zswap_debugfs_root, &zswap_reject_reclaim_fail); 875 + debugfs_create_u64("reject_alloc_fail", S_IRUGO, 876 + zswap_debugfs_root, &zswap_reject_alloc_fail); 877 + debugfs_create_u64("reject_kmemcache_fail", S_IRUGO, 878 + zswap_debugfs_root, &zswap_reject_kmemcache_fail); 879 + debugfs_create_u64("reject_compress_poor", S_IRUGO, 880 + zswap_debugfs_root, &zswap_reject_compress_poor); 881 + debugfs_create_u64("written_back_pages", S_IRUGO, 882 + zswap_debugfs_root, &zswap_written_back_pages); 883 + debugfs_create_u64("duplicate_entry", S_IRUGO, 884 + zswap_debugfs_root, &zswap_duplicate_entry); 885 + debugfs_create_u64("pool_pages", S_IRUGO, 886 + zswap_debugfs_root, &zswap_pool_pages); 887 + debugfs_create_atomic_t("stored_pages", S_IRUGO, 888 + zswap_debugfs_root, &zswap_stored_pages); 889 + 890 + return 0; 891 + } 892 + 893 + static void __exit zswap_debugfs_exit(void) 894 + { 895 + debugfs_remove_recursive(zswap_debugfs_root); 896 + } 897 + #else 898 + static int __init zswap_debugfs_init(void) 899 + { 900 + return 0; 901 + } 902 + 903 + static void __exit zswap_debugfs_exit(void) { } 904 + #endif 905 + 906 + /********************************* 907 + * module init and exit 908 + **********************************/ 909 + static int __init init_zswap(void) 910 + { 911 + if (!zswap_enabled) 912 + return 0; 913 + 914 + pr_info("loading zswap\n"); 915 + if (zswap_entry_cache_create()) { 916 + pr_err("entry cache creation failed\n"); 917 + goto error; 918 + } 919 + if (zswap_comp_init()) { 920 + pr_err("compressor initialization failed\n"); 921 + goto compfail; 922 + } 923 + if (zswap_cpu_init()) { 924 + pr_err("per-cpu initialization failed\n"); 925 + goto pcpufail; 926 + } 927 + frontswap_register_ops(&zswap_frontswap_ops); 928 + if (zswap_debugfs_init()) 929 + pr_warn("debugfs initialization failed\n"); 930 + return 0; 931 + pcpufail: 932 + zswap_comp_exit(); 933 + compfail: 934 + zswap_entry_cache_destory(); 935 + error: 936 + return -ENOMEM; 937 + } 938 + /* must be late so crypto has time to come up */ 939 + late_initcall(init_zswap); 940 + 941 + MODULE_LICENSE("GPL"); 942 + MODULE_AUTHOR("Seth Jennings <sjenning@linux.vnet.ibm.com>"); 943 + MODULE_DESCRIPTION("Compressed cache for swap pages");