Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at v3.11 1063 lines 27 kB view raw
1/* 2 * zsmalloc memory allocator 3 * 4 * Copyright (C) 2011 Nitin Gupta 5 * 6 * This code is released using a dual license strategy: BSD/GPL 7 * You can choose the license that better fits your requirements. 8 * 9 * Released under the terms of 3-clause BSD License 10 * Released under the terms of GNU General Public License Version 2.0 11 */ 12 13 14/* 15 * This allocator is designed for use with zcache and zram. Thus, the 16 * allocator is supposed to work well under low memory conditions. In 17 * particular, it never attempts higher order page allocation which is 18 * very likely to fail under memory pressure. On the other hand, if we 19 * just use single (0-order) pages, it would suffer from very high 20 * fragmentation -- any object of size PAGE_SIZE/2 or larger would occupy 21 * an entire page. This was one of the major issues with its predecessor 22 * (xvmalloc). 23 * 24 * To overcome these issues, zsmalloc allocates a bunch of 0-order pages 25 * and links them together using various 'struct page' fields. These linked 26 * pages act as a single higher-order page i.e. an object can span 0-order 27 * page boundaries. The code refers to these linked pages as a single entity 28 * called zspage. 29 * 30 * Following is how we use various fields and flags of underlying 31 * struct page(s) to form a zspage. 32 * 33 * Usage of struct page fields: 34 * page->first_page: points to the first component (0-order) page 35 * page->index (union with page->freelist): offset of the first object 36 * starting in this page. For the first page, this is 37 * always 0, so we use this field (aka freelist) to point 38 * to the first free object in zspage. 39 * page->lru: links together all component pages (except the first page) 40 * of a zspage 41 * 42 * For _first_ page only: 43 * 44 * page->private (union with page->first_page): refers to the 45 * component page after the first page 46 * page->freelist: points to the first free object in zspage. 47 * Free objects are linked together using in-place 48 * metadata. 49 * page->objects: maximum number of objects we can store in this 50 * zspage (class->zspage_order * PAGE_SIZE / class->size) 51 * page->lru: links together first pages of various zspages. 52 * Basically forming list of zspages in a fullness group. 53 * page->mapping: class index and fullness group of the zspage 54 * 55 * Usage of struct page flags: 56 * PG_private: identifies the first component page 57 * PG_private2: identifies the last component page 58 * 59 */ 60 61#ifdef CONFIG_ZSMALLOC_DEBUG 62#define DEBUG 63#endif 64 65#include <linux/module.h> 66#include <linux/kernel.h> 67#include <linux/bitops.h> 68#include <linux/errno.h> 69#include <linux/highmem.h> 70#include <linux/init.h> 71#include <linux/string.h> 72#include <linux/slab.h> 73#include <asm/tlbflush.h> 74#include <asm/pgtable.h> 75#include <linux/cpumask.h> 76#include <linux/cpu.h> 77#include <linux/vmalloc.h> 78#include <linux/hardirq.h> 79#include <linux/spinlock.h> 80#include <linux/types.h> 81 82#include "zsmalloc.h" 83 84/* 85 * This must be power of 2 and greater than of equal to sizeof(link_free). 86 * These two conditions ensure that any 'struct link_free' itself doesn't 87 * span more than 1 page which avoids complex case of mapping 2 pages simply 88 * to restore link_free pointer values. 89 */ 90#define ZS_ALIGN 8 91 92/* 93 * A single 'zspage' is composed of up to 2^N discontiguous 0-order (single) 94 * pages. ZS_MAX_ZSPAGE_ORDER defines upper limit on N. 95 */ 96#define ZS_MAX_ZSPAGE_ORDER 2 97#define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER) 98 99/* 100 * Object location (<PFN>, <obj_idx>) is encoded as 101 * as single (void *) handle value. 102 * 103 * Note that object index <obj_idx> is relative to system 104 * page <PFN> it is stored in, so for each sub-page belonging 105 * to a zspage, obj_idx starts with 0. 106 * 107 * This is made more complicated by various memory models and PAE. 108 */ 109 110#ifndef MAX_PHYSMEM_BITS 111#ifdef CONFIG_HIGHMEM64G 112#define MAX_PHYSMEM_BITS 36 113#else /* !CONFIG_HIGHMEM64G */ 114/* 115 * If this definition of MAX_PHYSMEM_BITS is used, OBJ_INDEX_BITS will just 116 * be PAGE_SHIFT 117 */ 118#define MAX_PHYSMEM_BITS BITS_PER_LONG 119#endif 120#endif 121#define _PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT) 122#define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS) 123#define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1) 124 125#define MAX(a, b) ((a) >= (b) ? (a) : (b)) 126/* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */ 127#define ZS_MIN_ALLOC_SIZE \ 128 MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS)) 129#define ZS_MAX_ALLOC_SIZE PAGE_SIZE 130 131/* 132 * On systems with 4K page size, this gives 254 size classes! There is a 133 * trader-off here: 134 * - Large number of size classes is potentially wasteful as free page are 135 * spread across these classes 136 * - Small number of size classes causes large internal fragmentation 137 * - Probably its better to use specific size classes (empirically 138 * determined). NOTE: all those class sizes must be set as multiple of 139 * ZS_ALIGN to make sure link_free itself never has to span 2 pages. 140 * 141 * ZS_MIN_ALLOC_SIZE and ZS_SIZE_CLASS_DELTA must be multiple of ZS_ALIGN 142 * (reason above) 143 */ 144#define ZS_SIZE_CLASS_DELTA (PAGE_SIZE >> 8) 145#define ZS_SIZE_CLASSES ((ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) / \ 146 ZS_SIZE_CLASS_DELTA + 1) 147 148/* 149 * We do not maintain any list for completely empty or full pages 150 */ 151enum fullness_group { 152 ZS_ALMOST_FULL, 153 ZS_ALMOST_EMPTY, 154 _ZS_NR_FULLNESS_GROUPS, 155 156 ZS_EMPTY, 157 ZS_FULL 158}; 159 160/* 161 * We assign a page to ZS_ALMOST_EMPTY fullness group when: 162 * n <= N / f, where 163 * n = number of allocated objects 164 * N = total number of objects zspage can store 165 * f = 1/fullness_threshold_frac 166 * 167 * Similarly, we assign zspage to: 168 * ZS_ALMOST_FULL when n > N / f 169 * ZS_EMPTY when n == 0 170 * ZS_FULL when n == N 171 * 172 * (see: fix_fullness_group()) 173 */ 174static const int fullness_threshold_frac = 4; 175 176struct size_class { 177 /* 178 * Size of objects stored in this class. Must be multiple 179 * of ZS_ALIGN. 180 */ 181 int size; 182 unsigned int index; 183 184 /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */ 185 int pages_per_zspage; 186 187 spinlock_t lock; 188 189 /* stats */ 190 u64 pages_allocated; 191 192 struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS]; 193}; 194 195/* 196 * Placed within free objects to form a singly linked list. 197 * For every zspage, first_page->freelist gives head of this list. 198 * 199 * This must be power of 2 and less than or equal to ZS_ALIGN 200 */ 201struct link_free { 202 /* Handle of next free chunk (encodes <PFN, obj_idx>) */ 203 void *next; 204}; 205 206struct zs_pool { 207 struct size_class size_class[ZS_SIZE_CLASSES]; 208 209 gfp_t flags; /* allocation flags used when growing pool */ 210}; 211 212/* 213 * A zspage's class index and fullness group 214 * are encoded in its (first)page->mapping 215 */ 216#define CLASS_IDX_BITS 28 217#define FULLNESS_BITS 4 218#define CLASS_IDX_MASK ((1 << CLASS_IDX_BITS) - 1) 219#define FULLNESS_MASK ((1 << FULLNESS_BITS) - 1) 220 221/* 222 * By default, zsmalloc uses a copy-based object mapping method to access 223 * allocations that span two pages. However, if a particular architecture 224 * performs VM mapping faster than copying, then it should be added here 225 * so that USE_PGTABLE_MAPPING is defined. This causes zsmalloc to use 226 * page table mapping rather than copying for object mapping. 227 */ 228#if defined(CONFIG_ARM) && !defined(MODULE) 229#define USE_PGTABLE_MAPPING 230#endif 231 232struct mapping_area { 233#ifdef USE_PGTABLE_MAPPING 234 struct vm_struct *vm; /* vm area for mapping object that span pages */ 235#else 236 char *vm_buf; /* copy buffer for objects that span pages */ 237#endif 238 char *vm_addr; /* address of kmap_atomic()'ed pages */ 239 enum zs_mapmode vm_mm; /* mapping mode */ 240}; 241 242 243/* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ 244static DEFINE_PER_CPU(struct mapping_area, zs_map_area); 245 246static int is_first_page(struct page *page) 247{ 248 return PagePrivate(page); 249} 250 251static int is_last_page(struct page *page) 252{ 253 return PagePrivate2(page); 254} 255 256static void get_zspage_mapping(struct page *page, unsigned int *class_idx, 257 enum fullness_group *fullness) 258{ 259 unsigned long m; 260 BUG_ON(!is_first_page(page)); 261 262 m = (unsigned long)page->mapping; 263 *fullness = m & FULLNESS_MASK; 264 *class_idx = (m >> FULLNESS_BITS) & CLASS_IDX_MASK; 265} 266 267static void set_zspage_mapping(struct page *page, unsigned int class_idx, 268 enum fullness_group fullness) 269{ 270 unsigned long m; 271 BUG_ON(!is_first_page(page)); 272 273 m = ((class_idx & CLASS_IDX_MASK) << FULLNESS_BITS) | 274 (fullness & FULLNESS_MASK); 275 page->mapping = (struct address_space *)m; 276} 277 278static int get_size_class_index(int size) 279{ 280 int idx = 0; 281 282 if (likely(size > ZS_MIN_ALLOC_SIZE)) 283 idx = DIV_ROUND_UP(size - ZS_MIN_ALLOC_SIZE, 284 ZS_SIZE_CLASS_DELTA); 285 286 return idx; 287} 288 289static enum fullness_group get_fullness_group(struct page *page) 290{ 291 int inuse, max_objects; 292 enum fullness_group fg; 293 BUG_ON(!is_first_page(page)); 294 295 inuse = page->inuse; 296 max_objects = page->objects; 297 298 if (inuse == 0) 299 fg = ZS_EMPTY; 300 else if (inuse == max_objects) 301 fg = ZS_FULL; 302 else if (inuse <= max_objects / fullness_threshold_frac) 303 fg = ZS_ALMOST_EMPTY; 304 else 305 fg = ZS_ALMOST_FULL; 306 307 return fg; 308} 309 310static void insert_zspage(struct page *page, struct size_class *class, 311 enum fullness_group fullness) 312{ 313 struct page **head; 314 315 BUG_ON(!is_first_page(page)); 316 317 if (fullness >= _ZS_NR_FULLNESS_GROUPS) 318 return; 319 320 head = &class->fullness_list[fullness]; 321 if (*head) 322 list_add_tail(&page->lru, &(*head)->lru); 323 324 *head = page; 325} 326 327static void remove_zspage(struct page *page, struct size_class *class, 328 enum fullness_group fullness) 329{ 330 struct page **head; 331 332 BUG_ON(!is_first_page(page)); 333 334 if (fullness >= _ZS_NR_FULLNESS_GROUPS) 335 return; 336 337 head = &class->fullness_list[fullness]; 338 BUG_ON(!*head); 339 if (list_empty(&(*head)->lru)) 340 *head = NULL; 341 else if (*head == page) 342 *head = (struct page *)list_entry((*head)->lru.next, 343 struct page, lru); 344 345 list_del_init(&page->lru); 346} 347 348static enum fullness_group fix_fullness_group(struct zs_pool *pool, 349 struct page *page) 350{ 351 int class_idx; 352 struct size_class *class; 353 enum fullness_group currfg, newfg; 354 355 BUG_ON(!is_first_page(page)); 356 357 get_zspage_mapping(page, &class_idx, &currfg); 358 newfg = get_fullness_group(page); 359 if (newfg == currfg) 360 goto out; 361 362 class = &pool->size_class[class_idx]; 363 remove_zspage(page, class, currfg); 364 insert_zspage(page, class, newfg); 365 set_zspage_mapping(page, class_idx, newfg); 366 367out: 368 return newfg; 369} 370 371/* 372 * We have to decide on how many pages to link together 373 * to form a zspage for each size class. This is important 374 * to reduce wastage due to unusable space left at end of 375 * each zspage which is given as: 376 * wastage = Zp - Zp % size_class 377 * where Zp = zspage size = k * PAGE_SIZE where k = 1, 2, ... 378 * 379 * For example, for size class of 3/8 * PAGE_SIZE, we should 380 * link together 3 PAGE_SIZE sized pages to form a zspage 381 * since then we can perfectly fit in 8 such objects. 382 */ 383static int get_pages_per_zspage(int class_size) 384{ 385 int i, max_usedpc = 0; 386 /* zspage order which gives maximum used size per KB */ 387 int max_usedpc_order = 1; 388 389 for (i = 1; i <= ZS_MAX_PAGES_PER_ZSPAGE; i++) { 390 int zspage_size; 391 int waste, usedpc; 392 393 zspage_size = i * PAGE_SIZE; 394 waste = zspage_size % class_size; 395 usedpc = (zspage_size - waste) * 100 / zspage_size; 396 397 if (usedpc > max_usedpc) { 398 max_usedpc = usedpc; 399 max_usedpc_order = i; 400 } 401 } 402 403 return max_usedpc_order; 404} 405 406/* 407 * A single 'zspage' is composed of many system pages which are 408 * linked together using fields in struct page. This function finds 409 * the first/head page, given any component page of a zspage. 410 */ 411static struct page *get_first_page(struct page *page) 412{ 413 if (is_first_page(page)) 414 return page; 415 else 416 return page->first_page; 417} 418 419static struct page *get_next_page(struct page *page) 420{ 421 struct page *next; 422 423 if (is_last_page(page)) 424 next = NULL; 425 else if (is_first_page(page)) 426 next = (struct page *)page->private; 427 else 428 next = list_entry(page->lru.next, struct page, lru); 429 430 return next; 431} 432 433/* Encode <page, obj_idx> as a single handle value */ 434static void *obj_location_to_handle(struct page *page, unsigned long obj_idx) 435{ 436 unsigned long handle; 437 438 if (!page) { 439 BUG_ON(obj_idx); 440 return NULL; 441 } 442 443 handle = page_to_pfn(page) << OBJ_INDEX_BITS; 444 handle |= (obj_idx & OBJ_INDEX_MASK); 445 446 return (void *)handle; 447} 448 449/* Decode <page, obj_idx> pair from the given object handle */ 450static void obj_handle_to_location(unsigned long handle, struct page **page, 451 unsigned long *obj_idx) 452{ 453 *page = pfn_to_page(handle >> OBJ_INDEX_BITS); 454 *obj_idx = handle & OBJ_INDEX_MASK; 455} 456 457static unsigned long obj_idx_to_offset(struct page *page, 458 unsigned long obj_idx, int class_size) 459{ 460 unsigned long off = 0; 461 462 if (!is_first_page(page)) 463 off = page->index; 464 465 return off + obj_idx * class_size; 466} 467 468static void reset_page(struct page *page) 469{ 470 clear_bit(PG_private, &page->flags); 471 clear_bit(PG_private_2, &page->flags); 472 set_page_private(page, 0); 473 page->mapping = NULL; 474 page->freelist = NULL; 475 page_mapcount_reset(page); 476} 477 478static void free_zspage(struct page *first_page) 479{ 480 struct page *nextp, *tmp, *head_extra; 481 482 BUG_ON(!is_first_page(first_page)); 483 BUG_ON(first_page->inuse); 484 485 head_extra = (struct page *)page_private(first_page); 486 487 reset_page(first_page); 488 __free_page(first_page); 489 490 /* zspage with only 1 system page */ 491 if (!head_extra) 492 return; 493 494 list_for_each_entry_safe(nextp, tmp, &head_extra->lru, lru) { 495 list_del(&nextp->lru); 496 reset_page(nextp); 497 __free_page(nextp); 498 } 499 reset_page(head_extra); 500 __free_page(head_extra); 501} 502 503/* Initialize a newly allocated zspage */ 504static void init_zspage(struct page *first_page, struct size_class *class) 505{ 506 unsigned long off = 0; 507 struct page *page = first_page; 508 509 BUG_ON(!is_first_page(first_page)); 510 while (page) { 511 struct page *next_page; 512 struct link_free *link; 513 unsigned int i, objs_on_page; 514 515 /* 516 * page->index stores offset of first object starting 517 * in the page. For the first page, this is always 0, 518 * so we use first_page->index (aka ->freelist) to store 519 * head of corresponding zspage's freelist. 520 */ 521 if (page != first_page) 522 page->index = off; 523 524 link = (struct link_free *)kmap_atomic(page) + 525 off / sizeof(*link); 526 objs_on_page = (PAGE_SIZE - off) / class->size; 527 528 for (i = 1; i <= objs_on_page; i++) { 529 off += class->size; 530 if (off < PAGE_SIZE) { 531 link->next = obj_location_to_handle(page, i); 532 link += class->size / sizeof(*link); 533 } 534 } 535 536 /* 537 * We now come to the last (full or partial) object on this 538 * page, which must point to the first object on the next 539 * page (if present) 540 */ 541 next_page = get_next_page(page); 542 link->next = obj_location_to_handle(next_page, 0); 543 kunmap_atomic(link); 544 page = next_page; 545 off = (off + class->size) % PAGE_SIZE; 546 } 547} 548 549/* 550 * Allocate a zspage for the given size class 551 */ 552static struct page *alloc_zspage(struct size_class *class, gfp_t flags) 553{ 554 int i, error; 555 struct page *first_page = NULL, *uninitialized_var(prev_page); 556 557 /* 558 * Allocate individual pages and link them together as: 559 * 1. first page->private = first sub-page 560 * 2. all sub-pages are linked together using page->lru 561 * 3. each sub-page is linked to the first page using page->first_page 562 * 563 * For each size class, First/Head pages are linked together using 564 * page->lru. Also, we set PG_private to identify the first page 565 * (i.e. no other sub-page has this flag set) and PG_private_2 to 566 * identify the last page. 567 */ 568 error = -ENOMEM; 569 for (i = 0; i < class->pages_per_zspage; i++) { 570 struct page *page; 571 572 page = alloc_page(flags); 573 if (!page) 574 goto cleanup; 575 576 INIT_LIST_HEAD(&page->lru); 577 if (i == 0) { /* first page */ 578 SetPagePrivate(page); 579 set_page_private(page, 0); 580 first_page = page; 581 first_page->inuse = 0; 582 } 583 if (i == 1) 584 first_page->private = (unsigned long)page; 585 if (i >= 1) 586 page->first_page = first_page; 587 if (i >= 2) 588 list_add(&page->lru, &prev_page->lru); 589 if (i == class->pages_per_zspage - 1) /* last page */ 590 SetPagePrivate2(page); 591 prev_page = page; 592 } 593 594 init_zspage(first_page, class); 595 596 first_page->freelist = obj_location_to_handle(first_page, 0); 597 /* Maximum number of objects we can store in this zspage */ 598 first_page->objects = class->pages_per_zspage * PAGE_SIZE / class->size; 599 600 error = 0; /* Success */ 601 602cleanup: 603 if (unlikely(error) && first_page) { 604 free_zspage(first_page); 605 first_page = NULL; 606 } 607 608 return first_page; 609} 610 611static struct page *find_get_zspage(struct size_class *class) 612{ 613 int i; 614 struct page *page; 615 616 for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) { 617 page = class->fullness_list[i]; 618 if (page) 619 break; 620 } 621 622 return page; 623} 624 625#ifdef USE_PGTABLE_MAPPING 626static inline int __zs_cpu_up(struct mapping_area *area) 627{ 628 /* 629 * Make sure we don't leak memory if a cpu UP notification 630 * and zs_init() race and both call zs_cpu_up() on the same cpu 631 */ 632 if (area->vm) 633 return 0; 634 area->vm = alloc_vm_area(PAGE_SIZE * 2, NULL); 635 if (!area->vm) 636 return -ENOMEM; 637 return 0; 638} 639 640static inline void __zs_cpu_down(struct mapping_area *area) 641{ 642 if (area->vm) 643 free_vm_area(area->vm); 644 area->vm = NULL; 645} 646 647static inline void *__zs_map_object(struct mapping_area *area, 648 struct page *pages[2], int off, int size) 649{ 650 BUG_ON(map_vm_area(area->vm, PAGE_KERNEL, &pages)); 651 area->vm_addr = area->vm->addr; 652 return area->vm_addr + off; 653} 654 655static inline void __zs_unmap_object(struct mapping_area *area, 656 struct page *pages[2], int off, int size) 657{ 658 unsigned long addr = (unsigned long)area->vm_addr; 659 660 unmap_kernel_range(addr, PAGE_SIZE * 2); 661} 662 663#else /* USE_PGTABLE_MAPPING */ 664 665static inline int __zs_cpu_up(struct mapping_area *area) 666{ 667 /* 668 * Make sure we don't leak memory if a cpu UP notification 669 * and zs_init() race and both call zs_cpu_up() on the same cpu 670 */ 671 if (area->vm_buf) 672 return 0; 673 area->vm_buf = (char *)__get_free_page(GFP_KERNEL); 674 if (!area->vm_buf) 675 return -ENOMEM; 676 return 0; 677} 678 679static inline void __zs_cpu_down(struct mapping_area *area) 680{ 681 if (area->vm_buf) 682 free_page((unsigned long)area->vm_buf); 683 area->vm_buf = NULL; 684} 685 686static void *__zs_map_object(struct mapping_area *area, 687 struct page *pages[2], int off, int size) 688{ 689 int sizes[2]; 690 void *addr; 691 char *buf = area->vm_buf; 692 693 /* disable page faults to match kmap_atomic() return conditions */ 694 pagefault_disable(); 695 696 /* no read fastpath */ 697 if (area->vm_mm == ZS_MM_WO) 698 goto out; 699 700 sizes[0] = PAGE_SIZE - off; 701 sizes[1] = size - sizes[0]; 702 703 /* copy object to per-cpu buffer */ 704 addr = kmap_atomic(pages[0]); 705 memcpy(buf, addr + off, sizes[0]); 706 kunmap_atomic(addr); 707 addr = kmap_atomic(pages[1]); 708 memcpy(buf + sizes[0], addr, sizes[1]); 709 kunmap_atomic(addr); 710out: 711 return area->vm_buf; 712} 713 714static void __zs_unmap_object(struct mapping_area *area, 715 struct page *pages[2], int off, int size) 716{ 717 int sizes[2]; 718 void *addr; 719 char *buf = area->vm_buf; 720 721 /* no write fastpath */ 722 if (area->vm_mm == ZS_MM_RO) 723 goto out; 724 725 sizes[0] = PAGE_SIZE - off; 726 sizes[1] = size - sizes[0]; 727 728 /* copy per-cpu buffer to object */ 729 addr = kmap_atomic(pages[0]); 730 memcpy(addr + off, buf, sizes[0]); 731 kunmap_atomic(addr); 732 addr = kmap_atomic(pages[1]); 733 memcpy(addr, buf + sizes[0], sizes[1]); 734 kunmap_atomic(addr); 735 736out: 737 /* enable page faults to match kunmap_atomic() return conditions */ 738 pagefault_enable(); 739} 740 741#endif /* USE_PGTABLE_MAPPING */ 742 743static int zs_cpu_notifier(struct notifier_block *nb, unsigned long action, 744 void *pcpu) 745{ 746 int ret, cpu = (long)pcpu; 747 struct mapping_area *area; 748 749 switch (action) { 750 case CPU_UP_PREPARE: 751 area = &per_cpu(zs_map_area, cpu); 752 ret = __zs_cpu_up(area); 753 if (ret) 754 return notifier_from_errno(ret); 755 break; 756 case CPU_DEAD: 757 case CPU_UP_CANCELED: 758 area = &per_cpu(zs_map_area, cpu); 759 __zs_cpu_down(area); 760 break; 761 } 762 763 return NOTIFY_OK; 764} 765 766static struct notifier_block zs_cpu_nb = { 767 .notifier_call = zs_cpu_notifier 768}; 769 770static void zs_exit(void) 771{ 772 int cpu; 773 774 for_each_online_cpu(cpu) 775 zs_cpu_notifier(NULL, CPU_DEAD, (void *)(long)cpu); 776 unregister_cpu_notifier(&zs_cpu_nb); 777} 778 779static int zs_init(void) 780{ 781 int cpu, ret; 782 783 register_cpu_notifier(&zs_cpu_nb); 784 for_each_online_cpu(cpu) { 785 ret = zs_cpu_notifier(NULL, CPU_UP_PREPARE, (void *)(long)cpu); 786 if (notifier_to_errno(ret)) 787 goto fail; 788 } 789 return 0; 790fail: 791 zs_exit(); 792 return notifier_to_errno(ret); 793} 794 795/** 796 * zs_create_pool - Creates an allocation pool to work from. 797 * @flags: allocation flags used to allocate pool metadata 798 * 799 * This function must be called before anything when using 800 * the zsmalloc allocator. 801 * 802 * On success, a pointer to the newly created pool is returned, 803 * otherwise NULL. 804 */ 805struct zs_pool *zs_create_pool(gfp_t flags) 806{ 807 int i, ovhd_size; 808 struct zs_pool *pool; 809 810 ovhd_size = roundup(sizeof(*pool), PAGE_SIZE); 811 pool = kzalloc(ovhd_size, GFP_KERNEL); 812 if (!pool) 813 return NULL; 814 815 for (i = 0; i < ZS_SIZE_CLASSES; i++) { 816 int size; 817 struct size_class *class; 818 819 size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA; 820 if (size > ZS_MAX_ALLOC_SIZE) 821 size = ZS_MAX_ALLOC_SIZE; 822 823 class = &pool->size_class[i]; 824 class->size = size; 825 class->index = i; 826 spin_lock_init(&class->lock); 827 class->pages_per_zspage = get_pages_per_zspage(size); 828 829 } 830 831 pool->flags = flags; 832 833 return pool; 834} 835EXPORT_SYMBOL_GPL(zs_create_pool); 836 837void zs_destroy_pool(struct zs_pool *pool) 838{ 839 int i; 840 841 for (i = 0; i < ZS_SIZE_CLASSES; i++) { 842 int fg; 843 struct size_class *class = &pool->size_class[i]; 844 845 for (fg = 0; fg < _ZS_NR_FULLNESS_GROUPS; fg++) { 846 if (class->fullness_list[fg]) { 847 pr_info("Freeing non-empty class with size %db, fullness group %d\n", 848 class->size, fg); 849 } 850 } 851 } 852 kfree(pool); 853} 854EXPORT_SYMBOL_GPL(zs_destroy_pool); 855 856/** 857 * zs_malloc - Allocate block of given size from pool. 858 * @pool: pool to allocate from 859 * @size: size of block to allocate 860 * 861 * On success, handle to the allocated object is returned, 862 * otherwise 0. 863 * Allocation requests with size > ZS_MAX_ALLOC_SIZE will fail. 864 */ 865unsigned long zs_malloc(struct zs_pool *pool, size_t size) 866{ 867 unsigned long obj; 868 struct link_free *link; 869 int class_idx; 870 struct size_class *class; 871 872 struct page *first_page, *m_page; 873 unsigned long m_objidx, m_offset; 874 875 if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE)) 876 return 0; 877 878 class_idx = get_size_class_index(size); 879 class = &pool->size_class[class_idx]; 880 BUG_ON(class_idx != class->index); 881 882 spin_lock(&class->lock); 883 first_page = find_get_zspage(class); 884 885 if (!first_page) { 886 spin_unlock(&class->lock); 887 first_page = alloc_zspage(class, pool->flags); 888 if (unlikely(!first_page)) 889 return 0; 890 891 set_zspage_mapping(first_page, class->index, ZS_EMPTY); 892 spin_lock(&class->lock); 893 class->pages_allocated += class->pages_per_zspage; 894 } 895 896 obj = (unsigned long)first_page->freelist; 897 obj_handle_to_location(obj, &m_page, &m_objidx); 898 m_offset = obj_idx_to_offset(m_page, m_objidx, class->size); 899 900 link = (struct link_free *)kmap_atomic(m_page) + 901 m_offset / sizeof(*link); 902 first_page->freelist = link->next; 903 memset(link, POISON_INUSE, sizeof(*link)); 904 kunmap_atomic(link); 905 906 first_page->inuse++; 907 /* Now move the zspage to another fullness group, if required */ 908 fix_fullness_group(pool, first_page); 909 spin_unlock(&class->lock); 910 911 return obj; 912} 913EXPORT_SYMBOL_GPL(zs_malloc); 914 915void zs_free(struct zs_pool *pool, unsigned long obj) 916{ 917 struct link_free *link; 918 struct page *first_page, *f_page; 919 unsigned long f_objidx, f_offset; 920 921 int class_idx; 922 struct size_class *class; 923 enum fullness_group fullness; 924 925 if (unlikely(!obj)) 926 return; 927 928 obj_handle_to_location(obj, &f_page, &f_objidx); 929 first_page = get_first_page(f_page); 930 931 get_zspage_mapping(first_page, &class_idx, &fullness); 932 class = &pool->size_class[class_idx]; 933 f_offset = obj_idx_to_offset(f_page, f_objidx, class->size); 934 935 spin_lock(&class->lock); 936 937 /* Insert this object in containing zspage's freelist */ 938 link = (struct link_free *)((unsigned char *)kmap_atomic(f_page) 939 + f_offset); 940 link->next = first_page->freelist; 941 kunmap_atomic(link); 942 first_page->freelist = (void *)obj; 943 944 first_page->inuse--; 945 fullness = fix_fullness_group(pool, first_page); 946 947 if (fullness == ZS_EMPTY) 948 class->pages_allocated -= class->pages_per_zspage; 949 950 spin_unlock(&class->lock); 951 952 if (fullness == ZS_EMPTY) 953 free_zspage(first_page); 954} 955EXPORT_SYMBOL_GPL(zs_free); 956 957/** 958 * zs_map_object - get address of allocated object from handle. 959 * @pool: pool from which the object was allocated 960 * @handle: handle returned from zs_malloc 961 * 962 * Before using an object allocated from zs_malloc, it must be mapped using 963 * this function. When done with the object, it must be unmapped using 964 * zs_unmap_object. 965 * 966 * Only one object can be mapped per cpu at a time. There is no protection 967 * against nested mappings. 968 * 969 * This function returns with preemption and page faults disabled. 970 */ 971void *zs_map_object(struct zs_pool *pool, unsigned long handle, 972 enum zs_mapmode mm) 973{ 974 struct page *page; 975 unsigned long obj_idx, off; 976 977 unsigned int class_idx; 978 enum fullness_group fg; 979 struct size_class *class; 980 struct mapping_area *area; 981 struct page *pages[2]; 982 983 BUG_ON(!handle); 984 985 /* 986 * Because we use per-cpu mapping areas shared among the 987 * pools/users, we can't allow mapping in interrupt context 988 * because it can corrupt another users mappings. 989 */ 990 BUG_ON(in_interrupt()); 991 992 obj_handle_to_location(handle, &page, &obj_idx); 993 get_zspage_mapping(get_first_page(page), &class_idx, &fg); 994 class = &pool->size_class[class_idx]; 995 off = obj_idx_to_offset(page, obj_idx, class->size); 996 997 area = &get_cpu_var(zs_map_area); 998 area->vm_mm = mm; 999 if (off + class->size <= PAGE_SIZE) { 1000 /* this object is contained entirely within a page */ 1001 area->vm_addr = kmap_atomic(page); 1002 return area->vm_addr + off; 1003 } 1004 1005 /* this object spans two pages */ 1006 pages[0] = page; 1007 pages[1] = get_next_page(page); 1008 BUG_ON(!pages[1]); 1009 1010 return __zs_map_object(area, pages, off, class->size); 1011} 1012EXPORT_SYMBOL_GPL(zs_map_object); 1013 1014void zs_unmap_object(struct zs_pool *pool, unsigned long handle) 1015{ 1016 struct page *page; 1017 unsigned long obj_idx, off; 1018 1019 unsigned int class_idx; 1020 enum fullness_group fg; 1021 struct size_class *class; 1022 struct mapping_area *area; 1023 1024 BUG_ON(!handle); 1025 1026 obj_handle_to_location(handle, &page, &obj_idx); 1027 get_zspage_mapping(get_first_page(page), &class_idx, &fg); 1028 class = &pool->size_class[class_idx]; 1029 off = obj_idx_to_offset(page, obj_idx, class->size); 1030 1031 area = &__get_cpu_var(zs_map_area); 1032 if (off + class->size <= PAGE_SIZE) 1033 kunmap_atomic(area->vm_addr); 1034 else { 1035 struct page *pages[2]; 1036 1037 pages[0] = page; 1038 pages[1] = get_next_page(page); 1039 BUG_ON(!pages[1]); 1040 1041 __zs_unmap_object(area, pages, off, class->size); 1042 } 1043 put_cpu_var(zs_map_area); 1044} 1045EXPORT_SYMBOL_GPL(zs_unmap_object); 1046 1047u64 zs_get_total_size_bytes(struct zs_pool *pool) 1048{ 1049 int i; 1050 u64 npages = 0; 1051 1052 for (i = 0; i < ZS_SIZE_CLASSES; i++) 1053 npages += pool->size_class[i].pages_allocated; 1054 1055 return npages << PAGE_SHIFT; 1056} 1057EXPORT_SYMBOL_GPL(zs_get_total_size_bytes); 1058 1059module_init(zs_init); 1060module_exit(zs_exit); 1061 1062MODULE_LICENSE("Dual BSD/GPL"); 1063MODULE_AUTHOR("Nitin Gupta <ngupta@vflare.org>");