Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'for-5.2' of git://git.kernel.org/pub/scm/linux/kernel/git/dennis/percpu

Pull percpu updates from Dennis Zhou:

- scan hint update which helps address performance issues with heavily
fragmented blocks

- lockdep fix when freeing an allocation causes balance work to be
scheduled

* 'for-5.2' of git://git.kernel.org/pub/scm/linux/kernel/git/dennis/percpu:
percpu: remove spurious lock dependency between percpu and sched
percpu: use chunk scan_hint to skip some scanning
percpu: convert chunk hints to be based on pcpu_block_md
percpu: make pcpu_block_md generic
percpu: use block scan_hint to only scan forward
percpu: remember largest area skipped during allocation
percpu: add block level scan_hint
percpu: set PCPU_BITMAP_BLOCK_SIZE to PAGE_SIZE
percpu: relegate chunks unusable when failing small allocations
percpu: manage chunks based on contig_bits instead of free_bytes
percpu: introduce helper to determine if two regions overlap
percpu: do not search past bitmap when allocating an area
percpu: update free path with correct new free region

+407 -180
+3 -9
include/linux/percpu.h
··· 26 26 #define PCPU_MIN_ALLOC_SHIFT 2 27 27 #define PCPU_MIN_ALLOC_SIZE (1 << PCPU_MIN_ALLOC_SHIFT) 28 28 29 - /* number of bits per page, used to trigger a scan if blocks are > PAGE_SIZE */ 30 - #define PCPU_BITS_PER_PAGE (PAGE_SIZE >> PCPU_MIN_ALLOC_SHIFT) 31 - 32 29 /* 33 - * This determines the size of each metadata block. There are several subtle 34 - * constraints around this constant. The reserved region must be a multiple of 35 - * PCPU_BITMAP_BLOCK_SIZE. Additionally, PCPU_BITMAP_BLOCK_SIZE must be a 36 - * multiple of PAGE_SIZE or PAGE_SIZE must be a multiple of 37 - * PCPU_BITMAP_BLOCK_SIZE to align with the populated page map. The unit_size 38 - * also has to be a multiple of PCPU_BITMAP_BLOCK_SIZE to ensure full blocks. 30 + * The PCPU_BITMAP_BLOCK_SIZE must be the same size as PAGE_SIZE as the 31 + * updating of hints is used to manage the nr_empty_pop_pages in both 32 + * the chunk and globally. 39 33 */ 40 34 #define PCPU_BITMAP_BLOCK_SIZE PAGE_SIZE 41 35 #define PCPU_BITMAP_BLOCK_BITS (PCPU_BITMAP_BLOCK_SIZE >> \
+11 -4
mm/percpu-internal.h
··· 9 9 * pcpu_block_md is the metadata block struct. 10 10 * Each chunk's bitmap is split into a number of full blocks. 11 11 * All units are in terms of bits. 12 + * 13 + * The scan hint is the largest known contiguous area before the contig hint. 14 + * It is not necessarily the actual largest contig hint though. There is an 15 + * invariant that the scan_hint_start > contig_hint_start iff 16 + * scan_hint == contig_hint. This is necessary because when scanning forward, 17 + * we don't know if a new contig hint would be better than the current one. 12 18 */ 13 19 struct pcpu_block_md { 20 + int scan_hint; /* scan hint for block */ 21 + int scan_hint_start; /* block relative starting 22 + position of the scan hint */ 14 23 int contig_hint; /* contig hint for block */ 15 24 int contig_hint_start; /* block relative starting 16 25 position of the contig hint */ ··· 28 19 int right_free; /* size of free space along 29 20 the right side of the block */ 30 21 int first_free; /* block position of first free */ 22 + int nr_bits; /* total bits responsible for */ 31 23 }; 32 24 33 25 struct pcpu_chunk { ··· 39 29 40 30 struct list_head list; /* linked to pcpu_slot lists */ 41 31 int free_bytes; /* free bytes in the chunk */ 42 - int contig_bits; /* max contiguous size hint */ 43 - int contig_bits_start; /* contig_bits starting 44 - offset */ 32 + struct pcpu_block_md chunk_md; 45 33 void *base_addr; /* base address of this chunk */ 46 34 47 35 unsigned long *alloc_map; /* allocation map */ ··· 47 39 struct pcpu_block_md *md_blocks; /* metadata blocks */ 48 40 49 41 void *data; /* chunk data */ 50 - int first_bit; /* no free below this */ 51 42 bool immutable; /* no [de]population allowed */ 52 43 int start_offset; /* the overlap with the previous 53 44 region to have a page aligned
+1 -1
mm/percpu-km.c
··· 70 70 chunk->base_addr = page_address(pages); 71 71 72 72 spin_lock_irqsave(&pcpu_lock, flags); 73 - pcpu_chunk_populated(chunk, 0, nr_pages, false); 73 + pcpu_chunk_populated(chunk, 0, nr_pages); 74 74 spin_unlock_irqrestore(&pcpu_lock, flags); 75 75 76 76 pcpu_stats_chunk_alloc();
+3 -2
mm/percpu-stats.c
··· 53 53 static void chunk_map_stats(struct seq_file *m, struct pcpu_chunk *chunk, 54 54 int *buffer) 55 55 { 56 + struct pcpu_block_md *chunk_md = &chunk->chunk_md; 56 57 int i, last_alloc, as_len, start, end; 57 58 int *alloc_sizes, *p; 58 59 /* statistics */ ··· 122 121 P("nr_alloc", chunk->nr_alloc); 123 122 P("max_alloc_size", chunk->max_alloc_size); 124 123 P("empty_pop_pages", chunk->nr_empty_pop_pages); 125 - P("first_bit", chunk->first_bit); 124 + P("first_bit", chunk_md->first_free); 126 125 P("free_bytes", chunk->free_bytes); 127 - P("contig_bytes", chunk->contig_bits * PCPU_MIN_ALLOC_SIZE); 126 + P("contig_bytes", chunk_md->contig_hint * PCPU_MIN_ALLOC_SIZE); 128 127 P("sum_frag", sum_frag); 129 128 P("max_frag", max_frag); 130 129 P("cur_min_alloc", cur_min_alloc);
+389 -164
mm/percpu.c
··· 94 94 95 95 /* the slots are sorted by free bytes left, 1-31 bytes share the same slot */ 96 96 #define PCPU_SLOT_BASE_SHIFT 5 97 + /* chunks in slots below this are subject to being sidelined on failed alloc */ 98 + #define PCPU_SLOT_FAIL_THRESHOLD 3 97 99 98 100 #define PCPU_EMPTY_POP_PAGES_LOW 2 99 101 #define PCPU_EMPTY_POP_PAGES_HIGH 4 ··· 233 231 234 232 static int pcpu_chunk_slot(const struct pcpu_chunk *chunk) 235 233 { 236 - if (chunk->free_bytes < PCPU_MIN_ALLOC_SIZE || chunk->contig_bits == 0) 234 + const struct pcpu_block_md *chunk_md = &chunk->chunk_md; 235 + 236 + if (chunk->free_bytes < PCPU_MIN_ALLOC_SIZE || 237 + chunk_md->contig_hint == 0) 237 238 return 0; 238 239 239 - return pcpu_size_to_slot(chunk->free_bytes); 240 + return pcpu_size_to_slot(chunk_md->contig_hint * PCPU_MIN_ALLOC_SIZE); 240 241 } 241 242 242 243 /* set the pointer to a chunk in a page struct */ ··· 321 316 static unsigned long pcpu_block_off_to_off(int index, int off) 322 317 { 323 318 return index * PCPU_BITMAP_BLOCK_BITS + off; 319 + } 320 + 321 + /* 322 + * pcpu_next_hint - determine which hint to use 323 + * @block: block of interest 324 + * @alloc_bits: size of allocation 325 + * 326 + * This determines if we should scan based on the scan_hint or first_free. 327 + * In general, we want to scan from first_free to fulfill allocations by 328 + * first fit. However, if we know a scan_hint at position scan_hint_start 329 + * cannot fulfill an allocation, we can begin scanning from there knowing 330 + * the contig_hint will be our fallback. 331 + */ 332 + static int pcpu_next_hint(struct pcpu_block_md *block, int alloc_bits) 333 + { 334 + /* 335 + * The three conditions below determine if we can skip past the 336 + * scan_hint. First, does the scan hint exist. Second, is the 337 + * contig_hint after the scan_hint (possibly not true iff 338 + * contig_hint == scan_hint). Third, is the allocation request 339 + * larger than the scan_hint. 340 + */ 341 + if (block->scan_hint && 342 + block->contig_hint_start > block->scan_hint_start && 343 + alloc_bits > block->scan_hint) 344 + return block->scan_hint_start + block->scan_hint; 345 + 346 + return block->first_free; 324 347 } 325 348 326 349 /** ··· 446 413 if (block->contig_hint && 447 414 block->contig_hint_start >= block_off && 448 415 block->contig_hint >= *bits + alloc_bits) { 416 + int start = pcpu_next_hint(block, alloc_bits); 417 + 449 418 *bits += alloc_bits + block->contig_hint_start - 450 - block->first_free; 451 - *bit_off = pcpu_block_off_to_off(i, block->first_free); 419 + start; 420 + *bit_off = pcpu_block_off_to_off(i, start); 452 421 return; 453 422 } 454 423 /* reset to satisfy the second predicate above */ ··· 523 488 kvfree(ptr); 524 489 } 525 490 491 + static void __pcpu_chunk_move(struct pcpu_chunk *chunk, int slot, 492 + bool move_front) 493 + { 494 + if (chunk != pcpu_reserved_chunk) { 495 + if (move_front) 496 + list_move(&chunk->list, &pcpu_slot[slot]); 497 + else 498 + list_move_tail(&chunk->list, &pcpu_slot[slot]); 499 + } 500 + } 501 + 502 + static void pcpu_chunk_move(struct pcpu_chunk *chunk, int slot) 503 + { 504 + __pcpu_chunk_move(chunk, slot, true); 505 + } 506 + 526 507 /** 527 508 * pcpu_chunk_relocate - put chunk in the appropriate chunk slot 528 509 * @chunk: chunk of interest ··· 556 505 { 557 506 int nslot = pcpu_chunk_slot(chunk); 558 507 559 - if (chunk != pcpu_reserved_chunk && oslot != nslot) { 560 - if (oslot < nslot) 561 - list_move(&chunk->list, &pcpu_slot[nslot]); 562 - else 563 - list_move_tail(&chunk->list, &pcpu_slot[nslot]); 564 - } 508 + if (oslot != nslot) 509 + __pcpu_chunk_move(chunk, nslot, oslot < nslot); 565 510 } 566 511 567 - /** 568 - * pcpu_cnt_pop_pages- counts populated backing pages in range 512 + /* 513 + * pcpu_update_empty_pages - update empty page counters 569 514 * @chunk: chunk of interest 570 - * @bit_off: start offset 571 - * @bits: size of area to check 515 + * @nr: nr of empty pages 572 516 * 573 - * Calculates the number of populated pages in the region 574 - * [page_start, page_end). This keeps track of how many empty populated 575 - * pages are available and decide if async work should be scheduled. 576 - * 577 - * RETURNS: 578 - * The nr of populated pages. 517 + * This is used to keep track of the empty pages now based on the premise 518 + * a md_block covers a page. The hint update functions recognize if a block 519 + * is made full or broken to calculate deltas for keeping track of free pages. 579 520 */ 580 - static inline int pcpu_cnt_pop_pages(struct pcpu_chunk *chunk, int bit_off, 581 - int bits) 521 + static inline void pcpu_update_empty_pages(struct pcpu_chunk *chunk, int nr) 582 522 { 583 - int page_start = PFN_UP(bit_off * PCPU_MIN_ALLOC_SIZE); 584 - int page_end = PFN_DOWN((bit_off + bits) * PCPU_MIN_ALLOC_SIZE); 585 - 586 - if (page_start >= page_end) 587 - return 0; 588 - 589 - /* 590 - * bitmap_weight counts the number of bits set in a bitmap up to 591 - * the specified number of bits. This is counting the populated 592 - * pages up to page_end and then subtracting the populated pages 593 - * up to page_start to count the populated pages in 594 - * [page_start, page_end). 595 - */ 596 - return bitmap_weight(chunk->populated, page_end) - 597 - bitmap_weight(chunk->populated, page_start); 598 - } 599 - 600 - /** 601 - * pcpu_chunk_update - updates the chunk metadata given a free area 602 - * @chunk: chunk of interest 603 - * @bit_off: chunk offset 604 - * @bits: size of free area 605 - * 606 - * This updates the chunk's contig hint and starting offset given a free area. 607 - * Choose the best starting offset if the contig hint is equal. 608 - */ 609 - static void pcpu_chunk_update(struct pcpu_chunk *chunk, int bit_off, int bits) 610 - { 611 - if (bits > chunk->contig_bits) { 612 - chunk->contig_bits_start = bit_off; 613 - chunk->contig_bits = bits; 614 - } else if (bits == chunk->contig_bits && chunk->contig_bits_start && 615 - (!bit_off || 616 - __ffs(bit_off) > __ffs(chunk->contig_bits_start))) { 617 - /* use the start with the best alignment */ 618 - chunk->contig_bits_start = bit_off; 619 - } 620 - } 621 - 622 - /** 623 - * pcpu_chunk_refresh_hint - updates metadata about a chunk 624 - * @chunk: chunk of interest 625 - * 626 - * Iterates over the metadata blocks to find the largest contig area. 627 - * It also counts the populated pages and uses the delta to update the 628 - * global count. 629 - * 630 - * Updates: 631 - * chunk->contig_bits 632 - * chunk->contig_bits_start 633 - * nr_empty_pop_pages (chunk and global) 634 - */ 635 - static void pcpu_chunk_refresh_hint(struct pcpu_chunk *chunk) 636 - { 637 - int bit_off, bits, nr_empty_pop_pages; 638 - 639 - /* clear metadata */ 640 - chunk->contig_bits = 0; 641 - 642 - bit_off = chunk->first_bit; 643 - bits = nr_empty_pop_pages = 0; 644 - pcpu_for_each_md_free_region(chunk, bit_off, bits) { 645 - pcpu_chunk_update(chunk, bit_off, bits); 646 - 647 - nr_empty_pop_pages += pcpu_cnt_pop_pages(chunk, bit_off, bits); 648 - } 649 - 650 - /* 651 - * Keep track of nr_empty_pop_pages. 652 - * 653 - * The chunk maintains the previous number of free pages it held, 654 - * so the delta is used to update the global counter. The reserved 655 - * chunk is not part of the free page count as they are populated 656 - * at init and are special to serving reserved allocations. 657 - */ 523 + chunk->nr_empty_pop_pages += nr; 658 524 if (chunk != pcpu_reserved_chunk) 659 - pcpu_nr_empty_pop_pages += 660 - (nr_empty_pop_pages - chunk->nr_empty_pop_pages); 525 + pcpu_nr_empty_pop_pages += nr; 526 + } 661 527 662 - chunk->nr_empty_pop_pages = nr_empty_pop_pages; 528 + /* 529 + * pcpu_region_overlap - determines if two regions overlap 530 + * @a: start of first region, inclusive 531 + * @b: end of first region, exclusive 532 + * @x: start of second region, inclusive 533 + * @y: end of second region, exclusive 534 + * 535 + * This is used to determine if the hint region [a, b) overlaps with the 536 + * allocated region [x, y). 537 + */ 538 + static inline bool pcpu_region_overlap(int a, int b, int x, int y) 539 + { 540 + return (a < y) && (x < b); 663 541 } 664 542 665 543 /** ··· 609 629 if (start == 0) 610 630 block->left_free = contig; 611 631 612 - if (end == PCPU_BITMAP_BLOCK_BITS) 632 + if (end == block->nr_bits) 613 633 block->right_free = contig; 614 634 615 635 if (contig > block->contig_hint) { 636 + /* promote the old contig_hint to be the new scan_hint */ 637 + if (start > block->contig_hint_start) { 638 + if (block->contig_hint > block->scan_hint) { 639 + block->scan_hint_start = 640 + block->contig_hint_start; 641 + block->scan_hint = block->contig_hint; 642 + } else if (start < block->scan_hint_start) { 643 + /* 644 + * The old contig_hint == scan_hint. But, the 645 + * new contig is larger so hold the invariant 646 + * scan_hint_start < contig_hint_start. 647 + */ 648 + block->scan_hint = 0; 649 + } 650 + } else { 651 + block->scan_hint = 0; 652 + } 616 653 block->contig_hint_start = start; 617 654 block->contig_hint = contig; 618 - } else if (block->contig_hint_start && contig == block->contig_hint && 619 - (!start || __ffs(start) > __ffs(block->contig_hint_start))) { 620 - /* use the start with the best alignment */ 621 - block->contig_hint_start = start; 655 + } else if (contig == block->contig_hint) { 656 + if (block->contig_hint_start && 657 + (!start || 658 + __ffs(start) > __ffs(block->contig_hint_start))) { 659 + /* start has a better alignment so use it */ 660 + block->contig_hint_start = start; 661 + if (start < block->scan_hint_start && 662 + block->contig_hint > block->scan_hint) 663 + block->scan_hint = 0; 664 + } else if (start > block->scan_hint_start || 665 + block->contig_hint > block->scan_hint) { 666 + /* 667 + * Knowing contig == contig_hint, update the scan_hint 668 + * if it is farther than or larger than the current 669 + * scan_hint. 670 + */ 671 + block->scan_hint_start = start; 672 + block->scan_hint = contig; 673 + } 674 + } else { 675 + /* 676 + * The region is smaller than the contig_hint. So only update 677 + * the scan_hint if it is larger than or equal and farther than 678 + * the current scan_hint. 679 + */ 680 + if ((start < block->contig_hint_start && 681 + (contig > block->scan_hint || 682 + (contig == block->scan_hint && 683 + start > block->scan_hint_start)))) { 684 + block->scan_hint_start = start; 685 + block->scan_hint = contig; 686 + } 687 + } 688 + } 689 + 690 + /* 691 + * pcpu_block_update_scan - update a block given a free area from a scan 692 + * @chunk: chunk of interest 693 + * @bit_off: chunk offset 694 + * @bits: size of free area 695 + * 696 + * Finding the final allocation spot first goes through pcpu_find_block_fit() 697 + * to find a block that can hold the allocation and then pcpu_alloc_area() 698 + * where a scan is used. When allocations require specific alignments, 699 + * we can inadvertently create holes which will not be seen in the alloc 700 + * or free paths. 701 + * 702 + * This takes a given free area hole and updates a block as it may change the 703 + * scan_hint. We need to scan backwards to ensure we don't miss free bits 704 + * from alignment. 705 + */ 706 + static void pcpu_block_update_scan(struct pcpu_chunk *chunk, int bit_off, 707 + int bits) 708 + { 709 + int s_off = pcpu_off_to_block_off(bit_off); 710 + int e_off = s_off + bits; 711 + int s_index, l_bit; 712 + struct pcpu_block_md *block; 713 + 714 + if (e_off > PCPU_BITMAP_BLOCK_BITS) 715 + return; 716 + 717 + s_index = pcpu_off_to_block_index(bit_off); 718 + block = chunk->md_blocks + s_index; 719 + 720 + /* scan backwards in case of alignment skipping free bits */ 721 + l_bit = find_last_bit(pcpu_index_alloc_map(chunk, s_index), s_off); 722 + s_off = (s_off == l_bit) ? 0 : l_bit + 1; 723 + 724 + pcpu_block_update(block, s_off, e_off); 725 + } 726 + 727 + /** 728 + * pcpu_chunk_refresh_hint - updates metadata about a chunk 729 + * @chunk: chunk of interest 730 + * @full_scan: if we should scan from the beginning 731 + * 732 + * Iterates over the metadata blocks to find the largest contig area. 733 + * A full scan can be avoided on the allocation path as this is triggered 734 + * if we broke the contig_hint. In doing so, the scan_hint will be before 735 + * the contig_hint or after if the scan_hint == contig_hint. This cannot 736 + * be prevented on freeing as we want to find the largest area possibly 737 + * spanning blocks. 738 + */ 739 + static void pcpu_chunk_refresh_hint(struct pcpu_chunk *chunk, bool full_scan) 740 + { 741 + struct pcpu_block_md *chunk_md = &chunk->chunk_md; 742 + int bit_off, bits; 743 + 744 + /* promote scan_hint to contig_hint */ 745 + if (!full_scan && chunk_md->scan_hint) { 746 + bit_off = chunk_md->scan_hint_start + chunk_md->scan_hint; 747 + chunk_md->contig_hint_start = chunk_md->scan_hint_start; 748 + chunk_md->contig_hint = chunk_md->scan_hint; 749 + chunk_md->scan_hint = 0; 750 + } else { 751 + bit_off = chunk_md->first_free; 752 + chunk_md->contig_hint = 0; 753 + } 754 + 755 + bits = 0; 756 + pcpu_for_each_md_free_region(chunk, bit_off, bits) { 757 + pcpu_block_update(chunk_md, bit_off, bit_off + bits); 622 758 } 623 759 } 624 760 ··· 750 654 { 751 655 struct pcpu_block_md *block = chunk->md_blocks + index; 752 656 unsigned long *alloc_map = pcpu_index_alloc_map(chunk, index); 753 - int rs, re; /* region start, region end */ 657 + int rs, re, start; /* region start, region end */ 754 658 755 - /* clear hints */ 756 - block->contig_hint = 0; 757 - block->left_free = block->right_free = 0; 659 + /* promote scan_hint to contig_hint */ 660 + if (block->scan_hint) { 661 + start = block->scan_hint_start + block->scan_hint; 662 + block->contig_hint_start = block->scan_hint_start; 663 + block->contig_hint = block->scan_hint; 664 + block->scan_hint = 0; 665 + } else { 666 + start = block->first_free; 667 + block->contig_hint = 0; 668 + } 669 + 670 + block->right_free = 0; 758 671 759 672 /* iterate over free areas and update the contig hints */ 760 - pcpu_for_each_unpop_region(alloc_map, rs, re, block->first_free, 673 + pcpu_for_each_unpop_region(alloc_map, rs, re, start, 761 674 PCPU_BITMAP_BLOCK_BITS) { 762 675 pcpu_block_update(block, rs, re); 763 676 } ··· 785 680 static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off, 786 681 int bits) 787 682 { 683 + struct pcpu_block_md *chunk_md = &chunk->chunk_md; 684 + int nr_empty_pages = 0; 788 685 struct pcpu_block_md *s_block, *e_block, *block; 789 686 int s_index, e_index; /* block indexes of the freed allocation */ 790 687 int s_off, e_off; /* block offsets of the freed allocation */ ··· 811 704 * If the allocation breaks the contig_hint, a scan is required to 812 705 * restore this hint. 813 706 */ 707 + if (s_block->contig_hint == PCPU_BITMAP_BLOCK_BITS) 708 + nr_empty_pages++; 709 + 814 710 if (s_off == s_block->first_free) 815 711 s_block->first_free = find_next_zero_bit( 816 712 pcpu_index_alloc_map(chunk, s_index), 817 713 PCPU_BITMAP_BLOCK_BITS, 818 714 s_off + bits); 819 715 820 - if (s_off >= s_block->contig_hint_start && 821 - s_off < s_block->contig_hint_start + s_block->contig_hint) { 716 + if (pcpu_region_overlap(s_block->scan_hint_start, 717 + s_block->scan_hint_start + s_block->scan_hint, 718 + s_off, 719 + s_off + bits)) 720 + s_block->scan_hint = 0; 721 + 722 + if (pcpu_region_overlap(s_block->contig_hint_start, 723 + s_block->contig_hint_start + 724 + s_block->contig_hint, 725 + s_off, 726 + s_off + bits)) { 822 727 /* block contig hint is broken - scan to fix it */ 728 + if (!s_off) 729 + s_block->left_free = 0; 823 730 pcpu_block_refresh_hint(chunk, s_index); 824 731 } else { 825 732 /* update left and right contig manually */ ··· 849 728 * Update e_block. 850 729 */ 851 730 if (s_index != e_index) { 731 + if (e_block->contig_hint == PCPU_BITMAP_BLOCK_BITS) 732 + nr_empty_pages++; 733 + 852 734 /* 853 735 * When the allocation is across blocks, the end is along 854 736 * the left part of the e_block. ··· 864 740 /* reset the block */ 865 741 e_block++; 866 742 } else { 743 + if (e_off > e_block->scan_hint_start) 744 + e_block->scan_hint = 0; 745 + 746 + e_block->left_free = 0; 867 747 if (e_off > e_block->contig_hint_start) { 868 748 /* contig hint is broken - scan to fix it */ 869 749 pcpu_block_refresh_hint(chunk, e_index); 870 750 } else { 871 - e_block->left_free = 0; 872 751 e_block->right_free = 873 752 min_t(int, e_block->right_free, 874 753 PCPU_BITMAP_BLOCK_BITS - e_off); ··· 879 752 } 880 753 881 754 /* update in-between md_blocks */ 755 + nr_empty_pages += (e_index - s_index - 1); 882 756 for (block = s_block + 1; block < e_block; block++) { 757 + block->scan_hint = 0; 883 758 block->contig_hint = 0; 884 759 block->left_free = 0; 885 760 block->right_free = 0; 886 761 } 887 762 } 888 763 764 + if (nr_empty_pages) 765 + pcpu_update_empty_pages(chunk, -nr_empty_pages); 766 + 767 + if (pcpu_region_overlap(chunk_md->scan_hint_start, 768 + chunk_md->scan_hint_start + 769 + chunk_md->scan_hint, 770 + bit_off, 771 + bit_off + bits)) 772 + chunk_md->scan_hint = 0; 773 + 889 774 /* 890 775 * The only time a full chunk scan is required is if the chunk 891 776 * contig hint is broken. Otherwise, it means a smaller space 892 777 * was used and therefore the chunk contig hint is still correct. 893 778 */ 894 - if (bit_off >= chunk->contig_bits_start && 895 - bit_off < chunk->contig_bits_start + chunk->contig_bits) 896 - pcpu_chunk_refresh_hint(chunk); 779 + if (pcpu_region_overlap(chunk_md->contig_hint_start, 780 + chunk_md->contig_hint_start + 781 + chunk_md->contig_hint, 782 + bit_off, 783 + bit_off + bits)) 784 + pcpu_chunk_refresh_hint(chunk, false); 897 785 } 898 786 899 787 /** ··· 924 782 * 925 783 * A chunk update is triggered if a page becomes free, a block becomes free, 926 784 * or the free spans across blocks. This tradeoff is to minimize iterating 927 - * over the block metadata to update chunk->contig_bits. chunk->contig_bits 928 - * may be off by up to a page, but it will never be more than the available 929 - * space. If the contig hint is contained in one block, it will be accurate. 785 + * over the block metadata to update chunk_md->contig_hint. 786 + * chunk_md->contig_hint may be off by up to a page, but it will never be more 787 + * than the available space. If the contig hint is contained in one block, it 788 + * will be accurate. 930 789 */ 931 790 static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off, 932 791 int bits) 933 792 { 793 + int nr_empty_pages = 0; 934 794 struct pcpu_block_md *s_block, *e_block, *block; 935 795 int s_index, e_index; /* block indexes of the freed allocation */ 936 796 int s_off, e_off; /* block offsets of the freed allocation */ ··· 986 842 987 843 /* update s_block */ 988 844 e_off = (s_index == e_index) ? end : PCPU_BITMAP_BLOCK_BITS; 845 + if (!start && e_off == PCPU_BITMAP_BLOCK_BITS) 846 + nr_empty_pages++; 989 847 pcpu_block_update(s_block, start, e_off); 990 848 991 849 /* freeing in the same block */ 992 850 if (s_index != e_index) { 993 851 /* update e_block */ 852 + if (end == PCPU_BITMAP_BLOCK_BITS) 853 + nr_empty_pages++; 994 854 pcpu_block_update(e_block, 0, end); 995 855 996 856 /* reset md_blocks in the middle */ 857 + nr_empty_pages += (e_index - s_index - 1); 997 858 for (block = s_block + 1; block < e_block; block++) { 998 859 block->first_free = 0; 860 + block->scan_hint = 0; 999 861 block->contig_hint_start = 0; 1000 862 block->contig_hint = PCPU_BITMAP_BLOCK_BITS; 1001 863 block->left_free = PCPU_BITMAP_BLOCK_BITS; ··· 1009 859 } 1010 860 } 1011 861 862 + if (nr_empty_pages) 863 + pcpu_update_empty_pages(chunk, nr_empty_pages); 864 + 1012 865 /* 1013 - * Refresh chunk metadata when the free makes a page free, a block 1014 - * free, or spans across blocks. The contig hint may be off by up to 1015 - * a page, but if the hint is contained in a block, it will be accurate 1016 - * with the else condition below. 866 + * Refresh chunk metadata when the free makes a block free or spans 867 + * across blocks. The contig_hint may be off by up to a page, but if 868 + * the contig_hint is contained in a block, it will be accurate with 869 + * the else condition below. 1017 870 */ 1018 - if ((ALIGN_DOWN(end, min(PCPU_BITS_PER_PAGE, PCPU_BITMAP_BLOCK_BITS)) > 1019 - ALIGN(start, min(PCPU_BITS_PER_PAGE, PCPU_BITMAP_BLOCK_BITS))) || 1020 - s_index != e_index) 1021 - pcpu_chunk_refresh_hint(chunk); 871 + if (((end - start) >= PCPU_BITMAP_BLOCK_BITS) || s_index != e_index) 872 + pcpu_chunk_refresh_hint(chunk, true); 1022 873 else 1023 - pcpu_chunk_update(chunk, pcpu_block_off_to_off(s_index, start), 1024 - s_block->contig_hint); 874 + pcpu_block_update(&chunk->chunk_md, 875 + pcpu_block_off_to_off(s_index, start), 876 + end); 1025 877 } 1026 878 1027 879 /** ··· 1078 926 static int pcpu_find_block_fit(struct pcpu_chunk *chunk, int alloc_bits, 1079 927 size_t align, bool pop_only) 1080 928 { 929 + struct pcpu_block_md *chunk_md = &chunk->chunk_md; 1081 930 int bit_off, bits, next_off; 1082 931 1083 932 /* ··· 1087 934 * cannot fit in the global hint, there is memory pressure and creating 1088 935 * a new chunk would happen soon. 1089 936 */ 1090 - bit_off = ALIGN(chunk->contig_bits_start, align) - 1091 - chunk->contig_bits_start; 1092 - if (bit_off + alloc_bits > chunk->contig_bits) 937 + bit_off = ALIGN(chunk_md->contig_hint_start, align) - 938 + chunk_md->contig_hint_start; 939 + if (bit_off + alloc_bits > chunk_md->contig_hint) 1093 940 return -1; 1094 941 1095 - bit_off = chunk->first_bit; 942 + bit_off = pcpu_next_hint(chunk_md, alloc_bits); 1096 943 bits = 0; 1097 944 pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits) { 1098 945 if (!pop_only || pcpu_is_populated(chunk, bit_off, bits, ··· 1107 954 return -1; 1108 955 1109 956 return bit_off; 957 + } 958 + 959 + /* 960 + * pcpu_find_zero_area - modified from bitmap_find_next_zero_area_off() 961 + * @map: the address to base the search on 962 + * @size: the bitmap size in bits 963 + * @start: the bitnumber to start searching at 964 + * @nr: the number of zeroed bits we're looking for 965 + * @align_mask: alignment mask for zero area 966 + * @largest_off: offset of the largest area skipped 967 + * @largest_bits: size of the largest area skipped 968 + * 969 + * The @align_mask should be one less than a power of 2. 970 + * 971 + * This is a modified version of bitmap_find_next_zero_area_off() to remember 972 + * the largest area that was skipped. This is imperfect, but in general is 973 + * good enough. The largest remembered region is the largest failed region 974 + * seen. This does not include anything we possibly skipped due to alignment. 975 + * pcpu_block_update_scan() does scan backwards to try and recover what was 976 + * lost to alignment. While this can cause scanning to miss earlier possible 977 + * free areas, smaller allocations will eventually fill those holes. 978 + */ 979 + static unsigned long pcpu_find_zero_area(unsigned long *map, 980 + unsigned long size, 981 + unsigned long start, 982 + unsigned long nr, 983 + unsigned long align_mask, 984 + unsigned long *largest_off, 985 + unsigned long *largest_bits) 986 + { 987 + unsigned long index, end, i, area_off, area_bits; 988 + again: 989 + index = find_next_zero_bit(map, size, start); 990 + 991 + /* Align allocation */ 992 + index = __ALIGN_MASK(index, align_mask); 993 + area_off = index; 994 + 995 + end = index + nr; 996 + if (end > size) 997 + return end; 998 + i = find_next_bit(map, end, index); 999 + if (i < end) { 1000 + area_bits = i - area_off; 1001 + /* remember largest unused area with best alignment */ 1002 + if (area_bits > *largest_bits || 1003 + (area_bits == *largest_bits && *largest_off && 1004 + (!area_off || __ffs(area_off) > __ffs(*largest_off)))) { 1005 + *largest_off = area_off; 1006 + *largest_bits = area_bits; 1007 + } 1008 + 1009 + start = i + 1; 1010 + goto again; 1011 + } 1012 + return index; 1110 1013 } 1111 1014 1112 1015 /** ··· 1187 978 static int pcpu_alloc_area(struct pcpu_chunk *chunk, int alloc_bits, 1188 979 size_t align, int start) 1189 980 { 981 + struct pcpu_block_md *chunk_md = &chunk->chunk_md; 1190 982 size_t align_mask = (align) ? (align - 1) : 0; 983 + unsigned long area_off = 0, area_bits = 0; 1191 984 int bit_off, end, oslot; 1192 985 1193 986 lockdep_assert_held(&pcpu_lock); ··· 1199 988 /* 1200 989 * Search to find a fit. 1201 990 */ 1202 - end = start + alloc_bits + PCPU_BITMAP_BLOCK_BITS; 1203 - bit_off = bitmap_find_next_zero_area(chunk->alloc_map, end, start, 1204 - alloc_bits, align_mask); 991 + end = min_t(int, start + alloc_bits + PCPU_BITMAP_BLOCK_BITS, 992 + pcpu_chunk_map_bits(chunk)); 993 + bit_off = pcpu_find_zero_area(chunk->alloc_map, end, start, alloc_bits, 994 + align_mask, &area_off, &area_bits); 1205 995 if (bit_off >= end) 1206 996 return -1; 997 + 998 + if (area_bits) 999 + pcpu_block_update_scan(chunk, area_off, area_bits); 1207 1000 1208 1001 /* update alloc map */ 1209 1002 bitmap_set(chunk->alloc_map, bit_off, alloc_bits); ··· 1220 1005 chunk->free_bytes -= alloc_bits * PCPU_MIN_ALLOC_SIZE; 1221 1006 1222 1007 /* update first free bit */ 1223 - if (bit_off == chunk->first_bit) 1224 - chunk->first_bit = find_next_zero_bit( 1008 + if (bit_off == chunk_md->first_free) 1009 + chunk_md->first_free = find_next_zero_bit( 1225 1010 chunk->alloc_map, 1226 1011 pcpu_chunk_map_bits(chunk), 1227 1012 bit_off + alloc_bits); ··· 1243 1028 */ 1244 1029 static void pcpu_free_area(struct pcpu_chunk *chunk, int off) 1245 1030 { 1031 + struct pcpu_block_md *chunk_md = &chunk->chunk_md; 1246 1032 int bit_off, bits, end, oslot; 1247 1033 1248 1034 lockdep_assert_held(&pcpu_lock); ··· 1263 1047 chunk->free_bytes += bits * PCPU_MIN_ALLOC_SIZE; 1264 1048 1265 1049 /* update first free bit */ 1266 - chunk->first_bit = min(chunk->first_bit, bit_off); 1050 + chunk_md->first_free = min(chunk_md->first_free, bit_off); 1267 1051 1268 1052 pcpu_block_update_hint_free(chunk, bit_off, bits); 1269 1053 1270 1054 pcpu_chunk_relocate(chunk, oslot); 1271 1055 } 1272 1056 1057 + static void pcpu_init_md_block(struct pcpu_block_md *block, int nr_bits) 1058 + { 1059 + block->scan_hint = 0; 1060 + block->contig_hint = nr_bits; 1061 + block->left_free = nr_bits; 1062 + block->right_free = nr_bits; 1063 + block->first_free = 0; 1064 + block->nr_bits = nr_bits; 1065 + } 1066 + 1273 1067 static void pcpu_init_md_blocks(struct pcpu_chunk *chunk) 1274 1068 { 1275 1069 struct pcpu_block_md *md_block; 1276 1070 1071 + /* init the chunk's block */ 1072 + pcpu_init_md_block(&chunk->chunk_md, pcpu_chunk_map_bits(chunk)); 1073 + 1277 1074 for (md_block = chunk->md_blocks; 1278 1075 md_block != chunk->md_blocks + pcpu_chunk_nr_blocks(chunk); 1279 - md_block++) { 1280 - md_block->contig_hint = PCPU_BITMAP_BLOCK_BITS; 1281 - md_block->left_free = PCPU_BITMAP_BLOCK_BITS; 1282 - md_block->right_free = PCPU_BITMAP_BLOCK_BITS; 1283 - } 1076 + md_block++) 1077 + pcpu_init_md_block(md_block, PCPU_BITMAP_BLOCK_BITS); 1284 1078 } 1285 1079 1286 1080 /** ··· 1369 1143 chunk->immutable = true; 1370 1144 bitmap_fill(chunk->populated, chunk->nr_pages); 1371 1145 chunk->nr_populated = chunk->nr_pages; 1372 - chunk->nr_empty_pop_pages = 1373 - pcpu_cnt_pop_pages(chunk, start_offset / PCPU_MIN_ALLOC_SIZE, 1374 - map_size / PCPU_MIN_ALLOC_SIZE); 1146 + chunk->nr_empty_pop_pages = chunk->nr_pages; 1375 1147 1376 - chunk->contig_bits = map_size / PCPU_MIN_ALLOC_SIZE; 1377 1148 chunk->free_bytes = map_size; 1378 1149 1379 1150 if (chunk->start_offset) { ··· 1380 1157 set_bit(0, chunk->bound_map); 1381 1158 set_bit(offset_bits, chunk->bound_map); 1382 1159 1383 - chunk->first_bit = offset_bits; 1160 + chunk->chunk_md.first_free = offset_bits; 1384 1161 1385 1162 pcpu_block_update_hint_alloc(chunk, 0, offset_bits); 1386 1163 } ··· 1433 1210 pcpu_init_md_blocks(chunk); 1434 1211 1435 1212 /* init metadata */ 1436 - chunk->contig_bits = region_bits; 1437 1213 chunk->free_bytes = chunk->nr_pages * PAGE_SIZE; 1438 1214 1439 1215 return chunk; ··· 1462 1240 * @chunk: pcpu_chunk which got populated 1463 1241 * @page_start: the start page 1464 1242 * @page_end: the end page 1465 - * @for_alloc: if this is to populate for allocation 1466 1243 * 1467 1244 * Pages in [@page_start,@page_end) have been populated to @chunk. Update 1468 1245 * the bookkeeping information accordingly. Must be called after each ··· 1471 1250 * is to serve an allocation in that area. 1472 1251 */ 1473 1252 static void pcpu_chunk_populated(struct pcpu_chunk *chunk, int page_start, 1474 - int page_end, bool for_alloc) 1253 + int page_end) 1475 1254 { 1476 1255 int nr = page_end - page_start; 1477 1256 ··· 1481 1260 chunk->nr_populated += nr; 1482 1261 pcpu_nr_populated += nr; 1483 1262 1484 - if (!for_alloc) { 1485 - chunk->nr_empty_pop_pages += nr; 1486 - pcpu_nr_empty_pop_pages += nr; 1487 - } 1263 + pcpu_update_empty_pages(chunk, nr); 1488 1264 } 1489 1265 1490 1266 /** ··· 1503 1285 1504 1286 bitmap_clear(chunk->populated, page_start, nr); 1505 1287 chunk->nr_populated -= nr; 1506 - chunk->nr_empty_pop_pages -= nr; 1507 - pcpu_nr_empty_pop_pages -= nr; 1508 1288 pcpu_nr_populated -= nr; 1289 + 1290 + pcpu_update_empty_pages(chunk, -nr); 1509 1291 } 1510 1292 1511 1293 /* ··· 1592 1374 bool is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL; 1593 1375 bool do_warn = !(gfp & __GFP_NOWARN); 1594 1376 static int warn_limit = 10; 1595 - struct pcpu_chunk *chunk; 1377 + struct pcpu_chunk *chunk, *next; 1596 1378 const char *err; 1597 1379 int slot, off, cpu, ret; 1598 1380 unsigned long flags; ··· 1654 1436 restart: 1655 1437 /* search through normal chunks */ 1656 1438 for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) { 1657 - list_for_each_entry(chunk, &pcpu_slot[slot], list) { 1439 + list_for_each_entry_safe(chunk, next, &pcpu_slot[slot], list) { 1658 1440 off = pcpu_find_block_fit(chunk, bits, bit_align, 1659 1441 is_atomic); 1660 - if (off < 0) 1442 + if (off < 0) { 1443 + if (slot < PCPU_SLOT_FAIL_THRESHOLD) 1444 + pcpu_chunk_move(chunk, 0); 1661 1445 continue; 1446 + } 1662 1447 1663 1448 off = pcpu_alloc_area(chunk, bits, bit_align, off); 1664 1449 if (off >= 0) ··· 1720 1499 err = "failed to populate"; 1721 1500 goto fail_unlock; 1722 1501 } 1723 - pcpu_chunk_populated(chunk, rs, re, true); 1502 + pcpu_chunk_populated(chunk, rs, re); 1724 1503 spin_unlock_irqrestore(&pcpu_lock, flags); 1725 1504 } 1726 1505 ··· 1919 1698 if (!ret) { 1920 1699 nr_to_pop -= nr; 1921 1700 spin_lock_irq(&pcpu_lock); 1922 - pcpu_chunk_populated(chunk, rs, rs + nr, false); 1701 + pcpu_chunk_populated(chunk, rs, rs + nr); 1923 1702 spin_unlock_irq(&pcpu_lock); 1924 1703 } else { 1925 1704 nr_to_pop = 0; ··· 1959 1738 struct pcpu_chunk *chunk; 1960 1739 unsigned long flags; 1961 1740 int off; 1741 + bool need_balance = false; 1962 1742 1963 1743 if (!ptr) 1964 1744 return; ··· 1981 1759 1982 1760 list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list) 1983 1761 if (pos != chunk) { 1984 - pcpu_schedule_balance_work(); 1762 + need_balance = true; 1985 1763 break; 1986 1764 } 1987 1765 } ··· 1989 1767 trace_percpu_free_percpu(chunk->base_addr, off, ptr); 1990 1768 1991 1769 spin_unlock_irqrestore(&pcpu_lock, flags); 1770 + 1771 + if (need_balance) 1772 + pcpu_schedule_balance_work(); 1992 1773 } 1993 1774 EXPORT_SYMBOL_GPL(free_percpu); 1994 1775