Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

memblock: Reimplement memblock allocation using reverse free area iterator

Now that all early memory information is in memblock when enabled, we
can implement reverse free area iterator and use it to implement NUMA
aware allocator which is then wrapped for simpler variants instead of
the confusing and inefficient mending of information in separate NUMA
aware allocator.

Implement for_each_free_mem_range_reverse(), use it to reimplement
memblock_find_in_range_node() which in turn is used by all allocators.

The visible allocator interface is inconsistent and can probably use
some cleanup too.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Yinghai Lu <yinghai@kernel.org>

Tejun Heo 7bd0b0f0 0ee332c1

+150 -149
+22 -2
include/linux/memblock.h
··· 46 46 #define memblock_dbg(fmt, ...) \ 47 47 if (memblock_debug) printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__) 48 48 49 + phys_addr_t memblock_find_in_range_node(phys_addr_t start, phys_addr_t end, 50 + phys_addr_t size, phys_addr_t align, int nid); 49 51 phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end, 50 52 phys_addr_t size, phys_addr_t align); 51 53 int memblock_free_reserved_regions(void); ··· 100 98 i != (u64)ULLONG_MAX; \ 101 99 __next_free_mem_range(&i, nid, p_start, p_end, p_nid)) 102 100 101 + void __next_free_mem_range_rev(u64 *idx, int nid, phys_addr_t *out_start, 102 + phys_addr_t *out_end, int *out_nid); 103 + 104 + /** 105 + * for_each_free_mem_range_reverse - rev-iterate through free memblock areas 106 + * @i: u64 used as loop variable 107 + * @nid: node selector, %MAX_NUMNODES for all nodes 108 + * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL 109 + * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL 110 + * @p_nid: ptr to int for nid of the range, can be %NULL 111 + * 112 + * Walks over free (memory && !reserved) areas of memblock in reverse 113 + * order. Available as soon as memblock is initialized. 114 + */ 115 + #define for_each_free_mem_range_reverse(i, nid, p_start, p_end, p_nid) \ 116 + for (i = (u64)ULLONG_MAX, \ 117 + __next_free_mem_range_rev(&i, nid, p_start, p_end, p_nid); \ 118 + i != (u64)ULLONG_MAX; \ 119 + __next_free_mem_range_rev(&i, nid, p_start, p_end, p_nid)) 120 + 103 121 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 104 122 int memblock_set_node(phys_addr_t base, phys_addr_t size, int nid); 105 123 ··· 143 121 } 144 122 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 145 123 146 - phys_addr_t memblock_find_in_range_node(phys_addr_t start, phys_addr_t end, 147 - phys_addr_t size, phys_addr_t align, int nid); 148 124 phys_addr_t memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid); 149 125 phys_addr_t memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid); 150 126
+128 -147
mm/memblock.c
··· 79 79 return (i < type->cnt) ? i : -1; 80 80 } 81 81 82 - /* 83 - * Find, allocate, deallocate or reserve unreserved regions. All allocations 84 - * are top-down. 82 + /** 83 + * memblock_find_in_range_node - find free area in given range and node 84 + * @start: start of candidate range 85 + * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} 86 + * @size: size of free area to find 87 + * @align: alignment of free area to find 88 + * @nid: nid of the free area to find, %MAX_NUMNODES for any node 89 + * 90 + * Find @size free area aligned to @align in the specified range and node. 91 + * 92 + * RETURNS: 93 + * Found address on success, %0 on failure. 85 94 */ 86 - 87 - static phys_addr_t __init_memblock memblock_find_region(phys_addr_t start, phys_addr_t end, 88 - phys_addr_t size, phys_addr_t align) 95 + phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start, 96 + phys_addr_t end, phys_addr_t size, 97 + phys_addr_t align, int nid) 89 98 { 90 - phys_addr_t base, res_base; 91 - long j; 99 + phys_addr_t this_start, this_end, cand; 100 + u64 i; 92 101 93 - /* In case, huge size is requested */ 94 - if (end < size) 95 - return 0; 102 + /* align @size to avoid excessive fragmentation on reserved array */ 103 + size = round_up(size, align); 96 104 97 - base = round_down(end - size, align); 98 - 99 - /* Prevent allocations returning 0 as it's also used to 100 - * indicate an allocation failure 101 - */ 102 - if (start == 0) 103 - start = PAGE_SIZE; 104 - 105 - while (start <= base) { 106 - j = memblock_overlaps_region(&memblock.reserved, base, size); 107 - if (j < 0) 108 - return base; 109 - res_base = memblock.reserved.regions[j].base; 110 - if (res_base < size) 111 - break; 112 - base = round_down(res_base - size, align); 113 - } 114 - 115 - return 0; 116 - } 117 - 118 - /* 119 - * Find a free area with specified alignment in a specific range. 120 - */ 121 - phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start, phys_addr_t end, 122 - phys_addr_t size, phys_addr_t align) 123 - { 124 - long i; 125 - 126 - BUG_ON(0 == size); 127 - 128 - /* Pump up max_addr */ 105 + /* pump up @end */ 129 106 if (end == MEMBLOCK_ALLOC_ACCESSIBLE) 130 107 end = memblock.current_limit; 131 108 132 - /* We do a top-down search, this tends to limit memory 133 - * fragmentation by keeping early boot allocs near the 134 - * top of memory 135 - */ 136 - for (i = memblock.memory.cnt - 1; i >= 0; i--) { 137 - phys_addr_t memblockbase = memblock.memory.regions[i].base; 138 - phys_addr_t memblocksize = memblock.memory.regions[i].size; 139 - phys_addr_t bottom, top, found; 109 + /* adjust @start to avoid underflow and allocating the first page */ 110 + start = max3(start, size, (phys_addr_t)PAGE_SIZE); 111 + end = max(start, end); 140 112 141 - if (memblocksize < size) 142 - continue; 143 - if ((memblockbase + memblocksize) <= start) 144 - break; 145 - bottom = max(memblockbase, start); 146 - top = min(memblockbase + memblocksize, end); 147 - if (bottom >= top) 148 - continue; 149 - found = memblock_find_region(bottom, top, size, align); 150 - if (found) 151 - return found; 113 + for_each_free_mem_range_reverse(i, nid, &this_start, &this_end, NULL) { 114 + this_start = clamp(this_start, start, end); 115 + this_end = clamp(this_end, start, end); 116 + 117 + cand = round_down(this_end - size, align); 118 + if (cand >= this_start) 119 + return cand; 152 120 } 153 121 return 0; 122 + } 123 + 124 + /** 125 + * memblock_find_in_range - find free area in given range 126 + * @start: start of candidate range 127 + * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} 128 + * @size: size of free area to find 129 + * @align: alignment of free area to find 130 + * 131 + * Find @size free area aligned to @align in the specified range. 132 + * 133 + * RETURNS: 134 + * Found address on success, %0 on failure. 135 + */ 136 + phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start, 137 + phys_addr_t end, phys_addr_t size, 138 + phys_addr_t align) 139 + { 140 + return memblock_find_in_range_node(start, end, size, align, 141 + MAX_NUMNODES); 154 142 } 155 143 156 144 /* ··· 595 607 *idx = ULLONG_MAX; 596 608 } 597 609 610 + /** 611 + * __next_free_mem_range_rev - next function for for_each_free_mem_range_reverse() 612 + * @idx: pointer to u64 loop variable 613 + * @nid: nid: node selector, %MAX_NUMNODES for all nodes 614 + * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL 615 + * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL 616 + * @p_nid: ptr to int for nid of the range, can be %NULL 617 + * 618 + * Reverse of __next_free_mem_range(). 619 + */ 620 + void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid, 621 + phys_addr_t *out_start, 622 + phys_addr_t *out_end, int *out_nid) 623 + { 624 + struct memblock_type *mem = &memblock.memory; 625 + struct memblock_type *rsv = &memblock.reserved; 626 + int mi = *idx & 0xffffffff; 627 + int ri = *idx >> 32; 628 + 629 + if (*idx == (u64)ULLONG_MAX) { 630 + mi = mem->cnt - 1; 631 + ri = rsv->cnt; 632 + } 633 + 634 + for ( ; mi >= 0; mi--) { 635 + struct memblock_region *m = &mem->regions[mi]; 636 + phys_addr_t m_start = m->base; 637 + phys_addr_t m_end = m->base + m->size; 638 + 639 + /* only memory regions are associated with nodes, check it */ 640 + if (nid != MAX_NUMNODES && nid != memblock_get_region_node(m)) 641 + continue; 642 + 643 + /* scan areas before each reservation for intersection */ 644 + for ( ; ri >= 0; ri--) { 645 + struct memblock_region *r = &rsv->regions[ri]; 646 + phys_addr_t r_start = ri ? r[-1].base + r[-1].size : 0; 647 + phys_addr_t r_end = ri < rsv->cnt ? r->base : ULLONG_MAX; 648 + 649 + /* if ri advanced past mi, break out to advance mi */ 650 + if (r_end <= m_start) 651 + break; 652 + /* if the two regions intersect, we're done */ 653 + if (m_end > r_start) { 654 + if (out_start) 655 + *out_start = max(m_start, r_start); 656 + if (out_end) 657 + *out_end = min(m_end, r_end); 658 + if (out_nid) 659 + *out_nid = memblock_get_region_node(m); 660 + 661 + if (m_start >= r_start) 662 + mi--; 663 + else 664 + ri--; 665 + *idx = (u32)mi | (u64)ri << 32; 666 + return; 667 + } 668 + } 669 + } 670 + 671 + *idx = ULLONG_MAX; 672 + } 673 + 598 674 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 599 675 /* 600 676 * Common iterator interface used to define for_each_mem_range(). ··· 722 670 } 723 671 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 724 672 725 - phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) 673 + static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size, 674 + phys_addr_t align, phys_addr_t max_addr, 675 + int nid) 726 676 { 727 677 phys_addr_t found; 728 678 729 - /* We align the size to limit fragmentation. Without this, a lot of 730 - * small allocs quickly eat up the whole reserve array on sparc 731 - */ 732 - size = round_up(size, align); 733 - 734 - found = memblock_find_in_range(0, max_addr, size, align); 679 + found = memblock_find_in_range_node(0, max_addr, size, align, nid); 735 680 if (found && !memblock_reserve(found, size)) 736 681 return found; 737 682 738 683 return 0; 684 + } 685 + 686 + phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid) 687 + { 688 + return memblock_alloc_base_nid(size, align, MEMBLOCK_ALLOC_ACCESSIBLE, nid); 689 + } 690 + 691 + phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) 692 + { 693 + return memblock_alloc_base_nid(size, align, max_addr, MAX_NUMNODES); 739 694 } 740 695 741 696 phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) ··· 761 702 phys_addr_t __init memblock_alloc(phys_addr_t size, phys_addr_t align) 762 703 { 763 704 return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE); 764 - } 765 - 766 - 767 - /* 768 - * Additional node-local top-down allocators. 769 - * 770 - * WARNING: Only available after early_node_map[] has been populated, 771 - * on some architectures, that is after all the calls to add_active_range() 772 - * have been done to populate it. 773 - */ 774 - 775 - static phys_addr_t __init memblock_nid_range_rev(phys_addr_t start, 776 - phys_addr_t end, int *nid) 777 - { 778 - #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 779 - unsigned long start_pfn, end_pfn; 780 - int i; 781 - 782 - for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, nid) 783 - if (end > PFN_PHYS(start_pfn) && end <= PFN_PHYS(end_pfn)) 784 - return max(start, PFN_PHYS(start_pfn)); 785 - #endif 786 - *nid = 0; 787 - return start; 788 - } 789 - 790 - phys_addr_t __init memblock_find_in_range_node(phys_addr_t start, 791 - phys_addr_t end, 792 - phys_addr_t size, 793 - phys_addr_t align, int nid) 794 - { 795 - struct memblock_type *mem = &memblock.memory; 796 - int i; 797 - 798 - BUG_ON(0 == size); 799 - 800 - /* Pump up max_addr */ 801 - if (end == MEMBLOCK_ALLOC_ACCESSIBLE) 802 - end = memblock.current_limit; 803 - 804 - for (i = mem->cnt - 1; i >= 0; i--) { 805 - struct memblock_region *r = &mem->regions[i]; 806 - phys_addr_t base = max(start, r->base); 807 - phys_addr_t top = min(end, r->base + r->size); 808 - 809 - while (base < top) { 810 - phys_addr_t tbase, ret; 811 - int tnid; 812 - 813 - tbase = memblock_nid_range_rev(base, top, &tnid); 814 - if (nid == MAX_NUMNODES || tnid == nid) { 815 - ret = memblock_find_region(tbase, top, size, align); 816 - if (ret) 817 - return ret; 818 - } 819 - top = tbase; 820 - } 821 - } 822 - 823 - return 0; 824 - } 825 - 826 - phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid) 827 - { 828 - phys_addr_t found; 829 - 830 - /* 831 - * We align the size to limit fragmentation. Without this, a lot of 832 - * small allocs quickly eat up the whole reserve array on sparc 833 - */ 834 - size = round_up(size, align); 835 - 836 - found = memblock_find_in_range_node(0, MEMBLOCK_ALLOC_ACCESSIBLE, 837 - size, align, nid); 838 - if (found && !memblock_reserve(found, size)) 839 - return found; 840 - 841 - return 0; 842 705 } 843 706 844 707 phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid)