Merge branch 'akpm' (patches from Andrew)

Merge yet more updates from Andrew Morton:
"A few final bits:

- large changes to vmalloc, yielding large performance benefits

- tweak the console-flush-on-panic code

- a few fixes"

* emailed patches from Andrew Morton <akpm@linux-foundation.org>:
panic: add an option to replay all the printk message in buffer
initramfs: don't free a non-existent initrd
fs/writeback.c: use rcu_barrier() to wait for inflight wb switches going into workqueue when umount
mm/compaction.c: correct zone boundary handling when isolating pages from a pageblock
mm/vmap: add DEBUG_AUGMENT_LOWEST_MATCH_CHECK macro
mm/vmap: add DEBUG_AUGMENT_PROPAGATE_CHECK macro
mm/vmalloc.c: keep track of free blocks for vmap allocation

+892 -260
+1
Documentation/admin-guide/kernel-parameters.txt
··· 3212 bit 2: print timer info 3213 bit 3: print locks info if CONFIG_LOCKDEP is on 3214 bit 4: print ftrace buffer 3215 3216 panic_on_warn panic() instead of WARN(). Useful to cause kdump 3217 on a WARN().
··· 3212 bit 2: print timer info 3213 bit 3: print locks info if CONFIG_LOCKDEP is on 3214 bit 4: print ftrace buffer 3215 + bit 5: print all printk messages in buffer 3216 3217 panic_on_warn panic() instead of WARN(). Useful to cause kdump 3218 on a WARN().
+1 -1
arch/powerpc/kernel/traps.c
··· 179 kmsg_dump(KMSG_DUMP_PANIC); 180 bust_spinlocks(0); 181 debug_locks_off(); 182 - console_flush_on_panic(); 183 } 184 185 static unsigned long oops_begin(struct pt_regs *regs)
··· 179 kmsg_dump(KMSG_DUMP_PANIC); 180 bust_spinlocks(0); 181 debug_locks_off(); 182 + console_flush_on_panic(CONSOLE_FLUSH_PENDING); 183 } 184 185 static unsigned long oops_begin(struct pt_regs *regs)
+8 -3
fs/fs-writeback.c
··· 523 524 isw->inode = inode; 525 526 - atomic_inc(&isw_nr_in_flight); 527 - 528 /* 529 * In addition to synchronizing among switchers, I_WB_SWITCH tells 530 * the RCU protected stat update paths to grab the i_page ··· 530 * Let's continue after I_WB_SWITCH is guaranteed to be visible. 531 */ 532 call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn); 533 goto out_unlock; 534 535 out_free: ··· 902 void cgroup_writeback_umount(void) 903 { 904 if (atomic_read(&isw_nr_in_flight)) { 905 - synchronize_rcu(); 906 flush_workqueue(isw_wq); 907 } 908 }
··· 523 524 isw->inode = inode; 525 526 /* 527 * In addition to synchronizing among switchers, I_WB_SWITCH tells 528 * the RCU protected stat update paths to grab the i_page ··· 532 * Let's continue after I_WB_SWITCH is guaranteed to be visible. 533 */ 534 call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn); 535 + 536 + atomic_inc(&isw_nr_in_flight); 537 + 538 goto out_unlock; 539 540 out_free: ··· 901 void cgroup_writeback_umount(void) 902 { 903 if (atomic_read(&isw_nr_in_flight)) { 904 + /* 905 + * Use rcu_barrier() to wait for all pending callbacks to 906 + * ensure that all in-flight wb switches are in the workqueue. 907 + */ 908 + rcu_barrier(); 909 flush_workqueue(isw_wq); 910 } 911 }
+6 -1
include/linux/console.h
··· 166 extern int console_set_on_cmdline; 167 extern struct console *early_console; 168 169 extern int add_preferred_console(char *name, int idx, char *options); 170 extern void register_console(struct console *); 171 extern int unregister_console(struct console *); ··· 180 extern void console_unlock(void); 181 extern void console_conditional_schedule(void); 182 extern void console_unblank(void); 183 - extern void console_flush_on_panic(void); 184 extern struct tty_driver *console_device(int *); 185 extern void console_stop(struct console *); 186 extern void console_start(struct console *);
··· 166 extern int console_set_on_cmdline; 167 extern struct console *early_console; 168 169 + enum con_flush_mode { 170 + CONSOLE_FLUSH_PENDING, 171 + CONSOLE_REPLAY_ALL, 172 + }; 173 + 174 extern int add_preferred_console(char *name, int idx, char *options); 175 extern void register_console(struct console *); 176 extern int unregister_console(struct console *); ··· 175 extern void console_unlock(void); 176 extern void console_conditional_schedule(void); 177 extern void console_unblank(void); 178 + extern void console_flush_on_panic(enum con_flush_mode mode); 179 extern struct tty_driver *console_device(int *); 180 extern void console_stop(struct console *); 181 extern void console_start(struct console *);
+5 -1
include/linux/vmalloc.h
··· 50 struct vmap_area { 51 unsigned long va_start; 52 unsigned long va_end; 53 unsigned long flags; 54 struct rb_node rb_node; /* address sorted rbtree */ 55 struct list_head list; /* address sorted list */ 56 struct llist_node purge_list; /* "lazy purge" list */ 57 struct vm_struct *vm; 58 - struct rcu_head rcu_head; 59 }; 60 61 /*
··· 50 struct vmap_area { 51 unsigned long va_start; 52 unsigned long va_end; 53 + 54 + /* 55 + * Largest available free size in subtree. 56 + */ 57 + unsigned long subtree_max_size; 58 unsigned long flags; 59 struct rb_node rb_node; /* address sorted rbtree */ 60 struct list_head list; /* address sorted list */ 61 struct llist_node purge_list; /* "lazy purge" list */ 62 struct vm_struct *vm; 63 }; 64 65 /*
+1 -1
init/initramfs.c
··· 669 * If the initrd region is overlapped with crashkernel reserved region, 670 * free only memory that is not part of crashkernel region. 671 */ 672 - if (!do_retain_initrd && !kexec_free_initrd()) 673 free_initrd_mem(initrd_start, initrd_end); 674 initrd_start = 0; 675 initrd_end = 0;
··· 669 * If the initrd region is overlapped with crashkernel reserved region, 670 * free only memory that is not part of crashkernel region. 671 */ 672 + if (!do_retain_initrd && initrd_start && !kexec_free_initrd()) 673 free_initrd_mem(initrd_start, initrd_end); 674 initrd_start = 0; 675 initrd_end = 0;
+5 -1
kernel/panic.c
··· 51 #define PANIC_PRINT_TIMER_INFO 0x00000004 52 #define PANIC_PRINT_LOCK_INFO 0x00000008 53 #define PANIC_PRINT_FTRACE_INFO 0x00000010 54 unsigned long panic_print; 55 56 ATOMIC_NOTIFIER_HEAD(panic_notifier_list); ··· 135 136 static void panic_print_sys_info(void) 137 { 138 if (panic_print & PANIC_PRINT_TASK_INFO) 139 show_state(); 140 ··· 281 * panic() is not being callled from OOPS. 282 */ 283 debug_locks_off(); 284 - console_flush_on_panic(); 285 286 panic_print_sys_info(); 287
··· 51 #define PANIC_PRINT_TIMER_INFO 0x00000004 52 #define PANIC_PRINT_LOCK_INFO 0x00000008 53 #define PANIC_PRINT_FTRACE_INFO 0x00000010 54 + #define PANIC_PRINT_ALL_PRINTK_MSG 0x00000020 55 unsigned long panic_print; 56 57 ATOMIC_NOTIFIER_HEAD(panic_notifier_list); ··· 134 135 static void panic_print_sys_info(void) 136 { 137 + if (panic_print & PANIC_PRINT_ALL_PRINTK_MSG) 138 + console_flush_on_panic(CONSOLE_REPLAY_ALL); 139 + 140 if (panic_print & PANIC_PRINT_TASK_INFO) 141 show_state(); 142 ··· 277 * panic() is not being callled from OOPS. 278 */ 279 debug_locks_off(); 280 + console_flush_on_panic(CONSOLE_FLUSH_PENDING); 281 282 panic_print_sys_info(); 283
+11 -1
kernel/printk/printk.c
··· 2535 2536 /** 2537 * console_flush_on_panic - flush console content on panic 2538 * 2539 * Immediately output all pending messages no matter what. 2540 */ 2541 - void console_flush_on_panic(void) 2542 { 2543 /* 2544 * If someone else is holding the console lock, trylock will fail ··· 2550 */ 2551 console_trylock(); 2552 console_may_schedule = 0; 2553 console_unlock(); 2554 } 2555
··· 2535 2536 /** 2537 * console_flush_on_panic - flush console content on panic 2538 + * @mode: flush all messages in buffer or just the pending ones 2539 * 2540 * Immediately output all pending messages no matter what. 2541 */ 2542 + void console_flush_on_panic(enum con_flush_mode mode) 2543 { 2544 /* 2545 * If someone else is holding the console lock, trylock will fail ··· 2549 */ 2550 console_trylock(); 2551 console_may_schedule = 0; 2552 + 2553 + if (mode == CONSOLE_REPLAY_ALL) { 2554 + unsigned long flags; 2555 + 2556 + logbuf_lock_irqsave(flags); 2557 + console_seq = log_first_seq; 2558 + console_idx = log_first_idx; 2559 + logbuf_unlock_irqrestore(flags); 2560 + } 2561 console_unlock(); 2562 } 2563
+2 -2
mm/compaction.c
··· 1230 1231 /* Pageblock boundaries */ 1232 start_pfn = pageblock_start_pfn(pfn); 1233 - end_pfn = min(start_pfn + pageblock_nr_pages, zone_end_pfn(cc->zone)); 1234 1235 /* Scan before */ 1236 if (start_pfn != pfn) { ··· 1241 1242 /* Scan after */ 1243 start_pfn = pfn + nr_isolated; 1244 - if (start_pfn != end_pfn) 1245 isolate_freepages_block(cc, &start_pfn, end_pfn, &cc->freepages, 1, false); 1246 1247 /* Skip this pageblock in the future as it's full or nearly full */
··· 1230 1231 /* Pageblock boundaries */ 1232 start_pfn = pageblock_start_pfn(pfn); 1233 + end_pfn = min(pageblock_end_pfn(pfn), zone_end_pfn(cc->zone)) - 1; 1234 1235 /* Scan before */ 1236 if (start_pfn != pfn) { ··· 1241 1242 /* Scan after */ 1243 start_pfn = pfn + nr_isolated; 1244 + if (start_pfn < end_pfn) 1245 isolate_freepages_block(cc, &start_pfn, end_pfn, &cc->freepages, 1, false); 1246 1247 /* Skip this pageblock in the future as it's full or nearly full */
+852 -249
mm/vmalloc.c
··· 32 #include <linux/compiler.h> 33 #include <linux/llist.h> 34 #include <linux/bitops.h> 35 36 #include <linux/uaccess.h> 37 #include <asm/tlbflush.h> ··· 325 326 /*** Global kva allocator ***/ 327 328 #define VM_LAZY_FREE 0x02 329 #define VM_VM_AREA 0x04 330 ··· 336 LIST_HEAD(vmap_area_list); 337 static LLIST_HEAD(vmap_purge_list); 338 static struct rb_root vmap_area_root = RB_ROOT; 339 340 - /* The vmap cache globals are protected by vmap_area_lock */ 341 - static struct rb_node *free_vmap_cache; 342 - static unsigned long cached_hole_size; 343 - static unsigned long cached_vstart; 344 - static unsigned long cached_align; 345 346 - static unsigned long vmap_area_pcpu_hole; 347 348 static struct vmap_area *__find_vmap_area(unsigned long addr) 349 { ··· 417 return NULL; 418 } 419 420 - static void __insert_vmap_area(struct vmap_area *va) 421 { 422 - struct rb_node **p = &vmap_area_root.rb_node; 423 - struct rb_node *parent = NULL; 424 - struct rb_node *tmp; 425 426 - while (*p) { 427 - struct vmap_area *tmp_va; 428 - 429 - parent = *p; 430 - tmp_va = rb_entry(parent, struct vmap_area, rb_node); 431 - if (va->va_start < tmp_va->va_end) 432 - p = &(*p)->rb_left; 433 - else if (va->va_end > tmp_va->va_start) 434 - p = &(*p)->rb_right; 435 - else 436 - BUG(); 437 } 438 439 - rb_link_node(&va->rb_node, parent, p); 440 - rb_insert_color(&va->rb_node, &vmap_area_root); 441 442 - /* address-sort this list */ 443 - tmp = rb_prev(&va->rb_node); 444 - if (tmp) { 445 - struct vmap_area *prev; 446 - prev = rb_entry(tmp, struct vmap_area, rb_node); 447 - list_add_rcu(&va->list, &prev->list); 448 - } else 449 - list_add_rcu(&va->list, &vmap_area_list); 450 } 451 452 - static void purge_vmap_area_lazy(void); 453 454 - static BLOCKING_NOTIFIER_HEAD(vmap_notify_list); 455 456 /* 457 * Allocate a region of KVA of the specified size and alignment, within the ··· 1032 int node, gfp_t gfp_mask) 1033 { 1034 struct vmap_area *va; 1035 - struct rb_node *n; 1036 unsigned long addr; 1037 int purged = 0; 1038 - struct vmap_area *first; 1039 1040 BUG_ON(!size); 1041 BUG_ON(offset_in_page(size)); 1042 BUG_ON(!is_power_of_2(align)); 1043 1044 might_sleep(); 1045 1046 - va = kmalloc_node(sizeof(struct vmap_area), 1047 gfp_mask & GFP_RECLAIM_MASK, node); 1048 if (unlikely(!va)) 1049 return ERR_PTR(-ENOMEM); ··· 1057 1058 retry: 1059 spin_lock(&vmap_area_lock); 1060 /* 1061 - * Invalidate cache if we have more permissive parameters. 1062 - * cached_hole_size notes the largest hole noticed _below_ 1063 - * the vmap_area cached in free_vmap_cache: if size fits 1064 - * into that hole, we want to scan from vstart to reuse 1065 - * the hole instead of allocating above free_vmap_cache. 1066 - * Note that __free_vmap_area may update free_vmap_cache 1067 - * without updating cached_hole_size or cached_align. 1068 */ 1069 - if (!free_vmap_cache || 1070 - size < cached_hole_size || 1071 - vstart < cached_vstart || 1072 - align < cached_align) { 1073 - nocache: 1074 - cached_hole_size = 0; 1075 - free_vmap_cache = NULL; 1076 - } 1077 - /* record if we encounter less permissive parameters */ 1078 - cached_vstart = vstart; 1079 - cached_align = align; 1080 - 1081 - /* find starting point for our search */ 1082 - if (free_vmap_cache) { 1083 - first = rb_entry(free_vmap_cache, struct vmap_area, rb_node); 1084 - addr = ALIGN(first->va_end, align); 1085 - if (addr < vstart) 1086 - goto nocache; 1087 - if (addr + size < addr) 1088 - goto overflow; 1089 - 1090 - } else { 1091 - addr = ALIGN(vstart, align); 1092 - if (addr + size < addr) 1093 - goto overflow; 1094 - 1095 - n = vmap_area_root.rb_node; 1096 - first = NULL; 1097 - 1098 - while (n) { 1099 - struct vmap_area *tmp; 1100 - tmp = rb_entry(n, struct vmap_area, rb_node); 1101 - if (tmp->va_end >= addr) { 1102 - first = tmp; 1103 - if (tmp->va_start <= addr) 1104 - break; 1105 - n = n->rb_left; 1106 - } else 1107 - n = n->rb_right; 1108 - } 1109 - 1110 - if (!first) 1111 - goto found; 1112 - } 1113 - 1114 - /* from the starting point, walk areas until a suitable hole is found */ 1115 - while (addr + size > first->va_start && addr + size <= vend) { 1116 - if (addr + cached_hole_size < first->va_start) 1117 - cached_hole_size = first->va_start - addr; 1118 - addr = ALIGN(first->va_end, align); 1119 - if (addr + size < addr) 1120 - goto overflow; 1121 - 1122 - if (list_is_last(&first->list, &vmap_area_list)) 1123 - goto found; 1124 - 1125 - first = list_next_entry(first, list); 1126 - } 1127 - 1128 - found: 1129 - /* 1130 - * Check also calculated address against the vstart, 1131 - * because it can be 0 because of big align request. 1132 - */ 1133 - if (addr + size > vend || addr < vstart) 1134 goto overflow; 1135 1136 va->va_start = addr; 1137 va->va_end = addr + size; 1138 va->flags = 0; 1139 - __insert_vmap_area(va); 1140 - free_vmap_cache = &va->rb_node; 1141 spin_unlock(&vmap_area_lock); 1142 1143 BUG_ON(!IS_ALIGNED(va->va_start, align)); ··· 1099 if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) 1100 pr_warn("vmap allocation for size %lu failed: use vmalloc=<size> to increase size\n", 1101 size); 1102 - kfree(va); 1103 return ERR_PTR(-EBUSY); 1104 } 1105 ··· 1120 { 1121 BUG_ON(RB_EMPTY_NODE(&va->rb_node)); 1122 1123 - if (free_vmap_cache) { 1124 - if (va->va_end < cached_vstart) { 1125 - free_vmap_cache = NULL; 1126 - } else { 1127 - struct vmap_area *cache; 1128 - cache = rb_entry(free_vmap_cache, struct vmap_area, rb_node); 1129 - if (va->va_start <= cache->va_start) { 1130 - free_vmap_cache = rb_prev(&va->rb_node); 1131 - /* 1132 - * We don't try to update cached_hole_size or 1133 - * cached_align, but it won't go very wrong. 1134 - */ 1135 - } 1136 - } 1137 - } 1138 - rb_erase(&va->rb_node, &vmap_area_root); 1139 - RB_CLEAR_NODE(&va->rb_node); 1140 - list_del_rcu(&va->list); 1141 1142 /* 1143 - * Track the highest possible candidate for pcpu area 1144 - * allocation. Areas outside of vmalloc area can be returned 1145 - * here too, consider only end addresses which fall inside 1146 - * vmalloc area proper. 1147 */ 1148 - if (va->va_end > VMALLOC_START && va->va_end <= VMALLOC_END) 1149 - vmap_area_pcpu_hole = max(vmap_area_pcpu_hole, va->va_end); 1150 - 1151 - kfree_rcu(va, rcu_head); 1152 } 1153 1154 /* ··· 1335 VMALLOC_PAGES / roundup_pow_of_two(NR_CPUS) / 16)) 1336 1337 #define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE) 1338 - 1339 - static bool vmap_initialized __read_mostly = false; 1340 1341 struct vmap_block_queue { 1342 spinlock_t lock; ··· 1796 vm_area_add_early(vm); 1797 } 1798 1799 void __init vmalloc_init(void) 1800 { 1801 struct vmap_area *va; 1802 struct vm_struct *tmp; 1803 int i; 1804 1805 for_each_possible_cpu(i) { 1806 struct vmap_block_queue *vbq; ··· 1862 1863 /* Import existing vmlist entries. */ 1864 for (tmp = vmlist; tmp; tmp = tmp->next) { 1865 - va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT); 1866 va->flags = VM_VM_AREA; 1867 va->va_start = (unsigned long)tmp->addr; 1868 va->va_end = va->va_start + tmp->size; 1869 va->vm = tmp; 1870 - __insert_vmap_area(va); 1871 } 1872 1873 - vmap_area_pcpu_hole = VMALLOC_END; 1874 - 1875 vmap_initialized = true; 1876 } 1877 ··· 3068 } 3069 3070 /** 3071 - * pvm_find_next_prev - find the next and prev vmap_area surrounding @end 3072 - * @end: target address 3073 - * @pnext: out arg for the next vmap_area 3074 - * @pprev: out arg for the previous vmap_area 3075 * 3076 - * Returns: %true if either or both of next and prev are found, 3077 - * %false if no vmap_area exists 3078 - * 3079 - * Find vmap_areas end addresses of which enclose @end. ie. if not 3080 - * NULL, *pnext->va_end > @end and *pprev->va_end <= @end. 3081 */ 3082 - static bool pvm_find_next_prev(unsigned long end, 3083 - struct vmap_area **pnext, 3084 - struct vmap_area **pprev) 3085 { 3086 - struct rb_node *n = vmap_area_root.rb_node; 3087 - struct vmap_area *va = NULL; 3088 3089 while (n) { 3090 - va = rb_entry(n, struct vmap_area, rb_node); 3091 - if (end < va->va_end) 3092 - n = n->rb_left; 3093 - else if (end > va->va_end) 3094 n = n->rb_right; 3095 - else 3096 - break; 3097 } 3098 3099 - if (!va) 3100 - return false; 3101 - 3102 - if (va->va_end > end) { 3103 - *pnext = va; 3104 - *pprev = node_to_va(rb_prev(&(*pnext)->rb_node)); 3105 - } else { 3106 - *pprev = va; 3107 - *pnext = node_to_va(rb_next(&(*pprev)->rb_node)); 3108 - } 3109 - return true; 3110 } 3111 3112 /** 3113 - * pvm_determine_end - find the highest aligned address between two vmap_areas 3114 - * @pnext: in/out arg for the next vmap_area 3115 - * @pprev: in/out arg for the previous vmap_area 3116 - * @align: alignment 3117 * 3118 - * Returns: determined end address 3119 - * 3120 - * Find the highest aligned address between *@pnext and *@pprev below 3121 - * VMALLOC_END. *@pnext and *@pprev are adjusted so that the aligned 3122 - * down address is between the end addresses of the two vmap_areas. 3123 - * 3124 - * Please note that the address returned by this function may fall 3125 - * inside *@pnext vmap_area. The caller is responsible for checking 3126 - * that. 3127 */ 3128 - static unsigned long pvm_determine_end(struct vmap_area **pnext, 3129 - struct vmap_area **pprev, 3130 - unsigned long align) 3131 { 3132 - const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); 3133 unsigned long addr; 3134 3135 - if (*pnext) 3136 - addr = min((*pnext)->va_start & ~(align - 1), vmalloc_end); 3137 - else 3138 - addr = vmalloc_end; 3139 - 3140 - while (*pprev && (*pprev)->va_end > addr) { 3141 - *pnext = *pprev; 3142 - *pprev = node_to_va(rb_prev(&(*pnext)->rb_node)); 3143 } 3144 3145 - return addr; 3146 } 3147 3148 /** ··· 3145 * to gigabytes. To avoid interacting with regular vmallocs, these 3146 * areas are allocated from top. 3147 * 3148 - * Despite its complicated look, this allocator is rather simple. It 3149 - * does everything top-down and scans areas from the end looking for 3150 - * matching slot. While scanning, if any of the areas overlaps with 3151 - * existing vmap_area, the base address is pulled down to fit the 3152 - * area. Scanning is repeated till all the areas fit and then all 3153 - * necessary data structures are inserted and the result is returned. 3154 */ 3155 struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, 3156 const size_t *sizes, int nr_vms, ··· 3158 { 3159 const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align); 3160 const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); 3161 - struct vmap_area **vas, *prev, *next; 3162 struct vm_struct **vms; 3163 int area, area2, last_area, term_area; 3164 - unsigned long base, start, end, last_end; 3165 bool purged = false; 3166 3167 /* verify parameters and allocate data structures */ 3168 BUG_ON(offset_in_page(align) || !is_power_of_2(align)); ··· 3199 goto err_free2; 3200 3201 for (area = 0; area < nr_vms; area++) { 3202 - vas[area] = kzalloc(sizeof(struct vmap_area), GFP_KERNEL); 3203 vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL); 3204 if (!vas[area] || !vms[area]) 3205 goto err_free; ··· 3212 start = offsets[area]; 3213 end = start + sizes[area]; 3214 3215 - if (!pvm_find_next_prev(vmap_area_pcpu_hole, &next, &prev)) { 3216 - base = vmalloc_end - last_end; 3217 - goto found; 3218 - } 3219 - base = pvm_determine_end(&next, &prev, align) - end; 3220 3221 while (true) { 3222 - BUG_ON(next && next->va_end <= base + end); 3223 - BUG_ON(prev && prev->va_end > base + end); 3224 - 3225 /* 3226 * base might have underflowed, add last_end before 3227 * comparing. 3228 */ 3229 - if (base + last_end < vmalloc_start + last_end) { 3230 - spin_unlock(&vmap_area_lock); 3231 - if (!purged) { 3232 - purge_vmap_area_lazy(); 3233 - purged = true; 3234 - goto retry; 3235 - } 3236 - goto err_free; 3237 - } 3238 3239 /* 3240 - * If next overlaps, move base downwards so that it's 3241 - * right below next and then recheck. 3242 */ 3243 - if (next && next->va_start < base + end) { 3244 - base = pvm_determine_end(&next, &prev, align) - end; 3245 - term_area = area; 3246 - continue; 3247 - } 3248 3249 /* 3250 - * If prev overlaps, shift down next and prev and move 3251 - * base so that it's right below new next and then 3252 - * recheck. 3253 */ 3254 - if (prev && prev->va_end > base + start) { 3255 - next = prev; 3256 - prev = node_to_va(rb_prev(&next->rb_node)); 3257 - base = pvm_determine_end(&next, &prev, align) - end; 3258 term_area = area; 3259 continue; 3260 } ··· 3246 area = (area + nr_vms - 1) % nr_vms; 3247 if (area == term_area) 3248 break; 3249 start = offsets[area]; 3250 end = start + sizes[area]; 3251 - pvm_find_next_prev(base + end, &next, &prev); 3252 } 3253 - found: 3254 /* we've found a fitting base, insert all va's */ 3255 for (area = 0; area < nr_vms; area++) { 3256 - struct vmap_area *va = vas[area]; 3257 3258 - va->va_start = base + offsets[area]; 3259 - va->va_end = va->va_start + sizes[area]; 3260 - __insert_vmap_area(va); 3261 } 3262 - 3263 - vmap_area_pcpu_hole = base + offsets[last_area]; 3264 3265 spin_unlock(&vmap_area_lock); 3266 ··· 3291 kfree(vas); 3292 return vms; 3293 3294 err_free: 3295 for (area = 0; area < nr_vms; area++) { 3296 - kfree(vas[area]); 3297 kfree(vms[area]); 3298 } 3299 err_free2:
··· 32 #include <linux/compiler.h> 33 #include <linux/llist.h> 34 #include <linux/bitops.h> 35 + #include <linux/rbtree_augmented.h> 36 37 #include <linux/uaccess.h> 38 #include <asm/tlbflush.h> ··· 324 325 /*** Global kva allocator ***/ 326 327 + #define DEBUG_AUGMENT_PROPAGATE_CHECK 0 328 + #define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0 329 + 330 #define VM_LAZY_FREE 0x02 331 #define VM_VM_AREA 0x04 332 ··· 332 LIST_HEAD(vmap_area_list); 333 static LLIST_HEAD(vmap_purge_list); 334 static struct rb_root vmap_area_root = RB_ROOT; 335 + static bool vmap_initialized __read_mostly; 336 337 + /* 338 + * This kmem_cache is used for vmap_area objects. Instead of 339 + * allocating from slab we reuse an object from this cache to 340 + * make things faster. Especially in "no edge" splitting of 341 + * free block. 342 + */ 343 + static struct kmem_cache *vmap_area_cachep; 344 345 + /* 346 + * This linked list is used in pair with free_vmap_area_root. 347 + * It gives O(1) access to prev/next to perform fast coalescing. 348 + */ 349 + static LIST_HEAD(free_vmap_area_list); 350 + 351 + /* 352 + * This augment red-black tree represents the free vmap space. 353 + * All vmap_area objects in this tree are sorted by va->va_start 354 + * address. It is used for allocation and merging when a vmap 355 + * object is released. 356 + * 357 + * Each vmap_area node contains a maximum available free block 358 + * of its sub-tree, right or left. Therefore it is possible to 359 + * find a lowest match of free area. 360 + */ 361 + static struct rb_root free_vmap_area_root = RB_ROOT; 362 + 363 + static __always_inline unsigned long 364 + va_size(struct vmap_area *va) 365 + { 366 + return (va->va_end - va->va_start); 367 + } 368 + 369 + static __always_inline unsigned long 370 + get_subtree_max_size(struct rb_node *node) 371 + { 372 + struct vmap_area *va; 373 + 374 + va = rb_entry_safe(node, struct vmap_area, rb_node); 375 + return va ? va->subtree_max_size : 0; 376 + } 377 + 378 + /* 379 + * Gets called when remove the node and rotate. 380 + */ 381 + static __always_inline unsigned long 382 + compute_subtree_max_size(struct vmap_area *va) 383 + { 384 + return max3(va_size(va), 385 + get_subtree_max_size(va->rb_node.rb_left), 386 + get_subtree_max_size(va->rb_node.rb_right)); 387 + } 388 + 389 + RB_DECLARE_CALLBACKS(static, free_vmap_area_rb_augment_cb, 390 + struct vmap_area, rb_node, unsigned long, subtree_max_size, 391 + compute_subtree_max_size) 392 + 393 + static void purge_vmap_area_lazy(void); 394 + static BLOCKING_NOTIFIER_HEAD(vmap_notify_list); 395 + static unsigned long lazy_max_pages(void); 396 397 static struct vmap_area *__find_vmap_area(unsigned long addr) 398 { ··· 360 return NULL; 361 } 362 363 + /* 364 + * This function returns back addresses of parent node 365 + * and its left or right link for further processing. 366 + */ 367 + static __always_inline struct rb_node ** 368 + find_va_links(struct vmap_area *va, 369 + struct rb_root *root, struct rb_node *from, 370 + struct rb_node **parent) 371 { 372 + struct vmap_area *tmp_va; 373 + struct rb_node **link; 374 375 + if (root) { 376 + link = &root->rb_node; 377 + if (unlikely(!*link)) { 378 + *parent = NULL; 379 + return link; 380 + } 381 + } else { 382 + link = &from; 383 } 384 385 + /* 386 + * Go to the bottom of the tree. When we hit the last point 387 + * we end up with parent rb_node and correct direction, i name 388 + * it link, where the new va->rb_node will be attached to. 389 + */ 390 + do { 391 + tmp_va = rb_entry(*link, struct vmap_area, rb_node); 392 393 + /* 394 + * During the traversal we also do some sanity check. 395 + * Trigger the BUG() if there are sides(left/right) 396 + * or full overlaps. 397 + */ 398 + if (va->va_start < tmp_va->va_end && 399 + va->va_end <= tmp_va->va_start) 400 + link = &(*link)->rb_left; 401 + else if (va->va_end > tmp_va->va_start && 402 + va->va_start >= tmp_va->va_end) 403 + link = &(*link)->rb_right; 404 + else 405 + BUG(); 406 + } while (*link); 407 + 408 + *parent = &tmp_va->rb_node; 409 + return link; 410 } 411 412 + static __always_inline struct list_head * 413 + get_va_next_sibling(struct rb_node *parent, struct rb_node **link) 414 + { 415 + struct list_head *list; 416 417 + if (unlikely(!parent)) 418 + /* 419 + * The red-black tree where we try to find VA neighbors 420 + * before merging or inserting is empty, i.e. it means 421 + * there is no free vmap space. Normally it does not 422 + * happen but we handle this case anyway. 423 + */ 424 + return NULL; 425 + 426 + list = &rb_entry(parent, struct vmap_area, rb_node)->list; 427 + return (&parent->rb_right == link ? list->next : list); 428 + } 429 + 430 + static __always_inline void 431 + link_va(struct vmap_area *va, struct rb_root *root, 432 + struct rb_node *parent, struct rb_node **link, struct list_head *head) 433 + { 434 + /* 435 + * VA is still not in the list, but we can 436 + * identify its future previous list_head node. 437 + */ 438 + if (likely(parent)) { 439 + head = &rb_entry(parent, struct vmap_area, rb_node)->list; 440 + if (&parent->rb_right != link) 441 + head = head->prev; 442 + } 443 + 444 + /* Insert to the rb-tree */ 445 + rb_link_node(&va->rb_node, parent, link); 446 + if (root == &free_vmap_area_root) { 447 + /* 448 + * Some explanation here. Just perform simple insertion 449 + * to the tree. We do not set va->subtree_max_size to 450 + * its current size before calling rb_insert_augmented(). 451 + * It is because of we populate the tree from the bottom 452 + * to parent levels when the node _is_ in the tree. 453 + * 454 + * Therefore we set subtree_max_size to zero after insertion, 455 + * to let __augment_tree_propagate_from() puts everything to 456 + * the correct order later on. 457 + */ 458 + rb_insert_augmented(&va->rb_node, 459 + root, &free_vmap_area_rb_augment_cb); 460 + va->subtree_max_size = 0; 461 + } else { 462 + rb_insert_color(&va->rb_node, root); 463 + } 464 + 465 + /* Address-sort this list */ 466 + list_add(&va->list, head); 467 + } 468 + 469 + static __always_inline void 470 + unlink_va(struct vmap_area *va, struct rb_root *root) 471 + { 472 + /* 473 + * During merging a VA node can be empty, therefore 474 + * not linked with the tree nor list. Just check it. 475 + */ 476 + if (!RB_EMPTY_NODE(&va->rb_node)) { 477 + if (root == &free_vmap_area_root) 478 + rb_erase_augmented(&va->rb_node, 479 + root, &free_vmap_area_rb_augment_cb); 480 + else 481 + rb_erase(&va->rb_node, root); 482 + 483 + list_del(&va->list); 484 + RB_CLEAR_NODE(&va->rb_node); 485 + } 486 + } 487 + 488 + #if DEBUG_AUGMENT_PROPAGATE_CHECK 489 + static void 490 + augment_tree_propagate_check(struct rb_node *n) 491 + { 492 + struct vmap_area *va; 493 + struct rb_node *node; 494 + unsigned long size; 495 + bool found = false; 496 + 497 + if (n == NULL) 498 + return; 499 + 500 + va = rb_entry(n, struct vmap_area, rb_node); 501 + size = va->subtree_max_size; 502 + node = n; 503 + 504 + while (node) { 505 + va = rb_entry(node, struct vmap_area, rb_node); 506 + 507 + if (get_subtree_max_size(node->rb_left) == size) { 508 + node = node->rb_left; 509 + } else { 510 + if (va_size(va) == size) { 511 + found = true; 512 + break; 513 + } 514 + 515 + node = node->rb_right; 516 + } 517 + } 518 + 519 + if (!found) { 520 + va = rb_entry(n, struct vmap_area, rb_node); 521 + pr_emerg("tree is corrupted: %lu, %lu\n", 522 + va_size(va), va->subtree_max_size); 523 + } 524 + 525 + augment_tree_propagate_check(n->rb_left); 526 + augment_tree_propagate_check(n->rb_right); 527 + } 528 + #endif 529 + 530 + /* 531 + * This function populates subtree_max_size from bottom to upper 532 + * levels starting from VA point. The propagation must be done 533 + * when VA size is modified by changing its va_start/va_end. Or 534 + * in case of newly inserting of VA to the tree. 535 + * 536 + * It means that __augment_tree_propagate_from() must be called: 537 + * - After VA has been inserted to the tree(free path); 538 + * - After VA has been shrunk(allocation path); 539 + * - After VA has been increased(merging path). 540 + * 541 + * Please note that, it does not mean that upper parent nodes 542 + * and their subtree_max_size are recalculated all the time up 543 + * to the root node. 544 + * 545 + * 4--8 546 + * /\ 547 + * / \ 548 + * / \ 549 + * 2--2 8--8 550 + * 551 + * For example if we modify the node 4, shrinking it to 2, then 552 + * no any modification is required. If we shrink the node 2 to 1 553 + * its subtree_max_size is updated only, and set to 1. If we shrink 554 + * the node 8 to 6, then its subtree_max_size is set to 6 and parent 555 + * node becomes 4--6. 556 + */ 557 + static __always_inline void 558 + augment_tree_propagate_from(struct vmap_area *va) 559 + { 560 + struct rb_node *node = &va->rb_node; 561 + unsigned long new_va_sub_max_size; 562 + 563 + while (node) { 564 + va = rb_entry(node, struct vmap_area, rb_node); 565 + new_va_sub_max_size = compute_subtree_max_size(va); 566 + 567 + /* 568 + * If the newly calculated maximum available size of the 569 + * subtree is equal to the current one, then it means that 570 + * the tree is propagated correctly. So we have to stop at 571 + * this point to save cycles. 572 + */ 573 + if (va->subtree_max_size == new_va_sub_max_size) 574 + break; 575 + 576 + va->subtree_max_size = new_va_sub_max_size; 577 + node = rb_parent(&va->rb_node); 578 + } 579 + 580 + #if DEBUG_AUGMENT_PROPAGATE_CHECK 581 + augment_tree_propagate_check(free_vmap_area_root.rb_node); 582 + #endif 583 + } 584 + 585 + static void 586 + insert_vmap_area(struct vmap_area *va, 587 + struct rb_root *root, struct list_head *head) 588 + { 589 + struct rb_node **link; 590 + struct rb_node *parent; 591 + 592 + link = find_va_links(va, root, NULL, &parent); 593 + link_va(va, root, parent, link, head); 594 + } 595 + 596 + static void 597 + insert_vmap_area_augment(struct vmap_area *va, 598 + struct rb_node *from, struct rb_root *root, 599 + struct list_head *head) 600 + { 601 + struct rb_node **link; 602 + struct rb_node *parent; 603 + 604 + if (from) 605 + link = find_va_links(va, NULL, from, &parent); 606 + else 607 + link = find_va_links(va, root, NULL, &parent); 608 + 609 + link_va(va, root, parent, link, head); 610 + augment_tree_propagate_from(va); 611 + } 612 + 613 + /* 614 + * Merge de-allocated chunk of VA memory with previous 615 + * and next free blocks. If coalesce is not done a new 616 + * free area is inserted. If VA has been merged, it is 617 + * freed. 618 + */ 619 + static __always_inline void 620 + merge_or_add_vmap_area(struct vmap_area *va, 621 + struct rb_root *root, struct list_head *head) 622 + { 623 + struct vmap_area *sibling; 624 + struct list_head *next; 625 + struct rb_node **link; 626 + struct rb_node *parent; 627 + bool merged = false; 628 + 629 + /* 630 + * Find a place in the tree where VA potentially will be 631 + * inserted, unless it is merged with its sibling/siblings. 632 + */ 633 + link = find_va_links(va, root, NULL, &parent); 634 + 635 + /* 636 + * Get next node of VA to check if merging can be done. 637 + */ 638 + next = get_va_next_sibling(parent, link); 639 + if (unlikely(next == NULL)) 640 + goto insert; 641 + 642 + /* 643 + * start end 644 + * | | 645 + * |<------VA------>|<-----Next----->| 646 + * | | 647 + * start end 648 + */ 649 + if (next != head) { 650 + sibling = list_entry(next, struct vmap_area, list); 651 + if (sibling->va_start == va->va_end) { 652 + sibling->va_start = va->va_start; 653 + 654 + /* Check and update the tree if needed. */ 655 + augment_tree_propagate_from(sibling); 656 + 657 + /* Remove this VA, it has been merged. */ 658 + unlink_va(va, root); 659 + 660 + /* Free vmap_area object. */ 661 + kmem_cache_free(vmap_area_cachep, va); 662 + 663 + /* Point to the new merged area. */ 664 + va = sibling; 665 + merged = true; 666 + } 667 + } 668 + 669 + /* 670 + * start end 671 + * | | 672 + * |<-----Prev----->|<------VA------>| 673 + * | | 674 + * start end 675 + */ 676 + if (next->prev != head) { 677 + sibling = list_entry(next->prev, struct vmap_area, list); 678 + if (sibling->va_end == va->va_start) { 679 + sibling->va_end = va->va_end; 680 + 681 + /* Check and update the tree if needed. */ 682 + augment_tree_propagate_from(sibling); 683 + 684 + /* Remove this VA, it has been merged. */ 685 + unlink_va(va, root); 686 + 687 + /* Free vmap_area object. */ 688 + kmem_cache_free(vmap_area_cachep, va); 689 + 690 + return; 691 + } 692 + } 693 + 694 + insert: 695 + if (!merged) { 696 + link_va(va, root, parent, link, head); 697 + augment_tree_propagate_from(va); 698 + } 699 + } 700 + 701 + static __always_inline bool 702 + is_within_this_va(struct vmap_area *va, unsigned long size, 703 + unsigned long align, unsigned long vstart) 704 + { 705 + unsigned long nva_start_addr; 706 + 707 + if (va->va_start > vstart) 708 + nva_start_addr = ALIGN(va->va_start, align); 709 + else 710 + nva_start_addr = ALIGN(vstart, align); 711 + 712 + /* Can be overflowed due to big size or alignment. */ 713 + if (nva_start_addr + size < nva_start_addr || 714 + nva_start_addr < vstart) 715 + return false; 716 + 717 + return (nva_start_addr + size <= va->va_end); 718 + } 719 + 720 + /* 721 + * Find the first free block(lowest start address) in the tree, 722 + * that will accomplish the request corresponding to passing 723 + * parameters. 724 + */ 725 + static __always_inline struct vmap_area * 726 + find_vmap_lowest_match(unsigned long size, 727 + unsigned long align, unsigned long vstart) 728 + { 729 + struct vmap_area *va; 730 + struct rb_node *node; 731 + unsigned long length; 732 + 733 + /* Start from the root. */ 734 + node = free_vmap_area_root.rb_node; 735 + 736 + /* Adjust the search size for alignment overhead. */ 737 + length = size + align - 1; 738 + 739 + while (node) { 740 + va = rb_entry(node, struct vmap_area, rb_node); 741 + 742 + if (get_subtree_max_size(node->rb_left) >= length && 743 + vstart < va->va_start) { 744 + node = node->rb_left; 745 + } else { 746 + if (is_within_this_va(va, size, align, vstart)) 747 + return va; 748 + 749 + /* 750 + * Does not make sense to go deeper towards the right 751 + * sub-tree if it does not have a free block that is 752 + * equal or bigger to the requested search length. 753 + */ 754 + if (get_subtree_max_size(node->rb_right) >= length) { 755 + node = node->rb_right; 756 + continue; 757 + } 758 + 759 + /* 760 + * OK. We roll back and find the fist right sub-tree, 761 + * that will satisfy the search criteria. It can happen 762 + * only once due to "vstart" restriction. 763 + */ 764 + while ((node = rb_parent(node))) { 765 + va = rb_entry(node, struct vmap_area, rb_node); 766 + if (is_within_this_va(va, size, align, vstart)) 767 + return va; 768 + 769 + if (get_subtree_max_size(node->rb_right) >= length && 770 + vstart <= va->va_start) { 771 + node = node->rb_right; 772 + break; 773 + } 774 + } 775 + } 776 + } 777 + 778 + return NULL; 779 + } 780 + 781 + #if DEBUG_AUGMENT_LOWEST_MATCH_CHECK 782 + #include <linux/random.h> 783 + 784 + static struct vmap_area * 785 + find_vmap_lowest_linear_match(unsigned long size, 786 + unsigned long align, unsigned long vstart) 787 + { 788 + struct vmap_area *va; 789 + 790 + list_for_each_entry(va, &free_vmap_area_list, list) { 791 + if (!is_within_this_va(va, size, align, vstart)) 792 + continue; 793 + 794 + return va; 795 + } 796 + 797 + return NULL; 798 + } 799 + 800 + static void 801 + find_vmap_lowest_match_check(unsigned long size) 802 + { 803 + struct vmap_area *va_1, *va_2; 804 + unsigned long vstart; 805 + unsigned int rnd; 806 + 807 + get_random_bytes(&rnd, sizeof(rnd)); 808 + vstart = VMALLOC_START + rnd; 809 + 810 + va_1 = find_vmap_lowest_match(size, 1, vstart); 811 + va_2 = find_vmap_lowest_linear_match(size, 1, vstart); 812 + 813 + if (va_1 != va_2) 814 + pr_emerg("not lowest: t: 0x%p, l: 0x%p, v: 0x%lx\n", 815 + va_1, va_2, vstart); 816 + } 817 + #endif 818 + 819 + enum fit_type { 820 + NOTHING_FIT = 0, 821 + FL_FIT_TYPE = 1, /* full fit */ 822 + LE_FIT_TYPE = 2, /* left edge fit */ 823 + RE_FIT_TYPE = 3, /* right edge fit */ 824 + NE_FIT_TYPE = 4 /* no edge fit */ 825 + }; 826 + 827 + static __always_inline enum fit_type 828 + classify_va_fit_type(struct vmap_area *va, 829 + unsigned long nva_start_addr, unsigned long size) 830 + { 831 + enum fit_type type; 832 + 833 + /* Check if it is within VA. */ 834 + if (nva_start_addr < va->va_start || 835 + nva_start_addr + size > va->va_end) 836 + return NOTHING_FIT; 837 + 838 + /* Now classify. */ 839 + if (va->va_start == nva_start_addr) { 840 + if (va->va_end == nva_start_addr + size) 841 + type = FL_FIT_TYPE; 842 + else 843 + type = LE_FIT_TYPE; 844 + } else if (va->va_end == nva_start_addr + size) { 845 + type = RE_FIT_TYPE; 846 + } else { 847 + type = NE_FIT_TYPE; 848 + } 849 + 850 + return type; 851 + } 852 + 853 + static __always_inline int 854 + adjust_va_to_fit_type(struct vmap_area *va, 855 + unsigned long nva_start_addr, unsigned long size, 856 + enum fit_type type) 857 + { 858 + struct vmap_area *lva; 859 + 860 + if (type == FL_FIT_TYPE) { 861 + /* 862 + * No need to split VA, it fully fits. 863 + * 864 + * | | 865 + * V NVA V 866 + * |---------------| 867 + */ 868 + unlink_va(va, &free_vmap_area_root); 869 + kmem_cache_free(vmap_area_cachep, va); 870 + } else if (type == LE_FIT_TYPE) { 871 + /* 872 + * Split left edge of fit VA. 873 + * 874 + * | | 875 + * V NVA V R 876 + * |-------|-------| 877 + */ 878 + va->va_start += size; 879 + } else if (type == RE_FIT_TYPE) { 880 + /* 881 + * Split right edge of fit VA. 882 + * 883 + * | | 884 + * L V NVA V 885 + * |-------|-------| 886 + */ 887 + va->va_end = nva_start_addr; 888 + } else if (type == NE_FIT_TYPE) { 889 + /* 890 + * Split no edge of fit VA. 891 + * 892 + * | | 893 + * L V NVA V R 894 + * |---|-------|---| 895 + */ 896 + lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT); 897 + if (unlikely(!lva)) 898 + return -1; 899 + 900 + /* 901 + * Build the remainder. 902 + */ 903 + lva->va_start = va->va_start; 904 + lva->va_end = nva_start_addr; 905 + 906 + /* 907 + * Shrink this VA to remaining size. 908 + */ 909 + va->va_start = nva_start_addr + size; 910 + } else { 911 + return -1; 912 + } 913 + 914 + if (type != FL_FIT_TYPE) { 915 + augment_tree_propagate_from(va); 916 + 917 + if (type == NE_FIT_TYPE) 918 + insert_vmap_area_augment(lva, &va->rb_node, 919 + &free_vmap_area_root, &free_vmap_area_list); 920 + } 921 + 922 + return 0; 923 + } 924 + 925 + /* 926 + * Returns a start address of the newly allocated area, if success. 927 + * Otherwise a vend is returned that indicates failure. 928 + */ 929 + static __always_inline unsigned long 930 + __alloc_vmap_area(unsigned long size, unsigned long align, 931 + unsigned long vstart, unsigned long vend, int node) 932 + { 933 + unsigned long nva_start_addr; 934 + struct vmap_area *va; 935 + enum fit_type type; 936 + int ret; 937 + 938 + va = find_vmap_lowest_match(size, align, vstart); 939 + if (unlikely(!va)) 940 + return vend; 941 + 942 + if (va->va_start > vstart) 943 + nva_start_addr = ALIGN(va->va_start, align); 944 + else 945 + nva_start_addr = ALIGN(vstart, align); 946 + 947 + /* Check the "vend" restriction. */ 948 + if (nva_start_addr + size > vend) 949 + return vend; 950 + 951 + /* Classify what we have found. */ 952 + type = classify_va_fit_type(va, nva_start_addr, size); 953 + if (WARN_ON_ONCE(type == NOTHING_FIT)) 954 + return vend; 955 + 956 + /* Update the free vmap_area. */ 957 + ret = adjust_va_to_fit_type(va, nva_start_addr, size, type); 958 + if (ret) 959 + return vend; 960 + 961 + #if DEBUG_AUGMENT_LOWEST_MATCH_CHECK 962 + find_vmap_lowest_match_check(size); 963 + #endif 964 + 965 + return nva_start_addr; 966 + } 967 968 /* 969 * Allocate a region of KVA of the specified size and alignment, within the ··· 406 int node, gfp_t gfp_mask) 407 { 408 struct vmap_area *va; 409 unsigned long addr; 410 int purged = 0; 411 412 BUG_ON(!size); 413 BUG_ON(offset_in_page(size)); 414 BUG_ON(!is_power_of_2(align)); 415 416 + if (unlikely(!vmap_initialized)) 417 + return ERR_PTR(-EBUSY); 418 + 419 might_sleep(); 420 421 + va = kmem_cache_alloc_node(vmap_area_cachep, 422 gfp_mask & GFP_RECLAIM_MASK, node); 423 if (unlikely(!va)) 424 return ERR_PTR(-ENOMEM); ··· 430 431 retry: 432 spin_lock(&vmap_area_lock); 433 + 434 /* 435 + * If an allocation fails, the "vend" address is 436 + * returned. Therefore trigger the overflow path. 437 */ 438 + addr = __alloc_vmap_area(size, align, vstart, vend, node); 439 + if (unlikely(addr == vend)) 440 goto overflow; 441 442 va->va_start = addr; 443 va->va_end = addr + size; 444 va->flags = 0; 445 + insert_vmap_area(va, &vmap_area_root, &vmap_area_list); 446 + 447 spin_unlock(&vmap_area_lock); 448 449 BUG_ON(!IS_ALIGNED(va->va_start, align)); ··· 539 if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) 540 pr_warn("vmap allocation for size %lu failed: use vmalloc=<size> to increase size\n", 541 size); 542 + 543 + kmem_cache_free(vmap_area_cachep, va); 544 return ERR_PTR(-EBUSY); 545 } 546 ··· 559 { 560 BUG_ON(RB_EMPTY_NODE(&va->rb_node)); 561 562 + /* 563 + * Remove from the busy tree/list. 564 + */ 565 + unlink_va(va, &vmap_area_root); 566 567 /* 568 + * Merge VA with its neighbors, otherwise just add it. 569 */ 570 + merge_or_add_vmap_area(va, 571 + &free_vmap_area_root, &free_vmap_area_list); 572 } 573 574 /* ··· 793 VMALLOC_PAGES / roundup_pow_of_two(NR_CPUS) / 16)) 794 795 #define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE) 796 797 struct vmap_block_queue { 798 spinlock_t lock; ··· 1256 vm_area_add_early(vm); 1257 } 1258 1259 + static void vmap_init_free_space(void) 1260 + { 1261 + unsigned long vmap_start = 1; 1262 + const unsigned long vmap_end = ULONG_MAX; 1263 + struct vmap_area *busy, *free; 1264 + 1265 + /* 1266 + * B F B B B F 1267 + * -|-----|.....|-----|-----|-----|.....|- 1268 + * | The KVA space | 1269 + * |<--------------------------------->| 1270 + */ 1271 + list_for_each_entry(busy, &vmap_area_list, list) { 1272 + if (busy->va_start - vmap_start > 0) { 1273 + free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); 1274 + if (!WARN_ON_ONCE(!free)) { 1275 + free->va_start = vmap_start; 1276 + free->va_end = busy->va_start; 1277 + 1278 + insert_vmap_area_augment(free, NULL, 1279 + &free_vmap_area_root, 1280 + &free_vmap_area_list); 1281 + } 1282 + } 1283 + 1284 + vmap_start = busy->va_end; 1285 + } 1286 + 1287 + if (vmap_end - vmap_start > 0) { 1288 + free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); 1289 + if (!WARN_ON_ONCE(!free)) { 1290 + free->va_start = vmap_start; 1291 + free->va_end = vmap_end; 1292 + 1293 + insert_vmap_area_augment(free, NULL, 1294 + &free_vmap_area_root, 1295 + &free_vmap_area_list); 1296 + } 1297 + } 1298 + } 1299 + 1300 void __init vmalloc_init(void) 1301 { 1302 struct vmap_area *va; 1303 struct vm_struct *tmp; 1304 int i; 1305 + 1306 + /* 1307 + * Create the cache for vmap_area objects. 1308 + */ 1309 + vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC); 1310 1311 for_each_possible_cpu(i) { 1312 struct vmap_block_queue *vbq; ··· 1276 1277 /* Import existing vmlist entries. */ 1278 for (tmp = vmlist; tmp; tmp = tmp->next) { 1279 + va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); 1280 + if (WARN_ON_ONCE(!va)) 1281 + continue; 1282 + 1283 va->flags = VM_VM_AREA; 1284 va->va_start = (unsigned long)tmp->addr; 1285 va->va_end = va->va_start + tmp->size; 1286 va->vm = tmp; 1287 + insert_vmap_area(va, &vmap_area_root, &vmap_area_list); 1288 } 1289 1290 + /* 1291 + * Now we can initialize a free vmap space. 1292 + */ 1293 + vmap_init_free_space(); 1294 vmap_initialized = true; 1295 } 1296 ··· 2477 } 2478 2479 /** 2480 + * pvm_find_va_enclose_addr - find the vmap_area @addr belongs to 2481 + * @addr: target address 2482 * 2483 + * Returns: vmap_area if it is found. If there is no such area 2484 + * the first highest(reverse order) vmap_area is returned 2485 + * i.e. va->va_start < addr && va->va_end < addr or NULL 2486 + * if there are no any areas before @addr. 2487 */ 2488 + static struct vmap_area * 2489 + pvm_find_va_enclose_addr(unsigned long addr) 2490 { 2491 + struct vmap_area *va, *tmp; 2492 + struct rb_node *n; 2493 + 2494 + n = free_vmap_area_root.rb_node; 2495 + va = NULL; 2496 2497 while (n) { 2498 + tmp = rb_entry(n, struct vmap_area, rb_node); 2499 + if (tmp->va_start <= addr) { 2500 + va = tmp; 2501 + if (tmp->va_end >= addr) 2502 + break; 2503 + 2504 n = n->rb_right; 2505 + } else { 2506 + n = n->rb_left; 2507 + } 2508 } 2509 2510 + return va; 2511 } 2512 2513 /** 2514 + * pvm_determine_end_from_reverse - find the highest aligned address 2515 + * of free block below VMALLOC_END 2516 + * @va: 2517 + * in - the VA we start the search(reverse order); 2518 + * out - the VA with the highest aligned end address. 2519 * 2520 + * Returns: determined end address within vmap_area 2521 */ 2522 + static unsigned long 2523 + pvm_determine_end_from_reverse(struct vmap_area **va, unsigned long align) 2524 { 2525 + unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); 2526 unsigned long addr; 2527 2528 + if (likely(*va)) { 2529 + list_for_each_entry_from_reverse((*va), 2530 + &free_vmap_area_list, list) { 2531 + addr = min((*va)->va_end & ~(align - 1), vmalloc_end); 2532 + if ((*va)->va_start < addr) 2533 + return addr; 2534 + } 2535 } 2536 2537 + return 0; 2538 } 2539 2540 /** ··· 2571 * to gigabytes. To avoid interacting with regular vmallocs, these 2572 * areas are allocated from top. 2573 * 2574 + * Despite its complicated look, this allocator is rather simple. It 2575 + * does everything top-down and scans free blocks from the end looking 2576 + * for matching base. While scanning, if any of the areas do not fit the 2577 + * base address is pulled down to fit the area. Scanning is repeated till 2578 + * all the areas fit and then all necessary data structures are inserted 2579 + * and the result is returned. 2580 */ 2581 struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, 2582 const size_t *sizes, int nr_vms, ··· 2584 { 2585 const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align); 2586 const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); 2587 + struct vmap_area **vas, *va; 2588 struct vm_struct **vms; 2589 int area, area2, last_area, term_area; 2590 + unsigned long base, start, size, end, last_end; 2591 bool purged = false; 2592 + enum fit_type type; 2593 2594 /* verify parameters and allocate data structures */ 2595 BUG_ON(offset_in_page(align) || !is_power_of_2(align)); ··· 2624 goto err_free2; 2625 2626 for (area = 0; area < nr_vms; area++) { 2627 + vas[area] = kmem_cache_zalloc(vmap_area_cachep, GFP_KERNEL); 2628 vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL); 2629 if (!vas[area] || !vms[area]) 2630 goto err_free; ··· 2637 start = offsets[area]; 2638 end = start + sizes[area]; 2639 2640 + va = pvm_find_va_enclose_addr(vmalloc_end); 2641 + base = pvm_determine_end_from_reverse(&va, align) - end; 2642 2643 while (true) { 2644 /* 2645 * base might have underflowed, add last_end before 2646 * comparing. 2647 */ 2648 + if (base + last_end < vmalloc_start + last_end) 2649 + goto overflow; 2650 2651 /* 2652 + * Fitting base has not been found. 2653 */ 2654 + if (va == NULL) 2655 + goto overflow; 2656 2657 /* 2658 + * If this VA does not fit, move base downwards and recheck. 2659 */ 2660 + if (base + start < va->va_start || base + end > va->va_end) { 2661 + va = node_to_va(rb_prev(&va->rb_node)); 2662 + base = pvm_determine_end_from_reverse(&va, align) - end; 2663 term_area = area; 2664 continue; 2665 } ··· 2691 area = (area + nr_vms - 1) % nr_vms; 2692 if (area == term_area) 2693 break; 2694 + 2695 start = offsets[area]; 2696 end = start + sizes[area]; 2697 + va = pvm_find_va_enclose_addr(base + end); 2698 } 2699 + 2700 /* we've found a fitting base, insert all va's */ 2701 for (area = 0; area < nr_vms; area++) { 2702 + int ret; 2703 2704 + start = base + offsets[area]; 2705 + size = sizes[area]; 2706 + 2707 + va = pvm_find_va_enclose_addr(start); 2708 + if (WARN_ON_ONCE(va == NULL)) 2709 + /* It is a BUG(), but trigger recovery instead. */ 2710 + goto recovery; 2711 + 2712 + type = classify_va_fit_type(va, start, size); 2713 + if (WARN_ON_ONCE(type == NOTHING_FIT)) 2714 + /* It is a BUG(), but trigger recovery instead. */ 2715 + goto recovery; 2716 + 2717 + ret = adjust_va_to_fit_type(va, start, size, type); 2718 + if (unlikely(ret)) 2719 + goto recovery; 2720 + 2721 + /* Allocated area. */ 2722 + va = vas[area]; 2723 + va->va_start = start; 2724 + va->va_end = start + size; 2725 + 2726 + insert_vmap_area(va, &vmap_area_root, &vmap_area_list); 2727 } 2728 2729 spin_unlock(&vmap_area_lock); 2730 ··· 2717 kfree(vas); 2718 return vms; 2719 2720 + recovery: 2721 + /* Remove previously inserted areas. */ 2722 + while (area--) { 2723 + __free_vmap_area(vas[area]); 2724 + vas[area] = NULL; 2725 + } 2726 + 2727 + overflow: 2728 + spin_unlock(&vmap_area_lock); 2729 + if (!purged) { 2730 + purge_vmap_area_lazy(); 2731 + purged = true; 2732 + 2733 + /* Before "retry", check if we recover. */ 2734 + for (area = 0; area < nr_vms; area++) { 2735 + if (vas[area]) 2736 + continue; 2737 + 2738 + vas[area] = kmem_cache_zalloc( 2739 + vmap_area_cachep, GFP_KERNEL); 2740 + if (!vas[area]) 2741 + goto err_free; 2742 + } 2743 + 2744 + goto retry; 2745 + } 2746 + 2747 err_free: 2748 for (area = 0; area < nr_vms; area++) { 2749 + if (vas[area]) 2750 + kmem_cache_free(vmap_area_cachep, vas[area]); 2751 + 2752 kfree(vms[area]); 2753 } 2754 err_free2: