Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

percpu: implement asynchronous chunk population

The percpu allocator now supports atomic allocations by only
allocating from already populated areas but the mechanism to ensure
that there's adequate amount of populated areas was missing.

This patch expands pcpu_balance_work so that in addition to freeing
excess free chunks it also populates chunks to maintain an adequate
level of populated areas. pcpu_alloc() schedules pcpu_balance_work if
the amount of free populated areas is too low or after an atomic
allocation failure.

* PERPCU_DYNAMIC_RESERVE is increased by two pages to account for
PCPU_EMPTY_POP_PAGES_LOW.

* pcpu_async_enabled is added to gate both async jobs -
chunk->map_extend_work and pcpu_balance_work - so that we don't end
up scheduling them while the needed subsystems aren't up yet.

Signed-off-by: Tejun Heo <tj@kernel.org>

Tejun Heo 1a4d7607 fe6bd8c3

+115 -6
+2 -2
include/linux/percpu.h
··· 48 48 * intelligent way to determine this would be nice. 49 49 */ 50 50 #if BITS_PER_LONG > 32 51 - #define PERCPU_DYNAMIC_RESERVE (20 << 10) 51 + #define PERCPU_DYNAMIC_RESERVE (28 << 10) 52 52 #else 53 - #define PERCPU_DYNAMIC_RESERVE (12 << 10) 53 + #define PERCPU_DYNAMIC_RESERVE (20 << 10) 54 54 #endif 55 55 56 56 extern void *pcpu_base_addr;
+113 -4
mm/percpu.c
··· 78 78 #define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */ 79 79 #define PCPU_ATOMIC_MAP_MARGIN_LOW 32 80 80 #define PCPU_ATOMIC_MAP_MARGIN_HIGH 64 81 + #define PCPU_EMPTY_POP_PAGES_LOW 2 82 + #define PCPU_EMPTY_POP_PAGES_HIGH 4 81 83 82 84 #ifdef CONFIG_SMP 83 85 /* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */ ··· 170 168 */ 171 169 static int pcpu_nr_empty_pop_pages; 172 170 173 - /* balance work is used to populate or destroy chunks asynchronously */ 171 + /* 172 + * Balance work is used to populate or destroy chunks asynchronously. We 173 + * try to keep the number of populated free pages between 174 + * PCPU_EMPTY_POP_PAGES_LOW and HIGH for atomic allocations and at most one 175 + * empty chunk. 176 + */ 174 177 static void pcpu_balance_workfn(struct work_struct *work); 175 178 static DECLARE_WORK(pcpu_balance_work, pcpu_balance_workfn); 179 + static bool pcpu_async_enabled __read_mostly; 180 + static bool pcpu_atomic_alloc_failed; 181 + 182 + static void pcpu_schedule_balance_work(void) 183 + { 184 + if (pcpu_async_enabled) 185 + schedule_work(&pcpu_balance_work); 186 + } 176 187 177 188 static bool pcpu_addr_in_first_chunk(void *addr) 178 189 { ··· 401 386 margin = 3; 402 387 403 388 if (chunk->map_alloc < 404 - chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW) 389 + chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW && 390 + pcpu_async_enabled) 405 391 schedule_work(&chunk->map_extend_work); 406 392 } else { 407 393 margin = PCPU_ATOMIC_MAP_MARGIN_HIGH; ··· 1021 1005 if (chunk != pcpu_reserved_chunk) 1022 1006 pcpu_nr_empty_pop_pages -= occ_pages; 1023 1007 1008 + if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW) 1009 + pcpu_schedule_balance_work(); 1010 + 1024 1011 /* clear the areas and return address relative to base address */ 1025 1012 for_each_possible_cpu(cpu) 1026 1013 memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size); ··· 1041 1022 dump_stack(); 1042 1023 if (!--warn_limit) 1043 1024 pr_info("PERCPU: limit reached, disable warning\n"); 1025 + } 1026 + if (is_atomic) { 1027 + /* see the flag handling in pcpu_blance_workfn() */ 1028 + pcpu_atomic_alloc_failed = true; 1029 + pcpu_schedule_balance_work(); 1044 1030 } 1045 1031 return NULL; 1046 1032 } ··· 1104 1080 } 1105 1081 1106 1082 /** 1107 - * pcpu_balance_workfn - reclaim fully free chunks, workqueue function 1083 + * pcpu_balance_workfn - manage the amount of free chunks and populated pages 1108 1084 * @work: unused 1109 1085 * 1110 1086 * Reclaim all fully free chunks except for the first one. ··· 1114 1090 LIST_HEAD(to_free); 1115 1091 struct list_head *free_head = &pcpu_slot[pcpu_nr_slots - 1]; 1116 1092 struct pcpu_chunk *chunk, *next; 1093 + int slot, nr_to_pop, ret; 1117 1094 1095 + /* 1096 + * There's no reason to keep around multiple unused chunks and VM 1097 + * areas can be scarce. Destroy all free chunks except for one. 1098 + */ 1118 1099 mutex_lock(&pcpu_alloc_mutex); 1119 1100 spin_lock_irq(&pcpu_lock); 1120 1101 ··· 1145 1116 spin_unlock_irq(&pcpu_lock); 1146 1117 } 1147 1118 pcpu_destroy_chunk(chunk); 1119 + } 1120 + 1121 + /* 1122 + * Ensure there are certain number of free populated pages for 1123 + * atomic allocs. Fill up from the most packed so that atomic 1124 + * allocs don't increase fragmentation. If atomic allocation 1125 + * failed previously, always populate the maximum amount. This 1126 + * should prevent atomic allocs larger than PAGE_SIZE from keeping 1127 + * failing indefinitely; however, large atomic allocs are not 1128 + * something we support properly and can be highly unreliable and 1129 + * inefficient. 1130 + */ 1131 + retry_pop: 1132 + if (pcpu_atomic_alloc_failed) { 1133 + nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH; 1134 + /* best effort anyway, don't worry about synchronization */ 1135 + pcpu_atomic_alloc_failed = false; 1136 + } else { 1137 + nr_to_pop = clamp(PCPU_EMPTY_POP_PAGES_HIGH - 1138 + pcpu_nr_empty_pop_pages, 1139 + 0, PCPU_EMPTY_POP_PAGES_HIGH); 1140 + } 1141 + 1142 + for (slot = pcpu_size_to_slot(PAGE_SIZE); slot < pcpu_nr_slots; slot++) { 1143 + int nr_unpop = 0, rs, re; 1144 + 1145 + if (!nr_to_pop) 1146 + break; 1147 + 1148 + spin_lock_irq(&pcpu_lock); 1149 + list_for_each_entry(chunk, &pcpu_slot[slot], list) { 1150 + nr_unpop = pcpu_unit_pages - chunk->nr_populated; 1151 + if (nr_unpop) 1152 + break; 1153 + } 1154 + spin_unlock_irq(&pcpu_lock); 1155 + 1156 + if (!nr_unpop) 1157 + continue; 1158 + 1159 + /* @chunk can't go away while pcpu_alloc_mutex is held */ 1160 + pcpu_for_each_unpop_region(chunk, rs, re, 0, pcpu_unit_pages) { 1161 + int nr = min(re - rs, nr_to_pop); 1162 + 1163 + ret = pcpu_populate_chunk(chunk, rs, rs + nr); 1164 + if (!ret) { 1165 + nr_to_pop -= nr; 1166 + spin_lock_irq(&pcpu_lock); 1167 + pcpu_chunk_populated(chunk, rs, rs + nr); 1168 + spin_unlock_irq(&pcpu_lock); 1169 + } else { 1170 + nr_to_pop = 0; 1171 + } 1172 + 1173 + if (!nr_to_pop) 1174 + break; 1175 + } 1176 + } 1177 + 1178 + if (nr_to_pop) { 1179 + /* ran out of chunks to populate, create a new one and retry */ 1180 + chunk = pcpu_create_chunk(); 1181 + if (chunk) { 1182 + spin_lock_irq(&pcpu_lock); 1183 + pcpu_chunk_relocate(chunk, -1); 1184 + spin_unlock_irq(&pcpu_lock); 1185 + goto retry_pop; 1186 + } 1148 1187 } 1149 1188 1150 1189 mutex_unlock(&pcpu_alloc_mutex); ··· 1257 1160 1258 1161 list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list) 1259 1162 if (pos != chunk) { 1260 - schedule_work(&pcpu_balance_work); 1163 + pcpu_schedule_balance_work(); 1261 1164 break; 1262 1165 } 1263 1166 } ··· 2284 2187 spin_unlock_irqrestore(&pcpu_lock, flags); 2285 2188 } 2286 2189 } 2190 + 2191 + /* 2192 + * Percpu allocator is initialized early during boot when neither slab or 2193 + * workqueue is available. Plug async management until everything is up 2194 + * and running. 2195 + */ 2196 + static int __init percpu_enable_async(void) 2197 + { 2198 + pcpu_async_enabled = true; 2199 + return 0; 2200 + } 2201 + subsys_initcall(percpu_enable_async);