Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

powerpc/mm: Rework context management for CPUs with no hash table

This reworks the context management code used by 4xx,8xx and
freescale BookE. It adds support for SMP by implementing a
concept of stale context map to lazily flush the TLB on
processors where a context may have been invalidated. This
also contains the ground work for generalizing such lazy TLB
flushing by just picking up a new PID and marking the old one
stale. This will be implemented later.

This is a first implementation that uses a global spinlock.

Ideally, we should try to get at least the fast path (context ID
already assigned) lockless or limited to a per context lock,
but for now this will do.

I tried to keep the UP case reasonably simple to avoid adding
too much overhead to 8xx which does a lot of context stealing
since it effectively has only 16 PIDs available.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Acked-by: Kumar Gala <galak@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@samba.org>

authored by

Benjamin Herrenschmidt and committed by
Paul Mackerras
2ca8cf73 5e696617

+237 -57
+3 -2
arch/powerpc/include/asm/mmu-40x.h
··· 54 54 #ifndef __ASSEMBLY__ 55 55 56 56 typedef struct { 57 - unsigned long id; 58 - unsigned long vdso_base; 57 + unsigned int id; 58 + unsigned int active; 59 + unsigned long vdso_base; 59 60 } mm_context_t; 60 61 61 62 #endif /* !__ASSEMBLY__ */
+3 -2
arch/powerpc/include/asm/mmu-44x.h
··· 56 56 extern unsigned int tlb_44x_hwater; 57 57 58 58 typedef struct { 59 - unsigned long id; 60 - unsigned long vdso_base; 59 + unsigned int id; 60 + unsigned int active; 61 + unsigned long vdso_base; 61 62 } mm_context_t; 62 63 63 64 #endif /* !__ASSEMBLY__ */
+2 -1
arch/powerpc/include/asm/mmu-8xx.h
··· 137 137 138 138 #ifndef __ASSEMBLY__ 139 139 typedef struct { 140 - unsigned long id; 140 + unsigned int id; 141 + unsigned int active; 141 142 unsigned long vdso_base; 142 143 } mm_context_t; 143 144 #endif /* !__ASSEMBLY__ */
+3 -2
arch/powerpc/include/asm/mmu-fsl-booke.h
··· 76 76 #ifndef __ASSEMBLY__ 77 77 78 78 typedef struct { 79 - unsigned long id; 80 - unsigned long vdso_base; 79 + unsigned int id; 80 + unsigned int active; 81 + unsigned long vdso_base; 81 82 } mm_context_t; 82 83 #endif /* !__ASSEMBLY__ */ 83 84
+2
arch/powerpc/include/asm/tlbflush.h
··· 29 29 30 30 #include <linux/mm.h> 31 31 32 + #define MMU_NO_CONTEXT ((unsigned int)-1) 33 + 32 34 extern void _tlbie(unsigned long address, unsigned int pid); 33 35 extern void _tlbil_all(void); 34 36 extern void _tlbil_pid(unsigned int pid);
+224 -50
arch/powerpc/mm/mmu_context_nohash.c
··· 14 14 * as published by the Free Software Foundation; either version 15 15 * 2 of the License, or (at your option) any later version. 16 16 * 17 + * TODO: 18 + * 19 + * - The global context lock will not scale very well 20 + * - The maps should be dynamically allocated to allow for processors 21 + * that support more PID bits at runtime 22 + * - Implement flush_tlb_mm() by making the context stale and picking 23 + * a new one 24 + * - More aggressively clear stale map bits and maybe find some way to 25 + * also clear mm->cpu_vm_mask bits when processes are migrated 17 26 */ 18 27 28 + #undef DEBUG 29 + #define DEBUG_STEAL_ONLY 30 + #undef DEBUG_MAP_CONSISTENCY 31 + 32 + #include <linux/kernel.h> 19 33 #include <linux/mm.h> 20 34 #include <linux/init.h> 21 35 22 36 #include <asm/mmu_context.h> 23 37 #include <asm/tlbflush.h> 38 + #include <linux/spinlock.h> 24 39 25 40 /* 26 41 * The MPC8xx has only 16 contexts. We rotate through them on each ··· 55 40 */ 56 41 57 42 #ifdef CONFIG_8xx 58 - #define NO_CONTEXT 16 59 43 #define LAST_CONTEXT 15 60 44 #define FIRST_CONTEXT 0 61 45 62 46 #elif defined(CONFIG_4xx) 63 - #define NO_CONTEXT 256 64 47 #define LAST_CONTEXT 255 65 48 #define FIRST_CONTEXT 1 66 49 67 50 #elif defined(CONFIG_E200) || defined(CONFIG_E500) 68 - #define NO_CONTEXT 256 69 51 #define LAST_CONTEXT 255 70 52 #define FIRST_CONTEXT 1 71 53 ··· 70 58 #error Unsupported processor type 71 59 #endif 72 60 73 - static unsigned long next_mmu_context; 61 + static unsigned int next_context, nr_free_contexts; 74 62 static unsigned long context_map[LAST_CONTEXT / BITS_PER_LONG + 1]; 75 - static atomic_t nr_free_contexts; 63 + static unsigned long stale_map[NR_CPUS][LAST_CONTEXT / BITS_PER_LONG + 1]; 76 64 static struct mm_struct *context_mm[LAST_CONTEXT+1]; 77 - static void steal_context(void); 65 + static spinlock_t context_lock = SPIN_LOCK_UNLOCKED; 78 66 79 67 /* Steal a context from a task that has one at the moment. 80 - * This is only used on 8xx and 4xx and we presently assume that 81 - * they don't do SMP. If they do then this will have to check 82 - * whether the MM we steal is in use. 83 - * We also assume that this is only used on systems that don't 84 - * use an MMU hash table - this is true for 8xx and 4xx. 68 + * 69 + * This is used when we are running out of available PID numbers 70 + * on the processors. 71 + * 85 72 * This isn't an LRU system, it just frees up each context in 86 73 * turn (sort-of pseudo-random replacement :). This would be the 87 74 * place to implement an LRU scheme if anyone was motivated to do it. 88 75 * -- paulus 76 + * 77 + * For context stealing, we use a slightly different approach for 78 + * SMP and UP. Basically, the UP one is simpler and doesn't use 79 + * the stale map as we can just flush the local CPU 80 + * -- benh 89 81 */ 90 - static void steal_context(void) 82 + #ifdef CONFIG_SMP 83 + static unsigned int steal_context_smp(unsigned int id) 91 84 { 92 85 struct mm_struct *mm; 86 + unsigned int cpu, max; 93 87 94 - /* free up context `next_mmu_context' */ 95 - /* if we shouldn't free context 0, don't... */ 96 - if (next_mmu_context < FIRST_CONTEXT) 97 - next_mmu_context = FIRST_CONTEXT; 98 - mm = context_mm[next_mmu_context]; 99 - flush_tlb_mm(mm); 100 - destroy_context(mm); 101 - } 88 + again: 89 + max = LAST_CONTEXT - FIRST_CONTEXT; 102 90 91 + /* Attempt to free next_context first and then loop until we manage */ 92 + while (max--) { 93 + /* Pick up the victim mm */ 94 + mm = context_mm[id]; 103 95 104 - /* 105 - * Get a new mmu context for the address space described by `mm'. 106 - */ 107 - static inline void get_mmu_context(struct mm_struct *mm) 108 - { 109 - unsigned long ctx; 96 + /* We have a candidate victim, check if it's active, on SMP 97 + * we cannot steal active contexts 98 + */ 99 + if (mm->context.active) { 100 + id++; 101 + if (id > LAST_CONTEXT) 102 + id = FIRST_CONTEXT; 103 + continue; 104 + } 105 + pr_debug("[%d] steal context %d from mm @%p\n", 106 + smp_processor_id(), id, mm); 110 107 111 - if (mm->context.id != NO_CONTEXT) 112 - return; 108 + /* Mark this mm has having no context anymore */ 109 + mm->context.id = MMU_NO_CONTEXT; 113 110 114 - while (atomic_dec_if_positive(&nr_free_contexts) < 0) 115 - steal_context(); 116 - 117 - ctx = next_mmu_context; 118 - while (test_and_set_bit(ctx, context_map)) { 119 - ctx = find_next_zero_bit(context_map, LAST_CONTEXT+1, ctx); 120 - if (ctx > LAST_CONTEXT) 121 - ctx = 0; 111 + /* Mark it stale on all CPUs that used this mm */ 112 + for_each_cpu_mask_nr(cpu, mm->cpu_vm_mask) 113 + __set_bit(id, stale_map[cpu]); 114 + return id; 122 115 } 123 - next_mmu_context = (ctx + 1) & LAST_CONTEXT; 124 - mm->context.id = ctx; 125 - context_mm[ctx] = mm; 116 + 117 + /* This will happen if you have more CPUs than available contexts, 118 + * all we can do here is wait a bit and try again 119 + */ 120 + spin_unlock(&context_lock); 121 + cpu_relax(); 122 + spin_lock(&context_lock); 123 + goto again; 126 124 } 125 + #endif /* CONFIG_SMP */ 126 + 127 + /* Note that this will also be called on SMP if all other CPUs are 128 + * offlined, which means that it may be called for cpu != 0. For 129 + * this to work, we somewhat assume that CPUs that are onlined 130 + * come up with a fully clean TLB (or are cleaned when offlined) 131 + */ 132 + static unsigned int steal_context_up(unsigned int id) 133 + { 134 + struct mm_struct *mm; 135 + int cpu = smp_processor_id(); 136 + 137 + /* Pick up the victim mm */ 138 + mm = context_mm[id]; 139 + 140 + pr_debug("[%d] steal context %d from mm @%p\n", cpu, id, mm); 141 + 142 + /* Mark this mm has having no context anymore */ 143 + mm->context.id = MMU_NO_CONTEXT; 144 + 145 + /* Flush the TLB for that context */ 146 + local_flush_tlb_mm(mm); 147 + 148 + /* XXX This clear should ultimately be part of local_flush_tlb_mm */ 149 + __clear_bit(id, stale_map[cpu]); 150 + 151 + return id; 152 + } 153 + 154 + #ifdef DEBUG_MAP_CONSISTENCY 155 + static void context_check_map(void) 156 + { 157 + unsigned int id, nrf, nact; 158 + 159 + nrf = nact = 0; 160 + for (id = FIRST_CONTEXT; id <= LAST_CONTEXT; id++) { 161 + int used = test_bit(id, context_map); 162 + if (!used) 163 + nrf++; 164 + if (used != (context_mm[id] != NULL)) 165 + pr_err("MMU: Context %d is %s and MM is %p !\n", 166 + id, used ? "used" : "free", context_mm[id]); 167 + if (context_mm[id] != NULL) 168 + nact += context_mm[id]->context.active; 169 + } 170 + if (nrf != nr_free_contexts) { 171 + pr_err("MMU: Free context count out of sync ! (%d vs %d)\n", 172 + nr_free_contexts, nrf); 173 + nr_free_contexts = nrf; 174 + } 175 + if (nact > num_online_cpus()) 176 + pr_err("MMU: More active contexts than CPUs ! (%d vs %d)\n", 177 + nact, num_online_cpus()); 178 + } 179 + #else 180 + static void context_check_map(void) { } 181 + #endif 127 182 128 183 void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next) 129 184 { 130 - get_mmu_context(next); 185 + unsigned int id, cpu = smp_processor_id(); 186 + unsigned long *map; 131 187 132 - set_context(next->context.id, next->pgd); 188 + /* No lockless fast path .. yet */ 189 + spin_lock(&context_lock); 190 + 191 + #ifndef DEBUG_STEAL_ONLY 192 + pr_debug("[%d] activating context for mm @%p, active=%d, id=%d\n", 193 + cpu, next, next->context.active, next->context.id); 194 + #endif 195 + 196 + #ifdef CONFIG_SMP 197 + /* Mark us active and the previous one not anymore */ 198 + next->context.active++; 199 + if (prev) { 200 + WARN_ON(prev->context.active < 1); 201 + prev->context.active--; 202 + } 203 + #endif /* CONFIG_SMP */ 204 + 205 + /* If we already have a valid assigned context, skip all that */ 206 + id = next->context.id; 207 + if (likely(id != MMU_NO_CONTEXT)) 208 + goto ctxt_ok; 209 + 210 + /* We really don't have a context, let's try to acquire one */ 211 + id = next_context; 212 + if (id > LAST_CONTEXT) 213 + id = FIRST_CONTEXT; 214 + map = context_map; 215 + 216 + /* No more free contexts, let's try to steal one */ 217 + if (nr_free_contexts == 0) { 218 + #ifdef CONFIG_SMP 219 + if (num_online_cpus() > 1) { 220 + id = steal_context_smp(id); 221 + goto stolen; 222 + } 223 + #endif /* CONFIG_SMP */ 224 + id = steal_context_up(id); 225 + goto stolen; 226 + } 227 + nr_free_contexts--; 228 + 229 + /* We know there's at least one free context, try to find it */ 230 + while (__test_and_set_bit(id, map)) { 231 + id = find_next_zero_bit(map, LAST_CONTEXT+1, id); 232 + if (id > LAST_CONTEXT) 233 + id = FIRST_CONTEXT; 234 + } 235 + stolen: 236 + next_context = id + 1; 237 + context_mm[id] = next; 238 + next->context.id = id; 239 + 240 + #ifndef DEBUG_STEAL_ONLY 241 + pr_debug("[%d] picked up new id %d, nrf is now %d\n", 242 + cpu, id, nr_free_contexts); 243 + #endif 244 + 245 + context_check_map(); 246 + ctxt_ok: 247 + 248 + /* If that context got marked stale on this CPU, then flush the 249 + * local TLB for it and unmark it before we use it 250 + */ 251 + if (test_bit(id, stale_map[cpu])) { 252 + pr_debug("[%d] flushing stale context %d for mm @%p !\n", 253 + cpu, id, next); 254 + local_flush_tlb_mm(next); 255 + 256 + /* XXX This clear should ultimately be part of local_flush_tlb_mm */ 257 + __clear_bit(id, stale_map[cpu]); 258 + } 259 + 260 + /* Flick the MMU and release lock */ 261 + set_context(id, next->pgd); 262 + spin_unlock(&context_lock); 133 263 } 134 264 135 265 /* ··· 279 125 */ 280 126 int init_new_context(struct task_struct *t, struct mm_struct *mm) 281 127 { 282 - mm->context.id = NO_CONTEXT; 128 + mm->context.id = MMU_NO_CONTEXT; 129 + mm->context.active = 0; 130 + 283 131 return 0; 284 132 } 285 133 ··· 290 134 */ 291 135 void destroy_context(struct mm_struct *mm) 292 136 { 293 - preempt_disable(); 294 - if (mm->context.id != NO_CONTEXT) { 295 - clear_bit(mm->context.id, context_map); 296 - mm->context.id = NO_CONTEXT; 297 - atomic_inc(&nr_free_contexts); 137 + unsigned int id; 138 + 139 + if (mm->context.id == MMU_NO_CONTEXT) 140 + return; 141 + 142 + WARN_ON(mm->context.active != 0); 143 + 144 + spin_lock(&context_lock); 145 + id = mm->context.id; 146 + if (id != MMU_NO_CONTEXT) { 147 + __clear_bit(id, context_map); 148 + mm->context.id = MMU_NO_CONTEXT; 149 + #ifdef DEBUG_MAP_CONSISTENCY 150 + mm->context.active = 0; 151 + context_mm[id] = NULL; 152 + #endif 153 + nr_free_contexts++; 298 154 } 299 - preempt_enable(); 155 + spin_unlock(&context_lock); 300 156 } 301 157 302 158 ··· 317 149 */ 318 150 void __init mmu_context_init(void) 319 151 { 152 + /* Mark init_mm as being active on all possible CPUs since 153 + * we'll get called with prev == init_mm the first time 154 + * we schedule on a given CPU 155 + */ 156 + init_mm.context.active = NR_CPUS; 157 + 320 158 /* 321 159 * Some processors have too few contexts to reserve one for 322 160 * init_mm, and require using context 0 for a normal task. ··· 330 156 * This code assumes FIRST_CONTEXT < 32. 331 157 */ 332 158 context_map[0] = (1 << FIRST_CONTEXT) - 1; 333 - next_mmu_context = FIRST_CONTEXT; 334 - atomic_set(&nr_free_contexts, LAST_CONTEXT - FIRST_CONTEXT + 1); 159 + next_context = FIRST_CONTEXT; 160 + nr_free_contexts = LAST_CONTEXT - FIRST_CONTEXT + 1; 335 161 } 336 162