Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

[PATCH] cpu state clean after hot remove

Clean CPU states in order to reuse smp boot code for CPU hotplug.

Signed-off-by: Li Shaohua<shaohua.li@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

authored by

Li Shaohua and committed by
Linus Torvalds
e1367daf 0bb3184d

+186 -42
+12
arch/i386/kernel/cpu/common.c
··· 651 651 clear_used_math(); 652 652 mxcsr_feature_mask_init(); 653 653 } 654 + 655 + #ifdef CONFIG_HOTPLUG_CPU 656 + void __devinit cpu_uninit(void) 657 + { 658 + int cpu = raw_smp_processor_id(); 659 + cpu_clear(cpu, cpu_initialized); 660 + 661 + /* lazy TLB state */ 662 + per_cpu(cpu_tlbstate, cpu).state = 0; 663 + per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm; 664 + } 665 + #endif
+5
arch/i386/kernel/irq.c
··· 156 156 cpu,hardirq_ctx[cpu],softirq_ctx[cpu]); 157 157 } 158 158 159 + void irq_ctx_exit(int cpu) 160 + { 161 + hardirq_ctx[cpu] = NULL; 162 + } 163 + 159 164 extern asmlinkage void __do_softirq(void); 160 165 161 166 asmlinkage void do_softirq(void)
+9 -11
arch/i386/kernel/process.c
··· 152 152 /* We don't actually take CPU down, just spin without interrupts. */ 153 153 static inline void play_dead(void) 154 154 { 155 + /* This must be done before dead CPU ack */ 156 + cpu_exit_clear(); 157 + wbinvd(); 158 + mb(); 155 159 /* Ack it */ 156 160 __get_cpu_var(cpu_state) = CPU_DEAD; 157 161 158 - /* We shouldn't have to disable interrupts while dead, but 159 - * some interrupts just don't seem to go away, and this makes 160 - * it "work" for testing purposes. */ 161 - /* Death loop */ 162 - while (__get_cpu_var(cpu_state) != CPU_UP_PREPARE) 163 - cpu_relax(); 164 - 162 + /* 163 + * With physical CPU hotplug, we should halt the cpu 164 + */ 165 165 local_irq_disable(); 166 - __flush_tlb_all(); 167 - cpu_set(smp_processor_id(), cpu_online_map); 168 - enable_APIC_timer(); 169 - local_irq_enable(); 166 + while (1) 167 + __asm__ __volatile__("hlt":::"memory"); 170 168 } 171 169 #else 172 170 static inline void play_dead(void)
+143 -30
arch/i386/kernel/smpboot.c
··· 90 90 EXPORT_SYMBOL(cpu_callout_map); 91 91 static cpumask_t smp_commenced_mask; 92 92 93 + /* TSC's upper 32 bits can't be written in eariler CPU (before prescott), there 94 + * is no way to resync one AP against BP. TBD: for prescott and above, we 95 + * should use IA64's algorithm 96 + */ 97 + static int __devinitdata tsc_sync_disabled; 98 + 93 99 /* Per CPU bogomips and other parameters */ 94 100 struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned; 95 101 EXPORT_SYMBOL(cpu_data); ··· 433 427 /* 434 428 * Synchronize the TSC with the BP 435 429 */ 436 - if (cpu_has_tsc && cpu_khz) 430 + if (cpu_has_tsc && cpu_khz && !tsc_sync_disabled) 437 431 synchronize_tsc_ap(); 438 432 } 439 433 ··· 513 507 lock_ipi_call_lock(); 514 508 cpu_set(smp_processor_id(), cpu_online_map); 515 509 unlock_ipi_call_lock(); 510 + per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; 516 511 517 512 /* We can take interrupts now: we're officially "up". */ 518 513 local_irq_enable(); ··· 823 816 #endif /* WAKE_SECONDARY_VIA_INIT */ 824 817 825 818 extern cpumask_t cpu_initialized; 819 + static inline int alloc_cpu_id(void) 820 + { 821 + cpumask_t tmp_map; 822 + int cpu; 823 + cpus_complement(tmp_map, cpu_present_map); 824 + cpu = first_cpu(tmp_map); 825 + if (cpu >= NR_CPUS) 826 + return -ENODEV; 827 + return cpu; 828 + } 826 829 827 - static int __devinit do_boot_cpu(int apicid) 830 + #ifdef CONFIG_HOTPLUG_CPU 831 + static struct task_struct * __devinitdata cpu_idle_tasks[NR_CPUS]; 832 + static inline struct task_struct * alloc_idle_task(int cpu) 833 + { 834 + struct task_struct *idle; 835 + 836 + if ((idle = cpu_idle_tasks[cpu]) != NULL) { 837 + /* initialize thread_struct. we really want to avoid destroy 838 + * idle tread 839 + */ 840 + idle->thread.esp = (unsigned long)(((struct pt_regs *) 841 + (THREAD_SIZE + (unsigned long) idle->thread_info)) - 1); 842 + init_idle(idle, cpu); 843 + return idle; 844 + } 845 + idle = fork_idle(cpu); 846 + 847 + if (!IS_ERR(idle)) 848 + cpu_idle_tasks[cpu] = idle; 849 + return idle; 850 + } 851 + #else 852 + #define alloc_idle_task(cpu) fork_idle(cpu) 853 + #endif 854 + 855 + static int __devinit do_boot_cpu(int apicid, int cpu) 828 856 /* 829 857 * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad 830 858 * (ie clustered apic addressing mode), this is a LOGICAL apic ID. ··· 868 826 { 869 827 struct task_struct *idle; 870 828 unsigned long boot_error; 871 - int timeout, cpu; 829 + int timeout; 872 830 unsigned long start_eip; 873 831 unsigned short nmi_high = 0, nmi_low = 0; 874 832 875 - cpu = ++cpucount; 833 + ++cpucount; 834 + 876 835 /* 877 836 * We can't use kernel_thread since we must avoid to 878 837 * reschedule the child. 879 838 */ 880 - idle = fork_idle(cpu); 839 + idle = alloc_idle_task(cpu); 881 840 if (IS_ERR(idle)) 882 841 panic("failed fork for CPU %d", cpu); 883 842 idle->thread.eip = (unsigned long) start_secondary; ··· 945 902 inquire_remote_apic(apicid); 946 903 } 947 904 } 948 - x86_cpu_to_apicid[cpu] = apicid; 905 + 949 906 if (boot_error) { 950 907 /* Try to put things back the way they were before ... */ 951 908 unmap_cpu_to_logical_apicid(cpu); 952 909 cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */ 953 910 cpu_clear(cpu, cpu_initialized); /* was set by cpu_init() */ 954 911 cpucount--; 912 + } else { 913 + x86_cpu_to_apicid[cpu] = apicid; 914 + cpu_set(cpu, cpu_present_map); 955 915 } 956 916 957 917 /* mark "stuck" area as not stuck */ ··· 962 916 963 917 return boot_error; 964 918 } 919 + 920 + #ifdef CONFIG_HOTPLUG_CPU 921 + void cpu_exit_clear(void) 922 + { 923 + int cpu = raw_smp_processor_id(); 924 + 925 + idle_task_exit(); 926 + 927 + cpucount --; 928 + cpu_uninit(); 929 + irq_ctx_exit(cpu); 930 + 931 + cpu_clear(cpu, cpu_callout_map); 932 + cpu_clear(cpu, cpu_callin_map); 933 + cpu_clear(cpu, cpu_present_map); 934 + 935 + cpu_clear(cpu, smp_commenced_mask); 936 + unmap_cpu_to_logical_apicid(cpu); 937 + } 938 + 939 + struct warm_boot_cpu_info { 940 + struct completion *complete; 941 + int apicid; 942 + int cpu; 943 + }; 944 + 945 + static void __devinit do_warm_boot_cpu(void *p) 946 + { 947 + struct warm_boot_cpu_info *info = p; 948 + do_boot_cpu(info->apicid, info->cpu); 949 + complete(info->complete); 950 + } 951 + 952 + int __devinit smp_prepare_cpu(int cpu) 953 + { 954 + DECLARE_COMPLETION(done); 955 + struct warm_boot_cpu_info info; 956 + struct work_struct task; 957 + int apicid, ret; 958 + 959 + lock_cpu_hotplug(); 960 + apicid = x86_cpu_to_apicid[cpu]; 961 + if (apicid == BAD_APICID) { 962 + ret = -ENODEV; 963 + goto exit; 964 + } 965 + 966 + info.complete = &done; 967 + info.apicid = apicid; 968 + info.cpu = cpu; 969 + INIT_WORK(&task, do_warm_boot_cpu, &info); 970 + 971 + tsc_sync_disabled = 1; 972 + 973 + /* init low mem mapping */ 974 + memcpy(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS, 975 + sizeof(swapper_pg_dir[0]) * KERNEL_PGD_PTRS); 976 + flush_tlb_all(); 977 + schedule_work(&task); 978 + wait_for_completion(&done); 979 + 980 + tsc_sync_disabled = 0; 981 + zap_low_mappings(); 982 + ret = 0; 983 + exit: 984 + unlock_cpu_hotplug(); 985 + return ret; 986 + } 987 + #endif 965 988 966 989 static void smp_tune_scheduling (void) 967 990 { ··· 1184 1069 if (max_cpus <= cpucount+1) 1185 1070 continue; 1186 1071 1187 - if (do_boot_cpu(apicid)) 1072 + if (((cpu = alloc_cpu_id()) <= 0) || do_boot_cpu(apicid, cpu)) 1188 1073 printk("CPU #%d not responding - cannot use it.\n", 1189 1074 apicid); 1190 1075 else ··· 1264 1149 { 1265 1150 cpu_set(smp_processor_id(), cpu_online_map); 1266 1151 cpu_set(smp_processor_id(), cpu_callout_map); 1152 + cpu_set(smp_processor_id(), cpu_present_map); 1153 + per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; 1267 1154 } 1268 1155 1269 1156 #ifdef CONFIG_HOTPLUG_CPU 1270 - 1271 - /* must be called with the cpucontrol mutex held */ 1272 - static int __devinit cpu_enable(unsigned int cpu) 1157 + static void 1158 + remove_siblinginfo(int cpu) 1273 1159 { 1274 - /* get the target out of its holding state */ 1275 - per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; 1276 - wmb(); 1160 + int sibling; 1277 1161 1278 - /* wait for the processor to ack it. timeout? */ 1279 - while (!cpu_online(cpu)) 1280 - cpu_relax(); 1281 - 1282 - fixup_irqs(cpu_online_map); 1283 - /* counter the disable in fixup_irqs() */ 1284 - local_irq_enable(); 1285 - return 0; 1162 + for_each_cpu_mask(sibling, cpu_sibling_map[cpu]) 1163 + cpu_clear(cpu, cpu_sibling_map[sibling]); 1164 + for_each_cpu_mask(sibling, cpu_core_map[cpu]) 1165 + cpu_clear(cpu, cpu_core_map[sibling]); 1166 + cpus_clear(cpu_sibling_map[cpu]); 1167 + cpus_clear(cpu_core_map[cpu]); 1168 + phys_proc_id[cpu] = BAD_APICID; 1169 + cpu_core_id[cpu] = BAD_APICID; 1286 1170 } 1287 1171 1288 1172 int __cpu_disable(void) ··· 1307 1193 mdelay(1); 1308 1194 local_irq_disable(); 1309 1195 1196 + remove_siblinginfo(cpu); 1197 + 1310 1198 cpu_clear(cpu, map); 1311 1199 fixup_irqs(map); 1312 1200 /* It's now safe to remove this processor from the online map */ ··· 1323 1207 1324 1208 for (i = 0; i < 10; i++) { 1325 1209 /* They ack this in play_dead by setting CPU_DEAD */ 1326 - if (per_cpu(cpu_state, cpu) == CPU_DEAD) 1210 + if (per_cpu(cpu_state, cpu) == CPU_DEAD) { 1211 + printk ("CPU %d is now offline\n", cpu); 1327 1212 return; 1213 + } 1328 1214 current->state = TASK_UNINTERRUPTIBLE; 1329 1215 schedule_timeout(HZ/10); 1330 1216 } ··· 1354 1236 return -EIO; 1355 1237 } 1356 1238 1357 - #ifdef CONFIG_HOTPLUG_CPU 1358 - /* Already up, and in cpu_quiescent now? */ 1359 - if (cpu_isset(cpu, smp_commenced_mask)) { 1360 - cpu_enable(cpu); 1361 - return 0; 1362 - } 1363 - #endif 1364 - 1365 1239 local_irq_enable(); 1240 + per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; 1366 1241 /* Unleash the CPU! */ 1367 1242 cpu_set(cpu, smp_commenced_mask); 1368 1243 while (!cpu_isset(cpu, cpu_online_map)) ··· 1369 1258 setup_ioapic_dest(); 1370 1259 #endif 1371 1260 zap_low_mappings(); 1261 + #ifndef CONFIG_HOTPLUG_CPU 1372 1262 /* 1373 1263 * Disable executability of the SMP trampoline: 1374 1264 */ 1375 1265 set_kernel_exec((unsigned long)trampoline_base, trampoline_exec); 1266 + #endif 1376 1267 } 1377 1268 1378 1269 void __init smp_intr_init(void)
+7 -1
drivers/base/cpu.c
··· 16 16 EXPORT_SYMBOL(cpu_sysdev_class); 17 17 18 18 #ifdef CONFIG_HOTPLUG_CPU 19 + #ifndef __HAVE_ARCH_SMP_PREPARE_CPU 20 + #define smp_prepare_cpu(cpu) (0) 21 + #endif 22 + 19 23 static ssize_t show_online(struct sys_device *dev, char *buf) 20 24 { 21 25 struct cpu *cpu = container_of(dev, struct cpu, sysdev); ··· 40 36 kobject_hotplug(&dev->kobj, KOBJ_OFFLINE); 41 37 break; 42 38 case '1': 43 - ret = cpu_up(cpu->sysdev.id); 39 + ret = smp_prepare_cpu(cpu->sysdev.id); 40 + if (ret == 0) 41 + ret = cpu_up(cpu->sysdev.id); 44 42 break; 45 43 default: 46 44 ret = -EINVAL;
+2
include/asm-i386/irq.h
··· 29 29 30 30 #ifdef CONFIG_4KSTACKS 31 31 extern void irq_ctx_init(int cpu); 32 + extern void irq_ctx_exit(int cpu); 32 33 # define __ARCH_HAS_DO_SOFTIRQ 33 34 #else 34 35 # define irq_ctx_init(cpu) do { } while (0) 36 + # define irq_ctx_exit(cpu) do { } while (0) 35 37 #endif 36 38 37 39 #ifdef CONFIG_IRQBALANCE
+8
include/asm-i386/smp.h
··· 48 48 #define MAX_APICID 256 49 49 extern u8 x86_cpu_to_apicid[]; 50 50 51 + #ifdef CONFIG_HOTPLUG_CPU 52 + extern void cpu_exit_clear(void); 53 + extern void cpu_uninit(void); 54 + 55 + #define __HAVE_ARCH_SMP_PREPARE_CPU 56 + extern int smp_prepare_cpu(int cpu); 57 + #endif 58 + 51 59 /* 52 60 * This function is needed by all SMP systems. It must _always_ be valid 53 61 * from the initial startup. We map APIC_BASE very early in page_setup(),