Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

[PATCH] x86_64: Collected NMI watchdog fixes.

Collected NMI watchdog fixes.

- Fix call of check_nmi_watchdog

- Remove earlier move of check_nmi_watchdog to later. It does not fix the
race it was supposed to fix fully.

- Remove unused P6 definitions

- Add support for performance counter based watchdog on P4 systems.

This allows to run it only once per second, which saves some CPU time.
Previously it would run at 1000Hz, which was too much.

Code ported from i386

Make this the default on Intel systems.

- Use check_nmi_watchdog with local APIC based nmi

- Fix race in touch_nmi_watchdog

- Fix bug that caused incorrect performance counters to be programmed in a
few cases on K8.

- Remove useless check for local APIC

- Use local_t and per_cpu variables for per CPU data.

- Keep other CPUs busy during check_nmi_watchdog to make sure they really
tick when in lapic mode.

- Only check CPUs that are actually online.

- Various other fixes.

- Fix fallback path when MSRs are unimplemented

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

authored by

Andi Kleen and committed by
Linus Torvalds
75152114 f3c5f5e7

+179 -79
+2 -1
arch/x86_64/kernel/apic.c
··· 33 33 #include <asm/mpspec.h> 34 34 #include <asm/pgalloc.h> 35 35 #include <asm/mach_apic.h> 36 + #include <asm/nmi.h> 36 37 37 38 int apic_verbosity; 38 39 ··· 1057 1056 nr_ioapics = 0; 1058 1057 #endif 1059 1058 setup_boot_APIC_clock(); 1060 - 1059 + check_nmi_watchdog(); 1061 1060 return 0; 1062 1061 } 1063 1062
+172 -78
arch/x86_64/kernel/nmi.c
··· 33 33 #include <asm/msr.h> 34 34 #include <asm/proto.h> 35 35 #include <asm/kdebug.h> 36 + #include <asm/local.h> 36 37 37 38 /* 38 39 * lapic_nmi_owner tracks the ownership of the lapic NMI hardware: ··· 60 59 61 60 unsigned int nmi_watchdog = NMI_DEFAULT; 62 61 static unsigned int nmi_hz = HZ; 63 - unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */ 62 + static unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */ 63 + static unsigned int nmi_p4_cccr_val; 64 64 65 65 /* Note that these events don't tick when the CPU idles. This means 66 66 the frequency varies with CPU load. */ ··· 73 71 #define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76 74 72 #define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 75 73 76 - #define P6_EVNTSEL0_ENABLE (1 << 22) 77 - #define P6_EVNTSEL_INT (1 << 20) 78 - #define P6_EVNTSEL_OS (1 << 17) 79 - #define P6_EVNTSEL_USR (1 << 16) 80 - #define P6_EVENT_CPU_CLOCKS_NOT_HALTED 0x79 81 - #define P6_NMI_EVENT P6_EVENT_CPU_CLOCKS_NOT_HALTED 74 + #define MSR_P4_MISC_ENABLE 0x1A0 75 + #define MSR_P4_MISC_ENABLE_PERF_AVAIL (1<<7) 76 + #define MSR_P4_MISC_ENABLE_PEBS_UNAVAIL (1<<12) 77 + #define MSR_P4_PERFCTR0 0x300 78 + #define MSR_P4_CCCR0 0x360 79 + #define P4_ESCR_EVENT_SELECT(N) ((N)<<25) 80 + #define P4_ESCR_OS (1<<3) 81 + #define P4_ESCR_USR (1<<2) 82 + #define P4_CCCR_OVF_PMI0 (1<<26) 83 + #define P4_CCCR_OVF_PMI1 (1<<27) 84 + #define P4_CCCR_THRESHOLD(N) ((N)<<20) 85 + #define P4_CCCR_COMPLEMENT (1<<19) 86 + #define P4_CCCR_COMPARE (1<<18) 87 + #define P4_CCCR_REQUIRED (3<<16) 88 + #define P4_CCCR_ESCR_SELECT(N) ((N)<<13) 89 + #define P4_CCCR_ENABLE (1<<12) 90 + /* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter 91 + CRU_ESCR0 (with any non-null event selector) through a complemented 92 + max threshold. [IA32-Vol3, Section 14.9.9] */ 93 + #define MSR_P4_IQ_COUNTER0 0x30C 94 + #define P4_NMI_CRU_ESCR0 (P4_ESCR_EVENT_SELECT(0x3F)|P4_ESCR_OS|P4_ESCR_USR) 95 + #define P4_NMI_IQ_CCCR0 \ 96 + (P4_CCCR_OVF_PMI0|P4_CCCR_THRESHOLD(15)|P4_CCCR_COMPLEMENT| \ 97 + P4_CCCR_COMPARE|P4_CCCR_REQUIRED|P4_CCCR_ESCR_SELECT(4)|P4_CCCR_ENABLE) 98 + 99 + static __init inline int nmi_known_cpu(void) 100 + { 101 + switch (boot_cpu_data.x86_vendor) { 102 + case X86_VENDOR_AMD: 103 + return boot_cpu_data.x86 == 15; 104 + case X86_VENDOR_INTEL: 105 + return boot_cpu_data.x86 == 15; 106 + } 107 + return 0; 108 + } 82 109 83 110 /* Run after command line and cpu_init init, but before all other checks */ 84 111 void __init nmi_watchdog_default(void) 85 112 { 86 113 if (nmi_watchdog != NMI_DEFAULT) 87 114 return; 88 - 89 - /* For some reason the IO APIC watchdog doesn't work on the AMD 90 - 8111 chipset. For now switch to local APIC mode using 91 - perfctr0 there. On Intel CPUs we don't have code to handle 92 - the perfctr and the IO-APIC seems to work, so use that. */ 93 - 94 - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { 95 - nmi_watchdog = NMI_LOCAL_APIC; 96 - printk(KERN_INFO 97 - "Using local APIC NMI watchdog using perfctr0\n"); 98 - } else { 99 - printk(KERN_INFO "Using IO APIC NMI watchdog\n"); 115 + if (nmi_known_cpu()) 116 + nmi_watchdog = NMI_LOCAL_APIC; 117 + else 100 118 nmi_watchdog = NMI_IO_APIC; 101 - } 102 119 } 103 120 104 - /* Why is there no CPUID flag for this? */ 105 - static __init int cpu_has_lapic(void) 121 + #ifdef CONFIG_SMP 122 + /* The performance counters used by NMI_LOCAL_APIC don't trigger when 123 + * the CPU is idle. To make sure the NMI watchdog really ticks on all 124 + * CPUs during the test make them busy. 125 + */ 126 + static __init void nmi_cpu_busy(void *data) 106 127 { 107 - switch (boot_cpu_data.x86_vendor) { 108 - case X86_VENDOR_INTEL: 109 - case X86_VENDOR_AMD: 110 - return boot_cpu_data.x86 >= 6; 111 - /* .... add more cpus here or find a different way to figure this out. */ 112 - default: 113 - return 0; 114 - } 128 + volatile int *endflag = data; 129 + local_irq_enable(); 130 + /* Intentionally don't use cpu_relax here. This is 131 + to make sure that the performance counter really ticks, 132 + even if there is a simulator or similar that catches the 133 + pause instruction. On a real HT machine this is fine because 134 + all other CPUs are busy with "useless" delay loops and don't 135 + care if they get somewhat less cycles. */ 136 + while (*endflag == 0) 137 + barrier(); 115 138 } 139 + #endif 116 140 117 - static int __init check_nmi_watchdog (void) 141 + int __init check_nmi_watchdog (void) 118 142 { 143 + volatile int endflag = 0; 119 144 int *counts; 120 145 int cpu; 121 146 122 - if (nmi_watchdog == NMI_NONE) 123 - return 0; 147 + counts = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL); 148 + if (!counts) 149 + return -1; 124 150 125 - if (nmi_watchdog == NMI_LOCAL_APIC && !cpu_has_lapic()) { 126 - nmi_watchdog = NMI_NONE; 127 - return -1; 128 - } 151 + printk(KERN_INFO "testing NMI watchdog ... "); 129 152 130 - counts = kmalloc(NR_CPUS * sizeof(int),GFP_KERNEL); 131 - if (!counts) { 132 - nmi_watchdog = NMI_NONE; 133 - return 0; 134 - } 135 - 136 - printk(KERN_INFO "Testing NMI watchdog ... "); 153 + if (nmi_watchdog == NMI_LOCAL_APIC) 154 + smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0); 137 155 138 156 for (cpu = 0; cpu < NR_CPUS; cpu++) 139 157 counts[cpu] = cpu_pda[cpu].__nmi_count; ··· 161 139 mdelay((10*1000)/nmi_hz); // wait 10 ticks 162 140 163 141 for (cpu = 0; cpu < NR_CPUS; cpu++) { 142 + if (!cpu_online(cpu)) 143 + continue; 164 144 if (cpu_pda[cpu].__nmi_count - counts[cpu] <= 5) { 165 - printk("CPU#%d: NMI appears to be stuck (%d)!\n", 145 + endflag = 1; 146 + printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n", 166 147 cpu, 148 + counts[cpu], 167 149 cpu_pda[cpu].__nmi_count); 168 150 nmi_active = 0; 169 151 lapic_nmi_owner &= ~LAPIC_NMI_WATCHDOG; 152 + nmi_perfctr_msr = 0; 170 153 kfree(counts); 171 154 return -1; 172 155 } 173 156 } 157 + endflag = 1; 174 158 printk("OK.\n"); 175 159 176 160 /* now that we know it works we can reduce NMI frequency to ··· 187 159 kfree(counts); 188 160 return 0; 189 161 } 190 - /* Have this called later during boot so counters are updating */ 191 - late_initcall(check_nmi_watchdog); 192 162 193 163 int __init setup_nmi_watchdog(char *str) 194 164 { ··· 204 178 205 179 if (nmi >= NMI_INVALID) 206 180 return 0; 207 - nmi_watchdog = nmi; 181 + nmi_watchdog = nmi; 208 182 return 1; 209 183 } 210 184 ··· 219 193 wrmsr(MSR_K7_EVNTSEL0, 0, 0); 220 194 break; 221 195 case X86_VENDOR_INTEL: 222 - wrmsr(MSR_IA32_EVNTSEL0, 0, 0); 196 + if (boot_cpu_data.x86 == 15) { 197 + wrmsr(MSR_P4_IQ_CCCR0, 0, 0); 198 + wrmsr(MSR_P4_CRU_ESCR0, 0, 0); 199 + } 223 200 break; 224 201 } 225 202 nmi_active = -1; ··· 290 261 291 262 static int nmi_pm_active; /* nmi_active before suspend */ 292 263 293 - static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state) 264 + static int lapic_nmi_suspend(struct sys_device *dev, u32 state) 294 265 { 295 266 nmi_pm_active = nmi_active; 296 267 disable_lapic_nmi_watchdog(); ··· 337 308 * Original code written by Keith Owens. 338 309 */ 339 310 311 + static void clear_msr_range(unsigned int base, unsigned int n) 312 + { 313 + unsigned int i; 314 + 315 + for(i = 0; i < n; ++i) 316 + wrmsr(base+i, 0, 0); 317 + } 318 + 340 319 static void setup_k7_watchdog(void) 341 320 { 342 321 int i; 343 322 unsigned int evntsel; 344 323 345 - /* No check, so can start with slow frequency */ 346 - nmi_hz = 1; 347 - 348 - /* XXX should check these in EFER */ 349 - 350 324 nmi_perfctr_msr = MSR_K7_PERFCTR0; 351 325 352 326 for(i = 0; i < 4; ++i) { 353 327 /* Simulator may not support it */ 354 - if (checking_wrmsrl(MSR_K7_EVNTSEL0+i, 0UL)) 328 + if (checking_wrmsrl(MSR_K7_EVNTSEL0+i, 0UL)) { 329 + nmi_perfctr_msr = 0; 355 330 return; 331 + } 356 332 wrmsrl(MSR_K7_PERFCTR0+i, 0UL); 357 333 } 358 334 ··· 367 333 | K7_NMI_EVENT; 368 334 369 335 wrmsr(MSR_K7_EVNTSEL0, evntsel, 0); 370 - wrmsrl(MSR_K7_PERFCTR0, -((u64)cpu_khz*1000) / nmi_hz); 336 + wrmsr(MSR_K7_PERFCTR0, -(cpu_khz/nmi_hz*1000), -1); 371 337 apic_write(APIC_LVTPC, APIC_DM_NMI); 372 338 evntsel |= K7_EVNTSEL_ENABLE; 373 339 wrmsr(MSR_K7_EVNTSEL0, evntsel, 0); 340 + } 341 + 342 + 343 + static int setup_p4_watchdog(void) 344 + { 345 + unsigned int misc_enable, dummy; 346 + 347 + rdmsr(MSR_P4_MISC_ENABLE, misc_enable, dummy); 348 + if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL)) 349 + return 0; 350 + 351 + nmi_perfctr_msr = MSR_P4_IQ_COUNTER0; 352 + nmi_p4_cccr_val = P4_NMI_IQ_CCCR0; 353 + #ifdef CONFIG_SMP 354 + if (smp_num_siblings == 2) 355 + nmi_p4_cccr_val |= P4_CCCR_OVF_PMI1; 356 + #endif 357 + 358 + if (!(misc_enable & MSR_P4_MISC_ENABLE_PEBS_UNAVAIL)) 359 + clear_msr_range(0x3F1, 2); 360 + /* MSR 0x3F0 seems to have a default value of 0xFC00, but current 361 + docs doesn't fully define it, so leave it alone for now. */ 362 + if (boot_cpu_data.x86_model >= 0x3) { 363 + /* MSR_P4_IQ_ESCR0/1 (0x3ba/0x3bb) removed */ 364 + clear_msr_range(0x3A0, 26); 365 + clear_msr_range(0x3BC, 3); 366 + } else { 367 + clear_msr_range(0x3A0, 31); 368 + } 369 + clear_msr_range(0x3C0, 6); 370 + clear_msr_range(0x3C8, 6); 371 + clear_msr_range(0x3E0, 2); 372 + clear_msr_range(MSR_P4_CCCR0, 18); 373 + clear_msr_range(MSR_P4_PERFCTR0, 18); 374 + 375 + wrmsr(MSR_P4_CRU_ESCR0, P4_NMI_CRU_ESCR0, 0); 376 + wrmsr(MSR_P4_IQ_CCCR0, P4_NMI_IQ_CCCR0 & ~P4_CCCR_ENABLE, 0); 377 + Dprintk("setting P4_IQ_COUNTER0 to 0x%08lx\n", -(cpu_khz/nmi_hz*1000)); 378 + wrmsr(MSR_P4_IQ_COUNTER0, -(cpu_khz/nmi_hz*1000), -1); 379 + apic_write(APIC_LVTPC, APIC_DM_NMI); 380 + wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0); 381 + return 1; 374 382 } 375 383 376 384 void setup_apic_nmi_watchdog(void) ··· 425 349 return; 426 350 setup_k7_watchdog(); 427 351 break; 352 + case X86_VENDOR_INTEL: 353 + if (boot_cpu_data.x86 != 15) 354 + return; 355 + if (!setup_p4_watchdog()) 356 + return; 357 + break; 358 + 428 359 default: 429 360 return; 430 361 } ··· 446 363 * 447 364 * as these watchdog NMI IRQs are generated on every CPU, we only 448 365 * have to check the current processor. 449 - * 450 - * since NMIs don't listen to _any_ locks, we have to be extremely 451 - * careful not to rely on unsafe variables. The printk might lock 452 - * up though, so we have to break up any console locks first ... 453 - * [when there will be more tty-related locks, break them up 454 - * here too!] 455 366 */ 456 367 457 - static unsigned int 458 - last_irq_sums [NR_CPUS], 459 - alert_counter [NR_CPUS]; 368 + static DEFINE_PER_CPU(unsigned, last_irq_sum); 369 + static DEFINE_PER_CPU(local_t, alert_counter); 370 + static DEFINE_PER_CPU(int, nmi_touch); 460 371 461 372 void touch_nmi_watchdog (void) 462 373 { 463 374 int i; 464 375 465 376 /* 466 - * Just reset the alert counters, (other CPUs might be 467 - * spinning on locks we hold): 377 + * Tell other CPUs to reset their alert counters. We cannot 378 + * do it ourselves because the alert count increase is not 379 + * atomic. 468 380 */ 469 381 for (i = 0; i < NR_CPUS; i++) 470 - alert_counter[i] = 0; 382 + per_cpu(nmi_touch, i) = 1; 471 383 } 472 384 473 385 void nmi_watchdog_tick (struct pt_regs * regs, unsigned reason) 474 386 { 475 - int sum, cpu; 387 + int sum; 388 + int touched = 0; 476 389 477 - cpu = safe_smp_processor_id(); 478 390 sum = read_pda(apic_timer_irqs); 479 - if (last_irq_sums[cpu] == sum) { 391 + if (__get_cpu_var(nmi_touch)) { 392 + __get_cpu_var(nmi_touch) = 0; 393 + touched = 1; 394 + } 395 + if (!touched && __get_cpu_var(last_irq_sum) == sum) { 480 396 /* 481 397 * Ayiee, looks like this CPU is stuck ... 482 398 * wait a few IRQs (5 seconds) before doing the oops ... 483 399 */ 484 - alert_counter[cpu]++; 485 - if (alert_counter[cpu] == 5*nmi_hz) { 400 + local_inc(&__get_cpu_var(alert_counter)); 401 + if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz) { 486 402 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) 487 403 == NOTIFY_STOP) { 488 - alert_counter[cpu] = 0; 404 + local_set(&__get_cpu_var(alert_counter), 0); 489 405 return; 490 406 } 491 407 die_nmi("NMI Watchdog detected LOCKUP on CPU%d", regs); 492 408 } 493 409 } else { 494 - last_irq_sums[cpu] = sum; 495 - alert_counter[cpu] = 0; 410 + __get_cpu_var(last_irq_sum) = sum; 411 + local_set(&__get_cpu_var(alert_counter), 0); 496 412 } 497 - if (nmi_perfctr_msr) 413 + if (nmi_perfctr_msr) { 414 + if (nmi_perfctr_msr == MSR_P4_IQ_COUNTER0) { 415 + /* 416 + * P4 quirks: 417 + * - An overflown perfctr will assert its interrupt 418 + * until the OVF flag in its CCCR is cleared. 419 + * - LVTPC is masked on interrupt and must be 420 + * unmasked by the LVTPC handler. 421 + */ 422 + wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0); 423 + apic_write(APIC_LVTPC, APIC_DM_NMI); 424 + } 498 425 wrmsr(nmi_perfctr_msr, -(cpu_khz/nmi_hz*1000), -1); 426 + } 499 427 } 500 428 501 429 static int dummy_nmi_callback(struct pt_regs * regs, int cpu)
+3
arch/x86_64/kernel/smpboot.c
··· 56 56 #include <asm/kdebug.h> 57 57 #include <asm/tlbflush.h> 58 58 #include <asm/proto.h> 59 + #include <asm/nmi.h> 59 60 60 61 /* Change for real CPU hotplug. Note other files need to be fixed 61 62 first too. */ ··· 1031 1030 1032 1031 detect_siblings(); 1033 1032 time_init_gtod(); 1033 + 1034 + check_nmi_watchdog(); 1034 1035 }
+2
include/asm-x86_64/nmi.h
··· 53 53 54 54 extern int panic_on_timeout; 55 55 extern int unknown_nmi_panic; 56 + 57 + extern int check_nmi_watchdog(void); 56 58 57 59 #endif /* ASM_NMI_H */