Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

x86, nmi: Add in logic to handle multiple events and unknown NMIs

Previous patches allow the NMI subsystem to process multipe NMI events
in one NMI. As previously discussed this can cause issues when an event
triggered another NMI but is processed in the current NMI. This causes the
next NMI to go unprocessed and become an 'unknown' NMI.

To handle this, we first have to flag whether or not the NMI handler handled
more than one event or not. If it did, then there exists a chance that
the next NMI might be already processed. Once the NMI is flagged as a
candidate to be swallowed, we next look for a back-to-back NMI condition.

This is determined by looking at the %rip from pt_regs. If it is the same
as the previous NMI, it is assumed the cpu did not have a chance to jump
back into a non-NMI context and execute code and instead handled another NMI.

If both of those conditions are true then we will swallow any unknown NMI.

There still exists a chance that we accidentally swallow a real unknown NMI,
but for now things seem better.

An optimization has also been added to the nmi notifier rountine. Because x86
can latch up to one NMI while currently processing an NMI, we don't have to
worry about executing _all_ the handlers in a standalone NMI. The idea is
if multiple NMIs come in, the second NMI will represent them. For those
back-to-back NMI cases, we have the potentail to drop NMIs. Therefore only
execute all the handlers in the second half of a detected back-to-back NMI.

Signed-off-by: Don Zickus <dzickus@redhat.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1317409584-23662-5-git-send-email-dzickus@redhat.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>

authored by

Don Zickus and committed by
Ingo Molnar
b227e233 9c48f1c6

+93 -9
+1
arch/x86/include/asm/nmi.h
··· 42 42 43 43 void stop_nmi(void); 44 44 void restart_nmi(void); 45 + void local_touch_nmi(void); 45 46 46 47 #endif /* _ASM_X86_NMI_H */
+88 -9
arch/x86/kernel/nmi.c
··· 71 71 72 72 #define nmi_to_desc(type) (&nmi_desc[type]) 73 73 74 - static int notrace __kprobes nmi_handle(unsigned int type, struct pt_regs *regs) 74 + static int notrace __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2b) 75 75 { 76 76 struct nmi_desc *desc = nmi_to_desc(type); 77 77 struct nmiaction *a; ··· 85 85 * can be latched at any given time. Walk the whole list 86 86 * to handle those situations. 87 87 */ 88 - list_for_each_entry_rcu(a, &desc->head, list) { 89 - 88 + list_for_each_entry_rcu(a, &desc->head, list) 90 89 handled += a->handler(type, regs); 91 - 92 - } 93 90 94 91 rcu_read_unlock(); 95 92 ··· 100 103 unsigned long flags; 101 104 102 105 spin_lock_irqsave(&desc->lock, flags); 106 + 107 + /* 108 + * most handlers of type NMI_UNKNOWN never return because 109 + * they just assume the NMI is theirs. Just a sanity check 110 + * to manage expectations 111 + */ 112 + WARN_ON_ONCE(type == NMI_UNKNOWN && !list_empty(&desc->head)); 103 113 104 114 /* 105 115 * some handlers need to be executed first otherwise a fake ··· 255 251 { 256 252 int handled; 257 253 258 - handled = nmi_handle(NMI_UNKNOWN, regs); 254 + /* 255 + * Use 'false' as back-to-back NMIs are dealt with one level up. 256 + * Of course this makes having multiple 'unknown' handlers useless 257 + * as only the first one is ever run (unless it can actually determine 258 + * if it caused the NMI) 259 + */ 260 + handled = nmi_handle(NMI_UNKNOWN, regs, false); 259 261 if (handled) 260 262 return; 261 263 #ifdef CONFIG_MCA ··· 284 274 pr_emerg("Dazed and confused, but trying to continue\n"); 285 275 } 286 276 277 + static DEFINE_PER_CPU(bool, swallow_nmi); 278 + static DEFINE_PER_CPU(unsigned long, last_nmi_rip); 279 + 287 280 static notrace __kprobes void default_do_nmi(struct pt_regs *regs) 288 281 { 289 282 unsigned char reason = 0; 290 283 int handled; 284 + bool b2b = false; 291 285 292 286 /* 293 287 * CPU-specific NMI must be processed before non-CPU-specific 294 288 * NMI, otherwise we may lose it, because the CPU-specific 295 289 * NMI can not be detected/processed on other CPUs. 296 290 */ 297 - handled = nmi_handle(NMI_LOCAL, regs); 298 - if (handled) 291 + 292 + /* 293 + * Back-to-back NMIs are interesting because they can either 294 + * be two NMI or more than two NMIs (any thing over two is dropped 295 + * due to NMI being edge-triggered). If this is the second half 296 + * of the back-to-back NMI, assume we dropped things and process 297 + * more handlers. Otherwise reset the 'swallow' NMI behaviour 298 + */ 299 + if (regs->ip == __this_cpu_read(last_nmi_rip)) 300 + b2b = true; 301 + else 302 + __this_cpu_write(swallow_nmi, false); 303 + 304 + __this_cpu_write(last_nmi_rip, regs->ip); 305 + 306 + handled = nmi_handle(NMI_LOCAL, regs, b2b); 307 + if (handled) { 308 + /* 309 + * There are cases when a NMI handler handles multiple 310 + * events in the current NMI. One of these events may 311 + * be queued for in the next NMI. Because the event is 312 + * already handled, the next NMI will result in an unknown 313 + * NMI. Instead lets flag this for a potential NMI to 314 + * swallow. 315 + */ 316 + if (handled > 1) 317 + __this_cpu_write(swallow_nmi, true); 299 318 return; 319 + } 300 320 301 321 /* Non-CPU-specific NMI: NMI sources can be processed on any CPU */ 302 322 raw_spin_lock(&nmi_reason_lock); ··· 349 309 } 350 310 raw_spin_unlock(&nmi_reason_lock); 351 311 352 - unknown_nmi_error(reason, regs); 312 + /* 313 + * Only one NMI can be latched at a time. To handle 314 + * this we may process multiple nmi handlers at once to 315 + * cover the case where an NMI is dropped. The downside 316 + * to this approach is we may process an NMI prematurely, 317 + * while its real NMI is sitting latched. This will cause 318 + * an unknown NMI on the next run of the NMI processing. 319 + * 320 + * We tried to flag that condition above, by setting the 321 + * swallow_nmi flag when we process more than one event. 322 + * This condition is also only present on the second half 323 + * of a back-to-back NMI, so we flag that condition too. 324 + * 325 + * If both are true, we assume we already processed this 326 + * NMI previously and we swallow it. Otherwise we reset 327 + * the logic. 328 + * 329 + * There are scenarios where we may accidentally swallow 330 + * a 'real' unknown NMI. For example, while processing 331 + * a perf NMI another perf NMI comes in along with a 332 + * 'real' unknown NMI. These two NMIs get combined into 333 + * one (as descibed above). When the next NMI gets 334 + * processed, it will be flagged by perf as handled, but 335 + * noone will know that there was a 'real' unknown NMI sent 336 + * also. As a result it gets swallowed. Or if the first 337 + * perf NMI returns two events handled then the second 338 + * NMI will get eaten by the logic below, again losing a 339 + * 'real' unknown NMI. But this is the best we can do 340 + * for now. 341 + */ 342 + if (b2b && __this_cpu_read(swallow_nmi)) 343 + ; 344 + else 345 + unknown_nmi_error(reason, regs); 353 346 } 354 347 355 348 dotraplinkage notrace __kprobes void ··· 406 333 void restart_nmi(void) 407 334 { 408 335 ignore_nmis--; 336 + } 337 + 338 + /* reset the back-to-back NMI logic */ 339 + void local_touch_nmi(void) 340 + { 341 + __this_cpu_write(last_nmi_rip, 0); 409 342 }
+2
arch/x86/kernel/process_32.c
··· 57 57 #include <asm/idle.h> 58 58 #include <asm/syscalls.h> 59 59 #include <asm/debugreg.h> 60 + #include <asm/nmi.h> 60 61 61 62 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); 62 63 ··· 108 107 if (cpu_is_offline(cpu)) 109 108 play_dead(); 110 109 110 + local_touch_nmi(); 111 111 local_irq_disable(); 112 112 /* Don't trace irqs off for idle */ 113 113 stop_critical_timings();
+2
arch/x86/kernel/process_64.c
··· 51 51 #include <asm/idle.h> 52 52 #include <asm/syscalls.h> 53 53 #include <asm/debugreg.h> 54 + #include <asm/nmi.h> 54 55 55 56 asmlinkage extern void ret_from_fork(void); 56 57 ··· 134 133 * from here on, until they go to idle. 135 134 * Otherwise, idle callbacks can misfire. 136 135 */ 136 + local_touch_nmi(); 137 137 local_irq_disable(); 138 138 enter_idle(); 139 139 /* Don't trace irqs off for idle */