Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

powerpc/kdump: Use setjmp/longjmp to handle kdump and system reset recursion

We can handle recursion caused by system reset by reusing the crash
shutdown fault handler.

Since we don't have an OS triggerable NMI, if all CPUs don't make it
into kdump then we tell the user to issue a system reset. However if
we have a panic timeout set we cannot wait forever and must continue
the kdump.

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

authored by

Anton Blanchard and committed by
Benjamin Herrenschmidt
07fe0c61 9b00ac06

+57 -15
+57 -15
arch/powerpc/kernel/crash.c
··· 53 53 static crash_shutdown_t crash_shutdown_handles[CRASH_HANDLER_MAX+1]; 54 54 static DEFINE_SPINLOCK(crash_handlers_lock); 55 55 56 + static unsigned long crash_shutdown_buf[JMP_BUF_LEN]; 57 + static int crash_shutdown_cpu = -1; 58 + 59 + static int handle_fault(struct pt_regs *regs) 60 + { 61 + if (crash_shutdown_cpu == smp_processor_id()) 62 + longjmp(crash_shutdown_buf, 1); 63 + return 0; 64 + } 65 + 56 66 #ifdef CONFIG_SMP 57 67 58 68 void crash_ipi_callback(struct pt_regs *regs) ··· 99 89 static void crash_kexec_prepare_cpus(int cpu) 100 90 { 101 91 unsigned int msecs; 102 - 103 92 unsigned int ncpus = num_online_cpus() - 1;/* Excluding the panic cpu */ 93 + int tries = 0; 94 + int (*old_handler)(struct pt_regs *regs); 104 95 105 96 printk(KERN_EMERG "Sending IPI to other CPUs\n"); 106 97 107 98 crash_send_ipi(crash_ipi_callback); 108 99 smp_wmb(); 109 100 101 + again: 110 102 /* 111 103 * FIXME: Until we will have the way to stop other CPUs reliably, 112 104 * the crash CPU will send an IPI and wait for other CPUs to ··· 123 111 124 112 /* Would it be better to replace the trap vector here? */ 125 113 126 - if (cpumask_weight(&cpus_in_crash) < ncpus) { 127 - printk(KERN_EMERG "ERROR: %d CPU(s) not responding\n", 128 - ncpus - cpumask_weight(&cpus_in_crash)); 114 + if (cpumask_weight(&cpus_in_crash) >= ncpus) { 115 + printk(KERN_EMERG "IPI complete\n"); 116 + return; 129 117 } 130 118 131 - printk(KERN_EMERG "IPI complete\n"); 119 + printk(KERN_EMERG "ERROR: %d cpu(s) not responding\n", 120 + ncpus - cpumask_weight(&cpus_in_crash)); 121 + 122 + /* 123 + * If we have a panic timeout set then we can't wait indefinitely 124 + * for someone to activate system reset. We also give up on the 125 + * second time through if system reset fail to work. 126 + */ 127 + if ((panic_timeout > 0) || (tries > 0)) 128 + return; 129 + 130 + /* 131 + * A system reset will cause all CPUs to take an 0x100 exception. 132 + * The primary CPU returns here via setjmp, and the secondary 133 + * CPUs reexecute the crash_kexec_secondary path. 134 + */ 135 + old_handler = __debugger; 136 + __debugger = handle_fault; 137 + crash_shutdown_cpu = smp_processor_id(); 138 + 139 + if (setjmp(crash_shutdown_buf) == 0) { 140 + printk(KERN_EMERG "Activate system reset (dumprestart) " 141 + "to stop other cpu(s)\n"); 142 + 143 + /* 144 + * A system reset will force all CPUs to execute the 145 + * crash code again. We need to reset cpus_in_crash so we 146 + * wait for everyone to do this. 147 + */ 148 + cpus_in_crash = CPU_MASK_NONE; 149 + smp_mb(); 150 + 151 + while (cpumask_weight(&cpus_in_crash) < ncpus) 152 + cpu_relax(); 153 + } 154 + 155 + crash_shutdown_cpu = -1; 156 + __debugger = old_handler; 157 + 158 + tries++; 159 + goto again; 132 160 } 133 161 134 162 /* ··· 296 244 return rc; 297 245 } 298 246 EXPORT_SYMBOL(crash_shutdown_unregister); 299 - 300 - static unsigned long crash_shutdown_buf[JMP_BUF_LEN]; 301 - static int crash_shutdown_cpu = -1; 302 - 303 - static int handle_fault(struct pt_regs *regs) 304 - { 305 - if (crash_shutdown_cpu == smp_processor_id()) 306 - longjmp(crash_shutdown_buf, 1); 307 - return 0; 308 - } 309 247 310 248 void default_machine_crash_shutdown(struct pt_regs *regs) 311 249 {