Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at v2.6.26 541 lines 13 kB view raw
1/* 2 * drivers/s390/s390mach.c 3 * S/390 machine check handler 4 * 5 * S390 version 6 * Copyright (C) 2000 IBM Deutschland Entwicklung GmbH, IBM Corporation 7 * Author(s): Ingo Adlung (adlung@de.ibm.com) 8 * Martin Schwidefsky (schwidefsky@de.ibm.com) 9 */ 10 11#include <linux/init.h> 12#include <linux/sched.h> 13#include <linux/errno.h> 14#include <linux/workqueue.h> 15#include <linux/time.h> 16#include <linux/device.h> 17#include <linux/kthread.h> 18#include <asm/etr.h> 19#include <asm/lowcore.h> 20#include <asm/cio.h> 21#include "cio/cio.h" 22#include "cio/chsc.h" 23#include "cio/css.h" 24#include "cio/chp.h" 25#include "s390mach.h" 26 27static struct semaphore m_sem; 28 29static NORET_TYPE void 30s390_handle_damage(char *msg) 31{ 32#ifdef CONFIG_SMP 33 smp_send_stop(); 34#endif 35 disabled_wait((unsigned long) __builtin_return_address(0)); 36 for(;;); 37} 38 39/* 40 * Retrieve CRWs and call function to handle event. 41 * 42 * Note : we currently process CRWs for io and chsc subchannels only 43 */ 44static int 45s390_collect_crw_info(void *param) 46{ 47 struct crw crw[2]; 48 int ccode; 49 struct semaphore *sem; 50 unsigned int chain; 51 int ignore; 52 53 sem = (struct semaphore *)param; 54repeat: 55 ignore = down_interruptible(sem); 56 chain = 0; 57 while (1) { 58 if (unlikely(chain > 1)) { 59 struct crw tmp_crw; 60 61 printk(KERN_WARNING"%s: Code does not support more " 62 "than two chained crws; please report to " 63 "linux390@de.ibm.com!\n", __func__); 64 ccode = stcrw(&tmp_crw); 65 printk(KERN_WARNING"%s: crw reports slct=%d, oflw=%d, " 66 "chn=%d, rsc=%X, anc=%d, erc=%X, rsid=%X\n", 67 __func__, tmp_crw.slct, tmp_crw.oflw, 68 tmp_crw.chn, tmp_crw.rsc, tmp_crw.anc, 69 tmp_crw.erc, tmp_crw.rsid); 70 printk(KERN_WARNING"%s: This was crw number %x in the " 71 "chain\n", __func__, chain); 72 if (ccode != 0) 73 break; 74 chain = tmp_crw.chn ? chain + 1 : 0; 75 continue; 76 } 77 ccode = stcrw(&crw[chain]); 78 if (ccode != 0) 79 break; 80 printk(KERN_DEBUG "crw_info : CRW reports slct=%d, oflw=%d, " 81 "chn=%d, rsc=%X, anc=%d, erc=%X, rsid=%X\n", 82 crw[chain].slct, crw[chain].oflw, crw[chain].chn, 83 crw[chain].rsc, crw[chain].anc, crw[chain].erc, 84 crw[chain].rsid); 85 /* Check for overflows. */ 86 if (crw[chain].oflw) { 87 pr_debug("%s: crw overflow detected!\n", __func__); 88 css_schedule_eval_all(); 89 chain = 0; 90 continue; 91 } 92 switch (crw[chain].rsc) { 93 case CRW_RSC_SCH: 94 if (crw[0].chn && !chain) 95 break; 96 pr_debug("source is subchannel %04X\n", crw[0].rsid); 97 css_process_crw(crw[0].rsid, chain ? crw[1].rsid : 0); 98 break; 99 case CRW_RSC_MONITOR: 100 pr_debug("source is monitoring facility\n"); 101 break; 102 case CRW_RSC_CPATH: 103 pr_debug("source is channel path %02X\n", crw[0].rsid); 104 /* 105 * Check for solicited machine checks. These are 106 * created by reset channel path and need not be 107 * reported to the common I/O layer. 108 */ 109 if (crw[chain].slct) { 110 pr_debug("solicited machine check for " 111 "channel path %02X\n", crw[0].rsid); 112 break; 113 } 114 switch (crw[0].erc) { 115 case CRW_ERC_IPARM: /* Path has come. */ 116 chp_process_crw(crw[0].rsid, 1); 117 break; 118 case CRW_ERC_PERRI: /* Path has gone. */ 119 case CRW_ERC_PERRN: 120 chp_process_crw(crw[0].rsid, 0); 121 break; 122 default: 123 pr_debug("Don't know how to handle erc=%x\n", 124 crw[0].erc); 125 } 126 break; 127 case CRW_RSC_CONFIG: 128 pr_debug("source is configuration-alert facility\n"); 129 break; 130 case CRW_RSC_CSS: 131 pr_debug("source is channel subsystem\n"); 132 chsc_process_crw(); 133 break; 134 default: 135 pr_debug("unknown source\n"); 136 break; 137 } 138 /* chain is always 0 or 1 here. */ 139 chain = crw[chain].chn ? chain + 1 : 0; 140 } 141 goto repeat; 142 return 0; 143} 144 145struct mcck_struct { 146 int kill_task; 147 int channel_report; 148 int warning; 149 unsigned long long mcck_code; 150}; 151 152static DEFINE_PER_CPU(struct mcck_struct, cpu_mcck); 153 154/* 155 * Main machine check handler function. Will be called with interrupts enabled 156 * or disabled and machine checks enabled or disabled. 157 */ 158void 159s390_handle_mcck(void) 160{ 161 unsigned long flags; 162 struct mcck_struct mcck; 163 164 /* 165 * Disable machine checks and get the current state of accumulated 166 * machine checks. Afterwards delete the old state and enable machine 167 * checks again. 168 */ 169 local_irq_save(flags); 170 local_mcck_disable(); 171 mcck = __get_cpu_var(cpu_mcck); 172 memset(&__get_cpu_var(cpu_mcck), 0, sizeof(struct mcck_struct)); 173 clear_thread_flag(TIF_MCCK_PENDING); 174 local_mcck_enable(); 175 local_irq_restore(flags); 176 177 if (mcck.channel_report) 178 up(&m_sem); 179 180#ifdef CONFIG_MACHCHK_WARNING 181/* 182 * The warning may remain for a prolonged period on the bare iron. 183 * (actually till the machine is powered off, or until the problem is gone) 184 * So we just stop listening for the WARNING MCH and prevent continuously 185 * being interrupted. One caveat is however, that we must do this per 186 * processor and cannot use the smp version of ctl_clear_bit(). 187 * On VM we only get one interrupt per virtally presented machinecheck. 188 * Though one suffices, we may get one interrupt per (virtual) processor. 189 */ 190 if (mcck.warning) { /* WARNING pending ? */ 191 static int mchchk_wng_posted = 0; 192 /* 193 * Use single machine clear, as we cannot handle smp right now 194 */ 195 __ctl_clear_bit(14, 24); /* Disable WARNING MCH */ 196 if (xchg(&mchchk_wng_posted, 1) == 0) 197 kill_cad_pid(SIGPWR, 1); 198 } 199#endif 200 201 if (mcck.kill_task) { 202 local_irq_enable(); 203 printk(KERN_EMERG "mcck: Terminating task because of machine " 204 "malfunction (code 0x%016llx).\n", mcck.mcck_code); 205 printk(KERN_EMERG "mcck: task: %s, pid: %d.\n", 206 current->comm, current->pid); 207 do_exit(SIGSEGV); 208 } 209} 210EXPORT_SYMBOL_GPL(s390_handle_mcck); 211 212/* 213 * returns 0 if all registers could be validated 214 * returns 1 otherwise 215 */ 216static int 217s390_revalidate_registers(struct mci *mci) 218{ 219 int kill_task; 220 u64 tmpclock; 221 u64 zero; 222 void *fpt_save_area, *fpt_creg_save_area; 223 224 kill_task = 0; 225 zero = 0; 226 /* General purpose registers */ 227 if (!mci->gr) 228 /* 229 * General purpose registers couldn't be restored and have 230 * unknown contents. Process needs to be terminated. 231 */ 232 kill_task = 1; 233 234 /* Revalidate floating point registers */ 235 if (!mci->fp) 236 /* 237 * Floating point registers can't be restored and 238 * therefore the process needs to be terminated. 239 */ 240 kill_task = 1; 241 242#ifndef CONFIG_64BIT 243 asm volatile( 244 " ld 0,0(%0)\n" 245 " ld 2,8(%0)\n" 246 " ld 4,16(%0)\n" 247 " ld 6,24(%0)" 248 : : "a" (&S390_lowcore.floating_pt_save_area)); 249#endif 250 251 if (MACHINE_HAS_IEEE) { 252#ifdef CONFIG_64BIT 253 fpt_save_area = &S390_lowcore.floating_pt_save_area; 254 fpt_creg_save_area = &S390_lowcore.fpt_creg_save_area; 255#else 256 fpt_save_area = (void *) S390_lowcore.extended_save_area_addr; 257 fpt_creg_save_area = fpt_save_area+128; 258#endif 259 /* Floating point control register */ 260 if (!mci->fc) { 261 /* 262 * Floating point control register can't be restored. 263 * Task will be terminated. 264 */ 265 asm volatile("lfpc 0(%0)" : : "a" (&zero), "m" (zero)); 266 kill_task = 1; 267 268 } else 269 asm volatile("lfpc 0(%0)" : : "a" (fpt_creg_save_area)); 270 271 asm volatile( 272 " ld 0,0(%0)\n" 273 " ld 1,8(%0)\n" 274 " ld 2,16(%0)\n" 275 " ld 3,24(%0)\n" 276 " ld 4,32(%0)\n" 277 " ld 5,40(%0)\n" 278 " ld 6,48(%0)\n" 279 " ld 7,56(%0)\n" 280 " ld 8,64(%0)\n" 281 " ld 9,72(%0)\n" 282 " ld 10,80(%0)\n" 283 " ld 11,88(%0)\n" 284 " ld 12,96(%0)\n" 285 " ld 13,104(%0)\n" 286 " ld 14,112(%0)\n" 287 " ld 15,120(%0)\n" 288 : : "a" (fpt_save_area)); 289 } 290 291 /* Revalidate access registers */ 292 asm volatile( 293 " lam 0,15,0(%0)" 294 : : "a" (&S390_lowcore.access_regs_save_area)); 295 if (!mci->ar) 296 /* 297 * Access registers have unknown contents. 298 * Terminating task. 299 */ 300 kill_task = 1; 301 302 /* Revalidate control registers */ 303 if (!mci->cr) 304 /* 305 * Control registers have unknown contents. 306 * Can't recover and therefore stopping machine. 307 */ 308 s390_handle_damage("invalid control registers."); 309 else 310#ifdef CONFIG_64BIT 311 asm volatile( 312 " lctlg 0,15,0(%0)" 313 : : "a" (&S390_lowcore.cregs_save_area)); 314#else 315 asm volatile( 316 " lctl 0,15,0(%0)" 317 : : "a" (&S390_lowcore.cregs_save_area)); 318#endif 319 320 /* 321 * We don't even try to revalidate the TOD register, since we simply 322 * can't write something sensible into that register. 323 */ 324 325#ifdef CONFIG_64BIT 326 /* 327 * See if we can revalidate the TOD programmable register with its 328 * old contents (should be zero) otherwise set it to zero. 329 */ 330 if (!mci->pr) 331 asm volatile( 332 " sr 0,0\n" 333 " sckpf" 334 : : : "0", "cc"); 335 else 336 asm volatile( 337 " l 0,0(%0)\n" 338 " sckpf" 339 : : "a" (&S390_lowcore.tod_progreg_save_area) 340 : "0", "cc"); 341#endif 342 343 /* Revalidate clock comparator register */ 344 asm volatile( 345 " stck 0(%1)\n" 346 " sckc 0(%1)" 347 : "=m" (tmpclock) : "a" (&(tmpclock)) : "cc", "memory"); 348 349 /* Check if old PSW is valid */ 350 if (!mci->wp) 351 /* 352 * Can't tell if we come from user or kernel mode 353 * -> stopping machine. 354 */ 355 s390_handle_damage("old psw invalid."); 356 357 if (!mci->ms || !mci->pm || !mci->ia) 358 kill_task = 1; 359 360 return kill_task; 361} 362 363#define MAX_IPD_COUNT 29 364#define MAX_IPD_TIME (5 * 60 * USEC_PER_SEC) /* 5 minutes */ 365 366/* 367 * machine check handler. 368 */ 369void 370s390_do_machine_check(struct pt_regs *regs) 371{ 372 static DEFINE_SPINLOCK(ipd_lock); 373 static unsigned long long last_ipd; 374 static int ipd_count; 375 unsigned long long tmp; 376 struct mci *mci; 377 struct mcck_struct *mcck; 378 int umode; 379 380 lockdep_off(); 381 382 mci = (struct mci *) &S390_lowcore.mcck_interruption_code; 383 mcck = &__get_cpu_var(cpu_mcck); 384 umode = user_mode(regs); 385 386 if (mci->sd) 387 /* System damage -> stopping machine */ 388 s390_handle_damage("received system damage machine check."); 389 390 if (mci->pd) { 391 if (mci->b) { 392 /* Processing backup -> verify if we can survive this */ 393 u64 z_mcic, o_mcic, t_mcic; 394#ifdef CONFIG_64BIT 395 z_mcic = (1ULL<<63 | 1ULL<<59 | 1ULL<<29); 396 o_mcic = (1ULL<<43 | 1ULL<<42 | 1ULL<<41 | 1ULL<<40 | 397 1ULL<<36 | 1ULL<<35 | 1ULL<<34 | 1ULL<<32 | 398 1ULL<<30 | 1ULL<<21 | 1ULL<<20 | 1ULL<<17 | 399 1ULL<<16); 400#else 401 z_mcic = (1ULL<<63 | 1ULL<<59 | 1ULL<<57 | 1ULL<<50 | 402 1ULL<<29); 403 o_mcic = (1ULL<<43 | 1ULL<<42 | 1ULL<<41 | 1ULL<<40 | 404 1ULL<<36 | 1ULL<<35 | 1ULL<<34 | 1ULL<<32 | 405 1ULL<<30 | 1ULL<<20 | 1ULL<<17 | 1ULL<<16); 406#endif 407 t_mcic = *(u64 *)mci; 408 409 if (((t_mcic & z_mcic) != 0) || 410 ((t_mcic & o_mcic) != o_mcic)) { 411 s390_handle_damage("processing backup machine " 412 "check with damage."); 413 } 414 415 /* 416 * Nullifying exigent condition, therefore we might 417 * retry this instruction. 418 */ 419 420 spin_lock(&ipd_lock); 421 422 tmp = get_clock(); 423 424 if (((tmp - last_ipd) >> 12) < MAX_IPD_TIME) 425 ipd_count++; 426 else 427 ipd_count = 1; 428 429 last_ipd = tmp; 430 431 if (ipd_count == MAX_IPD_COUNT) 432 s390_handle_damage("too many ipd retries."); 433 434 spin_unlock(&ipd_lock); 435 } 436 else { 437 /* Processing damage -> stopping machine */ 438 s390_handle_damage("received instruction processing " 439 "damage machine check."); 440 } 441 } 442 if (s390_revalidate_registers(mci)) { 443 if (umode) { 444 /* 445 * Couldn't restore all register contents while in 446 * user mode -> mark task for termination. 447 */ 448 mcck->kill_task = 1; 449 mcck->mcck_code = *(unsigned long long *) mci; 450 set_thread_flag(TIF_MCCK_PENDING); 451 } 452 else 453 /* 454 * Couldn't restore all register contents while in 455 * kernel mode -> stopping machine. 456 */ 457 s390_handle_damage("unable to revalidate registers."); 458 } 459 460 if (mci->cd) { 461 /* Timing facility damage */ 462 s390_handle_damage("TOD clock damaged"); 463 } 464 465 if (mci->ed && mci->ec) { 466 /* External damage */ 467 if (S390_lowcore.external_damage_code & (1U << ED_ETR_SYNC)) 468 etr_sync_check(); 469 if (S390_lowcore.external_damage_code & (1U << ED_ETR_SWITCH)) 470 etr_switch_to_local(); 471 } 472 473 if (mci->se) 474 /* Storage error uncorrected */ 475 s390_handle_damage("received storage error uncorrected " 476 "machine check."); 477 478 if (mci->ke) 479 /* Storage key-error uncorrected */ 480 s390_handle_damage("received storage key-error uncorrected " 481 "machine check."); 482 483 if (mci->ds && mci->fa) 484 /* Storage degradation */ 485 s390_handle_damage("received storage degradation machine " 486 "check."); 487 488 if (mci->cp) { 489 /* Channel report word pending */ 490 mcck->channel_report = 1; 491 set_thread_flag(TIF_MCCK_PENDING); 492 } 493 494 if (mci->w) { 495 /* Warning pending */ 496 mcck->warning = 1; 497 set_thread_flag(TIF_MCCK_PENDING); 498 } 499 lockdep_on(); 500} 501 502/* 503 * s390_init_machine_check 504 * 505 * initialize machine check handling 506 */ 507static int 508machine_check_init(void) 509{ 510 init_MUTEX_LOCKED(&m_sem); 511 ctl_set_bit(14, 25); /* enable external damage MCH */ 512 ctl_set_bit(14, 27); /* enable system recovery MCH */ 513#ifdef CONFIG_MACHCHK_WARNING 514 ctl_set_bit(14, 24); /* enable warning MCH */ 515#endif 516 return 0; 517} 518 519/* 520 * Initialize the machine check handler really early to be able to 521 * catch all machine checks that happen during boot 522 */ 523arch_initcall(machine_check_init); 524 525/* 526 * Machine checks for the channel subsystem must be enabled 527 * after the channel subsystem is initialized 528 */ 529static int __init 530machine_check_crw_init (void) 531{ 532 struct task_struct *task; 533 534 task = kthread_run(s390_collect_crw_info, &m_sem, "kmcheck"); 535 if (IS_ERR(task)) 536 return PTR_ERR(task); 537 ctl_set_bit(14, 28); /* enable channel report MCH */ 538 return 0; 539} 540 541device_initcall (machine_check_crw_init);