Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

[S390] pfault: cpu hotplug vs missing completion interrupts

On cpu hot remove a PFAULT CANCEL command is sent to the hypervisor
which in turn will cancel all outstanding pfault requests that have
been issued on that cpu (the same happens with a SIGP cpu reset).

The result is that we end up with uninterruptible processes where
the interrupt that would wake up these processes never arrives.

In order to solve this all processes which wait for a pfault
completion interrupt get woken up after a cpu hot remove. The worst
case that could happen is that they fault again and in turn need to
wait again.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>

authored by

Heiko Carstens and committed by
Martin Schwidefsky
f2db2e6c b456d94a

+71 -26
+2 -2
arch/s390/include/asm/lowcore.h
··· 124 124 /* Address space pointer. */ 125 125 __u32 kernel_asce; /* 0x02ac */ 126 126 __u32 user_asce; /* 0x02b0 */ 127 - __u8 pad_0x02b4[0x02b8-0x02b4]; /* 0x02b4 */ 127 + __u32 current_pid; /* 0x02b4 */ 128 128 129 129 /* SMP info area */ 130 130 __u32 cpu_nr; /* 0x02b8 */ ··· 255 255 /* Address space pointer. */ 256 256 __u64 kernel_asce; /* 0x0310 */ 257 257 __u64 user_asce; /* 0x0318 */ 258 - __u8 pad_0x0320[0x0328-0x0320]; /* 0x0320 */ 258 + __u64 current_pid; /* 0x0320 */ 259 259 260 260 /* SMP info area */ 261 261 __u32 cpu_nr; /* 0x0328 */
+1
arch/s390/include/asm/processor.h
··· 84 84 struct per_event per_event; /* Cause of the last PER trap */ 85 85 /* pfault_wait is used to block the process on a pfault event */ 86 86 unsigned long pfault_wait; 87 + struct list_head list; 87 88 }; 88 89 89 90 typedef struct thread_struct thread_struct;
+1
arch/s390/kernel/asm-offsets.c
··· 124 124 DEFINE(__LC_LAST_UPDATE_TIMER, offsetof(struct _lowcore, last_update_timer)); 125 125 DEFINE(__LC_LAST_UPDATE_CLOCK, offsetof(struct _lowcore, last_update_clock)); 126 126 DEFINE(__LC_CURRENT, offsetof(struct _lowcore, current_task)); 127 + DEFINE(__LC_CURRENT_PID, offsetof(struct _lowcore, current_pid)); 127 128 DEFINE(__LC_THREAD_INFO, offsetof(struct _lowcore, thread_info)); 128 129 DEFINE(__LC_KERNEL_STACK, offsetof(struct _lowcore, kernel_stack)); 129 130 DEFINE(__LC_ASYNC_STACK, offsetof(struct _lowcore, async_stack));
+1
arch/s390/kernel/entry.S
··· 212 212 lctl %c4,%c4,__TASK_pid(%r3) # load pid to control reg. 4 213 213 lm %r6,%r15,__SF_GPRS(%r15) # load gprs of next task 214 214 st %r3,__LC_CURRENT # store task struct of next 215 + mvc __LC_CURRENT_PID(4,%r0),__TASK_pid(%r3) # store pid of next 215 216 st %r5,__LC_THREAD_INFO # store thread info of next 216 217 ahi %r5,STACK_SIZE # end of kernel stack of next 217 218 st %r5,__LC_KERNEL_STACK # store end of kernel stack
+1
arch/s390/kernel/entry64.S
··· 220 220 lctl %c4,%c4,__TASK_pid(%r3) # load pid to control reg. 4 221 221 lmg %r6,%r15,__SF_GPRS(%r15) # load gprs of next task 222 222 stg %r3,__LC_CURRENT # store task struct of next 223 + mvc __LC_CURRENT_PID+4(4,%r0),__TASK_pid(%r3) # store pid of next 223 224 stg %r5,__LC_THREAD_INFO # store thread info of next 224 225 aghi %r5,STACK_SIZE # end of kernel stack of next 225 226 stg %r5,__LC_KERNEL_STACK # store end of kernel stack
+65 -24
arch/s390/mm/fault.c
··· 466 466 int pfault_init(void) 467 467 { 468 468 pfault_refbk_t refbk = 469 - { 0x258, 0, 5, 2, __LC_CURRENT, 1ULL << 48, 1ULL << 48, 469 + { 0x258, 0, 5, 2, __LC_CURRENT_PID, 1ULL << 48, 1ULL << 48, 470 470 __PF_RES_FIELD }; 471 471 int rc; 472 472 ··· 498 498 : : "a" (&refbk), "m" (refbk) : "cc"); 499 499 } 500 500 501 + static DEFINE_SPINLOCK(pfault_lock); 502 + static LIST_HEAD(pfault_list); 503 + 501 504 static void pfault_interrupt(unsigned int ext_int_code, 502 505 unsigned int param32, unsigned long param64) 503 506 { 504 507 struct task_struct *tsk; 505 508 __u16 subcode; 509 + pid_t pid; 506 510 507 511 /* 508 512 * Get the external interruption subcode & pfault ··· 518 514 if ((subcode & 0xff00) != __SUBCODE_MASK) 519 515 return; 520 516 kstat_cpu(smp_processor_id()).irqs[EXTINT_PFL]++; 521 - 522 - /* 523 - * Get the token (= address of the task structure of the affected task). 524 - */ 525 - #ifdef CONFIG_64BIT 526 - tsk = (struct task_struct *) param64; 527 - #else 528 - tsk = (struct task_struct *) param32; 529 - #endif 530 - 517 + if (subcode & 0x0080) { 518 + /* Get the token (= pid of the affected task). */ 519 + pid = sizeof(void *) == 4 ? param32 : param64; 520 + rcu_read_lock(); 521 + tsk = find_task_by_pid_ns(pid, &init_pid_ns); 522 + if (tsk) 523 + get_task_struct(tsk); 524 + rcu_read_unlock(); 525 + if (!tsk) 526 + return; 527 + } else { 528 + tsk = current; 529 + } 530 + spin_lock(&pfault_lock); 531 531 if (subcode & 0x0080) { 532 532 /* signal bit is set -> a page has been swapped in by VM */ 533 - if (xchg(&tsk->thread.pfault_wait, -1) != 0) { 533 + if (tsk->thread.pfault_wait == 1) { 534 534 /* Initial interrupt was faster than the completion 535 535 * interrupt. pfault_wait is valid. Set pfault_wait 536 536 * back to zero and wake up the process. This can 537 537 * safely be done because the task is still sleeping 538 538 * and can't produce new pfaults. */ 539 539 tsk->thread.pfault_wait = 0; 540 + list_del(&tsk->thread.list); 540 541 wake_up_process(tsk); 541 - put_task_struct(tsk); 542 + } else { 543 + /* Completion interrupt was faster than initial 544 + * interrupt. Set pfault_wait to -1 so the initial 545 + * interrupt doesn't put the task to sleep. */ 546 + tsk->thread.pfault_wait = -1; 542 547 } 548 + put_task_struct(tsk); 543 549 } else { 544 550 /* signal bit not set -> a real page is missing. */ 545 - get_task_struct(tsk); 546 - set_task_state(tsk, TASK_UNINTERRUPTIBLE); 547 - if (xchg(&tsk->thread.pfault_wait, 1) != 0) { 551 + if (tsk->thread.pfault_wait == -1) { 548 552 /* Completion interrupt was faster than the initial 549 - * interrupt (swapped in a -1 for pfault_wait). Set 550 - * pfault_wait back to zero and exit. This can be 551 - * done safely because tsk is running in kernel 552 - * mode and can't produce new pfaults. */ 553 + * interrupt (pfault_wait == -1). Set pfault_wait 554 + * back to zero and exit. */ 553 555 tsk->thread.pfault_wait = 0; 554 - set_task_state(tsk, TASK_RUNNING); 555 - put_task_struct(tsk); 556 - } else 556 + } else { 557 + /* Initial interrupt arrived before completion 558 + * interrupt. Let the task sleep. */ 559 + tsk->thread.pfault_wait = 1; 560 + list_add(&tsk->thread.list, &pfault_list); 561 + set_task_state(tsk, TASK_UNINTERRUPTIBLE); 557 562 set_tsk_need_resched(tsk); 563 + } 558 564 } 565 + spin_unlock(&pfault_lock); 566 + } 567 + 568 + static int __cpuinit pfault_cpu_notify(struct notifier_block *self, 569 + unsigned long action, void *hcpu) 570 + { 571 + struct thread_struct *thread, *next; 572 + struct task_struct *tsk; 573 + 574 + switch (action) { 575 + case CPU_DEAD: 576 + case CPU_DEAD_FROZEN: 577 + spin_lock_irq(&pfault_lock); 578 + list_for_each_entry_safe(thread, next, &pfault_list, list) { 579 + thread->pfault_wait = 0; 580 + list_del(&thread->list); 581 + tsk = container_of(thread, struct task_struct, thread); 582 + wake_up_process(tsk); 583 + } 584 + spin_unlock_irq(&pfault_lock); 585 + break; 586 + default: 587 + break; 588 + } 589 + return NOTIFY_OK; 559 590 } 560 591 561 592 static int __init pfault_irq_init(void) ··· 607 568 pfault_disable = 1; 608 569 return rc; 609 570 } 610 - if (pfault_init() == 0) 571 + if (pfault_init() == 0) { 572 + hotcpu_notifier(pfault_cpu_notify, 0); 611 573 return 0; 574 + } 612 575 613 576 /* Tough luck, no pfault. */ 614 577 pfault_disable = 1;