Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

s390/mm: Reimplement lazy ASCE handling

Reduce system call overhead time (round trip time for invoking a
non-existent system call) by 25%.

With the removal of set_fs() [1] lazy control register handling was removed
in order to keep kernel entry and exit simple. However this made system
calls slower.

With the conversion to generic entry [2] and numerous follow up changes
which simplified the entry code significantly, adding support for lazy asce
handling doesn't add much complexity to the entry code anymore.

In particular this means:

- On kernel entry the primary asce is not modified and contains the user
asce

- Kernel accesses which require secondary-space mode (for example futex
operations) are surrounded by enable_sacf_uaccess() and
disable_sacf_uaccess() calls. enable_sacf_uaccess() sets the primary asce
to kernel asce so that the sacf instruction can be used to switch to
secondary-space mode. The primary asce is changed back to user asce with
disable_sacf_uaccess().

The state of the control register which contains the primary asce is
reflected with a new TIF_ASCE_PRIMARY bit. This is required on context
switch so that the correct asce is restored for the scheduled in process.

In result address spaces are now setup like this:

CPU running in | %cr1 ASCE | %cr7 ASCE | %cr13 ASCE
-----------------------------|-----------|-----------|-----------
user space | user | user | kernel
kernel (no sacf) | user | user | kernel
kernel (during sacf uaccess) | kernel | user | kernel
kernel (kvm guest execution) | guest | user | kernel

In result cr1 control register content is not changed except for:
- futex system calls
- legacy s390 PCI system calls
- the kvm specific cmpxchg_user_key() uaccess helper

This leads to faster system call execution.

[1] 87d598634521 ("s390/mm: remove set_fs / rework address space handling")
[2] 56e62a737028 ("s390: convert to generic entry")

Reviewed-by: Alexander Gordeev <agordeev@linux.ibm.com>
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>

+97 -25
+36
arch/s390/include/asm/asce.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + 3 + #ifndef _ASM_S390_ASCE_H 4 + #define _ASM_S390_ASCE_H 5 + 6 + #include <linux/thread_info.h> 7 + #include <linux/irqflags.h> 8 + #include <asm/lowcore.h> 9 + #include <asm/ctlreg.h> 10 + 11 + static inline bool enable_sacf_uaccess(void) 12 + { 13 + unsigned long flags; 14 + 15 + if (test_thread_flag(TIF_ASCE_PRIMARY)) 16 + return true; 17 + local_irq_save(flags); 18 + local_ctl_load(1, &get_lowcore()->kernel_asce); 19 + set_thread_flag(TIF_ASCE_PRIMARY); 20 + local_irq_restore(flags); 21 + return false; 22 + } 23 + 24 + static inline void disable_sacf_uaccess(bool previous) 25 + { 26 + unsigned long flags; 27 + 28 + if (previous) 29 + return; 30 + local_irq_save(flags); 31 + local_ctl_load(1, &get_lowcore()->user_asce); 32 + clear_thread_flag(TIF_ASCE_PRIMARY); 33 + local_irq_restore(flags); 34 + } 35 + 36 + #endif /* _ASM_S390_ASCE_H */
+6
arch/s390/include/asm/futex.h
··· 13 13 static uaccess_kmsan_or_inline int \ 14 14 __futex_atomic_##name(int oparg, int *old, u32 __user *uaddr) \ 15 15 { \ 16 + bool sacf_flag; \ 16 17 int rc, new; \ 17 18 \ 18 19 instrument_copy_from_user_before(old, uaddr, sizeof(*old)); \ 20 + sacf_flag = enable_sacf_uaccess(); \ 19 21 asm_inline volatile( \ 20 22 " sacf 256\n" \ 21 23 "0: l %[old],%[uaddr]\n" \ ··· 34 32 [new] "=&d" (new), [uaddr] "+Q" (*uaddr) \ 35 33 : [oparg] "d" (oparg) \ 36 34 : "cc"); \ 35 + disable_sacf_uaccess(sacf_flag); \ 37 36 if (!rc) \ 38 37 instrument_copy_from_user_after(old, uaddr, sizeof(*old), 0); \ 39 38 return rc; \ ··· 78 75 static uaccess_kmsan_or_inline 79 76 int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 oldval, u32 newval) 80 77 { 78 + bool sacf_flag; 81 79 int rc; 82 80 83 81 instrument_copy_from_user_before(uval, uaddr, sizeof(*uval)); 82 + sacf_flag = enable_sacf_uaccess(); 84 83 asm_inline volatile( 85 84 " sacf 256\n" 86 85 "0: cs %[old],%[new],%[uaddr]\n" ··· 93 88 : [rc] "=d" (rc), [old] "+d" (oldval), [uaddr] "+Q" (*uaddr) 94 89 : [new] "d" (newval) 95 90 : "cc", "memory"); 91 + disable_sacf_uaccess(sacf_flag); 96 92 *uval = oldval; 97 93 instrument_copy_from_user_after(uval, uaddr, sizeof(*uval), 0); 98 94 return rc;
+18 -1
arch/s390/include/asm/mmu_context.h
··· 13 13 #include <linux/mm_types.h> 14 14 #include <asm/tlbflush.h> 15 15 #include <asm/ctlreg.h> 16 + #include <asm/asce.h> 16 17 #include <asm-generic/mm_hooks.h> 17 18 18 19 #define init_new_context init_new_context ··· 78 77 else 79 78 get_lowcore()->user_asce.val = next->context.asce; 80 79 cpumask_set_cpu(cpu, &next->context.cpu_attach_mask); 81 - /* Clear previous user-ASCE from CR7 */ 80 + /* Clear previous user-ASCE from CR1 and CR7 */ 81 + local_ctl_load(1, &s390_invalid_asce); 82 82 local_ctl_load(7, &s390_invalid_asce); 83 83 if (prev != next) 84 84 cpumask_clear_cpu(cpu, &prev->context.cpu_attach_mask); ··· 101 99 { 102 100 struct task_struct *tsk = current; 103 101 struct mm_struct *mm = tsk->mm; 102 + unsigned long flags; 104 103 105 104 if (mm) { 106 105 preempt_disable(); ··· 111 108 __tlb_flush_mm_lazy(mm); 112 109 preempt_enable(); 113 110 } 111 + local_irq_save(flags); 112 + if (test_thread_flag(TIF_ASCE_PRIMARY)) 113 + local_ctl_load(1, &get_lowcore()->kernel_asce); 114 + else 115 + local_ctl_load(1, &get_lowcore()->user_asce); 114 116 local_ctl_load(7, &get_lowcore()->user_asce); 117 + local_irq_restore(flags); 115 118 } 116 119 117 120 #define activate_mm activate_mm 118 121 static inline void activate_mm(struct mm_struct *prev, 119 122 struct mm_struct *next) 120 123 { 124 + unsigned long flags; 125 + 121 126 switch_mm(prev, next, current); 122 127 cpumask_set_cpu(smp_processor_id(), mm_cpumask(next)); 128 + local_irq_save(flags); 129 + if (test_thread_flag(TIF_ASCE_PRIMARY)) 130 + local_ctl_load(1, &get_lowcore()->kernel_asce); 131 + else 132 + local_ctl_load(1, &get_lowcore()->user_asce); 123 133 local_ctl_load(7, &get_lowcore()->user_asce); 134 + local_irq_restore(flags); 124 135 } 125 136 126 137 #include <asm-generic/mmu_context.h>
-1
arch/s390/include/asm/ptrace.h
··· 126 126 struct tpi_info tpi_info; 127 127 }; 128 128 unsigned long flags; 129 - unsigned long cr1; 130 129 unsigned long last_break; 131 130 }; 132 131
+2
arch/s390/include/asm/thread_info.h
··· 64 64 #define TIF_NEED_RESCHED_LAZY 3 /* lazy rescheduling needed */ 65 65 #define TIF_UPROBE 4 /* breakpointed or single-stepping */ 66 66 #define TIF_PATCH_PENDING 5 /* pending live patching update */ 67 + #define TIF_ASCE_PRIMARY 6 /* primary asce is kernel asce */ 67 68 #define TIF_NOTIFY_SIGNAL 7 /* signal notifications exist */ 68 69 #define TIF_GUARDED_STORAGE 8 /* load guarded storage control block */ 69 70 #define TIF_ISOLATE_BP_GUEST 9 /* Run KVM guests with isolated BP */ ··· 86 85 #define _TIF_NEED_RESCHED_LAZY BIT(TIF_NEED_RESCHED_LAZY) 87 86 #define _TIF_UPROBE BIT(TIF_UPROBE) 88 87 #define _TIF_PATCH_PENDING BIT(TIF_PATCH_PENDING) 88 + #define _TIF_ASCE_PRIMARY BIT(TIF_ASCE_PRIMARY) 89 89 #define _TIF_NOTIFY_SIGNAL BIT(TIF_NOTIFY_SIGNAL) 90 90 #define _TIF_GUARDED_STORAGE BIT(TIF_GUARDED_STORAGE) 91 91 #define _TIF_ISOLATE_BP_GUEST BIT(TIF_ISOLATE_BP_GUEST)
+12
arch/s390/include/asm/uaccess.h
··· 19 19 #include <asm/extable.h> 20 20 #include <asm/facility.h> 21 21 #include <asm-generic/access_ok.h> 22 + #include <asm/asce.h> 22 23 #include <linux/instrumented.h> 23 24 24 25 void debug_user_asce(int exit); ··· 479 478 __uint128_t old, __uint128_t new, 480 479 unsigned long key, int size) 481 480 { 481 + bool sacf_flag; 482 482 int rc = 0; 483 483 484 484 switch (size) { ··· 492 490 _old = ((unsigned int)old & 0xff) << shift; 493 491 _new = ((unsigned int)new & 0xff) << shift; 494 492 mask = ~(0xff << shift); 493 + sacf_flag = enable_sacf_uaccess(); 495 494 asm_inline volatile( 496 495 " spka 0(%[key])\n" 497 496 " sacf 256\n" ··· 527 524 [default_key] "J" (PAGE_DEFAULT_KEY), 528 525 [max_loops] "J" (CMPXCHG_USER_KEY_MAX_LOOPS) 529 526 : "memory", "cc"); 527 + disable_sacf_uaccess(sacf_flag); 530 528 *(unsigned char *)uval = prev >> shift; 531 529 if (!count) 532 530 rc = -EAGAIN; ··· 542 538 _old = ((unsigned int)old & 0xffff) << shift; 543 539 _new = ((unsigned int)new & 0xffff) << shift; 544 540 mask = ~(0xffff << shift); 541 + sacf_flag = enable_sacf_uaccess(); 545 542 asm_inline volatile( 546 543 " spka 0(%[key])\n" 547 544 " sacf 256\n" ··· 577 572 [default_key] "J" (PAGE_DEFAULT_KEY), 578 573 [max_loops] "J" (CMPXCHG_USER_KEY_MAX_LOOPS) 579 574 : "memory", "cc"); 575 + disable_sacf_uaccess(sacf_flag); 580 576 *(unsigned short *)uval = prev >> shift; 581 577 if (!count) 582 578 rc = -EAGAIN; ··· 586 580 case 4: { 587 581 unsigned int prev = old; 588 582 583 + sacf_flag = enable_sacf_uaccess(); 589 584 asm_inline volatile( 590 585 " spka 0(%[key])\n" 591 586 " sacf 256\n" ··· 602 595 [key] "a" (key << 4), 603 596 [default_key] "J" (PAGE_DEFAULT_KEY) 604 597 : "memory", "cc"); 598 + disable_sacf_uaccess(sacf_flag); 605 599 *(unsigned int *)uval = prev; 606 600 return rc; 607 601 } 608 602 case 8: { 609 603 unsigned long prev = old; 610 604 605 + sacf_flag = enable_sacf_uaccess(); 611 606 asm_inline volatile( 612 607 " spka 0(%[key])\n" 613 608 " sacf 256\n" ··· 625 616 [key] "a" (key << 4), 626 617 [default_key] "J" (PAGE_DEFAULT_KEY) 627 618 : "memory", "cc"); 619 + disable_sacf_uaccess(sacf_flag); 628 620 *(unsigned long *)uval = prev; 629 621 return rc; 630 622 } 631 623 case 16: { 632 624 __uint128_t prev = old; 633 625 626 + sacf_flag = enable_sacf_uaccess(); 634 627 asm_inline volatile( 635 628 " spka 0(%[key])\n" 636 629 " sacf 256\n" ··· 648 637 [key] "a" (key << 4), 649 638 [default_key] "J" (PAGE_DEFAULT_KEY) 650 639 : "memory", "cc"); 640 + disable_sacf_uaccess(sacf_flag); 651 641 *(__uint128_t *)uval = prev; 652 642 return rc; 653 643 }
-1
arch/s390/kernel/asm-offsets.c
··· 50 50 OFFSET(__PT_ORIG_GPR2, pt_regs, orig_gpr2); 51 51 OFFSET(__PT_INT_CODE, pt_regs, int_code); 52 52 OFFSET(__PT_FLAGS, pt_regs, flags); 53 - OFFSET(__PT_CR1, pt_regs, cr1); 54 53 OFFSET(__PT_LAST_BREAK, pt_regs, last_break); 55 54 DEFINE(__PT_SIZE, sizeof(struct pt_regs)); 56 55 BLANK();
+4 -16
arch/s390/kernel/entry.S
··· 116 116 .macro SIEEXIT sie_control,lowcore 117 117 lg %r9,\sie_control # get control block pointer 118 118 ni __SIE_PROG0C+3(%r9),0xfe # no longer in SIE 119 - lctlg %c1,%c1,__LC_KERNEL_ASCE(\lowcore) # load primary asce 119 + lctlg %c1,%c1,__LC_USER_ASCE(\lowcore) # load primary asce 120 120 lg %r9,__LC_CURRENT(\lowcore) 121 121 mvi __TI_sie(%r9),0 122 122 larl %r9,sie_exit # skip forward to sie_exit ··· 208 208 lg %r14,__SF_SIE_CONTROL(%r15) # get control block pointer 209 209 ni __SIE_PROG0C+3(%r14),0xfe # no longer in SIE 210 210 GET_LC %r14 211 - lctlg %c1,%c1,__LC_KERNEL_ASCE(%r14) # load primary asce 211 + lctlg %c1,%c1,__LC_USER_ASCE(%r14) # load primary asce 212 212 lg %r14,__LC_CURRENT(%r14) 213 213 mvi __TI_sie(%r14),0 214 214 SYM_INNER_LABEL(sie_exit, SYM_L_GLOBAL) ··· 240 240 lghi %r14,0 241 241 .Lsysc_per: 242 242 STBEAR __LC_LAST_BREAK(%r13) 243 - lctlg %c1,%c1,__LC_KERNEL_ASCE(%r13) 244 243 lg %r15,__LC_KERNEL_STACK(%r13) 245 244 xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) 246 245 stmg %r0,%r7,STACK_FRAME_OVERHEAD+__PT_R0(%r15) ··· 260 261 lgr %r3,%r14 261 262 brasl %r14,__do_syscall 262 263 STACKLEAK_ERASE 263 - lctlg %c1,%c1,__LC_USER_ASCE(%r13) 264 264 mvc __LC_RETURN_PSW(16,%r13),STACK_FRAME_OVERHEAD+__PT_PSW(%r15) 265 265 BPON 266 266 LBEAR STACK_FRAME_OVERHEAD+__PT_LAST_BREAK(%r15) ··· 276 278 brasl %r14,__ret_from_fork 277 279 STACKLEAK_ERASE 278 280 GET_LC %r13 279 - lctlg %c1,%c1,__LC_USER_ASCE(%r13) 280 281 mvc __LC_RETURN_PSW(16,%r13),STACK_FRAME_OVERHEAD+__PT_PSW(%r15) 281 282 BPON 282 283 LBEAR STACK_FRAME_OVERHEAD+__PT_LAST_BREAK(%r15) ··· 296 299 lmg %r8,%r9,__LC_PGM_OLD_PSW(%r13) 297 300 xgr %r10,%r10 298 301 tmhh %r8,0x0001 # coming from user space? 299 - jno .Lpgm_skip_asce 300 - lctlg %c1,%c1,__LC_KERNEL_ASCE(%r13) 301 - j 3f # -> fault in user space 302 - .Lpgm_skip_asce: 302 + jo 3f # -> fault in user space 303 303 #if IS_ENABLED(CONFIG_KVM) 304 304 lg %r11,__LC_CURRENT(%r13) 305 305 tm __TI_sie(%r11),0xff ··· 334 340 tmhh %r8,0x0001 # returning to user space? 335 341 jno .Lpgm_exit_kernel 336 342 STACKLEAK_ERASE 337 - lctlg %c1,%c1,__LC_USER_ASCE(%r13) 338 343 BPON 339 344 stpt __LC_EXIT_TIMER(%r13) 340 345 .Lpgm_exit_kernel: ··· 377 384 #endif 378 385 0: aghi %r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE) 379 386 j 2f 380 - 1: lctlg %c1,%c1,__LC_KERNEL_ASCE(%r13) 381 - lg %r15,__LC_KERNEL_STACK(%r13) 387 + 1: lg %r15,__LC_KERNEL_STACK(%r13) 382 388 2: xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) 383 389 la %r11,STACK_FRAME_OVERHEAD(%r15) 384 390 stmg %r0,%r7,__PT_R0(%r11) ··· 400 408 tmhh %r8,0x0001 # returning to user ? 401 409 jno 2f 402 410 STACKLEAK_ERASE 403 - lctlg %c1,%c1,__LC_USER_ASCE(%r13) 404 411 BPON 405 412 stpt __LC_EXIT_TIMER(%r13) 406 413 2: LBEAR __PT_LAST_BREAK(%r11) ··· 467 476 .Lmcck_user: 468 477 lg %r15,__LC_MCCK_STACK(%r13) 469 478 la %r11,STACK_FRAME_OVERHEAD(%r15) 470 - stctg %c1,%c1,__PT_CR1(%r11) 471 - lctlg %c1,%c1,__LC_KERNEL_ASCE(%r13) 472 479 xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) 473 480 lay %r14,__LC_GPREGS_SAVE_AREA(%r13) 474 481 mvc __PT_R0(128,%r11),0(%r14) ··· 484 495 xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) 485 496 lgr %r2,%r11 # pass pointer to pt_regs 486 497 brasl %r14,s390_do_machine_check 487 - lctlg %c1,%c1,__PT_CR1(%r11) 488 498 lmg %r0,%r10,__PT_R0(%r11) 489 499 mvc __LC_RETURN_MCCK_PSW(16,%r13),__PT_PSW(%r11) # move return PSW 490 500 tm __LC_RETURN_MCCK_PSW+1(%r13),0x01 # returning to user ?
+1 -1
arch/s390/kernel/smp.c
··· 263 263 abs_lc = get_abs_lowcore(); 264 264 memcpy(lc->cregs_save_area, abs_lc->cregs_save_area, sizeof(lc->cregs_save_area)); 265 265 put_abs_lowcore(abs_lc); 266 - lc->cregs_save_area[1] = lc->kernel_asce; 266 + lc->cregs_save_area[1] = lc->user_asce; 267 267 lc->cregs_save_area[7] = lc->user_asce; 268 268 save_access_regs((unsigned int *) lc->access_regs_save_area); 269 269 arch_spin_lock_setup(cpu);
+3 -2
arch/s390/lib/uaccess.c
··· 17 17 #ifdef CONFIG_DEBUG_ENTRY 18 18 void debug_user_asce(int exit) 19 19 { 20 + struct lowcore *lc = get_lowcore(); 20 21 struct ctlreg cr1, cr7; 21 22 22 23 local_ctl_store(1, &cr1); 23 24 local_ctl_store(7, &cr7); 24 - if (cr1.val == get_lowcore()->kernel_asce.val && cr7.val == get_lowcore()->user_asce.val) 25 + if (cr1.val == lc->user_asce.val && cr7.val == lc->user_asce.val) 25 26 return; 26 27 panic("incorrect ASCE on kernel %s\n" 27 28 "cr1: %016lx cr7: %016lx\n" 28 29 "kernel: %016lx user: %016lx\n", 29 30 exit ? "exit" : "entry", cr1.val, cr7.val, 30 - get_lowcore()->kernel_asce.val, get_lowcore()->user_asce.val); 31 + lc->kernel_asce.val, lc->user_asce.val); 31 32 } 32 33 #endif /*CONFIG_DEBUG_ENTRY */ 33 34
+6 -2
arch/s390/mm/pgalloc.c
··· 38 38 static void __crst_table_upgrade(void *arg) 39 39 { 40 40 struct mm_struct *mm = arg; 41 + struct ctlreg asce; 41 42 43 + asce.val = mm->context.asce; 42 44 /* change all active ASCEs to avoid the creation of new TLBs */ 43 45 if (current->active_mm == mm) { 44 - get_lowcore()->user_asce.val = mm->context.asce; 45 - local_ctl_load(7, &get_lowcore()->user_asce); 46 + get_lowcore()->user_asce = asce; 47 + local_ctl_load(7, &asce); 48 + if (!test_thread_flag(TIF_ASCE_PRIMARY)) 49 + local_ctl_load(1, &asce); 46 50 } 47 51 __tlb_flush_local(); 48 52 }
+9 -1
arch/s390/pci/pci_mmio.c
··· 32 32 u64 len, u8 *status) 33 33 { 34 34 int cc, exception; 35 + bool sacf_flag; 35 36 36 37 exception = 1; 38 + sacf_flag = enable_sacf_uaccess(); 37 39 asm_inline volatile ( 38 40 " sacf 256\n" 39 41 "0: .insn rsy,0xeb00000000d4,%[len],%[ioaddr],%[src]\n" ··· 46 44 : CC_OUT(cc, cc), [len] "+d" (len), [exc] "+d" (exception) 47 45 : [ioaddr] "a" (ioaddr), [src] "Q" (*((u8 __force *)src)) 48 46 : CC_CLOBBER_LIST("memory")); 47 + disable_sacf_uaccess(sacf_flag); 49 48 *status = len >> 24 & 0xff; 50 49 return exception ? -ENXIO : CC_TRANSFORM(cc); 51 50 } ··· 57 54 { 58 55 union register_pair ioaddr_len = {.even = (u64 __force)ioaddr, .odd = ulen}; 59 56 int cc, exception; 57 + bool sacf_flag; 60 58 u64 val = 0; 61 59 u64 cnt = ulen; 62 60 u8 tmp; ··· 68 64 * address space. pcistg then uses the user mappings. 69 65 */ 70 66 exception = 1; 67 + sacf_flag = enable_sacf_uaccess(); 71 68 asm_inline volatile ( 72 69 " sacf 256\n" 73 70 "0: llgc %[tmp],0(%[src])\n" ··· 86 81 CC_OUT(cc, cc), [ioaddr_len] "+&d" (ioaddr_len.pair) 87 82 : 88 83 : CC_CLOBBER_LIST("memory")); 84 + disable_sacf_uaccess(sacf_flag); 89 85 *status = ioaddr_len.odd >> 24 & 0xff; 90 86 91 87 cc = exception ? -ENXIO : CC_TRANSFORM(cc); ··· 210 204 u64 ulen, u8 *status) 211 205 { 212 206 union register_pair ioaddr_len = {.even = (u64 __force)ioaddr, .odd = ulen}; 207 + bool sacf_flag; 213 208 u64 cnt = ulen; 214 209 int shift = ulen * 8; 215 210 int cc, exception; ··· 222 215 * user address @dst 223 216 */ 224 217 exception = 1; 218 + sacf_flag = enable_sacf_uaccess(); 225 219 asm_inline volatile ( 226 220 " sacf 256\n" 227 221 "0: .insn rre,0xb9d60000,%[val],%[ioaddr_len]\n" ··· 247 239 [shift] "+d" (shift) 248 240 : 249 241 : CC_CLOBBER_LIST("memory")); 250 - 242 + disable_sacf_uaccess(sacf_flag); 251 243 cc = exception ? -ENXIO : CC_TRANSFORM(cc); 252 244 /* did we write everything to the user space buffer? */ 253 245 if (!cc && cnt != 0)