Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

x86, entry: Switch stacks on a paranoid entry from userspace

This causes all non-NMI, non-double-fault kernel entries from
userspace to run on the normal kernel stack. Double-fault is
exempt to minimize confusion if we double-fault directly from
userspace due to a bad kernel stack.

This is, suprisingly, simpler and shorter than the current code. It
removes the IMO rather frightening paranoid_userspace path, and it
make sync_regs much simpler.

There is no risk of stack overflow due to this change -- the kernel
stack that we switch to is empty.

This will also enable us to create non-atomic sections within
machine checks from userspace, which will simplify memory failure
handling. It will also allow the upcoming fsgsbase code to be
simplified, because it doesn't need to worry about usergs when
scheduling in paranoid_exit, as that code no longer exists.

Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Tony Luck <tony.luck@intel.com>
Acked-by: Borislav Petkov <bp@alien8.de>
Signed-off-by: Andy Lutomirski <luto@amacapital.net>

+67 -68
+12 -6
Documentation/x86/entry_64.txt
··· 78 78 xorl %ebx,%ebx 79 79 1: ret 80 80 81 - and the whole paranoid non-paranoid macro complexity is about whether 82 - to suffer that RDMSR cost. 83 - 84 81 If we are at an interrupt or user-trap/gate-alike boundary then we can 85 82 use the faster check: the stack will be a reliable indicator of 86 83 whether SWAPGS was already done: if we see that we are a secondary ··· 90 93 stack but before we executed SWAPGS, then the only safe way to check 91 94 for GS is the slower method: the RDMSR. 92 95 93 - So we try only to mark those entry methods 'paranoid' that absolutely 94 - need the more expensive check for the GS base - and we generate all 95 - 'normal' entry points with the regular (faster) entry macros. 96 + Therefore, super-atomic entries (except NMI, which is handled separately) 97 + must use idtentry with paranoid=1 to handle gsbase correctly. This 98 + triggers three main behavior changes: 99 + 100 + - Interrupt entry will use the slower gsbase check. 101 + - Interrupt entry from user mode will switch off the IST stack. 102 + - Interrupt exit to kernel mode will not attempt to reschedule. 103 + 104 + We try to only use IST entries and the paranoid entry code for vectors 105 + that absolutely need the more expensive check for the GS base - and we 106 + generate all 'normal' entry points with the regular (faster) paranoid=0 107 + variant.
+5 -3
Documentation/x86/x86_64/kernel-stacks
··· 40 40 interrupt-gate descriptor. When an interrupt occurs and the hardware 41 41 loads such a descriptor, the hardware automatically sets the new stack 42 42 pointer based on the IST value, then invokes the interrupt handler. If 43 - software wants to allow nested IST interrupts then the handler must 44 - adjust the IST values on entry to and exit from the interrupt handler. 45 - (This is occasionally done, e.g. for debug exceptions.) 43 + the interrupt came from user mode, then the interrupt handler prologue 44 + will switch back to the per-thread stack. If software wants to allow 45 + nested IST interrupts then the handler must adjust the IST values on 46 + entry to and exit from the interrupt handler. (This is occasionally 47 + done, e.g. for debug exceptions.) 46 48 47 49 Events with different IST codes (i.e. with different stacks) can be 48 50 nested. For example, a debug interrupt can safely be interrupted by an
+45 -41
arch/x86/kernel/entry_64.S
··· 1048 1048 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 1049 1049 1050 1050 .if \paranoid 1051 + .if \paranoid == 1 1052 + CFI_REMEMBER_STATE 1053 + testl $3, CS(%rsp) /* If coming from userspace, switch */ 1054 + jnz 1f /* stacks. */ 1055 + .endif 1051 1056 call save_paranoid 1052 1057 .else 1053 1058 call error_entry ··· 1093 1088 jmp error_exit /* %ebx: no swapgs flag */ 1094 1089 .endif 1095 1090 1091 + .if \paranoid == 1 1092 + CFI_RESTORE_STATE 1093 + /* 1094 + * Paranoid entry from userspace. Switch stacks and treat it 1095 + * as a normal entry. This means that paranoid handlers 1096 + * run in real process context if user_mode(regs). 1097 + */ 1098 + 1: 1099 + call error_entry 1100 + 1101 + DEFAULT_FRAME 0 1102 + 1103 + movq %rsp,%rdi /* pt_regs pointer */ 1104 + call sync_regs 1105 + movq %rax,%rsp /* switch stack */ 1106 + 1107 + movq %rsp,%rdi /* pt_regs pointer */ 1108 + 1109 + .if \has_error_code 1110 + movq ORIG_RAX(%rsp),%rsi /* get error code */ 1111 + movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */ 1112 + .else 1113 + xorl %esi,%esi /* no error code */ 1114 + .endif 1115 + 1116 + call \do_sym 1117 + 1118 + jmp error_exit /* %ebx: no swapgs flag */ 1119 + .endif 1120 + 1096 1121 CFI_ENDPROC 1097 1122 END(\sym) 1098 1123 .endm ··· 1143 1108 idtentry bounds do_bounds has_error_code=0 1144 1109 idtentry invalid_op do_invalid_op has_error_code=0 1145 1110 idtentry device_not_available do_device_not_available has_error_code=0 1146 - idtentry double_fault do_double_fault has_error_code=1 paranoid=1 1111 + idtentry double_fault do_double_fault has_error_code=1 paranoid=2 1147 1112 idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0 1148 1113 idtentry invalid_TSS do_invalid_TSS has_error_code=1 1149 1114 idtentry segment_not_present do_segment_not_present has_error_code=1 ··· 1324 1289 #endif 1325 1290 1326 1291 /* 1327 - * "Paranoid" exit path from exception stack. 1328 - * Paranoid because this is used by NMIs and cannot take 1329 - * any kernel state for granted. 1330 - * We don't do kernel preemption checks here, because only 1331 - * NMI should be common and it does not enable IRQs and 1332 - * cannot get reschedule ticks. 1292 + * "Paranoid" exit path from exception stack. This is invoked 1293 + * only on return from non-NMI IST interrupts that came 1294 + * from kernel space. 1333 1295 * 1334 - * "trace" is 0 for the NMI handler only, because irq-tracing 1335 - * is fundamentally NMI-unsafe. (we cannot change the soft and 1336 - * hard flags at once, atomically) 1296 + * We may be returning to very strange contexts (e.g. very early 1297 + * in syscall entry), so checking for preemption here would 1298 + * be complicated. Fortunately, we there's no good reason 1299 + * to try to handle preemption here. 1337 1300 */ 1338 1301 1339 1302 /* ebx: no swapgs flag */ ··· 1341 1308 TRACE_IRQS_OFF_DEBUG 1342 1309 testl %ebx,%ebx /* swapgs needed? */ 1343 1310 jnz paranoid_restore 1344 - testl $3,CS(%rsp) 1345 - jnz paranoid_userspace 1346 - paranoid_swapgs: 1347 1311 TRACE_IRQS_IRETQ 0 1348 1312 SWAPGS_UNSAFE_STACK 1349 1313 RESTORE_ALL 8 1350 - jmp irq_return 1314 + INTERRUPT_RETURN 1351 1315 paranoid_restore: 1352 1316 TRACE_IRQS_IRETQ_DEBUG 0 1353 1317 RESTORE_ALL 8 1354 - jmp irq_return 1355 - paranoid_userspace: 1356 - GET_THREAD_INFO(%rcx) 1357 - movl TI_flags(%rcx),%ebx 1358 - andl $_TIF_WORK_MASK,%ebx 1359 - jz paranoid_swapgs 1360 - movq %rsp,%rdi /* &pt_regs */ 1361 - call sync_regs 1362 - movq %rax,%rsp /* switch stack for scheduling */ 1363 - testl $_TIF_NEED_RESCHED,%ebx 1364 - jnz paranoid_schedule 1365 - movl %ebx,%edx /* arg3: thread flags */ 1366 - TRACE_IRQS_ON 1367 - ENABLE_INTERRUPTS(CLBR_NONE) 1368 - xorl %esi,%esi /* arg2: oldset */ 1369 - movq %rsp,%rdi /* arg1: &pt_regs */ 1370 - call do_notify_resume 1371 - DISABLE_INTERRUPTS(CLBR_NONE) 1372 - TRACE_IRQS_OFF 1373 - jmp paranoid_userspace 1374 - paranoid_schedule: 1375 - TRACE_IRQS_ON 1376 - ENABLE_INTERRUPTS(CLBR_ANY) 1377 - SCHEDULE_USER 1378 - DISABLE_INTERRUPTS(CLBR_ANY) 1379 - TRACE_IRQS_OFF 1380 - jmp paranoid_userspace 1318 + INTERRUPT_RETURN 1381 1319 CFI_ENDPROC 1382 1320 END(paranoid_exit) 1383 1321
+5 -18
arch/x86/kernel/traps.c
··· 466 466 467 467 #ifdef CONFIG_X86_64 468 468 /* 469 - * Help handler running on IST stack to switch back to user stack 470 - * for scheduling or signal handling. The actual stack switch is done in 471 - * entry.S 469 + * Help handler running on IST stack to switch off the IST stack if the 470 + * interrupted code was in user mode. The actual stack switch is done in 471 + * entry_64.S 472 472 */ 473 473 asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs) 474 474 { 475 - struct pt_regs *regs = eregs; 476 - /* Did already sync */ 477 - if (eregs == (struct pt_regs *)eregs->sp) 478 - ; 479 - /* Exception from user space */ 480 - else if (user_mode(eregs)) 481 - regs = task_pt_regs(current); 482 - /* 483 - * Exception from kernel and interrupts are enabled. Move to 484 - * kernel process stack. 485 - */ 486 - else if (eregs->flags & X86_EFLAGS_IF) 487 - regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs)); 488 - if (eregs != regs) 489 - *regs = *eregs; 475 + struct pt_regs *regs = task_pt_regs(current); 476 + *regs = *eregs; 490 477 return regs; 491 478 } 492 479 NOKPROBE_SYMBOL(sync_regs);