Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

x86/entry/unwind: Create stack frames for saved interrupt registers

With frame pointers, when a task is interrupted, its stack is no longer
completely reliable because the function could have been interrupted
before it had a chance to save the previous frame pointer on the stack.
So the caller of the interrupted function could get skipped by a stack
trace.

This is problematic for live patching, which needs to know whether a
stack trace of a sleeping task can be relied upon. There's currently no
way to detect if a sleeping task was interrupted by a page fault
exception or preemption before it went to sleep.

Another issue is that when dumping the stack of an interrupted task, the
unwinder has no way of knowing where the saved pt_regs registers are, so
it can't print them.

This solves those issues by encoding the pt_regs pointer in the frame
pointer on entry from an interrupt or an exception.

This patch also updates the unwinder to be able to decode it, because
otherwise the unwinder would be broken by this change.

Note that this causes a change in the behavior of the unwinder: each
instance of a pt_regs on the stack is now considered a "frame". So
callers of unwind_get_return_address() will now get an occasional
'regs->ip' address that would have previously been skipped over.

Suggested-by: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/8b9f84a21e39d249049e0547b559ff8da0df0988.1476973742.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

authored by

Josh Poimboeuf and committed by
Ingo Molnar
946c1911 29a6d796

+139 -16
+20
arch/x86/entry/calling.h
··· 192 192 .byte 0xf1 193 193 .endm 194 194 195 + /* 196 + * This is a sneaky trick to help the unwinder find pt_regs on the stack. The 197 + * frame pointer is replaced with an encoded pointer to pt_regs. The encoding 198 + * is just setting the LSB, which makes it an invalid stack address and is also 199 + * a signal to the unwinder that it's a pt_regs pointer in disguise. 200 + * 201 + * NOTE: This macro must be used *after* SAVE_EXTRA_REGS because it corrupts 202 + * the original rbp. 203 + */ 204 + .macro ENCODE_FRAME_POINTER ptregs_offset=0 205 + #ifdef CONFIG_FRAME_POINTER 206 + .if \ptregs_offset 207 + leaq \ptregs_offset(%rsp), %rbp 208 + .else 209 + mov %rsp, %rbp 210 + .endif 211 + orq $0x1, %rbp 212 + #endif 213 + .endm 214 + 195 215 #endif /* CONFIG_X86_64 */ 196 216 197 217 /*
+29 -4
arch/x86/entry/entry_32.S
··· 176 176 SET_KERNEL_GS %edx 177 177 .endm 178 178 179 + /* 180 + * This is a sneaky trick to help the unwinder find pt_regs on the stack. The 181 + * frame pointer is replaced with an encoded pointer to pt_regs. The encoding 182 + * is just setting the LSB, which makes it an invalid stack address and is also 183 + * a signal to the unwinder that it's a pt_regs pointer in disguise. 184 + * 185 + * NOTE: This macro must be used *after* SAVE_ALL because it corrupts the 186 + * original rbp. 187 + */ 188 + .macro ENCODE_FRAME_POINTER 189 + #ifdef CONFIG_FRAME_POINTER 190 + mov %esp, %ebp 191 + orl $0x1, %ebp 192 + #endif 193 + .endm 194 + 179 195 .macro RESTORE_INT_REGS 180 196 popl %ebx 181 197 popl %ecx ··· 657 641 ASM_CLAC 658 642 addl $-0x80, (%esp) /* Adjust vector into the [-256, -1] range */ 659 643 SAVE_ALL 644 + ENCODE_FRAME_POINTER 660 645 TRACE_IRQS_OFF 661 646 movl %esp, %eax 662 647 call do_IRQ ··· 669 652 ASM_CLAC; \ 670 653 pushl $~(nr); \ 671 654 SAVE_ALL; \ 655 + ENCODE_FRAME_POINTER; \ 672 656 TRACE_IRQS_OFF \ 673 657 movl %esp, %eax; \ 674 658 call fn; \ ··· 804 786 ENTRY(xen_hypervisor_callback) 805 787 pushl $-1 /* orig_ax = -1 => not a system call */ 806 788 SAVE_ALL 789 + ENCODE_FRAME_POINTER 807 790 TRACE_IRQS_OFF 808 791 809 792 /* ··· 859 840 jmp iret_exc 860 841 5: pushl $-1 /* orig_ax = -1 => not a system call */ 861 842 SAVE_ALL 843 + ENCODE_FRAME_POINTER 862 844 jmp ret_from_exception 863 845 864 846 .section .fixup, "ax" ··· 1087 1067 pushl %edx 1088 1068 pushl %ecx 1089 1069 pushl %ebx 1070 + ENCODE_FRAME_POINTER 1090 1071 cld 1091 1072 movl $(__KERNEL_PERCPU), %ecx 1092 1073 movl %ecx, %fs ··· 1120 1099 ASM_CLAC 1121 1100 pushl $-1 # mark this as an int 1122 1101 SAVE_ALL 1102 + ENCODE_FRAME_POINTER 1123 1103 xorl %edx, %edx # error code 0 1124 1104 movl %esp, %eax # pt_regs pointer 1125 1105 ··· 1136 1114 1137 1115 .Ldebug_from_sysenter_stack: 1138 1116 /* We're on the SYSENTER stack. Switch off. */ 1139 - movl %esp, %ebp 1117 + movl %esp, %ebx 1140 1118 movl PER_CPU_VAR(cpu_current_top_of_stack), %esp 1141 1119 TRACE_IRQS_OFF 1142 1120 call do_debug 1143 - movl %ebp, %esp 1121 + movl %ebx, %esp 1144 1122 jmp ret_from_exception 1145 1123 END(debug) 1146 1124 ··· 1163 1141 1164 1142 pushl %eax # pt_regs->orig_ax 1165 1143 SAVE_ALL 1144 + ENCODE_FRAME_POINTER 1166 1145 xorl %edx, %edx # zero error code 1167 1146 movl %esp, %eax # pt_regs pointer 1168 1147 ··· 1182 1159 * We're on the SYSENTER stack. Switch off. No one (not even debug) 1183 1160 * is using the thread stack right now, so it's safe for us to use it. 1184 1161 */ 1185 - movl %esp, %ebp 1162 + movl %esp, %ebx 1186 1163 movl PER_CPU_VAR(cpu_current_top_of_stack), %esp 1187 1164 call do_nmi 1188 - movl %ebp, %esp 1165 + movl %ebx, %esp 1189 1166 jmp .Lrestore_all_notrace 1190 1167 1191 1168 #ifdef CONFIG_X86_ESPFIX32 ··· 1202 1179 .endr 1203 1180 pushl %eax 1204 1181 SAVE_ALL 1182 + ENCODE_FRAME_POINTER 1205 1183 FIXUP_ESPFIX_STACK # %eax == %esp 1206 1184 xorl %edx, %edx # zero error code 1207 1185 call do_nmi ··· 1216 1192 ASM_CLAC 1217 1193 pushl $-1 # mark this as an int 1218 1194 SAVE_ALL 1195 + ENCODE_FRAME_POINTER 1219 1196 TRACE_IRQS_OFF 1220 1197 xorl %edx, %edx # zero error code 1221 1198 movl %esp, %eax # pt_regs pointer
+7 -3
arch/x86/entry/entry_64.S
··· 469 469 ALLOC_PT_GPREGS_ON_STACK 470 470 SAVE_C_REGS 471 471 SAVE_EXTRA_REGS 472 + ENCODE_FRAME_POINTER 472 473 473 474 testb $3, CS(%rsp) 474 475 jz 1f ··· 986 985 ALLOC_PT_GPREGS_ON_STACK 987 986 SAVE_C_REGS 988 987 SAVE_EXTRA_REGS 988 + ENCODE_FRAME_POINTER 989 989 jmp error_exit 990 990 END(xen_failsafe_callback) 991 991 ··· 1030 1028 cld 1031 1029 SAVE_C_REGS 8 1032 1030 SAVE_EXTRA_REGS 8 1031 + ENCODE_FRAME_POINTER 8 1033 1032 movl $1, %ebx 1034 1033 movl $MSR_GS_BASE, %ecx 1035 1034 rdmsr ··· 1078 1075 cld 1079 1076 SAVE_C_REGS 8 1080 1077 SAVE_EXTRA_REGS 8 1078 + ENCODE_FRAME_POINTER 8 1081 1079 xorl %ebx, %ebx 1082 1080 testb $3, CS+8(%rsp) 1083 1081 jz .Lerror_kernelspace ··· 1261 1257 pushq %r13 /* pt_regs->r13 */ 1262 1258 pushq %r14 /* pt_regs->r14 */ 1263 1259 pushq %r15 /* pt_regs->r15 */ 1260 + ENCODE_FRAME_POINTER 1264 1261 1265 1262 /* 1266 1263 * At this point we no longer need to worry about stack damage ··· 1275 1270 1276 1271 /* 1277 1272 * Return back to user mode. We must *not* do the normal exit 1278 - * work, because we don't want to enable interrupts. Fortunately, 1279 - * do_nmi doesn't modify pt_regs. 1273 + * work, because we don't want to enable interrupts. 1280 1274 */ 1281 1275 SWAPGS 1282 - jmp restore_c_regs_and_iret 1276 + jmp restore_regs_and_iret 1283 1277 1284 1278 .Lnmi_from_kernel: 1285 1279 /*
+15 -1
arch/x86/include/asm/unwind.h
··· 13 13 int graph_idx; 14 14 #ifdef CONFIG_FRAME_POINTER 15 15 unsigned long *bp; 16 + struct pt_regs *regs; 16 17 #else 17 18 unsigned long *sp; 18 19 #endif ··· 48 47 if (unwind_done(state)) 49 48 return NULL; 50 49 51 - return state->bp + 1; 50 + return state->regs ? &state->regs->ip : state->bp + 1; 51 + } 52 + 53 + static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) 54 + { 55 + if (unwind_done(state)) 56 + return NULL; 57 + 58 + return state->regs; 52 59 } 53 60 54 61 #else /* !CONFIG_FRAME_POINTER */ 55 62 56 63 static inline 57 64 unsigned long *unwind_get_return_address_ptr(struct unwind_state *state) 65 + { 66 + return NULL; 67 + } 68 + 69 + static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) 58 70 { 59 71 return NULL; 60 72 }
+68 -8
arch/x86/kernel/unwind_frame.c
··· 14 14 if (unwind_done(state)) 15 15 return 0; 16 16 17 + if (state->regs && user_mode(state->regs)) 18 + return 0; 19 + 17 20 addr = ftrace_graph_ret_addr(state->task, &state->graph_idx, *addr_p, 18 21 addr_p); 19 22 20 23 return __kernel_text_address(addr) ? addr : 0; 21 24 } 22 25 EXPORT_SYMBOL_GPL(unwind_get_return_address); 26 + 27 + /* 28 + * This determines if the frame pointer actually contains an encoded pointer to 29 + * pt_regs on the stack. See ENCODE_FRAME_POINTER. 30 + */ 31 + static struct pt_regs *decode_frame_pointer(unsigned long *bp) 32 + { 33 + unsigned long regs = (unsigned long)bp; 34 + 35 + if (!(regs & 0x1)) 36 + return NULL; 37 + 38 + return (struct pt_regs *)(regs & ~0x1); 39 + } 23 40 24 41 static bool update_stack_state(struct unwind_state *state, void *addr, 25 42 size_t len) ··· 60 43 61 44 bool unwind_next_frame(struct unwind_state *state) 62 45 { 63 - unsigned long *next_bp; 46 + struct pt_regs *regs; 47 + unsigned long *next_bp, *next_frame; 48 + size_t next_len; 64 49 65 50 if (unwind_done(state)) 66 51 return false; 67 52 68 - next_bp = (unsigned long *)*state->bp; 53 + /* have we reached the end? */ 54 + if (state->regs && user_mode(state->regs)) 55 + goto the_end; 56 + 57 + /* get the next frame pointer */ 58 + if (state->regs) 59 + next_bp = (unsigned long *)state->regs->bp; 60 + else 61 + next_bp = (unsigned long *)*state->bp; 62 + 63 + /* is the next frame pointer an encoded pointer to pt_regs? */ 64 + regs = decode_frame_pointer(next_bp); 65 + if (regs) { 66 + next_frame = (unsigned long *)regs; 67 + next_len = sizeof(*regs); 68 + } else { 69 + next_frame = next_bp; 70 + next_len = FRAME_HEADER_SIZE; 71 + } 69 72 70 73 /* make sure the next frame's data is accessible */ 71 - if (!update_stack_state(state, next_bp, FRAME_HEADER_SIZE)) 74 + if (!update_stack_state(state, next_frame, next_len)) 72 75 return false; 73 - 74 76 /* move to the next frame */ 75 - state->bp = next_bp; 77 + if (regs) { 78 + state->regs = regs; 79 + state->bp = NULL; 80 + } else { 81 + state->bp = next_bp; 82 + state->regs = NULL; 83 + } 84 + 76 85 return true; 86 + 87 + the_end: 88 + state->stack_info.type = STACK_TYPE_UNKNOWN; 89 + return false; 77 90 } 78 91 EXPORT_SYMBOL_GPL(unwind_next_frame); 79 92 80 93 void __unwind_start(struct unwind_state *state, struct task_struct *task, 81 94 struct pt_regs *regs, unsigned long *first_frame) 82 95 { 96 + unsigned long *bp, *frame; 97 + size_t len; 98 + 83 99 memset(state, 0, sizeof(*state)); 84 100 state->task = task; 85 101 ··· 123 73 } 124 74 125 75 /* set up the starting stack frame */ 126 - state->bp = get_frame_pointer(task, regs); 76 + bp = get_frame_pointer(task, regs); 77 + regs = decode_frame_pointer(bp); 78 + if (regs) { 79 + state->regs = regs; 80 + frame = (unsigned long *)regs; 81 + len = sizeof(*regs); 82 + } else { 83 + state->bp = bp; 84 + frame = bp; 85 + len = FRAME_HEADER_SIZE; 86 + } 127 87 128 88 /* initialize stack info and make sure the frame data is accessible */ 129 - get_stack_info(state->bp, state->task, &state->stack_info, 89 + get_stack_info(frame, state->task, &state->stack_info, 130 90 &state->stack_mask); 131 - update_stack_state(state, state->bp, FRAME_HEADER_SIZE); 91 + update_stack_state(state, frame, len); 132 92 133 93 /* 134 94 * The caller can provide the address of the first frame directly