Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

stacktrace/x86: add function for detecting reliable stack traces

For live patching and possibly other use cases, a stack trace is only
useful if it can be assured that it's completely reliable. Add a new
save_stack_trace_tsk_reliable() function to achieve that.

Note that if the target task isn't the current task, and the target task
is allowed to run, then it could be writing the stack while the unwinder
is reading it, resulting in possible corruption. So the caller of
save_stack_trace_tsk_reliable() must ensure that the task is either
'current' or inactive.

save_stack_trace_tsk_reliable() relies on the x86 unwinder's detection
of pt_regs on the stack. If the pt_regs are not user-mode registers
from a syscall, then they indicate an in-kernel interrupt or exception
(e.g. preemption or a page fault), in which case the stack is considered
unreliable due to the nature of frame pointers.

It also relies on the x86 unwinder's detection of other issues, such as:

- corrupted stack data
- stack grows the wrong way
- stack walk doesn't reach the bottom
- user didn't provide a large enough entries array

Such issues are reported by checking unwind_error() and !unwind_done().

Also add CONFIG_HAVE_RELIABLE_STACKTRACE so arch-independent code can
determine at build time whether the function is implemented.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Reviewed-by: Miroslav Benes <mbenes@suse.cz>
Acked-by: Ingo Molnar <mingo@kernel.org> # for the x86 changes
Signed-off-by: Jiri Kosina <jkosina@suse.cz>

authored by

Josh Poimboeuf and committed by
Jiri Kosina
af085d90 c1ae3cfa

+126 -6
+6
arch/Kconfig
··· 713 713 Architecture supports the 'objtool check' host tool command, which 714 714 performs compile-time stack metadata validation. 715 715 716 + config HAVE_RELIABLE_STACKTRACE 717 + bool 718 + help 719 + Architecture has a save_stack_trace_tsk_reliable() function which 720 + only returns a stack trace if it can guarantee the trace is reliable. 721 + 716 722 config HAVE_ARCH_HASH 717 723 bool 718 724 default n
+1
arch/x86/Kconfig
··· 160 160 select HAVE_PERF_REGS 161 161 select HAVE_PERF_USER_STACK_DUMP 162 162 select HAVE_REGS_AND_STACK_ACCESS_API 163 + select HAVE_RELIABLE_STACKTRACE if X86_64 && FRAME_POINTER && STACK_VALIDATION 163 164 select HAVE_STACK_VALIDATION if X86_64 164 165 select HAVE_SYSCALL_TRACEPOINTS 165 166 select HAVE_UNSTABLE_SCHED_CLOCK
+6
arch/x86/include/asm/unwind.h
··· 11 11 unsigned long stack_mask; 12 12 struct task_struct *task; 13 13 int graph_idx; 14 + bool error; 14 15 #ifdef CONFIG_FRAME_POINTER 15 16 unsigned long *bp, *orig_sp; 16 17 struct pt_regs *regs; ··· 39 38 first_frame = first_frame ? : get_stack_pointer(task, regs); 40 39 41 40 __unwind_start(state, task, regs, first_frame); 41 + } 42 + 43 + static inline bool unwind_error(struct unwind_state *state) 44 + { 45 + return state->error; 42 46 } 43 47 44 48 #ifdef CONFIG_FRAME_POINTER
+95 -1
arch/x86/kernel/stacktrace.c
··· 76 76 } 77 77 EXPORT_SYMBOL_GPL(save_stack_trace_tsk); 78 78 79 + #ifdef CONFIG_HAVE_RELIABLE_STACKTRACE 80 + 81 + #define STACKTRACE_DUMP_ONCE(task) ({ \ 82 + static bool __section(.data.unlikely) __dumped; \ 83 + \ 84 + if (!__dumped) { \ 85 + __dumped = true; \ 86 + WARN_ON(1); \ 87 + show_stack(task, NULL); \ 88 + } \ 89 + }) 90 + 91 + static int __save_stack_trace_reliable(struct stack_trace *trace, 92 + struct task_struct *task) 93 + { 94 + struct unwind_state state; 95 + struct pt_regs *regs; 96 + unsigned long addr; 97 + 98 + for (unwind_start(&state, task, NULL, NULL); !unwind_done(&state); 99 + unwind_next_frame(&state)) { 100 + 101 + regs = unwind_get_entry_regs(&state); 102 + if (regs) { 103 + /* 104 + * Kernel mode registers on the stack indicate an 105 + * in-kernel interrupt or exception (e.g., preemption 106 + * or a page fault), which can make frame pointers 107 + * unreliable. 108 + */ 109 + if (!user_mode(regs)) 110 + return -EINVAL; 111 + 112 + /* 113 + * The last frame contains the user mode syscall 114 + * pt_regs. Skip it and finish the unwind. 115 + */ 116 + unwind_next_frame(&state); 117 + if (!unwind_done(&state)) { 118 + STACKTRACE_DUMP_ONCE(task); 119 + return -EINVAL; 120 + } 121 + break; 122 + } 123 + 124 + addr = unwind_get_return_address(&state); 125 + 126 + /* 127 + * A NULL or invalid return address probably means there's some 128 + * generated code which __kernel_text_address() doesn't know 129 + * about. 130 + */ 131 + if (!addr) { 132 + STACKTRACE_DUMP_ONCE(task); 133 + return -EINVAL; 134 + } 135 + 136 + if (save_stack_address(trace, addr, false)) 137 + return -EINVAL; 138 + } 139 + 140 + /* Check for stack corruption */ 141 + if (unwind_error(&state)) { 142 + STACKTRACE_DUMP_ONCE(task); 143 + return -EINVAL; 144 + } 145 + 146 + if (trace->nr_entries < trace->max_entries) 147 + trace->entries[trace->nr_entries++] = ULONG_MAX; 148 + 149 + return 0; 150 + } 151 + 152 + /* 153 + * This function returns an error if it detects any unreliable features of the 154 + * stack. Otherwise it guarantees that the stack trace is reliable. 155 + * 156 + * If the task is not 'current', the caller *must* ensure the task is inactive. 157 + */ 158 + int save_stack_trace_tsk_reliable(struct task_struct *tsk, 159 + struct stack_trace *trace) 160 + { 161 + int ret; 162 + 163 + if (!try_get_task_stack(tsk)) 164 + return -EINVAL; 165 + 166 + ret = __save_stack_trace_reliable(trace, tsk); 167 + 168 + put_task_stack(tsk); 169 + 170 + return ret; 171 + } 172 + #endif /* CONFIG_HAVE_RELIABLE_STACKTRACE */ 173 + 79 174 /* Userspace stacktrace - based on kernel/trace/trace_sysprof.c */ 80 175 81 176 struct stack_frame_user { ··· 233 138 if (trace->nr_entries < trace->max_entries) 234 139 trace->entries[trace->nr_entries++] = ULONG_MAX; 235 140 } 236 -
+2
arch/x86/kernel/unwind_frame.c
··· 225 225 return true; 226 226 227 227 bad_address: 228 + state->error = true; 229 + 228 230 /* 229 231 * When unwinding a non-current task, the task might actually be 230 232 * running on another CPU, in which case it could be modifying its
+6 -3
include/linux/stacktrace.h
··· 18 18 struct stack_trace *trace); 19 19 extern void save_stack_trace_tsk(struct task_struct *tsk, 20 20 struct stack_trace *trace); 21 + extern int save_stack_trace_tsk_reliable(struct task_struct *tsk, 22 + struct stack_trace *trace); 21 23 22 24 extern void print_stack_trace(struct stack_trace *trace, int spaces); 23 25 extern int snprint_stack_trace(char *buf, size_t size, ··· 31 29 # define save_stack_trace_user(trace) do { } while (0) 32 30 #endif 33 31 34 - #else 32 + #else /* !CONFIG_STACKTRACE */ 35 33 # define save_stack_trace(trace) do { } while (0) 36 34 # define save_stack_trace_tsk(tsk, trace) do { } while (0) 37 35 # define save_stack_trace_user(trace) do { } while (0) 38 36 # define print_stack_trace(trace, spaces) do { } while (0) 39 37 # define snprint_stack_trace(buf, size, trace, spaces) do { } while (0) 40 - #endif 38 + # define save_stack_trace_tsk_reliable(tsk, trace) ({ -ENOSYS; }) 39 + #endif /* CONFIG_STACKTRACE */ 41 40 42 - #endif 41 + #endif /* __LINUX_STACKTRACE_H */
+10 -2
kernel/stacktrace.c
··· 54 54 EXPORT_SYMBOL_GPL(snprint_stack_trace); 55 55 56 56 /* 57 - * Architectures that do not implement save_stack_trace_tsk or 58 - * save_stack_trace_regs get this weak alias and a once-per-bootup warning 57 + * Architectures that do not implement save_stack_trace_*() 58 + * get these weak aliases and once-per-bootup warnings 59 59 * (whenever this facility is utilized - for example by procfs): 60 60 */ 61 61 __weak void ··· 68 68 save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace) 69 69 { 70 70 WARN_ONCE(1, KERN_INFO "save_stack_trace_regs() not implemented yet.\n"); 71 + } 72 + 73 + __weak int 74 + save_stack_trace_tsk_reliable(struct task_struct *tsk, 75 + struct stack_trace *trace) 76 + { 77 + WARN_ONCE(1, KERN_INFO "save_stack_tsk_reliable() not implemented yet.\n"); 78 + return -ENOSYS; 71 79 }