Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

perf: Support deferred user unwind

Add support for deferred userspace unwind to perf.

Where perf currently relies on in-place stack unwinding; from NMI
context and all that. This moves the userspace part of the unwind to
right before the return-to-userspace.

This has two distinct benefits, the biggest is that it moves the
unwind to a faultable context. It becomes possible to fault in debug
info (.eh_frame, SFrame etc.) that might not otherwise be readily
available. And secondly, it de-duplicates the user callchain where
multiple samples happen during the same kernel entry.

To facilitate this the perf interface is extended with a new record
type:

PERF_RECORD_CALLCHAIN_DEFERRED

and two new attribute flags:

perf_event_attr::defer_callchain - to request the user unwind be deferred
perf_event_attr::defer_output - to request PERF_RECORD_CALLCHAIN_DEFERRED records

The existing PERF_RECORD_SAMPLE callchain section gets a new
context type:

PERF_CONTEXT_USER_DEFERRED

After which will come a single entry, denoting the 'cookie' of the
deferred callchain that should be attached here, matching the 'cookie'
field of the above mentioned PERF_RECORD_CALLCHAIN_DEFERRED.

The 'defer_callchain' flag is expected on all events with
PERF_SAMPLE_CALLCHAIN. The 'defer_output' flag is expect on the event
responsible for collecting side-band events (like mmap, comm etc.).
Setting 'defer_output' on multiple events will get you duplicated
PERF_RECORD_CALLCHAIN_DEFERRED records.

Based on earlier patches by Josh and Steven.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20251023150002.GR4067720@noisy.programming.kicks-ass.net

+145 -20
+1 -1
include/linux/perf_event.h
··· 1720 1720 extern void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs); 1721 1721 extern struct perf_callchain_entry * 1722 1722 get_perf_callchain(struct pt_regs *regs, bool kernel, bool user, 1723 - u32 max_stack, bool crosstask, bool add_mark); 1723 + u32 max_stack, bool crosstask, bool add_mark, u64 defer_cookie); 1724 1724 extern int get_callchain_buffers(int max_stack); 1725 1725 extern void put_callchain_buffers(void); 1726 1726 extern struct perf_callchain_entry *get_callchain_entry(int *rctx);
-12
include/linux/unwind_deferred.h
··· 6 6 #include <linux/unwind_user.h> 7 7 #include <linux/unwind_deferred_types.h> 8 8 9 - struct unwind_work; 10 - 11 - typedef void (*unwind_callback_t)(struct unwind_work *work, 12 - struct unwind_stacktrace *trace, 13 - u64 cookie); 14 - 15 - struct unwind_work { 16 - struct list_head list; 17 - unwind_callback_t func; 18 - int bit; 19 - }; 20 - 21 9 #ifdef CONFIG_UNWIND_USER 22 10 23 11 enum {
+13
include/linux/unwind_deferred_types.h
··· 39 39 union unwind_task_id id; 40 40 }; 41 41 42 + struct unwind_work; 43 + struct unwind_stacktrace; 44 + 45 + typedef void (*unwind_callback_t)(struct unwind_work *work, 46 + struct unwind_stacktrace *trace, 47 + u64 cookie); 48 + 49 + struct unwind_work { 50 + struct list_head list; 51 + unwind_callback_t func; 52 + int bit; 53 + }; 54 + 42 55 #endif /* _LINUX_UNWIND_USER_DEFERRED_TYPES_H */
+20 -1
include/uapi/linux/perf_event.h
··· 463 463 inherit_thread : 1, /* children only inherit if cloned with CLONE_THREAD */ 464 464 remove_on_exec : 1, /* event is removed from task on exec */ 465 465 sigtrap : 1, /* send synchronous SIGTRAP on event */ 466 - __reserved_1 : 26; 466 + defer_callchain: 1, /* request PERF_RECORD_CALLCHAIN_DEFERRED records */ 467 + defer_output : 1, /* output PERF_RECORD_CALLCHAIN_DEFERRED records */ 468 + __reserved_1 : 24; 467 469 468 470 union { 469 471 __u32 wakeup_events; /* wake up every n events */ ··· 1241 1239 */ 1242 1240 PERF_RECORD_AUX_OUTPUT_HW_ID = 21, 1243 1241 1242 + /* 1243 + * This user callchain capture was deferred until shortly before 1244 + * returning to user space. Previous samples would have kernel 1245 + * callchains only and they need to be stitched with this to make full 1246 + * callchains. 1247 + * 1248 + * struct { 1249 + * struct perf_event_header header; 1250 + * u64 cookie; 1251 + * u64 nr; 1252 + * u64 ips[nr]; 1253 + * struct sample_id sample_id; 1254 + * }; 1255 + */ 1256 + PERF_RECORD_CALLCHAIN_DEFERRED = 22, 1257 + 1244 1258 PERF_RECORD_MAX, /* non-ABI */ 1245 1259 }; 1246 1260 ··· 1287 1269 PERF_CONTEXT_HV = (__u64)-32, 1288 1270 PERF_CONTEXT_KERNEL = (__u64)-128, 1289 1271 PERF_CONTEXT_USER = (__u64)-512, 1272 + PERF_CONTEXT_USER_DEFERRED = (__u64)-640, 1290 1273 1291 1274 PERF_CONTEXT_GUEST = (__u64)-2048, 1292 1275 PERF_CONTEXT_GUEST_KERNEL = (__u64)-2176,
+2 -2
kernel/bpf/stackmap.c
··· 315 315 max_depth = sysctl_perf_event_max_stack; 316 316 317 317 trace = get_perf_callchain(regs, kernel, user, max_depth, 318 - false, false); 318 + false, false, 0); 319 319 320 320 if (unlikely(!trace)) 321 321 /* couldn't fetch the stack trace */ ··· 452 452 trace = get_callchain_entry_for_task(task, max_depth); 453 453 else 454 454 trace = get_perf_callchain(regs, kernel, user, max_depth, 455 - crosstask, false); 455 + crosstask, false, 0); 456 456 457 457 if (unlikely(!trace) || trace->nr < skip) { 458 458 if (may_fault)
+13 -1
kernel/events/callchain.c
··· 218 218 219 219 struct perf_callchain_entry * 220 220 get_perf_callchain(struct pt_regs *regs, bool kernel, bool user, 221 - u32 max_stack, bool crosstask, bool add_mark) 221 + u32 max_stack, bool crosstask, bool add_mark, u64 defer_cookie) 222 222 { 223 223 struct perf_callchain_entry *entry; 224 224 struct perf_callchain_entry_ctx ctx; ··· 249 249 if (current->flags & (PF_KTHREAD | PF_USER_WORKER)) 250 250 goto exit_put; 251 251 regs = task_pt_regs(current); 252 + } 253 + 254 + if (defer_cookie) { 255 + /* 256 + * Foretell the coming of PERF_RECORD_CALLCHAIN_DEFERRED 257 + * which can be stitched to this one, and add 258 + * the cookie after it (it will be cut off when the 259 + * user stack is copied to the callchain). 260 + */ 261 + perf_callchain_store_context(&ctx, PERF_CONTEXT_USER_DEFERRED); 262 + perf_callchain_store_context(&ctx, defer_cookie); 263 + goto exit_put; 252 264 } 253 265 254 266 if (add_mark)
+76 -2
kernel/events/core.c
··· 56 56 #include <linux/buildid.h> 57 57 #include <linux/task_work.h> 58 58 #include <linux/percpu-rwsem.h> 59 + #include <linux/unwind_deferred.h> 59 60 60 61 #include "internal.h" 61 62 ··· 8201 8200 8202 8201 static struct perf_callchain_entry __empty_callchain = { .nr = 0, }; 8203 8202 8203 + static struct unwind_work perf_unwind_work; 8204 + 8204 8205 struct perf_callchain_entry * 8205 8206 perf_callchain(struct perf_event *event, struct pt_regs *regs) 8206 8207 { ··· 8211 8208 !(current->flags & (PF_KTHREAD | PF_USER_WORKER)); 8212 8209 /* Disallow cross-task user callchains. */ 8213 8210 bool crosstask = event->ctx->task && event->ctx->task != current; 8211 + bool defer_user = IS_ENABLED(CONFIG_UNWIND_USER) && user && 8212 + event->attr.defer_callchain; 8214 8213 const u32 max_stack = event->attr.sample_max_stack; 8215 8214 struct perf_callchain_entry *callchain; 8215 + u64 defer_cookie; 8216 8216 8217 8217 if (!current->mm) 8218 8218 user = false; ··· 8223 8217 if (!kernel && !user) 8224 8218 return &__empty_callchain; 8225 8219 8226 - callchain = get_perf_callchain(regs, kernel, user, 8227 - max_stack, crosstask, true); 8220 + if (!(user && defer_user && !crosstask && 8221 + unwind_deferred_request(&perf_unwind_work, &defer_cookie) >= 0)) 8222 + defer_cookie = 0; 8223 + 8224 + callchain = get_perf_callchain(regs, kernel, user, max_stack, 8225 + crosstask, true, defer_cookie); 8226 + 8228 8227 return callchain ?: &__empty_callchain; 8229 8228 } 8230 8229 ··· 10012 10001 10013 10002 memcpy(bpf_event.event_id.tag, prog->tag, BPF_TAG_SIZE); 10014 10003 perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL); 10004 + } 10005 + 10006 + struct perf_callchain_deferred_event { 10007 + struct unwind_stacktrace *trace; 10008 + struct { 10009 + struct perf_event_header header; 10010 + u64 cookie; 10011 + u64 nr; 10012 + u64 ips[]; 10013 + } event; 10014 + }; 10015 + 10016 + static void perf_callchain_deferred_output(struct perf_event *event, void *data) 10017 + { 10018 + struct perf_callchain_deferred_event *deferred_event = data; 10019 + struct perf_output_handle handle; 10020 + struct perf_sample_data sample; 10021 + int ret, size = deferred_event->event.header.size; 10022 + 10023 + if (!event->attr.defer_output) 10024 + return; 10025 + 10026 + /* XXX do we really need sample_id_all for this ??? */ 10027 + perf_event_header__init_id(&deferred_event->event.header, &sample, event); 10028 + 10029 + ret = perf_output_begin(&handle, &sample, event, 10030 + deferred_event->event.header.size); 10031 + if (ret) 10032 + goto out; 10033 + 10034 + perf_output_put(&handle, deferred_event->event); 10035 + for (int i = 0; i < deferred_event->trace->nr; i++) { 10036 + u64 entry = deferred_event->trace->entries[i]; 10037 + perf_output_put(&handle, entry); 10038 + } 10039 + perf_event__output_id_sample(event, &handle, &sample); 10040 + 10041 + perf_output_end(&handle); 10042 + out: 10043 + deferred_event->event.header.size = size; 10044 + } 10045 + 10046 + static void perf_unwind_deferred_callback(struct unwind_work *work, 10047 + struct unwind_stacktrace *trace, u64 cookie) 10048 + { 10049 + struct perf_callchain_deferred_event deferred_event = { 10050 + .trace = trace, 10051 + .event = { 10052 + .header = { 10053 + .type = PERF_RECORD_CALLCHAIN_DEFERRED, 10054 + .misc = PERF_RECORD_MISC_USER, 10055 + .size = sizeof(deferred_event.event) + 10056 + (trace->nr * sizeof(u64)), 10057 + }, 10058 + .cookie = cookie, 10059 + .nr = trace->nr, 10060 + }, 10061 + }; 10062 + 10063 + perf_iterate_sb(perf_callchain_deferred_output, &deferred_event, NULL); 10015 10064 } 10016 10065 10017 10066 struct perf_text_poke_event { ··· 14869 14798 int ret; 14870 14799 14871 14800 idr_init(&pmu_idr); 14801 + 14802 + unwind_deferred_init(&perf_unwind_work, 14803 + perf_unwind_deferred_callback); 14872 14804 14873 14805 perf_event_init_all_cpus(); 14874 14806 init_srcu_struct(&pmus_srcu);
+20 -1
tools/include/uapi/linux/perf_event.h
··· 463 463 inherit_thread : 1, /* children only inherit if cloned with CLONE_THREAD */ 464 464 remove_on_exec : 1, /* event is removed from task on exec */ 465 465 sigtrap : 1, /* send synchronous SIGTRAP on event */ 466 - __reserved_1 : 26; 466 + defer_callchain: 1, /* request PERF_RECORD_CALLCHAIN_DEFERRED records */ 467 + defer_output : 1, /* output PERF_RECORD_CALLCHAIN_DEFERRED records */ 468 + __reserved_1 : 24; 467 469 468 470 union { 469 471 __u32 wakeup_events; /* wake up every n events */ ··· 1241 1239 */ 1242 1240 PERF_RECORD_AUX_OUTPUT_HW_ID = 21, 1243 1241 1242 + /* 1243 + * This user callchain capture was deferred until shortly before 1244 + * returning to user space. Previous samples would have kernel 1245 + * callchains only and they need to be stitched with this to make full 1246 + * callchains. 1247 + * 1248 + * struct { 1249 + * struct perf_event_header header; 1250 + * u64 cookie; 1251 + * u64 nr; 1252 + * u64 ips[nr]; 1253 + * struct sample_id sample_id; 1254 + * }; 1255 + */ 1256 + PERF_RECORD_CALLCHAIN_DEFERRED = 22, 1257 + 1244 1258 PERF_RECORD_MAX, /* non-ABI */ 1245 1259 }; 1246 1260 ··· 1287 1269 PERF_CONTEXT_HV = (__u64)-32, 1288 1270 PERF_CONTEXT_KERNEL = (__u64)-128, 1289 1271 PERF_CONTEXT_USER = (__u64)-512, 1272 + PERF_CONTEXT_USER_DEFERRED = (__u64)-640, 1290 1273 1291 1274 PERF_CONTEXT_GUEST = (__u64)-2048, 1292 1275 PERF_CONTEXT_GUEST_KERNEL = (__u64)-2176,