Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'perf-core-2025-09-26' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull performance events updates from Ingo Molnar:
"Core perf code updates:

- Convert mmap() related reference counts to refcount_t. This is in
reaction to the recently fixed refcount bugs, which could have been
detected earlier and could have mitigated the bug somewhat (Thomas
Gleixner, Peter Zijlstra)

- Clean up and simplify the callchain code, in preparation for
sframes (Steven Rostedt, Josh Poimboeuf)

Uprobes updates:

- Add support to optimize usdt probes on x86-64, which gives a
substantial speedup (Jiri Olsa)

- Cleanups and fixes on x86 (Peter Zijlstra)

PMU driver updates:

- Various optimizations and fixes to the Intel PMU driver (Dapeng Mi)

Misc cleanups and fixes:

- Remove redundant __GFP_NOWARN (Qianfeng Rong)"

* tag 'perf-core-2025-09-26' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (57 commits)
selftests/bpf: Fix uprobe_sigill test for uprobe syscall error value
uprobes/x86: Return error from uprobe syscall when not called from trampoline
perf: Skip user unwind if the task is a kernel thread
perf: Simplify get_perf_callchain() user logic
perf: Use current->flags & PF_KTHREAD|PF_USER_WORKER instead of current->mm == NULL
perf: Have get_perf_callchain() return NULL if crosstask and user are set
perf: Remove get_perf_callchain() init_nr argument
perf/x86: Print PMU counters bitmap in x86_pmu_show_pmu_cap()
perf/x86/intel: Add ICL_FIXED_0_ADAPTIVE bit into INTEL_FIXED_BITS_MASK
perf/x86/intel: Change macro GLOBAL_CTRL_EN_PERF_METRICS to BIT_ULL(48)
perf/x86: Add PERF_CAP_PEBS_TIMING_INFO flag
perf/x86/intel: Fix IA32_PMC_x_CFG_B MSRs access error
perf/x86/intel: Use early_initcall() to hook bts_init()
uprobes: Remove redundant __GFP_NOWARN
selftests/seccomp: validate uprobe syscall passes through seccomp
seccomp: passthrough uprobe systemcall without filtering
selftests/bpf: Fix uprobe syscall shadow stack test
selftests/bpf: Change test_uretprobe_regs_change for uprobe and uretprobe
selftests/bpf: Add uprobe_regs_equal test
selftests/bpf: Add optimized usdt variant for basic usdt test
...

+2261 -415
+1 -1
arch/arm/probes/uprobes/core.c
··· 30 30 unsigned long vaddr) 31 31 { 32 32 return uprobe_write_opcode(auprobe, vma, vaddr, 33 - __opcode_to_mem_arm(auprobe->bpinsn)); 33 + __opcode_to_mem_arm(auprobe->bpinsn), true); 34 34 } 35 35 36 36 bool arch_uprobe_ignore(struct arch_uprobe *auprobe, struct pt_regs *regs)
+1
arch/x86/entry/syscalls/syscall_64.tbl
··· 345 345 333 common io_pgetevents sys_io_pgetevents 346 346 334 common rseq sys_rseq 347 347 335 common uretprobe sys_uretprobe 348 + 336 common uprobe sys_uprobe 348 349 # don't use numbers 387 through 423, add new calls after the last 349 350 # 'common' entry 350 351 424 common pidfd_send_signal sys_pidfd_send_signal
+9 -7
arch/x86/events/core.c
··· 2069 2069 2070 2070 void x86_pmu_show_pmu_cap(struct pmu *pmu) 2071 2071 { 2072 - pr_info("... version: %d\n", x86_pmu.version); 2073 - pr_info("... bit width: %d\n", x86_pmu.cntval_bits); 2074 - pr_info("... generic registers: %d\n", x86_pmu_num_counters(pmu)); 2075 - pr_info("... value mask: %016Lx\n", x86_pmu.cntval_mask); 2076 - pr_info("... max period: %016Lx\n", x86_pmu.max_period); 2077 - pr_info("... fixed-purpose events: %d\n", x86_pmu_num_counters_fixed(pmu)); 2078 - pr_info("... event mask: %016Lx\n", hybrid(pmu, intel_ctrl)); 2072 + pr_info("... version: %d\n", x86_pmu.version); 2073 + pr_info("... bit width: %d\n", x86_pmu.cntval_bits); 2074 + pr_info("... generic counters: %d\n", x86_pmu_num_counters(pmu)); 2075 + pr_info("... generic bitmap: %016llx\n", hybrid(pmu, cntr_mask64)); 2076 + pr_info("... fixed-purpose counters: %d\n", x86_pmu_num_counters_fixed(pmu)); 2077 + pr_info("... fixed-purpose bitmap: %016llx\n", hybrid(pmu, fixed_cntr_mask64)); 2078 + pr_info("... value mask: %016llx\n", x86_pmu.cntval_mask); 2079 + pr_info("... max period: %016llx\n", x86_pmu.max_period); 2080 + pr_info("... global_ctrl mask: %016llx\n", hybrid(pmu, intel_ctrl)); 2079 2081 } 2080 2082 2081 2083 static int __init init_hw_perf_events(void)
+1 -1
arch/x86/events/intel/bts.c
··· 643 643 644 644 return perf_pmu_register(&bts_pmu, "intel_bts", -1); 645 645 } 646 - arch_initcall(bts_init); 646 + early_initcall(bts_init);
+9 -12
arch/x86/events/intel/core.c
··· 2845 2845 { 2846 2846 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 2847 2847 struct hw_perf_event *hwc = &event->hw; 2848 - u64 mask, bits = 0; 2849 2848 int idx = hwc->idx; 2849 + u64 bits = 0; 2850 2850 2851 2851 if (is_topdown_idx(idx)) { 2852 2852 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); ··· 2885 2885 2886 2886 idx -= INTEL_PMC_IDX_FIXED; 2887 2887 bits = intel_fixed_bits_by_idx(idx, bits); 2888 - mask = intel_fixed_bits_by_idx(idx, INTEL_FIXED_BITS_MASK); 2889 - 2890 - if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip) { 2888 + if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip) 2891 2889 bits |= intel_fixed_bits_by_idx(idx, ICL_FIXED_0_ADAPTIVE); 2892 - mask |= intel_fixed_bits_by_idx(idx, ICL_FIXED_0_ADAPTIVE); 2893 - } 2894 2890 2895 - cpuc->fixed_ctrl_val &= ~mask; 2891 + cpuc->fixed_ctrl_val &= ~intel_fixed_bits_by_idx(idx, INTEL_FIXED_BITS_MASK); 2896 2892 cpuc->fixed_ctrl_val |= bits; 2897 2893 } 2898 2894 ··· 2993 2997 if (event->group_leader != leader->group_leader) 2994 2998 break; 2995 2999 for_each_set_bit(idx, (unsigned long *)&event->attr.config2, X86_PMC_IDX_MAX) { 2996 - if (WARN_ON_ONCE(i + idx > cpuc->n_events)) 3000 + if (i + idx >= cpuc->n_events || 3001 + !is_acr_event_group(cpuc->event_list[i + idx])) 2997 3002 return; 2998 3003 __set_bit(cpuc->assign[i + idx], (unsigned long *)&event->hw.config1); 2999 3004 } ··· 5315 5318 0, x86_pmu_num_counters(&pmu->pmu), 0, 0); 5316 5319 5317 5320 if (pmu->intel_cap.perf_metrics) 5318 - pmu->intel_ctrl |= 1ULL << GLOBAL_CTRL_EN_PERF_METRICS; 5321 + pmu->intel_ctrl |= GLOBAL_CTRL_EN_PERF_METRICS; 5319 5322 else 5320 - pmu->intel_ctrl &= ~(1ULL << GLOBAL_CTRL_EN_PERF_METRICS); 5323 + pmu->intel_ctrl &= ~GLOBAL_CTRL_EN_PERF_METRICS; 5321 5324 5322 5325 intel_pmu_check_event_constraints(pmu->event_constraints, 5323 5326 pmu->cntr_mask64, ··· 5452 5455 rdmsrq(MSR_IA32_PERF_CAPABILITIES, perf_cap.capabilities); 5453 5456 if (!perf_cap.perf_metrics) { 5454 5457 x86_pmu.intel_cap.perf_metrics = 0; 5455 - x86_pmu.intel_ctrl &= ~(1ULL << GLOBAL_CTRL_EN_PERF_METRICS); 5458 + x86_pmu.intel_ctrl &= ~GLOBAL_CTRL_EN_PERF_METRICS; 5456 5459 } 5457 5460 } 5458 5461 ··· 7786 7789 } 7787 7790 7788 7791 if (!is_hybrid() && x86_pmu.intel_cap.perf_metrics) 7789 - x86_pmu.intel_ctrl |= 1ULL << GLOBAL_CTRL_EN_PERF_METRICS; 7792 + x86_pmu.intel_ctrl |= GLOBAL_CTRL_EN_PERF_METRICS; 7790 7793 7791 7794 if (x86_pmu.intel_cap.pebs_timing_info) 7792 7795 x86_pmu.flags |= PMU_FL_RETIRE_LATENCY;
+8 -6
arch/x86/include/asm/msr-index.h
··· 315 315 #define PERF_CAP_PT_IDX 16 316 316 317 317 #define MSR_PEBS_LD_LAT_THRESHOLD 0x000003f6 318 - #define PERF_CAP_PEBS_TRAP BIT_ULL(6) 319 - #define PERF_CAP_ARCH_REG BIT_ULL(7) 320 - #define PERF_CAP_PEBS_FORMAT 0xf00 321 - #define PERF_CAP_PEBS_BASELINE BIT_ULL(14) 322 - #define PERF_CAP_PEBS_MASK (PERF_CAP_PEBS_TRAP | PERF_CAP_ARCH_REG | \ 323 - PERF_CAP_PEBS_FORMAT | PERF_CAP_PEBS_BASELINE) 318 + #define PERF_CAP_PEBS_TRAP BIT_ULL(6) 319 + #define PERF_CAP_ARCH_REG BIT_ULL(7) 320 + #define PERF_CAP_PEBS_FORMAT 0xf00 321 + #define PERF_CAP_PEBS_BASELINE BIT_ULL(14) 322 + #define PERF_CAP_PEBS_TIMING_INFO BIT_ULL(17) 323 + #define PERF_CAP_PEBS_MASK (PERF_CAP_PEBS_TRAP | PERF_CAP_ARCH_REG | \ 324 + PERF_CAP_PEBS_FORMAT | PERF_CAP_PEBS_BASELINE | \ 325 + PERF_CAP_PEBS_TIMING_INFO) 324 326 325 327 #define MSR_IA32_RTIT_CTL 0x00000570 326 328 #define RTIT_CTL_TRACEEN BIT(0)
+6 -2
arch/x86/include/asm/perf_event.h
··· 35 35 #define ARCH_PERFMON_EVENTSEL_EQ (1ULL << 36) 36 36 #define ARCH_PERFMON_EVENTSEL_UMASK2 (0xFFULL << 40) 37 37 38 - #define INTEL_FIXED_BITS_MASK 0xFULL 39 38 #define INTEL_FIXED_BITS_STRIDE 4 40 39 #define INTEL_FIXED_0_KERNEL (1ULL << 0) 41 40 #define INTEL_FIXED_0_USER (1ULL << 1) ··· 46 47 #define HSW_IN_TX_CHECKPOINTED (1ULL << 33) 47 48 #define ICL_EVENTSEL_ADAPTIVE (1ULL << 34) 48 49 #define ICL_FIXED_0_ADAPTIVE (1ULL << 32) 50 + 51 + #define INTEL_FIXED_BITS_MASK \ 52 + (INTEL_FIXED_0_KERNEL | INTEL_FIXED_0_USER | \ 53 + INTEL_FIXED_0_ANYTHREAD | INTEL_FIXED_0_ENABLE_PMI | \ 54 + ICL_FIXED_0_ADAPTIVE) 49 55 50 56 #define intel_fixed_bits_by_idx(_idx, _bits) \ 51 57 ((_bits) << ((_idx) * INTEL_FIXED_BITS_STRIDE)) ··· 434 430 #define GLOBAL_STATUS_TRACE_TOPAPMI BIT_ULL(GLOBAL_STATUS_TRACE_TOPAPMI_BIT) 435 431 #define GLOBAL_STATUS_PERF_METRICS_OVF_BIT 48 436 432 437 - #define GLOBAL_CTRL_EN_PERF_METRICS 48 433 + #define GLOBAL_CTRL_EN_PERF_METRICS BIT_ULL(48) 438 434 /* 439 435 * We model guest LBR event tracing as another fixed-mode PMC like BTS. 440 436 *
+4
arch/x86/include/asm/shstk.h
··· 23 23 int restore_signal_shadow_stack(void); 24 24 int shstk_update_last_frame(unsigned long val); 25 25 bool shstk_is_enabled(void); 26 + int shstk_pop(u64 *val); 27 + int shstk_push(u64 val); 26 28 #else 27 29 static inline long shstk_prctl(struct task_struct *task, int option, 28 30 unsigned long arg2) { return -EINVAL; } ··· 37 35 static inline int restore_signal_shadow_stack(void) { return 0; } 38 36 static inline int shstk_update_last_frame(unsigned long val) { return 0; } 39 37 static inline bool shstk_is_enabled(void) { return false; } 38 + static inline int shstk_pop(u64 *val) { return -ENOTSUPP; } 39 + static inline int shstk_push(u64 val) { return -ENOTSUPP; } 40 40 #endif /* CONFIG_X86_USER_SHADOW_STACK */ 41 41 42 42 #endif /* __ASSEMBLER__ */
+7
arch/x86/include/asm/uprobes.h
··· 20 20 #define UPROBE_SWBP_INSN 0xcc 21 21 #define UPROBE_SWBP_INSN_SIZE 1 22 22 23 + enum { 24 + ARCH_UPROBE_FLAG_CAN_OPTIMIZE = 0, 25 + ARCH_UPROBE_FLAG_OPTIMIZE_FAIL = 1, 26 + }; 27 + 23 28 struct uprobe_xol_ops; 24 29 25 30 struct arch_uprobe { ··· 50 45 u8 ilen; 51 46 } push; 52 47 }; 48 + 49 + unsigned long flags; 53 50 }; 54 51 55 52 struct arch_uprobe_task {
+40
arch/x86/kernel/shstk.c
··· 246 246 return ssp; 247 247 } 248 248 249 + int shstk_pop(u64 *val) 250 + { 251 + int ret = 0; 252 + u64 ssp; 253 + 254 + if (!features_enabled(ARCH_SHSTK_SHSTK)) 255 + return -ENOTSUPP; 256 + 257 + fpregs_lock_and_load(); 258 + 259 + rdmsrq(MSR_IA32_PL3_SSP, ssp); 260 + if (val && get_user(*val, (__user u64 *)ssp)) 261 + ret = -EFAULT; 262 + else 263 + wrmsrq(MSR_IA32_PL3_SSP, ssp + SS_FRAME_SIZE); 264 + fpregs_unlock(); 265 + 266 + return ret; 267 + } 268 + 269 + int shstk_push(u64 val) 270 + { 271 + u64 ssp; 272 + int ret; 273 + 274 + if (!features_enabled(ARCH_SHSTK_SHSTK)) 275 + return -ENOTSUPP; 276 + 277 + fpregs_lock_and_load(); 278 + 279 + rdmsrq(MSR_IA32_PL3_SSP, ssp); 280 + ssp -= SS_FRAME_SIZE; 281 + ret = write_user_shstk_64((__user void *)ssp, val); 282 + if (!ret) 283 + wrmsrq(MSR_IA32_PL3_SSP, ssp); 284 + fpregs_unlock(); 285 + 286 + return ret; 287 + } 288 + 249 289 #define SHSTK_DATA_BIT BIT(63) 250 290 251 291 static int put_shstk_data(u64 __user *addr, u64 data)
+613 -22
arch/x86/kernel/uprobes.c
··· 18 18 #include <asm/processor.h> 19 19 #include <asm/insn.h> 20 20 #include <asm/mmu_context.h> 21 + #include <asm/nops.h> 21 22 22 23 /* Post-execution fixups. */ 23 24 ··· 311 310 312 311 #ifdef CONFIG_X86_64 313 312 313 + struct uretprobe_syscall_args { 314 + unsigned long r11; 315 + unsigned long cx; 316 + unsigned long ax; 317 + }; 318 + 314 319 asm ( 315 320 ".pushsection .rodata\n" 316 321 ".global uretprobe_trampoline_entry\n" 317 322 "uretprobe_trampoline_entry:\n" 318 - "pushq %rax\n" 319 - "pushq %rcx\n" 320 - "pushq %r11\n" 321 - "movq $" __stringify(__NR_uretprobe) ", %rax\n" 323 + "push %rax\n" 324 + "push %rcx\n" 325 + "push %r11\n" 326 + "mov $" __stringify(__NR_uretprobe) ", %rax\n" 322 327 "syscall\n" 323 328 ".global uretprobe_syscall_check\n" 324 329 "uretprobe_syscall_check:\n" 325 - "popq %r11\n" 326 - "popq %rcx\n" 327 - 328 - /* The uretprobe syscall replaces stored %rax value with final 330 + "pop %r11\n" 331 + "pop %rcx\n" 332 + /* 333 + * The uretprobe syscall replaces stored %rax value with final 329 334 * return address, so we don't restore %rax in here and just 330 335 * call ret. 331 336 */ 332 - "retq\n" 337 + "ret\n" 338 + "int3\n" 333 339 ".global uretprobe_trampoline_end\n" 334 340 "uretprobe_trampoline_end:\n" 335 341 ".popsection\n" ··· 346 338 extern u8 uretprobe_trampoline_end[]; 347 339 extern u8 uretprobe_syscall_check[]; 348 340 349 - void *arch_uprobe_trampoline(unsigned long *psize) 341 + void *arch_uretprobe_trampoline(unsigned long *psize) 350 342 { 351 343 static uprobe_opcode_t insn = UPROBE_SWBP_INSN; 352 344 struct pt_regs *regs = task_pt_regs(current); ··· 373 365 SYSCALL_DEFINE0(uretprobe) 374 366 { 375 367 struct pt_regs *regs = task_pt_regs(current); 376 - unsigned long err, ip, sp, r11_cx_ax[3], tramp; 368 + struct uretprobe_syscall_args args; 369 + unsigned long err, ip, sp, tramp; 377 370 378 371 /* If there's no trampoline, we are called from wrong place. */ 379 372 tramp = uprobe_get_trampoline_vaddr(); ··· 385 376 if (unlikely(regs->ip != trampoline_check_ip(tramp))) 386 377 goto sigill; 387 378 388 - err = copy_from_user(r11_cx_ax, (void __user *)regs->sp, sizeof(r11_cx_ax)); 379 + err = copy_from_user(&args, (void __user *)regs->sp, sizeof(args)); 389 380 if (err) 390 381 goto sigill; 391 382 392 383 /* expose the "right" values of r11/cx/ax/sp to uprobe_consumer/s */ 393 - regs->r11 = r11_cx_ax[0]; 394 - regs->cx = r11_cx_ax[1]; 395 - regs->ax = r11_cx_ax[2]; 396 - regs->sp += sizeof(r11_cx_ax); 384 + regs->r11 = args.r11; 385 + regs->cx = args.cx; 386 + regs->ax = args.ax; 387 + regs->sp += sizeof(args); 397 388 regs->orig_ax = -1; 398 389 399 390 ip = regs->ip; ··· 409 400 */ 410 401 if (regs->sp != sp || shstk_is_enabled()) 411 402 return regs->ax; 412 - regs->sp -= sizeof(r11_cx_ax); 403 + regs->sp -= sizeof(args); 413 404 414 405 /* for the case uprobe_consumer has changed r11/cx */ 415 - r11_cx_ax[0] = regs->r11; 416 - r11_cx_ax[1] = regs->cx; 406 + args.r11 = regs->r11; 407 + args.cx = regs->cx; 417 408 418 409 /* 419 410 * ax register is passed through as return value, so we can use 420 411 * its space on stack for ip value and jump to it through the 421 412 * trampoline's ret instruction 422 413 */ 423 - r11_cx_ax[2] = regs->ip; 414 + args.ax = regs->ip; 424 415 regs->ip = ip; 425 416 426 - err = copy_to_user((void __user *)regs->sp, r11_cx_ax, sizeof(r11_cx_ax)); 417 + err = copy_to_user((void __user *)regs->sp, &args, sizeof(args)); 427 418 if (err) 428 419 goto sigill; 429 420 ··· 617 608 *sr = utask->autask.saved_scratch_register; 618 609 } 619 610 } 611 + 612 + static int tramp_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma) 613 + { 614 + return -EPERM; 615 + } 616 + 617 + static struct page *tramp_mapping_pages[2] __ro_after_init; 618 + 619 + static struct vm_special_mapping tramp_mapping = { 620 + .name = "[uprobes-trampoline]", 621 + .mremap = tramp_mremap, 622 + .pages = tramp_mapping_pages, 623 + }; 624 + 625 + struct uprobe_trampoline { 626 + struct hlist_node node; 627 + unsigned long vaddr; 628 + }; 629 + 630 + static bool is_reachable_by_call(unsigned long vtramp, unsigned long vaddr) 631 + { 632 + long delta = (long)(vaddr + 5 - vtramp); 633 + 634 + return delta >= INT_MIN && delta <= INT_MAX; 635 + } 636 + 637 + static unsigned long find_nearest_trampoline(unsigned long vaddr) 638 + { 639 + struct vm_unmapped_area_info info = { 640 + .length = PAGE_SIZE, 641 + .align_mask = ~PAGE_MASK, 642 + }; 643 + unsigned long low_limit, high_limit; 644 + unsigned long low_tramp, high_tramp; 645 + unsigned long call_end = vaddr + 5; 646 + 647 + if (check_add_overflow(call_end, INT_MIN, &low_limit)) 648 + low_limit = PAGE_SIZE; 649 + 650 + high_limit = call_end + INT_MAX; 651 + 652 + /* Search up from the caller address. */ 653 + info.low_limit = call_end; 654 + info.high_limit = min(high_limit, TASK_SIZE); 655 + high_tramp = vm_unmapped_area(&info); 656 + 657 + /* Search down from the caller address. */ 658 + info.low_limit = max(low_limit, PAGE_SIZE); 659 + info.high_limit = call_end; 660 + info.flags = VM_UNMAPPED_AREA_TOPDOWN; 661 + low_tramp = vm_unmapped_area(&info); 662 + 663 + if (IS_ERR_VALUE(high_tramp) && IS_ERR_VALUE(low_tramp)) 664 + return -ENOMEM; 665 + if (IS_ERR_VALUE(high_tramp)) 666 + return low_tramp; 667 + if (IS_ERR_VALUE(low_tramp)) 668 + return high_tramp; 669 + 670 + /* Return address that's closest to the caller address. */ 671 + if (call_end - low_tramp < high_tramp - call_end) 672 + return low_tramp; 673 + return high_tramp; 674 + } 675 + 676 + static struct uprobe_trampoline *create_uprobe_trampoline(unsigned long vaddr) 677 + { 678 + struct pt_regs *regs = task_pt_regs(current); 679 + struct mm_struct *mm = current->mm; 680 + struct uprobe_trampoline *tramp; 681 + struct vm_area_struct *vma; 682 + 683 + if (!user_64bit_mode(regs)) 684 + return NULL; 685 + 686 + vaddr = find_nearest_trampoline(vaddr); 687 + if (IS_ERR_VALUE(vaddr)) 688 + return NULL; 689 + 690 + tramp = kzalloc(sizeof(*tramp), GFP_KERNEL); 691 + if (unlikely(!tramp)) 692 + return NULL; 693 + 694 + tramp->vaddr = vaddr; 695 + vma = _install_special_mapping(mm, tramp->vaddr, PAGE_SIZE, 696 + VM_READ|VM_EXEC|VM_MAYEXEC|VM_MAYREAD|VM_DONTCOPY|VM_IO, 697 + &tramp_mapping); 698 + if (IS_ERR(vma)) { 699 + kfree(tramp); 700 + return NULL; 701 + } 702 + return tramp; 703 + } 704 + 705 + static struct uprobe_trampoline *get_uprobe_trampoline(unsigned long vaddr, bool *new) 706 + { 707 + struct uprobes_state *state = &current->mm->uprobes_state; 708 + struct uprobe_trampoline *tramp = NULL; 709 + 710 + if (vaddr > TASK_SIZE || vaddr < PAGE_SIZE) 711 + return NULL; 712 + 713 + hlist_for_each_entry(tramp, &state->head_tramps, node) { 714 + if (is_reachable_by_call(tramp->vaddr, vaddr)) { 715 + *new = false; 716 + return tramp; 717 + } 718 + } 719 + 720 + tramp = create_uprobe_trampoline(vaddr); 721 + if (!tramp) 722 + return NULL; 723 + 724 + *new = true; 725 + hlist_add_head(&tramp->node, &state->head_tramps); 726 + return tramp; 727 + } 728 + 729 + static void destroy_uprobe_trampoline(struct uprobe_trampoline *tramp) 730 + { 731 + /* 732 + * We do not unmap and release uprobe trampoline page itself, 733 + * because there's no easy way to make sure none of the threads 734 + * is still inside the trampoline. 735 + */ 736 + hlist_del(&tramp->node); 737 + kfree(tramp); 738 + } 739 + 740 + void arch_uprobe_init_state(struct mm_struct *mm) 741 + { 742 + INIT_HLIST_HEAD(&mm->uprobes_state.head_tramps); 743 + } 744 + 745 + void arch_uprobe_clear_state(struct mm_struct *mm) 746 + { 747 + struct uprobes_state *state = &mm->uprobes_state; 748 + struct uprobe_trampoline *tramp; 749 + struct hlist_node *n; 750 + 751 + hlist_for_each_entry_safe(tramp, n, &state->head_tramps, node) 752 + destroy_uprobe_trampoline(tramp); 753 + } 754 + 755 + static bool __in_uprobe_trampoline(unsigned long ip) 756 + { 757 + struct vm_area_struct *vma = vma_lookup(current->mm, ip); 758 + 759 + return vma && vma_is_special_mapping(vma, &tramp_mapping); 760 + } 761 + 762 + static bool in_uprobe_trampoline(unsigned long ip) 763 + { 764 + struct mm_struct *mm = current->mm; 765 + bool found, retry = true; 766 + unsigned int seq; 767 + 768 + rcu_read_lock(); 769 + if (mmap_lock_speculate_try_begin(mm, &seq)) { 770 + found = __in_uprobe_trampoline(ip); 771 + retry = mmap_lock_speculate_retry(mm, seq); 772 + } 773 + rcu_read_unlock(); 774 + 775 + if (retry) { 776 + mmap_read_lock(mm); 777 + found = __in_uprobe_trampoline(ip); 778 + mmap_read_unlock(mm); 779 + } 780 + return found; 781 + } 782 + 783 + /* 784 + * See uprobe syscall trampoline; the call to the trampoline will push 785 + * the return address on the stack, the trampoline itself then pushes 786 + * cx, r11 and ax. 787 + */ 788 + struct uprobe_syscall_args { 789 + unsigned long ax; 790 + unsigned long r11; 791 + unsigned long cx; 792 + unsigned long retaddr; 793 + }; 794 + 795 + SYSCALL_DEFINE0(uprobe) 796 + { 797 + struct pt_regs *regs = task_pt_regs(current); 798 + struct uprobe_syscall_args args; 799 + unsigned long ip, sp, sret; 800 + int err; 801 + 802 + /* Allow execution only from uprobe trampolines. */ 803 + if (!in_uprobe_trampoline(regs->ip)) 804 + return -ENXIO; 805 + 806 + err = copy_from_user(&args, (void __user *)regs->sp, sizeof(args)); 807 + if (err) 808 + goto sigill; 809 + 810 + ip = regs->ip; 811 + 812 + /* 813 + * expose the "right" values of ax/r11/cx/ip/sp to uprobe_consumer/s, plus: 814 + * - adjust ip to the probe address, call saved next instruction address 815 + * - adjust sp to the probe's stack frame (check trampoline code) 816 + */ 817 + regs->ax = args.ax; 818 + regs->r11 = args.r11; 819 + regs->cx = args.cx; 820 + regs->ip = args.retaddr - 5; 821 + regs->sp += sizeof(args); 822 + regs->orig_ax = -1; 823 + 824 + sp = regs->sp; 825 + 826 + err = shstk_pop((u64 *)&sret); 827 + if (err == -EFAULT || (!err && sret != args.retaddr)) 828 + goto sigill; 829 + 830 + handle_syscall_uprobe(regs, regs->ip); 831 + 832 + /* 833 + * Some of the uprobe consumers has changed sp, we can do nothing, 834 + * just return via iret. 835 + */ 836 + if (regs->sp != sp) { 837 + /* skip the trampoline call */ 838 + if (args.retaddr - 5 == regs->ip) 839 + regs->ip += 5; 840 + return regs->ax; 841 + } 842 + 843 + regs->sp -= sizeof(args); 844 + 845 + /* for the case uprobe_consumer has changed ax/r11/cx */ 846 + args.ax = regs->ax; 847 + args.r11 = regs->r11; 848 + args.cx = regs->cx; 849 + 850 + /* keep return address unless we are instructed otherwise */ 851 + if (args.retaddr - 5 != regs->ip) 852 + args.retaddr = regs->ip; 853 + 854 + if (shstk_push(args.retaddr) == -EFAULT) 855 + goto sigill; 856 + 857 + regs->ip = ip; 858 + 859 + err = copy_to_user((void __user *)regs->sp, &args, sizeof(args)); 860 + if (err) 861 + goto sigill; 862 + 863 + /* ensure sysret, see do_syscall_64() */ 864 + regs->r11 = regs->flags; 865 + regs->cx = regs->ip; 866 + return 0; 867 + 868 + sigill: 869 + force_sig(SIGILL); 870 + return -1; 871 + } 872 + 873 + asm ( 874 + ".pushsection .rodata\n" 875 + ".balign " __stringify(PAGE_SIZE) "\n" 876 + "uprobe_trampoline_entry:\n" 877 + "push %rcx\n" 878 + "push %r11\n" 879 + "push %rax\n" 880 + "mov $" __stringify(__NR_uprobe) ", %rax\n" 881 + "syscall\n" 882 + "pop %rax\n" 883 + "pop %r11\n" 884 + "pop %rcx\n" 885 + "ret\n" 886 + "int3\n" 887 + ".balign " __stringify(PAGE_SIZE) "\n" 888 + ".popsection\n" 889 + ); 890 + 891 + extern u8 uprobe_trampoline_entry[]; 892 + 893 + static int __init arch_uprobes_init(void) 894 + { 895 + tramp_mapping_pages[0] = virt_to_page(uprobe_trampoline_entry); 896 + return 0; 897 + } 898 + 899 + late_initcall(arch_uprobes_init); 900 + 901 + enum { 902 + EXPECT_SWBP, 903 + EXPECT_CALL, 904 + }; 905 + 906 + struct write_opcode_ctx { 907 + unsigned long base; 908 + int expect; 909 + }; 910 + 911 + static int is_call_insn(uprobe_opcode_t *insn) 912 + { 913 + return *insn == CALL_INSN_OPCODE; 914 + } 915 + 916 + /* 917 + * Verification callback used by int3_update uprobe_write calls to make sure 918 + * the underlying instruction is as expected - either int3 or call. 919 + */ 920 + static int verify_insn(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode, 921 + int nbytes, void *data) 922 + { 923 + struct write_opcode_ctx *ctx = data; 924 + uprobe_opcode_t old_opcode[5]; 925 + 926 + uprobe_copy_from_page(page, ctx->base, (uprobe_opcode_t *) &old_opcode, 5); 927 + 928 + switch (ctx->expect) { 929 + case EXPECT_SWBP: 930 + if (is_swbp_insn(&old_opcode[0])) 931 + return 1; 932 + break; 933 + case EXPECT_CALL: 934 + if (is_call_insn(&old_opcode[0])) 935 + return 1; 936 + break; 937 + } 938 + 939 + return -1; 940 + } 941 + 942 + /* 943 + * Modify multi-byte instructions by using INT3 breakpoints on SMP. 944 + * We completely avoid using stop_machine() here, and achieve the 945 + * synchronization using INT3 breakpoints and SMP cross-calls. 946 + * (borrowed comment from smp_text_poke_batch_finish) 947 + * 948 + * The way it is done: 949 + * - Add an INT3 trap to the address that will be patched 950 + * - SMP sync all CPUs 951 + * - Update all but the first byte of the patched range 952 + * - SMP sync all CPUs 953 + * - Replace the first byte (INT3) by the first byte of the replacing opcode 954 + * - SMP sync all CPUs 955 + */ 956 + static int int3_update(struct arch_uprobe *auprobe, struct vm_area_struct *vma, 957 + unsigned long vaddr, char *insn, bool optimize) 958 + { 959 + uprobe_opcode_t int3 = UPROBE_SWBP_INSN; 960 + struct write_opcode_ctx ctx = { 961 + .base = vaddr, 962 + }; 963 + int err; 964 + 965 + /* 966 + * Write int3 trap. 967 + * 968 + * The swbp_optimize path comes with breakpoint already installed, 969 + * so we can skip this step for optimize == true. 970 + */ 971 + if (!optimize) { 972 + ctx.expect = EXPECT_CALL; 973 + err = uprobe_write(auprobe, vma, vaddr, &int3, 1, verify_insn, 974 + true /* is_register */, false /* do_update_ref_ctr */, 975 + &ctx); 976 + if (err) 977 + return err; 978 + } 979 + 980 + smp_text_poke_sync_each_cpu(); 981 + 982 + /* Write all but the first byte of the patched range. */ 983 + ctx.expect = EXPECT_SWBP; 984 + err = uprobe_write(auprobe, vma, vaddr + 1, insn + 1, 4, verify_insn, 985 + true /* is_register */, false /* do_update_ref_ctr */, 986 + &ctx); 987 + if (err) 988 + return err; 989 + 990 + smp_text_poke_sync_each_cpu(); 991 + 992 + /* 993 + * Write first byte. 994 + * 995 + * The swbp_unoptimize needs to finish uprobe removal together 996 + * with ref_ctr update, using uprobe_write with proper flags. 997 + */ 998 + err = uprobe_write(auprobe, vma, vaddr, insn, 1, verify_insn, 999 + optimize /* is_register */, !optimize /* do_update_ref_ctr */, 1000 + &ctx); 1001 + if (err) 1002 + return err; 1003 + 1004 + smp_text_poke_sync_each_cpu(); 1005 + return 0; 1006 + } 1007 + 1008 + static int swbp_optimize(struct arch_uprobe *auprobe, struct vm_area_struct *vma, 1009 + unsigned long vaddr, unsigned long tramp) 1010 + { 1011 + u8 call[5]; 1012 + 1013 + __text_gen_insn(call, CALL_INSN_OPCODE, (const void *) vaddr, 1014 + (const void *) tramp, CALL_INSN_SIZE); 1015 + return int3_update(auprobe, vma, vaddr, call, true /* optimize */); 1016 + } 1017 + 1018 + static int swbp_unoptimize(struct arch_uprobe *auprobe, struct vm_area_struct *vma, 1019 + unsigned long vaddr) 1020 + { 1021 + return int3_update(auprobe, vma, vaddr, auprobe->insn, false /* optimize */); 1022 + } 1023 + 1024 + static int copy_from_vaddr(struct mm_struct *mm, unsigned long vaddr, void *dst, int len) 1025 + { 1026 + unsigned int gup_flags = FOLL_FORCE|FOLL_SPLIT_PMD; 1027 + struct vm_area_struct *vma; 1028 + struct page *page; 1029 + 1030 + page = get_user_page_vma_remote(mm, vaddr, gup_flags, &vma); 1031 + if (IS_ERR(page)) 1032 + return PTR_ERR(page); 1033 + uprobe_copy_from_page(page, vaddr, dst, len); 1034 + put_page(page); 1035 + return 0; 1036 + } 1037 + 1038 + static bool __is_optimized(uprobe_opcode_t *insn, unsigned long vaddr) 1039 + { 1040 + struct __packed __arch_relative_insn { 1041 + u8 op; 1042 + s32 raddr; 1043 + } *call = (struct __arch_relative_insn *) insn; 1044 + 1045 + if (!is_call_insn(insn)) 1046 + return false; 1047 + return __in_uprobe_trampoline(vaddr + 5 + call->raddr); 1048 + } 1049 + 1050 + static int is_optimized(struct mm_struct *mm, unsigned long vaddr) 1051 + { 1052 + uprobe_opcode_t insn[5]; 1053 + int err; 1054 + 1055 + err = copy_from_vaddr(mm, vaddr, &insn, 5); 1056 + if (err) 1057 + return err; 1058 + return __is_optimized((uprobe_opcode_t *)&insn, vaddr); 1059 + } 1060 + 1061 + static bool should_optimize(struct arch_uprobe *auprobe) 1062 + { 1063 + return !test_bit(ARCH_UPROBE_FLAG_OPTIMIZE_FAIL, &auprobe->flags) && 1064 + test_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags); 1065 + } 1066 + 1067 + int set_swbp(struct arch_uprobe *auprobe, struct vm_area_struct *vma, 1068 + unsigned long vaddr) 1069 + { 1070 + if (should_optimize(auprobe)) { 1071 + /* 1072 + * We could race with another thread that already optimized the probe, 1073 + * so let's not overwrite it with int3 again in this case. 1074 + */ 1075 + int ret = is_optimized(vma->vm_mm, vaddr); 1076 + if (ret < 0) 1077 + return ret; 1078 + if (ret) 1079 + return 0; 1080 + } 1081 + return uprobe_write_opcode(auprobe, vma, vaddr, UPROBE_SWBP_INSN, 1082 + true /* is_register */); 1083 + } 1084 + 1085 + int set_orig_insn(struct arch_uprobe *auprobe, struct vm_area_struct *vma, 1086 + unsigned long vaddr) 1087 + { 1088 + if (test_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags)) { 1089 + int ret = is_optimized(vma->vm_mm, vaddr); 1090 + if (ret < 0) 1091 + return ret; 1092 + if (ret) { 1093 + ret = swbp_unoptimize(auprobe, vma, vaddr); 1094 + WARN_ON_ONCE(ret); 1095 + return ret; 1096 + } 1097 + } 1098 + return uprobe_write_opcode(auprobe, vma, vaddr, *(uprobe_opcode_t *)&auprobe->insn, 1099 + false /* is_register */); 1100 + } 1101 + 1102 + static int __arch_uprobe_optimize(struct arch_uprobe *auprobe, struct mm_struct *mm, 1103 + unsigned long vaddr) 1104 + { 1105 + struct uprobe_trampoline *tramp; 1106 + struct vm_area_struct *vma; 1107 + bool new = false; 1108 + int err = 0; 1109 + 1110 + vma = find_vma(mm, vaddr); 1111 + if (!vma) 1112 + return -EINVAL; 1113 + tramp = get_uprobe_trampoline(vaddr, &new); 1114 + if (!tramp) 1115 + return -EINVAL; 1116 + err = swbp_optimize(auprobe, vma, vaddr, tramp->vaddr); 1117 + if (WARN_ON_ONCE(err) && new) 1118 + destroy_uprobe_trampoline(tramp); 1119 + return err; 1120 + } 1121 + 1122 + void arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr) 1123 + { 1124 + struct mm_struct *mm = current->mm; 1125 + uprobe_opcode_t insn[5]; 1126 + 1127 + if (!should_optimize(auprobe)) 1128 + return; 1129 + 1130 + mmap_write_lock(mm); 1131 + 1132 + /* 1133 + * Check if some other thread already optimized the uprobe for us, 1134 + * if it's the case just go away silently. 1135 + */ 1136 + if (copy_from_vaddr(mm, vaddr, &insn, 5)) 1137 + goto unlock; 1138 + if (!is_swbp_insn((uprobe_opcode_t*) &insn)) 1139 + goto unlock; 1140 + 1141 + /* 1142 + * If we fail to optimize the uprobe we set the fail bit so the 1143 + * above should_optimize will fail from now on. 1144 + */ 1145 + if (__arch_uprobe_optimize(auprobe, mm, vaddr)) 1146 + set_bit(ARCH_UPROBE_FLAG_OPTIMIZE_FAIL, &auprobe->flags); 1147 + 1148 + unlock: 1149 + mmap_write_unlock(mm); 1150 + } 1151 + 1152 + static bool insn_is_nop(struct insn *insn) 1153 + { 1154 + return insn->opcode.nbytes == 1 && insn->opcode.bytes[0] == 0x90; 1155 + } 1156 + 1157 + static bool insn_is_nopl(struct insn *insn) 1158 + { 1159 + if (insn->opcode.nbytes != 2) 1160 + return false; 1161 + 1162 + if (insn->opcode.bytes[0] != 0x0f || insn->opcode.bytes[1] != 0x1f) 1163 + return false; 1164 + 1165 + if (!insn->modrm.nbytes) 1166 + return false; 1167 + 1168 + if (X86_MODRM_REG(insn->modrm.bytes[0]) != 0) 1169 + return false; 1170 + 1171 + /* 0f 1f /0 - NOPL */ 1172 + return true; 1173 + } 1174 + 1175 + static bool can_optimize(struct insn *insn, unsigned long vaddr) 1176 + { 1177 + if (!insn->x86_64 || insn->length != 5) 1178 + return false; 1179 + 1180 + if (!insn_is_nop(insn) && !insn_is_nopl(insn)) 1181 + return false; 1182 + 1183 + /* We can't do cross page atomic writes yet. */ 1184 + return PAGE_SIZE - (vaddr & ~PAGE_MASK) >= 5; 1185 + } 620 1186 #else /* 32-bit: */ 621 1187 /* 622 1188 * No RIP-relative addressing on 32-bit ··· 1204 620 } 1205 621 static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) 1206 622 { 623 + } 624 + static bool can_optimize(struct insn *insn, unsigned long vaddr) 625 + { 626 + return false; 1207 627 } 1208 628 #endif /* CONFIG_X86_64 */ 1209 629 ··· 1567 979 */ 1568 980 int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long addr) 1569 981 { 1570 - struct insn insn; 1571 982 u8 fix_ip_or_call = UPROBE_FIX_IP; 983 + struct insn insn; 1572 984 int ret; 1573 985 1574 986 ret = uprobe_init_insn(auprobe, &insn, is_64bit_mm(mm)); 1575 987 if (ret) 1576 988 return ret; 989 + 990 + if (can_optimize(&insn, addr)) 991 + set_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags); 1577 992 1578 993 ret = branch_setup_xol_ops(auprobe, &insn); 1579 994 if (ret != -ENOSYS)
+1 -1
arch/x86/kvm/pmu.h
··· 13 13 #define MSR_IA32_MISC_ENABLE_PMU_RO_MASK (MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL | \ 14 14 MSR_IA32_MISC_ENABLE_BTS_UNAVAIL) 15 15 16 - /* retrieve the 4 bits for EN and PMI out of IA32_FIXED_CTR_CTRL */ 16 + /* retrieve a fixed counter bits out of IA32_FIXED_CTR_CTRL */ 17 17 #define fixed_ctrl_field(ctrl_reg, idx) \ 18 18 (((ctrl_reg) >> ((idx) * INTEL_FIXED_BITS_STRIDE)) & INTEL_FIXED_BITS_MASK) 19 19
+2 -2
include/linux/perf_event.h
··· 859 859 860 860 /* mmap bits */ 861 861 struct mutex mmap_mutex; 862 - atomic_t mmap_count; 862 + refcount_t mmap_count; 863 863 864 864 struct perf_buffer *rb; 865 865 struct list_head rb_entry; ··· 1719 1719 extern void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs); 1720 1720 extern void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs); 1721 1721 extern struct perf_callchain_entry * 1722 - get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, 1722 + get_perf_callchain(struct pt_regs *regs, bool kernel, bool user, 1723 1723 u32 max_stack, bool crosstask, bool add_mark); 1724 1724 extern int get_callchain_buffers(int max_stack); 1725 1725 extern void put_callchain_buffers(void);
+2
include/linux/syscalls.h
··· 1005 1005 1006 1006 asmlinkage long sys_uretprobe(void); 1007 1007 1008 + asmlinkage long sys_uprobe(void); 1009 + 1008 1010 /* pciconfig: alpha, arm, arm64, ia64, sparc */ 1009 1011 asmlinkage long sys_pciconfig_read(unsigned long bus, unsigned long dfn, 1010 1012 unsigned long off, unsigned long len,
+18 -2
include/linux/uprobes.h
··· 17 17 #include <linux/wait.h> 18 18 #include <linux/timer.h> 19 19 #include <linux/seqlock.h> 20 + #include <linux/mutex.h> 20 21 21 22 struct uprobe; 22 23 struct vm_area_struct; ··· 186 185 187 186 struct uprobes_state { 188 187 struct xol_area *xol_area; 188 + #ifdef CONFIG_X86_64 189 + struct hlist_head head_tramps; 190 + #endif 189 191 }; 192 + 193 + typedef int (*uprobe_write_verify_t)(struct page *page, unsigned long vaddr, 194 + uprobe_opcode_t *insn, int nbytes, void *data); 190 195 191 196 extern void __init uprobes_init(void); 192 197 extern int set_swbp(struct arch_uprobe *aup, struct vm_area_struct *vma, unsigned long vaddr); ··· 201 194 extern bool is_trap_insn(uprobe_opcode_t *insn); 202 195 extern unsigned long uprobe_get_swbp_addr(struct pt_regs *regs); 203 196 extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs); 204 - extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr, uprobe_opcode_t); 197 + extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr, uprobe_opcode_t, 198 + bool is_register); 199 + extern int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma, const unsigned long opcode_vaddr, 200 + uprobe_opcode_t *insn, int nbytes, uprobe_write_verify_t verify, bool is_register, bool do_update_ref_ctr, 201 + void *data); 205 202 extern struct uprobe *uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc); 206 203 extern int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool); 207 204 extern void uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc); ··· 235 224 extern void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, 236 225 void *src, unsigned long len); 237 226 extern void uprobe_handle_trampoline(struct pt_regs *regs); 238 - extern void *arch_uprobe_trampoline(unsigned long *psize); 227 + extern void *arch_uretprobe_trampoline(unsigned long *psize); 239 228 extern unsigned long uprobe_get_trampoline_vaddr(void); 229 + extern void uprobe_copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len); 230 + extern void arch_uprobe_clear_state(struct mm_struct *mm); 231 + extern void arch_uprobe_init_state(struct mm_struct *mm); 232 + extern void handle_syscall_uprobe(struct pt_regs *regs, unsigned long bp_vaddr); 233 + extern void arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr); 240 234 #else /* !CONFIG_UPROBES */ 241 235 struct uprobes_state { 242 236 };
+2 -2
kernel/bpf/stackmap.c
··· 314 314 if (max_depth > sysctl_perf_event_max_stack) 315 315 max_depth = sysctl_perf_event_max_stack; 316 316 317 - trace = get_perf_callchain(regs, 0, kernel, user, max_depth, 317 + trace = get_perf_callchain(regs, kernel, user, max_depth, 318 318 false, false); 319 319 320 320 if (unlikely(!trace)) ··· 451 451 else if (kernel && task) 452 452 trace = get_callchain_entry_for_task(task, max_depth); 453 453 else 454 - trace = get_perf_callchain(regs, 0, kernel, user, max_depth, 454 + trace = get_perf_callchain(regs, kernel, user, max_depth, 455 455 crosstask, false); 456 456 457 457 if (unlikely(!trace) || trace->nr < skip) {
+20 -22
kernel/events/callchain.c
··· 217 217 } 218 218 219 219 struct perf_callchain_entry * 220 - get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, 220 + get_perf_callchain(struct pt_regs *regs, bool kernel, bool user, 221 221 u32 max_stack, bool crosstask, bool add_mark) 222 222 { 223 223 struct perf_callchain_entry *entry; 224 224 struct perf_callchain_entry_ctx ctx; 225 225 int rctx, start_entry_idx; 226 226 227 + /* crosstask is not supported for user stacks */ 228 + if (crosstask && user && !kernel) 229 + return NULL; 230 + 227 231 entry = get_callchain_entry(&rctx); 228 232 if (!entry) 229 233 return NULL; 230 234 231 - ctx.entry = entry; 232 - ctx.max_stack = max_stack; 233 - ctx.nr = entry->nr = init_nr; 234 - ctx.contexts = 0; 235 - ctx.contexts_maxed = false; 235 + ctx.entry = entry; 236 + ctx.max_stack = max_stack; 237 + ctx.nr = entry->nr = 0; 238 + ctx.contexts = 0; 239 + ctx.contexts_maxed = false; 236 240 237 241 if (kernel && !user_mode(regs)) { 238 242 if (add_mark) ··· 244 240 perf_callchain_kernel(&ctx, regs); 245 241 } 246 242 247 - if (user) { 243 + if (user && !crosstask) { 248 244 if (!user_mode(regs)) { 249 - if (current->mm) 250 - regs = task_pt_regs(current); 251 - else 252 - regs = NULL; 253 - } 254 - 255 - if (regs) { 256 - if (crosstask) 245 + if (current->flags & (PF_KTHREAD | PF_USER_WORKER)) 257 246 goto exit_put; 258 - 259 - if (add_mark) 260 - perf_callchain_store_context(&ctx, PERF_CONTEXT_USER); 261 - 262 - start_entry_idx = entry->nr; 263 - perf_callchain_user(&ctx, regs); 264 - fixup_uretprobe_trampoline_entries(entry, start_entry_idx); 247 + regs = task_pt_regs(current); 265 248 } 249 + 250 + if (add_mark) 251 + perf_callchain_store_context(&ctx, PERF_CONTEXT_USER); 252 + 253 + start_entry_idx = entry->nr; 254 + perf_callchain_user(&ctx, regs); 255 + fixup_uretprobe_trampoline_entries(entry, start_entry_idx); 266 256 } 267 257 268 258 exit_put:
+217 -204
kernel/events/core.c
··· 3974 3974 */ 3975 3975 static inline bool event_update_userpage(struct perf_event *event) 3976 3976 { 3977 - if (likely(!atomic_read(&event->mmap_count))) 3977 + if (likely(!refcount_read(&event->mmap_count))) 3978 3978 return false; 3979 3979 3980 3980 perf_event_update_time(event); ··· 6710 6710 struct perf_event *event = vma->vm_file->private_data; 6711 6711 mapped_f mapped = get_mapped(event, event_mapped); 6712 6712 6713 - atomic_inc(&event->mmap_count); 6714 - atomic_inc(&event->rb->mmap_count); 6713 + refcount_inc(&event->mmap_count); 6714 + refcount_inc(&event->rb->mmap_count); 6715 6715 6716 6716 if (vma->vm_pgoff) 6717 - atomic_inc(&event->rb->aux_mmap_count); 6717 + refcount_inc(&event->rb->aux_mmap_count); 6718 6718 6719 6719 if (mapped) 6720 6720 mapped(event, vma->vm_mm); ··· 6749 6749 * to avoid complications. 6750 6750 */ 6751 6751 if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff && 6752 - atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &rb->aux_mutex)) { 6752 + refcount_dec_and_mutex_lock(&rb->aux_mmap_count, &rb->aux_mutex)) { 6753 6753 /* 6754 6754 * Stop all AUX events that are writing to this buffer, 6755 6755 * so that we can free its AUX pages and corresponding PMU ··· 6769 6769 mutex_unlock(&rb->aux_mutex); 6770 6770 } 6771 6771 6772 - if (atomic_dec_and_test(&rb->mmap_count)) 6772 + if (refcount_dec_and_test(&rb->mmap_count)) 6773 6773 detach_rest = true; 6774 6774 6775 - if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) 6775 + if (!refcount_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) 6776 6776 goto out_put; 6777 6777 6778 6778 ring_buffer_attach(event, NULL); ··· 6933 6933 return err; 6934 6934 } 6935 6935 6936 + static bool perf_mmap_calc_limits(struct vm_area_struct *vma, long *user_extra, long *extra) 6937 + { 6938 + unsigned long user_locked, user_lock_limit, locked, lock_limit; 6939 + struct user_struct *user = current_user(); 6940 + 6941 + user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10); 6942 + /* Increase the limit linearly with more CPUs */ 6943 + user_lock_limit *= num_online_cpus(); 6944 + 6945 + user_locked = atomic_long_read(&user->locked_vm); 6946 + 6947 + /* 6948 + * sysctl_perf_event_mlock may have changed, so that 6949 + * user->locked_vm > user_lock_limit 6950 + */ 6951 + if (user_locked > user_lock_limit) 6952 + user_locked = user_lock_limit; 6953 + user_locked += *user_extra; 6954 + 6955 + if (user_locked > user_lock_limit) { 6956 + /* 6957 + * charge locked_vm until it hits user_lock_limit; 6958 + * charge the rest from pinned_vm 6959 + */ 6960 + *extra = user_locked - user_lock_limit; 6961 + *user_extra -= *extra; 6962 + } 6963 + 6964 + lock_limit = rlimit(RLIMIT_MEMLOCK); 6965 + lock_limit >>= PAGE_SHIFT; 6966 + locked = atomic64_read(&vma->vm_mm->pinned_vm) + *extra; 6967 + 6968 + return locked <= lock_limit || !perf_is_paranoid() || capable(CAP_IPC_LOCK); 6969 + } 6970 + 6971 + static void perf_mmap_account(struct vm_area_struct *vma, long user_extra, long extra) 6972 + { 6973 + struct user_struct *user = current_user(); 6974 + 6975 + atomic_long_add(user_extra, &user->locked_vm); 6976 + atomic64_add(extra, &vma->vm_mm->pinned_vm); 6977 + } 6978 + 6979 + static int perf_mmap_rb(struct vm_area_struct *vma, struct perf_event *event, 6980 + unsigned long nr_pages) 6981 + { 6982 + long extra = 0, user_extra = nr_pages; 6983 + struct perf_buffer *rb; 6984 + int rb_flags = 0; 6985 + 6986 + nr_pages -= 1; 6987 + 6988 + /* 6989 + * If we have rb pages ensure they're a power-of-two number, so we 6990 + * can do bitmasks instead of modulo. 6991 + */ 6992 + if (nr_pages != 0 && !is_power_of_2(nr_pages)) 6993 + return -EINVAL; 6994 + 6995 + WARN_ON_ONCE(event->ctx->parent_ctx); 6996 + 6997 + if (event->rb) { 6998 + if (data_page_nr(event->rb) != nr_pages) 6999 + return -EINVAL; 7000 + 7001 + if (refcount_inc_not_zero(&event->rb->mmap_count)) { 7002 + /* 7003 + * Success -- managed to mmap() the same buffer 7004 + * multiple times. 7005 + */ 7006 + perf_mmap_account(vma, user_extra, extra); 7007 + refcount_inc(&event->mmap_count); 7008 + return 0; 7009 + } 7010 + 7011 + /* 7012 + * Raced against perf_mmap_close()'s 7013 + * refcount_dec_and_mutex_lock() remove the 7014 + * event and continue as if !event->rb 7015 + */ 7016 + ring_buffer_attach(event, NULL); 7017 + } 7018 + 7019 + if (!perf_mmap_calc_limits(vma, &user_extra, &extra)) 7020 + return -EPERM; 7021 + 7022 + if (vma->vm_flags & VM_WRITE) 7023 + rb_flags |= RING_BUFFER_WRITABLE; 7024 + 7025 + rb = rb_alloc(nr_pages, 7026 + event->attr.watermark ? event->attr.wakeup_watermark : 0, 7027 + event->cpu, rb_flags); 7028 + 7029 + if (!rb) 7030 + return -ENOMEM; 7031 + 7032 + refcount_set(&rb->mmap_count, 1); 7033 + rb->mmap_user = get_current_user(); 7034 + rb->mmap_locked = extra; 7035 + 7036 + ring_buffer_attach(event, rb); 7037 + 7038 + perf_event_update_time(event); 7039 + perf_event_init_userpage(event); 7040 + perf_event_update_userpage(event); 7041 + 7042 + perf_mmap_account(vma, user_extra, extra); 7043 + refcount_set(&event->mmap_count, 1); 7044 + 7045 + return 0; 7046 + } 7047 + 7048 + static int perf_mmap_aux(struct vm_area_struct *vma, struct perf_event *event, 7049 + unsigned long nr_pages) 7050 + { 7051 + long extra = 0, user_extra = nr_pages; 7052 + u64 aux_offset, aux_size; 7053 + struct perf_buffer *rb; 7054 + int ret, rb_flags = 0; 7055 + 7056 + rb = event->rb; 7057 + if (!rb) 7058 + return -EINVAL; 7059 + 7060 + guard(mutex)(&rb->aux_mutex); 7061 + 7062 + /* 7063 + * AUX area mapping: if rb->aux_nr_pages != 0, it's already 7064 + * mapped, all subsequent mappings should have the same size 7065 + * and offset. Must be above the normal perf buffer. 7066 + */ 7067 + aux_offset = READ_ONCE(rb->user_page->aux_offset); 7068 + aux_size = READ_ONCE(rb->user_page->aux_size); 7069 + 7070 + if (aux_offset < perf_data_size(rb) + PAGE_SIZE) 7071 + return -EINVAL; 7072 + 7073 + if (aux_offset != vma->vm_pgoff << PAGE_SHIFT) 7074 + return -EINVAL; 7075 + 7076 + /* already mapped with a different offset */ 7077 + if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff) 7078 + return -EINVAL; 7079 + 7080 + if (aux_size != nr_pages * PAGE_SIZE) 7081 + return -EINVAL; 7082 + 7083 + /* already mapped with a different size */ 7084 + if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages) 7085 + return -EINVAL; 7086 + 7087 + if (!is_power_of_2(nr_pages)) 7088 + return -EINVAL; 7089 + 7090 + if (!refcount_inc_not_zero(&rb->mmap_count)) 7091 + return -EINVAL; 7092 + 7093 + if (rb_has_aux(rb)) { 7094 + refcount_inc(&rb->aux_mmap_count); 7095 + 7096 + } else { 7097 + if (!perf_mmap_calc_limits(vma, &user_extra, &extra)) { 7098 + refcount_dec(&rb->mmap_count); 7099 + return -EPERM; 7100 + } 7101 + 7102 + WARN_ON(!rb && event->rb); 7103 + 7104 + if (vma->vm_flags & VM_WRITE) 7105 + rb_flags |= RING_BUFFER_WRITABLE; 7106 + 7107 + ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages, 7108 + event->attr.aux_watermark, rb_flags); 7109 + if (ret) { 7110 + refcount_dec(&rb->mmap_count); 7111 + return ret; 7112 + } 7113 + 7114 + refcount_set(&rb->aux_mmap_count, 1); 7115 + rb->aux_mmap_locked = extra; 7116 + } 7117 + 7118 + perf_mmap_account(vma, user_extra, extra); 7119 + refcount_inc(&event->mmap_count); 7120 + 7121 + return 0; 7122 + } 7123 + 6936 7124 static int perf_mmap(struct file *file, struct vm_area_struct *vma) 6937 7125 { 6938 7126 struct perf_event *event = file->private_data; 6939 - unsigned long user_locked, user_lock_limit; 6940 - struct user_struct *user = current_user(); 6941 - struct mutex *aux_mutex = NULL; 6942 - struct perf_buffer *rb = NULL; 6943 - unsigned long locked, lock_limit; 6944 - unsigned long vma_size; 6945 - unsigned long nr_pages; 6946 - long user_extra = 0, extra = 0; 6947 - int ret, flags = 0; 7127 + unsigned long vma_size, nr_pages; 6948 7128 mapped_f mapped; 7129 + int ret; 6949 7130 6950 7131 /* 6951 7132 * Don't allow mmap() of inherited per-task counters. This would ··· 7152 6971 if (vma_size != PAGE_SIZE * nr_pages) 7153 6972 return -EINVAL; 7154 6973 7155 - user_extra = nr_pages; 7156 - 7157 - mutex_lock(&event->mmap_mutex); 7158 - ret = -EINVAL; 7159 - 7160 - /* 7161 - * This relies on __pmu_detach_event() taking mmap_mutex after marking 7162 - * the event REVOKED. Either we observe the state, or __pmu_detach_event() 7163 - * will detach the rb created here. 7164 - */ 7165 - if (event->state <= PERF_EVENT_STATE_REVOKED) { 7166 - ret = -ENODEV; 7167 - goto unlock; 7168 - } 7169 - 7170 - if (vma->vm_pgoff == 0) { 7171 - nr_pages -= 1; 7172 - 6974 + scoped_guard (mutex, &event->mmap_mutex) { 7173 6975 /* 7174 - * If we have rb pages ensure they're a power-of-two number, so we 7175 - * can do bitmasks instead of modulo. 6976 + * This relies on __pmu_detach_event() taking mmap_mutex after marking 6977 + * the event REVOKED. Either we observe the state, or __pmu_detach_event() 6978 + * will detach the rb created here. 7176 6979 */ 7177 - if (nr_pages != 0 && !is_power_of_2(nr_pages)) 7178 - goto unlock; 6980 + if (event->state <= PERF_EVENT_STATE_REVOKED) 6981 + return -ENODEV; 7179 6982 7180 - WARN_ON_ONCE(event->ctx->parent_ctx); 7181 - 7182 - if (event->rb) { 7183 - if (data_page_nr(event->rb) != nr_pages) 7184 - goto unlock; 7185 - 7186 - if (atomic_inc_not_zero(&event->rb->mmap_count)) { 7187 - /* 7188 - * Success -- managed to mmap() the same buffer 7189 - * multiple times. 7190 - */ 7191 - ret = 0; 7192 - /* We need the rb to map pages. */ 7193 - rb = event->rb; 7194 - goto unlock; 7195 - } 7196 - 7197 - /* 7198 - * Raced against perf_mmap_close()'s 7199 - * atomic_dec_and_mutex_lock() remove the 7200 - * event and continue as if !event->rb 7201 - */ 7202 - ring_buffer_attach(event, NULL); 7203 - } 7204 - 7205 - } else { 7206 - /* 7207 - * AUX area mapping: if rb->aux_nr_pages != 0, it's already 7208 - * mapped, all subsequent mappings should have the same size 7209 - * and offset. Must be above the normal perf buffer. 7210 - */ 7211 - u64 aux_offset, aux_size; 7212 - 7213 - rb = event->rb; 7214 - if (!rb) 7215 - goto aux_unlock; 7216 - 7217 - aux_mutex = &rb->aux_mutex; 7218 - mutex_lock(aux_mutex); 7219 - 7220 - aux_offset = READ_ONCE(rb->user_page->aux_offset); 7221 - aux_size = READ_ONCE(rb->user_page->aux_size); 7222 - 7223 - if (aux_offset < perf_data_size(rb) + PAGE_SIZE) 7224 - goto aux_unlock; 7225 - 7226 - if (aux_offset != vma->vm_pgoff << PAGE_SHIFT) 7227 - goto aux_unlock; 7228 - 7229 - /* already mapped with a different offset */ 7230 - if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff) 7231 - goto aux_unlock; 7232 - 7233 - if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE) 7234 - goto aux_unlock; 7235 - 7236 - /* already mapped with a different size */ 7237 - if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages) 7238 - goto aux_unlock; 7239 - 7240 - if (!is_power_of_2(nr_pages)) 7241 - goto aux_unlock; 7242 - 7243 - if (!atomic_inc_not_zero(&rb->mmap_count)) 7244 - goto aux_unlock; 7245 - 7246 - if (rb_has_aux(rb)) { 7247 - atomic_inc(&rb->aux_mmap_count); 7248 - ret = 0; 7249 - goto unlock; 7250 - } 6983 + if (vma->vm_pgoff == 0) 6984 + ret = perf_mmap_rb(vma, event, nr_pages); 6985 + else 6986 + ret = perf_mmap_aux(vma, event, nr_pages); 6987 + if (ret) 6988 + return ret; 7251 6989 } 7252 - 7253 - user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10); 7254 - 7255 - /* 7256 - * Increase the limit linearly with more CPUs: 7257 - */ 7258 - user_lock_limit *= num_online_cpus(); 7259 - 7260 - user_locked = atomic_long_read(&user->locked_vm); 7261 - 7262 - /* 7263 - * sysctl_perf_event_mlock may have changed, so that 7264 - * user->locked_vm > user_lock_limit 7265 - */ 7266 - if (user_locked > user_lock_limit) 7267 - user_locked = user_lock_limit; 7268 - user_locked += user_extra; 7269 - 7270 - if (user_locked > user_lock_limit) { 7271 - /* 7272 - * charge locked_vm until it hits user_lock_limit; 7273 - * charge the rest from pinned_vm 7274 - */ 7275 - extra = user_locked - user_lock_limit; 7276 - user_extra -= extra; 7277 - } 7278 - 7279 - lock_limit = rlimit(RLIMIT_MEMLOCK); 7280 - lock_limit >>= PAGE_SHIFT; 7281 - locked = atomic64_read(&vma->vm_mm->pinned_vm) + extra; 7282 - 7283 - if ((locked > lock_limit) && perf_is_paranoid() && 7284 - !capable(CAP_IPC_LOCK)) { 7285 - ret = -EPERM; 7286 - goto unlock; 7287 - } 7288 - 7289 - WARN_ON(!rb && event->rb); 7290 - 7291 - if (vma->vm_flags & VM_WRITE) 7292 - flags |= RING_BUFFER_WRITABLE; 7293 - 7294 - if (!rb) { 7295 - rb = rb_alloc(nr_pages, 7296 - event->attr.watermark ? event->attr.wakeup_watermark : 0, 7297 - event->cpu, flags); 7298 - 7299 - if (!rb) { 7300 - ret = -ENOMEM; 7301 - goto unlock; 7302 - } 7303 - 7304 - atomic_set(&rb->mmap_count, 1); 7305 - rb->mmap_user = get_current_user(); 7306 - rb->mmap_locked = extra; 7307 - 7308 - ring_buffer_attach(event, rb); 7309 - 7310 - perf_event_update_time(event); 7311 - perf_event_init_userpage(event); 7312 - perf_event_update_userpage(event); 7313 - ret = 0; 7314 - } else { 7315 - ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages, 7316 - event->attr.aux_watermark, flags); 7317 - if (!ret) { 7318 - atomic_set(&rb->aux_mmap_count, 1); 7319 - rb->aux_mmap_locked = extra; 7320 - } 7321 - } 7322 - 7323 - unlock: 7324 - if (!ret) { 7325 - atomic_long_add(user_extra, &user->locked_vm); 7326 - atomic64_add(extra, &vma->vm_mm->pinned_vm); 7327 - 7328 - atomic_inc(&event->mmap_count); 7329 - } else if (rb) { 7330 - /* AUX allocation failed */ 7331 - atomic_dec(&rb->mmap_count); 7332 - } 7333 - aux_unlock: 7334 - if (aux_mutex) 7335 - mutex_unlock(aux_mutex); 7336 - mutex_unlock(&event->mmap_mutex); 7337 - 7338 - if (ret) 7339 - return ret; 7340 6990 7341 6991 /* 7342 6992 * Since pinned accounting is per vm we cannot allow fork() to copy our ··· 7186 7174 * full cleanup in this case and therefore does not invoke 7187 7175 * vmops::close(). 7188 7176 */ 7189 - ret = map_range(rb, vma); 7177 + ret = map_range(event->rb, vma); 7190 7178 if (ret) 7191 7179 perf_mmap_close(vma); 7192 7180 ··· 7452 7440 if (user_mode(regs)) { 7453 7441 regs_user->abi = perf_reg_abi(current); 7454 7442 regs_user->regs = regs; 7455 - } else if (!(current->flags & PF_KTHREAD)) { 7443 + } else if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER))) { 7456 7444 perf_get_regs_user(regs_user, regs); 7457 7445 } else { 7458 7446 regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE; ··· 8092 8080 * Try IRQ-safe get_user_page_fast_only first. 8093 8081 * If failed, leave phys_addr as 0. 8094 8082 */ 8095 - if (current->mm != NULL) { 8083 + if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER))) { 8096 8084 struct page *p; 8097 8085 8098 8086 pagefault_disable(); ··· 8204 8192 perf_callchain(struct perf_event *event, struct pt_regs *regs) 8205 8193 { 8206 8194 bool kernel = !event->attr.exclude_callchain_kernel; 8207 - bool user = !event->attr.exclude_callchain_user; 8195 + bool user = !event->attr.exclude_callchain_user && 8196 + !(current->flags & (PF_KTHREAD | PF_USER_WORKER)); 8208 8197 /* Disallow cross-task user callchains. */ 8209 8198 bool crosstask = event->ctx->task && event->ctx->task != current; 8210 8199 const u32 max_stack = event->attr.sample_max_stack; ··· 8217 8204 if (!kernel && !user) 8218 8205 return &__empty_callchain; 8219 8206 8220 - callchain = get_perf_callchain(regs, 0, kernel, user, 8207 + callchain = get_perf_callchain(regs, kernel, user, 8221 8208 max_stack, crosstask, true); 8222 8209 return callchain ?: &__empty_callchain; 8223 8210 } ··· 13262 13249 mutex_lock_double(&event->mmap_mutex, &output_event->mmap_mutex); 13263 13250 set: 13264 13251 /* Can't redirect output if we've got an active mmap() */ 13265 - if (atomic_read(&event->mmap_count)) 13252 + if (refcount_read(&event->mmap_count)) 13266 13253 goto unlock; 13267 13254 13268 13255 if (output_event) { ··· 13275 13262 goto unlock; 13276 13263 13277 13264 /* did we race against perf_mmap_close() */ 13278 - if (!atomic_read(&rb->mmap_count)) { 13265 + if (!refcount_read(&rb->mmap_count)) { 13279 13266 ring_buffer_put(rb); 13280 13267 goto unlock; 13281 13268 }
+2 -2
kernel/events/internal.h
··· 35 35 spinlock_t event_lock; 36 36 struct list_head event_list; 37 37 38 - atomic_t mmap_count; 38 + refcount_t mmap_count; 39 39 unsigned long mmap_locked; 40 40 struct user_struct *mmap_user; 41 41 ··· 47 47 unsigned long aux_pgoff; 48 48 int aux_nr_pages; 49 49 int aux_overwrite; 50 - atomic_t aux_mmap_count; 50 + refcount_t aux_mmap_count; 51 51 unsigned long aux_mmap_locked; 52 52 void (*free_aux)(void *); 53 53 refcount_t aux_refcount;
+1 -1
kernel/events/ring_buffer.c
··· 400 400 * the same order, see perf_mmap_close. Otherwise we end up freeing 401 401 * aux pages in this path, which is a bug, because in_atomic(). 402 402 */ 403 - if (!atomic_read(&rb->aux_mmap_count)) 403 + if (!refcount_read(&rb->aux_mmap_count)) 404 404 goto err; 405 405 406 406 if (!refcount_inc_not_zero(&rb->aux_refcount))
+73 -29
kernel/events/uprobes.c
··· 177 177 return is_swbp_insn(insn); 178 178 } 179 179 180 - static void copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len) 180 + void uprobe_copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len) 181 181 { 182 182 void *kaddr = kmap_atomic(page); 183 183 memcpy(dst, kaddr + (vaddr & ~PAGE_MASK), len); ··· 191 191 kunmap_atomic(kaddr); 192 192 } 193 193 194 - static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode) 194 + static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *insn, 195 + int nbytes, void *data) 195 196 { 196 197 uprobe_opcode_t old_opcode; 197 198 bool is_swbp; ··· 206 205 * is a trap variant; uprobes always wins over any other (gdb) 207 206 * breakpoint. 208 207 */ 209 - copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE); 208 + uprobe_copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE); 210 209 is_swbp = is_swbp_insn(&old_opcode); 211 210 212 - if (is_swbp_insn(new_opcode)) { 211 + if (is_swbp_insn(insn)) { 213 212 if (is_swbp) /* register: already installed? */ 214 213 return 0; 215 214 } else { ··· 400 399 return identical; 401 400 } 402 401 403 - static int __uprobe_write_opcode(struct vm_area_struct *vma, 402 + static int __uprobe_write(struct vm_area_struct *vma, 404 403 struct folio_walk *fw, struct folio *folio, 405 - unsigned long opcode_vaddr, uprobe_opcode_t opcode) 404 + unsigned long insn_vaddr, uprobe_opcode_t *insn, int nbytes, 405 + bool is_register) 406 406 { 407 - const unsigned long vaddr = opcode_vaddr & PAGE_MASK; 408 - const bool is_register = !!is_swbp_insn(&opcode); 407 + const unsigned long vaddr = insn_vaddr & PAGE_MASK; 409 408 bool pmd_mappable; 410 409 411 410 /* For now, we'll only handle PTE-mapped folios. */ ··· 430 429 */ 431 430 flush_cache_page(vma, vaddr, pte_pfn(fw->pte)); 432 431 fw->pte = ptep_clear_flush(vma, vaddr, fw->ptep); 433 - copy_to_page(fw->page, opcode_vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); 432 + copy_to_page(fw->page, insn_vaddr, insn, nbytes); 434 433 435 434 /* 436 435 * When unregistering, we may only zap a PTE if uffd is disabled and ··· 483 482 * @opcode_vaddr: the virtual address to store the opcode. 484 483 * @opcode: opcode to be written at @opcode_vaddr. 485 484 * 486 - * Called with mm->mmap_lock held for read or write. 485 + * Called with mm->mmap_lock held for write. 487 486 * Return 0 (success) or a negative errno. 488 487 */ 489 488 int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, 490 - const unsigned long opcode_vaddr, uprobe_opcode_t opcode) 489 + const unsigned long opcode_vaddr, uprobe_opcode_t opcode, 490 + bool is_register) 491 491 { 492 - const unsigned long vaddr = opcode_vaddr & PAGE_MASK; 492 + return uprobe_write(auprobe, vma, opcode_vaddr, &opcode, UPROBE_SWBP_INSN_SIZE, 493 + verify_opcode, is_register, true /* do_update_ref_ctr */, NULL); 494 + } 495 + 496 + int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma, 497 + const unsigned long insn_vaddr, uprobe_opcode_t *insn, int nbytes, 498 + uprobe_write_verify_t verify, bool is_register, bool do_update_ref_ctr, 499 + void *data) 500 + { 501 + const unsigned long vaddr = insn_vaddr & PAGE_MASK; 493 502 struct mm_struct *mm = vma->vm_mm; 494 503 struct uprobe *uprobe; 495 - int ret, is_register, ref_ctr_updated = 0; 504 + int ret, ref_ctr_updated = 0; 496 505 unsigned int gup_flags = FOLL_FORCE; 497 506 struct mmu_notifier_range range; 498 507 struct folio_walk fw; 499 508 struct folio *folio; 500 509 struct page *page; 501 510 502 - is_register = is_swbp_insn(&opcode); 503 511 uprobe = container_of(auprobe, struct uprobe, arch); 504 512 505 513 if (WARN_ON_ONCE(!is_cow_mapping(vma->vm_flags))) ··· 519 509 * page that we can safely modify. Use FOLL_WRITE to trigger a write 520 510 * fault if required. When unregistering, we might be lucky and the 521 511 * anon page is already gone. So defer write faults until really 522 - * required. Use FOLL_SPLIT_PMD, because __uprobe_write_opcode() 512 + * required. Use FOLL_SPLIT_PMD, because __uprobe_write() 523 513 * cannot deal with PMDs yet. 524 514 */ 525 515 if (is_register) ··· 531 521 goto out; 532 522 folio = page_folio(page); 533 523 534 - ret = verify_opcode(page, opcode_vaddr, &opcode); 524 + ret = verify(page, insn_vaddr, insn, nbytes, data); 535 525 if (ret <= 0) { 536 526 folio_put(folio); 537 527 goto out; 538 528 } 539 529 540 530 /* We are going to replace instruction, update ref_ctr. */ 541 - if (!ref_ctr_updated && uprobe->ref_ctr_offset) { 531 + if (do_update_ref_ctr && !ref_ctr_updated && uprobe->ref_ctr_offset) { 542 532 ret = update_ref_ctr(uprobe, mm, is_register ? 1 : -1); 543 533 if (ret) { 544 534 folio_put(folio); ··· 570 560 /* Walk the page tables again, to perform the actual update. */ 571 561 if (folio_walk_start(&fw, vma, vaddr, 0)) { 572 562 if (fw.page == page) 573 - ret = __uprobe_write_opcode(vma, &fw, folio, opcode_vaddr, opcode); 563 + ret = __uprobe_write(vma, &fw, folio, insn_vaddr, insn, nbytes, is_register); 574 564 folio_walk_end(&fw, vma); 575 565 } 576 566 ··· 590 580 591 581 out: 592 582 /* Revert back reference counter if instruction update failed. */ 593 - if (ret < 0 && ref_ctr_updated) 583 + if (do_update_ref_ctr && ret < 0 && ref_ctr_updated) 594 584 update_ref_ctr(uprobe, mm, is_register ? -1 : 1); 595 585 596 586 /* try collapse pmd for compound page */ ··· 612 602 int __weak set_swbp(struct arch_uprobe *auprobe, struct vm_area_struct *vma, 613 603 unsigned long vaddr) 614 604 { 615 - return uprobe_write_opcode(auprobe, vma, vaddr, UPROBE_SWBP_INSN); 605 + return uprobe_write_opcode(auprobe, vma, vaddr, UPROBE_SWBP_INSN, true); 616 606 } 617 607 618 608 /** ··· 628 618 struct vm_area_struct *vma, unsigned long vaddr) 629 619 { 630 620 return uprobe_write_opcode(auprobe, vma, vaddr, 631 - *(uprobe_opcode_t *)&auprobe->insn); 621 + *(uprobe_opcode_t *)&auprobe->insn, false); 632 622 } 633 623 634 624 /* uprobe should have guaranteed positive refcount */ ··· 1061 1051 if (IS_ERR(page)) 1062 1052 return PTR_ERR(page); 1063 1053 1064 - copy_from_page(page, offset, insn, nbytes); 1054 + uprobe_copy_from_page(page, offset, insn, nbytes); 1065 1055 put_page(page); 1066 1056 1067 1057 return 0; ··· 1220 1210 * reclaim. This is optimistic, no harm done if it fails. 1221 1211 */ 1222 1212 prev = kmalloc(sizeof(struct map_info), 1223 - GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN); 1213 + GFP_NOWAIT | __GFP_NOMEMALLOC); 1224 1214 if (prev) 1225 1215 prev->next = NULL; 1226 1216 } ··· 1407 1397 return ERR_PTR(-EINVAL); 1408 1398 1409 1399 /* 1410 - * This ensures that copy_from_page(), copy_to_page() and 1400 + * This ensures that uprobe_copy_from_page(), copy_to_page() and 1411 1401 * __update_ref_ctr() can't cross page boundary. 1412 1402 */ 1413 1403 if (!IS_ALIGNED(offset, UPROBE_SWBP_INSN_SIZE)) ··· 1473 1463 struct vm_area_struct *vma; 1474 1464 int err = 0; 1475 1465 1476 - mmap_read_lock(mm); 1466 + mmap_write_lock(mm); 1477 1467 for_each_vma(vmi, vma) { 1478 1468 unsigned long vaddr; 1479 1469 loff_t offset; ··· 1490 1480 vaddr = offset_to_vaddr(vma, uprobe->offset); 1491 1481 err |= remove_breakpoint(uprobe, vma, vaddr); 1492 1482 } 1493 - mmap_read_unlock(mm); 1483 + mmap_write_unlock(mm); 1494 1484 1495 1485 return err; 1496 1486 } ··· 1736 1726 return ret; 1737 1727 } 1738 1728 1739 - void * __weak arch_uprobe_trampoline(unsigned long *psize) 1729 + void * __weak arch_uretprobe_trampoline(unsigned long *psize) 1740 1730 { 1741 1731 static uprobe_opcode_t insn = UPROBE_SWBP_INSN; 1742 1732 ··· 1768 1758 init_waitqueue_head(&area->wq); 1769 1759 /* Reserve the 1st slot for get_trampoline_vaddr() */ 1770 1760 set_bit(0, area->bitmap); 1771 - insns = arch_uprobe_trampoline(&insns_size); 1761 + insns = arch_uretprobe_trampoline(&insns_size); 1772 1762 arch_uprobe_copy_ixol(area->page, 0, insns, insns_size); 1773 1763 1774 1764 if (!xol_add_vma(mm, area)) ··· 1802 1792 return area; 1803 1793 } 1804 1794 1795 + void __weak arch_uprobe_clear_state(struct mm_struct *mm) 1796 + { 1797 + } 1798 + 1799 + void __weak arch_uprobe_init_state(struct mm_struct *mm) 1800 + { 1801 + } 1802 + 1805 1803 /* 1806 1804 * uprobe_clear_state - Free the area allocated for slots. 1807 1805 */ ··· 1820 1802 mutex_lock(&delayed_uprobe_lock); 1821 1803 delayed_uprobe_remove(NULL, mm); 1822 1804 mutex_unlock(&delayed_uprobe_lock); 1805 + 1806 + arch_uprobe_clear_state(mm); 1823 1807 1824 1808 if (!area) 1825 1809 return; ··· 2413 2393 if (result < 0) 2414 2394 return result; 2415 2395 2416 - copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); 2396 + uprobe_copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); 2417 2397 put_page(page); 2418 2398 out: 2419 2399 /* This needs to return true for any variant of the trap insn */ ··· 2697 2677 return true; 2698 2678 } 2699 2679 2680 + void __weak arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr) 2681 + { 2682 + } 2683 + 2700 2684 /* 2701 2685 * Run handler and ask thread to singlestep. 2702 2686 * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. ··· 2765 2741 2766 2742 handler_chain(uprobe, regs); 2767 2743 2744 + /* Try to optimize after first hit. */ 2745 + arch_uprobe_optimize(&uprobe->arch, bp_vaddr); 2746 + 2768 2747 if (arch_uprobe_skip_sstep(&uprobe->arch, regs)) 2769 2748 goto out; 2770 2749 ··· 2777 2750 out: 2778 2751 /* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */ 2779 2752 rcu_read_unlock_trace(); 2753 + } 2754 + 2755 + void handle_syscall_uprobe(struct pt_regs *regs, unsigned long bp_vaddr) 2756 + { 2757 + struct uprobe *uprobe; 2758 + int is_swbp; 2759 + 2760 + guard(rcu_tasks_trace)(); 2761 + 2762 + uprobe = find_active_uprobe_rcu(bp_vaddr, &is_swbp); 2763 + if (!uprobe) 2764 + return; 2765 + if (!get_utask()) 2766 + return; 2767 + if (arch_uprobe_ignore(&uprobe->arch, regs)) 2768 + return; 2769 + handler_chain(uprobe, regs); 2780 2770 } 2781 2771 2782 2772 /*
+1
kernel/fork.c
··· 1014 1014 { 1015 1015 #ifdef CONFIG_UPROBES 1016 1016 mm->uprobes_state.xol_area = NULL; 1017 + arch_uprobe_init_state(mm); 1017 1018 #endif 1018 1019 } 1019 1020
+25 -7
kernel/seccomp.c
··· 741 741 } 742 742 743 743 #ifdef SECCOMP_ARCH_NATIVE 744 + static bool seccomp_uprobe_exception(struct seccomp_data *sd) 745 + { 746 + #if defined __NR_uretprobe || defined __NR_uprobe 747 + #ifdef SECCOMP_ARCH_COMPAT 748 + if (sd->arch == SECCOMP_ARCH_NATIVE) 749 + #endif 750 + { 751 + #ifdef __NR_uretprobe 752 + if (sd->nr == __NR_uretprobe) 753 + return true; 754 + #endif 755 + #ifdef __NR_uprobe 756 + if (sd->nr == __NR_uprobe) 757 + return true; 758 + #endif 759 + } 760 + #endif 761 + return false; 762 + } 763 + 744 764 /** 745 765 * seccomp_is_const_allow - check if filter is constant allow with given data 746 766 * @fprog: The BPF programs ··· 778 758 return false; 779 759 780 760 /* Our single exception to filtering. */ 781 - #ifdef __NR_uretprobe 782 - #ifdef SECCOMP_ARCH_COMPAT 783 - if (sd->arch == SECCOMP_ARCH_NATIVE) 784 - #endif 785 - if (sd->nr == __NR_uretprobe) 786 - return true; 787 - #endif 761 + if (seccomp_uprobe_exception(sd)) 762 + return true; 788 763 789 764 for (pc = 0; pc < fprog->len; pc++) { 790 765 struct sock_filter *insn = &fprog->filter[pc]; ··· 1057 1042 __NR_seccomp_read, __NR_seccomp_write, __NR_seccomp_exit, __NR_seccomp_sigreturn, 1058 1043 #ifdef __NR_uretprobe 1059 1044 __NR_uretprobe, 1045 + #endif 1046 + #ifdef __NR_uprobe 1047 + __NR_uprobe, 1060 1048 #endif 1061 1049 -1, /* negative terminated */ 1062 1050 };
+1
kernel/sys_ni.c
··· 392 392 COND_SYSCALL(rseq); 393 393 394 394 COND_SYSCALL(uretprobe); 395 + COND_SYSCALL(uprobe);
+8 -6
tools/arch/x86/include/asm/msr-index.h
··· 315 315 #define PERF_CAP_PT_IDX 16 316 316 317 317 #define MSR_PEBS_LD_LAT_THRESHOLD 0x000003f6 318 - #define PERF_CAP_PEBS_TRAP BIT_ULL(6) 319 - #define PERF_CAP_ARCH_REG BIT_ULL(7) 320 - #define PERF_CAP_PEBS_FORMAT 0xf00 321 - #define PERF_CAP_PEBS_BASELINE BIT_ULL(14) 322 - #define PERF_CAP_PEBS_MASK (PERF_CAP_PEBS_TRAP | PERF_CAP_ARCH_REG | \ 323 - PERF_CAP_PEBS_FORMAT | PERF_CAP_PEBS_BASELINE) 318 + #define PERF_CAP_PEBS_TRAP BIT_ULL(6) 319 + #define PERF_CAP_ARCH_REG BIT_ULL(7) 320 + #define PERF_CAP_PEBS_FORMAT 0xf00 321 + #define PERF_CAP_PEBS_BASELINE BIT_ULL(14) 322 + #define PERF_CAP_PEBS_TIMING_INFO BIT_ULL(17) 323 + #define PERF_CAP_PEBS_MASK (PERF_CAP_PEBS_TRAP | PERF_CAP_ARCH_REG | \ 324 + PERF_CAP_PEBS_FORMAT | PERF_CAP_PEBS_BASELINE | \ 325 + PERF_CAP_PEBS_TIMING_INFO) 324 326 325 327 #define MSR_IA32_RTIT_CTL 0x00000570 326 328 #define RTIT_CTL_TRACEEN BIT(0)
+462 -44
tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c
··· 8 8 #include <asm/ptrace.h> 9 9 #include <linux/compiler.h> 10 10 #include <linux/stringify.h> 11 + #include <linux/kernel.h> 11 12 #include <sys/wait.h> 12 13 #include <sys/syscall.h> 13 14 #include <sys/prctl.h> 14 15 #include <asm/prctl.h> 15 16 #include "uprobe_syscall.skel.h" 16 17 #include "uprobe_syscall_executed.skel.h" 18 + #include "bpf/libbpf_internal.h" 17 19 18 - __naked unsigned long uretprobe_regs_trigger(void) 20 + #define USDT_NOP .byte 0x0f, 0x1f, 0x44, 0x00, 0x00 21 + #include "usdt.h" 22 + 23 + #pragma GCC diagnostic ignored "-Wattributes" 24 + 25 + __attribute__((aligned(16))) 26 + __nocf_check __weak __naked unsigned long uprobe_regs_trigger(void) 19 27 { 20 28 asm volatile ( 29 + ".byte 0x0f, 0x1f, 0x44, 0x00, 0x00\n" /* nop5 */ 21 30 "movq $0xdeadbeef, %rax\n" 22 31 "ret\n" 23 32 ); 24 33 } 25 34 26 - __naked void uretprobe_regs(struct pt_regs *before, struct pt_regs *after) 35 + __naked void uprobe_regs(struct pt_regs *before, struct pt_regs *after) 27 36 { 28 37 asm volatile ( 29 38 "movq %r15, 0(%rdi)\n" ··· 53 44 "movq $0, 120(%rdi)\n" /* orig_rax */ 54 45 "movq $0, 128(%rdi)\n" /* rip */ 55 46 "movq $0, 136(%rdi)\n" /* cs */ 47 + "pushq %rax\n" 56 48 "pushf\n" 57 49 "pop %rax\n" 58 50 "movq %rax, 144(%rdi)\n" /* eflags */ 51 + "pop %rax\n" 59 52 "movq %rsp, 152(%rdi)\n" /* rsp */ 60 53 "movq $0, 160(%rdi)\n" /* ss */ 61 54 62 55 /* save 2nd argument */ 63 56 "pushq %rsi\n" 64 - "call uretprobe_regs_trigger\n" 57 + "call uprobe_regs_trigger\n" 65 58 66 59 /* save return value and load 2nd argument pointer to rax */ 67 60 "pushq %rax\n" ··· 103 92 ); 104 93 } 105 94 106 - static void test_uretprobe_regs_equal(void) 95 + static void test_uprobe_regs_equal(bool retprobe) 107 96 { 97 + LIBBPF_OPTS(bpf_uprobe_opts, opts, 98 + .retprobe = retprobe, 99 + ); 108 100 struct uprobe_syscall *skel = NULL; 109 101 struct pt_regs before = {}, after = {}; 110 102 unsigned long *pb = (unsigned long *) &before; 111 103 unsigned long *pa = (unsigned long *) &after; 112 104 unsigned long *pp; 105 + unsigned long offset; 113 106 unsigned int i, cnt; 114 - int err; 107 + 108 + offset = get_uprobe_offset(&uprobe_regs_trigger); 109 + if (!ASSERT_GE(offset, 0, "get_uprobe_offset")) 110 + return; 115 111 116 112 skel = uprobe_syscall__open_and_load(); 117 113 if (!ASSERT_OK_PTR(skel, "uprobe_syscall__open_and_load")) 118 114 goto cleanup; 119 115 120 - err = uprobe_syscall__attach(skel); 121 - if (!ASSERT_OK(err, "uprobe_syscall__attach")) 116 + skel->links.probe = bpf_program__attach_uprobe_opts(skel->progs.probe, 117 + 0, "/proc/self/exe", offset, &opts); 118 + if (!ASSERT_OK_PTR(skel->links.probe, "bpf_program__attach_uprobe_opts")) 122 119 goto cleanup; 123 120 124 - uretprobe_regs(&before, &after); 121 + /* make sure uprobe gets optimized */ 122 + if (!retprobe) 123 + uprobe_regs_trigger(); 124 + 125 + uprobe_regs(&before, &after); 125 126 126 127 pp = (unsigned long *) &skel->bss->regs; 127 128 cnt = sizeof(before)/sizeof(*pb); ··· 142 119 unsigned int offset = i * sizeof(unsigned long); 143 120 144 121 /* 145 - * Check register before and after uretprobe_regs_trigger call 122 + * Check register before and after uprobe_regs_trigger call 146 123 * that triggers the uretprobe. 147 124 */ 148 125 switch (offset) { ··· 156 133 157 134 /* 158 135 * Check register seen from bpf program and register after 159 - * uretprobe_regs_trigger call 136 + * uprobe_regs_trigger call (with rax exception, check below). 160 137 */ 161 138 switch (offset) { 162 139 /* ··· 169 146 case offsetof(struct pt_regs, rsp): 170 147 case offsetof(struct pt_regs, ss): 171 148 break; 149 + /* 150 + * uprobe does not see return value in rax, it needs to see the 151 + * original (before) rax value 152 + */ 153 + case offsetof(struct pt_regs, rax): 154 + if (!retprobe) { 155 + ASSERT_EQ(pp[i], pb[i], "uprobe rax prog-before value check"); 156 + break; 157 + } 172 158 default: 173 159 if (!ASSERT_EQ(pp[i], pa[i], "register prog-after value check")) 174 160 fprintf(stdout, "failed register offset %u\n", offset); ··· 207 175 return ret != n ? (int) ret : 0; 208 176 } 209 177 210 - static void test_uretprobe_regs_change(void) 178 + static void test_regs_change(void) 211 179 { 212 180 struct pt_regs before = {}, after = {}; 213 181 unsigned long *pb = (unsigned long *) &before; ··· 215 183 unsigned long cnt = sizeof(before)/sizeof(*pb); 216 184 unsigned int i, err, offset; 217 185 218 - offset = get_uprobe_offset(uretprobe_regs_trigger); 186 + offset = get_uprobe_offset(uprobe_regs_trigger); 219 187 220 188 err = write_bpf_testmod_uprobe(offset); 221 189 if (!ASSERT_OK(err, "register_uprobe")) 222 190 return; 223 191 224 - uretprobe_regs(&before, &after); 192 + /* make sure uprobe gets optimized */ 193 + uprobe_regs_trigger(); 194 + 195 + uprobe_regs(&before, &after); 225 196 226 197 err = write_bpf_testmod_uprobe(0); 227 198 if (!ASSERT_OK(err, "unregister_uprobe")) ··· 287 252 ); 288 253 struct uprobe_syscall_executed *skel; 289 254 int pid, status, err, go[2], c = 0; 255 + struct bpf_link *link; 290 256 291 257 if (!ASSERT_OK(pipe(go), "pipe")) 292 258 return; ··· 313 277 _exit(0); 314 278 } 315 279 316 - skel->links.test = bpf_program__attach_uprobe_multi(skel->progs.test, pid, 317 - "/proc/self/exe", 318 - "uretprobe_syscall_call", &opts); 319 - if (!ASSERT_OK_PTR(skel->links.test, "bpf_program__attach_uprobe_multi")) 280 + skel->bss->pid = pid; 281 + 282 + link = bpf_program__attach_uprobe_multi(skel->progs.test_uretprobe_multi, 283 + pid, "/proc/self/exe", 284 + "uretprobe_syscall_call", &opts); 285 + if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_multi")) 320 286 goto cleanup; 287 + skel->links.test_uretprobe_multi = link; 321 288 322 289 /* kick the child */ 323 290 write(go[1], &c, 1); ··· 338 299 uprobe_syscall_executed__destroy(skel); 339 300 close(go[1]); 340 301 close(go[0]); 302 + } 303 + 304 + #define TRAMP "[uprobes-trampoline]" 305 + 306 + __attribute__((aligned(16))) 307 + __nocf_check __weak __naked void uprobe_test(void) 308 + { 309 + asm volatile (" \n" 310 + ".byte 0x0f, 0x1f, 0x44, 0x00, 0x00 \n" 311 + "ret \n" 312 + ); 313 + } 314 + 315 + __attribute__((aligned(16))) 316 + __nocf_check __weak void usdt_test(void) 317 + { 318 + USDT(optimized_uprobe, usdt); 319 + } 320 + 321 + static int find_uprobes_trampoline(void *tramp_addr) 322 + { 323 + void *start, *end; 324 + char line[128]; 325 + int ret = -1; 326 + FILE *maps; 327 + 328 + maps = fopen("/proc/self/maps", "r"); 329 + if (!maps) { 330 + fprintf(stderr, "cannot open maps\n"); 331 + return -1; 332 + } 333 + 334 + while (fgets(line, sizeof(line), maps)) { 335 + int m = -1; 336 + 337 + /* We care only about private r-x mappings. */ 338 + if (sscanf(line, "%p-%p r-xp %*x %*x:%*x %*u %n", &start, &end, &m) != 2) 339 + continue; 340 + if (m < 0) 341 + continue; 342 + if (!strncmp(&line[m], TRAMP, sizeof(TRAMP)-1) && (start == tramp_addr)) { 343 + ret = 0; 344 + break; 345 + } 346 + } 347 + 348 + fclose(maps); 349 + return ret; 350 + } 351 + 352 + static unsigned char nop5[5] = { 0x0f, 0x1f, 0x44, 0x00, 0x00 }; 353 + 354 + static void *find_nop5(void *fn) 355 + { 356 + int i; 357 + 358 + for (i = 0; i < 10; i++) { 359 + if (!memcmp(nop5, fn + i, 5)) 360 + return fn + i; 361 + } 362 + return NULL; 363 + } 364 + 365 + typedef void (__attribute__((nocf_check)) *trigger_t)(void); 366 + 367 + static void *check_attach(struct uprobe_syscall_executed *skel, trigger_t trigger, 368 + void *addr, int executed) 369 + { 370 + struct __arch_relative_insn { 371 + __u8 op; 372 + __s32 raddr; 373 + } __packed *call; 374 + void *tramp = NULL; 375 + 376 + /* Uprobe gets optimized after first trigger, so let's press twice. */ 377 + trigger(); 378 + trigger(); 379 + 380 + /* Make sure bpf program got executed.. */ 381 + ASSERT_EQ(skel->bss->executed, executed, "executed"); 382 + 383 + /* .. and check the trampoline is as expected. */ 384 + call = (struct __arch_relative_insn *) addr; 385 + tramp = (void *) (call + 1) + call->raddr; 386 + ASSERT_EQ(call->op, 0xe8, "call"); 387 + ASSERT_OK(find_uprobes_trampoline(tramp), "uprobes_trampoline"); 388 + 389 + return tramp; 390 + } 391 + 392 + static void check_detach(void *addr, void *tramp) 393 + { 394 + /* [uprobes_trampoline] stays after detach */ 395 + ASSERT_OK(find_uprobes_trampoline(tramp), "uprobes_trampoline"); 396 + ASSERT_OK(memcmp(addr, nop5, 5), "nop5"); 397 + } 398 + 399 + static void check(struct uprobe_syscall_executed *skel, struct bpf_link *link, 400 + trigger_t trigger, void *addr, int executed) 401 + { 402 + void *tramp; 403 + 404 + tramp = check_attach(skel, trigger, addr, executed); 405 + bpf_link__destroy(link); 406 + check_detach(addr, tramp); 407 + } 408 + 409 + static void test_uprobe_legacy(void) 410 + { 411 + struct uprobe_syscall_executed *skel = NULL; 412 + LIBBPF_OPTS(bpf_uprobe_opts, opts, 413 + .retprobe = true, 414 + ); 415 + struct bpf_link *link; 416 + unsigned long offset; 417 + 418 + offset = get_uprobe_offset(&uprobe_test); 419 + if (!ASSERT_GE(offset, 0, "get_uprobe_offset")) 420 + goto cleanup; 421 + 422 + /* uprobe */ 423 + skel = uprobe_syscall_executed__open_and_load(); 424 + if (!ASSERT_OK_PTR(skel, "uprobe_syscall_executed__open_and_load")) 425 + return; 426 + 427 + skel->bss->pid = getpid(); 428 + 429 + link = bpf_program__attach_uprobe_opts(skel->progs.test_uprobe, 430 + 0, "/proc/self/exe", offset, NULL); 431 + if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_opts")) 432 + goto cleanup; 433 + 434 + check(skel, link, uprobe_test, uprobe_test, 2); 435 + 436 + /* uretprobe */ 437 + skel->bss->executed = 0; 438 + 439 + link = bpf_program__attach_uprobe_opts(skel->progs.test_uretprobe, 440 + 0, "/proc/self/exe", offset, &opts); 441 + if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_opts")) 442 + goto cleanup; 443 + 444 + check(skel, link, uprobe_test, uprobe_test, 2); 445 + 446 + cleanup: 447 + uprobe_syscall_executed__destroy(skel); 448 + } 449 + 450 + static void test_uprobe_multi(void) 451 + { 452 + struct uprobe_syscall_executed *skel = NULL; 453 + LIBBPF_OPTS(bpf_uprobe_multi_opts, opts); 454 + struct bpf_link *link; 455 + unsigned long offset; 456 + 457 + offset = get_uprobe_offset(&uprobe_test); 458 + if (!ASSERT_GE(offset, 0, "get_uprobe_offset")) 459 + goto cleanup; 460 + 461 + opts.offsets = &offset; 462 + opts.cnt = 1; 463 + 464 + skel = uprobe_syscall_executed__open_and_load(); 465 + if (!ASSERT_OK_PTR(skel, "uprobe_syscall_executed__open_and_load")) 466 + return; 467 + 468 + skel->bss->pid = getpid(); 469 + 470 + /* uprobe.multi */ 471 + link = bpf_program__attach_uprobe_multi(skel->progs.test_uprobe_multi, 472 + 0, "/proc/self/exe", NULL, &opts); 473 + if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_multi")) 474 + goto cleanup; 475 + 476 + check(skel, link, uprobe_test, uprobe_test, 2); 477 + 478 + /* uretprobe.multi */ 479 + skel->bss->executed = 0; 480 + opts.retprobe = true; 481 + link = bpf_program__attach_uprobe_multi(skel->progs.test_uretprobe_multi, 482 + 0, "/proc/self/exe", NULL, &opts); 483 + if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_multi")) 484 + goto cleanup; 485 + 486 + check(skel, link, uprobe_test, uprobe_test, 2); 487 + 488 + cleanup: 489 + uprobe_syscall_executed__destroy(skel); 490 + } 491 + 492 + static void test_uprobe_session(void) 493 + { 494 + struct uprobe_syscall_executed *skel = NULL; 495 + LIBBPF_OPTS(bpf_uprobe_multi_opts, opts, 496 + .session = true, 497 + ); 498 + struct bpf_link *link; 499 + unsigned long offset; 500 + 501 + offset = get_uprobe_offset(&uprobe_test); 502 + if (!ASSERT_GE(offset, 0, "get_uprobe_offset")) 503 + goto cleanup; 504 + 505 + opts.offsets = &offset; 506 + opts.cnt = 1; 507 + 508 + skel = uprobe_syscall_executed__open_and_load(); 509 + if (!ASSERT_OK_PTR(skel, "uprobe_syscall_executed__open_and_load")) 510 + return; 511 + 512 + skel->bss->pid = getpid(); 513 + 514 + link = bpf_program__attach_uprobe_multi(skel->progs.test_uprobe_session, 515 + 0, "/proc/self/exe", NULL, &opts); 516 + if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_multi")) 517 + goto cleanup; 518 + 519 + check(skel, link, uprobe_test, uprobe_test, 4); 520 + 521 + cleanup: 522 + uprobe_syscall_executed__destroy(skel); 523 + } 524 + 525 + static void test_uprobe_usdt(void) 526 + { 527 + struct uprobe_syscall_executed *skel; 528 + struct bpf_link *link; 529 + void *addr; 530 + 531 + errno = 0; 532 + addr = find_nop5(usdt_test); 533 + if (!ASSERT_OK_PTR(addr, "find_nop5")) 534 + return; 535 + 536 + skel = uprobe_syscall_executed__open_and_load(); 537 + if (!ASSERT_OK_PTR(skel, "uprobe_syscall_executed__open_and_load")) 538 + return; 539 + 540 + skel->bss->pid = getpid(); 541 + 542 + link = bpf_program__attach_usdt(skel->progs.test_usdt, 543 + -1 /* all PIDs */, "/proc/self/exe", 544 + "optimized_uprobe", "usdt", NULL); 545 + if (!ASSERT_OK_PTR(link, "bpf_program__attach_usdt")) 546 + goto cleanup; 547 + 548 + check(skel, link, usdt_test, addr, 2); 549 + 550 + cleanup: 551 + uprobe_syscall_executed__destroy(skel); 341 552 } 342 553 343 554 /* ··· 632 343 return; 633 344 } 634 345 635 - /* Run all of the uretprobe tests. */ 636 - test_uretprobe_regs_equal(); 637 - test_uretprobe_regs_change(); 346 + /* Run all the tests with shadow stack in place. */ 347 + 348 + test_uprobe_regs_equal(false); 349 + test_uprobe_regs_equal(true); 638 350 test_uretprobe_syscall_call(); 351 + 352 + test_uprobe_legacy(); 353 + test_uprobe_multi(); 354 + test_uprobe_session(); 355 + test_uprobe_usdt(); 356 + 357 + test_regs_change(); 639 358 640 359 ARCH_PRCTL(ARCH_SHSTK_DISABLE, ARCH_SHSTK_SHSTK); 641 360 } 361 + 362 + static volatile bool race_stop; 363 + 364 + static USDT_DEFINE_SEMA(race); 365 + 366 + static void *worker_trigger(void *arg) 367 + { 368 + unsigned long rounds = 0; 369 + 370 + while (!race_stop) { 371 + uprobe_test(); 372 + rounds++; 373 + } 374 + 375 + printf("tid %d trigger rounds: %lu\n", gettid(), rounds); 376 + return NULL; 377 + } 378 + 379 + static void *worker_attach(void *arg) 380 + { 381 + LIBBPF_OPTS(bpf_uprobe_opts, opts); 382 + struct uprobe_syscall_executed *skel; 383 + unsigned long rounds = 0, offset; 384 + const char *sema[2] = { 385 + __stringify(USDT_SEMA(race)), 386 + NULL, 387 + }; 388 + unsigned long *ref; 389 + int err; 390 + 391 + offset = get_uprobe_offset(&uprobe_test); 392 + if (!ASSERT_GE(offset, 0, "get_uprobe_offset")) 393 + return NULL; 394 + 395 + err = elf_resolve_syms_offsets("/proc/self/exe", 1, (const char **) &sema, &ref, STT_OBJECT); 396 + if (!ASSERT_OK(err, "elf_resolve_syms_offsets_sema")) 397 + return NULL; 398 + 399 + opts.ref_ctr_offset = *ref; 400 + 401 + skel = uprobe_syscall_executed__open_and_load(); 402 + if (!ASSERT_OK_PTR(skel, "uprobe_syscall_executed__open_and_load")) 403 + return NULL; 404 + 405 + skel->bss->pid = getpid(); 406 + 407 + while (!race_stop) { 408 + skel->links.test_uprobe = bpf_program__attach_uprobe_opts(skel->progs.test_uprobe, 409 + 0, "/proc/self/exe", offset, &opts); 410 + if (!ASSERT_OK_PTR(skel->links.test_uprobe, "bpf_program__attach_uprobe_opts")) 411 + break; 412 + 413 + bpf_link__destroy(skel->links.test_uprobe); 414 + skel->links.test_uprobe = NULL; 415 + rounds++; 416 + } 417 + 418 + printf("tid %d attach rounds: %lu hits: %d\n", gettid(), rounds, skel->bss->executed); 419 + uprobe_syscall_executed__destroy(skel); 420 + free(ref); 421 + return NULL; 422 + } 423 + 424 + static useconds_t race_msec(void) 425 + { 426 + char *env; 427 + 428 + env = getenv("BPF_SELFTESTS_UPROBE_SYSCALL_RACE_MSEC"); 429 + if (env) 430 + return atoi(env); 431 + 432 + /* default duration is 500ms */ 433 + return 500; 434 + } 435 + 436 + static void test_uprobe_race(void) 437 + { 438 + int err, i, nr_threads; 439 + pthread_t *threads; 440 + 441 + nr_threads = libbpf_num_possible_cpus(); 442 + if (!ASSERT_GT(nr_threads, 0, "libbpf_num_possible_cpus")) 443 + return; 444 + nr_threads = max(2, nr_threads); 445 + 446 + threads = alloca(sizeof(*threads) * nr_threads); 447 + if (!ASSERT_OK_PTR(threads, "malloc")) 448 + return; 449 + 450 + for (i = 0; i < nr_threads; i++) { 451 + err = pthread_create(&threads[i], NULL, i % 2 ? worker_trigger : worker_attach, 452 + NULL); 453 + if (!ASSERT_OK(err, "pthread_create")) 454 + goto cleanup; 455 + } 456 + 457 + usleep(race_msec() * 1000); 458 + 459 + cleanup: 460 + race_stop = true; 461 + for (nr_threads = i, i = 0; i < nr_threads; i++) 462 + pthread_join(threads[i], NULL); 463 + 464 + ASSERT_FALSE(USDT_SEMA_IS_ACTIVE(race), "race_semaphore"); 465 + } 466 + 467 + #ifndef __NR_uprobe 468 + #define __NR_uprobe 336 469 + #endif 470 + 471 + static void test_uprobe_error(void) 472 + { 473 + long err = syscall(__NR_uprobe); 474 + 475 + ASSERT_EQ(err, -1, "error"); 476 + ASSERT_EQ(errno, ENXIO, "errno"); 477 + } 478 + 479 + static void __test_uprobe_syscall(void) 480 + { 481 + if (test__start_subtest("uretprobe_regs_equal")) 482 + test_uprobe_regs_equal(true); 483 + if (test__start_subtest("uretprobe_syscall_call")) 484 + test_uretprobe_syscall_call(); 485 + if (test__start_subtest("uretprobe_shadow_stack")) 486 + test_uretprobe_shadow_stack(); 487 + if (test__start_subtest("uprobe_legacy")) 488 + test_uprobe_legacy(); 489 + if (test__start_subtest("uprobe_multi")) 490 + test_uprobe_multi(); 491 + if (test__start_subtest("uprobe_session")) 492 + test_uprobe_session(); 493 + if (test__start_subtest("uprobe_usdt")) 494 + test_uprobe_usdt(); 495 + if (test__start_subtest("uprobe_race")) 496 + test_uprobe_race(); 497 + if (test__start_subtest("uprobe_error")) 498 + test_uprobe_error(); 499 + if (test__start_subtest("uprobe_regs_equal")) 500 + test_uprobe_regs_equal(false); 501 + if (test__start_subtest("regs_change")) 502 + test_regs_change(); 503 + } 642 504 #else 643 - static void test_uretprobe_regs_equal(void) 644 - { 645 - test__skip(); 646 - } 647 - 648 - static void test_uretprobe_regs_change(void) 649 - { 650 - test__skip(); 651 - } 652 - 653 - static void test_uretprobe_syscall_call(void) 654 - { 655 - test__skip(); 656 - } 657 - 658 - static void test_uretprobe_shadow_stack(void) 505 + static void __test_uprobe_syscall(void) 659 506 { 660 507 test__skip(); 661 508 } ··· 799 374 800 375 void test_uprobe_syscall(void) 801 376 { 802 - if (test__start_subtest("uretprobe_regs_equal")) 803 - test_uretprobe_regs_equal(); 804 - if (test__start_subtest("uretprobe_regs_change")) 805 - test_uretprobe_regs_change(); 806 - if (test__start_subtest("uretprobe_syscall_call")) 807 - test_uretprobe_syscall_call(); 808 - if (test__start_subtest("uretprobe_shadow_stack")) 809 - test_uretprobe_shadow_stack(); 377 + __test_uprobe_syscall(); 810 378 }
+25 -13
tools/testing/selftests/bpf/prog_tests/usdt.c
··· 40 40 } 41 41 } 42 42 43 - static void subtest_basic_usdt(void) 43 + static void subtest_basic_usdt(bool optimized) 44 44 { 45 45 LIBBPF_OPTS(bpf_usdt_opts, opts); 46 46 struct test_usdt *skel; 47 47 struct test_usdt__bss *bss; 48 - int err, i; 48 + int err, i, called; 49 + 50 + #define TRIGGER(x) ({ \ 51 + trigger_func(x); \ 52 + if (optimized) \ 53 + trigger_func(x); \ 54 + optimized ? 2 : 1; \ 55 + }) 49 56 50 57 skel = test_usdt__open_and_load(); 51 58 if (!ASSERT_OK_PTR(skel, "skel_open")) ··· 73 66 if (!ASSERT_OK_PTR(skel->links.usdt0, "usdt0_link")) 74 67 goto cleanup; 75 68 76 - trigger_func(1); 69 + called = TRIGGER(1); 77 70 78 - ASSERT_EQ(bss->usdt0_called, 1, "usdt0_called"); 79 - ASSERT_EQ(bss->usdt3_called, 1, "usdt3_called"); 80 - ASSERT_EQ(bss->usdt12_called, 1, "usdt12_called"); 71 + ASSERT_EQ(bss->usdt0_called, called, "usdt0_called"); 72 + ASSERT_EQ(bss->usdt3_called, called, "usdt3_called"); 73 + ASSERT_EQ(bss->usdt12_called, called, "usdt12_called"); 81 74 82 75 ASSERT_EQ(bss->usdt0_cookie, 0xcafedeadbeeffeed, "usdt0_cookie"); 83 76 ASSERT_EQ(bss->usdt0_arg_cnt, 0, "usdt0_arg_cnt"); ··· 126 119 * bpf_program__attach_usdt() handles this properly and attaches to 127 120 * all possible places of USDT invocation. 128 121 */ 129 - trigger_func(2); 122 + called += TRIGGER(2); 130 123 131 - ASSERT_EQ(bss->usdt0_called, 2, "usdt0_called"); 132 - ASSERT_EQ(bss->usdt3_called, 2, "usdt3_called"); 133 - ASSERT_EQ(bss->usdt12_called, 2, "usdt12_called"); 124 + ASSERT_EQ(bss->usdt0_called, called, "usdt0_called"); 125 + ASSERT_EQ(bss->usdt3_called, called, "usdt3_called"); 126 + ASSERT_EQ(bss->usdt12_called, called, "usdt12_called"); 134 127 135 128 /* only check values that depend on trigger_func()'s input value */ 136 129 ASSERT_EQ(bss->usdt3_args[0], 2, "usdt3_arg1"); ··· 149 142 if (!ASSERT_OK_PTR(skel->links.usdt3, "usdt3_reattach")) 150 143 goto cleanup; 151 144 152 - trigger_func(3); 145 + called += TRIGGER(3); 153 146 154 - ASSERT_EQ(bss->usdt3_called, 3, "usdt3_called"); 147 + ASSERT_EQ(bss->usdt3_called, called, "usdt3_called"); 155 148 /* this time usdt3 has custom cookie */ 156 149 ASSERT_EQ(bss->usdt3_cookie, 0xBADC00C51E, "usdt3_cookie"); 157 150 ASSERT_EQ(bss->usdt3_arg_cnt, 3, "usdt3_arg_cnt"); ··· 165 158 166 159 cleanup: 167 160 test_usdt__destroy(skel); 161 + #undef TRIGGER 168 162 } 169 163 170 164 unsigned short test_usdt_100_semaphore SEC(".probes"); ··· 433 425 void test_usdt(void) 434 426 { 435 427 if (test__start_subtest("basic")) 436 - subtest_basic_usdt(); 428 + subtest_basic_usdt(false); 429 + #ifdef __x86_64__ 430 + if (test__start_subtest("basic_optimized")) 431 + subtest_basic_usdt(true); 432 + #endif 437 433 if (test__start_subtest("multispec")) 438 434 subtest_multispec_usdt(); 439 435 if (test__start_subtest("urand_auto_attach"))
+2 -2
tools/testing/selftests/bpf/progs/uprobe_syscall.c
··· 7 7 8 8 char _license[] SEC("license") = "GPL"; 9 9 10 - SEC("uretprobe//proc/self/exe:uretprobe_regs_trigger") 11 - int uretprobe(struct pt_regs *ctx) 10 + SEC("uprobe") 11 + int probe(struct pt_regs *ctx) 12 12 { 13 13 __builtin_memcpy(&regs, ctx, sizeof(regs)); 14 14 return 0;
+58 -2
tools/testing/selftests/bpf/progs/uprobe_syscall_executed.c
··· 1 1 // SPDX-License-Identifier: GPL-2.0 2 2 #include "vmlinux.h" 3 3 #include <bpf/bpf_helpers.h> 4 + #include <bpf/bpf_tracing.h> 5 + #include <bpf/usdt.bpf.h> 4 6 #include <string.h> 5 7 6 8 struct pt_regs regs; ··· 10 8 char _license[] SEC("license") = "GPL"; 11 9 12 10 int executed = 0; 11 + int pid; 12 + 13 + SEC("uprobe") 14 + int BPF_UPROBE(test_uprobe) 15 + { 16 + if (bpf_get_current_pid_tgid() >> 32 != pid) 17 + return 0; 18 + 19 + executed++; 20 + return 0; 21 + } 22 + 23 + SEC("uretprobe") 24 + int BPF_URETPROBE(test_uretprobe) 25 + { 26 + if (bpf_get_current_pid_tgid() >> 32 != pid) 27 + return 0; 28 + 29 + executed++; 30 + return 0; 31 + } 32 + 33 + SEC("uprobe.multi") 34 + int test_uprobe_multi(struct pt_regs *ctx) 35 + { 36 + if (bpf_get_current_pid_tgid() >> 32 != pid) 37 + return 0; 38 + 39 + executed++; 40 + return 0; 41 + } 13 42 14 43 SEC("uretprobe.multi") 15 - int test(struct pt_regs *regs) 44 + int test_uretprobe_multi(struct pt_regs *ctx) 16 45 { 17 - executed = 1; 46 + if (bpf_get_current_pid_tgid() >> 32 != pid) 47 + return 0; 48 + 49 + executed++; 50 + return 0; 51 + } 52 + 53 + SEC("uprobe.session") 54 + int test_uprobe_session(struct pt_regs *ctx) 55 + { 56 + if (bpf_get_current_pid_tgid() >> 32 != pid) 57 + return 0; 58 + 59 + executed++; 60 + return 0; 61 + } 62 + 63 + SEC("usdt") 64 + int test_usdt(struct pt_regs *ctx) 65 + { 66 + if (bpf_get_current_pid_tgid() >> 32 != pid) 67 + return 0; 68 + 69 + executed++; 18 70 return 0; 19 71 }
+9 -2
tools/testing/selftests/bpf/test_kmods/bpf_testmod.c
··· 501 501 #ifdef __x86_64__ 502 502 503 503 static int 504 + uprobe_handler(struct uprobe_consumer *self, struct pt_regs *regs, __u64 *data) 505 + { 506 + regs->cx = 0x87654321feebdaed; 507 + return 0; 508 + } 509 + 510 + static int 504 511 uprobe_ret_handler(struct uprobe_consumer *self, unsigned long func, 505 512 struct pt_regs *regs, __u64 *data) 506 513 507 514 { 508 515 regs->ax = 0x12345678deadbeef; 509 - regs->cx = 0x87654321feebdaed; 510 516 regs->r11 = (u64) -1; 511 - return true; 517 + return 0; 512 518 } 513 519 514 520 struct testmod_uprobe { ··· 526 520 static DEFINE_MUTEX(testmod_uprobe_mutex); 527 521 528 522 static struct testmod_uprobe uprobe = { 523 + .consumer.handler = uprobe_handler, 529 524 .consumer.ret_handler = uprobe_ret_handler, 530 525 }; 531 526
+545
tools/testing/selftests/bpf/usdt.h
··· 1 + // SPDX-License-Identifier: BSD-2-Clause 2 + /* 3 + * This single-header library defines a collection of variadic macros for 4 + * defining and triggering USDTs (User Statically-Defined Tracepoints): 5 + * 6 + * - For USDTs without associated semaphore: 7 + * USDT(group, name, args...) 8 + * 9 + * - For USDTs with implicit (transparent to the user) semaphore: 10 + * USDT_WITH_SEMA(group, name, args...) 11 + * USDT_IS_ACTIVE(group, name) 12 + * 13 + * - For USDTs with explicit (user-defined and provided) semaphore: 14 + * USDT_WITH_EXPLICIT_SEMA(sema, group, name, args...) 15 + * USDT_SEMA_IS_ACTIVE(sema) 16 + * 17 + * all of which emit a NOP instruction into the instruction stream, and so 18 + * have *zero* overhead for the surrounding code. USDTs are identified by 19 + * a combination of `group` and `name` identifiers, which is used by external 20 + * tracing tooling (tracers) for identifying exact USDTs of interest. 21 + * 22 + * USDTs can have an associated (2-byte) activity counter (USDT semaphore), 23 + * automatically maintained by Linux kernel whenever any correctly written 24 + * BPF-based tracer is attached to the USDT. This USDT semaphore can be used 25 + * to check whether there is a need to do any extra data collection and 26 + * processing for a given USDT (if necessary), and otherwise avoid extra work 27 + * for a common case of USDT not being traced ("active"). 28 + * 29 + * See documentation for USDT_WITH_SEMA()/USDT_IS_ACTIVE() or 30 + * USDT_WITH_EXPLICIT_SEMA()/USDT_SEMA_IS_ACTIVE() APIs below for details on 31 + * working with USDTs with implicitly or explicitly associated 32 + * USDT semaphores, respectively. 33 + * 34 + * There is also some additional data recorded into an auxiliary note 35 + * section. The data in the note section describes the operands, in terms of 36 + * size and location, used by tracing tooling to know where to find USDT 37 + * arguments. Each location is encoded as an assembler operand string. 38 + * Tracing tools (bpftrace and BPF-based tracers, systemtap, etc) insert 39 + * breakpoints on top of the nop, and decode the location operand-strings, 40 + * like an assembler, to find the values being passed. 41 + * 42 + * The operand strings are selected by the compiler for each operand. 43 + * They are constrained by inline-assembler codes.The default is: 44 + * 45 + * #define USDT_ARG_CONSTRAINT nor 46 + * 47 + * This is a good default if the operands tend to be integral and 48 + * moderate in number (smaller than number of registers). In other 49 + * cases, the compiler may report "'asm' requires impossible reload" or 50 + * similar. In this case, consider simplifying the macro call (fewer 51 + * and simpler operands), reduce optimization, or override the default 52 + * constraints string via: 53 + * 54 + * #define USDT_ARG_CONSTRAINT g 55 + * #include <usdt.h> 56 + * 57 + * For some historical description of USDT v3 format (the one used by this 58 + * library and generally recognized and assumed by BPF-based tracing tools) 59 + * see [0]. The more formal specification can be found at [1]. Additional 60 + * argument constraints information can be found at [2]. 61 + * 62 + * Original SystemTap's sys/sdt.h implementation ([3]) was used as a base for 63 + * this USDT library implementation. Current implementation differs *a lot* in 64 + * terms of exposed user API and general usability, which was the main goal 65 + * and focus of the reimplementation work. Nevertheless, underlying recorded 66 + * USDT definitions are fully binary compatible and any USDT-based tooling 67 + * should work equally well with USDTs defined by either SystemTap's or this 68 + * library's USDT implementation. 69 + * 70 + * [0] https://ecos.sourceware.org/ml/systemtap/2010-q3/msg00145.html 71 + * [1] https://sourceware.org/systemtap/wiki/UserSpaceProbeImplementation 72 + * [2] https://gcc.gnu.org/onlinedocs/gcc/Constraints.html 73 + * [3] https://sourceware.org/git/?p=systemtap.git;a=blob;f=includes/sys/sdt.h 74 + */ 75 + #ifndef __USDT_H 76 + #define __USDT_H 77 + 78 + /* 79 + * Changelog: 80 + * 81 + * 0.1.0 82 + * ----- 83 + * - Initial release 84 + */ 85 + #define USDT_MAJOR_VERSION 0 86 + #define USDT_MINOR_VERSION 1 87 + #define USDT_PATCH_VERSION 0 88 + 89 + /* C++20 and C23 added __VA_OPT__ as a standard replacement for non-standard `##__VA_ARGS__` extension */ 90 + #if (defined(__STDC_VERSION__) && __STDC_VERSION__ > 201710L) || (defined(__cplusplus) && __cplusplus > 201703L) 91 + #define __usdt_va_opt 1 92 + #define __usdt_va_args(...) __VA_OPT__(,) __VA_ARGS__ 93 + #else 94 + #define __usdt_va_args(...) , ##__VA_ARGS__ 95 + #endif 96 + 97 + /* 98 + * Trigger USDT with `group`:`name` identifier and pass through `args` as its 99 + * arguments. Zero arguments are acceptable as well. No USDT semaphore is 100 + * associated with this USDT. 101 + * 102 + * Such "semaphoreless" USDTs are commonly used when there is no extra data 103 + * collection or processing needed to collect and prepare USDT arguments and 104 + * they are just available in the surrounding code. USDT() macro will just 105 + * record their locations in CPU registers or in memory for tracing tooling to 106 + * be able to access them, if necessary. 107 + */ 108 + #ifdef __usdt_va_opt 109 + #define USDT(group, name, ...) \ 110 + __usdt_probe(group, name, __usdt_sema_none, 0 __VA_OPT__(,) __VA_ARGS__) 111 + #else 112 + #define USDT(group, name, ...) \ 113 + __usdt_probe(group, name, __usdt_sema_none, 0, ##__VA_ARGS__) 114 + #endif 115 + 116 + /* 117 + * Trigger USDT with `group`:`name` identifier and pass through `args` as its 118 + * arguments. Zero arguments are acceptable as well. USDT also get an 119 + * implicitly-defined associated USDT semaphore, which will be "activated" by 120 + * tracing tooling and can be used to check whether USDT is being actively 121 + * observed. 122 + * 123 + * USDTs with semaphore are commonly used when there is a need to perform 124 + * additional data collection and processing to prepare USDT arguments, which 125 + * otherwise might not be necessary for the rest of application logic. In such 126 + * case, USDT semaphore can be used to avoid unnecessary extra work. If USDT 127 + * is not traced (which is presumed to be a common situation), the associated 128 + * USDT semaphore is "inactive", and so there is no need to waste resources to 129 + * prepare USDT arguments. Use USDT_IS_ACTIVE(group, name) to check whether 130 + * USDT is "active". 131 + * 132 + * N.B. There is an inherent (albeit short) gap between checking whether USDT 133 + * is active and triggering corresponding USDT, in which external tracer can 134 + * be attached to an USDT and activate USDT semaphore after the activity check. 135 + * If such a race occurs, tracers might miss one USDT execution. Tracers are 136 + * expected to accommodate such possibility and this is expected to not be 137 + * a problem for applications and tracers. 138 + * 139 + * N.B. Implicit USDT semaphore defined by USDT_WITH_SEMA() is contained 140 + * within a single executable or shared library and is not shared outside 141 + * them. I.e., if you use USDT_WITH_SEMA() with the same USDT group and name 142 + * identifier across executable and shared library, it will work and won't 143 + * conflict, per se, but will define independent USDT semaphores, one for each 144 + * shared library/executable in which USDT_WITH_SEMA(group, name) is used. 145 + * That is, if you attach to this USDT in one shared library (or executable), 146 + * then only USDT semaphore within that shared library (or executable) will be 147 + * updated by the kernel, while other libraries (or executable) will not see 148 + * activated USDT semaphore. In short, it's best to use unique USDT group:name 149 + * identifiers across different shared libraries (and, equivalently, between 150 + * executable and shared library). This is advanced consideration and is 151 + * rarely (if ever) seen in practice, but just to avoid surprises this is 152 + * called out here. (Static libraries become a part of final executable, once 153 + * linked by linker, so the above considerations don't apply to them.) 154 + */ 155 + #ifdef __usdt_va_opt 156 + #define USDT_WITH_SEMA(group, name, ...) \ 157 + __usdt_probe(group, name, \ 158 + __usdt_sema_implicit, __usdt_sema_name(group, name) \ 159 + __VA_OPT__(,) __VA_ARGS__) 160 + #else 161 + #define USDT_WITH_SEMA(group, name, ...) \ 162 + __usdt_probe(group, name, \ 163 + __usdt_sema_implicit, __usdt_sema_name(group, name), \ 164 + ##__VA_ARGS__) 165 + #endif 166 + 167 + struct usdt_sema { volatile unsigned short active; }; 168 + 169 + /* 170 + * Check if USDT with `group`:`name` identifier is "active" (i.e., whether it 171 + * is attached to by external tracing tooling and is actively observed). 172 + * 173 + * This macro can be used to decide whether any additional and potentially 174 + * expensive data collection or processing should be done to pass extra 175 + * information into the given USDT. It is assumed that USDT is triggered with 176 + * USDT_WITH_SEMA() macro which will implicitly define associated USDT 177 + * semaphore. (If one needs more control over USDT semaphore, see 178 + * USDT_DEFINE_SEMA() and USDT_WITH_EXPLICIT_SEMA() macros below.) 179 + * 180 + * N.B. Such checks are necessarily racy and speculative. Between checking 181 + * whether USDT is active and triggering the USDT itself, tracer can be 182 + * detached with no notification. This race should be extremely rare and worst 183 + * case should result in one-time wasted extra data collection and processing. 184 + */ 185 + #define USDT_IS_ACTIVE(group, name) ({ \ 186 + extern struct usdt_sema __usdt_sema_name(group, name) \ 187 + __usdt_asm_name(__usdt_sema_name(group, name)); \ 188 + __usdt_sema_implicit(__usdt_sema_name(group, name)); \ 189 + __usdt_sema_name(group, name).active > 0; \ 190 + }) 191 + 192 + /* 193 + * APIs for working with user-defined explicit USDT semaphores. 194 + * 195 + * This is a less commonly used advanced API for use cases in which user needs 196 + * an explicit control over (potentially shared across multiple USDTs) USDT 197 + * semaphore instance. This can be used when there is a group of logically 198 + * related USDTs that all need extra data collection and processing whenever 199 + * any of a family of related USDTs are "activated" (i.e., traced). In such 200 + * a case, all such related USDTs will be associated with the same shared USDT 201 + * semaphore defined with USDT_DEFINE_SEMA() and the USDTs themselves will be 202 + * triggered with USDT_WITH_EXPLICIT_SEMA() macros, taking an explicit extra 203 + * USDT semaphore identifier as an extra parameter. 204 + */ 205 + 206 + /** 207 + * Underlying C global variable name for user-defined USDT semaphore with 208 + * `sema` identifier. Could be useful for debugging, but normally shouldn't be 209 + * used explicitly. 210 + */ 211 + #define USDT_SEMA(sema) __usdt_sema_##sema 212 + 213 + /* 214 + * Define storage for user-defined USDT semaphore `sema`. 215 + * 216 + * Should be used only once in non-header source file to let compiler allocate 217 + * space for the semaphore variable. Just like with any other global variable. 218 + * 219 + * This macro can be used anywhere where global variable declaration is 220 + * allowed. Just like with global variable definitions, there should be only 221 + * one definition of user-defined USDT semaphore with given `sema` identifier, 222 + * otherwise compiler or linker will complain about duplicate variable 223 + * definition. 224 + * 225 + * For C++, it is allowed to use USDT_DEFINE_SEMA() both in global namespace 226 + * and inside namespaces (including nested namespaces). Just make sure that 227 + * USDT_DECLARE_SEMA() is placed within the namespace where this semaphore is 228 + * referenced, or any of its parent namespaces, so the C++ language-level 229 + * identifier is visible to the code that needs to reference the semaphore. 230 + * At the lowest layer, USDT semaphores have global naming and visibility 231 + * (they have a corresponding `__usdt_sema_<name>` symbol, which can be linked 232 + * against from C or C++ code, if necessary). To keep it simple, putting 233 + * USDT_DECLARE_SEMA() declarations into global namespaces is the simplest 234 + * no-brainer solution. All these aspects are irrelevant for plain C, because 235 + * C doesn't have namespaces and everything is always in the global namespace. 236 + * 237 + * N.B. Due to USDT metadata being recorded in non-allocatable ELF note 238 + * section, it has limitations when it comes to relocations, which, in 239 + * practice, means that it's not possible to correctly share USDT semaphores 240 + * between main executable and shared libraries, or even between multiple 241 + * shared libraries. USDT semaphore has to be contained to individual shared 242 + * library or executable to avoid unpleasant surprises with half-working USDT 243 + * semaphores. We enforce this by marking semaphore ELF symbols as having 244 + * a hidden visibility. This is quite an advanced use case and consideration 245 + * and for most users this should have no consequences whatsoever. 246 + */ 247 + #define USDT_DEFINE_SEMA(sema) \ 248 + struct usdt_sema __usdt_sema_sec USDT_SEMA(sema) \ 249 + __usdt_asm_name(USDT_SEMA(sema)) \ 250 + __attribute__((visibility("hidden"))) = { 0 } 251 + 252 + /* 253 + * Declare extern reference to user-defined USDT semaphore `sema`. 254 + * 255 + * Refers to a variable defined in another compilation unit by 256 + * USDT_DEFINE_SEMA() and allows to use the same USDT semaphore across 257 + * multiple compilation units (i.e., .c and .cpp files). 258 + * 259 + * See USDT_DEFINE_SEMA() notes above for C++ language usage peculiarities. 260 + */ 261 + #define USDT_DECLARE_SEMA(sema) \ 262 + extern struct usdt_sema USDT_SEMA(sema) __usdt_asm_name(USDT_SEMA(sema)) 263 + 264 + /* 265 + * Check if user-defined USDT semaphore `sema` is "active" (i.e., whether it 266 + * is attached to by external tracing tooling and is actively observed). 267 + * 268 + * This macro can be used to decide whether any additional and potentially 269 + * expensive data collection or processing should be done to pass extra 270 + * information into USDT(s) associated with USDT semaphore `sema`. 271 + * 272 + * N.B. Such checks are necessarily racy. Between checking the state of USDT 273 + * semaphore and triggering associated USDT(s), the active tracer might attach 274 + * or detach. This race should be extremely rare and worst case should result 275 + * in one-time missed USDT event or wasted extra data collection and 276 + * processing. USDT-using tracers should be written with this in mind and is 277 + * not a concern of the application defining USDTs with associated semaphore. 278 + */ 279 + #define USDT_SEMA_IS_ACTIVE(sema) (USDT_SEMA(sema).active > 0) 280 + 281 + /* 282 + * Invoke USDT specified by `group` and `name` identifiers and associate 283 + * explicitly user-defined semaphore `sema` with it. Pass through `args` as 284 + * USDT arguments. `args` are optional and zero arguments are acceptable. 285 + * 286 + * Semaphore is defined with the help of USDT_DEFINE_SEMA() macro and can be 287 + * checked whether active with USDT_SEMA_IS_ACTIVE(). 288 + */ 289 + #ifdef __usdt_va_opt 290 + #define USDT_WITH_EXPLICIT_SEMA(sema, group, name, ...) \ 291 + __usdt_probe(group, name, __usdt_sema_explicit, USDT_SEMA(sema), ##__VA_ARGS__) 292 + #else 293 + #define USDT_WITH_EXPLICIT_SEMA(sema, group, name, ...) \ 294 + __usdt_probe(group, name, __usdt_sema_explicit, USDT_SEMA(sema) __VA_OPT__(,) __VA_ARGS__) 295 + #endif 296 + 297 + /* 298 + * Adjustable implementation aspects 299 + */ 300 + #ifndef USDT_ARG_CONSTRAINT 301 + #if defined __powerpc__ 302 + #define USDT_ARG_CONSTRAINT nZr 303 + #elif defined __arm__ 304 + #define USDT_ARG_CONSTRAINT g 305 + #elif defined __loongarch__ 306 + #define USDT_ARG_CONSTRAINT nmr 307 + #else 308 + #define USDT_ARG_CONSTRAINT nor 309 + #endif 310 + #endif /* USDT_ARG_CONSTRAINT */ 311 + 312 + #ifndef USDT_NOP 313 + #if defined(__ia64__) || defined(__s390__) || defined(__s390x__) 314 + #define USDT_NOP nop 0 315 + #else 316 + #define USDT_NOP nop 317 + #endif 318 + #endif /* USDT_NOP */ 319 + 320 + /* 321 + * Implementation details 322 + */ 323 + /* USDT name for implicitly-defined USDT semaphore, derived from group:name */ 324 + #define __usdt_sema_name(group, name) __usdt_sema_##group##__##name 325 + /* ELF section into which USDT semaphores are put */ 326 + #define __usdt_sema_sec __attribute__((section(".probes"))) 327 + 328 + #define __usdt_concat(a, b) a ## b 329 + #define __usdt_apply(fn, n) __usdt_concat(fn, n) 330 + 331 + #ifndef __usdt_nth 332 + #define __usdt_nth(_, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, N, ...) N 333 + #endif 334 + 335 + #ifndef __usdt_narg 336 + #ifdef __usdt_va_opt 337 + #define __usdt_narg(...) __usdt_nth(_ __VA_OPT__(,) __VA_ARGS__, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0) 338 + #else 339 + #define __usdt_narg(...) __usdt_nth(_, ##__VA_ARGS__, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0) 340 + #endif 341 + #endif /* __usdt_narg */ 342 + 343 + #define __usdt_hash # 344 + #define __usdt_str_(x) #x 345 + #define __usdt_str(x) __usdt_str_(x) 346 + 347 + #ifndef __usdt_asm_name 348 + #define __usdt_asm_name(name) __asm__(__usdt_str(name)) 349 + #endif 350 + 351 + #define __usdt_asm0() "\n" 352 + #define __usdt_asm1(x) __usdt_str(x) "\n" 353 + #define __usdt_asm2(x, ...) __usdt_str(x) "," __usdt_asm1(__VA_ARGS__) 354 + #define __usdt_asm3(x, ...) __usdt_str(x) "," __usdt_asm2(__VA_ARGS__) 355 + #define __usdt_asm4(x, ...) __usdt_str(x) "," __usdt_asm3(__VA_ARGS__) 356 + #define __usdt_asm5(x, ...) __usdt_str(x) "," __usdt_asm4(__VA_ARGS__) 357 + #define __usdt_asm6(x, ...) __usdt_str(x) "," __usdt_asm5(__VA_ARGS__) 358 + #define __usdt_asm7(x, ...) __usdt_str(x) "," __usdt_asm6(__VA_ARGS__) 359 + #define __usdt_asm8(x, ...) __usdt_str(x) "," __usdt_asm7(__VA_ARGS__) 360 + #define __usdt_asm9(x, ...) __usdt_str(x) "," __usdt_asm8(__VA_ARGS__) 361 + #define __usdt_asm10(x, ...) __usdt_str(x) "," __usdt_asm9(__VA_ARGS__) 362 + #define __usdt_asm11(x, ...) __usdt_str(x) "," __usdt_asm10(__VA_ARGS__) 363 + #define __usdt_asm12(x, ...) __usdt_str(x) "," __usdt_asm11(__VA_ARGS__) 364 + #define __usdt_asm(...) __usdt_apply(__usdt_asm, __usdt_narg(__VA_ARGS__))(__VA_ARGS__) 365 + 366 + #ifdef __LP64__ 367 + #define __usdt_asm_addr .8byte 368 + #else 369 + #define __usdt_asm_addr .4byte 370 + #endif 371 + 372 + #define __usdt_asm_strz_(x) __usdt_asm1(.asciz #x) 373 + #define __usdt_asm_strz(x) __usdt_asm_strz_(x) 374 + #define __usdt_asm_str_(x) __usdt_asm1(.ascii #x) 375 + #define __usdt_asm_str(x) __usdt_asm_str_(x) 376 + 377 + /* "semaphoreless" USDT case */ 378 + #ifndef __usdt_sema_none 379 + #define __usdt_sema_none(sema) 380 + #endif 381 + 382 + /* implicitly defined __usdt_sema__group__name semaphore (using weak symbols) */ 383 + #ifndef __usdt_sema_implicit 384 + #define __usdt_sema_implicit(sema) \ 385 + __asm__ __volatile__ ( \ 386 + __usdt_asm1(.ifndef sema) \ 387 + __usdt_asm3( .pushsection .probes, "aw", "progbits") \ 388 + __usdt_asm1( .weak sema) \ 389 + __usdt_asm1( .hidden sema) \ 390 + __usdt_asm1( .align 2) \ 391 + __usdt_asm1(sema:) \ 392 + __usdt_asm1( .zero 2) \ 393 + __usdt_asm2( .type sema, @object) \ 394 + __usdt_asm2( .size sema, 2) \ 395 + __usdt_asm1( .popsection) \ 396 + __usdt_asm1(.endif) \ 397 + ); 398 + #endif 399 + 400 + /* externally defined semaphore using USDT_DEFINE_SEMA() and passed explicitly by user */ 401 + #ifndef __usdt_sema_explicit 402 + #define __usdt_sema_explicit(sema) \ 403 + __asm__ __volatile__ ("" :: "m" (sema)); 404 + #endif 405 + 406 + /* main USDT definition (nop and .note.stapsdt metadata) */ 407 + #define __usdt_probe(group, name, sema_def, sema, ...) do { \ 408 + sema_def(sema) \ 409 + __asm__ __volatile__ ( \ 410 + __usdt_asm( 990: USDT_NOP) \ 411 + __usdt_asm3( .pushsection .note.stapsdt, "", "note") \ 412 + __usdt_asm1( .balign 4) \ 413 + __usdt_asm3( .4byte 992f-991f,994f-993f,3) \ 414 + __usdt_asm1(991: .asciz "stapsdt") \ 415 + __usdt_asm1(992: .balign 4) \ 416 + __usdt_asm1(993: __usdt_asm_addr 990b) \ 417 + __usdt_asm1( __usdt_asm_addr _.stapsdt.base) \ 418 + __usdt_asm1( __usdt_asm_addr sema) \ 419 + __usdt_asm_strz(group) \ 420 + __usdt_asm_strz(name) \ 421 + __usdt_asm_args(__VA_ARGS__) \ 422 + __usdt_asm1( .ascii "\0") \ 423 + __usdt_asm1(994: .balign 4) \ 424 + __usdt_asm1( .popsection) \ 425 + __usdt_asm1(.ifndef _.stapsdt.base) \ 426 + __usdt_asm5( .pushsection .stapsdt.base,"aG","progbits",.stapsdt.base,comdat)\ 427 + __usdt_asm1( .weak _.stapsdt.base) \ 428 + __usdt_asm1( .hidden _.stapsdt.base) \ 429 + __usdt_asm1(_.stapsdt.base:) \ 430 + __usdt_asm1( .space 1) \ 431 + __usdt_asm2( .size _.stapsdt.base, 1) \ 432 + __usdt_asm1( .popsection) \ 433 + __usdt_asm1(.endif) \ 434 + :: __usdt_asm_ops(__VA_ARGS__) \ 435 + ); \ 436 + } while (0) 437 + 438 + /* 439 + * NB: gdb PR24541 highlighted an unspecified corner of the sdt.h 440 + * operand note format. 441 + * 442 + * The named register may be a longer or shorter (!) alias for the 443 + * storage where the value in question is found. For example, on 444 + * i386, 64-bit value may be put in register pairs, and a register 445 + * name stored would identify just one of them. Previously, gcc was 446 + * asked to emit the %w[id] (16-bit alias of some registers holding 447 + * operands), even when a wider 32-bit value was used. 448 + * 449 + * Bottom line: the byte-width given before the @ sign governs. If 450 + * there is a mismatch between that width and that of the named 451 + * register, then a sys/sdt.h note consumer may need to employ 452 + * architecture-specific heuristics to figure out where the compiler 453 + * has actually put the complete value. 454 + */ 455 + #if defined(__powerpc__) || defined(__powerpc64__) 456 + #define __usdt_argref(id) %I[id]%[id] 457 + #elif defined(__i386__) 458 + #define __usdt_argref(id) %k[id] /* gcc.gnu.org/PR80115 sourceware.org/PR24541 */ 459 + #else 460 + #define __usdt_argref(id) %[id] 461 + #endif 462 + 463 + #define __usdt_asm_arg(n) __usdt_asm_str(%c[__usdt_asz##n]) \ 464 + __usdt_asm1(.ascii "@") \ 465 + __usdt_asm_str(__usdt_argref(__usdt_aval##n)) 466 + 467 + #define __usdt_asm_args0 /* no arguments */ 468 + #define __usdt_asm_args1 __usdt_asm_arg(1) 469 + #define __usdt_asm_args2 __usdt_asm_args1 __usdt_asm1(.ascii " ") __usdt_asm_arg(2) 470 + #define __usdt_asm_args3 __usdt_asm_args2 __usdt_asm1(.ascii " ") __usdt_asm_arg(3) 471 + #define __usdt_asm_args4 __usdt_asm_args3 __usdt_asm1(.ascii " ") __usdt_asm_arg(4) 472 + #define __usdt_asm_args5 __usdt_asm_args4 __usdt_asm1(.ascii " ") __usdt_asm_arg(5) 473 + #define __usdt_asm_args6 __usdt_asm_args5 __usdt_asm1(.ascii " ") __usdt_asm_arg(6) 474 + #define __usdt_asm_args7 __usdt_asm_args6 __usdt_asm1(.ascii " ") __usdt_asm_arg(7) 475 + #define __usdt_asm_args8 __usdt_asm_args7 __usdt_asm1(.ascii " ") __usdt_asm_arg(8) 476 + #define __usdt_asm_args9 __usdt_asm_args8 __usdt_asm1(.ascii " ") __usdt_asm_arg(9) 477 + #define __usdt_asm_args10 __usdt_asm_args9 __usdt_asm1(.ascii " ") __usdt_asm_arg(10) 478 + #define __usdt_asm_args11 __usdt_asm_args10 __usdt_asm1(.ascii " ") __usdt_asm_arg(11) 479 + #define __usdt_asm_args12 __usdt_asm_args11 __usdt_asm1(.ascii " ") __usdt_asm_arg(12) 480 + #define __usdt_asm_args(...) __usdt_apply(__usdt_asm_args, __usdt_narg(__VA_ARGS__)) 481 + 482 + #define __usdt_is_arr(x) (__builtin_classify_type(x) == 14 || __builtin_classify_type(x) == 5) 483 + #define __usdt_arg_size(x) (__usdt_is_arr(x) ? sizeof(void *) : sizeof(x)) 484 + 485 + /* 486 + * We can't use __builtin_choose_expr() in C++, so fall back to table-based 487 + * signedness determination for known types, utilizing templates magic. 488 + */ 489 + #ifdef __cplusplus 490 + 491 + #define __usdt_is_signed(x) (!__usdt_is_arr(x) && __usdt_t<__typeof(x)>::is_signed) 492 + 493 + #include <cstddef> 494 + 495 + template<typename T> struct __usdt_t { static const bool is_signed = false; }; 496 + template<typename A> struct __usdt_t<A[]> : public __usdt_t<A *> {}; 497 + template<typename A, size_t N> struct __usdt_t<A[N]> : public __usdt_t<A *> {}; 498 + 499 + #define __usdt_def_signed(T) \ 500 + template<> struct __usdt_t<T> { static const bool is_signed = true; }; \ 501 + template<> struct __usdt_t<const T> { static const bool is_signed = true; }; \ 502 + template<> struct __usdt_t<volatile T> { static const bool is_signed = true; }; \ 503 + template<> struct __usdt_t<const volatile T> { static const bool is_signed = true; } 504 + #define __usdt_maybe_signed(T) \ 505 + template<> struct __usdt_t<T> { static const bool is_signed = (T)-1 < (T)1; }; \ 506 + template<> struct __usdt_t<const T> { static const bool is_signed = (T)-1 < (T)1; }; \ 507 + template<> struct __usdt_t<volatile T> { static const bool is_signed = (T)-1 < (T)1; }; \ 508 + template<> struct __usdt_t<const volatile T> { static const bool is_signed = (T)-1 < (T)1; } 509 + 510 + __usdt_def_signed(signed char); 511 + __usdt_def_signed(short); 512 + __usdt_def_signed(int); 513 + __usdt_def_signed(long); 514 + __usdt_def_signed(long long); 515 + __usdt_maybe_signed(char); 516 + __usdt_maybe_signed(wchar_t); 517 + 518 + #else /* !__cplusplus */ 519 + 520 + #define __usdt_is_inttype(x) (__builtin_classify_type(x) >= 1 && __builtin_classify_type(x) <= 4) 521 + #define __usdt_inttype(x) __typeof(__builtin_choose_expr(__usdt_is_inttype(x), (x), 0U)) 522 + #define __usdt_is_signed(x) ((__usdt_inttype(x))-1 < (__usdt_inttype(x))1) 523 + 524 + #endif /* __cplusplus */ 525 + 526 + #define __usdt_asm_op(n, x) \ 527 + [__usdt_asz##n] "n" ((__usdt_is_signed(x) ? (int)-1 : 1) * (int)__usdt_arg_size(x)), \ 528 + [__usdt_aval##n] __usdt_str(USDT_ARG_CONSTRAINT)(x) 529 + 530 + #define __usdt_asm_ops0() [__usdt_dummy] "g" (0) 531 + #define __usdt_asm_ops1(x) __usdt_asm_op(1, x) 532 + #define __usdt_asm_ops2(a,x) __usdt_asm_ops1(a), __usdt_asm_op(2, x) 533 + #define __usdt_asm_ops3(a,b,x) __usdt_asm_ops2(a,b), __usdt_asm_op(3, x) 534 + #define __usdt_asm_ops4(a,b,c,x) __usdt_asm_ops3(a,b,c), __usdt_asm_op(4, x) 535 + #define __usdt_asm_ops5(a,b,c,d,x) __usdt_asm_ops4(a,b,c,d), __usdt_asm_op(5, x) 536 + #define __usdt_asm_ops6(a,b,c,d,e,x) __usdt_asm_ops5(a,b,c,d,e), __usdt_asm_op(6, x) 537 + #define __usdt_asm_ops7(a,b,c,d,e,f,x) __usdt_asm_ops6(a,b,c,d,e,f), __usdt_asm_op(7, x) 538 + #define __usdt_asm_ops8(a,b,c,d,e,f,g,x) __usdt_asm_ops7(a,b,c,d,e,f,g), __usdt_asm_op(8, x) 539 + #define __usdt_asm_ops9(a,b,c,d,e,f,g,h,x) __usdt_asm_ops8(a,b,c,d,e,f,g,h), __usdt_asm_op(9, x) 540 + #define __usdt_asm_ops10(a,b,c,d,e,f,g,h,i,x) __usdt_asm_ops9(a,b,c,d,e,f,g,h,i), __usdt_asm_op(10, x) 541 + #define __usdt_asm_ops11(a,b,c,d,e,f,g,h,i,j,x) __usdt_asm_ops10(a,b,c,d,e,f,g,h,i,j), __usdt_asm_op(11, x) 542 + #define __usdt_asm_ops12(a,b,c,d,e,f,g,h,i,j,k,x) __usdt_asm_ops11(a,b,c,d,e,f,g,h,i,j,k), __usdt_asm_op(12, x) 543 + #define __usdt_asm_ops(...) __usdt_apply(__usdt_asm_ops, __usdt_narg(__VA_ARGS__))(__VA_ARGS__) 544 + 545 + #endif /* __USDT_H */
+88 -23
tools/testing/selftests/seccomp/seccomp_bpf.c
··· 74 74 #define noinline __attribute__((noinline)) 75 75 #endif 76 76 77 + #ifndef __nocf_check 78 + #define __nocf_check __attribute__((nocf_check)) 79 + #endif 80 + 81 + #ifndef __naked 82 + #define __naked __attribute__((__naked__)) 83 + #endif 84 + 77 85 #ifndef PR_SET_NO_NEW_PRIVS 78 86 #define PR_SET_NO_NEW_PRIVS 38 79 87 #define PR_GET_NO_NEW_PRIVS 39 ··· 5035 5027 EXPECT_EQ(0, status); 5036 5028 } 5037 5029 5038 - noinline int probed(void) 5030 + #ifdef __x86_64__ 5031 + 5032 + /* 5033 + * We need naked probed_uprobe function. Using __nocf_check 5034 + * check to skip possible endbr64 instruction and ignoring 5035 + * -Wattributes, otherwise the compilation might fail. 5036 + */ 5037 + #pragma GCC diagnostic push 5038 + #pragma GCC diagnostic ignored "-Wattributes" 5039 + 5040 + __naked __nocf_check noinline int probed_uprobe(void) 5041 + { 5042 + /* 5043 + * Optimized uprobe is possible only on top of nop5 instruction. 5044 + */ 5045 + asm volatile (" \n" 5046 + ".byte 0x0f, 0x1f, 0x44, 0x00, 0x00 \n" 5047 + "ret \n" 5048 + ); 5049 + } 5050 + #pragma GCC diagnostic pop 5051 + 5052 + #else 5053 + noinline int probed_uprobe(void) 5054 + { 5055 + return 1; 5056 + } 5057 + #endif 5058 + 5059 + noinline int probed_uretprobe(void) 5039 5060 { 5040 5061 return 1; 5041 5062 } ··· 5117 5080 return found ? (uintptr_t)addr - start + base : -1; 5118 5081 } 5119 5082 5120 - FIXTURE(URETPROBE) { 5083 + FIXTURE(UPROBE) { 5121 5084 int fd; 5122 5085 }; 5123 5086 5124 - FIXTURE_VARIANT(URETPROBE) { 5087 + FIXTURE_VARIANT(UPROBE) { 5125 5088 /* 5126 - * All of the URETPROBE behaviors can be tested with either 5127 - * uretprobe attached or not 5089 + * All of the U(RET)PROBE behaviors can be tested with either 5090 + * u(ret)probe attached or not 5128 5091 */ 5129 5092 bool attach; 5093 + /* 5094 + * Test both uprobe and uretprobe. 5095 + */ 5096 + bool uretprobe; 5130 5097 }; 5131 5098 5132 - FIXTURE_VARIANT_ADD(URETPROBE, attached) { 5133 - .attach = true, 5134 - }; 5135 - 5136 - FIXTURE_VARIANT_ADD(URETPROBE, not_attached) { 5099 + FIXTURE_VARIANT_ADD(UPROBE, not_attached) { 5137 5100 .attach = false, 5101 + .uretprobe = false, 5138 5102 }; 5139 5103 5140 - FIXTURE_SETUP(URETPROBE) 5104 + FIXTURE_VARIANT_ADD(UPROBE, uprobe_attached) { 5105 + .attach = true, 5106 + .uretprobe = false, 5107 + }; 5108 + 5109 + FIXTURE_VARIANT_ADD(UPROBE, uretprobe_attached) { 5110 + .attach = true, 5111 + .uretprobe = true, 5112 + }; 5113 + 5114 + FIXTURE_SETUP(UPROBE) 5141 5115 { 5142 5116 const size_t attr_sz = sizeof(struct perf_event_attr); 5143 5117 struct perf_event_attr attr; 5144 5118 ssize_t offset; 5145 5119 int type, bit; 5146 5120 5147 - #ifndef __NR_uretprobe 5148 - SKIP(return, "__NR_uretprobe syscall not defined"); 5121 + #if !defined(__NR_uprobe) || !defined(__NR_uretprobe) 5122 + SKIP(return, "__NR_uprobe ot __NR_uretprobe syscalls not defined"); 5149 5123 #endif 5150 5124 5151 5125 if (!variant->attach) ··· 5166 5118 5167 5119 type = determine_uprobe_perf_type(); 5168 5120 ASSERT_GE(type, 0); 5169 - bit = determine_uprobe_retprobe_bit(); 5170 - ASSERT_GE(bit, 0); 5171 - offset = get_uprobe_offset(probed); 5121 + 5122 + if (variant->uretprobe) { 5123 + bit = determine_uprobe_retprobe_bit(); 5124 + ASSERT_GE(bit, 0); 5125 + } 5126 + 5127 + offset = get_uprobe_offset(variant->uretprobe ? probed_uretprobe : probed_uprobe); 5172 5128 ASSERT_GE(offset, 0); 5173 5129 5174 - attr.config |= 1 << bit; 5130 + if (variant->uretprobe) 5131 + attr.config |= 1 << bit; 5175 5132 attr.size = attr_sz; 5176 5133 attr.type = type; 5177 5134 attr.config1 = ptr_to_u64("/proc/self/exe"); ··· 5187 5134 PERF_FLAG_FD_CLOEXEC); 5188 5135 } 5189 5136 5190 - FIXTURE_TEARDOWN(URETPROBE) 5137 + FIXTURE_TEARDOWN(UPROBE) 5191 5138 { 5192 5139 /* we could call close(self->fd), but we'd need extra filter for 5193 5140 * that and since we are calling _exit right away.. ··· 5201 5148 return -1; 5202 5149 } 5203 5150 5204 - probed(); 5151 + /* 5152 + * Uprobe is optimized after first hit, so let's hit twice. 5153 + */ 5154 + probed_uprobe(); 5155 + probed_uprobe(); 5156 + 5157 + probed_uretprobe(); 5205 5158 return 0; 5206 5159 } 5207 5160 5208 - TEST_F(URETPROBE, uretprobe_default_allow) 5161 + TEST_F(UPROBE, uprobe_default_allow) 5209 5162 { 5210 5163 struct sock_filter filter[] = { 5211 5164 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), ··· 5224 5165 ASSERT_EQ(0, run_probed_with_filter(&prog)); 5225 5166 } 5226 5167 5227 - TEST_F(URETPROBE, uretprobe_default_block) 5168 + TEST_F(UPROBE, uprobe_default_block) 5228 5169 { 5229 5170 struct sock_filter filter[] = { 5230 5171 BPF_STMT(BPF_LD|BPF_W|BPF_ABS, ··· 5241 5182 ASSERT_EQ(0, run_probed_with_filter(&prog)); 5242 5183 } 5243 5184 5244 - TEST_F(URETPROBE, uretprobe_block_uretprobe_syscall) 5185 + TEST_F(UPROBE, uprobe_block_syscall) 5245 5186 { 5246 5187 struct sock_filter filter[] = { 5247 5188 BPF_STMT(BPF_LD|BPF_W|BPF_ABS, 5248 5189 offsetof(struct seccomp_data, nr)), 5190 + #ifdef __NR_uprobe 5191 + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_uprobe, 1, 2), 5192 + #endif 5249 5193 #ifdef __NR_uretprobe 5250 5194 BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_uretprobe, 0, 1), 5251 5195 #endif ··· 5263 5201 ASSERT_EQ(0, run_probed_with_filter(&prog)); 5264 5202 } 5265 5203 5266 - TEST_F(URETPROBE, uretprobe_default_block_with_uretprobe_syscall) 5204 + TEST_F(UPROBE, uprobe_default_block_with_syscall) 5267 5205 { 5268 5206 struct sock_filter filter[] = { 5269 5207 BPF_STMT(BPF_LD|BPF_W|BPF_ABS, 5270 5208 offsetof(struct seccomp_data, nr)), 5209 + #ifdef __NR_uprobe 5210 + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_uprobe, 3, 0), 5211 + #endif 5271 5212 #ifdef __NR_uretprobe 5272 5213 BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_uretprobe, 2, 0), 5273 5214 #endif