at v6.15 2042 lines 60 kB view raw
1/* 2 * Performance events: 3 * 4 * Copyright (C) 2008-2009, Thomas Gleixner <tglx@linutronix.de> 5 * Copyright (C) 2008-2011, Red Hat, Inc., Ingo Molnar 6 * Copyright (C) 2008-2011, Red Hat, Inc., Peter Zijlstra 7 * 8 * Data type definitions, declarations, prototypes. 9 * 10 * Started by: Thomas Gleixner and Ingo Molnar 11 * 12 * For licencing details see kernel-base/COPYING 13 */ 14#ifndef _LINUX_PERF_EVENT_H 15#define _LINUX_PERF_EVENT_H 16 17#include <uapi/linux/perf_event.h> 18#include <uapi/linux/bpf_perf_event.h> 19 20/* 21 * Kernel-internal data types and definitions: 22 */ 23 24#ifdef CONFIG_PERF_EVENTS 25# include <asm/perf_event.h> 26# include <asm/local64.h> 27#endif 28 29#define PERF_GUEST_ACTIVE 0x01 30#define PERF_GUEST_USER 0x02 31 32struct perf_guest_info_callbacks { 33 unsigned int (*state)(void); 34 unsigned long (*get_ip)(void); 35 unsigned int (*handle_intel_pt_intr)(void); 36}; 37 38#ifdef CONFIG_HAVE_HW_BREAKPOINT 39#include <linux/rhashtable-types.h> 40#include <asm/hw_breakpoint.h> 41#endif 42 43#include <linux/list.h> 44#include <linux/mutex.h> 45#include <linux/rculist.h> 46#include <linux/rcupdate.h> 47#include <linux/spinlock.h> 48#include <linux/hrtimer.h> 49#include <linux/fs.h> 50#include <linux/pid_namespace.h> 51#include <linux/workqueue.h> 52#include <linux/ftrace.h> 53#include <linux/cpu.h> 54#include <linux/irq_work.h> 55#include <linux/static_key.h> 56#include <linux/jump_label_ratelimit.h> 57#include <linux/atomic.h> 58#include <linux/sysfs.h> 59#include <linux/perf_regs.h> 60#include <linux/cgroup.h> 61#include <linux/refcount.h> 62#include <linux/security.h> 63#include <linux/static_call.h> 64#include <linux/lockdep.h> 65#include <asm/local.h> 66 67struct perf_callchain_entry { 68 __u64 nr; 69 __u64 ip[]; /* /proc/sys/kernel/perf_event_max_stack */ 70}; 71 72struct perf_callchain_entry_ctx { 73 struct perf_callchain_entry *entry; 74 u32 max_stack; 75 u32 nr; 76 short contexts; 77 bool contexts_maxed; 78}; 79 80typedef unsigned long (*perf_copy_f)(void *dst, const void *src, 81 unsigned long off, unsigned long len); 82 83struct perf_raw_frag { 84 union { 85 struct perf_raw_frag *next; 86 unsigned long pad; 87 }; 88 perf_copy_f copy; 89 void *data; 90 u32 size; 91} __packed; 92 93struct perf_raw_record { 94 struct perf_raw_frag frag; 95 u32 size; 96}; 97 98static __always_inline bool perf_raw_frag_last(const struct perf_raw_frag *frag) 99{ 100 return frag->pad < sizeof(u64); 101} 102 103/* 104 * branch stack layout: 105 * nr: number of taken branches stored in entries[] 106 * hw_idx: The low level index of raw branch records 107 * for the most recent branch. 108 * -1ULL means invalid/unknown. 109 * 110 * Note that nr can vary from sample to sample 111 * branches (to, from) are stored from most recent 112 * to least recent, i.e., entries[0] contains the most 113 * recent branch. 114 * The entries[] is an abstraction of raw branch records, 115 * which may not be stored in age order in HW, e.g. Intel LBR. 116 * The hw_idx is to expose the low level index of raw 117 * branch record for the most recent branch aka entries[0]. 118 * The hw_idx index is between -1 (unknown) and max depth, 119 * which can be retrieved in /sys/devices/cpu/caps/branches. 120 * For the architectures whose raw branch records are 121 * already stored in age order, the hw_idx should be 0. 122 */ 123struct perf_branch_stack { 124 __u64 nr; 125 __u64 hw_idx; 126 struct perf_branch_entry entries[]; 127}; 128 129struct task_struct; 130 131/* 132 * extra PMU register associated with an event 133 */ 134struct hw_perf_event_extra { 135 u64 config; /* register value */ 136 unsigned int reg; /* register address or index */ 137 int alloc; /* extra register already allocated */ 138 int idx; /* index in shared_regs->regs[] */ 139}; 140 141/** 142 * hw_perf_event::flag values 143 * 144 * PERF_EVENT_FLAG_ARCH bits are reserved for architecture-specific 145 * usage. 146 */ 147#define PERF_EVENT_FLAG_ARCH 0x000fffff 148#define PERF_EVENT_FLAG_USER_READ_CNT 0x80000000 149 150static_assert((PERF_EVENT_FLAG_USER_READ_CNT & PERF_EVENT_FLAG_ARCH) == 0); 151 152/** 153 * struct hw_perf_event - performance event hardware details: 154 */ 155struct hw_perf_event { 156#ifdef CONFIG_PERF_EVENTS 157 union { 158 struct { /* hardware */ 159 u64 config; 160 u64 last_tag; 161 unsigned long config_base; 162 unsigned long event_base; 163 int event_base_rdpmc; 164 int idx; 165 int last_cpu; 166 int flags; 167 168 struct hw_perf_event_extra extra_reg; 169 struct hw_perf_event_extra branch_reg; 170 }; 171 struct { /* aux / Intel-PT */ 172 u64 aux_config; 173 /* 174 * For AUX area events, aux_paused cannot be a state 175 * flag because it can be updated asynchronously to 176 * state. 177 */ 178 unsigned int aux_paused; 179 }; 180 struct { /* software */ 181 struct hrtimer hrtimer; 182 }; 183 struct { /* tracepoint */ 184 /* for tp_event->class */ 185 struct list_head tp_list; 186 }; 187 struct { /* amd_power */ 188 u64 pwr_acc; 189 u64 ptsc; 190 }; 191#ifdef CONFIG_HAVE_HW_BREAKPOINT 192 struct { /* breakpoint */ 193 /* 194 * Crufty hack to avoid the chicken and egg 195 * problem hw_breakpoint has with context 196 * creation and event initalization. 197 */ 198 struct arch_hw_breakpoint info; 199 struct rhlist_head bp_list; 200 }; 201#endif 202 struct { /* amd_iommu */ 203 u8 iommu_bank; 204 u8 iommu_cntr; 205 u16 padding; 206 u64 conf; 207 u64 conf1; 208 }; 209 }; 210 /* 211 * If the event is a per task event, this will point to the task in 212 * question. See the comment in perf_event_alloc(). 213 */ 214 struct task_struct *target; 215 216 /* 217 * PMU would store hardware filter configuration 218 * here. 219 */ 220 void *addr_filters; 221 222 /* Last sync'ed generation of filters */ 223 unsigned long addr_filters_gen; 224 225/* 226 * hw_perf_event::state flags; used to track the PERF_EF_* state. 227 */ 228#define PERF_HES_STOPPED 0x01 /* the counter is stopped */ 229#define PERF_HES_UPTODATE 0x02 /* event->count up-to-date */ 230#define PERF_HES_ARCH 0x04 231 232 int state; 233 234 /* 235 * The last observed hardware counter value, updated with a 236 * local64_cmpxchg() such that pmu::read() can be called nested. 237 */ 238 local64_t prev_count; 239 240 /* 241 * The period to start the next sample with. 242 */ 243 u64 sample_period; 244 245 union { 246 struct { /* Sampling */ 247 /* 248 * The period we started this sample with. 249 */ 250 u64 last_period; 251 252 /* 253 * However much is left of the current period; 254 * note that this is a full 64bit value and 255 * allows for generation of periods longer 256 * than hardware might allow. 257 */ 258 local64_t period_left; 259 }; 260 struct { /* Topdown events counting for context switch */ 261 u64 saved_metric; 262 u64 saved_slots; 263 }; 264 }; 265 266 /* 267 * State for throttling the event, see __perf_event_overflow() and 268 * perf_adjust_freq_unthr_context(). 269 */ 270 u64 interrupts_seq; 271 u64 interrupts; 272 273 /* 274 * State for freq target events, see __perf_event_overflow() and 275 * perf_adjust_freq_unthr_context(). 276 */ 277 u64 freq_time_stamp; 278 u64 freq_count_stamp; 279#endif 280}; 281 282struct perf_event; 283struct perf_event_pmu_context; 284 285/* 286 * Common implementation detail of pmu::{start,commit,cancel}_txn 287 */ 288#define PERF_PMU_TXN_ADD 0x1 /* txn to add/schedule event on PMU */ 289#define PERF_PMU_TXN_READ 0x2 /* txn to read event group from PMU */ 290 291/** 292 * pmu::capabilities flags 293 */ 294#define PERF_PMU_CAP_NO_INTERRUPT 0x0001 295#define PERF_PMU_CAP_NO_NMI 0x0002 296#define PERF_PMU_CAP_AUX_NO_SG 0x0004 297#define PERF_PMU_CAP_EXTENDED_REGS 0x0008 298#define PERF_PMU_CAP_EXCLUSIVE 0x0010 299#define PERF_PMU_CAP_ITRACE 0x0020 300#define PERF_PMU_CAP_NO_EXCLUDE 0x0040 301#define PERF_PMU_CAP_AUX_OUTPUT 0x0080 302#define PERF_PMU_CAP_EXTENDED_HW_TYPE 0x0100 303#define PERF_PMU_CAP_AUX_PAUSE 0x0200 304 305/** 306 * pmu::scope 307 */ 308enum perf_pmu_scope { 309 PERF_PMU_SCOPE_NONE = 0, 310 PERF_PMU_SCOPE_CORE, 311 PERF_PMU_SCOPE_DIE, 312 PERF_PMU_SCOPE_CLUSTER, 313 PERF_PMU_SCOPE_PKG, 314 PERF_PMU_SCOPE_SYS_WIDE, 315 PERF_PMU_MAX_SCOPE, 316}; 317 318struct perf_output_handle; 319 320#define PMU_NULL_DEV ((void *)(~0UL)) 321 322/** 323 * struct pmu - generic performance monitoring unit 324 */ 325struct pmu { 326 struct list_head entry; 327 328 struct module *module; 329 struct device *dev; 330 struct device *parent; 331 const struct attribute_group **attr_groups; 332 const struct attribute_group **attr_update; 333 const char *name; 334 int type; 335 336 /* 337 * various common per-pmu feature flags 338 */ 339 int capabilities; 340 341 /* 342 * PMU scope 343 */ 344 unsigned int scope; 345 346 struct perf_cpu_pmu_context * __percpu *cpu_pmu_context; 347 atomic_t exclusive_cnt; /* < 0: cpu; > 0: tsk */ 348 int task_ctx_nr; 349 int hrtimer_interval_ms; 350 351 /* number of address filters this PMU can do */ 352 unsigned int nr_addr_filters; 353 354 /* 355 * Fully disable/enable this PMU, can be used to protect from the PMI 356 * as well as for lazy/batch writing of the MSRs. 357 */ 358 void (*pmu_enable) (struct pmu *pmu); /* optional */ 359 void (*pmu_disable) (struct pmu *pmu); /* optional */ 360 361 /* 362 * Try and initialize the event for this PMU. 363 * 364 * Returns: 365 * -ENOENT -- @event is not for this PMU 366 * 367 * -ENODEV -- @event is for this PMU but PMU not present 368 * -EBUSY -- @event is for this PMU but PMU temporarily unavailable 369 * -EINVAL -- @event is for this PMU but @event is not valid 370 * -EOPNOTSUPP -- @event is for this PMU, @event is valid, but not supported 371 * -EACCES -- @event is for this PMU, @event is valid, but no privileges 372 * 373 * 0 -- @event is for this PMU and valid 374 * 375 * Other error return values are allowed. 376 */ 377 int (*event_init) (struct perf_event *event); 378 379 /* 380 * Notification that the event was mapped or unmapped. Called 381 * in the context of the mapping task. 382 */ 383 void (*event_mapped) (struct perf_event *event, struct mm_struct *mm); /* optional */ 384 void (*event_unmapped) (struct perf_event *event, struct mm_struct *mm); /* optional */ 385 386 /* 387 * Flags for ->add()/->del()/ ->start()/->stop(). There are 388 * matching hw_perf_event::state flags. 389 */ 390#define PERF_EF_START 0x01 /* start the counter when adding */ 391#define PERF_EF_RELOAD 0x02 /* reload the counter when starting */ 392#define PERF_EF_UPDATE 0x04 /* update the counter when stopping */ 393#define PERF_EF_PAUSE 0x08 /* AUX area event, pause tracing */ 394#define PERF_EF_RESUME 0x10 /* AUX area event, resume tracing */ 395 396 /* 397 * Adds/Removes a counter to/from the PMU, can be done inside a 398 * transaction, see the ->*_txn() methods. 399 * 400 * The add/del callbacks will reserve all hardware resources required 401 * to service the event, this includes any counter constraint 402 * scheduling etc. 403 * 404 * Called with IRQs disabled and the PMU disabled on the CPU the event 405 * is on. 406 * 407 * ->add() called without PERF_EF_START should result in the same state 408 * as ->add() followed by ->stop(). 409 * 410 * ->del() must always PERF_EF_UPDATE stop an event. If it calls 411 * ->stop() that must deal with already being stopped without 412 * PERF_EF_UPDATE. 413 */ 414 int (*add) (struct perf_event *event, int flags); 415 void (*del) (struct perf_event *event, int flags); 416 417 /* 418 * Starts/Stops a counter present on the PMU. 419 * 420 * The PMI handler should stop the counter when perf_event_overflow() 421 * returns !0. ->start() will be used to continue. 422 * 423 * Also used to change the sample period. 424 * 425 * Called with IRQs disabled and the PMU disabled on the CPU the event 426 * is on -- will be called from NMI context with the PMU generates 427 * NMIs. 428 * 429 * ->stop() with PERF_EF_UPDATE will read the counter and update 430 * period/count values like ->read() would. 431 * 432 * ->start() with PERF_EF_RELOAD will reprogram the counter 433 * value, must be preceded by a ->stop() with PERF_EF_UPDATE. 434 * 435 * ->stop() with PERF_EF_PAUSE will stop as simply as possible. Will not 436 * overlap another ->stop() with PERF_EF_PAUSE nor ->start() with 437 * PERF_EF_RESUME. 438 * 439 * ->start() with PERF_EF_RESUME will start as simply as possible but 440 * only if the counter is not otherwise stopped. Will not overlap 441 * another ->start() with PERF_EF_RESUME nor ->stop() with 442 * PERF_EF_PAUSE. 443 * 444 * Notably, PERF_EF_PAUSE/PERF_EF_RESUME *can* be concurrent with other 445 * ->stop()/->start() invocations, just not itself. 446 */ 447 void (*start) (struct perf_event *event, int flags); 448 void (*stop) (struct perf_event *event, int flags); 449 450 /* 451 * Updates the counter value of the event. 452 * 453 * For sampling capable PMUs this will also update the software period 454 * hw_perf_event::period_left field. 455 */ 456 void (*read) (struct perf_event *event); 457 458 /* 459 * Group events scheduling is treated as a transaction, add 460 * group events as a whole and perform one schedulability test. 461 * If the test fails, roll back the whole group 462 * 463 * Start the transaction, after this ->add() doesn't need to 464 * do schedulability tests. 465 * 466 * Optional. 467 */ 468 void (*start_txn) (struct pmu *pmu, unsigned int txn_flags); 469 /* 470 * If ->start_txn() disabled the ->add() schedulability test 471 * then ->commit_txn() is required to perform one. On success 472 * the transaction is closed. On error the transaction is kept 473 * open until ->cancel_txn() is called. 474 * 475 * Optional. 476 */ 477 int (*commit_txn) (struct pmu *pmu); 478 /* 479 * Will cancel the transaction, assumes ->del() is called 480 * for each successful ->add() during the transaction. 481 * 482 * Optional. 483 */ 484 void (*cancel_txn) (struct pmu *pmu); 485 486 /* 487 * Will return the value for perf_event_mmap_page::index for this event, 488 * if no implementation is provided it will default to 0 (see 489 * perf_event_idx_default). 490 */ 491 int (*event_idx) (struct perf_event *event); /*optional */ 492 493 /* 494 * context-switches callback 495 */ 496 void (*sched_task) (struct perf_event_pmu_context *pmu_ctx, 497 struct task_struct *task, bool sched_in); 498 499 /* 500 * Kmem cache of PMU specific data 501 */ 502 struct kmem_cache *task_ctx_cache; 503 504 /* 505 * Set up pmu-private data structures for an AUX area 506 */ 507 void *(*setup_aux) (struct perf_event *event, void **pages, 508 int nr_pages, bool overwrite); 509 /* optional */ 510 511 /* 512 * Free pmu-private AUX data structures 513 */ 514 void (*free_aux) (void *aux); /* optional */ 515 516 /* 517 * Take a snapshot of the AUX buffer without touching the event 518 * state, so that preempting ->start()/->stop() callbacks does 519 * not interfere with their logic. Called in PMI context. 520 * 521 * Returns the size of AUX data copied to the output handle. 522 * 523 * Optional. 524 */ 525 long (*snapshot_aux) (struct perf_event *event, 526 struct perf_output_handle *handle, 527 unsigned long size); 528 529 /* 530 * Validate address range filters: make sure the HW supports the 531 * requested configuration and number of filters; return 0 if the 532 * supplied filters are valid, -errno otherwise. 533 * 534 * Runs in the context of the ioctl()ing process and is not serialized 535 * with the rest of the PMU callbacks. 536 */ 537 int (*addr_filters_validate) (struct list_head *filters); 538 /* optional */ 539 540 /* 541 * Synchronize address range filter configuration: 542 * translate hw-agnostic filters into hardware configuration in 543 * event::hw::addr_filters. 544 * 545 * Runs as a part of filter sync sequence that is done in ->start() 546 * callback by calling perf_event_addr_filters_sync(). 547 * 548 * May (and should) traverse event::addr_filters::list, for which its 549 * caller provides necessary serialization. 550 */ 551 void (*addr_filters_sync) (struct perf_event *event); 552 /* optional */ 553 554 /* 555 * Check if event can be used for aux_output purposes for 556 * events of this PMU. 557 * 558 * Runs from perf_event_open(). Should return 0 for "no match" 559 * or non-zero for "match". 560 */ 561 int (*aux_output_match) (struct perf_event *event); 562 /* optional */ 563 564 /* 565 * Skip programming this PMU on the given CPU. Typically needed for 566 * big.LITTLE things. 567 */ 568 bool (*filter) (struct pmu *pmu, int cpu); /* optional */ 569 570 /* 571 * Check period value for PERF_EVENT_IOC_PERIOD ioctl. 572 */ 573 int (*check_period) (struct perf_event *event, u64 value); /* optional */ 574}; 575 576enum perf_addr_filter_action_t { 577 PERF_ADDR_FILTER_ACTION_STOP = 0, 578 PERF_ADDR_FILTER_ACTION_START, 579 PERF_ADDR_FILTER_ACTION_FILTER, 580}; 581 582/** 583 * struct perf_addr_filter - address range filter definition 584 * @entry: event's filter list linkage 585 * @path: object file's path for file-based filters 586 * @offset: filter range offset 587 * @size: filter range size (size==0 means single address trigger) 588 * @action: filter/start/stop 589 * 590 * This is a hardware-agnostic filter configuration as specified by the user. 591 */ 592struct perf_addr_filter { 593 struct list_head entry; 594 struct path path; 595 unsigned long offset; 596 unsigned long size; 597 enum perf_addr_filter_action_t action; 598}; 599 600/** 601 * struct perf_addr_filters_head - container for address range filters 602 * @list: list of filters for this event 603 * @lock: spinlock that serializes accesses to the @list and event's 604 * (and its children's) filter generations. 605 * @nr_file_filters: number of file-based filters 606 * 607 * A child event will use parent's @list (and therefore @lock), so they are 608 * bundled together; see perf_event_addr_filters(). 609 */ 610struct perf_addr_filters_head { 611 struct list_head list; 612 raw_spinlock_t lock; 613 unsigned int nr_file_filters; 614}; 615 616struct perf_addr_filter_range { 617 unsigned long start; 618 unsigned long size; 619}; 620 621/** 622 * enum perf_event_state - the states of an event: 623 */ 624enum perf_event_state { 625 PERF_EVENT_STATE_DEAD = -4, 626 PERF_EVENT_STATE_EXIT = -3, 627 PERF_EVENT_STATE_ERROR = -2, 628 PERF_EVENT_STATE_OFF = -1, 629 PERF_EVENT_STATE_INACTIVE = 0, 630 PERF_EVENT_STATE_ACTIVE = 1, 631}; 632 633struct file; 634struct perf_sample_data; 635 636typedef void (*perf_overflow_handler_t)(struct perf_event *, 637 struct perf_sample_data *, 638 struct pt_regs *regs); 639 640/* 641 * Event capabilities. For event_caps and groups caps. 642 * 643 * PERF_EV_CAP_SOFTWARE: Is a software event. 644 * PERF_EV_CAP_READ_ACTIVE_PKG: A CPU event (or cgroup event) that can be read 645 * from any CPU in the package where it is active. 646 * PERF_EV_CAP_SIBLING: An event with this flag must be a group sibling and 647 * cannot be a group leader. If an event with this flag is detached from the 648 * group it is scheduled out and moved into an unrecoverable ERROR state. 649 * PERF_EV_CAP_READ_SCOPE: A CPU event that can be read from any CPU of the 650 * PMU scope where it is active. 651 */ 652#define PERF_EV_CAP_SOFTWARE BIT(0) 653#define PERF_EV_CAP_READ_ACTIVE_PKG BIT(1) 654#define PERF_EV_CAP_SIBLING BIT(2) 655#define PERF_EV_CAP_READ_SCOPE BIT(3) 656 657#define SWEVENT_HLIST_BITS 8 658#define SWEVENT_HLIST_SIZE (1 << SWEVENT_HLIST_BITS) 659 660struct swevent_hlist { 661 struct hlist_head heads[SWEVENT_HLIST_SIZE]; 662 struct rcu_head rcu_head; 663}; 664 665#define PERF_ATTACH_CONTEXT 0x0001 666#define PERF_ATTACH_GROUP 0x0002 667#define PERF_ATTACH_TASK 0x0004 668#define PERF_ATTACH_TASK_DATA 0x0008 669#define PERF_ATTACH_GLOBAL_DATA 0x0010 670#define PERF_ATTACH_SCHED_CB 0x0020 671#define PERF_ATTACH_CHILD 0x0040 672#define PERF_ATTACH_EXCLUSIVE 0x0080 673#define PERF_ATTACH_CALLCHAIN 0x0100 674#define PERF_ATTACH_ITRACE 0x0200 675 676struct bpf_prog; 677struct perf_cgroup; 678struct perf_buffer; 679 680struct pmu_event_list { 681 raw_spinlock_t lock; 682 struct list_head list; 683}; 684 685/* 686 * event->sibling_list is modified whole holding both ctx->lock and ctx->mutex 687 * as such iteration must hold either lock. However, since ctx->lock is an IRQ 688 * safe lock, and is only held by the CPU doing the modification, having IRQs 689 * disabled is sufficient since it will hold-off the IPIs. 690 */ 691#ifdef CONFIG_PROVE_LOCKING 692#define lockdep_assert_event_ctx(event) \ 693 WARN_ON_ONCE(__lockdep_enabled && \ 694 (this_cpu_read(hardirqs_enabled) && \ 695 lockdep_is_held(&(event)->ctx->mutex) != LOCK_STATE_HELD)) 696#else 697#define lockdep_assert_event_ctx(event) 698#endif 699 700#define for_each_sibling_event(sibling, event) \ 701 lockdep_assert_event_ctx(event); \ 702 if ((event)->group_leader == (event)) \ 703 list_for_each_entry((sibling), &(event)->sibling_list, sibling_list) 704 705/** 706 * struct perf_event - performance event kernel representation: 707 */ 708struct perf_event { 709#ifdef CONFIG_PERF_EVENTS 710 /* 711 * entry onto perf_event_context::event_list; 712 * modifications require ctx->lock 713 * RCU safe iterations. 714 */ 715 struct list_head event_entry; 716 717 /* 718 * Locked for modification by both ctx->mutex and ctx->lock; holding 719 * either sufficies for read. 720 */ 721 struct list_head sibling_list; 722 struct list_head active_list; 723 /* 724 * Node on the pinned or flexible tree located at the event context; 725 */ 726 struct rb_node group_node; 727 u64 group_index; 728 /* 729 * We need storage to track the entries in perf_pmu_migrate_context; we 730 * cannot use the event_entry because of RCU and we want to keep the 731 * group in tact which avoids us using the other two entries. 732 */ 733 struct list_head migrate_entry; 734 735 struct hlist_node hlist_entry; 736 struct list_head active_entry; 737 int nr_siblings; 738 739 /* Not serialized. Only written during event initialization. */ 740 int event_caps; 741 /* The cumulative AND of all event_caps for events in this group. */ 742 int group_caps; 743 744 unsigned int group_generation; 745 struct perf_event *group_leader; 746 /* 747 * event->pmu will always point to pmu in which this event belongs. 748 * Whereas event->pmu_ctx->pmu may point to other pmu when group of 749 * different pmu events is created. 750 */ 751 struct pmu *pmu; 752 void *pmu_private; 753 754 enum perf_event_state state; 755 unsigned int attach_state; 756 local64_t count; 757 atomic64_t child_count; 758 759 /* 760 * These are the total time in nanoseconds that the event 761 * has been enabled (i.e. eligible to run, and the task has 762 * been scheduled in, if this is a per-task event) 763 * and running (scheduled onto the CPU), respectively. 764 */ 765 u64 total_time_enabled; 766 u64 total_time_running; 767 u64 tstamp; 768 769 struct perf_event_attr attr; 770 u16 header_size; 771 u16 id_header_size; 772 u16 read_size; 773 struct hw_perf_event hw; 774 775 struct perf_event_context *ctx; 776 /* 777 * event->pmu_ctx points to perf_event_pmu_context in which the event 778 * is added. This pmu_ctx can be of other pmu for sw event when that 779 * sw event is part of a group which also contains non-sw events. 780 */ 781 struct perf_event_pmu_context *pmu_ctx; 782 atomic_long_t refcount; 783 784 /* 785 * These accumulate total time (in nanoseconds) that children 786 * events have been enabled and running, respectively. 787 */ 788 atomic64_t child_total_time_enabled; 789 atomic64_t child_total_time_running; 790 791 /* 792 * Protect attach/detach and child_list: 793 */ 794 struct mutex child_mutex; 795 struct list_head child_list; 796 struct perf_event *parent; 797 798 int oncpu; 799 int cpu; 800 801 struct list_head owner_entry; 802 struct task_struct *owner; 803 804 /* mmap bits */ 805 struct mutex mmap_mutex; 806 atomic_t mmap_count; 807 808 struct perf_buffer *rb; 809 struct list_head rb_entry; 810 unsigned long rcu_batches; 811 int rcu_pending; 812 813 /* poll related */ 814 wait_queue_head_t waitq; 815 struct fasync_struct *fasync; 816 817 /* delayed work for NMIs and such */ 818 unsigned int pending_wakeup; 819 unsigned int pending_kill; 820 unsigned int pending_disable; 821 unsigned long pending_addr; /* SIGTRAP */ 822 struct irq_work pending_irq; 823 struct irq_work pending_disable_irq; 824 struct callback_head pending_task; 825 unsigned int pending_work; 826 827 atomic_t event_limit; 828 829 /* address range filters */ 830 struct perf_addr_filters_head addr_filters; 831 /* vma address array for file-based filders */ 832 struct perf_addr_filter_range *addr_filter_ranges; 833 unsigned long addr_filters_gen; 834 835 /* for aux_output events */ 836 struct perf_event *aux_event; 837 838 void (*destroy)(struct perf_event *); 839 struct rcu_head rcu_head; 840 841 struct pid_namespace *ns; 842 u64 id; 843 844 atomic64_t lost_samples; 845 846 u64 (*clock)(void); 847 perf_overflow_handler_t overflow_handler; 848 void *overflow_handler_context; 849 struct bpf_prog *prog; 850 u64 bpf_cookie; 851 852#ifdef CONFIG_EVENT_TRACING 853 struct trace_event_call *tp_event; 854 struct event_filter *filter; 855#ifdef CONFIG_FUNCTION_TRACER 856 struct ftrace_ops ftrace_ops; 857#endif 858#endif 859 860#ifdef CONFIG_CGROUP_PERF 861 struct perf_cgroup *cgrp; /* cgroup event is attach to */ 862#endif 863 864#ifdef CONFIG_SECURITY 865 void *security; 866#endif 867 struct list_head sb_list; 868 869 /* 870 * Certain events gets forwarded to another pmu internally by over- 871 * writing kernel copy of event->attr.type without user being aware 872 * of it. event->orig_type contains original 'type' requested by 873 * user. 874 */ 875 __u32 orig_type; 876#endif /* CONFIG_PERF_EVENTS */ 877}; 878 879/* 880 * ,-----------------------[1:n]------------------------. 881 * V V 882 * perf_event_context <-[1:n]-> perf_event_pmu_context <-[1:n]- perf_event 883 * | | 884 * `--[n:1]-> pmu <-[1:n]--' 885 * 886 * 887 * struct perf_event_pmu_context lifetime is refcount based and RCU freed 888 * (similar to perf_event_context). Locking is as if it were a member of 889 * perf_event_context; specifically: 890 * 891 * modification, both: ctx->mutex && ctx->lock 892 * reading, either: ctx->mutex || ctx->lock 893 * 894 * There is one exception to this; namely put_pmu_ctx() isn't always called 895 * with ctx->mutex held; this means that as long as we can guarantee the epc 896 * has events the above rules hold. 897 * 898 * Specificially, sys_perf_event_open()'s group_leader case depends on 899 * ctx->mutex pinning the configuration. Since we hold a reference on 900 * group_leader (through the filedesc) it can't go away, therefore it's 901 * associated pmu_ctx must exist and cannot change due to ctx->mutex. 902 * 903 * perf_event holds a refcount on perf_event_context 904 * perf_event holds a refcount on perf_event_pmu_context 905 */ 906struct perf_event_pmu_context { 907 struct pmu *pmu; 908 struct perf_event_context *ctx; 909 910 struct list_head pmu_ctx_entry; 911 912 struct list_head pinned_active; 913 struct list_head flexible_active; 914 915 /* Used to identify the per-cpu perf_event_pmu_context */ 916 unsigned int embedded : 1; 917 918 unsigned int nr_events; 919 unsigned int nr_cgroups; 920 unsigned int nr_freq; 921 922 atomic_t refcount; /* event <-> epc */ 923 struct rcu_head rcu_head; 924 925 /* 926 * Set when one or more (plausibly active) event can't be scheduled 927 * due to pmu overcommit or pmu constraints, except tolerant to 928 * events not necessary to be active due to scheduling constraints, 929 * such as cgroups. 930 */ 931 int rotate_necessary; 932}; 933 934static inline bool perf_pmu_ctx_is_active(struct perf_event_pmu_context *epc) 935{ 936 return !list_empty(&epc->flexible_active) || !list_empty(&epc->pinned_active); 937} 938 939struct perf_event_groups { 940 struct rb_root tree; 941 u64 index; 942}; 943 944 945/** 946 * struct perf_event_context - event context structure 947 * 948 * Used as a container for task events and CPU events as well: 949 */ 950struct perf_event_context { 951 /* 952 * Protect the states of the events in the list, 953 * nr_active, and the list: 954 */ 955 raw_spinlock_t lock; 956 /* 957 * Protect the list of events. Locking either mutex or lock 958 * is sufficient to ensure the list doesn't change; to change 959 * the list you need to lock both the mutex and the spinlock. 960 */ 961 struct mutex mutex; 962 963 struct list_head pmu_ctx_list; 964 struct perf_event_groups pinned_groups; 965 struct perf_event_groups flexible_groups; 966 struct list_head event_list; 967 968 int nr_events; 969 int nr_user; 970 int is_active; 971 972 int nr_stat; 973 int nr_freq; 974 int rotate_disable; 975 976 refcount_t refcount; /* event <-> ctx */ 977 struct task_struct *task; 978 979 /* 980 * Context clock, runs when context enabled. 981 */ 982 u64 time; 983 u64 timestamp; 984 u64 timeoffset; 985 986 /* 987 * These fields let us detect when two contexts have both 988 * been cloned (inherited) from a common ancestor. 989 */ 990 struct perf_event_context *parent_ctx; 991 u64 parent_gen; 992 u64 generation; 993 int pin_count; 994#ifdef CONFIG_CGROUP_PERF 995 int nr_cgroups; /* cgroup evts */ 996#endif 997 struct rcu_head rcu_head; 998 999 /* 1000 * The count of events for which using the switch-out fast path 1001 * should be avoided. 1002 * 1003 * Sum (event->pending_work + events with 1004 * (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ))) 1005 * 1006 * The SIGTRAP is targeted at ctx->task, as such it won't do changing 1007 * that until the signal is delivered. 1008 */ 1009 local_t nr_no_switch_fast; 1010}; 1011 1012/** 1013 * struct perf_ctx_data - PMU specific data for a task 1014 * @rcu_head: To avoid the race on free PMU specific data 1015 * @refcount: To track users 1016 * @global: To track system-wide users 1017 * @ctx_cache: Kmem cache of PMU specific data 1018 * @data: PMU specific data 1019 * 1020 * Currently, the struct is only used in Intel LBR call stack mode to 1021 * save/restore the call stack of a task on context switches. 1022 * 1023 * The rcu_head is used to prevent the race on free the data. 1024 * The data only be allocated when Intel LBR call stack mode is enabled. 1025 * The data will be freed when the mode is disabled. 1026 * The content of the data will only be accessed in context switch, which 1027 * should be protected by rcu_read_lock(). 1028 * 1029 * Because of the alignment requirement of Intel Arch LBR, the Kmem cache 1030 * is used to allocate the PMU specific data. The ctx_cache is to track 1031 * the Kmem cache. 1032 * 1033 * Careful: Struct perf_ctx_data is added as a pointer in struct task_struct. 1034 * When system-wide Intel LBR call stack mode is enabled, a buffer with 1035 * constant size will be allocated for each task. 1036 * Also, system memory consumption can further grow when the size of 1037 * struct perf_ctx_data enlarges. 1038 */ 1039struct perf_ctx_data { 1040 struct rcu_head rcu_head; 1041 refcount_t refcount; 1042 int global; 1043 struct kmem_cache *ctx_cache; 1044 void *data; 1045}; 1046 1047struct perf_cpu_pmu_context { 1048 struct perf_event_pmu_context epc; 1049 struct perf_event_pmu_context *task_epc; 1050 1051 struct list_head sched_cb_entry; 1052 int sched_cb_usage; 1053 1054 int active_oncpu; 1055 int exclusive; 1056 int pmu_disable_count; 1057 1058 raw_spinlock_t hrtimer_lock; 1059 struct hrtimer hrtimer; 1060 ktime_t hrtimer_interval; 1061 unsigned int hrtimer_active; 1062}; 1063 1064/** 1065 * struct perf_event_cpu_context - per cpu event context structure 1066 */ 1067struct perf_cpu_context { 1068 struct perf_event_context ctx; 1069 struct perf_event_context *task_ctx; 1070 int online; 1071 1072#ifdef CONFIG_CGROUP_PERF 1073 struct perf_cgroup *cgrp; 1074#endif 1075 1076 /* 1077 * Per-CPU storage for iterators used in visit_groups_merge. The default 1078 * storage is of size 2 to hold the CPU and any CPU event iterators. 1079 */ 1080 int heap_size; 1081 struct perf_event **heap; 1082 struct perf_event *heap_default[2]; 1083}; 1084 1085struct perf_output_handle { 1086 struct perf_event *event; 1087 struct perf_buffer *rb; 1088 unsigned long wakeup; 1089 unsigned long size; 1090 union { 1091 u64 flags; /* perf_output*() */ 1092 u64 aux_flags; /* perf_aux_output*() */ 1093 struct { 1094 u64 skip_read : 1; 1095 }; 1096 }; 1097 union { 1098 void *addr; 1099 unsigned long head; 1100 }; 1101 int page; 1102}; 1103 1104struct bpf_perf_event_data_kern { 1105 bpf_user_pt_regs_t *regs; 1106 struct perf_sample_data *data; 1107 struct perf_event *event; 1108}; 1109 1110#ifdef CONFIG_CGROUP_PERF 1111 1112/* 1113 * perf_cgroup_info keeps track of time_enabled for a cgroup. 1114 * This is a per-cpu dynamically allocated data structure. 1115 */ 1116struct perf_cgroup_info { 1117 u64 time; 1118 u64 timestamp; 1119 u64 timeoffset; 1120 int active; 1121}; 1122 1123struct perf_cgroup { 1124 struct cgroup_subsys_state css; 1125 struct perf_cgroup_info __percpu *info; 1126}; 1127 1128/* 1129 * Must ensure cgroup is pinned (css_get) before calling 1130 * this function. In other words, we cannot call this function 1131 * if there is no cgroup event for the current CPU context. 1132 */ 1133static inline struct perf_cgroup * 1134perf_cgroup_from_task(struct task_struct *task, struct perf_event_context *ctx) 1135{ 1136 return container_of(task_css_check(task, perf_event_cgrp_id, 1137 ctx ? lockdep_is_held(&ctx->lock) 1138 : true), 1139 struct perf_cgroup, css); 1140} 1141#endif /* CONFIG_CGROUP_PERF */ 1142 1143#ifdef CONFIG_PERF_EVENTS 1144 1145extern struct perf_event_context *perf_cpu_task_ctx(void); 1146 1147extern void *perf_aux_output_begin(struct perf_output_handle *handle, 1148 struct perf_event *event); 1149extern void perf_aux_output_end(struct perf_output_handle *handle, 1150 unsigned long size); 1151extern int perf_aux_output_skip(struct perf_output_handle *handle, 1152 unsigned long size); 1153extern void *perf_get_aux(struct perf_output_handle *handle); 1154extern void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags); 1155extern void perf_event_itrace_started(struct perf_event *event); 1156 1157extern int perf_pmu_register(struct pmu *pmu, const char *name, int type); 1158extern void perf_pmu_unregister(struct pmu *pmu); 1159 1160extern void __perf_event_task_sched_in(struct task_struct *prev, 1161 struct task_struct *task); 1162extern void __perf_event_task_sched_out(struct task_struct *prev, 1163 struct task_struct *next); 1164extern int perf_event_init_task(struct task_struct *child, u64 clone_flags); 1165extern void perf_event_exit_task(struct task_struct *child); 1166extern void perf_event_free_task(struct task_struct *task); 1167extern void perf_event_delayed_put(struct task_struct *task); 1168extern struct file *perf_event_get(unsigned int fd); 1169extern const struct perf_event *perf_get_event(struct file *file); 1170extern const struct perf_event_attr *perf_event_attrs(struct perf_event *event); 1171extern void perf_event_print_debug(void); 1172extern void perf_pmu_disable(struct pmu *pmu); 1173extern void perf_pmu_enable(struct pmu *pmu); 1174extern void perf_sched_cb_dec(struct pmu *pmu); 1175extern void perf_sched_cb_inc(struct pmu *pmu); 1176extern int perf_event_task_disable(void); 1177extern int perf_event_task_enable(void); 1178 1179extern void perf_pmu_resched(struct pmu *pmu); 1180 1181extern int perf_event_refresh(struct perf_event *event, int refresh); 1182extern void perf_event_update_userpage(struct perf_event *event); 1183extern int perf_event_release_kernel(struct perf_event *event); 1184extern struct perf_event * 1185perf_event_create_kernel_counter(struct perf_event_attr *attr, 1186 int cpu, 1187 struct task_struct *task, 1188 perf_overflow_handler_t callback, 1189 void *context); 1190extern void perf_pmu_migrate_context(struct pmu *pmu, 1191 int src_cpu, int dst_cpu); 1192int perf_event_read_local(struct perf_event *event, u64 *value, 1193 u64 *enabled, u64 *running); 1194extern u64 perf_event_read_value(struct perf_event *event, 1195 u64 *enabled, u64 *running); 1196 1197extern struct perf_callchain_entry *perf_callchain(struct perf_event *event, struct pt_regs *regs); 1198 1199static inline bool branch_sample_no_flags(const struct perf_event *event) 1200{ 1201 return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_NO_FLAGS; 1202} 1203 1204static inline bool branch_sample_no_cycles(const struct perf_event *event) 1205{ 1206 return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_NO_CYCLES; 1207} 1208 1209static inline bool branch_sample_type(const struct perf_event *event) 1210{ 1211 return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_TYPE_SAVE; 1212} 1213 1214static inline bool branch_sample_hw_index(const struct perf_event *event) 1215{ 1216 return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX; 1217} 1218 1219static inline bool branch_sample_priv(const struct perf_event *event) 1220{ 1221 return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_PRIV_SAVE; 1222} 1223 1224static inline bool branch_sample_counters(const struct perf_event *event) 1225{ 1226 return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_COUNTERS; 1227} 1228 1229static inline bool branch_sample_call_stack(const struct perf_event *event) 1230{ 1231 return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK; 1232} 1233 1234struct perf_sample_data { 1235 /* 1236 * Fields set by perf_sample_data_init() unconditionally, 1237 * group so as to minimize the cachelines touched. 1238 */ 1239 u64 sample_flags; 1240 u64 period; 1241 u64 dyn_size; 1242 1243 /* 1244 * Fields commonly set by __perf_event_header__init_id(), 1245 * group so as to minimize the cachelines touched. 1246 */ 1247 u64 type; 1248 struct { 1249 u32 pid; 1250 u32 tid; 1251 } tid_entry; 1252 u64 time; 1253 u64 id; 1254 struct { 1255 u32 cpu; 1256 u32 reserved; 1257 } cpu_entry; 1258 1259 /* 1260 * The other fields, optionally {set,used} by 1261 * perf_{prepare,output}_sample(). 1262 */ 1263 u64 ip; 1264 struct perf_callchain_entry *callchain; 1265 struct perf_raw_record *raw; 1266 struct perf_branch_stack *br_stack; 1267 u64 *br_stack_cntr; 1268 union perf_sample_weight weight; 1269 union perf_mem_data_src data_src; 1270 u64 txn; 1271 1272 struct perf_regs regs_user; 1273 struct perf_regs regs_intr; 1274 u64 stack_user_size; 1275 1276 u64 stream_id; 1277 u64 cgroup; 1278 u64 addr; 1279 u64 phys_addr; 1280 u64 data_page_size; 1281 u64 code_page_size; 1282 u64 aux_size; 1283} ____cacheline_aligned; 1284 1285/* default value for data source */ 1286#define PERF_MEM_NA (PERF_MEM_S(OP, NA) |\ 1287 PERF_MEM_S(LVL, NA) |\ 1288 PERF_MEM_S(SNOOP, NA) |\ 1289 PERF_MEM_S(LOCK, NA) |\ 1290 PERF_MEM_S(TLB, NA) |\ 1291 PERF_MEM_S(LVLNUM, NA)) 1292 1293static inline void perf_sample_data_init(struct perf_sample_data *data, 1294 u64 addr, u64 period) 1295{ 1296 /* remaining struct members initialized in perf_prepare_sample() */ 1297 data->sample_flags = PERF_SAMPLE_PERIOD; 1298 data->period = period; 1299 data->dyn_size = 0; 1300 1301 if (addr) { 1302 data->addr = addr; 1303 data->sample_flags |= PERF_SAMPLE_ADDR; 1304 } 1305} 1306 1307static inline void perf_sample_save_callchain(struct perf_sample_data *data, 1308 struct perf_event *event, 1309 struct pt_regs *regs) 1310{ 1311 int size = 1; 1312 1313 if (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)) 1314 return; 1315 if (WARN_ON_ONCE(data->sample_flags & PERF_SAMPLE_CALLCHAIN)) 1316 return; 1317 1318 data->callchain = perf_callchain(event, regs); 1319 size += data->callchain->nr; 1320 1321 data->dyn_size += size * sizeof(u64); 1322 data->sample_flags |= PERF_SAMPLE_CALLCHAIN; 1323} 1324 1325static inline void perf_sample_save_raw_data(struct perf_sample_data *data, 1326 struct perf_event *event, 1327 struct perf_raw_record *raw) 1328{ 1329 struct perf_raw_frag *frag = &raw->frag; 1330 u32 sum = 0; 1331 int size; 1332 1333 if (!(event->attr.sample_type & PERF_SAMPLE_RAW)) 1334 return; 1335 if (WARN_ON_ONCE(data->sample_flags & PERF_SAMPLE_RAW)) 1336 return; 1337 1338 do { 1339 sum += frag->size; 1340 if (perf_raw_frag_last(frag)) 1341 break; 1342 frag = frag->next; 1343 } while (1); 1344 1345 size = round_up(sum + sizeof(u32), sizeof(u64)); 1346 raw->size = size - sizeof(u32); 1347 frag->pad = raw->size - sum; 1348 1349 data->raw = raw; 1350 data->dyn_size += size; 1351 data->sample_flags |= PERF_SAMPLE_RAW; 1352} 1353 1354static inline bool has_branch_stack(struct perf_event *event) 1355{ 1356 return event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK; 1357} 1358 1359static inline void perf_sample_save_brstack(struct perf_sample_data *data, 1360 struct perf_event *event, 1361 struct perf_branch_stack *brs, 1362 u64 *brs_cntr) 1363{ 1364 int size = sizeof(u64); /* nr */ 1365 1366 if (!has_branch_stack(event)) 1367 return; 1368 if (WARN_ON_ONCE(data->sample_flags & PERF_SAMPLE_BRANCH_STACK)) 1369 return; 1370 1371 if (branch_sample_hw_index(event)) 1372 size += sizeof(u64); 1373 1374 brs->nr = min_t(u16, event->attr.sample_max_stack, brs->nr); 1375 1376 size += brs->nr * sizeof(struct perf_branch_entry); 1377 1378 /* 1379 * The extension space for counters is appended after the 1380 * struct perf_branch_stack. It is used to store the occurrences 1381 * of events of each branch. 1382 */ 1383 if (brs_cntr) 1384 size += brs->nr * sizeof(u64); 1385 1386 data->br_stack = brs; 1387 data->br_stack_cntr = brs_cntr; 1388 data->dyn_size += size; 1389 data->sample_flags |= PERF_SAMPLE_BRANCH_STACK; 1390} 1391 1392static inline u32 perf_sample_data_size(struct perf_sample_data *data, 1393 struct perf_event *event) 1394{ 1395 u32 size = sizeof(struct perf_event_header); 1396 1397 size += event->header_size + event->id_header_size; 1398 size += data->dyn_size; 1399 1400 return size; 1401} 1402 1403/* 1404 * Clear all bitfields in the perf_branch_entry. 1405 * The to and from fields are not cleared because they are 1406 * systematically modified by caller. 1407 */ 1408static inline void perf_clear_branch_entry_bitfields(struct perf_branch_entry *br) 1409{ 1410 br->mispred = 0; 1411 br->predicted = 0; 1412 br->in_tx = 0; 1413 br->abort = 0; 1414 br->cycles = 0; 1415 br->type = 0; 1416 br->spec = PERF_BR_SPEC_NA; 1417 br->reserved = 0; 1418} 1419 1420extern void perf_output_sample(struct perf_output_handle *handle, 1421 struct perf_event_header *header, 1422 struct perf_sample_data *data, 1423 struct perf_event *event); 1424extern void perf_prepare_sample(struct perf_sample_data *data, 1425 struct perf_event *event, 1426 struct pt_regs *regs); 1427extern void perf_prepare_header(struct perf_event_header *header, 1428 struct perf_sample_data *data, 1429 struct perf_event *event, 1430 struct pt_regs *regs); 1431 1432extern int perf_event_overflow(struct perf_event *event, 1433 struct perf_sample_data *data, 1434 struct pt_regs *regs); 1435 1436extern void perf_event_output_forward(struct perf_event *event, 1437 struct perf_sample_data *data, 1438 struct pt_regs *regs); 1439extern void perf_event_output_backward(struct perf_event *event, 1440 struct perf_sample_data *data, 1441 struct pt_regs *regs); 1442extern int perf_event_output(struct perf_event *event, 1443 struct perf_sample_data *data, 1444 struct pt_regs *regs); 1445 1446static inline bool 1447is_default_overflow_handler(struct perf_event *event) 1448{ 1449 perf_overflow_handler_t overflow_handler = event->overflow_handler; 1450 1451 if (likely(overflow_handler == perf_event_output_forward)) 1452 return true; 1453 if (unlikely(overflow_handler == perf_event_output_backward)) 1454 return true; 1455 return false; 1456} 1457 1458extern void 1459perf_event_header__init_id(struct perf_event_header *header, 1460 struct perf_sample_data *data, 1461 struct perf_event *event); 1462extern void 1463perf_event__output_id_sample(struct perf_event *event, 1464 struct perf_output_handle *handle, 1465 struct perf_sample_data *sample); 1466 1467extern void 1468perf_log_lost_samples(struct perf_event *event, u64 lost); 1469 1470static inline bool event_has_any_exclude_flag(struct perf_event *event) 1471{ 1472 struct perf_event_attr *attr = &event->attr; 1473 1474 return attr->exclude_idle || attr->exclude_user || 1475 attr->exclude_kernel || attr->exclude_hv || 1476 attr->exclude_guest || attr->exclude_host; 1477} 1478 1479static inline bool is_sampling_event(struct perf_event *event) 1480{ 1481 return event->attr.sample_period != 0; 1482} 1483 1484/* 1485 * Return 1 for a software event, 0 for a hardware event 1486 */ 1487static inline int is_software_event(struct perf_event *event) 1488{ 1489 return event->event_caps & PERF_EV_CAP_SOFTWARE; 1490} 1491 1492/* 1493 * Return 1 for event in sw context, 0 for event in hw context 1494 */ 1495static inline int in_software_context(struct perf_event *event) 1496{ 1497 return event->pmu_ctx->pmu->task_ctx_nr == perf_sw_context; 1498} 1499 1500static inline int is_exclusive_pmu(struct pmu *pmu) 1501{ 1502 return pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE; 1503} 1504 1505extern struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; 1506 1507extern void ___perf_sw_event(u32, u64, struct pt_regs *, u64); 1508extern void __perf_sw_event(u32, u64, struct pt_regs *, u64); 1509 1510#ifndef perf_arch_fetch_caller_regs 1511static inline void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip) { } 1512#endif 1513 1514/* 1515 * When generating a perf sample in-line, instead of from an interrupt / 1516 * exception, we lack a pt_regs. This is typically used from software events 1517 * like: SW_CONTEXT_SWITCHES, SW_MIGRATIONS and the tie-in with tracepoints. 1518 * 1519 * We typically don't need a full set, but (for x86) do require: 1520 * - ip for PERF_SAMPLE_IP 1521 * - cs for user_mode() tests 1522 * - sp for PERF_SAMPLE_CALLCHAIN 1523 * - eflags for MISC bits and CALLCHAIN (see: perf_hw_regs()) 1524 * 1525 * NOTE: assumes @regs is otherwise already 0 filled; this is important for 1526 * things like PERF_SAMPLE_REGS_INTR. 1527 */ 1528static inline void perf_fetch_caller_regs(struct pt_regs *regs) 1529{ 1530 perf_arch_fetch_caller_regs(regs, CALLER_ADDR0); 1531} 1532 1533static __always_inline void 1534perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) 1535{ 1536 if (static_key_false(&perf_swevent_enabled[event_id])) 1537 __perf_sw_event(event_id, nr, regs, addr); 1538} 1539 1540DECLARE_PER_CPU(struct pt_regs, __perf_regs[4]); 1541 1542/* 1543 * 'Special' version for the scheduler, it hard assumes no recursion, 1544 * which is guaranteed by us not actually scheduling inside other swevents 1545 * because those disable preemption. 1546 */ 1547static __always_inline void __perf_sw_event_sched(u32 event_id, u64 nr, u64 addr) 1548{ 1549 struct pt_regs *regs = this_cpu_ptr(&__perf_regs[0]); 1550 1551 perf_fetch_caller_regs(regs); 1552 ___perf_sw_event(event_id, nr, regs, addr); 1553} 1554 1555extern struct static_key_false perf_sched_events; 1556 1557static __always_inline bool __perf_sw_enabled(int swevt) 1558{ 1559 return static_key_false(&perf_swevent_enabled[swevt]); 1560} 1561 1562static inline void perf_event_task_migrate(struct task_struct *task) 1563{ 1564 if (__perf_sw_enabled(PERF_COUNT_SW_CPU_MIGRATIONS)) 1565 task->sched_migrated = 1; 1566} 1567 1568static inline void perf_event_task_sched_in(struct task_struct *prev, 1569 struct task_struct *task) 1570{ 1571 if (static_branch_unlikely(&perf_sched_events)) 1572 __perf_event_task_sched_in(prev, task); 1573 1574 if (__perf_sw_enabled(PERF_COUNT_SW_CPU_MIGRATIONS) && 1575 task->sched_migrated) { 1576 __perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0); 1577 task->sched_migrated = 0; 1578 } 1579} 1580 1581static inline void perf_event_task_sched_out(struct task_struct *prev, 1582 struct task_struct *next) 1583{ 1584 if (__perf_sw_enabled(PERF_COUNT_SW_CONTEXT_SWITCHES)) 1585 __perf_sw_event_sched(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 0); 1586 1587#ifdef CONFIG_CGROUP_PERF 1588 if (__perf_sw_enabled(PERF_COUNT_SW_CGROUP_SWITCHES) && 1589 perf_cgroup_from_task(prev, NULL) != 1590 perf_cgroup_from_task(next, NULL)) 1591 __perf_sw_event_sched(PERF_COUNT_SW_CGROUP_SWITCHES, 1, 0); 1592#endif 1593 1594 if (static_branch_unlikely(&perf_sched_events)) 1595 __perf_event_task_sched_out(prev, next); 1596} 1597 1598extern void perf_event_mmap(struct vm_area_struct *vma); 1599 1600extern void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, 1601 bool unregister, const char *sym); 1602extern void perf_event_bpf_event(struct bpf_prog *prog, 1603 enum perf_bpf_event_type type, 1604 u16 flags); 1605 1606#ifdef CONFIG_GUEST_PERF_EVENTS 1607extern struct perf_guest_info_callbacks __rcu *perf_guest_cbs; 1608 1609DECLARE_STATIC_CALL(__perf_guest_state, *perf_guest_cbs->state); 1610DECLARE_STATIC_CALL(__perf_guest_get_ip, *perf_guest_cbs->get_ip); 1611DECLARE_STATIC_CALL(__perf_guest_handle_intel_pt_intr, *perf_guest_cbs->handle_intel_pt_intr); 1612 1613static inline unsigned int perf_guest_state(void) 1614{ 1615 return static_call(__perf_guest_state)(); 1616} 1617static inline unsigned long perf_guest_get_ip(void) 1618{ 1619 return static_call(__perf_guest_get_ip)(); 1620} 1621static inline unsigned int perf_guest_handle_intel_pt_intr(void) 1622{ 1623 return static_call(__perf_guest_handle_intel_pt_intr)(); 1624} 1625extern void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs); 1626extern void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs); 1627#else 1628static inline unsigned int perf_guest_state(void) { return 0; } 1629static inline unsigned long perf_guest_get_ip(void) { return 0; } 1630static inline unsigned int perf_guest_handle_intel_pt_intr(void) { return 0; } 1631#endif /* CONFIG_GUEST_PERF_EVENTS */ 1632 1633extern void perf_event_exec(void); 1634extern void perf_event_comm(struct task_struct *tsk, bool exec); 1635extern void perf_event_namespaces(struct task_struct *tsk); 1636extern void perf_event_fork(struct task_struct *tsk); 1637extern void perf_event_text_poke(const void *addr, 1638 const void *old_bytes, size_t old_len, 1639 const void *new_bytes, size_t new_len); 1640 1641/* Callchains */ 1642DECLARE_PER_CPU(struct perf_callchain_entry, perf_callchain_entry); 1643 1644extern void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs); 1645extern void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs); 1646extern struct perf_callchain_entry * 1647get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, 1648 u32 max_stack, bool crosstask, bool add_mark); 1649extern int get_callchain_buffers(int max_stack); 1650extern void put_callchain_buffers(void); 1651extern struct perf_callchain_entry *get_callchain_entry(int *rctx); 1652extern void put_callchain_entry(int rctx); 1653 1654extern int sysctl_perf_event_max_stack; 1655extern int sysctl_perf_event_max_contexts_per_stack; 1656 1657static inline int perf_callchain_store_context(struct perf_callchain_entry_ctx *ctx, u64 ip) 1658{ 1659 if (ctx->contexts < sysctl_perf_event_max_contexts_per_stack) { 1660 struct perf_callchain_entry *entry = ctx->entry; 1661 entry->ip[entry->nr++] = ip; 1662 ++ctx->contexts; 1663 return 0; 1664 } else { 1665 ctx->contexts_maxed = true; 1666 return -1; /* no more room, stop walking the stack */ 1667 } 1668} 1669 1670static inline int perf_callchain_store(struct perf_callchain_entry_ctx *ctx, u64 ip) 1671{ 1672 if (ctx->nr < ctx->max_stack && !ctx->contexts_maxed) { 1673 struct perf_callchain_entry *entry = ctx->entry; 1674 entry->ip[entry->nr++] = ip; 1675 ++ctx->nr; 1676 return 0; 1677 } else { 1678 return -1; /* no more room, stop walking the stack */ 1679 } 1680} 1681 1682extern int sysctl_perf_event_paranoid; 1683extern int sysctl_perf_event_sample_rate; 1684 1685extern void perf_sample_event_took(u64 sample_len_ns); 1686 1687/* Access to perf_event_open(2) syscall. */ 1688#define PERF_SECURITY_OPEN 0 1689 1690/* Finer grained perf_event_open(2) access control. */ 1691#define PERF_SECURITY_CPU 1 1692#define PERF_SECURITY_KERNEL 2 1693#define PERF_SECURITY_TRACEPOINT 3 1694 1695static inline int perf_is_paranoid(void) 1696{ 1697 return sysctl_perf_event_paranoid > -1; 1698} 1699 1700int perf_allow_kernel(void); 1701 1702static inline int perf_allow_cpu(void) 1703{ 1704 if (sysctl_perf_event_paranoid > 0 && !perfmon_capable()) 1705 return -EACCES; 1706 1707 return security_perf_event_open(PERF_SECURITY_CPU); 1708} 1709 1710static inline int perf_allow_tracepoint(void) 1711{ 1712 if (sysctl_perf_event_paranoid > -1 && !perfmon_capable()) 1713 return -EPERM; 1714 1715 return security_perf_event_open(PERF_SECURITY_TRACEPOINT); 1716} 1717 1718extern int perf_exclude_event(struct perf_event *event, struct pt_regs *regs); 1719 1720extern void perf_event_init(void); 1721extern void perf_tp_event(u16 event_type, u64 count, void *record, 1722 int entry_size, struct pt_regs *regs, 1723 struct hlist_head *head, int rctx, 1724 struct task_struct *task); 1725extern void perf_bp_event(struct perf_event *event, void *data); 1726 1727extern unsigned long perf_misc_flags(struct perf_event *event, struct pt_regs *regs); 1728extern unsigned long perf_instruction_pointer(struct perf_event *event, 1729 struct pt_regs *regs); 1730 1731#ifndef perf_arch_misc_flags 1732# define perf_arch_misc_flags(regs) \ 1733 (user_mode(regs) ? PERF_RECORD_MISC_USER : PERF_RECORD_MISC_KERNEL) 1734# define perf_arch_instruction_pointer(regs) instruction_pointer(regs) 1735#endif 1736#ifndef perf_arch_bpf_user_pt_regs 1737# define perf_arch_bpf_user_pt_regs(regs) regs 1738#endif 1739 1740#ifndef perf_arch_guest_misc_flags 1741static inline unsigned long perf_arch_guest_misc_flags(struct pt_regs *regs) 1742{ 1743 unsigned long guest_state = perf_guest_state(); 1744 1745 if (!(guest_state & PERF_GUEST_ACTIVE)) 1746 return 0; 1747 1748 if (guest_state & PERF_GUEST_USER) 1749 return PERF_RECORD_MISC_GUEST_USER; 1750 else 1751 return PERF_RECORD_MISC_GUEST_KERNEL; 1752} 1753# define perf_arch_guest_misc_flags(regs) perf_arch_guest_misc_flags(regs) 1754#endif 1755 1756static inline bool needs_branch_stack(struct perf_event *event) 1757{ 1758 return event->attr.branch_sample_type != 0; 1759} 1760 1761static inline bool has_aux(struct perf_event *event) 1762{ 1763 return event->pmu->setup_aux; 1764} 1765 1766static inline bool has_aux_action(struct perf_event *event) 1767{ 1768 return event->attr.aux_sample_size || 1769 event->attr.aux_pause || 1770 event->attr.aux_resume; 1771} 1772 1773static inline bool is_write_backward(struct perf_event *event) 1774{ 1775 return !!event->attr.write_backward; 1776} 1777 1778static inline bool has_addr_filter(struct perf_event *event) 1779{ 1780 return event->pmu->nr_addr_filters; 1781} 1782 1783/* 1784 * An inherited event uses parent's filters 1785 */ 1786static inline struct perf_addr_filters_head * 1787perf_event_addr_filters(struct perf_event *event) 1788{ 1789 struct perf_addr_filters_head *ifh = &event->addr_filters; 1790 1791 if (event->parent) 1792 ifh = &event->parent->addr_filters; 1793 1794 return ifh; 1795} 1796 1797static inline struct fasync_struct **perf_event_fasync(struct perf_event *event) 1798{ 1799 /* Only the parent has fasync state */ 1800 if (event->parent) 1801 event = event->parent; 1802 return &event->fasync; 1803} 1804 1805extern void perf_event_addr_filters_sync(struct perf_event *event); 1806extern void perf_report_aux_output_id(struct perf_event *event, u64 hw_id); 1807 1808extern int perf_output_begin(struct perf_output_handle *handle, 1809 struct perf_sample_data *data, 1810 struct perf_event *event, unsigned int size); 1811extern int perf_output_begin_forward(struct perf_output_handle *handle, 1812 struct perf_sample_data *data, 1813 struct perf_event *event, 1814 unsigned int size); 1815extern int perf_output_begin_backward(struct perf_output_handle *handle, 1816 struct perf_sample_data *data, 1817 struct perf_event *event, 1818 unsigned int size); 1819 1820extern void perf_output_end(struct perf_output_handle *handle); 1821extern unsigned int perf_output_copy(struct perf_output_handle *handle, 1822 const void *buf, unsigned int len); 1823extern unsigned int perf_output_skip(struct perf_output_handle *handle, 1824 unsigned int len); 1825extern long perf_output_copy_aux(struct perf_output_handle *aux_handle, 1826 struct perf_output_handle *handle, 1827 unsigned long from, unsigned long to); 1828extern int perf_swevent_get_recursion_context(void); 1829extern void perf_swevent_put_recursion_context(int rctx); 1830extern u64 perf_swevent_set_period(struct perf_event *event); 1831extern void perf_event_enable(struct perf_event *event); 1832extern void perf_event_disable(struct perf_event *event); 1833extern void perf_event_disable_local(struct perf_event *event); 1834extern void perf_event_disable_inatomic(struct perf_event *event); 1835extern void perf_event_task_tick(void); 1836extern int perf_event_account_interrupt(struct perf_event *event); 1837extern int perf_event_period(struct perf_event *event, u64 value); 1838extern u64 perf_event_pause(struct perf_event *event, bool reset); 1839#else /* !CONFIG_PERF_EVENTS: */ 1840static inline void * 1841perf_aux_output_begin(struct perf_output_handle *handle, 1842 struct perf_event *event) { return NULL; } 1843static inline void 1844perf_aux_output_end(struct perf_output_handle *handle, unsigned long size) 1845 { } 1846static inline int 1847perf_aux_output_skip(struct perf_output_handle *handle, 1848 unsigned long size) { return -EINVAL; } 1849static inline void * 1850perf_get_aux(struct perf_output_handle *handle) { return NULL; } 1851static inline void 1852perf_event_task_migrate(struct task_struct *task) { } 1853static inline void 1854perf_event_task_sched_in(struct task_struct *prev, 1855 struct task_struct *task) { } 1856static inline void 1857perf_event_task_sched_out(struct task_struct *prev, 1858 struct task_struct *next) { } 1859static inline int perf_event_init_task(struct task_struct *child, 1860 u64 clone_flags) { return 0; } 1861static inline void perf_event_exit_task(struct task_struct *child) { } 1862static inline void perf_event_free_task(struct task_struct *task) { } 1863static inline void perf_event_delayed_put(struct task_struct *task) { } 1864static inline struct file *perf_event_get(unsigned int fd) { return ERR_PTR(-EINVAL); } 1865static inline const struct perf_event *perf_get_event(struct file *file) 1866{ 1867 return ERR_PTR(-EINVAL); 1868} 1869static inline const struct perf_event_attr *perf_event_attrs(struct perf_event *event) 1870{ 1871 return ERR_PTR(-EINVAL); 1872} 1873static inline int perf_event_read_local(struct perf_event *event, u64 *value, 1874 u64 *enabled, u64 *running) 1875{ 1876 return -EINVAL; 1877} 1878static inline void perf_event_print_debug(void) { } 1879static inline int perf_event_task_disable(void) { return -EINVAL; } 1880static inline int perf_event_task_enable(void) { return -EINVAL; } 1881static inline int perf_event_refresh(struct perf_event *event, int refresh) 1882{ 1883 return -EINVAL; 1884} 1885 1886static inline void 1887perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) { } 1888static inline void 1889perf_bp_event(struct perf_event *event, void *data) { } 1890 1891static inline void perf_event_mmap(struct vm_area_struct *vma) { } 1892 1893typedef int (perf_ksymbol_get_name_f)(char *name, int name_len, void *data); 1894static inline void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, 1895 bool unregister, const char *sym) { } 1896static inline void perf_event_bpf_event(struct bpf_prog *prog, 1897 enum perf_bpf_event_type type, 1898 u16 flags) { } 1899static inline void perf_event_exec(void) { } 1900static inline void perf_event_comm(struct task_struct *tsk, bool exec) { } 1901static inline void perf_event_namespaces(struct task_struct *tsk) { } 1902static inline void perf_event_fork(struct task_struct *tsk) { } 1903static inline void perf_event_text_poke(const void *addr, 1904 const void *old_bytes, 1905 size_t old_len, 1906 const void *new_bytes, 1907 size_t new_len) { } 1908static inline void perf_event_init(void) { } 1909static inline int perf_swevent_get_recursion_context(void) { return -1; } 1910static inline void perf_swevent_put_recursion_context(int rctx) { } 1911static inline u64 perf_swevent_set_period(struct perf_event *event) { return 0; } 1912static inline void perf_event_enable(struct perf_event *event) { } 1913static inline void perf_event_disable(struct perf_event *event) { } 1914static inline int __perf_event_disable(void *info) { return -1; } 1915static inline void perf_event_task_tick(void) { } 1916static inline int perf_event_release_kernel(struct perf_event *event) { return 0; } 1917static inline int perf_event_period(struct perf_event *event, u64 value) 1918{ 1919 return -EINVAL; 1920} 1921static inline u64 perf_event_pause(struct perf_event *event, bool reset) 1922{ 1923 return 0; 1924} 1925static inline int perf_exclude_event(struct perf_event *event, struct pt_regs *regs) 1926{ 1927 return 0; 1928} 1929#endif 1930 1931#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL) 1932extern void perf_restore_debug_store(void); 1933#else 1934static inline void perf_restore_debug_store(void) { } 1935#endif 1936 1937#define perf_output_put(handle, x) perf_output_copy((handle), &(x), sizeof(x)) 1938 1939struct perf_pmu_events_attr { 1940 struct device_attribute attr; 1941 u64 id; 1942 const char *event_str; 1943}; 1944 1945struct perf_pmu_events_ht_attr { 1946 struct device_attribute attr; 1947 u64 id; 1948 const char *event_str_ht; 1949 const char *event_str_noht; 1950}; 1951 1952struct perf_pmu_events_hybrid_attr { 1953 struct device_attribute attr; 1954 u64 id; 1955 const char *event_str; 1956 u64 pmu_type; 1957}; 1958 1959struct perf_pmu_format_hybrid_attr { 1960 struct device_attribute attr; 1961 u64 pmu_type; 1962}; 1963 1964ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr, 1965 char *page); 1966 1967#define PMU_EVENT_ATTR(_name, _var, _id, _show) \ 1968static struct perf_pmu_events_attr _var = { \ 1969 .attr = __ATTR(_name, 0444, _show, NULL), \ 1970 .id = _id, \ 1971}; 1972 1973#define PMU_EVENT_ATTR_STRING(_name, _var, _str) \ 1974static struct perf_pmu_events_attr _var = { \ 1975 .attr = __ATTR(_name, 0444, perf_event_sysfs_show, NULL), \ 1976 .id = 0, \ 1977 .event_str = _str, \ 1978}; 1979 1980#define PMU_EVENT_ATTR_ID(_name, _show, _id) \ 1981 (&((struct perf_pmu_events_attr[]) { \ 1982 { .attr = __ATTR(_name, 0444, _show, NULL), \ 1983 .id = _id, } \ 1984 })[0].attr.attr) 1985 1986#define PMU_FORMAT_ATTR_SHOW(_name, _format) \ 1987static ssize_t \ 1988_name##_show(struct device *dev, \ 1989 struct device_attribute *attr, \ 1990 char *page) \ 1991{ \ 1992 BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \ 1993 return sprintf(page, _format "\n"); \ 1994} \ 1995 1996#define PMU_FORMAT_ATTR(_name, _format) \ 1997 PMU_FORMAT_ATTR_SHOW(_name, _format) \ 1998 \ 1999static struct device_attribute format_attr_##_name = __ATTR_RO(_name) 2000 2001/* Performance counter hotplug functions */ 2002#ifdef CONFIG_PERF_EVENTS 2003int perf_event_init_cpu(unsigned int cpu); 2004int perf_event_exit_cpu(unsigned int cpu); 2005#else 2006#define perf_event_init_cpu NULL 2007#define perf_event_exit_cpu NULL 2008#endif 2009 2010extern void arch_perf_update_userpage(struct perf_event *event, 2011 struct perf_event_mmap_page *userpg, 2012 u64 now); 2013 2014/* 2015 * Snapshot branch stack on software events. 2016 * 2017 * Branch stack can be very useful in understanding software events. For 2018 * example, when a long function, e.g. sys_perf_event_open, returns an 2019 * errno, it is not obvious why the function failed. Branch stack could 2020 * provide very helpful information in this type of scenarios. 2021 * 2022 * On software event, it is necessary to stop the hardware branch recorder 2023 * fast. Otherwise, the hardware register/buffer will be flushed with 2024 * entries of the triggering event. Therefore, static call is used to 2025 * stop the hardware recorder. 2026 */ 2027 2028/* 2029 * cnt is the number of entries allocated for entries. 2030 * Return number of entries copied to . 2031 */ 2032typedef int (perf_snapshot_branch_stack_t)(struct perf_branch_entry *entries, 2033 unsigned int cnt); 2034DECLARE_STATIC_CALL(perf_snapshot_branch_stack, perf_snapshot_branch_stack_t); 2035 2036#ifndef PERF_NEEDS_LOPWR_CB 2037static inline void perf_lopwr_cb(bool mode) 2038{ 2039} 2040#endif 2041 2042#endif /* _LINUX_PERF_EVENT_H */