at v6.12 57 kB view raw
1/* 2 * Performance events: 3 * 4 * Copyright (C) 2008-2009, Thomas Gleixner <tglx@linutronix.de> 5 * Copyright (C) 2008-2011, Red Hat, Inc., Ingo Molnar 6 * Copyright (C) 2008-2011, Red Hat, Inc., Peter Zijlstra 7 * 8 * Data type definitions, declarations, prototypes. 9 * 10 * Started by: Thomas Gleixner and Ingo Molnar 11 * 12 * For licencing details see kernel-base/COPYING 13 */ 14#ifndef _LINUX_PERF_EVENT_H 15#define _LINUX_PERF_EVENT_H 16 17#include <uapi/linux/perf_event.h> 18#include <uapi/linux/bpf_perf_event.h> 19 20/* 21 * Kernel-internal data types and definitions: 22 */ 23 24#ifdef CONFIG_PERF_EVENTS 25# include <asm/perf_event.h> 26# include <asm/local64.h> 27#endif 28 29#define PERF_GUEST_ACTIVE 0x01 30#define PERF_GUEST_USER 0x02 31 32struct perf_guest_info_callbacks { 33 unsigned int (*state)(void); 34 unsigned long (*get_ip)(void); 35 unsigned int (*handle_intel_pt_intr)(void); 36}; 37 38#ifdef CONFIG_HAVE_HW_BREAKPOINT 39#include <linux/rhashtable-types.h> 40#include <asm/hw_breakpoint.h> 41#endif 42 43#include <linux/list.h> 44#include <linux/mutex.h> 45#include <linux/rculist.h> 46#include <linux/rcupdate.h> 47#include <linux/spinlock.h> 48#include <linux/hrtimer.h> 49#include <linux/fs.h> 50#include <linux/pid_namespace.h> 51#include <linux/workqueue.h> 52#include <linux/ftrace.h> 53#include <linux/cpu.h> 54#include <linux/irq_work.h> 55#include <linux/static_key.h> 56#include <linux/jump_label_ratelimit.h> 57#include <linux/atomic.h> 58#include <linux/sysfs.h> 59#include <linux/perf_regs.h> 60#include <linux/cgroup.h> 61#include <linux/refcount.h> 62#include <linux/security.h> 63#include <linux/static_call.h> 64#include <linux/lockdep.h> 65#include <asm/local.h> 66 67struct perf_callchain_entry { 68 __u64 nr; 69 __u64 ip[]; /* /proc/sys/kernel/perf_event_max_stack */ 70}; 71 72struct perf_callchain_entry_ctx { 73 struct perf_callchain_entry *entry; 74 u32 max_stack; 75 u32 nr; 76 short contexts; 77 bool contexts_maxed; 78}; 79 80typedef unsigned long (*perf_copy_f)(void *dst, const void *src, 81 unsigned long off, unsigned long len); 82 83struct perf_raw_frag { 84 union { 85 struct perf_raw_frag *next; 86 unsigned long pad; 87 }; 88 perf_copy_f copy; 89 void *data; 90 u32 size; 91} __packed; 92 93struct perf_raw_record { 94 struct perf_raw_frag frag; 95 u32 size; 96}; 97 98static __always_inline bool perf_raw_frag_last(const struct perf_raw_frag *frag) 99{ 100 return frag->pad < sizeof(u64); 101} 102 103/* 104 * branch stack layout: 105 * nr: number of taken branches stored in entries[] 106 * hw_idx: The low level index of raw branch records 107 * for the most recent branch. 108 * -1ULL means invalid/unknown. 109 * 110 * Note that nr can vary from sample to sample 111 * branches (to, from) are stored from most recent 112 * to least recent, i.e., entries[0] contains the most 113 * recent branch. 114 * The entries[] is an abstraction of raw branch records, 115 * which may not be stored in age order in HW, e.g. Intel LBR. 116 * The hw_idx is to expose the low level index of raw 117 * branch record for the most recent branch aka entries[0]. 118 * The hw_idx index is between -1 (unknown) and max depth, 119 * which can be retrieved in /sys/devices/cpu/caps/branches. 120 * For the architectures whose raw branch records are 121 * already stored in age order, the hw_idx should be 0. 122 */ 123struct perf_branch_stack { 124 __u64 nr; 125 __u64 hw_idx; 126 struct perf_branch_entry entries[]; 127}; 128 129struct task_struct; 130 131/* 132 * extra PMU register associated with an event 133 */ 134struct hw_perf_event_extra { 135 u64 config; /* register value */ 136 unsigned int reg; /* register address or index */ 137 int alloc; /* extra register already allocated */ 138 int idx; /* index in shared_regs->regs[] */ 139}; 140 141/** 142 * hw_perf_event::flag values 143 * 144 * PERF_EVENT_FLAG_ARCH bits are reserved for architecture-specific 145 * usage. 146 */ 147#define PERF_EVENT_FLAG_ARCH 0x000fffff 148#define PERF_EVENT_FLAG_USER_READ_CNT 0x80000000 149 150static_assert((PERF_EVENT_FLAG_USER_READ_CNT & PERF_EVENT_FLAG_ARCH) == 0); 151 152/** 153 * struct hw_perf_event - performance event hardware details: 154 */ 155struct hw_perf_event { 156#ifdef CONFIG_PERF_EVENTS 157 union { 158 struct { /* hardware */ 159 u64 config; 160 u64 last_tag; 161 unsigned long config_base; 162 unsigned long event_base; 163 int event_base_rdpmc; 164 int idx; 165 int last_cpu; 166 int flags; 167 168 struct hw_perf_event_extra extra_reg; 169 struct hw_perf_event_extra branch_reg; 170 }; 171 struct { /* aux / Intel-PT */ 172 u64 aux_config; 173 }; 174 struct { /* software */ 175 struct hrtimer hrtimer; 176 }; 177 struct { /* tracepoint */ 178 /* for tp_event->class */ 179 struct list_head tp_list; 180 }; 181 struct { /* amd_power */ 182 u64 pwr_acc; 183 u64 ptsc; 184 }; 185#ifdef CONFIG_HAVE_HW_BREAKPOINT 186 struct { /* breakpoint */ 187 /* 188 * Crufty hack to avoid the chicken and egg 189 * problem hw_breakpoint has with context 190 * creation and event initalization. 191 */ 192 struct arch_hw_breakpoint info; 193 struct rhlist_head bp_list; 194 }; 195#endif 196 struct { /* amd_iommu */ 197 u8 iommu_bank; 198 u8 iommu_cntr; 199 u16 padding; 200 u64 conf; 201 u64 conf1; 202 }; 203 }; 204 /* 205 * If the event is a per task event, this will point to the task in 206 * question. See the comment in perf_event_alloc(). 207 */ 208 struct task_struct *target; 209 210 /* 211 * PMU would store hardware filter configuration 212 * here. 213 */ 214 void *addr_filters; 215 216 /* Last sync'ed generation of filters */ 217 unsigned long addr_filters_gen; 218 219/* 220 * hw_perf_event::state flags; used to track the PERF_EF_* state. 221 */ 222#define PERF_HES_STOPPED 0x01 /* the counter is stopped */ 223#define PERF_HES_UPTODATE 0x02 /* event->count up-to-date */ 224#define PERF_HES_ARCH 0x04 225 226 int state; 227 228 /* 229 * The last observed hardware counter value, updated with a 230 * local64_cmpxchg() such that pmu::read() can be called nested. 231 */ 232 local64_t prev_count; 233 234 /* 235 * The period to start the next sample with. 236 */ 237 u64 sample_period; 238 239 union { 240 struct { /* Sampling */ 241 /* 242 * The period we started this sample with. 243 */ 244 u64 last_period; 245 246 /* 247 * However much is left of the current period; 248 * note that this is a full 64bit value and 249 * allows for generation of periods longer 250 * than hardware might allow. 251 */ 252 local64_t period_left; 253 }; 254 struct { /* Topdown events counting for context switch */ 255 u64 saved_metric; 256 u64 saved_slots; 257 }; 258 }; 259 260 /* 261 * State for throttling the event, see __perf_event_overflow() and 262 * perf_adjust_freq_unthr_context(). 263 */ 264 u64 interrupts_seq; 265 u64 interrupts; 266 267 /* 268 * State for freq target events, see __perf_event_overflow() and 269 * perf_adjust_freq_unthr_context(). 270 */ 271 u64 freq_time_stamp; 272 u64 freq_count_stamp; 273#endif 274}; 275 276struct perf_event; 277struct perf_event_pmu_context; 278 279/* 280 * Common implementation detail of pmu::{start,commit,cancel}_txn 281 */ 282#define PERF_PMU_TXN_ADD 0x1 /* txn to add/schedule event on PMU */ 283#define PERF_PMU_TXN_READ 0x2 /* txn to read event group from PMU */ 284 285/** 286 * pmu::capabilities flags 287 */ 288#define PERF_PMU_CAP_NO_INTERRUPT 0x0001 289#define PERF_PMU_CAP_NO_NMI 0x0002 290#define PERF_PMU_CAP_AUX_NO_SG 0x0004 291#define PERF_PMU_CAP_EXTENDED_REGS 0x0008 292#define PERF_PMU_CAP_EXCLUSIVE 0x0010 293#define PERF_PMU_CAP_ITRACE 0x0020 294#define PERF_PMU_CAP_NO_EXCLUDE 0x0040 295#define PERF_PMU_CAP_AUX_OUTPUT 0x0080 296#define PERF_PMU_CAP_EXTENDED_HW_TYPE 0x0100 297 298/** 299 * pmu::scope 300 */ 301enum perf_pmu_scope { 302 PERF_PMU_SCOPE_NONE = 0, 303 PERF_PMU_SCOPE_CORE, 304 PERF_PMU_SCOPE_DIE, 305 PERF_PMU_SCOPE_CLUSTER, 306 PERF_PMU_SCOPE_PKG, 307 PERF_PMU_SCOPE_SYS_WIDE, 308 PERF_PMU_MAX_SCOPE, 309}; 310 311struct perf_output_handle; 312 313#define PMU_NULL_DEV ((void *)(~0UL)) 314 315/** 316 * struct pmu - generic performance monitoring unit 317 */ 318struct pmu { 319 struct list_head entry; 320 321 struct module *module; 322 struct device *dev; 323 struct device *parent; 324 const struct attribute_group **attr_groups; 325 const struct attribute_group **attr_update; 326 const char *name; 327 int type; 328 329 /* 330 * various common per-pmu feature flags 331 */ 332 int capabilities; 333 334 /* 335 * PMU scope 336 */ 337 unsigned int scope; 338 339 int __percpu *pmu_disable_count; 340 struct perf_cpu_pmu_context __percpu *cpu_pmu_context; 341 atomic_t exclusive_cnt; /* < 0: cpu; > 0: tsk */ 342 int task_ctx_nr; 343 int hrtimer_interval_ms; 344 345 /* number of address filters this PMU can do */ 346 unsigned int nr_addr_filters; 347 348 /* 349 * Fully disable/enable this PMU, can be used to protect from the PMI 350 * as well as for lazy/batch writing of the MSRs. 351 */ 352 void (*pmu_enable) (struct pmu *pmu); /* optional */ 353 void (*pmu_disable) (struct pmu *pmu); /* optional */ 354 355 /* 356 * Try and initialize the event for this PMU. 357 * 358 * Returns: 359 * -ENOENT -- @event is not for this PMU 360 * 361 * -ENODEV -- @event is for this PMU but PMU not present 362 * -EBUSY -- @event is for this PMU but PMU temporarily unavailable 363 * -EINVAL -- @event is for this PMU but @event is not valid 364 * -EOPNOTSUPP -- @event is for this PMU, @event is valid, but not supported 365 * -EACCES -- @event is for this PMU, @event is valid, but no privileges 366 * 367 * 0 -- @event is for this PMU and valid 368 * 369 * Other error return values are allowed. 370 */ 371 int (*event_init) (struct perf_event *event); 372 373 /* 374 * Notification that the event was mapped or unmapped. Called 375 * in the context of the mapping task. 376 */ 377 void (*event_mapped) (struct perf_event *event, struct mm_struct *mm); /* optional */ 378 void (*event_unmapped) (struct perf_event *event, struct mm_struct *mm); /* optional */ 379 380 /* 381 * Flags for ->add()/->del()/ ->start()/->stop(). There are 382 * matching hw_perf_event::state flags. 383 */ 384#define PERF_EF_START 0x01 /* start the counter when adding */ 385#define PERF_EF_RELOAD 0x02 /* reload the counter when starting */ 386#define PERF_EF_UPDATE 0x04 /* update the counter when stopping */ 387 388 /* 389 * Adds/Removes a counter to/from the PMU, can be done inside a 390 * transaction, see the ->*_txn() methods. 391 * 392 * The add/del callbacks will reserve all hardware resources required 393 * to service the event, this includes any counter constraint 394 * scheduling etc. 395 * 396 * Called with IRQs disabled and the PMU disabled on the CPU the event 397 * is on. 398 * 399 * ->add() called without PERF_EF_START should result in the same state 400 * as ->add() followed by ->stop(). 401 * 402 * ->del() must always PERF_EF_UPDATE stop an event. If it calls 403 * ->stop() that must deal with already being stopped without 404 * PERF_EF_UPDATE. 405 */ 406 int (*add) (struct perf_event *event, int flags); 407 void (*del) (struct perf_event *event, int flags); 408 409 /* 410 * Starts/Stops a counter present on the PMU. 411 * 412 * The PMI handler should stop the counter when perf_event_overflow() 413 * returns !0. ->start() will be used to continue. 414 * 415 * Also used to change the sample period. 416 * 417 * Called with IRQs disabled and the PMU disabled on the CPU the event 418 * is on -- will be called from NMI context with the PMU generates 419 * NMIs. 420 * 421 * ->stop() with PERF_EF_UPDATE will read the counter and update 422 * period/count values like ->read() would. 423 * 424 * ->start() with PERF_EF_RELOAD will reprogram the counter 425 * value, must be preceded by a ->stop() with PERF_EF_UPDATE. 426 */ 427 void (*start) (struct perf_event *event, int flags); 428 void (*stop) (struct perf_event *event, int flags); 429 430 /* 431 * Updates the counter value of the event. 432 * 433 * For sampling capable PMUs this will also update the software period 434 * hw_perf_event::period_left field. 435 */ 436 void (*read) (struct perf_event *event); 437 438 /* 439 * Group events scheduling is treated as a transaction, add 440 * group events as a whole and perform one schedulability test. 441 * If the test fails, roll back the whole group 442 * 443 * Start the transaction, after this ->add() doesn't need to 444 * do schedulability tests. 445 * 446 * Optional. 447 */ 448 void (*start_txn) (struct pmu *pmu, unsigned int txn_flags); 449 /* 450 * If ->start_txn() disabled the ->add() schedulability test 451 * then ->commit_txn() is required to perform one. On success 452 * the transaction is closed. On error the transaction is kept 453 * open until ->cancel_txn() is called. 454 * 455 * Optional. 456 */ 457 int (*commit_txn) (struct pmu *pmu); 458 /* 459 * Will cancel the transaction, assumes ->del() is called 460 * for each successful ->add() during the transaction. 461 * 462 * Optional. 463 */ 464 void (*cancel_txn) (struct pmu *pmu); 465 466 /* 467 * Will return the value for perf_event_mmap_page::index for this event, 468 * if no implementation is provided it will default to 0 (see 469 * perf_event_idx_default). 470 */ 471 int (*event_idx) (struct perf_event *event); /*optional */ 472 473 /* 474 * context-switches callback 475 */ 476 void (*sched_task) (struct perf_event_pmu_context *pmu_ctx, 477 bool sched_in); 478 479 /* 480 * Kmem cache of PMU specific data 481 */ 482 struct kmem_cache *task_ctx_cache; 483 484 /* 485 * PMU specific parts of task perf event context (i.e. ctx->task_ctx_data) 486 * can be synchronized using this function. See Intel LBR callstack support 487 * implementation and Perf core context switch handling callbacks for usage 488 * examples. 489 */ 490 void (*swap_task_ctx) (struct perf_event_pmu_context *prev_epc, 491 struct perf_event_pmu_context *next_epc); 492 /* optional */ 493 494 /* 495 * Set up pmu-private data structures for an AUX area 496 */ 497 void *(*setup_aux) (struct perf_event *event, void **pages, 498 int nr_pages, bool overwrite); 499 /* optional */ 500 501 /* 502 * Free pmu-private AUX data structures 503 */ 504 void (*free_aux) (void *aux); /* optional */ 505 506 /* 507 * Take a snapshot of the AUX buffer without touching the event 508 * state, so that preempting ->start()/->stop() callbacks does 509 * not interfere with their logic. Called in PMI context. 510 * 511 * Returns the size of AUX data copied to the output handle. 512 * 513 * Optional. 514 */ 515 long (*snapshot_aux) (struct perf_event *event, 516 struct perf_output_handle *handle, 517 unsigned long size); 518 519 /* 520 * Validate address range filters: make sure the HW supports the 521 * requested configuration and number of filters; return 0 if the 522 * supplied filters are valid, -errno otherwise. 523 * 524 * Runs in the context of the ioctl()ing process and is not serialized 525 * with the rest of the PMU callbacks. 526 */ 527 int (*addr_filters_validate) (struct list_head *filters); 528 /* optional */ 529 530 /* 531 * Synchronize address range filter configuration: 532 * translate hw-agnostic filters into hardware configuration in 533 * event::hw::addr_filters. 534 * 535 * Runs as a part of filter sync sequence that is done in ->start() 536 * callback by calling perf_event_addr_filters_sync(). 537 * 538 * May (and should) traverse event::addr_filters::list, for which its 539 * caller provides necessary serialization. 540 */ 541 void (*addr_filters_sync) (struct perf_event *event); 542 /* optional */ 543 544 /* 545 * Check if event can be used for aux_output purposes for 546 * events of this PMU. 547 * 548 * Runs from perf_event_open(). Should return 0 for "no match" 549 * or non-zero for "match". 550 */ 551 int (*aux_output_match) (struct perf_event *event); 552 /* optional */ 553 554 /* 555 * Skip programming this PMU on the given CPU. Typically needed for 556 * big.LITTLE things. 557 */ 558 bool (*filter) (struct pmu *pmu, int cpu); /* optional */ 559 560 /* 561 * Check period value for PERF_EVENT_IOC_PERIOD ioctl. 562 */ 563 int (*check_period) (struct perf_event *event, u64 value); /* optional */ 564}; 565 566enum perf_addr_filter_action_t { 567 PERF_ADDR_FILTER_ACTION_STOP = 0, 568 PERF_ADDR_FILTER_ACTION_START, 569 PERF_ADDR_FILTER_ACTION_FILTER, 570}; 571 572/** 573 * struct perf_addr_filter - address range filter definition 574 * @entry: event's filter list linkage 575 * @path: object file's path for file-based filters 576 * @offset: filter range offset 577 * @size: filter range size (size==0 means single address trigger) 578 * @action: filter/start/stop 579 * 580 * This is a hardware-agnostic filter configuration as specified by the user. 581 */ 582struct perf_addr_filter { 583 struct list_head entry; 584 struct path path; 585 unsigned long offset; 586 unsigned long size; 587 enum perf_addr_filter_action_t action; 588}; 589 590/** 591 * struct perf_addr_filters_head - container for address range filters 592 * @list: list of filters for this event 593 * @lock: spinlock that serializes accesses to the @list and event's 594 * (and its children's) filter generations. 595 * @nr_file_filters: number of file-based filters 596 * 597 * A child event will use parent's @list (and therefore @lock), so they are 598 * bundled together; see perf_event_addr_filters(). 599 */ 600struct perf_addr_filters_head { 601 struct list_head list; 602 raw_spinlock_t lock; 603 unsigned int nr_file_filters; 604}; 605 606struct perf_addr_filter_range { 607 unsigned long start; 608 unsigned long size; 609}; 610 611/** 612 * enum perf_event_state - the states of an event: 613 */ 614enum perf_event_state { 615 PERF_EVENT_STATE_DEAD = -4, 616 PERF_EVENT_STATE_EXIT = -3, 617 PERF_EVENT_STATE_ERROR = -2, 618 PERF_EVENT_STATE_OFF = -1, 619 PERF_EVENT_STATE_INACTIVE = 0, 620 PERF_EVENT_STATE_ACTIVE = 1, 621}; 622 623struct file; 624struct perf_sample_data; 625 626typedef void (*perf_overflow_handler_t)(struct perf_event *, 627 struct perf_sample_data *, 628 struct pt_regs *regs); 629 630/* 631 * Event capabilities. For event_caps and groups caps. 632 * 633 * PERF_EV_CAP_SOFTWARE: Is a software event. 634 * PERF_EV_CAP_READ_ACTIVE_PKG: A CPU event (or cgroup event) that can be read 635 * from any CPU in the package where it is active. 636 * PERF_EV_CAP_SIBLING: An event with this flag must be a group sibling and 637 * cannot be a group leader. If an event with this flag is detached from the 638 * group it is scheduled out and moved into an unrecoverable ERROR state. 639 * PERF_EV_CAP_READ_SCOPE: A CPU event that can be read from any CPU of the 640 * PMU scope where it is active. 641 */ 642#define PERF_EV_CAP_SOFTWARE BIT(0) 643#define PERF_EV_CAP_READ_ACTIVE_PKG BIT(1) 644#define PERF_EV_CAP_SIBLING BIT(2) 645#define PERF_EV_CAP_READ_SCOPE BIT(3) 646 647#define SWEVENT_HLIST_BITS 8 648#define SWEVENT_HLIST_SIZE (1 << SWEVENT_HLIST_BITS) 649 650struct swevent_hlist { 651 struct hlist_head heads[SWEVENT_HLIST_SIZE]; 652 struct rcu_head rcu_head; 653}; 654 655#define PERF_ATTACH_CONTEXT 0x01 656#define PERF_ATTACH_GROUP 0x02 657#define PERF_ATTACH_TASK 0x04 658#define PERF_ATTACH_TASK_DATA 0x08 659#define PERF_ATTACH_ITRACE 0x10 660#define PERF_ATTACH_SCHED_CB 0x20 661#define PERF_ATTACH_CHILD 0x40 662 663struct bpf_prog; 664struct perf_cgroup; 665struct perf_buffer; 666 667struct pmu_event_list { 668 raw_spinlock_t lock; 669 struct list_head list; 670}; 671 672/* 673 * event->sibling_list is modified whole holding both ctx->lock and ctx->mutex 674 * as such iteration must hold either lock. However, since ctx->lock is an IRQ 675 * safe lock, and is only held by the CPU doing the modification, having IRQs 676 * disabled is sufficient since it will hold-off the IPIs. 677 */ 678#ifdef CONFIG_PROVE_LOCKING 679#define lockdep_assert_event_ctx(event) \ 680 WARN_ON_ONCE(__lockdep_enabled && \ 681 (this_cpu_read(hardirqs_enabled) && \ 682 lockdep_is_held(&(event)->ctx->mutex) != LOCK_STATE_HELD)) 683#else 684#define lockdep_assert_event_ctx(event) 685#endif 686 687#define for_each_sibling_event(sibling, event) \ 688 lockdep_assert_event_ctx(event); \ 689 if ((event)->group_leader == (event)) \ 690 list_for_each_entry((sibling), &(event)->sibling_list, sibling_list) 691 692/** 693 * struct perf_event - performance event kernel representation: 694 */ 695struct perf_event { 696#ifdef CONFIG_PERF_EVENTS 697 /* 698 * entry onto perf_event_context::event_list; 699 * modifications require ctx->lock 700 * RCU safe iterations. 701 */ 702 struct list_head event_entry; 703 704 /* 705 * Locked for modification by both ctx->mutex and ctx->lock; holding 706 * either sufficies for read. 707 */ 708 struct list_head sibling_list; 709 struct list_head active_list; 710 /* 711 * Node on the pinned or flexible tree located at the event context; 712 */ 713 struct rb_node group_node; 714 u64 group_index; 715 /* 716 * We need storage to track the entries in perf_pmu_migrate_context; we 717 * cannot use the event_entry because of RCU and we want to keep the 718 * group in tact which avoids us using the other two entries. 719 */ 720 struct list_head migrate_entry; 721 722 struct hlist_node hlist_entry; 723 struct list_head active_entry; 724 int nr_siblings; 725 726 /* Not serialized. Only written during event initialization. */ 727 int event_caps; 728 /* The cumulative AND of all event_caps for events in this group. */ 729 int group_caps; 730 731 unsigned int group_generation; 732 struct perf_event *group_leader; 733 /* 734 * event->pmu will always point to pmu in which this event belongs. 735 * Whereas event->pmu_ctx->pmu may point to other pmu when group of 736 * different pmu events is created. 737 */ 738 struct pmu *pmu; 739 void *pmu_private; 740 741 enum perf_event_state state; 742 unsigned int attach_state; 743 local64_t count; 744 atomic64_t child_count; 745 746 /* 747 * These are the total time in nanoseconds that the event 748 * has been enabled (i.e. eligible to run, and the task has 749 * been scheduled in, if this is a per-task event) 750 * and running (scheduled onto the CPU), respectively. 751 */ 752 u64 total_time_enabled; 753 u64 total_time_running; 754 u64 tstamp; 755 756 struct perf_event_attr attr; 757 u16 header_size; 758 u16 id_header_size; 759 u16 read_size; 760 struct hw_perf_event hw; 761 762 struct perf_event_context *ctx; 763 /* 764 * event->pmu_ctx points to perf_event_pmu_context in which the event 765 * is added. This pmu_ctx can be of other pmu for sw event when that 766 * sw event is part of a group which also contains non-sw events. 767 */ 768 struct perf_event_pmu_context *pmu_ctx; 769 atomic_long_t refcount; 770 771 /* 772 * These accumulate total time (in nanoseconds) that children 773 * events have been enabled and running, respectively. 774 */ 775 atomic64_t child_total_time_enabled; 776 atomic64_t child_total_time_running; 777 778 /* 779 * Protect attach/detach and child_list: 780 */ 781 struct mutex child_mutex; 782 struct list_head child_list; 783 struct perf_event *parent; 784 785 int oncpu; 786 int cpu; 787 788 struct list_head owner_entry; 789 struct task_struct *owner; 790 791 /* mmap bits */ 792 struct mutex mmap_mutex; 793 atomic_t mmap_count; 794 795 struct perf_buffer *rb; 796 struct list_head rb_entry; 797 unsigned long rcu_batches; 798 int rcu_pending; 799 800 /* poll related */ 801 wait_queue_head_t waitq; 802 struct fasync_struct *fasync; 803 804 /* delayed work for NMIs and such */ 805 unsigned int pending_wakeup; 806 unsigned int pending_kill; 807 unsigned int pending_disable; 808 unsigned long pending_addr; /* SIGTRAP */ 809 struct irq_work pending_irq; 810 struct irq_work pending_disable_irq; 811 struct callback_head pending_task; 812 unsigned int pending_work; 813 struct rcuwait pending_work_wait; 814 815 atomic_t event_limit; 816 817 /* address range filters */ 818 struct perf_addr_filters_head addr_filters; 819 /* vma address array for file-based filders */ 820 struct perf_addr_filter_range *addr_filter_ranges; 821 unsigned long addr_filters_gen; 822 823 /* for aux_output events */ 824 struct perf_event *aux_event; 825 826 void (*destroy)(struct perf_event *); 827 struct rcu_head rcu_head; 828 829 struct pid_namespace *ns; 830 u64 id; 831 832 atomic64_t lost_samples; 833 834 u64 (*clock)(void); 835 perf_overflow_handler_t overflow_handler; 836 void *overflow_handler_context; 837 struct bpf_prog *prog; 838 u64 bpf_cookie; 839 840#ifdef CONFIG_EVENT_TRACING 841 struct trace_event_call *tp_event; 842 struct event_filter *filter; 843#ifdef CONFIG_FUNCTION_TRACER 844 struct ftrace_ops ftrace_ops; 845#endif 846#endif 847 848#ifdef CONFIG_CGROUP_PERF 849 struct perf_cgroup *cgrp; /* cgroup event is attach to */ 850#endif 851 852#ifdef CONFIG_SECURITY 853 void *security; 854#endif 855 struct list_head sb_list; 856 857 /* 858 * Certain events gets forwarded to another pmu internally by over- 859 * writing kernel copy of event->attr.type without user being aware 860 * of it. event->orig_type contains original 'type' requested by 861 * user. 862 */ 863 __u32 orig_type; 864#endif /* CONFIG_PERF_EVENTS */ 865}; 866 867/* 868 * ,-----------------------[1:n]------------------------. 869 * V V 870 * perf_event_context <-[1:n]-> perf_event_pmu_context <-[1:n]- perf_event 871 * | | 872 * `--[n:1]-> pmu <-[1:n]--' 873 * 874 * 875 * struct perf_event_pmu_context lifetime is refcount based and RCU freed 876 * (similar to perf_event_context). Locking is as if it were a member of 877 * perf_event_context; specifically: 878 * 879 * modification, both: ctx->mutex && ctx->lock 880 * reading, either: ctx->mutex || ctx->lock 881 * 882 * There is one exception to this; namely put_pmu_ctx() isn't always called 883 * with ctx->mutex held; this means that as long as we can guarantee the epc 884 * has events the above rules hold. 885 * 886 * Specificially, sys_perf_event_open()'s group_leader case depends on 887 * ctx->mutex pinning the configuration. Since we hold a reference on 888 * group_leader (through the filedesc) it can't go away, therefore it's 889 * associated pmu_ctx must exist and cannot change due to ctx->mutex. 890 * 891 * perf_event holds a refcount on perf_event_context 892 * perf_event holds a refcount on perf_event_pmu_context 893 */ 894struct perf_event_pmu_context { 895 struct pmu *pmu; 896 struct perf_event_context *ctx; 897 898 struct list_head pmu_ctx_entry; 899 900 struct list_head pinned_active; 901 struct list_head flexible_active; 902 903 /* Used to avoid freeing per-cpu perf_event_pmu_context */ 904 unsigned int embedded : 1; 905 906 unsigned int nr_events; 907 unsigned int nr_cgroups; 908 unsigned int nr_freq; 909 910 atomic_t refcount; /* event <-> epc */ 911 struct rcu_head rcu_head; 912 913 void *task_ctx_data; /* pmu specific data */ 914 /* 915 * Set when one or more (plausibly active) event can't be scheduled 916 * due to pmu overcommit or pmu constraints, except tolerant to 917 * events not necessary to be active due to scheduling constraints, 918 * such as cgroups. 919 */ 920 int rotate_necessary; 921}; 922 923static inline bool perf_pmu_ctx_is_active(struct perf_event_pmu_context *epc) 924{ 925 return !list_empty(&epc->flexible_active) || !list_empty(&epc->pinned_active); 926} 927 928struct perf_event_groups { 929 struct rb_root tree; 930 u64 index; 931}; 932 933 934/** 935 * struct perf_event_context - event context structure 936 * 937 * Used as a container for task events and CPU events as well: 938 */ 939struct perf_event_context { 940 /* 941 * Protect the states of the events in the list, 942 * nr_active, and the list: 943 */ 944 raw_spinlock_t lock; 945 /* 946 * Protect the list of events. Locking either mutex or lock 947 * is sufficient to ensure the list doesn't change; to change 948 * the list you need to lock both the mutex and the spinlock. 949 */ 950 struct mutex mutex; 951 952 struct list_head pmu_ctx_list; 953 struct perf_event_groups pinned_groups; 954 struct perf_event_groups flexible_groups; 955 struct list_head event_list; 956 957 int nr_events; 958 int nr_user; 959 int is_active; 960 961 int nr_task_data; 962 int nr_stat; 963 int nr_freq; 964 int rotate_disable; 965 966 refcount_t refcount; /* event <-> ctx */ 967 struct task_struct *task; 968 969 /* 970 * Context clock, runs when context enabled. 971 */ 972 u64 time; 973 u64 timestamp; 974 u64 timeoffset; 975 976 /* 977 * These fields let us detect when two contexts have both 978 * been cloned (inherited) from a common ancestor. 979 */ 980 struct perf_event_context *parent_ctx; 981 u64 parent_gen; 982 u64 generation; 983 int pin_count; 984#ifdef CONFIG_CGROUP_PERF 985 int nr_cgroups; /* cgroup evts */ 986#endif 987 struct rcu_head rcu_head; 988 989 /* 990 * The count of events for which using the switch-out fast path 991 * should be avoided. 992 * 993 * Sum (event->pending_work + events with 994 * (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ))) 995 * 996 * The SIGTRAP is targeted at ctx->task, as such it won't do changing 997 * that until the signal is delivered. 998 */ 999 local_t nr_no_switch_fast; 1000}; 1001 1002struct perf_cpu_pmu_context { 1003 struct perf_event_pmu_context epc; 1004 struct perf_event_pmu_context *task_epc; 1005 1006 struct list_head sched_cb_entry; 1007 int sched_cb_usage; 1008 1009 int active_oncpu; 1010 int exclusive; 1011 1012 raw_spinlock_t hrtimer_lock; 1013 struct hrtimer hrtimer; 1014 ktime_t hrtimer_interval; 1015 unsigned int hrtimer_active; 1016}; 1017 1018/** 1019 * struct perf_event_cpu_context - per cpu event context structure 1020 */ 1021struct perf_cpu_context { 1022 struct perf_event_context ctx; 1023 struct perf_event_context *task_ctx; 1024 int online; 1025 1026#ifdef CONFIG_CGROUP_PERF 1027 struct perf_cgroup *cgrp; 1028#endif 1029 1030 /* 1031 * Per-CPU storage for iterators used in visit_groups_merge. The default 1032 * storage is of size 2 to hold the CPU and any CPU event iterators. 1033 */ 1034 int heap_size; 1035 struct perf_event **heap; 1036 struct perf_event *heap_default[2]; 1037}; 1038 1039struct perf_output_handle { 1040 struct perf_event *event; 1041 struct perf_buffer *rb; 1042 unsigned long wakeup; 1043 unsigned long size; 1044 u64 aux_flags; 1045 union { 1046 void *addr; 1047 unsigned long head; 1048 }; 1049 int page; 1050}; 1051 1052struct bpf_perf_event_data_kern { 1053 bpf_user_pt_regs_t *regs; 1054 struct perf_sample_data *data; 1055 struct perf_event *event; 1056}; 1057 1058#ifdef CONFIG_CGROUP_PERF 1059 1060/* 1061 * perf_cgroup_info keeps track of time_enabled for a cgroup. 1062 * This is a per-cpu dynamically allocated data structure. 1063 */ 1064struct perf_cgroup_info { 1065 u64 time; 1066 u64 timestamp; 1067 u64 timeoffset; 1068 int active; 1069}; 1070 1071struct perf_cgroup { 1072 struct cgroup_subsys_state css; 1073 struct perf_cgroup_info __percpu *info; 1074}; 1075 1076/* 1077 * Must ensure cgroup is pinned (css_get) before calling 1078 * this function. In other words, we cannot call this function 1079 * if there is no cgroup event for the current CPU context. 1080 */ 1081static inline struct perf_cgroup * 1082perf_cgroup_from_task(struct task_struct *task, struct perf_event_context *ctx) 1083{ 1084 return container_of(task_css_check(task, perf_event_cgrp_id, 1085 ctx ? lockdep_is_held(&ctx->lock) 1086 : true), 1087 struct perf_cgroup, css); 1088} 1089#endif /* CONFIG_CGROUP_PERF */ 1090 1091#ifdef CONFIG_PERF_EVENTS 1092 1093extern struct perf_event_context *perf_cpu_task_ctx(void); 1094 1095extern void *perf_aux_output_begin(struct perf_output_handle *handle, 1096 struct perf_event *event); 1097extern void perf_aux_output_end(struct perf_output_handle *handle, 1098 unsigned long size); 1099extern int perf_aux_output_skip(struct perf_output_handle *handle, 1100 unsigned long size); 1101extern void *perf_get_aux(struct perf_output_handle *handle); 1102extern void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags); 1103extern void perf_event_itrace_started(struct perf_event *event); 1104 1105extern int perf_pmu_register(struct pmu *pmu, const char *name, int type); 1106extern void perf_pmu_unregister(struct pmu *pmu); 1107 1108extern void __perf_event_task_sched_in(struct task_struct *prev, 1109 struct task_struct *task); 1110extern void __perf_event_task_sched_out(struct task_struct *prev, 1111 struct task_struct *next); 1112extern int perf_event_init_task(struct task_struct *child, u64 clone_flags); 1113extern void perf_event_exit_task(struct task_struct *child); 1114extern void perf_event_free_task(struct task_struct *task); 1115extern void perf_event_delayed_put(struct task_struct *task); 1116extern struct file *perf_event_get(unsigned int fd); 1117extern const struct perf_event *perf_get_event(struct file *file); 1118extern const struct perf_event_attr *perf_event_attrs(struct perf_event *event); 1119extern void perf_event_print_debug(void); 1120extern void perf_pmu_disable(struct pmu *pmu); 1121extern void perf_pmu_enable(struct pmu *pmu); 1122extern void perf_sched_cb_dec(struct pmu *pmu); 1123extern void perf_sched_cb_inc(struct pmu *pmu); 1124extern int perf_event_task_disable(void); 1125extern int perf_event_task_enable(void); 1126 1127extern void perf_pmu_resched(struct pmu *pmu); 1128 1129extern int perf_event_refresh(struct perf_event *event, int refresh); 1130extern void perf_event_update_userpage(struct perf_event *event); 1131extern int perf_event_release_kernel(struct perf_event *event); 1132extern struct perf_event * 1133perf_event_create_kernel_counter(struct perf_event_attr *attr, 1134 int cpu, 1135 struct task_struct *task, 1136 perf_overflow_handler_t callback, 1137 void *context); 1138extern void perf_pmu_migrate_context(struct pmu *pmu, 1139 int src_cpu, int dst_cpu); 1140int perf_event_read_local(struct perf_event *event, u64 *value, 1141 u64 *enabled, u64 *running); 1142extern u64 perf_event_read_value(struct perf_event *event, 1143 u64 *enabled, u64 *running); 1144 1145extern struct perf_callchain_entry *perf_callchain(struct perf_event *event, struct pt_regs *regs); 1146 1147static inline bool branch_sample_no_flags(const struct perf_event *event) 1148{ 1149 return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_NO_FLAGS; 1150} 1151 1152static inline bool branch_sample_no_cycles(const struct perf_event *event) 1153{ 1154 return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_NO_CYCLES; 1155} 1156 1157static inline bool branch_sample_type(const struct perf_event *event) 1158{ 1159 return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_TYPE_SAVE; 1160} 1161 1162static inline bool branch_sample_hw_index(const struct perf_event *event) 1163{ 1164 return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX; 1165} 1166 1167static inline bool branch_sample_priv(const struct perf_event *event) 1168{ 1169 return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_PRIV_SAVE; 1170} 1171 1172static inline bool branch_sample_counters(const struct perf_event *event) 1173{ 1174 return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_COUNTERS; 1175} 1176 1177static inline bool branch_sample_call_stack(const struct perf_event *event) 1178{ 1179 return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK; 1180} 1181 1182struct perf_sample_data { 1183 /* 1184 * Fields set by perf_sample_data_init() unconditionally, 1185 * group so as to minimize the cachelines touched. 1186 */ 1187 u64 sample_flags; 1188 u64 period; 1189 u64 dyn_size; 1190 1191 /* 1192 * Fields commonly set by __perf_event_header__init_id(), 1193 * group so as to minimize the cachelines touched. 1194 */ 1195 u64 type; 1196 struct { 1197 u32 pid; 1198 u32 tid; 1199 } tid_entry; 1200 u64 time; 1201 u64 id; 1202 struct { 1203 u32 cpu; 1204 u32 reserved; 1205 } cpu_entry; 1206 1207 /* 1208 * The other fields, optionally {set,used} by 1209 * perf_{prepare,output}_sample(). 1210 */ 1211 u64 ip; 1212 struct perf_callchain_entry *callchain; 1213 struct perf_raw_record *raw; 1214 struct perf_branch_stack *br_stack; 1215 u64 *br_stack_cntr; 1216 union perf_sample_weight weight; 1217 union perf_mem_data_src data_src; 1218 u64 txn; 1219 1220 struct perf_regs regs_user; 1221 struct perf_regs regs_intr; 1222 u64 stack_user_size; 1223 1224 u64 stream_id; 1225 u64 cgroup; 1226 u64 addr; 1227 u64 phys_addr; 1228 u64 data_page_size; 1229 u64 code_page_size; 1230 u64 aux_size; 1231} ____cacheline_aligned; 1232 1233/* default value for data source */ 1234#define PERF_MEM_NA (PERF_MEM_S(OP, NA) |\ 1235 PERF_MEM_S(LVL, NA) |\ 1236 PERF_MEM_S(SNOOP, NA) |\ 1237 PERF_MEM_S(LOCK, NA) |\ 1238 PERF_MEM_S(TLB, NA) |\ 1239 PERF_MEM_S(LVLNUM, NA)) 1240 1241static inline void perf_sample_data_init(struct perf_sample_data *data, 1242 u64 addr, u64 period) 1243{ 1244 /* remaining struct members initialized in perf_prepare_sample() */ 1245 data->sample_flags = PERF_SAMPLE_PERIOD; 1246 data->period = period; 1247 data->dyn_size = 0; 1248 1249 if (addr) { 1250 data->addr = addr; 1251 data->sample_flags |= PERF_SAMPLE_ADDR; 1252 } 1253} 1254 1255static inline void perf_sample_save_callchain(struct perf_sample_data *data, 1256 struct perf_event *event, 1257 struct pt_regs *regs) 1258{ 1259 int size = 1; 1260 1261 data->callchain = perf_callchain(event, regs); 1262 size += data->callchain->nr; 1263 1264 data->dyn_size += size * sizeof(u64); 1265 data->sample_flags |= PERF_SAMPLE_CALLCHAIN; 1266} 1267 1268static inline void perf_sample_save_raw_data(struct perf_sample_data *data, 1269 struct perf_raw_record *raw) 1270{ 1271 struct perf_raw_frag *frag = &raw->frag; 1272 u32 sum = 0; 1273 int size; 1274 1275 do { 1276 sum += frag->size; 1277 if (perf_raw_frag_last(frag)) 1278 break; 1279 frag = frag->next; 1280 } while (1); 1281 1282 size = round_up(sum + sizeof(u32), sizeof(u64)); 1283 raw->size = size - sizeof(u32); 1284 frag->pad = raw->size - sum; 1285 1286 data->raw = raw; 1287 data->dyn_size += size; 1288 data->sample_flags |= PERF_SAMPLE_RAW; 1289} 1290 1291static inline void perf_sample_save_brstack(struct perf_sample_data *data, 1292 struct perf_event *event, 1293 struct perf_branch_stack *brs, 1294 u64 *brs_cntr) 1295{ 1296 int size = sizeof(u64); /* nr */ 1297 1298 if (branch_sample_hw_index(event)) 1299 size += sizeof(u64); 1300 size += brs->nr * sizeof(struct perf_branch_entry); 1301 1302 /* 1303 * The extension space for counters is appended after the 1304 * struct perf_branch_stack. It is used to store the occurrences 1305 * of events of each branch. 1306 */ 1307 if (brs_cntr) 1308 size += brs->nr * sizeof(u64); 1309 1310 data->br_stack = brs; 1311 data->br_stack_cntr = brs_cntr; 1312 data->dyn_size += size; 1313 data->sample_flags |= PERF_SAMPLE_BRANCH_STACK; 1314} 1315 1316static inline u32 perf_sample_data_size(struct perf_sample_data *data, 1317 struct perf_event *event) 1318{ 1319 u32 size = sizeof(struct perf_event_header); 1320 1321 size += event->header_size + event->id_header_size; 1322 size += data->dyn_size; 1323 1324 return size; 1325} 1326 1327/* 1328 * Clear all bitfields in the perf_branch_entry. 1329 * The to and from fields are not cleared because they are 1330 * systematically modified by caller. 1331 */ 1332static inline void perf_clear_branch_entry_bitfields(struct perf_branch_entry *br) 1333{ 1334 br->mispred = 0; 1335 br->predicted = 0; 1336 br->in_tx = 0; 1337 br->abort = 0; 1338 br->cycles = 0; 1339 br->type = 0; 1340 br->spec = PERF_BR_SPEC_NA; 1341 br->reserved = 0; 1342} 1343 1344extern void perf_output_sample(struct perf_output_handle *handle, 1345 struct perf_event_header *header, 1346 struct perf_sample_data *data, 1347 struct perf_event *event); 1348extern void perf_prepare_sample(struct perf_sample_data *data, 1349 struct perf_event *event, 1350 struct pt_regs *regs); 1351extern void perf_prepare_header(struct perf_event_header *header, 1352 struct perf_sample_data *data, 1353 struct perf_event *event, 1354 struct pt_regs *regs); 1355 1356extern int perf_event_overflow(struct perf_event *event, 1357 struct perf_sample_data *data, 1358 struct pt_regs *regs); 1359 1360extern void perf_event_output_forward(struct perf_event *event, 1361 struct perf_sample_data *data, 1362 struct pt_regs *regs); 1363extern void perf_event_output_backward(struct perf_event *event, 1364 struct perf_sample_data *data, 1365 struct pt_regs *regs); 1366extern int perf_event_output(struct perf_event *event, 1367 struct perf_sample_data *data, 1368 struct pt_regs *regs); 1369 1370static inline bool 1371is_default_overflow_handler(struct perf_event *event) 1372{ 1373 perf_overflow_handler_t overflow_handler = event->overflow_handler; 1374 1375 if (likely(overflow_handler == perf_event_output_forward)) 1376 return true; 1377 if (unlikely(overflow_handler == perf_event_output_backward)) 1378 return true; 1379 return false; 1380} 1381 1382extern void 1383perf_event_header__init_id(struct perf_event_header *header, 1384 struct perf_sample_data *data, 1385 struct perf_event *event); 1386extern void 1387perf_event__output_id_sample(struct perf_event *event, 1388 struct perf_output_handle *handle, 1389 struct perf_sample_data *sample); 1390 1391extern void 1392perf_log_lost_samples(struct perf_event *event, u64 lost); 1393 1394static inline bool event_has_any_exclude_flag(struct perf_event *event) 1395{ 1396 struct perf_event_attr *attr = &event->attr; 1397 1398 return attr->exclude_idle || attr->exclude_user || 1399 attr->exclude_kernel || attr->exclude_hv || 1400 attr->exclude_guest || attr->exclude_host; 1401} 1402 1403static inline bool is_sampling_event(struct perf_event *event) 1404{ 1405 return event->attr.sample_period != 0; 1406} 1407 1408/* 1409 * Return 1 for a software event, 0 for a hardware event 1410 */ 1411static inline int is_software_event(struct perf_event *event) 1412{ 1413 return event->event_caps & PERF_EV_CAP_SOFTWARE; 1414} 1415 1416/* 1417 * Return 1 for event in sw context, 0 for event in hw context 1418 */ 1419static inline int in_software_context(struct perf_event *event) 1420{ 1421 return event->pmu_ctx->pmu->task_ctx_nr == perf_sw_context; 1422} 1423 1424static inline int is_exclusive_pmu(struct pmu *pmu) 1425{ 1426 return pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE; 1427} 1428 1429extern struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; 1430 1431extern void ___perf_sw_event(u32, u64, struct pt_regs *, u64); 1432extern void __perf_sw_event(u32, u64, struct pt_regs *, u64); 1433 1434#ifndef perf_arch_fetch_caller_regs 1435static inline void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip) { } 1436#endif 1437 1438/* 1439 * When generating a perf sample in-line, instead of from an interrupt / 1440 * exception, we lack a pt_regs. This is typically used from software events 1441 * like: SW_CONTEXT_SWITCHES, SW_MIGRATIONS and the tie-in with tracepoints. 1442 * 1443 * We typically don't need a full set, but (for x86) do require: 1444 * - ip for PERF_SAMPLE_IP 1445 * - cs for user_mode() tests 1446 * - sp for PERF_SAMPLE_CALLCHAIN 1447 * - eflags for MISC bits and CALLCHAIN (see: perf_hw_regs()) 1448 * 1449 * NOTE: assumes @regs is otherwise already 0 filled; this is important for 1450 * things like PERF_SAMPLE_REGS_INTR. 1451 */ 1452static inline void perf_fetch_caller_regs(struct pt_regs *regs) 1453{ 1454 perf_arch_fetch_caller_regs(regs, CALLER_ADDR0); 1455} 1456 1457static __always_inline void 1458perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) 1459{ 1460 if (static_key_false(&perf_swevent_enabled[event_id])) 1461 __perf_sw_event(event_id, nr, regs, addr); 1462} 1463 1464DECLARE_PER_CPU(struct pt_regs, __perf_regs[4]); 1465 1466/* 1467 * 'Special' version for the scheduler, it hard assumes no recursion, 1468 * which is guaranteed by us not actually scheduling inside other swevents 1469 * because those disable preemption. 1470 */ 1471static __always_inline void __perf_sw_event_sched(u32 event_id, u64 nr, u64 addr) 1472{ 1473 struct pt_regs *regs = this_cpu_ptr(&__perf_regs[0]); 1474 1475 perf_fetch_caller_regs(regs); 1476 ___perf_sw_event(event_id, nr, regs, addr); 1477} 1478 1479extern struct static_key_false perf_sched_events; 1480 1481static __always_inline bool __perf_sw_enabled(int swevt) 1482{ 1483 return static_key_false(&perf_swevent_enabled[swevt]); 1484} 1485 1486static inline void perf_event_task_migrate(struct task_struct *task) 1487{ 1488 if (__perf_sw_enabled(PERF_COUNT_SW_CPU_MIGRATIONS)) 1489 task->sched_migrated = 1; 1490} 1491 1492static inline void perf_event_task_sched_in(struct task_struct *prev, 1493 struct task_struct *task) 1494{ 1495 if (static_branch_unlikely(&perf_sched_events)) 1496 __perf_event_task_sched_in(prev, task); 1497 1498 if (__perf_sw_enabled(PERF_COUNT_SW_CPU_MIGRATIONS) && 1499 task->sched_migrated) { 1500 __perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0); 1501 task->sched_migrated = 0; 1502 } 1503} 1504 1505static inline void perf_event_task_sched_out(struct task_struct *prev, 1506 struct task_struct *next) 1507{ 1508 if (__perf_sw_enabled(PERF_COUNT_SW_CONTEXT_SWITCHES)) 1509 __perf_sw_event_sched(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 0); 1510 1511#ifdef CONFIG_CGROUP_PERF 1512 if (__perf_sw_enabled(PERF_COUNT_SW_CGROUP_SWITCHES) && 1513 perf_cgroup_from_task(prev, NULL) != 1514 perf_cgroup_from_task(next, NULL)) 1515 __perf_sw_event_sched(PERF_COUNT_SW_CGROUP_SWITCHES, 1, 0); 1516#endif 1517 1518 if (static_branch_unlikely(&perf_sched_events)) 1519 __perf_event_task_sched_out(prev, next); 1520} 1521 1522extern void perf_event_mmap(struct vm_area_struct *vma); 1523 1524extern void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, 1525 bool unregister, const char *sym); 1526extern void perf_event_bpf_event(struct bpf_prog *prog, 1527 enum perf_bpf_event_type type, 1528 u16 flags); 1529 1530#ifdef CONFIG_GUEST_PERF_EVENTS 1531extern struct perf_guest_info_callbacks __rcu *perf_guest_cbs; 1532 1533DECLARE_STATIC_CALL(__perf_guest_state, *perf_guest_cbs->state); 1534DECLARE_STATIC_CALL(__perf_guest_get_ip, *perf_guest_cbs->get_ip); 1535DECLARE_STATIC_CALL(__perf_guest_handle_intel_pt_intr, *perf_guest_cbs->handle_intel_pt_intr); 1536 1537static inline unsigned int perf_guest_state(void) 1538{ 1539 return static_call(__perf_guest_state)(); 1540} 1541static inline unsigned long perf_guest_get_ip(void) 1542{ 1543 return static_call(__perf_guest_get_ip)(); 1544} 1545static inline unsigned int perf_guest_handle_intel_pt_intr(void) 1546{ 1547 return static_call(__perf_guest_handle_intel_pt_intr)(); 1548} 1549extern void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs); 1550extern void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs); 1551#else 1552static inline unsigned int perf_guest_state(void) { return 0; } 1553static inline unsigned long perf_guest_get_ip(void) { return 0; } 1554static inline unsigned int perf_guest_handle_intel_pt_intr(void) { return 0; } 1555#endif /* CONFIG_GUEST_PERF_EVENTS */ 1556 1557extern void perf_event_exec(void); 1558extern void perf_event_comm(struct task_struct *tsk, bool exec); 1559extern void perf_event_namespaces(struct task_struct *tsk); 1560extern void perf_event_fork(struct task_struct *tsk); 1561extern void perf_event_text_poke(const void *addr, 1562 const void *old_bytes, size_t old_len, 1563 const void *new_bytes, size_t new_len); 1564 1565/* Callchains */ 1566DECLARE_PER_CPU(struct perf_callchain_entry, perf_callchain_entry); 1567 1568extern void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs); 1569extern void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs); 1570extern struct perf_callchain_entry * 1571get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, 1572 u32 max_stack, bool crosstask, bool add_mark); 1573extern int get_callchain_buffers(int max_stack); 1574extern void put_callchain_buffers(void); 1575extern struct perf_callchain_entry *get_callchain_entry(int *rctx); 1576extern void put_callchain_entry(int rctx); 1577 1578extern int sysctl_perf_event_max_stack; 1579extern int sysctl_perf_event_max_contexts_per_stack; 1580 1581static inline int perf_callchain_store_context(struct perf_callchain_entry_ctx *ctx, u64 ip) 1582{ 1583 if (ctx->contexts < sysctl_perf_event_max_contexts_per_stack) { 1584 struct perf_callchain_entry *entry = ctx->entry; 1585 entry->ip[entry->nr++] = ip; 1586 ++ctx->contexts; 1587 return 0; 1588 } else { 1589 ctx->contexts_maxed = true; 1590 return -1; /* no more room, stop walking the stack */ 1591 } 1592} 1593 1594static inline int perf_callchain_store(struct perf_callchain_entry_ctx *ctx, u64 ip) 1595{ 1596 if (ctx->nr < ctx->max_stack && !ctx->contexts_maxed) { 1597 struct perf_callchain_entry *entry = ctx->entry; 1598 entry->ip[entry->nr++] = ip; 1599 ++ctx->nr; 1600 return 0; 1601 } else { 1602 return -1; /* no more room, stop walking the stack */ 1603 } 1604} 1605 1606extern int sysctl_perf_event_paranoid; 1607extern int sysctl_perf_event_mlock; 1608extern int sysctl_perf_event_sample_rate; 1609extern int sysctl_perf_cpu_time_max_percent; 1610 1611extern void perf_sample_event_took(u64 sample_len_ns); 1612 1613int perf_event_max_sample_rate_handler(const struct ctl_table *table, int write, 1614 void *buffer, size_t *lenp, loff_t *ppos); 1615int perf_cpu_time_max_percent_handler(const struct ctl_table *table, int write, 1616 void *buffer, size_t *lenp, loff_t *ppos); 1617int perf_event_max_stack_handler(const struct ctl_table *table, int write, 1618 void *buffer, size_t *lenp, loff_t *ppos); 1619 1620/* Access to perf_event_open(2) syscall. */ 1621#define PERF_SECURITY_OPEN 0 1622 1623/* Finer grained perf_event_open(2) access control. */ 1624#define PERF_SECURITY_CPU 1 1625#define PERF_SECURITY_KERNEL 2 1626#define PERF_SECURITY_TRACEPOINT 3 1627 1628static inline int perf_is_paranoid(void) 1629{ 1630 return sysctl_perf_event_paranoid > -1; 1631} 1632 1633int perf_allow_kernel(struct perf_event_attr *attr); 1634 1635static inline int perf_allow_cpu(struct perf_event_attr *attr) 1636{ 1637 if (sysctl_perf_event_paranoid > 0 && !perfmon_capable()) 1638 return -EACCES; 1639 1640 return security_perf_event_open(attr, PERF_SECURITY_CPU); 1641} 1642 1643static inline int perf_allow_tracepoint(struct perf_event_attr *attr) 1644{ 1645 if (sysctl_perf_event_paranoid > -1 && !perfmon_capable()) 1646 return -EPERM; 1647 1648 return security_perf_event_open(attr, PERF_SECURITY_TRACEPOINT); 1649} 1650 1651extern void perf_event_init(void); 1652extern void perf_tp_event(u16 event_type, u64 count, void *record, 1653 int entry_size, struct pt_regs *regs, 1654 struct hlist_head *head, int rctx, 1655 struct task_struct *task); 1656extern void perf_bp_event(struct perf_event *event, void *data); 1657 1658#ifndef perf_misc_flags 1659# define perf_misc_flags(regs) \ 1660 (user_mode(regs) ? PERF_RECORD_MISC_USER : PERF_RECORD_MISC_KERNEL) 1661# define perf_instruction_pointer(regs) instruction_pointer(regs) 1662#endif 1663#ifndef perf_arch_bpf_user_pt_regs 1664# define perf_arch_bpf_user_pt_regs(regs) regs 1665#endif 1666 1667static inline bool has_branch_stack(struct perf_event *event) 1668{ 1669 return event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK; 1670} 1671 1672static inline bool needs_branch_stack(struct perf_event *event) 1673{ 1674 return event->attr.branch_sample_type != 0; 1675} 1676 1677static inline bool has_aux(struct perf_event *event) 1678{ 1679 return event->pmu->setup_aux; 1680} 1681 1682static inline bool is_write_backward(struct perf_event *event) 1683{ 1684 return !!event->attr.write_backward; 1685} 1686 1687static inline bool has_addr_filter(struct perf_event *event) 1688{ 1689 return event->pmu->nr_addr_filters; 1690} 1691 1692/* 1693 * An inherited event uses parent's filters 1694 */ 1695static inline struct perf_addr_filters_head * 1696perf_event_addr_filters(struct perf_event *event) 1697{ 1698 struct perf_addr_filters_head *ifh = &event->addr_filters; 1699 1700 if (event->parent) 1701 ifh = &event->parent->addr_filters; 1702 1703 return ifh; 1704} 1705 1706static inline struct fasync_struct **perf_event_fasync(struct perf_event *event) 1707{ 1708 /* Only the parent has fasync state */ 1709 if (event->parent) 1710 event = event->parent; 1711 return &event->fasync; 1712} 1713 1714extern void perf_event_addr_filters_sync(struct perf_event *event); 1715extern void perf_report_aux_output_id(struct perf_event *event, u64 hw_id); 1716 1717extern int perf_output_begin(struct perf_output_handle *handle, 1718 struct perf_sample_data *data, 1719 struct perf_event *event, unsigned int size); 1720extern int perf_output_begin_forward(struct perf_output_handle *handle, 1721 struct perf_sample_data *data, 1722 struct perf_event *event, 1723 unsigned int size); 1724extern int perf_output_begin_backward(struct perf_output_handle *handle, 1725 struct perf_sample_data *data, 1726 struct perf_event *event, 1727 unsigned int size); 1728 1729extern void perf_output_end(struct perf_output_handle *handle); 1730extern unsigned int perf_output_copy(struct perf_output_handle *handle, 1731 const void *buf, unsigned int len); 1732extern unsigned int perf_output_skip(struct perf_output_handle *handle, 1733 unsigned int len); 1734extern long perf_output_copy_aux(struct perf_output_handle *aux_handle, 1735 struct perf_output_handle *handle, 1736 unsigned long from, unsigned long to); 1737extern int perf_swevent_get_recursion_context(void); 1738extern void perf_swevent_put_recursion_context(int rctx); 1739extern u64 perf_swevent_set_period(struct perf_event *event); 1740extern void perf_event_enable(struct perf_event *event); 1741extern void perf_event_disable(struct perf_event *event); 1742extern void perf_event_disable_local(struct perf_event *event); 1743extern void perf_event_disable_inatomic(struct perf_event *event); 1744extern void perf_event_task_tick(void); 1745extern int perf_event_account_interrupt(struct perf_event *event); 1746extern int perf_event_period(struct perf_event *event, u64 value); 1747extern u64 perf_event_pause(struct perf_event *event, bool reset); 1748#else /* !CONFIG_PERF_EVENTS: */ 1749static inline void * 1750perf_aux_output_begin(struct perf_output_handle *handle, 1751 struct perf_event *event) { return NULL; } 1752static inline void 1753perf_aux_output_end(struct perf_output_handle *handle, unsigned long size) 1754 { } 1755static inline int 1756perf_aux_output_skip(struct perf_output_handle *handle, 1757 unsigned long size) { return -EINVAL; } 1758static inline void * 1759perf_get_aux(struct perf_output_handle *handle) { return NULL; } 1760static inline void 1761perf_event_task_migrate(struct task_struct *task) { } 1762static inline void 1763perf_event_task_sched_in(struct task_struct *prev, 1764 struct task_struct *task) { } 1765static inline void 1766perf_event_task_sched_out(struct task_struct *prev, 1767 struct task_struct *next) { } 1768static inline int perf_event_init_task(struct task_struct *child, 1769 u64 clone_flags) { return 0; } 1770static inline void perf_event_exit_task(struct task_struct *child) { } 1771static inline void perf_event_free_task(struct task_struct *task) { } 1772static inline void perf_event_delayed_put(struct task_struct *task) { } 1773static inline struct file *perf_event_get(unsigned int fd) { return ERR_PTR(-EINVAL); } 1774static inline const struct perf_event *perf_get_event(struct file *file) 1775{ 1776 return ERR_PTR(-EINVAL); 1777} 1778static inline const struct perf_event_attr *perf_event_attrs(struct perf_event *event) 1779{ 1780 return ERR_PTR(-EINVAL); 1781} 1782static inline int perf_event_read_local(struct perf_event *event, u64 *value, 1783 u64 *enabled, u64 *running) 1784{ 1785 return -EINVAL; 1786} 1787static inline void perf_event_print_debug(void) { } 1788static inline int perf_event_task_disable(void) { return -EINVAL; } 1789static inline int perf_event_task_enable(void) { return -EINVAL; } 1790static inline int perf_event_refresh(struct perf_event *event, int refresh) 1791{ 1792 return -EINVAL; 1793} 1794 1795static inline void 1796perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) { } 1797static inline void 1798perf_bp_event(struct perf_event *event, void *data) { } 1799 1800static inline void perf_event_mmap(struct vm_area_struct *vma) { } 1801 1802typedef int (perf_ksymbol_get_name_f)(char *name, int name_len, void *data); 1803static inline void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, 1804 bool unregister, const char *sym) { } 1805static inline void perf_event_bpf_event(struct bpf_prog *prog, 1806 enum perf_bpf_event_type type, 1807 u16 flags) { } 1808static inline void perf_event_exec(void) { } 1809static inline void perf_event_comm(struct task_struct *tsk, bool exec) { } 1810static inline void perf_event_namespaces(struct task_struct *tsk) { } 1811static inline void perf_event_fork(struct task_struct *tsk) { } 1812static inline void perf_event_text_poke(const void *addr, 1813 const void *old_bytes, 1814 size_t old_len, 1815 const void *new_bytes, 1816 size_t new_len) { } 1817static inline void perf_event_init(void) { } 1818static inline int perf_swevent_get_recursion_context(void) { return -1; } 1819static inline void perf_swevent_put_recursion_context(int rctx) { } 1820static inline u64 perf_swevent_set_period(struct perf_event *event) { return 0; } 1821static inline void perf_event_enable(struct perf_event *event) { } 1822static inline void perf_event_disable(struct perf_event *event) { } 1823static inline int __perf_event_disable(void *info) { return -1; } 1824static inline void perf_event_task_tick(void) { } 1825static inline int perf_event_release_kernel(struct perf_event *event) { return 0; } 1826static inline int perf_event_period(struct perf_event *event, u64 value) 1827{ 1828 return -EINVAL; 1829} 1830static inline u64 perf_event_pause(struct perf_event *event, bool reset) 1831{ 1832 return 0; 1833} 1834#endif 1835 1836#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL) 1837extern void perf_restore_debug_store(void); 1838#else 1839static inline void perf_restore_debug_store(void) { } 1840#endif 1841 1842#define perf_output_put(handle, x) perf_output_copy((handle), &(x), sizeof(x)) 1843 1844struct perf_pmu_events_attr { 1845 struct device_attribute attr; 1846 u64 id; 1847 const char *event_str; 1848}; 1849 1850struct perf_pmu_events_ht_attr { 1851 struct device_attribute attr; 1852 u64 id; 1853 const char *event_str_ht; 1854 const char *event_str_noht; 1855}; 1856 1857struct perf_pmu_events_hybrid_attr { 1858 struct device_attribute attr; 1859 u64 id; 1860 const char *event_str; 1861 u64 pmu_type; 1862}; 1863 1864struct perf_pmu_format_hybrid_attr { 1865 struct device_attribute attr; 1866 u64 pmu_type; 1867}; 1868 1869ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr, 1870 char *page); 1871 1872#define PMU_EVENT_ATTR(_name, _var, _id, _show) \ 1873static struct perf_pmu_events_attr _var = { \ 1874 .attr = __ATTR(_name, 0444, _show, NULL), \ 1875 .id = _id, \ 1876}; 1877 1878#define PMU_EVENT_ATTR_STRING(_name, _var, _str) \ 1879static struct perf_pmu_events_attr _var = { \ 1880 .attr = __ATTR(_name, 0444, perf_event_sysfs_show, NULL), \ 1881 .id = 0, \ 1882 .event_str = _str, \ 1883}; 1884 1885#define PMU_EVENT_ATTR_ID(_name, _show, _id) \ 1886 (&((struct perf_pmu_events_attr[]) { \ 1887 { .attr = __ATTR(_name, 0444, _show, NULL), \ 1888 .id = _id, } \ 1889 })[0].attr.attr) 1890 1891#define PMU_FORMAT_ATTR_SHOW(_name, _format) \ 1892static ssize_t \ 1893_name##_show(struct device *dev, \ 1894 struct device_attribute *attr, \ 1895 char *page) \ 1896{ \ 1897 BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \ 1898 return sprintf(page, _format "\n"); \ 1899} \ 1900 1901#define PMU_FORMAT_ATTR(_name, _format) \ 1902 PMU_FORMAT_ATTR_SHOW(_name, _format) \ 1903 \ 1904static struct device_attribute format_attr_##_name = __ATTR_RO(_name) 1905 1906/* Performance counter hotplug functions */ 1907#ifdef CONFIG_PERF_EVENTS 1908int perf_event_init_cpu(unsigned int cpu); 1909int perf_event_exit_cpu(unsigned int cpu); 1910#else 1911#define perf_event_init_cpu NULL 1912#define perf_event_exit_cpu NULL 1913#endif 1914 1915extern void arch_perf_update_userpage(struct perf_event *event, 1916 struct perf_event_mmap_page *userpg, 1917 u64 now); 1918 1919/* 1920 * Snapshot branch stack on software events. 1921 * 1922 * Branch stack can be very useful in understanding software events. For 1923 * example, when a long function, e.g. sys_perf_event_open, returns an 1924 * errno, it is not obvious why the function failed. Branch stack could 1925 * provide very helpful information in this type of scenarios. 1926 * 1927 * On software event, it is necessary to stop the hardware branch recorder 1928 * fast. Otherwise, the hardware register/buffer will be flushed with 1929 * entries of the triggering event. Therefore, static call is used to 1930 * stop the hardware recorder. 1931 */ 1932 1933/* 1934 * cnt is the number of entries allocated for entries. 1935 * Return number of entries copied to . 1936 */ 1937typedef int (perf_snapshot_branch_stack_t)(struct perf_branch_entry *entries, 1938 unsigned int cnt); 1939DECLARE_STATIC_CALL(perf_snapshot_branch_stack, perf_snapshot_branch_stack_t); 1940 1941#ifndef PERF_NEEDS_LOPWR_CB 1942static inline void perf_lopwr_cb(bool mode) 1943{ 1944} 1945#endif 1946 1947#endif /* _LINUX_PERF_EVENT_H */