Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'perf-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull perf fixes from Thomas Gleixner:
"This is much bigger than typical fixes, but Peter found a category of
races that spurred more fixes and more debugging enhancements. Work
started before the merge window, but got finished only now.

Aside of that this contains the usual small fixes to perf and tools.
Nothing particular exciting"

* 'perf-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (43 commits)
perf: Remove/simplify lockdep annotation
perf: Synchronously clean up child events
perf: Untangle 'owner' confusion
perf: Add flags argument to perf_remove_from_context()
perf: Clean up sync_child_event()
perf: Robustify event->owner usage and SMP ordering
perf: Fix STATE_EXIT usage
perf: Update locking order
perf: Remove __free_event()
perf/bpf: Convert perf_event_array to use struct file
perf: Fix NULL deref
perf/x86: De-obfuscate code
perf/x86: Fix uninitialized value usage
perf: Fix race in perf_event_exit_task_context()
perf: Fix orphan hole
perf stat: Do not clean event's private stats
perf hists: Fix HISTC_MEM_DCACHELINE width setting
perf annotate browser: Fix behaviour of Shift-Tab with nothing focussed
perf tests: Remove wrong semicolon in while loop in CQM test
perf: Synchronously free aux pages in case of allocation failure
...

+752 -673
+3 -4
arch/x86/kernel/cpu/perf_event_intel.c
··· 1960 1960 1961 1961 static int intel_alt_er(int idx, u64 config) 1962 1962 { 1963 - int alt_idx; 1963 + int alt_idx = idx; 1964 + 1964 1965 if (!(x86_pmu.flags & PMU_FL_HAS_RSP_1)) 1965 1966 return idx; 1966 1967 ··· 2898 2897 return; 2899 2898 2900 2899 if (!(x86_pmu.flags & PMU_FL_NO_HT_SHARING)) { 2901 - void **onln = &cpuc->kfree_on_online[X86_PERF_KFREE_SHARED]; 2902 - 2903 2900 for_each_cpu(i, topology_sibling_cpumask(cpu)) { 2904 2901 struct intel_shared_regs *pc; 2905 2902 2906 2903 pc = per_cpu(cpu_hw_events, i).shared_regs; 2907 2904 if (pc && pc->core_id == core_id) { 2908 - *onln = cpuc->shared_regs; 2905 + cpuc->kfree_on_online[0] = cpuc->shared_regs; 2909 2906 cpuc->shared_regs = pc; 2910 2907 break; 2911 2908 }
+3
arch/x86/kernel/cpu/perf_event_intel_uncore.c
··· 995 995 case 87: /* Knights Landing */ 996 996 ret = knl_uncore_pci_init(); 997 997 break; 998 + case 94: /* SkyLake */ 999 + ret = skl_uncore_pci_init(); 1000 + break; 998 1001 default: 999 1002 return 0; 1000 1003 }
+1
arch/x86/kernel/cpu/perf_event_intel_uncore.h
··· 336 336 int ivb_uncore_pci_init(void); 337 337 int hsw_uncore_pci_init(void); 338 338 int bdw_uncore_pci_init(void); 339 + int skl_uncore_pci_init(void); 339 340 void snb_uncore_cpu_init(void); 340 341 void nhm_uncore_cpu_init(void); 341 342 int snb_pci2phy_map_init(int devid);
+20
arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
··· 8 8 #define PCI_DEVICE_ID_INTEL_HSW_IMC 0x0c00 9 9 #define PCI_DEVICE_ID_INTEL_HSW_U_IMC 0x0a04 10 10 #define PCI_DEVICE_ID_INTEL_BDW_IMC 0x1604 11 + #define PCI_DEVICE_ID_INTEL_SKL_IMC 0x191f 11 12 12 13 /* SNB event control */ 13 14 #define SNB_UNC_CTL_EV_SEL_MASK 0x000000ff ··· 525 524 { /* end: all zeroes */ }, 526 525 }; 527 526 527 + static const struct pci_device_id skl_uncore_pci_ids[] = { 528 + { /* IMC */ 529 + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SKL_IMC), 530 + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), 531 + }, 532 + { /* end: all zeroes */ }, 533 + }; 534 + 528 535 static struct pci_driver snb_uncore_pci_driver = { 529 536 .name = "snb_uncore", 530 537 .id_table = snb_uncore_pci_ids, ··· 553 544 .id_table = bdw_uncore_pci_ids, 554 545 }; 555 546 547 + static struct pci_driver skl_uncore_pci_driver = { 548 + .name = "skl_uncore", 549 + .id_table = skl_uncore_pci_ids, 550 + }; 551 + 556 552 struct imc_uncore_pci_dev { 557 553 __u32 pci_id; 558 554 struct pci_driver *driver; ··· 572 558 IMC_DEV(HSW_IMC, &hsw_uncore_pci_driver), /* 4th Gen Core Processor */ 573 559 IMC_DEV(HSW_U_IMC, &hsw_uncore_pci_driver), /* 4th Gen Core ULT Mobile Processor */ 574 560 IMC_DEV(BDW_IMC, &bdw_uncore_pci_driver), /* 5th Gen Core U */ 561 + IMC_DEV(SKL_IMC, &skl_uncore_pci_driver), /* 6th Gen Core */ 575 562 { /* end marker */ } 576 563 }; 577 564 ··· 621 606 } 622 607 623 608 int bdw_uncore_pci_init(void) 609 + { 610 + return imc_uncore_pci_init(); 611 + } 612 + 613 + int skl_uncore_pci_init(void) 624 614 { 625 615 return imc_uncore_pci_init(); 626 616 }
+3 -6
include/linux/perf_event.h
··· 634 634 int nr_cgroups; /* cgroup evts */ 635 635 void *task_ctx_data; /* pmu specific data */ 636 636 struct rcu_head rcu_head; 637 - 638 - struct delayed_work orphans_remove; 639 - bool orphans_remove_sched; 640 637 }; 641 638 642 639 /* ··· 726 729 extern void perf_event_exit_task(struct task_struct *child); 727 730 extern void perf_event_free_task(struct task_struct *task); 728 731 extern void perf_event_delayed_put(struct task_struct *task); 729 - extern struct perf_event *perf_event_get(unsigned int fd); 732 + extern struct file *perf_event_get(unsigned int fd); 730 733 extern const struct perf_event_attr *perf_event_attrs(struct perf_event *event); 731 734 extern void perf_event_print_debug(void); 732 735 extern void perf_pmu_disable(struct pmu *pmu); ··· 1041 1044 extern u64 perf_swevent_set_period(struct perf_event *event); 1042 1045 extern void perf_event_enable(struct perf_event *event); 1043 1046 extern void perf_event_disable(struct perf_event *event); 1044 - extern int __perf_event_disable(void *info); 1047 + extern void perf_event_disable_local(struct perf_event *event); 1045 1048 extern void perf_event_task_tick(void); 1046 1049 #else /* !CONFIG_PERF_EVENTS: */ 1047 1050 static inline void * ··· 1067 1070 static inline void perf_event_exit_task(struct task_struct *child) { } 1068 1071 static inline void perf_event_free_task(struct task_struct *task) { } 1069 1072 static inline void perf_event_delayed_put(struct task_struct *task) { } 1070 - static inline struct perf_event *perf_event_get(unsigned int fd) { return ERR_PTR(-EINVAL); } 1073 + static inline struct file *perf_event_get(unsigned int fd) { return ERR_PTR(-EINVAL); } 1071 1074 static inline const struct perf_event_attr *perf_event_attrs(struct perf_event *event) 1072 1075 { 1073 1076 return ERR_PTR(-EINVAL);
+11 -10
kernel/bpf/arraymap.c
··· 291 291 { 292 292 struct perf_event *event; 293 293 const struct perf_event_attr *attr; 294 + struct file *file; 294 295 295 - event = perf_event_get(fd); 296 - if (IS_ERR(event)) 297 - return event; 296 + file = perf_event_get(fd); 297 + if (IS_ERR(file)) 298 + return file; 299 + 300 + event = file->private_data; 298 301 299 302 attr = perf_event_attrs(event); 300 303 if (IS_ERR(attr)) ··· 307 304 goto err; 308 305 309 306 if (attr->type == PERF_TYPE_RAW) 310 - return event; 307 + return file; 311 308 312 309 if (attr->type == PERF_TYPE_HARDWARE) 313 - return event; 310 + return file; 314 311 315 312 if (attr->type == PERF_TYPE_SOFTWARE && 316 313 attr->config == PERF_COUNT_SW_BPF_OUTPUT) 317 - return event; 314 + return file; 318 315 err: 319 - perf_event_release_kernel(event); 316 + fput(file); 320 317 return ERR_PTR(-EINVAL); 321 318 } 322 319 323 320 static void perf_event_fd_array_put_ptr(void *ptr) 324 321 { 325 - struct perf_event *event = ptr; 326 - 327 - perf_event_release_kernel(event); 322 + fput((struct file *)ptr); 328 323 } 329 324 330 325 static const struct bpf_map_ops perf_event_array_ops = {
+605 -606
kernel/events/core.c
··· 49 49 50 50 #include <asm/irq_regs.h> 51 51 52 - static struct workqueue_struct *perf_wq; 53 - 54 52 typedef int (*remote_function_f)(void *); 55 53 56 54 struct remote_function_call { ··· 124 126 return data.ret; 125 127 } 126 128 127 - static void event_function_call(struct perf_event *event, 128 - int (*active)(void *), 129 - void (*inactive)(void *), 130 - void *data) 129 + static inline struct perf_cpu_context * 130 + __get_cpu_context(struct perf_event_context *ctx) 131 + { 132 + return this_cpu_ptr(ctx->pmu->pmu_cpu_context); 133 + } 134 + 135 + static void perf_ctx_lock(struct perf_cpu_context *cpuctx, 136 + struct perf_event_context *ctx) 137 + { 138 + raw_spin_lock(&cpuctx->ctx.lock); 139 + if (ctx) 140 + raw_spin_lock(&ctx->lock); 141 + } 142 + 143 + static void perf_ctx_unlock(struct perf_cpu_context *cpuctx, 144 + struct perf_event_context *ctx) 145 + { 146 + if (ctx) 147 + raw_spin_unlock(&ctx->lock); 148 + raw_spin_unlock(&cpuctx->ctx.lock); 149 + } 150 + 151 + #define TASK_TOMBSTONE ((void *)-1L) 152 + 153 + static bool is_kernel_event(struct perf_event *event) 154 + { 155 + return READ_ONCE(event->owner) == TASK_TOMBSTONE; 156 + } 157 + 158 + /* 159 + * On task ctx scheduling... 160 + * 161 + * When !ctx->nr_events a task context will not be scheduled. This means 162 + * we can disable the scheduler hooks (for performance) without leaving 163 + * pending task ctx state. 164 + * 165 + * This however results in two special cases: 166 + * 167 + * - removing the last event from a task ctx; this is relatively straight 168 + * forward and is done in __perf_remove_from_context. 169 + * 170 + * - adding the first event to a task ctx; this is tricky because we cannot 171 + * rely on ctx->is_active and therefore cannot use event_function_call(). 172 + * See perf_install_in_context(). 173 + * 174 + * This is because we need a ctx->lock serialized variable (ctx->is_active) 175 + * to reliably determine if a particular task/context is scheduled in. The 176 + * task_curr() use in task_function_call() is racy in that a remote context 177 + * switch is not a single atomic operation. 178 + * 179 + * As is, the situation is 'safe' because we set rq->curr before we do the 180 + * actual context switch. This means that task_curr() will fail early, but 181 + * we'll continue spinning on ctx->is_active until we've passed 182 + * perf_event_task_sched_out(). 183 + * 184 + * Without this ctx->lock serialized variable we could have race where we find 185 + * the task (and hence the context) would not be active while in fact they are. 186 + * 187 + * If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set. 188 + */ 189 + 190 + typedef void (*event_f)(struct perf_event *, struct perf_cpu_context *, 191 + struct perf_event_context *, void *); 192 + 193 + struct event_function_struct { 194 + struct perf_event *event; 195 + event_f func; 196 + void *data; 197 + }; 198 + 199 + static int event_function(void *info) 200 + { 201 + struct event_function_struct *efs = info; 202 + struct perf_event *event = efs->event; 203 + struct perf_event_context *ctx = event->ctx; 204 + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 205 + struct perf_event_context *task_ctx = cpuctx->task_ctx; 206 + int ret = 0; 207 + 208 + WARN_ON_ONCE(!irqs_disabled()); 209 + 210 + perf_ctx_lock(cpuctx, task_ctx); 211 + /* 212 + * Since we do the IPI call without holding ctx->lock things can have 213 + * changed, double check we hit the task we set out to hit. 214 + */ 215 + if (ctx->task) { 216 + if (ctx->task != current) { 217 + ret = -EAGAIN; 218 + goto unlock; 219 + } 220 + 221 + /* 222 + * We only use event_function_call() on established contexts, 223 + * and event_function() is only ever called when active (or 224 + * rather, we'll have bailed in task_function_call() or the 225 + * above ctx->task != current test), therefore we must have 226 + * ctx->is_active here. 227 + */ 228 + WARN_ON_ONCE(!ctx->is_active); 229 + /* 230 + * And since we have ctx->is_active, cpuctx->task_ctx must 231 + * match. 232 + */ 233 + WARN_ON_ONCE(task_ctx != ctx); 234 + } else { 235 + WARN_ON_ONCE(&cpuctx->ctx != ctx); 236 + } 237 + 238 + efs->func(event, cpuctx, ctx, efs->data); 239 + unlock: 240 + perf_ctx_unlock(cpuctx, task_ctx); 241 + 242 + return ret; 243 + } 244 + 245 + static void event_function_local(struct perf_event *event, event_f func, void *data) 246 + { 247 + struct event_function_struct efs = { 248 + .event = event, 249 + .func = func, 250 + .data = data, 251 + }; 252 + 253 + int ret = event_function(&efs); 254 + WARN_ON_ONCE(ret); 255 + } 256 + 257 + static void event_function_call(struct perf_event *event, event_f func, void *data) 131 258 { 132 259 struct perf_event_context *ctx = event->ctx; 133 - struct task_struct *task = ctx->task; 260 + struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */ 261 + struct event_function_struct efs = { 262 + .event = event, 263 + .func = func, 264 + .data = data, 265 + }; 266 + 267 + if (!event->parent) { 268 + /* 269 + * If this is a !child event, we must hold ctx::mutex to 270 + * stabilize the the event->ctx relation. See 271 + * perf_event_ctx_lock(). 272 + */ 273 + lockdep_assert_held(&ctx->mutex); 274 + } 134 275 135 276 if (!task) { 136 - cpu_function_call(event->cpu, active, data); 277 + cpu_function_call(event->cpu, event_function, &efs); 137 278 return; 138 279 } 139 280 140 281 again: 141 - if (!task_function_call(task, active, data)) 282 + if (task == TASK_TOMBSTONE) 283 + return; 284 + 285 + if (!task_function_call(task, event_function, &efs)) 142 286 return; 143 287 144 288 raw_spin_lock_irq(&ctx->lock); 145 - if (ctx->is_active) { 146 - /* 147 - * Reload the task pointer, it might have been changed by 148 - * a concurrent perf_event_context_sched_out(). 149 - */ 150 - task = ctx->task; 151 - raw_spin_unlock_irq(&ctx->lock); 152 - goto again; 289 + /* 290 + * Reload the task pointer, it might have been changed by 291 + * a concurrent perf_event_context_sched_out(). 292 + */ 293 + task = ctx->task; 294 + if (task != TASK_TOMBSTONE) { 295 + if (ctx->is_active) { 296 + raw_spin_unlock_irq(&ctx->lock); 297 + goto again; 298 + } 299 + func(event, NULL, ctx, data); 153 300 } 154 - inactive(data); 155 301 raw_spin_unlock_irq(&ctx->lock); 156 - } 157 - 158 - #define EVENT_OWNER_KERNEL ((void *) -1) 159 - 160 - static bool is_kernel_event(struct perf_event *event) 161 - { 162 - return event->owner == EVENT_OWNER_KERNEL; 163 302 } 164 303 165 304 #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\ ··· 503 368 return event->clock(); 504 369 } 505 370 506 - static inline struct perf_cpu_context * 507 - __get_cpu_context(struct perf_event_context *ctx) 508 - { 509 - return this_cpu_ptr(ctx->pmu->pmu_cpu_context); 510 - } 511 - 512 - static void perf_ctx_lock(struct perf_cpu_context *cpuctx, 513 - struct perf_event_context *ctx) 514 - { 515 - raw_spin_lock(&cpuctx->ctx.lock); 516 - if (ctx) 517 - raw_spin_lock(&ctx->lock); 518 - } 519 - 520 - static void perf_ctx_unlock(struct perf_cpu_context *cpuctx, 521 - struct perf_event_context *ctx) 522 - { 523 - if (ctx) 524 - raw_spin_unlock(&ctx->lock); 525 - raw_spin_unlock(&cpuctx->ctx.lock); 526 - } 527 - 528 371 #ifdef CONFIG_CGROUP_PERF 529 372 530 373 static inline bool ··· 692 579 * we are holding the rcu lock 693 580 */ 694 581 cgrp1 = perf_cgroup_from_task(task, NULL); 695 - 696 - /* 697 - * next is NULL when called from perf_event_enable_on_exec() 698 - * that will systematically cause a cgroup_switch() 699 - */ 700 - if (next) 701 - cgrp2 = perf_cgroup_from_task(next, NULL); 582 + cgrp2 = perf_cgroup_from_task(next, NULL); 702 583 703 584 /* 704 585 * only schedule out current cgroup events if we know ··· 718 611 * we are holding the rcu lock 719 612 */ 720 613 cgrp1 = perf_cgroup_from_task(task, NULL); 721 - 722 - /* prev can never be NULL */ 723 614 cgrp2 = perf_cgroup_from_task(prev, NULL); 724 615 725 616 /* ··· 1022 917 if (atomic_dec_and_test(&ctx->refcount)) { 1023 918 if (ctx->parent_ctx) 1024 919 put_ctx(ctx->parent_ctx); 1025 - if (ctx->task) 920 + if (ctx->task && ctx->task != TASK_TOMBSTONE) 1026 921 put_task_struct(ctx->task); 1027 922 call_rcu(&ctx->rcu_head, free_ctx); 1028 923 } ··· 1039 934 * perf_event_context::mutex nests and those are: 1040 935 * 1041 936 * - perf_event_exit_task_context() [ child , 0 ] 1042 - * __perf_event_exit_task() 1043 - * sync_child_event() 1044 - * put_event() [ parent, 1 ] 937 + * perf_event_exit_event() 938 + * put_event() [ parent, 1 ] 1045 939 * 1046 940 * - perf_event_init_context() [ parent, 0 ] 1047 941 * inherit_task_group() ··· 1083 979 * Lock order: 1084 980 * task_struct::perf_event_mutex 1085 981 * perf_event_context::mutex 1086 - * perf_event_context::lock 1087 982 * perf_event::child_mutex; 983 + * perf_event_context::lock 1088 984 * perf_event::mmap_mutex 1089 985 * mmap_sem 1090 986 */ ··· 1182 1078 1183 1079 /* 1184 1080 * Get the perf_event_context for a task and lock it. 1081 + * 1185 1082 * This has to cope with with the fact that until it is locked, 1186 1083 * the context could get moved to another task. 1187 1084 */ ··· 1223 1118 goto retry; 1224 1119 } 1225 1120 1226 - if (!atomic_inc_not_zero(&ctx->refcount)) { 1121 + if (ctx->task == TASK_TOMBSTONE || 1122 + !atomic_inc_not_zero(&ctx->refcount)) { 1227 1123 raw_spin_unlock(&ctx->lock); 1228 1124 ctx = NULL; 1125 + } else { 1126 + WARN_ON_ONCE(ctx->task != task); 1229 1127 } 1230 1128 } 1231 1129 rcu_read_unlock(); ··· 1354 1246 static void 1355 1247 list_add_event(struct perf_event *event, struct perf_event_context *ctx) 1356 1248 { 1249 + lockdep_assert_held(&ctx->lock); 1250 + 1357 1251 WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT); 1358 1252 event->attach_state |= PERF_ATTACH_CONTEXT; 1359 1253 ··· 1558 1448 1559 1449 if (is_cgroup_event(event)) { 1560 1450 ctx->nr_cgroups--; 1451 + /* 1452 + * Because cgroup events are always per-cpu events, this will 1453 + * always be called from the right CPU. 1454 + */ 1561 1455 cpuctx = __get_cpu_context(ctx); 1562 1456 /* 1563 - * if there are no more cgroup events 1564 - * then cler cgrp to avoid stale pointer 1565 - * in update_cgrp_time_from_cpuctx() 1457 + * If there are no more cgroup events then clear cgrp to avoid 1458 + * stale pointer in update_cgrp_time_from_cpuctx(). 1566 1459 */ 1567 1460 if (!ctx->nr_cgroups) 1568 1461 cpuctx->cgrp = NULL; ··· 1643 1530 perf_event__header_size(tmp); 1644 1531 } 1645 1532 1646 - /* 1647 - * User event without the task. 1648 - */ 1649 1533 static bool is_orphaned_event(struct perf_event *event) 1650 1534 { 1651 - return event && !is_kernel_event(event) && !event->owner; 1535 + return event->state == PERF_EVENT_STATE_EXIT; 1652 1536 } 1653 - 1654 - /* 1655 - * Event has a parent but parent's task finished and it's 1656 - * alive only because of children holding refference. 1657 - */ 1658 - static bool is_orphaned_child(struct perf_event *event) 1659 - { 1660 - return is_orphaned_event(event->parent); 1661 - } 1662 - 1663 - static void orphans_remove_work(struct work_struct *work); 1664 - 1665 - static void schedule_orphans_remove(struct perf_event_context *ctx) 1666 - { 1667 - if (!ctx->task || ctx->orphans_remove_sched || !perf_wq) 1668 - return; 1669 - 1670 - if (queue_delayed_work(perf_wq, &ctx->orphans_remove, 1)) { 1671 - get_ctx(ctx); 1672 - ctx->orphans_remove_sched = true; 1673 - } 1674 - } 1675 - 1676 - static int __init perf_workqueue_init(void) 1677 - { 1678 - perf_wq = create_singlethread_workqueue("perf"); 1679 - WARN(!perf_wq, "failed to create perf workqueue\n"); 1680 - return perf_wq ? 0 : -1; 1681 - } 1682 - 1683 - core_initcall(perf_workqueue_init); 1684 1537 1685 1538 static inline int pmu_filter_match(struct perf_event *event) 1686 1539 { ··· 1708 1629 if (event->attr.exclusive || !cpuctx->active_oncpu) 1709 1630 cpuctx->exclusive = 0; 1710 1631 1711 - if (is_orphaned_child(event)) 1712 - schedule_orphans_remove(ctx); 1713 - 1714 1632 perf_pmu_enable(event->pmu); 1715 1633 } 1716 1634 ··· 1731 1655 cpuctx->exclusive = 0; 1732 1656 } 1733 1657 1734 - struct remove_event { 1735 - struct perf_event *event; 1736 - bool detach_group; 1737 - }; 1738 - 1739 - static void ___perf_remove_from_context(void *info) 1740 - { 1741 - struct remove_event *re = info; 1742 - struct perf_event *event = re->event; 1743 - struct perf_event_context *ctx = event->ctx; 1744 - 1745 - if (re->detach_group) 1746 - perf_group_detach(event); 1747 - list_del_event(event, ctx); 1748 - } 1658 + #define DETACH_GROUP 0x01UL 1659 + #define DETACH_STATE 0x02UL 1749 1660 1750 1661 /* 1751 1662 * Cross CPU call to remove a performance event ··· 1740 1677 * We disable the event on the hardware level first. After that we 1741 1678 * remove it from the context list. 1742 1679 */ 1743 - static int __perf_remove_from_context(void *info) 1680 + static void 1681 + __perf_remove_from_context(struct perf_event *event, 1682 + struct perf_cpu_context *cpuctx, 1683 + struct perf_event_context *ctx, 1684 + void *info) 1744 1685 { 1745 - struct remove_event *re = info; 1746 - struct perf_event *event = re->event; 1747 - struct perf_event_context *ctx = event->ctx; 1748 - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 1686 + unsigned long flags = (unsigned long)info; 1749 1687 1750 - raw_spin_lock(&ctx->lock); 1751 1688 event_sched_out(event, cpuctx, ctx); 1752 - if (re->detach_group) 1689 + if (flags & DETACH_GROUP) 1753 1690 perf_group_detach(event); 1754 1691 list_del_event(event, ctx); 1755 - if (!ctx->nr_events && cpuctx->task_ctx == ctx) { 1756 - ctx->is_active = 0; 1757 - cpuctx->task_ctx = NULL; 1758 - } 1759 - raw_spin_unlock(&ctx->lock); 1692 + if (flags & DETACH_STATE) 1693 + event->state = PERF_EVENT_STATE_EXIT; 1760 1694 1761 - return 0; 1695 + if (!ctx->nr_events && ctx->is_active) { 1696 + ctx->is_active = 0; 1697 + if (ctx->task) { 1698 + WARN_ON_ONCE(cpuctx->task_ctx != ctx); 1699 + cpuctx->task_ctx = NULL; 1700 + } 1701 + } 1762 1702 } 1763 1703 1764 1704 /* 1765 1705 * Remove the event from a task's (or a CPU's) list of events. 1766 - * 1767 - * CPU events are removed with a smp call. For task events we only 1768 - * call when the task is on a CPU. 1769 1706 * 1770 1707 * If event->ctx is a cloned context, callers must make sure that 1771 1708 * every task struct that event->ctx->task could possibly point to ··· 1774 1711 * When called from perf_event_exit_task, it's OK because the 1775 1712 * context has been detached from its task. 1776 1713 */ 1777 - static void perf_remove_from_context(struct perf_event *event, bool detach_group) 1714 + static void perf_remove_from_context(struct perf_event *event, unsigned long flags) 1778 1715 { 1779 - struct perf_event_context *ctx = event->ctx; 1780 - struct remove_event re = { 1781 - .event = event, 1782 - .detach_group = detach_group, 1783 - }; 1716 + lockdep_assert_held(&event->ctx->mutex); 1784 1717 1785 - lockdep_assert_held(&ctx->mutex); 1786 - 1787 - event_function_call(event, __perf_remove_from_context, 1788 - ___perf_remove_from_context, &re); 1718 + event_function_call(event, __perf_remove_from_context, (void *)flags); 1789 1719 } 1790 1720 1791 1721 /* 1792 1722 * Cross CPU call to disable a performance event 1793 1723 */ 1794 - int __perf_event_disable(void *info) 1724 + static void __perf_event_disable(struct perf_event *event, 1725 + struct perf_cpu_context *cpuctx, 1726 + struct perf_event_context *ctx, 1727 + void *info) 1795 1728 { 1796 - struct perf_event *event = info; 1797 - struct perf_event_context *ctx = event->ctx; 1798 - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 1729 + if (event->state < PERF_EVENT_STATE_INACTIVE) 1730 + return; 1799 1731 1800 - /* 1801 - * If this is a per-task event, need to check whether this 1802 - * event's task is the current task on this cpu. 1803 - * 1804 - * Can trigger due to concurrent perf_event_context_sched_out() 1805 - * flipping contexts around. 1806 - */ 1807 - if (ctx->task && cpuctx->task_ctx != ctx) 1808 - return -EINVAL; 1809 - 1810 - raw_spin_lock(&ctx->lock); 1811 - 1812 - /* 1813 - * If the event is on, turn it off. 1814 - * If it is in error state, leave it in error state. 1815 - */ 1816 - if (event->state >= PERF_EVENT_STATE_INACTIVE) { 1817 - update_context_time(ctx); 1818 - update_cgrp_time_from_event(event); 1819 - update_group_times(event); 1820 - if (event == event->group_leader) 1821 - group_sched_out(event, cpuctx, ctx); 1822 - else 1823 - event_sched_out(event, cpuctx, ctx); 1824 - event->state = PERF_EVENT_STATE_OFF; 1825 - } 1826 - 1827 - raw_spin_unlock(&ctx->lock); 1828 - 1829 - return 0; 1830 - } 1831 - 1832 - void ___perf_event_disable(void *info) 1833 - { 1834 - struct perf_event *event = info; 1835 - 1836 - /* 1837 - * Since we have the lock this context can't be scheduled 1838 - * in, so we can change the state safely. 1839 - */ 1840 - if (event->state == PERF_EVENT_STATE_INACTIVE) { 1841 - update_group_times(event); 1842 - event->state = PERF_EVENT_STATE_OFF; 1843 - } 1732 + update_context_time(ctx); 1733 + update_cgrp_time_from_event(event); 1734 + update_group_times(event); 1735 + if (event == event->group_leader) 1736 + group_sched_out(event, cpuctx, ctx); 1737 + else 1738 + event_sched_out(event, cpuctx, ctx); 1739 + event->state = PERF_EVENT_STATE_OFF; 1844 1740 } 1845 1741 1846 1742 /* ··· 1810 1788 * remains valid. This condition is satisifed when called through 1811 1789 * perf_event_for_each_child or perf_event_for_each because they 1812 1790 * hold the top-level event's child_mutex, so any descendant that 1813 - * goes to exit will block in sync_child_event. 1791 + * goes to exit will block in perf_event_exit_event(). 1792 + * 1814 1793 * When called from perf_pending_event it's OK because event->ctx 1815 1794 * is the current context on this CPU and preemption is disabled, 1816 1795 * hence we can't get into perf_event_task_sched_out for this context. ··· 1827 1804 } 1828 1805 raw_spin_unlock_irq(&ctx->lock); 1829 1806 1830 - event_function_call(event, __perf_event_disable, 1831 - ___perf_event_disable, event); 1807 + event_function_call(event, __perf_event_disable, NULL); 1808 + } 1809 + 1810 + void perf_event_disable_local(struct perf_event *event) 1811 + { 1812 + event_function_local(event, __perf_event_disable, NULL); 1832 1813 } 1833 1814 1834 1815 /* ··· 1944 1917 1945 1918 if (event->attr.exclusive) 1946 1919 cpuctx->exclusive = 1; 1947 - 1948 - if (is_orphaned_child(event)) 1949 - schedule_orphans_remove(ctx); 1950 1920 1951 1921 out: 1952 1922 perf_pmu_enable(event->pmu); ··· 2063 2039 event->tstamp_stopped = tstamp; 2064 2040 } 2065 2041 2066 - static void task_ctx_sched_out(struct perf_event_context *ctx); 2042 + static void task_ctx_sched_out(struct perf_cpu_context *cpuctx, 2043 + struct perf_event_context *ctx); 2067 2044 static void 2068 2045 ctx_sched_in(struct perf_event_context *ctx, 2069 2046 struct perf_cpu_context *cpuctx, ··· 2083 2058 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task); 2084 2059 } 2085 2060 2086 - static void ___perf_install_in_context(void *info) 2061 + static void ctx_resched(struct perf_cpu_context *cpuctx, 2062 + struct perf_event_context *task_ctx) 2087 2063 { 2088 - struct perf_event *event = info; 2089 - struct perf_event_context *ctx = event->ctx; 2090 - 2091 - /* 2092 - * Since the task isn't running, its safe to add the event, us holding 2093 - * the ctx->lock ensures the task won't get scheduled in. 2094 - */ 2095 - add_event_to_ctx(event, ctx); 2064 + perf_pmu_disable(cpuctx->ctx.pmu); 2065 + if (task_ctx) 2066 + task_ctx_sched_out(cpuctx, task_ctx); 2067 + cpu_ctx_sched_out(cpuctx, EVENT_ALL); 2068 + perf_event_sched_in(cpuctx, task_ctx, current); 2069 + perf_pmu_enable(cpuctx->ctx.pmu); 2096 2070 } 2097 2071 2098 2072 /* ··· 2101 2077 */ 2102 2078 static int __perf_install_in_context(void *info) 2103 2079 { 2104 - struct perf_event *event = info; 2105 - struct perf_event_context *ctx = event->ctx; 2080 + struct perf_event_context *ctx = info; 2106 2081 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 2107 2082 struct perf_event_context *task_ctx = cpuctx->task_ctx; 2108 - struct task_struct *task = current; 2109 2083 2110 - perf_ctx_lock(cpuctx, task_ctx); 2111 - perf_pmu_disable(cpuctx->ctx.pmu); 2112 - 2113 - /* 2114 - * If there was an active task_ctx schedule it out. 2115 - */ 2116 - if (task_ctx) 2117 - task_ctx_sched_out(task_ctx); 2118 - 2119 - /* 2120 - * If the context we're installing events in is not the 2121 - * active task_ctx, flip them. 2122 - */ 2123 - if (ctx->task && task_ctx != ctx) { 2124 - if (task_ctx) 2125 - raw_spin_unlock(&task_ctx->lock); 2084 + raw_spin_lock(&cpuctx->ctx.lock); 2085 + if (ctx->task) { 2126 2086 raw_spin_lock(&ctx->lock); 2087 + /* 2088 + * If we hit the 'wrong' task, we've since scheduled and 2089 + * everything should be sorted, nothing to do! 2090 + */ 2127 2091 task_ctx = ctx; 2092 + if (ctx->task != current) 2093 + goto unlock; 2094 + 2095 + /* 2096 + * If task_ctx is set, it had better be to us. 2097 + */ 2098 + WARN_ON_ONCE(cpuctx->task_ctx != ctx && cpuctx->task_ctx); 2099 + } else if (task_ctx) { 2100 + raw_spin_lock(&task_ctx->lock); 2128 2101 } 2129 2102 2130 - if (task_ctx) { 2131 - cpuctx->task_ctx = task_ctx; 2132 - task = task_ctx->task; 2133 - } 2134 - 2135 - cpu_ctx_sched_out(cpuctx, EVENT_ALL); 2136 - 2137 - update_context_time(ctx); 2138 - /* 2139 - * update cgrp time only if current cgrp 2140 - * matches event->cgrp. Must be done before 2141 - * calling add_event_to_ctx() 2142 - */ 2143 - update_cgrp_time_from_event(event); 2144 - 2145 - add_event_to_ctx(event, ctx); 2146 - 2147 - /* 2148 - * Schedule everything back in 2149 - */ 2150 - perf_event_sched_in(cpuctx, task_ctx, task); 2151 - 2152 - perf_pmu_enable(cpuctx->ctx.pmu); 2103 + ctx_resched(cpuctx, task_ctx); 2104 + unlock: 2153 2105 perf_ctx_unlock(cpuctx, task_ctx); 2154 2106 2155 2107 return 0; ··· 2133 2133 2134 2134 /* 2135 2135 * Attach a performance event to a context 2136 - * 2137 - * First we add the event to the list with the hardware enable bit 2138 - * in event->hw_config cleared. 2139 - * 2140 - * If the event is attached to a task which is on a CPU we use a smp 2141 - * call to enable it in the task context. The task might have been 2142 - * scheduled away, but we check this in the smp call again. 2143 2136 */ 2144 2137 static void 2145 2138 perf_install_in_context(struct perf_event_context *ctx, 2146 2139 struct perf_event *event, 2147 2140 int cpu) 2148 2141 { 2142 + struct task_struct *task = NULL; 2143 + 2149 2144 lockdep_assert_held(&ctx->mutex); 2150 2145 2151 2146 event->ctx = ctx; 2152 2147 if (event->cpu != -1) 2153 2148 event->cpu = cpu; 2154 2149 2155 - event_function_call(event, __perf_install_in_context, 2156 - ___perf_install_in_context, event); 2150 + /* 2151 + * Installing events is tricky because we cannot rely on ctx->is_active 2152 + * to be set in case this is the nr_events 0 -> 1 transition. 2153 + * 2154 + * So what we do is we add the event to the list here, which will allow 2155 + * a future context switch to DTRT and then send a racy IPI. If the IPI 2156 + * fails to hit the right task, this means a context switch must have 2157 + * happened and that will have taken care of business. 2158 + */ 2159 + raw_spin_lock_irq(&ctx->lock); 2160 + task = ctx->task; 2161 + /* 2162 + * Worse, we cannot even rely on the ctx actually existing anymore. If 2163 + * between find_get_context() and perf_install_in_context() the task 2164 + * went through perf_event_exit_task() its dead and we should not be 2165 + * adding new events. 2166 + */ 2167 + if (task == TASK_TOMBSTONE) { 2168 + raw_spin_unlock_irq(&ctx->lock); 2169 + return; 2170 + } 2171 + update_context_time(ctx); 2172 + /* 2173 + * Update cgrp time only if current cgrp matches event->cgrp. 2174 + * Must be done before calling add_event_to_ctx(). 2175 + */ 2176 + update_cgrp_time_from_event(event); 2177 + add_event_to_ctx(event, ctx); 2178 + raw_spin_unlock_irq(&ctx->lock); 2179 + 2180 + if (task) 2181 + task_function_call(task, __perf_install_in_context, ctx); 2182 + else 2183 + cpu_function_call(cpu, __perf_install_in_context, ctx); 2157 2184 } 2158 2185 2159 2186 /* ··· 2207 2180 /* 2208 2181 * Cross CPU call to enable a performance event 2209 2182 */ 2210 - static int __perf_event_enable(void *info) 2183 + static void __perf_event_enable(struct perf_event *event, 2184 + struct perf_cpu_context *cpuctx, 2185 + struct perf_event_context *ctx, 2186 + void *info) 2211 2187 { 2212 - struct perf_event *event = info; 2213 - struct perf_event_context *ctx = event->ctx; 2214 2188 struct perf_event *leader = event->group_leader; 2215 - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 2216 - int err; 2189 + struct perf_event_context *task_ctx; 2217 2190 2218 - /* 2219 - * There's a time window between 'ctx->is_active' check 2220 - * in perf_event_enable function and this place having: 2221 - * - IRQs on 2222 - * - ctx->lock unlocked 2223 - * 2224 - * where the task could be killed and 'ctx' deactivated 2225 - * by perf_event_exit_task. 2226 - */ 2227 - if (!ctx->is_active) 2228 - return -EINVAL; 2191 + if (event->state >= PERF_EVENT_STATE_INACTIVE || 2192 + event->state <= PERF_EVENT_STATE_ERROR) 2193 + return; 2229 2194 2230 - raw_spin_lock(&ctx->lock); 2231 2195 update_context_time(ctx); 2232 - 2233 - if (event->state >= PERF_EVENT_STATE_INACTIVE) 2234 - goto unlock; 2235 - 2236 - /* 2237 - * set current task's cgroup time reference point 2238 - */ 2239 - perf_cgroup_set_timestamp(current, ctx); 2240 - 2241 2196 __perf_event_mark_enabled(event); 2242 2197 2198 + if (!ctx->is_active) 2199 + return; 2200 + 2243 2201 if (!event_filter_match(event)) { 2244 - if (is_cgroup_event(event)) 2202 + if (is_cgroup_event(event)) { 2203 + perf_cgroup_set_timestamp(current, ctx); // XXX ? 2245 2204 perf_cgroup_defer_enabled(event); 2246 - goto unlock; 2205 + } 2206 + return; 2247 2207 } 2248 2208 2249 2209 /* ··· 2238 2224 * then don't put it on unless the group is on. 2239 2225 */ 2240 2226 if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) 2241 - goto unlock; 2227 + return; 2242 2228 2243 - if (!group_can_go_on(event, cpuctx, 1)) { 2244 - err = -EEXIST; 2245 - } else { 2246 - if (event == leader) 2247 - err = group_sched_in(event, cpuctx, ctx); 2248 - else 2249 - err = event_sched_in(event, cpuctx, ctx); 2250 - } 2229 + task_ctx = cpuctx->task_ctx; 2230 + if (ctx->task) 2231 + WARN_ON_ONCE(task_ctx != ctx); 2251 2232 2252 - if (err) { 2253 - /* 2254 - * If this event can't go on and it's part of a 2255 - * group, then the whole group has to come off. 2256 - */ 2257 - if (leader != event) { 2258 - group_sched_out(leader, cpuctx, ctx); 2259 - perf_mux_hrtimer_restart(cpuctx); 2260 - } 2261 - if (leader->attr.pinned) { 2262 - update_group_times(leader); 2263 - leader->state = PERF_EVENT_STATE_ERROR; 2264 - } 2265 - } 2266 - 2267 - unlock: 2268 - raw_spin_unlock(&ctx->lock); 2269 - 2270 - return 0; 2271 - } 2272 - 2273 - void ___perf_event_enable(void *info) 2274 - { 2275 - __perf_event_mark_enabled((struct perf_event *)info); 2233 + ctx_resched(cpuctx, task_ctx); 2276 2234 } 2277 2235 2278 2236 /* ··· 2261 2275 struct perf_event_context *ctx = event->ctx; 2262 2276 2263 2277 raw_spin_lock_irq(&ctx->lock); 2264 - if (event->state >= PERF_EVENT_STATE_INACTIVE) { 2278 + if (event->state >= PERF_EVENT_STATE_INACTIVE || 2279 + event->state < PERF_EVENT_STATE_ERROR) { 2265 2280 raw_spin_unlock_irq(&ctx->lock); 2266 2281 return; 2267 2282 } ··· 2278 2291 event->state = PERF_EVENT_STATE_OFF; 2279 2292 raw_spin_unlock_irq(&ctx->lock); 2280 2293 2281 - event_function_call(event, __perf_event_enable, 2282 - ___perf_event_enable, event); 2294 + event_function_call(event, __perf_event_enable, NULL); 2283 2295 } 2284 2296 2285 2297 /* ··· 2328 2342 struct perf_cpu_context *cpuctx, 2329 2343 enum event_type_t event_type) 2330 2344 { 2331 - struct perf_event *event; 2332 2345 int is_active = ctx->is_active; 2346 + struct perf_event *event; 2347 + 2348 + lockdep_assert_held(&ctx->lock); 2349 + 2350 + if (likely(!ctx->nr_events)) { 2351 + /* 2352 + * See __perf_remove_from_context(). 2353 + */ 2354 + WARN_ON_ONCE(ctx->is_active); 2355 + if (ctx->task) 2356 + WARN_ON_ONCE(cpuctx->task_ctx); 2357 + return; 2358 + } 2333 2359 2334 2360 ctx->is_active &= ~event_type; 2335 - if (likely(!ctx->nr_events)) 2336 - return; 2361 + if (ctx->task) { 2362 + WARN_ON_ONCE(cpuctx->task_ctx != ctx); 2363 + if (!ctx->is_active) 2364 + cpuctx->task_ctx = NULL; 2365 + } 2337 2366 2338 2367 update_context_time(ctx); 2339 2368 update_cgrp_time_from_cpuctx(cpuctx); ··· 2519 2518 raw_spin_lock(&ctx->lock); 2520 2519 raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); 2521 2520 if (context_equiv(ctx, next_ctx)) { 2522 - /* 2523 - * XXX do we need a memory barrier of sorts 2524 - * wrt to rcu_dereference() of perf_event_ctxp 2525 - */ 2526 - task->perf_event_ctxp[ctxn] = next_ctx; 2527 - next->perf_event_ctxp[ctxn] = ctx; 2528 - ctx->task = next; 2529 - next_ctx->task = task; 2521 + WRITE_ONCE(ctx->task, next); 2522 + WRITE_ONCE(next_ctx->task, task); 2530 2523 2531 2524 swap(ctx->task_ctx_data, next_ctx->task_ctx_data); 2525 + 2526 + /* 2527 + * RCU_INIT_POINTER here is safe because we've not 2528 + * modified the ctx and the above modification of 2529 + * ctx->task and ctx->task_ctx_data are immaterial 2530 + * since those values are always verified under 2531 + * ctx->lock which we're now holding. 2532 + */ 2533 + RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], next_ctx); 2534 + RCU_INIT_POINTER(next->perf_event_ctxp[ctxn], ctx); 2532 2535 2533 2536 do_switch = 0; 2534 2537 ··· 2546 2541 2547 2542 if (do_switch) { 2548 2543 raw_spin_lock(&ctx->lock); 2549 - ctx_sched_out(ctx, cpuctx, EVENT_ALL); 2550 - cpuctx->task_ctx = NULL; 2544 + task_ctx_sched_out(cpuctx, ctx); 2551 2545 raw_spin_unlock(&ctx->lock); 2552 2546 } 2553 2547 } ··· 2641 2637 perf_cgroup_sched_out(task, next); 2642 2638 } 2643 2639 2644 - static void task_ctx_sched_out(struct perf_event_context *ctx) 2640 + static void task_ctx_sched_out(struct perf_cpu_context *cpuctx, 2641 + struct perf_event_context *ctx) 2645 2642 { 2646 - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 2647 - 2648 2643 if (!cpuctx->task_ctx) 2649 2644 return; 2650 2645 ··· 2651 2648 return; 2652 2649 2653 2650 ctx_sched_out(ctx, cpuctx, EVENT_ALL); 2654 - cpuctx->task_ctx = NULL; 2655 2651 } 2656 2652 2657 2653 /* ··· 2727 2725 enum event_type_t event_type, 2728 2726 struct task_struct *task) 2729 2727 { 2730 - u64 now; 2731 2728 int is_active = ctx->is_active; 2729 + u64 now; 2732 2730 2733 - ctx->is_active |= event_type; 2731 + lockdep_assert_held(&ctx->lock); 2732 + 2734 2733 if (likely(!ctx->nr_events)) 2735 2734 return; 2735 + 2736 + ctx->is_active |= event_type; 2737 + if (ctx->task) { 2738 + if (!is_active) 2739 + cpuctx->task_ctx = ctx; 2740 + else 2741 + WARN_ON_ONCE(cpuctx->task_ctx != ctx); 2742 + } 2736 2743 2737 2744 now = perf_clock(); 2738 2745 ctx->timestamp = now; ··· 2784 2773 * cpu flexible, task flexible. 2785 2774 */ 2786 2775 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); 2787 - 2788 - if (ctx->nr_events) 2789 - cpuctx->task_ctx = ctx; 2790 - 2791 - perf_event_sched_in(cpuctx, cpuctx->task_ctx, task); 2792 - 2776 + perf_event_sched_in(cpuctx, ctx, task); 2793 2777 perf_pmu_enable(ctx->pmu); 2794 2778 perf_ctx_unlock(cpuctx, ctx); 2795 2779 } ··· 2806 2800 struct perf_event_context *ctx; 2807 2801 int ctxn; 2808 2802 2803 + /* 2804 + * If cgroup events exist on this CPU, then we need to check if we have 2805 + * to switch in PMU state; cgroup event are system-wide mode only. 2806 + * 2807 + * Since cgroup events are CPU events, we must schedule these in before 2808 + * we schedule in the task events. 2809 + */ 2810 + if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) 2811 + perf_cgroup_sched_in(prev, task); 2812 + 2809 2813 for_each_task_context_nr(ctxn) { 2810 2814 ctx = task->perf_event_ctxp[ctxn]; 2811 2815 if (likely(!ctx)) ··· 2823 2807 2824 2808 perf_event_context_sched_in(ctx, task); 2825 2809 } 2826 - /* 2827 - * if cgroup events exist on this CPU, then we need 2828 - * to check if we have to switch in PMU state. 2829 - * cgroup event are system-wide mode only 2830 - */ 2831 - if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) 2832 - perf_cgroup_sched_in(prev, task); 2833 2810 2834 2811 if (atomic_read(&nr_switch_events)) 2835 2812 perf_event_switch(task, prev, true); ··· 3108 3099 static void perf_event_enable_on_exec(int ctxn) 3109 3100 { 3110 3101 struct perf_event_context *ctx, *clone_ctx = NULL; 3102 + struct perf_cpu_context *cpuctx; 3111 3103 struct perf_event *event; 3112 3104 unsigned long flags; 3113 3105 int enabled = 0; 3114 - int ret; 3115 3106 3116 3107 local_irq_save(flags); 3117 3108 ctx = current->perf_event_ctxp[ctxn]; 3118 3109 if (!ctx || !ctx->nr_events) 3119 3110 goto out; 3120 3111 3121 - /* 3122 - * We must ctxsw out cgroup events to avoid conflict 3123 - * when invoking perf_task_event_sched_in() later on 3124 - * in this function. Otherwise we end up trying to 3125 - * ctxswin cgroup events which are already scheduled 3126 - * in. 3127 - */ 3128 - perf_cgroup_sched_out(current, NULL); 3129 - 3130 - raw_spin_lock(&ctx->lock); 3131 - task_ctx_sched_out(ctx); 3132 - 3133 - list_for_each_entry(event, &ctx->event_list, event_entry) { 3134 - ret = event_enable_on_exec(event, ctx); 3135 - if (ret) 3136 - enabled = 1; 3137 - } 3112 + cpuctx = __get_cpu_context(ctx); 3113 + perf_ctx_lock(cpuctx, ctx); 3114 + list_for_each_entry(event, &ctx->event_list, event_entry) 3115 + enabled |= event_enable_on_exec(event, ctx); 3138 3116 3139 3117 /* 3140 - * Unclone this context if we enabled any event. 3118 + * Unclone and reschedule this context if we enabled any event. 3141 3119 */ 3142 - if (enabled) 3120 + if (enabled) { 3143 3121 clone_ctx = unclone_ctx(ctx); 3122 + ctx_resched(cpuctx, ctx); 3123 + } 3124 + perf_ctx_unlock(cpuctx, ctx); 3144 3125 3145 - raw_spin_unlock(&ctx->lock); 3146 - 3147 - /* 3148 - * Also calls ctxswin for cgroup events, if any: 3149 - */ 3150 - perf_event_context_sched_in(ctx, ctx->task); 3151 3126 out: 3152 3127 local_irq_restore(flags); 3153 3128 ··· 3327 3334 INIT_LIST_HEAD(&ctx->flexible_groups); 3328 3335 INIT_LIST_HEAD(&ctx->event_list); 3329 3336 atomic_set(&ctx->refcount, 1); 3330 - INIT_DELAYED_WORK(&ctx->orphans_remove, orphans_remove_work); 3331 3337 } 3332 3338 3333 3339 static struct perf_event_context * ··· 3513 3521 3514 3522 static void unaccount_event(struct perf_event *event) 3515 3523 { 3524 + bool dec = false; 3525 + 3516 3526 if (event->parent) 3517 3527 return; 3518 3528 3519 3529 if (event->attach_state & PERF_ATTACH_TASK) 3520 - static_key_slow_dec_deferred(&perf_sched_events); 3530 + dec = true; 3521 3531 if (event->attr.mmap || event->attr.mmap_data) 3522 3532 atomic_dec(&nr_mmap_events); 3523 3533 if (event->attr.comm) ··· 3529 3535 if (event->attr.freq) 3530 3536 atomic_dec(&nr_freq_events); 3531 3537 if (event->attr.context_switch) { 3532 - static_key_slow_dec_deferred(&perf_sched_events); 3538 + dec = true; 3533 3539 atomic_dec(&nr_switch_events); 3534 3540 } 3535 3541 if (is_cgroup_event(event)) 3536 - static_key_slow_dec_deferred(&perf_sched_events); 3542 + dec = true; 3537 3543 if (has_branch_stack(event)) 3544 + dec = true; 3545 + 3546 + if (dec) 3538 3547 static_key_slow_dec_deferred(&perf_sched_events); 3539 3548 3540 3549 unaccount_event_cpu(event, event->cpu); ··· 3553 3556 * 3) two matching events on the same context. 3554 3557 * 3555 3558 * The former two cases are handled in the allocation path (perf_event_alloc(), 3556 - * __free_event()), the latter -- before the first perf_install_in_context(). 3559 + * _free_event()), the latter -- before the first perf_install_in_context(). 3557 3560 */ 3558 3561 static int exclusive_event_init(struct perf_event *event) 3559 3562 { ··· 3628 3631 return true; 3629 3632 } 3630 3633 3631 - static void __free_event(struct perf_event *event) 3632 - { 3633 - if (!event->parent) { 3634 - if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) 3635 - put_callchain_buffers(); 3636 - } 3637 - 3638 - perf_event_free_bpf_prog(event); 3639 - 3640 - if (event->destroy) 3641 - event->destroy(event); 3642 - 3643 - if (event->ctx) 3644 - put_ctx(event->ctx); 3645 - 3646 - if (event->pmu) { 3647 - exclusive_event_destroy(event); 3648 - module_put(event->pmu->module); 3649 - } 3650 - 3651 - call_rcu(&event->rcu_head, free_event_rcu); 3652 - } 3653 - 3654 3634 static void _free_event(struct perf_event *event) 3655 3635 { 3656 3636 irq_work_sync(&event->pending); ··· 3649 3675 if (is_cgroup_event(event)) 3650 3676 perf_detach_cgroup(event); 3651 3677 3652 - __free_event(event); 3678 + if (!event->parent) { 3679 + if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) 3680 + put_callchain_buffers(); 3681 + } 3682 + 3683 + perf_event_free_bpf_prog(event); 3684 + 3685 + if (event->destroy) 3686 + event->destroy(event); 3687 + 3688 + if (event->ctx) 3689 + put_ctx(event->ctx); 3690 + 3691 + if (event->pmu) { 3692 + exclusive_event_destroy(event); 3693 + module_put(event->pmu->module); 3694 + } 3695 + 3696 + call_rcu(&event->rcu_head, free_event_rcu); 3653 3697 } 3654 3698 3655 3699 /* ··· 3694 3702 struct task_struct *owner; 3695 3703 3696 3704 rcu_read_lock(); 3697 - owner = ACCESS_ONCE(event->owner); 3698 3705 /* 3699 - * Matches the smp_wmb() in perf_event_exit_task(). If we observe 3700 - * !owner it means the list deletion is complete and we can indeed 3701 - * free this event, otherwise we need to serialize on 3706 + * Matches the smp_store_release() in perf_event_exit_task(). If we 3707 + * observe !owner it means the list deletion is complete and we can 3708 + * indeed free this event, otherwise we need to serialize on 3702 3709 * owner->perf_event_mutex. 3703 3710 */ 3704 - smp_read_barrier_depends(); 3711 + owner = lockless_dereference(event->owner); 3705 3712 if (owner) { 3706 3713 /* 3707 3714 * Since delayed_put_task_struct() also drops the last ··· 3728 3737 * ensured they're done, and we can proceed with freeing the 3729 3738 * event. 3730 3739 */ 3731 - if (event->owner) 3740 + if (event->owner) { 3732 3741 list_del_init(&event->owner_entry); 3742 + smp_store_release(&event->owner, NULL); 3743 + } 3733 3744 mutex_unlock(&owner->perf_event_mutex); 3734 3745 put_task_struct(owner); 3735 3746 } ··· 3739 3746 3740 3747 static void put_event(struct perf_event *event) 3741 3748 { 3742 - struct perf_event_context *ctx; 3743 - 3744 3749 if (!atomic_long_dec_and_test(&event->refcount)) 3745 3750 return; 3746 - 3747 - if (!is_kernel_event(event)) 3748 - perf_remove_from_owner(event); 3749 - 3750 - /* 3751 - * There are two ways this annotation is useful: 3752 - * 3753 - * 1) there is a lock recursion from perf_event_exit_task 3754 - * see the comment there. 3755 - * 3756 - * 2) there is a lock-inversion with mmap_sem through 3757 - * perf_read_group(), which takes faults while 3758 - * holding ctx->mutex, however this is called after 3759 - * the last filedesc died, so there is no possibility 3760 - * to trigger the AB-BA case. 3761 - */ 3762 - ctx = perf_event_ctx_lock_nested(event, SINGLE_DEPTH_NESTING); 3763 - WARN_ON_ONCE(ctx->parent_ctx); 3764 - perf_remove_from_context(event, true); 3765 - perf_event_ctx_unlock(event, ctx); 3766 3751 3767 3752 _free_event(event); 3768 3753 } 3769 3754 3755 + /* 3756 + * Kill an event dead; while event:refcount will preserve the event 3757 + * object, it will not preserve its functionality. Once the last 'user' 3758 + * gives up the object, we'll destroy the thing. 3759 + */ 3770 3760 int perf_event_release_kernel(struct perf_event *event) 3771 3761 { 3762 + struct perf_event_context *ctx; 3763 + struct perf_event *child, *tmp; 3764 + 3765 + if (!is_kernel_event(event)) 3766 + perf_remove_from_owner(event); 3767 + 3768 + ctx = perf_event_ctx_lock(event); 3769 + WARN_ON_ONCE(ctx->parent_ctx); 3770 + perf_remove_from_context(event, DETACH_GROUP | DETACH_STATE); 3771 + perf_event_ctx_unlock(event, ctx); 3772 + 3773 + /* 3774 + * At this point we must have event->state == PERF_EVENT_STATE_EXIT, 3775 + * either from the above perf_remove_from_context() or through 3776 + * perf_event_exit_event(). 3777 + * 3778 + * Therefore, anybody acquiring event->child_mutex after the below 3779 + * loop _must_ also see this, most importantly inherit_event() which 3780 + * will avoid placing more children on the list. 3781 + * 3782 + * Thus this guarantees that we will in fact observe and kill _ALL_ 3783 + * child events. 3784 + */ 3785 + WARN_ON_ONCE(event->state != PERF_EVENT_STATE_EXIT); 3786 + 3787 + again: 3788 + mutex_lock(&event->child_mutex); 3789 + list_for_each_entry(child, &event->child_list, child_list) { 3790 + 3791 + /* 3792 + * Cannot change, child events are not migrated, see the 3793 + * comment with perf_event_ctx_lock_nested(). 3794 + */ 3795 + ctx = lockless_dereference(child->ctx); 3796 + /* 3797 + * Since child_mutex nests inside ctx::mutex, we must jump 3798 + * through hoops. We start by grabbing a reference on the ctx. 3799 + * 3800 + * Since the event cannot get freed while we hold the 3801 + * child_mutex, the context must also exist and have a !0 3802 + * reference count. 3803 + */ 3804 + get_ctx(ctx); 3805 + 3806 + /* 3807 + * Now that we have a ctx ref, we can drop child_mutex, and 3808 + * acquire ctx::mutex without fear of it going away. Then we 3809 + * can re-acquire child_mutex. 3810 + */ 3811 + mutex_unlock(&event->child_mutex); 3812 + mutex_lock(&ctx->mutex); 3813 + mutex_lock(&event->child_mutex); 3814 + 3815 + /* 3816 + * Now that we hold ctx::mutex and child_mutex, revalidate our 3817 + * state, if child is still the first entry, it didn't get freed 3818 + * and we can continue doing so. 3819 + */ 3820 + tmp = list_first_entry_or_null(&event->child_list, 3821 + struct perf_event, child_list); 3822 + if (tmp == child) { 3823 + perf_remove_from_context(child, DETACH_GROUP); 3824 + list_del(&child->child_list); 3825 + free_event(child); 3826 + /* 3827 + * This matches the refcount bump in inherit_event(); 3828 + * this can't be the last reference. 3829 + */ 3830 + put_event(event); 3831 + } 3832 + 3833 + mutex_unlock(&event->child_mutex); 3834 + mutex_unlock(&ctx->mutex); 3835 + put_ctx(ctx); 3836 + goto again; 3837 + } 3838 + mutex_unlock(&event->child_mutex); 3839 + 3840 + /* Must be the last reference */ 3772 3841 put_event(event); 3773 3842 return 0; 3774 3843 } ··· 3841 3786 */ 3842 3787 static int perf_release(struct inode *inode, struct file *file) 3843 3788 { 3844 - put_event(file->private_data); 3789 + perf_event_release_kernel(file->private_data); 3845 3790 return 0; 3846 - } 3847 - 3848 - /* 3849 - * Remove all orphanes events from the context. 3850 - */ 3851 - static void orphans_remove_work(struct work_struct *work) 3852 - { 3853 - struct perf_event_context *ctx; 3854 - struct perf_event *event, *tmp; 3855 - 3856 - ctx = container_of(work, struct perf_event_context, 3857 - orphans_remove.work); 3858 - 3859 - mutex_lock(&ctx->mutex); 3860 - list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) { 3861 - struct perf_event *parent_event = event->parent; 3862 - 3863 - if (!is_orphaned_child(event)) 3864 - continue; 3865 - 3866 - perf_remove_from_context(event, true); 3867 - 3868 - mutex_lock(&parent_event->child_mutex); 3869 - list_del_init(&event->child_list); 3870 - mutex_unlock(&parent_event->child_mutex); 3871 - 3872 - free_event(event); 3873 - put_event(parent_event); 3874 - } 3875 - 3876 - raw_spin_lock_irq(&ctx->lock); 3877 - ctx->orphans_remove_sched = false; 3878 - raw_spin_unlock_irq(&ctx->lock); 3879 - mutex_unlock(&ctx->mutex); 3880 - 3881 - put_ctx(ctx); 3882 3791 } 3883 3792 3884 3793 u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) ··· 4073 4054 /* 4074 4055 * Holding the top-level event's child_mutex means that any 4075 4056 * descendant process that has inherited this event will block 4076 - * in sync_child_event if it goes to exit, thus satisfying the 4057 + * in perf_event_exit_event() if it goes to exit, thus satisfying the 4077 4058 * task existence requirements of perf_event_enable/disable. 4078 4059 */ 4079 4060 static void perf_event_for_each_child(struct perf_event *event, ··· 4105 4086 perf_event_for_each_child(sibling, func); 4106 4087 } 4107 4088 4108 - struct period_event { 4109 - struct perf_event *event; 4110 - u64 value; 4111 - }; 4112 - 4113 - static void ___perf_event_period(void *info) 4089 + static void __perf_event_period(struct perf_event *event, 4090 + struct perf_cpu_context *cpuctx, 4091 + struct perf_event_context *ctx, 4092 + void *info) 4114 4093 { 4115 - struct period_event *pe = info; 4116 - struct perf_event *event = pe->event; 4117 - u64 value = pe->value; 4118 - 4119 - if (event->attr.freq) { 4120 - event->attr.sample_freq = value; 4121 - } else { 4122 - event->attr.sample_period = value; 4123 - event->hw.sample_period = value; 4124 - } 4125 - 4126 - local64_set(&event->hw.period_left, 0); 4127 - } 4128 - 4129 - static int __perf_event_period(void *info) 4130 - { 4131 - struct period_event *pe = info; 4132 - struct perf_event *event = pe->event; 4133 - struct perf_event_context *ctx = event->ctx; 4134 - u64 value = pe->value; 4094 + u64 value = *((u64 *)info); 4135 4095 bool active; 4136 4096 4137 - raw_spin_lock(&ctx->lock); 4138 4097 if (event->attr.freq) { 4139 4098 event->attr.sample_freq = value; 4140 4099 } else { ··· 4132 4135 event->pmu->start(event, PERF_EF_RELOAD); 4133 4136 perf_pmu_enable(ctx->pmu); 4134 4137 } 4135 - raw_spin_unlock(&ctx->lock); 4136 - 4137 - return 0; 4138 4138 } 4139 4139 4140 4140 static int perf_event_period(struct perf_event *event, u64 __user *arg) 4141 4141 { 4142 - struct period_event pe = { .event = event, }; 4143 4142 u64 value; 4144 4143 4145 4144 if (!is_sampling_event(event)) ··· 4150 4157 if (event->attr.freq && value > sysctl_perf_event_sample_rate) 4151 4158 return -EINVAL; 4152 4159 4153 - pe.value = value; 4154 - 4155 - event_function_call(event, __perf_event_period, 4156 - ___perf_event_period, &pe); 4160 + event_function_call(event, __perf_event_period, &value); 4157 4161 4158 4162 return 0; 4159 4163 } ··· 4922 4932 4923 4933 if (event->pending_disable) { 4924 4934 event->pending_disable = 0; 4925 - __perf_event_disable(event); 4935 + perf_event_disable_local(event); 4926 4936 } 4927 4937 4928 4938 if (event->pending_wakeup) { ··· 7743 7753 7744 7754 static void account_event(struct perf_event *event) 7745 7755 { 7756 + bool inc = false; 7757 + 7746 7758 if (event->parent) 7747 7759 return; 7748 7760 7749 7761 if (event->attach_state & PERF_ATTACH_TASK) 7750 - static_key_slow_inc(&perf_sched_events.key); 7762 + inc = true; 7751 7763 if (event->attr.mmap || event->attr.mmap_data) 7752 7764 atomic_inc(&nr_mmap_events); 7753 7765 if (event->attr.comm) ··· 7762 7770 } 7763 7771 if (event->attr.context_switch) { 7764 7772 atomic_inc(&nr_switch_events); 7765 - static_key_slow_inc(&perf_sched_events.key); 7773 + inc = true; 7766 7774 } 7767 7775 if (has_branch_stack(event)) 7768 - static_key_slow_inc(&perf_sched_events.key); 7776 + inc = true; 7769 7777 if (is_cgroup_event(event)) 7778 + inc = true; 7779 + 7780 + if (inc) 7770 7781 static_key_slow_inc(&perf_sched_events.key); 7771 7782 7772 7783 account_event_cpu(event, event->cpu); ··· 8417 8422 * See perf_event_ctx_lock() for comments on the details 8418 8423 * of swizzling perf_event::ctx. 8419 8424 */ 8420 - perf_remove_from_context(group_leader, false); 8425 + perf_remove_from_context(group_leader, 0); 8421 8426 8422 8427 list_for_each_entry(sibling, &group_leader->sibling_list, 8423 8428 group_entry) { 8424 - perf_remove_from_context(sibling, false); 8429 + perf_remove_from_context(sibling, 0); 8425 8430 put_ctx(gctx); 8426 8431 } 8427 8432 ··· 8474 8479 perf_event__header_size(event); 8475 8480 perf_event__id_header_size(event); 8476 8481 8482 + event->owner = current; 8483 + 8477 8484 perf_install_in_context(ctx, event, event->cpu); 8478 8485 perf_unpin_context(ctx); 8479 8486 ··· 8484 8487 mutex_unlock(&ctx->mutex); 8485 8488 8486 8489 put_online_cpus(); 8487 - 8488 - event->owner = current; 8489 8490 8490 8491 mutex_lock(&current->perf_event_mutex); 8491 8492 list_add_tail(&event->owner_entry, &current->perf_event_list); ··· 8551 8556 } 8552 8557 8553 8558 /* Mark owner so we could distinguish it from user events. */ 8554 - event->owner = EVENT_OWNER_KERNEL; 8559 + event->owner = TASK_TOMBSTONE; 8555 8560 8556 8561 account_event(event); 8557 8562 ··· 8601 8606 mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex); 8602 8607 list_for_each_entry_safe(event, tmp, &src_ctx->event_list, 8603 8608 event_entry) { 8604 - perf_remove_from_context(event, false); 8609 + perf_remove_from_context(event, 0); 8605 8610 unaccount_event_cpu(event, src_cpu); 8606 8611 put_ctx(src_ctx); 8607 8612 list_add(&event->migrate_entry, &events); ··· 8668 8673 &parent_event->child_total_time_enabled); 8669 8674 atomic64_add(child_event->total_time_running, 8670 8675 &parent_event->child_total_time_running); 8671 - 8672 - /* 8673 - * Remove this event from the parent's list 8674 - */ 8675 - WARN_ON_ONCE(parent_event->ctx->parent_ctx); 8676 - mutex_lock(&parent_event->child_mutex); 8677 - list_del_init(&child_event->child_list); 8678 - mutex_unlock(&parent_event->child_mutex); 8679 - 8680 - /* 8681 - * Make sure user/parent get notified, that we just 8682 - * lost one event. 8683 - */ 8684 - perf_event_wakeup(parent_event); 8685 - 8686 - /* 8687 - * Release the parent event, if this was the last 8688 - * reference to it. 8689 - */ 8690 - put_event(parent_event); 8691 8676 } 8692 8677 8693 8678 static void 8694 - __perf_event_exit_task(struct perf_event *child_event, 8695 - struct perf_event_context *child_ctx, 8696 - struct task_struct *child) 8679 + perf_event_exit_event(struct perf_event *child_event, 8680 + struct perf_event_context *child_ctx, 8681 + struct task_struct *child) 8697 8682 { 8683 + struct perf_event *parent_event = child_event->parent; 8684 + 8698 8685 /* 8699 8686 * Do not destroy the 'original' grouping; because of the context 8700 8687 * switch optimization the original events could've ended up in a ··· 8689 8712 * Do destroy all inherited groups, we don't care about those 8690 8713 * and being thorough is better. 8691 8714 */ 8692 - perf_remove_from_context(child_event, !!child_event->parent); 8715 + raw_spin_lock_irq(&child_ctx->lock); 8716 + WARN_ON_ONCE(child_ctx->is_active); 8717 + 8718 + if (parent_event) 8719 + perf_group_detach(child_event); 8720 + list_del_event(child_event, child_ctx); 8721 + child_event->state = PERF_EVENT_STATE_EXIT; /* see perf_event_release_kernel() */ 8722 + raw_spin_unlock_irq(&child_ctx->lock); 8693 8723 8694 8724 /* 8695 - * It can happen that the parent exits first, and has events 8696 - * that are still around due to the child reference. These 8697 - * events need to be zapped. 8725 + * Parent events are governed by their filedesc, retain them. 8698 8726 */ 8699 - if (child_event->parent) { 8700 - sync_child_event(child_event, child); 8701 - free_event(child_event); 8702 - } else { 8703 - child_event->state = PERF_EVENT_STATE_EXIT; 8727 + if (!parent_event) { 8704 8728 perf_event_wakeup(child_event); 8729 + return; 8705 8730 } 8731 + /* 8732 + * Child events can be cleaned up. 8733 + */ 8734 + 8735 + sync_child_event(child_event, child); 8736 + 8737 + /* 8738 + * Remove this event from the parent's list 8739 + */ 8740 + WARN_ON_ONCE(parent_event->ctx->parent_ctx); 8741 + mutex_lock(&parent_event->child_mutex); 8742 + list_del_init(&child_event->child_list); 8743 + mutex_unlock(&parent_event->child_mutex); 8744 + 8745 + /* 8746 + * Kick perf_poll() for is_event_hup(). 8747 + */ 8748 + perf_event_wakeup(parent_event); 8749 + free_event(child_event); 8750 + put_event(parent_event); 8706 8751 } 8707 8752 8708 8753 static void perf_event_exit_task_context(struct task_struct *child, int ctxn) 8709 8754 { 8710 - struct perf_event *child_event, *next; 8711 8755 struct perf_event_context *child_ctx, *clone_ctx = NULL; 8712 - unsigned long flags; 8756 + struct perf_event *child_event, *next; 8713 8757 8714 - if (likely(!child->perf_event_ctxp[ctxn])) 8758 + WARN_ON_ONCE(child != current); 8759 + 8760 + child_ctx = perf_pin_task_context(child, ctxn); 8761 + if (!child_ctx) 8715 8762 return; 8716 8763 8717 - local_irq_save(flags); 8718 8764 /* 8719 - * We can't reschedule here because interrupts are disabled, 8720 - * and either child is current or it is a task that can't be 8721 - * scheduled, so we are now safe from rescheduling changing 8722 - * our context. 8765 + * In order to reduce the amount of tricky in ctx tear-down, we hold 8766 + * ctx::mutex over the entire thing. This serializes against almost 8767 + * everything that wants to access the ctx. 8768 + * 8769 + * The exception is sys_perf_event_open() / 8770 + * perf_event_create_kernel_count() which does find_get_context() 8771 + * without ctx::mutex (it cannot because of the move_group double mutex 8772 + * lock thing). See the comments in perf_install_in_context(). 8723 8773 */ 8724 - child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]); 8774 + mutex_lock(&child_ctx->mutex); 8725 8775 8726 8776 /* 8727 - * Take the context lock here so that if find_get_context is 8728 - * reading child->perf_event_ctxp, we wait until it has 8729 - * incremented the context's refcount before we do put_ctx below. 8777 + * In a single ctx::lock section, de-schedule the events and detach the 8778 + * context from the task such that we cannot ever get it scheduled back 8779 + * in. 8730 8780 */ 8731 - raw_spin_lock(&child_ctx->lock); 8732 - task_ctx_sched_out(child_ctx); 8733 - child->perf_event_ctxp[ctxn] = NULL; 8781 + raw_spin_lock_irq(&child_ctx->lock); 8782 + task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx); 8734 8783 8735 8784 /* 8736 - * If this context is a clone; unclone it so it can't get 8737 - * swapped to another process while we're removing all 8738 - * the events from it. 8785 + * Now that the context is inactive, destroy the task <-> ctx relation 8786 + * and mark the context dead. 8739 8787 */ 8788 + RCU_INIT_POINTER(child->perf_event_ctxp[ctxn], NULL); 8789 + put_ctx(child_ctx); /* cannot be last */ 8790 + WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE); 8791 + put_task_struct(current); /* cannot be last */ 8792 + 8740 8793 clone_ctx = unclone_ctx(child_ctx); 8741 - update_context_time(child_ctx); 8742 - raw_spin_unlock_irqrestore(&child_ctx->lock, flags); 8794 + raw_spin_unlock_irq(&child_ctx->lock); 8743 8795 8744 8796 if (clone_ctx) 8745 8797 put_ctx(clone_ctx); ··· 8780 8774 */ 8781 8775 perf_event_task(child, child_ctx, 0); 8782 8776 8783 - /* 8784 - * We can recurse on the same lock type through: 8785 - * 8786 - * __perf_event_exit_task() 8787 - * sync_child_event() 8788 - * put_event() 8789 - * mutex_lock(&ctx->mutex) 8790 - * 8791 - * But since its the parent context it won't be the same instance. 8792 - */ 8793 - mutex_lock(&child_ctx->mutex); 8794 - 8795 8777 list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry) 8796 - __perf_event_exit_task(child_event, child_ctx, child); 8778 + perf_event_exit_event(child_event, child_ctx, child); 8797 8779 8798 8780 mutex_unlock(&child_ctx->mutex); 8799 8781 ··· 8806 8812 * the owner, closes a race against perf_release() where 8807 8813 * we need to serialize on the owner->perf_event_mutex. 8808 8814 */ 8809 - smp_wmb(); 8810 - event->owner = NULL; 8815 + smp_store_release(&event->owner, NULL); 8811 8816 } 8812 8817 mutex_unlock(&child->perf_event_mutex); 8813 8818 ··· 8889 8896 WARN_ON_ONCE(task->perf_event_ctxp[ctxn]); 8890 8897 } 8891 8898 8892 - struct perf_event *perf_event_get(unsigned int fd) 8899 + struct file *perf_event_get(unsigned int fd) 8893 8900 { 8894 - int err; 8895 - struct fd f; 8896 - struct perf_event *event; 8901 + struct file *file; 8897 8902 8898 - err = perf_fget_light(fd, &f); 8899 - if (err) 8900 - return ERR_PTR(err); 8903 + file = fget_raw(fd); 8904 + if (!file) 8905 + return ERR_PTR(-EBADF); 8901 8906 8902 - event = f.file->private_data; 8903 - atomic_long_inc(&event->refcount); 8904 - fdput(f); 8907 + if (file->f_op != &perf_fops) { 8908 + fput(file); 8909 + return ERR_PTR(-EBADF); 8910 + } 8905 8911 8906 - return event; 8912 + return file; 8907 8913 } 8908 8914 8909 8915 const struct perf_event_attr *perf_event_attrs(struct perf_event *event) ··· 8945 8953 if (IS_ERR(child_event)) 8946 8954 return child_event; 8947 8955 8956 + /* 8957 + * is_orphaned_event() and list_add_tail(&parent_event->child_list) 8958 + * must be under the same lock in order to serialize against 8959 + * perf_event_release_kernel(), such that either we must observe 8960 + * is_orphaned_event() or they will observe us on the child_list. 8961 + */ 8962 + mutex_lock(&parent_event->child_mutex); 8948 8963 if (is_orphaned_event(parent_event) || 8949 8964 !atomic_long_inc_not_zero(&parent_event->refcount)) { 8965 + mutex_unlock(&parent_event->child_mutex); 8950 8966 free_event(child_event); 8951 8967 return NULL; 8952 8968 } ··· 9002 9002 /* 9003 9003 * Link this into the parent event's child list 9004 9004 */ 9005 - WARN_ON_ONCE(parent_event->ctx->parent_ctx); 9006 - mutex_lock(&parent_event->child_mutex); 9007 9005 list_add_tail(&child_event->child_list, &parent_event->child_list); 9008 9006 mutex_unlock(&parent_event->child_mutex); 9009 9007 ··· 9219 9221 #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE 9220 9222 static void __perf_event_exit_context(void *__info) 9221 9223 { 9222 - struct remove_event re = { .detach_group = true }; 9223 9224 struct perf_event_context *ctx = __info; 9225 + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 9226 + struct perf_event *event; 9224 9227 9225 - rcu_read_lock(); 9226 - list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry) 9227 - __perf_remove_from_context(&re); 9228 - rcu_read_unlock(); 9228 + raw_spin_lock(&ctx->lock); 9229 + list_for_each_entry(event, &ctx->event_list, event_entry) 9230 + __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP); 9231 + raw_spin_unlock(&ctx->lock); 9229 9232 } 9230 9233 9231 9234 static void perf_event_exit_cpu_context(int cpu)
+1 -1
kernel/events/hw_breakpoint.c
··· 444 444 * current task. 445 445 */ 446 446 if (irqs_disabled() && bp->ctx && bp->ctx->task == current) 447 - __perf_event_disable(bp); 447 + perf_event_disable_local(bp); 448 448 else 449 449 perf_event_disable(bp); 450 450
+20 -20
kernel/events/ring_buffer.c
··· 459 459 __free_page(page); 460 460 } 461 461 462 + static void __rb_free_aux(struct ring_buffer *rb) 463 + { 464 + int pg; 465 + 466 + if (rb->aux_priv) { 467 + rb->free_aux(rb->aux_priv); 468 + rb->free_aux = NULL; 469 + rb->aux_priv = NULL; 470 + } 471 + 472 + if (rb->aux_nr_pages) { 473 + for (pg = 0; pg < rb->aux_nr_pages; pg++) 474 + rb_free_aux_page(rb, pg); 475 + 476 + kfree(rb->aux_pages); 477 + rb->aux_nr_pages = 0; 478 + } 479 + } 480 + 462 481 int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event, 463 482 pgoff_t pgoff, int nr_pages, long watermark, int flags) 464 483 { ··· 566 547 if (!ret) 567 548 rb->aux_pgoff = pgoff; 568 549 else 569 - rb_free_aux(rb); 550 + __rb_free_aux(rb); 570 551 571 552 return ret; 572 - } 573 - 574 - static void __rb_free_aux(struct ring_buffer *rb) 575 - { 576 - int pg; 577 - 578 - if (rb->aux_priv) { 579 - rb->free_aux(rb->aux_priv); 580 - rb->free_aux = NULL; 581 - rb->aux_priv = NULL; 582 - } 583 - 584 - if (rb->aux_nr_pages) { 585 - for (pg = 0; pg < rb->aux_nr_pages; pg++) 586 - rb_free_aux_page(rb, pg); 587 - 588 - kfree(rb->aux_pages); 589 - rb->aux_nr_pages = 0; 590 - } 591 553 } 592 554 593 555 void rb_free_aux(struct ring_buffer *rb)
+10 -4
kernel/trace/bpf_trace.c
··· 191 191 struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; 192 192 struct bpf_array *array = container_of(map, struct bpf_array, map); 193 193 struct perf_event *event; 194 + struct file *file; 194 195 195 196 if (unlikely(index >= array->map.max_entries)) 196 197 return -E2BIG; 197 198 198 - event = (struct perf_event *)array->ptrs[index]; 199 - if (!event) 199 + file = (struct file *)array->ptrs[index]; 200 + if (unlikely(!file)) 200 201 return -ENOENT; 202 + 203 + event = file->private_data; 201 204 202 205 /* make sure event is local and doesn't have pmu::count */ 203 206 if (event->oncpu != smp_processor_id() || ··· 231 228 void *data = (void *) (long) r4; 232 229 struct perf_sample_data sample_data; 233 230 struct perf_event *event; 231 + struct file *file; 234 232 struct perf_raw_record raw = { 235 233 .size = size, 236 234 .data = data, ··· 240 236 if (unlikely(index >= array->map.max_entries)) 241 237 return -E2BIG; 242 238 243 - event = (struct perf_event *)array->ptrs[index]; 244 - if (unlikely(!event)) 239 + file = (struct file *)array->ptrs[index]; 240 + if (unlikely(!file)) 245 241 return -ENOENT; 242 + 243 + event = file->private_data; 246 244 247 245 if (unlikely(event->attr.type != PERF_TYPE_SOFTWARE || 248 246 event->attr.config != PERF_COUNT_SW_BPF_OUTPUT))
+24 -1
tools/perf/Makefile.perf
··· 77 77 # Define NO_AUXTRACE if you do not want AUX area tracing support 78 78 # 79 79 # Define NO_LIBBPF if you do not want BPF support 80 + # 81 + # Define FEATURES_DUMP to provide features detection dump file 82 + # and bypass the feature detection 80 83 81 84 # As per kernel Makefile, avoid funny character set dependencies 82 85 unexport LC_ALL ··· 167 164 168 165 ifeq ($(config),1) 169 166 include config/Makefile 167 + endif 168 + 169 + # The FEATURE_DUMP_EXPORT holds location of the actual 170 + # FEATURE_DUMP file to be used to bypass feature detection 171 + # (for bpf or any other subproject) 172 + ifeq ($(FEATURES_DUMP),) 173 + FEATURE_DUMP_EXPORT := $(realpath $(OUTPUT)FEATURE-DUMP) 174 + else 175 + FEATURE_DUMP_EXPORT := $(FEATURES_DUMP) 170 176 endif 171 177 172 178 export prefix bindir sharedir sysconfdir DESTDIR ··· 448 436 $(Q)$(MAKE) -C $(LIB_DIR) O=$(OUTPUT) clean >/dev/null 449 437 450 438 $(LIBBPF): fixdep FORCE 451 - $(Q)$(MAKE) -C $(BPF_DIR) O=$(OUTPUT) $(OUTPUT)libbpf.a FEATURES_DUMP=$(realpath $(OUTPUT)FEATURE-DUMP) 439 + $(Q)$(MAKE) -C $(BPF_DIR) O=$(OUTPUT) $(OUTPUT)libbpf.a FEATURES_DUMP=$(FEATURE_DUMP_EXPORT) 452 440 453 441 $(LIBBPF)-clean: 454 442 $(call QUIET_CLEAN, libbpf) ··· 621 609 $(OUTPUT)tests/llvm-src-{base,kbuild,prologue}.c 622 610 $(QUIET_SUBDIR0)Documentation $(QUIET_SUBDIR1) clean 623 611 $(python-clean) 612 + 613 + # 614 + # To provide FEATURE-DUMP into $(FEATURE_DUMP_COPY) 615 + # file if defined, with no further action. 616 + feature-dump: 617 + ifdef FEATURE_DUMP_COPY 618 + @cp $(OUTPUT)FEATURE-DUMP $(FEATURE_DUMP_COPY) 619 + @echo "FEATURE-DUMP file copied into $(FEATURE_DUMP_COPY)" 620 + else 621 + @echo "FEATURE-DUMP file available in $(OUTPUT)FEATURE-DUMP" 622 + endif 624 623 625 624 # 626 625 # Trick: if ../../.git does not exist - we are building out of tree for example,
+1 -1
tools/perf/arch/x86/tests/intel-cqm.c
··· 17 17 if (pid) 18 18 return pid; 19 19 20 - while(1); 20 + while(1) 21 21 sleep(5); 22 22 return 0; 23 23 }
+4
tools/perf/config/Makefile
··· 181 181 182 182 EXTLIBS = -lpthread -lrt -lm -ldl 183 183 184 + ifeq ($(FEATURES_DUMP),) 184 185 include $(srctree)/tools/build/Makefile.feature 186 + else 187 + include $(FEATURES_DUMP) 188 + endif 185 189 186 190 ifeq ($(feature-stackprotector-all), 1) 187 191 CFLAGS += -fstack-protector-all
+40 -15
tools/perf/tests/make
··· 5 5 # no target specified, trigger the whole suite 6 6 all: 7 7 @echo "Testing Makefile"; $(MAKE) -sf tests/make MK=Makefile 8 - @echo "Testing Makefile.perf"; $(MAKE) -sf tests/make MK=Makefile.perf 8 + @echo "Testing Makefile.perf"; $(MAKE) -sf tests/make MK=Makefile.perf SET_PARALLEL=1 SET_O=1 9 9 else 10 10 # run only specific test over 'Makefile' 11 11 %: ··· 13 13 endif 14 14 else 15 15 PERF := . 16 + PERF_O := $(PERF) 17 + O_OPT := 18 + 19 + ifneq ($(O),) 20 + FULL_O := $(shell readlink -f $(O) || echo $(O)) 21 + PERF_O := $(FULL_O) 22 + ifeq ($(SET_O),1) 23 + O_OPT := 'O=$(FULL_O)' 24 + endif 25 + K_O_OPT := 'O=$(FULL_O)' 26 + endif 27 + 28 + PARALLEL_OPT= 29 + ifeq ($(SET_PARALLEL),1) 30 + cores := $(shell (getconf _NPROCESSORS_ONLN || egrep -c '^processor|^CPU[0-9]' /proc/cpuinfo) 2>/dev/null) 31 + ifeq ($(cores),0) 32 + cores := 1 33 + endif 34 + PARALLEL_OPT="-j$(cores)" 35 + endif 16 36 17 37 # As per kernel Makefile, avoid funny character set dependencies 18 38 unexport LC_ALL ··· 176 156 test_make_help_O := $(test_ok) 177 157 test_make_doc_O := $(test_ok) 178 158 179 - test_make_python_perf_so := test -f $(PERF)/python/perf.so 159 + test_make_python_perf_so := test -f $(PERF_O)/python/perf.so 180 160 181 - test_make_perf_o := test -f $(PERF)/perf.o 182 - test_make_util_map_o := test -f $(PERF)/util/map.o 183 - test_make_util_pmu_bison_o := test -f $(PERF)/util/pmu-bison.o 161 + test_make_perf_o := test -f $(PERF_O)/perf.o 162 + test_make_util_map_o := test -f $(PERF_O)/util/map.o 163 + test_make_util_pmu_bison_o := test -f $(PERF_O)/util/pmu-bison.o 184 164 185 165 define test_dest_files 186 166 for file in $(1); do \ ··· 247 227 test_make_util_map_o_O := test -f $$TMP_O/util/map.o 248 228 test_make_util_pmu_bison_o_O := test -f $$TMP_O/util/pmu-bison.o 249 229 250 - test_default = test -x $(PERF)/perf 230 + test_default = test -x $(PERF_O)/perf 251 231 test = $(if $(test_$1),$(test_$1),$(test_default)) 252 232 253 233 test_default_O = test -x $$TMP_O/perf ··· 267 247 268 248 MAKEFLAGS := --no-print-directory 269 249 270 - clean := @(cd $(PERF); make -s -f $(MK) clean >/dev/null) 250 + clean := @(cd $(PERF); make -s -f $(MK) $(O_OPT) clean >/dev/null) 271 251 272 252 $(run): 273 253 $(call clean) 274 254 @TMP_DEST=$$(mktemp -d); \ 275 - cmd="cd $(PERF) && make -f $(MK) DESTDIR=$$TMP_DEST $($@)"; \ 255 + cmd="cd $(PERF) && make -f $(MK) $(PARALLEL_OPT) $(O_OPT) DESTDIR=$$TMP_DEST $($@)"; \ 276 256 echo "- $@: $$cmd" && echo $$cmd > $@ && \ 277 257 ( eval $$cmd ) >> $@ 2>&1; \ 278 258 echo " test: $(call test,$@)" >> $@ 2>&1; \ ··· 283 263 $(call clean) 284 264 @TMP_O=$$(mktemp -d); \ 285 265 TMP_DEST=$$(mktemp -d); \ 286 - cmd="cd $(PERF) && make -f $(MK) O=$$TMP_O DESTDIR=$$TMP_DEST $($(patsubst %_O,%,$@))"; \ 266 + cmd="cd $(PERF) && make -f $(MK) $(PARALLEL_OPT) O=$$TMP_O DESTDIR=$$TMP_DEST $($(patsubst %_O,%,$@))"; \ 287 267 echo "- $@: $$cmd" && echo $$cmd > $@ && \ 288 268 ( eval $$cmd ) >> $@ 2>&1 && \ 289 269 echo " test: $(call test_O,$@)" >> $@ 2>&1; \ ··· 296 276 ( eval $$cmd ) >> $@ 2>&1 && \ 297 277 rm -f $@ 298 278 279 + KERNEL_O := ../.. 280 + ifneq ($(O),) 281 + KERNEL_O := $(O) 282 + endif 283 + 299 284 make_kernelsrc: 300 - @echo "- make -C <kernelsrc> tools/perf" 285 + @echo "- make -C <kernelsrc> $(PARALLEL_OPT) $(K_O_OPT) tools/perf" 301 286 $(call clean); \ 302 - (make -C ../.. tools/perf) > $@ 2>&1 && \ 303 - test -x perf && rm -f $@ || (cat $@ ; false) 287 + (make -C ../.. $(PARALLEL_OPT) $(K_O_OPT) tools/perf) > $@ 2>&1 && \ 288 + test -x $(KERNEL_O)/tools/perf/perf && rm -f $@ || (cat $@ ; false) 304 289 305 290 make_kernelsrc_tools: 306 - @echo "- make -C <kernelsrc>/tools perf" 291 + @echo "- make -C <kernelsrc>/tools $(PARALLEL_OPT) $(K_O_OPT) perf" 307 292 $(call clean); \ 308 - (make -C ../../tools perf) > $@ 2>&1 && \ 309 - test -x perf && rm -f $@ || (cat $@ ; false) 293 + (make -C ../../tools $(PARALLEL_OPT) $(K_O_OPT) perf) > $@ 2>&1 && \ 294 + test -x $(KERNEL_O)/tools/perf/perf && rm -f $@ || (cat $@ ; false) 310 295 311 296 all: $(run) $(run_O) tarpkg make_kernelsrc make_kernelsrc_tools 312 297 @echo OK
+2 -2
tools/perf/ui/browsers/annotate.c
··· 755 755 nd = browser->curr_hot; 756 756 break; 757 757 case K_UNTAB: 758 - if (nd != NULL) 758 + if (nd != NULL) { 759 759 nd = rb_next(nd); 760 760 if (nd == NULL) 761 761 nd = rb_first(&browser->entries); 762 - else 762 + } else 763 763 nd = browser->curr_hot; 764 764 break; 765 765 case K_F1:
+2
tools/perf/util/hist.c
··· 131 131 symlen = unresolved_col_width + 4 + 2; 132 132 hists__new_col_len(hists, HISTC_MEM_DADDR_SYMBOL, 133 133 symlen); 134 + hists__new_col_len(hists, HISTC_MEM_DCACHELINE, 135 + symlen); 134 136 } 135 137 136 138 if (h->mem_info->iaddr.sym) {
+1 -1
tools/perf/util/session.c
··· 1149 1149 1150 1150 machine = machines__find(machines, pid); 1151 1151 if (!machine) 1152 - machine = machines__find(machines, DEFAULT_GUEST_KERNEL_ID); 1152 + machine = machines__findnew(machines, DEFAULT_GUEST_KERNEL_ID); 1153 1153 return machine; 1154 1154 } 1155 1155
-1
tools/perf/util/stat.c
··· 310 310 int i, ret; 311 311 312 312 aggr->val = aggr->ena = aggr->run = 0; 313 - init_stats(ps->res_stats); 314 313 315 314 if (counter->per_pkg) 316 315 zero_per_pkg(counter);
+1 -1
tools/perf/util/symbol.c
··· 1466 1466 * Read the build id if possible. This is required for 1467 1467 * DSO_BINARY_TYPE__BUILDID_DEBUGINFO to work 1468 1468 */ 1469 - if (filename__read_build_id(dso->name, build_id, BUILD_ID_SIZE) > 0) 1469 + if (filename__read_build_id(dso->long_name, build_id, BUILD_ID_SIZE) > 0) 1470 1470 dso__set_build_id(dso, build_id); 1471 1471 1472 1472 /*