Merge branch 'perf-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

+3 -4

arch/x86/kernel/cpu/perf_event_intel.c

··· 1960 1960 1961 1961 static int intel_alt_er(int idx, u64 config) 1962 1962 { 1963 - int alt_idx; 1963 + int alt_idx = idx; 1964 + 1964 1965 if (!(x86_pmu.flags & PMU_FL_HAS_RSP_1)) 1965 1966 return idx; 1966 1967 ··· 2898 2897 return; 2899 2898 2900 2899 if (!(x86_pmu.flags & PMU_FL_NO_HT_SHARING)) { 2901 - void **onln = &cpuc->kfree_on_online[X86_PERF_KFREE_SHARED]; 2902 - 2903 2900 for_each_cpu(i, topology_sibling_cpumask(cpu)) { 2904 2901 struct intel_shared_regs *pc; 2905 2902 2906 2903 pc = per_cpu(cpu_hw_events, i).shared_regs; 2907 2904 if (pc && pc->core_id == core_id) { 2908 - *onln = cpuc->shared_regs; 2905 + cpuc->kfree_on_online[0] = cpuc->shared_regs; 2909 2906 cpuc->shared_regs = pc; 2910 2907 break; 2911 2908 }

+3

arch/x86/kernel/cpu/perf_event_intel_uncore.c

··· 995 995 case 87: /* Knights Landing */ 996 996 ret = knl_uncore_pci_init(); 997 997 break; 998 + case 94: /* SkyLake */ 999 + ret = skl_uncore_pci_init(); 1000 + break; 998 1001 default: 999 1002 return 0; 1000 1003 }

+1

arch/x86/kernel/cpu/perf_event_intel_uncore.h

··· 336 336 int ivb_uncore_pci_init(void); 337 337 int hsw_uncore_pci_init(void); 338 338 int bdw_uncore_pci_init(void); 339 + int skl_uncore_pci_init(void); 339 340 void snb_uncore_cpu_init(void); 340 341 void nhm_uncore_cpu_init(void); 341 342 int snb_pci2phy_map_init(int devid);

+20

arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c

··· 8 8 #define PCI_DEVICE_ID_INTEL_HSW_IMC 0x0c00 9 9 #define PCI_DEVICE_ID_INTEL_HSW_U_IMC 0x0a04 10 10 #define PCI_DEVICE_ID_INTEL_BDW_IMC 0x1604 11 + #define PCI_DEVICE_ID_INTEL_SKL_IMC 0x191f 11 12 12 13 /* SNB event control */ 13 14 #define SNB_UNC_CTL_EV_SEL_MASK 0x000000ff ··· 525 524 { /* end: all zeroes */ }, 526 525 }; 527 526 527 + static const struct pci_device_id skl_uncore_pci_ids[] = { 528 + { /* IMC */ 529 + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SKL_IMC), 530 + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), 531 + }, 532 + { /* end: all zeroes */ }, 533 + }; 534 + 528 535 static struct pci_driver snb_uncore_pci_driver = { 529 536 .name = "snb_uncore", 530 537 .id_table = snb_uncore_pci_ids, ··· 553 544 .id_table = bdw_uncore_pci_ids, 554 545 }; 555 546 547 + static struct pci_driver skl_uncore_pci_driver = { 548 + .name = "skl_uncore", 549 + .id_table = skl_uncore_pci_ids, 550 + }; 551 + 556 552 struct imc_uncore_pci_dev { 557 553 __u32 pci_id; 558 554 struct pci_driver *driver; ··· 572 558 IMC_DEV(HSW_IMC, &hsw_uncore_pci_driver), /* 4th Gen Core Processor */ 573 559 IMC_DEV(HSW_U_IMC, &hsw_uncore_pci_driver), /* 4th Gen Core ULT Mobile Processor */ 574 560 IMC_DEV(BDW_IMC, &bdw_uncore_pci_driver), /* 5th Gen Core U */ 561 + IMC_DEV(SKL_IMC, &skl_uncore_pci_driver), /* 6th Gen Core */ 575 562 { /* end marker */ } 576 563 }; 577 564 ··· 621 606 } 622 607 623 608 int bdw_uncore_pci_init(void) 609 + { 610 + return imc_uncore_pci_init(); 611 + } 612 + 613 + int skl_uncore_pci_init(void) 624 614 { 625 615 return imc_uncore_pci_init(); 626 616 }

+3 -6

include/linux/perf_event.h

··· 634 634 int nr_cgroups; /* cgroup evts */ 635 635 void *task_ctx_data; /* pmu specific data */ 636 636 struct rcu_head rcu_head; 637 - 638 - struct delayed_work orphans_remove; 639 - bool orphans_remove_sched; 640 637 }; 641 638 642 639 /* ··· 726 729 extern void perf_event_exit_task(struct task_struct *child); 727 730 extern void perf_event_free_task(struct task_struct *task); 728 731 extern void perf_event_delayed_put(struct task_struct *task); 729 - extern struct perf_event *perf_event_get(unsigned int fd); 732 + extern struct file *perf_event_get(unsigned int fd); 730 733 extern const struct perf_event_attr *perf_event_attrs(struct perf_event *event); 731 734 extern void perf_event_print_debug(void); 732 735 extern void perf_pmu_disable(struct pmu *pmu); ··· 1041 1044 extern u64 perf_swevent_set_period(struct perf_event *event); 1042 1045 extern void perf_event_enable(struct perf_event *event); 1043 1046 extern void perf_event_disable(struct perf_event *event); 1044 - extern int __perf_event_disable(void *info); 1047 + extern void perf_event_disable_local(struct perf_event *event); 1045 1048 extern void perf_event_task_tick(void); 1046 1049 #else /* !CONFIG_PERF_EVENTS: */ 1047 1050 static inline void * ··· 1067 1070 static inline void perf_event_exit_task(struct task_struct *child) { } 1068 1071 static inline void perf_event_free_task(struct task_struct *task) { } 1069 1072 static inline void perf_event_delayed_put(struct task_struct *task) { } 1070 - static inline struct perf_event *perf_event_get(unsigned int fd) { return ERR_PTR(-EINVAL); } 1073 + static inline struct file *perf_event_get(unsigned int fd) { return ERR_PTR(-EINVAL); } 1071 1074 static inline const struct perf_event_attr *perf_event_attrs(struct perf_event *event) 1072 1075 { 1073 1076 return ERR_PTR(-EINVAL);

+11 -10

kernel/bpf/arraymap.c

··· 291 291 { 292 292 struct perf_event *event; 293 293 const struct perf_event_attr *attr; 294 + struct file *file; 294 295 295 - event = perf_event_get(fd); 296 - if (IS_ERR(event)) 297 - return event; 296 + file = perf_event_get(fd); 297 + if (IS_ERR(file)) 298 + return file; 299 + 300 + event = file->private_data; 298 301 299 302 attr = perf_event_attrs(event); 300 303 if (IS_ERR(attr)) ··· 307 304 goto err; 308 305 309 306 if (attr->type == PERF_TYPE_RAW) 310 - return event; 307 + return file; 311 308 312 309 if (attr->type == PERF_TYPE_HARDWARE) 313 - return event; 310 + return file; 314 311 315 312 if (attr->type == PERF_TYPE_SOFTWARE && 316 313 attr->config == PERF_COUNT_SW_BPF_OUTPUT) 317 - return event; 314 + return file; 318 315 err: 319 - perf_event_release_kernel(event); 316 + fput(file); 320 317 return ERR_PTR(-EINVAL); 321 318 } 322 319 323 320 static void perf_event_fd_array_put_ptr(void *ptr) 324 321 { 325 - struct perf_event *event = ptr; 326 - 327 - perf_event_release_kernel(event); 322 + fput((struct file *)ptr); 328 323 } 329 324 330 325 static const struct bpf_map_ops perf_event_array_ops = {

+605 -606

kernel/events/core.c

··· 49 49 50 50 #include <asm/irq_regs.h> 51 51 52 - static struct workqueue_struct *perf_wq; 53 - 54 52 typedef int (*remote_function_f)(void *); 55 53 56 54 struct remote_function_call { ··· 124 126 return data.ret; 125 127 } 126 128 127 - static void event_function_call(struct perf_event *event, 128 - int (*active)(void *), 129 - void (*inactive)(void *), 130 - void *data) 129 + static inline struct perf_cpu_context * 130 + __get_cpu_context(struct perf_event_context *ctx) 131 + { 132 + return this_cpu_ptr(ctx->pmu->pmu_cpu_context); 133 + } 134 + 135 + static void perf_ctx_lock(struct perf_cpu_context *cpuctx, 136 + struct perf_event_context *ctx) 137 + { 138 + raw_spin_lock(&cpuctx->ctx.lock); 139 + if (ctx) 140 + raw_spin_lock(&ctx->lock); 141 + } 142 + 143 + static void perf_ctx_unlock(struct perf_cpu_context *cpuctx, 144 + struct perf_event_context *ctx) 145 + { 146 + if (ctx) 147 + raw_spin_unlock(&ctx->lock); 148 + raw_spin_unlock(&cpuctx->ctx.lock); 149 + } 150 + 151 + #define TASK_TOMBSTONE ((void *)-1L) 152 + 153 + static bool is_kernel_event(struct perf_event *event) 154 + { 155 + return READ_ONCE(event->owner) == TASK_TOMBSTONE; 156 + } 157 + 158 + /* 159 + * On task ctx scheduling... 160 + * 161 + * When !ctx->nr_events a task context will not be scheduled. This means 162 + * we can disable the scheduler hooks (for performance) without leaving 163 + * pending task ctx state. 164 + * 165 + * This however results in two special cases: 166 + * 167 + * - removing the last event from a task ctx; this is relatively straight 168 + * forward and is done in __perf_remove_from_context. 169 + * 170 + * - adding the first event to a task ctx; this is tricky because we cannot 171 + * rely on ctx->is_active and therefore cannot use event_function_call(). 172 + * See perf_install_in_context(). 173 + * 174 + * This is because we need a ctx->lock serialized variable (ctx->is_active) 175 + * to reliably determine if a particular task/context is scheduled in. The 176 + * task_curr() use in task_function_call() is racy in that a remote context 177 + * switch is not a single atomic operation. 178 + * 179 + * As is, the situation is 'safe' because we set rq->curr before we do the 180 + * actual context switch. This means that task_curr() will fail early, but 181 + * we'll continue spinning on ctx->is_active until we've passed 182 + * perf_event_task_sched_out(). 183 + * 184 + * Without this ctx->lock serialized variable we could have race where we find 185 + * the task (and hence the context) would not be active while in fact they are. 186 + * 187 + * If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set. 188 + */ 189 + 190 + typedef void (*event_f)(struct perf_event *, struct perf_cpu_context *, 191 + struct perf_event_context *, void *); 192 + 193 + struct event_function_struct { 194 + struct perf_event *event; 195 + event_f func; 196 + void *data; 197 + }; 198 + 199 + static int event_function(void *info) 200 + { 201 + struct event_function_struct *efs = info; 202 + struct perf_event *event = efs->event; 203 + struct perf_event_context *ctx = event->ctx; 204 + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 205 + struct perf_event_context *task_ctx = cpuctx->task_ctx; 206 + int ret = 0; 207 + 208 + WARN_ON_ONCE(!irqs_disabled()); 209 + 210 + perf_ctx_lock(cpuctx, task_ctx); 211 + /* 212 + * Since we do the IPI call without holding ctx->lock things can have 213 + * changed, double check we hit the task we set out to hit. 214 + */ 215 + if (ctx->task) { 216 + if (ctx->task != current) { 217 + ret = -EAGAIN; 218 + goto unlock; 219 + } 220 + 221 + /* 222 + * We only use event_function_call() on established contexts, 223 + * and event_function() is only ever called when active (or 224 + * rather, we'll have bailed in task_function_call() or the 225 + * above ctx->task != current test), therefore we must have 226 + * ctx->is_active here. 227 + */ 228 + WARN_ON_ONCE(!ctx->is_active); 229 + /* 230 + * And since we have ctx->is_active, cpuctx->task_ctx must 231 + * match. 232 + */ 233 + WARN_ON_ONCE(task_ctx != ctx); 234 + } else { 235 + WARN_ON_ONCE(&cpuctx->ctx != ctx); 236 + } 237 + 238 + efs->func(event, cpuctx, ctx, efs->data); 239 + unlock: 240 + perf_ctx_unlock(cpuctx, task_ctx); 241 + 242 + return ret; 243 + } 244 + 245 + static void event_function_local(struct perf_event *event, event_f func, void *data) 246 + { 247 + struct event_function_struct efs = { 248 + .event = event, 249 + .func = func, 250 + .data = data, 251 + }; 252 + 253 + int ret = event_function(&efs); 254 + WARN_ON_ONCE(ret); 255 + } 256 + 257 + static void event_function_call(struct perf_event *event, event_f func, void *data) 131 258 { 132 259 struct perf_event_context *ctx = event->ctx; 133 - struct task_struct *task = ctx->task; 260 + struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */ 261 + struct event_function_struct efs = { 262 + .event = event, 263 + .func = func, 264 + .data = data, 265 + }; 266 + 267 + if (!event->parent) { 268 + /* 269 + * If this is a !child event, we must hold ctx::mutex to 270 + * stabilize the the event->ctx relation. See 271 + * perf_event_ctx_lock(). 272 + */ 273 + lockdep_assert_held(&ctx->mutex); 274 + } 134 275 135 276 if (!task) { 136 - cpu_function_call(event->cpu, active, data); 277 + cpu_function_call(event->cpu, event_function, &efs); 137 278 return; 138 279 } 139 280 140 281 again: 141 - if (!task_function_call(task, active, data)) 282 + if (task == TASK_TOMBSTONE) 283 + return; 284 + 285 + if (!task_function_call(task, event_function, &efs)) 142 286 return; 143 287 144 288 raw_spin_lock_irq(&ctx->lock); 145 - if (ctx->is_active) { 146 - /* 147 - * Reload the task pointer, it might have been changed by 148 - * a concurrent perf_event_context_sched_out(). 149 - */ 150 - task = ctx->task; 151 - raw_spin_unlock_irq(&ctx->lock); 152 - goto again; 289 + /* 290 + * Reload the task pointer, it might have been changed by 291 + * a concurrent perf_event_context_sched_out(). 292 + */ 293 + task = ctx->task; 294 + if (task != TASK_TOMBSTONE) { 295 + if (ctx->is_active) { 296 + raw_spin_unlock_irq(&ctx->lock); 297 + goto again; 298 + } 299 + func(event, NULL, ctx, data); 153 300 } 154 - inactive(data); 155 301 raw_spin_unlock_irq(&ctx->lock); 156 - } 157 - 158 - #define EVENT_OWNER_KERNEL ((void *) -1) 159 - 160 - static bool is_kernel_event(struct perf_event *event) 161 - { 162 - return event->owner == EVENT_OWNER_KERNEL; 163 302 } 164 303 165 304 #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\ ··· 503 368 return event->clock(); 504 369 } 505 370 506 - static inline struct perf_cpu_context * 507 - __get_cpu_context(struct perf_event_context *ctx) 508 - { 509 - return this_cpu_ptr(ctx->pmu->pmu_cpu_context); 510 - } 511 - 512 - static void perf_ctx_lock(struct perf_cpu_context *cpuctx, 513 - struct perf_event_context *ctx) 514 - { 515 - raw_spin_lock(&cpuctx->ctx.lock); 516 - if (ctx) 517 - raw_spin_lock(&ctx->lock); 518 - } 519 - 520 - static void perf_ctx_unlock(struct perf_cpu_context *cpuctx, 521 - struct perf_event_context *ctx) 522 - { 523 - if (ctx) 524 - raw_spin_unlock(&ctx->lock); 525 - raw_spin_unlock(&cpuctx->ctx.lock); 526 - } 527 - 528 371 #ifdef CONFIG_CGROUP_PERF 529 372 530 373 static inline bool ··· 692 579 * we are holding the rcu lock 693 580 */ 694 581 cgrp1 = perf_cgroup_from_task(task, NULL); 695 - 696 - /* 697 - * next is NULL when called from perf_event_enable_on_exec() 698 - * that will systematically cause a cgroup_switch() 699 - */ 700 - if (next) 701 - cgrp2 = perf_cgroup_from_task(next, NULL); 582 + cgrp2 = perf_cgroup_from_task(next, NULL); 702 583 703 584 /* 704 585 * only schedule out current cgroup events if we know ··· 718 611 * we are holding the rcu lock 719 612 */ 720 613 cgrp1 = perf_cgroup_from_task(task, NULL); 721 - 722 - /* prev can never be NULL */ 723 614 cgrp2 = perf_cgroup_from_task(prev, NULL); 724 615 725 616 /* ··· 1022 917 if (atomic_dec_and_test(&ctx->refcount)) { 1023 918 if (ctx->parent_ctx) 1024 919 put_ctx(ctx->parent_ctx); 1025 - if (ctx->task) 920 + if (ctx->task && ctx->task != TASK_TOMBSTONE) 1026 921 put_task_struct(ctx->task); 1027 922 call_rcu(&ctx->rcu_head, free_ctx); 1028 923 } ··· 1039 934 * perf_event_context::mutex nests and those are: 1040 935 * 1041 936 * - perf_event_exit_task_context() [ child , 0 ] 1042 - * __perf_event_exit_task() 1043 - * sync_child_event() 1044 - * put_event() [ parent, 1 ] 937 + * perf_event_exit_event() 938 + * put_event() [ parent, 1 ] 1045 939 * 1046 940 * - perf_event_init_context() [ parent, 0 ] 1047 941 * inherit_task_group() ··· 1083 979 * Lock order: 1084 980 * task_struct::perf_event_mutex 1085 981 * perf_event_context::mutex 1086 - * perf_event_context::lock 1087 982 * perf_event::child_mutex; 983 + * perf_event_context::lock 1088 984 * perf_event::mmap_mutex 1089 985 * mmap_sem 1090 986 */ ··· 1182 1078 1183 1079 /* 1184 1080 * Get the perf_event_context for a task and lock it. 1081 + * 1185 1082 * This has to cope with with the fact that until it is locked, 1186 1083 * the context could get moved to another task. 1187 1084 */ ··· 1223 1118 goto retry; 1224 1119 } 1225 1120 1226 - if (!atomic_inc_not_zero(&ctx->refcount)) { 1121 + if (ctx->task == TASK_TOMBSTONE || 1122 + !atomic_inc_not_zero(&ctx->refcount)) { 1227 1123 raw_spin_unlock(&ctx->lock); 1228 1124 ctx = NULL; 1125 + } else { 1126 + WARN_ON_ONCE(ctx->task != task); 1229 1127 } 1230 1128 } 1231 1129 rcu_read_unlock(); ··· 1354 1246 static void 1355 1247 list_add_event(struct perf_event *event, struct perf_event_context *ctx) 1356 1248 { 1249 + lockdep_assert_held(&ctx->lock); 1250 + 1357 1251 WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT); 1358 1252 event->attach_state |= PERF_ATTACH_CONTEXT; 1359 1253 ··· 1558 1448 1559 1449 if (is_cgroup_event(event)) { 1560 1450 ctx->nr_cgroups--; 1451 + /* 1452 + * Because cgroup events are always per-cpu events, this will 1453 + * always be called from the right CPU. 1454 + */ 1561 1455 cpuctx = __get_cpu_context(ctx); 1562 1456 /* 1563 - * if there are no more cgroup events 1564 - * then cler cgrp to avoid stale pointer 1565 - * in update_cgrp_time_from_cpuctx() 1457 + * If there are no more cgroup events then clear cgrp to avoid 1458 + * stale pointer in update_cgrp_time_from_cpuctx(). 1566 1459 */ 1567 1460 if (!ctx->nr_cgroups) 1568 1461 cpuctx->cgrp = NULL; ··· 1643 1530 perf_event__header_size(tmp); 1644 1531 } 1645 1532 1646 - /* 1647 - * User event without the task. 1648 - */ 1649 1533 static bool is_orphaned_event(struct perf_event *event) 1650 1534 { 1651 - return event && !is_kernel_event(event) && !event->owner; 1535 + return event->state == PERF_EVENT_STATE_EXIT; 1652 1536 } 1653 - 1654 - /* 1655 - * Event has a parent but parent's task finished and it's 1656 - * alive only because of children holding refference. 1657 - */ 1658 - static bool is_orphaned_child(struct perf_event *event) 1659 - { 1660 - return is_orphaned_event(event->parent); 1661 - } 1662 - 1663 - static void orphans_remove_work(struct work_struct *work); 1664 - 1665 - static void schedule_orphans_remove(struct perf_event_context *ctx) 1666 - { 1667 - if (!ctx->task || ctx->orphans_remove_sched || !perf_wq) 1668 - return; 1669 - 1670 - if (queue_delayed_work(perf_wq, &ctx->orphans_remove, 1)) { 1671 - get_ctx(ctx); 1672 - ctx->orphans_remove_sched = true; 1673 - } 1674 - } 1675 - 1676 - static int __init perf_workqueue_init(void) 1677 - { 1678 - perf_wq = create_singlethread_workqueue("perf"); 1679 - WARN(!perf_wq, "failed to create perf workqueue\n"); 1680 - return perf_wq ? 0 : -1; 1681 - } 1682 - 1683 - core_initcall(perf_workqueue_init); 1684 1537 1685 1538 static inline int pmu_filter_match(struct perf_event *event) 1686 1539 { ··· 1708 1629 if (event->attr.exclusive || !cpuctx->active_oncpu) 1709 1630 cpuctx->exclusive = 0; 1710 1631 1711 - if (is_orphaned_child(event)) 1712 - schedule_orphans_remove(ctx); 1713 - 1714 1632 perf_pmu_enable(event->pmu); 1715 1633 } 1716 1634 ··· 1731 1655 cpuctx->exclusive = 0; 1732 1656 } 1733 1657 1734 - struct remove_event { 1735 - struct perf_event *event; 1736 - bool detach_group; 1737 - }; 1738 - 1739 - static void ___perf_remove_from_context(void *info) 1740 - { 1741 - struct remove_event *re = info; 1742 - struct perf_event *event = re->event; 1743 - struct perf_event_context *ctx = event->ctx; 1744 - 1745 - if (re->detach_group) 1746 - perf_group_detach(event); 1747 - list_del_event(event, ctx); 1748 - } 1658 + #define DETACH_GROUP 0x01UL 1659 + #define DETACH_STATE 0x02UL 1749 1660 1750 1661 /* 1751 1662 * Cross CPU call to remove a performance event ··· 1740 1677 * We disable the event on the hardware level first. After that we 1741 1678 * remove it from the context list. 1742 1679 */ 1743 - static int __perf_remove_from_context(void *info) 1680 + static void 1681 + __perf_remove_from_context(struct perf_event *event, 1682 + struct perf_cpu_context *cpuctx, 1683 + struct perf_event_context *ctx, 1684 + void *info) 1744 1685 { 1745 - struct remove_event *re = info; 1746 - struct perf_event *event = re->event; 1747 - struct perf_event_context *ctx = event->ctx; 1748 - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 1686 + unsigned long flags = (unsigned long)info; 1749 1687 1750 - raw_spin_lock(&ctx->lock); 1751 1688 event_sched_out(event, cpuctx, ctx); 1752 - if (re->detach_group) 1689 + if (flags & DETACH_GROUP) 1753 1690 perf_group_detach(event); 1754 1691 list_del_event(event, ctx); 1755 - if (!ctx->nr_events && cpuctx->task_ctx == ctx) { 1756 - ctx->is_active = 0; 1757 - cpuctx->task_ctx = NULL; 1758 - } 1759 - raw_spin_unlock(&ctx->lock); 1692 + if (flags & DETACH_STATE) 1693 + event->state = PERF_EVENT_STATE_EXIT; 1760 1694 1761 - return 0; 1695 + if (!ctx->nr_events && ctx->is_active) { 1696 + ctx->is_active = 0; 1697 + if (ctx->task) { 1698 + WARN_ON_ONCE(cpuctx->task_ctx != ctx); 1699 + cpuctx->task_ctx = NULL; 1700 + } 1701 + } 1762 1702 } 1763 1703 1764 1704 /* 1765 1705 * Remove the event from a task's (or a CPU's) list of events. 1766 - * 1767 - * CPU events are removed with a smp call. For task events we only 1768 - * call when the task is on a CPU. 1769 1706 * 1770 1707 * If event->ctx is a cloned context, callers must make sure that 1771 1708 * every task struct that event->ctx->task could possibly point to ··· 1774 1711 * When called from perf_event_exit_task, it's OK because the 1775 1712 * context has been detached from its task. 1776 1713 */ 1777 - static void perf_remove_from_context(struct perf_event *event, bool detach_group) 1714 + static void perf_remove_from_context(struct perf_event *event, unsigned long flags) 1778 1715 { 1779 - struct perf_event_context *ctx = event->ctx; 1780 - struct remove_event re = { 1781 - .event = event, 1782 - .detach_group = detach_group, 1783 - }; 1716 + lockdep_assert_held(&event->ctx->mutex); 1784 1717 1785 - lockdep_assert_held(&ctx->mutex); 1786 - 1787 - event_function_call(event, __perf_remove_from_context, 1788 - ___perf_remove_from_context, &re); 1718 + event_function_call(event, __perf_remove_from_context, (void *)flags); 1789 1719 } 1790 1720 1791 1721 /* 1792 1722 * Cross CPU call to disable a performance event 1793 1723 */ 1794 - int __perf_event_disable(void *info) 1724 + static void __perf_event_disable(struct perf_event *event, 1725 + struct perf_cpu_context *cpuctx, 1726 + struct perf_event_context *ctx, 1727 + void *info) 1795 1728 { 1796 - struct perf_event *event = info; 1797 - struct perf_event_context *ctx = event->ctx; 1798 - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 1729 + if (event->state < PERF_EVENT_STATE_INACTIVE) 1730 + return; 1799 1731 1800 - /* 1801 - * If this is a per-task event, need to check whether this 1802 - * event's task is the current task on this cpu. 1803 - * 1804 - * Can trigger due to concurrent perf_event_context_sched_out() 1805 - * flipping contexts around. 1806 - */ 1807 - if (ctx->task && cpuctx->task_ctx != ctx) 1808 - return -EINVAL; 1809 - 1810 - raw_spin_lock(&ctx->lock); 1811 - 1812 - /* 1813 - * If the event is on, turn it off. 1814 - * If it is in error state, leave it in error state. 1815 - */ 1816 - if (event->state >= PERF_EVENT_STATE_INACTIVE) { 1817 - update_context_time(ctx); 1818 - update_cgrp_time_from_event(event); 1819 - update_group_times(event); 1820 - if (event == event->group_leader) 1821 - group_sched_out(event, cpuctx, ctx); 1822 - else 1823 - event_sched_out(event, cpuctx, ctx); 1824 - event->state = PERF_EVENT_STATE_OFF; 1825 - } 1826 - 1827 - raw_spin_unlock(&ctx->lock); 1828 - 1829 - return 0; 1830 - } 1831 - 1832 - void ___perf_event_disable(void *info) 1833 - { 1834 - struct perf_event *event = info; 1835 - 1836 - /* 1837 - * Since we have the lock this context can't be scheduled 1838 - * in, so we can change the state safely. 1839 - */ 1840 - if (event->state == PERF_EVENT_STATE_INACTIVE) { 1841 - update_group_times(event); 1842 - event->state = PERF_EVENT_STATE_OFF; 1843 - } 1732 + update_context_time(ctx); 1733 + update_cgrp_time_from_event(event); 1734 + update_group_times(event); 1735 + if (event == event->group_leader) 1736 + group_sched_out(event, cpuctx, ctx); 1737 + else 1738 + event_sched_out(event, cpuctx, ctx); 1739 + event->state = PERF_EVENT_STATE_OFF; 1844 1740 } 1845 1741 1846 1742 /* ··· 1810 1788 * remains valid. This condition is satisifed when called through 1811 1789 * perf_event_for_each_child or perf_event_for_each because they 1812 1790 * hold the top-level event's child_mutex, so any descendant that 1813 - * goes to exit will block in sync_child_event. 1791 + * goes to exit will block in perf_event_exit_event(). 1792 + * 1814 1793 * When called from perf_pending_event it's OK because event->ctx 1815 1794 * is the current context on this CPU and preemption is disabled, 1816 1795 * hence we can't get into perf_event_task_sched_out for this context. ··· 1827 1804 } 1828 1805 raw_spin_unlock_irq(&ctx->lock); 1829 1806 1830 - event_function_call(event, __perf_event_disable, 1831 - ___perf_event_disable, event); 1807 + event_function_call(event, __perf_event_disable, NULL); 1808 + } 1809 + 1810 + void perf_event_disable_local(struct perf_event *event) 1811 + { 1812 + event_function_local(event, __perf_event_disable, NULL); 1832 1813 } 1833 1814 1834 1815 /* ··· 1944 1917 1945 1918 if (event->attr.exclusive) 1946 1919 cpuctx->exclusive = 1; 1947 - 1948 - if (is_orphaned_child(event)) 1949 - schedule_orphans_remove(ctx); 1950 1920 1951 1921 out: 1952 1922 perf_pmu_enable(event->pmu); ··· 2063 2039 event->tstamp_stopped = tstamp; 2064 2040 } 2065 2041 2066 - static void task_ctx_sched_out(struct perf_event_context *ctx); 2042 + static void task_ctx_sched_out(struct perf_cpu_context *cpuctx, 2043 + struct perf_event_context *ctx); 2067 2044 static void 2068 2045 ctx_sched_in(struct perf_event_context *ctx, 2069 2046 struct perf_cpu_context *cpuctx, ··· 2083 2058 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task); 2084 2059 } 2085 2060 2086 - static void ___perf_install_in_context(void *info) 2061 + static void ctx_resched(struct perf_cpu_context *cpuctx, 2062 + struct perf_event_context *task_ctx) 2087 2063 { 2088 - struct perf_event *event = info; 2089 - struct perf_event_context *ctx = event->ctx; 2090 - 2091 - /* 2092 - * Since the task isn't running, its safe to add the event, us holding 2093 - * the ctx->lock ensures the task won't get scheduled in. 2094 - */ 2095 - add_event_to_ctx(event, ctx); 2064 + perf_pmu_disable(cpuctx->ctx.pmu); 2065 + if (task_ctx) 2066 + task_ctx_sched_out(cpuctx, task_ctx); 2067 + cpu_ctx_sched_out(cpuctx, EVENT_ALL); 2068 + perf_event_sched_in(cpuctx, task_ctx, current); 2069 + perf_pmu_enable(cpuctx->ctx.pmu); 2096 2070 } 2097 2071 2098 2072 /* ··· 2101 2077 */ 2102 2078 static int __perf_install_in_context(void *info) 2103 2079 { 2104 - struct perf_event *event = info; 2105 - struct perf_event_context *ctx = event->ctx; 2080 + struct perf_event_context *ctx = info; 2106 2081 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 2107 2082 struct perf_event_context *task_ctx = cpuctx->task_ctx; 2108 - struct task_struct *task = current; 2109 2083 2110 - perf_ctx_lock(cpuctx, task_ctx); 2111 - perf_pmu_disable(cpuctx->ctx.pmu); 2112 - 2113 - /* 2114 - * If there was an active task_ctx schedule it out. 2115 - */ 2116 - if (task_ctx) 2117 - task_ctx_sched_out(task_ctx); 2118 - 2119 - /* 2120 - * If the context we're installing events in is not the 2121 - * active task_ctx, flip them. 2122 - */ 2123 - if (ctx->task && task_ctx != ctx) { 2124 - if (task_ctx) 2125 - raw_spin_unlock(&task_ctx->lock); 2084 + raw_spin_lock(&cpuctx->ctx.lock); 2085 + if (ctx->task) { 2126 2086 raw_spin_lock(&ctx->lock); 2087 + /* 2088 + * If we hit the 'wrong' task, we've since scheduled and 2089 + * everything should be sorted, nothing to do! 2090 + */ 2127 2091 task_ctx = ctx; 2092 + if (ctx->task != current) 2093 + goto unlock; 2094 + 2095 + /* 2096 + * If task_ctx is set, it had better be to us. 2097 + */ 2098 + WARN_ON_ONCE(cpuctx->task_ctx != ctx && cpuctx->task_ctx); 2099 + } else if (task_ctx) { 2100 + raw_spin_lock(&task_ctx->lock); 2128 2101 } 2129 2102 2130 - if (task_ctx) { 2131 - cpuctx->task_ctx = task_ctx; 2132 - task = task_ctx->task; 2133 - } 2134 - 2135 - cpu_ctx_sched_out(cpuctx, EVENT_ALL); 2136 - 2137 - update_context_time(ctx); 2138 - /* 2139 - * update cgrp time only if current cgrp 2140 - * matches event->cgrp. Must be done before 2141 - * calling add_event_to_ctx() 2142 - */ 2143 - update_cgrp_time_from_event(event); 2144 - 2145 - add_event_to_ctx(event, ctx); 2146 - 2147 - /* 2148 - * Schedule everything back in 2149 - */ 2150 - perf_event_sched_in(cpuctx, task_ctx, task); 2151 - 2152 - perf_pmu_enable(cpuctx->ctx.pmu); 2103 + ctx_resched(cpuctx, task_ctx); 2104 + unlock: 2153 2105 perf_ctx_unlock(cpuctx, task_ctx); 2154 2106 2155 2107 return 0; ··· 2133 2133 2134 2134 /* 2135 2135 * Attach a performance event to a context 2136 - * 2137 - * First we add the event to the list with the hardware enable bit 2138 - * in event->hw_config cleared. 2139 - * 2140 - * If the event is attached to a task which is on a CPU we use a smp 2141 - * call to enable it in the task context. The task might have been 2142 - * scheduled away, but we check this in the smp call again. 2143 2136 */ 2144 2137 static void 2145 2138 perf_install_in_context(struct perf_event_context *ctx, 2146 2139 struct perf_event *event, 2147 2140 int cpu) 2148 2141 { 2142 + struct task_struct *task = NULL; 2143 + 2149 2144 lockdep_assert_held(&ctx->mutex); 2150 2145 2151 2146 event->ctx = ctx; 2152 2147 if (event->cpu != -1) 2153 2148 event->cpu = cpu; 2154 2149 2155 - event_function_call(event, __perf_install_in_context, 2156 - ___perf_install_in_context, event); 2150 + /* 2151 + * Installing events is tricky because we cannot rely on ctx->is_active 2152 + * to be set in case this is the nr_events 0 -> 1 transition. 2153 + * 2154 + * So what we do is we add the event to the list here, which will allow 2155 + * a future context switch to DTRT and then send a racy IPI. If the IPI 2156 + * fails to hit the right task, this means a context switch must have 2157 + * happened and that will have taken care of business. 2158 + */ 2159 + raw_spin_lock_irq(&ctx->lock); 2160 + task = ctx->task; 2161 + /* 2162 + * Worse, we cannot even rely on the ctx actually existing anymore. If 2163 + * between find_get_context() and perf_install_in_context() the task 2164 + * went through perf_event_exit_task() its dead and we should not be 2165 + * adding new events. 2166 + */ 2167 + if (task == TASK_TOMBSTONE) { 2168 + raw_spin_unlock_irq(&ctx->lock); 2169 + return; 2170 + } 2171 + update_context_time(ctx); 2172 + /* 2173 + * Update cgrp time only if current cgrp matches event->cgrp. 2174 + * Must be done before calling add_event_to_ctx(). 2175 + */ 2176 + update_cgrp_time_from_event(event); 2177 + add_event_to_ctx(event, ctx); 2178 + raw_spin_unlock_irq(&ctx->lock); 2179 + 2180 + if (task) 2181 + task_function_call(task, __perf_install_in_context, ctx); 2182 + else 2183 + cpu_function_call(cpu, __perf_install_in_context, ctx); 2157 2184 } 2158 2185 2159 2186 /* ··· 2207 2180 /* 2208 2181 * Cross CPU call to enable a performance event 2209 2182 */ 2210 - static int __perf_event_enable(void *info) 2183 + static void __perf_event_enable(struct perf_event *event, 2184 + struct perf_cpu_context *cpuctx, 2185 + struct perf_event_context *ctx, 2186 + void *info) 2211 2187 { 2212 - struct perf_event *event = info; 2213 - struct perf_event_context *ctx = event->ctx; 2214 2188 struct perf_event *leader = event->group_leader; 2215 - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 2216 - int err; 2189 + struct perf_event_context *task_ctx; 2217 2190 2218 - /* 2219 - * There's a time window between 'ctx->is_active' check 2220 - * in perf_event_enable function and this place having: 2221 - * - IRQs on 2222 - * - ctx->lock unlocked 2223 - * 2224 - * where the task could be killed and 'ctx' deactivated 2225 - * by perf_event_exit_task. 2226 - */ 2227 - if (!ctx->is_active) 2228 - return -EINVAL; 2191 + if (event->state >= PERF_EVENT_STATE_INACTIVE || 2192 + event->state <= PERF_EVENT_STATE_ERROR) 2193 + return; 2229 2194 2230 - raw_spin_lock(&ctx->lock); 2231 2195 update_context_time(ctx); 2232 - 2233 - if (event->state >= PERF_EVENT_STATE_INACTIVE) 2234 - goto unlock; 2235 - 2236 - /* 2237 - * set current task's cgroup time reference point 2238 - */ 2239 - perf_cgroup_set_timestamp(current, ctx); 2240 - 2241 2196 __perf_event_mark_enabled(event); 2242 2197 2198 + if (!ctx->is_active) 2199 + return; 2200 + 2243 2201 if (!event_filter_match(event)) { 2244 - if (is_cgroup_event(event)) 2202 + if (is_cgroup_event(event)) { 2203 + perf_cgroup_set_timestamp(current, ctx); // XXX ? 2245 2204 perf_cgroup_defer_enabled(event); 2246 - goto unlock; 2205 + } 2206 + return; 2247 2207 } 2248 2208 2249 2209 /* ··· 2238 2224 * then don't put it on unless the group is on. 2239 2225 */ 2240 2226 if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) 2241 - goto unlock; 2227 + return; 2242 2228 2243 - if (!group_can_go_on(event, cpuctx, 1)) { 2244 - err = -EEXIST; 2245 - } else { 2246 - if (event == leader) 2247 - err = group_sched_in(event, cpuctx, ctx); 2248 - else 2249 - err = event_sched_in(event, cpuctx, ctx); 2250 - } 2229 + task_ctx = cpuctx->task_ctx; 2230 + if (ctx->task) 2231 + WARN_ON_ONCE(task_ctx != ctx); 2251 2232 2252 - if (err) { 2253 - /* 2254 - * If this event can't go on and it's part of a 2255 - * group, then the whole group has to come off. 2256 - */ 2257 - if (leader != event) { 2258 - group_sched_out(leader, cpuctx, ctx); 2259 - perf_mux_hrtimer_restart(cpuctx); 2260 - } 2261 - if (leader->attr.pinned) { 2262 - update_group_times(leader); 2263 - leader->state = PERF_EVENT_STATE_ERROR; 2264 - } 2265 - } 2266 - 2267 - unlock: 2268 - raw_spin_unlock(&ctx->lock); 2269 - 2270 - return 0; 2271 - } 2272 - 2273 - void ___perf_event_enable(void *info) 2274 - { 2275 - __perf_event_mark_enabled((struct perf_event *)info); 2233 + ctx_resched(cpuctx, task_ctx); 2276 2234 } 2277 2235 2278 2236 /* ··· 2261 2275 struct perf_event_context *ctx = event->ctx; 2262 2276 2263 2277 raw_spin_lock_irq(&ctx->lock); 2264 - if (event->state >= PERF_EVENT_STATE_INACTIVE) { 2278 + if (event->state >= PERF_EVENT_STATE_INACTIVE || 2279 + event->state < PERF_EVENT_STATE_ERROR) { 2265 2280 raw_spin_unlock_irq(&ctx->lock); 2266 2281 return; 2267 2282 } ··· 2278 2291 event->state = PERF_EVENT_STATE_OFF; 2279 2292 raw_spin_unlock_irq(&ctx->lock); 2280 2293 2281 - event_function_call(event, __perf_event_enable, 2282 - ___perf_event_enable, event); 2294 + event_function_call(event, __perf_event_enable, NULL); 2283 2295 } 2284 2296 2285 2297 /* ··· 2328 2342 struct perf_cpu_context *cpuctx, 2329 2343 enum event_type_t event_type) 2330 2344 { 2331 - struct perf_event *event; 2332 2345 int is_active = ctx->is_active; 2346 + struct perf_event *event; 2347 + 2348 + lockdep_assert_held(&ctx->lock); 2349 + 2350 + if (likely(!ctx->nr_events)) { 2351 + /* 2352 + * See __perf_remove_from_context(). 2353 + */ 2354 + WARN_ON_ONCE(ctx->is_active); 2355 + if (ctx->task) 2356 + WARN_ON_ONCE(cpuctx->task_ctx); 2357 + return; 2358 + } 2333 2359 2334 2360 ctx->is_active &= ~event_type; 2335 - if (likely(!ctx->nr_events)) 2336 - return; 2361 + if (ctx->task) { 2362 + WARN_ON_ONCE(cpuctx->task_ctx != ctx); 2363 + if (!ctx->is_active) 2364 + cpuctx->task_ctx = NULL; 2365 + } 2337 2366 2338 2367 update_context_time(ctx); 2339 2368 update_cgrp_time_from_cpuctx(cpuctx); ··· 2519 2518 raw_spin_lock(&ctx->lock); 2520 2519 raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); 2521 2520 if (context_equiv(ctx, next_ctx)) { 2522 - /* 2523 - * XXX do we need a memory barrier of sorts 2524 - * wrt to rcu_dereference() of perf_event_ctxp 2525 - */ 2526 - task->perf_event_ctxp[ctxn] = next_ctx; 2527 - next->perf_event_ctxp[ctxn] = ctx; 2528 - ctx->task = next; 2529 - next_ctx->task = task; 2521 + WRITE_ONCE(ctx->task, next); 2522 + WRITE_ONCE(next_ctx->task, task); 2530 2523 2531 2524 swap(ctx->task_ctx_data, next_ctx->task_ctx_data); 2525 + 2526 + /* 2527 + * RCU_INIT_POINTER here is safe because we've not 2528 + * modified the ctx and the above modification of 2529 + * ctx->task and ctx->task_ctx_data are immaterial 2530 + * since those values are always verified under 2531 + * ctx->lock which we're now holding. 2532 + */ 2533 + RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], next_ctx); 2534 + RCU_INIT_POINTER(next->perf_event_ctxp[ctxn], ctx); 2532 2535 2533 2536 do_switch = 0; 2534 2537 ··· 2546 2541 2547 2542 if (do_switch) { 2548 2543 raw_spin_lock(&ctx->lock); 2549 - ctx_sched_out(ctx, cpuctx, EVENT_ALL); 2550 - cpuctx->task_ctx = NULL; 2544 + task_ctx_sched_out(cpuctx, ctx); 2551 2545 raw_spin_unlock(&ctx->lock); 2552 2546 } 2553 2547 } ··· 2641 2637 perf_cgroup_sched_out(task, next); 2642 2638 } 2643 2639 2644 - static void task_ctx_sched_out(struct perf_event_context *ctx) 2640 + static void task_ctx_sched_out(struct perf_cpu_context *cpuctx, 2641 + struct perf_event_context *ctx) 2645 2642 { 2646 - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 2647 - 2648 2643 if (!cpuctx->task_ctx) 2649 2644 return; 2650 2645 ··· 2651 2648 return; 2652 2649 2653 2650 ctx_sched_out(ctx, cpuctx, EVENT_ALL); 2654 - cpuctx->task_ctx = NULL; 2655 2651 } 2656 2652 2657 2653 /* ··· 2727 2725 enum event_type_t event_type, 2728 2726 struct task_struct *task) 2729 2727 { 2730 - u64 now; 2731 2728 int is_active = ctx->is_active; 2729 + u64 now; 2732 2730 2733 - ctx->is_active |= event_type; 2731 + lockdep_assert_held(&ctx->lock); 2732 + 2734 2733 if (likely(!ctx->nr_events)) 2735 2734 return; 2735 + 2736 + ctx->is_active |= event_type; 2737 + if (ctx->task) { 2738 + if (!is_active) 2739 + cpuctx->task_ctx = ctx; 2740 + else 2741 + WARN_ON_ONCE(cpuctx->task_ctx != ctx); 2742 + } 2736 2743 2737 2744 now = perf_clock(); 2738 2745 ctx->timestamp = now; ··· 2784 2773 * cpu flexible, task flexible. 2785 2774 */ 2786 2775 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); 2787 - 2788 - if (ctx->nr_events) 2789 - cpuctx->task_ctx = ctx; 2790 - 2791 - perf_event_sched_in(cpuctx, cpuctx->task_ctx, task); 2792 - 2776 + perf_event_sched_in(cpuctx, ctx, task); 2793 2777 perf_pmu_enable(ctx->pmu); 2794 2778 perf_ctx_unlock(cpuctx, ctx); 2795 2779 } ··· 2806 2800 struct perf_event_context *ctx; 2807 2801 int ctxn; 2808 2802 2803 + /* 2804 + * If cgroup events exist on this CPU, then we need to check if we have 2805 + * to switch in PMU state; cgroup event are system-wide mode only. 2806 + * 2807 + * Since cgroup events are CPU events, we must schedule these in before 2808 + * we schedule in the task events. 2809 + */ 2810 + if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) 2811 + perf_cgroup_sched_in(prev, task); 2812 + 2809 2813 for_each_task_context_nr(ctxn) { 2810 2814 ctx = task->perf_event_ctxp[ctxn]; 2811 2815 if (likely(!ctx)) ··· 2823 2807 2824 2808 perf_event_context_sched_in(ctx, task); 2825 2809 } 2826 - /* 2827 - * if cgroup events exist on this CPU, then we need 2828 - * to check if we have to switch in PMU state. 2829 - * cgroup event are system-wide mode only 2830 - */ 2831 - if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) 2832 - perf_cgroup_sched_in(prev, task); 2833 2810 2834 2811 if (atomic_read(&nr_switch_events)) 2835 2812 perf_event_switch(task, prev, true); ··· 3108 3099 static void perf_event_enable_on_exec(int ctxn) 3109 3100 { 3110 3101 struct perf_event_context *ctx, *clone_ctx = NULL; 3102 + struct perf_cpu_context *cpuctx; 3111 3103 struct perf_event *event; 3112 3104 unsigned long flags; 3113 3105 int enabled = 0; 3114 - int ret; 3115 3106 3116 3107 local_irq_save(flags); 3117 3108 ctx = current->perf_event_ctxp[ctxn]; 3118 3109 if (!ctx || !ctx->nr_events) 3119 3110 goto out; 3120 3111 3121 - /* 3122 - * We must ctxsw out cgroup events to avoid conflict 3123 - * when invoking perf_task_event_sched_in() later on 3124 - * in this function. Otherwise we end up trying to 3125 - * ctxswin cgroup events which are already scheduled 3126 - * in. 3127 - */ 3128 - perf_cgroup_sched_out(current, NULL); 3129 - 3130 - raw_spin_lock(&ctx->lock); 3131 - task_ctx_sched_out(ctx); 3132 - 3133 - list_for_each_entry(event, &ctx->event_list, event_entry) { 3134 - ret = event_enable_on_exec(event, ctx); 3135 - if (ret) 3136 - enabled = 1; 3137 - } 3112 + cpuctx = __get_cpu_context(ctx); 3113 + perf_ctx_lock(cpuctx, ctx); 3114 + list_for_each_entry(event, &ctx->event_list, event_entry) 3115 + enabled |= event_enable_on_exec(event, ctx); 3138 3116 3139 3117 /* 3140 - * Unclone this context if we enabled any event. 3118 + * Unclone and reschedule this context if we enabled any event. 3141 3119 */ 3142 - if (enabled) 3120 + if (enabled) { 3143 3121 clone_ctx = unclone_ctx(ctx); 3122 + ctx_resched(cpuctx, ctx); 3123 + } 3124 + perf_ctx_unlock(cpuctx, ctx); 3144 3125 3145 - raw_spin_unlock(&ctx->lock); 3146 - 3147 - /* 3148 - * Also calls ctxswin for cgroup events, if any: 3149 - */ 3150 - perf_event_context_sched_in(ctx, ctx->task); 3151 3126 out: 3152 3127 local_irq_restore(flags); 3153 3128 ··· 3327 3334 INIT_LIST_HEAD(&ctx->flexible_groups); 3328 3335 INIT_LIST_HEAD(&ctx->event_list); 3329 3336 atomic_set(&ctx->refcount, 1); 3330 - INIT_DELAYED_WORK(&ctx->orphans_remove, orphans_remove_work); 3331 3337 } 3332 3338 3333 3339 static struct perf_event_context * ··· 3513 3521 3514 3522 static void unaccount_event(struct perf_event *event) 3515 3523 { 3524 + bool dec = false; 3525 + 3516 3526 if (event->parent) 3517 3527 return; 3518 3528 3519 3529 if (event->attach_state & PERF_ATTACH_TASK) 3520 - static_key_slow_dec_deferred(&perf_sched_events); 3530 + dec = true; 3521 3531 if (event->attr.mmap || event->attr.mmap_data) 3522 3532 atomic_dec(&nr_mmap_events); 3523 3533 if (event->attr.comm) ··· 3529 3535 if (event->attr.freq) 3530 3536 atomic_dec(&nr_freq_events); 3531 3537 if (event->attr.context_switch) { 3532 - static_key_slow_dec_deferred(&perf_sched_events); 3538 + dec = true; 3533 3539 atomic_dec(&nr_switch_events); 3534 3540 } 3535 3541 if (is_cgroup_event(event)) 3536 - static_key_slow_dec_deferred(&perf_sched_events); 3542 + dec = true; 3537 3543 if (has_branch_stack(event)) 3544 + dec = true; 3545 + 3546 + if (dec) 3538 3547 static_key_slow_dec_deferred(&perf_sched_events); 3539 3548 3540 3549 unaccount_event_cpu(event, event->cpu); ··· 3553 3556 * 3) two matching events on the same context. 3554 3557 * 3555 3558 * The former two cases are handled in the allocation path (perf_event_alloc(), 3556 - * __free_event()), the latter -- before the first perf_install_in_context(). 3559 + * _free_event()), the latter -- before the first perf_install_in_context(). 3557 3560 */ 3558 3561 static int exclusive_event_init(struct perf_event *event) 3559 3562 { ··· 3628 3631 return true; 3629 3632 } 3630 3633 3631 - static void __free_event(struct perf_event *event) 3632 - { 3633 - if (!event->parent) { 3634 - if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) 3635 - put_callchain_buffers(); 3636 - } 3637 - 3638 - perf_event_free_bpf_prog(event); 3639 - 3640 - if (event->destroy) 3641 - event->destroy(event); 3642 - 3643 - if (event->ctx) 3644 - put_ctx(event->ctx); 3645 - 3646 - if (event->pmu) { 3647 - exclusive_event_destroy(event); 3648 - module_put(event->pmu->module); 3649 - } 3650 - 3651 - call_rcu(&event->rcu_head, free_event_rcu); 3652 - } 3653 - 3654 3634 static void _free_event(struct perf_event *event) 3655 3635 { 3656 3636 irq_work_sync(&event->pending); ··· 3649 3675 if (is_cgroup_event(event)) 3650 3676 perf_detach_cgroup(event); 3651 3677 3652 - __free_event(event); 3678 + if (!event->parent) { 3679 + if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) 3680 + put_callchain_buffers(); 3681 + } 3682 + 3683 + perf_event_free_bpf_prog(event); 3684 + 3685 + if (event->destroy) 3686 + event->destroy(event); 3687 + 3688 + if (event->ctx) 3689 + put_ctx(event->ctx); 3690 + 3691 + if (event->pmu) { 3692 + exclusive_event_destroy(event); 3693 + module_put(event->pmu->module); 3694 + } 3695 + 3696 + call_rcu(&event->rcu_head, free_event_rcu); 3653 3697 } 3654 3698 3655 3699 /* ··· 3694 3702 struct task_struct *owner; 3695 3703 3696 3704 rcu_read_lock(); 3697 - owner = ACCESS_ONCE(event->owner); 3698 3705 /* 3699 - * Matches the smp_wmb() in perf_event_exit_task(). If we observe 3700 - * !owner it means the list deletion is complete and we can indeed 3701 - * free this event, otherwise we need to serialize on 3706 + * Matches the smp_store_release() in perf_event_exit_task(). If we 3707 + * observe !owner it means the list deletion is complete and we can 3708 + * indeed free this event, otherwise we need to serialize on 3702 3709 * owner->perf_event_mutex. 3703 3710 */ 3704 - smp_read_barrier_depends(); 3711 + owner = lockless_dereference(event->owner); 3705 3712 if (owner) { 3706 3713 /* 3707 3714 * Since delayed_put_task_struct() also drops the last ··· 3728 3737 * ensured they're done, and we can proceed with freeing the 3729 3738 * event. 3730 3739 */ 3731 - if (event->owner) 3740 + if (event->owner) { 3732 3741 list_del_init(&event->owner_entry); 3742 + smp_store_release(&event->owner, NULL); 3743 + } 3733 3744 mutex_unlock(&owner->perf_event_mutex); 3734 3745 put_task_struct(owner); 3735 3746 } ··· 3739 3746 3740 3747 static void put_event(struct perf_event *event) 3741 3748 { 3742 - struct perf_event_context *ctx; 3743 - 3744 3749 if (!atomic_long_dec_and_test(&event->refcount)) 3745 3750 return; 3746 - 3747 - if (!is_kernel_event(event)) 3748 - perf_remove_from_owner(event); 3749 - 3750 - /* 3751 - * There are two ways this annotation is useful: 3752 - * 3753 - * 1) there is a lock recursion from perf_event_exit_task 3754 - * see the comment there. 3755 - * 3756 - * 2) there is a lock-inversion with mmap_sem through 3757 - * perf_read_group(), which takes faults while 3758 - * holding ctx->mutex, however this is called after 3759 - * the last filedesc died, so there is no possibility 3760 - * to trigger the AB-BA case. 3761 - */ 3762 - ctx = perf_event_ctx_lock_nested(event, SINGLE_DEPTH_NESTING); 3763 - WARN_ON_ONCE(ctx->parent_ctx); 3764 - perf_remove_from_context(event, true); 3765 - perf_event_ctx_unlock(event, ctx); 3766 3751 3767 3752 _free_event(event); 3768 3753 } 3769 3754 3755 + /* 3756 + * Kill an event dead; while event:refcount will preserve the event 3757 + * object, it will not preserve its functionality. Once the last 'user' 3758 + * gives up the object, we'll destroy the thing. 3759 + */ 3770 3760 int perf_event_release_kernel(struct perf_event *event) 3771 3761 { 3762 + struct perf_event_context *ctx; 3763 + struct perf_event *child, *tmp; 3764 + 3765 + if (!is_kernel_event(event)) 3766 + perf_remove_from_owner(event); 3767 + 3768 + ctx = perf_event_ctx_lock(event); 3769 + WARN_ON_ONCE(ctx->parent_ctx); 3770 + perf_remove_from_context(event, DETACH_GROUP | DETACH_STATE); 3771 + perf_event_ctx_unlock(event, ctx); 3772 + 3773 + /* 3774 + * At this point we must have event->state == PERF_EVENT_STATE_EXIT, 3775 + * either from the above perf_remove_from_context() or through 3776 + * perf_event_exit_event(). 3777 + * 3778 + * Therefore, anybody acquiring event->child_mutex after the below 3779 + * loop _must_ also see this, most importantly inherit_event() which 3780 + * will avoid placing more children on the list. 3781 + * 3782 + * Thus this guarantees that we will in fact observe and kill _ALL_ 3783 + * child events. 3784 + */ 3785 + WARN_ON_ONCE(event->state != PERF_EVENT_STATE_EXIT); 3786 + 3787 + again: 3788 + mutex_lock(&event->child_mutex); 3789 + list_for_each_entry(child, &event->child_list, child_list) { 3790 + 3791 + /* 3792 + * Cannot change, child events are not migrated, see the 3793 + * comment with perf_event_ctx_lock_nested(). 3794 + */ 3795 + ctx = lockless_dereference(child->ctx); 3796 + /* 3797 + * Since child_mutex nests inside ctx::mutex, we must jump 3798 + * through hoops. We start by grabbing a reference on the ctx. 3799 + * 3800 + * Since the event cannot get freed while we hold the 3801 + * child_mutex, the context must also exist and have a !0 3802 + * reference count. 3803 + */ 3804 + get_ctx(ctx); 3805 + 3806 + /* 3807 + * Now that we have a ctx ref, we can drop child_mutex, and 3808 + * acquire ctx::mutex without fear of it going away. Then we 3809 + * can re-acquire child_mutex. 3810 + */ 3811 + mutex_unlock(&event->child_mutex); 3812 + mutex_lock(&ctx->mutex); 3813 + mutex_lock(&event->child_mutex); 3814 + 3815 + /* 3816 + * Now that we hold ctx::mutex and child_mutex, revalidate our 3817 + * state, if child is still the first entry, it didn't get freed 3818 + * and we can continue doing so. 3819 + */ 3820 + tmp = list_first_entry_or_null(&event->child_list, 3821 + struct perf_event, child_list); 3822 + if (tmp == child) { 3823 + perf_remove_from_context(child, DETACH_GROUP); 3824 + list_del(&child->child_list); 3825 + free_event(child); 3826 + /* 3827 + * This matches the refcount bump in inherit_event(); 3828 + * this can't be the last reference. 3829 + */ 3830 + put_event(event); 3831 + } 3832 + 3833 + mutex_unlock(&event->child_mutex); 3834 + mutex_unlock(&ctx->mutex); 3835 + put_ctx(ctx); 3836 + goto again; 3837 + } 3838 + mutex_unlock(&event->child_mutex); 3839 + 3840 + /* Must be the last reference */ 3772 3841 put_event(event); 3773 3842 return 0; 3774 3843 } ··· 3841 3786 */ 3842 3787 static int perf_release(struct inode *inode, struct file *file) 3843 3788 { 3844 - put_event(file->private_data); 3789 + perf_event_release_kernel(file->private_data); 3845 3790 return 0; 3846 - } 3847 - 3848 - /* 3849 - * Remove all orphanes events from the context. 3850 - */ 3851 - static void orphans_remove_work(struct work_struct *work) 3852 - { 3853 - struct perf_event_context *ctx; 3854 - struct perf_event *event, *tmp; 3855 - 3856 - ctx = container_of(work, struct perf_event_context, 3857 - orphans_remove.work); 3858 - 3859 - mutex_lock(&ctx->mutex); 3860 - list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) { 3861 - struct perf_event *parent_event = event->parent; 3862 - 3863 - if (!is_orphaned_child(event)) 3864 - continue; 3865 - 3866 - perf_remove_from_context(event, true); 3867 - 3868 - mutex_lock(&parent_event->child_mutex); 3869 - list_del_init(&event->child_list); 3870 - mutex_unlock(&parent_event->child_mutex); 3871 - 3872 - free_event(event); 3873 - put_event(parent_event); 3874 - } 3875 - 3876 - raw_spin_lock_irq(&ctx->lock); 3877 - ctx->orphans_remove_sched = false; 3878 - raw_spin_unlock_irq(&ctx->lock); 3879 - mutex_unlock(&ctx->mutex); 3880 - 3881 - put_ctx(ctx); 3882 3791 } 3883 3792 3884 3793 u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) ··· 4073 4054 /* 4074 4055 * Holding the top-level event's child_mutex means that any 4075 4056 * descendant process that has inherited this event will block 4076 - * in sync_child_event if it goes to exit, thus satisfying the 4057 + * in perf_event_exit_event() if it goes to exit, thus satisfying the 4077 4058 * task existence requirements of perf_event_enable/disable. 4078 4059 */ 4079 4060 static void perf_event_for_each_child(struct perf_event *event, ··· 4105 4086 perf_event_for_each_child(sibling, func); 4106 4087 } 4107 4088 4108 - struct period_event { 4109 - struct perf_event *event; 4110 - u64 value; 4111 - }; 4112 - 4113 - static void ___perf_event_period(void *info) 4089 + static void __perf_event_period(struct perf_event *event, 4090 + struct perf_cpu_context *cpuctx, 4091 + struct perf_event_context *ctx, 4092 + void *info) 4114 4093 { 4115 - struct period_event *pe = info; 4116 - struct perf_event *event = pe->event; 4117 - u64 value = pe->value; 4118 - 4119 - if (event->attr.freq) { 4120 - event->attr.sample_freq = value; 4121 - } else { 4122 - event->attr.sample_period = value; 4123 - event->hw.sample_period = value; 4124 - } 4125 - 4126 - local64_set(&event->hw.period_left, 0); 4127 - } 4128 - 4129 - static int __perf_event_period(void *info) 4130 - { 4131 - struct period_event *pe = info; 4132 - struct perf_event *event = pe->event; 4133 - struct perf_event_context *ctx = event->ctx; 4134 - u64 value = pe->value; 4094 + u64 value = *((u64 *)info); 4135 4095 bool active; 4136 4096 4137 - raw_spin_lock(&ctx->lock); 4138 4097 if (event->attr.freq) { 4139 4098 event->attr.sample_freq = value; 4140 4099 } else { ··· 4132 4135 event->pmu->start(event, PERF_EF_RELOAD); 4133 4136 perf_pmu_enable(ctx->pmu); 4134 4137 } 4135 - raw_spin_unlock(&ctx->lock); 4136 - 4137 - return 0; 4138 4138 } 4139 4139 4140 4140 static int perf_event_period(struct perf_event *event, u64 __user *arg) 4141 4141 { 4142 - struct period_event pe = { .event = event, }; 4143 4142 u64 value; 4144 4143 4145 4144 if (!is_sampling_event(event)) ··· 4150 4157 if (event->attr.freq && value > sysctl_perf_event_sample_rate) 4151 4158 return -EINVAL; 4152 4159 4153 - pe.value = value; 4154 - 4155 - event_function_call(event, __perf_event_period, 4156 - ___perf_event_period, &pe); 4160 + event_function_call(event, __perf_event_period, &value); 4157 4161 4158 4162 return 0; 4159 4163 } ··· 4922 4932 4923 4933 if (event->pending_disable) { 4924 4934 event->pending_disable = 0; 4925 - __perf_event_disable(event); 4935 + perf_event_disable_local(event); 4926 4936 } 4927 4937 4928 4938 if (event->pending_wakeup) { ··· 7743 7753 7744 7754 static void account_event(struct perf_event *event) 7745 7755 { 7756 + bool inc = false; 7757 + 7746 7758 if (event->parent) 7747 7759 return; 7748 7760 7749 7761 if (event->attach_state & PERF_ATTACH_TASK) 7750 - static_key_slow_inc(&perf_sched_events.key); 7762 + inc = true; 7751 7763 if (event->attr.mmap || event->attr.mmap_data) 7752 7764 atomic_inc(&nr_mmap_events); 7753 7765 if (event->attr.comm) ··· 7762 7770 } 7763 7771 if (event->attr.context_switch) { 7764 7772 atomic_inc(&nr_switch_events); 7765 - static_key_slow_inc(&perf_sched_events.key); 7773 + inc = true; 7766 7774 } 7767 7775 if (has_branch_stack(event)) 7768 - static_key_slow_inc(&perf_sched_events.key); 7776 + inc = true; 7769 7777 if (is_cgroup_event(event)) 7778 + inc = true; 7779 + 7780 + if (inc) 7770 7781 static_key_slow_inc(&perf_sched_events.key); 7771 7782 7772 7783 account_event_cpu(event, event->cpu); ··· 8417 8422 * See perf_event_ctx_lock() for comments on the details 8418 8423 * of swizzling perf_event::ctx. 8419 8424 */ 8420 - perf_remove_from_context(group_leader, false); 8425 + perf_remove_from_context(group_leader, 0); 8421 8426 8422 8427 list_for_each_entry(sibling, &group_leader->sibling_list, 8423 8428 group_entry) { 8424 - perf_remove_from_context(sibling, false); 8429 + perf_remove_from_context(sibling, 0); 8425 8430 put_ctx(gctx); 8426 8431 } 8427 8432 ··· 8474 8479 perf_event__header_size(event); 8475 8480 perf_event__id_header_size(event); 8476 8481 8482 + event->owner = current; 8483 + 8477 8484 perf_install_in_context(ctx, event, event->cpu); 8478 8485 perf_unpin_context(ctx); 8479 8486 ··· 8484 8487 mutex_unlock(&ctx->mutex); 8485 8488 8486 8489 put_online_cpus(); 8487 - 8488 - event->owner = current; 8489 8490 8490 8491 mutex_lock(&current->perf_event_mutex); 8491 8492 list_add_tail(&event->owner_entry, &current->perf_event_list); ··· 8551 8556 } 8552 8557 8553 8558 /* Mark owner so we could distinguish it from user events. */ 8554 - event->owner = EVENT_OWNER_KERNEL; 8559 + event->owner = TASK_TOMBSTONE; 8555 8560 8556 8561 account_event(event); 8557 8562 ··· 8601 8606 mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex); 8602 8607 list_for_each_entry_safe(event, tmp, &src_ctx->event_list, 8603 8608 event_entry) { 8604 - perf_remove_from_context(event, false); 8609 + perf_remove_from_context(event, 0); 8605 8610 unaccount_event_cpu(event, src_cpu); 8606 8611 put_ctx(src_ctx); 8607 8612 list_add(&event->migrate_entry, &events); ··· 8668 8673 &parent_event->child_total_time_enabled); 8669 8674 atomic64_add(child_event->total_time_running, 8670 8675 &parent_event->child_total_time_running); 8671 - 8672 - /* 8673 - * Remove this event from the parent's list 8674 - */ 8675 - WARN_ON_ONCE(parent_event->ctx->parent_ctx); 8676 - mutex_lock(&parent_event->child_mutex); 8677 - list_del_init(&child_event->child_list); 8678 - mutex_unlock(&parent_event->child_mutex); 8679 - 8680 - /* 8681 - * Make sure user/parent get notified, that we just 8682 - * lost one event. 8683 - */ 8684 - perf_event_wakeup(parent_event); 8685 - 8686 - /* 8687 - * Release the parent event, if this was the last 8688 - * reference to it. 8689 - */ 8690 - put_event(parent_event); 8691 8676 } 8692 8677 8693 8678 static void 8694 - __perf_event_exit_task(struct perf_event *child_event, 8695 - struct perf_event_context *child_ctx, 8696 - struct task_struct *child) 8679 + perf_event_exit_event(struct perf_event *child_event, 8680 + struct perf_event_context *child_ctx, 8681 + struct task_struct *child) 8697 8682 { 8683 + struct perf_event *parent_event = child_event->parent; 8684 + 8698 8685 /* 8699 8686 * Do not destroy the 'original' grouping; because of the context 8700 8687 * switch optimization the original events could've ended up in a ··· 8689 8712 * Do destroy all inherited groups, we don't care about those 8690 8713 * and being thorough is better. 8691 8714 */ 8692 - perf_remove_from_context(child_event, !!child_event->parent); 8715 + raw_spin_lock_irq(&child_ctx->lock); 8716 + WARN_ON_ONCE(child_ctx->is_active); 8717 + 8718 + if (parent_event) 8719 + perf_group_detach(child_event); 8720 + list_del_event(child_event, child_ctx); 8721 + child_event->state = PERF_EVENT_STATE_EXIT; /* see perf_event_release_kernel() */ 8722 + raw_spin_unlock_irq(&child_ctx->lock); 8693 8723 8694 8724 /* 8695 - * It can happen that the parent exits first, and has events 8696 - * that are still around due to the child reference. These 8697 - * events need to be zapped. 8725 + * Parent events are governed by their filedesc, retain them. 8698 8726 */ 8699 - if (child_event->parent) { 8700 - sync_child_event(child_event, child); 8701 - free_event(child_event); 8702 - } else { 8703 - child_event->state = PERF_EVENT_STATE_EXIT; 8727 + if (!parent_event) { 8704 8728 perf_event_wakeup(child_event); 8729 + return; 8705 8730 } 8731 + /* 8732 + * Child events can be cleaned up. 8733 + */ 8734 + 8735 + sync_child_event(child_event, child); 8736 + 8737 + /* 8738 + * Remove this event from the parent's list 8739 + */ 8740 + WARN_ON_ONCE(parent_event->ctx->parent_ctx); 8741 + mutex_lock(&parent_event->child_mutex); 8742 + list_del_init(&child_event->child_list); 8743 + mutex_unlock(&parent_event->child_mutex); 8744 + 8745 + /* 8746 + * Kick perf_poll() for is_event_hup(). 8747 + */ 8748 + perf_event_wakeup(parent_event); 8749 + free_event(child_event); 8750 + put_event(parent_event); 8706 8751 } 8707 8752 8708 8753 static void perf_event_exit_task_context(struct task_struct *child, int ctxn) 8709 8754 { 8710 - struct perf_event *child_event, *next; 8711 8755 struct perf_event_context *child_ctx, *clone_ctx = NULL; 8712 - unsigned long flags; 8756 + struct perf_event *child_event, *next; 8713 8757 8714 - if (likely(!child->perf_event_ctxp[ctxn])) 8758 + WARN_ON_ONCE(child != current); 8759 + 8760 + child_ctx = perf_pin_task_context(child, ctxn); 8761 + if (!child_ctx) 8715 8762 return; 8716 8763 8717 - local_irq_save(flags); 8718 8764 /* 8719 - * We can't reschedule here because interrupts are disabled, 8720 - * and either child is current or it is a task that can't be 8721 - * scheduled, so we are now safe from rescheduling changing 8722 - * our context. 8765 + * In order to reduce the amount of tricky in ctx tear-down, we hold 8766 + * ctx::mutex over the entire thing. This serializes against almost 8767 + * everything that wants to access the ctx. 8768 + * 8769 + * The exception is sys_perf_event_open() / 8770 + * perf_event_create_kernel_count() which does find_get_context() 8771 + * without ctx::mutex (it cannot because of the move_group double mutex 8772 + * lock thing). See the comments in perf_install_in_context(). 8723 8773 */ 8724 - child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]); 8774 + mutex_lock(&child_ctx->mutex); 8725 8775 8726 8776 /* 8727 - * Take the context lock here so that if find_get_context is 8728 - * reading child->perf_event_ctxp, we wait until it has 8729 - * incremented the context's refcount before we do put_ctx below. 8777 + * In a single ctx::lock section, de-schedule the events and detach the 8778 + * context from the task such that we cannot ever get it scheduled back 8779 + * in. 8730 8780 */ 8731 - raw_spin_lock(&child_ctx->lock); 8732 - task_ctx_sched_out(child_ctx); 8733 - child->perf_event_ctxp[ctxn] = NULL; 8781 + raw_spin_lock_irq(&child_ctx->lock); 8782 + task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx); 8734 8783 8735 8784 /* 8736 - * If this context is a clone; unclone it so it can't get 8737 - * swapped to another process while we're removing all 8738 - * the events from it. 8785 + * Now that the context is inactive, destroy the task <-> ctx relation 8786 + * and mark the context dead. 8739 8787 */ 8788 + RCU_INIT_POINTER(child->perf_event_ctxp[ctxn], NULL); 8789 + put_ctx(child_ctx); /* cannot be last */ 8790 + WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE); 8791 + put_task_struct(current); /* cannot be last */ 8792 + 8740 8793 clone_ctx = unclone_ctx(child_ctx); 8741 - update_context_time(child_ctx); 8742 - raw_spin_unlock_irqrestore(&child_ctx->lock, flags); 8794 + raw_spin_unlock_irq(&child_ctx->lock); 8743 8795 8744 8796 if (clone_ctx) 8745 8797 put_ctx(clone_ctx); ··· 8780 8774 */ 8781 8775 perf_event_task(child, child_ctx, 0); 8782 8776 8783 - /* 8784 - * We can recurse on the same lock type through: 8785 - * 8786 - * __perf_event_exit_task() 8787 - * sync_child_event() 8788 - * put_event() 8789 - * mutex_lock(&ctx->mutex) 8790 - * 8791 - * But since its the parent context it won't be the same instance. 8792 - */ 8793 - mutex_lock(&child_ctx->mutex); 8794 - 8795 8777 list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry) 8796 - __perf_event_exit_task(child_event, child_ctx, child); 8778 + perf_event_exit_event(child_event, child_ctx, child); 8797 8779 8798 8780 mutex_unlock(&child_ctx->mutex); 8799 8781 ··· 8806 8812 * the owner, closes a race against perf_release() where 8807 8813 * we need to serialize on the owner->perf_event_mutex. 8808 8814 */ 8809 - smp_wmb(); 8810 - event->owner = NULL; 8815 + smp_store_release(&event->owner, NULL); 8811 8816 } 8812 8817 mutex_unlock(&child->perf_event_mutex); 8813 8818 ··· 8889 8896 WARN_ON_ONCE(task->perf_event_ctxp[ctxn]); 8890 8897 } 8891 8898 8892 - struct perf_event *perf_event_get(unsigned int fd) 8899 + struct file *perf_event_get(unsigned int fd) 8893 8900 { 8894 - int err; 8895 - struct fd f; 8896 - struct perf_event *event; 8901 + struct file *file; 8897 8902 8898 - err = perf_fget_light(fd, &f); 8899 - if (err) 8900 - return ERR_PTR(err); 8903 + file = fget_raw(fd); 8904 + if (!file) 8905 + return ERR_PTR(-EBADF); 8901 8906 8902 - event = f.file->private_data; 8903 - atomic_long_inc(&event->refcount); 8904 - fdput(f); 8907 + if (file->f_op != &perf_fops) { 8908 + fput(file); 8909 + return ERR_PTR(-EBADF); 8910 + } 8905 8911 8906 - return event; 8912 + return file; 8907 8913 } 8908 8914 8909 8915 const struct perf_event_attr *perf_event_attrs(struct perf_event *event) ··· 8945 8953 if (IS_ERR(child_event)) 8946 8954 return child_event; 8947 8955 8956 + /* 8957 + * is_orphaned_event() and list_add_tail(&parent_event->child_list) 8958 + * must be under the same lock in order to serialize against 8959 + * perf_event_release_kernel(), such that either we must observe 8960 + * is_orphaned_event() or they will observe us on the child_list. 8961 + */ 8962 + mutex_lock(&parent_event->child_mutex); 8948 8963 if (is_orphaned_event(parent_event) || 8949 8964 !atomic_long_inc_not_zero(&parent_event->refcount)) { 8965 + mutex_unlock(&parent_event->child_mutex); 8950 8966 free_event(child_event); 8951 8967 return NULL; 8952 8968 } ··· 9002 9002 /* 9003 9003 * Link this into the parent event's child list 9004 9004 */ 9005 - WARN_ON_ONCE(parent_event->ctx->parent_ctx); 9006 - mutex_lock(&parent_event->child_mutex); 9007 9005 list_add_tail(&child_event->child_list, &parent_event->child_list); 9008 9006 mutex_unlock(&parent_event->child_mutex); 9009 9007 ··· 9219 9221 #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE 9220 9222 static void __perf_event_exit_context(void *__info) 9221 9223 { 9222 - struct remove_event re = { .detach_group = true }; 9223 9224 struct perf_event_context *ctx = __info; 9225 + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 9226 + struct perf_event *event; 9224 9227 9225 - rcu_read_lock(); 9226 - list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry) 9227 - __perf_remove_from_context(&re); 9228 - rcu_read_unlock(); 9228 + raw_spin_lock(&ctx->lock); 9229 + list_for_each_entry(event, &ctx->event_list, event_entry) 9230 + __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP); 9231 + raw_spin_unlock(&ctx->lock); 9229 9232 } 9230 9233 9231 9234 static void perf_event_exit_cpu_context(int cpu)

+1 -1

kernel/events/hw_breakpoint.c

··· 444 444 * current task. 445 445 */ 446 446 if (irqs_disabled() && bp->ctx && bp->ctx->task == current) 447 - __perf_event_disable(bp); 447 + perf_event_disable_local(bp); 448 448 else 449 449 perf_event_disable(bp); 450 450

+20 -20

kernel/events/ring_buffer.c

··· 459 459 __free_page(page); 460 460 } 461 461 462 + static void __rb_free_aux(struct ring_buffer *rb) 463 + { 464 + int pg; 465 + 466 + if (rb->aux_priv) { 467 + rb->free_aux(rb->aux_priv); 468 + rb->free_aux = NULL; 469 + rb->aux_priv = NULL; 470 + } 471 + 472 + if (rb->aux_nr_pages) { 473 + for (pg = 0; pg < rb->aux_nr_pages; pg++) 474 + rb_free_aux_page(rb, pg); 475 + 476 + kfree(rb->aux_pages); 477 + rb->aux_nr_pages = 0; 478 + } 479 + } 480 + 462 481 int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event, 463 482 pgoff_t pgoff, int nr_pages, long watermark, int flags) 464 483 { ··· 566 547 if (!ret) 567 548 rb->aux_pgoff = pgoff; 568 549 else 569 - rb_free_aux(rb); 550 + __rb_free_aux(rb); 570 551 571 552 return ret; 572 - } 573 - 574 - static void __rb_free_aux(struct ring_buffer *rb) 575 - { 576 - int pg; 577 - 578 - if (rb->aux_priv) { 579 - rb->free_aux(rb->aux_priv); 580 - rb->free_aux = NULL; 581 - rb->aux_priv = NULL; 582 - } 583 - 584 - if (rb->aux_nr_pages) { 585 - for (pg = 0; pg < rb->aux_nr_pages; pg++) 586 - rb_free_aux_page(rb, pg); 587 - 588 - kfree(rb->aux_pages); 589 - rb->aux_nr_pages = 0; 590 - } 591 553 } 592 554 593 555 void rb_free_aux(struct ring_buffer *rb)

+10 -4

kernel/trace/bpf_trace.c

··· 191 191 struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; 192 192 struct bpf_array *array = container_of(map, struct bpf_array, map); 193 193 struct perf_event *event; 194 + struct file *file; 194 195 195 196 if (unlikely(index >= array->map.max_entries)) 196 197 return -E2BIG; 197 198 198 - event = (struct perf_event *)array->ptrs[index]; 199 - if (!event) 199 + file = (struct file *)array->ptrs[index]; 200 + if (unlikely(!file)) 200 201 return -ENOENT; 202 + 203 + event = file->private_data; 201 204 202 205 /* make sure event is local and doesn't have pmu::count */ 203 206 if (event->oncpu != smp_processor_id() || ··· 231 228 void *data = (void *) (long) r4; 232 229 struct perf_sample_data sample_data; 233 230 struct perf_event *event; 231 + struct file *file; 234 232 struct perf_raw_record raw = { 235 233 .size = size, 236 234 .data = data, ··· 240 236 if (unlikely(index >= array->map.max_entries)) 241 237 return -E2BIG; 242 238 243 - event = (struct perf_event *)array->ptrs[index]; 244 - if (unlikely(!event)) 239 + file = (struct file *)array->ptrs[index]; 240 + if (unlikely(!file)) 245 241 return -ENOENT; 242 + 243 + event = file->private_data; 246 244 247 245 if (unlikely(event->attr.type != PERF_TYPE_SOFTWARE || 248 246 event->attr.config != PERF_COUNT_SW_BPF_OUTPUT))

+24 -1

tools/perf/Makefile.perf

··· 77 77 # Define NO_AUXTRACE if you do not want AUX area tracing support 78 78 # 79 79 # Define NO_LIBBPF if you do not want BPF support 80 + # 81 + # Define FEATURES_DUMP to provide features detection dump file 82 + # and bypass the feature detection 80 83 81 84 # As per kernel Makefile, avoid funny character set dependencies 82 85 unexport LC_ALL ··· 167 164 168 165 ifeq ($(config),1) 169 166 include config/Makefile 167 + endif 168 + 169 + # The FEATURE_DUMP_EXPORT holds location of the actual 170 + # FEATURE_DUMP file to be used to bypass feature detection 171 + # (for bpf or any other subproject) 172 + ifeq ($(FEATURES_DUMP),) 173 + FEATURE_DUMP_EXPORT := $(realpath $(OUTPUT)FEATURE-DUMP) 174 + else 175 + FEATURE_DUMP_EXPORT := $(FEATURES_DUMP) 170 176 endif 171 177 172 178 export prefix bindir sharedir sysconfdir DESTDIR ··· 448 436 $(Q)$(MAKE) -C $(LIB_DIR) O=$(OUTPUT) clean >/dev/null 449 437 450 438 $(LIBBPF): fixdep FORCE 451 - $(Q)$(MAKE) -C $(BPF_DIR) O=$(OUTPUT) $(OUTPUT)libbpf.a FEATURES_DUMP=$(realpath $(OUTPUT)FEATURE-DUMP) 439 + $(Q)$(MAKE) -C $(BPF_DIR) O=$(OUTPUT) $(OUTPUT)libbpf.a FEATURES_DUMP=$(FEATURE_DUMP_EXPORT) 452 440 453 441 $(LIBBPF)-clean: 454 442 $(call QUIET_CLEAN, libbpf) ··· 621 609 $(OUTPUT)tests/llvm-src-{base,kbuild,prologue}.c 622 610 $(QUIET_SUBDIR0)Documentation $(QUIET_SUBDIR1) clean 623 611 $(python-clean) 612 + 613 + # 614 + # To provide FEATURE-DUMP into $(FEATURE_DUMP_COPY) 615 + # file if defined, with no further action. 616 + feature-dump: 617 + ifdef FEATURE_DUMP_COPY 618 + @cp $(OUTPUT)FEATURE-DUMP $(FEATURE_DUMP_COPY) 619 + @echo "FEATURE-DUMP file copied into $(FEATURE_DUMP_COPY)" 620 + else 621 + @echo "FEATURE-DUMP file available in $(OUTPUT)FEATURE-DUMP" 622 + endif 624 623 625 624 # 626 625 # Trick: if ../../.git does not exist - we are building out of tree for example,

+1 -1

tools/perf/arch/x86/tests/intel-cqm.c

··· 17 17 if (pid) 18 18 return pid; 19 19 20 - while(1); 20 + while(1) 21 21 sleep(5); 22 22 return 0; 23 23 }

+4

tools/perf/config/Makefile

··· 181 181 182 182 EXTLIBS = -lpthread -lrt -lm -ldl 183 183 184 + ifeq ($(FEATURES_DUMP),) 184 185 include $(srctree)/tools/build/Makefile.feature 186 + else 187 + include $(FEATURES_DUMP) 188 + endif 185 189 186 190 ifeq ($(feature-stackprotector-all), 1) 187 191 CFLAGS += -fstack-protector-all

+40 -15

tools/perf/tests/make

··· 5 5 # no target specified, trigger the whole suite 6 6 all: 7 7 @echo "Testing Makefile"; $(MAKE) -sf tests/make MK=Makefile 8 - @echo "Testing Makefile.perf"; $(MAKE) -sf tests/make MK=Makefile.perf 8 + @echo "Testing Makefile.perf"; $(MAKE) -sf tests/make MK=Makefile.perf SET_PARALLEL=1 SET_O=1 9 9 else 10 10 # run only specific test over 'Makefile' 11 11 %: ··· 13 13 endif 14 14 else 15 15 PERF := . 16 + PERF_O := $(PERF) 17 + O_OPT := 18 + 19 + ifneq ($(O),) 20 + FULL_O := $(shell readlink -f $(O) || echo $(O)) 21 + PERF_O := $(FULL_O) 22 + ifeq ($(SET_O),1) 23 + O_OPT := 'O=$(FULL_O)' 24 + endif 25 + K_O_OPT := 'O=$(FULL_O)' 26 + endif 27 + 28 + PARALLEL_OPT= 29 + ifeq ($(SET_PARALLEL),1) 30 + cores := $(shell (getconf _NPROCESSORS_ONLN || egrep -c '^processor|^CPU[0-9]' /proc/cpuinfo) 2>/dev/null) 31 + ifeq ($(cores),0) 32 + cores := 1 33 + endif 34 + PARALLEL_OPT="-j$(cores)" 35 + endif 16 36 17 37 # As per kernel Makefile, avoid funny character set dependencies 18 38 unexport LC_ALL ··· 176 156 test_make_help_O := $(test_ok) 177 157 test_make_doc_O := $(test_ok) 178 158 179 - test_make_python_perf_so := test -f $(PERF)/python/perf.so 159 + test_make_python_perf_so := test -f $(PERF_O)/python/perf.so 180 160 181 - test_make_perf_o := test -f $(PERF)/perf.o 182 - test_make_util_map_o := test -f $(PERF)/util/map.o 183 - test_make_util_pmu_bison_o := test -f $(PERF)/util/pmu-bison.o 161 + test_make_perf_o := test -f $(PERF_O)/perf.o 162 + test_make_util_map_o := test -f $(PERF_O)/util/map.o 163 + test_make_util_pmu_bison_o := test -f $(PERF_O)/util/pmu-bison.o 184 164 185 165 define test_dest_files 186 166 for file in $(1); do \ ··· 247 227 test_make_util_map_o_O := test -f $$TMP_O/util/map.o 248 228 test_make_util_pmu_bison_o_O := test -f $$TMP_O/util/pmu-bison.o 249 229 250 - test_default = test -x $(PERF)/perf 230 + test_default = test -x $(PERF_O)/perf 251 231 test = $(if $(test_$1),$(test_$1),$(test_default)) 252 232 253 233 test_default_O = test -x $$TMP_O/perf ··· 267 247 268 248 MAKEFLAGS := --no-print-directory 269 249 270 - clean := @(cd $(PERF); make -s -f $(MK) clean >/dev/null) 250 + clean := @(cd $(PERF); make -s -f $(MK) $(O_OPT) clean >/dev/null) 271 251 272 252 $(run): 273 253 $(call clean) 274 254 @TMP_DEST=$$(mktemp -d); \ 275 - cmd="cd $(PERF) && make -f $(MK) DESTDIR=$$TMP_DEST $($@)"; \ 255 + cmd="cd $(PERF) && make -f $(MK) $(PARALLEL_OPT) $(O_OPT) DESTDIR=$$TMP_DEST $($@)"; \ 276 256 echo "- $@: $$cmd" && echo $$cmd > $@ && \ 277 257 ( eval $$cmd ) >> $@ 2>&1; \ 278 258 echo " test: $(call test,$@)" >> $@ 2>&1; \ ··· 283 263 $(call clean) 284 264 @TMP_O=$$(mktemp -d); \ 285 265 TMP_DEST=$$(mktemp -d); \ 286 - cmd="cd $(PERF) && make -f $(MK) O=$$TMP_O DESTDIR=$$TMP_DEST $($(patsubst %_O,%,$@))"; \ 266 + cmd="cd $(PERF) && make -f $(MK) $(PARALLEL_OPT) O=$$TMP_O DESTDIR=$$TMP_DEST $($(patsubst %_O,%,$@))"; \ 287 267 echo "- $@: $$cmd" && echo $$cmd > $@ && \ 288 268 ( eval $$cmd ) >> $@ 2>&1 && \ 289 269 echo " test: $(call test_O,$@)" >> $@ 2>&1; \ ··· 296 276 ( eval $$cmd ) >> $@ 2>&1 && \ 297 277 rm -f $@ 298 278 279 + KERNEL_O := ../.. 280 + ifneq ($(O),) 281 + KERNEL_O := $(O) 282 + endif 283 + 299 284 make_kernelsrc: 300 - @echo "- make -C <kernelsrc> tools/perf" 285 + @echo "- make -C <kernelsrc> $(PARALLEL_OPT) $(K_O_OPT) tools/perf" 301 286 $(call clean); \ 302 - (make -C ../.. tools/perf) > $@ 2>&1 && \ 303 - test -x perf && rm -f $@ || (cat $@ ; false) 287 + (make -C ../.. $(PARALLEL_OPT) $(K_O_OPT) tools/perf) > $@ 2>&1 && \ 288 + test -x $(KERNEL_O)/tools/perf/perf && rm -f $@ || (cat $@ ; false) 304 289 305 290 make_kernelsrc_tools: 306 - @echo "- make -C <kernelsrc>/tools perf" 291 + @echo "- make -C <kernelsrc>/tools $(PARALLEL_OPT) $(K_O_OPT) perf" 307 292 $(call clean); \ 308 - (make -C ../../tools perf) > $@ 2>&1 && \ 309 - test -x perf && rm -f $@ || (cat $@ ; false) 293 + (make -C ../../tools $(PARALLEL_OPT) $(K_O_OPT) perf) > $@ 2>&1 && \ 294 + test -x $(KERNEL_O)/tools/perf/perf && rm -f $@ || (cat $@ ; false) 310 295 311 296 all: $(run) $(run_O) tarpkg make_kernelsrc make_kernelsrc_tools 312 297 @echo OK

+2 -2

tools/perf/ui/browsers/annotate.c

··· 755 755 nd = browser->curr_hot; 756 756 break; 757 757 case K_UNTAB: 758 - if (nd != NULL) 758 + if (nd != NULL) { 759 759 nd = rb_next(nd); 760 760 if (nd == NULL) 761 761 nd = rb_first(&browser->entries); 762 - else 762 + } else 763 763 nd = browser->curr_hot; 764 764 break; 765 765 case K_F1:

+2

tools/perf/util/hist.c

··· 131 131 symlen = unresolved_col_width + 4 + 2; 132 132 hists__new_col_len(hists, HISTC_MEM_DADDR_SYMBOL, 133 133 symlen); 134 + hists__new_col_len(hists, HISTC_MEM_DCACHELINE, 135 + symlen); 134 136 } 135 137 136 138 if (h->mem_info->iaddr.sym) {

+1 -1

tools/perf/util/session.c

··· 1149 1149 1150 1150 machine = machines__find(machines, pid); 1151 1151 if (!machine) 1152 - machine = machines__find(machines, DEFAULT_GUEST_KERNEL_ID); 1152 + machine = machines__findnew(machines, DEFAULT_GUEST_KERNEL_ID); 1153 1153 return machine; 1154 1154 } 1155 1155

-1

tools/perf/util/stat.c

··· 310 310 int i, ret; 311 311 312 312 aggr->val = aggr->ena = aggr->run = 0; 313 - init_stats(ps->res_stats); 314 313 315 314 if (counter->per_pkg) 316 315 zero_per_pkg(counter);

+1 -1

tools/perf/util/symbol.c

··· 1466 1466 * Read the build id if possible. This is required for 1467 1467 * DSO_BINARY_TYPE__BUILDID_DEBUGINFO to work 1468 1468 */ 1469 - if (filename__read_build_id(dso->name, build_id, BUILD_ID_SIZE) > 0) 1469 + if (filename__read_build_id(dso->long_name, build_id, BUILD_ID_SIZE) > 0) 1470 1470 dso__set_build_id(dso, build_id); 1471 1471 1472 1472 /*

Configure Feed

Configure Feed