Merge branch 'perf-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull perf fixes from Thomas Gleixner:
"A rather largish series of 12 patches addressing a maze of race
conditions in the perf core code from Peter Zijlstra"

* 'perf-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
perf: Robustify task_function_call()
perf: Fix scaling vs. perf_install_in_context()
perf: Fix scaling vs. perf_event_enable()
perf: Fix scaling vs. perf_event_enable_on_exec()
perf: Fix ctx time tracking by introducing EVENT_TIME
perf: Cure event->pending_disable race
perf: Fix race between event install and jump_labels
perf: Fix cloning
perf: Only update context time when active
perf: Allow perf_release() with !event->ctx
perf: Do not double free
perf: Close install vs. exit race

Changed files
+245 -132
include
linux
kernel
events
+4 -3
include/linux/perf_event.h
··· 397 * enum perf_event_active_state - the states of a event 398 */ 399 enum perf_event_active_state { 400 PERF_EVENT_STATE_EXIT = -3, 401 PERF_EVENT_STATE_ERROR = -2, 402 PERF_EVENT_STATE_OFF = -1, ··· 906 } 907 } 908 909 - extern struct static_key_deferred perf_sched_events; 910 911 static __always_inline bool 912 perf_sw_migrate_enabled(void) ··· 925 static inline void perf_event_task_sched_in(struct task_struct *prev, 926 struct task_struct *task) 927 { 928 - if (static_key_false(&perf_sched_events.key)) 929 __perf_event_task_sched_in(prev, task); 930 931 if (perf_sw_migrate_enabled() && task->sched_migrated) { ··· 942 { 943 perf_sw_event_sched(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 0); 944 945 - if (static_key_false(&perf_sched_events.key)) 946 __perf_event_task_sched_out(prev, next); 947 } 948
··· 397 * enum perf_event_active_state - the states of a event 398 */ 399 enum perf_event_active_state { 400 + PERF_EVENT_STATE_DEAD = -4, 401 PERF_EVENT_STATE_EXIT = -3, 402 PERF_EVENT_STATE_ERROR = -2, 403 PERF_EVENT_STATE_OFF = -1, ··· 905 } 906 } 907 908 + extern struct static_key_false perf_sched_events; 909 910 static __always_inline bool 911 perf_sw_migrate_enabled(void) ··· 924 static inline void perf_event_task_sched_in(struct task_struct *prev, 925 struct task_struct *task) 926 { 927 + if (static_branch_unlikely(&perf_sched_events)) 928 __perf_event_task_sched_in(prev, task); 929 930 if (perf_sw_migrate_enabled() && task->sched_migrated) { ··· 941 { 942 perf_sw_event_sched(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 0); 943 944 + if (static_branch_unlikely(&perf_sched_events)) 945 __perf_event_task_sched_out(prev, next); 946 } 947
+241 -129
kernel/events/core.c
··· 64 struct task_struct *p = tfc->p; 65 66 if (p) { 67 - tfc->ret = -EAGAIN; 68 - if (task_cpu(p) != smp_processor_id() || !task_curr(p)) 69 return; 70 } 71 ··· 101 .p = p, 102 .func = func, 103 .info = info, 104 - .ret = -ESRCH, /* No such (running) process */ 105 }; 106 107 - if (task_curr(p)) 108 - smp_call_function_single(task_cpu(p), remote_function, &data, 1); 109 110 - return data.ret; 111 } 112 113 /** ··· 182 * rely on ctx->is_active and therefore cannot use event_function_call(). 183 * See perf_install_in_context(). 184 * 185 - * This is because we need a ctx->lock serialized variable (ctx->is_active) 186 - * to reliably determine if a particular task/context is scheduled in. The 187 - * task_curr() use in task_function_call() is racy in that a remote context 188 - * switch is not a single atomic operation. 189 - * 190 - * As is, the situation is 'safe' because we set rq->curr before we do the 191 - * actual context switch. This means that task_curr() will fail early, but 192 - * we'll continue spinning on ctx->is_active until we've passed 193 - * perf_event_task_sched_out(). 194 - * 195 - * Without this ctx->lock serialized variable we could have race where we find 196 - * the task (and hence the context) would not be active while in fact they are. 197 - * 198 * If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set. 199 */ 200 ··· 212 */ 213 if (ctx->task) { 214 if (ctx->task != current) { 215 - ret = -EAGAIN; 216 goto unlock; 217 } 218 ··· 276 return; 277 } 278 279 - again: 280 if (task == TASK_TOMBSTONE) 281 return; 282 283 if (!task_function_call(task, event_function, &efs)) 284 return; 285 ··· 289 * a concurrent perf_event_context_sched_out(). 290 */ 291 task = ctx->task; 292 - if (task != TASK_TOMBSTONE) { 293 - if (ctx->is_active) { 294 - raw_spin_unlock_irq(&ctx->lock); 295 - goto again; 296 - } 297 - func(event, NULL, ctx, data); 298 } 299 raw_spin_unlock_irq(&ctx->lock); 300 } 301 ··· 316 enum event_type_t { 317 EVENT_FLEXIBLE = 0x1, 318 EVENT_PINNED = 0x2, 319 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, 320 }; 321 ··· 324 * perf_sched_events : >0 events exist 325 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu 326 */ 327 - struct static_key_deferred perf_sched_events __read_mostly; 328 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); 329 static DEFINE_PER_CPU(int, perf_sched_cb_usages); 330 ··· 1297 1298 /* 1299 * Update the total_time_enabled and total_time_running fields for a event. 1300 - * The caller of this function needs to hold the ctx->lock. 1301 */ 1302 static void update_event_times(struct perf_event *event) 1303 { 1304 struct perf_event_context *ctx = event->ctx; 1305 u64 run_end; 1306 1307 if (event->state < PERF_EVENT_STATE_INACTIVE || 1308 event->group_leader->state < PERF_EVENT_STATE_INACTIVE) 1309 return; 1310 /* 1311 * in cgroup mode, time_enabled represents 1312 * the time the event was enabled AND active ··· 1656 1657 static bool is_orphaned_event(struct perf_event *event) 1658 { 1659 - return event->state == PERF_EVENT_STATE_EXIT; 1660 } 1661 1662 static inline int pmu_filter_match(struct perf_event *event) ··· 1701 1702 perf_pmu_disable(event->pmu); 1703 1704 event->state = PERF_EVENT_STATE_INACTIVE; 1705 if (event->pending_disable) { 1706 event->pending_disable = 0; 1707 event->state = PERF_EVENT_STATE_OFF; 1708 } 1709 - event->tstamp_stopped = tstamp; 1710 - event->pmu->del(event, 0); 1711 - event->oncpu = -1; 1712 1713 if (!is_software_event(event)) 1714 cpuctx->active_oncpu--; ··· 1743 } 1744 1745 #define DETACH_GROUP 0x01UL 1746 - #define DETACH_STATE 0x02UL 1747 1748 /* 1749 * Cross CPU call to remove a performance event ··· 1762 if (flags & DETACH_GROUP) 1763 perf_group_detach(event); 1764 list_del_event(event, ctx); 1765 - if (flags & DETACH_STATE) 1766 - event->state = PERF_EVENT_STATE_EXIT; 1767 1768 if (!ctx->nr_events && ctx->is_active) { 1769 ctx->is_active = 0; ··· 2071 event->tstamp_stopped = tstamp; 2072 } 2073 2074 - static void task_ctx_sched_out(struct perf_cpu_context *cpuctx, 2075 - struct perf_event_context *ctx); 2076 static void 2077 ctx_sched_in(struct perf_event_context *ctx, 2078 struct perf_cpu_context *cpuctx, 2079 enum event_type_t event_type, 2080 struct task_struct *task); 2081 2082 static void perf_event_sched_in(struct perf_cpu_context *cpuctx, 2083 struct perf_event_context *ctx, ··· 2118 /* 2119 * Cross CPU call to install and enable a performance event 2120 * 2121 - * Must be called with ctx->mutex held 2122 */ 2123 static int __perf_install_in_context(void *info) 2124 { 2125 - struct perf_event_context *ctx = info; 2126 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 2127 struct perf_event_context *task_ctx = cpuctx->task_ctx; 2128 2129 raw_spin_lock(&cpuctx->ctx.lock); 2130 if (ctx->task) { 2131 raw_spin_lock(&ctx->lock); 2132 - /* 2133 - * If we hit the 'wrong' task, we've since scheduled and 2134 - * everything should be sorted, nothing to do! 2135 - */ 2136 task_ctx = ctx; 2137 - if (ctx->task != current) 2138 goto unlock; 2139 2140 /* 2141 - * If task_ctx is set, it had better be to us. 2142 */ 2143 - WARN_ON_ONCE(cpuctx->task_ctx != ctx && cpuctx->task_ctx); 2144 } else if (task_ctx) { 2145 raw_spin_lock(&task_ctx->lock); 2146 } 2147 2148 - ctx_resched(cpuctx, task_ctx); 2149 unlock: 2150 perf_ctx_unlock(cpuctx, task_ctx); 2151 2152 - return 0; 2153 } 2154 2155 /* 2156 - * Attach a performance event to a context 2157 */ 2158 static void 2159 perf_install_in_context(struct perf_event_context *ctx, 2160 struct perf_event *event, 2161 int cpu) 2162 { 2163 - struct task_struct *task = NULL; 2164 2165 lockdep_assert_held(&ctx->mutex); 2166 ··· 2187 if (event->cpu != -1) 2188 event->cpu = cpu; 2189 2190 /* 2191 * Installing events is tricky because we cannot rely on ctx->is_active 2192 * to be set in case this is the nr_events 0 -> 1 transition. 2193 - * 2194 - * So what we do is we add the event to the list here, which will allow 2195 - * a future context switch to DTRT and then send a racy IPI. If the IPI 2196 - * fails to hit the right task, this means a context switch must have 2197 - * happened and that will have taken care of business. 2198 */ 2199 raw_spin_lock_irq(&ctx->lock); 2200 task = ctx->task; 2201 - /* 2202 - * Worse, we cannot even rely on the ctx actually existing anymore. If 2203 - * between find_get_context() and perf_install_in_context() the task 2204 - * went through perf_event_exit_task() its dead and we should not be 2205 - * adding new events. 2206 - */ 2207 - if (task == TASK_TOMBSTONE) { 2208 raw_spin_unlock_irq(&ctx->lock); 2209 return; 2210 } 2211 - update_context_time(ctx); 2212 - /* 2213 - * Update cgrp time only if current cgrp matches event->cgrp. 2214 - * Must be done before calling add_event_to_ctx(). 2215 - */ 2216 - update_cgrp_time_from_event(event); 2217 - add_event_to_ctx(event, ctx); 2218 raw_spin_unlock_irq(&ctx->lock); 2219 - 2220 - if (task) 2221 - task_function_call(task, __perf_install_in_context, ctx); 2222 - else 2223 - cpu_function_call(cpu, __perf_install_in_context, ctx); 2224 } 2225 2226 /* ··· 2265 event->state <= PERF_EVENT_STATE_ERROR) 2266 return; 2267 2268 - update_context_time(ctx); 2269 __perf_event_mark_enabled(event); 2270 2271 if (!ctx->is_active) 2272 return; 2273 2274 if (!event_filter_match(event)) { 2275 - if (is_cgroup_event(event)) { 2276 - perf_cgroup_set_timestamp(current, ctx); // XXX ? 2277 perf_cgroup_defer_enabled(event); 2278 - } 2279 return; 2280 } 2281 ··· 2284 * If the event is in a group and isn't the group leader, 2285 * then don't put it on unless the group is on. 2286 */ 2287 - if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) 2288 return; 2289 2290 task_ctx = cpuctx->task_ctx; 2291 if (ctx->task) ··· 2393 } 2394 2395 ctx->is_active &= ~event_type; 2396 if (ctx->task) { 2397 WARN_ON_ONCE(cpuctx->task_ctx != ctx); 2398 if (!ctx->is_active) 2399 cpuctx->task_ctx = NULL; 2400 } 2401 2402 - update_context_time(ctx); 2403 - update_cgrp_time_from_cpuctx(cpuctx); 2404 - if (!ctx->nr_active) 2405 return; 2406 2407 perf_pmu_disable(ctx->pmu); 2408 - if ((is_active & EVENT_PINNED) && (event_type & EVENT_PINNED)) { 2409 list_for_each_entry(event, &ctx->pinned_groups, group_entry) 2410 group_sched_out(event, cpuctx, ctx); 2411 } 2412 2413 - if ((is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE)) { 2414 list_for_each_entry(event, &ctx->flexible_groups, group_entry) 2415 group_sched_out(event, cpuctx, ctx); 2416 } ··· 2699 perf_cgroup_sched_out(task, next); 2700 } 2701 2702 - static void task_ctx_sched_out(struct perf_cpu_context *cpuctx, 2703 - struct perf_event_context *ctx) 2704 - { 2705 - if (!cpuctx->task_ctx) 2706 - return; 2707 - 2708 - if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) 2709 - return; 2710 - 2711 - ctx_sched_out(ctx, cpuctx, EVENT_ALL); 2712 - } 2713 - 2714 /* 2715 * Called with IRQs disabled 2716 */ ··· 2781 if (likely(!ctx->nr_events)) 2782 return; 2783 2784 - ctx->is_active |= event_type; 2785 if (ctx->task) { 2786 if (!is_active) 2787 cpuctx->task_ctx = ctx; ··· 2789 WARN_ON_ONCE(cpuctx->task_ctx != ctx); 2790 } 2791 2792 - now = perf_clock(); 2793 - ctx->timestamp = now; 2794 - perf_cgroup_set_timestamp(task, ctx); 2795 /* 2796 * First go through the list and put on any pinned groups 2797 * in order to give them the best chance of going on. 2798 */ 2799 - if (!(is_active & EVENT_PINNED) && (event_type & EVENT_PINNED)) 2800 ctx_pinned_sched_in(ctx, cpuctx); 2801 2802 /* Then walk through the lower prio flexible groups */ 2803 - if (!(is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE)) 2804 ctx_flexible_sched_in(ctx, cpuctx); 2805 } 2806 ··· 3172 3173 cpuctx = __get_cpu_context(ctx); 3174 perf_ctx_lock(cpuctx, ctx); 3175 list_for_each_entry(event, &ctx->event_list, event_entry) 3176 enabled |= event_enable_on_exec(event, ctx); 3177 ··· 3590 if (has_branch_stack(event)) 3591 dec = true; 3592 3593 - if (dec) 3594 - static_key_slow_dec_deferred(&perf_sched_events); 3595 3596 unaccount_event_cpu(event, event->cpu); 3597 } 3598 3599 /* ··· 3815 */ 3816 int perf_event_release_kernel(struct perf_event *event) 3817 { 3818 - struct perf_event_context *ctx; 3819 struct perf_event *child, *tmp; 3820 3821 if (!is_kernel_event(event)) 3822 perf_remove_from_owner(event); 3823 3824 ctx = perf_event_ctx_lock(event); 3825 WARN_ON_ONCE(ctx->parent_ctx); 3826 - perf_remove_from_context(event, DETACH_GROUP | DETACH_STATE); 3827 - perf_event_ctx_unlock(event, ctx); 3828 3829 /* 3830 - * At this point we must have event->state == PERF_EVENT_STATE_EXIT, 3831 - * either from the above perf_remove_from_context() or through 3832 - * perf_event_exit_event(). 3833 * 3834 - * Therefore, anybody acquiring event->child_mutex after the below 3835 - * loop _must_ also see this, most importantly inherit_event() which 3836 - * will avoid placing more children on the list. 3837 * 3838 * Thus this guarantees that we will in fact observe and kill _ALL_ 3839 * child events. 3840 */ 3841 - WARN_ON_ONCE(event->state != PERF_EVENT_STATE_EXIT); 3842 3843 again: 3844 mutex_lock(&event->child_mutex); ··· 3905 } 3906 mutex_unlock(&event->child_mutex); 3907 3908 - /* Must be the last reference */ 3909 - put_event(event); 3910 return 0; 3911 } 3912 EXPORT_SYMBOL_GPL(perf_event_release_kernel); ··· 4063 { 4064 bool no_children; 4065 4066 - if (event->state != PERF_EVENT_STATE_EXIT) 4067 return false; 4068 4069 mutex_lock(&event->child_mutex); ··· 7844 if (is_cgroup_event(event)) 7845 inc = true; 7846 7847 - if (inc) 7848 - static_key_slow_inc(&perf_sched_events.key); 7849 7850 account_event_cpu(event, event->cpu); 7851 } ··· 8484 if (move_group) { 8485 gctx = group_leader->ctx; 8486 mutex_lock_double(&gctx->mutex, &ctx->mutex); 8487 } else { 8488 mutex_lock(&ctx->mutex); 8489 } 8490 8491 if (!perf_event_validate_size(event)) { ··· 8613 perf_unpin_context(ctx); 8614 put_ctx(ctx); 8615 err_alloc: 8616 - free_event(event); 8617 err_cpus: 8618 put_online_cpus(); 8619 err_task: ··· 8672 8673 WARN_ON_ONCE(ctx->parent_ctx); 8674 mutex_lock(&ctx->mutex); 8675 if (!exclusive_event_installable(event, ctx)) { 8676 - mutex_unlock(&ctx->mutex); 8677 - perf_unpin_context(ctx); 8678 - put_ctx(ctx); 8679 err = -EBUSY; 8680 - goto err_free; 8681 } 8682 8683 perf_install_in_context(ctx, event, cpu); ··· 8688 8689 return event; 8690 8691 err_free: 8692 free_event(event); 8693 err: ··· 8810 if (parent_event) 8811 perf_group_detach(child_event); 8812 list_del_event(child_event, child_ctx); 8813 - child_event->state = PERF_EVENT_STATE_EXIT; /* see perf_event_release_kernel() */ 8814 raw_spin_unlock_irq(&child_ctx->lock); 8815 8816 /* ··· 9427 9428 ret = init_hw_breakpoint(); 9429 WARN(ret, "hw_breakpoint initialization failed with: %d", ret); 9430 - 9431 - /* do not patch jump label more than once per second */ 9432 - jump_label_rate_limit(&perf_sched_events, HZ); 9433 9434 /* 9435 * Build time assertion that we keep the data_head at the intended
··· 64 struct task_struct *p = tfc->p; 65 66 if (p) { 67 + /* -EAGAIN */ 68 + if (task_cpu(p) != smp_processor_id()) 69 + return; 70 + 71 + /* 72 + * Now that we're on right CPU with IRQs disabled, we can test 73 + * if we hit the right task without races. 74 + */ 75 + 76 + tfc->ret = -ESRCH; /* No such (running) process */ 77 + if (p != current) 78 return; 79 } 80 ··· 92 .p = p, 93 .func = func, 94 .info = info, 95 + .ret = -EAGAIN, 96 }; 97 + int ret; 98 99 + do { 100 + ret = smp_call_function_single(task_cpu(p), remote_function, &data, 1); 101 + if (!ret) 102 + ret = data.ret; 103 + } while (ret == -EAGAIN); 104 105 + return ret; 106 } 107 108 /** ··· 169 * rely on ctx->is_active and therefore cannot use event_function_call(). 170 * See perf_install_in_context(). 171 * 172 * If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set. 173 */ 174 ··· 212 */ 213 if (ctx->task) { 214 if (ctx->task != current) { 215 + ret = -ESRCH; 216 goto unlock; 217 } 218 ··· 276 return; 277 } 278 279 if (task == TASK_TOMBSTONE) 280 return; 281 282 + again: 283 if (!task_function_call(task, event_function, &efs)) 284 return; 285 ··· 289 * a concurrent perf_event_context_sched_out(). 290 */ 291 task = ctx->task; 292 + if (task == TASK_TOMBSTONE) { 293 + raw_spin_unlock_irq(&ctx->lock); 294 + return; 295 } 296 + if (ctx->is_active) { 297 + raw_spin_unlock_irq(&ctx->lock); 298 + goto again; 299 + } 300 + func(event, NULL, ctx, data); 301 raw_spin_unlock_irq(&ctx->lock); 302 } 303 ··· 314 enum event_type_t { 315 EVENT_FLEXIBLE = 0x1, 316 EVENT_PINNED = 0x2, 317 + EVENT_TIME = 0x4, 318 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, 319 }; 320 ··· 321 * perf_sched_events : >0 events exist 322 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu 323 */ 324 + 325 + static void perf_sched_delayed(struct work_struct *work); 326 + DEFINE_STATIC_KEY_FALSE(perf_sched_events); 327 + static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed); 328 + static DEFINE_MUTEX(perf_sched_mutex); 329 + static atomic_t perf_sched_count; 330 + 331 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); 332 static DEFINE_PER_CPU(int, perf_sched_cb_usages); 333 ··· 1288 1289 /* 1290 * Update the total_time_enabled and total_time_running fields for a event. 1291 */ 1292 static void update_event_times(struct perf_event *event) 1293 { 1294 struct perf_event_context *ctx = event->ctx; 1295 u64 run_end; 1296 1297 + lockdep_assert_held(&ctx->lock); 1298 + 1299 if (event->state < PERF_EVENT_STATE_INACTIVE || 1300 event->group_leader->state < PERF_EVENT_STATE_INACTIVE) 1301 return; 1302 + 1303 /* 1304 * in cgroup mode, time_enabled represents 1305 * the time the event was enabled AND active ··· 1645 1646 static bool is_orphaned_event(struct perf_event *event) 1647 { 1648 + return event->state == PERF_EVENT_STATE_DEAD; 1649 } 1650 1651 static inline int pmu_filter_match(struct perf_event *event) ··· 1690 1691 perf_pmu_disable(event->pmu); 1692 1693 + event->tstamp_stopped = tstamp; 1694 + event->pmu->del(event, 0); 1695 + event->oncpu = -1; 1696 event->state = PERF_EVENT_STATE_INACTIVE; 1697 if (event->pending_disable) { 1698 event->pending_disable = 0; 1699 event->state = PERF_EVENT_STATE_OFF; 1700 } 1701 1702 if (!is_software_event(event)) 1703 cpuctx->active_oncpu--; ··· 1732 } 1733 1734 #define DETACH_GROUP 0x01UL 1735 1736 /* 1737 * Cross CPU call to remove a performance event ··· 1752 if (flags & DETACH_GROUP) 1753 perf_group_detach(event); 1754 list_del_event(event, ctx); 1755 1756 if (!ctx->nr_events && ctx->is_active) { 1757 ctx->is_active = 0; ··· 2063 event->tstamp_stopped = tstamp; 2064 } 2065 2066 + static void ctx_sched_out(struct perf_event_context *ctx, 2067 + struct perf_cpu_context *cpuctx, 2068 + enum event_type_t event_type); 2069 static void 2070 ctx_sched_in(struct perf_event_context *ctx, 2071 struct perf_cpu_context *cpuctx, 2072 enum event_type_t event_type, 2073 struct task_struct *task); 2074 + 2075 + static void task_ctx_sched_out(struct perf_cpu_context *cpuctx, 2076 + struct perf_event_context *ctx) 2077 + { 2078 + if (!cpuctx->task_ctx) 2079 + return; 2080 + 2081 + if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) 2082 + return; 2083 + 2084 + ctx_sched_out(ctx, cpuctx, EVENT_ALL); 2085 + } 2086 2087 static void perf_event_sched_in(struct perf_cpu_context *cpuctx, 2088 struct perf_event_context *ctx, ··· 2097 /* 2098 * Cross CPU call to install and enable a performance event 2099 * 2100 + * Very similar to remote_function() + event_function() but cannot assume that 2101 + * things like ctx->is_active and cpuctx->task_ctx are set. 2102 */ 2103 static int __perf_install_in_context(void *info) 2104 { 2105 + struct perf_event *event = info; 2106 + struct perf_event_context *ctx = event->ctx; 2107 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 2108 struct perf_event_context *task_ctx = cpuctx->task_ctx; 2109 + bool activate = true; 2110 + int ret = 0; 2111 2112 raw_spin_lock(&cpuctx->ctx.lock); 2113 if (ctx->task) { 2114 raw_spin_lock(&ctx->lock); 2115 task_ctx = ctx; 2116 + 2117 + /* If we're on the wrong CPU, try again */ 2118 + if (task_cpu(ctx->task) != smp_processor_id()) { 2119 + ret = -ESRCH; 2120 goto unlock; 2121 + } 2122 2123 /* 2124 + * If we're on the right CPU, see if the task we target is 2125 + * current, if not we don't have to activate the ctx, a future 2126 + * context switch will do that for us. 2127 */ 2128 + if (ctx->task != current) 2129 + activate = false; 2130 + else 2131 + WARN_ON_ONCE(cpuctx->task_ctx && cpuctx->task_ctx != ctx); 2132 + 2133 } else if (task_ctx) { 2134 raw_spin_lock(&task_ctx->lock); 2135 } 2136 2137 + if (activate) { 2138 + ctx_sched_out(ctx, cpuctx, EVENT_TIME); 2139 + add_event_to_ctx(event, ctx); 2140 + ctx_resched(cpuctx, task_ctx); 2141 + } else { 2142 + add_event_to_ctx(event, ctx); 2143 + } 2144 + 2145 unlock: 2146 perf_ctx_unlock(cpuctx, task_ctx); 2147 2148 + return ret; 2149 } 2150 2151 /* 2152 + * Attach a performance event to a context. 2153 + * 2154 + * Very similar to event_function_call, see comment there. 2155 */ 2156 static void 2157 perf_install_in_context(struct perf_event_context *ctx, 2158 struct perf_event *event, 2159 int cpu) 2160 { 2161 + struct task_struct *task = READ_ONCE(ctx->task); 2162 2163 lockdep_assert_held(&ctx->mutex); 2164 ··· 2147 if (event->cpu != -1) 2148 event->cpu = cpu; 2149 2150 + if (!task) { 2151 + cpu_function_call(cpu, __perf_install_in_context, event); 2152 + return; 2153 + } 2154 + 2155 + /* 2156 + * Should not happen, we validate the ctx is still alive before calling. 2157 + */ 2158 + if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) 2159 + return; 2160 + 2161 /* 2162 * Installing events is tricky because we cannot rely on ctx->is_active 2163 * to be set in case this is the nr_events 0 -> 1 transition. 2164 */ 2165 + again: 2166 + /* 2167 + * Cannot use task_function_call() because we need to run on the task's 2168 + * CPU regardless of whether its current or not. 2169 + */ 2170 + if (!cpu_function_call(task_cpu(task), __perf_install_in_context, event)) 2171 + return; 2172 + 2173 raw_spin_lock_irq(&ctx->lock); 2174 task = ctx->task; 2175 + if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) { 2176 + /* 2177 + * Cannot happen because we already checked above (which also 2178 + * cannot happen), and we hold ctx->mutex, which serializes us 2179 + * against perf_event_exit_task_context(). 2180 + */ 2181 raw_spin_unlock_irq(&ctx->lock); 2182 return; 2183 } 2184 raw_spin_unlock_irq(&ctx->lock); 2185 + /* 2186 + * Since !ctx->is_active doesn't mean anything, we must IPI 2187 + * unconditionally. 2188 + */ 2189 + goto again; 2190 } 2191 2192 /* ··· 2219 event->state <= PERF_EVENT_STATE_ERROR) 2220 return; 2221 2222 + if (ctx->is_active) 2223 + ctx_sched_out(ctx, cpuctx, EVENT_TIME); 2224 + 2225 __perf_event_mark_enabled(event); 2226 2227 if (!ctx->is_active) 2228 return; 2229 2230 if (!event_filter_match(event)) { 2231 + if (is_cgroup_event(event)) 2232 perf_cgroup_defer_enabled(event); 2233 + ctx_sched_in(ctx, cpuctx, EVENT_TIME, current); 2234 return; 2235 } 2236 ··· 2237 * If the event is in a group and isn't the group leader, 2238 * then don't put it on unless the group is on. 2239 */ 2240 + if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) { 2241 + ctx_sched_in(ctx, cpuctx, EVENT_TIME, current); 2242 return; 2243 + } 2244 2245 task_ctx = cpuctx->task_ctx; 2246 if (ctx->task) ··· 2344 } 2345 2346 ctx->is_active &= ~event_type; 2347 + if (!(ctx->is_active & EVENT_ALL)) 2348 + ctx->is_active = 0; 2349 + 2350 if (ctx->task) { 2351 WARN_ON_ONCE(cpuctx->task_ctx != ctx); 2352 if (!ctx->is_active) 2353 cpuctx->task_ctx = NULL; 2354 } 2355 2356 + is_active ^= ctx->is_active; /* changed bits */ 2357 + 2358 + if (is_active & EVENT_TIME) { 2359 + /* update (and stop) ctx time */ 2360 + update_context_time(ctx); 2361 + update_cgrp_time_from_cpuctx(cpuctx); 2362 + } 2363 + 2364 + if (!ctx->nr_active || !(is_active & EVENT_ALL)) 2365 return; 2366 2367 perf_pmu_disable(ctx->pmu); 2368 + if (is_active & EVENT_PINNED) { 2369 list_for_each_entry(event, &ctx->pinned_groups, group_entry) 2370 group_sched_out(event, cpuctx, ctx); 2371 } 2372 2373 + if (is_active & EVENT_FLEXIBLE) { 2374 list_for_each_entry(event, &ctx->flexible_groups, group_entry) 2375 group_sched_out(event, cpuctx, ctx); 2376 } ··· 2641 perf_cgroup_sched_out(task, next); 2642 } 2643 2644 /* 2645 * Called with IRQs disabled 2646 */ ··· 2735 if (likely(!ctx->nr_events)) 2736 return; 2737 2738 + ctx->is_active |= (event_type | EVENT_TIME); 2739 if (ctx->task) { 2740 if (!is_active) 2741 cpuctx->task_ctx = ctx; ··· 2743 WARN_ON_ONCE(cpuctx->task_ctx != ctx); 2744 } 2745 2746 + is_active ^= ctx->is_active; /* changed bits */ 2747 + 2748 + if (is_active & EVENT_TIME) { 2749 + /* start ctx time */ 2750 + now = perf_clock(); 2751 + ctx->timestamp = now; 2752 + perf_cgroup_set_timestamp(task, ctx); 2753 + } 2754 + 2755 /* 2756 * First go through the list and put on any pinned groups 2757 * in order to give them the best chance of going on. 2758 */ 2759 + if (is_active & EVENT_PINNED) 2760 ctx_pinned_sched_in(ctx, cpuctx); 2761 2762 /* Then walk through the lower prio flexible groups */ 2763 + if (is_active & EVENT_FLEXIBLE) 2764 ctx_flexible_sched_in(ctx, cpuctx); 2765 } 2766 ··· 3120 3121 cpuctx = __get_cpu_context(ctx); 3122 perf_ctx_lock(cpuctx, ctx); 3123 + ctx_sched_out(ctx, cpuctx, EVENT_TIME); 3124 list_for_each_entry(event, &ctx->event_list, event_entry) 3125 enabled |= event_enable_on_exec(event, ctx); 3126 ··· 3537 if (has_branch_stack(event)) 3538 dec = true; 3539 3540 + if (dec) { 3541 + if (!atomic_add_unless(&perf_sched_count, -1, 1)) 3542 + schedule_delayed_work(&perf_sched_work, HZ); 3543 + } 3544 3545 unaccount_event_cpu(event, event->cpu); 3546 + } 3547 + 3548 + static void perf_sched_delayed(struct work_struct *work) 3549 + { 3550 + mutex_lock(&perf_sched_mutex); 3551 + if (atomic_dec_and_test(&perf_sched_count)) 3552 + static_branch_disable(&perf_sched_events); 3553 + mutex_unlock(&perf_sched_mutex); 3554 } 3555 3556 /* ··· 3752 */ 3753 int perf_event_release_kernel(struct perf_event *event) 3754 { 3755 + struct perf_event_context *ctx = event->ctx; 3756 struct perf_event *child, *tmp; 3757 + 3758 + /* 3759 + * If we got here through err_file: fput(event_file); we will not have 3760 + * attached to a context yet. 3761 + */ 3762 + if (!ctx) { 3763 + WARN_ON_ONCE(event->attach_state & 3764 + (PERF_ATTACH_CONTEXT|PERF_ATTACH_GROUP)); 3765 + goto no_ctx; 3766 + } 3767 3768 if (!is_kernel_event(event)) 3769 perf_remove_from_owner(event); 3770 3771 ctx = perf_event_ctx_lock(event); 3772 WARN_ON_ONCE(ctx->parent_ctx); 3773 + perf_remove_from_context(event, DETACH_GROUP); 3774 3775 + raw_spin_lock_irq(&ctx->lock); 3776 /* 3777 + * Mark this even as STATE_DEAD, there is no external reference to it 3778 + * anymore. 3779 * 3780 + * Anybody acquiring event->child_mutex after the below loop _must_ 3781 + * also see this, most importantly inherit_event() which will avoid 3782 + * placing more children on the list. 3783 * 3784 * Thus this guarantees that we will in fact observe and kill _ALL_ 3785 * child events. 3786 */ 3787 + event->state = PERF_EVENT_STATE_DEAD; 3788 + raw_spin_unlock_irq(&ctx->lock); 3789 + 3790 + perf_event_ctx_unlock(event, ctx); 3791 3792 again: 3793 mutex_lock(&event->child_mutex); ··· 3830 } 3831 mutex_unlock(&event->child_mutex); 3832 3833 + no_ctx: 3834 + put_event(event); /* Must be the 'last' reference */ 3835 return 0; 3836 } 3837 EXPORT_SYMBOL_GPL(perf_event_release_kernel); ··· 3988 { 3989 bool no_children; 3990 3991 + if (event->state > PERF_EVENT_STATE_EXIT) 3992 return false; 3993 3994 mutex_lock(&event->child_mutex); ··· 7769 if (is_cgroup_event(event)) 7770 inc = true; 7771 7772 + if (inc) { 7773 + if (atomic_inc_not_zero(&perf_sched_count)) 7774 + goto enabled; 7775 + 7776 + mutex_lock(&perf_sched_mutex); 7777 + if (!atomic_read(&perf_sched_count)) { 7778 + static_branch_enable(&perf_sched_events); 7779 + /* 7780 + * Guarantee that all CPUs observe they key change and 7781 + * call the perf scheduling hooks before proceeding to 7782 + * install events that need them. 7783 + */ 7784 + synchronize_sched(); 7785 + } 7786 + /* 7787 + * Now that we have waited for the sync_sched(), allow further 7788 + * increments to by-pass the mutex. 7789 + */ 7790 + atomic_inc(&perf_sched_count); 7791 + mutex_unlock(&perf_sched_mutex); 7792 + } 7793 + enabled: 7794 7795 account_event_cpu(event, event->cpu); 7796 } ··· 8389 if (move_group) { 8390 gctx = group_leader->ctx; 8391 mutex_lock_double(&gctx->mutex, &ctx->mutex); 8392 + if (gctx->task == TASK_TOMBSTONE) { 8393 + err = -ESRCH; 8394 + goto err_locked; 8395 + } 8396 } else { 8397 mutex_lock(&ctx->mutex); 8398 + } 8399 + 8400 + if (ctx->task == TASK_TOMBSTONE) { 8401 + err = -ESRCH; 8402 + goto err_locked; 8403 } 8404 8405 if (!perf_event_validate_size(event)) { ··· 8509 perf_unpin_context(ctx); 8510 put_ctx(ctx); 8511 err_alloc: 8512 + /* 8513 + * If event_file is set, the fput() above will have called ->release() 8514 + * and that will take care of freeing the event. 8515 + */ 8516 + if (!event_file) 8517 + free_event(event); 8518 err_cpus: 8519 put_online_cpus(); 8520 err_task: ··· 8563 8564 WARN_ON_ONCE(ctx->parent_ctx); 8565 mutex_lock(&ctx->mutex); 8566 + if (ctx->task == TASK_TOMBSTONE) { 8567 + err = -ESRCH; 8568 + goto err_unlock; 8569 + } 8570 + 8571 if (!exclusive_event_installable(event, ctx)) { 8572 err = -EBUSY; 8573 + goto err_unlock; 8574 } 8575 8576 perf_install_in_context(ctx, event, cpu); ··· 8577 8578 return event; 8579 8580 + err_unlock: 8581 + mutex_unlock(&ctx->mutex); 8582 + perf_unpin_context(ctx); 8583 + put_ctx(ctx); 8584 err_free: 8585 free_event(event); 8586 err: ··· 8695 if (parent_event) 8696 perf_group_detach(child_event); 8697 list_del_event(child_event, child_ctx); 8698 + child_event->state = PERF_EVENT_STATE_EXIT; /* is_event_hup() */ 8699 raw_spin_unlock_irq(&child_ctx->lock); 8700 8701 /* ··· 9312 9313 ret = init_hw_breakpoint(); 9314 WARN(ret, "hw_breakpoint initialization failed with: %d", ret); 9315 9316 /* 9317 * Build time assertion that we keep the data_head at the intended