perf: Fix scaling vs. perf_install_in_context()

Completely reworks perf_install_in_context() (again!) in order to
ensure that there will be no ctx time hole between add_event_to_ctx()
and any potential ctx_sched_in().

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dvyukov@google.com
Cc: eranian@google.com
Cc: oleg@redhat.com
Cc: panand@redhat.com
Cc: sasha.levin@oracle.com
Cc: vince@deater.net
Link: http://lkml.kernel.org/r/20160224174948.279399438@infradead.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

authored by Peter Zijlstra and committed by Ingo Molnar a096309b bd2afa49

Changed files
+76 -51
kernel
events
+76 -51
kernel/events/core.c
··· 276 276 return; 277 277 } 278 278 279 - again: 280 279 if (task == TASK_TOMBSTONE) 281 280 return; 282 281 282 + again: 283 283 if (!task_function_call(task, event_function, &efs)) 284 284 return; 285 285 ··· 289 289 * a concurrent perf_event_context_sched_out(). 290 290 */ 291 291 task = ctx->task; 292 - if (task != TASK_TOMBSTONE) { 293 - if (ctx->is_active) { 294 - raw_spin_unlock_irq(&ctx->lock); 295 - goto again; 296 - } 297 - func(event, NULL, ctx, data); 292 + if (task == TASK_TOMBSTONE) { 293 + raw_spin_unlock_irq(&ctx->lock); 294 + return; 298 295 } 296 + if (ctx->is_active) { 297 + raw_spin_unlock_irq(&ctx->lock); 298 + goto again; 299 + } 300 + func(event, NULL, ctx, data); 299 301 raw_spin_unlock_irq(&ctx->lock); 300 302 } 301 303 ··· 2118 2116 /* 2119 2117 * Cross CPU call to install and enable a performance event 2120 2118 * 2121 - * Must be called with ctx->mutex held 2119 + * Very similar to remote_function() + event_function() but cannot assume that 2120 + * things like ctx->is_active and cpuctx->task_ctx are set. 2122 2121 */ 2123 2122 static int __perf_install_in_context(void *info) 2124 2123 { 2125 - struct perf_event_context *ctx = info; 2124 + struct perf_event *event = info; 2125 + struct perf_event_context *ctx = event->ctx; 2126 2126 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 2127 2127 struct perf_event_context *task_ctx = cpuctx->task_ctx; 2128 + bool activate = true; 2129 + int ret = 0; 2128 2130 2129 2131 raw_spin_lock(&cpuctx->ctx.lock); 2130 2132 if (ctx->task) { 2131 2133 raw_spin_lock(&ctx->lock); 2132 - /* 2133 - * If we hit the 'wrong' task, we've since scheduled and 2134 - * everything should be sorted, nothing to do! 2135 - */ 2136 2134 task_ctx = ctx; 2137 - if (ctx->task != current) 2135 + 2136 + /* If we're on the wrong CPU, try again */ 2137 + if (task_cpu(ctx->task) != smp_processor_id()) { 2138 + ret = -ESRCH; 2138 2139 goto unlock; 2140 + } 2139 2141 2140 2142 /* 2141 - * If task_ctx is set, it had better be to us. 2143 + * If we're on the right CPU, see if the task we target is 2144 + * current, if not we don't have to activate the ctx, a future 2145 + * context switch will do that for us. 2142 2146 */ 2143 - WARN_ON_ONCE(cpuctx->task_ctx != ctx && cpuctx->task_ctx); 2147 + if (ctx->task != current) 2148 + activate = false; 2149 + else 2150 + WARN_ON_ONCE(cpuctx->task_ctx && cpuctx->task_ctx != ctx); 2151 + 2144 2152 } else if (task_ctx) { 2145 2153 raw_spin_lock(&task_ctx->lock); 2146 2154 } 2147 2155 2148 - ctx_resched(cpuctx, task_ctx); 2156 + if (activate) { 2157 + ctx_sched_out(ctx, cpuctx, EVENT_TIME); 2158 + add_event_to_ctx(event, ctx); 2159 + ctx_resched(cpuctx, task_ctx); 2160 + } else { 2161 + add_event_to_ctx(event, ctx); 2162 + } 2163 + 2149 2164 unlock: 2150 2165 perf_ctx_unlock(cpuctx, task_ctx); 2151 2166 2152 - return 0; 2167 + return ret; 2153 2168 } 2154 2169 2155 2170 /* 2156 - * Attach a performance event to a context 2171 + * Attach a performance event to a context. 2172 + * 2173 + * Very similar to event_function_call, see comment there. 2157 2174 */ 2158 2175 static void 2159 2176 perf_install_in_context(struct perf_event_context *ctx, 2160 2177 struct perf_event *event, 2161 2178 int cpu) 2162 2179 { 2163 - struct task_struct *task = NULL; 2180 + struct task_struct *task = READ_ONCE(ctx->task); 2164 2181 2165 2182 lockdep_assert_held(&ctx->mutex); 2166 2183 ··· 2187 2166 if (event->cpu != -1) 2188 2167 event->cpu = cpu; 2189 2168 2190 - /* 2191 - * Installing events is tricky because we cannot rely on ctx->is_active 2192 - * to be set in case this is the nr_events 0 -> 1 transition. 2193 - * 2194 - * So what we do is we add the event to the list here, which will allow 2195 - * a future context switch to DTRT and then send a racy IPI. If the IPI 2196 - * fails to hit the right task, this means a context switch must have 2197 - * happened and that will have taken care of business. 2198 - */ 2199 - raw_spin_lock_irq(&ctx->lock); 2200 - task = ctx->task; 2201 - 2202 - /* 2203 - * If between ctx = find_get_context() and mutex_lock(&ctx->mutex) the 2204 - * ctx gets destroyed, we must not install an event into it. 2205 - * 2206 - * This is normally tested for after we acquire the mutex, so this is 2207 - * a sanity check. 2208 - */ 2209 - if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) { 2210 - raw_spin_unlock_irq(&ctx->lock); 2169 + if (!task) { 2170 + cpu_function_call(cpu, __perf_install_in_context, event); 2211 2171 return; 2212 2172 } 2213 2173 2214 - if (ctx->is_active) { 2215 - update_context_time(ctx); 2216 - update_cgrp_time_from_event(event); 2174 + /* 2175 + * Should not happen, we validate the ctx is still alive before calling. 2176 + */ 2177 + if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) 2178 + return; 2179 + 2180 + /* 2181 + * Installing events is tricky because we cannot rely on ctx->is_active 2182 + * to be set in case this is the nr_events 0 -> 1 transition. 2183 + */ 2184 + again: 2185 + /* 2186 + * Cannot use task_function_call() because we need to run on the task's 2187 + * CPU regardless of whether its current or not. 2188 + */ 2189 + if (!cpu_function_call(task_cpu(task), __perf_install_in_context, event)) 2190 + return; 2191 + 2192 + raw_spin_lock_irq(&ctx->lock); 2193 + task = ctx->task; 2194 + if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) { 2195 + /* 2196 + * Cannot happen because we already checked above (which also 2197 + * cannot happen), and we hold ctx->mutex, which serializes us 2198 + * against perf_event_exit_task_context(). 2199 + */ 2200 + raw_spin_unlock_irq(&ctx->lock); 2201 + return; 2217 2202 } 2218 - 2219 - add_event_to_ctx(event, ctx); 2220 2203 raw_spin_unlock_irq(&ctx->lock); 2221 - 2222 - if (task) 2223 - task_function_call(task, __perf_install_in_context, ctx); 2224 - else 2225 - cpu_function_call(cpu, __perf_install_in_context, ctx); 2204 + /* 2205 + * Since !ctx->is_active doesn't mean anything, we must IPI 2206 + * unconditionally. 2207 + */ 2208 + goto again; 2226 2209 } 2227 2210 2228 2211 /*