perf: Fix race between event install and jump_labels

perf_install_in_context() relies upon the context switch hooks to have
scheduled in events when the IPI misses its target -- after all, if
the task has moved from the CPU (or wasn't running at all), it will
have to context switch to run elsewhere.

This however doesn't appear to be happening.

It is possible for the IPI to not happen (task wasn't running) only to
later observe the task running with an inactive context.

The only possible explanation is that the context switch hooks are not
called. Therefore put in a sync_sched() after toggling the jump_label
to guarantee all CPUs will have them enabled before we install an
event.

A simple if (0->1) sync_sched() will not in fact work, because any
further increment can race and complete before the sync_sched().
Therefore we must jump through some hoops.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dvyukov@google.com
Cc: eranian@google.com
Cc: oleg@redhat.com
Cc: panand@redhat.com
Cc: sasha.levin@oracle.com
Cc: vince@deater.net
Link: http://lkml.kernel.org/r/20160224174947.980211985@infradead.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

authored by

Peter Zijlstra and committed by

Ingo Molnar 10 years ago 9107c89e a69b0ca4

+44 -11

2 changed files

expand all

include

linux

perf_event.h

kernel

events

core.c

+3 -3

include/linux/perf_event.h

··· 906 906 } 907 907 } 908 908 909 - extern struct static_key_deferred perf_sched_events; 909 + extern struct static_key_false perf_sched_events; 910 910 911 911 static __always_inline bool 912 912 perf_sw_migrate_enabled(void) ··· 925 925 static inline void perf_event_task_sched_in(struct task_struct *prev, 926 926 struct task_struct *task) 927 927 { 928 - if (static_key_false(&perf_sched_events.key)) 928 + if (static_branch_unlikely(&perf_sched_events)) 929 929 __perf_event_task_sched_in(prev, task); 930 930 931 931 if (perf_sw_migrate_enabled() && task->sched_migrated) { ··· 942 942 { 943 943 perf_sw_event_sched(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 0); 944 944 945 - if (static_key_false(&perf_sched_events.key)) 945 + if (static_branch_unlikely(&perf_sched_events)) 946 946 __perf_event_task_sched_out(prev, next); 947 947 } 948 948

+41 -8

kernel/events/core.c

··· 321 321 * perf_sched_events : >0 events exist 322 322 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu 323 323 */ 324 - struct static_key_deferred perf_sched_events __read_mostly; 324 + 325 + static void perf_sched_delayed(struct work_struct *work); 326 + DEFINE_STATIC_KEY_FALSE(perf_sched_events); 327 + static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed); 328 + static DEFINE_MUTEX(perf_sched_mutex); 329 + static atomic_t perf_sched_count; 330 + 325 331 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); 326 332 static DEFINE_PER_CPU(int, perf_sched_cb_usages); 327 333 ··· 3542 3536 if (has_branch_stack(event)) 3543 3537 dec = true; 3544 3538 3545 - if (dec) 3546 - static_key_slow_dec_deferred(&perf_sched_events); 3539 + if (dec) { 3540 + if (!atomic_add_unless(&perf_sched_count, -1, 1)) 3541 + schedule_delayed_work(&perf_sched_work, HZ); 3542 + } 3547 3543 3548 3544 unaccount_event_cpu(event, event->cpu); 3545 + } 3546 + 3547 + static void perf_sched_delayed(struct work_struct *work) 3548 + { 3549 + mutex_lock(&perf_sched_mutex); 3550 + if (atomic_dec_and_test(&perf_sched_count)) 3551 + static_branch_disable(&perf_sched_events); 3552 + mutex_unlock(&perf_sched_mutex); 3549 3553 } 3550 3554 3551 3555 /* ··· 7796 7780 if (is_cgroup_event(event)) 7797 7781 inc = true; 7798 7782 7799 - if (inc) 7800 - static_key_slow_inc(&perf_sched_events.key); 7783 + if (inc) { 7784 + if (atomic_inc_not_zero(&perf_sched_count)) 7785 + goto enabled; 7786 + 7787 + mutex_lock(&perf_sched_mutex); 7788 + if (!atomic_read(&perf_sched_count)) { 7789 + static_branch_enable(&perf_sched_events); 7790 + /* 7791 + * Guarantee that all CPUs observe they key change and 7792 + * call the perf scheduling hooks before proceeding to 7793 + * install events that need them. 7794 + */ 7795 + synchronize_sched(); 7796 + } 7797 + /* 7798 + * Now that we have waited for the sync_sched(), allow further 7799 + * increments to by-pass the mutex. 7800 + */ 7801 + atomic_inc(&perf_sched_count); 7802 + mutex_unlock(&perf_sched_mutex); 7803 + } 7804 + enabled: 7801 7805 7802 7806 account_event_cpu(event, event->cpu); 7803 7807 } ··· 9379 9343 9380 9344 ret = init_hw_breakpoint(); 9381 9345 WARN(ret, "hw_breakpoint initialization failed with: %d", ret); 9382 - 9383 - /* do not patch jump label more than once per second */ 9384 - jump_label_rate_limit(&perf_sched_events, HZ); 9385 9346 9386 9347 /* 9387 9348 * Build time assertion that we keep the data_head at the intended

Configure Feed

Configure Feed