Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'perf/urgent' into perf/core

Conflicts:
arch/x86/kernel/apic/hw_nmi.c

Merge reason: Resolve conflict, queue up dependent patch.

Signed-off-by: Ingo Molnar <mingo@elte.hu>

+144 -46
+1 -1
arch/x86/Kconfig
··· 21 21 select HAVE_UNSTABLE_SCHED_CLOCK 22 22 select HAVE_IDE 23 23 select HAVE_OPROFILE 24 - select HAVE_PERF_EVENTS if (!M386 && !M486) 24 + select HAVE_PERF_EVENTS 25 25 select HAVE_IRQ_WORK 26 26 select HAVE_IOREMAP_PROT 27 27 select HAVE_KPROBES
+4 -3
arch/x86/kernel/apic/hw_nmi.c
··· 17 17 #include <linux/nmi.h> 18 18 #include <linux/module.h> 19 19 20 - /* For reliability, we're prepared to waste bits here. */ 21 - static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly; 22 - 23 20 #ifdef CONFIG_HARDLOCKUP_DETECTOR 24 21 u64 hw_nmi_get_sample_period(void) 25 22 { 26 23 return (u64)(cpu_khz) * 1000 * 60; 27 24 } 28 25 #endif 26 + 27 + 28 + /* For reliability, we're prepared to waste bits here. */ 29 + static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly; 29 30 30 31 #ifdef arch_trigger_all_cpu_backtrace 31 32 void arch_trigger_all_cpu_backtrace(void)
+20
arch/x86/kernel/cpu/perf_event.c
··· 372 372 373 373 #endif 374 374 375 + static bool check_hw_exists(void) 376 + { 377 + u64 val, val_new = 0; 378 + int ret = 0; 379 + 380 + val = 0xabcdUL; 381 + ret |= checking_wrmsrl(x86_pmu.perfctr, val); 382 + ret |= rdmsrl_safe(x86_pmu.perfctr, &val_new); 383 + if (ret || val != val_new) 384 + return false; 385 + 386 + return true; 387 + } 388 + 375 389 static void reserve_ds_buffers(void); 376 390 static void release_ds_buffers(void); 377 391 ··· 1376 1362 } 1377 1363 1378 1364 pmu_check_apic(); 1365 + 1366 + /* sanity check that the hardware exists or is emulated */ 1367 + if (!check_hw_exists()) { 1368 + pr_cont("Broken PMU hardware detected, software events only.\n"); 1369 + return; 1370 + } 1379 1371 1380 1372 pr_cont("%s PMU driver.\n", x86_pmu.name); 1381 1373
+2
arch/x86/kernel/entry_64.S
··· 295 295 .endm 296 296 297 297 /* save partial stack frame */ 298 + .pushsection .kprobes.text, "ax" 298 299 ENTRY(save_args) 299 300 XCPT_FRAME 300 301 cld ··· 335 334 ret 336 335 CFI_ENDPROC 337 336 END(save_args) 337 + .popsection 338 338 339 339 ENTRY(save_rest) 340 340 PARTIAL_FRAME 1 REST_SKIP+8
+4
arch/x86/kernel/hw_breakpoint.c
··· 433 433 dr6_p = (unsigned long *)ERR_PTR(args->err); 434 434 dr6 = *dr6_p; 435 435 436 + /* If it's a single step, TRAP bits are random */ 437 + if (dr6 & DR_STEP) 438 + return NOTIFY_DONE; 439 + 436 440 /* Do an early return if no trap bits are set in DR6 */ 437 441 if ((dr6 & DR_TRAP_BITS) == 0) 438 442 return NOTIFY_DONE;
+4
include/linux/hw_breakpoint.h
··· 33 33 34 34 #ifdef CONFIG_HAVE_HW_BREAKPOINT 35 35 36 + extern int __init init_hw_breakpoint(void); 37 + 36 38 static inline void hw_breakpoint_init(struct perf_event_attr *attr) 37 39 { 38 40 memset(attr, 0, sizeof(*attr)); ··· 109 107 } 110 108 111 109 #else /* !CONFIG_HAVE_HW_BREAKPOINT */ 110 + 111 + static inline int __init init_hw_breakpoint(void) { return 0; } 112 112 113 113 static inline struct perf_event * 114 114 register_user_hw_breakpoint(struct perf_event_attr *attr,
+16 -14
include/linux/perf_event.h
··· 850 850 int nr_active; 851 851 int is_active; 852 852 int nr_stat; 853 + int rotate_disable; 853 854 atomic_t refcount; 854 855 struct task_struct *task; 855 856 ··· 909 908 extern const char *perf_pmu_name(void); 910 909 extern void __perf_event_task_sched_in(struct task_struct *task); 911 910 extern void __perf_event_task_sched_out(struct task_struct *task, struct task_struct *next); 912 - 913 - extern atomic_t perf_task_events; 914 - 915 - static inline void perf_event_task_sched_in(struct task_struct *task) 916 - { 917 - COND_STMT(&perf_task_events, __perf_event_task_sched_in(task)); 918 - } 919 - 920 - static inline 921 - void perf_event_task_sched_out(struct task_struct *task, struct task_struct *next) 922 - { 923 - COND_STMT(&perf_task_events, __perf_event_task_sched_out(task, next)); 924 - } 925 - 926 911 extern int perf_event_init_task(struct task_struct *child); 927 912 extern void perf_event_exit_task(struct task_struct *child); 928 913 extern void perf_event_free_task(struct task_struct *task); ··· 1015 1028 regs = &hot_regs; 1016 1029 } 1017 1030 __perf_sw_event(event_id, nr, nmi, regs, addr); 1031 + } 1032 + 1033 + extern atomic_t perf_task_events; 1034 + 1035 + static inline void perf_event_task_sched_in(struct task_struct *task) 1036 + { 1037 + COND_STMT(&perf_task_events, __perf_event_task_sched_in(task)); 1038 + } 1039 + 1040 + static inline 1041 + void perf_event_task_sched_out(struct task_struct *task, struct task_struct *next) 1042 + { 1043 + perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0); 1044 + 1045 + COND_STMT(&perf_task_events, __perf_event_task_sched_out(task, next)); 1018 1046 } 1019 1047 1020 1048 extern void perf_event_mmap(struct vm_area_struct *vma);
+1 -2
kernel/hw_breakpoint.c
··· 620 620 .read = hw_breakpoint_pmu_read, 621 621 }; 622 622 623 - static int __init init_hw_breakpoint(void) 623 + int __init init_hw_breakpoint(void) 624 624 { 625 625 unsigned int **task_bp_pinned; 626 626 int cpu, err_cpu; ··· 655 655 656 656 return -ENOMEM; 657 657 } 658 - core_initcall(init_hw_breakpoint); 659 658 660 659
+3 -1
kernel/irq_work.c
··· 145 145 * Clear the BUSY bit and return to the free state if 146 146 * no-one else claimed it meanwhile. 147 147 */ 148 - cmpxchg(&entry->next, next_flags(NULL, IRQ_WORK_BUSY), NULL); 148 + (void)cmpxchg(&entry->next, 149 + next_flags(NULL, IRQ_WORK_BUSY), 150 + NULL); 149 151 } 150 152 } 151 153 EXPORT_SYMBOL_GPL(irq_work_run);
+77 -16
kernel/perf_event.c
··· 31 31 #include <linux/kernel_stat.h> 32 32 #include <linux/perf_event.h> 33 33 #include <linux/ftrace_event.h> 34 + #include <linux/hw_breakpoint.h> 34 35 35 36 #include <asm/irq_regs.h> 36 37 ··· 1287 1286 { 1288 1287 int ctxn; 1289 1288 1290 - perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0); 1291 - 1292 1289 for_each_task_context_nr(ctxn) 1293 1290 perf_event_context_sched_out(task, ctxn, next); 1294 1291 } ··· 1620 1621 { 1621 1622 raw_spin_lock(&ctx->lock); 1622 1623 1623 - /* Rotate the first entry last of non-pinned groups */ 1624 - list_rotate_left(&ctx->flexible_groups); 1624 + /* 1625 + * Rotate the first entry last of non-pinned groups. Rotation might be 1626 + * disabled by the inheritance code. 1627 + */ 1628 + if (!ctx->rotate_disable) 1629 + list_rotate_left(&ctx->flexible_groups); 1625 1630 1626 1631 raw_spin_unlock(&ctx->lock); 1627 1632 } ··· 2237 2234 raw_spin_unlock_irq(&ctx->lock); 2238 2235 mutex_unlock(&ctx->mutex); 2239 2236 2240 - mutex_lock(&event->owner->perf_event_mutex); 2241 - list_del_init(&event->owner_entry); 2242 - mutex_unlock(&event->owner->perf_event_mutex); 2243 - put_task_struct(event->owner); 2244 - 2245 2237 free_event(event); 2246 2238 2247 2239 return 0; ··· 2249 2251 static int perf_release(struct inode *inode, struct file *file) 2250 2252 { 2251 2253 struct perf_event *event = file->private_data; 2254 + struct task_struct *owner; 2252 2255 2253 2256 file->private_data = NULL; 2257 + 2258 + rcu_read_lock(); 2259 + owner = ACCESS_ONCE(event->owner); 2260 + /* 2261 + * Matches the smp_wmb() in perf_event_exit_task(). If we observe 2262 + * !owner it means the list deletion is complete and we can indeed 2263 + * free this event, otherwise we need to serialize on 2264 + * owner->perf_event_mutex. 2265 + */ 2266 + smp_read_barrier_depends(); 2267 + if (owner) { 2268 + /* 2269 + * Since delayed_put_task_struct() also drops the last 2270 + * task reference we can safely take a new reference 2271 + * while holding the rcu_read_lock(). 2272 + */ 2273 + get_task_struct(owner); 2274 + } 2275 + rcu_read_unlock(); 2276 + 2277 + if (owner) { 2278 + mutex_lock(&owner->perf_event_mutex); 2279 + /* 2280 + * We have to re-check the event->owner field, if it is cleared 2281 + * we raced with perf_event_exit_task(), acquiring the mutex 2282 + * ensured they're done, and we can proceed with freeing the 2283 + * event. 2284 + */ 2285 + if (event->owner) 2286 + list_del_init(&event->owner_entry); 2287 + mutex_unlock(&owner->perf_event_mutex); 2288 + put_task_struct(owner); 2289 + } 2254 2290 2255 2291 return perf_event_release_kernel(event); 2256 2292 } ··· 5700 5668 mutex_unlock(&ctx->mutex); 5701 5669 5702 5670 event->owner = current; 5703 - get_task_struct(current); 5671 + 5704 5672 mutex_lock(&current->perf_event_mutex); 5705 5673 list_add_tail(&event->owner_entry, &current->perf_event_list); 5706 5674 mutex_unlock(&current->perf_event_mutex); ··· 5767 5735 perf_install_in_context(ctx, event, cpu); 5768 5736 ++ctx->generation; 5769 5737 mutex_unlock(&ctx->mutex); 5770 - 5771 - event->owner = current; 5772 - get_task_struct(current); 5773 - mutex_lock(&current->perf_event_mutex); 5774 - list_add_tail(&event->owner_entry, &current->perf_event_list); 5775 - mutex_unlock(&current->perf_event_mutex); 5776 5738 5777 5739 return event; 5778 5740 ··· 5918 5892 */ 5919 5893 void perf_event_exit_task(struct task_struct *child) 5920 5894 { 5895 + struct perf_event *event, *tmp; 5921 5896 int ctxn; 5897 + 5898 + mutex_lock(&child->perf_event_mutex); 5899 + list_for_each_entry_safe(event, tmp, &child->perf_event_list, 5900 + owner_entry) { 5901 + list_del_init(&event->owner_entry); 5902 + 5903 + /* 5904 + * Ensure the list deletion is visible before we clear 5905 + * the owner, closes a race against perf_release() where 5906 + * we need to serialize on the owner->perf_event_mutex. 5907 + */ 5908 + smp_wmb(); 5909 + event->owner = NULL; 5910 + } 5911 + mutex_unlock(&child->perf_event_mutex); 5922 5912 5923 5913 for_each_task_context_nr(ctxn) 5924 5914 perf_event_exit_task_context(child, ctxn); ··· 6155 6113 struct perf_event *event; 6156 6114 struct task_struct *parent = current; 6157 6115 int inherited_all = 1; 6116 + unsigned long flags; 6158 6117 int ret = 0; 6159 6118 6160 6119 child->perf_event_ctxp[ctxn] = NULL; ··· 6196 6153 break; 6197 6154 } 6198 6155 6156 + /* 6157 + * We can't hold ctx->lock when iterating the ->flexible_group list due 6158 + * to allocations, but we need to prevent rotation because 6159 + * rotate_ctx() will change the list from interrupt context. 6160 + */ 6161 + raw_spin_lock_irqsave(&parent_ctx->lock, flags); 6162 + parent_ctx->rotate_disable = 1; 6163 + raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); 6164 + 6199 6165 list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) { 6200 6166 ret = inherit_task_group(event, parent, parent_ctx, 6201 6167 child, ctxn, &inherited_all); 6202 6168 if (ret) 6203 6169 break; 6204 6170 } 6171 + 6172 + raw_spin_lock_irqsave(&parent_ctx->lock, flags); 6173 + parent_ctx->rotate_disable = 0; 6174 + raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); 6205 6175 6206 6176 child_ctx = child->perf_event_ctxp[ctxn]; 6207 6177 ··· 6368 6312 6369 6313 void __init perf_event_init(void) 6370 6314 { 6315 + int ret; 6316 + 6371 6317 perf_event_init_all_cpus(); 6372 6318 init_srcu_struct(&pmus_srcu); 6373 6319 perf_pmu_register(&perf_swevent); ··· 6377 6319 perf_pmu_register(&perf_task_clock); 6378 6320 perf_tp_register(); 6379 6321 perf_cpu_notifier(perf_cpu_notify); 6322 + 6323 + ret = init_hw_breakpoint(); 6324 + WARN(ret, "hw_breakpoint initialization failed with: %d", ret); 6380 6325 }
+9 -8
tools/perf/builtin-record.c
··· 697 697 if (err < 0) 698 698 err = event__synthesize_kernel_mmap(process_synthesized_event, 699 699 session, machine, "_stext"); 700 - if (err < 0) { 701 - pr_err("Couldn't record kernel reference relocation symbol.\n"); 702 - return err; 703 - } 700 + if (err < 0) 701 + pr_err("Couldn't record kernel reference relocation symbol\n" 702 + "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n" 703 + "Check /proc/kallsyms permission or run as root.\n"); 704 704 705 705 err = event__synthesize_modules(process_synthesized_event, 706 706 session, machine); 707 - if (err < 0) { 708 - pr_err("Couldn't record kernel reference relocation symbol.\n"); 709 - return err; 710 - } 707 + if (err < 0) 708 + pr_err("Couldn't record kernel module information.\n" 709 + "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n" 710 + "Check /proc/modules permission or run as root.\n"); 711 + 711 712 if (perf_guest) 712 713 perf_session__process_machines(session, event__synthesize_guest_os); 713 714
+3 -1
tools/perf/util/symbol.c
··· 295 295 { 296 296 struct rb_node **p = &self->rb_node; 297 297 struct rb_node *parent = NULL; 298 - struct symbol_name_rb_node *symn = ((void *)sym) - sizeof(*parent), *s; 298 + struct symbol_name_rb_node *symn, *s; 299 + 300 + symn = container_of(sym, struct symbol_name_rb_node, sym); 299 301 300 302 while (*p != NULL) { 301 303 parent = *p;