Merge branch 'perf-fixes-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip

* 'perf-fixes-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip:
perf, x86: Try to handle unknown nmis with an enabled PMU
perf, x86: Fix handle_irq return values
perf, x86: Fix accidentally ack'ing a second event on intel perf counter
oprofile, x86: fix init_sysfs() function stub
lockup_detector: Sync touch_*_watchdog back to old semantics
tracing: Fix a race in function profile
oprofile, x86: fix init_sysfs error handling
perf_events: Fix time tracking for events with pid != -1 and cpu != -1
perf: Initialize callchains roots's childen hits
oprofile: fix crash when accessing freed task structs

+133 -53
+46 -13
arch/x86/kernel/cpu/perf_event.c
··· 1154 1154 /* 1155 1155 * event overflow 1156 1156 */ 1157 - handled = 1; 1157 + handled++; 1158 1158 data.period = event->hw.last_period; 1159 1159 1160 1160 if (!x86_perf_event_set_period(event)) ··· 1200 1200 apic_write(APIC_LVTPC, APIC_DM_NMI); 1201 1201 } 1202 1202 1203 + struct pmu_nmi_state { 1204 + unsigned int marked; 1205 + int handled; 1206 + }; 1207 + 1208 + static DEFINE_PER_CPU(struct pmu_nmi_state, pmu_nmi); 1209 + 1203 1210 static int __kprobes 1204 1211 perf_event_nmi_handler(struct notifier_block *self, 1205 1212 unsigned long cmd, void *__args) 1206 1213 { 1207 1214 struct die_args *args = __args; 1208 - struct pt_regs *regs; 1215 + unsigned int this_nmi; 1216 + int handled; 1209 1217 1210 1218 if (!atomic_read(&active_events)) 1211 1219 return NOTIFY_DONE; ··· 1222 1214 case DIE_NMI: 1223 1215 case DIE_NMI_IPI: 1224 1216 break; 1225 - 1217 + case DIE_NMIUNKNOWN: 1218 + this_nmi = percpu_read(irq_stat.__nmi_count); 1219 + if (this_nmi != __get_cpu_var(pmu_nmi).marked) 1220 + /* let the kernel handle the unknown nmi */ 1221 + return NOTIFY_DONE; 1222 + /* 1223 + * This one is a PMU back-to-back nmi. Two events 1224 + * trigger 'simultaneously' raising two back-to-back 1225 + * NMIs. If the first NMI handles both, the latter 1226 + * will be empty and daze the CPU. So, we drop it to 1227 + * avoid false-positive 'unknown nmi' messages. 1228 + */ 1229 + return NOTIFY_STOP; 1226 1230 default: 1227 1231 return NOTIFY_DONE; 1228 1232 } 1229 1233 1230 - regs = args->regs; 1231 - 1232 1234 apic_write(APIC_LVTPC, APIC_DM_NMI); 1233 - /* 1234 - * Can't rely on the handled return value to say it was our NMI, two 1235 - * events could trigger 'simultaneously' raising two back-to-back NMIs. 1236 - * 1237 - * If the first NMI handles both, the latter will be empty and daze 1238 - * the CPU. 1239 - */ 1240 - x86_pmu.handle_irq(regs); 1235 + 1236 + handled = x86_pmu.handle_irq(args->regs); 1237 + if (!handled) 1238 + return NOTIFY_DONE; 1239 + 1240 + this_nmi = percpu_read(irq_stat.__nmi_count); 1241 + if ((handled > 1) || 1242 + /* the next nmi could be a back-to-back nmi */ 1243 + ((__get_cpu_var(pmu_nmi).marked == this_nmi) && 1244 + (__get_cpu_var(pmu_nmi).handled > 1))) { 1245 + /* 1246 + * We could have two subsequent back-to-back nmis: The 1247 + * first handles more than one counter, the 2nd 1248 + * handles only one counter and the 3rd handles no 1249 + * counter. 1250 + * 1251 + * This is the 2nd nmi because the previous was 1252 + * handling more than one counter. We will mark the 1253 + * next (3rd) and then drop it if unhandled. 1254 + */ 1255 + __get_cpu_var(pmu_nmi).marked = this_nmi + 1; 1256 + __get_cpu_var(pmu_nmi).handled = handled; 1257 + } 1241 1258 1242 1259 return NOTIFY_STOP; 1243 1260 }
+9 -6
arch/x86/kernel/cpu/perf_event_intel.c
··· 712 712 struct perf_sample_data data; 713 713 struct cpu_hw_events *cpuc; 714 714 int bit, loops; 715 - u64 ack, status; 715 + u64 status; 716 + int handled = 0; 716 717 717 718 perf_sample_data_init(&data, 0); 718 719 ··· 729 728 730 729 loops = 0; 731 730 again: 731 + intel_pmu_ack_status(status); 732 732 if (++loops > 100) { 733 733 WARN_ONCE(1, "perfevents: irq loop stuck!\n"); 734 734 perf_event_print_debug(); ··· 738 736 } 739 737 740 738 inc_irq_stat(apic_perf_irqs); 741 - ack = status; 742 739 743 740 intel_pmu_lbr_read(); 744 741 745 742 /* 746 743 * PEBS overflow sets bit 62 in the global status register 747 744 */ 748 - if (__test_and_clear_bit(62, (unsigned long *)&status)) 745 + if (__test_and_clear_bit(62, (unsigned long *)&status)) { 746 + handled++; 749 747 x86_pmu.drain_pebs(regs); 748 + } 750 749 751 750 for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { 752 751 struct perf_event *event = cpuc->events[bit]; 752 + 753 + handled++; 753 754 754 755 if (!test_bit(bit, cpuc->active_mask)) 755 756 continue; ··· 766 761 x86_pmu_stop(event); 767 762 } 768 763 769 - intel_pmu_ack_status(ack); 770 - 771 764 /* 772 765 * Repeat if there is more work to be done: 773 766 */ ··· 775 772 776 773 done: 777 774 intel_pmu_enable_all(0); 778 - return 1; 775 + return handled; 779 776 } 780 777 781 778 static struct event_constraint *
+1 -1
arch/x86/kernel/cpu/perf_event_p4.c
··· 692 692 inc_irq_stat(apic_perf_irqs); 693 693 } 694 694 695 - return handled > 0; 695 + return handled; 696 696 } 697 697 698 698 /*
+17 -5
arch/x86/oprofile/nmi_int.c
··· 568 568 int error; 569 569 570 570 error = sysdev_class_register(&oprofile_sysclass); 571 - if (!error) 572 - error = sysdev_register(&device_oprofile); 571 + if (error) 572 + return error; 573 + 574 + error = sysdev_register(&device_oprofile); 575 + if (error) 576 + sysdev_class_unregister(&oprofile_sysclass); 577 + 573 578 return error; 574 579 } 575 580 ··· 585 580 } 586 581 587 582 #else 588 - #define init_sysfs() do { } while (0) 589 - #define exit_sysfs() do { } while (0) 583 + 584 + static inline int init_sysfs(void) { return 0; } 585 + static inline void exit_sysfs(void) { } 586 + 590 587 #endif /* CONFIG_PM */ 591 588 592 589 static int __init p4_init(char **cpu_type) ··· 702 695 char *cpu_type = NULL; 703 696 int ret = 0; 704 697 698 + using_nmi = 0; 699 + 705 700 if (!cpu_has_apic) 706 701 return -ENODEV; 707 702 ··· 783 774 784 775 mux_init(ops); 785 776 786 - init_sysfs(); 777 + ret = init_sysfs(); 778 + if (ret) 779 + return ret; 780 + 787 781 using_nmi = 1; 788 782 printk(KERN_INFO "oprofile: using NMI interrupt.\n"); 789 783 return 0;
+14 -13
drivers/oprofile/buffer_sync.c
··· 141 141 .notifier_call = module_load_notify, 142 142 }; 143 143 144 - 145 - static void end_sync(void) 146 - { 147 - end_cpu_work(); 148 - /* make sure we don't leak task structs */ 149 - process_task_mortuary(); 150 - process_task_mortuary(); 151 - } 152 - 153 - 154 144 int sync_start(void) 155 145 { 156 146 int err; ··· 148 158 if (!zalloc_cpumask_var(&marked_cpus, GFP_KERNEL)) 149 159 return -ENOMEM; 150 160 151 - start_cpu_work(); 161 + mutex_lock(&buffer_mutex); 152 162 153 163 err = task_handoff_register(&task_free_nb); 154 164 if (err) ··· 163 173 if (err) 164 174 goto out4; 165 175 176 + start_cpu_work(); 177 + 166 178 out: 179 + mutex_unlock(&buffer_mutex); 167 180 return err; 168 181 out4: 169 182 profile_event_unregister(PROFILE_MUNMAP, &munmap_nb); ··· 175 182 out2: 176 183 task_handoff_unregister(&task_free_nb); 177 184 out1: 178 - end_sync(); 179 185 free_cpumask_var(marked_cpus); 180 186 goto out; 181 187 } ··· 182 190 183 191 void sync_stop(void) 184 192 { 193 + /* flush buffers */ 194 + mutex_lock(&buffer_mutex); 195 + end_cpu_work(); 185 196 unregister_module_notifier(&module_load_nb); 186 197 profile_event_unregister(PROFILE_MUNMAP, &munmap_nb); 187 198 profile_event_unregister(PROFILE_TASK_EXIT, &task_exit_nb); 188 199 task_handoff_unregister(&task_free_nb); 189 - end_sync(); 200 + mutex_unlock(&buffer_mutex); 201 + flush_scheduled_work(); 202 + 203 + /* make sure we don't leak task structs */ 204 + process_task_mortuary(); 205 + process_task_mortuary(); 206 + 190 207 free_cpumask_var(marked_cpus); 191 208 } 192 209
-2
drivers/oprofile/cpu_buffer.c
··· 120 120 121 121 cancel_delayed_work(&b->work); 122 122 } 123 - 124 - flush_scheduled_work(); 125 123 } 126 124 127 125 /*
+22 -4
kernel/perf_event.c
··· 402 402 } 403 403 } 404 404 405 + static inline int 406 + event_filter_match(struct perf_event *event) 407 + { 408 + return event->cpu == -1 || event->cpu == smp_processor_id(); 409 + } 410 + 405 411 static void 406 412 event_sched_out(struct perf_event *event, 407 413 struct perf_cpu_context *cpuctx, 408 414 struct perf_event_context *ctx) 409 415 { 416 + u64 delta; 417 + /* 418 + * An event which could not be activated because of 419 + * filter mismatch still needs to have its timings 420 + * maintained, otherwise bogus information is return 421 + * via read() for time_enabled, time_running: 422 + */ 423 + if (event->state == PERF_EVENT_STATE_INACTIVE 424 + && !event_filter_match(event)) { 425 + delta = ctx->time - event->tstamp_stopped; 426 + event->tstamp_running += delta; 427 + event->tstamp_stopped = ctx->time; 428 + } 429 + 410 430 if (event->state != PERF_EVENT_STATE_ACTIVE) 411 431 return; 412 432 ··· 452 432 struct perf_event_context *ctx) 453 433 { 454 434 struct perf_event *event; 455 - 456 - if (group_event->state != PERF_EVENT_STATE_ACTIVE) 457 - return; 435 + int state = group_event->state; 458 436 459 437 event_sched_out(group_event, cpuctx, ctx); 460 438 ··· 462 444 list_for_each_entry(event, &group_event->sibling_list, group_entry) 463 445 event_sched_out(event, cpuctx, ctx); 464 446 465 - if (group_event->attr.exclusive) 447 + if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive) 466 448 cpuctx->exclusive = 0; 467 449 } 468 450
+11 -4
kernel/trace/ftrace.c
··· 381 381 { 382 382 struct ftrace_profile *rec = v; 383 383 char str[KSYM_SYMBOL_LEN]; 384 + int ret = 0; 384 385 #ifdef CONFIG_FUNCTION_GRAPH_TRACER 385 - static DEFINE_MUTEX(mutex); 386 386 static struct trace_seq s; 387 387 unsigned long long avg; 388 388 unsigned long long stddev; 389 389 #endif 390 + mutex_lock(&ftrace_profile_lock); 391 + 392 + /* we raced with function_profile_reset() */ 393 + if (unlikely(rec->counter == 0)) { 394 + ret = -EBUSY; 395 + goto out; 396 + } 390 397 391 398 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); 392 399 seq_printf(m, " %-30.30s %10lu", str, rec->counter); ··· 415 408 do_div(stddev, (rec->counter - 1) * 1000); 416 409 } 417 410 418 - mutex_lock(&mutex); 419 411 trace_seq_init(&s); 420 412 trace_print_graph_duration(rec->time, &s); 421 413 trace_seq_puts(&s, " "); ··· 422 416 trace_seq_puts(&s, " "); 423 417 trace_print_graph_duration(stddev, &s); 424 418 trace_print_seq(m, &s); 425 - mutex_unlock(&mutex); 426 419 #endif 427 420 seq_putc(m, '\n'); 421 + out: 422 + mutex_unlock(&ftrace_profile_lock); 428 423 429 - return 0; 424 + return ret; 430 425 } 431 426 432 427 static void ftrace_profile_reset(struct ftrace_profile_stat *stat)
+12 -5
kernel/watchdog.c
··· 122 122 123 123 void touch_softlockup_watchdog(void) 124 124 { 125 - __get_cpu_var(watchdog_touch_ts) = 0; 125 + __raw_get_cpu_var(watchdog_touch_ts) = 0; 126 126 } 127 127 EXPORT_SYMBOL(touch_softlockup_watchdog); 128 128 ··· 142 142 #ifdef CONFIG_HARDLOCKUP_DETECTOR 143 143 void touch_nmi_watchdog(void) 144 144 { 145 - __get_cpu_var(watchdog_nmi_touch) = true; 145 + if (watchdog_enabled) { 146 + unsigned cpu; 147 + 148 + for_each_present_cpu(cpu) { 149 + if (per_cpu(watchdog_nmi_touch, cpu) != true) 150 + per_cpu(watchdog_nmi_touch, cpu) = true; 151 + } 152 + } 146 153 touch_softlockup_watchdog(); 147 154 } 148 155 EXPORT_SYMBOL(touch_nmi_watchdog); ··· 440 433 wake_up_process(p); 441 434 } 442 435 436 + /* if any cpu succeeds, watchdog is considered enabled for the system */ 437 + watchdog_enabled = 1; 438 + 443 439 return 0; 444 440 } 445 441 ··· 465 455 per_cpu(softlockup_watchdog, cpu) = NULL; 466 456 kthread_stop(p); 467 457 } 468 - 469 - /* if any cpu succeeds, watchdog is considered enabled for the system */ 470 - watchdog_enabled = 1; 471 458 } 472 459 473 460 static void watchdog_enable_all_cpus(void)
+1
tools/perf/util/callchain.h
··· 50 50 INIT_LIST_HEAD(&node->children); 51 51 INIT_LIST_HEAD(&node->val); 52 52 53 + node->children_hit = 0; 53 54 node->parent = NULL; 54 55 node->hit = 0; 55 56 }