commit 899edae615c806f78880077bd46f04d7f23ae6e6 · tjh.dev/kernel

tjh.dev / kernel

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Merge branch 'perf-fixes-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip

* 'perf-fixes-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip:
perf, x86: Try to handle unknown nmis with an enabled PMU
perf, x86: Fix handle_irq return values
perf, x86: Fix accidentally ack'ing a second event on intel perf counter
oprofile, x86: fix init_sysfs() function stub
lockup_detector: Sync touch_*_watchdog back to old semantics
tracing: Fix a race in function profile
oprofile, x86: fix init_sysfs error handling
perf_events: Fix time tracking for events with pid != -1 and cpu != -1
perf: Initialize callchains roots's childen hits
oprofile: fix crash when accessing freed task structs

Linus Torvalds 15 years ago 899edae6 c8c727db

+133 -53

10 changed files

expand all

unified split

arch

x86

kernel

cpu

perf_event.c

perf_event_intel.c

perf_event_p4.c

oprofile

nmi_int.c

drivers

oprofile

buffer_sync.c

cpu_buffer.c

kernel

perf_event.c

trace

ftrace.c

watchdog.c

tools

perf

util

callchain.h

+46 -13

arch/x86/kernel/cpu/perf_event.c

··· 1154 /* 1155 * event overflow 1156 */ 1157 - handled = 1; 1158 data.period = event->hw.last_period; 1159 1160 if (!x86_perf_event_set_period(event)) ··· 1200 apic_write(APIC_LVTPC, APIC_DM_NMI); 1201 } 1202 1203 static int __kprobes 1204 perf_event_nmi_handler(struct notifier_block *self, 1205 unsigned long cmd, void *__args) 1206 { 1207 struct die_args *args = __args; 1208 - struct pt_regs *regs; 1209 1210 if (!atomic_read(&active_events)) 1211 return NOTIFY_DONE; ··· 1222 case DIE_NMI: 1223 case DIE_NMI_IPI: 1224 break; 1225 - 1226 default: 1227 return NOTIFY_DONE; 1228 } 1229 1230 - regs = args->regs; 1231 - 1232 apic_write(APIC_LVTPC, APIC_DM_NMI); 1233 - /* 1234 - * Can't rely on the handled return value to say it was our NMI, two 1235 - * events could trigger 'simultaneously' raising two back-to-back NMIs. 1236 - * 1237 - * If the first NMI handles both, the latter will be empty and daze 1238 - * the CPU. 1239 - */ 1240 - x86_pmu.handle_irq(regs); 1241 1242 return NOTIFY_STOP; 1243 }

··· 1154 /* 1155 * event overflow 1156 */ 1157 + handled++; 1158 data.period = event->hw.last_period; 1159 1160 if (!x86_perf_event_set_period(event)) ··· 1200 apic_write(APIC_LVTPC, APIC_DM_NMI); 1201 } 1202 1203 + struct pmu_nmi_state { 1204 + unsigned int marked; 1205 + int handled; 1206 + }; 1207 + 1208 + static DEFINE_PER_CPU(struct pmu_nmi_state, pmu_nmi); 1209 + 1210 static int __kprobes 1211 perf_event_nmi_handler(struct notifier_block *self, 1212 unsigned long cmd, void *__args) 1213 { 1214 struct die_args *args = __args; 1215 + unsigned int this_nmi; 1216 + int handled; 1217 1218 if (!atomic_read(&active_events)) 1219 return NOTIFY_DONE; ··· 1214 case DIE_NMI: 1215 case DIE_NMI_IPI: 1216 break; 1217 + case DIE_NMIUNKNOWN: 1218 + this_nmi = percpu_read(irq_stat.__nmi_count); 1219 + if (this_nmi != __get_cpu_var(pmu_nmi).marked) 1220 + /* let the kernel handle the unknown nmi */ 1221 + return NOTIFY_DONE; 1222 + /* 1223 + * This one is a PMU back-to-back nmi. Two events 1224 + * trigger 'simultaneously' raising two back-to-back 1225 + * NMIs. If the first NMI handles both, the latter 1226 + * will be empty and daze the CPU. So, we drop it to 1227 + * avoid false-positive 'unknown nmi' messages. 1228 + */ 1229 + return NOTIFY_STOP; 1230 default: 1231 return NOTIFY_DONE; 1232 } 1233 1234 apic_write(APIC_LVTPC, APIC_DM_NMI); 1235 + 1236 + handled = x86_pmu.handle_irq(args->regs); 1237 + if (!handled) 1238 + return NOTIFY_DONE; 1239 + 1240 + this_nmi = percpu_read(irq_stat.__nmi_count); 1241 + if ((handled > 1) || 1242 + /* the next nmi could be a back-to-back nmi */ 1243 + ((__get_cpu_var(pmu_nmi).marked == this_nmi) && 1244 + (__get_cpu_var(pmu_nmi).handled > 1))) { 1245 + /* 1246 + * We could have two subsequent back-to-back nmis: The 1247 + * first handles more than one counter, the 2nd 1248 + * handles only one counter and the 3rd handles no 1249 + * counter. 1250 + * 1251 + * This is the 2nd nmi because the previous was 1252 + * handling more than one counter. We will mark the 1253 + * next (3rd) and then drop it if unhandled. 1254 + */ 1255 + __get_cpu_var(pmu_nmi).marked = this_nmi + 1; 1256 + __get_cpu_var(pmu_nmi).handled = handled; 1257 + } 1258 1259 return NOTIFY_STOP; 1260 }

+9 -6

arch/x86/kernel/cpu/perf_event_intel.c

··· 712 struct perf_sample_data data; 713 struct cpu_hw_events *cpuc; 714 int bit, loops; 715 - u64 ack, status; 716 717 perf_sample_data_init(&data, 0); 718 ··· 729 730 loops = 0; 731 again: 732 if (++loops > 100) { 733 WARN_ONCE(1, "perfevents: irq loop stuck!\n"); 734 perf_event_print_debug(); ··· 738 } 739 740 inc_irq_stat(apic_perf_irqs); 741 - ack = status; 742 743 intel_pmu_lbr_read(); 744 745 /* 746 * PEBS overflow sets bit 62 in the global status register 747 */ 748 - if (__test_and_clear_bit(62, (unsigned long *)&status)) 749 x86_pmu.drain_pebs(regs); 750 751 for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { 752 struct perf_event *event = cpuc->events[bit]; 753 754 if (!test_bit(bit, cpuc->active_mask)) 755 continue; ··· 766 x86_pmu_stop(event); 767 } 768 769 - intel_pmu_ack_status(ack); 770 - 771 /* 772 * Repeat if there is more work to be done: 773 */ ··· 775 776 done: 777 intel_pmu_enable_all(0); 778 - return 1; 779 } 780 781 static struct event_constraint *

··· 712 struct perf_sample_data data; 713 struct cpu_hw_events *cpuc; 714 int bit, loops; 715 + u64 status; 716 + int handled = 0; 717 718 perf_sample_data_init(&data, 0); 719 ··· 728 729 loops = 0; 730 again: 731 + intel_pmu_ack_status(status); 732 if (++loops > 100) { 733 WARN_ONCE(1, "perfevents: irq loop stuck!\n"); 734 perf_event_print_debug(); ··· 736 } 737 738 inc_irq_stat(apic_perf_irqs); 739 740 intel_pmu_lbr_read(); 741 742 /* 743 * PEBS overflow sets bit 62 in the global status register 744 */ 745 + if (__test_and_clear_bit(62, (unsigned long *)&status)) { 746 + handled++; 747 x86_pmu.drain_pebs(regs); 748 + } 749 750 for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { 751 struct perf_event *event = cpuc->events[bit]; 752 + 753 + handled++; 754 755 if (!test_bit(bit, cpuc->active_mask)) 756 continue; ··· 761 x86_pmu_stop(event); 762 } 763 764 /* 765 * Repeat if there is more work to be done: 766 */ ··· 772 773 done: 774 intel_pmu_enable_all(0); 775 + return handled; 776 } 777 778 static struct event_constraint *

+1 -1

arch/x86/kernel/cpu/perf_event_p4.c

··· 692 inc_irq_stat(apic_perf_irqs); 693 } 694 695 - return handled > 0; 696 } 697 698 /*

··· 692 inc_irq_stat(apic_perf_irqs); 693 } 694 695 + return handled; 696 } 697 698 /*

+17 -5

arch/x86/oprofile/nmi_int.c

··· 568 int error; 569 570 error = sysdev_class_register(&oprofile_sysclass); 571 - if (!error) 572 - error = sysdev_register(&device_oprofile); 573 return error; 574 } 575 ··· 585 } 586 587 #else 588 - #define init_sysfs() do { } while (0) 589 - #define exit_sysfs() do { } while (0) 590 #endif /* CONFIG_PM */ 591 592 static int __init p4_init(char **cpu_type) ··· 702 char *cpu_type = NULL; 703 int ret = 0; 704 705 if (!cpu_has_apic) 706 return -ENODEV; 707 ··· 783 784 mux_init(ops); 785 786 - init_sysfs(); 787 using_nmi = 1; 788 printk(KERN_INFO "oprofile: using NMI interrupt.\n"); 789 return 0;

··· 568 int error; 569 570 error = sysdev_class_register(&oprofile_sysclass); 571 + if (error) 572 + return error; 573 + 574 + error = sysdev_register(&device_oprofile); 575 + if (error) 576 + sysdev_class_unregister(&oprofile_sysclass); 577 + 578 return error; 579 } 580 ··· 580 } 581 582 #else 583 + 584 + static inline int init_sysfs(void) { return 0; } 585 + static inline void exit_sysfs(void) { } 586 + 587 #endif /* CONFIG_PM */ 588 589 static int __init p4_init(char **cpu_type) ··· 695 char *cpu_type = NULL; 696 int ret = 0; 697 698 + using_nmi = 0; 699 + 700 if (!cpu_has_apic) 701 return -ENODEV; 702 ··· 774 775 mux_init(ops); 776 777 + ret = init_sysfs(); 778 + if (ret) 779 + return ret; 780 + 781 using_nmi = 1; 782 printk(KERN_INFO "oprofile: using NMI interrupt.\n"); 783 return 0;

+14 -13

drivers/oprofile/buffer_sync.c

··· 141 .notifier_call = module_load_notify, 142 }; 143 144 - 145 - static void end_sync(void) 146 - { 147 - end_cpu_work(); 148 - /* make sure we don't leak task structs */ 149 - process_task_mortuary(); 150 - process_task_mortuary(); 151 - } 152 - 153 - 154 int sync_start(void) 155 { 156 int err; ··· 148 if (!zalloc_cpumask_var(&marked_cpus, GFP_KERNEL)) 149 return -ENOMEM; 150 151 - start_cpu_work(); 152 153 err = task_handoff_register(&task_free_nb); 154 if (err) ··· 163 if (err) 164 goto out4; 165 166 out: 167 return err; 168 out4: 169 profile_event_unregister(PROFILE_MUNMAP, &munmap_nb); ··· 175 out2: 176 task_handoff_unregister(&task_free_nb); 177 out1: 178 - end_sync(); 179 free_cpumask_var(marked_cpus); 180 goto out; 181 } ··· 182 183 void sync_stop(void) 184 { 185 unregister_module_notifier(&module_load_nb); 186 profile_event_unregister(PROFILE_MUNMAP, &munmap_nb); 187 profile_event_unregister(PROFILE_TASK_EXIT, &task_exit_nb); 188 task_handoff_unregister(&task_free_nb); 189 - end_sync(); 190 free_cpumask_var(marked_cpus); 191 } 192

··· 141 .notifier_call = module_load_notify, 142 }; 143 144 int sync_start(void) 145 { 146 int err; ··· 158 if (!zalloc_cpumask_var(&marked_cpus, GFP_KERNEL)) 159 return -ENOMEM; 160 161 + mutex_lock(&buffer_mutex); 162 163 err = task_handoff_register(&task_free_nb); 164 if (err) ··· 173 if (err) 174 goto out4; 175 176 + start_cpu_work(); 177 + 178 out: 179 + mutex_unlock(&buffer_mutex); 180 return err; 181 out4: 182 profile_event_unregister(PROFILE_MUNMAP, &munmap_nb); ··· 182 out2: 183 task_handoff_unregister(&task_free_nb); 184 out1: 185 free_cpumask_var(marked_cpus); 186 goto out; 187 } ··· 190 191 void sync_stop(void) 192 { 193 + /* flush buffers */ 194 + mutex_lock(&buffer_mutex); 195 + end_cpu_work(); 196 unregister_module_notifier(&module_load_nb); 197 profile_event_unregister(PROFILE_MUNMAP, &munmap_nb); 198 profile_event_unregister(PROFILE_TASK_EXIT, &task_exit_nb); 199 task_handoff_unregister(&task_free_nb); 200 + mutex_unlock(&buffer_mutex); 201 + flush_scheduled_work(); 202 + 203 + /* make sure we don't leak task structs */ 204 + process_task_mortuary(); 205 + process_task_mortuary(); 206 + 207 free_cpumask_var(marked_cpus); 208 } 209

-2

drivers/oprofile/cpu_buffer.c

··· 120 121 cancel_delayed_work(&b->work); 122 } 123 - 124 - flush_scheduled_work(); 125 } 126 127 /*

··· 120 121 cancel_delayed_work(&b->work); 122 } 123 } 124 125 /*

+22 -4

kernel/perf_event.c

··· 402 } 403 } 404 405 static void 406 event_sched_out(struct perf_event *event, 407 struct perf_cpu_context *cpuctx, 408 struct perf_event_context *ctx) 409 { 410 if (event->state != PERF_EVENT_STATE_ACTIVE) 411 return; 412 ··· 452 struct perf_event_context *ctx) 453 { 454 struct perf_event *event; 455 - 456 - if (group_event->state != PERF_EVENT_STATE_ACTIVE) 457 - return; 458 459 event_sched_out(group_event, cpuctx, ctx); 460 ··· 462 list_for_each_entry(event, &group_event->sibling_list, group_entry) 463 event_sched_out(event, cpuctx, ctx); 464 465 - if (group_event->attr.exclusive) 466 cpuctx->exclusive = 0; 467 } 468

··· 402 } 403 } 404 405 + static inline int 406 + event_filter_match(struct perf_event *event) 407 + { 408 + return event->cpu == -1 || event->cpu == smp_processor_id(); 409 + } 410 + 411 static void 412 event_sched_out(struct perf_event *event, 413 struct perf_cpu_context *cpuctx, 414 struct perf_event_context *ctx) 415 { 416 + u64 delta; 417 + /* 418 + * An event which could not be activated because of 419 + * filter mismatch still needs to have its timings 420 + * maintained, otherwise bogus information is return 421 + * via read() for time_enabled, time_running: 422 + */ 423 + if (event->state == PERF_EVENT_STATE_INACTIVE 424 + && !event_filter_match(event)) { 425 + delta = ctx->time - event->tstamp_stopped; 426 + event->tstamp_running += delta; 427 + event->tstamp_stopped = ctx->time; 428 + } 429 + 430 if (event->state != PERF_EVENT_STATE_ACTIVE) 431 return; 432 ··· 432 struct perf_event_context *ctx) 433 { 434 struct perf_event *event; 435 + int state = group_event->state; 436 437 event_sched_out(group_event, cpuctx, ctx); 438 ··· 444 list_for_each_entry(event, &group_event->sibling_list, group_entry) 445 event_sched_out(event, cpuctx, ctx); 446 447 + if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive) 448 cpuctx->exclusive = 0; 449 } 450

+11 -4

kernel/trace/ftrace.c

··· 381 { 382 struct ftrace_profile *rec = v; 383 char str[KSYM_SYMBOL_LEN]; 384 #ifdef CONFIG_FUNCTION_GRAPH_TRACER 385 - static DEFINE_MUTEX(mutex); 386 static struct trace_seq s; 387 unsigned long long avg; 388 unsigned long long stddev; 389 #endif 390 391 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); 392 seq_printf(m, " %-30.30s %10lu", str, rec->counter); ··· 415 do_div(stddev, (rec->counter - 1) * 1000); 416 } 417 418 - mutex_lock(&mutex); 419 trace_seq_init(&s); 420 trace_print_graph_duration(rec->time, &s); 421 trace_seq_puts(&s, " "); ··· 422 trace_seq_puts(&s, " "); 423 trace_print_graph_duration(stddev, &s); 424 trace_print_seq(m, &s); 425 - mutex_unlock(&mutex); 426 #endif 427 seq_putc(m, '\n'); 428 429 - return 0; 430 } 431 432 static void ftrace_profile_reset(struct ftrace_profile_stat *stat)

··· 381 { 382 struct ftrace_profile *rec = v; 383 char str[KSYM_SYMBOL_LEN]; 384 + int ret = 0; 385 #ifdef CONFIG_FUNCTION_GRAPH_TRACER 386 static struct trace_seq s; 387 unsigned long long avg; 388 unsigned long long stddev; 389 #endif 390 + mutex_lock(&ftrace_profile_lock); 391 + 392 + /* we raced with function_profile_reset() */ 393 + if (unlikely(rec->counter == 0)) { 394 + ret = -EBUSY; 395 + goto out; 396 + } 397 398 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); 399 seq_printf(m, " %-30.30s %10lu", str, rec->counter); ··· 408 do_div(stddev, (rec->counter - 1) * 1000); 409 } 410 411 trace_seq_init(&s); 412 trace_print_graph_duration(rec->time, &s); 413 trace_seq_puts(&s, " "); ··· 416 trace_seq_puts(&s, " "); 417 trace_print_graph_duration(stddev, &s); 418 trace_print_seq(m, &s); 419 #endif 420 seq_putc(m, '\n'); 421 + out: 422 + mutex_unlock(&ftrace_profile_lock); 423 424 + return ret; 425 } 426 427 static void ftrace_profile_reset(struct ftrace_profile_stat *stat)

+12 -5

kernel/watchdog.c

··· 122 123 void touch_softlockup_watchdog(void) 124 { 125 - __get_cpu_var(watchdog_touch_ts) = 0; 126 } 127 EXPORT_SYMBOL(touch_softlockup_watchdog); 128 ··· 142 #ifdef CONFIG_HARDLOCKUP_DETECTOR 143 void touch_nmi_watchdog(void) 144 { 145 - __get_cpu_var(watchdog_nmi_touch) = true; 146 touch_softlockup_watchdog(); 147 } 148 EXPORT_SYMBOL(touch_nmi_watchdog); ··· 440 wake_up_process(p); 441 } 442 443 return 0; 444 } 445 ··· 465 per_cpu(softlockup_watchdog, cpu) = NULL; 466 kthread_stop(p); 467 } 468 - 469 - /* if any cpu succeeds, watchdog is considered enabled for the system */ 470 - watchdog_enabled = 1; 471 } 472 473 static void watchdog_enable_all_cpus(void)

··· 122 123 void touch_softlockup_watchdog(void) 124 { 125 + __raw_get_cpu_var(watchdog_touch_ts) = 0; 126 } 127 EXPORT_SYMBOL(touch_softlockup_watchdog); 128 ··· 142 #ifdef CONFIG_HARDLOCKUP_DETECTOR 143 void touch_nmi_watchdog(void) 144 { 145 + if (watchdog_enabled) { 146 + unsigned cpu; 147 + 148 + for_each_present_cpu(cpu) { 149 + if (per_cpu(watchdog_nmi_touch, cpu) != true) 150 + per_cpu(watchdog_nmi_touch, cpu) = true; 151 + } 152 + } 153 touch_softlockup_watchdog(); 154 } 155 EXPORT_SYMBOL(touch_nmi_watchdog); ··· 433 wake_up_process(p); 434 } 435 436 + /* if any cpu succeeds, watchdog is considered enabled for the system */ 437 + watchdog_enabled = 1; 438 + 439 return 0; 440 } 441 ··· 455 per_cpu(softlockup_watchdog, cpu) = NULL; 456 kthread_stop(p); 457 } 458 } 459 460 static void watchdog_enable_all_cpus(void)

tools/perf/util/callchain.h

··· 50 INIT_LIST_HEAD(&node->children); 51 INIT_LIST_HEAD(&node->val); 52 53 node->parent = NULL; 54 node->hit = 0; 55 }

··· 50 INIT_LIST_HEAD(&node->children); 51 INIT_LIST_HEAD(&node->val); 52 53 + node->children_hit = 0; 54 node->parent = NULL; 55 node->hit = 0; 56 }