Merge branch 'perf-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull perf fixes from Ingo Molnar:
"Two fixes:

- Fix 'NMI handler took too long to run' false positives

[ Genuine NMI overhead speedups will come for v3.13, this commit
only fixes a measurement bug ]

- Fix perf ring-buffer missed barrier causing (rare) ring-buffer data
corruption on ppc64"

* 'perf-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
perf/x86: Fix NMI measurements
perf: Fix perf ring buffer memory ordering

Changed files
+39 -14
arch
x86
kernel
include
uapi
linux
kernel
events
+3 -3
arch/x86/kernel/cpu/perf_event.c
··· 1276 1276 static int __kprobes 1277 1277 perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs) 1278 1278 { 1279 - int ret; 1280 1279 u64 start_clock; 1281 1280 u64 finish_clock; 1281 + int ret; 1282 1282 1283 1283 if (!atomic_read(&active_events)) 1284 1284 return NMI_DONE; 1285 1285 1286 - start_clock = local_clock(); 1286 + start_clock = sched_clock(); 1287 1287 ret = x86_pmu.handle_irq(regs); 1288 - finish_clock = local_clock(); 1288 + finish_clock = sched_clock(); 1289 1289 1290 1290 perf_sample_event_took(finish_clock - start_clock); 1291 1291
+2 -2
arch/x86/kernel/nmi.c
··· 113 113 u64 before, delta, whole_msecs; 114 114 int remainder_ns, decimal_msecs, thishandled; 115 115 116 - before = local_clock(); 116 + before = sched_clock(); 117 117 thishandled = a->handler(type, regs); 118 118 handled += thishandled; 119 - delta = local_clock() - before; 119 + delta = sched_clock() - before; 120 120 trace_nmi_handler(a->handler, (int)delta, thishandled); 121 121 122 122 if (delta < nmi_longest_ns)
+7 -5
include/uapi/linux/perf_event.h
··· 456 456 /* 457 457 * Control data for the mmap() data buffer. 458 458 * 459 - * User-space reading the @data_head value should issue an rmb(), on 460 - * SMP capable platforms, after reading this value -- see 461 - * perf_event_wakeup(). 459 + * User-space reading the @data_head value should issue an smp_rmb(), 460 + * after reading this value. 462 461 * 463 462 * When the mapping is PROT_WRITE the @data_tail value should be 464 - * written by userspace to reflect the last read data. In this case 465 - * the kernel will not over-write unread data. 463 + * written by userspace to reflect the last read data, after issueing 464 + * an smp_mb() to separate the data read from the ->data_tail store. 465 + * In this case the kernel will not over-write unread data. 466 + * 467 + * See perf_output_put_handle() for the data ordering. 466 468 */ 467 469 __u64 data_head; /* head in the data section */ 468 470 __u64 data_tail; /* user-space written tail */
+27 -4
kernel/events/ring_buffer.c
··· 87 87 goto out; 88 88 89 89 /* 90 - * Publish the known good head. Rely on the full barrier implied 91 - * by atomic_dec_and_test() order the rb->head read and this 92 - * write. 90 + * Since the mmap() consumer (userspace) can run on a different CPU: 91 + * 92 + * kernel user 93 + * 94 + * READ ->data_tail READ ->data_head 95 + * smp_mb() (A) smp_rmb() (C) 96 + * WRITE $data READ $data 97 + * smp_wmb() (B) smp_mb() (D) 98 + * STORE ->data_head WRITE ->data_tail 99 + * 100 + * Where A pairs with D, and B pairs with C. 101 + * 102 + * I don't think A needs to be a full barrier because we won't in fact 103 + * write data until we see the store from userspace. So we simply don't 104 + * issue the data WRITE until we observe it. Be conservative for now. 105 + * 106 + * OTOH, D needs to be a full barrier since it separates the data READ 107 + * from the tail WRITE. 108 + * 109 + * For B a WMB is sufficient since it separates two WRITEs, and for C 110 + * an RMB is sufficient since it separates two READs. 111 + * 112 + * See perf_output_begin(). 93 113 */ 114 + smp_wmb(); 94 115 rb->user_page->data_head = head; 95 116 96 117 /* ··· 175 154 * Userspace could choose to issue a mb() before updating the 176 155 * tail pointer. So that all reads will be completed before the 177 156 * write is issued. 157 + * 158 + * See perf_output_put_handle(). 178 159 */ 179 160 tail = ACCESS_ONCE(rb->user_page->data_tail); 180 - smp_rmb(); 161 + smp_mb(); 181 162 offset = head = local_read(&rb->head); 182 163 head += size; 183 164 if (unlikely(!perf_output_space(rb, tail, offset, head)))