commit 9581b7d2689ad9a0a20ddaec0427b46ad774585b · tjh.dev/kernel

+3 -3

arch/x86/kernel/cpu/perf_event.c

··· 1276 1276 static int __kprobes 1277 1277 perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs) 1278 1278 { 1279 - int ret; 1280 1279 u64 start_clock; 1281 1280 u64 finish_clock; 1281 + int ret; 1282 1282 1283 1283 if (!atomic_read(&active_events)) 1284 1284 return NMI_DONE; 1285 1285 1286 - start_clock = local_clock(); 1286 + start_clock = sched_clock(); 1287 1287 ret = x86_pmu.handle_irq(regs); 1288 - finish_clock = local_clock(); 1288 + finish_clock = sched_clock(); 1289 1289 1290 1290 perf_sample_event_took(finish_clock - start_clock); 1291 1291

+2 -2

arch/x86/kernel/nmi.c

··· 113 113 u64 before, delta, whole_msecs; 114 114 int remainder_ns, decimal_msecs, thishandled; 115 115 116 - before = local_clock(); 116 + before = sched_clock(); 117 117 thishandled = a->handler(type, regs); 118 118 handled += thishandled; 119 - delta = local_clock() - before; 119 + delta = sched_clock() - before; 120 120 trace_nmi_handler(a->handler, (int)delta, thishandled); 121 121 122 122 if (delta < nmi_longest_ns)

+7 -5

include/uapi/linux/perf_event.h

··· 456 456 /* 457 457 * Control data for the mmap() data buffer. 458 458 * 459 - * User-space reading the @data_head value should issue an rmb(), on 460 - * SMP capable platforms, after reading this value -- see 461 - * perf_event_wakeup(). 459 + * User-space reading the @data_head value should issue an smp_rmb(), 460 + * after reading this value. 462 461 * 463 462 * When the mapping is PROT_WRITE the @data_tail value should be 464 - * written by userspace to reflect the last read data. In this case 465 - * the kernel will not over-write unread data. 463 + * written by userspace to reflect the last read data, after issueing 464 + * an smp_mb() to separate the data read from the ->data_tail store. 465 + * In this case the kernel will not over-write unread data. 466 + * 467 + * See perf_output_put_handle() for the data ordering. 466 468 */ 467 469 __u64 data_head; /* head in the data section */ 468 470 __u64 data_tail; /* user-space written tail */

+27 -4

kernel/events/ring_buffer.c

··· 87 87 goto out; 88 88 89 89 /* 90 - * Publish the known good head. Rely on the full barrier implied 91 - * by atomic_dec_and_test() order the rb->head read and this 92 - * write. 90 + * Since the mmap() consumer (userspace) can run on a different CPU: 91 + * 92 + * kernel user 93 + * 94 + * READ ->data_tail READ ->data_head 95 + * smp_mb() (A) smp_rmb() (C) 96 + * WRITE $data READ $data 97 + * smp_wmb() (B) smp_mb() (D) 98 + * STORE ->data_head WRITE ->data_tail 99 + * 100 + * Where A pairs with D, and B pairs with C. 101 + * 102 + * I don't think A needs to be a full barrier because we won't in fact 103 + * write data until we see the store from userspace. So we simply don't 104 + * issue the data WRITE until we observe it. Be conservative for now. 105 + * 106 + * OTOH, D needs to be a full barrier since it separates the data READ 107 + * from the tail WRITE. 108 + * 109 + * For B a WMB is sufficient since it separates two WRITEs, and for C 110 + * an RMB is sufficient since it separates two READs. 111 + * 112 + * See perf_output_begin(). 93 113 */ 114 + smp_wmb(); 94 115 rb->user_page->data_head = head; 95 116 96 117 /* ··· 175 154 * Userspace could choose to issue a mb() before updating the 176 155 * tail pointer. So that all reads will be completed before the 177 156 * write is issued. 157 + * 158 + * See perf_output_put_handle(). 178 159 */ 179 160 tail = ACCESS_ONCE(rb->user_page->data_tail); 180 - smp_rmb(); 161 + smp_mb(); 181 162 offset = head = local_read(&rb->head); 182 163 head += size; 183 164 if (unlikely(!perf_output_space(rb, tail, offset, head)))