at v2.6.12 307 lines 6.7 kB view raw
1/** 2 * @file cpu_buffer.c 3 * 4 * @remark Copyright 2002 OProfile authors 5 * @remark Read the file COPYING 6 * 7 * @author John Levon <levon@movementarian.org> 8 * 9 * Each CPU has a local buffer that stores PC value/event 10 * pairs. We also log context switches when we notice them. 11 * Eventually each CPU's buffer is processed into the global 12 * event buffer by sync_buffer(). 13 * 14 * We use a local buffer for two reasons: an NMI or similar 15 * interrupt cannot synchronise, and high sampling rates 16 * would lead to catastrophic global synchronisation if 17 * a global buffer was used. 18 */ 19 20#include <linux/sched.h> 21#include <linux/oprofile.h> 22#include <linux/vmalloc.h> 23#include <linux/errno.h> 24 25#include "event_buffer.h" 26#include "cpu_buffer.h" 27#include "buffer_sync.h" 28#include "oprof.h" 29 30struct oprofile_cpu_buffer cpu_buffer[NR_CPUS] __cacheline_aligned; 31 32static void wq_sync_buffer(void *); 33 34#define DEFAULT_TIMER_EXPIRE (HZ / 10) 35static int work_enabled; 36 37void free_cpu_buffers(void) 38{ 39 int i; 40 41 for_each_online_cpu(i) { 42 vfree(cpu_buffer[i].buffer); 43 } 44} 45 46 47int alloc_cpu_buffers(void) 48{ 49 int i; 50 51 unsigned long buffer_size = fs_cpu_buffer_size; 52 53 for_each_online_cpu(i) { 54 struct oprofile_cpu_buffer * b = &cpu_buffer[i]; 55 56 b->buffer = vmalloc(sizeof(struct op_sample) * buffer_size); 57 if (!b->buffer) 58 goto fail; 59 60 b->last_task = NULL; 61 b->last_is_kernel = -1; 62 b->tracing = 0; 63 b->buffer_size = buffer_size; 64 b->tail_pos = 0; 65 b->head_pos = 0; 66 b->sample_received = 0; 67 b->sample_lost_overflow = 0; 68 b->cpu = i; 69 INIT_WORK(&b->work, wq_sync_buffer, b); 70 } 71 return 0; 72 73fail: 74 free_cpu_buffers(); 75 return -ENOMEM; 76} 77 78 79void start_cpu_work(void) 80{ 81 int i; 82 83 work_enabled = 1; 84 85 for_each_online_cpu(i) { 86 struct oprofile_cpu_buffer * b = &cpu_buffer[i]; 87 88 /* 89 * Spread the work by 1 jiffy per cpu so they dont all 90 * fire at once. 91 */ 92 schedule_delayed_work_on(i, &b->work, DEFAULT_TIMER_EXPIRE + i); 93 } 94} 95 96 97void end_cpu_work(void) 98{ 99 int i; 100 101 work_enabled = 0; 102 103 for_each_online_cpu(i) { 104 struct oprofile_cpu_buffer * b = &cpu_buffer[i]; 105 106 cancel_delayed_work(&b->work); 107 } 108 109 flush_scheduled_work(); 110} 111 112 113/* Resets the cpu buffer to a sane state. */ 114void cpu_buffer_reset(struct oprofile_cpu_buffer * cpu_buf) 115{ 116 /* reset these to invalid values; the next sample 117 * collected will populate the buffer with proper 118 * values to initialize the buffer 119 */ 120 cpu_buf->last_is_kernel = -1; 121 cpu_buf->last_task = NULL; 122} 123 124 125/* compute number of available slots in cpu_buffer queue */ 126static unsigned long nr_available_slots(struct oprofile_cpu_buffer const * b) 127{ 128 unsigned long head = b->head_pos; 129 unsigned long tail = b->tail_pos; 130 131 if (tail > head) 132 return (tail - head) - 1; 133 134 return tail + (b->buffer_size - head) - 1; 135} 136 137 138static void increment_head(struct oprofile_cpu_buffer * b) 139{ 140 unsigned long new_head = b->head_pos + 1; 141 142 /* Ensure anything written to the slot before we 143 * increment is visible */ 144 wmb(); 145 146 if (new_head < b->buffer_size) 147 b->head_pos = new_head; 148 else 149 b->head_pos = 0; 150} 151 152 153 154 155inline static void 156add_sample(struct oprofile_cpu_buffer * cpu_buf, 157 unsigned long pc, unsigned long event) 158{ 159 struct op_sample * entry = &cpu_buf->buffer[cpu_buf->head_pos]; 160 entry->eip = pc; 161 entry->event = event; 162 increment_head(cpu_buf); 163} 164 165 166inline static void 167add_code(struct oprofile_cpu_buffer * buffer, unsigned long value) 168{ 169 add_sample(buffer, ESCAPE_CODE, value); 170} 171 172 173/* This must be safe from any context. It's safe writing here 174 * because of the head/tail separation of the writer and reader 175 * of the CPU buffer. 176 * 177 * is_kernel is needed because on some architectures you cannot 178 * tell if you are in kernel or user space simply by looking at 179 * pc. We tag this in the buffer by generating kernel enter/exit 180 * events whenever is_kernel changes 181 */ 182static int log_sample(struct oprofile_cpu_buffer * cpu_buf, unsigned long pc, 183 int is_kernel, unsigned long event) 184{ 185 struct task_struct * task; 186 187 cpu_buf->sample_received++; 188 189 if (nr_available_slots(cpu_buf) < 3) { 190 cpu_buf->sample_lost_overflow++; 191 return 0; 192 } 193 194 is_kernel = !!is_kernel; 195 196 task = current; 197 198 /* notice a switch from user->kernel or vice versa */ 199 if (cpu_buf->last_is_kernel != is_kernel) { 200 cpu_buf->last_is_kernel = is_kernel; 201 add_code(cpu_buf, is_kernel); 202 } 203 204 /* notice a task switch */ 205 if (cpu_buf->last_task != task) { 206 cpu_buf->last_task = task; 207 add_code(cpu_buf, (unsigned long)task); 208 } 209 210 add_sample(cpu_buf, pc, event); 211 return 1; 212} 213 214static int oprofile_begin_trace(struct oprofile_cpu_buffer * cpu_buf) 215{ 216 if (nr_available_slots(cpu_buf) < 4) { 217 cpu_buf->sample_lost_overflow++; 218 return 0; 219 } 220 221 add_code(cpu_buf, CPU_TRACE_BEGIN); 222 cpu_buf->tracing = 1; 223 return 1; 224} 225 226 227static void oprofile_end_trace(struct oprofile_cpu_buffer * cpu_buf) 228{ 229 cpu_buf->tracing = 0; 230} 231 232 233void oprofile_add_sample(struct pt_regs * const regs, unsigned long event) 234{ 235 struct oprofile_cpu_buffer * cpu_buf = &cpu_buffer[smp_processor_id()]; 236 unsigned long pc = profile_pc(regs); 237 int is_kernel = !user_mode(regs); 238 239 if (!backtrace_depth) { 240 log_sample(cpu_buf, pc, is_kernel, event); 241 return; 242 } 243 244 if (!oprofile_begin_trace(cpu_buf)) 245 return; 246 247 /* if log_sample() fail we can't backtrace since we lost the source 248 * of this event */ 249 if (log_sample(cpu_buf, pc, is_kernel, event)) 250 oprofile_ops.backtrace(regs, backtrace_depth); 251 oprofile_end_trace(cpu_buf); 252} 253 254 255void oprofile_add_pc(unsigned long pc, int is_kernel, unsigned long event) 256{ 257 struct oprofile_cpu_buffer * cpu_buf = &cpu_buffer[smp_processor_id()]; 258 log_sample(cpu_buf, pc, is_kernel, event); 259} 260 261 262void oprofile_add_trace(unsigned long pc) 263{ 264 struct oprofile_cpu_buffer * cpu_buf = &cpu_buffer[smp_processor_id()]; 265 266 if (!cpu_buf->tracing) 267 return; 268 269 if (nr_available_slots(cpu_buf) < 1) { 270 cpu_buf->tracing = 0; 271 cpu_buf->sample_lost_overflow++; 272 return; 273 } 274 275 /* broken frame can give an eip with the same value as an escape code, 276 * abort the trace if we get it */ 277 if (pc == ESCAPE_CODE) { 278 cpu_buf->tracing = 0; 279 cpu_buf->backtrace_aborted++; 280 return; 281 } 282 283 add_sample(cpu_buf, pc, 0); 284} 285 286 287 288/* 289 * This serves to avoid cpu buffer overflow, and makes sure 290 * the task mortuary progresses 291 * 292 * By using schedule_delayed_work_on and then schedule_delayed_work 293 * we guarantee this will stay on the correct cpu 294 */ 295static void wq_sync_buffer(void * data) 296{ 297 struct oprofile_cpu_buffer * b = data; 298 if (b->cpu != smp_processor_id()) { 299 printk("WQ on CPU%d, prefer CPU%d\n", 300 smp_processor_id(), b->cpu); 301 } 302 sync_buffer(b->cpu); 303 304 /* don't re-add the work if we're shutting down */ 305 if (work_enabled) 306 schedule_delayed_work(&b->work, DEFAULT_TIMER_EXPIRE); 307}