at v2.6.35 456 lines 10 kB view raw
1/** 2 * @file cpu_buffer.c 3 * 4 * @remark Copyright 2002-2009 OProfile authors 5 * @remark Read the file COPYING 6 * 7 * @author John Levon <levon@movementarian.org> 8 * @author Barry Kasindorf <barry.kasindorf@amd.com> 9 * @author Robert Richter <robert.richter@amd.com> 10 * 11 * Each CPU has a local buffer that stores PC value/event 12 * pairs. We also log context switches when we notice them. 13 * Eventually each CPU's buffer is processed into the global 14 * event buffer by sync_buffer(). 15 * 16 * We use a local buffer for two reasons: an NMI or similar 17 * interrupt cannot synchronise, and high sampling rates 18 * would lead to catastrophic global synchronisation if 19 * a global buffer was used. 20 */ 21 22#include <linux/sched.h> 23#include <linux/oprofile.h> 24#include <linux/errno.h> 25 26#include "event_buffer.h" 27#include "cpu_buffer.h" 28#include "buffer_sync.h" 29#include "oprof.h" 30 31#define OP_BUFFER_FLAGS 0 32 33static struct ring_buffer *op_ring_buffer; 34DEFINE_PER_CPU(struct oprofile_cpu_buffer, op_cpu_buffer); 35 36static void wq_sync_buffer(struct work_struct *work); 37 38#define DEFAULT_TIMER_EXPIRE (HZ / 10) 39static int work_enabled; 40 41unsigned long oprofile_get_cpu_buffer_size(void) 42{ 43 return oprofile_cpu_buffer_size; 44} 45 46void oprofile_cpu_buffer_inc_smpl_lost(void) 47{ 48 struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(op_cpu_buffer); 49 50 cpu_buf->sample_lost_overflow++; 51} 52 53void free_cpu_buffers(void) 54{ 55 if (op_ring_buffer) 56 ring_buffer_free(op_ring_buffer); 57 op_ring_buffer = NULL; 58} 59 60#define RB_EVENT_HDR_SIZE 4 61 62int alloc_cpu_buffers(void) 63{ 64 int i; 65 66 unsigned long buffer_size = oprofile_cpu_buffer_size; 67 unsigned long byte_size = buffer_size * (sizeof(struct op_sample) + 68 RB_EVENT_HDR_SIZE); 69 70 op_ring_buffer = ring_buffer_alloc(byte_size, OP_BUFFER_FLAGS); 71 if (!op_ring_buffer) 72 goto fail; 73 74 for_each_possible_cpu(i) { 75 struct oprofile_cpu_buffer *b = &per_cpu(op_cpu_buffer, i); 76 77 b->last_task = NULL; 78 b->last_is_kernel = -1; 79 b->tracing = 0; 80 b->buffer_size = buffer_size; 81 b->sample_received = 0; 82 b->sample_lost_overflow = 0; 83 b->backtrace_aborted = 0; 84 b->sample_invalid_eip = 0; 85 b->cpu = i; 86 INIT_DELAYED_WORK(&b->work, wq_sync_buffer); 87 } 88 return 0; 89 90fail: 91 free_cpu_buffers(); 92 return -ENOMEM; 93} 94 95void start_cpu_work(void) 96{ 97 int i; 98 99 work_enabled = 1; 100 101 for_each_online_cpu(i) { 102 struct oprofile_cpu_buffer *b = &per_cpu(op_cpu_buffer, i); 103 104 /* 105 * Spread the work by 1 jiffy per cpu so they dont all 106 * fire at once. 107 */ 108 schedule_delayed_work_on(i, &b->work, DEFAULT_TIMER_EXPIRE + i); 109 } 110} 111 112void end_cpu_work(void) 113{ 114 int i; 115 116 work_enabled = 0; 117 118 for_each_online_cpu(i) { 119 struct oprofile_cpu_buffer *b = &per_cpu(op_cpu_buffer, i); 120 121 cancel_delayed_work(&b->work); 122 } 123 124 flush_scheduled_work(); 125} 126 127/* 128 * This function prepares the cpu buffer to write a sample. 129 * 130 * Struct op_entry is used during operations on the ring buffer while 131 * struct op_sample contains the data that is stored in the ring 132 * buffer. Struct entry can be uninitialized. The function reserves a 133 * data array that is specified by size. Use 134 * op_cpu_buffer_write_commit() after preparing the sample. In case of 135 * errors a null pointer is returned, otherwise the pointer to the 136 * sample. 137 * 138 */ 139struct op_sample 140*op_cpu_buffer_write_reserve(struct op_entry *entry, unsigned long size) 141{ 142 entry->event = ring_buffer_lock_reserve 143 (op_ring_buffer, sizeof(struct op_sample) + 144 size * sizeof(entry->sample->data[0])); 145 if (!entry->event) 146 return NULL; 147 entry->sample = ring_buffer_event_data(entry->event); 148 entry->size = size; 149 entry->data = entry->sample->data; 150 151 return entry->sample; 152} 153 154int op_cpu_buffer_write_commit(struct op_entry *entry) 155{ 156 return ring_buffer_unlock_commit(op_ring_buffer, entry->event); 157} 158 159struct op_sample *op_cpu_buffer_read_entry(struct op_entry *entry, int cpu) 160{ 161 struct ring_buffer_event *e; 162 e = ring_buffer_consume(op_ring_buffer, cpu, NULL, NULL); 163 if (!e) 164 return NULL; 165 166 entry->event = e; 167 entry->sample = ring_buffer_event_data(e); 168 entry->size = (ring_buffer_event_length(e) - sizeof(struct op_sample)) 169 / sizeof(entry->sample->data[0]); 170 entry->data = entry->sample->data; 171 return entry->sample; 172} 173 174unsigned long op_cpu_buffer_entries(int cpu) 175{ 176 return ring_buffer_entries_cpu(op_ring_buffer, cpu); 177} 178 179static int 180op_add_code(struct oprofile_cpu_buffer *cpu_buf, unsigned long backtrace, 181 int is_kernel, struct task_struct *task) 182{ 183 struct op_entry entry; 184 struct op_sample *sample; 185 unsigned long flags; 186 int size; 187 188 flags = 0; 189 190 if (backtrace) 191 flags |= TRACE_BEGIN; 192 193 /* notice a switch from user->kernel or vice versa */ 194 is_kernel = !!is_kernel; 195 if (cpu_buf->last_is_kernel != is_kernel) { 196 cpu_buf->last_is_kernel = is_kernel; 197 flags |= KERNEL_CTX_SWITCH; 198 if (is_kernel) 199 flags |= IS_KERNEL; 200 } 201 202 /* notice a task switch */ 203 if (cpu_buf->last_task != task) { 204 cpu_buf->last_task = task; 205 flags |= USER_CTX_SWITCH; 206 } 207 208 if (!flags) 209 /* nothing to do */ 210 return 0; 211 212 if (flags & USER_CTX_SWITCH) 213 size = 1; 214 else 215 size = 0; 216 217 sample = op_cpu_buffer_write_reserve(&entry, size); 218 if (!sample) 219 return -ENOMEM; 220 221 sample->eip = ESCAPE_CODE; 222 sample->event = flags; 223 224 if (size) 225 op_cpu_buffer_add_data(&entry, (unsigned long)task); 226 227 op_cpu_buffer_write_commit(&entry); 228 229 return 0; 230} 231 232static inline int 233op_add_sample(struct oprofile_cpu_buffer *cpu_buf, 234 unsigned long pc, unsigned long event) 235{ 236 struct op_entry entry; 237 struct op_sample *sample; 238 239 sample = op_cpu_buffer_write_reserve(&entry, 0); 240 if (!sample) 241 return -ENOMEM; 242 243 sample->eip = pc; 244 sample->event = event; 245 246 return op_cpu_buffer_write_commit(&entry); 247} 248 249/* 250 * This must be safe from any context. 251 * 252 * is_kernel is needed because on some architectures you cannot 253 * tell if you are in kernel or user space simply by looking at 254 * pc. We tag this in the buffer by generating kernel enter/exit 255 * events whenever is_kernel changes 256 */ 257static int 258log_sample(struct oprofile_cpu_buffer *cpu_buf, unsigned long pc, 259 unsigned long backtrace, int is_kernel, unsigned long event) 260{ 261 cpu_buf->sample_received++; 262 263 if (pc == ESCAPE_CODE) { 264 cpu_buf->sample_invalid_eip++; 265 return 0; 266 } 267 268 if (op_add_code(cpu_buf, backtrace, is_kernel, current)) 269 goto fail; 270 271 if (op_add_sample(cpu_buf, pc, event)) 272 goto fail; 273 274 return 1; 275 276fail: 277 cpu_buf->sample_lost_overflow++; 278 return 0; 279} 280 281static inline void oprofile_begin_trace(struct oprofile_cpu_buffer *cpu_buf) 282{ 283 cpu_buf->tracing = 1; 284} 285 286static inline void oprofile_end_trace(struct oprofile_cpu_buffer *cpu_buf) 287{ 288 cpu_buf->tracing = 0; 289} 290 291static inline void 292__oprofile_add_ext_sample(unsigned long pc, struct pt_regs * const regs, 293 unsigned long event, int is_kernel) 294{ 295 struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(op_cpu_buffer); 296 unsigned long backtrace = oprofile_backtrace_depth; 297 298 /* 299 * if log_sample() fail we can't backtrace since we lost the 300 * source of this event 301 */ 302 if (!log_sample(cpu_buf, pc, backtrace, is_kernel, event)) 303 /* failed */ 304 return; 305 306 if (!backtrace) 307 return; 308 309 oprofile_begin_trace(cpu_buf); 310 oprofile_ops.backtrace(regs, backtrace); 311 oprofile_end_trace(cpu_buf); 312} 313 314void oprofile_add_ext_sample(unsigned long pc, struct pt_regs * const regs, 315 unsigned long event, int is_kernel) 316{ 317 __oprofile_add_ext_sample(pc, regs, event, is_kernel); 318} 319 320void oprofile_add_sample(struct pt_regs * const regs, unsigned long event) 321{ 322 int is_kernel; 323 unsigned long pc; 324 325 if (likely(regs)) { 326 is_kernel = !user_mode(regs); 327 pc = profile_pc(regs); 328 } else { 329 is_kernel = 0; /* This value will not be used */ 330 pc = ESCAPE_CODE; /* as this causes an early return. */ 331 } 332 333 __oprofile_add_ext_sample(pc, regs, event, is_kernel); 334} 335 336/* 337 * Add samples with data to the ring buffer. 338 * 339 * Use oprofile_add_data(&entry, val) to add data and 340 * oprofile_write_commit(&entry) to commit the sample. 341 */ 342void 343oprofile_write_reserve(struct op_entry *entry, struct pt_regs * const regs, 344 unsigned long pc, int code, int size) 345{ 346 struct op_sample *sample; 347 int is_kernel = !user_mode(regs); 348 struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(op_cpu_buffer); 349 350 cpu_buf->sample_received++; 351 352 /* no backtraces for samples with data */ 353 if (op_add_code(cpu_buf, 0, is_kernel, current)) 354 goto fail; 355 356 sample = op_cpu_buffer_write_reserve(entry, size + 2); 357 if (!sample) 358 goto fail; 359 sample->eip = ESCAPE_CODE; 360 sample->event = 0; /* no flags */ 361 362 op_cpu_buffer_add_data(entry, code); 363 op_cpu_buffer_add_data(entry, pc); 364 365 return; 366 367fail: 368 entry->event = NULL; 369 cpu_buf->sample_lost_overflow++; 370} 371 372int oprofile_add_data(struct op_entry *entry, unsigned long val) 373{ 374 if (!entry->event) 375 return 0; 376 return op_cpu_buffer_add_data(entry, val); 377} 378 379int oprofile_add_data64(struct op_entry *entry, u64 val) 380{ 381 if (!entry->event) 382 return 0; 383 if (op_cpu_buffer_get_size(entry) < 2) 384 /* 385 * the function returns 0 to indicate a too small 386 * buffer, even if there is some space left 387 */ 388 return 0; 389 if (!op_cpu_buffer_add_data(entry, (u32)val)) 390 return 0; 391 return op_cpu_buffer_add_data(entry, (u32)(val >> 32)); 392} 393 394int oprofile_write_commit(struct op_entry *entry) 395{ 396 if (!entry->event) 397 return -EINVAL; 398 return op_cpu_buffer_write_commit(entry); 399} 400 401void oprofile_add_pc(unsigned long pc, int is_kernel, unsigned long event) 402{ 403 struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(op_cpu_buffer); 404 log_sample(cpu_buf, pc, 0, is_kernel, event); 405} 406 407void oprofile_add_trace(unsigned long pc) 408{ 409 struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(op_cpu_buffer); 410 411 if (!cpu_buf->tracing) 412 return; 413 414 /* 415 * broken frame can give an eip with the same value as an 416 * escape code, abort the trace if we get it 417 */ 418 if (pc == ESCAPE_CODE) 419 goto fail; 420 421 if (op_add_sample(cpu_buf, pc, 0)) 422 goto fail; 423 424 return; 425fail: 426 cpu_buf->tracing = 0; 427 cpu_buf->backtrace_aborted++; 428 return; 429} 430 431/* 432 * This serves to avoid cpu buffer overflow, and makes sure 433 * the task mortuary progresses 434 * 435 * By using schedule_delayed_work_on and then schedule_delayed_work 436 * we guarantee this will stay on the correct cpu 437 */ 438static void wq_sync_buffer(struct work_struct *work) 439{ 440 struct oprofile_cpu_buffer *b = 441 container_of(work, struct oprofile_cpu_buffer, work.work); 442 if (b->cpu != smp_processor_id()) { 443 printk(KERN_DEBUG "WQ on CPU%d, prefer CPU%d\n", 444 smp_processor_id(), b->cpu); 445 446 if (!cpu_online(b->cpu)) { 447 cancel_delayed_work(&b->work); 448 return; 449 } 450 } 451 sync_buffer(b->cpu); 452 453 /* don't re-add the work if we're shutting down */ 454 if (work_enabled) 455 schedule_delayed_work(&b->work, DEFAULT_TIMER_EXPIRE); 456}