at v2.6.13 13 kB view raw
1/** 2 * @file buffer_sync.c 3 * 4 * @remark Copyright 2002 OProfile authors 5 * @remark Read the file COPYING 6 * 7 * @author John Levon <levon@movementarian.org> 8 * 9 * This is the core of the buffer management. Each 10 * CPU buffer is processed and entered into the 11 * global event buffer. Such processing is necessary 12 * in several circumstances, mentioned below. 13 * 14 * The processing does the job of converting the 15 * transitory EIP value into a persistent dentry/offset 16 * value that the profiler can record at its leisure. 17 * 18 * See fs/dcookies.c for a description of the dentry/offset 19 * objects. 20 */ 21 22#include <linux/mm.h> 23#include <linux/workqueue.h> 24#include <linux/notifier.h> 25#include <linux/dcookies.h> 26#include <linux/profile.h> 27#include <linux/module.h> 28#include <linux/fs.h> 29 30#include "oprofile_stats.h" 31#include "event_buffer.h" 32#include "cpu_buffer.h" 33#include "buffer_sync.h" 34 35static LIST_HEAD(dying_tasks); 36static LIST_HEAD(dead_tasks); 37static cpumask_t marked_cpus = CPU_MASK_NONE; 38static DEFINE_SPINLOCK(task_mortuary); 39static void process_task_mortuary(void); 40 41 42/* Take ownership of the task struct and place it on the 43 * list for processing. Only after two full buffer syncs 44 * does the task eventually get freed, because by then 45 * we are sure we will not reference it again. 46 */ 47static int task_free_notify(struct notifier_block * self, unsigned long val, void * data) 48{ 49 struct task_struct * task = data; 50 spin_lock(&task_mortuary); 51 list_add(&task->tasks, &dying_tasks); 52 spin_unlock(&task_mortuary); 53 return NOTIFY_OK; 54} 55 56 57/* The task is on its way out. A sync of the buffer means we can catch 58 * any remaining samples for this task. 59 */ 60static int task_exit_notify(struct notifier_block * self, unsigned long val, void * data) 61{ 62 /* To avoid latency problems, we only process the current CPU, 63 * hoping that most samples for the task are on this CPU 64 */ 65 sync_buffer(raw_smp_processor_id()); 66 return 0; 67} 68 69 70/* The task is about to try a do_munmap(). We peek at what it's going to 71 * do, and if it's an executable region, process the samples first, so 72 * we don't lose any. This does not have to be exact, it's a QoI issue 73 * only. 74 */ 75static int munmap_notify(struct notifier_block * self, unsigned long val, void * data) 76{ 77 unsigned long addr = (unsigned long)data; 78 struct mm_struct * mm = current->mm; 79 struct vm_area_struct * mpnt; 80 81 down_read(&mm->mmap_sem); 82 83 mpnt = find_vma(mm, addr); 84 if (mpnt && mpnt->vm_file && (mpnt->vm_flags & VM_EXEC)) { 85 up_read(&mm->mmap_sem); 86 /* To avoid latency problems, we only process the current CPU, 87 * hoping that most samples for the task are on this CPU 88 */ 89 sync_buffer(raw_smp_processor_id()); 90 return 0; 91 } 92 93 up_read(&mm->mmap_sem); 94 return 0; 95} 96 97 98/* We need to be told about new modules so we don't attribute to a previously 99 * loaded module, or drop the samples on the floor. 100 */ 101static int module_load_notify(struct notifier_block * self, unsigned long val, void * data) 102{ 103#ifdef CONFIG_MODULES 104 if (val != MODULE_STATE_COMING) 105 return 0; 106 107 /* FIXME: should we process all CPU buffers ? */ 108 down(&buffer_sem); 109 add_event_entry(ESCAPE_CODE); 110 add_event_entry(MODULE_LOADED_CODE); 111 up(&buffer_sem); 112#endif 113 return 0; 114} 115 116 117static struct notifier_block task_free_nb = { 118 .notifier_call = task_free_notify, 119}; 120 121static struct notifier_block task_exit_nb = { 122 .notifier_call = task_exit_notify, 123}; 124 125static struct notifier_block munmap_nb = { 126 .notifier_call = munmap_notify, 127}; 128 129static struct notifier_block module_load_nb = { 130 .notifier_call = module_load_notify, 131}; 132 133 134static void end_sync(void) 135{ 136 end_cpu_work(); 137 /* make sure we don't leak task structs */ 138 process_task_mortuary(); 139 process_task_mortuary(); 140} 141 142 143int sync_start(void) 144{ 145 int err; 146 147 start_cpu_work(); 148 149 err = task_handoff_register(&task_free_nb); 150 if (err) 151 goto out1; 152 err = profile_event_register(PROFILE_TASK_EXIT, &task_exit_nb); 153 if (err) 154 goto out2; 155 err = profile_event_register(PROFILE_MUNMAP, &munmap_nb); 156 if (err) 157 goto out3; 158 err = register_module_notifier(&module_load_nb); 159 if (err) 160 goto out4; 161 162out: 163 return err; 164out4: 165 profile_event_unregister(PROFILE_MUNMAP, &munmap_nb); 166out3: 167 profile_event_unregister(PROFILE_TASK_EXIT, &task_exit_nb); 168out2: 169 task_handoff_unregister(&task_free_nb); 170out1: 171 end_sync(); 172 goto out; 173} 174 175 176void sync_stop(void) 177{ 178 unregister_module_notifier(&module_load_nb); 179 profile_event_unregister(PROFILE_MUNMAP, &munmap_nb); 180 profile_event_unregister(PROFILE_TASK_EXIT, &task_exit_nb); 181 task_handoff_unregister(&task_free_nb); 182 end_sync(); 183} 184 185 186/* Optimisation. We can manage without taking the dcookie sem 187 * because we cannot reach this code without at least one 188 * dcookie user still being registered (namely, the reader 189 * of the event buffer). */ 190static inline unsigned long fast_get_dcookie(struct dentry * dentry, 191 struct vfsmount * vfsmnt) 192{ 193 unsigned long cookie; 194 195 if (dentry->d_cookie) 196 return (unsigned long)dentry; 197 get_dcookie(dentry, vfsmnt, &cookie); 198 return cookie; 199} 200 201 202/* Look up the dcookie for the task's first VM_EXECUTABLE mapping, 203 * which corresponds loosely to "application name". This is 204 * not strictly necessary but allows oprofile to associate 205 * shared-library samples with particular applications 206 */ 207static unsigned long get_exec_dcookie(struct mm_struct * mm) 208{ 209 unsigned long cookie = NO_COOKIE; 210 struct vm_area_struct * vma; 211 212 if (!mm) 213 goto out; 214 215 for (vma = mm->mmap; vma; vma = vma->vm_next) { 216 if (!vma->vm_file) 217 continue; 218 if (!(vma->vm_flags & VM_EXECUTABLE)) 219 continue; 220 cookie = fast_get_dcookie(vma->vm_file->f_dentry, 221 vma->vm_file->f_vfsmnt); 222 break; 223 } 224 225out: 226 return cookie; 227} 228 229 230/* Convert the EIP value of a sample into a persistent dentry/offset 231 * pair that can then be added to the global event buffer. We make 232 * sure to do this lookup before a mm->mmap modification happens so 233 * we don't lose track. 234 */ 235static unsigned long lookup_dcookie(struct mm_struct * mm, unsigned long addr, off_t * offset) 236{ 237 unsigned long cookie = NO_COOKIE; 238 struct vm_area_struct * vma; 239 240 for (vma = find_vma(mm, addr); vma; vma = vma->vm_next) { 241 242 if (addr < vma->vm_start || addr >= vma->vm_end) 243 continue; 244 245 if (vma->vm_file) { 246 cookie = fast_get_dcookie(vma->vm_file->f_dentry, 247 vma->vm_file->f_vfsmnt); 248 *offset = (vma->vm_pgoff << PAGE_SHIFT) + addr - 249 vma->vm_start; 250 } else { 251 /* must be an anonymous map */ 252 *offset = addr; 253 } 254 255 break; 256 } 257 258 if (!vma) 259 cookie = INVALID_COOKIE; 260 261 return cookie; 262} 263 264 265static unsigned long last_cookie = INVALID_COOKIE; 266 267static void add_cpu_switch(int i) 268{ 269 add_event_entry(ESCAPE_CODE); 270 add_event_entry(CPU_SWITCH_CODE); 271 add_event_entry(i); 272 last_cookie = INVALID_COOKIE; 273} 274 275static void add_kernel_ctx_switch(unsigned int in_kernel) 276{ 277 add_event_entry(ESCAPE_CODE); 278 if (in_kernel) 279 add_event_entry(KERNEL_ENTER_SWITCH_CODE); 280 else 281 add_event_entry(KERNEL_EXIT_SWITCH_CODE); 282} 283 284static void 285add_user_ctx_switch(struct task_struct const * task, unsigned long cookie) 286{ 287 add_event_entry(ESCAPE_CODE); 288 add_event_entry(CTX_SWITCH_CODE); 289 add_event_entry(task->pid); 290 add_event_entry(cookie); 291 /* Another code for daemon back-compat */ 292 add_event_entry(ESCAPE_CODE); 293 add_event_entry(CTX_TGID_CODE); 294 add_event_entry(task->tgid); 295} 296 297 298static void add_cookie_switch(unsigned long cookie) 299{ 300 add_event_entry(ESCAPE_CODE); 301 add_event_entry(COOKIE_SWITCH_CODE); 302 add_event_entry(cookie); 303} 304 305 306static void add_trace_begin(void) 307{ 308 add_event_entry(ESCAPE_CODE); 309 add_event_entry(TRACE_BEGIN_CODE); 310} 311 312 313static void add_sample_entry(unsigned long offset, unsigned long event) 314{ 315 add_event_entry(offset); 316 add_event_entry(event); 317} 318 319 320static int add_us_sample(struct mm_struct * mm, struct op_sample * s) 321{ 322 unsigned long cookie; 323 off_t offset; 324 325 cookie = lookup_dcookie(mm, s->eip, &offset); 326 327 if (cookie == INVALID_COOKIE) { 328 atomic_inc(&oprofile_stats.sample_lost_no_mapping); 329 return 0; 330 } 331 332 if (cookie != last_cookie) { 333 add_cookie_switch(cookie); 334 last_cookie = cookie; 335 } 336 337 add_sample_entry(offset, s->event); 338 339 return 1; 340} 341 342 343/* Add a sample to the global event buffer. If possible the 344 * sample is converted into a persistent dentry/offset pair 345 * for later lookup from userspace. 346 */ 347static int 348add_sample(struct mm_struct * mm, struct op_sample * s, int in_kernel) 349{ 350 if (in_kernel) { 351 add_sample_entry(s->eip, s->event); 352 return 1; 353 } else if (mm) { 354 return add_us_sample(mm, s); 355 } else { 356 atomic_inc(&oprofile_stats.sample_lost_no_mm); 357 } 358 return 0; 359} 360 361 362static void release_mm(struct mm_struct * mm) 363{ 364 if (!mm) 365 return; 366 up_read(&mm->mmap_sem); 367 mmput(mm); 368} 369 370 371static struct mm_struct * take_tasks_mm(struct task_struct * task) 372{ 373 struct mm_struct * mm = get_task_mm(task); 374 if (mm) 375 down_read(&mm->mmap_sem); 376 return mm; 377} 378 379 380static inline int is_code(unsigned long val) 381{ 382 return val == ESCAPE_CODE; 383} 384 385 386/* "acquire" as many cpu buffer slots as we can */ 387static unsigned long get_slots(struct oprofile_cpu_buffer * b) 388{ 389 unsigned long head = b->head_pos; 390 unsigned long tail = b->tail_pos; 391 392 /* 393 * Subtle. This resets the persistent last_task 394 * and in_kernel values used for switching notes. 395 * BUT, there is a small window between reading 396 * head_pos, and this call, that means samples 397 * can appear at the new head position, but not 398 * be prefixed with the notes for switching 399 * kernel mode or a task switch. This small hole 400 * can lead to mis-attribution or samples where 401 * we don't know if it's in the kernel or not, 402 * at the start of an event buffer. 403 */ 404 cpu_buffer_reset(b); 405 406 if (head >= tail) 407 return head - tail; 408 409 return head + (b->buffer_size - tail); 410} 411 412 413static void increment_tail(struct oprofile_cpu_buffer * b) 414{ 415 unsigned long new_tail = b->tail_pos + 1; 416 417 rmb(); 418 419 if (new_tail < b->buffer_size) 420 b->tail_pos = new_tail; 421 else 422 b->tail_pos = 0; 423} 424 425 426/* Move tasks along towards death. Any tasks on dead_tasks 427 * will definitely have no remaining references in any 428 * CPU buffers at this point, because we use two lists, 429 * and to have reached the list, it must have gone through 430 * one full sync already. 431 */ 432static void process_task_mortuary(void) 433{ 434 struct list_head * pos; 435 struct list_head * pos2; 436 struct task_struct * task; 437 438 spin_lock(&task_mortuary); 439 440 list_for_each_safe(pos, pos2, &dead_tasks) { 441 task = list_entry(pos, struct task_struct, tasks); 442 list_del(&task->tasks); 443 free_task(task); 444 } 445 446 list_for_each_safe(pos, pos2, &dying_tasks) { 447 task = list_entry(pos, struct task_struct, tasks); 448 list_del(&task->tasks); 449 list_add_tail(&task->tasks, &dead_tasks); 450 } 451 452 spin_unlock(&task_mortuary); 453} 454 455 456static void mark_done(int cpu) 457{ 458 int i; 459 460 cpu_set(cpu, marked_cpus); 461 462 for_each_online_cpu(i) { 463 if (!cpu_isset(i, marked_cpus)) 464 return; 465 } 466 467 /* All CPUs have been processed at least once, 468 * we can process the mortuary once 469 */ 470 process_task_mortuary(); 471 472 cpus_clear(marked_cpus); 473} 474 475 476/* FIXME: this is not sufficient if we implement syscall barrier backtrace 477 * traversal, the code switch to sb_sample_start at first kernel enter/exit 478 * switch so we need a fifth state and some special handling in sync_buffer() 479 */ 480typedef enum { 481 sb_bt_ignore = -2, 482 sb_buffer_start, 483 sb_bt_start, 484 sb_sample_start, 485} sync_buffer_state; 486 487/* Sync one of the CPU's buffers into the global event buffer. 488 * Here we need to go through each batch of samples punctuated 489 * by context switch notes, taking the task's mmap_sem and doing 490 * lookup in task->mm->mmap to convert EIP into dcookie/offset 491 * value. 492 */ 493void sync_buffer(int cpu) 494{ 495 struct oprofile_cpu_buffer * cpu_buf = &cpu_buffer[cpu]; 496 struct mm_struct *mm = NULL; 497 struct task_struct * new; 498 unsigned long cookie = 0; 499 int in_kernel = 1; 500 unsigned int i; 501 sync_buffer_state state = sb_buffer_start; 502 unsigned long available; 503 504 down(&buffer_sem); 505 506 add_cpu_switch(cpu); 507 508 /* Remember, only we can modify tail_pos */ 509 510 available = get_slots(cpu_buf); 511 512 for (i = 0; i < available; ++i) { 513 struct op_sample * s = &cpu_buf->buffer[cpu_buf->tail_pos]; 514 515 if (is_code(s->eip)) { 516 if (s->event <= CPU_IS_KERNEL) { 517 /* kernel/userspace switch */ 518 in_kernel = s->event; 519 if (state == sb_buffer_start) 520 state = sb_sample_start; 521 add_kernel_ctx_switch(s->event); 522 } else if (s->event == CPU_TRACE_BEGIN) { 523 state = sb_bt_start; 524 add_trace_begin(); 525 } else { 526 struct mm_struct * oldmm = mm; 527 528 /* userspace context switch */ 529 new = (struct task_struct *)s->event; 530 531 release_mm(oldmm); 532 mm = take_tasks_mm(new); 533 if (mm != oldmm) 534 cookie = get_exec_dcookie(mm); 535 add_user_ctx_switch(new, cookie); 536 } 537 } else { 538 if (state >= sb_bt_start && 539 !add_sample(mm, s, in_kernel)) { 540 if (state == sb_bt_start) { 541 state = sb_bt_ignore; 542 atomic_inc(&oprofile_stats.bt_lost_no_mapping); 543 } 544 } 545 } 546 547 increment_tail(cpu_buf); 548 } 549 release_mm(mm); 550 551 mark_done(cpu); 552 553 up(&buffer_sem); 554}