at v2.6.12 547 lines 13 kB view raw
1/** 2 * @file buffer_sync.c 3 * 4 * @remark Copyright 2002 OProfile authors 5 * @remark Read the file COPYING 6 * 7 * @author John Levon <levon@movementarian.org> 8 * 9 * This is the core of the buffer management. Each 10 * CPU buffer is processed and entered into the 11 * global event buffer. Such processing is necessary 12 * in several circumstances, mentioned below. 13 * 14 * The processing does the job of converting the 15 * transitory EIP value into a persistent dentry/offset 16 * value that the profiler can record at its leisure. 17 * 18 * See fs/dcookies.c for a description of the dentry/offset 19 * objects. 20 */ 21 22#include <linux/mm.h> 23#include <linux/workqueue.h> 24#include <linux/notifier.h> 25#include <linux/dcookies.h> 26#include <linux/profile.h> 27#include <linux/module.h> 28#include <linux/fs.h> 29 30#include "oprofile_stats.h" 31#include "event_buffer.h" 32#include "cpu_buffer.h" 33#include "buffer_sync.h" 34 35static LIST_HEAD(dying_tasks); 36static LIST_HEAD(dead_tasks); 37static cpumask_t marked_cpus = CPU_MASK_NONE; 38static DEFINE_SPINLOCK(task_mortuary); 39static void process_task_mortuary(void); 40 41 42/* Take ownership of the task struct and place it on the 43 * list for processing. Only after two full buffer syncs 44 * does the task eventually get freed, because by then 45 * we are sure we will not reference it again. 46 */ 47static int task_free_notify(struct notifier_block * self, unsigned long val, void * data) 48{ 49 struct task_struct * task = data; 50 spin_lock(&task_mortuary); 51 list_add(&task->tasks, &dying_tasks); 52 spin_unlock(&task_mortuary); 53 return NOTIFY_OK; 54} 55 56 57/* The task is on its way out. A sync of the buffer means we can catch 58 * any remaining samples for this task. 59 */ 60static int task_exit_notify(struct notifier_block * self, unsigned long val, void * data) 61{ 62 /* To avoid latency problems, we only process the current CPU, 63 * hoping that most samples for the task are on this CPU 64 */ 65 sync_buffer(_smp_processor_id()); 66 return 0; 67} 68 69 70/* The task is about to try a do_munmap(). We peek at what it's going to 71 * do, and if it's an executable region, process the samples first, so 72 * we don't lose any. This does not have to be exact, it's a QoI issue 73 * only. 74 */ 75static int munmap_notify(struct notifier_block * self, unsigned long val, void * data) 76{ 77 unsigned long addr = (unsigned long)data; 78 struct mm_struct * mm = current->mm; 79 struct vm_area_struct * mpnt; 80 81 down_read(&mm->mmap_sem); 82 83 mpnt = find_vma(mm, addr); 84 if (mpnt && mpnt->vm_file && (mpnt->vm_flags & VM_EXEC)) { 85 up_read(&mm->mmap_sem); 86 /* To avoid latency problems, we only process the current CPU, 87 * hoping that most samples for the task are on this CPU 88 */ 89 sync_buffer(_smp_processor_id()); 90 return 0; 91 } 92 93 up_read(&mm->mmap_sem); 94 return 0; 95} 96 97 98/* We need to be told about new modules so we don't attribute to a previously 99 * loaded module, or drop the samples on the floor. 100 */ 101static int module_load_notify(struct notifier_block * self, unsigned long val, void * data) 102{ 103#ifdef CONFIG_MODULES 104 if (val != MODULE_STATE_COMING) 105 return 0; 106 107 /* FIXME: should we process all CPU buffers ? */ 108 down(&buffer_sem); 109 add_event_entry(ESCAPE_CODE); 110 add_event_entry(MODULE_LOADED_CODE); 111 up(&buffer_sem); 112#endif 113 return 0; 114} 115 116 117static struct notifier_block task_free_nb = { 118 .notifier_call = task_free_notify, 119}; 120 121static struct notifier_block task_exit_nb = { 122 .notifier_call = task_exit_notify, 123}; 124 125static struct notifier_block munmap_nb = { 126 .notifier_call = munmap_notify, 127}; 128 129static struct notifier_block module_load_nb = { 130 .notifier_call = module_load_notify, 131}; 132 133 134static void end_sync(void) 135{ 136 end_cpu_work(); 137 /* make sure we don't leak task structs */ 138 process_task_mortuary(); 139 process_task_mortuary(); 140} 141 142 143int sync_start(void) 144{ 145 int err; 146 147 start_cpu_work(); 148 149 err = task_handoff_register(&task_free_nb); 150 if (err) 151 goto out1; 152 err = profile_event_register(PROFILE_TASK_EXIT, &task_exit_nb); 153 if (err) 154 goto out2; 155 err = profile_event_register(PROFILE_MUNMAP, &munmap_nb); 156 if (err) 157 goto out3; 158 err = register_module_notifier(&module_load_nb); 159 if (err) 160 goto out4; 161 162out: 163 return err; 164out4: 165 profile_event_unregister(PROFILE_MUNMAP, &munmap_nb); 166out3: 167 profile_event_unregister(PROFILE_TASK_EXIT, &task_exit_nb); 168out2: 169 task_handoff_unregister(&task_free_nb); 170out1: 171 end_sync(); 172 goto out; 173} 174 175 176void sync_stop(void) 177{ 178 unregister_module_notifier(&module_load_nb); 179 profile_event_unregister(PROFILE_MUNMAP, &munmap_nb); 180 profile_event_unregister(PROFILE_TASK_EXIT, &task_exit_nb); 181 task_handoff_unregister(&task_free_nb); 182 end_sync(); 183} 184 185 186/* Optimisation. We can manage without taking the dcookie sem 187 * because we cannot reach this code without at least one 188 * dcookie user still being registered (namely, the reader 189 * of the event buffer). */ 190static inline unsigned long fast_get_dcookie(struct dentry * dentry, 191 struct vfsmount * vfsmnt) 192{ 193 unsigned long cookie; 194 195 if (dentry->d_cookie) 196 return (unsigned long)dentry; 197 get_dcookie(dentry, vfsmnt, &cookie); 198 return cookie; 199} 200 201 202/* Look up the dcookie for the task's first VM_EXECUTABLE mapping, 203 * which corresponds loosely to "application name". This is 204 * not strictly necessary but allows oprofile to associate 205 * shared-library samples with particular applications 206 */ 207static unsigned long get_exec_dcookie(struct mm_struct * mm) 208{ 209 unsigned long cookie = 0; 210 struct vm_area_struct * vma; 211 212 if (!mm) 213 goto out; 214 215 for (vma = mm->mmap; vma; vma = vma->vm_next) { 216 if (!vma->vm_file) 217 continue; 218 if (!(vma->vm_flags & VM_EXECUTABLE)) 219 continue; 220 cookie = fast_get_dcookie(vma->vm_file->f_dentry, 221 vma->vm_file->f_vfsmnt); 222 break; 223 } 224 225out: 226 return cookie; 227} 228 229 230/* Convert the EIP value of a sample into a persistent dentry/offset 231 * pair that can then be added to the global event buffer. We make 232 * sure to do this lookup before a mm->mmap modification happens so 233 * we don't lose track. 234 */ 235static unsigned long lookup_dcookie(struct mm_struct * mm, unsigned long addr, off_t * offset) 236{ 237 unsigned long cookie = 0; 238 struct vm_area_struct * vma; 239 240 for (vma = find_vma(mm, addr); vma; vma = vma->vm_next) { 241 242 if (!vma->vm_file) 243 continue; 244 245 if (addr < vma->vm_start || addr >= vma->vm_end) 246 continue; 247 248 cookie = fast_get_dcookie(vma->vm_file->f_dentry, 249 vma->vm_file->f_vfsmnt); 250 *offset = (vma->vm_pgoff << PAGE_SHIFT) + addr - vma->vm_start; 251 break; 252 } 253 254 return cookie; 255} 256 257 258static unsigned long last_cookie = ~0UL; 259 260static void add_cpu_switch(int i) 261{ 262 add_event_entry(ESCAPE_CODE); 263 add_event_entry(CPU_SWITCH_CODE); 264 add_event_entry(i); 265 last_cookie = ~0UL; 266} 267 268static void add_kernel_ctx_switch(unsigned int in_kernel) 269{ 270 add_event_entry(ESCAPE_CODE); 271 if (in_kernel) 272 add_event_entry(KERNEL_ENTER_SWITCH_CODE); 273 else 274 add_event_entry(KERNEL_EXIT_SWITCH_CODE); 275} 276 277static void 278add_user_ctx_switch(struct task_struct const * task, unsigned long cookie) 279{ 280 add_event_entry(ESCAPE_CODE); 281 add_event_entry(CTX_SWITCH_CODE); 282 add_event_entry(task->pid); 283 add_event_entry(cookie); 284 /* Another code for daemon back-compat */ 285 add_event_entry(ESCAPE_CODE); 286 add_event_entry(CTX_TGID_CODE); 287 add_event_entry(task->tgid); 288} 289 290 291static void add_cookie_switch(unsigned long cookie) 292{ 293 add_event_entry(ESCAPE_CODE); 294 add_event_entry(COOKIE_SWITCH_CODE); 295 add_event_entry(cookie); 296} 297 298 299static void add_trace_begin(void) 300{ 301 add_event_entry(ESCAPE_CODE); 302 add_event_entry(TRACE_BEGIN_CODE); 303} 304 305 306static void add_sample_entry(unsigned long offset, unsigned long event) 307{ 308 add_event_entry(offset); 309 add_event_entry(event); 310} 311 312 313static int add_us_sample(struct mm_struct * mm, struct op_sample * s) 314{ 315 unsigned long cookie; 316 off_t offset; 317 318 cookie = lookup_dcookie(mm, s->eip, &offset); 319 320 if (!cookie) { 321 atomic_inc(&oprofile_stats.sample_lost_no_mapping); 322 return 0; 323 } 324 325 if (cookie != last_cookie) { 326 add_cookie_switch(cookie); 327 last_cookie = cookie; 328 } 329 330 add_sample_entry(offset, s->event); 331 332 return 1; 333} 334 335 336/* Add a sample to the global event buffer. If possible the 337 * sample is converted into a persistent dentry/offset pair 338 * for later lookup from userspace. 339 */ 340static int 341add_sample(struct mm_struct * mm, struct op_sample * s, int in_kernel) 342{ 343 if (in_kernel) { 344 add_sample_entry(s->eip, s->event); 345 return 1; 346 } else if (mm) { 347 return add_us_sample(mm, s); 348 } else { 349 atomic_inc(&oprofile_stats.sample_lost_no_mm); 350 } 351 return 0; 352} 353 354 355static void release_mm(struct mm_struct * mm) 356{ 357 if (!mm) 358 return; 359 up_read(&mm->mmap_sem); 360 mmput(mm); 361} 362 363 364static struct mm_struct * take_tasks_mm(struct task_struct * task) 365{ 366 struct mm_struct * mm = get_task_mm(task); 367 if (mm) 368 down_read(&mm->mmap_sem); 369 return mm; 370} 371 372 373static inline int is_code(unsigned long val) 374{ 375 return val == ESCAPE_CODE; 376} 377 378 379/* "acquire" as many cpu buffer slots as we can */ 380static unsigned long get_slots(struct oprofile_cpu_buffer * b) 381{ 382 unsigned long head = b->head_pos; 383 unsigned long tail = b->tail_pos; 384 385 /* 386 * Subtle. This resets the persistent last_task 387 * and in_kernel values used for switching notes. 388 * BUT, there is a small window between reading 389 * head_pos, and this call, that means samples 390 * can appear at the new head position, but not 391 * be prefixed with the notes for switching 392 * kernel mode or a task switch. This small hole 393 * can lead to mis-attribution or samples where 394 * we don't know if it's in the kernel or not, 395 * at the start of an event buffer. 396 */ 397 cpu_buffer_reset(b); 398 399 if (head >= tail) 400 return head - tail; 401 402 return head + (b->buffer_size - tail); 403} 404 405 406static void increment_tail(struct oprofile_cpu_buffer * b) 407{ 408 unsigned long new_tail = b->tail_pos + 1; 409 410 rmb(); 411 412 if (new_tail < b->buffer_size) 413 b->tail_pos = new_tail; 414 else 415 b->tail_pos = 0; 416} 417 418 419/* Move tasks along towards death. Any tasks on dead_tasks 420 * will definitely have no remaining references in any 421 * CPU buffers at this point, because we use two lists, 422 * and to have reached the list, it must have gone through 423 * one full sync already. 424 */ 425static void process_task_mortuary(void) 426{ 427 struct list_head * pos; 428 struct list_head * pos2; 429 struct task_struct * task; 430 431 spin_lock(&task_mortuary); 432 433 list_for_each_safe(pos, pos2, &dead_tasks) { 434 task = list_entry(pos, struct task_struct, tasks); 435 list_del(&task->tasks); 436 free_task(task); 437 } 438 439 list_for_each_safe(pos, pos2, &dying_tasks) { 440 task = list_entry(pos, struct task_struct, tasks); 441 list_del(&task->tasks); 442 list_add_tail(&task->tasks, &dead_tasks); 443 } 444 445 spin_unlock(&task_mortuary); 446} 447 448 449static void mark_done(int cpu) 450{ 451 int i; 452 453 cpu_set(cpu, marked_cpus); 454 455 for_each_online_cpu(i) { 456 if (!cpu_isset(i, marked_cpus)) 457 return; 458 } 459 460 /* All CPUs have been processed at least once, 461 * we can process the mortuary once 462 */ 463 process_task_mortuary(); 464 465 cpus_clear(marked_cpus); 466} 467 468 469/* FIXME: this is not sufficient if we implement syscall barrier backtrace 470 * traversal, the code switch to sb_sample_start at first kernel enter/exit 471 * switch so we need a fifth state and some special handling in sync_buffer() 472 */ 473typedef enum { 474 sb_bt_ignore = -2, 475 sb_buffer_start, 476 sb_bt_start, 477 sb_sample_start, 478} sync_buffer_state; 479 480/* Sync one of the CPU's buffers into the global event buffer. 481 * Here we need to go through each batch of samples punctuated 482 * by context switch notes, taking the task's mmap_sem and doing 483 * lookup in task->mm->mmap to convert EIP into dcookie/offset 484 * value. 485 */ 486void sync_buffer(int cpu) 487{ 488 struct oprofile_cpu_buffer * cpu_buf = &cpu_buffer[cpu]; 489 struct mm_struct *mm = NULL; 490 struct task_struct * new; 491 unsigned long cookie = 0; 492 int in_kernel = 1; 493 unsigned int i; 494 sync_buffer_state state = sb_buffer_start; 495 unsigned long available; 496 497 down(&buffer_sem); 498 499 add_cpu_switch(cpu); 500 501 /* Remember, only we can modify tail_pos */ 502 503 available = get_slots(cpu_buf); 504 505 for (i = 0; i < available; ++i) { 506 struct op_sample * s = &cpu_buf->buffer[cpu_buf->tail_pos]; 507 508 if (is_code(s->eip)) { 509 if (s->event <= CPU_IS_KERNEL) { 510 /* kernel/userspace switch */ 511 in_kernel = s->event; 512 if (state == sb_buffer_start) 513 state = sb_sample_start; 514 add_kernel_ctx_switch(s->event); 515 } else if (s->event == CPU_TRACE_BEGIN) { 516 state = sb_bt_start; 517 add_trace_begin(); 518 } else { 519 struct mm_struct * oldmm = mm; 520 521 /* userspace context switch */ 522 new = (struct task_struct *)s->event; 523 524 release_mm(oldmm); 525 mm = take_tasks_mm(new); 526 if (mm != oldmm) 527 cookie = get_exec_dcookie(mm); 528 add_user_ctx_switch(new, cookie); 529 } 530 } else { 531 if (state >= sb_bt_start && 532 !add_sample(mm, s, in_kernel)) { 533 if (state == sb_bt_start) { 534 state = sb_bt_ignore; 535 atomic_inc(&oprofile_stats.bt_lost_no_mapping); 536 } 537 } 538 } 539 540 increment_tail(cpu_buf); 541 } 542 release_mm(mm); 543 544 mark_done(cpu); 545 546 up(&buffer_sem); 547}