at v2.6.26 553 lines 13 kB view raw
1/** 2 * @file buffer_sync.c 3 * 4 * @remark Copyright 2002 OProfile authors 5 * @remark Read the file COPYING 6 * 7 * @author John Levon <levon@movementarian.org> 8 * 9 * This is the core of the buffer management. Each 10 * CPU buffer is processed and entered into the 11 * global event buffer. Such processing is necessary 12 * in several circumstances, mentioned below. 13 * 14 * The processing does the job of converting the 15 * transitory EIP value into a persistent dentry/offset 16 * value that the profiler can record at its leisure. 17 * 18 * See fs/dcookies.c for a description of the dentry/offset 19 * objects. 20 */ 21 22#include <linux/mm.h> 23#include <linux/workqueue.h> 24#include <linux/notifier.h> 25#include <linux/dcookies.h> 26#include <linux/profile.h> 27#include <linux/module.h> 28#include <linux/fs.h> 29#include <linux/oprofile.h> 30#include <linux/sched.h> 31 32#include "oprofile_stats.h" 33#include "event_buffer.h" 34#include "cpu_buffer.h" 35#include "buffer_sync.h" 36 37static LIST_HEAD(dying_tasks); 38static LIST_HEAD(dead_tasks); 39static cpumask_t marked_cpus = CPU_MASK_NONE; 40static DEFINE_SPINLOCK(task_mortuary); 41static void process_task_mortuary(void); 42 43 44/* Take ownership of the task struct and place it on the 45 * list for processing. Only after two full buffer syncs 46 * does the task eventually get freed, because by then 47 * we are sure we will not reference it again. 48 * Can be invoked from softirq via RCU callback due to 49 * call_rcu() of the task struct, hence the _irqsave. 50 */ 51static int task_free_notify(struct notifier_block * self, unsigned long val, void * data) 52{ 53 unsigned long flags; 54 struct task_struct * task = data; 55 spin_lock_irqsave(&task_mortuary, flags); 56 list_add(&task->tasks, &dying_tasks); 57 spin_unlock_irqrestore(&task_mortuary, flags); 58 return NOTIFY_OK; 59} 60 61 62/* The task is on its way out. A sync of the buffer means we can catch 63 * any remaining samples for this task. 64 */ 65static int task_exit_notify(struct notifier_block * self, unsigned long val, void * data) 66{ 67 /* To avoid latency problems, we only process the current CPU, 68 * hoping that most samples for the task are on this CPU 69 */ 70 sync_buffer(raw_smp_processor_id()); 71 return 0; 72} 73 74 75/* The task is about to try a do_munmap(). We peek at what it's going to 76 * do, and if it's an executable region, process the samples first, so 77 * we don't lose any. This does not have to be exact, it's a QoI issue 78 * only. 79 */ 80static int munmap_notify(struct notifier_block * self, unsigned long val, void * data) 81{ 82 unsigned long addr = (unsigned long)data; 83 struct mm_struct * mm = current->mm; 84 struct vm_area_struct * mpnt; 85 86 down_read(&mm->mmap_sem); 87 88 mpnt = find_vma(mm, addr); 89 if (mpnt && mpnt->vm_file && (mpnt->vm_flags & VM_EXEC)) { 90 up_read(&mm->mmap_sem); 91 /* To avoid latency problems, we only process the current CPU, 92 * hoping that most samples for the task are on this CPU 93 */ 94 sync_buffer(raw_smp_processor_id()); 95 return 0; 96 } 97 98 up_read(&mm->mmap_sem); 99 return 0; 100} 101 102 103/* We need to be told about new modules so we don't attribute to a previously 104 * loaded module, or drop the samples on the floor. 105 */ 106static int module_load_notify(struct notifier_block * self, unsigned long val, void * data) 107{ 108#ifdef CONFIG_MODULES 109 if (val != MODULE_STATE_COMING) 110 return 0; 111 112 /* FIXME: should we process all CPU buffers ? */ 113 mutex_lock(&buffer_mutex); 114 add_event_entry(ESCAPE_CODE); 115 add_event_entry(MODULE_LOADED_CODE); 116 mutex_unlock(&buffer_mutex); 117#endif 118 return 0; 119} 120 121 122static struct notifier_block task_free_nb = { 123 .notifier_call = task_free_notify, 124}; 125 126static struct notifier_block task_exit_nb = { 127 .notifier_call = task_exit_notify, 128}; 129 130static struct notifier_block munmap_nb = { 131 .notifier_call = munmap_notify, 132}; 133 134static struct notifier_block module_load_nb = { 135 .notifier_call = module_load_notify, 136}; 137 138 139static void end_sync(void) 140{ 141 end_cpu_work(); 142 /* make sure we don't leak task structs */ 143 process_task_mortuary(); 144 process_task_mortuary(); 145} 146 147 148int sync_start(void) 149{ 150 int err; 151 152 start_cpu_work(); 153 154 err = task_handoff_register(&task_free_nb); 155 if (err) 156 goto out1; 157 err = profile_event_register(PROFILE_TASK_EXIT, &task_exit_nb); 158 if (err) 159 goto out2; 160 err = profile_event_register(PROFILE_MUNMAP, &munmap_nb); 161 if (err) 162 goto out3; 163 err = register_module_notifier(&module_load_nb); 164 if (err) 165 goto out4; 166 167out: 168 return err; 169out4: 170 profile_event_unregister(PROFILE_MUNMAP, &munmap_nb); 171out3: 172 profile_event_unregister(PROFILE_TASK_EXIT, &task_exit_nb); 173out2: 174 task_handoff_unregister(&task_free_nb); 175out1: 176 end_sync(); 177 goto out; 178} 179 180 181void sync_stop(void) 182{ 183 unregister_module_notifier(&module_load_nb); 184 profile_event_unregister(PROFILE_MUNMAP, &munmap_nb); 185 profile_event_unregister(PROFILE_TASK_EXIT, &task_exit_nb); 186 task_handoff_unregister(&task_free_nb); 187 end_sync(); 188} 189 190 191/* Optimisation. We can manage without taking the dcookie sem 192 * because we cannot reach this code without at least one 193 * dcookie user still being registered (namely, the reader 194 * of the event buffer). */ 195static inline unsigned long fast_get_dcookie(struct path *path) 196{ 197 unsigned long cookie; 198 199 if (path->dentry->d_cookie) 200 return (unsigned long)path->dentry; 201 get_dcookie(path, &cookie); 202 return cookie; 203} 204 205 206/* Look up the dcookie for the task's first VM_EXECUTABLE mapping, 207 * which corresponds loosely to "application name". This is 208 * not strictly necessary but allows oprofile to associate 209 * shared-library samples with particular applications 210 */ 211static unsigned long get_exec_dcookie(struct mm_struct * mm) 212{ 213 unsigned long cookie = NO_COOKIE; 214 struct vm_area_struct * vma; 215 216 if (!mm) 217 goto out; 218 219 for (vma = mm->mmap; vma; vma = vma->vm_next) { 220 if (!vma->vm_file) 221 continue; 222 if (!(vma->vm_flags & VM_EXECUTABLE)) 223 continue; 224 cookie = fast_get_dcookie(&vma->vm_file->f_path); 225 break; 226 } 227 228out: 229 return cookie; 230} 231 232 233/* Convert the EIP value of a sample into a persistent dentry/offset 234 * pair that can then be added to the global event buffer. We make 235 * sure to do this lookup before a mm->mmap modification happens so 236 * we don't lose track. 237 */ 238static unsigned long lookup_dcookie(struct mm_struct * mm, unsigned long addr, off_t * offset) 239{ 240 unsigned long cookie = NO_COOKIE; 241 struct vm_area_struct * vma; 242 243 for (vma = find_vma(mm, addr); vma; vma = vma->vm_next) { 244 245 if (addr < vma->vm_start || addr >= vma->vm_end) 246 continue; 247 248 if (vma->vm_file) { 249 cookie = fast_get_dcookie(&vma->vm_file->f_path); 250 *offset = (vma->vm_pgoff << PAGE_SHIFT) + addr - 251 vma->vm_start; 252 } else { 253 /* must be an anonymous map */ 254 *offset = addr; 255 } 256 257 break; 258 } 259 260 if (!vma) 261 cookie = INVALID_COOKIE; 262 263 return cookie; 264} 265 266 267static unsigned long last_cookie = INVALID_COOKIE; 268 269static void add_cpu_switch(int i) 270{ 271 add_event_entry(ESCAPE_CODE); 272 add_event_entry(CPU_SWITCH_CODE); 273 add_event_entry(i); 274 last_cookie = INVALID_COOKIE; 275} 276 277static void add_kernel_ctx_switch(unsigned int in_kernel) 278{ 279 add_event_entry(ESCAPE_CODE); 280 if (in_kernel) 281 add_event_entry(KERNEL_ENTER_SWITCH_CODE); 282 else 283 add_event_entry(KERNEL_EXIT_SWITCH_CODE); 284} 285 286static void 287add_user_ctx_switch(struct task_struct const * task, unsigned long cookie) 288{ 289 add_event_entry(ESCAPE_CODE); 290 add_event_entry(CTX_SWITCH_CODE); 291 add_event_entry(task->pid); 292 add_event_entry(cookie); 293 /* Another code for daemon back-compat */ 294 add_event_entry(ESCAPE_CODE); 295 add_event_entry(CTX_TGID_CODE); 296 add_event_entry(task->tgid); 297} 298 299 300static void add_cookie_switch(unsigned long cookie) 301{ 302 add_event_entry(ESCAPE_CODE); 303 add_event_entry(COOKIE_SWITCH_CODE); 304 add_event_entry(cookie); 305} 306 307 308static void add_trace_begin(void) 309{ 310 add_event_entry(ESCAPE_CODE); 311 add_event_entry(TRACE_BEGIN_CODE); 312} 313 314 315static void add_sample_entry(unsigned long offset, unsigned long event) 316{ 317 add_event_entry(offset); 318 add_event_entry(event); 319} 320 321 322static int add_us_sample(struct mm_struct * mm, struct op_sample * s) 323{ 324 unsigned long cookie; 325 off_t offset; 326 327 cookie = lookup_dcookie(mm, s->eip, &offset); 328 329 if (cookie == INVALID_COOKIE) { 330 atomic_inc(&oprofile_stats.sample_lost_no_mapping); 331 return 0; 332 } 333 334 if (cookie != last_cookie) { 335 add_cookie_switch(cookie); 336 last_cookie = cookie; 337 } 338 339 add_sample_entry(offset, s->event); 340 341 return 1; 342} 343 344 345/* Add a sample to the global event buffer. If possible the 346 * sample is converted into a persistent dentry/offset pair 347 * for later lookup from userspace. 348 */ 349static int 350add_sample(struct mm_struct * mm, struct op_sample * s, int in_kernel) 351{ 352 if (in_kernel) { 353 add_sample_entry(s->eip, s->event); 354 return 1; 355 } else if (mm) { 356 return add_us_sample(mm, s); 357 } else { 358 atomic_inc(&oprofile_stats.sample_lost_no_mm); 359 } 360 return 0; 361} 362 363 364static void release_mm(struct mm_struct * mm) 365{ 366 if (!mm) 367 return; 368 up_read(&mm->mmap_sem); 369 mmput(mm); 370} 371 372 373static struct mm_struct * take_tasks_mm(struct task_struct * task) 374{ 375 struct mm_struct * mm = get_task_mm(task); 376 if (mm) 377 down_read(&mm->mmap_sem); 378 return mm; 379} 380 381 382static inline int is_code(unsigned long val) 383{ 384 return val == ESCAPE_CODE; 385} 386 387 388/* "acquire" as many cpu buffer slots as we can */ 389static unsigned long get_slots(struct oprofile_cpu_buffer * b) 390{ 391 unsigned long head = b->head_pos; 392 unsigned long tail = b->tail_pos; 393 394 /* 395 * Subtle. This resets the persistent last_task 396 * and in_kernel values used for switching notes. 397 * BUT, there is a small window between reading 398 * head_pos, and this call, that means samples 399 * can appear at the new head position, but not 400 * be prefixed with the notes for switching 401 * kernel mode or a task switch. This small hole 402 * can lead to mis-attribution or samples where 403 * we don't know if it's in the kernel or not, 404 * at the start of an event buffer. 405 */ 406 cpu_buffer_reset(b); 407 408 if (head >= tail) 409 return head - tail; 410 411 return head + (b->buffer_size - tail); 412} 413 414 415static void increment_tail(struct oprofile_cpu_buffer * b) 416{ 417 unsigned long new_tail = b->tail_pos + 1; 418 419 rmb(); 420 421 if (new_tail < b->buffer_size) 422 b->tail_pos = new_tail; 423 else 424 b->tail_pos = 0; 425} 426 427 428/* Move tasks along towards death. Any tasks on dead_tasks 429 * will definitely have no remaining references in any 430 * CPU buffers at this point, because we use two lists, 431 * and to have reached the list, it must have gone through 432 * one full sync already. 433 */ 434static void process_task_mortuary(void) 435{ 436 unsigned long flags; 437 LIST_HEAD(local_dead_tasks); 438 struct task_struct * task; 439 struct task_struct * ttask; 440 441 spin_lock_irqsave(&task_mortuary, flags); 442 443 list_splice_init(&dead_tasks, &local_dead_tasks); 444 list_splice_init(&dying_tasks, &dead_tasks); 445 446 spin_unlock_irqrestore(&task_mortuary, flags); 447 448 list_for_each_entry_safe(task, ttask, &local_dead_tasks, tasks) { 449 list_del(&task->tasks); 450 free_task(task); 451 } 452} 453 454 455static void mark_done(int cpu) 456{ 457 int i; 458 459 cpu_set(cpu, marked_cpus); 460 461 for_each_online_cpu(i) { 462 if (!cpu_isset(i, marked_cpus)) 463 return; 464 } 465 466 /* All CPUs have been processed at least once, 467 * we can process the mortuary once 468 */ 469 process_task_mortuary(); 470 471 cpus_clear(marked_cpus); 472} 473 474 475/* FIXME: this is not sufficient if we implement syscall barrier backtrace 476 * traversal, the code switch to sb_sample_start at first kernel enter/exit 477 * switch so we need a fifth state and some special handling in sync_buffer() 478 */ 479typedef enum { 480 sb_bt_ignore = -2, 481 sb_buffer_start, 482 sb_bt_start, 483 sb_sample_start, 484} sync_buffer_state; 485 486/* Sync one of the CPU's buffers into the global event buffer. 487 * Here we need to go through each batch of samples punctuated 488 * by context switch notes, taking the task's mmap_sem and doing 489 * lookup in task->mm->mmap to convert EIP into dcookie/offset 490 * value. 491 */ 492void sync_buffer(int cpu) 493{ 494 struct oprofile_cpu_buffer *cpu_buf = &per_cpu(cpu_buffer, cpu); 495 struct mm_struct *mm = NULL; 496 struct task_struct * new; 497 unsigned long cookie = 0; 498 int in_kernel = 1; 499 unsigned int i; 500 sync_buffer_state state = sb_buffer_start; 501 unsigned long available; 502 503 mutex_lock(&buffer_mutex); 504 505 add_cpu_switch(cpu); 506 507 /* Remember, only we can modify tail_pos */ 508 509 available = get_slots(cpu_buf); 510 511 for (i = 0; i < available; ++i) { 512 struct op_sample * s = &cpu_buf->buffer[cpu_buf->tail_pos]; 513 514 if (is_code(s->eip)) { 515 if (s->event <= CPU_IS_KERNEL) { 516 /* kernel/userspace switch */ 517 in_kernel = s->event; 518 if (state == sb_buffer_start) 519 state = sb_sample_start; 520 add_kernel_ctx_switch(s->event); 521 } else if (s->event == CPU_TRACE_BEGIN) { 522 state = sb_bt_start; 523 add_trace_begin(); 524 } else { 525 struct mm_struct * oldmm = mm; 526 527 /* userspace context switch */ 528 new = (struct task_struct *)s->event; 529 530 release_mm(oldmm); 531 mm = take_tasks_mm(new); 532 if (mm != oldmm) 533 cookie = get_exec_dcookie(mm); 534 add_user_ctx_switch(new, cookie); 535 } 536 } else { 537 if (state >= sb_bt_start && 538 !add_sample(mm, s, in_kernel)) { 539 if (state == sb_bt_start) { 540 state = sb_bt_ignore; 541 atomic_inc(&oprofile_stats.bt_lost_no_mapping); 542 } 543 } 544 } 545 546 increment_tail(cpu_buf); 547 } 548 release_mm(mm); 549 550 mark_done(cpu); 551 552 mutex_unlock(&buffer_mutex); 553}