1/** 2 * @file buffer_sync.c 3 * 4 * @remark Copyright 2002 OProfile authors 5 * @remark Read the file COPYING 6 * 7 * @author John Levon <levon@movementarian.org> 8 * 9 * This is the core of the buffer management. Each 10 * CPU buffer is processed and entered into the 11 * global event buffer. Such processing is necessary 12 * in several circumstances, mentioned below. 13 * 14 * The processing does the job of converting the 15 * transitory EIP value into a persistent dentry/offset 16 * value that the profiler can record at its leisure. 17 * 18 * See fs/dcookies.c for a description of the dentry/offset 19 * objects. 20 */ 21 22#include <linux/mm.h> 23#include <linux/workqueue.h> 24#include <linux/notifier.h> 25#include <linux/dcookies.h> 26#include <linux/profile.h> 27#include <linux/module.h> 28#include <linux/fs.h> 29#include <linux/oprofile.h> 30#include <linux/sched.h> 31 32#include "oprofile_stats.h" 33#include "event_buffer.h" 34#include "cpu_buffer.h" 35#include "buffer_sync.h" 36 37static LIST_HEAD(dying_tasks); 38static LIST_HEAD(dead_tasks); 39static cpumask_t marked_cpus = CPU_MASK_NONE; 40static DEFINE_SPINLOCK(task_mortuary); 41static void process_task_mortuary(void); 42 43 44/* Take ownership of the task struct and place it on the 45 * list for processing. Only after two full buffer syncs 46 * does the task eventually get freed, because by then 47 * we are sure we will not reference it again. 48 * Can be invoked from softirq via RCU callback due to 49 * call_rcu() of the task struct, hence the _irqsave. 50 */ 51static int task_free_notify(struct notifier_block * self, unsigned long val, void * data) 52{ 53 unsigned long flags; 54 struct task_struct * task = data; 55 spin_lock_irqsave(&task_mortuary, flags); 56 list_add(&task->tasks, &dying_tasks); 57 spin_unlock_irqrestore(&task_mortuary, flags); 58 return NOTIFY_OK; 59} 60 61 62/* The task is on its way out. A sync of the buffer means we can catch 63 * any remaining samples for this task. 64 */ 65static int task_exit_notify(struct notifier_block * self, unsigned long val, void * data) 66{ 67 /* To avoid latency problems, we only process the current CPU, 68 * hoping that most samples for the task are on this CPU 69 */ 70 sync_buffer(raw_smp_processor_id()); 71 return 0; 72} 73 74 75/* The task is about to try a do_munmap(). We peek at what it's going to 76 * do, and if it's an executable region, process the samples first, so 77 * we don't lose any. This does not have to be exact, it's a QoI issue 78 * only. 79 */ 80static int munmap_notify(struct notifier_block * self, unsigned long val, void * data) 81{ 82 unsigned long addr = (unsigned long)data; 83 struct mm_struct * mm = current->mm; 84 struct vm_area_struct * mpnt; 85 86 down_read(&mm->mmap_sem); 87 88 mpnt = find_vma(mm, addr); 89 if (mpnt && mpnt->vm_file && (mpnt->vm_flags & VM_EXEC)) { 90 up_read(&mm->mmap_sem); 91 /* To avoid latency problems, we only process the current CPU, 92 * hoping that most samples for the task are on this CPU 93 */ 94 sync_buffer(raw_smp_processor_id()); 95 return 0; 96 } 97 98 up_read(&mm->mmap_sem); 99 return 0; 100} 101 102 103/* We need to be told about new modules so we don't attribute to a previously 104 * loaded module, or drop the samples on the floor. 105 */ 106static int module_load_notify(struct notifier_block * self, unsigned long val, void * data) 107{ 108#ifdef CONFIG_MODULES 109 if (val != MODULE_STATE_COMING) 110 return 0; 111 112 /* FIXME: should we process all CPU buffers ? */ 113 mutex_lock(&buffer_mutex); 114 add_event_entry(ESCAPE_CODE); 115 add_event_entry(MODULE_LOADED_CODE); 116 mutex_unlock(&buffer_mutex); 117#endif 118 return 0; 119} 120 121 122static struct notifier_block task_free_nb = { 123 .notifier_call = task_free_notify, 124}; 125 126static struct notifier_block task_exit_nb = { 127 .notifier_call = task_exit_notify, 128}; 129 130static struct notifier_block munmap_nb = { 131 .notifier_call = munmap_notify, 132}; 133 134static struct notifier_block module_load_nb = { 135 .notifier_call = module_load_notify, 136}; 137 138 139static void end_sync(void) 140{ 141 end_cpu_work(); 142 /* make sure we don't leak task structs */ 143 process_task_mortuary(); 144 process_task_mortuary(); 145} 146 147 148int sync_start(void) 149{ 150 int err; 151 152 start_cpu_work(); 153 154 err = task_handoff_register(&task_free_nb); 155 if (err) 156 goto out1; 157 err = profile_event_register(PROFILE_TASK_EXIT, &task_exit_nb); 158 if (err) 159 goto out2; 160 err = profile_event_register(PROFILE_MUNMAP, &munmap_nb); 161 if (err) 162 goto out3; 163 err = register_module_notifier(&module_load_nb); 164 if (err) 165 goto out4; 166 167out: 168 return err; 169out4: 170 profile_event_unregister(PROFILE_MUNMAP, &munmap_nb); 171out3: 172 profile_event_unregister(PROFILE_TASK_EXIT, &task_exit_nb); 173out2: 174 task_handoff_unregister(&task_free_nb); 175out1: 176 end_sync(); 177 goto out; 178} 179 180 181void sync_stop(void) 182{ 183 unregister_module_notifier(&module_load_nb); 184 profile_event_unregister(PROFILE_MUNMAP, &munmap_nb); 185 profile_event_unregister(PROFILE_TASK_EXIT, &task_exit_nb); 186 task_handoff_unregister(&task_free_nb); 187 end_sync(); 188} 189 190 191/* Optimisation. We can manage without taking the dcookie sem 192 * because we cannot reach this code without at least one 193 * dcookie user still being registered (namely, the reader 194 * of the event buffer). */ 195static inline unsigned long fast_get_dcookie(struct dentry * dentry, 196 struct vfsmount * vfsmnt) 197{ 198 unsigned long cookie; 199 200 if (dentry->d_cookie) 201 return (unsigned long)dentry; 202 get_dcookie(dentry, vfsmnt, &cookie); 203 return cookie; 204} 205 206 207/* Look up the dcookie for the task's first VM_EXECUTABLE mapping, 208 * which corresponds loosely to "application name". This is 209 * not strictly necessary but allows oprofile to associate 210 * shared-library samples with particular applications 211 */ 212static unsigned long get_exec_dcookie(struct mm_struct * mm) 213{ 214 unsigned long cookie = NO_COOKIE; 215 struct vm_area_struct * vma; 216 217 if (!mm) 218 goto out; 219 220 for (vma = mm->mmap; vma; vma = vma->vm_next) { 221 if (!vma->vm_file) 222 continue; 223 if (!(vma->vm_flags & VM_EXECUTABLE)) 224 continue; 225 cookie = fast_get_dcookie(vma->vm_file->f_path.dentry, 226 vma->vm_file->f_path.mnt); 227 break; 228 } 229 230out: 231 return cookie; 232} 233 234 235/* Convert the EIP value of a sample into a persistent dentry/offset 236 * pair that can then be added to the global event buffer. We make 237 * sure to do this lookup before a mm->mmap modification happens so 238 * we don't lose track. 239 */ 240static unsigned long lookup_dcookie(struct mm_struct * mm, unsigned long addr, off_t * offset) 241{ 242 unsigned long cookie = NO_COOKIE; 243 struct vm_area_struct * vma; 244 245 for (vma = find_vma(mm, addr); vma; vma = vma->vm_next) { 246 247 if (addr < vma->vm_start || addr >= vma->vm_end) 248 continue; 249 250 if (vma->vm_file) { 251 cookie = fast_get_dcookie(vma->vm_file->f_path.dentry, 252 vma->vm_file->f_path.mnt); 253 *offset = (vma->vm_pgoff << PAGE_SHIFT) + addr - 254 vma->vm_start; 255 } else { 256 /* must be an anonymous map */ 257 *offset = addr; 258 } 259 260 break; 261 } 262 263 if (!vma) 264 cookie = INVALID_COOKIE; 265 266 return cookie; 267} 268 269 270static unsigned long last_cookie = INVALID_COOKIE; 271 272static void add_cpu_switch(int i) 273{ 274 add_event_entry(ESCAPE_CODE); 275 add_event_entry(CPU_SWITCH_CODE); 276 add_event_entry(i); 277 last_cookie = INVALID_COOKIE; 278} 279 280static void add_kernel_ctx_switch(unsigned int in_kernel) 281{ 282 add_event_entry(ESCAPE_CODE); 283 if (in_kernel) 284 add_event_entry(KERNEL_ENTER_SWITCH_CODE); 285 else 286 add_event_entry(KERNEL_EXIT_SWITCH_CODE); 287} 288 289static void 290add_user_ctx_switch(struct task_struct const * task, unsigned long cookie) 291{ 292 add_event_entry(ESCAPE_CODE); 293 add_event_entry(CTX_SWITCH_CODE); 294 add_event_entry(task->pid); 295 add_event_entry(cookie); 296 /* Another code for daemon back-compat */ 297 add_event_entry(ESCAPE_CODE); 298 add_event_entry(CTX_TGID_CODE); 299 add_event_entry(task->tgid); 300} 301 302 303static void add_cookie_switch(unsigned long cookie) 304{ 305 add_event_entry(ESCAPE_CODE); 306 add_event_entry(COOKIE_SWITCH_CODE); 307 add_event_entry(cookie); 308} 309 310 311static void add_trace_begin(void) 312{ 313 add_event_entry(ESCAPE_CODE); 314 add_event_entry(TRACE_BEGIN_CODE); 315} 316 317 318static void add_sample_entry(unsigned long offset, unsigned long event) 319{ 320 add_event_entry(offset); 321 add_event_entry(event); 322} 323 324 325static int add_us_sample(struct mm_struct * mm, struct op_sample * s) 326{ 327 unsigned long cookie; 328 off_t offset; 329 330 cookie = lookup_dcookie(mm, s->eip, &offset); 331 332 if (cookie == INVALID_COOKIE) { 333 atomic_inc(&oprofile_stats.sample_lost_no_mapping); 334 return 0; 335 } 336 337 if (cookie != last_cookie) { 338 add_cookie_switch(cookie); 339 last_cookie = cookie; 340 } 341 342 add_sample_entry(offset, s->event); 343 344 return 1; 345} 346 347 348/* Add a sample to the global event buffer. If possible the 349 * sample is converted into a persistent dentry/offset pair 350 * for later lookup from userspace. 351 */ 352static int 353add_sample(struct mm_struct * mm, struct op_sample * s, int in_kernel) 354{ 355 if (in_kernel) { 356 add_sample_entry(s->eip, s->event); 357 return 1; 358 } else if (mm) { 359 return add_us_sample(mm, s); 360 } else { 361 atomic_inc(&oprofile_stats.sample_lost_no_mm); 362 } 363 return 0; 364} 365 366 367static void release_mm(struct mm_struct * mm) 368{ 369 if (!mm) 370 return; 371 up_read(&mm->mmap_sem); 372 mmput(mm); 373} 374 375 376static struct mm_struct * take_tasks_mm(struct task_struct * task) 377{ 378 struct mm_struct * mm = get_task_mm(task); 379 if (mm) 380 down_read(&mm->mmap_sem); 381 return mm; 382} 383 384 385static inline int is_code(unsigned long val) 386{ 387 return val == ESCAPE_CODE; 388} 389 390 391/* "acquire" as many cpu buffer slots as we can */ 392static unsigned long get_slots(struct oprofile_cpu_buffer * b) 393{ 394 unsigned long head = b->head_pos; 395 unsigned long tail = b->tail_pos; 396 397 /* 398 * Subtle. This resets the persistent last_task 399 * and in_kernel values used for switching notes. 400 * BUT, there is a small window between reading 401 * head_pos, and this call, that means samples 402 * can appear at the new head position, but not 403 * be prefixed with the notes for switching 404 * kernel mode or a task switch. This small hole 405 * can lead to mis-attribution or samples where 406 * we don't know if it's in the kernel or not, 407 * at the start of an event buffer. 408 */ 409 cpu_buffer_reset(b); 410 411 if (head >= tail) 412 return head - tail; 413 414 return head + (b->buffer_size - tail); 415} 416 417 418static void increment_tail(struct oprofile_cpu_buffer * b) 419{ 420 unsigned long new_tail = b->tail_pos + 1; 421 422 rmb(); 423 424 if (new_tail < b->buffer_size) 425 b->tail_pos = new_tail; 426 else 427 b->tail_pos = 0; 428} 429 430 431/* Move tasks along towards death. Any tasks on dead_tasks 432 * will definitely have no remaining references in any 433 * CPU buffers at this point, because we use two lists, 434 * and to have reached the list, it must have gone through 435 * one full sync already. 436 */ 437static void process_task_mortuary(void) 438{ 439 unsigned long flags; 440 LIST_HEAD(local_dead_tasks); 441 struct task_struct * task; 442 struct task_struct * ttask; 443 444 spin_lock_irqsave(&task_mortuary, flags); 445 446 list_splice_init(&dead_tasks, &local_dead_tasks); 447 list_splice_init(&dying_tasks, &dead_tasks); 448 449 spin_unlock_irqrestore(&task_mortuary, flags); 450 451 list_for_each_entry_safe(task, ttask, &local_dead_tasks, tasks) { 452 list_del(&task->tasks); 453 free_task(task); 454 } 455} 456 457 458static void mark_done(int cpu) 459{ 460 int i; 461 462 cpu_set(cpu, marked_cpus); 463 464 for_each_online_cpu(i) { 465 if (!cpu_isset(i, marked_cpus)) 466 return; 467 } 468 469 /* All CPUs have been processed at least once, 470 * we can process the mortuary once 471 */ 472 process_task_mortuary(); 473 474 cpus_clear(marked_cpus); 475} 476 477 478/* FIXME: this is not sufficient if we implement syscall barrier backtrace 479 * traversal, the code switch to sb_sample_start at first kernel enter/exit 480 * switch so we need a fifth state and some special handling in sync_buffer() 481 */ 482typedef enum { 483 sb_bt_ignore = -2, 484 sb_buffer_start, 485 sb_bt_start, 486 sb_sample_start, 487} sync_buffer_state; 488 489/* Sync one of the CPU's buffers into the global event buffer. 490 * Here we need to go through each batch of samples punctuated 491 * by context switch notes, taking the task's mmap_sem and doing 492 * lookup in task->mm->mmap to convert EIP into dcookie/offset 493 * value. 494 */ 495void sync_buffer(int cpu) 496{ 497 struct oprofile_cpu_buffer * cpu_buf = &cpu_buffer[cpu]; 498 struct mm_struct *mm = NULL; 499 struct task_struct * new; 500 unsigned long cookie = 0; 501 int in_kernel = 1; 502 unsigned int i; 503 sync_buffer_state state = sb_buffer_start; 504 unsigned long available; 505 506 mutex_lock(&buffer_mutex); 507 508 add_cpu_switch(cpu); 509 510 /* Remember, only we can modify tail_pos */ 511 512 available = get_slots(cpu_buf); 513 514 for (i = 0; i < available; ++i) { 515 struct op_sample * s = &cpu_buf->buffer[cpu_buf->tail_pos]; 516 517 if (is_code(s->eip)) { 518 if (s->event <= CPU_IS_KERNEL) { 519 /* kernel/userspace switch */ 520 in_kernel = s->event; 521 if (state == sb_buffer_start) 522 state = sb_sample_start; 523 add_kernel_ctx_switch(s->event); 524 } else if (s->event == CPU_TRACE_BEGIN) { 525 state = sb_bt_start; 526 add_trace_begin(); 527 } else { 528 struct mm_struct * oldmm = mm; 529 530 /* userspace context switch */ 531 new = (struct task_struct *)s->event; 532 533 release_mm(oldmm); 534 mm = take_tasks_mm(new); 535 if (mm != oldmm) 536 cookie = get_exec_dcookie(mm); 537 add_user_ctx_switch(new, cookie); 538 } 539 } else { 540 if (state >= sb_bt_start && 541 !add_sample(mm, s, in_kernel)) { 542 if (state == sb_bt_start) { 543 state = sb_bt_ignore; 544 atomic_inc(&oprofile_stats.bt_lost_no_mapping); 545 } 546 } 547 } 548 549 increment_tail(cpu_buf); 550 } 551 release_mm(mm); 552 553 mark_done(cpu); 554 555 mutex_unlock(&buffer_mutex); 556}