drivers/oprofile/buffer_sync.c at v2.6.28 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / drivers / oprofile / buffer_sync.c
at v2.6.28 657 lines 16 kB view raw
  1/**
  2 * @file buffer_sync.c
  3 *
  4 * @remark Copyright 2002 OProfile authors
  5 * @remark Read the file COPYING
  6 *
  7 * @author John Levon <levon@movementarian.org>
  8 * @author Barry Kasindorf
  9 *
 10 * This is the core of the buffer management. Each
 11 * CPU buffer is processed and entered into the
 12 * global event buffer. Such processing is necessary
 13 * in several circumstances, mentioned below.
 14 *
 15 * The processing does the job of converting the
 16 * transitory EIP value into a persistent dentry/offset
 17 * value that the profiler can record at its leisure.
 18 *
 19 * See fs/dcookies.c for a description of the dentry/offset
 20 * objects.
 21 */
 22
 23#include <linux/mm.h>
 24#include <linux/workqueue.h>
 25#include <linux/notifier.h>
 26#include <linux/dcookies.h>
 27#include <linux/profile.h>
 28#include <linux/module.h>
 29#include <linux/fs.h>
 30#include <linux/oprofile.h>
 31#include <linux/sched.h>
 32
 33#include "oprofile_stats.h"
 34#include "event_buffer.h"
 35#include "cpu_buffer.h"
 36#include "buffer_sync.h"
 37
 38static LIST_HEAD(dying_tasks);
 39static LIST_HEAD(dead_tasks);
 40static cpumask_t marked_cpus = CPU_MASK_NONE;
 41static DEFINE_SPINLOCK(task_mortuary);
 42static void process_task_mortuary(void);
 43
 44/* Take ownership of the task struct and place it on the
 45 * list for processing. Only after two full buffer syncs
 46 * does the task eventually get freed, because by then
 47 * we are sure we will not reference it again.
 48 * Can be invoked from softirq via RCU callback due to
 49 * call_rcu() of the task struct, hence the _irqsave.
 50 */
 51static int
 52task_free_notify(struct notifier_block *self, unsigned long val, void *data)
 53{
 54	unsigned long flags;
 55	struct task_struct *task = data;
 56	spin_lock_irqsave(&task_mortuary, flags);
 57	list_add(&task->tasks, &dying_tasks);
 58	spin_unlock_irqrestore(&task_mortuary, flags);
 59	return NOTIFY_OK;
 60}
 61
 62
 63/* The task is on its way out. A sync of the buffer means we can catch
 64 * any remaining samples for this task.
 65 */
 66static int
 67task_exit_notify(struct notifier_block *self, unsigned long val, void *data)
 68{
 69	/* To avoid latency problems, we only process the current CPU,
 70	 * hoping that most samples for the task are on this CPU
 71	 */
 72	sync_buffer(raw_smp_processor_id());
 73	return 0;
 74}
 75
 76
 77/* The task is about to try a do_munmap(). We peek at what it's going to
 78 * do, and if it's an executable region, process the samples first, so
 79 * we don't lose any. This does not have to be exact, it's a QoI issue
 80 * only.
 81 */
 82static int
 83munmap_notify(struct notifier_block *self, unsigned long val, void *data)
 84{
 85	unsigned long addr = (unsigned long)data;
 86	struct mm_struct *mm = current->mm;
 87	struct vm_area_struct *mpnt;
 88
 89	down_read(&mm->mmap_sem);
 90
 91	mpnt = find_vma(mm, addr);
 92	if (mpnt && mpnt->vm_file && (mpnt->vm_flags & VM_EXEC)) {
 93		up_read(&mm->mmap_sem);
 94		/* To avoid latency problems, we only process the current CPU,
 95		 * hoping that most samples for the task are on this CPU
 96		 */
 97		sync_buffer(raw_smp_processor_id());
 98		return 0;
 99	}
100
101	up_read(&mm->mmap_sem);
102	return 0;
103}
104
105
106/* We need to be told about new modules so we don't attribute to a previously
107 * loaded module, or drop the samples on the floor.
108 */
109static int
110module_load_notify(struct notifier_block *self, unsigned long val, void *data)
111{
112#ifdef CONFIG_MODULES
113	if (val != MODULE_STATE_COMING)
114		return 0;
115
116	/* FIXME: should we process all CPU buffers ? */
117	mutex_lock(&buffer_mutex);
118	add_event_entry(ESCAPE_CODE);
119	add_event_entry(MODULE_LOADED_CODE);
120	mutex_unlock(&buffer_mutex);
121#endif
122	return 0;
123}
124
125
126static struct notifier_block task_free_nb = {
127	.notifier_call	= task_free_notify,
128};
129
130static struct notifier_block task_exit_nb = {
131	.notifier_call	= task_exit_notify,
132};
133
134static struct notifier_block munmap_nb = {
135	.notifier_call	= munmap_notify,
136};
137
138static struct notifier_block module_load_nb = {
139	.notifier_call = module_load_notify,
140};
141
142
143static void end_sync(void)
144{
145	end_cpu_work();
146	/* make sure we don't leak task structs */
147	process_task_mortuary();
148	process_task_mortuary();
149}
150
151
152int sync_start(void)
153{
154	int err;
155
156	start_cpu_work();
157
158	err = task_handoff_register(&task_free_nb);
159	if (err)
160		goto out1;
161	err = profile_event_register(PROFILE_TASK_EXIT, &task_exit_nb);
162	if (err)
163		goto out2;
164	err = profile_event_register(PROFILE_MUNMAP, &munmap_nb);
165	if (err)
166		goto out3;
167	err = register_module_notifier(&module_load_nb);
168	if (err)
169		goto out4;
170
171out:
172	return err;
173out4:
174	profile_event_unregister(PROFILE_MUNMAP, &munmap_nb);
175out3:
176	profile_event_unregister(PROFILE_TASK_EXIT, &task_exit_nb);
177out2:
178	task_handoff_unregister(&task_free_nb);
179out1:
180	end_sync();
181	goto out;
182}
183
184
185void sync_stop(void)
186{
187	unregister_module_notifier(&module_load_nb);
188	profile_event_unregister(PROFILE_MUNMAP, &munmap_nb);
189	profile_event_unregister(PROFILE_TASK_EXIT, &task_exit_nb);
190	task_handoff_unregister(&task_free_nb);
191	end_sync();
192}
193
194
195/* Optimisation. We can manage without taking the dcookie sem
196 * because we cannot reach this code without at least one
197 * dcookie user still being registered (namely, the reader
198 * of the event buffer). */
199static inline unsigned long fast_get_dcookie(struct path *path)
200{
201	unsigned long cookie;
202
203	if (path->dentry->d_cookie)
204		return (unsigned long)path->dentry;
205	get_dcookie(path, &cookie);
206	return cookie;
207}
208
209
210/* Look up the dcookie for the task's first VM_EXECUTABLE mapping,
211 * which corresponds loosely to "application name". This is
212 * not strictly necessary but allows oprofile to associate
213 * shared-library samples with particular applications
214 */
215static unsigned long get_exec_dcookie(struct mm_struct *mm)
216{
217	unsigned long cookie = NO_COOKIE;
218	struct vm_area_struct *vma;
219
220	if (!mm)
221		goto out;
222
223	for (vma = mm->mmap; vma; vma = vma->vm_next) {
224		if (!vma->vm_file)
225			continue;
226		if (!(vma->vm_flags & VM_EXECUTABLE))
227			continue;
228		cookie = fast_get_dcookie(&vma->vm_file->f_path);
229		break;
230	}
231
232out:
233	return cookie;
234}
235
236
237/* Convert the EIP value of a sample into a persistent dentry/offset
238 * pair that can then be added to the global event buffer. We make
239 * sure to do this lookup before a mm->mmap modification happens so
240 * we don't lose track.
241 */
242static unsigned long
243lookup_dcookie(struct mm_struct *mm, unsigned long addr, off_t *offset)
244{
245	unsigned long cookie = NO_COOKIE;
246	struct vm_area_struct *vma;
247
248	for (vma = find_vma(mm, addr); vma; vma = vma->vm_next) {
249
250		if (addr < vma->vm_start || addr >= vma->vm_end)
251			continue;
252
253		if (vma->vm_file) {
254			cookie = fast_get_dcookie(&vma->vm_file->f_path);
255			*offset = (vma->vm_pgoff << PAGE_SHIFT) + addr -
256				vma->vm_start;
257		} else {
258			/* must be an anonymous map */
259			*offset = addr;
260		}
261
262		break;
263	}
264
265	if (!vma)
266		cookie = INVALID_COOKIE;
267
268	return cookie;
269}
270
271static void increment_tail(struct oprofile_cpu_buffer *b)
272{
273	unsigned long new_tail = b->tail_pos + 1;
274
275	rmb();	/* be sure fifo pointers are synchromized */
276
277	if (new_tail < b->buffer_size)
278		b->tail_pos = new_tail;
279	else
280		b->tail_pos = 0;
281}
282
283static unsigned long last_cookie = INVALID_COOKIE;
284
285static void add_cpu_switch(int i)
286{
287	add_event_entry(ESCAPE_CODE);
288	add_event_entry(CPU_SWITCH_CODE);
289	add_event_entry(i);
290	last_cookie = INVALID_COOKIE;
291}
292
293static void add_kernel_ctx_switch(unsigned int in_kernel)
294{
295	add_event_entry(ESCAPE_CODE);
296	if (in_kernel)
297		add_event_entry(KERNEL_ENTER_SWITCH_CODE);
298	else
299		add_event_entry(KERNEL_EXIT_SWITCH_CODE);
300}
301
302static void
303add_user_ctx_switch(struct task_struct const *task, unsigned long cookie)
304{
305	add_event_entry(ESCAPE_CODE);
306	add_event_entry(CTX_SWITCH_CODE);
307	add_event_entry(task->pid);
308	add_event_entry(cookie);
309	/* Another code for daemon back-compat */
310	add_event_entry(ESCAPE_CODE);
311	add_event_entry(CTX_TGID_CODE);
312	add_event_entry(task->tgid);
313}
314
315
316static void add_cookie_switch(unsigned long cookie)
317{
318	add_event_entry(ESCAPE_CODE);
319	add_event_entry(COOKIE_SWITCH_CODE);
320	add_event_entry(cookie);
321}
322
323
324static void add_trace_begin(void)
325{
326	add_event_entry(ESCAPE_CODE);
327	add_event_entry(TRACE_BEGIN_CODE);
328}
329
330#ifdef CONFIG_OPROFILE_IBS
331
332#define IBS_FETCH_CODE_SIZE	2
333#define IBS_OP_CODE_SIZE	5
334#define IBS_EIP(offset)				\
335	(((struct op_sample *)&cpu_buf->buffer[(offset)])->eip)
336#define IBS_EVENT(offset)				\
337	(((struct op_sample *)&cpu_buf->buffer[(offset)])->event)
338
339/*
340 * Add IBS fetch and op entries to event buffer
341 */
342static void add_ibs_begin(struct oprofile_cpu_buffer *cpu_buf, int code,
343			  struct mm_struct *mm)
344{
345	unsigned long rip;
346	int i, count;
347	unsigned long ibs_cookie = 0;
348	off_t offset;
349
350	increment_tail(cpu_buf);	/* move to RIP entry */
351
352	rip = IBS_EIP(cpu_buf->tail_pos);
353
354#ifdef __LP64__
355	rip += IBS_EVENT(cpu_buf->tail_pos) << 32;
356#endif
357
358	if (mm) {
359		ibs_cookie = lookup_dcookie(mm, rip, &offset);
360
361		if (ibs_cookie == NO_COOKIE)
362			offset = rip;
363		if (ibs_cookie == INVALID_COOKIE) {
364			atomic_inc(&oprofile_stats.sample_lost_no_mapping);
365			offset = rip;
366		}
367		if (ibs_cookie != last_cookie) {
368			add_cookie_switch(ibs_cookie);
369			last_cookie = ibs_cookie;
370		}
371	} else
372		offset = rip;
373
374	add_event_entry(ESCAPE_CODE);
375	add_event_entry(code);
376	add_event_entry(offset);	/* Offset from Dcookie */
377
378	/* we send the Dcookie offset, but send the raw Linear Add also*/
379	add_event_entry(IBS_EIP(cpu_buf->tail_pos));
380	add_event_entry(IBS_EVENT(cpu_buf->tail_pos));
381
382	if (code == IBS_FETCH_CODE)
383		count = IBS_FETCH_CODE_SIZE;	/*IBS FETCH is 2 int64s*/
384	else
385		count = IBS_OP_CODE_SIZE;	/*IBS OP is 5 int64s*/
386
387	for (i = 0; i < count; i++) {
388		increment_tail(cpu_buf);
389		add_event_entry(IBS_EIP(cpu_buf->tail_pos));
390		add_event_entry(IBS_EVENT(cpu_buf->tail_pos));
391	}
392}
393
394#endif
395
396static void add_sample_entry(unsigned long offset, unsigned long event)
397{
398	add_event_entry(offset);
399	add_event_entry(event);
400}
401
402
403static int add_us_sample(struct mm_struct *mm, struct op_sample *s)
404{
405	unsigned long cookie;
406	off_t offset;
407
408	cookie = lookup_dcookie(mm, s->eip, &offset);
409
410	if (cookie == INVALID_COOKIE) {
411		atomic_inc(&oprofile_stats.sample_lost_no_mapping);
412		return 0;
413	}
414
415	if (cookie != last_cookie) {
416		add_cookie_switch(cookie);
417		last_cookie = cookie;
418	}
419
420	add_sample_entry(offset, s->event);
421
422	return 1;
423}
424
425
426/* Add a sample to the global event buffer. If possible the
427 * sample is converted into a persistent dentry/offset pair
428 * for later lookup from userspace.
429 */
430static int
431add_sample(struct mm_struct *mm, struct op_sample *s, int in_kernel)
432{
433	if (in_kernel) {
434		add_sample_entry(s->eip, s->event);
435		return 1;
436	} else if (mm) {
437		return add_us_sample(mm, s);
438	} else {
439		atomic_inc(&oprofile_stats.sample_lost_no_mm);
440	}
441	return 0;
442}
443
444
445static void release_mm(struct mm_struct *mm)
446{
447	if (!mm)
448		return;
449	up_read(&mm->mmap_sem);
450	mmput(mm);
451}
452
453
454static struct mm_struct *take_tasks_mm(struct task_struct *task)
455{
456	struct mm_struct *mm = get_task_mm(task);
457	if (mm)
458		down_read(&mm->mmap_sem);
459	return mm;
460}
461
462
463static inline int is_code(unsigned long val)
464{
465	return val == ESCAPE_CODE;
466}
467
468
469/* "acquire" as many cpu buffer slots as we can */
470static unsigned long get_slots(struct oprofile_cpu_buffer *b)
471{
472	unsigned long head = b->head_pos;
473	unsigned long tail = b->tail_pos;
474
475	/*
476	 * Subtle. This resets the persistent last_task
477	 * and in_kernel values used for switching notes.
478	 * BUT, there is a small window between reading
479	 * head_pos, and this call, that means samples
480	 * can appear at the new head position, but not
481	 * be prefixed with the notes for switching
482	 * kernel mode or a task switch. This small hole
483	 * can lead to mis-attribution or samples where
484	 * we don't know if it's in the kernel or not,
485	 * at the start of an event buffer.
486	 */
487	cpu_buffer_reset(b);
488
489	if (head >= tail)
490		return head - tail;
491
492	return head + (b->buffer_size - tail);
493}
494
495
496/* Move tasks along towards death. Any tasks on dead_tasks
497 * will definitely have no remaining references in any
498 * CPU buffers at this point, because we use two lists,
499 * and to have reached the list, it must have gone through
500 * one full sync already.
501 */
502static void process_task_mortuary(void)
503{
504	unsigned long flags;
505	LIST_HEAD(local_dead_tasks);
506	struct task_struct *task;
507	struct task_struct *ttask;
508
509	spin_lock_irqsave(&task_mortuary, flags);
510
511	list_splice_init(&dead_tasks, &local_dead_tasks);
512	list_splice_init(&dying_tasks, &dead_tasks);
513
514	spin_unlock_irqrestore(&task_mortuary, flags);
515
516	list_for_each_entry_safe(task, ttask, &local_dead_tasks, tasks) {
517		list_del(&task->tasks);
518		free_task(task);
519	}
520}
521
522
523static void mark_done(int cpu)
524{
525	int i;
526
527	cpu_set(cpu, marked_cpus);
528
529	for_each_online_cpu(i) {
530		if (!cpu_isset(i, marked_cpus))
531			return;
532	}
533
534	/* All CPUs have been processed at least once,
535	 * we can process the mortuary once
536	 */
537	process_task_mortuary();
538
539	cpus_clear(marked_cpus);
540}
541
542
543/* FIXME: this is not sufficient if we implement syscall barrier backtrace
544 * traversal, the code switch to sb_sample_start at first kernel enter/exit
545 * switch so we need a fifth state and some special handling in sync_buffer()
546 */
547typedef enum {
548	sb_bt_ignore = -2,
549	sb_buffer_start,
550	sb_bt_start,
551	sb_sample_start,
552} sync_buffer_state;
553
554/* Sync one of the CPU's buffers into the global event buffer.
555 * Here we need to go through each batch of samples punctuated
556 * by context switch notes, taking the task's mmap_sem and doing
557 * lookup in task->mm->mmap to convert EIP into dcookie/offset
558 * value.
559 */
560void sync_buffer(int cpu)
561{
562	struct oprofile_cpu_buffer *cpu_buf = &per_cpu(cpu_buffer, cpu);
563	struct mm_struct *mm = NULL;
564	struct task_struct *new;
565	unsigned long cookie = 0;
566	int in_kernel = 1;
567	sync_buffer_state state = sb_buffer_start;
568#ifndef CONFIG_OPROFILE_IBS
569	unsigned int i;
570	unsigned long available;
571#endif
572
573	mutex_lock(&buffer_mutex);
574
575	add_cpu_switch(cpu);
576
577	/* Remember, only we can modify tail_pos */
578
579#ifndef CONFIG_OPROFILE_IBS
580	available = get_slots(cpu_buf);
581
582	for (i = 0; i < available; ++i) {
583#else
584	while (get_slots(cpu_buf)) {
585#endif
586		struct op_sample *s = &cpu_buf->buffer[cpu_buf->tail_pos];
587
588		if (is_code(s->eip)) {
589			if (s->event <= CPU_IS_KERNEL) {
590				/* kernel/userspace switch */
591				in_kernel = s->event;
592				if (state == sb_buffer_start)
593					state = sb_sample_start;
594				add_kernel_ctx_switch(s->event);
595			} else if (s->event == CPU_TRACE_BEGIN) {
596				state = sb_bt_start;
597				add_trace_begin();
598#ifdef CONFIG_OPROFILE_IBS
599			} else if (s->event == IBS_FETCH_BEGIN) {
600				state = sb_bt_start;
601				add_ibs_begin(cpu_buf, IBS_FETCH_CODE, mm);
602			} else if (s->event == IBS_OP_BEGIN) {
603				state = sb_bt_start;
604				add_ibs_begin(cpu_buf, IBS_OP_CODE, mm);
605#endif
606			} else {
607				struct mm_struct *oldmm = mm;
608
609				/* userspace context switch */
610				new = (struct task_struct *)s->event;
611
612				release_mm(oldmm);
613				mm = take_tasks_mm(new);
614				if (mm != oldmm)
615					cookie = get_exec_dcookie(mm);
616				add_user_ctx_switch(new, cookie);
617			}
618		} else if (state >= sb_bt_start &&
619			   !add_sample(mm, s, in_kernel)) {
620			if (state == sb_bt_start) {
621				state = sb_bt_ignore;
622				atomic_inc(&oprofile_stats.bt_lost_no_mapping);
623			}
624		}
625
626		increment_tail(cpu_buf);
627	}
628	release_mm(mm);
629
630	mark_done(cpu);
631
632	mutex_unlock(&buffer_mutex);
633}
634
635/* The function can be used to add a buffer worth of data directly to
636 * the kernel buffer. The buffer is assumed to be a circular buffer.
637 * Take the entries from index start and end at index end, wrapping
638 * at max_entries.
639 */
640void oprofile_put_buff(unsigned long *buf, unsigned int start,
641		       unsigned int stop, unsigned int max)
642{
643	int i;
644
645	i = start;
646
647	mutex_lock(&buffer_mutex);
648	while (i != stop) {
649		add_event_entry(buf[i++]);
650
651		if (i >= max)
652			i = 0;
653	}
654
655	mutex_unlock(&buffer_mutex);
656}
657