sched: latencytop support · tjh.dev/kernel@9745512

+27

arch/x86/kernel/stacktrace.c

··· 33 33 trace->entries[trace->nr_entries++] = addr; 34 34 } 35 35 36 + static void save_stack_address_nosched(void *data, unsigned long addr) 37 + { 38 + struct stack_trace *trace = (struct stack_trace *)data; 39 + if (in_sched_functions(addr)) 40 + return; 41 + if (trace->skip > 0) { 42 + trace->skip--; 43 + return; 44 + } 45 + if (trace->nr_entries < trace->max_entries) 46 + trace->entries[trace->nr_entries++] = addr; 47 + } 48 + 36 49 static const struct stacktrace_ops save_stack_ops = { 37 50 .warning = save_stack_warning, 38 51 .warning_symbol = save_stack_warning_symbol, 39 52 .stack = save_stack_stack, 40 53 .address = save_stack_address, 54 + }; 55 + 56 + static const struct stacktrace_ops save_stack_ops_nosched = { 57 + .warning = save_stack_warning, 58 + .warning_symbol = save_stack_warning_symbol, 59 + .stack = save_stack_stack, 60 + .address = save_stack_address_nosched, 41 61 }; 42 62 43 63 /* ··· 70 50 trace->entries[trace->nr_entries++] = ULONG_MAX; 71 51 } 72 52 EXPORT_SYMBOL(save_stack_trace); 53 + 54 + void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) 55 + { 56 + dump_trace(tsk, NULL, NULL, &save_stack_ops_nosched, trace); 57 + if (trace->nr_entries < trace->max_entries) 58 + trace->entries[trace->nr_entries++] = ULONG_MAX; 59 + }

+78

fs/proc/base.c

··· 310 310 } 311 311 #endif 312 312 313 + #ifdef CONFIG_LATENCYTOP 314 + static int lstats_show_proc(struct seq_file *m, void *v) 315 + { 316 + int i; 317 + struct task_struct *task = m->private; 318 + seq_puts(m, "Latency Top version : v0.1\n"); 319 + 320 + for (i = 0; i < 32; i++) { 321 + if (task->latency_record[i].backtrace[0]) { 322 + int q; 323 + seq_printf(m, "%i %li %li ", 324 + task->latency_record[i].count, 325 + task->latency_record[i].time, 326 + task->latency_record[i].max); 327 + for (q = 0; q < LT_BACKTRACEDEPTH; q++) { 328 + char sym[KSYM_NAME_LEN]; 329 + char *c; 330 + if (!task->latency_record[i].backtrace[q]) 331 + break; 332 + if (task->latency_record[i].backtrace[q] == ULONG_MAX) 333 + break; 334 + sprint_symbol(sym, task->latency_record[i].backtrace[q]); 335 + c = strchr(sym, '+'); 336 + if (c) 337 + *c = 0; 338 + seq_printf(m, "%s ", sym); 339 + } 340 + seq_printf(m, "\n"); 341 + } 342 + 343 + } 344 + return 0; 345 + } 346 + 347 + static int lstats_open(struct inode *inode, struct file *file) 348 + { 349 + int ret; 350 + struct seq_file *m; 351 + struct task_struct *task = get_proc_task(inode); 352 + 353 + ret = single_open(file, lstats_show_proc, NULL); 354 + if (!ret) { 355 + m = file->private_data; 356 + m->private = task; 357 + } 358 + return ret; 359 + } 360 + 361 + static ssize_t lstats_write(struct file *file, const char __user *buf, 362 + size_t count, loff_t *offs) 363 + { 364 + struct seq_file *m; 365 + struct task_struct *task; 366 + 367 + m = file->private_data; 368 + task = m->private; 369 + clear_all_latency_tracing(task); 370 + 371 + return count; 372 + } 373 + 374 + static const struct file_operations proc_lstats_operations = { 375 + .open = lstats_open, 376 + .read = seq_read, 377 + .write = lstats_write, 378 + .llseek = seq_lseek, 379 + .release = single_release, 380 + }; 381 + 382 + #endif 383 + 313 384 /* The badness from the OOM killer */ 314 385 unsigned long badness(struct task_struct *p, unsigned long uptime); 315 386 static int proc_oom_score(struct task_struct *task, char *buffer) ··· 1090 1019 .write = proc_fault_inject_write, 1091 1020 }; 1092 1021 #endif 1022 + 1093 1023 1094 1024 #ifdef CONFIG_SCHED_DEBUG 1095 1025 /* ··· 2302 2230 #ifdef CONFIG_SCHEDSTATS 2303 2231 INF("schedstat", S_IRUGO, pid_schedstat), 2304 2232 #endif 2233 + #ifdef CONFIG_LATENCYTOP 2234 + REG("latency", S_IRUGO, lstats), 2235 + #endif 2305 2236 #ifdef CONFIG_PROC_PID_CPUSET 2306 2237 REG("cpuset", S_IRUGO, cpuset), 2307 2238 #endif ··· 2629 2554 #endif 2630 2555 #ifdef CONFIG_SCHEDSTATS 2631 2556 INF("schedstat", S_IRUGO, pid_schedstat), 2557 + #endif 2558 + #ifdef CONFIG_LATENCYTOP 2559 + REG("latency", S_IRUGO, lstats), 2632 2560 #endif 2633 2561 #ifdef CONFIG_PROC_PID_CPUSET 2634 2562 REG("cpuset", S_IRUGO, cpuset),

+44

include/linux/latencytop.h

··· 1 + /* 2 + * latencytop.h: Infrastructure for displaying latency 3 + * 4 + * (C) Copyright 2008 Intel Corporation 5 + * Author: Arjan van de Ven <arjan@linux.intel.com> 6 + * 7 + */ 8 + 9 + #ifndef _INCLUDE_GUARD_LATENCYTOP_H_ 10 + #define _INCLUDE_GUARD_LATENCYTOP_H_ 11 + 12 + #ifdef CONFIG_LATENCYTOP 13 + 14 + #define LT_SAVECOUNT 32 15 + #define LT_BACKTRACEDEPTH 12 16 + 17 + struct latency_record { 18 + unsigned long backtrace[LT_BACKTRACEDEPTH]; 19 + unsigned int count; 20 + unsigned long time; 21 + unsigned long max; 22 + }; 23 + 24 + 25 + struct task_struct; 26 + 27 + void account_scheduler_latency(struct task_struct *task, int usecs, int inter); 28 + 29 + void clear_all_latency_tracing(struct task_struct *p); 30 + 31 + #else 32 + 33 + static inline void 34 + account_scheduler_latency(struct task_struct *task, int usecs, int inter) 35 + { 36 + } 37 + 38 + static inline void clear_all_latency_tracing(struct task_struct *p) 39 + { 40 + } 41 + 42 + #endif 43 + 44 + #endif

+5

include/linux/sched.h

··· 88 88 #include <linux/hrtimer.h> 89 89 #include <linux/task_io_accounting.h> 90 90 #include <linux/kobject.h> 91 + #include <linux/latencytop.h> 91 92 92 93 #include <asm/processor.h> 93 94 ··· 1221 1220 int make_it_fail; 1222 1221 #endif 1223 1222 struct prop_local_single dirties; 1223 + #ifdef CONFIG_LATENCYTOP 1224 + int latency_record_count; 1225 + struct latency_record latency_record[LT_SAVECOUNT]; 1226 + #endif 1224 1227 }; 1225 1228 1226 1229 /*

+3

include/linux/stacktrace.h

··· 9 9 }; 10 10 11 11 extern void save_stack_trace(struct stack_trace *trace); 12 + extern void save_stack_trace_tsk(struct task_struct *tsk, 13 + struct stack_trace *trace); 12 14 13 15 extern void print_stack_trace(struct stack_trace *trace, int spaces); 14 16 #else 15 17 # define save_stack_trace(trace) do { } while (0) 18 + # define save_stack_trace_tsk(tsk, trace) do { } while (0) 16 19 # define print_stack_trace(trace, spaces) do { } while (0) 17 20 #endif 18 21

+1

kernel/Makefile

··· 62 62 obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o 63 63 obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o 64 64 obj-$(CONFIG_MARKERS) += marker.o 65 + obj-$(CONFIG_LATENCYTOP) += latencytop.o 65 66 66 67 ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) 67 68 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is

+1

kernel/fork.c

··· 1205 1205 #ifdef TIF_SYSCALL_EMU 1206 1206 clear_tsk_thread_flag(p, TIF_SYSCALL_EMU); 1207 1207 #endif 1208 + clear_all_latency_tracing(p); 1208 1209 1209 1210 /* Our parent execution domain becomes current domain 1210 1211 These must match for thread signalling to apply */

+239

kernel/latencytop.c

··· 1 + /* 2 + * latencytop.c: Latency display infrastructure 3 + * 4 + * (C) Copyright 2008 Intel Corporation 5 + * Author: Arjan van de Ven <arjan@linux.intel.com> 6 + * 7 + * This program is free software; you can redistribute it and/or 8 + * modify it under the terms of the GNU General Public License 9 + * as published by the Free Software Foundation; version 2 10 + * of the License. 11 + */ 12 + #include <linux/latencytop.h> 13 + #include <linux/kallsyms.h> 14 + #include <linux/seq_file.h> 15 + #include <linux/notifier.h> 16 + #include <linux/spinlock.h> 17 + #include <linux/proc_fs.h> 18 + #include <linux/module.h> 19 + #include <linux/sched.h> 20 + #include <linux/list.h> 21 + #include <linux/slab.h> 22 + #include <linux/stacktrace.h> 23 + 24 + static DEFINE_SPINLOCK(latency_lock); 25 + 26 + #define MAXLR 128 27 + static struct latency_record latency_record[MAXLR]; 28 + 29 + int latencytop_enabled; 30 + 31 + void clear_all_latency_tracing(struct task_struct *p) 32 + { 33 + unsigned long flags; 34 + 35 + if (!latencytop_enabled) 36 + return; 37 + 38 + spin_lock_irqsave(&latency_lock, flags); 39 + memset(&p->latency_record, 0, sizeof(p->latency_record)); 40 + p->latency_record_count = 0; 41 + spin_unlock_irqrestore(&latency_lock, flags); 42 + } 43 + 44 + static void clear_global_latency_tracing(void) 45 + { 46 + unsigned long flags; 47 + 48 + spin_lock_irqsave(&latency_lock, flags); 49 + memset(&latency_record, 0, sizeof(latency_record)); 50 + spin_unlock_irqrestore(&latency_lock, flags); 51 + } 52 + 53 + static void __sched 54 + account_global_scheduler_latency(struct task_struct *tsk, struct latency_record *lat) 55 + { 56 + int firstnonnull = MAXLR + 1; 57 + int i; 58 + 59 + if (!latencytop_enabled) 60 + return; 61 + 62 + /* skip kernel threads for now */ 63 + if (!tsk->mm) 64 + return; 65 + 66 + for (i = 0; i < MAXLR; i++) { 67 + int q; 68 + int same = 1; 69 + /* Nothing stored: */ 70 + if (!latency_record[i].backtrace[0]) { 71 + if (firstnonnull > i) 72 + firstnonnull = i; 73 + continue; 74 + } 75 + for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) { 76 + if (latency_record[i].backtrace[q] != 77 + lat->backtrace[q]) 78 + same = 0; 79 + if (same && lat->backtrace[q] == 0) 80 + break; 81 + if (same && lat->backtrace[q] == ULONG_MAX) 82 + break; 83 + } 84 + if (same) { 85 + latency_record[i].count++; 86 + latency_record[i].time += lat->time; 87 + if (lat->time > latency_record[i].max) 88 + latency_record[i].max = lat->time; 89 + return; 90 + } 91 + } 92 + 93 + i = firstnonnull; 94 + if (i >= MAXLR - 1) 95 + return; 96 + 97 + /* Allocted a new one: */ 98 + memcpy(&latency_record[i], lat, sizeof(struct latency_record)); 99 + } 100 + 101 + static inline void store_stacktrace(struct task_struct *tsk, struct latency_record *lat) 102 + { 103 + struct stack_trace trace; 104 + 105 + memset(&trace, 0, sizeof(trace)); 106 + trace.max_entries = LT_BACKTRACEDEPTH; 107 + trace.entries = &lat->backtrace[0]; 108 + trace.skip = 0; 109 + save_stack_trace_tsk(tsk, &trace); 110 + } 111 + 112 + void __sched 113 + account_scheduler_latency(struct task_struct *tsk, int usecs, int inter) 114 + { 115 + unsigned long flags; 116 + int i, q; 117 + struct latency_record lat; 118 + 119 + if (!latencytop_enabled) 120 + return; 121 + 122 + /* Long interruptible waits are generally user requested... */ 123 + if (inter && usecs > 5000) 124 + return; 125 + 126 + memset(&lat, 0, sizeof(lat)); 127 + lat.count = 1; 128 + lat.time = usecs; 129 + lat.max = usecs; 130 + store_stacktrace(tsk, &lat); 131 + 132 + spin_lock_irqsave(&latency_lock, flags); 133 + 134 + account_global_scheduler_latency(tsk, &lat); 135 + 136 + /* 137 + * short term hack; if we're > 32 we stop; future we recycle: 138 + */ 139 + tsk->latency_record_count++; 140 + if (tsk->latency_record_count >= LT_SAVECOUNT) 141 + goto out_unlock; 142 + 143 + for (i = 0; i < LT_SAVECOUNT ; i++) { 144 + struct latency_record *mylat; 145 + int same = 1; 146 + mylat = &tsk->latency_record[i]; 147 + for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) { 148 + if (mylat->backtrace[q] != 149 + lat.backtrace[q]) 150 + same = 0; 151 + if (same && lat.backtrace[q] == 0) 152 + break; 153 + if (same && lat.backtrace[q] == ULONG_MAX) 154 + break; 155 + } 156 + if (same) { 157 + mylat->count++; 158 + mylat->time += lat.time; 159 + if (lat.time > mylat->max) 160 + mylat->max = lat.time; 161 + goto out_unlock; 162 + } 163 + } 164 + 165 + /* Allocated a new one: */ 166 + i = tsk->latency_record_count; 167 + memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record)); 168 + 169 + out_unlock: 170 + spin_unlock_irqrestore(&latency_lock, flags); 171 + } 172 + 173 + static int lstats_show(struct seq_file *m, void *v) 174 + { 175 + int i; 176 + 177 + seq_puts(m, "Latency Top version : v0.1\n"); 178 + 179 + for (i = 0; i < MAXLR; i++) { 180 + if (latency_record[i].backtrace[0]) { 181 + int q; 182 + seq_printf(m, "%i %li %li ", 183 + latency_record[i].count, 184 + latency_record[i].time, 185 + latency_record[i].max); 186 + for (q = 0; q < LT_BACKTRACEDEPTH; q++) { 187 + char sym[KSYM_NAME_LEN]; 188 + char *c; 189 + if (!latency_record[i].backtrace[q]) 190 + break; 191 + if (latency_record[i].backtrace[q] == ULONG_MAX) 192 + break; 193 + sprint_symbol(sym, latency_record[i].backtrace[q]); 194 + c = strchr(sym, '+'); 195 + if (c) 196 + *c = 0; 197 + seq_printf(m, "%s ", sym); 198 + } 199 + seq_printf(m, "\n"); 200 + } 201 + } 202 + return 0; 203 + } 204 + 205 + static ssize_t 206 + lstats_write(struct file *file, const char __user *buf, size_t count, 207 + loff_t *offs) 208 + { 209 + clear_global_latency_tracing(); 210 + 211 + return count; 212 + } 213 + 214 + static int lstats_open(struct inode *inode, struct file *filp) 215 + { 216 + return single_open(filp, lstats_show, NULL); 217 + } 218 + 219 + static struct file_operations lstats_fops = { 220 + .open = lstats_open, 221 + .read = seq_read, 222 + .write = lstats_write, 223 + .llseek = seq_lseek, 224 + .release = single_release, 225 + }; 226 + 227 + static int __init init_lstats_procfs(void) 228 + { 229 + struct proc_dir_entry *pe; 230 + 231 + pe = create_proc_entry("latency_stats", 0644, NULL); 232 + if (!pe) 233 + return -ENOMEM; 234 + 235 + pe->proc_fops = &lstats_fops; 236 + 237 + return 0; 238 + } 239 + __initcall(init_lstats_procfs);

+7 -1

kernel/sched_fair.c

··· 20 20 * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> 21 21 */ 22 22 23 + #include <linux/latencytop.h> 24 + 23 25 /* 24 26 * Targeted preemption latency for CPU-bound tasks: 25 27 * (default: 20ms * (1 + ilog(ncpus)), units: nanoseconds) ··· 436 434 #ifdef CONFIG_SCHEDSTATS 437 435 if (se->sleep_start) { 438 436 u64 delta = rq_of(cfs_rq)->clock - se->sleep_start; 437 + struct task_struct *tsk = task_of(se); 439 438 440 439 if ((s64)delta < 0) 441 440 delta = 0; ··· 446 443 447 444 se->sleep_start = 0; 448 445 se->sum_sleep_runtime += delta; 446 + 447 + account_scheduler_latency(tsk, delta >> 10, 1); 449 448 } 450 449 if (se->block_start) { 451 450 u64 delta = rq_of(cfs_rq)->clock - se->block_start; 451 + struct task_struct *tsk = task_of(se); 452 452 453 453 if ((s64)delta < 0) 454 454 delta = 0; ··· 468 462 * time that the task spent sleeping: 469 463 */ 470 464 if (unlikely(prof_on == SLEEP_PROFILING)) { 471 - struct task_struct *tsk = task_of(se); 472 465 473 466 profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk), 474 467 delta >> 20); 475 468 } 469 + account_scheduler_latency(tsk, delta >> 10, 0); 476 470 } 477 471 #endif 478 472 }

+10

kernel/sysctl.c

··· 81 81 extern int maps_protect; 82 82 extern int sysctl_stat_interval; 83 83 extern int audit_argv_kb; 84 + extern int latencytop_enabled; 84 85 85 86 /* Constants used for minimum and maximum */ 86 87 #ifdef CONFIG_DETECT_SOFTLOCKUP ··· 415 414 .maxlen = sizeof(int), 416 415 .mode = 0644, 417 416 .proc_handler = &proc_dointvec_taint, 417 + }, 418 + #endif 419 + #ifdef CONFIG_LATENCYTOP 420 + { 421 + .procname = "latencytop", 422 + .data = &latencytop_enabled, 423 + .maxlen = sizeof(int), 424 + .mode = 0644, 425 + .proc_handler = &proc_dointvec, 418 426 }, 419 427 #endif 420 428 #ifdef CONFIG_SECURITY_CAPABILITIES

+14

lib/Kconfig.debug

··· 517 517 help 518 518 Provide stacktrace filter for fault-injection capabilities 519 519 520 + config LATENCYTOP 521 + bool "Latency measuring infrastructure" 522 + select FRAME_POINTER if !MIPS 523 + select KALLSYMS 524 + select KALLSYMS_ALL 525 + select STACKTRACE 526 + select SCHEDSTATS 527 + select SCHED_DEBUG 528 + depends on X86 || X86_64 529 + help 530 + Enable this option if you want to use the LatencyTOP tool 531 + to find out which userspace is blocking on what kernel operations. 532 + 533 + 520 534 source "samples/Kconfig"