Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at 5c3e985a2c1908aa97221d3806f85ce7e2fbfa88 540 lines 12 kB view raw
1/* 2 * Copyright (C) 2006 Jens Axboe <axboe@suse.de> 3 * 4 * This program is free software; you can redistribute it and/or modify 5 * it under the terms of the GNU General Public License version 2 as 6 * published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 * GNU General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public License 14 * along with this program; if not, write to the Free Software 15 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 16 * 17 */ 18#include <linux/kernel.h> 19#include <linux/blkdev.h> 20#include <linux/blktrace_api.h> 21#include <linux/percpu.h> 22#include <linux/init.h> 23#include <linux/mutex.h> 24#include <linux/debugfs.h> 25#include <asm/uaccess.h> 26 27static DEFINE_PER_CPU(unsigned long long, blk_trace_cpu_offset) = { 0, }; 28static unsigned int blktrace_seq __read_mostly = 1; 29 30/* 31 * Send out a notify for this process, if we haven't done so since a trace 32 * started 33 */ 34static void trace_note_tsk(struct blk_trace *bt, struct task_struct *tsk) 35{ 36 struct blk_io_trace *t; 37 38 t = relay_reserve(bt->rchan, sizeof(*t) + sizeof(tsk->comm)); 39 if (t) { 40 t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION; 41 t->device = bt->dev; 42 t->action = BLK_TC_ACT(BLK_TC_NOTIFY); 43 t->pid = tsk->pid; 44 t->cpu = smp_processor_id(); 45 t->pdu_len = sizeof(tsk->comm); 46 memcpy((void *) t + sizeof(*t), tsk->comm, t->pdu_len); 47 tsk->btrace_seq = blktrace_seq; 48 } 49} 50 51static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector, 52 pid_t pid) 53{ 54 if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0) 55 return 1; 56 if (sector < bt->start_lba || sector > bt->end_lba) 57 return 1; 58 if (bt->pid && pid != bt->pid) 59 return 1; 60 61 return 0; 62} 63 64/* 65 * Data direction bit lookup 66 */ 67static u32 ddir_act[2] __read_mostly = { BLK_TC_ACT(BLK_TC_READ), BLK_TC_ACT(BLK_TC_WRITE) }; 68 69/* 70 * Bio action bits of interest 71 */ 72static u32 bio_act[5] __read_mostly = { 0, BLK_TC_ACT(BLK_TC_BARRIER), BLK_TC_ACT(BLK_TC_SYNC), 0, BLK_TC_ACT(BLK_TC_AHEAD) }; 73 74/* 75 * More could be added as needed, taking care to increment the decrementer 76 * to get correct indexing 77 */ 78#define trace_barrier_bit(rw) \ 79 (((rw) & (1 << BIO_RW_BARRIER)) >> (BIO_RW_BARRIER - 0)) 80#define trace_sync_bit(rw) \ 81 (((rw) & (1 << BIO_RW_SYNC)) >> (BIO_RW_SYNC - 1)) 82#define trace_ahead_bit(rw) \ 83 (((rw) & (1 << BIO_RW_AHEAD)) << (2 - BIO_RW_AHEAD)) 84 85/* 86 * The worker for the various blk_add_trace*() types. Fills out a 87 * blk_io_trace structure and places it in a per-cpu subbuffer. 88 */ 89void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, 90 int rw, u32 what, int error, int pdu_len, void *pdu_data) 91{ 92 struct task_struct *tsk = current; 93 struct blk_io_trace *t; 94 unsigned long flags; 95 unsigned long *sequence; 96 pid_t pid; 97 int cpu; 98 99 if (unlikely(bt->trace_state != Blktrace_running)) 100 return; 101 102 what |= ddir_act[rw & WRITE]; 103 what |= bio_act[trace_barrier_bit(rw)]; 104 what |= bio_act[trace_sync_bit(rw)]; 105 what |= bio_act[trace_ahead_bit(rw)]; 106 107 pid = tsk->pid; 108 if (unlikely(act_log_check(bt, what, sector, pid))) 109 return; 110 111 /* 112 * A word about the locking here - we disable interrupts to reserve 113 * some space in the relay per-cpu buffer, to prevent an irq 114 * from coming in and stepping on our toes. Once reserved, it's 115 * enough to get preemption disabled to prevent read of this data 116 * before we are through filling it. get_cpu()/put_cpu() does this 117 * for us 118 */ 119 local_irq_save(flags); 120 121 if (unlikely(tsk->btrace_seq != blktrace_seq)) 122 trace_note_tsk(bt, tsk); 123 124 t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len); 125 if (t) { 126 cpu = smp_processor_id(); 127 sequence = per_cpu_ptr(bt->sequence, cpu); 128 129 t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION; 130 t->sequence = ++(*sequence); 131 t->time = sched_clock() - per_cpu(blk_trace_cpu_offset, cpu); 132 t->sector = sector; 133 t->bytes = bytes; 134 t->action = what; 135 t->pid = pid; 136 t->device = bt->dev; 137 t->cpu = cpu; 138 t->error = error; 139 t->pdu_len = pdu_len; 140 141 if (pdu_len) 142 memcpy((void *) t + sizeof(*t), pdu_data, pdu_len); 143 } 144 145 local_irq_restore(flags); 146} 147 148EXPORT_SYMBOL_GPL(__blk_add_trace); 149 150static struct dentry *blk_tree_root; 151static struct mutex blk_tree_mutex; 152static unsigned int root_users; 153 154static inline void blk_remove_root(void) 155{ 156 if (blk_tree_root) { 157 debugfs_remove(blk_tree_root); 158 blk_tree_root = NULL; 159 } 160} 161 162static void blk_remove_tree(struct dentry *dir) 163{ 164 mutex_lock(&blk_tree_mutex); 165 debugfs_remove(dir); 166 if (--root_users == 0) 167 blk_remove_root(); 168 mutex_unlock(&blk_tree_mutex); 169} 170 171static struct dentry *blk_create_tree(const char *blk_name) 172{ 173 struct dentry *dir = NULL; 174 175 mutex_lock(&blk_tree_mutex); 176 177 if (!blk_tree_root) { 178 blk_tree_root = debugfs_create_dir("block", NULL); 179 if (!blk_tree_root) 180 goto err; 181 } 182 183 dir = debugfs_create_dir(blk_name, blk_tree_root); 184 if (dir) 185 root_users++; 186 else 187 blk_remove_root(); 188 189err: 190 mutex_unlock(&blk_tree_mutex); 191 return dir; 192} 193 194static void blk_trace_cleanup(struct blk_trace *bt) 195{ 196 relay_close(bt->rchan); 197 debugfs_remove(bt->dropped_file); 198 blk_remove_tree(bt->dir); 199 free_percpu(bt->sequence); 200 kfree(bt); 201} 202 203static int blk_trace_remove(request_queue_t *q) 204{ 205 struct blk_trace *bt; 206 207 bt = xchg(&q->blk_trace, NULL); 208 if (!bt) 209 return -EINVAL; 210 211 if (bt->trace_state == Blktrace_setup || 212 bt->trace_state == Blktrace_stopped) 213 blk_trace_cleanup(bt); 214 215 return 0; 216} 217 218static int blk_dropped_open(struct inode *inode, struct file *filp) 219{ 220 filp->private_data = inode->u.generic_ip; 221 222 return 0; 223} 224 225static ssize_t blk_dropped_read(struct file *filp, char __user *buffer, 226 size_t count, loff_t *ppos) 227{ 228 struct blk_trace *bt = filp->private_data; 229 char buf[16]; 230 231 snprintf(buf, sizeof(buf), "%u\n", atomic_read(&bt->dropped)); 232 233 return simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf)); 234} 235 236static struct file_operations blk_dropped_fops = { 237 .owner = THIS_MODULE, 238 .open = blk_dropped_open, 239 .read = blk_dropped_read, 240}; 241 242/* 243 * Keep track of how many times we encountered a full subbuffer, to aid 244 * the user space app in telling how many lost events there were. 245 */ 246static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf, 247 void *prev_subbuf, size_t prev_padding) 248{ 249 struct blk_trace *bt; 250 251 if (!relay_buf_full(buf)) 252 return 1; 253 254 bt = buf->chan->private_data; 255 atomic_inc(&bt->dropped); 256 return 0; 257} 258 259static int blk_remove_buf_file_callback(struct dentry *dentry) 260{ 261 debugfs_remove(dentry); 262 return 0; 263} 264 265static struct dentry *blk_create_buf_file_callback(const char *filename, 266 struct dentry *parent, 267 int mode, 268 struct rchan_buf *buf, 269 int *is_global) 270{ 271 return debugfs_create_file(filename, mode, parent, buf, 272 &relay_file_operations); 273} 274 275static struct rchan_callbacks blk_relay_callbacks = { 276 .subbuf_start = blk_subbuf_start_callback, 277 .create_buf_file = blk_create_buf_file_callback, 278 .remove_buf_file = blk_remove_buf_file_callback, 279}; 280 281/* 282 * Setup everything required to start tracing 283 */ 284static int blk_trace_setup(request_queue_t *q, struct block_device *bdev, 285 char __user *arg) 286{ 287 struct blk_user_trace_setup buts; 288 struct blk_trace *old_bt, *bt = NULL; 289 struct dentry *dir = NULL; 290 char b[BDEVNAME_SIZE]; 291 int ret, i; 292 293 if (copy_from_user(&buts, arg, sizeof(buts))) 294 return -EFAULT; 295 296 if (!buts.buf_size || !buts.buf_nr) 297 return -EINVAL; 298 299 strcpy(buts.name, bdevname(bdev, b)); 300 301 /* 302 * some device names have larger paths - convert the slashes 303 * to underscores for this to work as expected 304 */ 305 for (i = 0; i < strlen(buts.name); i++) 306 if (buts.name[i] == '/') 307 buts.name[i] = '_'; 308 309 if (copy_to_user(arg, &buts, sizeof(buts))) 310 return -EFAULT; 311 312 ret = -ENOMEM; 313 bt = kzalloc(sizeof(*bt), GFP_KERNEL); 314 if (!bt) 315 goto err; 316 317 bt->sequence = alloc_percpu(unsigned long); 318 if (!bt->sequence) 319 goto err; 320 321 ret = -ENOENT; 322 dir = blk_create_tree(buts.name); 323 if (!dir) 324 goto err; 325 326 bt->dir = dir; 327 bt->dev = bdev->bd_dev; 328 atomic_set(&bt->dropped, 0); 329 330 ret = -EIO; 331 bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt, &blk_dropped_fops); 332 if (!bt->dropped_file) 333 goto err; 334 335 bt->rchan = relay_open("trace", dir, buts.buf_size, buts.buf_nr, &blk_relay_callbacks); 336 if (!bt->rchan) 337 goto err; 338 bt->rchan->private_data = bt; 339 340 bt->act_mask = buts.act_mask; 341 if (!bt->act_mask) 342 bt->act_mask = (u16) -1; 343 344 bt->start_lba = buts.start_lba; 345 bt->end_lba = buts.end_lba; 346 if (!bt->end_lba) 347 bt->end_lba = -1ULL; 348 349 bt->pid = buts.pid; 350 bt->trace_state = Blktrace_setup; 351 352 ret = -EBUSY; 353 old_bt = xchg(&q->blk_trace, bt); 354 if (old_bt) { 355 (void) xchg(&q->blk_trace, old_bt); 356 goto err; 357 } 358 359 return 0; 360err: 361 if (dir) 362 blk_remove_tree(dir); 363 if (bt) { 364 if (bt->dropped_file) 365 debugfs_remove(bt->dropped_file); 366 if (bt->sequence) 367 free_percpu(bt->sequence); 368 if (bt->rchan) 369 relay_close(bt->rchan); 370 kfree(bt); 371 } 372 return ret; 373} 374 375static int blk_trace_startstop(request_queue_t *q, int start) 376{ 377 struct blk_trace *bt; 378 int ret; 379 380 if ((bt = q->blk_trace) == NULL) 381 return -EINVAL; 382 383 /* 384 * For starting a trace, we can transition from a setup or stopped 385 * trace. For stopping a trace, the state must be running 386 */ 387 ret = -EINVAL; 388 if (start) { 389 if (bt->trace_state == Blktrace_setup || 390 bt->trace_state == Blktrace_stopped) { 391 blktrace_seq++; 392 smp_mb(); 393 bt->trace_state = Blktrace_running; 394 ret = 0; 395 } 396 } else { 397 if (bt->trace_state == Blktrace_running) { 398 bt->trace_state = Blktrace_stopped; 399 relay_flush(bt->rchan); 400 ret = 0; 401 } 402 } 403 404 return ret; 405} 406 407/** 408 * blk_trace_ioctl: - handle the ioctls associated with tracing 409 * @bdev: the block device 410 * @cmd: the ioctl cmd 411 * @arg: the argument data, if any 412 * 413 **/ 414int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) 415{ 416 request_queue_t *q; 417 int ret, start = 0; 418 419 q = bdev_get_queue(bdev); 420 if (!q) 421 return -ENXIO; 422 423 mutex_lock(&bdev->bd_mutex); 424 425 switch (cmd) { 426 case BLKTRACESETUP: 427 ret = blk_trace_setup(q, bdev, arg); 428 break; 429 case BLKTRACESTART: 430 start = 1; 431 case BLKTRACESTOP: 432 ret = blk_trace_startstop(q, start); 433 break; 434 case BLKTRACETEARDOWN: 435 ret = blk_trace_remove(q); 436 break; 437 default: 438 ret = -ENOTTY; 439 break; 440 } 441 442 mutex_unlock(&bdev->bd_mutex); 443 return ret; 444} 445 446/** 447 * blk_trace_shutdown: - stop and cleanup trace structures 448 * @q: the request queue associated with the device 449 * 450 **/ 451void blk_trace_shutdown(request_queue_t *q) 452{ 453 blk_trace_startstop(q, 0); 454 blk_trace_remove(q); 455} 456 457/* 458 * Average offset over two calls to sched_clock() with a gettimeofday() 459 * in the middle 460 */ 461static void blk_check_time(unsigned long long *t) 462{ 463 unsigned long long a, b; 464 struct timeval tv; 465 466 a = sched_clock(); 467 do_gettimeofday(&tv); 468 b = sched_clock(); 469 470 *t = tv.tv_sec * 1000000000 + tv.tv_usec * 1000; 471 *t -= (a + b) / 2; 472} 473 474static void blk_trace_check_cpu_time(void *data) 475{ 476 unsigned long long *t; 477 int cpu = get_cpu(); 478 479 t = &per_cpu(blk_trace_cpu_offset, cpu); 480 481 /* 482 * Just call it twice, hopefully the second call will be cache hot 483 * and a little more precise 484 */ 485 blk_check_time(t); 486 blk_check_time(t); 487 488 put_cpu(); 489} 490 491/* 492 * Call blk_trace_check_cpu_time() on each CPU to calibrate our inter-CPU 493 * timings 494 */ 495static void blk_trace_calibrate_offsets(void) 496{ 497 unsigned long flags; 498 499 smp_call_function(blk_trace_check_cpu_time, NULL, 1, 1); 500 local_irq_save(flags); 501 blk_trace_check_cpu_time(NULL); 502 local_irq_restore(flags); 503} 504 505static void blk_trace_set_ht_offsets(void) 506{ 507#if defined(CONFIG_SCHED_SMT) 508 int cpu, i; 509 510 /* 511 * now make sure HT siblings have the same time offset 512 */ 513 preempt_disable(); 514 for_each_online_cpu(cpu) { 515 unsigned long long *cpu_off, *sibling_off; 516 517 for_each_cpu_mask(i, cpu_sibling_map[cpu]) { 518 if (i == cpu) 519 continue; 520 521 cpu_off = &per_cpu(blk_trace_cpu_offset, cpu); 522 sibling_off = &per_cpu(blk_trace_cpu_offset, i); 523 *sibling_off = *cpu_off; 524 } 525 } 526 preempt_enable(); 527#endif 528} 529 530static __init int blk_trace_init(void) 531{ 532 mutex_init(&blk_tree_mutex); 533 blk_trace_calibrate_offsets(); 534 blk_trace_set_ht_offsets(); 535 536 return 0; 537} 538 539module_init(blk_trace_init); 540