Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

dm stats: support precise timestamps

Make it possible to use precise timestamps with nanosecond granularity
in dm statistics.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>

authored by

Mikulas Patocka and committed by
Mike Snitzer
c96aec34 dd4c1b7d

+127 -43
+24 -4
Documentation/device-mapper/statistics.txt
··· 13 13 The I/O statistics counters for each step-sized area of a region are 14 14 in the same format as /sys/block/*/stat or /proc/diskstats (see: 15 15 Documentation/iostats.txt). But two extra counters (12 and 13) are 16 - provided: total time spent reading and writing in milliseconds. All 17 - these counters may be accessed by sending the @stats_print message to 18 - the appropriate DM device via dmsetup. 16 + provided: total time spent reading and writing. All these counters may 17 + be accessed by sending the @stats_print message to the appropriate DM 18 + device via dmsetup. 19 + 20 + The reported times are in milliseconds and the granularity depends on 21 + the kernel ticks. When the option precise_timestamps is used, the 22 + reported times are in nanoseconds. 19 23 20 24 Each region has a corresponding unique identifier, which we call a 21 25 region_id, that is assigned when the region is created. The region_id ··· 37 33 Messages 38 34 ======== 39 35 40 - @stats_create <range> <step> [<program_id> [<aux_data>]] 36 + @stats_create <range> <step> 37 + [<number_of_optional_arguments> <optional_arguments>...] 38 + [<program_id> [<aux_data>]] 41 39 42 40 Create a new region and return the region_id. 43 41 ··· 54 48 "/<number_of_areas>" - the range is subdivided into the specified 55 49 number of areas. 56 50 51 + <number_of_optional_arguments> 52 + The number of optional arguments 53 + 54 + <optional_arguments> 55 + The following optional arguments are supported 56 + precise_timestamps - use precise timer with nanosecond resolution 57 + instead of the "jiffies" variable. When this argument is 58 + used, the resulting times are in nanoseconds instead of 59 + milliseconds. Precise timestamps are a little bit slower 60 + to obtain than jiffies-based timestamps. 61 + 57 62 <program_id> 58 63 An optional parameter. A name that uniquely identifies 59 64 the userspace owner of the range. This groups ranges together ··· 72 55 created and ignore those created by others. 73 56 The kernel returns this string back in the output of 74 57 @stats_list message, but it doesn't use it for anything else. 58 + If we omit the number of optional arguments, program id must not 59 + be a number, otherwise it would be interpreted as the number of 60 + optional arguments. 75 61 76 62 <aux_data> 77 63 An optional parameter. A word that provides auxiliary data
+100 -38
drivers/md/dm-stats.c
··· 33 33 34 34 struct dm_stat_shared { 35 35 atomic_t in_flight[2]; 36 - unsigned long stamp; 36 + unsigned long long stamp; 37 37 struct dm_stat_percpu tmp; 38 38 }; 39 39 40 40 struct dm_stat { 41 41 struct list_head list_entry; 42 42 int id; 43 + unsigned stat_flags; 43 44 size_t n_entries; 44 45 sector_t start; 45 46 sector_t end; ··· 53 52 struct dm_stat_percpu *stat_percpu[NR_CPUS]; 54 53 struct dm_stat_shared stat_shared[0]; 55 54 }; 55 + 56 + #define STAT_PRECISE_TIMESTAMPS 1 56 57 57 58 struct dm_stats_last_position { 58 59 sector_t last_sector; ··· 227 224 } 228 225 229 226 static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end, 230 - sector_t step, const char *program_id, const char *aux_data, 227 + sector_t step, unsigned stat_flags, 228 + const char *program_id, const char *aux_data, 231 229 void (*suspend_callback)(struct mapped_device *), 232 230 void (*resume_callback)(struct mapped_device *), 233 231 struct mapped_device *md) ··· 269 265 if (!s) 270 266 return -ENOMEM; 271 267 268 + s->stat_flags = stat_flags; 272 269 s->n_entries = n_entries; 273 270 s->start = start; 274 271 s->end = end; ··· 419 414 return 1; 420 415 } 421 416 422 - static void dm_stat_round(struct dm_stat_shared *shared, struct dm_stat_percpu *p) 417 + static void dm_stat_round(struct dm_stat *s, struct dm_stat_shared *shared, 418 + struct dm_stat_percpu *p) 423 419 { 424 420 /* 425 421 * This is racy, but so is part_round_stats_single. 426 422 */ 427 - unsigned long now = jiffies; 428 - unsigned in_flight_read; 429 - unsigned in_flight_write; 430 - unsigned long difference = now - shared->stamp; 423 + unsigned long long now, difference; 424 + unsigned in_flight_read, in_flight_write; 431 425 426 + if (likely(!(s->stat_flags & STAT_PRECISE_TIMESTAMPS))) 427 + now = jiffies; 428 + else 429 + now = ktime_to_ns(ktime_get()); 430 + 431 + difference = now - shared->stamp; 432 432 if (!difference) 433 433 return; 434 + 434 435 in_flight_read = (unsigned)atomic_read(&shared->in_flight[READ]); 435 436 in_flight_write = (unsigned)atomic_read(&shared->in_flight[WRITE]); 436 437 if (in_flight_read) ··· 451 440 } 452 441 453 442 static void dm_stat_for_entry(struct dm_stat *s, size_t entry, 454 - unsigned long bi_rw, sector_t len, bool merged, 455 - bool end, unsigned long duration) 443 + unsigned long bi_rw, sector_t len, 444 + struct dm_stats_aux *stats_aux, bool end, 445 + unsigned long duration_jiffies) 456 446 { 457 447 unsigned long idx = bi_rw & REQ_WRITE; 458 448 struct dm_stat_shared *shared = &s->stat_shared[entry]; ··· 483 471 p = &s->stat_percpu[smp_processor_id()][entry]; 484 472 485 473 if (!end) { 486 - dm_stat_round(shared, p); 474 + dm_stat_round(s, shared, p); 487 475 atomic_inc(&shared->in_flight[idx]); 488 476 } else { 489 - dm_stat_round(shared, p); 477 + dm_stat_round(s, shared, p); 490 478 atomic_dec(&shared->in_flight[idx]); 491 479 p->sectors[idx] += len; 492 480 p->ios[idx] += 1; 493 - p->merges[idx] += merged; 494 - p->ticks[idx] += duration; 481 + p->merges[idx] += stats_aux->merged; 482 + if (!(s->stat_flags & STAT_PRECISE_TIMESTAMPS)) 483 + p->ticks[idx] += duration_jiffies; 484 + else 485 + p->ticks[idx] += stats_aux->duration_ns; 495 486 } 496 487 497 488 #if BITS_PER_LONG == 32 ··· 506 491 507 492 static void __dm_stat_bio(struct dm_stat *s, unsigned long bi_rw, 508 493 sector_t bi_sector, sector_t end_sector, 509 - bool end, unsigned long duration, 494 + bool end, unsigned long duration_jiffies, 510 495 struct dm_stats_aux *stats_aux) 511 496 { 512 497 sector_t rel_sector, offset, todo, fragment_len; ··· 535 520 if (fragment_len > s->step - offset) 536 521 fragment_len = s->step - offset; 537 522 dm_stat_for_entry(s, entry, bi_rw, fragment_len, 538 - stats_aux->merged, end, duration); 523 + stats_aux, end, duration_jiffies); 539 524 todo -= fragment_len; 540 525 entry++; 541 526 offset = 0; ··· 544 529 545 530 void dm_stats_account_io(struct dm_stats *stats, unsigned long bi_rw, 546 531 sector_t bi_sector, unsigned bi_sectors, bool end, 547 - unsigned long duration, struct dm_stats_aux *stats_aux) 532 + unsigned long duration_jiffies, 533 + struct dm_stats_aux *stats_aux) 548 534 { 549 535 struct dm_stat *s; 550 536 sector_t end_sector; 551 537 struct dm_stats_last_position *last; 538 + bool got_precise_time; 552 539 553 540 if (unlikely(!bi_sectors)) 554 541 return; ··· 574 557 575 558 rcu_read_lock(); 576 559 577 - list_for_each_entry_rcu(s, &stats->list, list_entry) 578 - __dm_stat_bio(s, bi_rw, bi_sector, end_sector, end, duration, stats_aux); 560 + got_precise_time = false; 561 + list_for_each_entry_rcu(s, &stats->list, list_entry) { 562 + if (s->stat_flags & STAT_PRECISE_TIMESTAMPS && !got_precise_time) { 563 + if (!end) 564 + stats_aux->duration_ns = ktime_to_ns(ktime_get()); 565 + else 566 + stats_aux->duration_ns = ktime_to_ns(ktime_get()) - stats_aux->duration_ns; 567 + got_precise_time = true; 568 + } 569 + __dm_stat_bio(s, bi_rw, bi_sector, end_sector, end, duration_jiffies, stats_aux); 570 + } 579 571 580 572 rcu_read_unlock(); 581 573 } ··· 597 571 598 572 local_irq_disable(); 599 573 p = &s->stat_percpu[smp_processor_id()][x]; 600 - dm_stat_round(shared, p); 574 + dm_stat_round(s, shared, p); 601 575 local_irq_enable(); 602 576 603 577 memset(&shared->tmp, 0, sizeof(shared->tmp)); ··· 669 643 /* 670 644 * This is like jiffies_to_msec, but works for 64-bit values. 671 645 */ 672 - static unsigned long long dm_jiffies_to_msec64(unsigned long long j) 646 + static unsigned long long dm_jiffies_to_msec64(struct dm_stat *s, unsigned long long j) 673 647 { 674 - unsigned long long result = 0; 648 + unsigned long long result; 675 649 unsigned mult; 676 650 651 + if (s->stat_flags & STAT_PRECISE_TIMESTAMPS) 652 + return j; 653 + 654 + result = 0; 677 655 if (j) 678 656 result = jiffies_to_msecs(j & 0x3fffff); 679 657 if (j >= 1 << 22) { ··· 739 709 shared->tmp.ios[READ], 740 710 shared->tmp.merges[READ], 741 711 shared->tmp.sectors[READ], 742 - dm_jiffies_to_msec64(shared->tmp.ticks[READ]), 712 + dm_jiffies_to_msec64(s, shared->tmp.ticks[READ]), 743 713 shared->tmp.ios[WRITE], 744 714 shared->tmp.merges[WRITE], 745 715 shared->tmp.sectors[WRITE], 746 - dm_jiffies_to_msec64(shared->tmp.ticks[WRITE]), 716 + dm_jiffies_to_msec64(s, shared->tmp.ticks[WRITE]), 747 717 dm_stat_in_flight(shared), 748 - dm_jiffies_to_msec64(shared->tmp.io_ticks_total), 749 - dm_jiffies_to_msec64(shared->tmp.time_in_queue), 750 - dm_jiffies_to_msec64(shared->tmp.io_ticks[READ]), 751 - dm_jiffies_to_msec64(shared->tmp.io_ticks[WRITE])); 718 + dm_jiffies_to_msec64(s, shared->tmp.io_ticks_total), 719 + dm_jiffies_to_msec64(s, shared->tmp.time_in_queue), 720 + dm_jiffies_to_msec64(s, shared->tmp.io_ticks[READ]), 721 + dm_jiffies_to_msec64(s, shared->tmp.io_ticks[WRITE])); 752 722 753 723 if (unlikely(sz + 1 >= maxlen)) 754 724 goto buffer_overflow; ··· 799 769 unsigned long long start, end, len, step; 800 770 unsigned divisor; 801 771 const char *program_id, *aux_data; 772 + unsigned stat_flags = 0; 773 + 774 + struct dm_arg_set as, as_backup; 775 + const char *a; 776 + unsigned feature_args; 802 777 803 778 /* 804 779 * Input format: 805 - * <range> <step> [<program_id> [<aux_data>]] 780 + * <range> <step> [<extra_parameters> <parameters>] [<program_id> [<aux_data>]] 806 781 */ 807 782 808 - if (argc < 3 || argc > 5) 783 + if (argc < 3) 809 784 return -EINVAL; 810 785 811 - if (!strcmp(argv[1], "-")) { 786 + as.argc = argc; 787 + as.argv = argv; 788 + dm_consume_args(&as, 1); 789 + 790 + a = dm_shift_arg(&as); 791 + if (!strcmp(a, "-")) { 812 792 start = 0; 813 793 len = dm_get_size(md); 814 794 if (!len) 815 795 len = 1; 816 - } else if (sscanf(argv[1], "%llu+%llu%c", &start, &len, &dummy) != 2 || 796 + } else if (sscanf(a, "%llu+%llu%c", &start, &len, &dummy) != 2 || 817 797 start != (sector_t)start || len != (sector_t)len) 818 798 return -EINVAL; 819 799 ··· 831 791 if (start >= end) 832 792 return -EINVAL; 833 793 834 - if (sscanf(argv[2], "/%u%c", &divisor, &dummy) == 1) { 794 + a = dm_shift_arg(&as); 795 + if (sscanf(a, "/%u%c", &divisor, &dummy) == 1) { 835 796 if (!divisor) 836 797 return -EINVAL; 837 798 step = end - start; ··· 840 799 step++; 841 800 if (!step) 842 801 step = 1; 843 - } else if (sscanf(argv[2], "%llu%c", &step, &dummy) != 1 || 802 + } else if (sscanf(a, "%llu%c", &step, &dummy) != 1 || 844 803 step != (sector_t)step || !step) 845 804 return -EINVAL; 805 + 806 + as_backup = as; 807 + a = dm_shift_arg(&as); 808 + if (a && sscanf(a, "%u%c", &feature_args, &dummy) == 1) { 809 + while (feature_args--) { 810 + a = dm_shift_arg(&as); 811 + if (!a) 812 + return -EINVAL; 813 + if (!strcasecmp(a, "precise_timestamps")) 814 + stat_flags |= STAT_PRECISE_TIMESTAMPS; 815 + else 816 + return -EINVAL; 817 + } 818 + } else { 819 + as = as_backup; 820 + } 846 821 847 822 program_id = "-"; 848 823 aux_data = "-"; 849 824 850 - if (argc > 3) 851 - program_id = argv[3]; 825 + a = dm_shift_arg(&as); 826 + if (a) 827 + program_id = a; 852 828 853 - if (argc > 4) 854 - aux_data = argv[4]; 829 + a = dm_shift_arg(&as); 830 + if (a) 831 + aux_data = a; 832 + 833 + if (as.argc) 834 + return -EINVAL; 855 835 856 836 /* 857 837 * If a buffer overflow happens after we created the region, ··· 884 822 if (dm_message_test_buffer_overflow(result, maxlen)) 885 823 return 1; 886 824 887 - id = dm_stats_create(dm_get_stats(md), start, end, step, program_id, aux_data, 825 + id = dm_stats_create(dm_get_stats(md), start, end, step, stat_flags, program_id, aux_data, 888 826 dm_internal_suspend_fast, dm_internal_resume_fast, md); 889 827 if (id < 0) 890 828 return id;
+3 -1
drivers/md/dm-stats.h
··· 18 18 19 19 struct dm_stats_aux { 20 20 bool merged; 21 + unsigned long long duration_ns; 21 22 }; 22 23 23 24 void dm_stats_init(struct dm_stats *st); ··· 31 30 32 31 void dm_stats_account_io(struct dm_stats *stats, unsigned long bi_rw, 33 32 sector_t bi_sector, unsigned bi_sectors, bool end, 34 - unsigned long duration, struct dm_stats_aux *aux); 33 + unsigned long duration_jiffies, 34 + struct dm_stats_aux *aux); 35 35 36 36 static inline bool dm_stats_used(struct dm_stats *st) 37 37 {