Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

dm stats: collect and report histogram of IO latencies

Add an option to dm statistics to collect and report a histogram of
IO latencies.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>

authored by

Mikulas Patocka and committed by
Mike Snitzer
dfcfac3e c96aec34

+199 -25
+16 -3
Documentation/device-mapper/statistics.txt
··· 13 13 The I/O statistics counters for each step-sized area of a region are 14 14 in the same format as /sys/block/*/stat or /proc/diskstats (see: 15 15 Documentation/iostats.txt). But two extra counters (12 and 13) are 16 - provided: total time spent reading and writing. All these counters may 17 - be accessed by sending the @stats_print message to the appropriate DM 18 - device via dmsetup. 16 + provided: total time spent reading and writing. When the histogram 17 + argument is used, the 14th parameter is reported that represents the 18 + histogram of latencies. All these counters may be accessed by sending 19 + the @stats_print message to the appropriate DM device via dmsetup. 19 20 20 21 The reported times are in milliseconds and the granularity depends on 21 22 the kernel ticks. When the option precise_timestamps is used, the ··· 65 64 used, the resulting times are in nanoseconds instead of 66 65 milliseconds. Precise timestamps are a little bit slower 67 66 to obtain than jiffies-based timestamps. 67 + histogram:n1,n2,n3,n4,... - collect histogram of latencies. The 68 + numbers n1, n2, etc are times that represent the boundaries 69 + of the histogram. If precise_timestamps is not used, the 70 + times are in milliseconds, otherwise they are in 71 + nanoseconds. For each range, the kernel will report the 72 + number of requests that completed within this range. For 73 + example, if we use "histogram:10,20,30", the kernel will 74 + report four numbers a:b:c:d. a is the number of requests 75 + that took 0-10 ms to complete, b is the number of requests 76 + that took 10-20 ms to complete, c is the number of requests 77 + that took 20-30 ms to complete and d is the number of 78 + requests that took more than 30 ms to complete. 68 79 69 80 <program_id> 70 81 An optional parameter. A name that uniquely identifies
+183 -22
drivers/md/dm-stats.c
··· 29 29 unsigned long long io_ticks[2]; 30 30 unsigned long long io_ticks_total; 31 31 unsigned long long time_in_queue; 32 + unsigned long long *histogram; 32 33 }; 33 34 34 35 struct dm_stat_shared { ··· 46 45 sector_t start; 47 46 sector_t end; 48 47 sector_t step; 48 + unsigned n_histogram_entries; 49 + unsigned long long *histogram_boundaries; 49 50 const char *program_id; 50 51 const char *aux_data; 51 52 struct rcu_head rcu_head; 52 53 size_t shared_alloc_size; 53 54 size_t percpu_alloc_size; 55 + size_t histogram_alloc_size; 54 56 struct dm_stat_percpu *stat_percpu[NR_CPUS]; 55 57 struct dm_stat_shared stat_shared[0]; 56 58 }; ··· 177 173 178 174 kfree(s->program_id); 179 175 kfree(s->aux_data); 180 - for_each_possible_cpu(cpu) 176 + for_each_possible_cpu(cpu) { 177 + dm_kvfree(s->stat_percpu[cpu][0].histogram, s->histogram_alloc_size); 181 178 dm_kvfree(s->stat_percpu[cpu], s->percpu_alloc_size); 179 + } 180 + dm_kvfree(s->stat_shared[0].tmp.histogram, s->histogram_alloc_size); 182 181 dm_kvfree(s, s->shared_alloc_size); 183 182 } 184 183 ··· 235 228 236 229 static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end, 237 230 sector_t step, unsigned stat_flags, 231 + unsigned n_histogram_entries, 232 + unsigned long long *histogram_boundaries, 238 233 const char *program_id, const char *aux_data, 239 234 void (*suspend_callback)(struct mapped_device *), 240 235 void (*resume_callback)(struct mapped_device *), ··· 248 239 size_t ni; 249 240 size_t shared_alloc_size; 250 241 size_t percpu_alloc_size; 242 + size_t histogram_alloc_size; 251 243 struct dm_stat_percpu *p; 252 244 int cpu; 253 245 int ret_id; ··· 272 262 if (percpu_alloc_size / sizeof(struct dm_stat_percpu) != n_entries) 273 263 return -EOVERFLOW; 274 264 275 - if (!check_shared_memory(shared_alloc_size + num_possible_cpus() * percpu_alloc_size)) 265 + histogram_alloc_size = (n_histogram_entries + 1) * (size_t)n_entries * sizeof(unsigned long long); 266 + if (histogram_alloc_size / (n_histogram_entries + 1) != (size_t)n_entries * sizeof(unsigned long long)) 267 + return -EOVERFLOW; 268 + 269 + if (!check_shared_memory(shared_alloc_size + histogram_alloc_size + 270 + num_possible_cpus() * (percpu_alloc_size + histogram_alloc_size))) 276 271 return -ENOMEM; 277 272 278 273 s = dm_kvzalloc(shared_alloc_size, NUMA_NO_NODE); ··· 291 276 s->step = step; 292 277 s->shared_alloc_size = shared_alloc_size; 293 278 s->percpu_alloc_size = percpu_alloc_size; 279 + s->histogram_alloc_size = histogram_alloc_size; 280 + 281 + s->n_histogram_entries = n_histogram_entries; 282 + s->histogram_boundaries = kmemdup(histogram_boundaries, 283 + s->n_histogram_entries * sizeof(unsigned long long), GFP_KERNEL); 284 + if (!s->histogram_boundaries) { 285 + r = -ENOMEM; 286 + goto out; 287 + } 294 288 295 289 s->program_id = kstrdup(program_id, GFP_KERNEL); 296 290 if (!s->program_id) { ··· 317 293 atomic_set(&s->stat_shared[ni].in_flight[WRITE], 0); 318 294 } 319 295 296 + if (s->n_histogram_entries) { 297 + unsigned long long *hi; 298 + hi = dm_kvzalloc(s->histogram_alloc_size, NUMA_NO_NODE); 299 + if (!hi) { 300 + r = -ENOMEM; 301 + goto out; 302 + } 303 + for (ni = 0; ni < n_entries; ni++) { 304 + s->stat_shared[ni].tmp.histogram = hi; 305 + hi += s->n_histogram_entries + 1; 306 + } 307 + } 308 + 320 309 for_each_possible_cpu(cpu) { 321 310 p = dm_kvzalloc(percpu_alloc_size, cpu_to_node(cpu)); 322 311 if (!p) { ··· 337 300 goto out; 338 301 } 339 302 s->stat_percpu[cpu] = p; 303 + if (s->n_histogram_entries) { 304 + unsigned long long *hi; 305 + hi = dm_kvzalloc(s->histogram_alloc_size, cpu_to_node(cpu)); 306 + if (!hi) { 307 + r = -ENOMEM; 308 + goto out; 309 + } 310 + for (ni = 0; ni < n_entries; ni++) { 311 + p[ni].histogram = hi; 312 + hi += s->n_histogram_entries + 1; 313 + } 314 + } 340 315 } 341 316 342 317 /* ··· 426 377 * vfree can't be called from RCU callback 427 378 */ 428 379 for_each_possible_cpu(cpu) 429 - if (is_vmalloc_addr(s->stat_percpu)) 380 + if (is_vmalloc_addr(s->stat_percpu) || 381 + is_vmalloc_addr(s->stat_percpu[cpu][0].histogram)) 430 382 goto do_sync_free; 431 - if (is_vmalloc_addr(s)) { 383 + if (is_vmalloc_addr(s) || 384 + is_vmalloc_addr(s->stat_shared[0].tmp.histogram)) { 432 385 do_sync_free: 433 386 synchronize_rcu_expedited(); 434 387 dm_stat_free(&s->rcu_head); ··· 537 486 dm_stat_round(s, shared, p); 538 487 atomic_inc(&shared->in_flight[idx]); 539 488 } else { 489 + unsigned long long duration; 540 490 dm_stat_round(s, shared, p); 541 491 atomic_dec(&shared->in_flight[idx]); 542 492 p->sectors[idx] += len; 543 493 p->ios[idx] += 1; 544 494 p->merges[idx] += stats_aux->merged; 545 - if (!(s->stat_flags & STAT_PRECISE_TIMESTAMPS)) 495 + if (!(s->stat_flags & STAT_PRECISE_TIMESTAMPS)) { 546 496 p->ticks[idx] += duration_jiffies; 547 - else 497 + duration = jiffies_to_msecs(duration_jiffies); 498 + } else { 548 499 p->ticks[idx] += stats_aux->duration_ns; 500 + duration = stats_aux->duration_ns; 501 + } 502 + if (s->n_histogram_entries) { 503 + unsigned lo = 0, hi = s->n_histogram_entries + 1; 504 + while (lo + 1 < hi) { 505 + unsigned mid = (lo + hi) / 2; 506 + if (s->histogram_boundaries[mid - 1] > duration) { 507 + hi = mid; 508 + } else { 509 + lo = mid; 510 + } 511 + 512 + } 513 + p->histogram[lo]++; 514 + } 549 515 } 550 516 551 517 #if BITS_PER_LONG == 32 ··· 668 600 dm_stat_round(s, shared, p); 669 601 local_irq_enable(); 670 602 671 - memset(&shared->tmp, 0, sizeof(shared->tmp)); 603 + shared->tmp.sectors[READ] = 0; 604 + shared->tmp.sectors[WRITE] = 0; 605 + shared->tmp.ios[READ] = 0; 606 + shared->tmp.ios[WRITE] = 0; 607 + shared->tmp.merges[READ] = 0; 608 + shared->tmp.merges[WRITE] = 0; 609 + shared->tmp.ticks[READ] = 0; 610 + shared->tmp.ticks[WRITE] = 0; 611 + shared->tmp.io_ticks[READ] = 0; 612 + shared->tmp.io_ticks[WRITE] = 0; 613 + shared->tmp.io_ticks_total = 0; 614 + shared->tmp.time_in_queue = 0; 615 + 616 + if (s->n_histogram_entries) 617 + memset(shared->tmp.histogram, 0, (s->n_histogram_entries + 1) * sizeof(unsigned long long)); 618 + 672 619 for_each_possible_cpu(cpu) { 673 620 p = &s->stat_percpu[cpu][x]; 674 621 shared->tmp.sectors[READ] += ACCESS_ONCE(p->sectors[READ]); ··· 698 615 shared->tmp.io_ticks[WRITE] += ACCESS_ONCE(p->io_ticks[WRITE]); 699 616 shared->tmp.io_ticks_total += ACCESS_ONCE(p->io_ticks_total); 700 617 shared->tmp.time_in_queue += ACCESS_ONCE(p->time_in_queue); 618 + if (s->n_histogram_entries) { 619 + unsigned i; 620 + for (i = 0; i < s->n_histogram_entries + 1; i++) 621 + shared->tmp.histogram[i] += ACCESS_ONCE(p->histogram[i]); 622 + } 701 623 } 702 624 } 703 625 ··· 732 644 p->io_ticks_total -= shared->tmp.io_ticks_total; 733 645 p->time_in_queue -= shared->tmp.time_in_queue; 734 646 local_irq_enable(); 647 + if (s->n_histogram_entries) { 648 + unsigned i; 649 + for (i = 0; i < s->n_histogram_entries + 1; i++) { 650 + local_irq_disable(); 651 + p = &s->stat_percpu[smp_processor_id()][x]; 652 + p->histogram[i] -= shared->tmp.histogram[i]; 653 + local_irq_enable(); 654 + } 655 + } 735 656 } 736 657 } 737 658 ··· 830 733 831 734 __dm_stat_init_temporary_percpu_totals(shared, s, x); 832 735 833 - DMEMIT("%llu+%llu %llu %llu %llu %llu %llu %llu %llu %llu %d %llu %llu %llu %llu\n", 736 + DMEMIT("%llu+%llu %llu %llu %llu %llu %llu %llu %llu %llu %d %llu %llu %llu %llu", 834 737 (unsigned long long)start, 835 738 (unsigned long long)step, 836 739 shared->tmp.ios[READ], ··· 846 749 dm_jiffies_to_msec64(s, shared->tmp.time_in_queue), 847 750 dm_jiffies_to_msec64(s, shared->tmp.io_ticks[READ]), 848 751 dm_jiffies_to_msec64(s, shared->tmp.io_ticks[WRITE])); 752 + if (s->n_histogram_entries) { 753 + unsigned i; 754 + for (i = 0; i < s->n_histogram_entries + 1; i++) { 755 + DMEMIT("%s%llu", !i ? " " : ":", shared->tmp.histogram[i]); 756 + } 757 + } 758 + DMEMIT("\n"); 849 759 850 760 if (unlikely(sz + 1 >= maxlen)) 851 761 goto buffer_overflow; ··· 894 790 return 0; 895 791 } 896 792 793 + static int parse_histogram(const char *h, unsigned *n_histogram_entries, 794 + unsigned long long **histogram_boundaries) 795 + { 796 + const char *q; 797 + unsigned n; 798 + unsigned long long last; 799 + 800 + *n_histogram_entries = 1; 801 + for (q = h; *q; q++) 802 + if (*q == ',') 803 + (*n_histogram_entries)++; 804 + 805 + *histogram_boundaries = kmalloc(*n_histogram_entries * sizeof(unsigned long long), GFP_KERNEL); 806 + if (!*histogram_boundaries) 807 + return -ENOMEM; 808 + 809 + n = 0; 810 + last = 0; 811 + while (1) { 812 + unsigned long long hi; 813 + int s; 814 + char ch; 815 + s = sscanf(h, "%llu%c", &hi, &ch); 816 + if (!s || (s == 2 && ch != ',')) 817 + return -EINVAL; 818 + if (hi <= last) 819 + return -EINVAL; 820 + last = hi; 821 + (*histogram_boundaries)[n] = hi; 822 + if (s == 1) 823 + return 0; 824 + h = strchr(h, ',') + 1; 825 + n++; 826 + } 827 + } 828 + 897 829 static int message_stats_create(struct mapped_device *md, 898 830 unsigned argc, char **argv, 899 831 char *result, unsigned maxlen) 900 832 { 833 + int r; 901 834 int id; 902 835 char dummy; 903 836 unsigned long long start, end, len, step; 904 837 unsigned divisor; 905 838 const char *program_id, *aux_data; 906 839 unsigned stat_flags = 0; 840 + 841 + unsigned n_histogram_entries = 0; 842 + unsigned long long *histogram_boundaries = NULL; 907 843 908 844 struct dm_arg_set as, as_backup; 909 845 const char *a; ··· 955 811 */ 956 812 957 813 if (argc < 3) 958 - return -EINVAL; 814 + goto ret_einval; 959 815 960 816 as.argc = argc; 961 817 as.argv = argv; ··· 969 825 len = 1; 970 826 } else if (sscanf(a, "%llu+%llu%c", &start, &len, &dummy) != 2 || 971 827 start != (sector_t)start || len != (sector_t)len) 972 - return -EINVAL; 828 + goto ret_einval; 973 829 974 830 end = start + len; 975 831 if (start >= end) 976 - return -EINVAL; 832 + goto ret_einval; 977 833 978 834 a = dm_shift_arg(&as); 979 835 if (sscanf(a, "/%u%c", &divisor, &dummy) == 1) { ··· 986 842 step = 1; 987 843 } else if (sscanf(a, "%llu%c", &step, &dummy) != 1 || 988 844 step != (sector_t)step || !step) 989 - return -EINVAL; 845 + goto ret_einval; 990 846 991 847 as_backup = as; 992 848 a = dm_shift_arg(&as); ··· 994 850 while (feature_args--) { 995 851 a = dm_shift_arg(&as); 996 852 if (!a) 997 - return -EINVAL; 853 + goto ret_einval; 998 854 if (!strcasecmp(a, "precise_timestamps")) 999 855 stat_flags |= STAT_PRECISE_TIMESTAMPS; 1000 - else 1001 - return -EINVAL; 856 + else if (!strncasecmp(a, "histogram:", 10)) { 857 + if (n_histogram_entries) 858 + goto ret_einval; 859 + if ((r = parse_histogram(a + 10, &n_histogram_entries, &histogram_boundaries))) 860 + goto ret; 861 + } else 862 + goto ret_einval; 1002 863 } 1003 864 } else { 1004 865 as = as_backup; ··· 1021 872 aux_data = a; 1022 873 1023 874 if (as.argc) 1024 - return -EINVAL; 875 + goto ret_einval; 1025 876 1026 877 /* 1027 878 * If a buffer overflow happens after we created the region, ··· 1030 881 * leaked). So we must detect buffer overflow in advance. 1031 882 */ 1032 883 snprintf(result, maxlen, "%d", INT_MAX); 1033 - if (dm_message_test_buffer_overflow(result, maxlen)) 1034 - return 1; 884 + if (dm_message_test_buffer_overflow(result, maxlen)) { 885 + r = 1; 886 + goto ret; 887 + } 1035 888 1036 - id = dm_stats_create(dm_get_stats(md), start, end, step, stat_flags, program_id, aux_data, 889 + id = dm_stats_create(dm_get_stats(md), start, end, step, stat_flags, 890 + n_histogram_entries, histogram_boundaries, program_id, aux_data, 1037 891 dm_internal_suspend_fast, dm_internal_resume_fast, md); 1038 - if (id < 0) 1039 - return id; 892 + if (id < 0) { 893 + r = id; 894 + goto ret; 895 + } 1040 896 1041 897 snprintf(result, maxlen, "%d", id); 1042 898 1043 - return 1; 899 + r = 1; 900 + goto ret; 901 + 902 + ret_einval: 903 + r = -EINVAL; 904 + ret: 905 + kfree(histogram_boundaries); 906 + return r; 1044 907 } 1045 908 1046 909 static int message_stats_delete(struct mapped_device *md,