Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

blk-iolatency: use a percentile approache for ssd's

We use an average latency approach for determining if we're missing our
latency target. This works well for rotational storage where we have
generally consistent latencies, but for ssd's and other low latency
devices you have more of a spikey behavior, which means we often won't
throttle misbehaving groups because a lot of IO completes at drastically
faster times than our latency target. Instead keep track of how many
IO's miss our target and how many IO's are done in our time window. If
the p(90) latency is above our target then we know we need to throttle.
With this change in place we are seeing the same throttling behavior
with our testcase on ssd's as we see with rotational drives.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>

authored by

Josef Bacik and committed by
Jens Axboe
1fa2840e 22ed8a93

+145 -34
+145 -34
block/blk-iolatency.c
··· 115 115 atomic_t scale_cookie; 116 116 }; 117 117 118 + struct percentile_stats { 119 + u64 total; 120 + u64 missed; 121 + }; 122 + 123 + struct latency_stat { 124 + union { 125 + struct percentile_stats ps; 126 + struct blk_rq_stat rqs; 127 + }; 128 + }; 129 + 118 130 struct iolatency_grp { 119 131 struct blkg_policy_data pd; 120 - struct blk_rq_stat __percpu *stats; 132 + struct latency_stat __percpu *stats; 121 133 struct blk_iolatency *blkiolat; 122 134 struct rq_depth rq_depth; 123 135 struct rq_wait rq_wait; ··· 144 132 /* Our current number of IO's for the last summation. */ 145 133 u64 nr_samples; 146 134 135 + bool ssd; 147 136 struct child_latency_info child_lat; 148 137 }; 149 138 ··· 183 170 static inline struct blkcg_gq *lat_to_blkg(struct iolatency_grp *iolat) 184 171 { 185 172 return pd_to_blkg(&iolat->pd); 173 + } 174 + 175 + static inline void latency_stat_init(struct iolatency_grp *iolat, 176 + struct latency_stat *stat) 177 + { 178 + if (iolat->ssd) { 179 + stat->ps.total = 0; 180 + stat->ps.missed = 0; 181 + } else 182 + blk_rq_stat_init(&stat->rqs); 183 + } 184 + 185 + static inline void latency_stat_sum(struct iolatency_grp *iolat, 186 + struct latency_stat *sum, 187 + struct latency_stat *stat) 188 + { 189 + if (iolat->ssd) { 190 + sum->ps.total += stat->ps.total; 191 + sum->ps.missed += stat->ps.missed; 192 + } else 193 + blk_rq_stat_sum(&sum->rqs, &stat->rqs); 194 + } 195 + 196 + static inline void latency_stat_record_time(struct iolatency_grp *iolat, 197 + u64 req_time) 198 + { 199 + struct latency_stat *stat = get_cpu_ptr(iolat->stats); 200 + if (iolat->ssd) { 201 + if (req_time >= iolat->min_lat_nsec) 202 + stat->ps.missed++; 203 + stat->ps.total++; 204 + } else 205 + blk_rq_stat_add(&stat->rqs, req_time); 206 + put_cpu_ptr(stat); 207 + } 208 + 209 + static inline bool latency_sum_ok(struct iolatency_grp *iolat, 210 + struct latency_stat *stat) 211 + { 212 + if (iolat->ssd) { 213 + u64 thresh = div64_u64(stat->ps.total, 10); 214 + thresh = max(thresh, 1ULL); 215 + return stat->ps.missed < thresh; 216 + } 217 + return stat->rqs.mean <= iolat->min_lat_nsec; 218 + } 219 + 220 + static inline u64 latency_stat_samples(struct iolatency_grp *iolat, 221 + struct latency_stat *stat) 222 + { 223 + if (iolat->ssd) 224 + return stat->ps.total; 225 + return stat->rqs.nr_samples; 226 + } 227 + 228 + static inline void iolat_update_total_lat_avg(struct iolatency_grp *iolat, 229 + struct latency_stat *stat) 230 + { 231 + int exp_idx; 232 + 233 + if (iolat->ssd) 234 + return; 235 + 236 + /* 237 + * CALC_LOAD takes in a number stored in fixed point representation. 238 + * Because we are using this for IO time in ns, the values stored 239 + * are significantly larger than the FIXED_1 denominator (2048). 240 + * Therefore, rounding errors in the calculation are negligible and 241 + * can be ignored. 242 + */ 243 + exp_idx = min_t(int, BLKIOLATENCY_NR_EXP_FACTORS - 1, 244 + div64_u64(iolat->cur_win_nsec, 245 + BLKIOLATENCY_EXP_BUCKET_SIZE)); 246 + CALC_LOAD(iolat->lat_avg, iolatency_exp_factors[exp_idx], stat->rqs.mean); 186 247 } 187 248 188 249 static inline bool iolatency_may_queue(struct iolatency_grp *iolat, ··· 505 418 struct bio_issue *issue, u64 now, 506 419 bool issue_as_root) 507 420 { 508 - struct blk_rq_stat *rq_stat; 509 421 u64 start = bio_issue_time(issue); 510 422 u64 req_time; 511 423 ··· 530 444 return; 531 445 } 532 446 533 - rq_stat = get_cpu_ptr(iolat->stats); 534 - blk_rq_stat_add(rq_stat, req_time); 535 - put_cpu_ptr(rq_stat); 447 + latency_stat_record_time(iolat, req_time); 536 448 } 537 449 538 450 #define BLKIOLATENCY_MIN_ADJUST_TIME (500 * NSEC_PER_MSEC) ··· 541 457 struct blkcg_gq *blkg = lat_to_blkg(iolat); 542 458 struct iolatency_grp *parent; 543 459 struct child_latency_info *lat_info; 544 - struct blk_rq_stat stat; 460 + struct latency_stat stat; 545 461 unsigned long flags; 546 - int cpu, exp_idx; 462 + int cpu; 547 463 548 - blk_rq_stat_init(&stat); 464 + latency_stat_init(iolat, &stat); 549 465 preempt_disable(); 550 466 for_each_online_cpu(cpu) { 551 - struct blk_rq_stat *s; 467 + struct latency_stat *s; 552 468 s = per_cpu_ptr(iolat->stats, cpu); 553 - blk_rq_stat_sum(&stat, s); 554 - blk_rq_stat_init(s); 469 + latency_stat_sum(iolat, &stat, s); 470 + latency_stat_init(iolat, s); 555 471 } 556 472 preempt_enable(); 557 473 ··· 561 477 562 478 lat_info = &parent->child_lat; 563 479 564 - /* 565 - * CALC_LOAD takes in a number stored in fixed point representation. 566 - * Because we are using this for IO time in ns, the values stored 567 - * are significantly larger than the FIXED_1 denominator (2048). 568 - * Therefore, rounding errors in the calculation are negligible and 569 - * can be ignored. 570 - */ 571 - exp_idx = min_t(int, BLKIOLATENCY_NR_EXP_FACTORS - 1, 572 - div64_u64(iolat->cur_win_nsec, 573 - BLKIOLATENCY_EXP_BUCKET_SIZE)); 574 - CALC_LOAD(iolat->lat_avg, iolatency_exp_factors[exp_idx], stat.mean); 480 + iolat_update_total_lat_avg(iolat, &stat); 575 481 576 482 /* Everything is ok and we don't need to adjust the scale. */ 577 - if (stat.mean <= iolat->min_lat_nsec && 483 + if (latency_sum_ok(iolat, &stat) && 578 484 atomic_read(&lat_info->scale_cookie) == DEFAULT_SCALE_COOKIE) 579 485 return; 580 486 581 487 /* Somebody beat us to the punch, just bail. */ 582 488 spin_lock_irqsave(&lat_info->lock, flags); 583 489 lat_info->nr_samples -= iolat->nr_samples; 584 - lat_info->nr_samples += stat.nr_samples; 585 - iolat->nr_samples = stat.nr_samples; 490 + lat_info->nr_samples += latency_stat_samples(iolat, &stat); 491 + iolat->nr_samples = latency_stat_samples(iolat, &stat); 586 492 587 493 if ((lat_info->last_scale_event >= now || 588 494 now - lat_info->last_scale_event < BLKIOLATENCY_MIN_ADJUST_TIME) && 589 495 lat_info->scale_lat <= iolat->min_lat_nsec) 590 496 goto out; 591 497 592 - if (stat.mean <= iolat->min_lat_nsec && 593 - stat.nr_samples >= BLKIOLATENCY_MIN_GOOD_SAMPLES) { 498 + if (latency_sum_ok(iolat, &stat)) { 499 + if (latency_stat_samples(iolat, &stat) < 500 + BLKIOLATENCY_MIN_GOOD_SAMPLES) 501 + goto out; 594 502 if (lat_info->scale_grp == iolat) { 595 503 lat_info->last_scale_event = now; 596 504 scale_cookie_change(iolat->blkiolat, lat_info, true); 597 505 } 598 - } else if (stat.mean > iolat->min_lat_nsec) { 506 + } else { 599 507 lat_info->last_scale_event = now; 600 508 if (!lat_info->scale_grp || 601 509 lat_info->scale_lat > iolat->min_lat_nsec) { ··· 884 808 return 0; 885 809 } 886 810 811 + static size_t iolatency_ssd_stat(struct iolatency_grp *iolat, char *buf, 812 + size_t size) 813 + { 814 + struct latency_stat stat; 815 + int cpu; 816 + 817 + latency_stat_init(iolat, &stat); 818 + preempt_disable(); 819 + for_each_online_cpu(cpu) { 820 + struct latency_stat *s; 821 + s = per_cpu_ptr(iolat->stats, cpu); 822 + latency_stat_sum(iolat, &stat, s); 823 + } 824 + preempt_enable(); 825 + 826 + if (iolat->rq_depth.max_depth == UINT_MAX) 827 + return scnprintf(buf, size, " missed=%llu total=%llu depth=max", 828 + (unsigned long long)stat.ps.missed, 829 + (unsigned long long)stat.ps.total); 830 + return scnprintf(buf, size, " missed=%llu total=%llu depth=%u", 831 + (unsigned long long)stat.ps.missed, 832 + (unsigned long long)stat.ps.total, 833 + iolat->rq_depth.max_depth); 834 + } 835 + 887 836 static size_t iolatency_pd_stat(struct blkg_policy_data *pd, char *buf, 888 837 size_t size) 889 838 { 890 839 struct iolatency_grp *iolat = pd_to_lat(pd); 891 - unsigned long long avg_lat = div64_u64(iolat->lat_avg, NSEC_PER_USEC); 892 - unsigned long long cur_win = div64_u64(iolat->cur_win_nsec, NSEC_PER_MSEC); 840 + unsigned long long avg_lat; 841 + unsigned long long cur_win; 893 842 843 + if (iolat->ssd) 844 + return iolatency_ssd_stat(iolat, buf, size); 845 + 846 + avg_lat = div64_u64(iolat->lat_avg, NSEC_PER_USEC); 847 + cur_win = div64_u64(iolat->cur_win_nsec, NSEC_PER_MSEC); 894 848 if (iolat->rq_depth.max_depth == UINT_MAX) 895 849 return scnprintf(buf, size, " depth=max avg_lat=%llu win=%llu", 896 850 avg_lat, cur_win); ··· 937 831 iolat = kzalloc_node(sizeof(*iolat), gfp, node); 938 832 if (!iolat) 939 833 return NULL; 940 - iolat->stats = __alloc_percpu_gfp(sizeof(struct blk_rq_stat), 941 - __alignof__(struct blk_rq_stat), gfp); 834 + iolat->stats = __alloc_percpu_gfp(sizeof(struct latency_stat), 835 + __alignof__(struct latency_stat), gfp); 942 836 if (!iolat->stats) { 943 837 kfree(iolat); 944 838 return NULL; ··· 955 849 u64 now = ktime_to_ns(ktime_get()); 956 850 int cpu; 957 851 852 + if (blk_queue_nonrot(blkg->q)) 853 + iolat->ssd = true; 854 + else 855 + iolat->ssd = false; 856 + 958 857 for_each_possible_cpu(cpu) { 959 - struct blk_rq_stat *stat; 858 + struct latency_stat *stat; 960 859 stat = per_cpu_ptr(iolat->stats, cpu); 961 - blk_rq_stat_init(stat); 860 + latency_stat_init(iolat, stat); 962 861 } 963 862 964 863 rq_wait_init(&iolat->rq_wait);