Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at v4.8-rc5 4920 lines 130 kB view raw
1/* 2 * CFQ, or complete fairness queueing, disk scheduler. 3 * 4 * Based on ideas from a previously unfinished io 5 * scheduler (round robin per-process disk scheduling) and Andrea Arcangeli. 6 * 7 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> 8 */ 9#include <linux/module.h> 10#include <linux/slab.h> 11#include <linux/blkdev.h> 12#include <linux/elevator.h> 13#include <linux/ktime.h> 14#include <linux/rbtree.h> 15#include <linux/ioprio.h> 16#include <linux/blktrace_api.h> 17#include <linux/blk-cgroup.h> 18#include "blk.h" 19 20/* 21 * tunables 22 */ 23/* max queue in one round of service */ 24static const int cfq_quantum = 8; 25static const u64 cfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 }; 26/* maximum backwards seek, in KiB */ 27static const int cfq_back_max = 16 * 1024; 28/* penalty of a backwards seek */ 29static const int cfq_back_penalty = 2; 30static const u64 cfq_slice_sync = NSEC_PER_SEC / 10; 31static u64 cfq_slice_async = NSEC_PER_SEC / 25; 32static const int cfq_slice_async_rq = 2; 33static u64 cfq_slice_idle = NSEC_PER_SEC / 125; 34static u64 cfq_group_idle = NSEC_PER_SEC / 125; 35static const u64 cfq_target_latency = (u64)NSEC_PER_SEC * 3/10; /* 300 ms */ 36static const int cfq_hist_divisor = 4; 37 38/* 39 * offset from end of service tree 40 */ 41#define CFQ_IDLE_DELAY (NSEC_PER_SEC / 5) 42 43/* 44 * below this threshold, we consider thinktime immediate 45 */ 46#define CFQ_MIN_TT (2 * NSEC_PER_SEC / HZ) 47 48#define CFQ_SLICE_SCALE (5) 49#define CFQ_HW_QUEUE_MIN (5) 50#define CFQ_SERVICE_SHIFT 12 51 52#define CFQQ_SEEK_THR (sector_t)(8 * 100) 53#define CFQQ_CLOSE_THR (sector_t)(8 * 1024) 54#define CFQQ_SECT_THR_NONROT (sector_t)(2 * 32) 55#define CFQQ_SEEKY(cfqq) (hweight32(cfqq->seek_history) > 32/8) 56 57#define RQ_CIC(rq) icq_to_cic((rq)->elv.icq) 58#define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elv.priv[0]) 59#define RQ_CFQG(rq) (struct cfq_group *) ((rq)->elv.priv[1]) 60 61static struct kmem_cache *cfq_pool; 62 63#define CFQ_PRIO_LISTS IOPRIO_BE_NR 64#define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE) 65#define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT) 66 67#define sample_valid(samples) ((samples) > 80) 68#define rb_entry_cfqg(node) rb_entry((node), struct cfq_group, rb_node) 69 70/* blkio-related constants */ 71#define CFQ_WEIGHT_LEGACY_MIN 10 72#define CFQ_WEIGHT_LEGACY_DFL 500 73#define CFQ_WEIGHT_LEGACY_MAX 1000 74 75struct cfq_ttime { 76 u64 last_end_request; 77 78 u64 ttime_total; 79 u64 ttime_mean; 80 unsigned long ttime_samples; 81}; 82 83/* 84 * Most of our rbtree usage is for sorting with min extraction, so 85 * if we cache the leftmost node we don't have to walk down the tree 86 * to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should 87 * move this into the elevator for the rq sorting as well. 88 */ 89struct cfq_rb_root { 90 struct rb_root rb; 91 struct rb_node *left; 92 unsigned count; 93 u64 min_vdisktime; 94 struct cfq_ttime ttime; 95}; 96#define CFQ_RB_ROOT (struct cfq_rb_root) { .rb = RB_ROOT, \ 97 .ttime = {.last_end_request = ktime_get_ns(),},} 98 99/* 100 * Per process-grouping structure 101 */ 102struct cfq_queue { 103 /* reference count */ 104 int ref; 105 /* various state flags, see below */ 106 unsigned int flags; 107 /* parent cfq_data */ 108 struct cfq_data *cfqd; 109 /* service_tree member */ 110 struct rb_node rb_node; 111 /* service_tree key */ 112 u64 rb_key; 113 /* prio tree member */ 114 struct rb_node p_node; 115 /* prio tree root we belong to, if any */ 116 struct rb_root *p_root; 117 /* sorted list of pending requests */ 118 struct rb_root sort_list; 119 /* if fifo isn't expired, next request to serve */ 120 struct request *next_rq; 121 /* requests queued in sort_list */ 122 int queued[2]; 123 /* currently allocated requests */ 124 int allocated[2]; 125 /* fifo list of requests in sort_list */ 126 struct list_head fifo; 127 128 /* time when queue got scheduled in to dispatch first request. */ 129 u64 dispatch_start; 130 u64 allocated_slice; 131 u64 slice_dispatch; 132 /* time when first request from queue completed and slice started. */ 133 u64 slice_start; 134 u64 slice_end; 135 s64 slice_resid; 136 137 /* pending priority requests */ 138 int prio_pending; 139 /* number of requests that are on the dispatch list or inside driver */ 140 int dispatched; 141 142 /* io prio of this group */ 143 unsigned short ioprio, org_ioprio; 144 unsigned short ioprio_class, org_ioprio_class; 145 146 pid_t pid; 147 148 u32 seek_history; 149 sector_t last_request_pos; 150 151 struct cfq_rb_root *service_tree; 152 struct cfq_queue *new_cfqq; 153 struct cfq_group *cfqg; 154 /* Number of sectors dispatched from queue in single dispatch round */ 155 unsigned long nr_sectors; 156}; 157 158/* 159 * First index in the service_trees. 160 * IDLE is handled separately, so it has negative index 161 */ 162enum wl_class_t { 163 BE_WORKLOAD = 0, 164 RT_WORKLOAD = 1, 165 IDLE_WORKLOAD = 2, 166 CFQ_PRIO_NR, 167}; 168 169/* 170 * Second index in the service_trees. 171 */ 172enum wl_type_t { 173 ASYNC_WORKLOAD = 0, 174 SYNC_NOIDLE_WORKLOAD = 1, 175 SYNC_WORKLOAD = 2 176}; 177 178struct cfqg_stats { 179#ifdef CONFIG_CFQ_GROUP_IOSCHED 180 /* number of ios merged */ 181 struct blkg_rwstat merged; 182 /* total time spent on device in ns, may not be accurate w/ queueing */ 183 struct blkg_rwstat service_time; 184 /* total time spent waiting in scheduler queue in ns */ 185 struct blkg_rwstat wait_time; 186 /* number of IOs queued up */ 187 struct blkg_rwstat queued; 188 /* total disk time and nr sectors dispatched by this group */ 189 struct blkg_stat time; 190#ifdef CONFIG_DEBUG_BLK_CGROUP 191 /* time not charged to this cgroup */ 192 struct blkg_stat unaccounted_time; 193 /* sum of number of ios queued across all samples */ 194 struct blkg_stat avg_queue_size_sum; 195 /* count of samples taken for average */ 196 struct blkg_stat avg_queue_size_samples; 197 /* how many times this group has been removed from service tree */ 198 struct blkg_stat dequeue; 199 /* total time spent waiting for it to be assigned a timeslice. */ 200 struct blkg_stat group_wait_time; 201 /* time spent idling for this blkcg_gq */ 202 struct blkg_stat idle_time; 203 /* total time with empty current active q with other requests queued */ 204 struct blkg_stat empty_time; 205 /* fields after this shouldn't be cleared on stat reset */ 206 uint64_t start_group_wait_time; 207 uint64_t start_idle_time; 208 uint64_t start_empty_time; 209 uint16_t flags; 210#endif /* CONFIG_DEBUG_BLK_CGROUP */ 211#endif /* CONFIG_CFQ_GROUP_IOSCHED */ 212}; 213 214/* Per-cgroup data */ 215struct cfq_group_data { 216 /* must be the first member */ 217 struct blkcg_policy_data cpd; 218 219 unsigned int weight; 220 unsigned int leaf_weight; 221}; 222 223/* This is per cgroup per device grouping structure */ 224struct cfq_group { 225 /* must be the first member */ 226 struct blkg_policy_data pd; 227 228 /* group service_tree member */ 229 struct rb_node rb_node; 230 231 /* group service_tree key */ 232 u64 vdisktime; 233 234 /* 235 * The number of active cfqgs and sum of their weights under this 236 * cfqg. This covers this cfqg's leaf_weight and all children's 237 * weights, but does not cover weights of further descendants. 238 * 239 * If a cfqg is on the service tree, it's active. An active cfqg 240 * also activates its parent and contributes to the children_weight 241 * of the parent. 242 */ 243 int nr_active; 244 unsigned int children_weight; 245 246 /* 247 * vfraction is the fraction of vdisktime that the tasks in this 248 * cfqg are entitled to. This is determined by compounding the 249 * ratios walking up from this cfqg to the root. 250 * 251 * It is in fixed point w/ CFQ_SERVICE_SHIFT and the sum of all 252 * vfractions on a service tree is approximately 1. The sum may 253 * deviate a bit due to rounding errors and fluctuations caused by 254 * cfqgs entering and leaving the service tree. 255 */ 256 unsigned int vfraction; 257 258 /* 259 * There are two weights - (internal) weight is the weight of this 260 * cfqg against the sibling cfqgs. leaf_weight is the wight of 261 * this cfqg against the child cfqgs. For the root cfqg, both 262 * weights are kept in sync for backward compatibility. 263 */ 264 unsigned int weight; 265 unsigned int new_weight; 266 unsigned int dev_weight; 267 268 unsigned int leaf_weight; 269 unsigned int new_leaf_weight; 270 unsigned int dev_leaf_weight; 271 272 /* number of cfqq currently on this group */ 273 int nr_cfqq; 274 275 /* 276 * Per group busy queues average. Useful for workload slice calc. We 277 * create the array for each prio class but at run time it is used 278 * only for RT and BE class and slot for IDLE class remains unused. 279 * This is primarily done to avoid confusion and a gcc warning. 280 */ 281 unsigned int busy_queues_avg[CFQ_PRIO_NR]; 282 /* 283 * rr lists of queues with requests. We maintain service trees for 284 * RT and BE classes. These trees are subdivided in subclasses 285 * of SYNC, SYNC_NOIDLE and ASYNC based on workload type. For IDLE 286 * class there is no subclassification and all the cfq queues go on 287 * a single tree service_tree_idle. 288 * Counts are embedded in the cfq_rb_root 289 */ 290 struct cfq_rb_root service_trees[2][3]; 291 struct cfq_rb_root service_tree_idle; 292 293 u64 saved_wl_slice; 294 enum wl_type_t saved_wl_type; 295 enum wl_class_t saved_wl_class; 296 297 /* number of requests that are on the dispatch list or inside driver */ 298 int dispatched; 299 struct cfq_ttime ttime; 300 struct cfqg_stats stats; /* stats for this cfqg */ 301 302 /* async queue for each priority case */ 303 struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR]; 304 struct cfq_queue *async_idle_cfqq; 305 306}; 307 308struct cfq_io_cq { 309 struct io_cq icq; /* must be the first member */ 310 struct cfq_queue *cfqq[2]; 311 struct cfq_ttime ttime; 312 int ioprio; /* the current ioprio */ 313#ifdef CONFIG_CFQ_GROUP_IOSCHED 314 uint64_t blkcg_serial_nr; /* the current blkcg serial */ 315#endif 316}; 317 318/* 319 * Per block device queue structure 320 */ 321struct cfq_data { 322 struct request_queue *queue; 323 /* Root service tree for cfq_groups */ 324 struct cfq_rb_root grp_service_tree; 325 struct cfq_group *root_group; 326 327 /* 328 * The priority currently being served 329 */ 330 enum wl_class_t serving_wl_class; 331 enum wl_type_t serving_wl_type; 332 u64 workload_expires; 333 struct cfq_group *serving_group; 334 335 /* 336 * Each priority tree is sorted by next_request position. These 337 * trees are used when determining if two or more queues are 338 * interleaving requests (see cfq_close_cooperator). 339 */ 340 struct rb_root prio_trees[CFQ_PRIO_LISTS]; 341 342 unsigned int busy_queues; 343 unsigned int busy_sync_queues; 344 345 int rq_in_driver; 346 int rq_in_flight[2]; 347 348 /* 349 * queue-depth detection 350 */ 351 int rq_queued; 352 int hw_tag; 353 /* 354 * hw_tag can be 355 * -1 => indeterminate, (cfq will behave as if NCQ is present, to allow better detection) 356 * 1 => NCQ is present (hw_tag_est_depth is the estimated max depth) 357 * 0 => no NCQ 358 */ 359 int hw_tag_est_depth; 360 unsigned int hw_tag_samples; 361 362 /* 363 * idle window management 364 */ 365 struct hrtimer idle_slice_timer; 366 struct work_struct unplug_work; 367 368 struct cfq_queue *active_queue; 369 struct cfq_io_cq *active_cic; 370 371 sector_t last_position; 372 373 /* 374 * tunables, see top of file 375 */ 376 unsigned int cfq_quantum; 377 unsigned int cfq_back_penalty; 378 unsigned int cfq_back_max; 379 unsigned int cfq_slice_async_rq; 380 unsigned int cfq_latency; 381 u64 cfq_fifo_expire[2]; 382 u64 cfq_slice[2]; 383 u64 cfq_slice_idle; 384 u64 cfq_group_idle; 385 u64 cfq_target_latency; 386 387 /* 388 * Fallback dummy cfqq for extreme OOM conditions 389 */ 390 struct cfq_queue oom_cfqq; 391 392 u64 last_delayed_sync; 393}; 394 395static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd); 396static void cfq_put_queue(struct cfq_queue *cfqq); 397 398static struct cfq_rb_root *st_for(struct cfq_group *cfqg, 399 enum wl_class_t class, 400 enum wl_type_t type) 401{ 402 if (!cfqg) 403 return NULL; 404 405 if (class == IDLE_WORKLOAD) 406 return &cfqg->service_tree_idle; 407 408 return &cfqg->service_trees[class][type]; 409} 410 411enum cfqq_state_flags { 412 CFQ_CFQQ_FLAG_on_rr = 0, /* on round-robin busy list */ 413 CFQ_CFQQ_FLAG_wait_request, /* waiting for a request */ 414 CFQ_CFQQ_FLAG_must_dispatch, /* must be allowed a dispatch */ 415 CFQ_CFQQ_FLAG_must_alloc_slice, /* per-slice must_alloc flag */ 416 CFQ_CFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ 417 CFQ_CFQQ_FLAG_idle_window, /* slice idling enabled */ 418 CFQ_CFQQ_FLAG_prio_changed, /* task priority has changed */ 419 CFQ_CFQQ_FLAG_slice_new, /* no requests dispatched in slice */ 420 CFQ_CFQQ_FLAG_sync, /* synchronous queue */ 421 CFQ_CFQQ_FLAG_coop, /* cfqq is shared */ 422 CFQ_CFQQ_FLAG_split_coop, /* shared cfqq will be splitted */ 423 CFQ_CFQQ_FLAG_deep, /* sync cfqq experienced large depth */ 424 CFQ_CFQQ_FLAG_wait_busy, /* Waiting for next request */ 425}; 426 427#define CFQ_CFQQ_FNS(name) \ 428static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq) \ 429{ \ 430 (cfqq)->flags |= (1 << CFQ_CFQQ_FLAG_##name); \ 431} \ 432static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq) \ 433{ \ 434 (cfqq)->flags &= ~(1 << CFQ_CFQQ_FLAG_##name); \ 435} \ 436static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq) \ 437{ \ 438 return ((cfqq)->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0; \ 439} 440 441CFQ_CFQQ_FNS(on_rr); 442CFQ_CFQQ_FNS(wait_request); 443CFQ_CFQQ_FNS(must_dispatch); 444CFQ_CFQQ_FNS(must_alloc_slice); 445CFQ_CFQQ_FNS(fifo_expire); 446CFQ_CFQQ_FNS(idle_window); 447CFQ_CFQQ_FNS(prio_changed); 448CFQ_CFQQ_FNS(slice_new); 449CFQ_CFQQ_FNS(sync); 450CFQ_CFQQ_FNS(coop); 451CFQ_CFQQ_FNS(split_coop); 452CFQ_CFQQ_FNS(deep); 453CFQ_CFQQ_FNS(wait_busy); 454#undef CFQ_CFQQ_FNS 455 456#if defined(CONFIG_CFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) 457 458/* cfqg stats flags */ 459enum cfqg_stats_flags { 460 CFQG_stats_waiting = 0, 461 CFQG_stats_idling, 462 CFQG_stats_empty, 463}; 464 465#define CFQG_FLAG_FNS(name) \ 466static inline void cfqg_stats_mark_##name(struct cfqg_stats *stats) \ 467{ \ 468 stats->flags |= (1 << CFQG_stats_##name); \ 469} \ 470static inline void cfqg_stats_clear_##name(struct cfqg_stats *stats) \ 471{ \ 472 stats->flags &= ~(1 << CFQG_stats_##name); \ 473} \ 474static inline int cfqg_stats_##name(struct cfqg_stats *stats) \ 475{ \ 476 return (stats->flags & (1 << CFQG_stats_##name)) != 0; \ 477} \ 478 479CFQG_FLAG_FNS(waiting) 480CFQG_FLAG_FNS(idling) 481CFQG_FLAG_FNS(empty) 482#undef CFQG_FLAG_FNS 483 484/* This should be called with the queue_lock held. */ 485static void cfqg_stats_update_group_wait_time(struct cfqg_stats *stats) 486{ 487 unsigned long long now; 488 489 if (!cfqg_stats_waiting(stats)) 490 return; 491 492 now = sched_clock(); 493 if (time_after64(now, stats->start_group_wait_time)) 494 blkg_stat_add(&stats->group_wait_time, 495 now - stats->start_group_wait_time); 496 cfqg_stats_clear_waiting(stats); 497} 498 499/* This should be called with the queue_lock held. */ 500static void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg, 501 struct cfq_group *curr_cfqg) 502{ 503 struct cfqg_stats *stats = &cfqg->stats; 504 505 if (cfqg_stats_waiting(stats)) 506 return; 507 if (cfqg == curr_cfqg) 508 return; 509 stats->start_group_wait_time = sched_clock(); 510 cfqg_stats_mark_waiting(stats); 511} 512 513/* This should be called with the queue_lock held. */ 514static void cfqg_stats_end_empty_time(struct cfqg_stats *stats) 515{ 516 unsigned long long now; 517 518 if (!cfqg_stats_empty(stats)) 519 return; 520 521 now = sched_clock(); 522 if (time_after64(now, stats->start_empty_time)) 523 blkg_stat_add(&stats->empty_time, 524 now - stats->start_empty_time); 525 cfqg_stats_clear_empty(stats); 526} 527 528static void cfqg_stats_update_dequeue(struct cfq_group *cfqg) 529{ 530 blkg_stat_add(&cfqg->stats.dequeue, 1); 531} 532 533static void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg) 534{ 535 struct cfqg_stats *stats = &cfqg->stats; 536 537 if (blkg_rwstat_total(&stats->queued)) 538 return; 539 540 /* 541 * group is already marked empty. This can happen if cfqq got new 542 * request in parent group and moved to this group while being added 543 * to service tree. Just ignore the event and move on. 544 */ 545 if (cfqg_stats_empty(stats)) 546 return; 547 548 stats->start_empty_time = sched_clock(); 549 cfqg_stats_mark_empty(stats); 550} 551 552static void cfqg_stats_update_idle_time(struct cfq_group *cfqg) 553{ 554 struct cfqg_stats *stats = &cfqg->stats; 555 556 if (cfqg_stats_idling(stats)) { 557 unsigned long long now = sched_clock(); 558 559 if (time_after64(now, stats->start_idle_time)) 560 blkg_stat_add(&stats->idle_time, 561 now - stats->start_idle_time); 562 cfqg_stats_clear_idling(stats); 563 } 564} 565 566static void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg) 567{ 568 struct cfqg_stats *stats = &cfqg->stats; 569 570 BUG_ON(cfqg_stats_idling(stats)); 571 572 stats->start_idle_time = sched_clock(); 573 cfqg_stats_mark_idling(stats); 574} 575 576static void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg) 577{ 578 struct cfqg_stats *stats = &cfqg->stats; 579 580 blkg_stat_add(&stats->avg_queue_size_sum, 581 blkg_rwstat_total(&stats->queued)); 582 blkg_stat_add(&stats->avg_queue_size_samples, 1); 583 cfqg_stats_update_group_wait_time(stats); 584} 585 586#else /* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */ 587 588static inline void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg, struct cfq_group *curr_cfqg) { } 589static inline void cfqg_stats_end_empty_time(struct cfqg_stats *stats) { } 590static inline void cfqg_stats_update_dequeue(struct cfq_group *cfqg) { } 591static inline void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg) { } 592static inline void cfqg_stats_update_idle_time(struct cfq_group *cfqg) { } 593static inline void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg) { } 594static inline void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg) { } 595 596#endif /* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */ 597 598#ifdef CONFIG_CFQ_GROUP_IOSCHED 599 600static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd) 601{ 602 return pd ? container_of(pd, struct cfq_group, pd) : NULL; 603} 604 605static struct cfq_group_data 606*cpd_to_cfqgd(struct blkcg_policy_data *cpd) 607{ 608 return cpd ? container_of(cpd, struct cfq_group_data, cpd) : NULL; 609} 610 611static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg) 612{ 613 return pd_to_blkg(&cfqg->pd); 614} 615 616static struct blkcg_policy blkcg_policy_cfq; 617 618static inline struct cfq_group *blkg_to_cfqg(struct blkcg_gq *blkg) 619{ 620 return pd_to_cfqg(blkg_to_pd(blkg, &blkcg_policy_cfq)); 621} 622 623static struct cfq_group_data *blkcg_to_cfqgd(struct blkcg *blkcg) 624{ 625 return cpd_to_cfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_cfq)); 626} 627 628static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg) 629{ 630 struct blkcg_gq *pblkg = cfqg_to_blkg(cfqg)->parent; 631 632 return pblkg ? blkg_to_cfqg(pblkg) : NULL; 633} 634 635static inline bool cfqg_is_descendant(struct cfq_group *cfqg, 636 struct cfq_group *ancestor) 637{ 638 return cgroup_is_descendant(cfqg_to_blkg(cfqg)->blkcg->css.cgroup, 639 cfqg_to_blkg(ancestor)->blkcg->css.cgroup); 640} 641 642static inline void cfqg_get(struct cfq_group *cfqg) 643{ 644 return blkg_get(cfqg_to_blkg(cfqg)); 645} 646 647static inline void cfqg_put(struct cfq_group *cfqg) 648{ 649 return blkg_put(cfqg_to_blkg(cfqg)); 650} 651 652#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) do { \ 653 char __pbuf[128]; \ 654 \ 655 blkg_path(cfqg_to_blkg((cfqq)->cfqg), __pbuf, sizeof(__pbuf)); \ 656 blk_add_trace_msg((cfqd)->queue, "cfq%d%c%c %s " fmt, (cfqq)->pid, \ 657 cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \ 658 cfqq_type((cfqq)) == SYNC_NOIDLE_WORKLOAD ? 'N' : ' ',\ 659 __pbuf, ##args); \ 660} while (0) 661 662#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do { \ 663 char __pbuf[128]; \ 664 \ 665 blkg_path(cfqg_to_blkg(cfqg), __pbuf, sizeof(__pbuf)); \ 666 blk_add_trace_msg((cfqd)->queue, "%s " fmt, __pbuf, ##args); \ 667} while (0) 668 669static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg, 670 struct cfq_group *curr_cfqg, int op, 671 int op_flags) 672{ 673 blkg_rwstat_add(&cfqg->stats.queued, op, op_flags, 1); 674 cfqg_stats_end_empty_time(&cfqg->stats); 675 cfqg_stats_set_start_group_wait_time(cfqg, curr_cfqg); 676} 677 678static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg, 679 uint64_t time, unsigned long unaccounted_time) 680{ 681 blkg_stat_add(&cfqg->stats.time, time); 682#ifdef CONFIG_DEBUG_BLK_CGROUP 683 blkg_stat_add(&cfqg->stats.unaccounted_time, unaccounted_time); 684#endif 685} 686 687static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int op, 688 int op_flags) 689{ 690 blkg_rwstat_add(&cfqg->stats.queued, op, op_flags, -1); 691} 692 693static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int op, 694 int op_flags) 695{ 696 blkg_rwstat_add(&cfqg->stats.merged, op, op_flags, 1); 697} 698 699static inline void cfqg_stats_update_completion(struct cfq_group *cfqg, 700 uint64_t start_time, uint64_t io_start_time, int op, 701 int op_flags) 702{ 703 struct cfqg_stats *stats = &cfqg->stats; 704 unsigned long long now = sched_clock(); 705 706 if (time_after64(now, io_start_time)) 707 blkg_rwstat_add(&stats->service_time, op, op_flags, 708 now - io_start_time); 709 if (time_after64(io_start_time, start_time)) 710 blkg_rwstat_add(&stats->wait_time, op, op_flags, 711 io_start_time - start_time); 712} 713 714/* @stats = 0 */ 715static void cfqg_stats_reset(struct cfqg_stats *stats) 716{ 717 /* queued stats shouldn't be cleared */ 718 blkg_rwstat_reset(&stats->merged); 719 blkg_rwstat_reset(&stats->service_time); 720 blkg_rwstat_reset(&stats->wait_time); 721 blkg_stat_reset(&stats->time); 722#ifdef CONFIG_DEBUG_BLK_CGROUP 723 blkg_stat_reset(&stats->unaccounted_time); 724 blkg_stat_reset(&stats->avg_queue_size_sum); 725 blkg_stat_reset(&stats->avg_queue_size_samples); 726 blkg_stat_reset(&stats->dequeue); 727 blkg_stat_reset(&stats->group_wait_time); 728 blkg_stat_reset(&stats->idle_time); 729 blkg_stat_reset(&stats->empty_time); 730#endif 731} 732 733/* @to += @from */ 734static void cfqg_stats_add_aux(struct cfqg_stats *to, struct cfqg_stats *from) 735{ 736 /* queued stats shouldn't be cleared */ 737 blkg_rwstat_add_aux(&to->merged, &from->merged); 738 blkg_rwstat_add_aux(&to->service_time, &from->service_time); 739 blkg_rwstat_add_aux(&to->wait_time, &from->wait_time); 740 blkg_stat_add_aux(&from->time, &from->time); 741#ifdef CONFIG_DEBUG_BLK_CGROUP 742 blkg_stat_add_aux(&to->unaccounted_time, &from->unaccounted_time); 743 blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum); 744 blkg_stat_add_aux(&to->avg_queue_size_samples, &from->avg_queue_size_samples); 745 blkg_stat_add_aux(&to->dequeue, &from->dequeue); 746 blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time); 747 blkg_stat_add_aux(&to->idle_time, &from->idle_time); 748 blkg_stat_add_aux(&to->empty_time, &from->empty_time); 749#endif 750} 751 752/* 753 * Transfer @cfqg's stats to its parent's aux counts so that the ancestors' 754 * recursive stats can still account for the amount used by this cfqg after 755 * it's gone. 756 */ 757static void cfqg_stats_xfer_dead(struct cfq_group *cfqg) 758{ 759 struct cfq_group *parent = cfqg_parent(cfqg); 760 761 lockdep_assert_held(cfqg_to_blkg(cfqg)->q->queue_lock); 762 763 if (unlikely(!parent)) 764 return; 765 766 cfqg_stats_add_aux(&parent->stats, &cfqg->stats); 767 cfqg_stats_reset(&cfqg->stats); 768} 769 770#else /* CONFIG_CFQ_GROUP_IOSCHED */ 771 772static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg) { return NULL; } 773static inline bool cfqg_is_descendant(struct cfq_group *cfqg, 774 struct cfq_group *ancestor) 775{ 776 return true; 777} 778static inline void cfqg_get(struct cfq_group *cfqg) { } 779static inline void cfqg_put(struct cfq_group *cfqg) { } 780 781#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ 782 blk_add_trace_msg((cfqd)->queue, "cfq%d%c%c " fmt, (cfqq)->pid, \ 783 cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \ 784 cfqq_type((cfqq)) == SYNC_NOIDLE_WORKLOAD ? 'N' : ' ',\ 785 ##args) 786#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0) 787 788static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg, 789 struct cfq_group *curr_cfqg, int op, int op_flags) { } 790static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg, 791 uint64_t time, unsigned long unaccounted_time) { } 792static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int op, 793 int op_flags) { } 794static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int op, 795 int op_flags) { } 796static inline void cfqg_stats_update_completion(struct cfq_group *cfqg, 797 uint64_t start_time, uint64_t io_start_time, int op, 798 int op_flags) { } 799 800#endif /* CONFIG_CFQ_GROUP_IOSCHED */ 801 802#define cfq_log(cfqd, fmt, args...) \ 803 blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args) 804 805/* Traverses through cfq group service trees */ 806#define for_each_cfqg_st(cfqg, i, j, st) \ 807 for (i = 0; i <= IDLE_WORKLOAD; i++) \ 808 for (j = 0, st = i < IDLE_WORKLOAD ? &cfqg->service_trees[i][j]\ 809 : &cfqg->service_tree_idle; \ 810 (i < IDLE_WORKLOAD && j <= SYNC_WORKLOAD) || \ 811 (i == IDLE_WORKLOAD && j == 0); \ 812 j++, st = i < IDLE_WORKLOAD ? \ 813 &cfqg->service_trees[i][j]: NULL) \ 814 815static inline bool cfq_io_thinktime_big(struct cfq_data *cfqd, 816 struct cfq_ttime *ttime, bool group_idle) 817{ 818 u64 slice; 819 if (!sample_valid(ttime->ttime_samples)) 820 return false; 821 if (group_idle) 822 slice = cfqd->cfq_group_idle; 823 else 824 slice = cfqd->cfq_slice_idle; 825 return ttime->ttime_mean > slice; 826} 827 828static inline bool iops_mode(struct cfq_data *cfqd) 829{ 830 /* 831 * If we are not idling on queues and it is a NCQ drive, parallel 832 * execution of requests is on and measuring time is not possible 833 * in most of the cases until and unless we drive shallower queue 834 * depths and that becomes a performance bottleneck. In such cases 835 * switch to start providing fairness in terms of number of IOs. 836 */ 837 if (!cfqd->cfq_slice_idle && cfqd->hw_tag) 838 return true; 839 else 840 return false; 841} 842 843static inline enum wl_class_t cfqq_class(struct cfq_queue *cfqq) 844{ 845 if (cfq_class_idle(cfqq)) 846 return IDLE_WORKLOAD; 847 if (cfq_class_rt(cfqq)) 848 return RT_WORKLOAD; 849 return BE_WORKLOAD; 850} 851 852 853static enum wl_type_t cfqq_type(struct cfq_queue *cfqq) 854{ 855 if (!cfq_cfqq_sync(cfqq)) 856 return ASYNC_WORKLOAD; 857 if (!cfq_cfqq_idle_window(cfqq)) 858 return SYNC_NOIDLE_WORKLOAD; 859 return SYNC_WORKLOAD; 860} 861 862static inline int cfq_group_busy_queues_wl(enum wl_class_t wl_class, 863 struct cfq_data *cfqd, 864 struct cfq_group *cfqg) 865{ 866 if (wl_class == IDLE_WORKLOAD) 867 return cfqg->service_tree_idle.count; 868 869 return cfqg->service_trees[wl_class][ASYNC_WORKLOAD].count + 870 cfqg->service_trees[wl_class][SYNC_NOIDLE_WORKLOAD].count + 871 cfqg->service_trees[wl_class][SYNC_WORKLOAD].count; 872} 873 874static inline int cfqg_busy_async_queues(struct cfq_data *cfqd, 875 struct cfq_group *cfqg) 876{ 877 return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count + 878 cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count; 879} 880 881static void cfq_dispatch_insert(struct request_queue *, struct request *); 882static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, bool is_sync, 883 struct cfq_io_cq *cic, struct bio *bio); 884 885static inline struct cfq_io_cq *icq_to_cic(struct io_cq *icq) 886{ 887 /* cic->icq is the first member, %NULL will convert to %NULL */ 888 return container_of(icq, struct cfq_io_cq, icq); 889} 890 891static inline struct cfq_io_cq *cfq_cic_lookup(struct cfq_data *cfqd, 892 struct io_context *ioc) 893{ 894 if (ioc) 895 return icq_to_cic(ioc_lookup_icq(ioc, cfqd->queue)); 896 return NULL; 897} 898 899static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_cq *cic, bool is_sync) 900{ 901 return cic->cfqq[is_sync]; 902} 903 904static inline void cic_set_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq, 905 bool is_sync) 906{ 907 cic->cfqq[is_sync] = cfqq; 908} 909 910static inline struct cfq_data *cic_to_cfqd(struct cfq_io_cq *cic) 911{ 912 return cic->icq.q->elevator->elevator_data; 913} 914 915/* 916 * We regard a request as SYNC, if it's either a read or has the SYNC bit 917 * set (in which case it could also be direct WRITE). 918 */ 919static inline bool cfq_bio_sync(struct bio *bio) 920{ 921 return bio_data_dir(bio) == READ || (bio->bi_opf & REQ_SYNC); 922} 923 924/* 925 * scheduler run of queue, if there are requests pending and no one in the 926 * driver that will restart queueing 927 */ 928static inline void cfq_schedule_dispatch(struct cfq_data *cfqd) 929{ 930 if (cfqd->busy_queues) { 931 cfq_log(cfqd, "schedule dispatch"); 932 kblockd_schedule_work(&cfqd->unplug_work); 933 } 934} 935 936/* 937 * Scale schedule slice based on io priority. Use the sync time slice only 938 * if a queue is marked sync and has sync io queued. A sync queue with async 939 * io only, should not get full sync slice length. 940 */ 941static inline u64 cfq_prio_slice(struct cfq_data *cfqd, bool sync, 942 unsigned short prio) 943{ 944 u64 base_slice = cfqd->cfq_slice[sync]; 945 u64 slice = div_u64(base_slice, CFQ_SLICE_SCALE); 946 947 WARN_ON(prio >= IOPRIO_BE_NR); 948 949 return base_slice + (slice * (4 - prio)); 950} 951 952static inline u64 953cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) 954{ 955 return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio); 956} 957 958/** 959 * cfqg_scale_charge - scale disk time charge according to cfqg weight 960 * @charge: disk time being charged 961 * @vfraction: vfraction of the cfqg, fixed point w/ CFQ_SERVICE_SHIFT 962 * 963 * Scale @charge according to @vfraction, which is in range (0, 1]. The 964 * scaling is inversely proportional. 965 * 966 * scaled = charge / vfraction 967 * 968 * The result is also in fixed point w/ CFQ_SERVICE_SHIFT. 969 */ 970static inline u64 cfqg_scale_charge(u64 charge, 971 unsigned int vfraction) 972{ 973 u64 c = charge << CFQ_SERVICE_SHIFT; /* make it fixed point */ 974 975 /* charge / vfraction */ 976 c <<= CFQ_SERVICE_SHIFT; 977 return div_u64(c, vfraction); 978} 979 980static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime) 981{ 982 s64 delta = (s64)(vdisktime - min_vdisktime); 983 if (delta > 0) 984 min_vdisktime = vdisktime; 985 986 return min_vdisktime; 987} 988 989static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime) 990{ 991 s64 delta = (s64)(vdisktime - min_vdisktime); 992 if (delta < 0) 993 min_vdisktime = vdisktime; 994 995 return min_vdisktime; 996} 997 998static void update_min_vdisktime(struct cfq_rb_root *st) 999{ 1000 struct cfq_group *cfqg; 1001 1002 if (st->left) { 1003 cfqg = rb_entry_cfqg(st->left); 1004 st->min_vdisktime = max_vdisktime(st->min_vdisktime, 1005 cfqg->vdisktime); 1006 } 1007} 1008 1009/* 1010 * get averaged number of queues of RT/BE priority. 1011 * average is updated, with a formula that gives more weight to higher numbers, 1012 * to quickly follows sudden increases and decrease slowly 1013 */ 1014 1015static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd, 1016 struct cfq_group *cfqg, bool rt) 1017{ 1018 unsigned min_q, max_q; 1019 unsigned mult = cfq_hist_divisor - 1; 1020 unsigned round = cfq_hist_divisor / 2; 1021 unsigned busy = cfq_group_busy_queues_wl(rt, cfqd, cfqg); 1022 1023 min_q = min(cfqg->busy_queues_avg[rt], busy); 1024 max_q = max(cfqg->busy_queues_avg[rt], busy); 1025 cfqg->busy_queues_avg[rt] = (mult * max_q + min_q + round) / 1026 cfq_hist_divisor; 1027 return cfqg->busy_queues_avg[rt]; 1028} 1029 1030static inline u64 1031cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg) 1032{ 1033 return cfqd->cfq_target_latency * cfqg->vfraction >> CFQ_SERVICE_SHIFT; 1034} 1035 1036static inline u64 1037cfq_scaled_cfqq_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) 1038{ 1039 u64 slice = cfq_prio_to_slice(cfqd, cfqq); 1040 if (cfqd->cfq_latency) { 1041 /* 1042 * interested queues (we consider only the ones with the same 1043 * priority class in the cfq group) 1044 */ 1045 unsigned iq = cfq_group_get_avg_queues(cfqd, cfqq->cfqg, 1046 cfq_class_rt(cfqq)); 1047 u64 sync_slice = cfqd->cfq_slice[1]; 1048 u64 expect_latency = sync_slice * iq; 1049 u64 group_slice = cfq_group_slice(cfqd, cfqq->cfqg); 1050 1051 if (expect_latency > group_slice) { 1052 u64 base_low_slice = 2 * cfqd->cfq_slice_idle; 1053 u64 low_slice; 1054 1055 /* scale low_slice according to IO priority 1056 * and sync vs async */ 1057 low_slice = div64_u64(base_low_slice*slice, sync_slice); 1058 low_slice = min(slice, low_slice); 1059 /* the adapted slice value is scaled to fit all iqs 1060 * into the target latency */ 1061 slice = div64_u64(slice*group_slice, expect_latency); 1062 slice = max(slice, low_slice); 1063 } 1064 } 1065 return slice; 1066} 1067 1068static inline void 1069cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) 1070{ 1071 u64 slice = cfq_scaled_cfqq_slice(cfqd, cfqq); 1072 u64 now = ktime_get_ns(); 1073 1074 cfqq->slice_start = now; 1075 cfqq->slice_end = now + slice; 1076 cfqq->allocated_slice = slice; 1077 cfq_log_cfqq(cfqd, cfqq, "set_slice=%llu", cfqq->slice_end - now); 1078} 1079 1080/* 1081 * We need to wrap this check in cfq_cfqq_slice_new(), since ->slice_end 1082 * isn't valid until the first request from the dispatch is activated 1083 * and the slice time set. 1084 */ 1085static inline bool cfq_slice_used(struct cfq_queue *cfqq) 1086{ 1087 if (cfq_cfqq_slice_new(cfqq)) 1088 return false; 1089 if (ktime_get_ns() < cfqq->slice_end) 1090 return false; 1091 1092 return true; 1093} 1094 1095/* 1096 * Lifted from AS - choose which of rq1 and rq2 that is best served now. 1097 * We choose the request that is closest to the head right now. Distance 1098 * behind the head is penalized and only allowed to a certain extent. 1099 */ 1100static struct request * 1101cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2, sector_t last) 1102{ 1103 sector_t s1, s2, d1 = 0, d2 = 0; 1104 unsigned long back_max; 1105#define CFQ_RQ1_WRAP 0x01 /* request 1 wraps */ 1106#define CFQ_RQ2_WRAP 0x02 /* request 2 wraps */ 1107 unsigned wrap = 0; /* bit mask: requests behind the disk head? */ 1108 1109 if (rq1 == NULL || rq1 == rq2) 1110 return rq2; 1111 if (rq2 == NULL) 1112 return rq1; 1113 1114 if (rq_is_sync(rq1) != rq_is_sync(rq2)) 1115 return rq_is_sync(rq1) ? rq1 : rq2; 1116 1117 if ((rq1->cmd_flags ^ rq2->cmd_flags) & REQ_PRIO) 1118 return rq1->cmd_flags & REQ_PRIO ? rq1 : rq2; 1119 1120 s1 = blk_rq_pos(rq1); 1121 s2 = blk_rq_pos(rq2); 1122 1123 /* 1124 * by definition, 1KiB is 2 sectors 1125 */ 1126 back_max = cfqd->cfq_back_max * 2; 1127 1128 /* 1129 * Strict one way elevator _except_ in the case where we allow 1130 * short backward seeks which are biased as twice the cost of a 1131 * similar forward seek. 1132 */ 1133 if (s1 >= last) 1134 d1 = s1 - last; 1135 else if (s1 + back_max >= last) 1136 d1 = (last - s1) * cfqd->cfq_back_penalty; 1137 else 1138 wrap |= CFQ_RQ1_WRAP; 1139 1140 if (s2 >= last) 1141 d2 = s2 - last; 1142 else if (s2 + back_max >= last) 1143 d2 = (last - s2) * cfqd->cfq_back_penalty; 1144 else 1145 wrap |= CFQ_RQ2_WRAP; 1146 1147 /* Found required data */ 1148 1149 /* 1150 * By doing switch() on the bit mask "wrap" we avoid having to 1151 * check two variables for all permutations: --> faster! 1152 */ 1153 switch (wrap) { 1154 case 0: /* common case for CFQ: rq1 and rq2 not wrapped */ 1155 if (d1 < d2) 1156 return rq1; 1157 else if (d2 < d1) 1158 return rq2; 1159 else { 1160 if (s1 >= s2) 1161 return rq1; 1162 else 1163 return rq2; 1164 } 1165 1166 case CFQ_RQ2_WRAP: 1167 return rq1; 1168 case CFQ_RQ1_WRAP: 1169 return rq2; 1170 case (CFQ_RQ1_WRAP|CFQ_RQ2_WRAP): /* both rqs wrapped */ 1171 default: 1172 /* 1173 * Since both rqs are wrapped, 1174 * start with the one that's further behind head 1175 * (--> only *one* back seek required), 1176 * since back seek takes more time than forward. 1177 */ 1178 if (s1 <= s2) 1179 return rq1; 1180 else 1181 return rq2; 1182 } 1183} 1184 1185/* 1186 * The below is leftmost cache rbtree addon 1187 */ 1188static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root) 1189{ 1190 /* Service tree is empty */ 1191 if (!root->count) 1192 return NULL; 1193 1194 if (!root->left) 1195 root->left = rb_first(&root->rb); 1196 1197 if (root->left) 1198 return rb_entry(root->left, struct cfq_queue, rb_node); 1199 1200 return NULL; 1201} 1202 1203static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root) 1204{ 1205 if (!root->left) 1206 root->left = rb_first(&root->rb); 1207 1208 if (root->left) 1209 return rb_entry_cfqg(root->left); 1210 1211 return NULL; 1212} 1213 1214static void rb_erase_init(struct rb_node *n, struct rb_root *root) 1215{ 1216 rb_erase(n, root); 1217 RB_CLEAR_NODE(n); 1218} 1219 1220static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root) 1221{ 1222 if (root->left == n) 1223 root->left = NULL; 1224 rb_erase_init(n, &root->rb); 1225 --root->count; 1226} 1227 1228/* 1229 * would be nice to take fifo expire time into account as well 1230 */ 1231static struct request * 1232cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq, 1233 struct request *last) 1234{ 1235 struct rb_node *rbnext = rb_next(&last->rb_node); 1236 struct rb_node *rbprev = rb_prev(&last->rb_node); 1237 struct request *next = NULL, *prev = NULL; 1238 1239 BUG_ON(RB_EMPTY_NODE(&last->rb_node)); 1240 1241 if (rbprev) 1242 prev = rb_entry_rq(rbprev); 1243 1244 if (rbnext) 1245 next = rb_entry_rq(rbnext); 1246 else { 1247 rbnext = rb_first(&cfqq->sort_list); 1248 if (rbnext && rbnext != &last->rb_node) 1249 next = rb_entry_rq(rbnext); 1250 } 1251 1252 return cfq_choose_req(cfqd, next, prev, blk_rq_pos(last)); 1253} 1254 1255static u64 cfq_slice_offset(struct cfq_data *cfqd, 1256 struct cfq_queue *cfqq) 1257{ 1258 /* 1259 * just an approximation, should be ok. 1260 */ 1261 return (cfqq->cfqg->nr_cfqq - 1) * (cfq_prio_slice(cfqd, 1, 0) - 1262 cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio)); 1263} 1264 1265static inline s64 1266cfqg_key(struct cfq_rb_root *st, struct cfq_group *cfqg) 1267{ 1268 return cfqg->vdisktime - st->min_vdisktime; 1269} 1270 1271static void 1272__cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg) 1273{ 1274 struct rb_node **node = &st->rb.rb_node; 1275 struct rb_node *parent = NULL; 1276 struct cfq_group *__cfqg; 1277 s64 key = cfqg_key(st, cfqg); 1278 int left = 1; 1279 1280 while (*node != NULL) { 1281 parent = *node; 1282 __cfqg = rb_entry_cfqg(parent); 1283 1284 if (key < cfqg_key(st, __cfqg)) 1285 node = &parent->rb_left; 1286 else { 1287 node = &parent->rb_right; 1288 left = 0; 1289 } 1290 } 1291 1292 if (left) 1293 st->left = &cfqg->rb_node; 1294 1295 rb_link_node(&cfqg->rb_node, parent, node); 1296 rb_insert_color(&cfqg->rb_node, &st->rb); 1297} 1298 1299/* 1300 * This has to be called only on activation of cfqg 1301 */ 1302static void 1303cfq_update_group_weight(struct cfq_group *cfqg) 1304{ 1305 if (cfqg->new_weight) { 1306 cfqg->weight = cfqg->new_weight; 1307 cfqg->new_weight = 0; 1308 } 1309} 1310 1311static void 1312cfq_update_group_leaf_weight(struct cfq_group *cfqg) 1313{ 1314 BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node)); 1315 1316 if (cfqg->new_leaf_weight) { 1317 cfqg->leaf_weight = cfqg->new_leaf_weight; 1318 cfqg->new_leaf_weight = 0; 1319 } 1320} 1321 1322static void 1323cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg) 1324{ 1325 unsigned int vfr = 1 << CFQ_SERVICE_SHIFT; /* start with 1 */ 1326 struct cfq_group *pos = cfqg; 1327 struct cfq_group *parent; 1328 bool propagate; 1329 1330 /* add to the service tree */ 1331 BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node)); 1332 1333 /* 1334 * Update leaf_weight. We cannot update weight at this point 1335 * because cfqg might already have been activated and is 1336 * contributing its current weight to the parent's child_weight. 1337 */ 1338 cfq_update_group_leaf_weight(cfqg); 1339 __cfq_group_service_tree_add(st, cfqg); 1340 1341 /* 1342 * Activate @cfqg and calculate the portion of vfraction @cfqg is 1343 * entitled to. vfraction is calculated by walking the tree 1344 * towards the root calculating the fraction it has at each level. 1345 * The compounded ratio is how much vfraction @cfqg owns. 1346 * 1347 * Start with the proportion tasks in this cfqg has against active 1348 * children cfqgs - its leaf_weight against children_weight. 1349 */ 1350 propagate = !pos->nr_active++; 1351 pos->children_weight += pos->leaf_weight; 1352 vfr = vfr * pos->leaf_weight / pos->children_weight; 1353 1354 /* 1355 * Compound ->weight walking up the tree. Both activation and 1356 * vfraction calculation are done in the same loop. Propagation 1357 * stops once an already activated node is met. vfraction 1358 * calculation should always continue to the root. 1359 */ 1360 while ((parent = cfqg_parent(pos))) { 1361 if (propagate) { 1362 cfq_update_group_weight(pos); 1363 propagate = !parent->nr_active++; 1364 parent->children_weight += pos->weight; 1365 } 1366 vfr = vfr * pos->weight / parent->children_weight; 1367 pos = parent; 1368 } 1369 1370 cfqg->vfraction = max_t(unsigned, vfr, 1); 1371} 1372 1373static void 1374cfq_group_notify_queue_add(struct cfq_data *cfqd, struct cfq_group *cfqg) 1375{ 1376 struct cfq_rb_root *st = &cfqd->grp_service_tree; 1377 struct cfq_group *__cfqg; 1378 struct rb_node *n; 1379 1380 cfqg->nr_cfqq++; 1381 if (!RB_EMPTY_NODE(&cfqg->rb_node)) 1382 return; 1383 1384 /* 1385 * Currently put the group at the end. Later implement something 1386 * so that groups get lesser vtime based on their weights, so that 1387 * if group does not loose all if it was not continuously backlogged. 1388 */ 1389 n = rb_last(&st->rb); 1390 if (n) { 1391 __cfqg = rb_entry_cfqg(n); 1392 cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY; 1393 } else 1394 cfqg->vdisktime = st->min_vdisktime; 1395 cfq_group_service_tree_add(st, cfqg); 1396} 1397 1398static void 1399cfq_group_service_tree_del(struct cfq_rb_root *st, struct cfq_group *cfqg) 1400{ 1401 struct cfq_group *pos = cfqg; 1402 bool propagate; 1403 1404 /* 1405 * Undo activation from cfq_group_service_tree_add(). Deactivate 1406 * @cfqg and propagate deactivation upwards. 1407 */ 1408 propagate = !--pos->nr_active; 1409 pos->children_weight -= pos->leaf_weight; 1410 1411 while (propagate) { 1412 struct cfq_group *parent = cfqg_parent(pos); 1413 1414 /* @pos has 0 nr_active at this point */ 1415 WARN_ON_ONCE(pos->children_weight); 1416 pos->vfraction = 0; 1417 1418 if (!parent) 1419 break; 1420 1421 propagate = !--parent->nr_active; 1422 parent->children_weight -= pos->weight; 1423 pos = parent; 1424 } 1425 1426 /* remove from the service tree */ 1427 if (!RB_EMPTY_NODE(&cfqg->rb_node)) 1428 cfq_rb_erase(&cfqg->rb_node, st); 1429} 1430 1431static void 1432cfq_group_notify_queue_del(struct cfq_data *cfqd, struct cfq_group *cfqg) 1433{ 1434 struct cfq_rb_root *st = &cfqd->grp_service_tree; 1435 1436 BUG_ON(cfqg->nr_cfqq < 1); 1437 cfqg->nr_cfqq--; 1438 1439 /* If there are other cfq queues under this group, don't delete it */ 1440 if (cfqg->nr_cfqq) 1441 return; 1442 1443 cfq_log_cfqg(cfqd, cfqg, "del_from_rr group"); 1444 cfq_group_service_tree_del(st, cfqg); 1445 cfqg->saved_wl_slice = 0; 1446 cfqg_stats_update_dequeue(cfqg); 1447} 1448 1449static inline u64 cfq_cfqq_slice_usage(struct cfq_queue *cfqq, 1450 u64 *unaccounted_time) 1451{ 1452 u64 slice_used; 1453 u64 now = ktime_get_ns(); 1454 1455 /* 1456 * Queue got expired before even a single request completed or 1457 * got expired immediately after first request completion. 1458 */ 1459 if (!cfqq->slice_start || cfqq->slice_start == now) { 1460 /* 1461 * Also charge the seek time incurred to the group, otherwise 1462 * if there are mutiple queues in the group, each can dispatch 1463 * a single request on seeky media and cause lots of seek time 1464 * and group will never know it. 1465 */ 1466 slice_used = max_t(u64, (now - cfqq->dispatch_start), 1467 jiffies_to_nsecs(1)); 1468 } else { 1469 slice_used = now - cfqq->slice_start; 1470 if (slice_used > cfqq->allocated_slice) { 1471 *unaccounted_time = slice_used - cfqq->allocated_slice; 1472 slice_used = cfqq->allocated_slice; 1473 } 1474 if (cfqq->slice_start > cfqq->dispatch_start) 1475 *unaccounted_time += cfqq->slice_start - 1476 cfqq->dispatch_start; 1477 } 1478 1479 return slice_used; 1480} 1481 1482static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg, 1483 struct cfq_queue *cfqq) 1484{ 1485 struct cfq_rb_root *st = &cfqd->grp_service_tree; 1486 u64 used_sl, charge, unaccounted_sl = 0; 1487 int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg) 1488 - cfqg->service_tree_idle.count; 1489 unsigned int vfr; 1490 u64 now = ktime_get_ns(); 1491 1492 BUG_ON(nr_sync < 0); 1493 used_sl = charge = cfq_cfqq_slice_usage(cfqq, &unaccounted_sl); 1494 1495 if (iops_mode(cfqd)) 1496 charge = cfqq->slice_dispatch; 1497 else if (!cfq_cfqq_sync(cfqq) && !nr_sync) 1498 charge = cfqq->allocated_slice; 1499 1500 /* 1501 * Can't update vdisktime while on service tree and cfqg->vfraction 1502 * is valid only while on it. Cache vfr, leave the service tree, 1503 * update vdisktime and go back on. The re-addition to the tree 1504 * will also update the weights as necessary. 1505 */ 1506 vfr = cfqg->vfraction; 1507 cfq_group_service_tree_del(st, cfqg); 1508 cfqg->vdisktime += cfqg_scale_charge(charge, vfr); 1509 cfq_group_service_tree_add(st, cfqg); 1510 1511 /* This group is being expired. Save the context */ 1512 if (cfqd->workload_expires > now) { 1513 cfqg->saved_wl_slice = cfqd->workload_expires - now; 1514 cfqg->saved_wl_type = cfqd->serving_wl_type; 1515 cfqg->saved_wl_class = cfqd->serving_wl_class; 1516 } else 1517 cfqg->saved_wl_slice = 0; 1518 1519 cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime, 1520 st->min_vdisktime); 1521 cfq_log_cfqq(cfqq->cfqd, cfqq, 1522 "sl_used=%llu disp=%llu charge=%llu iops=%u sect=%lu", 1523 used_sl, cfqq->slice_dispatch, charge, 1524 iops_mode(cfqd), cfqq->nr_sectors); 1525 cfqg_stats_update_timeslice_used(cfqg, used_sl, unaccounted_sl); 1526 cfqg_stats_set_start_empty_time(cfqg); 1527} 1528 1529/** 1530 * cfq_init_cfqg_base - initialize base part of a cfq_group 1531 * @cfqg: cfq_group to initialize 1532 * 1533 * Initialize the base part which is used whether %CONFIG_CFQ_GROUP_IOSCHED 1534 * is enabled or not. 1535 */ 1536static void cfq_init_cfqg_base(struct cfq_group *cfqg) 1537{ 1538 struct cfq_rb_root *st; 1539 int i, j; 1540 1541 for_each_cfqg_st(cfqg, i, j, st) 1542 *st = CFQ_RB_ROOT; 1543 RB_CLEAR_NODE(&cfqg->rb_node); 1544 1545 cfqg->ttime.last_end_request = ktime_get_ns(); 1546} 1547 1548#ifdef CONFIG_CFQ_GROUP_IOSCHED 1549static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val, 1550 bool on_dfl, bool reset_dev, bool is_leaf_weight); 1551 1552static void cfqg_stats_exit(struct cfqg_stats *stats) 1553{ 1554 blkg_rwstat_exit(&stats->merged); 1555 blkg_rwstat_exit(&stats->service_time); 1556 blkg_rwstat_exit(&stats->wait_time); 1557 blkg_rwstat_exit(&stats->queued); 1558 blkg_stat_exit(&stats->time); 1559#ifdef CONFIG_DEBUG_BLK_CGROUP 1560 blkg_stat_exit(&stats->unaccounted_time); 1561 blkg_stat_exit(&stats->avg_queue_size_sum); 1562 blkg_stat_exit(&stats->avg_queue_size_samples); 1563 blkg_stat_exit(&stats->dequeue); 1564 blkg_stat_exit(&stats->group_wait_time); 1565 blkg_stat_exit(&stats->idle_time); 1566 blkg_stat_exit(&stats->empty_time); 1567#endif 1568} 1569 1570static int cfqg_stats_init(struct cfqg_stats *stats, gfp_t gfp) 1571{ 1572 if (blkg_rwstat_init(&stats->merged, gfp) || 1573 blkg_rwstat_init(&stats->service_time, gfp) || 1574 blkg_rwstat_init(&stats->wait_time, gfp) || 1575 blkg_rwstat_init(&stats->queued, gfp) || 1576 blkg_stat_init(&stats->time, gfp)) 1577 goto err; 1578 1579#ifdef CONFIG_DEBUG_BLK_CGROUP 1580 if (blkg_stat_init(&stats->unaccounted_time, gfp) || 1581 blkg_stat_init(&stats->avg_queue_size_sum, gfp) || 1582 blkg_stat_init(&stats->avg_queue_size_samples, gfp) || 1583 blkg_stat_init(&stats->dequeue, gfp) || 1584 blkg_stat_init(&stats->group_wait_time, gfp) || 1585 blkg_stat_init(&stats->idle_time, gfp) || 1586 blkg_stat_init(&stats->empty_time, gfp)) 1587 goto err; 1588#endif 1589 return 0; 1590err: 1591 cfqg_stats_exit(stats); 1592 return -ENOMEM; 1593} 1594 1595static struct blkcg_policy_data *cfq_cpd_alloc(gfp_t gfp) 1596{ 1597 struct cfq_group_data *cgd; 1598 1599 cgd = kzalloc(sizeof(*cgd), GFP_KERNEL); 1600 if (!cgd) 1601 return NULL; 1602 return &cgd->cpd; 1603} 1604 1605static void cfq_cpd_init(struct blkcg_policy_data *cpd) 1606{ 1607 struct cfq_group_data *cgd = cpd_to_cfqgd(cpd); 1608 unsigned int weight = cgroup_subsys_on_dfl(io_cgrp_subsys) ? 1609 CGROUP_WEIGHT_DFL : CFQ_WEIGHT_LEGACY_DFL; 1610 1611 if (cpd_to_blkcg(cpd) == &blkcg_root) 1612 weight *= 2; 1613 1614 cgd->weight = weight; 1615 cgd->leaf_weight = weight; 1616} 1617 1618static void cfq_cpd_free(struct blkcg_policy_data *cpd) 1619{ 1620 kfree(cpd_to_cfqgd(cpd)); 1621} 1622 1623static void cfq_cpd_bind(struct blkcg_policy_data *cpd) 1624{ 1625 struct blkcg *blkcg = cpd_to_blkcg(cpd); 1626 bool on_dfl = cgroup_subsys_on_dfl(io_cgrp_subsys); 1627 unsigned int weight = on_dfl ? CGROUP_WEIGHT_DFL : CFQ_WEIGHT_LEGACY_DFL; 1628 1629 if (blkcg == &blkcg_root) 1630 weight *= 2; 1631 1632 WARN_ON_ONCE(__cfq_set_weight(&blkcg->css, weight, on_dfl, true, false)); 1633 WARN_ON_ONCE(__cfq_set_weight(&blkcg->css, weight, on_dfl, true, true)); 1634} 1635 1636static struct blkg_policy_data *cfq_pd_alloc(gfp_t gfp, int node) 1637{ 1638 struct cfq_group *cfqg; 1639 1640 cfqg = kzalloc_node(sizeof(*cfqg), gfp, node); 1641 if (!cfqg) 1642 return NULL; 1643 1644 cfq_init_cfqg_base(cfqg); 1645 if (cfqg_stats_init(&cfqg->stats, gfp)) { 1646 kfree(cfqg); 1647 return NULL; 1648 } 1649 1650 return &cfqg->pd; 1651} 1652 1653static void cfq_pd_init(struct blkg_policy_data *pd) 1654{ 1655 struct cfq_group *cfqg = pd_to_cfqg(pd); 1656 struct cfq_group_data *cgd = blkcg_to_cfqgd(pd->blkg->blkcg); 1657 1658 cfqg->weight = cgd->weight; 1659 cfqg->leaf_weight = cgd->leaf_weight; 1660} 1661 1662static void cfq_pd_offline(struct blkg_policy_data *pd) 1663{ 1664 struct cfq_group *cfqg = pd_to_cfqg(pd); 1665 int i; 1666 1667 for (i = 0; i < IOPRIO_BE_NR; i++) { 1668 if (cfqg->async_cfqq[0][i]) 1669 cfq_put_queue(cfqg->async_cfqq[0][i]); 1670 if (cfqg->async_cfqq[1][i]) 1671 cfq_put_queue(cfqg->async_cfqq[1][i]); 1672 } 1673 1674 if (cfqg->async_idle_cfqq) 1675 cfq_put_queue(cfqg->async_idle_cfqq); 1676 1677 /* 1678 * @blkg is going offline and will be ignored by 1679 * blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so 1680 * that they don't get lost. If IOs complete after this point, the 1681 * stats for them will be lost. Oh well... 1682 */ 1683 cfqg_stats_xfer_dead(cfqg); 1684} 1685 1686static void cfq_pd_free(struct blkg_policy_data *pd) 1687{ 1688 struct cfq_group *cfqg = pd_to_cfqg(pd); 1689 1690 cfqg_stats_exit(&cfqg->stats); 1691 return kfree(cfqg); 1692} 1693 1694static void cfq_pd_reset_stats(struct blkg_policy_data *pd) 1695{ 1696 struct cfq_group *cfqg = pd_to_cfqg(pd); 1697 1698 cfqg_stats_reset(&cfqg->stats); 1699} 1700 1701static struct cfq_group *cfq_lookup_cfqg(struct cfq_data *cfqd, 1702 struct blkcg *blkcg) 1703{ 1704 struct blkcg_gq *blkg; 1705 1706 blkg = blkg_lookup(blkcg, cfqd->queue); 1707 if (likely(blkg)) 1708 return blkg_to_cfqg(blkg); 1709 return NULL; 1710} 1711 1712static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) 1713{ 1714 cfqq->cfqg = cfqg; 1715 /* cfqq reference on cfqg */ 1716 cfqg_get(cfqg); 1717} 1718 1719static u64 cfqg_prfill_weight_device(struct seq_file *sf, 1720 struct blkg_policy_data *pd, int off) 1721{ 1722 struct cfq_group *cfqg = pd_to_cfqg(pd); 1723 1724 if (!cfqg->dev_weight) 1725 return 0; 1726 return __blkg_prfill_u64(sf, pd, cfqg->dev_weight); 1727} 1728 1729static int cfqg_print_weight_device(struct seq_file *sf, void *v) 1730{ 1731 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), 1732 cfqg_prfill_weight_device, &blkcg_policy_cfq, 1733 0, false); 1734 return 0; 1735} 1736 1737static u64 cfqg_prfill_leaf_weight_device(struct seq_file *sf, 1738 struct blkg_policy_data *pd, int off) 1739{ 1740 struct cfq_group *cfqg = pd_to_cfqg(pd); 1741 1742 if (!cfqg->dev_leaf_weight) 1743 return 0; 1744 return __blkg_prfill_u64(sf, pd, cfqg->dev_leaf_weight); 1745} 1746 1747static int cfqg_print_leaf_weight_device(struct seq_file *sf, void *v) 1748{ 1749 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), 1750 cfqg_prfill_leaf_weight_device, &blkcg_policy_cfq, 1751 0, false); 1752 return 0; 1753} 1754 1755static int cfq_print_weight(struct seq_file *sf, void *v) 1756{ 1757 struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); 1758 struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg); 1759 unsigned int val = 0; 1760 1761 if (cgd) 1762 val = cgd->weight; 1763 1764 seq_printf(sf, "%u\n", val); 1765 return 0; 1766} 1767 1768static int cfq_print_leaf_weight(struct seq_file *sf, void *v) 1769{ 1770 struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); 1771 struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg); 1772 unsigned int val = 0; 1773 1774 if (cgd) 1775 val = cgd->leaf_weight; 1776 1777 seq_printf(sf, "%u\n", val); 1778 return 0; 1779} 1780 1781static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of, 1782 char *buf, size_t nbytes, loff_t off, 1783 bool on_dfl, bool is_leaf_weight) 1784{ 1785 unsigned int min = on_dfl ? CGROUP_WEIGHT_MIN : CFQ_WEIGHT_LEGACY_MIN; 1786 unsigned int max = on_dfl ? CGROUP_WEIGHT_MAX : CFQ_WEIGHT_LEGACY_MAX; 1787 struct blkcg *blkcg = css_to_blkcg(of_css(of)); 1788 struct blkg_conf_ctx ctx; 1789 struct cfq_group *cfqg; 1790 struct cfq_group_data *cfqgd; 1791 int ret; 1792 u64 v; 1793 1794 ret = blkg_conf_prep(blkcg, &blkcg_policy_cfq, buf, &ctx); 1795 if (ret) 1796 return ret; 1797 1798 if (sscanf(ctx.body, "%llu", &v) == 1) { 1799 /* require "default" on dfl */ 1800 ret = -ERANGE; 1801 if (!v && on_dfl) 1802 goto out_finish; 1803 } else if (!strcmp(strim(ctx.body), "default")) { 1804 v = 0; 1805 } else { 1806 ret = -EINVAL; 1807 goto out_finish; 1808 } 1809 1810 cfqg = blkg_to_cfqg(ctx.blkg); 1811 cfqgd = blkcg_to_cfqgd(blkcg); 1812 1813 ret = -ERANGE; 1814 if (!v || (v >= min && v <= max)) { 1815 if (!is_leaf_weight) { 1816 cfqg->dev_weight = v; 1817 cfqg->new_weight = v ?: cfqgd->weight; 1818 } else { 1819 cfqg->dev_leaf_weight = v; 1820 cfqg->new_leaf_weight = v ?: cfqgd->leaf_weight; 1821 } 1822 ret = 0; 1823 } 1824out_finish: 1825 blkg_conf_finish(&ctx); 1826 return ret ?: nbytes; 1827} 1828 1829static ssize_t cfqg_set_weight_device(struct kernfs_open_file *of, 1830 char *buf, size_t nbytes, loff_t off) 1831{ 1832 return __cfqg_set_weight_device(of, buf, nbytes, off, false, false); 1833} 1834 1835static ssize_t cfqg_set_leaf_weight_device(struct kernfs_open_file *of, 1836 char *buf, size_t nbytes, loff_t off) 1837{ 1838 return __cfqg_set_weight_device(of, buf, nbytes, off, false, true); 1839} 1840 1841static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val, 1842 bool on_dfl, bool reset_dev, bool is_leaf_weight) 1843{ 1844 unsigned int min = on_dfl ? CGROUP_WEIGHT_MIN : CFQ_WEIGHT_LEGACY_MIN; 1845 unsigned int max = on_dfl ? CGROUP_WEIGHT_MAX : CFQ_WEIGHT_LEGACY_MAX; 1846 struct blkcg *blkcg = css_to_blkcg(css); 1847 struct blkcg_gq *blkg; 1848 struct cfq_group_data *cfqgd; 1849 int ret = 0; 1850 1851 if (val < min || val > max) 1852 return -ERANGE; 1853 1854 spin_lock_irq(&blkcg->lock); 1855 cfqgd = blkcg_to_cfqgd(blkcg); 1856 if (!cfqgd) { 1857 ret = -EINVAL; 1858 goto out; 1859 } 1860 1861 if (!is_leaf_weight) 1862 cfqgd->weight = val; 1863 else 1864 cfqgd->leaf_weight = val; 1865 1866 hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { 1867 struct cfq_group *cfqg = blkg_to_cfqg(blkg); 1868 1869 if (!cfqg) 1870 continue; 1871 1872 if (!is_leaf_weight) { 1873 if (reset_dev) 1874 cfqg->dev_weight = 0; 1875 if (!cfqg->dev_weight) 1876 cfqg->new_weight = cfqgd->weight; 1877 } else { 1878 if (reset_dev) 1879 cfqg->dev_leaf_weight = 0; 1880 if (!cfqg->dev_leaf_weight) 1881 cfqg->new_leaf_weight = cfqgd->leaf_weight; 1882 } 1883 } 1884 1885out: 1886 spin_unlock_irq(&blkcg->lock); 1887 return ret; 1888} 1889 1890static int cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft, 1891 u64 val) 1892{ 1893 return __cfq_set_weight(css, val, false, false, false); 1894} 1895 1896static int cfq_set_leaf_weight(struct cgroup_subsys_state *css, 1897 struct cftype *cft, u64 val) 1898{ 1899 return __cfq_set_weight(css, val, false, false, true); 1900} 1901 1902static int cfqg_print_stat(struct seq_file *sf, void *v) 1903{ 1904 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat, 1905 &blkcg_policy_cfq, seq_cft(sf)->private, false); 1906 return 0; 1907} 1908 1909static int cfqg_print_rwstat(struct seq_file *sf, void *v) 1910{ 1911 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat, 1912 &blkcg_policy_cfq, seq_cft(sf)->private, true); 1913 return 0; 1914} 1915 1916static u64 cfqg_prfill_stat_recursive(struct seq_file *sf, 1917 struct blkg_policy_data *pd, int off) 1918{ 1919 u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd), 1920 &blkcg_policy_cfq, off); 1921 return __blkg_prfill_u64(sf, pd, sum); 1922} 1923 1924static u64 cfqg_prfill_rwstat_recursive(struct seq_file *sf, 1925 struct blkg_policy_data *pd, int off) 1926{ 1927 struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd), 1928 &blkcg_policy_cfq, off); 1929 return __blkg_prfill_rwstat(sf, pd, &sum); 1930} 1931 1932static int cfqg_print_stat_recursive(struct seq_file *sf, void *v) 1933{ 1934 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), 1935 cfqg_prfill_stat_recursive, &blkcg_policy_cfq, 1936 seq_cft(sf)->private, false); 1937 return 0; 1938} 1939 1940static int cfqg_print_rwstat_recursive(struct seq_file *sf, void *v) 1941{ 1942 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), 1943 cfqg_prfill_rwstat_recursive, &blkcg_policy_cfq, 1944 seq_cft(sf)->private, true); 1945 return 0; 1946} 1947 1948static u64 cfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd, 1949 int off) 1950{ 1951 u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes); 1952 1953 return __blkg_prfill_u64(sf, pd, sum >> 9); 1954} 1955 1956static int cfqg_print_stat_sectors(struct seq_file *sf, void *v) 1957{ 1958 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), 1959 cfqg_prfill_sectors, &blkcg_policy_cfq, 0, false); 1960 return 0; 1961} 1962 1963static u64 cfqg_prfill_sectors_recursive(struct seq_file *sf, 1964 struct blkg_policy_data *pd, int off) 1965{ 1966 struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL, 1967 offsetof(struct blkcg_gq, stat_bytes)); 1968 u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) + 1969 atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]); 1970 1971 return __blkg_prfill_u64(sf, pd, sum >> 9); 1972} 1973 1974static int cfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v) 1975{ 1976 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), 1977 cfqg_prfill_sectors_recursive, &blkcg_policy_cfq, 0, 1978 false); 1979 return 0; 1980} 1981 1982#ifdef CONFIG_DEBUG_BLK_CGROUP 1983static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf, 1984 struct blkg_policy_data *pd, int off) 1985{ 1986 struct cfq_group *cfqg = pd_to_cfqg(pd); 1987 u64 samples = blkg_stat_read(&cfqg->stats.avg_queue_size_samples); 1988 u64 v = 0; 1989 1990 if (samples) { 1991 v = blkg_stat_read(&cfqg->stats.avg_queue_size_sum); 1992 v = div64_u64(v, samples); 1993 } 1994 __blkg_prfill_u64(sf, pd, v); 1995 return 0; 1996} 1997 1998/* print avg_queue_size */ 1999static int cfqg_print_avg_queue_size(struct seq_file *sf, void *v) 2000{ 2001 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), 2002 cfqg_prfill_avg_queue_size, &blkcg_policy_cfq, 2003 0, false); 2004 return 0; 2005} 2006#endif /* CONFIG_DEBUG_BLK_CGROUP */ 2007 2008static struct cftype cfq_blkcg_legacy_files[] = { 2009 /* on root, weight is mapped to leaf_weight */ 2010 { 2011 .name = "weight_device", 2012 .flags = CFTYPE_ONLY_ON_ROOT, 2013 .seq_show = cfqg_print_leaf_weight_device, 2014 .write = cfqg_set_leaf_weight_device, 2015 }, 2016 { 2017 .name = "weight", 2018 .flags = CFTYPE_ONLY_ON_ROOT, 2019 .seq_show = cfq_print_leaf_weight, 2020 .write_u64 = cfq_set_leaf_weight, 2021 }, 2022 2023 /* no such mapping necessary for !roots */ 2024 { 2025 .name = "weight_device", 2026 .flags = CFTYPE_NOT_ON_ROOT, 2027 .seq_show = cfqg_print_weight_device, 2028 .write = cfqg_set_weight_device, 2029 }, 2030 { 2031 .name = "weight", 2032 .flags = CFTYPE_NOT_ON_ROOT, 2033 .seq_show = cfq_print_weight, 2034 .write_u64 = cfq_set_weight, 2035 }, 2036 2037 { 2038 .name = "leaf_weight_device", 2039 .seq_show = cfqg_print_leaf_weight_device, 2040 .write = cfqg_set_leaf_weight_device, 2041 }, 2042 { 2043 .name = "leaf_weight", 2044 .seq_show = cfq_print_leaf_weight, 2045 .write_u64 = cfq_set_leaf_weight, 2046 }, 2047 2048 /* statistics, covers only the tasks in the cfqg */ 2049 { 2050 .name = "time", 2051 .private = offsetof(struct cfq_group, stats.time), 2052 .seq_show = cfqg_print_stat, 2053 }, 2054 { 2055 .name = "sectors", 2056 .seq_show = cfqg_print_stat_sectors, 2057 }, 2058 { 2059 .name = "io_service_bytes", 2060 .private = (unsigned long)&blkcg_policy_cfq, 2061 .seq_show = blkg_print_stat_bytes, 2062 }, 2063 { 2064 .name = "io_serviced", 2065 .private = (unsigned long)&blkcg_policy_cfq, 2066 .seq_show = blkg_print_stat_ios, 2067 }, 2068 { 2069 .name = "io_service_time", 2070 .private = offsetof(struct cfq_group, stats.service_time), 2071 .seq_show = cfqg_print_rwstat, 2072 }, 2073 { 2074 .name = "io_wait_time", 2075 .private = offsetof(struct cfq_group, stats.wait_time), 2076 .seq_show = cfqg_print_rwstat, 2077 }, 2078 { 2079 .name = "io_merged", 2080 .private = offsetof(struct cfq_group, stats.merged), 2081 .seq_show = cfqg_print_rwstat, 2082 }, 2083 { 2084 .name = "io_queued", 2085 .private = offsetof(struct cfq_group, stats.queued), 2086 .seq_show = cfqg_print_rwstat, 2087 }, 2088 2089 /* the same statictics which cover the cfqg and its descendants */ 2090 { 2091 .name = "time_recursive", 2092 .private = offsetof(struct cfq_group, stats.time), 2093 .seq_show = cfqg_print_stat_recursive, 2094 }, 2095 { 2096 .name = "sectors_recursive", 2097 .seq_show = cfqg_print_stat_sectors_recursive, 2098 }, 2099 { 2100 .name = "io_service_bytes_recursive", 2101 .private = (unsigned long)&blkcg_policy_cfq, 2102 .seq_show = blkg_print_stat_bytes_recursive, 2103 }, 2104 { 2105 .name = "io_serviced_recursive", 2106 .private = (unsigned long)&blkcg_policy_cfq, 2107 .seq_show = blkg_print_stat_ios_recursive, 2108 }, 2109 { 2110 .name = "io_service_time_recursive", 2111 .private = offsetof(struct cfq_group, stats.service_time), 2112 .seq_show = cfqg_print_rwstat_recursive, 2113 }, 2114 { 2115 .name = "io_wait_time_recursive", 2116 .private = offsetof(struct cfq_group, stats.wait_time), 2117 .seq_show = cfqg_print_rwstat_recursive, 2118 }, 2119 { 2120 .name = "io_merged_recursive", 2121 .private = offsetof(struct cfq_group, stats.merged), 2122 .seq_show = cfqg_print_rwstat_recursive, 2123 }, 2124 { 2125 .name = "io_queued_recursive", 2126 .private = offsetof(struct cfq_group, stats.queued), 2127 .seq_show = cfqg_print_rwstat_recursive, 2128 }, 2129#ifdef CONFIG_DEBUG_BLK_CGROUP 2130 { 2131 .name = "avg_queue_size", 2132 .seq_show = cfqg_print_avg_queue_size, 2133 }, 2134 { 2135 .name = "group_wait_time", 2136 .private = offsetof(struct cfq_group, stats.group_wait_time), 2137 .seq_show = cfqg_print_stat, 2138 }, 2139 { 2140 .name = "idle_time", 2141 .private = offsetof(struct cfq_group, stats.idle_time), 2142 .seq_show = cfqg_print_stat, 2143 }, 2144 { 2145 .name = "empty_time", 2146 .private = offsetof(struct cfq_group, stats.empty_time), 2147 .seq_show = cfqg_print_stat, 2148 }, 2149 { 2150 .name = "dequeue", 2151 .private = offsetof(struct cfq_group, stats.dequeue), 2152 .seq_show = cfqg_print_stat, 2153 }, 2154 { 2155 .name = "unaccounted_time", 2156 .private = offsetof(struct cfq_group, stats.unaccounted_time), 2157 .seq_show = cfqg_print_stat, 2158 }, 2159#endif /* CONFIG_DEBUG_BLK_CGROUP */ 2160 { } /* terminate */ 2161}; 2162 2163static int cfq_print_weight_on_dfl(struct seq_file *sf, void *v) 2164{ 2165 struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); 2166 struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg); 2167 2168 seq_printf(sf, "default %u\n", cgd->weight); 2169 blkcg_print_blkgs(sf, blkcg, cfqg_prfill_weight_device, 2170 &blkcg_policy_cfq, 0, false); 2171 return 0; 2172} 2173 2174static ssize_t cfq_set_weight_on_dfl(struct kernfs_open_file *of, 2175 char *buf, size_t nbytes, loff_t off) 2176{ 2177 char *endp; 2178 int ret; 2179 u64 v; 2180 2181 buf = strim(buf); 2182 2183 /* "WEIGHT" or "default WEIGHT" sets the default weight */ 2184 v = simple_strtoull(buf, &endp, 0); 2185 if (*endp == '\0' || sscanf(buf, "default %llu", &v) == 1) { 2186 ret = __cfq_set_weight(of_css(of), v, true, false, false); 2187 return ret ?: nbytes; 2188 } 2189 2190 /* "MAJ:MIN WEIGHT" */ 2191 return __cfqg_set_weight_device(of, buf, nbytes, off, true, false); 2192} 2193 2194static struct cftype cfq_blkcg_files[] = { 2195 { 2196 .name = "weight", 2197 .flags = CFTYPE_NOT_ON_ROOT, 2198 .seq_show = cfq_print_weight_on_dfl, 2199 .write = cfq_set_weight_on_dfl, 2200 }, 2201 { } /* terminate */ 2202}; 2203 2204#else /* GROUP_IOSCHED */ 2205static struct cfq_group *cfq_lookup_cfqg(struct cfq_data *cfqd, 2206 struct blkcg *blkcg) 2207{ 2208 return cfqd->root_group; 2209} 2210 2211static inline void 2212cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) { 2213 cfqq->cfqg = cfqg; 2214} 2215 2216#endif /* GROUP_IOSCHED */ 2217 2218/* 2219 * The cfqd->service_trees holds all pending cfq_queue's that have 2220 * requests waiting to be processed. It is sorted in the order that 2221 * we will service the queues. 2222 */ 2223static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq, 2224 bool add_front) 2225{ 2226 struct rb_node **p, *parent; 2227 struct cfq_queue *__cfqq; 2228 u64 rb_key; 2229 struct cfq_rb_root *st; 2230 int left; 2231 int new_cfqq = 1; 2232 u64 now = ktime_get_ns(); 2233 2234 st = st_for(cfqq->cfqg, cfqq_class(cfqq), cfqq_type(cfqq)); 2235 if (cfq_class_idle(cfqq)) { 2236 rb_key = CFQ_IDLE_DELAY; 2237 parent = rb_last(&st->rb); 2238 if (parent && parent != &cfqq->rb_node) { 2239 __cfqq = rb_entry(parent, struct cfq_queue, rb_node); 2240 rb_key += __cfqq->rb_key; 2241 } else 2242 rb_key += now; 2243 } else if (!add_front) { 2244 /* 2245 * Get our rb key offset. Subtract any residual slice 2246 * value carried from last service. A negative resid 2247 * count indicates slice overrun, and this should position 2248 * the next service time further away in the tree. 2249 */ 2250 rb_key = cfq_slice_offset(cfqd, cfqq) + now; 2251 rb_key -= cfqq->slice_resid; 2252 cfqq->slice_resid = 0; 2253 } else { 2254 rb_key = -NSEC_PER_SEC; 2255 __cfqq = cfq_rb_first(st); 2256 rb_key += __cfqq ? __cfqq->rb_key : now; 2257 } 2258 2259 if (!RB_EMPTY_NODE(&cfqq->rb_node)) { 2260 new_cfqq = 0; 2261 /* 2262 * same position, nothing more to do 2263 */ 2264 if (rb_key == cfqq->rb_key && cfqq->service_tree == st) 2265 return; 2266 2267 cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree); 2268 cfqq->service_tree = NULL; 2269 } 2270 2271 left = 1; 2272 parent = NULL; 2273 cfqq->service_tree = st; 2274 p = &st->rb.rb_node; 2275 while (*p) { 2276 parent = *p; 2277 __cfqq = rb_entry(parent, struct cfq_queue, rb_node); 2278 2279 /* 2280 * sort by key, that represents service time. 2281 */ 2282 if (rb_key < __cfqq->rb_key) 2283 p = &parent->rb_left; 2284 else { 2285 p = &parent->rb_right; 2286 left = 0; 2287 } 2288 } 2289 2290 if (left) 2291 st->left = &cfqq->rb_node; 2292 2293 cfqq->rb_key = rb_key; 2294 rb_link_node(&cfqq->rb_node, parent, p); 2295 rb_insert_color(&cfqq->rb_node, &st->rb); 2296 st->count++; 2297 if (add_front || !new_cfqq) 2298 return; 2299 cfq_group_notify_queue_add(cfqd, cfqq->cfqg); 2300} 2301 2302static struct cfq_queue * 2303cfq_prio_tree_lookup(struct cfq_data *cfqd, struct rb_root *root, 2304 sector_t sector, struct rb_node **ret_parent, 2305 struct rb_node ***rb_link) 2306{ 2307 struct rb_node **p, *parent; 2308 struct cfq_queue *cfqq = NULL; 2309 2310 parent = NULL; 2311 p = &root->rb_node; 2312 while (*p) { 2313 struct rb_node **n; 2314 2315 parent = *p; 2316 cfqq = rb_entry(parent, struct cfq_queue, p_node); 2317 2318 /* 2319 * Sort strictly based on sector. Smallest to the left, 2320 * largest to the right. 2321 */ 2322 if (sector > blk_rq_pos(cfqq->next_rq)) 2323 n = &(*p)->rb_right; 2324 else if (sector < blk_rq_pos(cfqq->next_rq)) 2325 n = &(*p)->rb_left; 2326 else 2327 break; 2328 p = n; 2329 cfqq = NULL; 2330 } 2331 2332 *ret_parent = parent; 2333 if (rb_link) 2334 *rb_link = p; 2335 return cfqq; 2336} 2337 2338static void cfq_prio_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq) 2339{ 2340 struct rb_node **p, *parent; 2341 struct cfq_queue *__cfqq; 2342 2343 if (cfqq->p_root) { 2344 rb_erase(&cfqq->p_node, cfqq->p_root); 2345 cfqq->p_root = NULL; 2346 } 2347 2348 if (cfq_class_idle(cfqq)) 2349 return; 2350 if (!cfqq->next_rq) 2351 return; 2352 2353 cfqq->p_root = &cfqd->prio_trees[cfqq->org_ioprio]; 2354 __cfqq = cfq_prio_tree_lookup(cfqd, cfqq->p_root, 2355 blk_rq_pos(cfqq->next_rq), &parent, &p); 2356 if (!__cfqq) { 2357 rb_link_node(&cfqq->p_node, parent, p); 2358 rb_insert_color(&cfqq->p_node, cfqq->p_root); 2359 } else 2360 cfqq->p_root = NULL; 2361} 2362 2363/* 2364 * Update cfqq's position in the service tree. 2365 */ 2366static void cfq_resort_rr_list(struct cfq_data *cfqd, struct cfq_queue *cfqq) 2367{ 2368 /* 2369 * Resorting requires the cfqq to be on the RR list already. 2370 */ 2371 if (cfq_cfqq_on_rr(cfqq)) { 2372 cfq_service_tree_add(cfqd, cfqq, 0); 2373 cfq_prio_tree_add(cfqd, cfqq); 2374 } 2375} 2376 2377/* 2378 * add to busy list of queues for service, trying to be fair in ordering 2379 * the pending list according to last request service 2380 */ 2381static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) 2382{ 2383 cfq_log_cfqq(cfqd, cfqq, "add_to_rr"); 2384 BUG_ON(cfq_cfqq_on_rr(cfqq)); 2385 cfq_mark_cfqq_on_rr(cfqq); 2386 cfqd->busy_queues++; 2387 if (cfq_cfqq_sync(cfqq)) 2388 cfqd->busy_sync_queues++; 2389 2390 cfq_resort_rr_list(cfqd, cfqq); 2391} 2392 2393/* 2394 * Called when the cfqq no longer has requests pending, remove it from 2395 * the service tree. 2396 */ 2397static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) 2398{ 2399 cfq_log_cfqq(cfqd, cfqq, "del_from_rr"); 2400 BUG_ON(!cfq_cfqq_on_rr(cfqq)); 2401 cfq_clear_cfqq_on_rr(cfqq); 2402 2403 if (!RB_EMPTY_NODE(&cfqq->rb_node)) { 2404 cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree); 2405 cfqq->service_tree = NULL; 2406 } 2407 if (cfqq->p_root) { 2408 rb_erase(&cfqq->p_node, cfqq->p_root); 2409 cfqq->p_root = NULL; 2410 } 2411 2412 cfq_group_notify_queue_del(cfqd, cfqq->cfqg); 2413 BUG_ON(!cfqd->busy_queues); 2414 cfqd->busy_queues--; 2415 if (cfq_cfqq_sync(cfqq)) 2416 cfqd->busy_sync_queues--; 2417} 2418 2419/* 2420 * rb tree support functions 2421 */ 2422static void cfq_del_rq_rb(struct request *rq) 2423{ 2424 struct cfq_queue *cfqq = RQ_CFQQ(rq); 2425 const int sync = rq_is_sync(rq); 2426 2427 BUG_ON(!cfqq->queued[sync]); 2428 cfqq->queued[sync]--; 2429 2430 elv_rb_del(&cfqq->sort_list, rq); 2431 2432 if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) { 2433 /* 2434 * Queue will be deleted from service tree when we actually 2435 * expire it later. Right now just remove it from prio tree 2436 * as it is empty. 2437 */ 2438 if (cfqq->p_root) { 2439 rb_erase(&cfqq->p_node, cfqq->p_root); 2440 cfqq->p_root = NULL; 2441 } 2442 } 2443} 2444 2445static void cfq_add_rq_rb(struct request *rq) 2446{ 2447 struct cfq_queue *cfqq = RQ_CFQQ(rq); 2448 struct cfq_data *cfqd = cfqq->cfqd; 2449 struct request *prev; 2450 2451 cfqq->queued[rq_is_sync(rq)]++; 2452 2453 elv_rb_add(&cfqq->sort_list, rq); 2454 2455 if (!cfq_cfqq_on_rr(cfqq)) 2456 cfq_add_cfqq_rr(cfqd, cfqq); 2457 2458 /* 2459 * check if this request is a better next-serve candidate 2460 */ 2461 prev = cfqq->next_rq; 2462 cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq, cfqd->last_position); 2463 2464 /* 2465 * adjust priority tree position, if ->next_rq changes 2466 */ 2467 if (prev != cfqq->next_rq) 2468 cfq_prio_tree_add(cfqd, cfqq); 2469 2470 BUG_ON(!cfqq->next_rq); 2471} 2472 2473static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq) 2474{ 2475 elv_rb_del(&cfqq->sort_list, rq); 2476 cfqq->queued[rq_is_sync(rq)]--; 2477 cfqg_stats_update_io_remove(RQ_CFQG(rq), req_op(rq), rq->cmd_flags); 2478 cfq_add_rq_rb(rq); 2479 cfqg_stats_update_io_add(RQ_CFQG(rq), cfqq->cfqd->serving_group, 2480 req_op(rq), rq->cmd_flags); 2481} 2482 2483static struct request * 2484cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio) 2485{ 2486 struct task_struct *tsk = current; 2487 struct cfq_io_cq *cic; 2488 struct cfq_queue *cfqq; 2489 2490 cic = cfq_cic_lookup(cfqd, tsk->io_context); 2491 if (!cic) 2492 return NULL; 2493 2494 cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio)); 2495 if (cfqq) 2496 return elv_rb_find(&cfqq->sort_list, bio_end_sector(bio)); 2497 2498 return NULL; 2499} 2500 2501static void cfq_activate_request(struct request_queue *q, struct request *rq) 2502{ 2503 struct cfq_data *cfqd = q->elevator->elevator_data; 2504 2505 cfqd->rq_in_driver++; 2506 cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d", 2507 cfqd->rq_in_driver); 2508 2509 cfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); 2510} 2511 2512static void cfq_deactivate_request(struct request_queue *q, struct request *rq) 2513{ 2514 struct cfq_data *cfqd = q->elevator->elevator_data; 2515 2516 WARN_ON(!cfqd->rq_in_driver); 2517 cfqd->rq_in_driver--; 2518 cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d", 2519 cfqd->rq_in_driver); 2520} 2521 2522static void cfq_remove_request(struct request *rq) 2523{ 2524 struct cfq_queue *cfqq = RQ_CFQQ(rq); 2525 2526 if (cfqq->next_rq == rq) 2527 cfqq->next_rq = cfq_find_next_rq(cfqq->cfqd, cfqq, rq); 2528 2529 list_del_init(&rq->queuelist); 2530 cfq_del_rq_rb(rq); 2531 2532 cfqq->cfqd->rq_queued--; 2533 cfqg_stats_update_io_remove(RQ_CFQG(rq), req_op(rq), rq->cmd_flags); 2534 if (rq->cmd_flags & REQ_PRIO) { 2535 WARN_ON(!cfqq->prio_pending); 2536 cfqq->prio_pending--; 2537 } 2538} 2539 2540static int cfq_merge(struct request_queue *q, struct request **req, 2541 struct bio *bio) 2542{ 2543 struct cfq_data *cfqd = q->elevator->elevator_data; 2544 struct request *__rq; 2545 2546 __rq = cfq_find_rq_fmerge(cfqd, bio); 2547 if (__rq && elv_bio_merge_ok(__rq, bio)) { 2548 *req = __rq; 2549 return ELEVATOR_FRONT_MERGE; 2550 } 2551 2552 return ELEVATOR_NO_MERGE; 2553} 2554 2555static void cfq_merged_request(struct request_queue *q, struct request *req, 2556 int type) 2557{ 2558 if (type == ELEVATOR_FRONT_MERGE) { 2559 struct cfq_queue *cfqq = RQ_CFQQ(req); 2560 2561 cfq_reposition_rq_rb(cfqq, req); 2562 } 2563} 2564 2565static void cfq_bio_merged(struct request_queue *q, struct request *req, 2566 struct bio *bio) 2567{ 2568 cfqg_stats_update_io_merged(RQ_CFQG(req), bio_op(bio), bio->bi_opf); 2569} 2570 2571static void 2572cfq_merged_requests(struct request_queue *q, struct request *rq, 2573 struct request *next) 2574{ 2575 struct cfq_queue *cfqq = RQ_CFQQ(rq); 2576 struct cfq_data *cfqd = q->elevator->elevator_data; 2577 2578 /* 2579 * reposition in fifo if next is older than rq 2580 */ 2581 if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) && 2582 next->fifo_time < rq->fifo_time && 2583 cfqq == RQ_CFQQ(next)) { 2584 list_move(&rq->queuelist, &next->queuelist); 2585 rq->fifo_time = next->fifo_time; 2586 } 2587 2588 if (cfqq->next_rq == next) 2589 cfqq->next_rq = rq; 2590 cfq_remove_request(next); 2591 cfqg_stats_update_io_merged(RQ_CFQG(rq), req_op(next), next->cmd_flags); 2592 2593 cfqq = RQ_CFQQ(next); 2594 /* 2595 * all requests of this queue are merged to other queues, delete it 2596 * from the service tree. If it's the active_queue, 2597 * cfq_dispatch_requests() will choose to expire it or do idle 2598 */ 2599 if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list) && 2600 cfqq != cfqd->active_queue) 2601 cfq_del_cfqq_rr(cfqd, cfqq); 2602} 2603 2604static int cfq_allow_bio_merge(struct request_queue *q, struct request *rq, 2605 struct bio *bio) 2606{ 2607 struct cfq_data *cfqd = q->elevator->elevator_data; 2608 struct cfq_io_cq *cic; 2609 struct cfq_queue *cfqq; 2610 2611 /* 2612 * Disallow merge of a sync bio into an async request. 2613 */ 2614 if (cfq_bio_sync(bio) && !rq_is_sync(rq)) 2615 return false; 2616 2617 /* 2618 * Lookup the cfqq that this bio will be queued with and allow 2619 * merge only if rq is queued there. 2620 */ 2621 cic = cfq_cic_lookup(cfqd, current->io_context); 2622 if (!cic) 2623 return false; 2624 2625 cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio)); 2626 return cfqq == RQ_CFQQ(rq); 2627} 2628 2629static int cfq_allow_rq_merge(struct request_queue *q, struct request *rq, 2630 struct request *next) 2631{ 2632 return RQ_CFQQ(rq) == RQ_CFQQ(next); 2633} 2634 2635static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq) 2636{ 2637 hrtimer_try_to_cancel(&cfqd->idle_slice_timer); 2638 cfqg_stats_update_idle_time(cfqq->cfqg); 2639} 2640 2641static void __cfq_set_active_queue(struct cfq_data *cfqd, 2642 struct cfq_queue *cfqq) 2643{ 2644 if (cfqq) { 2645 cfq_log_cfqq(cfqd, cfqq, "set_active wl_class:%d wl_type:%d", 2646 cfqd->serving_wl_class, cfqd->serving_wl_type); 2647 cfqg_stats_update_avg_queue_size(cfqq->cfqg); 2648 cfqq->slice_start = 0; 2649 cfqq->dispatch_start = ktime_get_ns(); 2650 cfqq->allocated_slice = 0; 2651 cfqq->slice_end = 0; 2652 cfqq->slice_dispatch = 0; 2653 cfqq->nr_sectors = 0; 2654 2655 cfq_clear_cfqq_wait_request(cfqq); 2656 cfq_clear_cfqq_must_dispatch(cfqq); 2657 cfq_clear_cfqq_must_alloc_slice(cfqq); 2658 cfq_clear_cfqq_fifo_expire(cfqq); 2659 cfq_mark_cfqq_slice_new(cfqq); 2660 2661 cfq_del_timer(cfqd, cfqq); 2662 } 2663 2664 cfqd->active_queue = cfqq; 2665} 2666 2667/* 2668 * current cfqq expired its slice (or was too idle), select new one 2669 */ 2670static void 2671__cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq, 2672 bool timed_out) 2673{ 2674 cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out); 2675 2676 if (cfq_cfqq_wait_request(cfqq)) 2677 cfq_del_timer(cfqd, cfqq); 2678 2679 cfq_clear_cfqq_wait_request(cfqq); 2680 cfq_clear_cfqq_wait_busy(cfqq); 2681 2682 /* 2683 * If this cfqq is shared between multiple processes, check to 2684 * make sure that those processes are still issuing I/Os within 2685 * the mean seek distance. If not, it may be time to break the 2686 * queues apart again. 2687 */ 2688 if (cfq_cfqq_coop(cfqq) && CFQQ_SEEKY(cfqq)) 2689 cfq_mark_cfqq_split_coop(cfqq); 2690 2691 /* 2692 * store what was left of this slice, if the queue idled/timed out 2693 */ 2694 if (timed_out) { 2695 if (cfq_cfqq_slice_new(cfqq)) 2696 cfqq->slice_resid = cfq_scaled_cfqq_slice(cfqd, cfqq); 2697 else 2698 cfqq->slice_resid = cfqq->slice_end - ktime_get_ns(); 2699 cfq_log_cfqq(cfqd, cfqq, "resid=%lld", cfqq->slice_resid); 2700 } 2701 2702 cfq_group_served(cfqd, cfqq->cfqg, cfqq); 2703 2704 if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) 2705 cfq_del_cfqq_rr(cfqd, cfqq); 2706 2707 cfq_resort_rr_list(cfqd, cfqq); 2708 2709 if (cfqq == cfqd->active_queue) 2710 cfqd->active_queue = NULL; 2711 2712 if (cfqd->active_cic) { 2713 put_io_context(cfqd->active_cic->icq.ioc); 2714 cfqd->active_cic = NULL; 2715 } 2716} 2717 2718static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out) 2719{ 2720 struct cfq_queue *cfqq = cfqd->active_queue; 2721 2722 if (cfqq) 2723 __cfq_slice_expired(cfqd, cfqq, timed_out); 2724} 2725 2726/* 2727 * Get next queue for service. Unless we have a queue preemption, 2728 * we'll simply select the first cfqq in the service tree. 2729 */ 2730static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd) 2731{ 2732 struct cfq_rb_root *st = st_for(cfqd->serving_group, 2733 cfqd->serving_wl_class, cfqd->serving_wl_type); 2734 2735 if (!cfqd->rq_queued) 2736 return NULL; 2737 2738 /* There is nothing to dispatch */ 2739 if (!st) 2740 return NULL; 2741 if (RB_EMPTY_ROOT(&st->rb)) 2742 return NULL; 2743 return cfq_rb_first(st); 2744} 2745 2746static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd) 2747{ 2748 struct cfq_group *cfqg; 2749 struct cfq_queue *cfqq; 2750 int i, j; 2751 struct cfq_rb_root *st; 2752 2753 if (!cfqd->rq_queued) 2754 return NULL; 2755 2756 cfqg = cfq_get_next_cfqg(cfqd); 2757 if (!cfqg) 2758 return NULL; 2759 2760 for_each_cfqg_st(cfqg, i, j, st) 2761 if ((cfqq = cfq_rb_first(st)) != NULL) 2762 return cfqq; 2763 return NULL; 2764} 2765 2766/* 2767 * Get and set a new active queue for service. 2768 */ 2769static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd, 2770 struct cfq_queue *cfqq) 2771{ 2772 if (!cfqq) 2773 cfqq = cfq_get_next_queue(cfqd); 2774 2775 __cfq_set_active_queue(cfqd, cfqq); 2776 return cfqq; 2777} 2778 2779static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd, 2780 struct request *rq) 2781{ 2782 if (blk_rq_pos(rq) >= cfqd->last_position) 2783 return blk_rq_pos(rq) - cfqd->last_position; 2784 else 2785 return cfqd->last_position - blk_rq_pos(rq); 2786} 2787 2788static inline int cfq_rq_close(struct cfq_data *cfqd, struct cfq_queue *cfqq, 2789 struct request *rq) 2790{ 2791 return cfq_dist_from_last(cfqd, rq) <= CFQQ_CLOSE_THR; 2792} 2793 2794static struct cfq_queue *cfqq_close(struct cfq_data *cfqd, 2795 struct cfq_queue *cur_cfqq) 2796{ 2797 struct rb_root *root = &cfqd->prio_trees[cur_cfqq->org_ioprio]; 2798 struct rb_node *parent, *node; 2799 struct cfq_queue *__cfqq; 2800 sector_t sector = cfqd->last_position; 2801 2802 if (RB_EMPTY_ROOT(root)) 2803 return NULL; 2804 2805 /* 2806 * First, if we find a request starting at the end of the last 2807 * request, choose it. 2808 */ 2809 __cfqq = cfq_prio_tree_lookup(cfqd, root, sector, &parent, NULL); 2810 if (__cfqq) 2811 return __cfqq; 2812 2813 /* 2814 * If the exact sector wasn't found, the parent of the NULL leaf 2815 * will contain the closest sector. 2816 */ 2817 __cfqq = rb_entry(parent, struct cfq_queue, p_node); 2818 if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq)) 2819 return __cfqq; 2820 2821 if (blk_rq_pos(__cfqq->next_rq) < sector) 2822 node = rb_next(&__cfqq->p_node); 2823 else 2824 node = rb_prev(&__cfqq->p_node); 2825 if (!node) 2826 return NULL; 2827 2828 __cfqq = rb_entry(node, struct cfq_queue, p_node); 2829 if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq)) 2830 return __cfqq; 2831 2832 return NULL; 2833} 2834 2835/* 2836 * cfqd - obvious 2837 * cur_cfqq - passed in so that we don't decide that the current queue is 2838 * closely cooperating with itself. 2839 * 2840 * So, basically we're assuming that that cur_cfqq has dispatched at least 2841 * one request, and that cfqd->last_position reflects a position on the disk 2842 * associated with the I/O issued by cur_cfqq. I'm not sure this is a valid 2843 * assumption. 2844 */ 2845static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd, 2846 struct cfq_queue *cur_cfqq) 2847{ 2848 struct cfq_queue *cfqq; 2849 2850 if (cfq_class_idle(cur_cfqq)) 2851 return NULL; 2852 if (!cfq_cfqq_sync(cur_cfqq)) 2853 return NULL; 2854 if (CFQQ_SEEKY(cur_cfqq)) 2855 return NULL; 2856 2857 /* 2858 * Don't search priority tree if it's the only queue in the group. 2859 */ 2860 if (cur_cfqq->cfqg->nr_cfqq == 1) 2861 return NULL; 2862 2863 /* 2864 * We should notice if some of the queues are cooperating, eg 2865 * working closely on the same area of the disk. In that case, 2866 * we can group them together and don't waste time idling. 2867 */ 2868 cfqq = cfqq_close(cfqd, cur_cfqq); 2869 if (!cfqq) 2870 return NULL; 2871 2872 /* If new queue belongs to different cfq_group, don't choose it */ 2873 if (cur_cfqq->cfqg != cfqq->cfqg) 2874 return NULL; 2875 2876 /* 2877 * It only makes sense to merge sync queues. 2878 */ 2879 if (!cfq_cfqq_sync(cfqq)) 2880 return NULL; 2881 if (CFQQ_SEEKY(cfqq)) 2882 return NULL; 2883 2884 /* 2885 * Do not merge queues of different priority classes 2886 */ 2887 if (cfq_class_rt(cfqq) != cfq_class_rt(cur_cfqq)) 2888 return NULL; 2889 2890 return cfqq; 2891} 2892 2893/* 2894 * Determine whether we should enforce idle window for this queue. 2895 */ 2896 2897static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq) 2898{ 2899 enum wl_class_t wl_class = cfqq_class(cfqq); 2900 struct cfq_rb_root *st = cfqq->service_tree; 2901 2902 BUG_ON(!st); 2903 BUG_ON(!st->count); 2904 2905 if (!cfqd->cfq_slice_idle) 2906 return false; 2907 2908 /* We never do for idle class queues. */ 2909 if (wl_class == IDLE_WORKLOAD) 2910 return false; 2911 2912 /* We do for queues that were marked with idle window flag. */ 2913 if (cfq_cfqq_idle_window(cfqq) && 2914 !(blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag)) 2915 return true; 2916 2917 /* 2918 * Otherwise, we do only if they are the last ones 2919 * in their service tree. 2920 */ 2921 if (st->count == 1 && cfq_cfqq_sync(cfqq) && 2922 !cfq_io_thinktime_big(cfqd, &st->ttime, false)) 2923 return true; 2924 cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d", st->count); 2925 return false; 2926} 2927 2928static void cfq_arm_slice_timer(struct cfq_data *cfqd) 2929{ 2930 struct cfq_queue *cfqq = cfqd->active_queue; 2931 struct cfq_rb_root *st = cfqq->service_tree; 2932 struct cfq_io_cq *cic; 2933 u64 sl, group_idle = 0; 2934 u64 now = ktime_get_ns(); 2935 2936 /* 2937 * SSD device without seek penalty, disable idling. But only do so 2938 * for devices that support queuing, otherwise we still have a problem 2939 * with sync vs async workloads. 2940 */ 2941 if (blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag) 2942 return; 2943 2944 WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list)); 2945 WARN_ON(cfq_cfqq_slice_new(cfqq)); 2946 2947 /* 2948 * idle is disabled, either manually or by past process history 2949 */ 2950 if (!cfq_should_idle(cfqd, cfqq)) { 2951 /* no queue idling. Check for group idling */ 2952 if (cfqd->cfq_group_idle) 2953 group_idle = cfqd->cfq_group_idle; 2954 else 2955 return; 2956 } 2957 2958 /* 2959 * still active requests from this queue, don't idle 2960 */ 2961 if (cfqq->dispatched) 2962 return; 2963 2964 /* 2965 * task has exited, don't wait 2966 */ 2967 cic = cfqd->active_cic; 2968 if (!cic || !atomic_read(&cic->icq.ioc->active_ref)) 2969 return; 2970 2971 /* 2972 * If our average think time is larger than the remaining time 2973 * slice, then don't idle. This avoids overrunning the allotted 2974 * time slice. 2975 */ 2976 if (sample_valid(cic->ttime.ttime_samples) && 2977 (cfqq->slice_end - now < cic->ttime.ttime_mean)) { 2978 cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%llu", 2979 cic->ttime.ttime_mean); 2980 return; 2981 } 2982 2983 /* 2984 * There are other queues in the group or this is the only group and 2985 * it has too big thinktime, don't do group idle. 2986 */ 2987 if (group_idle && 2988 (cfqq->cfqg->nr_cfqq > 1 || 2989 cfq_io_thinktime_big(cfqd, &st->ttime, true))) 2990 return; 2991 2992 cfq_mark_cfqq_wait_request(cfqq); 2993 2994 if (group_idle) 2995 sl = cfqd->cfq_group_idle; 2996 else 2997 sl = cfqd->cfq_slice_idle; 2998 2999 hrtimer_start(&cfqd->idle_slice_timer, ns_to_ktime(sl), 3000 HRTIMER_MODE_REL); 3001 cfqg_stats_set_start_idle_time(cfqq->cfqg); 3002 cfq_log_cfqq(cfqd, cfqq, "arm_idle: %llu group_idle: %d", sl, 3003 group_idle ? 1 : 0); 3004} 3005 3006/* 3007 * Move request from internal lists to the request queue dispatch list. 3008 */ 3009static void cfq_dispatch_insert(struct request_queue *q, struct request *rq) 3010{ 3011 struct cfq_data *cfqd = q->elevator->elevator_data; 3012 struct cfq_queue *cfqq = RQ_CFQQ(rq); 3013 3014 cfq_log_cfqq(cfqd, cfqq, "dispatch_insert"); 3015 3016 cfqq->next_rq = cfq_find_next_rq(cfqd, cfqq, rq); 3017 cfq_remove_request(rq); 3018 cfqq->dispatched++; 3019 (RQ_CFQG(rq))->dispatched++; 3020 elv_dispatch_sort(q, rq); 3021 3022 cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++; 3023 cfqq->nr_sectors += blk_rq_sectors(rq); 3024} 3025 3026/* 3027 * return expired entry, or NULL to just start from scratch in rbtree 3028 */ 3029static struct request *cfq_check_fifo(struct cfq_queue *cfqq) 3030{ 3031 struct request *rq = NULL; 3032 3033 if (cfq_cfqq_fifo_expire(cfqq)) 3034 return NULL; 3035 3036 cfq_mark_cfqq_fifo_expire(cfqq); 3037 3038 if (list_empty(&cfqq->fifo)) 3039 return NULL; 3040 3041 rq = rq_entry_fifo(cfqq->fifo.next); 3042 if (ktime_get_ns() < rq->fifo_time) 3043 rq = NULL; 3044 3045 cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq); 3046 return rq; 3047} 3048 3049static inline int 3050cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq) 3051{ 3052 const int base_rq = cfqd->cfq_slice_async_rq; 3053 3054 WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR); 3055 3056 return 2 * base_rq * (IOPRIO_BE_NR - cfqq->ioprio); 3057} 3058 3059/* 3060 * Must be called with the queue_lock held. 3061 */ 3062static int cfqq_process_refs(struct cfq_queue *cfqq) 3063{ 3064 int process_refs, io_refs; 3065 3066 io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE]; 3067 process_refs = cfqq->ref - io_refs; 3068 BUG_ON(process_refs < 0); 3069 return process_refs; 3070} 3071 3072static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq) 3073{ 3074 int process_refs, new_process_refs; 3075 struct cfq_queue *__cfqq; 3076 3077 /* 3078 * If there are no process references on the new_cfqq, then it is 3079 * unsafe to follow the ->new_cfqq chain as other cfqq's in the 3080 * chain may have dropped their last reference (not just their 3081 * last process reference). 3082 */ 3083 if (!cfqq_process_refs(new_cfqq)) 3084 return; 3085 3086 /* Avoid a circular list and skip interim queue merges */ 3087 while ((__cfqq = new_cfqq->new_cfqq)) { 3088 if (__cfqq == cfqq) 3089 return; 3090 new_cfqq = __cfqq; 3091 } 3092 3093 process_refs = cfqq_process_refs(cfqq); 3094 new_process_refs = cfqq_process_refs(new_cfqq); 3095 /* 3096 * If the process for the cfqq has gone away, there is no 3097 * sense in merging the queues. 3098 */ 3099 if (process_refs == 0 || new_process_refs == 0) 3100 return; 3101 3102 /* 3103 * Merge in the direction of the lesser amount of work. 3104 */ 3105 if (new_process_refs >= process_refs) { 3106 cfqq->new_cfqq = new_cfqq; 3107 new_cfqq->ref += process_refs; 3108 } else { 3109 new_cfqq->new_cfqq = cfqq; 3110 cfqq->ref += new_process_refs; 3111 } 3112} 3113 3114static enum wl_type_t cfq_choose_wl_type(struct cfq_data *cfqd, 3115 struct cfq_group *cfqg, enum wl_class_t wl_class) 3116{ 3117 struct cfq_queue *queue; 3118 int i; 3119 bool key_valid = false; 3120 u64 lowest_key = 0; 3121 enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD; 3122 3123 for (i = 0; i <= SYNC_WORKLOAD; ++i) { 3124 /* select the one with lowest rb_key */ 3125 queue = cfq_rb_first(st_for(cfqg, wl_class, i)); 3126 if (queue && 3127 (!key_valid || queue->rb_key < lowest_key)) { 3128 lowest_key = queue->rb_key; 3129 cur_best = i; 3130 key_valid = true; 3131 } 3132 } 3133 3134 return cur_best; 3135} 3136 3137static void 3138choose_wl_class_and_type(struct cfq_data *cfqd, struct cfq_group *cfqg) 3139{ 3140 u64 slice; 3141 unsigned count; 3142 struct cfq_rb_root *st; 3143 u64 group_slice; 3144 enum wl_class_t original_class = cfqd->serving_wl_class; 3145 u64 now = ktime_get_ns(); 3146 3147 /* Choose next priority. RT > BE > IDLE */ 3148 if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg)) 3149 cfqd->serving_wl_class = RT_WORKLOAD; 3150 else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg)) 3151 cfqd->serving_wl_class = BE_WORKLOAD; 3152 else { 3153 cfqd->serving_wl_class = IDLE_WORKLOAD; 3154 cfqd->workload_expires = now + jiffies_to_nsecs(1); 3155 return; 3156 } 3157 3158 if (original_class != cfqd->serving_wl_class) 3159 goto new_workload; 3160 3161 /* 3162 * For RT and BE, we have to choose also the type 3163 * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload 3164 * expiration time 3165 */ 3166 st = st_for(cfqg, cfqd->serving_wl_class, cfqd->serving_wl_type); 3167 count = st->count; 3168 3169 /* 3170 * check workload expiration, and that we still have other queues ready 3171 */ 3172 if (count && !(now > cfqd->workload_expires)) 3173 return; 3174 3175new_workload: 3176 /* otherwise select new workload type */ 3177 cfqd->serving_wl_type = cfq_choose_wl_type(cfqd, cfqg, 3178 cfqd->serving_wl_class); 3179 st = st_for(cfqg, cfqd->serving_wl_class, cfqd->serving_wl_type); 3180 count = st->count; 3181 3182 /* 3183 * the workload slice is computed as a fraction of target latency 3184 * proportional to the number of queues in that workload, over 3185 * all the queues in the same priority class 3186 */ 3187 group_slice = cfq_group_slice(cfqd, cfqg); 3188 3189 slice = div_u64(group_slice * count, 3190 max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_wl_class], 3191 cfq_group_busy_queues_wl(cfqd->serving_wl_class, cfqd, 3192 cfqg))); 3193 3194 if (cfqd->serving_wl_type == ASYNC_WORKLOAD) { 3195 u64 tmp; 3196 3197 /* 3198 * Async queues are currently system wide. Just taking 3199 * proportion of queues with-in same group will lead to higher 3200 * async ratio system wide as generally root group is going 3201 * to have higher weight. A more accurate thing would be to 3202 * calculate system wide asnc/sync ratio. 3203 */ 3204 tmp = cfqd->cfq_target_latency * 3205 cfqg_busy_async_queues(cfqd, cfqg); 3206 tmp = div_u64(tmp, cfqd->busy_queues); 3207 slice = min_t(u64, slice, tmp); 3208 3209 /* async workload slice is scaled down according to 3210 * the sync/async slice ratio. */ 3211 slice = div64_u64(slice*cfqd->cfq_slice[0], cfqd->cfq_slice[1]); 3212 } else 3213 /* sync workload slice is at least 2 * cfq_slice_idle */ 3214 slice = max(slice, 2 * cfqd->cfq_slice_idle); 3215 3216 slice = max_t(u64, slice, CFQ_MIN_TT); 3217 cfq_log(cfqd, "workload slice:%llu", slice); 3218 cfqd->workload_expires = now + slice; 3219} 3220 3221static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd) 3222{ 3223 struct cfq_rb_root *st = &cfqd->grp_service_tree; 3224 struct cfq_group *cfqg; 3225 3226 if (RB_EMPTY_ROOT(&st->rb)) 3227 return NULL; 3228 cfqg = cfq_rb_first_group(st); 3229 update_min_vdisktime(st); 3230 return cfqg; 3231} 3232 3233static void cfq_choose_cfqg(struct cfq_data *cfqd) 3234{ 3235 struct cfq_group *cfqg = cfq_get_next_cfqg(cfqd); 3236 u64 now = ktime_get_ns(); 3237 3238 cfqd->serving_group = cfqg; 3239 3240 /* Restore the workload type data */ 3241 if (cfqg->saved_wl_slice) { 3242 cfqd->workload_expires = now + cfqg->saved_wl_slice; 3243 cfqd->serving_wl_type = cfqg->saved_wl_type; 3244 cfqd->serving_wl_class = cfqg->saved_wl_class; 3245 } else 3246 cfqd->workload_expires = now - 1; 3247 3248 choose_wl_class_and_type(cfqd, cfqg); 3249} 3250 3251/* 3252 * Select a queue for service. If we have a current active queue, 3253 * check whether to continue servicing it, or retrieve and set a new one. 3254 */ 3255static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) 3256{ 3257 struct cfq_queue *cfqq, *new_cfqq = NULL; 3258 u64 now = ktime_get_ns(); 3259 3260 cfqq = cfqd->active_queue; 3261 if (!cfqq) 3262 goto new_queue; 3263 3264 if (!cfqd->rq_queued) 3265 return NULL; 3266 3267 /* 3268 * We were waiting for group to get backlogged. Expire the queue 3269 */ 3270 if (cfq_cfqq_wait_busy(cfqq) && !RB_EMPTY_ROOT(&cfqq->sort_list)) 3271 goto expire; 3272 3273 /* 3274 * The active queue has run out of time, expire it and select new. 3275 */ 3276 if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq)) { 3277 /* 3278 * If slice had not expired at the completion of last request 3279 * we might not have turned on wait_busy flag. Don't expire 3280 * the queue yet. Allow the group to get backlogged. 3281 * 3282 * The very fact that we have used the slice, that means we 3283 * have been idling all along on this queue and it should be 3284 * ok to wait for this request to complete. 3285 */ 3286 if (cfqq->cfqg->nr_cfqq == 1 && RB_EMPTY_ROOT(&cfqq->sort_list) 3287 && cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) { 3288 cfqq = NULL; 3289 goto keep_queue; 3290 } else 3291 goto check_group_idle; 3292 } 3293 3294 /* 3295 * The active queue has requests and isn't expired, allow it to 3296 * dispatch. 3297 */ 3298 if (!RB_EMPTY_ROOT(&cfqq->sort_list)) 3299 goto keep_queue; 3300 3301 /* 3302 * If another queue has a request waiting within our mean seek 3303 * distance, let it run. The expire code will check for close 3304 * cooperators and put the close queue at the front of the service 3305 * tree. If possible, merge the expiring queue with the new cfqq. 3306 */ 3307 new_cfqq = cfq_close_cooperator(cfqd, cfqq); 3308 if (new_cfqq) { 3309 if (!cfqq->new_cfqq) 3310 cfq_setup_merge(cfqq, new_cfqq); 3311 goto expire; 3312 } 3313 3314 /* 3315 * No requests pending. If the active queue still has requests in 3316 * flight or is idling for a new request, allow either of these 3317 * conditions to happen (or time out) before selecting a new queue. 3318 */ 3319 if (hrtimer_active(&cfqd->idle_slice_timer)) { 3320 cfqq = NULL; 3321 goto keep_queue; 3322 } 3323 3324 /* 3325 * This is a deep seek queue, but the device is much faster than 3326 * the queue can deliver, don't idle 3327 **/ 3328 if (CFQQ_SEEKY(cfqq) && cfq_cfqq_idle_window(cfqq) && 3329 (cfq_cfqq_slice_new(cfqq) || 3330 (cfqq->slice_end - now > now - cfqq->slice_start))) { 3331 cfq_clear_cfqq_deep(cfqq); 3332 cfq_clear_cfqq_idle_window(cfqq); 3333 } 3334 3335 if (cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) { 3336 cfqq = NULL; 3337 goto keep_queue; 3338 } 3339 3340 /* 3341 * If group idle is enabled and there are requests dispatched from 3342 * this group, wait for requests to complete. 3343 */ 3344check_group_idle: 3345 if (cfqd->cfq_group_idle && cfqq->cfqg->nr_cfqq == 1 && 3346 cfqq->cfqg->dispatched && 3347 !cfq_io_thinktime_big(cfqd, &cfqq->cfqg->ttime, true)) { 3348 cfqq = NULL; 3349 goto keep_queue; 3350 } 3351 3352expire: 3353 cfq_slice_expired(cfqd, 0); 3354new_queue: 3355 /* 3356 * Current queue expired. Check if we have to switch to a new 3357 * service tree 3358 */ 3359 if (!new_cfqq) 3360 cfq_choose_cfqg(cfqd); 3361 3362 cfqq = cfq_set_active_queue(cfqd, new_cfqq); 3363keep_queue: 3364 return cfqq; 3365} 3366 3367static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq) 3368{ 3369 int dispatched = 0; 3370 3371 while (cfqq->next_rq) { 3372 cfq_dispatch_insert(cfqq->cfqd->queue, cfqq->next_rq); 3373 dispatched++; 3374 } 3375 3376 BUG_ON(!list_empty(&cfqq->fifo)); 3377 3378 /* By default cfqq is not expired if it is empty. Do it explicitly */ 3379 __cfq_slice_expired(cfqq->cfqd, cfqq, 0); 3380 return dispatched; 3381} 3382 3383/* 3384 * Drain our current requests. Used for barriers and when switching 3385 * io schedulers on-the-fly. 3386 */ 3387static int cfq_forced_dispatch(struct cfq_data *cfqd) 3388{ 3389 struct cfq_queue *cfqq; 3390 int dispatched = 0; 3391 3392 /* Expire the timeslice of the current active queue first */ 3393 cfq_slice_expired(cfqd, 0); 3394 while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL) { 3395 __cfq_set_active_queue(cfqd, cfqq); 3396 dispatched += __cfq_forced_dispatch_cfqq(cfqq); 3397 } 3398 3399 BUG_ON(cfqd->busy_queues); 3400 3401 cfq_log(cfqd, "forced_dispatch=%d", dispatched); 3402 return dispatched; 3403} 3404 3405static inline bool cfq_slice_used_soon(struct cfq_data *cfqd, 3406 struct cfq_queue *cfqq) 3407{ 3408 u64 now = ktime_get_ns(); 3409 3410 /* the queue hasn't finished any request, can't estimate */ 3411 if (cfq_cfqq_slice_new(cfqq)) 3412 return true; 3413 if (now + cfqd->cfq_slice_idle * cfqq->dispatched > cfqq->slice_end) 3414 return true; 3415 3416 return false; 3417} 3418 3419static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq) 3420{ 3421 unsigned int max_dispatch; 3422 3423 /* 3424 * Drain async requests before we start sync IO 3425 */ 3426 if (cfq_should_idle(cfqd, cfqq) && cfqd->rq_in_flight[BLK_RW_ASYNC]) 3427 return false; 3428 3429 /* 3430 * If this is an async queue and we have sync IO in flight, let it wait 3431 */ 3432 if (cfqd->rq_in_flight[BLK_RW_SYNC] && !cfq_cfqq_sync(cfqq)) 3433 return false; 3434 3435 max_dispatch = max_t(unsigned int, cfqd->cfq_quantum / 2, 1); 3436 if (cfq_class_idle(cfqq)) 3437 max_dispatch = 1; 3438 3439 /* 3440 * Does this cfqq already have too much IO in flight? 3441 */ 3442 if (cfqq->dispatched >= max_dispatch) { 3443 bool promote_sync = false; 3444 /* 3445 * idle queue must always only have a single IO in flight 3446 */ 3447 if (cfq_class_idle(cfqq)) 3448 return false; 3449 3450 /* 3451 * If there is only one sync queue 3452 * we can ignore async queue here and give the sync 3453 * queue no dispatch limit. The reason is a sync queue can 3454 * preempt async queue, limiting the sync queue doesn't make 3455 * sense. This is useful for aiostress test. 3456 */ 3457 if (cfq_cfqq_sync(cfqq) && cfqd->busy_sync_queues == 1) 3458 promote_sync = true; 3459 3460 /* 3461 * We have other queues, don't allow more IO from this one 3462 */ 3463 if (cfqd->busy_queues > 1 && cfq_slice_used_soon(cfqd, cfqq) && 3464 !promote_sync) 3465 return false; 3466 3467 /* 3468 * Sole queue user, no limit 3469 */ 3470 if (cfqd->busy_queues == 1 || promote_sync) 3471 max_dispatch = -1; 3472 else 3473 /* 3474 * Normally we start throttling cfqq when cfq_quantum/2 3475 * requests have been dispatched. But we can drive 3476 * deeper queue depths at the beginning of slice 3477 * subjected to upper limit of cfq_quantum. 3478 * */ 3479 max_dispatch = cfqd->cfq_quantum; 3480 } 3481 3482 /* 3483 * Async queues must wait a bit before being allowed dispatch. 3484 * We also ramp up the dispatch depth gradually for async IO, 3485 * based on the last sync IO we serviced 3486 */ 3487 if (!cfq_cfqq_sync(cfqq) && cfqd->cfq_latency) { 3488 u64 last_sync = ktime_get_ns() - cfqd->last_delayed_sync; 3489 unsigned int depth; 3490 3491 depth = div64_u64(last_sync, cfqd->cfq_slice[1]); 3492 if (!depth && !cfqq->dispatched) 3493 depth = 1; 3494 if (depth < max_dispatch) 3495 max_dispatch = depth; 3496 } 3497 3498 /* 3499 * If we're below the current max, allow a dispatch 3500 */ 3501 return cfqq->dispatched < max_dispatch; 3502} 3503 3504/* 3505 * Dispatch a request from cfqq, moving them to the request queue 3506 * dispatch list. 3507 */ 3508static bool cfq_dispatch_request(struct cfq_data *cfqd, struct cfq_queue *cfqq) 3509{ 3510 struct request *rq; 3511 3512 BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list)); 3513 3514 if (!cfq_may_dispatch(cfqd, cfqq)) 3515 return false; 3516 3517 /* 3518 * follow expired path, else get first next available 3519 */ 3520 rq = cfq_check_fifo(cfqq); 3521 if (!rq) 3522 rq = cfqq->next_rq; 3523 3524 /* 3525 * insert request into driver dispatch list 3526 */ 3527 cfq_dispatch_insert(cfqd->queue, rq); 3528 3529 if (!cfqd->active_cic) { 3530 struct cfq_io_cq *cic = RQ_CIC(rq); 3531 3532 atomic_long_inc(&cic->icq.ioc->refcount); 3533 cfqd->active_cic = cic; 3534 } 3535 3536 return true; 3537} 3538 3539/* 3540 * Find the cfqq that we need to service and move a request from that to the 3541 * dispatch list 3542 */ 3543static int cfq_dispatch_requests(struct request_queue *q, int force) 3544{ 3545 struct cfq_data *cfqd = q->elevator->elevator_data; 3546 struct cfq_queue *cfqq; 3547 3548 if (!cfqd->busy_queues) 3549 return 0; 3550 3551 if (unlikely(force)) 3552 return cfq_forced_dispatch(cfqd); 3553 3554 cfqq = cfq_select_queue(cfqd); 3555 if (!cfqq) 3556 return 0; 3557 3558 /* 3559 * Dispatch a request from this cfqq, if it is allowed 3560 */ 3561 if (!cfq_dispatch_request(cfqd, cfqq)) 3562 return 0; 3563 3564 cfqq->slice_dispatch++; 3565 cfq_clear_cfqq_must_dispatch(cfqq); 3566 3567 /* 3568 * expire an async queue immediately if it has used up its slice. idle 3569 * queue always expire after 1 dispatch round. 3570 */ 3571 if (cfqd->busy_queues > 1 && ((!cfq_cfqq_sync(cfqq) && 3572 cfqq->slice_dispatch >= cfq_prio_to_maxrq(cfqd, cfqq)) || 3573 cfq_class_idle(cfqq))) { 3574 cfqq->slice_end = ktime_get_ns() + 1; 3575 cfq_slice_expired(cfqd, 0); 3576 } 3577 3578 cfq_log_cfqq(cfqd, cfqq, "dispatched a request"); 3579 return 1; 3580} 3581 3582/* 3583 * task holds one reference to the queue, dropped when task exits. each rq 3584 * in-flight on this queue also holds a reference, dropped when rq is freed. 3585 * 3586 * Each cfq queue took a reference on the parent group. Drop it now. 3587 * queue lock must be held here. 3588 */ 3589static void cfq_put_queue(struct cfq_queue *cfqq) 3590{ 3591 struct cfq_data *cfqd = cfqq->cfqd; 3592 struct cfq_group *cfqg; 3593 3594 BUG_ON(cfqq->ref <= 0); 3595 3596 cfqq->ref--; 3597 if (cfqq->ref) 3598 return; 3599 3600 cfq_log_cfqq(cfqd, cfqq, "put_queue"); 3601 BUG_ON(rb_first(&cfqq->sort_list)); 3602 BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]); 3603 cfqg = cfqq->cfqg; 3604 3605 if (unlikely(cfqd->active_queue == cfqq)) { 3606 __cfq_slice_expired(cfqd, cfqq, 0); 3607 cfq_schedule_dispatch(cfqd); 3608 } 3609 3610 BUG_ON(cfq_cfqq_on_rr(cfqq)); 3611 kmem_cache_free(cfq_pool, cfqq); 3612 cfqg_put(cfqg); 3613} 3614 3615static void cfq_put_cooperator(struct cfq_queue *cfqq) 3616{ 3617 struct cfq_queue *__cfqq, *next; 3618 3619 /* 3620 * If this queue was scheduled to merge with another queue, be 3621 * sure to drop the reference taken on that queue (and others in 3622 * the merge chain). See cfq_setup_merge and cfq_merge_cfqqs. 3623 */ 3624 __cfqq = cfqq->new_cfqq; 3625 while (__cfqq) { 3626 if (__cfqq == cfqq) { 3627 WARN(1, "cfqq->new_cfqq loop detected\n"); 3628 break; 3629 } 3630 next = __cfqq->new_cfqq; 3631 cfq_put_queue(__cfqq); 3632 __cfqq = next; 3633 } 3634} 3635 3636static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq) 3637{ 3638 if (unlikely(cfqq == cfqd->active_queue)) { 3639 __cfq_slice_expired(cfqd, cfqq, 0); 3640 cfq_schedule_dispatch(cfqd); 3641 } 3642 3643 cfq_put_cooperator(cfqq); 3644 3645 cfq_put_queue(cfqq); 3646} 3647 3648static void cfq_init_icq(struct io_cq *icq) 3649{ 3650 struct cfq_io_cq *cic = icq_to_cic(icq); 3651 3652 cic->ttime.last_end_request = ktime_get_ns(); 3653} 3654 3655static void cfq_exit_icq(struct io_cq *icq) 3656{ 3657 struct cfq_io_cq *cic = icq_to_cic(icq); 3658 struct cfq_data *cfqd = cic_to_cfqd(cic); 3659 3660 if (cic_to_cfqq(cic, false)) { 3661 cfq_exit_cfqq(cfqd, cic_to_cfqq(cic, false)); 3662 cic_set_cfqq(cic, NULL, false); 3663 } 3664 3665 if (cic_to_cfqq(cic, true)) { 3666 cfq_exit_cfqq(cfqd, cic_to_cfqq(cic, true)); 3667 cic_set_cfqq(cic, NULL, true); 3668 } 3669} 3670 3671static void cfq_init_prio_data(struct cfq_queue *cfqq, struct cfq_io_cq *cic) 3672{ 3673 struct task_struct *tsk = current; 3674 int ioprio_class; 3675 3676 if (!cfq_cfqq_prio_changed(cfqq)) 3677 return; 3678 3679 ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio); 3680 switch (ioprio_class) { 3681 default: 3682 printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class); 3683 case IOPRIO_CLASS_NONE: 3684 /* 3685 * no prio set, inherit CPU scheduling settings 3686 */ 3687 cfqq->ioprio = task_nice_ioprio(tsk); 3688 cfqq->ioprio_class = task_nice_ioclass(tsk); 3689 break; 3690 case IOPRIO_CLASS_RT: 3691 cfqq->ioprio = IOPRIO_PRIO_DATA(cic->ioprio); 3692 cfqq->ioprio_class = IOPRIO_CLASS_RT; 3693 break; 3694 case IOPRIO_CLASS_BE: 3695 cfqq->ioprio = IOPRIO_PRIO_DATA(cic->ioprio); 3696 cfqq->ioprio_class = IOPRIO_CLASS_BE; 3697 break; 3698 case IOPRIO_CLASS_IDLE: 3699 cfqq->ioprio_class = IOPRIO_CLASS_IDLE; 3700 cfqq->ioprio = 7; 3701 cfq_clear_cfqq_idle_window(cfqq); 3702 break; 3703 } 3704 3705 /* 3706 * keep track of original prio settings in case we have to temporarily 3707 * elevate the priority of this queue 3708 */ 3709 cfqq->org_ioprio = cfqq->ioprio; 3710 cfqq->org_ioprio_class = cfqq->ioprio_class; 3711 cfq_clear_cfqq_prio_changed(cfqq); 3712} 3713 3714static void check_ioprio_changed(struct cfq_io_cq *cic, struct bio *bio) 3715{ 3716 int ioprio = cic->icq.ioc->ioprio; 3717 struct cfq_data *cfqd = cic_to_cfqd(cic); 3718 struct cfq_queue *cfqq; 3719 3720 /* 3721 * Check whether ioprio has changed. The condition may trigger 3722 * spuriously on a newly created cic but there's no harm. 3723 */ 3724 if (unlikely(!cfqd) || likely(cic->ioprio == ioprio)) 3725 return; 3726 3727 cfqq = cic_to_cfqq(cic, false); 3728 if (cfqq) { 3729 cfq_put_queue(cfqq); 3730 cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio); 3731 cic_set_cfqq(cic, cfqq, false); 3732 } 3733 3734 cfqq = cic_to_cfqq(cic, true); 3735 if (cfqq) 3736 cfq_mark_cfqq_prio_changed(cfqq); 3737 3738 cic->ioprio = ioprio; 3739} 3740 3741static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq, 3742 pid_t pid, bool is_sync) 3743{ 3744 RB_CLEAR_NODE(&cfqq->rb_node); 3745 RB_CLEAR_NODE(&cfqq->p_node); 3746 INIT_LIST_HEAD(&cfqq->fifo); 3747 3748 cfqq->ref = 0; 3749 cfqq->cfqd = cfqd; 3750 3751 cfq_mark_cfqq_prio_changed(cfqq); 3752 3753 if (is_sync) { 3754 if (!cfq_class_idle(cfqq)) 3755 cfq_mark_cfqq_idle_window(cfqq); 3756 cfq_mark_cfqq_sync(cfqq); 3757 } 3758 cfqq->pid = pid; 3759} 3760 3761#ifdef CONFIG_CFQ_GROUP_IOSCHED 3762static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) 3763{ 3764 struct cfq_data *cfqd = cic_to_cfqd(cic); 3765 struct cfq_queue *cfqq; 3766 uint64_t serial_nr; 3767 3768 rcu_read_lock(); 3769 serial_nr = bio_blkcg(bio)->css.serial_nr; 3770 rcu_read_unlock(); 3771 3772 /* 3773 * Check whether blkcg has changed. The condition may trigger 3774 * spuriously on a newly created cic but there's no harm. 3775 */ 3776 if (unlikely(!cfqd) || likely(cic->blkcg_serial_nr == serial_nr)) 3777 return; 3778 3779 /* 3780 * Drop reference to queues. New queues will be assigned in new 3781 * group upon arrival of fresh requests. 3782 */ 3783 cfqq = cic_to_cfqq(cic, false); 3784 if (cfqq) { 3785 cfq_log_cfqq(cfqd, cfqq, "changed cgroup"); 3786 cic_set_cfqq(cic, NULL, false); 3787 cfq_put_queue(cfqq); 3788 } 3789 3790 cfqq = cic_to_cfqq(cic, true); 3791 if (cfqq) { 3792 cfq_log_cfqq(cfqd, cfqq, "changed cgroup"); 3793 cic_set_cfqq(cic, NULL, true); 3794 cfq_put_queue(cfqq); 3795 } 3796 3797 cic->blkcg_serial_nr = serial_nr; 3798} 3799#else 3800static inline void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) { } 3801#endif /* CONFIG_CFQ_GROUP_IOSCHED */ 3802 3803static struct cfq_queue ** 3804cfq_async_queue_prio(struct cfq_group *cfqg, int ioprio_class, int ioprio) 3805{ 3806 switch (ioprio_class) { 3807 case IOPRIO_CLASS_RT: 3808 return &cfqg->async_cfqq[0][ioprio]; 3809 case IOPRIO_CLASS_NONE: 3810 ioprio = IOPRIO_NORM; 3811 /* fall through */ 3812 case IOPRIO_CLASS_BE: 3813 return &cfqg->async_cfqq[1][ioprio]; 3814 case IOPRIO_CLASS_IDLE: 3815 return &cfqg->async_idle_cfqq; 3816 default: 3817 BUG(); 3818 } 3819} 3820 3821static struct cfq_queue * 3822cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic, 3823 struct bio *bio) 3824{ 3825 int ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio); 3826 int ioprio = IOPRIO_PRIO_DATA(cic->ioprio); 3827 struct cfq_queue **async_cfqq = NULL; 3828 struct cfq_queue *cfqq; 3829 struct cfq_group *cfqg; 3830 3831 rcu_read_lock(); 3832 cfqg = cfq_lookup_cfqg(cfqd, bio_blkcg(bio)); 3833 if (!cfqg) { 3834 cfqq = &cfqd->oom_cfqq; 3835 goto out; 3836 } 3837 3838 if (!is_sync) { 3839 if (!ioprio_valid(cic->ioprio)) { 3840 struct task_struct *tsk = current; 3841 ioprio = task_nice_ioprio(tsk); 3842 ioprio_class = task_nice_ioclass(tsk); 3843 } 3844 async_cfqq = cfq_async_queue_prio(cfqg, ioprio_class, ioprio); 3845 cfqq = *async_cfqq; 3846 if (cfqq) 3847 goto out; 3848 } 3849 3850 cfqq = kmem_cache_alloc_node(cfq_pool, GFP_NOWAIT | __GFP_ZERO, 3851 cfqd->queue->node); 3852 if (!cfqq) { 3853 cfqq = &cfqd->oom_cfqq; 3854 goto out; 3855 } 3856 3857 cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync); 3858 cfq_init_prio_data(cfqq, cic); 3859 cfq_link_cfqq_cfqg(cfqq, cfqg); 3860 cfq_log_cfqq(cfqd, cfqq, "alloced"); 3861 3862 if (async_cfqq) { 3863 /* a new async queue is created, pin and remember */ 3864 cfqq->ref++; 3865 *async_cfqq = cfqq; 3866 } 3867out: 3868 cfqq->ref++; 3869 rcu_read_unlock(); 3870 return cfqq; 3871} 3872 3873static void 3874__cfq_update_io_thinktime(struct cfq_ttime *ttime, u64 slice_idle) 3875{ 3876 u64 elapsed = ktime_get_ns() - ttime->last_end_request; 3877 elapsed = min(elapsed, 2UL * slice_idle); 3878 3879 ttime->ttime_samples = (7*ttime->ttime_samples + 256) / 8; 3880 ttime->ttime_total = div_u64(7*ttime->ttime_total + 256*elapsed, 8); 3881 ttime->ttime_mean = div64_ul(ttime->ttime_total + 128, 3882 ttime->ttime_samples); 3883} 3884 3885static void 3886cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_queue *cfqq, 3887 struct cfq_io_cq *cic) 3888{ 3889 if (cfq_cfqq_sync(cfqq)) { 3890 __cfq_update_io_thinktime(&cic->ttime, cfqd->cfq_slice_idle); 3891 __cfq_update_io_thinktime(&cfqq->service_tree->ttime, 3892 cfqd->cfq_slice_idle); 3893 } 3894#ifdef CONFIG_CFQ_GROUP_IOSCHED 3895 __cfq_update_io_thinktime(&cfqq->cfqg->ttime, cfqd->cfq_group_idle); 3896#endif 3897} 3898 3899static void 3900cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq, 3901 struct request *rq) 3902{ 3903 sector_t sdist = 0; 3904 sector_t n_sec = blk_rq_sectors(rq); 3905 if (cfqq->last_request_pos) { 3906 if (cfqq->last_request_pos < blk_rq_pos(rq)) 3907 sdist = blk_rq_pos(rq) - cfqq->last_request_pos; 3908 else 3909 sdist = cfqq->last_request_pos - blk_rq_pos(rq); 3910 } 3911 3912 cfqq->seek_history <<= 1; 3913 if (blk_queue_nonrot(cfqd->queue)) 3914 cfqq->seek_history |= (n_sec < CFQQ_SECT_THR_NONROT); 3915 else 3916 cfqq->seek_history |= (sdist > CFQQ_SEEK_THR); 3917} 3918 3919/* 3920 * Disable idle window if the process thinks too long or seeks so much that 3921 * it doesn't matter 3922 */ 3923static void 3924cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq, 3925 struct cfq_io_cq *cic) 3926{ 3927 int old_idle, enable_idle; 3928 3929 /* 3930 * Don't idle for async or idle io prio class 3931 */ 3932 if (!cfq_cfqq_sync(cfqq) || cfq_class_idle(cfqq)) 3933 return; 3934 3935 enable_idle = old_idle = cfq_cfqq_idle_window(cfqq); 3936 3937 if (cfqq->queued[0] + cfqq->queued[1] >= 4) 3938 cfq_mark_cfqq_deep(cfqq); 3939 3940 if (cfqq->next_rq && (cfqq->next_rq->cmd_flags & REQ_NOIDLE)) 3941 enable_idle = 0; 3942 else if (!atomic_read(&cic->icq.ioc->active_ref) || 3943 !cfqd->cfq_slice_idle || 3944 (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq))) 3945 enable_idle = 0; 3946 else if (sample_valid(cic->ttime.ttime_samples)) { 3947 if (cic->ttime.ttime_mean > cfqd->cfq_slice_idle) 3948 enable_idle = 0; 3949 else 3950 enable_idle = 1; 3951 } 3952 3953 if (old_idle != enable_idle) { 3954 cfq_log_cfqq(cfqd, cfqq, "idle=%d", enable_idle); 3955 if (enable_idle) 3956 cfq_mark_cfqq_idle_window(cfqq); 3957 else 3958 cfq_clear_cfqq_idle_window(cfqq); 3959 } 3960} 3961 3962/* 3963 * Check if new_cfqq should preempt the currently active queue. Return 0 for 3964 * no or if we aren't sure, a 1 will cause a preempt. 3965 */ 3966static bool 3967cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, 3968 struct request *rq) 3969{ 3970 struct cfq_queue *cfqq; 3971 3972 cfqq = cfqd->active_queue; 3973 if (!cfqq) 3974 return false; 3975 3976 if (cfq_class_idle(new_cfqq)) 3977 return false; 3978 3979 if (cfq_class_idle(cfqq)) 3980 return true; 3981 3982 /* 3983 * Don't allow a non-RT request to preempt an ongoing RT cfqq timeslice. 3984 */ 3985 if (cfq_class_rt(cfqq) && !cfq_class_rt(new_cfqq)) 3986 return false; 3987 3988 /* 3989 * if the new request is sync, but the currently running queue is 3990 * not, let the sync request have priority. 3991 */ 3992 if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq)) 3993 return true; 3994 3995 /* 3996 * Treat ancestors of current cgroup the same way as current cgroup. 3997 * For anybody else we disallow preemption to guarantee service 3998 * fairness among cgroups. 3999 */ 4000 if (!cfqg_is_descendant(cfqq->cfqg, new_cfqq->cfqg)) 4001 return false; 4002 4003 if (cfq_slice_used(cfqq)) 4004 return true; 4005 4006 /* 4007 * Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice. 4008 */ 4009 if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq)) 4010 return true; 4011 4012 WARN_ON_ONCE(cfqq->ioprio_class != new_cfqq->ioprio_class); 4013 /* Allow preemption only if we are idling on sync-noidle tree */ 4014 if (cfqd->serving_wl_type == SYNC_NOIDLE_WORKLOAD && 4015 cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD && 4016 RB_EMPTY_ROOT(&cfqq->sort_list)) 4017 return true; 4018 4019 /* 4020 * So both queues are sync. Let the new request get disk time if 4021 * it's a metadata request and the current queue is doing regular IO. 4022 */ 4023 if ((rq->cmd_flags & REQ_PRIO) && !cfqq->prio_pending) 4024 return true; 4025 4026 /* An idle queue should not be idle now for some reason */ 4027 if (RB_EMPTY_ROOT(&cfqq->sort_list) && !cfq_should_idle(cfqd, cfqq)) 4028 return true; 4029 4030 if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq)) 4031 return false; 4032 4033 /* 4034 * if this request is as-good as one we would expect from the 4035 * current cfqq, let it preempt 4036 */ 4037 if (cfq_rq_close(cfqd, cfqq, rq)) 4038 return true; 4039 4040 return false; 4041} 4042 4043/* 4044 * cfqq preempts the active queue. if we allowed preempt with no slice left, 4045 * let it have half of its nominal slice. 4046 */ 4047static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) 4048{ 4049 enum wl_type_t old_type = cfqq_type(cfqd->active_queue); 4050 4051 cfq_log_cfqq(cfqd, cfqq, "preempt"); 4052 cfq_slice_expired(cfqd, 1); 4053 4054 /* 4055 * workload type is changed, don't save slice, otherwise preempt 4056 * doesn't happen 4057 */ 4058 if (old_type != cfqq_type(cfqq)) 4059 cfqq->cfqg->saved_wl_slice = 0; 4060 4061 /* 4062 * Put the new queue at the front of the of the current list, 4063 * so we know that it will be selected next. 4064 */ 4065 BUG_ON(!cfq_cfqq_on_rr(cfqq)); 4066 4067 cfq_service_tree_add(cfqd, cfqq, 1); 4068 4069 cfqq->slice_end = 0; 4070 cfq_mark_cfqq_slice_new(cfqq); 4071} 4072 4073/* 4074 * Called when a new fs request (rq) is added (to cfqq). Check if there's 4075 * something we should do about it 4076 */ 4077static void 4078cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq, 4079 struct request *rq) 4080{ 4081 struct cfq_io_cq *cic = RQ_CIC(rq); 4082 4083 cfqd->rq_queued++; 4084 if (rq->cmd_flags & REQ_PRIO) 4085 cfqq->prio_pending++; 4086 4087 cfq_update_io_thinktime(cfqd, cfqq, cic); 4088 cfq_update_io_seektime(cfqd, cfqq, rq); 4089 cfq_update_idle_window(cfqd, cfqq, cic); 4090 4091 cfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); 4092 4093 if (cfqq == cfqd->active_queue) { 4094 /* 4095 * Remember that we saw a request from this process, but 4096 * don't start queuing just yet. Otherwise we risk seeing lots 4097 * of tiny requests, because we disrupt the normal plugging 4098 * and merging. If the request is already larger than a single 4099 * page, let it rip immediately. For that case we assume that 4100 * merging is already done. Ditto for a busy system that 4101 * has other work pending, don't risk delaying until the 4102 * idle timer unplug to continue working. 4103 */ 4104 if (cfq_cfqq_wait_request(cfqq)) { 4105 if (blk_rq_bytes(rq) > PAGE_SIZE || 4106 cfqd->busy_queues > 1) { 4107 cfq_del_timer(cfqd, cfqq); 4108 cfq_clear_cfqq_wait_request(cfqq); 4109 __blk_run_queue(cfqd->queue); 4110 } else { 4111 cfqg_stats_update_idle_time(cfqq->cfqg); 4112 cfq_mark_cfqq_must_dispatch(cfqq); 4113 } 4114 } 4115 } else if (cfq_should_preempt(cfqd, cfqq, rq)) { 4116 /* 4117 * not the active queue - expire current slice if it is 4118 * idle and has expired it's mean thinktime or this new queue 4119 * has some old slice time left and is of higher priority or 4120 * this new queue is RT and the current one is BE 4121 */ 4122 cfq_preempt_queue(cfqd, cfqq); 4123 __blk_run_queue(cfqd->queue); 4124 } 4125} 4126 4127static void cfq_insert_request(struct request_queue *q, struct request *rq) 4128{ 4129 struct cfq_data *cfqd = q->elevator->elevator_data; 4130 struct cfq_queue *cfqq = RQ_CFQQ(rq); 4131 4132 cfq_log_cfqq(cfqd, cfqq, "insert_request"); 4133 cfq_init_prio_data(cfqq, RQ_CIC(rq)); 4134 4135 rq->fifo_time = ktime_get_ns() + cfqd->cfq_fifo_expire[rq_is_sync(rq)]; 4136 list_add_tail(&rq->queuelist, &cfqq->fifo); 4137 cfq_add_rq_rb(rq); 4138 cfqg_stats_update_io_add(RQ_CFQG(rq), cfqd->serving_group, req_op(rq), 4139 rq->cmd_flags); 4140 cfq_rq_enqueued(cfqd, cfqq, rq); 4141} 4142 4143/* 4144 * Update hw_tag based on peak queue depth over 50 samples under 4145 * sufficient load. 4146 */ 4147static void cfq_update_hw_tag(struct cfq_data *cfqd) 4148{ 4149 struct cfq_queue *cfqq = cfqd->active_queue; 4150 4151 if (cfqd->rq_in_driver > cfqd->hw_tag_est_depth) 4152 cfqd->hw_tag_est_depth = cfqd->rq_in_driver; 4153 4154 if (cfqd->hw_tag == 1) 4155 return; 4156 4157 if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN && 4158 cfqd->rq_in_driver <= CFQ_HW_QUEUE_MIN) 4159 return; 4160 4161 /* 4162 * If active queue hasn't enough requests and can idle, cfq might not 4163 * dispatch sufficient requests to hardware. Don't zero hw_tag in this 4164 * case 4165 */ 4166 if (cfqq && cfq_cfqq_idle_window(cfqq) && 4167 cfqq->dispatched + cfqq->queued[0] + cfqq->queued[1] < 4168 CFQ_HW_QUEUE_MIN && cfqd->rq_in_driver < CFQ_HW_QUEUE_MIN) 4169 return; 4170 4171 if (cfqd->hw_tag_samples++ < 50) 4172 return; 4173 4174 if (cfqd->hw_tag_est_depth >= CFQ_HW_QUEUE_MIN) 4175 cfqd->hw_tag = 1; 4176 else 4177 cfqd->hw_tag = 0; 4178} 4179 4180static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq) 4181{ 4182 struct cfq_io_cq *cic = cfqd->active_cic; 4183 u64 now = ktime_get_ns(); 4184 4185 /* If the queue already has requests, don't wait */ 4186 if (!RB_EMPTY_ROOT(&cfqq->sort_list)) 4187 return false; 4188 4189 /* If there are other queues in the group, don't wait */ 4190 if (cfqq->cfqg->nr_cfqq > 1) 4191 return false; 4192 4193 /* the only queue in the group, but think time is big */ 4194 if (cfq_io_thinktime_big(cfqd, &cfqq->cfqg->ttime, true)) 4195 return false; 4196 4197 if (cfq_slice_used(cfqq)) 4198 return true; 4199 4200 /* if slice left is less than think time, wait busy */ 4201 if (cic && sample_valid(cic->ttime.ttime_samples) 4202 && (cfqq->slice_end - now < cic->ttime.ttime_mean)) 4203 return true; 4204 4205 /* 4206 * If think times is less than a jiffy than ttime_mean=0 and above 4207 * will not be true. It might happen that slice has not expired yet 4208 * but will expire soon (4-5 ns) during select_queue(). To cover the 4209 * case where think time is less than a jiffy, mark the queue wait 4210 * busy if only 1 jiffy is left in the slice. 4211 */ 4212 if (cfqq->slice_end - now <= jiffies_to_nsecs(1)) 4213 return true; 4214 4215 return false; 4216} 4217 4218static void cfq_completed_request(struct request_queue *q, struct request *rq) 4219{ 4220 struct cfq_queue *cfqq = RQ_CFQQ(rq); 4221 struct cfq_data *cfqd = cfqq->cfqd; 4222 const int sync = rq_is_sync(rq); 4223 u64 now = ktime_get_ns(); 4224 4225 cfq_log_cfqq(cfqd, cfqq, "complete rqnoidle %d", 4226 !!(rq->cmd_flags & REQ_NOIDLE)); 4227 4228 cfq_update_hw_tag(cfqd); 4229 4230 WARN_ON(!cfqd->rq_in_driver); 4231 WARN_ON(!cfqq->dispatched); 4232 cfqd->rq_in_driver--; 4233 cfqq->dispatched--; 4234 (RQ_CFQG(rq))->dispatched--; 4235 cfqg_stats_update_completion(cfqq->cfqg, rq_start_time_ns(rq), 4236 rq_io_start_time_ns(rq), req_op(rq), 4237 rq->cmd_flags); 4238 4239 cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--; 4240 4241 if (sync) { 4242 struct cfq_rb_root *st; 4243 4244 RQ_CIC(rq)->ttime.last_end_request = now; 4245 4246 if (cfq_cfqq_on_rr(cfqq)) 4247 st = cfqq->service_tree; 4248 else 4249 st = st_for(cfqq->cfqg, cfqq_class(cfqq), 4250 cfqq_type(cfqq)); 4251 4252 st->ttime.last_end_request = now; 4253 /* 4254 * We have to do this check in jiffies since start_time is in 4255 * jiffies and it is not trivial to convert to ns. If 4256 * cfq_fifo_expire[1] ever comes close to 1 jiffie, this test 4257 * will become problematic but so far we are fine (the default 4258 * is 128 ms). 4259 */ 4260 if (!time_after(rq->start_time + 4261 nsecs_to_jiffies(cfqd->cfq_fifo_expire[1]), 4262 jiffies)) 4263 cfqd->last_delayed_sync = now; 4264 } 4265 4266#ifdef CONFIG_CFQ_GROUP_IOSCHED 4267 cfqq->cfqg->ttime.last_end_request = now; 4268#endif 4269 4270 /* 4271 * If this is the active queue, check if it needs to be expired, 4272 * or if we want to idle in case it has no pending requests. 4273 */ 4274 if (cfqd->active_queue == cfqq) { 4275 const bool cfqq_empty = RB_EMPTY_ROOT(&cfqq->sort_list); 4276 4277 if (cfq_cfqq_slice_new(cfqq)) { 4278 cfq_set_prio_slice(cfqd, cfqq); 4279 cfq_clear_cfqq_slice_new(cfqq); 4280 } 4281 4282 /* 4283 * Should we wait for next request to come in before we expire 4284 * the queue. 4285 */ 4286 if (cfq_should_wait_busy(cfqd, cfqq)) { 4287 u64 extend_sl = cfqd->cfq_slice_idle; 4288 if (!cfqd->cfq_slice_idle) 4289 extend_sl = cfqd->cfq_group_idle; 4290 cfqq->slice_end = now + extend_sl; 4291 cfq_mark_cfqq_wait_busy(cfqq); 4292 cfq_log_cfqq(cfqd, cfqq, "will busy wait"); 4293 } 4294 4295 /* 4296 * Idling is not enabled on: 4297 * - expired queues 4298 * - idle-priority queues 4299 * - async queues 4300 * - queues with still some requests queued 4301 * - when there is a close cooperator 4302 */ 4303 if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq)) 4304 cfq_slice_expired(cfqd, 1); 4305 else if (sync && cfqq_empty && 4306 !cfq_close_cooperator(cfqd, cfqq)) { 4307 cfq_arm_slice_timer(cfqd); 4308 } 4309 } 4310 4311 if (!cfqd->rq_in_driver) 4312 cfq_schedule_dispatch(cfqd); 4313} 4314 4315static void cfqq_boost_on_prio(struct cfq_queue *cfqq, int op_flags) 4316{ 4317 /* 4318 * If REQ_PRIO is set, boost class and prio level, if it's below 4319 * BE/NORM. If prio is not set, restore the potentially boosted 4320 * class/prio level. 4321 */ 4322 if (!(op_flags & REQ_PRIO)) { 4323 cfqq->ioprio_class = cfqq->org_ioprio_class; 4324 cfqq->ioprio = cfqq->org_ioprio; 4325 } else { 4326 if (cfq_class_idle(cfqq)) 4327 cfqq->ioprio_class = IOPRIO_CLASS_BE; 4328 if (cfqq->ioprio > IOPRIO_NORM) 4329 cfqq->ioprio = IOPRIO_NORM; 4330 } 4331} 4332 4333static inline int __cfq_may_queue(struct cfq_queue *cfqq) 4334{ 4335 if (cfq_cfqq_wait_request(cfqq) && !cfq_cfqq_must_alloc_slice(cfqq)) { 4336 cfq_mark_cfqq_must_alloc_slice(cfqq); 4337 return ELV_MQUEUE_MUST; 4338 } 4339 4340 return ELV_MQUEUE_MAY; 4341} 4342 4343static int cfq_may_queue(struct request_queue *q, int op, int op_flags) 4344{ 4345 struct cfq_data *cfqd = q->elevator->elevator_data; 4346 struct task_struct *tsk = current; 4347 struct cfq_io_cq *cic; 4348 struct cfq_queue *cfqq; 4349 4350 /* 4351 * don't force setup of a queue from here, as a call to may_queue 4352 * does not necessarily imply that a request actually will be queued. 4353 * so just lookup a possibly existing queue, or return 'may queue' 4354 * if that fails 4355 */ 4356 cic = cfq_cic_lookup(cfqd, tsk->io_context); 4357 if (!cic) 4358 return ELV_MQUEUE_MAY; 4359 4360 cfqq = cic_to_cfqq(cic, rw_is_sync(op, op_flags)); 4361 if (cfqq) { 4362 cfq_init_prio_data(cfqq, cic); 4363 cfqq_boost_on_prio(cfqq, op_flags); 4364 4365 return __cfq_may_queue(cfqq); 4366 } 4367 4368 return ELV_MQUEUE_MAY; 4369} 4370 4371/* 4372 * queue lock held here 4373 */ 4374static void cfq_put_request(struct request *rq) 4375{ 4376 struct cfq_queue *cfqq = RQ_CFQQ(rq); 4377 4378 if (cfqq) { 4379 const int rw = rq_data_dir(rq); 4380 4381 BUG_ON(!cfqq->allocated[rw]); 4382 cfqq->allocated[rw]--; 4383 4384 /* Put down rq reference on cfqg */ 4385 cfqg_put(RQ_CFQG(rq)); 4386 rq->elv.priv[0] = NULL; 4387 rq->elv.priv[1] = NULL; 4388 4389 cfq_put_queue(cfqq); 4390 } 4391} 4392 4393static struct cfq_queue * 4394cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_cq *cic, 4395 struct cfq_queue *cfqq) 4396{ 4397 cfq_log_cfqq(cfqd, cfqq, "merging with queue %p", cfqq->new_cfqq); 4398 cic_set_cfqq(cic, cfqq->new_cfqq, 1); 4399 cfq_mark_cfqq_coop(cfqq->new_cfqq); 4400 cfq_put_queue(cfqq); 4401 return cic_to_cfqq(cic, 1); 4402} 4403 4404/* 4405 * Returns NULL if a new cfqq should be allocated, or the old cfqq if this 4406 * was the last process referring to said cfqq. 4407 */ 4408static struct cfq_queue * 4409split_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq) 4410{ 4411 if (cfqq_process_refs(cfqq) == 1) { 4412 cfqq->pid = current->pid; 4413 cfq_clear_cfqq_coop(cfqq); 4414 cfq_clear_cfqq_split_coop(cfqq); 4415 return cfqq; 4416 } 4417 4418 cic_set_cfqq(cic, NULL, 1); 4419 4420 cfq_put_cooperator(cfqq); 4421 4422 cfq_put_queue(cfqq); 4423 return NULL; 4424} 4425/* 4426 * Allocate cfq data structures associated with this request. 4427 */ 4428static int 4429cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio, 4430 gfp_t gfp_mask) 4431{ 4432 struct cfq_data *cfqd = q->elevator->elevator_data; 4433 struct cfq_io_cq *cic = icq_to_cic(rq->elv.icq); 4434 const int rw = rq_data_dir(rq); 4435 const bool is_sync = rq_is_sync(rq); 4436 struct cfq_queue *cfqq; 4437 4438 spin_lock_irq(q->queue_lock); 4439 4440 check_ioprio_changed(cic, bio); 4441 check_blkcg_changed(cic, bio); 4442new_queue: 4443 cfqq = cic_to_cfqq(cic, is_sync); 4444 if (!cfqq || cfqq == &cfqd->oom_cfqq) { 4445 if (cfqq) 4446 cfq_put_queue(cfqq); 4447 cfqq = cfq_get_queue(cfqd, is_sync, cic, bio); 4448 cic_set_cfqq(cic, cfqq, is_sync); 4449 } else { 4450 /* 4451 * If the queue was seeky for too long, break it apart. 4452 */ 4453 if (cfq_cfqq_coop(cfqq) && cfq_cfqq_split_coop(cfqq)) { 4454 cfq_log_cfqq(cfqd, cfqq, "breaking apart cfqq"); 4455 cfqq = split_cfqq(cic, cfqq); 4456 if (!cfqq) 4457 goto new_queue; 4458 } 4459 4460 /* 4461 * Check to see if this queue is scheduled to merge with 4462 * another, closely cooperating queue. The merging of 4463 * queues happens here as it must be done in process context. 4464 * The reference on new_cfqq was taken in merge_cfqqs. 4465 */ 4466 if (cfqq->new_cfqq) 4467 cfqq = cfq_merge_cfqqs(cfqd, cic, cfqq); 4468 } 4469 4470 cfqq->allocated[rw]++; 4471 4472 cfqq->ref++; 4473 cfqg_get(cfqq->cfqg); 4474 rq->elv.priv[0] = cfqq; 4475 rq->elv.priv[1] = cfqq->cfqg; 4476 spin_unlock_irq(q->queue_lock); 4477 return 0; 4478} 4479 4480static void cfq_kick_queue(struct work_struct *work) 4481{ 4482 struct cfq_data *cfqd = 4483 container_of(work, struct cfq_data, unplug_work); 4484 struct request_queue *q = cfqd->queue; 4485 4486 spin_lock_irq(q->queue_lock); 4487 __blk_run_queue(cfqd->queue); 4488 spin_unlock_irq(q->queue_lock); 4489} 4490 4491/* 4492 * Timer running if the active_queue is currently idling inside its time slice 4493 */ 4494static enum hrtimer_restart cfq_idle_slice_timer(struct hrtimer *timer) 4495{ 4496 struct cfq_data *cfqd = container_of(timer, struct cfq_data, 4497 idle_slice_timer); 4498 struct cfq_queue *cfqq; 4499 unsigned long flags; 4500 int timed_out = 1; 4501 4502 cfq_log(cfqd, "idle timer fired"); 4503 4504 spin_lock_irqsave(cfqd->queue->queue_lock, flags); 4505 4506 cfqq = cfqd->active_queue; 4507 if (cfqq) { 4508 timed_out = 0; 4509 4510 /* 4511 * We saw a request before the queue expired, let it through 4512 */ 4513 if (cfq_cfqq_must_dispatch(cfqq)) 4514 goto out_kick; 4515 4516 /* 4517 * expired 4518 */ 4519 if (cfq_slice_used(cfqq)) 4520 goto expire; 4521 4522 /* 4523 * only expire and reinvoke request handler, if there are 4524 * other queues with pending requests 4525 */ 4526 if (!cfqd->busy_queues) 4527 goto out_cont; 4528 4529 /* 4530 * not expired and it has a request pending, let it dispatch 4531 */ 4532 if (!RB_EMPTY_ROOT(&cfqq->sort_list)) 4533 goto out_kick; 4534 4535 /* 4536 * Queue depth flag is reset only when the idle didn't succeed 4537 */ 4538 cfq_clear_cfqq_deep(cfqq); 4539 } 4540expire: 4541 cfq_slice_expired(cfqd, timed_out); 4542out_kick: 4543 cfq_schedule_dispatch(cfqd); 4544out_cont: 4545 spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); 4546 return HRTIMER_NORESTART; 4547} 4548 4549static void cfq_shutdown_timer_wq(struct cfq_data *cfqd) 4550{ 4551 hrtimer_cancel(&cfqd->idle_slice_timer); 4552 cancel_work_sync(&cfqd->unplug_work); 4553} 4554 4555static void cfq_exit_queue(struct elevator_queue *e) 4556{ 4557 struct cfq_data *cfqd = e->elevator_data; 4558 struct request_queue *q = cfqd->queue; 4559 4560 cfq_shutdown_timer_wq(cfqd); 4561 4562 spin_lock_irq(q->queue_lock); 4563 4564 if (cfqd->active_queue) 4565 __cfq_slice_expired(cfqd, cfqd->active_queue, 0); 4566 4567 spin_unlock_irq(q->queue_lock); 4568 4569 cfq_shutdown_timer_wq(cfqd); 4570 4571#ifdef CONFIG_CFQ_GROUP_IOSCHED 4572 blkcg_deactivate_policy(q, &blkcg_policy_cfq); 4573#else 4574 kfree(cfqd->root_group); 4575#endif 4576 kfree(cfqd); 4577} 4578 4579static int cfq_init_queue(struct request_queue *q, struct elevator_type *e) 4580{ 4581 struct cfq_data *cfqd; 4582 struct blkcg_gq *blkg __maybe_unused; 4583 int i, ret; 4584 struct elevator_queue *eq; 4585 4586 eq = elevator_alloc(q, e); 4587 if (!eq) 4588 return -ENOMEM; 4589 4590 cfqd = kzalloc_node(sizeof(*cfqd), GFP_KERNEL, q->node); 4591 if (!cfqd) { 4592 kobject_put(&eq->kobj); 4593 return -ENOMEM; 4594 } 4595 eq->elevator_data = cfqd; 4596 4597 cfqd->queue = q; 4598 spin_lock_irq(q->queue_lock); 4599 q->elevator = eq; 4600 spin_unlock_irq(q->queue_lock); 4601 4602 /* Init root service tree */ 4603 cfqd->grp_service_tree = CFQ_RB_ROOT; 4604 4605 /* Init root group and prefer root group over other groups by default */ 4606#ifdef CONFIG_CFQ_GROUP_IOSCHED 4607 ret = blkcg_activate_policy(q, &blkcg_policy_cfq); 4608 if (ret) 4609 goto out_free; 4610 4611 cfqd->root_group = blkg_to_cfqg(q->root_blkg); 4612#else 4613 ret = -ENOMEM; 4614 cfqd->root_group = kzalloc_node(sizeof(*cfqd->root_group), 4615 GFP_KERNEL, cfqd->queue->node); 4616 if (!cfqd->root_group) 4617 goto out_free; 4618 4619 cfq_init_cfqg_base(cfqd->root_group); 4620 cfqd->root_group->weight = 2 * CFQ_WEIGHT_LEGACY_DFL; 4621 cfqd->root_group->leaf_weight = 2 * CFQ_WEIGHT_LEGACY_DFL; 4622#endif 4623 4624 /* 4625 * Not strictly needed (since RB_ROOT just clears the node and we 4626 * zeroed cfqd on alloc), but better be safe in case someone decides 4627 * to add magic to the rb code 4628 */ 4629 for (i = 0; i < CFQ_PRIO_LISTS; i++) 4630 cfqd->prio_trees[i] = RB_ROOT; 4631 4632 /* 4633 * Our fallback cfqq if cfq_get_queue() runs into OOM issues. 4634 * Grab a permanent reference to it, so that the normal code flow 4635 * will not attempt to free it. oom_cfqq is linked to root_group 4636 * but shouldn't hold a reference as it'll never be unlinked. Lose 4637 * the reference from linking right away. 4638 */ 4639 cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0); 4640 cfqd->oom_cfqq.ref++; 4641 4642 spin_lock_irq(q->queue_lock); 4643 cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, cfqd->root_group); 4644 cfqg_put(cfqd->root_group); 4645 spin_unlock_irq(q->queue_lock); 4646 4647 hrtimer_init(&cfqd->idle_slice_timer, CLOCK_MONOTONIC, 4648 HRTIMER_MODE_REL); 4649 cfqd->idle_slice_timer.function = cfq_idle_slice_timer; 4650 4651 INIT_WORK(&cfqd->unplug_work, cfq_kick_queue); 4652 4653 cfqd->cfq_quantum = cfq_quantum; 4654 cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0]; 4655 cfqd->cfq_fifo_expire[1] = cfq_fifo_expire[1]; 4656 cfqd->cfq_back_max = cfq_back_max; 4657 cfqd->cfq_back_penalty = cfq_back_penalty; 4658 cfqd->cfq_slice[0] = cfq_slice_async; 4659 cfqd->cfq_slice[1] = cfq_slice_sync; 4660 cfqd->cfq_target_latency = cfq_target_latency; 4661 cfqd->cfq_slice_async_rq = cfq_slice_async_rq; 4662 cfqd->cfq_slice_idle = cfq_slice_idle; 4663 cfqd->cfq_group_idle = cfq_group_idle; 4664 cfqd->cfq_latency = 1; 4665 cfqd->hw_tag = -1; 4666 /* 4667 * we optimistically start assuming sync ops weren't delayed in last 4668 * second, in order to have larger depth for async operations. 4669 */ 4670 cfqd->last_delayed_sync = ktime_get_ns() - NSEC_PER_SEC; 4671 return 0; 4672 4673out_free: 4674 kfree(cfqd); 4675 kobject_put(&eq->kobj); 4676 return ret; 4677} 4678 4679static void cfq_registered_queue(struct request_queue *q) 4680{ 4681 struct elevator_queue *e = q->elevator; 4682 struct cfq_data *cfqd = e->elevator_data; 4683 4684 /* 4685 * Default to IOPS mode with no idling for SSDs 4686 */ 4687 if (blk_queue_nonrot(q)) 4688 cfqd->cfq_slice_idle = 0; 4689} 4690 4691/* 4692 * sysfs parts below --> 4693 */ 4694static ssize_t 4695cfq_var_show(unsigned int var, char *page) 4696{ 4697 return sprintf(page, "%u\n", var); 4698} 4699 4700static ssize_t 4701cfq_var_store(unsigned int *var, const char *page, size_t count) 4702{ 4703 char *p = (char *) page; 4704 4705 *var = simple_strtoul(p, &p, 10); 4706 return count; 4707} 4708 4709#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ 4710static ssize_t __FUNC(struct elevator_queue *e, char *page) \ 4711{ \ 4712 struct cfq_data *cfqd = e->elevator_data; \ 4713 u64 __data = __VAR; \ 4714 if (__CONV) \ 4715 __data = div_u64(__data, NSEC_PER_MSEC); \ 4716 return cfq_var_show(__data, (page)); \ 4717} 4718SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0); 4719SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1); 4720SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1); 4721SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0); 4722SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0); 4723SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1); 4724SHOW_FUNCTION(cfq_group_idle_show, cfqd->cfq_group_idle, 1); 4725SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1); 4726SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1); 4727SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0); 4728SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0); 4729SHOW_FUNCTION(cfq_target_latency_show, cfqd->cfq_target_latency, 1); 4730#undef SHOW_FUNCTION 4731 4732#define USEC_SHOW_FUNCTION(__FUNC, __VAR) \ 4733static ssize_t __FUNC(struct elevator_queue *e, char *page) \ 4734{ \ 4735 struct cfq_data *cfqd = e->elevator_data; \ 4736 u64 __data = __VAR; \ 4737 __data = div_u64(__data, NSEC_PER_USEC); \ 4738 return cfq_var_show(__data, (page)); \ 4739} 4740USEC_SHOW_FUNCTION(cfq_slice_idle_us_show, cfqd->cfq_slice_idle); 4741USEC_SHOW_FUNCTION(cfq_group_idle_us_show, cfqd->cfq_group_idle); 4742USEC_SHOW_FUNCTION(cfq_slice_sync_us_show, cfqd->cfq_slice[1]); 4743USEC_SHOW_FUNCTION(cfq_slice_async_us_show, cfqd->cfq_slice[0]); 4744USEC_SHOW_FUNCTION(cfq_target_latency_us_show, cfqd->cfq_target_latency); 4745#undef USEC_SHOW_FUNCTION 4746 4747#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ 4748static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \ 4749{ \ 4750 struct cfq_data *cfqd = e->elevator_data; \ 4751 unsigned int __data; \ 4752 int ret = cfq_var_store(&__data, (page), count); \ 4753 if (__data < (MIN)) \ 4754 __data = (MIN); \ 4755 else if (__data > (MAX)) \ 4756 __data = (MAX); \ 4757 if (__CONV) \ 4758 *(__PTR) = (u64)__data * NSEC_PER_MSEC; \ 4759 else \ 4760 *(__PTR) = __data; \ 4761 return ret; \ 4762} 4763STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0); 4764STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1, 4765 UINT_MAX, 1); 4766STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1, 4767 UINT_MAX, 1); 4768STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0); 4769STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1, 4770 UINT_MAX, 0); 4771STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1); 4772STORE_FUNCTION(cfq_group_idle_store, &cfqd->cfq_group_idle, 0, UINT_MAX, 1); 4773STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1); 4774STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1); 4775STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, 4776 UINT_MAX, 0); 4777STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0); 4778STORE_FUNCTION(cfq_target_latency_store, &cfqd->cfq_target_latency, 1, UINT_MAX, 1); 4779#undef STORE_FUNCTION 4780 4781#define USEC_STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \ 4782static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \ 4783{ \ 4784 struct cfq_data *cfqd = e->elevator_data; \ 4785 unsigned int __data; \ 4786 int ret = cfq_var_store(&__data, (page), count); \ 4787 if (__data < (MIN)) \ 4788 __data = (MIN); \ 4789 else if (__data > (MAX)) \ 4790 __data = (MAX); \ 4791 *(__PTR) = (u64)__data * NSEC_PER_USEC; \ 4792 return ret; \ 4793} 4794USEC_STORE_FUNCTION(cfq_slice_idle_us_store, &cfqd->cfq_slice_idle, 0, UINT_MAX); 4795USEC_STORE_FUNCTION(cfq_group_idle_us_store, &cfqd->cfq_group_idle, 0, UINT_MAX); 4796USEC_STORE_FUNCTION(cfq_slice_sync_us_store, &cfqd->cfq_slice[1], 1, UINT_MAX); 4797USEC_STORE_FUNCTION(cfq_slice_async_us_store, &cfqd->cfq_slice[0], 1, UINT_MAX); 4798USEC_STORE_FUNCTION(cfq_target_latency_us_store, &cfqd->cfq_target_latency, 1, UINT_MAX); 4799#undef USEC_STORE_FUNCTION 4800 4801#define CFQ_ATTR(name) \ 4802 __ATTR(name, S_IRUGO|S_IWUSR, cfq_##name##_show, cfq_##name##_store) 4803 4804static struct elv_fs_entry cfq_attrs[] = { 4805 CFQ_ATTR(quantum), 4806 CFQ_ATTR(fifo_expire_sync), 4807 CFQ_ATTR(fifo_expire_async), 4808 CFQ_ATTR(back_seek_max), 4809 CFQ_ATTR(back_seek_penalty), 4810 CFQ_ATTR(slice_sync), 4811 CFQ_ATTR(slice_sync_us), 4812 CFQ_ATTR(slice_async), 4813 CFQ_ATTR(slice_async_us), 4814 CFQ_ATTR(slice_async_rq), 4815 CFQ_ATTR(slice_idle), 4816 CFQ_ATTR(slice_idle_us), 4817 CFQ_ATTR(group_idle), 4818 CFQ_ATTR(group_idle_us), 4819 CFQ_ATTR(low_latency), 4820 CFQ_ATTR(target_latency), 4821 CFQ_ATTR(target_latency_us), 4822 __ATTR_NULL 4823}; 4824 4825static struct elevator_type iosched_cfq = { 4826 .ops = { 4827 .elevator_merge_fn = cfq_merge, 4828 .elevator_merged_fn = cfq_merged_request, 4829 .elevator_merge_req_fn = cfq_merged_requests, 4830 .elevator_allow_bio_merge_fn = cfq_allow_bio_merge, 4831 .elevator_allow_rq_merge_fn = cfq_allow_rq_merge, 4832 .elevator_bio_merged_fn = cfq_bio_merged, 4833 .elevator_dispatch_fn = cfq_dispatch_requests, 4834 .elevator_add_req_fn = cfq_insert_request, 4835 .elevator_activate_req_fn = cfq_activate_request, 4836 .elevator_deactivate_req_fn = cfq_deactivate_request, 4837 .elevator_completed_req_fn = cfq_completed_request, 4838 .elevator_former_req_fn = elv_rb_former_request, 4839 .elevator_latter_req_fn = elv_rb_latter_request, 4840 .elevator_init_icq_fn = cfq_init_icq, 4841 .elevator_exit_icq_fn = cfq_exit_icq, 4842 .elevator_set_req_fn = cfq_set_request, 4843 .elevator_put_req_fn = cfq_put_request, 4844 .elevator_may_queue_fn = cfq_may_queue, 4845 .elevator_init_fn = cfq_init_queue, 4846 .elevator_exit_fn = cfq_exit_queue, 4847 .elevator_registered_fn = cfq_registered_queue, 4848 }, 4849 .icq_size = sizeof(struct cfq_io_cq), 4850 .icq_align = __alignof__(struct cfq_io_cq), 4851 .elevator_attrs = cfq_attrs, 4852 .elevator_name = "cfq", 4853 .elevator_owner = THIS_MODULE, 4854}; 4855 4856#ifdef CONFIG_CFQ_GROUP_IOSCHED 4857static struct blkcg_policy blkcg_policy_cfq = { 4858 .dfl_cftypes = cfq_blkcg_files, 4859 .legacy_cftypes = cfq_blkcg_legacy_files, 4860 4861 .cpd_alloc_fn = cfq_cpd_alloc, 4862 .cpd_init_fn = cfq_cpd_init, 4863 .cpd_free_fn = cfq_cpd_free, 4864 .cpd_bind_fn = cfq_cpd_bind, 4865 4866 .pd_alloc_fn = cfq_pd_alloc, 4867 .pd_init_fn = cfq_pd_init, 4868 .pd_offline_fn = cfq_pd_offline, 4869 .pd_free_fn = cfq_pd_free, 4870 .pd_reset_stats_fn = cfq_pd_reset_stats, 4871}; 4872#endif 4873 4874static int __init cfq_init(void) 4875{ 4876 int ret; 4877 4878#ifdef CONFIG_CFQ_GROUP_IOSCHED 4879 ret = blkcg_policy_register(&blkcg_policy_cfq); 4880 if (ret) 4881 return ret; 4882#else 4883 cfq_group_idle = 0; 4884#endif 4885 4886 ret = -ENOMEM; 4887 cfq_pool = KMEM_CACHE(cfq_queue, 0); 4888 if (!cfq_pool) 4889 goto err_pol_unreg; 4890 4891 ret = elv_register(&iosched_cfq); 4892 if (ret) 4893 goto err_free_pool; 4894 4895 return 0; 4896 4897err_free_pool: 4898 kmem_cache_destroy(cfq_pool); 4899err_pol_unreg: 4900#ifdef CONFIG_CFQ_GROUP_IOSCHED 4901 blkcg_policy_unregister(&blkcg_policy_cfq); 4902#endif 4903 return ret; 4904} 4905 4906static void __exit cfq_exit(void) 4907{ 4908#ifdef CONFIG_CFQ_GROUP_IOSCHED 4909 blkcg_policy_unregister(&blkcg_policy_cfq); 4910#endif 4911 elv_unregister(&iosched_cfq); 4912 kmem_cache_destroy(cfq_pool); 4913} 4914 4915module_init(cfq_init); 4916module_exit(cfq_exit); 4917 4918MODULE_AUTHOR("Jens Axboe"); 4919MODULE_LICENSE("GPL"); 4920MODULE_DESCRIPTION("Completely Fair Queueing IO scheduler");