Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at v4.2-rc7 4754 lines 125 kB view raw
1/* 2 * CFQ, or complete fairness queueing, disk scheduler. 3 * 4 * Based on ideas from a previously unfinished io 5 * scheduler (round robin per-process disk scheduling) and Andrea Arcangeli. 6 * 7 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> 8 */ 9#include <linux/module.h> 10#include <linux/slab.h> 11#include <linux/blkdev.h> 12#include <linux/elevator.h> 13#include <linux/jiffies.h> 14#include <linux/rbtree.h> 15#include <linux/ioprio.h> 16#include <linux/blktrace_api.h> 17#include <linux/blk-cgroup.h> 18#include "blk.h" 19 20/* 21 * tunables 22 */ 23/* max queue in one round of service */ 24static const int cfq_quantum = 8; 25static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 }; 26/* maximum backwards seek, in KiB */ 27static const int cfq_back_max = 16 * 1024; 28/* penalty of a backwards seek */ 29static const int cfq_back_penalty = 2; 30static const int cfq_slice_sync = HZ / 10; 31static int cfq_slice_async = HZ / 25; 32static const int cfq_slice_async_rq = 2; 33static int cfq_slice_idle = HZ / 125; 34static int cfq_group_idle = HZ / 125; 35static const int cfq_target_latency = HZ * 3/10; /* 300 ms */ 36static const int cfq_hist_divisor = 4; 37 38/* 39 * offset from end of service tree 40 */ 41#define CFQ_IDLE_DELAY (HZ / 5) 42 43/* 44 * below this threshold, we consider thinktime immediate 45 */ 46#define CFQ_MIN_TT (2) 47 48#define CFQ_SLICE_SCALE (5) 49#define CFQ_HW_QUEUE_MIN (5) 50#define CFQ_SERVICE_SHIFT 12 51 52#define CFQQ_SEEK_THR (sector_t)(8 * 100) 53#define CFQQ_CLOSE_THR (sector_t)(8 * 1024) 54#define CFQQ_SECT_THR_NONROT (sector_t)(2 * 32) 55#define CFQQ_SEEKY(cfqq) (hweight32(cfqq->seek_history) > 32/8) 56 57#define RQ_CIC(rq) icq_to_cic((rq)->elv.icq) 58#define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elv.priv[0]) 59#define RQ_CFQG(rq) (struct cfq_group *) ((rq)->elv.priv[1]) 60 61static struct kmem_cache *cfq_pool; 62 63#define CFQ_PRIO_LISTS IOPRIO_BE_NR 64#define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE) 65#define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT) 66 67#define sample_valid(samples) ((samples) > 80) 68#define rb_entry_cfqg(node) rb_entry((node), struct cfq_group, rb_node) 69 70/* blkio-related constants */ 71#define CFQ_WEIGHT_MIN 10 72#define CFQ_WEIGHT_MAX 1000 73#define CFQ_WEIGHT_DEFAULT 500 74 75struct cfq_ttime { 76 unsigned long last_end_request; 77 78 unsigned long ttime_total; 79 unsigned long ttime_samples; 80 unsigned long ttime_mean; 81}; 82 83/* 84 * Most of our rbtree usage is for sorting with min extraction, so 85 * if we cache the leftmost node we don't have to walk down the tree 86 * to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should 87 * move this into the elevator for the rq sorting as well. 88 */ 89struct cfq_rb_root { 90 struct rb_root rb; 91 struct rb_node *left; 92 unsigned count; 93 u64 min_vdisktime; 94 struct cfq_ttime ttime; 95}; 96#define CFQ_RB_ROOT (struct cfq_rb_root) { .rb = RB_ROOT, \ 97 .ttime = {.last_end_request = jiffies,},} 98 99/* 100 * Per process-grouping structure 101 */ 102struct cfq_queue { 103 /* reference count */ 104 int ref; 105 /* various state flags, see below */ 106 unsigned int flags; 107 /* parent cfq_data */ 108 struct cfq_data *cfqd; 109 /* service_tree member */ 110 struct rb_node rb_node; 111 /* service_tree key */ 112 unsigned long rb_key; 113 /* prio tree member */ 114 struct rb_node p_node; 115 /* prio tree root we belong to, if any */ 116 struct rb_root *p_root; 117 /* sorted list of pending requests */ 118 struct rb_root sort_list; 119 /* if fifo isn't expired, next request to serve */ 120 struct request *next_rq; 121 /* requests queued in sort_list */ 122 int queued[2]; 123 /* currently allocated requests */ 124 int allocated[2]; 125 /* fifo list of requests in sort_list */ 126 struct list_head fifo; 127 128 /* time when queue got scheduled in to dispatch first request. */ 129 unsigned long dispatch_start; 130 unsigned int allocated_slice; 131 unsigned int slice_dispatch; 132 /* time when first request from queue completed and slice started. */ 133 unsigned long slice_start; 134 unsigned long slice_end; 135 long slice_resid; 136 137 /* pending priority requests */ 138 int prio_pending; 139 /* number of requests that are on the dispatch list or inside driver */ 140 int dispatched; 141 142 /* io prio of this group */ 143 unsigned short ioprio, org_ioprio; 144 unsigned short ioprio_class; 145 146 pid_t pid; 147 148 u32 seek_history; 149 sector_t last_request_pos; 150 151 struct cfq_rb_root *service_tree; 152 struct cfq_queue *new_cfqq; 153 struct cfq_group *cfqg; 154 /* Number of sectors dispatched from queue in single dispatch round */ 155 unsigned long nr_sectors; 156}; 157 158/* 159 * First index in the service_trees. 160 * IDLE is handled separately, so it has negative index 161 */ 162enum wl_class_t { 163 BE_WORKLOAD = 0, 164 RT_WORKLOAD = 1, 165 IDLE_WORKLOAD = 2, 166 CFQ_PRIO_NR, 167}; 168 169/* 170 * Second index in the service_trees. 171 */ 172enum wl_type_t { 173 ASYNC_WORKLOAD = 0, 174 SYNC_NOIDLE_WORKLOAD = 1, 175 SYNC_WORKLOAD = 2 176}; 177 178struct cfqg_stats { 179#ifdef CONFIG_CFQ_GROUP_IOSCHED 180 /* total bytes transferred */ 181 struct blkg_rwstat service_bytes; 182 /* total IOs serviced, post merge */ 183 struct blkg_rwstat serviced; 184 /* number of ios merged */ 185 struct blkg_rwstat merged; 186 /* total time spent on device in ns, may not be accurate w/ queueing */ 187 struct blkg_rwstat service_time; 188 /* total time spent waiting in scheduler queue in ns */ 189 struct blkg_rwstat wait_time; 190 /* number of IOs queued up */ 191 struct blkg_rwstat queued; 192 /* total sectors transferred */ 193 struct blkg_stat sectors; 194 /* total disk time and nr sectors dispatched by this group */ 195 struct blkg_stat time; 196#ifdef CONFIG_DEBUG_BLK_CGROUP 197 /* time not charged to this cgroup */ 198 struct blkg_stat unaccounted_time; 199 /* sum of number of ios queued across all samples */ 200 struct blkg_stat avg_queue_size_sum; 201 /* count of samples taken for average */ 202 struct blkg_stat avg_queue_size_samples; 203 /* how many times this group has been removed from service tree */ 204 struct blkg_stat dequeue; 205 /* total time spent waiting for it to be assigned a timeslice. */ 206 struct blkg_stat group_wait_time; 207 /* time spent idling for this blkcg_gq */ 208 struct blkg_stat idle_time; 209 /* total time with empty current active q with other requests queued */ 210 struct blkg_stat empty_time; 211 /* fields after this shouldn't be cleared on stat reset */ 212 uint64_t start_group_wait_time; 213 uint64_t start_idle_time; 214 uint64_t start_empty_time; 215 uint16_t flags; 216#endif /* CONFIG_DEBUG_BLK_CGROUP */ 217#endif /* CONFIG_CFQ_GROUP_IOSCHED */ 218}; 219 220/* Per-cgroup data */ 221struct cfq_group_data { 222 /* must be the first member */ 223 struct blkcg_policy_data pd; 224 225 unsigned int weight; 226 unsigned int leaf_weight; 227}; 228 229/* This is per cgroup per device grouping structure */ 230struct cfq_group { 231 /* must be the first member */ 232 struct blkg_policy_data pd; 233 234 /* group service_tree member */ 235 struct rb_node rb_node; 236 237 /* group service_tree key */ 238 u64 vdisktime; 239 240 /* 241 * The number of active cfqgs and sum of their weights under this 242 * cfqg. This covers this cfqg's leaf_weight and all children's 243 * weights, but does not cover weights of further descendants. 244 * 245 * If a cfqg is on the service tree, it's active. An active cfqg 246 * also activates its parent and contributes to the children_weight 247 * of the parent. 248 */ 249 int nr_active; 250 unsigned int children_weight; 251 252 /* 253 * vfraction is the fraction of vdisktime that the tasks in this 254 * cfqg are entitled to. This is determined by compounding the 255 * ratios walking up from this cfqg to the root. 256 * 257 * It is in fixed point w/ CFQ_SERVICE_SHIFT and the sum of all 258 * vfractions on a service tree is approximately 1. The sum may 259 * deviate a bit due to rounding errors and fluctuations caused by 260 * cfqgs entering and leaving the service tree. 261 */ 262 unsigned int vfraction; 263 264 /* 265 * There are two weights - (internal) weight is the weight of this 266 * cfqg against the sibling cfqgs. leaf_weight is the wight of 267 * this cfqg against the child cfqgs. For the root cfqg, both 268 * weights are kept in sync for backward compatibility. 269 */ 270 unsigned int weight; 271 unsigned int new_weight; 272 unsigned int dev_weight; 273 274 unsigned int leaf_weight; 275 unsigned int new_leaf_weight; 276 unsigned int dev_leaf_weight; 277 278 /* number of cfqq currently on this group */ 279 int nr_cfqq; 280 281 /* 282 * Per group busy queues average. Useful for workload slice calc. We 283 * create the array for each prio class but at run time it is used 284 * only for RT and BE class and slot for IDLE class remains unused. 285 * This is primarily done to avoid confusion and a gcc warning. 286 */ 287 unsigned int busy_queues_avg[CFQ_PRIO_NR]; 288 /* 289 * rr lists of queues with requests. We maintain service trees for 290 * RT and BE classes. These trees are subdivided in subclasses 291 * of SYNC, SYNC_NOIDLE and ASYNC based on workload type. For IDLE 292 * class there is no subclassification and all the cfq queues go on 293 * a single tree service_tree_idle. 294 * Counts are embedded in the cfq_rb_root 295 */ 296 struct cfq_rb_root service_trees[2][3]; 297 struct cfq_rb_root service_tree_idle; 298 299 unsigned long saved_wl_slice; 300 enum wl_type_t saved_wl_type; 301 enum wl_class_t saved_wl_class; 302 303 /* number of requests that are on the dispatch list or inside driver */ 304 int dispatched; 305 struct cfq_ttime ttime; 306 struct cfqg_stats stats; /* stats for this cfqg */ 307 struct cfqg_stats dead_stats; /* stats pushed from dead children */ 308}; 309 310struct cfq_io_cq { 311 struct io_cq icq; /* must be the first member */ 312 struct cfq_queue *cfqq[2]; 313 struct cfq_ttime ttime; 314 int ioprio; /* the current ioprio */ 315#ifdef CONFIG_CFQ_GROUP_IOSCHED 316 uint64_t blkcg_serial_nr; /* the current blkcg serial */ 317#endif 318}; 319 320/* 321 * Per block device queue structure 322 */ 323struct cfq_data { 324 struct request_queue *queue; 325 /* Root service tree for cfq_groups */ 326 struct cfq_rb_root grp_service_tree; 327 struct cfq_group *root_group; 328 329 /* 330 * The priority currently being served 331 */ 332 enum wl_class_t serving_wl_class; 333 enum wl_type_t serving_wl_type; 334 unsigned long workload_expires; 335 struct cfq_group *serving_group; 336 337 /* 338 * Each priority tree is sorted by next_request position. These 339 * trees are used when determining if two or more queues are 340 * interleaving requests (see cfq_close_cooperator). 341 */ 342 struct rb_root prio_trees[CFQ_PRIO_LISTS]; 343 344 unsigned int busy_queues; 345 unsigned int busy_sync_queues; 346 347 int rq_in_driver; 348 int rq_in_flight[2]; 349 350 /* 351 * queue-depth detection 352 */ 353 int rq_queued; 354 int hw_tag; 355 /* 356 * hw_tag can be 357 * -1 => indeterminate, (cfq will behave as if NCQ is present, to allow better detection) 358 * 1 => NCQ is present (hw_tag_est_depth is the estimated max depth) 359 * 0 => no NCQ 360 */ 361 int hw_tag_est_depth; 362 unsigned int hw_tag_samples; 363 364 /* 365 * idle window management 366 */ 367 struct timer_list idle_slice_timer; 368 struct work_struct unplug_work; 369 370 struct cfq_queue *active_queue; 371 struct cfq_io_cq *active_cic; 372 373 /* 374 * async queue for each priority case 375 */ 376 struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR]; 377 struct cfq_queue *async_idle_cfqq; 378 379 sector_t last_position; 380 381 /* 382 * tunables, see top of file 383 */ 384 unsigned int cfq_quantum; 385 unsigned int cfq_fifo_expire[2]; 386 unsigned int cfq_back_penalty; 387 unsigned int cfq_back_max; 388 unsigned int cfq_slice[2]; 389 unsigned int cfq_slice_async_rq; 390 unsigned int cfq_slice_idle; 391 unsigned int cfq_group_idle; 392 unsigned int cfq_latency; 393 unsigned int cfq_target_latency; 394 395 /* 396 * Fallback dummy cfqq for extreme OOM conditions 397 */ 398 struct cfq_queue oom_cfqq; 399 400 unsigned long last_delayed_sync; 401}; 402 403static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd); 404 405static struct cfq_rb_root *st_for(struct cfq_group *cfqg, 406 enum wl_class_t class, 407 enum wl_type_t type) 408{ 409 if (!cfqg) 410 return NULL; 411 412 if (class == IDLE_WORKLOAD) 413 return &cfqg->service_tree_idle; 414 415 return &cfqg->service_trees[class][type]; 416} 417 418enum cfqq_state_flags { 419 CFQ_CFQQ_FLAG_on_rr = 0, /* on round-robin busy list */ 420 CFQ_CFQQ_FLAG_wait_request, /* waiting for a request */ 421 CFQ_CFQQ_FLAG_must_dispatch, /* must be allowed a dispatch */ 422 CFQ_CFQQ_FLAG_must_alloc_slice, /* per-slice must_alloc flag */ 423 CFQ_CFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ 424 CFQ_CFQQ_FLAG_idle_window, /* slice idling enabled */ 425 CFQ_CFQQ_FLAG_prio_changed, /* task priority has changed */ 426 CFQ_CFQQ_FLAG_slice_new, /* no requests dispatched in slice */ 427 CFQ_CFQQ_FLAG_sync, /* synchronous queue */ 428 CFQ_CFQQ_FLAG_coop, /* cfqq is shared */ 429 CFQ_CFQQ_FLAG_split_coop, /* shared cfqq will be splitted */ 430 CFQ_CFQQ_FLAG_deep, /* sync cfqq experienced large depth */ 431 CFQ_CFQQ_FLAG_wait_busy, /* Waiting for next request */ 432}; 433 434#define CFQ_CFQQ_FNS(name) \ 435static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq) \ 436{ \ 437 (cfqq)->flags |= (1 << CFQ_CFQQ_FLAG_##name); \ 438} \ 439static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq) \ 440{ \ 441 (cfqq)->flags &= ~(1 << CFQ_CFQQ_FLAG_##name); \ 442} \ 443static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq) \ 444{ \ 445 return ((cfqq)->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0; \ 446} 447 448CFQ_CFQQ_FNS(on_rr); 449CFQ_CFQQ_FNS(wait_request); 450CFQ_CFQQ_FNS(must_dispatch); 451CFQ_CFQQ_FNS(must_alloc_slice); 452CFQ_CFQQ_FNS(fifo_expire); 453CFQ_CFQQ_FNS(idle_window); 454CFQ_CFQQ_FNS(prio_changed); 455CFQ_CFQQ_FNS(slice_new); 456CFQ_CFQQ_FNS(sync); 457CFQ_CFQQ_FNS(coop); 458CFQ_CFQQ_FNS(split_coop); 459CFQ_CFQQ_FNS(deep); 460CFQ_CFQQ_FNS(wait_busy); 461#undef CFQ_CFQQ_FNS 462 463#if defined(CONFIG_CFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) 464 465/* cfqg stats flags */ 466enum cfqg_stats_flags { 467 CFQG_stats_waiting = 0, 468 CFQG_stats_idling, 469 CFQG_stats_empty, 470}; 471 472#define CFQG_FLAG_FNS(name) \ 473static inline void cfqg_stats_mark_##name(struct cfqg_stats *stats) \ 474{ \ 475 stats->flags |= (1 << CFQG_stats_##name); \ 476} \ 477static inline void cfqg_stats_clear_##name(struct cfqg_stats *stats) \ 478{ \ 479 stats->flags &= ~(1 << CFQG_stats_##name); \ 480} \ 481static inline int cfqg_stats_##name(struct cfqg_stats *stats) \ 482{ \ 483 return (stats->flags & (1 << CFQG_stats_##name)) != 0; \ 484} \ 485 486CFQG_FLAG_FNS(waiting) 487CFQG_FLAG_FNS(idling) 488CFQG_FLAG_FNS(empty) 489#undef CFQG_FLAG_FNS 490 491/* This should be called with the queue_lock held. */ 492static void cfqg_stats_update_group_wait_time(struct cfqg_stats *stats) 493{ 494 unsigned long long now; 495 496 if (!cfqg_stats_waiting(stats)) 497 return; 498 499 now = sched_clock(); 500 if (time_after64(now, stats->start_group_wait_time)) 501 blkg_stat_add(&stats->group_wait_time, 502 now - stats->start_group_wait_time); 503 cfqg_stats_clear_waiting(stats); 504} 505 506/* This should be called with the queue_lock held. */ 507static void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg, 508 struct cfq_group *curr_cfqg) 509{ 510 struct cfqg_stats *stats = &cfqg->stats; 511 512 if (cfqg_stats_waiting(stats)) 513 return; 514 if (cfqg == curr_cfqg) 515 return; 516 stats->start_group_wait_time = sched_clock(); 517 cfqg_stats_mark_waiting(stats); 518} 519 520/* This should be called with the queue_lock held. */ 521static void cfqg_stats_end_empty_time(struct cfqg_stats *stats) 522{ 523 unsigned long long now; 524 525 if (!cfqg_stats_empty(stats)) 526 return; 527 528 now = sched_clock(); 529 if (time_after64(now, stats->start_empty_time)) 530 blkg_stat_add(&stats->empty_time, 531 now - stats->start_empty_time); 532 cfqg_stats_clear_empty(stats); 533} 534 535static void cfqg_stats_update_dequeue(struct cfq_group *cfqg) 536{ 537 blkg_stat_add(&cfqg->stats.dequeue, 1); 538} 539 540static void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg) 541{ 542 struct cfqg_stats *stats = &cfqg->stats; 543 544 if (blkg_rwstat_total(&stats->queued)) 545 return; 546 547 /* 548 * group is already marked empty. This can happen if cfqq got new 549 * request in parent group and moved to this group while being added 550 * to service tree. Just ignore the event and move on. 551 */ 552 if (cfqg_stats_empty(stats)) 553 return; 554 555 stats->start_empty_time = sched_clock(); 556 cfqg_stats_mark_empty(stats); 557} 558 559static void cfqg_stats_update_idle_time(struct cfq_group *cfqg) 560{ 561 struct cfqg_stats *stats = &cfqg->stats; 562 563 if (cfqg_stats_idling(stats)) { 564 unsigned long long now = sched_clock(); 565 566 if (time_after64(now, stats->start_idle_time)) 567 blkg_stat_add(&stats->idle_time, 568 now - stats->start_idle_time); 569 cfqg_stats_clear_idling(stats); 570 } 571} 572 573static void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg) 574{ 575 struct cfqg_stats *stats = &cfqg->stats; 576 577 BUG_ON(cfqg_stats_idling(stats)); 578 579 stats->start_idle_time = sched_clock(); 580 cfqg_stats_mark_idling(stats); 581} 582 583static void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg) 584{ 585 struct cfqg_stats *stats = &cfqg->stats; 586 587 blkg_stat_add(&stats->avg_queue_size_sum, 588 blkg_rwstat_total(&stats->queued)); 589 blkg_stat_add(&stats->avg_queue_size_samples, 1); 590 cfqg_stats_update_group_wait_time(stats); 591} 592 593#else /* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */ 594 595static inline void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg, struct cfq_group *curr_cfqg) { } 596static inline void cfqg_stats_end_empty_time(struct cfqg_stats *stats) { } 597static inline void cfqg_stats_update_dequeue(struct cfq_group *cfqg) { } 598static inline void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg) { } 599static inline void cfqg_stats_update_idle_time(struct cfq_group *cfqg) { } 600static inline void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg) { } 601static inline void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg) { } 602 603#endif /* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */ 604 605#ifdef CONFIG_CFQ_GROUP_IOSCHED 606 607static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd) 608{ 609 return pd ? container_of(pd, struct cfq_group, pd) : NULL; 610} 611 612static struct cfq_group_data 613*cpd_to_cfqgd(struct blkcg_policy_data *cpd) 614{ 615 return cpd ? container_of(cpd, struct cfq_group_data, pd) : NULL; 616} 617 618static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg) 619{ 620 return pd_to_blkg(&cfqg->pd); 621} 622 623static struct blkcg_policy blkcg_policy_cfq; 624 625static inline struct cfq_group *blkg_to_cfqg(struct blkcg_gq *blkg) 626{ 627 return pd_to_cfqg(blkg_to_pd(blkg, &blkcg_policy_cfq)); 628} 629 630static struct cfq_group_data *blkcg_to_cfqgd(struct blkcg *blkcg) 631{ 632 return cpd_to_cfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_cfq)); 633} 634 635static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg) 636{ 637 struct blkcg_gq *pblkg = cfqg_to_blkg(cfqg)->parent; 638 639 return pblkg ? blkg_to_cfqg(pblkg) : NULL; 640} 641 642static inline void cfqg_get(struct cfq_group *cfqg) 643{ 644 return blkg_get(cfqg_to_blkg(cfqg)); 645} 646 647static inline void cfqg_put(struct cfq_group *cfqg) 648{ 649 return blkg_put(cfqg_to_blkg(cfqg)); 650} 651 652#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) do { \ 653 char __pbuf[128]; \ 654 \ 655 blkg_path(cfqg_to_blkg((cfqq)->cfqg), __pbuf, sizeof(__pbuf)); \ 656 blk_add_trace_msg((cfqd)->queue, "cfq%d%c%c %s " fmt, (cfqq)->pid, \ 657 cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \ 658 cfqq_type((cfqq)) == SYNC_NOIDLE_WORKLOAD ? 'N' : ' ',\ 659 __pbuf, ##args); \ 660} while (0) 661 662#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do { \ 663 char __pbuf[128]; \ 664 \ 665 blkg_path(cfqg_to_blkg(cfqg), __pbuf, sizeof(__pbuf)); \ 666 blk_add_trace_msg((cfqd)->queue, "%s " fmt, __pbuf, ##args); \ 667} while (0) 668 669static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg, 670 struct cfq_group *curr_cfqg, int rw) 671{ 672 blkg_rwstat_add(&cfqg->stats.queued, rw, 1); 673 cfqg_stats_end_empty_time(&cfqg->stats); 674 cfqg_stats_set_start_group_wait_time(cfqg, curr_cfqg); 675} 676 677static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg, 678 unsigned long time, unsigned long unaccounted_time) 679{ 680 blkg_stat_add(&cfqg->stats.time, time); 681#ifdef CONFIG_DEBUG_BLK_CGROUP 682 blkg_stat_add(&cfqg->stats.unaccounted_time, unaccounted_time); 683#endif 684} 685 686static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int rw) 687{ 688 blkg_rwstat_add(&cfqg->stats.queued, rw, -1); 689} 690 691static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw) 692{ 693 blkg_rwstat_add(&cfqg->stats.merged, rw, 1); 694} 695 696static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg, 697 uint64_t bytes, int rw) 698{ 699 blkg_stat_add(&cfqg->stats.sectors, bytes >> 9); 700 blkg_rwstat_add(&cfqg->stats.serviced, rw, 1); 701 blkg_rwstat_add(&cfqg->stats.service_bytes, rw, bytes); 702} 703 704static inline void cfqg_stats_update_completion(struct cfq_group *cfqg, 705 uint64_t start_time, uint64_t io_start_time, int rw) 706{ 707 struct cfqg_stats *stats = &cfqg->stats; 708 unsigned long long now = sched_clock(); 709 710 if (time_after64(now, io_start_time)) 711 blkg_rwstat_add(&stats->service_time, rw, now - io_start_time); 712 if (time_after64(io_start_time, start_time)) 713 blkg_rwstat_add(&stats->wait_time, rw, 714 io_start_time - start_time); 715} 716 717/* @stats = 0 */ 718static void cfqg_stats_reset(struct cfqg_stats *stats) 719{ 720 /* queued stats shouldn't be cleared */ 721 blkg_rwstat_reset(&stats->service_bytes); 722 blkg_rwstat_reset(&stats->serviced); 723 blkg_rwstat_reset(&stats->merged); 724 blkg_rwstat_reset(&stats->service_time); 725 blkg_rwstat_reset(&stats->wait_time); 726 blkg_stat_reset(&stats->time); 727#ifdef CONFIG_DEBUG_BLK_CGROUP 728 blkg_stat_reset(&stats->unaccounted_time); 729 blkg_stat_reset(&stats->avg_queue_size_sum); 730 blkg_stat_reset(&stats->avg_queue_size_samples); 731 blkg_stat_reset(&stats->dequeue); 732 blkg_stat_reset(&stats->group_wait_time); 733 blkg_stat_reset(&stats->idle_time); 734 blkg_stat_reset(&stats->empty_time); 735#endif 736} 737 738/* @to += @from */ 739static void cfqg_stats_merge(struct cfqg_stats *to, struct cfqg_stats *from) 740{ 741 /* queued stats shouldn't be cleared */ 742 blkg_rwstat_merge(&to->service_bytes, &from->service_bytes); 743 blkg_rwstat_merge(&to->serviced, &from->serviced); 744 blkg_rwstat_merge(&to->merged, &from->merged); 745 blkg_rwstat_merge(&to->service_time, &from->service_time); 746 blkg_rwstat_merge(&to->wait_time, &from->wait_time); 747 blkg_stat_merge(&from->time, &from->time); 748#ifdef CONFIG_DEBUG_BLK_CGROUP 749 blkg_stat_merge(&to->unaccounted_time, &from->unaccounted_time); 750 blkg_stat_merge(&to->avg_queue_size_sum, &from->avg_queue_size_sum); 751 blkg_stat_merge(&to->avg_queue_size_samples, &from->avg_queue_size_samples); 752 blkg_stat_merge(&to->dequeue, &from->dequeue); 753 blkg_stat_merge(&to->group_wait_time, &from->group_wait_time); 754 blkg_stat_merge(&to->idle_time, &from->idle_time); 755 blkg_stat_merge(&to->empty_time, &from->empty_time); 756#endif 757} 758 759/* 760 * Transfer @cfqg's stats to its parent's dead_stats so that the ancestors' 761 * recursive stats can still account for the amount used by this cfqg after 762 * it's gone. 763 */ 764static void cfqg_stats_xfer_dead(struct cfq_group *cfqg) 765{ 766 struct cfq_group *parent = cfqg_parent(cfqg); 767 768 lockdep_assert_held(cfqg_to_blkg(cfqg)->q->queue_lock); 769 770 if (unlikely(!parent)) 771 return; 772 773 cfqg_stats_merge(&parent->dead_stats, &cfqg->stats); 774 cfqg_stats_merge(&parent->dead_stats, &cfqg->dead_stats); 775 cfqg_stats_reset(&cfqg->stats); 776 cfqg_stats_reset(&cfqg->dead_stats); 777} 778 779#else /* CONFIG_CFQ_GROUP_IOSCHED */ 780 781static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg) { return NULL; } 782static inline void cfqg_get(struct cfq_group *cfqg) { } 783static inline void cfqg_put(struct cfq_group *cfqg) { } 784 785#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ 786 blk_add_trace_msg((cfqd)->queue, "cfq%d%c%c " fmt, (cfqq)->pid, \ 787 cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \ 788 cfqq_type((cfqq)) == SYNC_NOIDLE_WORKLOAD ? 'N' : ' ',\ 789 ##args) 790#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0) 791 792static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg, 793 struct cfq_group *curr_cfqg, int rw) { } 794static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg, 795 unsigned long time, unsigned long unaccounted_time) { } 796static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int rw) { } 797static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw) { } 798static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg, 799 uint64_t bytes, int rw) { } 800static inline void cfqg_stats_update_completion(struct cfq_group *cfqg, 801 uint64_t start_time, uint64_t io_start_time, int rw) { } 802 803#endif /* CONFIG_CFQ_GROUP_IOSCHED */ 804 805#define cfq_log(cfqd, fmt, args...) \ 806 blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args) 807 808/* Traverses through cfq group service trees */ 809#define for_each_cfqg_st(cfqg, i, j, st) \ 810 for (i = 0; i <= IDLE_WORKLOAD; i++) \ 811 for (j = 0, st = i < IDLE_WORKLOAD ? &cfqg->service_trees[i][j]\ 812 : &cfqg->service_tree_idle; \ 813 (i < IDLE_WORKLOAD && j <= SYNC_WORKLOAD) || \ 814 (i == IDLE_WORKLOAD && j == 0); \ 815 j++, st = i < IDLE_WORKLOAD ? \ 816 &cfqg->service_trees[i][j]: NULL) \ 817 818static inline bool cfq_io_thinktime_big(struct cfq_data *cfqd, 819 struct cfq_ttime *ttime, bool group_idle) 820{ 821 unsigned long slice; 822 if (!sample_valid(ttime->ttime_samples)) 823 return false; 824 if (group_idle) 825 slice = cfqd->cfq_group_idle; 826 else 827 slice = cfqd->cfq_slice_idle; 828 return ttime->ttime_mean > slice; 829} 830 831static inline bool iops_mode(struct cfq_data *cfqd) 832{ 833 /* 834 * If we are not idling on queues and it is a NCQ drive, parallel 835 * execution of requests is on and measuring time is not possible 836 * in most of the cases until and unless we drive shallower queue 837 * depths and that becomes a performance bottleneck. In such cases 838 * switch to start providing fairness in terms of number of IOs. 839 */ 840 if (!cfqd->cfq_slice_idle && cfqd->hw_tag) 841 return true; 842 else 843 return false; 844} 845 846static inline enum wl_class_t cfqq_class(struct cfq_queue *cfqq) 847{ 848 if (cfq_class_idle(cfqq)) 849 return IDLE_WORKLOAD; 850 if (cfq_class_rt(cfqq)) 851 return RT_WORKLOAD; 852 return BE_WORKLOAD; 853} 854 855 856static enum wl_type_t cfqq_type(struct cfq_queue *cfqq) 857{ 858 if (!cfq_cfqq_sync(cfqq)) 859 return ASYNC_WORKLOAD; 860 if (!cfq_cfqq_idle_window(cfqq)) 861 return SYNC_NOIDLE_WORKLOAD; 862 return SYNC_WORKLOAD; 863} 864 865static inline int cfq_group_busy_queues_wl(enum wl_class_t wl_class, 866 struct cfq_data *cfqd, 867 struct cfq_group *cfqg) 868{ 869 if (wl_class == IDLE_WORKLOAD) 870 return cfqg->service_tree_idle.count; 871 872 return cfqg->service_trees[wl_class][ASYNC_WORKLOAD].count + 873 cfqg->service_trees[wl_class][SYNC_NOIDLE_WORKLOAD].count + 874 cfqg->service_trees[wl_class][SYNC_WORKLOAD].count; 875} 876 877static inline int cfqg_busy_async_queues(struct cfq_data *cfqd, 878 struct cfq_group *cfqg) 879{ 880 return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count + 881 cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count; 882} 883 884static void cfq_dispatch_insert(struct request_queue *, struct request *); 885static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, bool is_sync, 886 struct cfq_io_cq *cic, struct bio *bio, 887 gfp_t gfp_mask); 888 889static inline struct cfq_io_cq *icq_to_cic(struct io_cq *icq) 890{ 891 /* cic->icq is the first member, %NULL will convert to %NULL */ 892 return container_of(icq, struct cfq_io_cq, icq); 893} 894 895static inline struct cfq_io_cq *cfq_cic_lookup(struct cfq_data *cfqd, 896 struct io_context *ioc) 897{ 898 if (ioc) 899 return icq_to_cic(ioc_lookup_icq(ioc, cfqd->queue)); 900 return NULL; 901} 902 903static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_cq *cic, bool is_sync) 904{ 905 return cic->cfqq[is_sync]; 906} 907 908static inline void cic_set_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq, 909 bool is_sync) 910{ 911 cic->cfqq[is_sync] = cfqq; 912} 913 914static inline struct cfq_data *cic_to_cfqd(struct cfq_io_cq *cic) 915{ 916 return cic->icq.q->elevator->elevator_data; 917} 918 919/* 920 * We regard a request as SYNC, if it's either a read or has the SYNC bit 921 * set (in which case it could also be direct WRITE). 922 */ 923static inline bool cfq_bio_sync(struct bio *bio) 924{ 925 return bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC); 926} 927 928/* 929 * scheduler run of queue, if there are requests pending and no one in the 930 * driver that will restart queueing 931 */ 932static inline void cfq_schedule_dispatch(struct cfq_data *cfqd) 933{ 934 if (cfqd->busy_queues) { 935 cfq_log(cfqd, "schedule dispatch"); 936 kblockd_schedule_work(&cfqd->unplug_work); 937 } 938} 939 940/* 941 * Scale schedule slice based on io priority. Use the sync time slice only 942 * if a queue is marked sync and has sync io queued. A sync queue with async 943 * io only, should not get full sync slice length. 944 */ 945static inline int cfq_prio_slice(struct cfq_data *cfqd, bool sync, 946 unsigned short prio) 947{ 948 const int base_slice = cfqd->cfq_slice[sync]; 949 950 WARN_ON(prio >= IOPRIO_BE_NR); 951 952 return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - prio)); 953} 954 955static inline int 956cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) 957{ 958 return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio); 959} 960 961/** 962 * cfqg_scale_charge - scale disk time charge according to cfqg weight 963 * @charge: disk time being charged 964 * @vfraction: vfraction of the cfqg, fixed point w/ CFQ_SERVICE_SHIFT 965 * 966 * Scale @charge according to @vfraction, which is in range (0, 1]. The 967 * scaling is inversely proportional. 968 * 969 * scaled = charge / vfraction 970 * 971 * The result is also in fixed point w/ CFQ_SERVICE_SHIFT. 972 */ 973static inline u64 cfqg_scale_charge(unsigned long charge, 974 unsigned int vfraction) 975{ 976 u64 c = charge << CFQ_SERVICE_SHIFT; /* make it fixed point */ 977 978 /* charge / vfraction */ 979 c <<= CFQ_SERVICE_SHIFT; 980 do_div(c, vfraction); 981 return c; 982} 983 984static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime) 985{ 986 s64 delta = (s64)(vdisktime - min_vdisktime); 987 if (delta > 0) 988 min_vdisktime = vdisktime; 989 990 return min_vdisktime; 991} 992 993static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime) 994{ 995 s64 delta = (s64)(vdisktime - min_vdisktime); 996 if (delta < 0) 997 min_vdisktime = vdisktime; 998 999 return min_vdisktime; 1000} 1001 1002static void update_min_vdisktime(struct cfq_rb_root *st) 1003{ 1004 struct cfq_group *cfqg; 1005 1006 if (st->left) { 1007 cfqg = rb_entry_cfqg(st->left); 1008 st->min_vdisktime = max_vdisktime(st->min_vdisktime, 1009 cfqg->vdisktime); 1010 } 1011} 1012 1013/* 1014 * get averaged number of queues of RT/BE priority. 1015 * average is updated, with a formula that gives more weight to higher numbers, 1016 * to quickly follows sudden increases and decrease slowly 1017 */ 1018 1019static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd, 1020 struct cfq_group *cfqg, bool rt) 1021{ 1022 unsigned min_q, max_q; 1023 unsigned mult = cfq_hist_divisor - 1; 1024 unsigned round = cfq_hist_divisor / 2; 1025 unsigned busy = cfq_group_busy_queues_wl(rt, cfqd, cfqg); 1026 1027 min_q = min(cfqg->busy_queues_avg[rt], busy); 1028 max_q = max(cfqg->busy_queues_avg[rt], busy); 1029 cfqg->busy_queues_avg[rt] = (mult * max_q + min_q + round) / 1030 cfq_hist_divisor; 1031 return cfqg->busy_queues_avg[rt]; 1032} 1033 1034static inline unsigned 1035cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg) 1036{ 1037 return cfqd->cfq_target_latency * cfqg->vfraction >> CFQ_SERVICE_SHIFT; 1038} 1039 1040static inline unsigned 1041cfq_scaled_cfqq_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) 1042{ 1043 unsigned slice = cfq_prio_to_slice(cfqd, cfqq); 1044 if (cfqd->cfq_latency) { 1045 /* 1046 * interested queues (we consider only the ones with the same 1047 * priority class in the cfq group) 1048 */ 1049 unsigned iq = cfq_group_get_avg_queues(cfqd, cfqq->cfqg, 1050 cfq_class_rt(cfqq)); 1051 unsigned sync_slice = cfqd->cfq_slice[1]; 1052 unsigned expect_latency = sync_slice * iq; 1053 unsigned group_slice = cfq_group_slice(cfqd, cfqq->cfqg); 1054 1055 if (expect_latency > group_slice) { 1056 unsigned base_low_slice = 2 * cfqd->cfq_slice_idle; 1057 /* scale low_slice according to IO priority 1058 * and sync vs async */ 1059 unsigned low_slice = 1060 min(slice, base_low_slice * slice / sync_slice); 1061 /* the adapted slice value is scaled to fit all iqs 1062 * into the target latency */ 1063 slice = max(slice * group_slice / expect_latency, 1064 low_slice); 1065 } 1066 } 1067 return slice; 1068} 1069 1070static inline void 1071cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) 1072{ 1073 unsigned slice = cfq_scaled_cfqq_slice(cfqd, cfqq); 1074 1075 cfqq->slice_start = jiffies; 1076 cfqq->slice_end = jiffies + slice; 1077 cfqq->allocated_slice = slice; 1078 cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies); 1079} 1080 1081/* 1082 * We need to wrap this check in cfq_cfqq_slice_new(), since ->slice_end 1083 * isn't valid until the first request from the dispatch is activated 1084 * and the slice time set. 1085 */ 1086static inline bool cfq_slice_used(struct cfq_queue *cfqq) 1087{ 1088 if (cfq_cfqq_slice_new(cfqq)) 1089 return false; 1090 if (time_before(jiffies, cfqq->slice_end)) 1091 return false; 1092 1093 return true; 1094} 1095 1096/* 1097 * Lifted from AS - choose which of rq1 and rq2 that is best served now. 1098 * We choose the request that is closest to the head right now. Distance 1099 * behind the head is penalized and only allowed to a certain extent. 1100 */ 1101static struct request * 1102cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2, sector_t last) 1103{ 1104 sector_t s1, s2, d1 = 0, d2 = 0; 1105 unsigned long back_max; 1106#define CFQ_RQ1_WRAP 0x01 /* request 1 wraps */ 1107#define CFQ_RQ2_WRAP 0x02 /* request 2 wraps */ 1108 unsigned wrap = 0; /* bit mask: requests behind the disk head? */ 1109 1110 if (rq1 == NULL || rq1 == rq2) 1111 return rq2; 1112 if (rq2 == NULL) 1113 return rq1; 1114 1115 if (rq_is_sync(rq1) != rq_is_sync(rq2)) 1116 return rq_is_sync(rq1) ? rq1 : rq2; 1117 1118 if ((rq1->cmd_flags ^ rq2->cmd_flags) & REQ_PRIO) 1119 return rq1->cmd_flags & REQ_PRIO ? rq1 : rq2; 1120 1121 s1 = blk_rq_pos(rq1); 1122 s2 = blk_rq_pos(rq2); 1123 1124 /* 1125 * by definition, 1KiB is 2 sectors 1126 */ 1127 back_max = cfqd->cfq_back_max * 2; 1128 1129 /* 1130 * Strict one way elevator _except_ in the case where we allow 1131 * short backward seeks which are biased as twice the cost of a 1132 * similar forward seek. 1133 */ 1134 if (s1 >= last) 1135 d1 = s1 - last; 1136 else if (s1 + back_max >= last) 1137 d1 = (last - s1) * cfqd->cfq_back_penalty; 1138 else 1139 wrap |= CFQ_RQ1_WRAP; 1140 1141 if (s2 >= last) 1142 d2 = s2 - last; 1143 else if (s2 + back_max >= last) 1144 d2 = (last - s2) * cfqd->cfq_back_penalty; 1145 else 1146 wrap |= CFQ_RQ2_WRAP; 1147 1148 /* Found required data */ 1149 1150 /* 1151 * By doing switch() on the bit mask "wrap" we avoid having to 1152 * check two variables for all permutations: --> faster! 1153 */ 1154 switch (wrap) { 1155 case 0: /* common case for CFQ: rq1 and rq2 not wrapped */ 1156 if (d1 < d2) 1157 return rq1; 1158 else if (d2 < d1) 1159 return rq2; 1160 else { 1161 if (s1 >= s2) 1162 return rq1; 1163 else 1164 return rq2; 1165 } 1166 1167 case CFQ_RQ2_WRAP: 1168 return rq1; 1169 case CFQ_RQ1_WRAP: 1170 return rq2; 1171 case (CFQ_RQ1_WRAP|CFQ_RQ2_WRAP): /* both rqs wrapped */ 1172 default: 1173 /* 1174 * Since both rqs are wrapped, 1175 * start with the one that's further behind head 1176 * (--> only *one* back seek required), 1177 * since back seek takes more time than forward. 1178 */ 1179 if (s1 <= s2) 1180 return rq1; 1181 else 1182 return rq2; 1183 } 1184} 1185 1186/* 1187 * The below is leftmost cache rbtree addon 1188 */ 1189static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root) 1190{ 1191 /* Service tree is empty */ 1192 if (!root->count) 1193 return NULL; 1194 1195 if (!root->left) 1196 root->left = rb_first(&root->rb); 1197 1198 if (root->left) 1199 return rb_entry(root->left, struct cfq_queue, rb_node); 1200 1201 return NULL; 1202} 1203 1204static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root) 1205{ 1206 if (!root->left) 1207 root->left = rb_first(&root->rb); 1208 1209 if (root->left) 1210 return rb_entry_cfqg(root->left); 1211 1212 return NULL; 1213} 1214 1215static void rb_erase_init(struct rb_node *n, struct rb_root *root) 1216{ 1217 rb_erase(n, root); 1218 RB_CLEAR_NODE(n); 1219} 1220 1221static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root) 1222{ 1223 if (root->left == n) 1224 root->left = NULL; 1225 rb_erase_init(n, &root->rb); 1226 --root->count; 1227} 1228 1229/* 1230 * would be nice to take fifo expire time into account as well 1231 */ 1232static struct request * 1233cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq, 1234 struct request *last) 1235{ 1236 struct rb_node *rbnext = rb_next(&last->rb_node); 1237 struct rb_node *rbprev = rb_prev(&last->rb_node); 1238 struct request *next = NULL, *prev = NULL; 1239 1240 BUG_ON(RB_EMPTY_NODE(&last->rb_node)); 1241 1242 if (rbprev) 1243 prev = rb_entry_rq(rbprev); 1244 1245 if (rbnext) 1246 next = rb_entry_rq(rbnext); 1247 else { 1248 rbnext = rb_first(&cfqq->sort_list); 1249 if (rbnext && rbnext != &last->rb_node) 1250 next = rb_entry_rq(rbnext); 1251 } 1252 1253 return cfq_choose_req(cfqd, next, prev, blk_rq_pos(last)); 1254} 1255 1256static unsigned long cfq_slice_offset(struct cfq_data *cfqd, 1257 struct cfq_queue *cfqq) 1258{ 1259 /* 1260 * just an approximation, should be ok. 1261 */ 1262 return (cfqq->cfqg->nr_cfqq - 1) * (cfq_prio_slice(cfqd, 1, 0) - 1263 cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio)); 1264} 1265 1266static inline s64 1267cfqg_key(struct cfq_rb_root *st, struct cfq_group *cfqg) 1268{ 1269 return cfqg->vdisktime - st->min_vdisktime; 1270} 1271 1272static void 1273__cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg) 1274{ 1275 struct rb_node **node = &st->rb.rb_node; 1276 struct rb_node *parent = NULL; 1277 struct cfq_group *__cfqg; 1278 s64 key = cfqg_key(st, cfqg); 1279 int left = 1; 1280 1281 while (*node != NULL) { 1282 parent = *node; 1283 __cfqg = rb_entry_cfqg(parent); 1284 1285 if (key < cfqg_key(st, __cfqg)) 1286 node = &parent->rb_left; 1287 else { 1288 node = &parent->rb_right; 1289 left = 0; 1290 } 1291 } 1292 1293 if (left) 1294 st->left = &cfqg->rb_node; 1295 1296 rb_link_node(&cfqg->rb_node, parent, node); 1297 rb_insert_color(&cfqg->rb_node, &st->rb); 1298} 1299 1300/* 1301 * This has to be called only on activation of cfqg 1302 */ 1303static void 1304cfq_update_group_weight(struct cfq_group *cfqg) 1305{ 1306 if (cfqg->new_weight) { 1307 cfqg->weight = cfqg->new_weight; 1308 cfqg->new_weight = 0; 1309 } 1310} 1311 1312static void 1313cfq_update_group_leaf_weight(struct cfq_group *cfqg) 1314{ 1315 BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node)); 1316 1317 if (cfqg->new_leaf_weight) { 1318 cfqg->leaf_weight = cfqg->new_leaf_weight; 1319 cfqg->new_leaf_weight = 0; 1320 } 1321} 1322 1323static void 1324cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg) 1325{ 1326 unsigned int vfr = 1 << CFQ_SERVICE_SHIFT; /* start with 1 */ 1327 struct cfq_group *pos = cfqg; 1328 struct cfq_group *parent; 1329 bool propagate; 1330 1331 /* add to the service tree */ 1332 BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node)); 1333 1334 /* 1335 * Update leaf_weight. We cannot update weight at this point 1336 * because cfqg might already have been activated and is 1337 * contributing its current weight to the parent's child_weight. 1338 */ 1339 cfq_update_group_leaf_weight(cfqg); 1340 __cfq_group_service_tree_add(st, cfqg); 1341 1342 /* 1343 * Activate @cfqg and calculate the portion of vfraction @cfqg is 1344 * entitled to. vfraction is calculated by walking the tree 1345 * towards the root calculating the fraction it has at each level. 1346 * The compounded ratio is how much vfraction @cfqg owns. 1347 * 1348 * Start with the proportion tasks in this cfqg has against active 1349 * children cfqgs - its leaf_weight against children_weight. 1350 */ 1351 propagate = !pos->nr_active++; 1352 pos->children_weight += pos->leaf_weight; 1353 vfr = vfr * pos->leaf_weight / pos->children_weight; 1354 1355 /* 1356 * Compound ->weight walking up the tree. Both activation and 1357 * vfraction calculation are done in the same loop. Propagation 1358 * stops once an already activated node is met. vfraction 1359 * calculation should always continue to the root. 1360 */ 1361 while ((parent = cfqg_parent(pos))) { 1362 if (propagate) { 1363 cfq_update_group_weight(pos); 1364 propagate = !parent->nr_active++; 1365 parent->children_weight += pos->weight; 1366 } 1367 vfr = vfr * pos->weight / parent->children_weight; 1368 pos = parent; 1369 } 1370 1371 cfqg->vfraction = max_t(unsigned, vfr, 1); 1372} 1373 1374static void 1375cfq_group_notify_queue_add(struct cfq_data *cfqd, struct cfq_group *cfqg) 1376{ 1377 struct cfq_rb_root *st = &cfqd->grp_service_tree; 1378 struct cfq_group *__cfqg; 1379 struct rb_node *n; 1380 1381 cfqg->nr_cfqq++; 1382 if (!RB_EMPTY_NODE(&cfqg->rb_node)) 1383 return; 1384 1385 /* 1386 * Currently put the group at the end. Later implement something 1387 * so that groups get lesser vtime based on their weights, so that 1388 * if group does not loose all if it was not continuously backlogged. 1389 */ 1390 n = rb_last(&st->rb); 1391 if (n) { 1392 __cfqg = rb_entry_cfqg(n); 1393 cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY; 1394 } else 1395 cfqg->vdisktime = st->min_vdisktime; 1396 cfq_group_service_tree_add(st, cfqg); 1397} 1398 1399static void 1400cfq_group_service_tree_del(struct cfq_rb_root *st, struct cfq_group *cfqg) 1401{ 1402 struct cfq_group *pos = cfqg; 1403 bool propagate; 1404 1405 /* 1406 * Undo activation from cfq_group_service_tree_add(). Deactivate 1407 * @cfqg and propagate deactivation upwards. 1408 */ 1409 propagate = !--pos->nr_active; 1410 pos->children_weight -= pos->leaf_weight; 1411 1412 while (propagate) { 1413 struct cfq_group *parent = cfqg_parent(pos); 1414 1415 /* @pos has 0 nr_active at this point */ 1416 WARN_ON_ONCE(pos->children_weight); 1417 pos->vfraction = 0; 1418 1419 if (!parent) 1420 break; 1421 1422 propagate = !--parent->nr_active; 1423 parent->children_weight -= pos->weight; 1424 pos = parent; 1425 } 1426 1427 /* remove from the service tree */ 1428 if (!RB_EMPTY_NODE(&cfqg->rb_node)) 1429 cfq_rb_erase(&cfqg->rb_node, st); 1430} 1431 1432static void 1433cfq_group_notify_queue_del(struct cfq_data *cfqd, struct cfq_group *cfqg) 1434{ 1435 struct cfq_rb_root *st = &cfqd->grp_service_tree; 1436 1437 BUG_ON(cfqg->nr_cfqq < 1); 1438 cfqg->nr_cfqq--; 1439 1440 /* If there are other cfq queues under this group, don't delete it */ 1441 if (cfqg->nr_cfqq) 1442 return; 1443 1444 cfq_log_cfqg(cfqd, cfqg, "del_from_rr group"); 1445 cfq_group_service_tree_del(st, cfqg); 1446 cfqg->saved_wl_slice = 0; 1447 cfqg_stats_update_dequeue(cfqg); 1448} 1449 1450static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq, 1451 unsigned int *unaccounted_time) 1452{ 1453 unsigned int slice_used; 1454 1455 /* 1456 * Queue got expired before even a single request completed or 1457 * got expired immediately after first request completion. 1458 */ 1459 if (!cfqq->slice_start || cfqq->slice_start == jiffies) { 1460 /* 1461 * Also charge the seek time incurred to the group, otherwise 1462 * if there are mutiple queues in the group, each can dispatch 1463 * a single request on seeky media and cause lots of seek time 1464 * and group will never know it. 1465 */ 1466 slice_used = max_t(unsigned, (jiffies - cfqq->dispatch_start), 1467 1); 1468 } else { 1469 slice_used = jiffies - cfqq->slice_start; 1470 if (slice_used > cfqq->allocated_slice) { 1471 *unaccounted_time = slice_used - cfqq->allocated_slice; 1472 slice_used = cfqq->allocated_slice; 1473 } 1474 if (time_after(cfqq->slice_start, cfqq->dispatch_start)) 1475 *unaccounted_time += cfqq->slice_start - 1476 cfqq->dispatch_start; 1477 } 1478 1479 return slice_used; 1480} 1481 1482static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg, 1483 struct cfq_queue *cfqq) 1484{ 1485 struct cfq_rb_root *st = &cfqd->grp_service_tree; 1486 unsigned int used_sl, charge, unaccounted_sl = 0; 1487 int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg) 1488 - cfqg->service_tree_idle.count; 1489 unsigned int vfr; 1490 1491 BUG_ON(nr_sync < 0); 1492 used_sl = charge = cfq_cfqq_slice_usage(cfqq, &unaccounted_sl); 1493 1494 if (iops_mode(cfqd)) 1495 charge = cfqq->slice_dispatch; 1496 else if (!cfq_cfqq_sync(cfqq) && !nr_sync) 1497 charge = cfqq->allocated_slice; 1498 1499 /* 1500 * Can't update vdisktime while on service tree and cfqg->vfraction 1501 * is valid only while on it. Cache vfr, leave the service tree, 1502 * update vdisktime and go back on. The re-addition to the tree 1503 * will also update the weights as necessary. 1504 */ 1505 vfr = cfqg->vfraction; 1506 cfq_group_service_tree_del(st, cfqg); 1507 cfqg->vdisktime += cfqg_scale_charge(charge, vfr); 1508 cfq_group_service_tree_add(st, cfqg); 1509 1510 /* This group is being expired. Save the context */ 1511 if (time_after(cfqd->workload_expires, jiffies)) { 1512 cfqg->saved_wl_slice = cfqd->workload_expires 1513 - jiffies; 1514 cfqg->saved_wl_type = cfqd->serving_wl_type; 1515 cfqg->saved_wl_class = cfqd->serving_wl_class; 1516 } else 1517 cfqg->saved_wl_slice = 0; 1518 1519 cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime, 1520 st->min_vdisktime); 1521 cfq_log_cfqq(cfqq->cfqd, cfqq, 1522 "sl_used=%u disp=%u charge=%u iops=%u sect=%lu", 1523 used_sl, cfqq->slice_dispatch, charge, 1524 iops_mode(cfqd), cfqq->nr_sectors); 1525 cfqg_stats_update_timeslice_used(cfqg, used_sl, unaccounted_sl); 1526 cfqg_stats_set_start_empty_time(cfqg); 1527} 1528 1529/** 1530 * cfq_init_cfqg_base - initialize base part of a cfq_group 1531 * @cfqg: cfq_group to initialize 1532 * 1533 * Initialize the base part which is used whether %CONFIG_CFQ_GROUP_IOSCHED 1534 * is enabled or not. 1535 */ 1536static void cfq_init_cfqg_base(struct cfq_group *cfqg) 1537{ 1538 struct cfq_rb_root *st; 1539 int i, j; 1540 1541 for_each_cfqg_st(cfqg, i, j, st) 1542 *st = CFQ_RB_ROOT; 1543 RB_CLEAR_NODE(&cfqg->rb_node); 1544 1545 cfqg->ttime.last_end_request = jiffies; 1546} 1547 1548#ifdef CONFIG_CFQ_GROUP_IOSCHED 1549static void cfqg_stats_init(struct cfqg_stats *stats) 1550{ 1551 blkg_rwstat_init(&stats->service_bytes); 1552 blkg_rwstat_init(&stats->serviced); 1553 blkg_rwstat_init(&stats->merged); 1554 blkg_rwstat_init(&stats->service_time); 1555 blkg_rwstat_init(&stats->wait_time); 1556 blkg_rwstat_init(&stats->queued); 1557 1558 blkg_stat_init(&stats->sectors); 1559 blkg_stat_init(&stats->time); 1560 1561#ifdef CONFIG_DEBUG_BLK_CGROUP 1562 blkg_stat_init(&stats->unaccounted_time); 1563 blkg_stat_init(&stats->avg_queue_size_sum); 1564 blkg_stat_init(&stats->avg_queue_size_samples); 1565 blkg_stat_init(&stats->dequeue); 1566 blkg_stat_init(&stats->group_wait_time); 1567 blkg_stat_init(&stats->idle_time); 1568 blkg_stat_init(&stats->empty_time); 1569#endif 1570} 1571 1572static void cfq_cpd_init(const struct blkcg *blkcg) 1573{ 1574 struct cfq_group_data *cgd = 1575 cpd_to_cfqgd(blkcg->pd[blkcg_policy_cfq.plid]); 1576 1577 if (blkcg == &blkcg_root) { 1578 cgd->weight = 2 * CFQ_WEIGHT_DEFAULT; 1579 cgd->leaf_weight = 2 * CFQ_WEIGHT_DEFAULT; 1580 } else { 1581 cgd->weight = CFQ_WEIGHT_DEFAULT; 1582 cgd->leaf_weight = CFQ_WEIGHT_DEFAULT; 1583 } 1584} 1585 1586static void cfq_pd_init(struct blkcg_gq *blkg) 1587{ 1588 struct cfq_group *cfqg = blkg_to_cfqg(blkg); 1589 struct cfq_group_data *cgd = blkcg_to_cfqgd(blkg->blkcg); 1590 1591 cfq_init_cfqg_base(cfqg); 1592 cfqg->weight = cgd->weight; 1593 cfqg->leaf_weight = cgd->leaf_weight; 1594 cfqg_stats_init(&cfqg->stats); 1595 cfqg_stats_init(&cfqg->dead_stats); 1596} 1597 1598static void cfq_pd_offline(struct blkcg_gq *blkg) 1599{ 1600 /* 1601 * @blkg is going offline and will be ignored by 1602 * blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so 1603 * that they don't get lost. If IOs complete after this point, the 1604 * stats for them will be lost. Oh well... 1605 */ 1606 cfqg_stats_xfer_dead(blkg_to_cfqg(blkg)); 1607} 1608 1609/* offset delta from cfqg->stats to cfqg->dead_stats */ 1610static const int dead_stats_off_delta = offsetof(struct cfq_group, dead_stats) - 1611 offsetof(struct cfq_group, stats); 1612 1613/* to be used by recursive prfill, sums live and dead stats recursively */ 1614static u64 cfqg_stat_pd_recursive_sum(struct blkg_policy_data *pd, int off) 1615{ 1616 u64 sum = 0; 1617 1618 sum += blkg_stat_recursive_sum(pd, off); 1619 sum += blkg_stat_recursive_sum(pd, off + dead_stats_off_delta); 1620 return sum; 1621} 1622 1623/* to be used by recursive prfill, sums live and dead rwstats recursively */ 1624static struct blkg_rwstat cfqg_rwstat_pd_recursive_sum(struct blkg_policy_data *pd, 1625 int off) 1626{ 1627 struct blkg_rwstat a, b; 1628 1629 a = blkg_rwstat_recursive_sum(pd, off); 1630 b = blkg_rwstat_recursive_sum(pd, off + dead_stats_off_delta); 1631 blkg_rwstat_merge(&a, &b); 1632 return a; 1633} 1634 1635static void cfq_pd_reset_stats(struct blkcg_gq *blkg) 1636{ 1637 struct cfq_group *cfqg = blkg_to_cfqg(blkg); 1638 1639 cfqg_stats_reset(&cfqg->stats); 1640 cfqg_stats_reset(&cfqg->dead_stats); 1641} 1642 1643/* 1644 * Search for the cfq group current task belongs to. request_queue lock must 1645 * be held. 1646 */ 1647static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd, 1648 struct blkcg *blkcg) 1649{ 1650 struct request_queue *q = cfqd->queue; 1651 struct cfq_group *cfqg = NULL; 1652 1653 /* avoid lookup for the common case where there's no blkcg */ 1654 if (blkcg == &blkcg_root) { 1655 cfqg = cfqd->root_group; 1656 } else { 1657 struct blkcg_gq *blkg; 1658 1659 blkg = blkg_lookup_create(blkcg, q); 1660 if (!IS_ERR(blkg)) 1661 cfqg = blkg_to_cfqg(blkg); 1662 } 1663 1664 return cfqg; 1665} 1666 1667static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) 1668{ 1669 /* Currently, all async queues are mapped to root group */ 1670 if (!cfq_cfqq_sync(cfqq)) 1671 cfqg = cfqq->cfqd->root_group; 1672 1673 cfqq->cfqg = cfqg; 1674 /* cfqq reference on cfqg */ 1675 cfqg_get(cfqg); 1676} 1677 1678static u64 cfqg_prfill_weight_device(struct seq_file *sf, 1679 struct blkg_policy_data *pd, int off) 1680{ 1681 struct cfq_group *cfqg = pd_to_cfqg(pd); 1682 1683 if (!cfqg->dev_weight) 1684 return 0; 1685 return __blkg_prfill_u64(sf, pd, cfqg->dev_weight); 1686} 1687 1688static int cfqg_print_weight_device(struct seq_file *sf, void *v) 1689{ 1690 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), 1691 cfqg_prfill_weight_device, &blkcg_policy_cfq, 1692 0, false); 1693 return 0; 1694} 1695 1696static u64 cfqg_prfill_leaf_weight_device(struct seq_file *sf, 1697 struct blkg_policy_data *pd, int off) 1698{ 1699 struct cfq_group *cfqg = pd_to_cfqg(pd); 1700 1701 if (!cfqg->dev_leaf_weight) 1702 return 0; 1703 return __blkg_prfill_u64(sf, pd, cfqg->dev_leaf_weight); 1704} 1705 1706static int cfqg_print_leaf_weight_device(struct seq_file *sf, void *v) 1707{ 1708 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), 1709 cfqg_prfill_leaf_weight_device, &blkcg_policy_cfq, 1710 0, false); 1711 return 0; 1712} 1713 1714static int cfq_print_weight(struct seq_file *sf, void *v) 1715{ 1716 struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); 1717 struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg); 1718 unsigned int val = 0; 1719 1720 if (cgd) 1721 val = cgd->weight; 1722 1723 seq_printf(sf, "%u\n", val); 1724 return 0; 1725} 1726 1727static int cfq_print_leaf_weight(struct seq_file *sf, void *v) 1728{ 1729 struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); 1730 struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg); 1731 unsigned int val = 0; 1732 1733 if (cgd) 1734 val = cgd->leaf_weight; 1735 1736 seq_printf(sf, "%u\n", val); 1737 return 0; 1738} 1739 1740static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of, 1741 char *buf, size_t nbytes, loff_t off, 1742 bool is_leaf_weight) 1743{ 1744 struct blkcg *blkcg = css_to_blkcg(of_css(of)); 1745 struct blkg_conf_ctx ctx; 1746 struct cfq_group *cfqg; 1747 struct cfq_group_data *cfqgd; 1748 int ret; 1749 1750 ret = blkg_conf_prep(blkcg, &blkcg_policy_cfq, buf, &ctx); 1751 if (ret) 1752 return ret; 1753 1754 ret = -EINVAL; 1755 cfqg = blkg_to_cfqg(ctx.blkg); 1756 cfqgd = blkcg_to_cfqgd(blkcg); 1757 if (!cfqg || !cfqgd) 1758 goto err; 1759 1760 if (!ctx.v || (ctx.v >= CFQ_WEIGHT_MIN && ctx.v <= CFQ_WEIGHT_MAX)) { 1761 if (!is_leaf_weight) { 1762 cfqg->dev_weight = ctx.v; 1763 cfqg->new_weight = ctx.v ?: cfqgd->weight; 1764 } else { 1765 cfqg->dev_leaf_weight = ctx.v; 1766 cfqg->new_leaf_weight = ctx.v ?: cfqgd->leaf_weight; 1767 } 1768 ret = 0; 1769 } 1770 1771err: 1772 blkg_conf_finish(&ctx); 1773 return ret ?: nbytes; 1774} 1775 1776static ssize_t cfqg_set_weight_device(struct kernfs_open_file *of, 1777 char *buf, size_t nbytes, loff_t off) 1778{ 1779 return __cfqg_set_weight_device(of, buf, nbytes, off, false); 1780} 1781 1782static ssize_t cfqg_set_leaf_weight_device(struct kernfs_open_file *of, 1783 char *buf, size_t nbytes, loff_t off) 1784{ 1785 return __cfqg_set_weight_device(of, buf, nbytes, off, true); 1786} 1787 1788static int __cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft, 1789 u64 val, bool is_leaf_weight) 1790{ 1791 struct blkcg *blkcg = css_to_blkcg(css); 1792 struct blkcg_gq *blkg; 1793 struct cfq_group_data *cfqgd; 1794 int ret = 0; 1795 1796 if (val < CFQ_WEIGHT_MIN || val > CFQ_WEIGHT_MAX) 1797 return -EINVAL; 1798 1799 spin_lock_irq(&blkcg->lock); 1800 cfqgd = blkcg_to_cfqgd(blkcg); 1801 if (!cfqgd) { 1802 ret = -EINVAL; 1803 goto out; 1804 } 1805 1806 if (!is_leaf_weight) 1807 cfqgd->weight = val; 1808 else 1809 cfqgd->leaf_weight = val; 1810 1811 hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { 1812 struct cfq_group *cfqg = blkg_to_cfqg(blkg); 1813 1814 if (!cfqg) 1815 continue; 1816 1817 if (!is_leaf_weight) { 1818 if (!cfqg->dev_weight) 1819 cfqg->new_weight = cfqgd->weight; 1820 } else { 1821 if (!cfqg->dev_leaf_weight) 1822 cfqg->new_leaf_weight = cfqgd->leaf_weight; 1823 } 1824 } 1825 1826out: 1827 spin_unlock_irq(&blkcg->lock); 1828 return ret; 1829} 1830 1831static int cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft, 1832 u64 val) 1833{ 1834 return __cfq_set_weight(css, cft, val, false); 1835} 1836 1837static int cfq_set_leaf_weight(struct cgroup_subsys_state *css, 1838 struct cftype *cft, u64 val) 1839{ 1840 return __cfq_set_weight(css, cft, val, true); 1841} 1842 1843static int cfqg_print_stat(struct seq_file *sf, void *v) 1844{ 1845 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat, 1846 &blkcg_policy_cfq, seq_cft(sf)->private, false); 1847 return 0; 1848} 1849 1850static int cfqg_print_rwstat(struct seq_file *sf, void *v) 1851{ 1852 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat, 1853 &blkcg_policy_cfq, seq_cft(sf)->private, true); 1854 return 0; 1855} 1856 1857static u64 cfqg_prfill_stat_recursive(struct seq_file *sf, 1858 struct blkg_policy_data *pd, int off) 1859{ 1860 u64 sum = cfqg_stat_pd_recursive_sum(pd, off); 1861 1862 return __blkg_prfill_u64(sf, pd, sum); 1863} 1864 1865static u64 cfqg_prfill_rwstat_recursive(struct seq_file *sf, 1866 struct blkg_policy_data *pd, int off) 1867{ 1868 struct blkg_rwstat sum = cfqg_rwstat_pd_recursive_sum(pd, off); 1869 1870 return __blkg_prfill_rwstat(sf, pd, &sum); 1871} 1872 1873static int cfqg_print_stat_recursive(struct seq_file *sf, void *v) 1874{ 1875 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), 1876 cfqg_prfill_stat_recursive, &blkcg_policy_cfq, 1877 seq_cft(sf)->private, false); 1878 return 0; 1879} 1880 1881static int cfqg_print_rwstat_recursive(struct seq_file *sf, void *v) 1882{ 1883 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), 1884 cfqg_prfill_rwstat_recursive, &blkcg_policy_cfq, 1885 seq_cft(sf)->private, true); 1886 return 0; 1887} 1888 1889#ifdef CONFIG_DEBUG_BLK_CGROUP 1890static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf, 1891 struct blkg_policy_data *pd, int off) 1892{ 1893 struct cfq_group *cfqg = pd_to_cfqg(pd); 1894 u64 samples = blkg_stat_read(&cfqg->stats.avg_queue_size_samples); 1895 u64 v = 0; 1896 1897 if (samples) { 1898 v = blkg_stat_read(&cfqg->stats.avg_queue_size_sum); 1899 v = div64_u64(v, samples); 1900 } 1901 __blkg_prfill_u64(sf, pd, v); 1902 return 0; 1903} 1904 1905/* print avg_queue_size */ 1906static int cfqg_print_avg_queue_size(struct seq_file *sf, void *v) 1907{ 1908 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), 1909 cfqg_prfill_avg_queue_size, &blkcg_policy_cfq, 1910 0, false); 1911 return 0; 1912} 1913#endif /* CONFIG_DEBUG_BLK_CGROUP */ 1914 1915static struct cftype cfq_blkcg_files[] = { 1916 /* on root, weight is mapped to leaf_weight */ 1917 { 1918 .name = "weight_device", 1919 .flags = CFTYPE_ONLY_ON_ROOT, 1920 .seq_show = cfqg_print_leaf_weight_device, 1921 .write = cfqg_set_leaf_weight_device, 1922 }, 1923 { 1924 .name = "weight", 1925 .flags = CFTYPE_ONLY_ON_ROOT, 1926 .seq_show = cfq_print_leaf_weight, 1927 .write_u64 = cfq_set_leaf_weight, 1928 }, 1929 1930 /* no such mapping necessary for !roots */ 1931 { 1932 .name = "weight_device", 1933 .flags = CFTYPE_NOT_ON_ROOT, 1934 .seq_show = cfqg_print_weight_device, 1935 .write = cfqg_set_weight_device, 1936 }, 1937 { 1938 .name = "weight", 1939 .flags = CFTYPE_NOT_ON_ROOT, 1940 .seq_show = cfq_print_weight, 1941 .write_u64 = cfq_set_weight, 1942 }, 1943 1944 { 1945 .name = "leaf_weight_device", 1946 .seq_show = cfqg_print_leaf_weight_device, 1947 .write = cfqg_set_leaf_weight_device, 1948 }, 1949 { 1950 .name = "leaf_weight", 1951 .seq_show = cfq_print_leaf_weight, 1952 .write_u64 = cfq_set_leaf_weight, 1953 }, 1954 1955 /* statistics, covers only the tasks in the cfqg */ 1956 { 1957 .name = "time", 1958 .private = offsetof(struct cfq_group, stats.time), 1959 .seq_show = cfqg_print_stat, 1960 }, 1961 { 1962 .name = "sectors", 1963 .private = offsetof(struct cfq_group, stats.sectors), 1964 .seq_show = cfqg_print_stat, 1965 }, 1966 { 1967 .name = "io_service_bytes", 1968 .private = offsetof(struct cfq_group, stats.service_bytes), 1969 .seq_show = cfqg_print_rwstat, 1970 }, 1971 { 1972 .name = "io_serviced", 1973 .private = offsetof(struct cfq_group, stats.serviced), 1974 .seq_show = cfqg_print_rwstat, 1975 }, 1976 { 1977 .name = "io_service_time", 1978 .private = offsetof(struct cfq_group, stats.service_time), 1979 .seq_show = cfqg_print_rwstat, 1980 }, 1981 { 1982 .name = "io_wait_time", 1983 .private = offsetof(struct cfq_group, stats.wait_time), 1984 .seq_show = cfqg_print_rwstat, 1985 }, 1986 { 1987 .name = "io_merged", 1988 .private = offsetof(struct cfq_group, stats.merged), 1989 .seq_show = cfqg_print_rwstat, 1990 }, 1991 { 1992 .name = "io_queued", 1993 .private = offsetof(struct cfq_group, stats.queued), 1994 .seq_show = cfqg_print_rwstat, 1995 }, 1996 1997 /* the same statictics which cover the cfqg and its descendants */ 1998 { 1999 .name = "time_recursive", 2000 .private = offsetof(struct cfq_group, stats.time), 2001 .seq_show = cfqg_print_stat_recursive, 2002 }, 2003 { 2004 .name = "sectors_recursive", 2005 .private = offsetof(struct cfq_group, stats.sectors), 2006 .seq_show = cfqg_print_stat_recursive, 2007 }, 2008 { 2009 .name = "io_service_bytes_recursive", 2010 .private = offsetof(struct cfq_group, stats.service_bytes), 2011 .seq_show = cfqg_print_rwstat_recursive, 2012 }, 2013 { 2014 .name = "io_serviced_recursive", 2015 .private = offsetof(struct cfq_group, stats.serviced), 2016 .seq_show = cfqg_print_rwstat_recursive, 2017 }, 2018 { 2019 .name = "io_service_time_recursive", 2020 .private = offsetof(struct cfq_group, stats.service_time), 2021 .seq_show = cfqg_print_rwstat_recursive, 2022 }, 2023 { 2024 .name = "io_wait_time_recursive", 2025 .private = offsetof(struct cfq_group, stats.wait_time), 2026 .seq_show = cfqg_print_rwstat_recursive, 2027 }, 2028 { 2029 .name = "io_merged_recursive", 2030 .private = offsetof(struct cfq_group, stats.merged), 2031 .seq_show = cfqg_print_rwstat_recursive, 2032 }, 2033 { 2034 .name = "io_queued_recursive", 2035 .private = offsetof(struct cfq_group, stats.queued), 2036 .seq_show = cfqg_print_rwstat_recursive, 2037 }, 2038#ifdef CONFIG_DEBUG_BLK_CGROUP 2039 { 2040 .name = "avg_queue_size", 2041 .seq_show = cfqg_print_avg_queue_size, 2042 }, 2043 { 2044 .name = "group_wait_time", 2045 .private = offsetof(struct cfq_group, stats.group_wait_time), 2046 .seq_show = cfqg_print_stat, 2047 }, 2048 { 2049 .name = "idle_time", 2050 .private = offsetof(struct cfq_group, stats.idle_time), 2051 .seq_show = cfqg_print_stat, 2052 }, 2053 { 2054 .name = "empty_time", 2055 .private = offsetof(struct cfq_group, stats.empty_time), 2056 .seq_show = cfqg_print_stat, 2057 }, 2058 { 2059 .name = "dequeue", 2060 .private = offsetof(struct cfq_group, stats.dequeue), 2061 .seq_show = cfqg_print_stat, 2062 }, 2063 { 2064 .name = "unaccounted_time", 2065 .private = offsetof(struct cfq_group, stats.unaccounted_time), 2066 .seq_show = cfqg_print_stat, 2067 }, 2068#endif /* CONFIG_DEBUG_BLK_CGROUP */ 2069 { } /* terminate */ 2070}; 2071#else /* GROUP_IOSCHED */ 2072static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd, 2073 struct blkcg *blkcg) 2074{ 2075 return cfqd->root_group; 2076} 2077 2078static inline void 2079cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) { 2080 cfqq->cfqg = cfqg; 2081} 2082 2083#endif /* GROUP_IOSCHED */ 2084 2085/* 2086 * The cfqd->service_trees holds all pending cfq_queue's that have 2087 * requests waiting to be processed. It is sorted in the order that 2088 * we will service the queues. 2089 */ 2090static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq, 2091 bool add_front) 2092{ 2093 struct rb_node **p, *parent; 2094 struct cfq_queue *__cfqq; 2095 unsigned long rb_key; 2096 struct cfq_rb_root *st; 2097 int left; 2098 int new_cfqq = 1; 2099 2100 st = st_for(cfqq->cfqg, cfqq_class(cfqq), cfqq_type(cfqq)); 2101 if (cfq_class_idle(cfqq)) { 2102 rb_key = CFQ_IDLE_DELAY; 2103 parent = rb_last(&st->rb); 2104 if (parent && parent != &cfqq->rb_node) { 2105 __cfqq = rb_entry(parent, struct cfq_queue, rb_node); 2106 rb_key += __cfqq->rb_key; 2107 } else 2108 rb_key += jiffies; 2109 } else if (!add_front) { 2110 /* 2111 * Get our rb key offset. Subtract any residual slice 2112 * value carried from last service. A negative resid 2113 * count indicates slice overrun, and this should position 2114 * the next service time further away in the tree. 2115 */ 2116 rb_key = cfq_slice_offset(cfqd, cfqq) + jiffies; 2117 rb_key -= cfqq->slice_resid; 2118 cfqq->slice_resid = 0; 2119 } else { 2120 rb_key = -HZ; 2121 __cfqq = cfq_rb_first(st); 2122 rb_key += __cfqq ? __cfqq->rb_key : jiffies; 2123 } 2124 2125 if (!RB_EMPTY_NODE(&cfqq->rb_node)) { 2126 new_cfqq = 0; 2127 /* 2128 * same position, nothing more to do 2129 */ 2130 if (rb_key == cfqq->rb_key && cfqq->service_tree == st) 2131 return; 2132 2133 cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree); 2134 cfqq->service_tree = NULL; 2135 } 2136 2137 left = 1; 2138 parent = NULL; 2139 cfqq->service_tree = st; 2140 p = &st->rb.rb_node; 2141 while (*p) { 2142 parent = *p; 2143 __cfqq = rb_entry(parent, struct cfq_queue, rb_node); 2144 2145 /* 2146 * sort by key, that represents service time. 2147 */ 2148 if (time_before(rb_key, __cfqq->rb_key)) 2149 p = &parent->rb_left; 2150 else { 2151 p = &parent->rb_right; 2152 left = 0; 2153 } 2154 } 2155 2156 if (left) 2157 st->left = &cfqq->rb_node; 2158 2159 cfqq->rb_key = rb_key; 2160 rb_link_node(&cfqq->rb_node, parent, p); 2161 rb_insert_color(&cfqq->rb_node, &st->rb); 2162 st->count++; 2163 if (add_front || !new_cfqq) 2164 return; 2165 cfq_group_notify_queue_add(cfqd, cfqq->cfqg); 2166} 2167 2168static struct cfq_queue * 2169cfq_prio_tree_lookup(struct cfq_data *cfqd, struct rb_root *root, 2170 sector_t sector, struct rb_node **ret_parent, 2171 struct rb_node ***rb_link) 2172{ 2173 struct rb_node **p, *parent; 2174 struct cfq_queue *cfqq = NULL; 2175 2176 parent = NULL; 2177 p = &root->rb_node; 2178 while (*p) { 2179 struct rb_node **n; 2180 2181 parent = *p; 2182 cfqq = rb_entry(parent, struct cfq_queue, p_node); 2183 2184 /* 2185 * Sort strictly based on sector. Smallest to the left, 2186 * largest to the right. 2187 */ 2188 if (sector > blk_rq_pos(cfqq->next_rq)) 2189 n = &(*p)->rb_right; 2190 else if (sector < blk_rq_pos(cfqq->next_rq)) 2191 n = &(*p)->rb_left; 2192 else 2193 break; 2194 p = n; 2195 cfqq = NULL; 2196 } 2197 2198 *ret_parent = parent; 2199 if (rb_link) 2200 *rb_link = p; 2201 return cfqq; 2202} 2203 2204static void cfq_prio_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq) 2205{ 2206 struct rb_node **p, *parent; 2207 struct cfq_queue *__cfqq; 2208 2209 if (cfqq->p_root) { 2210 rb_erase(&cfqq->p_node, cfqq->p_root); 2211 cfqq->p_root = NULL; 2212 } 2213 2214 if (cfq_class_idle(cfqq)) 2215 return; 2216 if (!cfqq->next_rq) 2217 return; 2218 2219 cfqq->p_root = &cfqd->prio_trees[cfqq->org_ioprio]; 2220 __cfqq = cfq_prio_tree_lookup(cfqd, cfqq->p_root, 2221 blk_rq_pos(cfqq->next_rq), &parent, &p); 2222 if (!__cfqq) { 2223 rb_link_node(&cfqq->p_node, parent, p); 2224 rb_insert_color(&cfqq->p_node, cfqq->p_root); 2225 } else 2226 cfqq->p_root = NULL; 2227} 2228 2229/* 2230 * Update cfqq's position in the service tree. 2231 */ 2232static void cfq_resort_rr_list(struct cfq_data *cfqd, struct cfq_queue *cfqq) 2233{ 2234 /* 2235 * Resorting requires the cfqq to be on the RR list already. 2236 */ 2237 if (cfq_cfqq_on_rr(cfqq)) { 2238 cfq_service_tree_add(cfqd, cfqq, 0); 2239 cfq_prio_tree_add(cfqd, cfqq); 2240 } 2241} 2242 2243/* 2244 * add to busy list of queues for service, trying to be fair in ordering 2245 * the pending list according to last request service 2246 */ 2247static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) 2248{ 2249 cfq_log_cfqq(cfqd, cfqq, "add_to_rr"); 2250 BUG_ON(cfq_cfqq_on_rr(cfqq)); 2251 cfq_mark_cfqq_on_rr(cfqq); 2252 cfqd->busy_queues++; 2253 if (cfq_cfqq_sync(cfqq)) 2254 cfqd->busy_sync_queues++; 2255 2256 cfq_resort_rr_list(cfqd, cfqq); 2257} 2258 2259/* 2260 * Called when the cfqq no longer has requests pending, remove it from 2261 * the service tree. 2262 */ 2263static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) 2264{ 2265 cfq_log_cfqq(cfqd, cfqq, "del_from_rr"); 2266 BUG_ON(!cfq_cfqq_on_rr(cfqq)); 2267 cfq_clear_cfqq_on_rr(cfqq); 2268 2269 if (!RB_EMPTY_NODE(&cfqq->rb_node)) { 2270 cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree); 2271 cfqq->service_tree = NULL; 2272 } 2273 if (cfqq->p_root) { 2274 rb_erase(&cfqq->p_node, cfqq->p_root); 2275 cfqq->p_root = NULL; 2276 } 2277 2278 cfq_group_notify_queue_del(cfqd, cfqq->cfqg); 2279 BUG_ON(!cfqd->busy_queues); 2280 cfqd->busy_queues--; 2281 if (cfq_cfqq_sync(cfqq)) 2282 cfqd->busy_sync_queues--; 2283} 2284 2285/* 2286 * rb tree support functions 2287 */ 2288static void cfq_del_rq_rb(struct request *rq) 2289{ 2290 struct cfq_queue *cfqq = RQ_CFQQ(rq); 2291 const int sync = rq_is_sync(rq); 2292 2293 BUG_ON(!cfqq->queued[sync]); 2294 cfqq->queued[sync]--; 2295 2296 elv_rb_del(&cfqq->sort_list, rq); 2297 2298 if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) { 2299 /* 2300 * Queue will be deleted from service tree when we actually 2301 * expire it later. Right now just remove it from prio tree 2302 * as it is empty. 2303 */ 2304 if (cfqq->p_root) { 2305 rb_erase(&cfqq->p_node, cfqq->p_root); 2306 cfqq->p_root = NULL; 2307 } 2308 } 2309} 2310 2311static void cfq_add_rq_rb(struct request *rq) 2312{ 2313 struct cfq_queue *cfqq = RQ_CFQQ(rq); 2314 struct cfq_data *cfqd = cfqq->cfqd; 2315 struct request *prev; 2316 2317 cfqq->queued[rq_is_sync(rq)]++; 2318 2319 elv_rb_add(&cfqq->sort_list, rq); 2320 2321 if (!cfq_cfqq_on_rr(cfqq)) 2322 cfq_add_cfqq_rr(cfqd, cfqq); 2323 2324 /* 2325 * check if this request is a better next-serve candidate 2326 */ 2327 prev = cfqq->next_rq; 2328 cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq, cfqd->last_position); 2329 2330 /* 2331 * adjust priority tree position, if ->next_rq changes 2332 */ 2333 if (prev != cfqq->next_rq) 2334 cfq_prio_tree_add(cfqd, cfqq); 2335 2336 BUG_ON(!cfqq->next_rq); 2337} 2338 2339static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq) 2340{ 2341 elv_rb_del(&cfqq->sort_list, rq); 2342 cfqq->queued[rq_is_sync(rq)]--; 2343 cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags); 2344 cfq_add_rq_rb(rq); 2345 cfqg_stats_update_io_add(RQ_CFQG(rq), cfqq->cfqd->serving_group, 2346 rq->cmd_flags); 2347} 2348 2349static struct request * 2350cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio) 2351{ 2352 struct task_struct *tsk = current; 2353 struct cfq_io_cq *cic; 2354 struct cfq_queue *cfqq; 2355 2356 cic = cfq_cic_lookup(cfqd, tsk->io_context); 2357 if (!cic) 2358 return NULL; 2359 2360 cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio)); 2361 if (cfqq) 2362 return elv_rb_find(&cfqq->sort_list, bio_end_sector(bio)); 2363 2364 return NULL; 2365} 2366 2367static void cfq_activate_request(struct request_queue *q, struct request *rq) 2368{ 2369 struct cfq_data *cfqd = q->elevator->elevator_data; 2370 2371 cfqd->rq_in_driver++; 2372 cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d", 2373 cfqd->rq_in_driver); 2374 2375 cfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); 2376} 2377 2378static void cfq_deactivate_request(struct request_queue *q, struct request *rq) 2379{ 2380 struct cfq_data *cfqd = q->elevator->elevator_data; 2381 2382 WARN_ON(!cfqd->rq_in_driver); 2383 cfqd->rq_in_driver--; 2384 cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d", 2385 cfqd->rq_in_driver); 2386} 2387 2388static void cfq_remove_request(struct request *rq) 2389{ 2390 struct cfq_queue *cfqq = RQ_CFQQ(rq); 2391 2392 if (cfqq->next_rq == rq) 2393 cfqq->next_rq = cfq_find_next_rq(cfqq->cfqd, cfqq, rq); 2394 2395 list_del_init(&rq->queuelist); 2396 cfq_del_rq_rb(rq); 2397 2398 cfqq->cfqd->rq_queued--; 2399 cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags); 2400 if (rq->cmd_flags & REQ_PRIO) { 2401 WARN_ON(!cfqq->prio_pending); 2402 cfqq->prio_pending--; 2403 } 2404} 2405 2406static int cfq_merge(struct request_queue *q, struct request **req, 2407 struct bio *bio) 2408{ 2409 struct cfq_data *cfqd = q->elevator->elevator_data; 2410 struct request *__rq; 2411 2412 __rq = cfq_find_rq_fmerge(cfqd, bio); 2413 if (__rq && elv_rq_merge_ok(__rq, bio)) { 2414 *req = __rq; 2415 return ELEVATOR_FRONT_MERGE; 2416 } 2417 2418 return ELEVATOR_NO_MERGE; 2419} 2420 2421static void cfq_merged_request(struct request_queue *q, struct request *req, 2422 int type) 2423{ 2424 if (type == ELEVATOR_FRONT_MERGE) { 2425 struct cfq_queue *cfqq = RQ_CFQQ(req); 2426 2427 cfq_reposition_rq_rb(cfqq, req); 2428 } 2429} 2430 2431static void cfq_bio_merged(struct request_queue *q, struct request *req, 2432 struct bio *bio) 2433{ 2434 cfqg_stats_update_io_merged(RQ_CFQG(req), bio->bi_rw); 2435} 2436 2437static void 2438cfq_merged_requests(struct request_queue *q, struct request *rq, 2439 struct request *next) 2440{ 2441 struct cfq_queue *cfqq = RQ_CFQQ(rq); 2442 struct cfq_data *cfqd = q->elevator->elevator_data; 2443 2444 /* 2445 * reposition in fifo if next is older than rq 2446 */ 2447 if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) && 2448 time_before(next->fifo_time, rq->fifo_time) && 2449 cfqq == RQ_CFQQ(next)) { 2450 list_move(&rq->queuelist, &next->queuelist); 2451 rq->fifo_time = next->fifo_time; 2452 } 2453 2454 if (cfqq->next_rq == next) 2455 cfqq->next_rq = rq; 2456 cfq_remove_request(next); 2457 cfqg_stats_update_io_merged(RQ_CFQG(rq), next->cmd_flags); 2458 2459 cfqq = RQ_CFQQ(next); 2460 /* 2461 * all requests of this queue are merged to other queues, delete it 2462 * from the service tree. If it's the active_queue, 2463 * cfq_dispatch_requests() will choose to expire it or do idle 2464 */ 2465 if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list) && 2466 cfqq != cfqd->active_queue) 2467 cfq_del_cfqq_rr(cfqd, cfqq); 2468} 2469 2470static int cfq_allow_merge(struct request_queue *q, struct request *rq, 2471 struct bio *bio) 2472{ 2473 struct cfq_data *cfqd = q->elevator->elevator_data; 2474 struct cfq_io_cq *cic; 2475 struct cfq_queue *cfqq; 2476 2477 /* 2478 * Disallow merge of a sync bio into an async request. 2479 */ 2480 if (cfq_bio_sync(bio) && !rq_is_sync(rq)) 2481 return false; 2482 2483 /* 2484 * Lookup the cfqq that this bio will be queued with and allow 2485 * merge only if rq is queued there. 2486 */ 2487 cic = cfq_cic_lookup(cfqd, current->io_context); 2488 if (!cic) 2489 return false; 2490 2491 cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio)); 2492 return cfqq == RQ_CFQQ(rq); 2493} 2494 2495static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq) 2496{ 2497 del_timer(&cfqd->idle_slice_timer); 2498 cfqg_stats_update_idle_time(cfqq->cfqg); 2499} 2500 2501static void __cfq_set_active_queue(struct cfq_data *cfqd, 2502 struct cfq_queue *cfqq) 2503{ 2504 if (cfqq) { 2505 cfq_log_cfqq(cfqd, cfqq, "set_active wl_class:%d wl_type:%d", 2506 cfqd->serving_wl_class, cfqd->serving_wl_type); 2507 cfqg_stats_update_avg_queue_size(cfqq->cfqg); 2508 cfqq->slice_start = 0; 2509 cfqq->dispatch_start = jiffies; 2510 cfqq->allocated_slice = 0; 2511 cfqq->slice_end = 0; 2512 cfqq->slice_dispatch = 0; 2513 cfqq->nr_sectors = 0; 2514 2515 cfq_clear_cfqq_wait_request(cfqq); 2516 cfq_clear_cfqq_must_dispatch(cfqq); 2517 cfq_clear_cfqq_must_alloc_slice(cfqq); 2518 cfq_clear_cfqq_fifo_expire(cfqq); 2519 cfq_mark_cfqq_slice_new(cfqq); 2520 2521 cfq_del_timer(cfqd, cfqq); 2522 } 2523 2524 cfqd->active_queue = cfqq; 2525} 2526 2527/* 2528 * current cfqq expired its slice (or was too idle), select new one 2529 */ 2530static void 2531__cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq, 2532 bool timed_out) 2533{ 2534 cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out); 2535 2536 if (cfq_cfqq_wait_request(cfqq)) 2537 cfq_del_timer(cfqd, cfqq); 2538 2539 cfq_clear_cfqq_wait_request(cfqq); 2540 cfq_clear_cfqq_wait_busy(cfqq); 2541 2542 /* 2543 * If this cfqq is shared between multiple processes, check to 2544 * make sure that those processes are still issuing I/Os within 2545 * the mean seek distance. If not, it may be time to break the 2546 * queues apart again. 2547 */ 2548 if (cfq_cfqq_coop(cfqq) && CFQQ_SEEKY(cfqq)) 2549 cfq_mark_cfqq_split_coop(cfqq); 2550 2551 /* 2552 * store what was left of this slice, if the queue idled/timed out 2553 */ 2554 if (timed_out) { 2555 if (cfq_cfqq_slice_new(cfqq)) 2556 cfqq->slice_resid = cfq_scaled_cfqq_slice(cfqd, cfqq); 2557 else 2558 cfqq->slice_resid = cfqq->slice_end - jiffies; 2559 cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid); 2560 } 2561 2562 cfq_group_served(cfqd, cfqq->cfqg, cfqq); 2563 2564 if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) 2565 cfq_del_cfqq_rr(cfqd, cfqq); 2566 2567 cfq_resort_rr_list(cfqd, cfqq); 2568 2569 if (cfqq == cfqd->active_queue) 2570 cfqd->active_queue = NULL; 2571 2572 if (cfqd->active_cic) { 2573 put_io_context(cfqd->active_cic->icq.ioc); 2574 cfqd->active_cic = NULL; 2575 } 2576} 2577 2578static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out) 2579{ 2580 struct cfq_queue *cfqq = cfqd->active_queue; 2581 2582 if (cfqq) 2583 __cfq_slice_expired(cfqd, cfqq, timed_out); 2584} 2585 2586/* 2587 * Get next queue for service. Unless we have a queue preemption, 2588 * we'll simply select the first cfqq in the service tree. 2589 */ 2590static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd) 2591{ 2592 struct cfq_rb_root *st = st_for(cfqd->serving_group, 2593 cfqd->serving_wl_class, cfqd->serving_wl_type); 2594 2595 if (!cfqd->rq_queued) 2596 return NULL; 2597 2598 /* There is nothing to dispatch */ 2599 if (!st) 2600 return NULL; 2601 if (RB_EMPTY_ROOT(&st->rb)) 2602 return NULL; 2603 return cfq_rb_first(st); 2604} 2605 2606static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd) 2607{ 2608 struct cfq_group *cfqg; 2609 struct cfq_queue *cfqq; 2610 int i, j; 2611 struct cfq_rb_root *st; 2612 2613 if (!cfqd->rq_queued) 2614 return NULL; 2615 2616 cfqg = cfq_get_next_cfqg(cfqd); 2617 if (!cfqg) 2618 return NULL; 2619 2620 for_each_cfqg_st(cfqg, i, j, st) 2621 if ((cfqq = cfq_rb_first(st)) != NULL) 2622 return cfqq; 2623 return NULL; 2624} 2625 2626/* 2627 * Get and set a new active queue for service. 2628 */ 2629static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd, 2630 struct cfq_queue *cfqq) 2631{ 2632 if (!cfqq) 2633 cfqq = cfq_get_next_queue(cfqd); 2634 2635 __cfq_set_active_queue(cfqd, cfqq); 2636 return cfqq; 2637} 2638 2639static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd, 2640 struct request *rq) 2641{ 2642 if (blk_rq_pos(rq) >= cfqd->last_position) 2643 return blk_rq_pos(rq) - cfqd->last_position; 2644 else 2645 return cfqd->last_position - blk_rq_pos(rq); 2646} 2647 2648static inline int cfq_rq_close(struct cfq_data *cfqd, struct cfq_queue *cfqq, 2649 struct request *rq) 2650{ 2651 return cfq_dist_from_last(cfqd, rq) <= CFQQ_CLOSE_THR; 2652} 2653 2654static struct cfq_queue *cfqq_close(struct cfq_data *cfqd, 2655 struct cfq_queue *cur_cfqq) 2656{ 2657 struct rb_root *root = &cfqd->prio_trees[cur_cfqq->org_ioprio]; 2658 struct rb_node *parent, *node; 2659 struct cfq_queue *__cfqq; 2660 sector_t sector = cfqd->last_position; 2661 2662 if (RB_EMPTY_ROOT(root)) 2663 return NULL; 2664 2665 /* 2666 * First, if we find a request starting at the end of the last 2667 * request, choose it. 2668 */ 2669 __cfqq = cfq_prio_tree_lookup(cfqd, root, sector, &parent, NULL); 2670 if (__cfqq) 2671 return __cfqq; 2672 2673 /* 2674 * If the exact sector wasn't found, the parent of the NULL leaf 2675 * will contain the closest sector. 2676 */ 2677 __cfqq = rb_entry(parent, struct cfq_queue, p_node); 2678 if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq)) 2679 return __cfqq; 2680 2681 if (blk_rq_pos(__cfqq->next_rq) < sector) 2682 node = rb_next(&__cfqq->p_node); 2683 else 2684 node = rb_prev(&__cfqq->p_node); 2685 if (!node) 2686 return NULL; 2687 2688 __cfqq = rb_entry(node, struct cfq_queue, p_node); 2689 if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq)) 2690 return __cfqq; 2691 2692 return NULL; 2693} 2694 2695/* 2696 * cfqd - obvious 2697 * cur_cfqq - passed in so that we don't decide that the current queue is 2698 * closely cooperating with itself. 2699 * 2700 * So, basically we're assuming that that cur_cfqq has dispatched at least 2701 * one request, and that cfqd->last_position reflects a position on the disk 2702 * associated with the I/O issued by cur_cfqq. I'm not sure this is a valid 2703 * assumption. 2704 */ 2705static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd, 2706 struct cfq_queue *cur_cfqq) 2707{ 2708 struct cfq_queue *cfqq; 2709 2710 if (cfq_class_idle(cur_cfqq)) 2711 return NULL; 2712 if (!cfq_cfqq_sync(cur_cfqq)) 2713 return NULL; 2714 if (CFQQ_SEEKY(cur_cfqq)) 2715 return NULL; 2716 2717 /* 2718 * Don't search priority tree if it's the only queue in the group. 2719 */ 2720 if (cur_cfqq->cfqg->nr_cfqq == 1) 2721 return NULL; 2722 2723 /* 2724 * We should notice if some of the queues are cooperating, eg 2725 * working closely on the same area of the disk. In that case, 2726 * we can group them together and don't waste time idling. 2727 */ 2728 cfqq = cfqq_close(cfqd, cur_cfqq); 2729 if (!cfqq) 2730 return NULL; 2731 2732 /* If new queue belongs to different cfq_group, don't choose it */ 2733 if (cur_cfqq->cfqg != cfqq->cfqg) 2734 return NULL; 2735 2736 /* 2737 * It only makes sense to merge sync queues. 2738 */ 2739 if (!cfq_cfqq_sync(cfqq)) 2740 return NULL; 2741 if (CFQQ_SEEKY(cfqq)) 2742 return NULL; 2743 2744 /* 2745 * Do not merge queues of different priority classes 2746 */ 2747 if (cfq_class_rt(cfqq) != cfq_class_rt(cur_cfqq)) 2748 return NULL; 2749 2750 return cfqq; 2751} 2752 2753/* 2754 * Determine whether we should enforce idle window for this queue. 2755 */ 2756 2757static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq) 2758{ 2759 enum wl_class_t wl_class = cfqq_class(cfqq); 2760 struct cfq_rb_root *st = cfqq->service_tree; 2761 2762 BUG_ON(!st); 2763 BUG_ON(!st->count); 2764 2765 if (!cfqd->cfq_slice_idle) 2766 return false; 2767 2768 /* We never do for idle class queues. */ 2769 if (wl_class == IDLE_WORKLOAD) 2770 return false; 2771 2772 /* We do for queues that were marked with idle window flag. */ 2773 if (cfq_cfqq_idle_window(cfqq) && 2774 !(blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag)) 2775 return true; 2776 2777 /* 2778 * Otherwise, we do only if they are the last ones 2779 * in their service tree. 2780 */ 2781 if (st->count == 1 && cfq_cfqq_sync(cfqq) && 2782 !cfq_io_thinktime_big(cfqd, &st->ttime, false)) 2783 return true; 2784 cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d", st->count); 2785 return false; 2786} 2787 2788static void cfq_arm_slice_timer(struct cfq_data *cfqd) 2789{ 2790 struct cfq_queue *cfqq = cfqd->active_queue; 2791 struct cfq_io_cq *cic; 2792 unsigned long sl, group_idle = 0; 2793 2794 /* 2795 * SSD device without seek penalty, disable idling. But only do so 2796 * for devices that support queuing, otherwise we still have a problem 2797 * with sync vs async workloads. 2798 */ 2799 if (blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag) 2800 return; 2801 2802 WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list)); 2803 WARN_ON(cfq_cfqq_slice_new(cfqq)); 2804 2805 /* 2806 * idle is disabled, either manually or by past process history 2807 */ 2808 if (!cfq_should_idle(cfqd, cfqq)) { 2809 /* no queue idling. Check for group idling */ 2810 if (cfqd->cfq_group_idle) 2811 group_idle = cfqd->cfq_group_idle; 2812 else 2813 return; 2814 } 2815 2816 /* 2817 * still active requests from this queue, don't idle 2818 */ 2819 if (cfqq->dispatched) 2820 return; 2821 2822 /* 2823 * task has exited, don't wait 2824 */ 2825 cic = cfqd->active_cic; 2826 if (!cic || !atomic_read(&cic->icq.ioc->active_ref)) 2827 return; 2828 2829 /* 2830 * If our average think time is larger than the remaining time 2831 * slice, then don't idle. This avoids overrunning the allotted 2832 * time slice. 2833 */ 2834 if (sample_valid(cic->ttime.ttime_samples) && 2835 (cfqq->slice_end - jiffies < cic->ttime.ttime_mean)) { 2836 cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%lu", 2837 cic->ttime.ttime_mean); 2838 return; 2839 } 2840 2841 /* There are other queues in the group, don't do group idle */ 2842 if (group_idle && cfqq->cfqg->nr_cfqq > 1) 2843 return; 2844 2845 cfq_mark_cfqq_wait_request(cfqq); 2846 2847 if (group_idle) 2848 sl = cfqd->cfq_group_idle; 2849 else 2850 sl = cfqd->cfq_slice_idle; 2851 2852 mod_timer(&cfqd->idle_slice_timer, jiffies + sl); 2853 cfqg_stats_set_start_idle_time(cfqq->cfqg); 2854 cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu group_idle: %d", sl, 2855 group_idle ? 1 : 0); 2856} 2857 2858/* 2859 * Move request from internal lists to the request queue dispatch list. 2860 */ 2861static void cfq_dispatch_insert(struct request_queue *q, struct request *rq) 2862{ 2863 struct cfq_data *cfqd = q->elevator->elevator_data; 2864 struct cfq_queue *cfqq = RQ_CFQQ(rq); 2865 2866 cfq_log_cfqq(cfqd, cfqq, "dispatch_insert"); 2867 2868 cfqq->next_rq = cfq_find_next_rq(cfqd, cfqq, rq); 2869 cfq_remove_request(rq); 2870 cfqq->dispatched++; 2871 (RQ_CFQG(rq))->dispatched++; 2872 elv_dispatch_sort(q, rq); 2873 2874 cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++; 2875 cfqq->nr_sectors += blk_rq_sectors(rq); 2876 cfqg_stats_update_dispatch(cfqq->cfqg, blk_rq_bytes(rq), rq->cmd_flags); 2877} 2878 2879/* 2880 * return expired entry, or NULL to just start from scratch in rbtree 2881 */ 2882static struct request *cfq_check_fifo(struct cfq_queue *cfqq) 2883{ 2884 struct request *rq = NULL; 2885 2886 if (cfq_cfqq_fifo_expire(cfqq)) 2887 return NULL; 2888 2889 cfq_mark_cfqq_fifo_expire(cfqq); 2890 2891 if (list_empty(&cfqq->fifo)) 2892 return NULL; 2893 2894 rq = rq_entry_fifo(cfqq->fifo.next); 2895 if (time_before(jiffies, rq->fifo_time)) 2896 rq = NULL; 2897 2898 cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq); 2899 return rq; 2900} 2901 2902static inline int 2903cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq) 2904{ 2905 const int base_rq = cfqd->cfq_slice_async_rq; 2906 2907 WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR); 2908 2909 return 2 * base_rq * (IOPRIO_BE_NR - cfqq->ioprio); 2910} 2911 2912/* 2913 * Must be called with the queue_lock held. 2914 */ 2915static int cfqq_process_refs(struct cfq_queue *cfqq) 2916{ 2917 int process_refs, io_refs; 2918 2919 io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE]; 2920 process_refs = cfqq->ref - io_refs; 2921 BUG_ON(process_refs < 0); 2922 return process_refs; 2923} 2924 2925static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq) 2926{ 2927 int process_refs, new_process_refs; 2928 struct cfq_queue *__cfqq; 2929 2930 /* 2931 * If there are no process references on the new_cfqq, then it is 2932 * unsafe to follow the ->new_cfqq chain as other cfqq's in the 2933 * chain may have dropped their last reference (not just their 2934 * last process reference). 2935 */ 2936 if (!cfqq_process_refs(new_cfqq)) 2937 return; 2938 2939 /* Avoid a circular list and skip interim queue merges */ 2940 while ((__cfqq = new_cfqq->new_cfqq)) { 2941 if (__cfqq == cfqq) 2942 return; 2943 new_cfqq = __cfqq; 2944 } 2945 2946 process_refs = cfqq_process_refs(cfqq); 2947 new_process_refs = cfqq_process_refs(new_cfqq); 2948 /* 2949 * If the process for the cfqq has gone away, there is no 2950 * sense in merging the queues. 2951 */ 2952 if (process_refs == 0 || new_process_refs == 0) 2953 return; 2954 2955 /* 2956 * Merge in the direction of the lesser amount of work. 2957 */ 2958 if (new_process_refs >= process_refs) { 2959 cfqq->new_cfqq = new_cfqq; 2960 new_cfqq->ref += process_refs; 2961 } else { 2962 new_cfqq->new_cfqq = cfqq; 2963 cfqq->ref += new_process_refs; 2964 } 2965} 2966 2967static enum wl_type_t cfq_choose_wl_type(struct cfq_data *cfqd, 2968 struct cfq_group *cfqg, enum wl_class_t wl_class) 2969{ 2970 struct cfq_queue *queue; 2971 int i; 2972 bool key_valid = false; 2973 unsigned long lowest_key = 0; 2974 enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD; 2975 2976 for (i = 0; i <= SYNC_WORKLOAD; ++i) { 2977 /* select the one with lowest rb_key */ 2978 queue = cfq_rb_first(st_for(cfqg, wl_class, i)); 2979 if (queue && 2980 (!key_valid || time_before(queue->rb_key, lowest_key))) { 2981 lowest_key = queue->rb_key; 2982 cur_best = i; 2983 key_valid = true; 2984 } 2985 } 2986 2987 return cur_best; 2988} 2989 2990static void 2991choose_wl_class_and_type(struct cfq_data *cfqd, struct cfq_group *cfqg) 2992{ 2993 unsigned slice; 2994 unsigned count; 2995 struct cfq_rb_root *st; 2996 unsigned group_slice; 2997 enum wl_class_t original_class = cfqd->serving_wl_class; 2998 2999 /* Choose next priority. RT > BE > IDLE */ 3000 if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg)) 3001 cfqd->serving_wl_class = RT_WORKLOAD; 3002 else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg)) 3003 cfqd->serving_wl_class = BE_WORKLOAD; 3004 else { 3005 cfqd->serving_wl_class = IDLE_WORKLOAD; 3006 cfqd->workload_expires = jiffies + 1; 3007 return; 3008 } 3009 3010 if (original_class != cfqd->serving_wl_class) 3011 goto new_workload; 3012 3013 /* 3014 * For RT and BE, we have to choose also the type 3015 * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload 3016 * expiration time 3017 */ 3018 st = st_for(cfqg, cfqd->serving_wl_class, cfqd->serving_wl_type); 3019 count = st->count; 3020 3021 /* 3022 * check workload expiration, and that we still have other queues ready 3023 */ 3024 if (count && !time_after(jiffies, cfqd->workload_expires)) 3025 return; 3026 3027new_workload: 3028 /* otherwise select new workload type */ 3029 cfqd->serving_wl_type = cfq_choose_wl_type(cfqd, cfqg, 3030 cfqd->serving_wl_class); 3031 st = st_for(cfqg, cfqd->serving_wl_class, cfqd->serving_wl_type); 3032 count = st->count; 3033 3034 /* 3035 * the workload slice is computed as a fraction of target latency 3036 * proportional to the number of queues in that workload, over 3037 * all the queues in the same priority class 3038 */ 3039 group_slice = cfq_group_slice(cfqd, cfqg); 3040 3041 slice = group_slice * count / 3042 max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_wl_class], 3043 cfq_group_busy_queues_wl(cfqd->serving_wl_class, cfqd, 3044 cfqg)); 3045 3046 if (cfqd->serving_wl_type == ASYNC_WORKLOAD) { 3047 unsigned int tmp; 3048 3049 /* 3050 * Async queues are currently system wide. Just taking 3051 * proportion of queues with-in same group will lead to higher 3052 * async ratio system wide as generally root group is going 3053 * to have higher weight. A more accurate thing would be to 3054 * calculate system wide asnc/sync ratio. 3055 */ 3056 tmp = cfqd->cfq_target_latency * 3057 cfqg_busy_async_queues(cfqd, cfqg); 3058 tmp = tmp/cfqd->busy_queues; 3059 slice = min_t(unsigned, slice, tmp); 3060 3061 /* async workload slice is scaled down according to 3062 * the sync/async slice ratio. */ 3063 slice = slice * cfqd->cfq_slice[0] / cfqd->cfq_slice[1]; 3064 } else 3065 /* sync workload slice is at least 2 * cfq_slice_idle */ 3066 slice = max(slice, 2 * cfqd->cfq_slice_idle); 3067 3068 slice = max_t(unsigned, slice, CFQ_MIN_TT); 3069 cfq_log(cfqd, "workload slice:%d", slice); 3070 cfqd->workload_expires = jiffies + slice; 3071} 3072 3073static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd) 3074{ 3075 struct cfq_rb_root *st = &cfqd->grp_service_tree; 3076 struct cfq_group *cfqg; 3077 3078 if (RB_EMPTY_ROOT(&st->rb)) 3079 return NULL; 3080 cfqg = cfq_rb_first_group(st); 3081 update_min_vdisktime(st); 3082 return cfqg; 3083} 3084 3085static void cfq_choose_cfqg(struct cfq_data *cfqd) 3086{ 3087 struct cfq_group *cfqg = cfq_get_next_cfqg(cfqd); 3088 3089 cfqd->serving_group = cfqg; 3090 3091 /* Restore the workload type data */ 3092 if (cfqg->saved_wl_slice) { 3093 cfqd->workload_expires = jiffies + cfqg->saved_wl_slice; 3094 cfqd->serving_wl_type = cfqg->saved_wl_type; 3095 cfqd->serving_wl_class = cfqg->saved_wl_class; 3096 } else 3097 cfqd->workload_expires = jiffies - 1; 3098 3099 choose_wl_class_and_type(cfqd, cfqg); 3100} 3101 3102/* 3103 * Select a queue for service. If we have a current active queue, 3104 * check whether to continue servicing it, or retrieve and set a new one. 3105 */ 3106static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) 3107{ 3108 struct cfq_queue *cfqq, *new_cfqq = NULL; 3109 3110 cfqq = cfqd->active_queue; 3111 if (!cfqq) 3112 goto new_queue; 3113 3114 if (!cfqd->rq_queued) 3115 return NULL; 3116 3117 /* 3118 * We were waiting for group to get backlogged. Expire the queue 3119 */ 3120 if (cfq_cfqq_wait_busy(cfqq) && !RB_EMPTY_ROOT(&cfqq->sort_list)) 3121 goto expire; 3122 3123 /* 3124 * The active queue has run out of time, expire it and select new. 3125 */ 3126 if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq)) { 3127 /* 3128 * If slice had not expired at the completion of last request 3129 * we might not have turned on wait_busy flag. Don't expire 3130 * the queue yet. Allow the group to get backlogged. 3131 * 3132 * The very fact that we have used the slice, that means we 3133 * have been idling all along on this queue and it should be 3134 * ok to wait for this request to complete. 3135 */ 3136 if (cfqq->cfqg->nr_cfqq == 1 && RB_EMPTY_ROOT(&cfqq->sort_list) 3137 && cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) { 3138 cfqq = NULL; 3139 goto keep_queue; 3140 } else 3141 goto check_group_idle; 3142 } 3143 3144 /* 3145 * The active queue has requests and isn't expired, allow it to 3146 * dispatch. 3147 */ 3148 if (!RB_EMPTY_ROOT(&cfqq->sort_list)) 3149 goto keep_queue; 3150 3151 /* 3152 * If another queue has a request waiting within our mean seek 3153 * distance, let it run. The expire code will check for close 3154 * cooperators and put the close queue at the front of the service 3155 * tree. If possible, merge the expiring queue with the new cfqq. 3156 */ 3157 new_cfqq = cfq_close_cooperator(cfqd, cfqq); 3158 if (new_cfqq) { 3159 if (!cfqq->new_cfqq) 3160 cfq_setup_merge(cfqq, new_cfqq); 3161 goto expire; 3162 } 3163 3164 /* 3165 * No requests pending. If the active queue still has requests in 3166 * flight or is idling for a new request, allow either of these 3167 * conditions to happen (or time out) before selecting a new queue. 3168 */ 3169 if (timer_pending(&cfqd->idle_slice_timer)) { 3170 cfqq = NULL; 3171 goto keep_queue; 3172 } 3173 3174 /* 3175 * This is a deep seek queue, but the device is much faster than 3176 * the queue can deliver, don't idle 3177 **/ 3178 if (CFQQ_SEEKY(cfqq) && cfq_cfqq_idle_window(cfqq) && 3179 (cfq_cfqq_slice_new(cfqq) || 3180 (cfqq->slice_end - jiffies > jiffies - cfqq->slice_start))) { 3181 cfq_clear_cfqq_deep(cfqq); 3182 cfq_clear_cfqq_idle_window(cfqq); 3183 } 3184 3185 if (cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) { 3186 cfqq = NULL; 3187 goto keep_queue; 3188 } 3189 3190 /* 3191 * If group idle is enabled and there are requests dispatched from 3192 * this group, wait for requests to complete. 3193 */ 3194check_group_idle: 3195 if (cfqd->cfq_group_idle && cfqq->cfqg->nr_cfqq == 1 && 3196 cfqq->cfqg->dispatched && 3197 !cfq_io_thinktime_big(cfqd, &cfqq->cfqg->ttime, true)) { 3198 cfqq = NULL; 3199 goto keep_queue; 3200 } 3201 3202expire: 3203 cfq_slice_expired(cfqd, 0); 3204new_queue: 3205 /* 3206 * Current queue expired. Check if we have to switch to a new 3207 * service tree 3208 */ 3209 if (!new_cfqq) 3210 cfq_choose_cfqg(cfqd); 3211 3212 cfqq = cfq_set_active_queue(cfqd, new_cfqq); 3213keep_queue: 3214 return cfqq; 3215} 3216 3217static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq) 3218{ 3219 int dispatched = 0; 3220 3221 while (cfqq->next_rq) { 3222 cfq_dispatch_insert(cfqq->cfqd->queue, cfqq->next_rq); 3223 dispatched++; 3224 } 3225 3226 BUG_ON(!list_empty(&cfqq->fifo)); 3227 3228 /* By default cfqq is not expired if it is empty. Do it explicitly */ 3229 __cfq_slice_expired(cfqq->cfqd, cfqq, 0); 3230 return dispatched; 3231} 3232 3233/* 3234 * Drain our current requests. Used for barriers and when switching 3235 * io schedulers on-the-fly. 3236 */ 3237static int cfq_forced_dispatch(struct cfq_data *cfqd) 3238{ 3239 struct cfq_queue *cfqq; 3240 int dispatched = 0; 3241 3242 /* Expire the timeslice of the current active queue first */ 3243 cfq_slice_expired(cfqd, 0); 3244 while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL) { 3245 __cfq_set_active_queue(cfqd, cfqq); 3246 dispatched += __cfq_forced_dispatch_cfqq(cfqq); 3247 } 3248 3249 BUG_ON(cfqd->busy_queues); 3250 3251 cfq_log(cfqd, "forced_dispatch=%d", dispatched); 3252 return dispatched; 3253} 3254 3255static inline bool cfq_slice_used_soon(struct cfq_data *cfqd, 3256 struct cfq_queue *cfqq) 3257{ 3258 /* the queue hasn't finished any request, can't estimate */ 3259 if (cfq_cfqq_slice_new(cfqq)) 3260 return true; 3261 if (time_after(jiffies + cfqd->cfq_slice_idle * cfqq->dispatched, 3262 cfqq->slice_end)) 3263 return true; 3264 3265 return false; 3266} 3267 3268static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq) 3269{ 3270 unsigned int max_dispatch; 3271 3272 /* 3273 * Drain async requests before we start sync IO 3274 */ 3275 if (cfq_should_idle(cfqd, cfqq) && cfqd->rq_in_flight[BLK_RW_ASYNC]) 3276 return false; 3277 3278 /* 3279 * If this is an async queue and we have sync IO in flight, let it wait 3280 */ 3281 if (cfqd->rq_in_flight[BLK_RW_SYNC] && !cfq_cfqq_sync(cfqq)) 3282 return false; 3283 3284 max_dispatch = max_t(unsigned int, cfqd->cfq_quantum / 2, 1); 3285 if (cfq_class_idle(cfqq)) 3286 max_dispatch = 1; 3287 3288 /* 3289 * Does this cfqq already have too much IO in flight? 3290 */ 3291 if (cfqq->dispatched >= max_dispatch) { 3292 bool promote_sync = false; 3293 /* 3294 * idle queue must always only have a single IO in flight 3295 */ 3296 if (cfq_class_idle(cfqq)) 3297 return false; 3298 3299 /* 3300 * If there is only one sync queue 3301 * we can ignore async queue here and give the sync 3302 * queue no dispatch limit. The reason is a sync queue can 3303 * preempt async queue, limiting the sync queue doesn't make 3304 * sense. This is useful for aiostress test. 3305 */ 3306 if (cfq_cfqq_sync(cfqq) && cfqd->busy_sync_queues == 1) 3307 promote_sync = true; 3308 3309 /* 3310 * We have other queues, don't allow more IO from this one 3311 */ 3312 if (cfqd->busy_queues > 1 && cfq_slice_used_soon(cfqd, cfqq) && 3313 !promote_sync) 3314 return false; 3315 3316 /* 3317 * Sole queue user, no limit 3318 */ 3319 if (cfqd->busy_queues == 1 || promote_sync) 3320 max_dispatch = -1; 3321 else 3322 /* 3323 * Normally we start throttling cfqq when cfq_quantum/2 3324 * requests have been dispatched. But we can drive 3325 * deeper queue depths at the beginning of slice 3326 * subjected to upper limit of cfq_quantum. 3327 * */ 3328 max_dispatch = cfqd->cfq_quantum; 3329 } 3330 3331 /* 3332 * Async queues must wait a bit before being allowed dispatch. 3333 * We also ramp up the dispatch depth gradually for async IO, 3334 * based on the last sync IO we serviced 3335 */ 3336 if (!cfq_cfqq_sync(cfqq) && cfqd->cfq_latency) { 3337 unsigned long last_sync = jiffies - cfqd->last_delayed_sync; 3338 unsigned int depth; 3339 3340 depth = last_sync / cfqd->cfq_slice[1]; 3341 if (!depth && !cfqq->dispatched) 3342 depth = 1; 3343 if (depth < max_dispatch) 3344 max_dispatch = depth; 3345 } 3346 3347 /* 3348 * If we're below the current max, allow a dispatch 3349 */ 3350 return cfqq->dispatched < max_dispatch; 3351} 3352 3353/* 3354 * Dispatch a request from cfqq, moving them to the request queue 3355 * dispatch list. 3356 */ 3357static bool cfq_dispatch_request(struct cfq_data *cfqd, struct cfq_queue *cfqq) 3358{ 3359 struct request *rq; 3360 3361 BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list)); 3362 3363 if (!cfq_may_dispatch(cfqd, cfqq)) 3364 return false; 3365 3366 /* 3367 * follow expired path, else get first next available 3368 */ 3369 rq = cfq_check_fifo(cfqq); 3370 if (!rq) 3371 rq = cfqq->next_rq; 3372 3373 /* 3374 * insert request into driver dispatch list 3375 */ 3376 cfq_dispatch_insert(cfqd->queue, rq); 3377 3378 if (!cfqd->active_cic) { 3379 struct cfq_io_cq *cic = RQ_CIC(rq); 3380 3381 atomic_long_inc(&cic->icq.ioc->refcount); 3382 cfqd->active_cic = cic; 3383 } 3384 3385 return true; 3386} 3387 3388/* 3389 * Find the cfqq that we need to service and move a request from that to the 3390 * dispatch list 3391 */ 3392static int cfq_dispatch_requests(struct request_queue *q, int force) 3393{ 3394 struct cfq_data *cfqd = q->elevator->elevator_data; 3395 struct cfq_queue *cfqq; 3396 3397 if (!cfqd->busy_queues) 3398 return 0; 3399 3400 if (unlikely(force)) 3401 return cfq_forced_dispatch(cfqd); 3402 3403 cfqq = cfq_select_queue(cfqd); 3404 if (!cfqq) 3405 return 0; 3406 3407 /* 3408 * Dispatch a request from this cfqq, if it is allowed 3409 */ 3410 if (!cfq_dispatch_request(cfqd, cfqq)) 3411 return 0; 3412 3413 cfqq->slice_dispatch++; 3414 cfq_clear_cfqq_must_dispatch(cfqq); 3415 3416 /* 3417 * expire an async queue immediately if it has used up its slice. idle 3418 * queue always expire after 1 dispatch round. 3419 */ 3420 if (cfqd->busy_queues > 1 && ((!cfq_cfqq_sync(cfqq) && 3421 cfqq->slice_dispatch >= cfq_prio_to_maxrq(cfqd, cfqq)) || 3422 cfq_class_idle(cfqq))) { 3423 cfqq->slice_end = jiffies + 1; 3424 cfq_slice_expired(cfqd, 0); 3425 } 3426 3427 cfq_log_cfqq(cfqd, cfqq, "dispatched a request"); 3428 return 1; 3429} 3430 3431/* 3432 * task holds one reference to the queue, dropped when task exits. each rq 3433 * in-flight on this queue also holds a reference, dropped when rq is freed. 3434 * 3435 * Each cfq queue took a reference on the parent group. Drop it now. 3436 * queue lock must be held here. 3437 */ 3438static void cfq_put_queue(struct cfq_queue *cfqq) 3439{ 3440 struct cfq_data *cfqd = cfqq->cfqd; 3441 struct cfq_group *cfqg; 3442 3443 BUG_ON(cfqq->ref <= 0); 3444 3445 cfqq->ref--; 3446 if (cfqq->ref) 3447 return; 3448 3449 cfq_log_cfqq(cfqd, cfqq, "put_queue"); 3450 BUG_ON(rb_first(&cfqq->sort_list)); 3451 BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]); 3452 cfqg = cfqq->cfqg; 3453 3454 if (unlikely(cfqd->active_queue == cfqq)) { 3455 __cfq_slice_expired(cfqd, cfqq, 0); 3456 cfq_schedule_dispatch(cfqd); 3457 } 3458 3459 BUG_ON(cfq_cfqq_on_rr(cfqq)); 3460 kmem_cache_free(cfq_pool, cfqq); 3461 cfqg_put(cfqg); 3462} 3463 3464static void cfq_put_cooperator(struct cfq_queue *cfqq) 3465{ 3466 struct cfq_queue *__cfqq, *next; 3467 3468 /* 3469 * If this queue was scheduled to merge with another queue, be 3470 * sure to drop the reference taken on that queue (and others in 3471 * the merge chain). See cfq_setup_merge and cfq_merge_cfqqs. 3472 */ 3473 __cfqq = cfqq->new_cfqq; 3474 while (__cfqq) { 3475 if (__cfqq == cfqq) { 3476 WARN(1, "cfqq->new_cfqq loop detected\n"); 3477 break; 3478 } 3479 next = __cfqq->new_cfqq; 3480 cfq_put_queue(__cfqq); 3481 __cfqq = next; 3482 } 3483} 3484 3485static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq) 3486{ 3487 if (unlikely(cfqq == cfqd->active_queue)) { 3488 __cfq_slice_expired(cfqd, cfqq, 0); 3489 cfq_schedule_dispatch(cfqd); 3490 } 3491 3492 cfq_put_cooperator(cfqq); 3493 3494 cfq_put_queue(cfqq); 3495} 3496 3497static void cfq_init_icq(struct io_cq *icq) 3498{ 3499 struct cfq_io_cq *cic = icq_to_cic(icq); 3500 3501 cic->ttime.last_end_request = jiffies; 3502} 3503 3504static void cfq_exit_icq(struct io_cq *icq) 3505{ 3506 struct cfq_io_cq *cic = icq_to_cic(icq); 3507 struct cfq_data *cfqd = cic_to_cfqd(cic); 3508 3509 if (cic->cfqq[BLK_RW_ASYNC]) { 3510 cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]); 3511 cic->cfqq[BLK_RW_ASYNC] = NULL; 3512 } 3513 3514 if (cic->cfqq[BLK_RW_SYNC]) { 3515 cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_SYNC]); 3516 cic->cfqq[BLK_RW_SYNC] = NULL; 3517 } 3518} 3519 3520static void cfq_init_prio_data(struct cfq_queue *cfqq, struct cfq_io_cq *cic) 3521{ 3522 struct task_struct *tsk = current; 3523 int ioprio_class; 3524 3525 if (!cfq_cfqq_prio_changed(cfqq)) 3526 return; 3527 3528 ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio); 3529 switch (ioprio_class) { 3530 default: 3531 printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class); 3532 case IOPRIO_CLASS_NONE: 3533 /* 3534 * no prio set, inherit CPU scheduling settings 3535 */ 3536 cfqq->ioprio = task_nice_ioprio(tsk); 3537 cfqq->ioprio_class = task_nice_ioclass(tsk); 3538 break; 3539 case IOPRIO_CLASS_RT: 3540 cfqq->ioprio = IOPRIO_PRIO_DATA(cic->ioprio); 3541 cfqq->ioprio_class = IOPRIO_CLASS_RT; 3542 break; 3543 case IOPRIO_CLASS_BE: 3544 cfqq->ioprio = IOPRIO_PRIO_DATA(cic->ioprio); 3545 cfqq->ioprio_class = IOPRIO_CLASS_BE; 3546 break; 3547 case IOPRIO_CLASS_IDLE: 3548 cfqq->ioprio_class = IOPRIO_CLASS_IDLE; 3549 cfqq->ioprio = 7; 3550 cfq_clear_cfqq_idle_window(cfqq); 3551 break; 3552 } 3553 3554 /* 3555 * keep track of original prio settings in case we have to temporarily 3556 * elevate the priority of this queue 3557 */ 3558 cfqq->org_ioprio = cfqq->ioprio; 3559 cfq_clear_cfqq_prio_changed(cfqq); 3560} 3561 3562static void check_ioprio_changed(struct cfq_io_cq *cic, struct bio *bio) 3563{ 3564 int ioprio = cic->icq.ioc->ioprio; 3565 struct cfq_data *cfqd = cic_to_cfqd(cic); 3566 struct cfq_queue *cfqq; 3567 3568 /* 3569 * Check whether ioprio has changed. The condition may trigger 3570 * spuriously on a newly created cic but there's no harm. 3571 */ 3572 if (unlikely(!cfqd) || likely(cic->ioprio == ioprio)) 3573 return; 3574 3575 cfqq = cic->cfqq[BLK_RW_ASYNC]; 3576 if (cfqq) { 3577 struct cfq_queue *new_cfqq; 3578 new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio, 3579 GFP_ATOMIC); 3580 if (new_cfqq) { 3581 cic->cfqq[BLK_RW_ASYNC] = new_cfqq; 3582 cfq_put_queue(cfqq); 3583 } 3584 } 3585 3586 cfqq = cic->cfqq[BLK_RW_SYNC]; 3587 if (cfqq) 3588 cfq_mark_cfqq_prio_changed(cfqq); 3589 3590 cic->ioprio = ioprio; 3591} 3592 3593static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq, 3594 pid_t pid, bool is_sync) 3595{ 3596 RB_CLEAR_NODE(&cfqq->rb_node); 3597 RB_CLEAR_NODE(&cfqq->p_node); 3598 INIT_LIST_HEAD(&cfqq->fifo); 3599 3600 cfqq->ref = 0; 3601 cfqq->cfqd = cfqd; 3602 3603 cfq_mark_cfqq_prio_changed(cfqq); 3604 3605 if (is_sync) { 3606 if (!cfq_class_idle(cfqq)) 3607 cfq_mark_cfqq_idle_window(cfqq); 3608 cfq_mark_cfqq_sync(cfqq); 3609 } 3610 cfqq->pid = pid; 3611} 3612 3613#ifdef CONFIG_CFQ_GROUP_IOSCHED 3614static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) 3615{ 3616 struct cfq_data *cfqd = cic_to_cfqd(cic); 3617 struct cfq_queue *sync_cfqq; 3618 uint64_t serial_nr; 3619 3620 rcu_read_lock(); 3621 serial_nr = bio_blkcg(bio)->css.serial_nr; 3622 rcu_read_unlock(); 3623 3624 /* 3625 * Check whether blkcg has changed. The condition may trigger 3626 * spuriously on a newly created cic but there's no harm. 3627 */ 3628 if (unlikely(!cfqd) || likely(cic->blkcg_serial_nr == serial_nr)) 3629 return; 3630 3631 sync_cfqq = cic_to_cfqq(cic, 1); 3632 if (sync_cfqq) { 3633 /* 3634 * Drop reference to sync queue. A new sync queue will be 3635 * assigned in new group upon arrival of a fresh request. 3636 */ 3637 cfq_log_cfqq(cfqd, sync_cfqq, "changed cgroup"); 3638 cic_set_cfqq(cic, NULL, 1); 3639 cfq_put_queue(sync_cfqq); 3640 } 3641 3642 cic->blkcg_serial_nr = serial_nr; 3643} 3644#else 3645static inline void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) { } 3646#endif /* CONFIG_CFQ_GROUP_IOSCHED */ 3647 3648static struct cfq_queue * 3649cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic, 3650 struct bio *bio, gfp_t gfp_mask) 3651{ 3652 struct blkcg *blkcg; 3653 struct cfq_queue *cfqq, *new_cfqq = NULL; 3654 struct cfq_group *cfqg; 3655 3656retry: 3657 rcu_read_lock(); 3658 3659 blkcg = bio_blkcg(bio); 3660 cfqg = cfq_lookup_create_cfqg(cfqd, blkcg); 3661 if (!cfqg) { 3662 cfqq = &cfqd->oom_cfqq; 3663 goto out; 3664 } 3665 3666 cfqq = cic_to_cfqq(cic, is_sync); 3667 3668 /* 3669 * Always try a new alloc if we fell back to the OOM cfqq 3670 * originally, since it should just be a temporary situation. 3671 */ 3672 if (!cfqq || cfqq == &cfqd->oom_cfqq) { 3673 cfqq = NULL; 3674 if (new_cfqq) { 3675 cfqq = new_cfqq; 3676 new_cfqq = NULL; 3677 } else if (gfp_mask & __GFP_WAIT) { 3678 rcu_read_unlock(); 3679 spin_unlock_irq(cfqd->queue->queue_lock); 3680 new_cfqq = kmem_cache_alloc_node(cfq_pool, 3681 gfp_mask | __GFP_ZERO, 3682 cfqd->queue->node); 3683 spin_lock_irq(cfqd->queue->queue_lock); 3684 if (new_cfqq) 3685 goto retry; 3686 else 3687 return &cfqd->oom_cfqq; 3688 } else { 3689 cfqq = kmem_cache_alloc_node(cfq_pool, 3690 gfp_mask | __GFP_ZERO, 3691 cfqd->queue->node); 3692 } 3693 3694 if (cfqq) { 3695 cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync); 3696 cfq_init_prio_data(cfqq, cic); 3697 cfq_link_cfqq_cfqg(cfqq, cfqg); 3698 cfq_log_cfqq(cfqd, cfqq, "alloced"); 3699 } else 3700 cfqq = &cfqd->oom_cfqq; 3701 } 3702out: 3703 if (new_cfqq) 3704 kmem_cache_free(cfq_pool, new_cfqq); 3705 3706 rcu_read_unlock(); 3707 return cfqq; 3708} 3709 3710static struct cfq_queue ** 3711cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio) 3712{ 3713 switch (ioprio_class) { 3714 case IOPRIO_CLASS_RT: 3715 return &cfqd->async_cfqq[0][ioprio]; 3716 case IOPRIO_CLASS_NONE: 3717 ioprio = IOPRIO_NORM; 3718 /* fall through */ 3719 case IOPRIO_CLASS_BE: 3720 return &cfqd->async_cfqq[1][ioprio]; 3721 case IOPRIO_CLASS_IDLE: 3722 return &cfqd->async_idle_cfqq; 3723 default: 3724 BUG(); 3725 } 3726} 3727 3728static struct cfq_queue * 3729cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic, 3730 struct bio *bio, gfp_t gfp_mask) 3731{ 3732 int ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio); 3733 int ioprio = IOPRIO_PRIO_DATA(cic->ioprio); 3734 struct cfq_queue **async_cfqq = NULL; 3735 struct cfq_queue *cfqq = NULL; 3736 3737 if (!is_sync) { 3738 if (!ioprio_valid(cic->ioprio)) { 3739 struct task_struct *tsk = current; 3740 ioprio = task_nice_ioprio(tsk); 3741 ioprio_class = task_nice_ioclass(tsk); 3742 } 3743 async_cfqq = cfq_async_queue_prio(cfqd, ioprio_class, ioprio); 3744 cfqq = *async_cfqq; 3745 } 3746 3747 if (!cfqq) 3748 cfqq = cfq_find_alloc_queue(cfqd, is_sync, cic, bio, gfp_mask); 3749 3750 /* 3751 * pin the queue now that it's allocated, scheduler exit will prune it 3752 */ 3753 if (!is_sync && !(*async_cfqq)) { 3754 cfqq->ref++; 3755 *async_cfqq = cfqq; 3756 } 3757 3758 cfqq->ref++; 3759 return cfqq; 3760} 3761 3762static void 3763__cfq_update_io_thinktime(struct cfq_ttime *ttime, unsigned long slice_idle) 3764{ 3765 unsigned long elapsed = jiffies - ttime->last_end_request; 3766 elapsed = min(elapsed, 2UL * slice_idle); 3767 3768 ttime->ttime_samples = (7*ttime->ttime_samples + 256) / 8; 3769 ttime->ttime_total = (7*ttime->ttime_total + 256*elapsed) / 8; 3770 ttime->ttime_mean = (ttime->ttime_total + 128) / ttime->ttime_samples; 3771} 3772 3773static void 3774cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_queue *cfqq, 3775 struct cfq_io_cq *cic) 3776{ 3777 if (cfq_cfqq_sync(cfqq)) { 3778 __cfq_update_io_thinktime(&cic->ttime, cfqd->cfq_slice_idle); 3779 __cfq_update_io_thinktime(&cfqq->service_tree->ttime, 3780 cfqd->cfq_slice_idle); 3781 } 3782#ifdef CONFIG_CFQ_GROUP_IOSCHED 3783 __cfq_update_io_thinktime(&cfqq->cfqg->ttime, cfqd->cfq_group_idle); 3784#endif 3785} 3786 3787static void 3788cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq, 3789 struct request *rq) 3790{ 3791 sector_t sdist = 0; 3792 sector_t n_sec = blk_rq_sectors(rq); 3793 if (cfqq->last_request_pos) { 3794 if (cfqq->last_request_pos < blk_rq_pos(rq)) 3795 sdist = blk_rq_pos(rq) - cfqq->last_request_pos; 3796 else 3797 sdist = cfqq->last_request_pos - blk_rq_pos(rq); 3798 } 3799 3800 cfqq->seek_history <<= 1; 3801 if (blk_queue_nonrot(cfqd->queue)) 3802 cfqq->seek_history |= (n_sec < CFQQ_SECT_THR_NONROT); 3803 else 3804 cfqq->seek_history |= (sdist > CFQQ_SEEK_THR); 3805} 3806 3807/* 3808 * Disable idle window if the process thinks too long or seeks so much that 3809 * it doesn't matter 3810 */ 3811static void 3812cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq, 3813 struct cfq_io_cq *cic) 3814{ 3815 int old_idle, enable_idle; 3816 3817 /* 3818 * Don't idle for async or idle io prio class 3819 */ 3820 if (!cfq_cfqq_sync(cfqq) || cfq_class_idle(cfqq)) 3821 return; 3822 3823 enable_idle = old_idle = cfq_cfqq_idle_window(cfqq); 3824 3825 if (cfqq->queued[0] + cfqq->queued[1] >= 4) 3826 cfq_mark_cfqq_deep(cfqq); 3827 3828 if (cfqq->next_rq && (cfqq->next_rq->cmd_flags & REQ_NOIDLE)) 3829 enable_idle = 0; 3830 else if (!atomic_read(&cic->icq.ioc->active_ref) || 3831 !cfqd->cfq_slice_idle || 3832 (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq))) 3833 enable_idle = 0; 3834 else if (sample_valid(cic->ttime.ttime_samples)) { 3835 if (cic->ttime.ttime_mean > cfqd->cfq_slice_idle) 3836 enable_idle = 0; 3837 else 3838 enable_idle = 1; 3839 } 3840 3841 if (old_idle != enable_idle) { 3842 cfq_log_cfqq(cfqd, cfqq, "idle=%d", enable_idle); 3843 if (enable_idle) 3844 cfq_mark_cfqq_idle_window(cfqq); 3845 else 3846 cfq_clear_cfqq_idle_window(cfqq); 3847 } 3848} 3849 3850/* 3851 * Check if new_cfqq should preempt the currently active queue. Return 0 for 3852 * no or if we aren't sure, a 1 will cause a preempt. 3853 */ 3854static bool 3855cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, 3856 struct request *rq) 3857{ 3858 struct cfq_queue *cfqq; 3859 3860 cfqq = cfqd->active_queue; 3861 if (!cfqq) 3862 return false; 3863 3864 if (cfq_class_idle(new_cfqq)) 3865 return false; 3866 3867 if (cfq_class_idle(cfqq)) 3868 return true; 3869 3870 /* 3871 * Don't allow a non-RT request to preempt an ongoing RT cfqq timeslice. 3872 */ 3873 if (cfq_class_rt(cfqq) && !cfq_class_rt(new_cfqq)) 3874 return false; 3875 3876 /* 3877 * if the new request is sync, but the currently running queue is 3878 * not, let the sync request have priority. 3879 */ 3880 if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq)) 3881 return true; 3882 3883 if (new_cfqq->cfqg != cfqq->cfqg) 3884 return false; 3885 3886 if (cfq_slice_used(cfqq)) 3887 return true; 3888 3889 /* Allow preemption only if we are idling on sync-noidle tree */ 3890 if (cfqd->serving_wl_type == SYNC_NOIDLE_WORKLOAD && 3891 cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD && 3892 new_cfqq->service_tree->count == 2 && 3893 RB_EMPTY_ROOT(&cfqq->sort_list)) 3894 return true; 3895 3896 /* 3897 * So both queues are sync. Let the new request get disk time if 3898 * it's a metadata request and the current queue is doing regular IO. 3899 */ 3900 if ((rq->cmd_flags & REQ_PRIO) && !cfqq->prio_pending) 3901 return true; 3902 3903 /* 3904 * Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice. 3905 */ 3906 if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq)) 3907 return true; 3908 3909 /* An idle queue should not be idle now for some reason */ 3910 if (RB_EMPTY_ROOT(&cfqq->sort_list) && !cfq_should_idle(cfqd, cfqq)) 3911 return true; 3912 3913 if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq)) 3914 return false; 3915 3916 /* 3917 * if this request is as-good as one we would expect from the 3918 * current cfqq, let it preempt 3919 */ 3920 if (cfq_rq_close(cfqd, cfqq, rq)) 3921 return true; 3922 3923 return false; 3924} 3925 3926/* 3927 * cfqq preempts the active queue. if we allowed preempt with no slice left, 3928 * let it have half of its nominal slice. 3929 */ 3930static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) 3931{ 3932 enum wl_type_t old_type = cfqq_type(cfqd->active_queue); 3933 3934 cfq_log_cfqq(cfqd, cfqq, "preempt"); 3935 cfq_slice_expired(cfqd, 1); 3936 3937 /* 3938 * workload type is changed, don't save slice, otherwise preempt 3939 * doesn't happen 3940 */ 3941 if (old_type != cfqq_type(cfqq)) 3942 cfqq->cfqg->saved_wl_slice = 0; 3943 3944 /* 3945 * Put the new queue at the front of the of the current list, 3946 * so we know that it will be selected next. 3947 */ 3948 BUG_ON(!cfq_cfqq_on_rr(cfqq)); 3949 3950 cfq_service_tree_add(cfqd, cfqq, 1); 3951 3952 cfqq->slice_end = 0; 3953 cfq_mark_cfqq_slice_new(cfqq); 3954} 3955 3956/* 3957 * Called when a new fs request (rq) is added (to cfqq). Check if there's 3958 * something we should do about it 3959 */ 3960static void 3961cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq, 3962 struct request *rq) 3963{ 3964 struct cfq_io_cq *cic = RQ_CIC(rq); 3965 3966 cfqd->rq_queued++; 3967 if (rq->cmd_flags & REQ_PRIO) 3968 cfqq->prio_pending++; 3969 3970 cfq_update_io_thinktime(cfqd, cfqq, cic); 3971 cfq_update_io_seektime(cfqd, cfqq, rq); 3972 cfq_update_idle_window(cfqd, cfqq, cic); 3973 3974 cfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); 3975 3976 if (cfqq == cfqd->active_queue) { 3977 /* 3978 * Remember that we saw a request from this process, but 3979 * don't start queuing just yet. Otherwise we risk seeing lots 3980 * of tiny requests, because we disrupt the normal plugging 3981 * and merging. If the request is already larger than a single 3982 * page, let it rip immediately. For that case we assume that 3983 * merging is already done. Ditto for a busy system that 3984 * has other work pending, don't risk delaying until the 3985 * idle timer unplug to continue working. 3986 */ 3987 if (cfq_cfqq_wait_request(cfqq)) { 3988 if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE || 3989 cfqd->busy_queues > 1) { 3990 cfq_del_timer(cfqd, cfqq); 3991 cfq_clear_cfqq_wait_request(cfqq); 3992 __blk_run_queue(cfqd->queue); 3993 } else { 3994 cfqg_stats_update_idle_time(cfqq->cfqg); 3995 cfq_mark_cfqq_must_dispatch(cfqq); 3996 } 3997 } 3998 } else if (cfq_should_preempt(cfqd, cfqq, rq)) { 3999 /* 4000 * not the active queue - expire current slice if it is 4001 * idle and has expired it's mean thinktime or this new queue 4002 * has some old slice time left and is of higher priority or 4003 * this new queue is RT and the current one is BE 4004 */ 4005 cfq_preempt_queue(cfqd, cfqq); 4006 __blk_run_queue(cfqd->queue); 4007 } 4008} 4009 4010static void cfq_insert_request(struct request_queue *q, struct request *rq) 4011{ 4012 struct cfq_data *cfqd = q->elevator->elevator_data; 4013 struct cfq_queue *cfqq = RQ_CFQQ(rq); 4014 4015 cfq_log_cfqq(cfqd, cfqq, "insert_request"); 4016 cfq_init_prio_data(cfqq, RQ_CIC(rq)); 4017 4018 rq->fifo_time = jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]; 4019 list_add_tail(&rq->queuelist, &cfqq->fifo); 4020 cfq_add_rq_rb(rq); 4021 cfqg_stats_update_io_add(RQ_CFQG(rq), cfqd->serving_group, 4022 rq->cmd_flags); 4023 cfq_rq_enqueued(cfqd, cfqq, rq); 4024} 4025 4026/* 4027 * Update hw_tag based on peak queue depth over 50 samples under 4028 * sufficient load. 4029 */ 4030static void cfq_update_hw_tag(struct cfq_data *cfqd) 4031{ 4032 struct cfq_queue *cfqq = cfqd->active_queue; 4033 4034 if (cfqd->rq_in_driver > cfqd->hw_tag_est_depth) 4035 cfqd->hw_tag_est_depth = cfqd->rq_in_driver; 4036 4037 if (cfqd->hw_tag == 1) 4038 return; 4039 4040 if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN && 4041 cfqd->rq_in_driver <= CFQ_HW_QUEUE_MIN) 4042 return; 4043 4044 /* 4045 * If active queue hasn't enough requests and can idle, cfq might not 4046 * dispatch sufficient requests to hardware. Don't zero hw_tag in this 4047 * case 4048 */ 4049 if (cfqq && cfq_cfqq_idle_window(cfqq) && 4050 cfqq->dispatched + cfqq->queued[0] + cfqq->queued[1] < 4051 CFQ_HW_QUEUE_MIN && cfqd->rq_in_driver < CFQ_HW_QUEUE_MIN) 4052 return; 4053 4054 if (cfqd->hw_tag_samples++ < 50) 4055 return; 4056 4057 if (cfqd->hw_tag_est_depth >= CFQ_HW_QUEUE_MIN) 4058 cfqd->hw_tag = 1; 4059 else 4060 cfqd->hw_tag = 0; 4061} 4062 4063static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq) 4064{ 4065 struct cfq_io_cq *cic = cfqd->active_cic; 4066 4067 /* If the queue already has requests, don't wait */ 4068 if (!RB_EMPTY_ROOT(&cfqq->sort_list)) 4069 return false; 4070 4071 /* If there are other queues in the group, don't wait */ 4072 if (cfqq->cfqg->nr_cfqq > 1) 4073 return false; 4074 4075 /* the only queue in the group, but think time is big */ 4076 if (cfq_io_thinktime_big(cfqd, &cfqq->cfqg->ttime, true)) 4077 return false; 4078 4079 if (cfq_slice_used(cfqq)) 4080 return true; 4081 4082 /* if slice left is less than think time, wait busy */ 4083 if (cic && sample_valid(cic->ttime.ttime_samples) 4084 && (cfqq->slice_end - jiffies < cic->ttime.ttime_mean)) 4085 return true; 4086 4087 /* 4088 * If think times is less than a jiffy than ttime_mean=0 and above 4089 * will not be true. It might happen that slice has not expired yet 4090 * but will expire soon (4-5 ns) during select_queue(). To cover the 4091 * case where think time is less than a jiffy, mark the queue wait 4092 * busy if only 1 jiffy is left in the slice. 4093 */ 4094 if (cfqq->slice_end - jiffies == 1) 4095 return true; 4096 4097 return false; 4098} 4099 4100static void cfq_completed_request(struct request_queue *q, struct request *rq) 4101{ 4102 struct cfq_queue *cfqq = RQ_CFQQ(rq); 4103 struct cfq_data *cfqd = cfqq->cfqd; 4104 const int sync = rq_is_sync(rq); 4105 unsigned long now; 4106 4107 now = jiffies; 4108 cfq_log_cfqq(cfqd, cfqq, "complete rqnoidle %d", 4109 !!(rq->cmd_flags & REQ_NOIDLE)); 4110 4111 cfq_update_hw_tag(cfqd); 4112 4113 WARN_ON(!cfqd->rq_in_driver); 4114 WARN_ON(!cfqq->dispatched); 4115 cfqd->rq_in_driver--; 4116 cfqq->dispatched--; 4117 (RQ_CFQG(rq))->dispatched--; 4118 cfqg_stats_update_completion(cfqq->cfqg, rq_start_time_ns(rq), 4119 rq_io_start_time_ns(rq), rq->cmd_flags); 4120 4121 cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--; 4122 4123 if (sync) { 4124 struct cfq_rb_root *st; 4125 4126 RQ_CIC(rq)->ttime.last_end_request = now; 4127 4128 if (cfq_cfqq_on_rr(cfqq)) 4129 st = cfqq->service_tree; 4130 else 4131 st = st_for(cfqq->cfqg, cfqq_class(cfqq), 4132 cfqq_type(cfqq)); 4133 4134 st->ttime.last_end_request = now; 4135 if (!time_after(rq->start_time + cfqd->cfq_fifo_expire[1], now)) 4136 cfqd->last_delayed_sync = now; 4137 } 4138 4139#ifdef CONFIG_CFQ_GROUP_IOSCHED 4140 cfqq->cfqg->ttime.last_end_request = now; 4141#endif 4142 4143 /* 4144 * If this is the active queue, check if it needs to be expired, 4145 * or if we want to idle in case it has no pending requests. 4146 */ 4147 if (cfqd->active_queue == cfqq) { 4148 const bool cfqq_empty = RB_EMPTY_ROOT(&cfqq->sort_list); 4149 4150 if (cfq_cfqq_slice_new(cfqq)) { 4151 cfq_set_prio_slice(cfqd, cfqq); 4152 cfq_clear_cfqq_slice_new(cfqq); 4153 } 4154 4155 /* 4156 * Should we wait for next request to come in before we expire 4157 * the queue. 4158 */ 4159 if (cfq_should_wait_busy(cfqd, cfqq)) { 4160 unsigned long extend_sl = cfqd->cfq_slice_idle; 4161 if (!cfqd->cfq_slice_idle) 4162 extend_sl = cfqd->cfq_group_idle; 4163 cfqq->slice_end = jiffies + extend_sl; 4164 cfq_mark_cfqq_wait_busy(cfqq); 4165 cfq_log_cfqq(cfqd, cfqq, "will busy wait"); 4166 } 4167 4168 /* 4169 * Idling is not enabled on: 4170 * - expired queues 4171 * - idle-priority queues 4172 * - async queues 4173 * - queues with still some requests queued 4174 * - when there is a close cooperator 4175 */ 4176 if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq)) 4177 cfq_slice_expired(cfqd, 1); 4178 else if (sync && cfqq_empty && 4179 !cfq_close_cooperator(cfqd, cfqq)) { 4180 cfq_arm_slice_timer(cfqd); 4181 } 4182 } 4183 4184 if (!cfqd->rq_in_driver) 4185 cfq_schedule_dispatch(cfqd); 4186} 4187 4188static inline int __cfq_may_queue(struct cfq_queue *cfqq) 4189{ 4190 if (cfq_cfqq_wait_request(cfqq) && !cfq_cfqq_must_alloc_slice(cfqq)) { 4191 cfq_mark_cfqq_must_alloc_slice(cfqq); 4192 return ELV_MQUEUE_MUST; 4193 } 4194 4195 return ELV_MQUEUE_MAY; 4196} 4197 4198static int cfq_may_queue(struct request_queue *q, int rw) 4199{ 4200 struct cfq_data *cfqd = q->elevator->elevator_data; 4201 struct task_struct *tsk = current; 4202 struct cfq_io_cq *cic; 4203 struct cfq_queue *cfqq; 4204 4205 /* 4206 * don't force setup of a queue from here, as a call to may_queue 4207 * does not necessarily imply that a request actually will be queued. 4208 * so just lookup a possibly existing queue, or return 'may queue' 4209 * if that fails 4210 */ 4211 cic = cfq_cic_lookup(cfqd, tsk->io_context); 4212 if (!cic) 4213 return ELV_MQUEUE_MAY; 4214 4215 cfqq = cic_to_cfqq(cic, rw_is_sync(rw)); 4216 if (cfqq) { 4217 cfq_init_prio_data(cfqq, cic); 4218 4219 return __cfq_may_queue(cfqq); 4220 } 4221 4222 return ELV_MQUEUE_MAY; 4223} 4224 4225/* 4226 * queue lock held here 4227 */ 4228static void cfq_put_request(struct request *rq) 4229{ 4230 struct cfq_queue *cfqq = RQ_CFQQ(rq); 4231 4232 if (cfqq) { 4233 const int rw = rq_data_dir(rq); 4234 4235 BUG_ON(!cfqq->allocated[rw]); 4236 cfqq->allocated[rw]--; 4237 4238 /* Put down rq reference on cfqg */ 4239 cfqg_put(RQ_CFQG(rq)); 4240 rq->elv.priv[0] = NULL; 4241 rq->elv.priv[1] = NULL; 4242 4243 cfq_put_queue(cfqq); 4244 } 4245} 4246 4247static struct cfq_queue * 4248cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_cq *cic, 4249 struct cfq_queue *cfqq) 4250{ 4251 cfq_log_cfqq(cfqd, cfqq, "merging with queue %p", cfqq->new_cfqq); 4252 cic_set_cfqq(cic, cfqq->new_cfqq, 1); 4253 cfq_mark_cfqq_coop(cfqq->new_cfqq); 4254 cfq_put_queue(cfqq); 4255 return cic_to_cfqq(cic, 1); 4256} 4257 4258/* 4259 * Returns NULL if a new cfqq should be allocated, or the old cfqq if this 4260 * was the last process referring to said cfqq. 4261 */ 4262static struct cfq_queue * 4263split_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq) 4264{ 4265 if (cfqq_process_refs(cfqq) == 1) { 4266 cfqq->pid = current->pid; 4267 cfq_clear_cfqq_coop(cfqq); 4268 cfq_clear_cfqq_split_coop(cfqq); 4269 return cfqq; 4270 } 4271 4272 cic_set_cfqq(cic, NULL, 1); 4273 4274 cfq_put_cooperator(cfqq); 4275 4276 cfq_put_queue(cfqq); 4277 return NULL; 4278} 4279/* 4280 * Allocate cfq data structures associated with this request. 4281 */ 4282static int 4283cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio, 4284 gfp_t gfp_mask) 4285{ 4286 struct cfq_data *cfqd = q->elevator->elevator_data; 4287 struct cfq_io_cq *cic = icq_to_cic(rq->elv.icq); 4288 const int rw = rq_data_dir(rq); 4289 const bool is_sync = rq_is_sync(rq); 4290 struct cfq_queue *cfqq; 4291 4292 might_sleep_if(gfp_mask & __GFP_WAIT); 4293 4294 spin_lock_irq(q->queue_lock); 4295 4296 check_ioprio_changed(cic, bio); 4297 check_blkcg_changed(cic, bio); 4298new_queue: 4299 cfqq = cic_to_cfqq(cic, is_sync); 4300 if (!cfqq || cfqq == &cfqd->oom_cfqq) { 4301 cfqq = cfq_get_queue(cfqd, is_sync, cic, bio, gfp_mask); 4302 cic_set_cfqq(cic, cfqq, is_sync); 4303 } else { 4304 /* 4305 * If the queue was seeky for too long, break it apart. 4306 */ 4307 if (cfq_cfqq_coop(cfqq) && cfq_cfqq_split_coop(cfqq)) { 4308 cfq_log_cfqq(cfqd, cfqq, "breaking apart cfqq"); 4309 cfqq = split_cfqq(cic, cfqq); 4310 if (!cfqq) 4311 goto new_queue; 4312 } 4313 4314 /* 4315 * Check to see if this queue is scheduled to merge with 4316 * another, closely cooperating queue. The merging of 4317 * queues happens here as it must be done in process context. 4318 * The reference on new_cfqq was taken in merge_cfqqs. 4319 */ 4320 if (cfqq->new_cfqq) 4321 cfqq = cfq_merge_cfqqs(cfqd, cic, cfqq); 4322 } 4323 4324 cfqq->allocated[rw]++; 4325 4326 cfqq->ref++; 4327 cfqg_get(cfqq->cfqg); 4328 rq->elv.priv[0] = cfqq; 4329 rq->elv.priv[1] = cfqq->cfqg; 4330 spin_unlock_irq(q->queue_lock); 4331 return 0; 4332} 4333 4334static void cfq_kick_queue(struct work_struct *work) 4335{ 4336 struct cfq_data *cfqd = 4337 container_of(work, struct cfq_data, unplug_work); 4338 struct request_queue *q = cfqd->queue; 4339 4340 spin_lock_irq(q->queue_lock); 4341 __blk_run_queue(cfqd->queue); 4342 spin_unlock_irq(q->queue_lock); 4343} 4344 4345/* 4346 * Timer running if the active_queue is currently idling inside its time slice 4347 */ 4348static void cfq_idle_slice_timer(unsigned long data) 4349{ 4350 struct cfq_data *cfqd = (struct cfq_data *) data; 4351 struct cfq_queue *cfqq; 4352 unsigned long flags; 4353 int timed_out = 1; 4354 4355 cfq_log(cfqd, "idle timer fired"); 4356 4357 spin_lock_irqsave(cfqd->queue->queue_lock, flags); 4358 4359 cfqq = cfqd->active_queue; 4360 if (cfqq) { 4361 timed_out = 0; 4362 4363 /* 4364 * We saw a request before the queue expired, let it through 4365 */ 4366 if (cfq_cfqq_must_dispatch(cfqq)) 4367 goto out_kick; 4368 4369 /* 4370 * expired 4371 */ 4372 if (cfq_slice_used(cfqq)) 4373 goto expire; 4374 4375 /* 4376 * only expire and reinvoke request handler, if there are 4377 * other queues with pending requests 4378 */ 4379 if (!cfqd->busy_queues) 4380 goto out_cont; 4381 4382 /* 4383 * not expired and it has a request pending, let it dispatch 4384 */ 4385 if (!RB_EMPTY_ROOT(&cfqq->sort_list)) 4386 goto out_kick; 4387 4388 /* 4389 * Queue depth flag is reset only when the idle didn't succeed 4390 */ 4391 cfq_clear_cfqq_deep(cfqq); 4392 } 4393expire: 4394 cfq_slice_expired(cfqd, timed_out); 4395out_kick: 4396 cfq_schedule_dispatch(cfqd); 4397out_cont: 4398 spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); 4399} 4400 4401static void cfq_shutdown_timer_wq(struct cfq_data *cfqd) 4402{ 4403 del_timer_sync(&cfqd->idle_slice_timer); 4404 cancel_work_sync(&cfqd->unplug_work); 4405} 4406 4407static void cfq_put_async_queues(struct cfq_data *cfqd) 4408{ 4409 int i; 4410 4411 for (i = 0; i < IOPRIO_BE_NR; i++) { 4412 if (cfqd->async_cfqq[0][i]) 4413 cfq_put_queue(cfqd->async_cfqq[0][i]); 4414 if (cfqd->async_cfqq[1][i]) 4415 cfq_put_queue(cfqd->async_cfqq[1][i]); 4416 } 4417 4418 if (cfqd->async_idle_cfqq) 4419 cfq_put_queue(cfqd->async_idle_cfqq); 4420} 4421 4422static void cfq_exit_queue(struct elevator_queue *e) 4423{ 4424 struct cfq_data *cfqd = e->elevator_data; 4425 struct request_queue *q = cfqd->queue; 4426 4427 cfq_shutdown_timer_wq(cfqd); 4428 4429 spin_lock_irq(q->queue_lock); 4430 4431 if (cfqd->active_queue) 4432 __cfq_slice_expired(cfqd, cfqd->active_queue, 0); 4433 4434 cfq_put_async_queues(cfqd); 4435 4436 spin_unlock_irq(q->queue_lock); 4437 4438 cfq_shutdown_timer_wq(cfqd); 4439 4440#ifdef CONFIG_CFQ_GROUP_IOSCHED 4441 blkcg_deactivate_policy(q, &blkcg_policy_cfq); 4442#else 4443 kfree(cfqd->root_group); 4444#endif 4445 kfree(cfqd); 4446} 4447 4448static int cfq_init_queue(struct request_queue *q, struct elevator_type *e) 4449{ 4450 struct cfq_data *cfqd; 4451 struct blkcg_gq *blkg __maybe_unused; 4452 int i, ret; 4453 struct elevator_queue *eq; 4454 4455 eq = elevator_alloc(q, e); 4456 if (!eq) 4457 return -ENOMEM; 4458 4459 cfqd = kzalloc_node(sizeof(*cfqd), GFP_KERNEL, q->node); 4460 if (!cfqd) { 4461 kobject_put(&eq->kobj); 4462 return -ENOMEM; 4463 } 4464 eq->elevator_data = cfqd; 4465 4466 cfqd->queue = q; 4467 spin_lock_irq(q->queue_lock); 4468 q->elevator = eq; 4469 spin_unlock_irq(q->queue_lock); 4470 4471 /* Init root service tree */ 4472 cfqd->grp_service_tree = CFQ_RB_ROOT; 4473 4474 /* Init root group and prefer root group over other groups by default */ 4475#ifdef CONFIG_CFQ_GROUP_IOSCHED 4476 ret = blkcg_activate_policy(q, &blkcg_policy_cfq); 4477 if (ret) 4478 goto out_free; 4479 4480 cfqd->root_group = blkg_to_cfqg(q->root_blkg); 4481#else 4482 ret = -ENOMEM; 4483 cfqd->root_group = kzalloc_node(sizeof(*cfqd->root_group), 4484 GFP_KERNEL, cfqd->queue->node); 4485 if (!cfqd->root_group) 4486 goto out_free; 4487 4488 cfq_init_cfqg_base(cfqd->root_group); 4489#endif 4490 cfqd->root_group->weight = 2 * CFQ_WEIGHT_DEFAULT; 4491 cfqd->root_group->leaf_weight = 2 * CFQ_WEIGHT_DEFAULT; 4492 4493 /* 4494 * Not strictly needed (since RB_ROOT just clears the node and we 4495 * zeroed cfqd on alloc), but better be safe in case someone decides 4496 * to add magic to the rb code 4497 */ 4498 for (i = 0; i < CFQ_PRIO_LISTS; i++) 4499 cfqd->prio_trees[i] = RB_ROOT; 4500 4501 /* 4502 * Our fallback cfqq if cfq_find_alloc_queue() runs into OOM issues. 4503 * Grab a permanent reference to it, so that the normal code flow 4504 * will not attempt to free it. oom_cfqq is linked to root_group 4505 * but shouldn't hold a reference as it'll never be unlinked. Lose 4506 * the reference from linking right away. 4507 */ 4508 cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0); 4509 cfqd->oom_cfqq.ref++; 4510 4511 spin_lock_irq(q->queue_lock); 4512 cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, cfqd->root_group); 4513 cfqg_put(cfqd->root_group); 4514 spin_unlock_irq(q->queue_lock); 4515 4516 init_timer(&cfqd->idle_slice_timer); 4517 cfqd->idle_slice_timer.function = cfq_idle_slice_timer; 4518 cfqd->idle_slice_timer.data = (unsigned long) cfqd; 4519 4520 INIT_WORK(&cfqd->unplug_work, cfq_kick_queue); 4521 4522 cfqd->cfq_quantum = cfq_quantum; 4523 cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0]; 4524 cfqd->cfq_fifo_expire[1] = cfq_fifo_expire[1]; 4525 cfqd->cfq_back_max = cfq_back_max; 4526 cfqd->cfq_back_penalty = cfq_back_penalty; 4527 cfqd->cfq_slice[0] = cfq_slice_async; 4528 cfqd->cfq_slice[1] = cfq_slice_sync; 4529 cfqd->cfq_target_latency = cfq_target_latency; 4530 cfqd->cfq_slice_async_rq = cfq_slice_async_rq; 4531 cfqd->cfq_slice_idle = cfq_slice_idle; 4532 cfqd->cfq_group_idle = cfq_group_idle; 4533 cfqd->cfq_latency = 1; 4534 cfqd->hw_tag = -1; 4535 /* 4536 * we optimistically start assuming sync ops weren't delayed in last 4537 * second, in order to have larger depth for async operations. 4538 */ 4539 cfqd->last_delayed_sync = jiffies - HZ; 4540 return 0; 4541 4542out_free: 4543 kfree(cfqd); 4544 kobject_put(&eq->kobj); 4545 return ret; 4546} 4547 4548static void cfq_registered_queue(struct request_queue *q) 4549{ 4550 struct elevator_queue *e = q->elevator; 4551 struct cfq_data *cfqd = e->elevator_data; 4552 4553 /* 4554 * Default to IOPS mode with no idling for SSDs 4555 */ 4556 if (blk_queue_nonrot(q)) 4557 cfqd->cfq_slice_idle = 0; 4558} 4559 4560/* 4561 * sysfs parts below --> 4562 */ 4563static ssize_t 4564cfq_var_show(unsigned int var, char *page) 4565{ 4566 return sprintf(page, "%u\n", var); 4567} 4568 4569static ssize_t 4570cfq_var_store(unsigned int *var, const char *page, size_t count) 4571{ 4572 char *p = (char *) page; 4573 4574 *var = simple_strtoul(p, &p, 10); 4575 return count; 4576} 4577 4578#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ 4579static ssize_t __FUNC(struct elevator_queue *e, char *page) \ 4580{ \ 4581 struct cfq_data *cfqd = e->elevator_data; \ 4582 unsigned int __data = __VAR; \ 4583 if (__CONV) \ 4584 __data = jiffies_to_msecs(__data); \ 4585 return cfq_var_show(__data, (page)); \ 4586} 4587SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0); 4588SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1); 4589SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1); 4590SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0); 4591SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0); 4592SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1); 4593SHOW_FUNCTION(cfq_group_idle_show, cfqd->cfq_group_idle, 1); 4594SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1); 4595SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1); 4596SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0); 4597SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0); 4598SHOW_FUNCTION(cfq_target_latency_show, cfqd->cfq_target_latency, 1); 4599#undef SHOW_FUNCTION 4600 4601#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ 4602static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \ 4603{ \ 4604 struct cfq_data *cfqd = e->elevator_data; \ 4605 unsigned int __data; \ 4606 int ret = cfq_var_store(&__data, (page), count); \ 4607 if (__data < (MIN)) \ 4608 __data = (MIN); \ 4609 else if (__data > (MAX)) \ 4610 __data = (MAX); \ 4611 if (__CONV) \ 4612 *(__PTR) = msecs_to_jiffies(__data); \ 4613 else \ 4614 *(__PTR) = __data; \ 4615 return ret; \ 4616} 4617STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0); 4618STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1, 4619 UINT_MAX, 1); 4620STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1, 4621 UINT_MAX, 1); 4622STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0); 4623STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1, 4624 UINT_MAX, 0); 4625STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1); 4626STORE_FUNCTION(cfq_group_idle_store, &cfqd->cfq_group_idle, 0, UINT_MAX, 1); 4627STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1); 4628STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1); 4629STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, 4630 UINT_MAX, 0); 4631STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0); 4632STORE_FUNCTION(cfq_target_latency_store, &cfqd->cfq_target_latency, 1, UINT_MAX, 1); 4633#undef STORE_FUNCTION 4634 4635#define CFQ_ATTR(name) \ 4636 __ATTR(name, S_IRUGO|S_IWUSR, cfq_##name##_show, cfq_##name##_store) 4637 4638static struct elv_fs_entry cfq_attrs[] = { 4639 CFQ_ATTR(quantum), 4640 CFQ_ATTR(fifo_expire_sync), 4641 CFQ_ATTR(fifo_expire_async), 4642 CFQ_ATTR(back_seek_max), 4643 CFQ_ATTR(back_seek_penalty), 4644 CFQ_ATTR(slice_sync), 4645 CFQ_ATTR(slice_async), 4646 CFQ_ATTR(slice_async_rq), 4647 CFQ_ATTR(slice_idle), 4648 CFQ_ATTR(group_idle), 4649 CFQ_ATTR(low_latency), 4650 CFQ_ATTR(target_latency), 4651 __ATTR_NULL 4652}; 4653 4654static struct elevator_type iosched_cfq = { 4655 .ops = { 4656 .elevator_merge_fn = cfq_merge, 4657 .elevator_merged_fn = cfq_merged_request, 4658 .elevator_merge_req_fn = cfq_merged_requests, 4659 .elevator_allow_merge_fn = cfq_allow_merge, 4660 .elevator_bio_merged_fn = cfq_bio_merged, 4661 .elevator_dispatch_fn = cfq_dispatch_requests, 4662 .elevator_add_req_fn = cfq_insert_request, 4663 .elevator_activate_req_fn = cfq_activate_request, 4664 .elevator_deactivate_req_fn = cfq_deactivate_request, 4665 .elevator_completed_req_fn = cfq_completed_request, 4666 .elevator_former_req_fn = elv_rb_former_request, 4667 .elevator_latter_req_fn = elv_rb_latter_request, 4668 .elevator_init_icq_fn = cfq_init_icq, 4669 .elevator_exit_icq_fn = cfq_exit_icq, 4670 .elevator_set_req_fn = cfq_set_request, 4671 .elevator_put_req_fn = cfq_put_request, 4672 .elevator_may_queue_fn = cfq_may_queue, 4673 .elevator_init_fn = cfq_init_queue, 4674 .elevator_exit_fn = cfq_exit_queue, 4675 .elevator_registered_fn = cfq_registered_queue, 4676 }, 4677 .icq_size = sizeof(struct cfq_io_cq), 4678 .icq_align = __alignof__(struct cfq_io_cq), 4679 .elevator_attrs = cfq_attrs, 4680 .elevator_name = "cfq", 4681 .elevator_owner = THIS_MODULE, 4682}; 4683 4684#ifdef CONFIG_CFQ_GROUP_IOSCHED 4685static struct blkcg_policy blkcg_policy_cfq = { 4686 .pd_size = sizeof(struct cfq_group), 4687 .cpd_size = sizeof(struct cfq_group_data), 4688 .cftypes = cfq_blkcg_files, 4689 4690 .cpd_init_fn = cfq_cpd_init, 4691 .pd_init_fn = cfq_pd_init, 4692 .pd_offline_fn = cfq_pd_offline, 4693 .pd_reset_stats_fn = cfq_pd_reset_stats, 4694}; 4695#endif 4696 4697static int __init cfq_init(void) 4698{ 4699 int ret; 4700 4701 /* 4702 * could be 0 on HZ < 1000 setups 4703 */ 4704 if (!cfq_slice_async) 4705 cfq_slice_async = 1; 4706 if (!cfq_slice_idle) 4707 cfq_slice_idle = 1; 4708 4709#ifdef CONFIG_CFQ_GROUP_IOSCHED 4710 if (!cfq_group_idle) 4711 cfq_group_idle = 1; 4712 4713 ret = blkcg_policy_register(&blkcg_policy_cfq); 4714 if (ret) 4715 return ret; 4716#else 4717 cfq_group_idle = 0; 4718#endif 4719 4720 ret = -ENOMEM; 4721 cfq_pool = KMEM_CACHE(cfq_queue, 0); 4722 if (!cfq_pool) 4723 goto err_pol_unreg; 4724 4725 ret = elv_register(&iosched_cfq); 4726 if (ret) 4727 goto err_free_pool; 4728 4729 return 0; 4730 4731err_free_pool: 4732 kmem_cache_destroy(cfq_pool); 4733err_pol_unreg: 4734#ifdef CONFIG_CFQ_GROUP_IOSCHED 4735 blkcg_policy_unregister(&blkcg_policy_cfq); 4736#endif 4737 return ret; 4738} 4739 4740static void __exit cfq_exit(void) 4741{ 4742#ifdef CONFIG_CFQ_GROUP_IOSCHED 4743 blkcg_policy_unregister(&blkcg_policy_cfq); 4744#endif 4745 elv_unregister(&iosched_cfq); 4746 kmem_cache_destroy(cfq_pool); 4747} 4748 4749module_init(cfq_init); 4750module_exit(cfq_exit); 4751 4752MODULE_AUTHOR("Jens Axboe"); 4753MODULE_LICENSE("GPL"); 4754MODULE_DESCRIPTION("Completely Fair Queueing IO scheduler");