at master 36 kB view raw
1/* SPDX-License-Identifier: GPL-2.0 */ 2#ifndef BLK_MQ_H 3#define BLK_MQ_H 4 5#include <linux/blkdev.h> 6#include <linux/sbitmap.h> 7#include <linux/lockdep.h> 8#include <linux/scatterlist.h> 9#include <linux/prefetch.h> 10#include <linux/srcu.h> 11#include <linux/rw_hint.h> 12#include <linux/rwsem.h> 13 14struct blk_mq_tags; 15struct blk_flush_queue; 16 17#define BLKDEV_MIN_RQ 4 18#define BLKDEV_DEFAULT_RQ 128 19 20enum rq_end_io_ret { 21 RQ_END_IO_NONE, 22 RQ_END_IO_FREE, 23}; 24 25typedef enum rq_end_io_ret (rq_end_io_fn)(struct request *, blk_status_t); 26 27/* 28 * request flags */ 29typedef __u32 __bitwise req_flags_t; 30 31/* Keep rqf_name[] in sync with the definitions below */ 32enum rqf_flags { 33 /* drive already may have started this one */ 34 __RQF_STARTED, 35 /* request for flush sequence */ 36 __RQF_FLUSH_SEQ, 37 /* merge of different types, fail separately */ 38 __RQF_MIXED_MERGE, 39 /* don't call prep for this one */ 40 __RQF_DONTPREP, 41 /* use hctx->sched_tags */ 42 __RQF_SCHED_TAGS, 43 /* use an I/O scheduler for this request */ 44 __RQF_USE_SCHED, 45 /* vaguely specified driver internal error. Ignored by block layer */ 46 __RQF_FAILED, 47 /* don't warn about errors */ 48 __RQF_QUIET, 49 /* account into disk and partition IO statistics */ 50 __RQF_IO_STAT, 51 /* runtime pm request */ 52 __RQF_PM, 53 /* on IO scheduler merge hash */ 54 __RQF_HASHED, 55 /* track IO completion time */ 56 __RQF_STATS, 57 /* Look at ->special_vec for the actual data payload instead of the 58 bio chain. */ 59 __RQF_SPECIAL_PAYLOAD, 60 /* request completion needs to be signaled to zone write plugging. */ 61 __RQF_ZONE_WRITE_PLUGGING, 62 /* ->timeout has been called, don't expire again */ 63 __RQF_TIMED_OUT, 64 __RQF_RESV, 65 __RQF_BITS 66}; 67 68#define RQF_STARTED ((__force req_flags_t)(1 << __RQF_STARTED)) 69#define RQF_FLUSH_SEQ ((__force req_flags_t)(1 << __RQF_FLUSH_SEQ)) 70#define RQF_MIXED_MERGE ((__force req_flags_t)(1 << __RQF_MIXED_MERGE)) 71#define RQF_DONTPREP ((__force req_flags_t)(1 << __RQF_DONTPREP)) 72#define RQF_SCHED_TAGS ((__force req_flags_t)(1 << __RQF_SCHED_TAGS)) 73#define RQF_USE_SCHED ((__force req_flags_t)(1 << __RQF_USE_SCHED)) 74#define RQF_FAILED ((__force req_flags_t)(1 << __RQF_FAILED)) 75#define RQF_QUIET ((__force req_flags_t)(1 << __RQF_QUIET)) 76#define RQF_IO_STAT ((__force req_flags_t)(1 << __RQF_IO_STAT)) 77#define RQF_PM ((__force req_flags_t)(1 << __RQF_PM)) 78#define RQF_HASHED ((__force req_flags_t)(1 << __RQF_HASHED)) 79#define RQF_STATS ((__force req_flags_t)(1 << __RQF_STATS)) 80#define RQF_SPECIAL_PAYLOAD \ 81 ((__force req_flags_t)(1 << __RQF_SPECIAL_PAYLOAD)) 82#define RQF_ZONE_WRITE_PLUGGING \ 83 ((__force req_flags_t)(1 << __RQF_ZONE_WRITE_PLUGGING)) 84#define RQF_TIMED_OUT ((__force req_flags_t)(1 << __RQF_TIMED_OUT)) 85#define RQF_RESV ((__force req_flags_t)(1 << __RQF_RESV)) 86 87/* flags that prevent us from merging requests: */ 88#define RQF_NOMERGE_FLAGS \ 89 (RQF_STARTED | RQF_FLUSH_SEQ | RQF_SPECIAL_PAYLOAD) 90 91enum mq_rq_state { 92 MQ_RQ_IDLE = 0, 93 MQ_RQ_IN_FLIGHT = 1, 94 MQ_RQ_COMPLETE = 2, 95}; 96 97/* 98 * Try to put the fields that are referenced together in the same cacheline. 99 * 100 * If you modify this structure, make sure to update blk_rq_init() and 101 * especially blk_mq_rq_ctx_init() to take care of the added fields. 102 */ 103struct request { 104 struct request_queue *q; 105 struct blk_mq_ctx *mq_ctx; 106 struct blk_mq_hw_ctx *mq_hctx; 107 108 blk_opf_t cmd_flags; /* op and common flags */ 109 req_flags_t rq_flags; 110 111 int tag; 112 int internal_tag; 113 114 unsigned int timeout; 115 116 /* the following two fields are internal, NEVER access directly */ 117 unsigned int __data_len; /* total data len */ 118 sector_t __sector; /* sector cursor */ 119 120 struct bio *bio; 121 struct bio *biotail; 122 123 union { 124 struct list_head queuelist; 125 struct request *rq_next; 126 }; 127 128 struct block_device *part; 129#ifdef CONFIG_BLK_RQ_ALLOC_TIME 130 /* Time that the first bio started allocating this request. */ 131 u64 alloc_time_ns; 132#endif 133 /* Time that this request was allocated for this IO. */ 134 u64 start_time_ns; 135 /* Time that I/O was submitted to the device. */ 136 u64 io_start_time_ns; 137 138#ifdef CONFIG_BLK_WBT 139 unsigned short wbt_flags; 140#endif 141 /* 142 * rq sectors used for blk stats. It has the same value 143 * with blk_rq_sectors(rq), except that it never be zeroed 144 * by completion. 145 */ 146 unsigned short stats_sectors; 147 148 /* 149 * Number of scatter-gather DMA addr+len pairs after 150 * physical address coalescing is performed. 151 */ 152 unsigned short nr_phys_segments; 153 unsigned short nr_integrity_segments; 154 155 /* 156 * The lowest set bit for address gaps between physical segments. This 157 * provides information necessary for dma optimization opprotunities, 158 * like for testing if the segments can be coalesced against the 159 * device's iommu granule. 160 */ 161 unsigned char phys_gap_bit; 162 163#ifdef CONFIG_BLK_INLINE_ENCRYPTION 164 struct bio_crypt_ctx *crypt_ctx; 165 struct blk_crypto_keyslot *crypt_keyslot; 166#endif 167 168 enum mq_rq_state state; 169 atomic_t ref; 170 171 unsigned long deadline; 172 173 /* 174 * The hash is used inside the scheduler, and killed once the 175 * request reaches the dispatch list. The ipi_list is only used 176 * to queue the request for softirq completion, which is long 177 * after the request has been unhashed (and even removed from 178 * the dispatch list). 179 */ 180 union { 181 struct hlist_node hash; /* merge hash */ 182 struct llist_node ipi_list; 183 }; 184 185 /* 186 * The rb_node is only used inside the io scheduler, requests 187 * are pruned when moved to the dispatch queue. special_vec must 188 * only be used if RQF_SPECIAL_PAYLOAD is set, and those cannot be 189 * insert into an IO scheduler. 190 */ 191 union { 192 struct rb_node rb_node; /* sort/lookup */ 193 struct bio_vec special_vec; 194 }; 195 196 /* 197 * Three pointers are available for the IO schedulers, if they need 198 * more they have to dynamically allocate it. 199 */ 200 struct { 201 struct io_cq *icq; 202 void *priv[2]; 203 } elv; 204 205 struct { 206 unsigned int seq; 207 rq_end_io_fn *saved_end_io; 208 } flush; 209 210 u64 fifo_time; 211 212 /* 213 * completion callback. 214 */ 215 rq_end_io_fn *end_io; 216 void *end_io_data; 217}; 218 219/* 220 * Returns a mask with all bits starting at req->phys_gap_bit set to 1. 221 */ 222static inline unsigned long req_phys_gap_mask(const struct request *req) 223{ 224 return ~(((1 << req->phys_gap_bit) >> 1) - 1); 225} 226 227static inline enum req_op req_op(const struct request *req) 228{ 229 return req->cmd_flags & REQ_OP_MASK; 230} 231 232static inline bool blk_rq_is_passthrough(struct request *rq) 233{ 234 return blk_op_is_passthrough(rq->cmd_flags); 235} 236 237static inline unsigned short req_get_ioprio(struct request *req) 238{ 239 if (req->bio) 240 return req->bio->bi_ioprio; 241 return 0; 242} 243 244#define rq_data_dir(rq) (op_is_write(req_op(rq)) ? WRITE : READ) 245 246#define rq_dma_dir(rq) \ 247 (op_is_write(req_op(rq)) ? DMA_TO_DEVICE : DMA_FROM_DEVICE) 248 249static inline int rq_list_empty(const struct rq_list *rl) 250{ 251 return rl->head == NULL; 252} 253 254static inline void rq_list_init(struct rq_list *rl) 255{ 256 rl->head = NULL; 257 rl->tail = NULL; 258} 259 260static inline void rq_list_add_tail(struct rq_list *rl, struct request *rq) 261{ 262 rq->rq_next = NULL; 263 if (rl->tail) 264 rl->tail->rq_next = rq; 265 else 266 rl->head = rq; 267 rl->tail = rq; 268} 269 270static inline void rq_list_add_head(struct rq_list *rl, struct request *rq) 271{ 272 rq->rq_next = rl->head; 273 rl->head = rq; 274 if (!rl->tail) 275 rl->tail = rq; 276} 277 278static inline struct request *rq_list_pop(struct rq_list *rl) 279{ 280 struct request *rq = rl->head; 281 282 if (rq) { 283 rl->head = rl->head->rq_next; 284 if (!rl->head) 285 rl->tail = NULL; 286 rq->rq_next = NULL; 287 } 288 289 return rq; 290} 291 292static inline struct request *rq_list_peek(struct rq_list *rl) 293{ 294 return rl->head; 295} 296 297#define rq_list_for_each(rl, pos) \ 298 for (pos = rq_list_peek((rl)); (pos); pos = pos->rq_next) 299 300#define rq_list_for_each_safe(rl, pos, nxt) \ 301 for (pos = rq_list_peek((rl)), nxt = pos->rq_next; \ 302 pos; pos = nxt, nxt = pos ? pos->rq_next : NULL) 303 304/** 305 * enum blk_eh_timer_return - How the timeout handler should proceed 306 * @BLK_EH_DONE: The block driver completed the command or will complete it at 307 * a later time. 308 * @BLK_EH_RESET_TIMER: Reset the request timer and continue waiting for the 309 * request to complete. 310 */ 311enum blk_eh_timer_return { 312 BLK_EH_DONE, 313 BLK_EH_RESET_TIMER, 314}; 315 316/** 317 * struct blk_mq_hw_ctx - State for a hardware queue facing the hardware 318 * block device 319 */ 320struct blk_mq_hw_ctx { 321 struct { 322 /** @lock: Protects the dispatch list. */ 323 spinlock_t lock; 324 /** 325 * @dispatch: Used for requests that are ready to be 326 * dispatched to the hardware but for some reason (e.g. lack of 327 * resources) could not be sent to the hardware. As soon as the 328 * driver can send new requests, requests at this list will 329 * be sent first for a fairer dispatch. 330 */ 331 struct list_head dispatch; 332 /** 333 * @state: BLK_MQ_S_* flags. Defines the state of the hw 334 * queue (active, scheduled to restart, stopped). 335 */ 336 unsigned long state; 337 } ____cacheline_aligned_in_smp; 338 339 /** 340 * @run_work: Used for scheduling a hardware queue run at a later time. 341 */ 342 struct delayed_work run_work; 343 /** @cpumask: Map of available CPUs where this hctx can run. */ 344 cpumask_var_t cpumask; 345 /** 346 * @next_cpu: Used by blk_mq_hctx_next_cpu() for round-robin CPU 347 * selection from @cpumask. 348 */ 349 int next_cpu; 350 /** 351 * @next_cpu_batch: Counter of how many works left in the batch before 352 * changing to the next CPU. 353 */ 354 int next_cpu_batch; 355 356 /** @flags: BLK_MQ_F_* flags. Defines the behaviour of the queue. */ 357 unsigned long flags; 358 359 /** 360 * @sched_data: Pointer owned by the IO scheduler attached to a request 361 * queue. It's up to the IO scheduler how to use this pointer. 362 */ 363 void *sched_data; 364 /** 365 * @queue: Pointer to the request queue that owns this hardware context. 366 */ 367 struct request_queue *queue; 368 /** @fq: Queue of requests that need to perform a flush operation. */ 369 struct blk_flush_queue *fq; 370 371 /** 372 * @driver_data: Pointer to data owned by the block driver that created 373 * this hctx 374 */ 375 void *driver_data; 376 377 /** 378 * @ctx_map: Bitmap for each software queue. If bit is on, there is a 379 * pending request in that software queue. 380 */ 381 struct sbitmap ctx_map; 382 383 /** 384 * @dispatch_from: Software queue to be used when no scheduler was 385 * selected. 386 */ 387 struct blk_mq_ctx *dispatch_from; 388 /** 389 * @dispatch_busy: Number used by blk_mq_update_dispatch_busy() to 390 * decide if the hw_queue is busy using Exponential Weighted Moving 391 * Average algorithm. 392 */ 393 unsigned int dispatch_busy; 394 395 /** @type: HCTX_TYPE_* flags. Type of hardware queue. */ 396 unsigned short type; 397 /** @nr_ctx: Number of software queues. */ 398 unsigned short nr_ctx; 399 /** @ctxs: Array of software queues. */ 400 struct blk_mq_ctx **ctxs; 401 402 /** @dispatch_wait_lock: Lock for dispatch_wait queue. */ 403 spinlock_t dispatch_wait_lock; 404 /** 405 * @dispatch_wait: Waitqueue to put requests when there is no tag 406 * available at the moment, to wait for another try in the future. 407 */ 408 wait_queue_entry_t dispatch_wait; 409 410 /** 411 * @wait_index: Index of next available dispatch_wait queue to insert 412 * requests. 413 */ 414 atomic_t wait_index; 415 416 /** 417 * @tags: Tags owned by the block driver. A tag at this set is only 418 * assigned when a request is dispatched from a hardware queue. 419 */ 420 struct blk_mq_tags *tags; 421 /** 422 * @sched_tags: Tags owned by I/O scheduler. If there is an I/O 423 * scheduler associated with a request queue, a tag is assigned when 424 * that request is allocated. Else, this member is not used. 425 */ 426 struct blk_mq_tags *sched_tags; 427 428 /** @numa_node: NUMA node the storage adapter has been connected to. */ 429 unsigned int numa_node; 430 /** @queue_num: Index of this hardware queue. */ 431 unsigned int queue_num; 432 433 /** 434 * @nr_active: Number of active requests. Only used when a tag set is 435 * shared across request queues. 436 */ 437 atomic_t nr_active; 438 439 /** @cpuhp_online: List to store request if CPU is going to die */ 440 struct hlist_node cpuhp_online; 441 /** @cpuhp_dead: List to store request if some CPU die. */ 442 struct hlist_node cpuhp_dead; 443 /** @kobj: Kernel object for sysfs. */ 444 struct kobject kobj; 445 446#ifdef CONFIG_BLK_DEBUG_FS 447 /** 448 * @debugfs_dir: debugfs directory for this hardware queue. Named 449 * as cpu<cpu_number>. 450 */ 451 struct dentry *debugfs_dir; 452 /** @sched_debugfs_dir: debugfs directory for the scheduler. */ 453 struct dentry *sched_debugfs_dir; 454#endif 455 456 /** 457 * @hctx_list: if this hctx is not in use, this is an entry in 458 * q->unused_hctx_list. 459 */ 460 struct list_head hctx_list; 461}; 462 463/** 464 * struct blk_mq_queue_map - Map software queues to hardware queues 465 * @mq_map: CPU ID to hardware queue index map. This is an array 466 * with nr_cpu_ids elements. Each element has a value in the range 467 * [@queue_offset, @queue_offset + @nr_queues). 468 * @nr_queues: Number of hardware queues to map CPU IDs onto. 469 * @queue_offset: First hardware queue to map onto. Used by the PCIe NVMe 470 * driver to map each hardware queue type (enum hctx_type) onto a distinct 471 * set of hardware queues. 472 */ 473struct blk_mq_queue_map { 474 unsigned int *mq_map; 475 unsigned int nr_queues; 476 unsigned int queue_offset; 477}; 478 479/** 480 * enum hctx_type - Type of hardware queue 481 * @HCTX_TYPE_DEFAULT: All I/O not otherwise accounted for. 482 * @HCTX_TYPE_READ: Just for READ I/O. 483 * @HCTX_TYPE_POLL: Polled I/O of any kind. 484 * @HCTX_MAX_TYPES: Number of types of hctx. 485 */ 486enum hctx_type { 487 HCTX_TYPE_DEFAULT, 488 HCTX_TYPE_READ, 489 HCTX_TYPE_POLL, 490 491 HCTX_MAX_TYPES, 492}; 493 494/** 495 * struct blk_mq_tag_set - tag set that can be shared between request queues 496 * @ops: Pointers to functions that implement block driver behavior. 497 * @map: One or more ctx -> hctx mappings. One map exists for each 498 * hardware queue type (enum hctx_type) that the driver wishes 499 * to support. There are no restrictions on maps being of the 500 * same size, and it's perfectly legal to share maps between 501 * types. 502 * @nr_maps: Number of elements in the @map array. A number in the range 503 * [1, HCTX_MAX_TYPES]. 504 * @nr_hw_queues: Number of hardware queues supported by the block driver that 505 * owns this data structure. 506 * @queue_depth: Number of tags per hardware queue, reserved tags included. 507 * @reserved_tags: Number of tags to set aside for BLK_MQ_REQ_RESERVED tag 508 * allocations. 509 * @cmd_size: Number of additional bytes to allocate per request. The block 510 * driver owns these additional bytes. 511 * @numa_node: NUMA node the storage adapter has been connected to. 512 * @timeout: Request processing timeout in jiffies. 513 * @flags: Zero or more BLK_MQ_F_* flags. 514 * @driver_data: Pointer to data owned by the block driver that created this 515 * tag set. 516 * @tags: Tag sets. One tag set per hardware queue. Has @nr_hw_queues 517 * elements. 518 * @shared_tags: 519 * Shared set of tags. Has @nr_hw_queues elements. If set, 520 * shared by all @tags. 521 * @tag_list_lock: Serializes tag_list accesses. 522 * @tag_list: List of the request queues that use this tag set. See also 523 * request_queue.tag_set_list. 524 * @srcu: Use as lock when type of the request queue is blocking 525 * (BLK_MQ_F_BLOCKING). 526 * @tags_srcu: SRCU used to defer freeing of tags page_list to prevent 527 * use-after-free when iterating tags. 528 * @update_nr_hwq_lock: 529 * Synchronize updating nr_hw_queues with add/del disk & 530 * switching elevator. 531 */ 532struct blk_mq_tag_set { 533 const struct blk_mq_ops *ops; 534 struct blk_mq_queue_map map[HCTX_MAX_TYPES]; 535 unsigned int nr_maps; 536 unsigned int nr_hw_queues; 537 unsigned int queue_depth; 538 unsigned int reserved_tags; 539 unsigned int cmd_size; 540 int numa_node; 541 unsigned int timeout; 542 unsigned int flags; 543 void *driver_data; 544 545 struct blk_mq_tags **tags; 546 547 struct blk_mq_tags *shared_tags; 548 549 struct mutex tag_list_lock; 550 struct list_head tag_list; 551 struct srcu_struct *srcu; 552 struct srcu_struct tags_srcu; 553 554 struct rw_semaphore update_nr_hwq_lock; 555}; 556 557/** 558 * struct blk_mq_queue_data - Data about a request inserted in a queue 559 * 560 * @rq: Request pointer. 561 * @last: If it is the last request in the queue. 562 */ 563struct blk_mq_queue_data { 564 struct request *rq; 565 bool last; 566}; 567 568typedef bool (busy_tag_iter_fn)(struct request *, void *); 569 570/** 571 * struct blk_mq_ops - Callback functions that implements block driver 572 * behaviour. 573 */ 574struct blk_mq_ops { 575 /** 576 * @queue_rq: Queue a new request from block IO. 577 */ 578 blk_status_t (*queue_rq)(struct blk_mq_hw_ctx *, 579 const struct blk_mq_queue_data *); 580 581 /** 582 * @commit_rqs: If a driver uses bd->last to judge when to submit 583 * requests to hardware, it must define this function. In case of errors 584 * that make us stop issuing further requests, this hook serves the 585 * purpose of kicking the hardware (which the last request otherwise 586 * would have done). 587 */ 588 void (*commit_rqs)(struct blk_mq_hw_ctx *); 589 590 /** 591 * @queue_rqs: Queue a list of new requests. Driver is guaranteed 592 * that each request belongs to the same queue. If the driver doesn't 593 * empty the @rqlist completely, then the rest will be queued 594 * individually by the block layer upon return. 595 */ 596 void (*queue_rqs)(struct rq_list *rqlist); 597 598 /** 599 * @get_budget: Reserve budget before queue request, once .queue_rq is 600 * run, it is driver's responsibility to release the 601 * reserved budget. Also we have to handle failure case 602 * of .get_budget for avoiding I/O deadlock. 603 */ 604 int (*get_budget)(struct request_queue *); 605 606 /** 607 * @put_budget: Release the reserved budget. 608 */ 609 void (*put_budget)(struct request_queue *, int); 610 611 /** 612 * @set_rq_budget_token: store rq's budget token 613 */ 614 void (*set_rq_budget_token)(struct request *, int); 615 /** 616 * @get_rq_budget_token: retrieve rq's budget token 617 */ 618 int (*get_rq_budget_token)(struct request *); 619 620 /** 621 * @timeout: Called on request timeout. 622 */ 623 enum blk_eh_timer_return (*timeout)(struct request *); 624 625 /** 626 * @poll: Called to poll for completion of a specific tag. 627 */ 628 int (*poll)(struct blk_mq_hw_ctx *, struct io_comp_batch *); 629 630 /** 631 * @complete: Mark the request as complete. 632 */ 633 void (*complete)(struct request *); 634 635 /** 636 * @init_hctx: Called when the block layer side of a hardware queue has 637 * been set up, allowing the driver to allocate/init matching 638 * structures. 639 */ 640 int (*init_hctx)(struct blk_mq_hw_ctx *, void *, unsigned int); 641 /** 642 * @exit_hctx: Ditto for exit/teardown. 643 */ 644 void (*exit_hctx)(struct blk_mq_hw_ctx *, unsigned int); 645 646 /** 647 * @init_request: Called for every command allocated by the block layer 648 * to allow the driver to set up driver specific data. 649 * 650 * Tag greater than or equal to queue_depth is for setting up 651 * flush request. 652 */ 653 int (*init_request)(struct blk_mq_tag_set *set, struct request *, 654 unsigned int, unsigned int); 655 /** 656 * @exit_request: Ditto for exit/teardown. 657 */ 658 void (*exit_request)(struct blk_mq_tag_set *set, struct request *, 659 unsigned int); 660 661 /** 662 * @cleanup_rq: Called before freeing one request which isn't completed 663 * yet, and usually for freeing the driver private data. 664 */ 665 void (*cleanup_rq)(struct request *); 666 667 /** 668 * @busy: If set, returns whether or not this queue currently is busy. 669 */ 670 bool (*busy)(struct request_queue *); 671 672 /** 673 * @map_queues: This allows drivers specify their own queue mapping by 674 * overriding the setup-time function that builds the mq_map. 675 */ 676 void (*map_queues)(struct blk_mq_tag_set *set); 677 678#ifdef CONFIG_BLK_DEBUG_FS 679 /** 680 * @show_rq: Used by the debugfs implementation to show driver-specific 681 * information about a request. 682 */ 683 void (*show_rq)(struct seq_file *m, struct request *rq); 684#endif 685}; 686 687/* Keep hctx_flag_name[] in sync with the definitions below */ 688enum { 689 BLK_MQ_F_TAG_QUEUE_SHARED = 1 << 1, 690 /* 691 * Set when this device requires underlying blk-mq device for 692 * completing IO: 693 */ 694 BLK_MQ_F_STACKING = 1 << 2, 695 BLK_MQ_F_TAG_HCTX_SHARED = 1 << 3, 696 BLK_MQ_F_BLOCKING = 1 << 4, 697 698 /* 699 * Alloc tags on a round-robin base instead of the first available one. 700 */ 701 BLK_MQ_F_TAG_RR = 1 << 5, 702 703 /* 704 * Select 'none' during queue registration in case of a single hwq 705 * or shared hwqs instead of 'mq-deadline'. 706 */ 707 BLK_MQ_F_NO_SCHED_BY_DEFAULT = 1 << 6, 708 709 BLK_MQ_F_MAX = 1 << 7, 710}; 711 712#define BLK_MQ_MAX_DEPTH (10240) 713#define BLK_MQ_NO_HCTX_IDX (-1U) 714 715enum { 716 /* Keep hctx_state_name[] in sync with the definitions below */ 717 BLK_MQ_S_STOPPED, 718 BLK_MQ_S_TAG_ACTIVE, 719 BLK_MQ_S_SCHED_RESTART, 720 /* hw queue is inactive after all its CPUs become offline */ 721 BLK_MQ_S_INACTIVE, 722 BLK_MQ_S_MAX 723}; 724 725struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, 726 struct queue_limits *lim, void *queuedata, 727 struct lock_class_key *lkclass); 728#define blk_mq_alloc_disk(set, lim, queuedata) \ 729({ \ 730 static struct lock_class_key __key; \ 731 \ 732 __blk_mq_alloc_disk(set, lim, queuedata, &__key); \ 733}) 734struct gendisk *blk_mq_alloc_disk_for_queue(struct request_queue *q, 735 struct lock_class_key *lkclass); 736struct request_queue *blk_mq_alloc_queue(struct blk_mq_tag_set *set, 737 struct queue_limits *lim, void *queuedata); 738int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, 739 struct request_queue *q); 740void blk_mq_destroy_queue(struct request_queue *); 741 742int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set); 743int blk_mq_alloc_sq_tag_set(struct blk_mq_tag_set *set, 744 const struct blk_mq_ops *ops, unsigned int queue_depth, 745 unsigned int set_flags); 746void blk_mq_free_tag_set(struct blk_mq_tag_set *set); 747 748void blk_mq_free_request(struct request *rq); 749int blk_rq_poll(struct request *rq, struct io_comp_batch *iob, 750 unsigned int poll_flags); 751 752bool blk_mq_queue_inflight(struct request_queue *q); 753 754enum { 755 /* return when out of requests */ 756 BLK_MQ_REQ_NOWAIT = (__force blk_mq_req_flags_t)(1 << 0), 757 /* allocate from reserved pool */ 758 BLK_MQ_REQ_RESERVED = (__force blk_mq_req_flags_t)(1 << 1), 759 /* set RQF_PM */ 760 BLK_MQ_REQ_PM = (__force blk_mq_req_flags_t)(1 << 2), 761}; 762 763struct request *blk_mq_alloc_request(struct request_queue *q, blk_opf_t opf, 764 blk_mq_req_flags_t flags); 765struct request *blk_mq_alloc_request_hctx(struct request_queue *q, 766 blk_opf_t opf, blk_mq_req_flags_t flags, 767 unsigned int hctx_idx); 768 769/* 770 * Tag address space map. 771 */ 772struct blk_mq_tags { 773 unsigned int nr_tags; 774 unsigned int nr_reserved_tags; 775 unsigned int active_queues; 776 777 struct sbitmap_queue bitmap_tags; 778 struct sbitmap_queue breserved_tags; 779 780 struct request **rqs; 781 struct request **static_rqs; 782 struct list_head page_list; 783 784 /* 785 * used to clear request reference in rqs[] before freeing one 786 * request pool 787 */ 788 spinlock_t lock; 789 struct rcu_head rcu_head; 790}; 791 792static inline struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, 793 unsigned int tag) 794{ 795 if (tag < tags->nr_tags) { 796 prefetch(tags->rqs[tag]); 797 return tags->rqs[tag]; 798 } 799 800 return NULL; 801} 802 803enum { 804 BLK_MQ_UNIQUE_TAG_BITS = 16, 805 BLK_MQ_UNIQUE_TAG_MASK = (1 << BLK_MQ_UNIQUE_TAG_BITS) - 1, 806}; 807 808u32 blk_mq_unique_tag(struct request *rq); 809 810static inline u16 blk_mq_unique_tag_to_hwq(u32 unique_tag) 811{ 812 return unique_tag >> BLK_MQ_UNIQUE_TAG_BITS; 813} 814 815static inline u16 blk_mq_unique_tag_to_tag(u32 unique_tag) 816{ 817 return unique_tag & BLK_MQ_UNIQUE_TAG_MASK; 818} 819 820/** 821 * blk_mq_rq_state() - read the current MQ_RQ_* state of a request 822 * @rq: target request. 823 */ 824static inline enum mq_rq_state blk_mq_rq_state(struct request *rq) 825{ 826 return READ_ONCE(rq->state); 827} 828 829static inline int blk_mq_request_started(struct request *rq) 830{ 831 return blk_mq_rq_state(rq) != MQ_RQ_IDLE; 832} 833 834static inline int blk_mq_request_completed(struct request *rq) 835{ 836 return blk_mq_rq_state(rq) == MQ_RQ_COMPLETE; 837} 838 839/* 840 * 841 * Set the state to complete when completing a request from inside ->queue_rq. 842 * This is used by drivers that want to ensure special complete actions that 843 * need access to the request are called on failure, e.g. by nvme for 844 * multipathing. 845 */ 846static inline void blk_mq_set_request_complete(struct request *rq) 847{ 848 WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); 849} 850 851/* 852 * Complete the request directly instead of deferring it to softirq or 853 * completing it another CPU. Useful in preemptible instead of an interrupt. 854 */ 855static inline void blk_mq_complete_request_direct(struct request *rq, 856 void (*complete)(struct request *rq)) 857{ 858 WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); 859 complete(rq); 860} 861 862void blk_mq_start_request(struct request *rq); 863void blk_mq_end_request(struct request *rq, blk_status_t error); 864void __blk_mq_end_request(struct request *rq, blk_status_t error); 865void blk_mq_end_request_batch(struct io_comp_batch *ib); 866 867/* 868 * Only need start/end time stamping if we have iostat or 869 * blk stats enabled, or using an IO scheduler. 870 */ 871static inline bool blk_mq_need_time_stamp(struct request *rq) 872{ 873 return (rq->rq_flags & (RQF_IO_STAT | RQF_STATS | RQF_USE_SCHED)); 874} 875 876static inline bool blk_mq_is_reserved_rq(struct request *rq) 877{ 878 return rq->rq_flags & RQF_RESV; 879} 880 881/** 882 * blk_mq_add_to_batch() - add a request to the completion batch 883 * @req: The request to add to batch 884 * @iob: The batch to add the request 885 * @is_error: Specify true if the request failed with an error 886 * @complete: The completaion handler for the request 887 * 888 * Batched completions only work when there is no I/O error and no special 889 * ->end_io handler. 890 * 891 * Return: true when the request was added to the batch, otherwise false 892 */ 893static inline bool blk_mq_add_to_batch(struct request *req, 894 struct io_comp_batch *iob, bool is_error, 895 void (*complete)(struct io_comp_batch *)) 896{ 897 /* 898 * Check various conditions that exclude batch processing: 899 * 1) No batch container 900 * 2) Has scheduler data attached 901 * 3) Not a passthrough request and end_io set 902 * 4) Not a passthrough request and failed with an error 903 */ 904 if (!iob) 905 return false; 906 if (req->rq_flags & RQF_SCHED_TAGS) 907 return false; 908 if (!blk_rq_is_passthrough(req)) { 909 if (req->end_io) 910 return false; 911 if (is_error) 912 return false; 913 } 914 915 if (!iob->complete) 916 iob->complete = complete; 917 else if (iob->complete != complete) 918 return false; 919 iob->need_ts |= blk_mq_need_time_stamp(req); 920 rq_list_add_tail(&iob->req_list, req); 921 return true; 922} 923 924void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list); 925void blk_mq_kick_requeue_list(struct request_queue *q); 926void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs); 927void blk_mq_complete_request(struct request *rq); 928bool blk_mq_complete_request_remote(struct request *rq); 929void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx); 930void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx); 931void blk_mq_stop_hw_queues(struct request_queue *q); 932void blk_mq_start_hw_queues(struct request_queue *q); 933void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); 934void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async); 935void blk_mq_quiesce_queue(struct request_queue *q); 936void blk_mq_wait_quiesce_done(struct blk_mq_tag_set *set); 937void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set); 938void blk_mq_unquiesce_tagset(struct blk_mq_tag_set *set); 939void blk_mq_unquiesce_queue(struct request_queue *q); 940void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); 941void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); 942void blk_mq_run_hw_queues(struct request_queue *q, bool async); 943void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs); 944void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, 945 busy_tag_iter_fn *fn, void *priv); 946void blk_mq_tagset_wait_completed_request(struct blk_mq_tag_set *tagset); 947void blk_mq_freeze_queue_nomemsave(struct request_queue *q); 948void blk_mq_unfreeze_queue_nomemrestore(struct request_queue *q); 949static inline unsigned int __must_check 950blk_mq_freeze_queue(struct request_queue *q) 951{ 952 unsigned int memflags = memalloc_noio_save(); 953 954 blk_mq_freeze_queue_nomemsave(q); 955 return memflags; 956} 957static inline void 958blk_mq_unfreeze_queue(struct request_queue *q, unsigned int memflags) 959{ 960 blk_mq_unfreeze_queue_nomemrestore(q); 961 memalloc_noio_restore(memflags); 962} 963void blk_freeze_queue_start(struct request_queue *q); 964void blk_mq_freeze_queue_wait(struct request_queue *q); 965int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, 966 unsigned long timeout); 967void blk_mq_unfreeze_queue_non_owner(struct request_queue *q); 968void blk_freeze_queue_start_non_owner(struct request_queue *q); 969 970unsigned int blk_mq_num_possible_queues(unsigned int max_queues); 971unsigned int blk_mq_num_online_queues(unsigned int max_queues); 972void blk_mq_map_queues(struct blk_mq_queue_map *qmap); 973void blk_mq_map_hw_queues(struct blk_mq_queue_map *qmap, 974 struct device *dev, unsigned int offset); 975void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues); 976 977void blk_mq_quiesce_queue_nowait(struct request_queue *q); 978 979unsigned int blk_mq_rq_cpu(struct request *rq); 980 981bool __blk_should_fake_timeout(struct request_queue *q); 982static inline bool blk_should_fake_timeout(struct request_queue *q) 983{ 984 if (IS_ENABLED(CONFIG_FAIL_IO_TIMEOUT) && 985 test_bit(QUEUE_FLAG_FAIL_IO, &q->queue_flags)) 986 return __blk_should_fake_timeout(q); 987 return false; 988} 989 990/** 991 * blk_mq_rq_from_pdu - cast a PDU to a request 992 * @pdu: the PDU (Protocol Data Unit) to be casted 993 * 994 * Return: request 995 * 996 * Driver command data is immediately after the request. So subtract request 997 * size to get back to the original request. 998 */ 999static inline struct request *blk_mq_rq_from_pdu(void *pdu) 1000{ 1001 return pdu - sizeof(struct request); 1002} 1003 1004/** 1005 * blk_mq_rq_to_pdu - cast a request to a PDU 1006 * @rq: the request to be casted 1007 * 1008 * Return: pointer to the PDU 1009 * 1010 * Driver command data is immediately after the request. So add request to get 1011 * the PDU. 1012 */ 1013static inline void *blk_mq_rq_to_pdu(struct request *rq) 1014{ 1015 return rq + 1; 1016} 1017 1018static inline struct blk_mq_hw_ctx *queue_hctx(struct request_queue *q, int id) 1019{ 1020 struct blk_mq_hw_ctx *hctx; 1021 1022 rcu_read_lock(); 1023 hctx = rcu_dereference(q->queue_hw_ctx)[id]; 1024 rcu_read_unlock(); 1025 1026 return hctx; 1027} 1028 1029#define queue_for_each_hw_ctx(q, hctx, i) \ 1030 for ((i) = 0; (i) < (q)->nr_hw_queues && \ 1031 ({ hctx = queue_hctx((q), i); 1; }); (i)++) 1032 1033#define hctx_for_each_ctx(hctx, ctx, i) \ 1034 for ((i) = 0; (i) < (hctx)->nr_ctx && \ 1035 ({ ctx = (hctx)->ctxs[(i)]; 1; }); (i)++) 1036 1037static inline void blk_mq_cleanup_rq(struct request *rq) 1038{ 1039 if (rq->q->mq_ops->cleanup_rq) 1040 rq->q->mq_ops->cleanup_rq(rq); 1041} 1042 1043void blk_mq_hctx_set_fq_lock_class(struct blk_mq_hw_ctx *hctx, 1044 struct lock_class_key *key); 1045 1046static inline bool rq_is_sync(struct request *rq) 1047{ 1048 return op_is_sync(rq->cmd_flags); 1049} 1050 1051void blk_rq_init(struct request_queue *q, struct request *rq); 1052int blk_rq_prep_clone(struct request *rq, struct request *rq_src, 1053 struct bio_set *bs, gfp_t gfp_mask, 1054 int (*bio_ctr)(struct bio *, struct bio *, void *), void *data); 1055void blk_rq_unprep_clone(struct request *rq); 1056blk_status_t blk_insert_cloned_request(struct request *rq); 1057 1058struct rq_map_data { 1059 struct page **pages; 1060 unsigned long offset; 1061 unsigned short page_order; 1062 unsigned short nr_entries; 1063 bool null_mapped; 1064 bool from_user; 1065}; 1066 1067int blk_rq_map_user(struct request_queue *, struct request *, 1068 struct rq_map_data *, void __user *, unsigned long, gfp_t); 1069int blk_rq_map_user_io(struct request *, struct rq_map_data *, 1070 void __user *, unsigned long, gfp_t, bool, int, bool, int); 1071int blk_rq_map_user_iov(struct request_queue *, struct request *, 1072 struct rq_map_data *, const struct iov_iter *, gfp_t); 1073int blk_rq_unmap_user(struct bio *); 1074int blk_rq_map_kern(struct request *rq, void *kbuf, unsigned int len, 1075 gfp_t gfp); 1076int blk_rq_append_bio(struct request *rq, struct bio *bio); 1077void blk_execute_rq_nowait(struct request *rq, bool at_head); 1078blk_status_t blk_execute_rq(struct request *rq, bool at_head); 1079bool blk_rq_is_poll(struct request *rq); 1080 1081struct req_iterator { 1082 struct bvec_iter iter; 1083 struct bio *bio; 1084}; 1085 1086#define __rq_for_each_bio(_bio, rq) \ 1087 if ((rq->bio)) \ 1088 for (_bio = (rq)->bio; _bio; _bio = _bio->bi_next) 1089 1090#define rq_for_each_segment(bvl, _rq, _iter) \ 1091 __rq_for_each_bio(_iter.bio, _rq) \ 1092 bio_for_each_segment(bvl, _iter.bio, _iter.iter) 1093 1094#define rq_for_each_bvec(bvl, _rq, _iter) \ 1095 __rq_for_each_bio(_iter.bio, _rq) \ 1096 bio_for_each_bvec(bvl, _iter.bio, _iter.iter) 1097 1098#define rq_iter_last(bvec, _iter) \ 1099 (_iter.bio->bi_next == NULL && \ 1100 bio_iter_last(bvec, _iter.iter)) 1101 1102/* 1103 * blk_rq_pos() : the current sector 1104 * blk_rq_bytes() : bytes left in the entire request 1105 * blk_rq_cur_bytes() : bytes left in the current segment 1106 * blk_rq_sectors() : sectors left in the entire request 1107 * blk_rq_cur_sectors() : sectors left in the current segment 1108 * blk_rq_stats_sectors() : sectors of the entire request used for stats 1109 */ 1110static inline sector_t blk_rq_pos(const struct request *rq) 1111{ 1112 return rq->__sector; 1113} 1114 1115static inline unsigned int blk_rq_bytes(const struct request *rq) 1116{ 1117 return rq->__data_len; 1118} 1119 1120static inline int blk_rq_cur_bytes(const struct request *rq) 1121{ 1122 if (!rq->bio) 1123 return 0; 1124 if (!bio_has_data(rq->bio)) /* dataless requests such as discard */ 1125 return rq->bio->bi_iter.bi_size; 1126 return bio_iovec(rq->bio).bv_len; 1127} 1128 1129static inline unsigned int blk_rq_sectors(const struct request *rq) 1130{ 1131 return blk_rq_bytes(rq) >> SECTOR_SHIFT; 1132} 1133 1134static inline unsigned int blk_rq_cur_sectors(const struct request *rq) 1135{ 1136 return blk_rq_cur_bytes(rq) >> SECTOR_SHIFT; 1137} 1138 1139static inline unsigned int blk_rq_stats_sectors(const struct request *rq) 1140{ 1141 return rq->stats_sectors; 1142} 1143 1144/* 1145 * Some commands like WRITE SAME have a payload or data transfer size which 1146 * is different from the size of the request. Any driver that supports such 1147 * commands using the RQF_SPECIAL_PAYLOAD flag needs to use this helper to 1148 * calculate the data transfer size. 1149 */ 1150static inline unsigned int blk_rq_payload_bytes(struct request *rq) 1151{ 1152 if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) 1153 return rq->special_vec.bv_len; 1154 return blk_rq_bytes(rq); 1155} 1156 1157/* 1158 * Return the first full biovec in the request. The caller needs to check that 1159 * there are any bvecs before calling this helper. 1160 */ 1161static inline struct bio_vec req_bvec(struct request *rq) 1162{ 1163 if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) 1164 return rq->special_vec; 1165 return mp_bvec_iter_bvec(rq->bio->bi_io_vec, rq->bio->bi_iter); 1166} 1167 1168static inline unsigned int blk_rq_count_bios(struct request *rq) 1169{ 1170 unsigned int nr_bios = 0; 1171 struct bio *bio; 1172 1173 __rq_for_each_bio(bio, rq) 1174 nr_bios++; 1175 1176 return nr_bios; 1177} 1178 1179void blk_steal_bios(struct bio_list *list, struct request *rq); 1180 1181/* 1182 * Request completion related functions. 1183 * 1184 * blk_update_request() completes given number of bytes and updates 1185 * the request without completing it. 1186 */ 1187bool blk_update_request(struct request *rq, blk_status_t error, 1188 unsigned int nr_bytes); 1189void blk_abort_request(struct request *); 1190 1191/* 1192 * Number of physical segments as sent to the device. 1193 * 1194 * Normally this is the number of discontiguous data segments sent by the 1195 * submitter. But for data-less command like discard we might have no 1196 * actual data segments submitted, but the driver might have to add it's 1197 * own special payload. In that case we still return 1 here so that this 1198 * special payload will be mapped. 1199 */ 1200static inline unsigned short blk_rq_nr_phys_segments(struct request *rq) 1201{ 1202 if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) 1203 return 1; 1204 return rq->nr_phys_segments; 1205} 1206 1207/* 1208 * Number of discard segments (or ranges) the driver needs to fill in. 1209 * Each discard bio merged into a request is counted as one segment. 1210 */ 1211static inline unsigned short blk_rq_nr_discard_segments(struct request *rq) 1212{ 1213 return max_t(unsigned short, rq->nr_phys_segments, 1); 1214} 1215 1216/** 1217 * blk_rq_nr_bvec - return number of bvecs in a request 1218 * @rq: request to calculate bvecs for 1219 * 1220 * Returns the number of bvecs. 1221 */ 1222static inline unsigned int blk_rq_nr_bvec(struct request *rq) 1223{ 1224 struct req_iterator rq_iter; 1225 struct bio_vec bv; 1226 unsigned int nr_bvec = 0; 1227 1228 rq_for_each_bvec(bv, rq, rq_iter) 1229 nr_bvec++; 1230 1231 return nr_bvec; 1232} 1233 1234int __blk_rq_map_sg(struct request *rq, struct scatterlist *sglist, 1235 struct scatterlist **last_sg); 1236static inline int blk_rq_map_sg(struct request *rq, struct scatterlist *sglist) 1237{ 1238 struct scatterlist *last_sg = NULL; 1239 1240 return __blk_rq_map_sg(rq, sglist, &last_sg); 1241} 1242void blk_dump_rq_flags(struct request *, char *); 1243 1244#endif /* BLK_MQ_H */