Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
at v6.18 1198 lines 35 kB view raw
1/* SPDX-License-Identifier: GPL-2.0 */ 2#ifndef BLK_MQ_H 3#define BLK_MQ_H 4 5#include <linux/blkdev.h> 6#include <linux/sbitmap.h> 7#include <linux/lockdep.h> 8#include <linux/scatterlist.h> 9#include <linux/prefetch.h> 10#include <linux/srcu.h> 11#include <linux/rw_hint.h> 12#include <linux/rwsem.h> 13 14struct blk_mq_tags; 15struct blk_flush_queue; 16 17#define BLKDEV_MIN_RQ 4 18#define BLKDEV_DEFAULT_RQ 128 19 20enum rq_end_io_ret { 21 RQ_END_IO_NONE, 22 RQ_END_IO_FREE, 23}; 24 25typedef enum rq_end_io_ret (rq_end_io_fn)(struct request *, blk_status_t); 26 27/* 28 * request flags */ 29typedef __u32 __bitwise req_flags_t; 30 31/* Keep rqf_name[] in sync with the definitions below */ 32enum rqf_flags { 33 /* drive already may have started this one */ 34 __RQF_STARTED, 35 /* request for flush sequence */ 36 __RQF_FLUSH_SEQ, 37 /* merge of different types, fail separately */ 38 __RQF_MIXED_MERGE, 39 /* don't call prep for this one */ 40 __RQF_DONTPREP, 41 /* use hctx->sched_tags */ 42 __RQF_SCHED_TAGS, 43 /* use an I/O scheduler for this request */ 44 __RQF_USE_SCHED, 45 /* vaguely specified driver internal error. Ignored by block layer */ 46 __RQF_FAILED, 47 /* don't warn about errors */ 48 __RQF_QUIET, 49 /* account into disk and partition IO statistics */ 50 __RQF_IO_STAT, 51 /* runtime pm request */ 52 __RQF_PM, 53 /* on IO scheduler merge hash */ 54 __RQF_HASHED, 55 /* track IO completion time */ 56 __RQF_STATS, 57 /* Look at ->special_vec for the actual data payload instead of the 58 bio chain. */ 59 __RQF_SPECIAL_PAYLOAD, 60 /* request completion needs to be signaled to zone write plugging. */ 61 __RQF_ZONE_WRITE_PLUGGING, 62 /* ->timeout has been called, don't expire again */ 63 __RQF_TIMED_OUT, 64 __RQF_RESV, 65 __RQF_BITS 66}; 67 68#define RQF_STARTED ((__force req_flags_t)(1 << __RQF_STARTED)) 69#define RQF_FLUSH_SEQ ((__force req_flags_t)(1 << __RQF_FLUSH_SEQ)) 70#define RQF_MIXED_MERGE ((__force req_flags_t)(1 << __RQF_MIXED_MERGE)) 71#define RQF_DONTPREP ((__force req_flags_t)(1 << __RQF_DONTPREP)) 72#define RQF_SCHED_TAGS ((__force req_flags_t)(1 << __RQF_SCHED_TAGS)) 73#define RQF_USE_SCHED ((__force req_flags_t)(1 << __RQF_USE_SCHED)) 74#define RQF_FAILED ((__force req_flags_t)(1 << __RQF_FAILED)) 75#define RQF_QUIET ((__force req_flags_t)(1 << __RQF_QUIET)) 76#define RQF_IO_STAT ((__force req_flags_t)(1 << __RQF_IO_STAT)) 77#define RQF_PM ((__force req_flags_t)(1 << __RQF_PM)) 78#define RQF_HASHED ((__force req_flags_t)(1 << __RQF_HASHED)) 79#define RQF_STATS ((__force req_flags_t)(1 << __RQF_STATS)) 80#define RQF_SPECIAL_PAYLOAD \ 81 ((__force req_flags_t)(1 << __RQF_SPECIAL_PAYLOAD)) 82#define RQF_ZONE_WRITE_PLUGGING \ 83 ((__force req_flags_t)(1 << __RQF_ZONE_WRITE_PLUGGING)) 84#define RQF_TIMED_OUT ((__force req_flags_t)(1 << __RQF_TIMED_OUT)) 85#define RQF_RESV ((__force req_flags_t)(1 << __RQF_RESV)) 86 87/* flags that prevent us from merging requests: */ 88#define RQF_NOMERGE_FLAGS \ 89 (RQF_STARTED | RQF_FLUSH_SEQ | RQF_SPECIAL_PAYLOAD) 90 91enum mq_rq_state { 92 MQ_RQ_IDLE = 0, 93 MQ_RQ_IN_FLIGHT = 1, 94 MQ_RQ_COMPLETE = 2, 95}; 96 97/* 98 * Try to put the fields that are referenced together in the same cacheline. 99 * 100 * If you modify this structure, make sure to update blk_rq_init() and 101 * especially blk_mq_rq_ctx_init() to take care of the added fields. 102 */ 103struct request { 104 struct request_queue *q; 105 struct blk_mq_ctx *mq_ctx; 106 struct blk_mq_hw_ctx *mq_hctx; 107 108 blk_opf_t cmd_flags; /* op and common flags */ 109 req_flags_t rq_flags; 110 111 int tag; 112 int internal_tag; 113 114 unsigned int timeout; 115 116 /* the following two fields are internal, NEVER access directly */ 117 unsigned int __data_len; /* total data len */ 118 sector_t __sector; /* sector cursor */ 119 120 struct bio *bio; 121 struct bio *biotail; 122 123 union { 124 struct list_head queuelist; 125 struct request *rq_next; 126 }; 127 128 struct block_device *part; 129#ifdef CONFIG_BLK_RQ_ALLOC_TIME 130 /* Time that the first bio started allocating this request. */ 131 u64 alloc_time_ns; 132#endif 133 /* Time that this request was allocated for this IO. */ 134 u64 start_time_ns; 135 /* Time that I/O was submitted to the device. */ 136 u64 io_start_time_ns; 137 138#ifdef CONFIG_BLK_WBT 139 unsigned short wbt_flags; 140#endif 141 /* 142 * rq sectors used for blk stats. It has the same value 143 * with blk_rq_sectors(rq), except that it never be zeroed 144 * by completion. 145 */ 146 unsigned short stats_sectors; 147 148 /* 149 * Number of scatter-gather DMA addr+len pairs after 150 * physical address coalescing is performed. 151 */ 152 unsigned short nr_phys_segments; 153 unsigned short nr_integrity_segments; 154 155#ifdef CONFIG_BLK_INLINE_ENCRYPTION 156 struct bio_crypt_ctx *crypt_ctx; 157 struct blk_crypto_keyslot *crypt_keyslot; 158#endif 159 160 enum mq_rq_state state; 161 atomic_t ref; 162 163 unsigned long deadline; 164 165 /* 166 * The hash is used inside the scheduler, and killed once the 167 * request reaches the dispatch list. The ipi_list is only used 168 * to queue the request for softirq completion, which is long 169 * after the request has been unhashed (and even removed from 170 * the dispatch list). 171 */ 172 union { 173 struct hlist_node hash; /* merge hash */ 174 struct llist_node ipi_list; 175 }; 176 177 /* 178 * The rb_node is only used inside the io scheduler, requests 179 * are pruned when moved to the dispatch queue. special_vec must 180 * only be used if RQF_SPECIAL_PAYLOAD is set, and those cannot be 181 * insert into an IO scheduler. 182 */ 183 union { 184 struct rb_node rb_node; /* sort/lookup */ 185 struct bio_vec special_vec; 186 }; 187 188 /* 189 * Three pointers are available for the IO schedulers, if they need 190 * more they have to dynamically allocate it. 191 */ 192 struct { 193 struct io_cq *icq; 194 void *priv[2]; 195 } elv; 196 197 struct { 198 unsigned int seq; 199 rq_end_io_fn *saved_end_io; 200 } flush; 201 202 u64 fifo_time; 203 204 /* 205 * completion callback. 206 */ 207 rq_end_io_fn *end_io; 208 void *end_io_data; 209}; 210 211static inline enum req_op req_op(const struct request *req) 212{ 213 return req->cmd_flags & REQ_OP_MASK; 214} 215 216static inline bool blk_rq_is_passthrough(struct request *rq) 217{ 218 return blk_op_is_passthrough(rq->cmd_flags); 219} 220 221static inline unsigned short req_get_ioprio(struct request *req) 222{ 223 if (req->bio) 224 return req->bio->bi_ioprio; 225 return 0; 226} 227 228#define rq_data_dir(rq) (op_is_write(req_op(rq)) ? WRITE : READ) 229 230#define rq_dma_dir(rq) \ 231 (op_is_write(req_op(rq)) ? DMA_TO_DEVICE : DMA_FROM_DEVICE) 232 233static inline int rq_list_empty(const struct rq_list *rl) 234{ 235 return rl->head == NULL; 236} 237 238static inline void rq_list_init(struct rq_list *rl) 239{ 240 rl->head = NULL; 241 rl->tail = NULL; 242} 243 244static inline void rq_list_add_tail(struct rq_list *rl, struct request *rq) 245{ 246 rq->rq_next = NULL; 247 if (rl->tail) 248 rl->tail->rq_next = rq; 249 else 250 rl->head = rq; 251 rl->tail = rq; 252} 253 254static inline void rq_list_add_head(struct rq_list *rl, struct request *rq) 255{ 256 rq->rq_next = rl->head; 257 rl->head = rq; 258 if (!rl->tail) 259 rl->tail = rq; 260} 261 262static inline struct request *rq_list_pop(struct rq_list *rl) 263{ 264 struct request *rq = rl->head; 265 266 if (rq) { 267 rl->head = rl->head->rq_next; 268 if (!rl->head) 269 rl->tail = NULL; 270 rq->rq_next = NULL; 271 } 272 273 return rq; 274} 275 276static inline struct request *rq_list_peek(struct rq_list *rl) 277{ 278 return rl->head; 279} 280 281#define rq_list_for_each(rl, pos) \ 282 for (pos = rq_list_peek((rl)); (pos); pos = pos->rq_next) 283 284#define rq_list_for_each_safe(rl, pos, nxt) \ 285 for (pos = rq_list_peek((rl)), nxt = pos->rq_next; \ 286 pos; pos = nxt, nxt = pos ? pos->rq_next : NULL) 287 288/** 289 * enum blk_eh_timer_return - How the timeout handler should proceed 290 * @BLK_EH_DONE: The block driver completed the command or will complete it at 291 * a later time. 292 * @BLK_EH_RESET_TIMER: Reset the request timer and continue waiting for the 293 * request to complete. 294 */ 295enum blk_eh_timer_return { 296 BLK_EH_DONE, 297 BLK_EH_RESET_TIMER, 298}; 299 300/** 301 * struct blk_mq_hw_ctx - State for a hardware queue facing the hardware 302 * block device 303 */ 304struct blk_mq_hw_ctx { 305 struct { 306 /** @lock: Protects the dispatch list. */ 307 spinlock_t lock; 308 /** 309 * @dispatch: Used for requests that are ready to be 310 * dispatched to the hardware but for some reason (e.g. lack of 311 * resources) could not be sent to the hardware. As soon as the 312 * driver can send new requests, requests at this list will 313 * be sent first for a fairer dispatch. 314 */ 315 struct list_head dispatch; 316 /** 317 * @state: BLK_MQ_S_* flags. Defines the state of the hw 318 * queue (active, scheduled to restart, stopped). 319 */ 320 unsigned long state; 321 } ____cacheline_aligned_in_smp; 322 323 /** 324 * @run_work: Used for scheduling a hardware queue run at a later time. 325 */ 326 struct delayed_work run_work; 327 /** @cpumask: Map of available CPUs where this hctx can run. */ 328 cpumask_var_t cpumask; 329 /** 330 * @next_cpu: Used by blk_mq_hctx_next_cpu() for round-robin CPU 331 * selection from @cpumask. 332 */ 333 int next_cpu; 334 /** 335 * @next_cpu_batch: Counter of how many works left in the batch before 336 * changing to the next CPU. 337 */ 338 int next_cpu_batch; 339 340 /** @flags: BLK_MQ_F_* flags. Defines the behaviour of the queue. */ 341 unsigned long flags; 342 343 /** 344 * @sched_data: Pointer owned by the IO scheduler attached to a request 345 * queue. It's up to the IO scheduler how to use this pointer. 346 */ 347 void *sched_data; 348 /** 349 * @queue: Pointer to the request queue that owns this hardware context. 350 */ 351 struct request_queue *queue; 352 /** @fq: Queue of requests that need to perform a flush operation. */ 353 struct blk_flush_queue *fq; 354 355 /** 356 * @driver_data: Pointer to data owned by the block driver that created 357 * this hctx 358 */ 359 void *driver_data; 360 361 /** 362 * @ctx_map: Bitmap for each software queue. If bit is on, there is a 363 * pending request in that software queue. 364 */ 365 struct sbitmap ctx_map; 366 367 /** 368 * @dispatch_from: Software queue to be used when no scheduler was 369 * selected. 370 */ 371 struct blk_mq_ctx *dispatch_from; 372 /** 373 * @dispatch_busy: Number used by blk_mq_update_dispatch_busy() to 374 * decide if the hw_queue is busy using Exponential Weighted Moving 375 * Average algorithm. 376 */ 377 unsigned int dispatch_busy; 378 379 /** @type: HCTX_TYPE_* flags. Type of hardware queue. */ 380 unsigned short type; 381 /** @nr_ctx: Number of software queues. */ 382 unsigned short nr_ctx; 383 /** @ctxs: Array of software queues. */ 384 struct blk_mq_ctx **ctxs; 385 386 /** @dispatch_wait_lock: Lock for dispatch_wait queue. */ 387 spinlock_t dispatch_wait_lock; 388 /** 389 * @dispatch_wait: Waitqueue to put requests when there is no tag 390 * available at the moment, to wait for another try in the future. 391 */ 392 wait_queue_entry_t dispatch_wait; 393 394 /** 395 * @wait_index: Index of next available dispatch_wait queue to insert 396 * requests. 397 */ 398 atomic_t wait_index; 399 400 /** 401 * @tags: Tags owned by the block driver. A tag at this set is only 402 * assigned when a request is dispatched from a hardware queue. 403 */ 404 struct blk_mq_tags *tags; 405 /** 406 * @sched_tags: Tags owned by I/O scheduler. If there is an I/O 407 * scheduler associated with a request queue, a tag is assigned when 408 * that request is allocated. Else, this member is not used. 409 */ 410 struct blk_mq_tags *sched_tags; 411 412 /** @numa_node: NUMA node the storage adapter has been connected to. */ 413 unsigned int numa_node; 414 /** @queue_num: Index of this hardware queue. */ 415 unsigned int queue_num; 416 417 /** 418 * @nr_active: Number of active requests. Only used when a tag set is 419 * shared across request queues. 420 */ 421 atomic_t nr_active; 422 423 /** @cpuhp_online: List to store request if CPU is going to die */ 424 struct hlist_node cpuhp_online; 425 /** @cpuhp_dead: List to store request if some CPU die. */ 426 struct hlist_node cpuhp_dead; 427 /** @kobj: Kernel object for sysfs. */ 428 struct kobject kobj; 429 430#ifdef CONFIG_BLK_DEBUG_FS 431 /** 432 * @debugfs_dir: debugfs directory for this hardware queue. Named 433 * as cpu<cpu_number>. 434 */ 435 struct dentry *debugfs_dir; 436 /** @sched_debugfs_dir: debugfs directory for the scheduler. */ 437 struct dentry *sched_debugfs_dir; 438#endif 439 440 /** 441 * @hctx_list: if this hctx is not in use, this is an entry in 442 * q->unused_hctx_list. 443 */ 444 struct list_head hctx_list; 445}; 446 447/** 448 * struct blk_mq_queue_map - Map software queues to hardware queues 449 * @mq_map: CPU ID to hardware queue index map. This is an array 450 * with nr_cpu_ids elements. Each element has a value in the range 451 * [@queue_offset, @queue_offset + @nr_queues). 452 * @nr_queues: Number of hardware queues to map CPU IDs onto. 453 * @queue_offset: First hardware queue to map onto. Used by the PCIe NVMe 454 * driver to map each hardware queue type (enum hctx_type) onto a distinct 455 * set of hardware queues. 456 */ 457struct blk_mq_queue_map { 458 unsigned int *mq_map; 459 unsigned int nr_queues; 460 unsigned int queue_offset; 461}; 462 463/** 464 * enum hctx_type - Type of hardware queue 465 * @HCTX_TYPE_DEFAULT: All I/O not otherwise accounted for. 466 * @HCTX_TYPE_READ: Just for READ I/O. 467 * @HCTX_TYPE_POLL: Polled I/O of any kind. 468 * @HCTX_MAX_TYPES: Number of types of hctx. 469 */ 470enum hctx_type { 471 HCTX_TYPE_DEFAULT, 472 HCTX_TYPE_READ, 473 HCTX_TYPE_POLL, 474 475 HCTX_MAX_TYPES, 476}; 477 478/** 479 * struct blk_mq_tag_set - tag set that can be shared between request queues 480 * @ops: Pointers to functions that implement block driver behavior. 481 * @map: One or more ctx -> hctx mappings. One map exists for each 482 * hardware queue type (enum hctx_type) that the driver wishes 483 * to support. There are no restrictions on maps being of the 484 * same size, and it's perfectly legal to share maps between 485 * types. 486 * @nr_maps: Number of elements in the @map array. A number in the range 487 * [1, HCTX_MAX_TYPES]. 488 * @nr_hw_queues: Number of hardware queues supported by the block driver that 489 * owns this data structure. 490 * @queue_depth: Number of tags per hardware queue, reserved tags included. 491 * @reserved_tags: Number of tags to set aside for BLK_MQ_REQ_RESERVED tag 492 * allocations. 493 * @cmd_size: Number of additional bytes to allocate per request. The block 494 * driver owns these additional bytes. 495 * @numa_node: NUMA node the storage adapter has been connected to. 496 * @timeout: Request processing timeout in jiffies. 497 * @flags: Zero or more BLK_MQ_F_* flags. 498 * @driver_data: Pointer to data owned by the block driver that created this 499 * tag set. 500 * @tags: Tag sets. One tag set per hardware queue. Has @nr_hw_queues 501 * elements. 502 * @shared_tags: 503 * Shared set of tags. Has @nr_hw_queues elements. If set, 504 * shared by all @tags. 505 * @tag_list_lock: Serializes tag_list accesses. 506 * @tag_list: List of the request queues that use this tag set. See also 507 * request_queue.tag_set_list. 508 * @srcu: Use as lock when type of the request queue is blocking 509 * (BLK_MQ_F_BLOCKING). 510 * @tags_srcu: SRCU used to defer freeing of tags page_list to prevent 511 * use-after-free when iterating tags. 512 * @update_nr_hwq_lock: 513 * Synchronize updating nr_hw_queues with add/del disk & 514 * switching elevator. 515 */ 516struct blk_mq_tag_set { 517 const struct blk_mq_ops *ops; 518 struct blk_mq_queue_map map[HCTX_MAX_TYPES]; 519 unsigned int nr_maps; 520 unsigned int nr_hw_queues; 521 unsigned int queue_depth; 522 unsigned int reserved_tags; 523 unsigned int cmd_size; 524 int numa_node; 525 unsigned int timeout; 526 unsigned int flags; 527 void *driver_data; 528 529 struct blk_mq_tags **tags; 530 531 struct blk_mq_tags *shared_tags; 532 533 struct mutex tag_list_lock; 534 struct list_head tag_list; 535 struct srcu_struct *srcu; 536 struct srcu_struct tags_srcu; 537 538 struct rw_semaphore update_nr_hwq_lock; 539}; 540 541/** 542 * struct blk_mq_queue_data - Data about a request inserted in a queue 543 * 544 * @rq: Request pointer. 545 * @last: If it is the last request in the queue. 546 */ 547struct blk_mq_queue_data { 548 struct request *rq; 549 bool last; 550}; 551 552typedef bool (busy_tag_iter_fn)(struct request *, void *); 553 554/** 555 * struct blk_mq_ops - Callback functions that implements block driver 556 * behaviour. 557 */ 558struct blk_mq_ops { 559 /** 560 * @queue_rq: Queue a new request from block IO. 561 */ 562 blk_status_t (*queue_rq)(struct blk_mq_hw_ctx *, 563 const struct blk_mq_queue_data *); 564 565 /** 566 * @commit_rqs: If a driver uses bd->last to judge when to submit 567 * requests to hardware, it must define this function. In case of errors 568 * that make us stop issuing further requests, this hook serves the 569 * purpose of kicking the hardware (which the last request otherwise 570 * would have done). 571 */ 572 void (*commit_rqs)(struct blk_mq_hw_ctx *); 573 574 /** 575 * @queue_rqs: Queue a list of new requests. Driver is guaranteed 576 * that each request belongs to the same queue. If the driver doesn't 577 * empty the @rqlist completely, then the rest will be queued 578 * individually by the block layer upon return. 579 */ 580 void (*queue_rqs)(struct rq_list *rqlist); 581 582 /** 583 * @get_budget: Reserve budget before queue request, once .queue_rq is 584 * run, it is driver's responsibility to release the 585 * reserved budget. Also we have to handle failure case 586 * of .get_budget for avoiding I/O deadlock. 587 */ 588 int (*get_budget)(struct request_queue *); 589 590 /** 591 * @put_budget: Release the reserved budget. 592 */ 593 void (*put_budget)(struct request_queue *, int); 594 595 /** 596 * @set_rq_budget_token: store rq's budget token 597 */ 598 void (*set_rq_budget_token)(struct request *, int); 599 /** 600 * @get_rq_budget_token: retrieve rq's budget token 601 */ 602 int (*get_rq_budget_token)(struct request *); 603 604 /** 605 * @timeout: Called on request timeout. 606 */ 607 enum blk_eh_timer_return (*timeout)(struct request *); 608 609 /** 610 * @poll: Called to poll for completion of a specific tag. 611 */ 612 int (*poll)(struct blk_mq_hw_ctx *, struct io_comp_batch *); 613 614 /** 615 * @complete: Mark the request as complete. 616 */ 617 void (*complete)(struct request *); 618 619 /** 620 * @init_hctx: Called when the block layer side of a hardware queue has 621 * been set up, allowing the driver to allocate/init matching 622 * structures. 623 */ 624 int (*init_hctx)(struct blk_mq_hw_ctx *, void *, unsigned int); 625 /** 626 * @exit_hctx: Ditto for exit/teardown. 627 */ 628 void (*exit_hctx)(struct blk_mq_hw_ctx *, unsigned int); 629 630 /** 631 * @init_request: Called for every command allocated by the block layer 632 * to allow the driver to set up driver specific data. 633 * 634 * Tag greater than or equal to queue_depth is for setting up 635 * flush request. 636 */ 637 int (*init_request)(struct blk_mq_tag_set *set, struct request *, 638 unsigned int, unsigned int); 639 /** 640 * @exit_request: Ditto for exit/teardown. 641 */ 642 void (*exit_request)(struct blk_mq_tag_set *set, struct request *, 643 unsigned int); 644 645 /** 646 * @cleanup_rq: Called before freeing one request which isn't completed 647 * yet, and usually for freeing the driver private data. 648 */ 649 void (*cleanup_rq)(struct request *); 650 651 /** 652 * @busy: If set, returns whether or not this queue currently is busy. 653 */ 654 bool (*busy)(struct request_queue *); 655 656 /** 657 * @map_queues: This allows drivers specify their own queue mapping by 658 * overriding the setup-time function that builds the mq_map. 659 */ 660 void (*map_queues)(struct blk_mq_tag_set *set); 661 662#ifdef CONFIG_BLK_DEBUG_FS 663 /** 664 * @show_rq: Used by the debugfs implementation to show driver-specific 665 * information about a request. 666 */ 667 void (*show_rq)(struct seq_file *m, struct request *rq); 668#endif 669}; 670 671/* Keep hctx_flag_name[] in sync with the definitions below */ 672enum { 673 BLK_MQ_F_TAG_QUEUE_SHARED = 1 << 1, 674 /* 675 * Set when this device requires underlying blk-mq device for 676 * completing IO: 677 */ 678 BLK_MQ_F_STACKING = 1 << 2, 679 BLK_MQ_F_TAG_HCTX_SHARED = 1 << 3, 680 BLK_MQ_F_BLOCKING = 1 << 4, 681 682 /* 683 * Alloc tags on a round-robin base instead of the first available one. 684 */ 685 BLK_MQ_F_TAG_RR = 1 << 5, 686 687 /* 688 * Select 'none' during queue registration in case of a single hwq 689 * or shared hwqs instead of 'mq-deadline'. 690 */ 691 BLK_MQ_F_NO_SCHED_BY_DEFAULT = 1 << 6, 692 693 BLK_MQ_F_MAX = 1 << 7, 694}; 695 696#define BLK_MQ_MAX_DEPTH (10240) 697#define BLK_MQ_NO_HCTX_IDX (-1U) 698 699enum { 700 /* Keep hctx_state_name[] in sync with the definitions below */ 701 BLK_MQ_S_STOPPED, 702 BLK_MQ_S_TAG_ACTIVE, 703 BLK_MQ_S_SCHED_RESTART, 704 /* hw queue is inactive after all its CPUs become offline */ 705 BLK_MQ_S_INACTIVE, 706 BLK_MQ_S_MAX 707}; 708 709struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, 710 struct queue_limits *lim, void *queuedata, 711 struct lock_class_key *lkclass); 712#define blk_mq_alloc_disk(set, lim, queuedata) \ 713({ \ 714 static struct lock_class_key __key; \ 715 \ 716 __blk_mq_alloc_disk(set, lim, queuedata, &__key); \ 717}) 718struct gendisk *blk_mq_alloc_disk_for_queue(struct request_queue *q, 719 struct lock_class_key *lkclass); 720struct request_queue *blk_mq_alloc_queue(struct blk_mq_tag_set *set, 721 struct queue_limits *lim, void *queuedata); 722int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, 723 struct request_queue *q); 724void blk_mq_destroy_queue(struct request_queue *); 725 726int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set); 727int blk_mq_alloc_sq_tag_set(struct blk_mq_tag_set *set, 728 const struct blk_mq_ops *ops, unsigned int queue_depth, 729 unsigned int set_flags); 730void blk_mq_free_tag_set(struct blk_mq_tag_set *set); 731 732void blk_mq_free_request(struct request *rq); 733int blk_rq_poll(struct request *rq, struct io_comp_batch *iob, 734 unsigned int poll_flags); 735 736bool blk_mq_queue_inflight(struct request_queue *q); 737 738enum { 739 /* return when out of requests */ 740 BLK_MQ_REQ_NOWAIT = (__force blk_mq_req_flags_t)(1 << 0), 741 /* allocate from reserved pool */ 742 BLK_MQ_REQ_RESERVED = (__force blk_mq_req_flags_t)(1 << 1), 743 /* set RQF_PM */ 744 BLK_MQ_REQ_PM = (__force blk_mq_req_flags_t)(1 << 2), 745}; 746 747struct request *blk_mq_alloc_request(struct request_queue *q, blk_opf_t opf, 748 blk_mq_req_flags_t flags); 749struct request *blk_mq_alloc_request_hctx(struct request_queue *q, 750 blk_opf_t opf, blk_mq_req_flags_t flags, 751 unsigned int hctx_idx); 752 753/* 754 * Tag address space map. 755 */ 756struct blk_mq_tags { 757 unsigned int nr_tags; 758 unsigned int nr_reserved_tags; 759 unsigned int active_queues; 760 761 struct sbitmap_queue bitmap_tags; 762 struct sbitmap_queue breserved_tags; 763 764 struct request **rqs; 765 struct request **static_rqs; 766 struct list_head page_list; 767 768 /* 769 * used to clear request reference in rqs[] before freeing one 770 * request pool 771 */ 772 spinlock_t lock; 773 struct rcu_head rcu_head; 774}; 775 776static inline struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, 777 unsigned int tag) 778{ 779 if (tag < tags->nr_tags) { 780 prefetch(tags->rqs[tag]); 781 return tags->rqs[tag]; 782 } 783 784 return NULL; 785} 786 787enum { 788 BLK_MQ_UNIQUE_TAG_BITS = 16, 789 BLK_MQ_UNIQUE_TAG_MASK = (1 << BLK_MQ_UNIQUE_TAG_BITS) - 1, 790}; 791 792u32 blk_mq_unique_tag(struct request *rq); 793 794static inline u16 blk_mq_unique_tag_to_hwq(u32 unique_tag) 795{ 796 return unique_tag >> BLK_MQ_UNIQUE_TAG_BITS; 797} 798 799static inline u16 blk_mq_unique_tag_to_tag(u32 unique_tag) 800{ 801 return unique_tag & BLK_MQ_UNIQUE_TAG_MASK; 802} 803 804/** 805 * blk_mq_rq_state() - read the current MQ_RQ_* state of a request 806 * @rq: target request. 807 */ 808static inline enum mq_rq_state blk_mq_rq_state(struct request *rq) 809{ 810 return READ_ONCE(rq->state); 811} 812 813static inline int blk_mq_request_started(struct request *rq) 814{ 815 return blk_mq_rq_state(rq) != MQ_RQ_IDLE; 816} 817 818static inline int blk_mq_request_completed(struct request *rq) 819{ 820 return blk_mq_rq_state(rq) == MQ_RQ_COMPLETE; 821} 822 823/* 824 * 825 * Set the state to complete when completing a request from inside ->queue_rq. 826 * This is used by drivers that want to ensure special complete actions that 827 * need access to the request are called on failure, e.g. by nvme for 828 * multipathing. 829 */ 830static inline void blk_mq_set_request_complete(struct request *rq) 831{ 832 WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); 833} 834 835/* 836 * Complete the request directly instead of deferring it to softirq or 837 * completing it another CPU. Useful in preemptible instead of an interrupt. 838 */ 839static inline void blk_mq_complete_request_direct(struct request *rq, 840 void (*complete)(struct request *rq)) 841{ 842 WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); 843 complete(rq); 844} 845 846void blk_mq_start_request(struct request *rq); 847void blk_mq_end_request(struct request *rq, blk_status_t error); 848void __blk_mq_end_request(struct request *rq, blk_status_t error); 849void blk_mq_end_request_batch(struct io_comp_batch *ib); 850 851/* 852 * Only need start/end time stamping if we have iostat or 853 * blk stats enabled, or using an IO scheduler. 854 */ 855static inline bool blk_mq_need_time_stamp(struct request *rq) 856{ 857 return (rq->rq_flags & (RQF_IO_STAT | RQF_STATS | RQF_USE_SCHED)); 858} 859 860static inline bool blk_mq_is_reserved_rq(struct request *rq) 861{ 862 return rq->rq_flags & RQF_RESV; 863} 864 865/** 866 * blk_mq_add_to_batch() - add a request to the completion batch 867 * @req: The request to add to batch 868 * @iob: The batch to add the request 869 * @is_error: Specify true if the request failed with an error 870 * @complete: The completaion handler for the request 871 * 872 * Batched completions only work when there is no I/O error and no special 873 * ->end_io handler. 874 * 875 * Return: true when the request was added to the batch, otherwise false 876 */ 877static inline bool blk_mq_add_to_batch(struct request *req, 878 struct io_comp_batch *iob, bool is_error, 879 void (*complete)(struct io_comp_batch *)) 880{ 881 /* 882 * Check various conditions that exclude batch processing: 883 * 1) No batch container 884 * 2) Has scheduler data attached 885 * 3) Not a passthrough request and end_io set 886 * 4) Not a passthrough request and failed with an error 887 */ 888 if (!iob) 889 return false; 890 if (req->rq_flags & RQF_SCHED_TAGS) 891 return false; 892 if (!blk_rq_is_passthrough(req)) { 893 if (req->end_io) 894 return false; 895 if (is_error) 896 return false; 897 } 898 899 if (!iob->complete) 900 iob->complete = complete; 901 else if (iob->complete != complete) 902 return false; 903 iob->need_ts |= blk_mq_need_time_stamp(req); 904 rq_list_add_tail(&iob->req_list, req); 905 return true; 906} 907 908void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list); 909void blk_mq_kick_requeue_list(struct request_queue *q); 910void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs); 911void blk_mq_complete_request(struct request *rq); 912bool blk_mq_complete_request_remote(struct request *rq); 913void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx); 914void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx); 915void blk_mq_stop_hw_queues(struct request_queue *q); 916void blk_mq_start_hw_queues(struct request_queue *q); 917void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); 918void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async); 919void blk_mq_quiesce_queue(struct request_queue *q); 920void blk_mq_wait_quiesce_done(struct blk_mq_tag_set *set); 921void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set); 922void blk_mq_unquiesce_tagset(struct blk_mq_tag_set *set); 923void blk_mq_unquiesce_queue(struct request_queue *q); 924void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); 925void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); 926void blk_mq_run_hw_queues(struct request_queue *q, bool async); 927void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs); 928void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, 929 busy_tag_iter_fn *fn, void *priv); 930void blk_mq_tagset_wait_completed_request(struct blk_mq_tag_set *tagset); 931void blk_mq_freeze_queue_nomemsave(struct request_queue *q); 932void blk_mq_unfreeze_queue_nomemrestore(struct request_queue *q); 933static inline unsigned int __must_check 934blk_mq_freeze_queue(struct request_queue *q) 935{ 936 unsigned int memflags = memalloc_noio_save(); 937 938 blk_mq_freeze_queue_nomemsave(q); 939 return memflags; 940} 941static inline void 942blk_mq_unfreeze_queue(struct request_queue *q, unsigned int memflags) 943{ 944 blk_mq_unfreeze_queue_nomemrestore(q); 945 memalloc_noio_restore(memflags); 946} 947void blk_freeze_queue_start(struct request_queue *q); 948void blk_mq_freeze_queue_wait(struct request_queue *q); 949int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, 950 unsigned long timeout); 951void blk_mq_unfreeze_queue_non_owner(struct request_queue *q); 952void blk_freeze_queue_start_non_owner(struct request_queue *q); 953 954unsigned int blk_mq_num_possible_queues(unsigned int max_queues); 955unsigned int blk_mq_num_online_queues(unsigned int max_queues); 956void blk_mq_map_queues(struct blk_mq_queue_map *qmap); 957void blk_mq_map_hw_queues(struct blk_mq_queue_map *qmap, 958 struct device *dev, unsigned int offset); 959void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues); 960 961void blk_mq_quiesce_queue_nowait(struct request_queue *q); 962 963unsigned int blk_mq_rq_cpu(struct request *rq); 964 965bool __blk_should_fake_timeout(struct request_queue *q); 966static inline bool blk_should_fake_timeout(struct request_queue *q) 967{ 968 if (IS_ENABLED(CONFIG_FAIL_IO_TIMEOUT) && 969 test_bit(QUEUE_FLAG_FAIL_IO, &q->queue_flags)) 970 return __blk_should_fake_timeout(q); 971 return false; 972} 973 974/** 975 * blk_mq_rq_from_pdu - cast a PDU to a request 976 * @pdu: the PDU (Protocol Data Unit) to be casted 977 * 978 * Return: request 979 * 980 * Driver command data is immediately after the request. So subtract request 981 * size to get back to the original request. 982 */ 983static inline struct request *blk_mq_rq_from_pdu(void *pdu) 984{ 985 return pdu - sizeof(struct request); 986} 987 988/** 989 * blk_mq_rq_to_pdu - cast a request to a PDU 990 * @rq: the request to be casted 991 * 992 * Return: pointer to the PDU 993 * 994 * Driver command data is immediately after the request. So add request to get 995 * the PDU. 996 */ 997static inline void *blk_mq_rq_to_pdu(struct request *rq) 998{ 999 return rq + 1; 1000} 1001 1002#define queue_for_each_hw_ctx(q, hctx, i) \ 1003 xa_for_each(&(q)->hctx_table, (i), (hctx)) 1004 1005#define hctx_for_each_ctx(hctx, ctx, i) \ 1006 for ((i) = 0; (i) < (hctx)->nr_ctx && \ 1007 ({ ctx = (hctx)->ctxs[(i)]; 1; }); (i)++) 1008 1009static inline void blk_mq_cleanup_rq(struct request *rq) 1010{ 1011 if (rq->q->mq_ops->cleanup_rq) 1012 rq->q->mq_ops->cleanup_rq(rq); 1013} 1014 1015void blk_mq_hctx_set_fq_lock_class(struct blk_mq_hw_ctx *hctx, 1016 struct lock_class_key *key); 1017 1018static inline bool rq_is_sync(struct request *rq) 1019{ 1020 return op_is_sync(rq->cmd_flags); 1021} 1022 1023void blk_rq_init(struct request_queue *q, struct request *rq); 1024int blk_rq_prep_clone(struct request *rq, struct request *rq_src, 1025 struct bio_set *bs, gfp_t gfp_mask, 1026 int (*bio_ctr)(struct bio *, struct bio *, void *), void *data); 1027void blk_rq_unprep_clone(struct request *rq); 1028blk_status_t blk_insert_cloned_request(struct request *rq); 1029 1030struct rq_map_data { 1031 struct page **pages; 1032 unsigned long offset; 1033 unsigned short page_order; 1034 unsigned short nr_entries; 1035 bool null_mapped; 1036 bool from_user; 1037}; 1038 1039int blk_rq_map_user(struct request_queue *, struct request *, 1040 struct rq_map_data *, void __user *, unsigned long, gfp_t); 1041int blk_rq_map_user_io(struct request *, struct rq_map_data *, 1042 void __user *, unsigned long, gfp_t, bool, int, bool, int); 1043int blk_rq_map_user_iov(struct request_queue *, struct request *, 1044 struct rq_map_data *, const struct iov_iter *, gfp_t); 1045int blk_rq_unmap_user(struct bio *); 1046int blk_rq_map_kern(struct request *rq, void *kbuf, unsigned int len, 1047 gfp_t gfp); 1048int blk_rq_append_bio(struct request *rq, struct bio *bio); 1049void blk_execute_rq_nowait(struct request *rq, bool at_head); 1050blk_status_t blk_execute_rq(struct request *rq, bool at_head); 1051bool blk_rq_is_poll(struct request *rq); 1052 1053struct req_iterator { 1054 struct bvec_iter iter; 1055 struct bio *bio; 1056}; 1057 1058#define __rq_for_each_bio(_bio, rq) \ 1059 if ((rq->bio)) \ 1060 for (_bio = (rq)->bio; _bio; _bio = _bio->bi_next) 1061 1062#define rq_for_each_segment(bvl, _rq, _iter) \ 1063 __rq_for_each_bio(_iter.bio, _rq) \ 1064 bio_for_each_segment(bvl, _iter.bio, _iter.iter) 1065 1066#define rq_for_each_bvec(bvl, _rq, _iter) \ 1067 __rq_for_each_bio(_iter.bio, _rq) \ 1068 bio_for_each_bvec(bvl, _iter.bio, _iter.iter) 1069 1070#define rq_iter_last(bvec, _iter) \ 1071 (_iter.bio->bi_next == NULL && \ 1072 bio_iter_last(bvec, _iter.iter)) 1073 1074/* 1075 * blk_rq_pos() : the current sector 1076 * blk_rq_bytes() : bytes left in the entire request 1077 * blk_rq_cur_bytes() : bytes left in the current segment 1078 * blk_rq_sectors() : sectors left in the entire request 1079 * blk_rq_cur_sectors() : sectors left in the current segment 1080 * blk_rq_stats_sectors() : sectors of the entire request used for stats 1081 */ 1082static inline sector_t blk_rq_pos(const struct request *rq) 1083{ 1084 return rq->__sector; 1085} 1086 1087static inline unsigned int blk_rq_bytes(const struct request *rq) 1088{ 1089 return rq->__data_len; 1090} 1091 1092static inline int blk_rq_cur_bytes(const struct request *rq) 1093{ 1094 if (!rq->bio) 1095 return 0; 1096 if (!bio_has_data(rq->bio)) /* dataless requests such as discard */ 1097 return rq->bio->bi_iter.bi_size; 1098 return bio_iovec(rq->bio).bv_len; 1099} 1100 1101static inline unsigned int blk_rq_sectors(const struct request *rq) 1102{ 1103 return blk_rq_bytes(rq) >> SECTOR_SHIFT; 1104} 1105 1106static inline unsigned int blk_rq_cur_sectors(const struct request *rq) 1107{ 1108 return blk_rq_cur_bytes(rq) >> SECTOR_SHIFT; 1109} 1110 1111static inline unsigned int blk_rq_stats_sectors(const struct request *rq) 1112{ 1113 return rq->stats_sectors; 1114} 1115 1116/* 1117 * Some commands like WRITE SAME have a payload or data transfer size which 1118 * is different from the size of the request. Any driver that supports such 1119 * commands using the RQF_SPECIAL_PAYLOAD flag needs to use this helper to 1120 * calculate the data transfer size. 1121 */ 1122static inline unsigned int blk_rq_payload_bytes(struct request *rq) 1123{ 1124 if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) 1125 return rq->special_vec.bv_len; 1126 return blk_rq_bytes(rq); 1127} 1128 1129/* 1130 * Return the first full biovec in the request. The caller needs to check that 1131 * there are any bvecs before calling this helper. 1132 */ 1133static inline struct bio_vec req_bvec(struct request *rq) 1134{ 1135 if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) 1136 return rq->special_vec; 1137 return mp_bvec_iter_bvec(rq->bio->bi_io_vec, rq->bio->bi_iter); 1138} 1139 1140static inline unsigned int blk_rq_count_bios(struct request *rq) 1141{ 1142 unsigned int nr_bios = 0; 1143 struct bio *bio; 1144 1145 __rq_for_each_bio(bio, rq) 1146 nr_bios++; 1147 1148 return nr_bios; 1149} 1150 1151void blk_steal_bios(struct bio_list *list, struct request *rq); 1152 1153/* 1154 * Request completion related functions. 1155 * 1156 * blk_update_request() completes given number of bytes and updates 1157 * the request without completing it. 1158 */ 1159bool blk_update_request(struct request *rq, blk_status_t error, 1160 unsigned int nr_bytes); 1161void blk_abort_request(struct request *); 1162 1163/* 1164 * Number of physical segments as sent to the device. 1165 * 1166 * Normally this is the number of discontiguous data segments sent by the 1167 * submitter. But for data-less command like discard we might have no 1168 * actual data segments submitted, but the driver might have to add it's 1169 * own special payload. In that case we still return 1 here so that this 1170 * special payload will be mapped. 1171 */ 1172static inline unsigned short blk_rq_nr_phys_segments(struct request *rq) 1173{ 1174 if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) 1175 return 1; 1176 return rq->nr_phys_segments; 1177} 1178 1179/* 1180 * Number of discard segments (or ranges) the driver needs to fill in. 1181 * Each discard bio merged into a request is counted as one segment. 1182 */ 1183static inline unsigned short blk_rq_nr_discard_segments(struct request *rq) 1184{ 1185 return max_t(unsigned short, rq->nr_phys_segments, 1); 1186} 1187 1188int __blk_rq_map_sg(struct request *rq, struct scatterlist *sglist, 1189 struct scatterlist **last_sg); 1190static inline int blk_rq_map_sg(struct request *rq, struct scatterlist *sglist) 1191{ 1192 struct scatterlist *last_sg = NULL; 1193 1194 return __blk_rq_map_sg(rq, sglist, &last_sg); 1195} 1196void blk_dump_rq_flags(struct request *, char *); 1197 1198#endif /* BLK_MQ_H */