at v5.8 594 lines 19 kB view raw
1/* SPDX-License-Identifier: GPL-2.0 */ 2#ifndef BLK_MQ_H 3#define BLK_MQ_H 4 5#include <linux/blkdev.h> 6#include <linux/sbitmap.h> 7#include <linux/srcu.h> 8 9struct blk_mq_tags; 10struct blk_flush_queue; 11 12/** 13 * struct blk_mq_hw_ctx - State for a hardware queue facing the hardware 14 * block device 15 */ 16struct blk_mq_hw_ctx { 17 struct { 18 /** @lock: Protects the dispatch list. */ 19 spinlock_t lock; 20 /** 21 * @dispatch: Used for requests that are ready to be 22 * dispatched to the hardware but for some reason (e.g. lack of 23 * resources) could not be sent to the hardware. As soon as the 24 * driver can send new requests, requests at this list will 25 * be sent first for a fairer dispatch. 26 */ 27 struct list_head dispatch; 28 /** 29 * @state: BLK_MQ_S_* flags. Defines the state of the hw 30 * queue (active, scheduled to restart, stopped). 31 */ 32 unsigned long state; 33 } ____cacheline_aligned_in_smp; 34 35 /** 36 * @run_work: Used for scheduling a hardware queue run at a later time. 37 */ 38 struct delayed_work run_work; 39 /** @cpumask: Map of available CPUs where this hctx can run. */ 40 cpumask_var_t cpumask; 41 /** 42 * @next_cpu: Used by blk_mq_hctx_next_cpu() for round-robin CPU 43 * selection from @cpumask. 44 */ 45 int next_cpu; 46 /** 47 * @next_cpu_batch: Counter of how many works left in the batch before 48 * changing to the next CPU. 49 */ 50 int next_cpu_batch; 51 52 /** @flags: BLK_MQ_F_* flags. Defines the behaviour of the queue. */ 53 unsigned long flags; 54 55 /** 56 * @sched_data: Pointer owned by the IO scheduler attached to a request 57 * queue. It's up to the IO scheduler how to use this pointer. 58 */ 59 void *sched_data; 60 /** 61 * @queue: Pointer to the request queue that owns this hardware context. 62 */ 63 struct request_queue *queue; 64 /** @fq: Queue of requests that need to perform a flush operation. */ 65 struct blk_flush_queue *fq; 66 67 /** 68 * @driver_data: Pointer to data owned by the block driver that created 69 * this hctx 70 */ 71 void *driver_data; 72 73 /** 74 * @ctx_map: Bitmap for each software queue. If bit is on, there is a 75 * pending request in that software queue. 76 */ 77 struct sbitmap ctx_map; 78 79 /** 80 * @dispatch_from: Software queue to be used when no scheduler was 81 * selected. 82 */ 83 struct blk_mq_ctx *dispatch_from; 84 /** 85 * @dispatch_busy: Number used by blk_mq_update_dispatch_busy() to 86 * decide if the hw_queue is busy using Exponential Weighted Moving 87 * Average algorithm. 88 */ 89 unsigned int dispatch_busy; 90 91 /** @type: HCTX_TYPE_* flags. Type of hardware queue. */ 92 unsigned short type; 93 /** @nr_ctx: Number of software queues. */ 94 unsigned short nr_ctx; 95 /** @ctxs: Array of software queues. */ 96 struct blk_mq_ctx **ctxs; 97 98 /** @dispatch_wait_lock: Lock for dispatch_wait queue. */ 99 spinlock_t dispatch_wait_lock; 100 /** 101 * @dispatch_wait: Waitqueue to put requests when there is no tag 102 * available at the moment, to wait for another try in the future. 103 */ 104 wait_queue_entry_t dispatch_wait; 105 106 /** 107 * @wait_index: Index of next available dispatch_wait queue to insert 108 * requests. 109 */ 110 atomic_t wait_index; 111 112 /** 113 * @tags: Tags owned by the block driver. A tag at this set is only 114 * assigned when a request is dispatched from a hardware queue. 115 */ 116 struct blk_mq_tags *tags; 117 /** 118 * @sched_tags: Tags owned by I/O scheduler. If there is an I/O 119 * scheduler associated with a request queue, a tag is assigned when 120 * that request is allocated. Else, this member is not used. 121 */ 122 struct blk_mq_tags *sched_tags; 123 124 /** @queued: Number of queued requests. */ 125 unsigned long queued; 126 /** @run: Number of dispatched requests. */ 127 unsigned long run; 128#define BLK_MQ_MAX_DISPATCH_ORDER 7 129 /** @dispatched: Number of dispatch requests by queue. */ 130 unsigned long dispatched[BLK_MQ_MAX_DISPATCH_ORDER]; 131 132 /** @numa_node: NUMA node the storage adapter has been connected to. */ 133 unsigned int numa_node; 134 /** @queue_num: Index of this hardware queue. */ 135 unsigned int queue_num; 136 137 /** 138 * @nr_active: Number of active requests. Only used when a tag set is 139 * shared across request queues. 140 */ 141 atomic_t nr_active; 142 143 /** @cpuhp_online: List to store request if CPU is going to die */ 144 struct hlist_node cpuhp_online; 145 /** @cpuhp_dead: List to store request if some CPU die. */ 146 struct hlist_node cpuhp_dead; 147 /** @kobj: Kernel object for sysfs. */ 148 struct kobject kobj; 149 150 /** @poll_considered: Count times blk_poll() was called. */ 151 unsigned long poll_considered; 152 /** @poll_invoked: Count how many requests blk_poll() polled. */ 153 unsigned long poll_invoked; 154 /** @poll_success: Count how many polled requests were completed. */ 155 unsigned long poll_success; 156 157#ifdef CONFIG_BLK_DEBUG_FS 158 /** 159 * @debugfs_dir: debugfs directory for this hardware queue. Named 160 * as cpu<cpu_number>. 161 */ 162 struct dentry *debugfs_dir; 163 /** @sched_debugfs_dir: debugfs directory for the scheduler. */ 164 struct dentry *sched_debugfs_dir; 165#endif 166 167 /** 168 * @hctx_list: if this hctx is not in use, this is an entry in 169 * q->unused_hctx_list. 170 */ 171 struct list_head hctx_list; 172 173 /** 174 * @srcu: Sleepable RCU. Use as lock when type of the hardware queue is 175 * blocking (BLK_MQ_F_BLOCKING). Must be the last member - see also 176 * blk_mq_hw_ctx_size(). 177 */ 178 struct srcu_struct srcu[]; 179}; 180 181/** 182 * struct blk_mq_queue_map - Map software queues to hardware queues 183 * @mq_map: CPU ID to hardware queue index map. This is an array 184 * with nr_cpu_ids elements. Each element has a value in the range 185 * [@queue_offset, @queue_offset + @nr_queues). 186 * @nr_queues: Number of hardware queues to map CPU IDs onto. 187 * @queue_offset: First hardware queue to map onto. Used by the PCIe NVMe 188 * driver to map each hardware queue type (enum hctx_type) onto a distinct 189 * set of hardware queues. 190 */ 191struct blk_mq_queue_map { 192 unsigned int *mq_map; 193 unsigned int nr_queues; 194 unsigned int queue_offset; 195}; 196 197/** 198 * enum hctx_type - Type of hardware queue 199 * @HCTX_TYPE_DEFAULT: All I/O not otherwise accounted for. 200 * @HCTX_TYPE_READ: Just for READ I/O. 201 * @HCTX_TYPE_POLL: Polled I/O of any kind. 202 * @HCTX_MAX_TYPES: Number of types of hctx. 203 */ 204enum hctx_type { 205 HCTX_TYPE_DEFAULT, 206 HCTX_TYPE_READ, 207 HCTX_TYPE_POLL, 208 209 HCTX_MAX_TYPES, 210}; 211 212/** 213 * struct blk_mq_tag_set - tag set that can be shared between request queues 214 * @map: One or more ctx -> hctx mappings. One map exists for each 215 * hardware queue type (enum hctx_type) that the driver wishes 216 * to support. There are no restrictions on maps being of the 217 * same size, and it's perfectly legal to share maps between 218 * types. 219 * @nr_maps: Number of elements in the @map array. A number in the range 220 * [1, HCTX_MAX_TYPES]. 221 * @ops: Pointers to functions that implement block driver behavior. 222 * @nr_hw_queues: Number of hardware queues supported by the block driver that 223 * owns this data structure. 224 * @queue_depth: Number of tags per hardware queue, reserved tags included. 225 * @reserved_tags: Number of tags to set aside for BLK_MQ_REQ_RESERVED tag 226 * allocations. 227 * @cmd_size: Number of additional bytes to allocate per request. The block 228 * driver owns these additional bytes. 229 * @numa_node: NUMA node the storage adapter has been connected to. 230 * @timeout: Request processing timeout in jiffies. 231 * @flags: Zero or more BLK_MQ_F_* flags. 232 * @driver_data: Pointer to data owned by the block driver that created this 233 * tag set. 234 * @tags: Tag sets. One tag set per hardware queue. Has @nr_hw_queues 235 * elements. 236 * @tag_list_lock: Serializes tag_list accesses. 237 * @tag_list: List of the request queues that use this tag set. See also 238 * request_queue.tag_set_list. 239 */ 240struct blk_mq_tag_set { 241 struct blk_mq_queue_map map[HCTX_MAX_TYPES]; 242 unsigned int nr_maps; 243 const struct blk_mq_ops *ops; 244 unsigned int nr_hw_queues; 245 unsigned int queue_depth; 246 unsigned int reserved_tags; 247 unsigned int cmd_size; 248 int numa_node; 249 unsigned int timeout; 250 unsigned int flags; 251 void *driver_data; 252 253 struct blk_mq_tags **tags; 254 255 struct mutex tag_list_lock; 256 struct list_head tag_list; 257}; 258 259/** 260 * struct blk_mq_queue_data - Data about a request inserted in a queue 261 * 262 * @rq: Request pointer. 263 * @last: If it is the last request in the queue. 264 */ 265struct blk_mq_queue_data { 266 struct request *rq; 267 bool last; 268}; 269 270typedef blk_status_t (queue_rq_fn)(struct blk_mq_hw_ctx *, 271 const struct blk_mq_queue_data *); 272typedef void (commit_rqs_fn)(struct blk_mq_hw_ctx *); 273typedef bool (get_budget_fn)(struct blk_mq_hw_ctx *); 274typedef void (put_budget_fn)(struct blk_mq_hw_ctx *); 275typedef enum blk_eh_timer_return (timeout_fn)(struct request *, bool); 276typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int); 277typedef void (exit_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int); 278typedef int (init_request_fn)(struct blk_mq_tag_set *set, struct request *, 279 unsigned int, unsigned int); 280typedef void (exit_request_fn)(struct blk_mq_tag_set *set, struct request *, 281 unsigned int); 282 283typedef bool (busy_iter_fn)(struct blk_mq_hw_ctx *, struct request *, void *, 284 bool); 285typedef bool (busy_tag_iter_fn)(struct request *, void *, bool); 286typedef int (poll_fn)(struct blk_mq_hw_ctx *); 287typedef int (map_queues_fn)(struct blk_mq_tag_set *set); 288typedef bool (busy_fn)(struct request_queue *); 289typedef void (complete_fn)(struct request *); 290typedef void (cleanup_rq_fn)(struct request *); 291 292/** 293 * struct blk_mq_ops - Callback functions that implements block driver 294 * behaviour. 295 */ 296struct blk_mq_ops { 297 /** 298 * @queue_rq: Queue a new request from block IO. 299 */ 300 queue_rq_fn *queue_rq; 301 302 /** 303 * @commit_rqs: If a driver uses bd->last to judge when to submit 304 * requests to hardware, it must define this function. In case of errors 305 * that make us stop issuing further requests, this hook serves the 306 * purpose of kicking the hardware (which the last request otherwise 307 * would have done). 308 */ 309 commit_rqs_fn *commit_rqs; 310 311 /** 312 * @get_budget: Reserve budget before queue request, once .queue_rq is 313 * run, it is driver's responsibility to release the 314 * reserved budget. Also we have to handle failure case 315 * of .get_budget for avoiding I/O deadlock. 316 */ 317 get_budget_fn *get_budget; 318 /** 319 * @put_budget: Release the reserved budget. 320 */ 321 put_budget_fn *put_budget; 322 323 /** 324 * @timeout: Called on request timeout. 325 */ 326 timeout_fn *timeout; 327 328 /** 329 * @poll: Called to poll for completion of a specific tag. 330 */ 331 poll_fn *poll; 332 333 /** 334 * @complete: Mark the request as complete. 335 */ 336 complete_fn *complete; 337 338 /** 339 * @init_hctx: Called when the block layer side of a hardware queue has 340 * been set up, allowing the driver to allocate/init matching 341 * structures. 342 */ 343 init_hctx_fn *init_hctx; 344 /** 345 * @exit_hctx: Ditto for exit/teardown. 346 */ 347 exit_hctx_fn *exit_hctx; 348 349 /** 350 * @init_request: Called for every command allocated by the block layer 351 * to allow the driver to set up driver specific data. 352 * 353 * Tag greater than or equal to queue_depth is for setting up 354 * flush request. 355 */ 356 init_request_fn *init_request; 357 /** 358 * @exit_request: Ditto for exit/teardown. 359 */ 360 exit_request_fn *exit_request; 361 362 /** 363 * @initialize_rq_fn: Called from inside blk_get_request(). 364 */ 365 void (*initialize_rq_fn)(struct request *rq); 366 367 /** 368 * @cleanup_rq: Called before freeing one request which isn't completed 369 * yet, and usually for freeing the driver private data. 370 */ 371 cleanup_rq_fn *cleanup_rq; 372 373 /** 374 * @busy: If set, returns whether or not this queue currently is busy. 375 */ 376 busy_fn *busy; 377 378 /** 379 * @map_queues: This allows drivers specify their own queue mapping by 380 * overriding the setup-time function that builds the mq_map. 381 */ 382 map_queues_fn *map_queues; 383 384#ifdef CONFIG_BLK_DEBUG_FS 385 /** 386 * @show_rq: Used by the debugfs implementation to show driver-specific 387 * information about a request. 388 */ 389 void (*show_rq)(struct seq_file *m, struct request *rq); 390#endif 391}; 392 393enum { 394 BLK_MQ_F_SHOULD_MERGE = 1 << 0, 395 BLK_MQ_F_TAG_SHARED = 1 << 1, 396 /* 397 * Set when this device requires underlying blk-mq device for 398 * completing IO: 399 */ 400 BLK_MQ_F_STACKING = 1 << 2, 401 BLK_MQ_F_BLOCKING = 1 << 5, 402 BLK_MQ_F_NO_SCHED = 1 << 6, 403 BLK_MQ_F_ALLOC_POLICY_START_BIT = 8, 404 BLK_MQ_F_ALLOC_POLICY_BITS = 1, 405 406 BLK_MQ_S_STOPPED = 0, 407 BLK_MQ_S_TAG_ACTIVE = 1, 408 BLK_MQ_S_SCHED_RESTART = 2, 409 410 /* hw queue is inactive after all its CPUs become offline */ 411 BLK_MQ_S_INACTIVE = 3, 412 413 BLK_MQ_MAX_DEPTH = 10240, 414 415 BLK_MQ_CPU_WORK_BATCH = 8, 416}; 417#define BLK_MQ_FLAG_TO_ALLOC_POLICY(flags) \ 418 ((flags >> BLK_MQ_F_ALLOC_POLICY_START_BIT) & \ 419 ((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1)) 420#define BLK_ALLOC_POLICY_TO_MQ_FLAG(policy) \ 421 ((policy & ((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1)) \ 422 << BLK_MQ_F_ALLOC_POLICY_START_BIT) 423 424struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *); 425struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set, 426 void *queuedata); 427struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, 428 struct request_queue *q, 429 bool elevator_init); 430struct request_queue *blk_mq_init_sq_queue(struct blk_mq_tag_set *set, 431 const struct blk_mq_ops *ops, 432 unsigned int queue_depth, 433 unsigned int set_flags); 434void blk_mq_unregister_dev(struct device *, struct request_queue *); 435 436int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set); 437void blk_mq_free_tag_set(struct blk_mq_tag_set *set); 438 439void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule); 440 441void blk_mq_free_request(struct request *rq); 442 443bool blk_mq_queue_inflight(struct request_queue *q); 444 445enum { 446 /* return when out of requests */ 447 BLK_MQ_REQ_NOWAIT = (__force blk_mq_req_flags_t)(1 << 0), 448 /* allocate from reserved pool */ 449 BLK_MQ_REQ_RESERVED = (__force blk_mq_req_flags_t)(1 << 1), 450 /* allocate internal/sched tag */ 451 BLK_MQ_REQ_INTERNAL = (__force blk_mq_req_flags_t)(1 << 2), 452 /* set RQF_PREEMPT */ 453 BLK_MQ_REQ_PREEMPT = (__force blk_mq_req_flags_t)(1 << 3), 454}; 455 456struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, 457 blk_mq_req_flags_t flags); 458struct request *blk_mq_alloc_request_hctx(struct request_queue *q, 459 unsigned int op, blk_mq_req_flags_t flags, 460 unsigned int hctx_idx); 461struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag); 462 463enum { 464 BLK_MQ_UNIQUE_TAG_BITS = 16, 465 BLK_MQ_UNIQUE_TAG_MASK = (1 << BLK_MQ_UNIQUE_TAG_BITS) - 1, 466}; 467 468u32 blk_mq_unique_tag(struct request *rq); 469 470static inline u16 blk_mq_unique_tag_to_hwq(u32 unique_tag) 471{ 472 return unique_tag >> BLK_MQ_UNIQUE_TAG_BITS; 473} 474 475static inline u16 blk_mq_unique_tag_to_tag(u32 unique_tag) 476{ 477 return unique_tag & BLK_MQ_UNIQUE_TAG_MASK; 478} 479 480/** 481 * blk_mq_rq_state() - read the current MQ_RQ_* state of a request 482 * @rq: target request. 483 */ 484static inline enum mq_rq_state blk_mq_rq_state(struct request *rq) 485{ 486 return READ_ONCE(rq->state); 487} 488 489static inline int blk_mq_request_started(struct request *rq) 490{ 491 return blk_mq_rq_state(rq) != MQ_RQ_IDLE; 492} 493 494static inline int blk_mq_request_completed(struct request *rq) 495{ 496 return blk_mq_rq_state(rq) == MQ_RQ_COMPLETE; 497} 498 499void blk_mq_start_request(struct request *rq); 500void blk_mq_end_request(struct request *rq, blk_status_t error); 501void __blk_mq_end_request(struct request *rq, blk_status_t error); 502 503void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list); 504void blk_mq_kick_requeue_list(struct request_queue *q); 505void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs); 506bool blk_mq_complete_request(struct request *rq); 507void blk_mq_force_complete_rq(struct request *rq); 508bool blk_mq_bio_list_merge(struct request_queue *q, struct list_head *list, 509 struct bio *bio, unsigned int nr_segs); 510bool blk_mq_queue_stopped(struct request_queue *q); 511void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx); 512void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx); 513void blk_mq_stop_hw_queues(struct request_queue *q); 514void blk_mq_start_hw_queues(struct request_queue *q); 515void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); 516void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async); 517void blk_mq_quiesce_queue(struct request_queue *q); 518void blk_mq_unquiesce_queue(struct request_queue *q); 519void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); 520void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); 521void blk_mq_run_hw_queues(struct request_queue *q, bool async); 522void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs); 523void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, 524 busy_tag_iter_fn *fn, void *priv); 525void blk_mq_tagset_wait_completed_request(struct blk_mq_tag_set *tagset); 526void blk_mq_freeze_queue(struct request_queue *q); 527void blk_mq_unfreeze_queue(struct request_queue *q); 528void blk_freeze_queue_start(struct request_queue *q); 529void blk_mq_freeze_queue_wait(struct request_queue *q); 530int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, 531 unsigned long timeout); 532 533int blk_mq_map_queues(struct blk_mq_queue_map *qmap); 534void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues); 535 536void blk_mq_quiesce_queue_nowait(struct request_queue *q); 537 538unsigned int blk_mq_rq_cpu(struct request *rq); 539 540/** 541 * blk_mq_rq_from_pdu - cast a PDU to a request 542 * @pdu: the PDU (Protocol Data Unit) to be casted 543 * 544 * Return: request 545 * 546 * Driver command data is immediately after the request. So subtract request 547 * size to get back to the original request. 548 */ 549static inline struct request *blk_mq_rq_from_pdu(void *pdu) 550{ 551 return pdu - sizeof(struct request); 552} 553 554/** 555 * blk_mq_rq_to_pdu - cast a request to a PDU 556 * @rq: the request to be casted 557 * 558 * Return: pointer to the PDU 559 * 560 * Driver command data is immediately after the request. So add request to get 561 * the PDU. 562 */ 563static inline void *blk_mq_rq_to_pdu(struct request *rq) 564{ 565 return rq + 1; 566} 567 568#define queue_for_each_hw_ctx(q, hctx, i) \ 569 for ((i) = 0; (i) < (q)->nr_hw_queues && \ 570 ({ hctx = (q)->queue_hw_ctx[i]; 1; }); (i)++) 571 572#define hctx_for_each_ctx(hctx, ctx, i) \ 573 for ((i) = 0; (i) < (hctx)->nr_ctx && \ 574 ({ ctx = (hctx)->ctxs[(i)]; 1; }); (i)++) 575 576static inline blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, 577 struct request *rq) 578{ 579 if (rq->tag != -1) 580 return rq->tag | (hctx->queue_num << BLK_QC_T_SHIFT); 581 582 return rq->internal_tag | (hctx->queue_num << BLK_QC_T_SHIFT) | 583 BLK_QC_T_INTERNAL; 584} 585 586static inline void blk_mq_cleanup_rq(struct request *rq) 587{ 588 if (rq->q->mq_ops->cleanup_rq) 589 rq->q->mq_ops->cleanup_rq(rq); 590} 591 592blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio); 593 594#endif