at v5.9 18 kB view raw
1/* SPDX-License-Identifier: GPL-2.0 */ 2#ifndef BLK_MQ_H 3#define BLK_MQ_H 4 5#include <linux/blkdev.h> 6#include <linux/sbitmap.h> 7#include <linux/srcu.h> 8 9struct blk_mq_tags; 10struct blk_flush_queue; 11 12/** 13 * struct blk_mq_hw_ctx - State for a hardware queue facing the hardware 14 * block device 15 */ 16struct blk_mq_hw_ctx { 17 struct { 18 /** @lock: Protects the dispatch list. */ 19 spinlock_t lock; 20 /** 21 * @dispatch: Used for requests that are ready to be 22 * dispatched to the hardware but for some reason (e.g. lack of 23 * resources) could not be sent to the hardware. As soon as the 24 * driver can send new requests, requests at this list will 25 * be sent first for a fairer dispatch. 26 */ 27 struct list_head dispatch; 28 /** 29 * @state: BLK_MQ_S_* flags. Defines the state of the hw 30 * queue (active, scheduled to restart, stopped). 31 */ 32 unsigned long state; 33 } ____cacheline_aligned_in_smp; 34 35 /** 36 * @run_work: Used for scheduling a hardware queue run at a later time. 37 */ 38 struct delayed_work run_work; 39 /** @cpumask: Map of available CPUs where this hctx can run. */ 40 cpumask_var_t cpumask; 41 /** 42 * @next_cpu: Used by blk_mq_hctx_next_cpu() for round-robin CPU 43 * selection from @cpumask. 44 */ 45 int next_cpu; 46 /** 47 * @next_cpu_batch: Counter of how many works left in the batch before 48 * changing to the next CPU. 49 */ 50 int next_cpu_batch; 51 52 /** @flags: BLK_MQ_F_* flags. Defines the behaviour of the queue. */ 53 unsigned long flags; 54 55 /** 56 * @sched_data: Pointer owned by the IO scheduler attached to a request 57 * queue. It's up to the IO scheduler how to use this pointer. 58 */ 59 void *sched_data; 60 /** 61 * @queue: Pointer to the request queue that owns this hardware context. 62 */ 63 struct request_queue *queue; 64 /** @fq: Queue of requests that need to perform a flush operation. */ 65 struct blk_flush_queue *fq; 66 67 /** 68 * @driver_data: Pointer to data owned by the block driver that created 69 * this hctx 70 */ 71 void *driver_data; 72 73 /** 74 * @ctx_map: Bitmap for each software queue. If bit is on, there is a 75 * pending request in that software queue. 76 */ 77 struct sbitmap ctx_map; 78 79 /** 80 * @dispatch_from: Software queue to be used when no scheduler was 81 * selected. 82 */ 83 struct blk_mq_ctx *dispatch_from; 84 /** 85 * @dispatch_busy: Number used by blk_mq_update_dispatch_busy() to 86 * decide if the hw_queue is busy using Exponential Weighted Moving 87 * Average algorithm. 88 */ 89 unsigned int dispatch_busy; 90 91 /** @type: HCTX_TYPE_* flags. Type of hardware queue. */ 92 unsigned short type; 93 /** @nr_ctx: Number of software queues. */ 94 unsigned short nr_ctx; 95 /** @ctxs: Array of software queues. */ 96 struct blk_mq_ctx **ctxs; 97 98 /** @dispatch_wait_lock: Lock for dispatch_wait queue. */ 99 spinlock_t dispatch_wait_lock; 100 /** 101 * @dispatch_wait: Waitqueue to put requests when there is no tag 102 * available at the moment, to wait for another try in the future. 103 */ 104 wait_queue_entry_t dispatch_wait; 105 106 /** 107 * @wait_index: Index of next available dispatch_wait queue to insert 108 * requests. 109 */ 110 atomic_t wait_index; 111 112 /** 113 * @tags: Tags owned by the block driver. A tag at this set is only 114 * assigned when a request is dispatched from a hardware queue. 115 */ 116 struct blk_mq_tags *tags; 117 /** 118 * @sched_tags: Tags owned by I/O scheduler. If there is an I/O 119 * scheduler associated with a request queue, a tag is assigned when 120 * that request is allocated. Else, this member is not used. 121 */ 122 struct blk_mq_tags *sched_tags; 123 124 /** @queued: Number of queued requests. */ 125 unsigned long queued; 126 /** @run: Number of dispatched requests. */ 127 unsigned long run; 128#define BLK_MQ_MAX_DISPATCH_ORDER 7 129 /** @dispatched: Number of dispatch requests by queue. */ 130 unsigned long dispatched[BLK_MQ_MAX_DISPATCH_ORDER]; 131 132 /** @numa_node: NUMA node the storage adapter has been connected to. */ 133 unsigned int numa_node; 134 /** @queue_num: Index of this hardware queue. */ 135 unsigned int queue_num; 136 137 /** 138 * @nr_active: Number of active requests. Only used when a tag set is 139 * shared across request queues. 140 */ 141 atomic_t nr_active; 142 143 /** @cpuhp_online: List to store request if CPU is going to die */ 144 struct hlist_node cpuhp_online; 145 /** @cpuhp_dead: List to store request if some CPU die. */ 146 struct hlist_node cpuhp_dead; 147 /** @kobj: Kernel object for sysfs. */ 148 struct kobject kobj; 149 150 /** @poll_considered: Count times blk_poll() was called. */ 151 unsigned long poll_considered; 152 /** @poll_invoked: Count how many requests blk_poll() polled. */ 153 unsigned long poll_invoked; 154 /** @poll_success: Count how many polled requests were completed. */ 155 unsigned long poll_success; 156 157#ifdef CONFIG_BLK_DEBUG_FS 158 /** 159 * @debugfs_dir: debugfs directory for this hardware queue. Named 160 * as cpu<cpu_number>. 161 */ 162 struct dentry *debugfs_dir; 163 /** @sched_debugfs_dir: debugfs directory for the scheduler. */ 164 struct dentry *sched_debugfs_dir; 165#endif 166 167 /** 168 * @hctx_list: if this hctx is not in use, this is an entry in 169 * q->unused_hctx_list. 170 */ 171 struct list_head hctx_list; 172 173 /** 174 * @srcu: Sleepable RCU. Use as lock when type of the hardware queue is 175 * blocking (BLK_MQ_F_BLOCKING). Must be the last member - see also 176 * blk_mq_hw_ctx_size(). 177 */ 178 struct srcu_struct srcu[]; 179}; 180 181/** 182 * struct blk_mq_queue_map - Map software queues to hardware queues 183 * @mq_map: CPU ID to hardware queue index map. This is an array 184 * with nr_cpu_ids elements. Each element has a value in the range 185 * [@queue_offset, @queue_offset + @nr_queues). 186 * @nr_queues: Number of hardware queues to map CPU IDs onto. 187 * @queue_offset: First hardware queue to map onto. Used by the PCIe NVMe 188 * driver to map each hardware queue type (enum hctx_type) onto a distinct 189 * set of hardware queues. 190 */ 191struct blk_mq_queue_map { 192 unsigned int *mq_map; 193 unsigned int nr_queues; 194 unsigned int queue_offset; 195}; 196 197/** 198 * enum hctx_type - Type of hardware queue 199 * @HCTX_TYPE_DEFAULT: All I/O not otherwise accounted for. 200 * @HCTX_TYPE_READ: Just for READ I/O. 201 * @HCTX_TYPE_POLL: Polled I/O of any kind. 202 * @HCTX_MAX_TYPES: Number of types of hctx. 203 */ 204enum hctx_type { 205 HCTX_TYPE_DEFAULT, 206 HCTX_TYPE_READ, 207 HCTX_TYPE_POLL, 208 209 HCTX_MAX_TYPES, 210}; 211 212/** 213 * struct blk_mq_tag_set - tag set that can be shared between request queues 214 * @map: One or more ctx -> hctx mappings. One map exists for each 215 * hardware queue type (enum hctx_type) that the driver wishes 216 * to support. There are no restrictions on maps being of the 217 * same size, and it's perfectly legal to share maps between 218 * types. 219 * @nr_maps: Number of elements in the @map array. A number in the range 220 * [1, HCTX_MAX_TYPES]. 221 * @ops: Pointers to functions that implement block driver behavior. 222 * @nr_hw_queues: Number of hardware queues supported by the block driver that 223 * owns this data structure. 224 * @queue_depth: Number of tags per hardware queue, reserved tags included. 225 * @reserved_tags: Number of tags to set aside for BLK_MQ_REQ_RESERVED tag 226 * allocations. 227 * @cmd_size: Number of additional bytes to allocate per request. The block 228 * driver owns these additional bytes. 229 * @numa_node: NUMA node the storage adapter has been connected to. 230 * @timeout: Request processing timeout in jiffies. 231 * @flags: Zero or more BLK_MQ_F_* flags. 232 * @driver_data: Pointer to data owned by the block driver that created this 233 * tag set. 234 * @tags: Tag sets. One tag set per hardware queue. Has @nr_hw_queues 235 * elements. 236 * @tag_list_lock: Serializes tag_list accesses. 237 * @tag_list: List of the request queues that use this tag set. See also 238 * request_queue.tag_set_list. 239 */ 240struct blk_mq_tag_set { 241 struct blk_mq_queue_map map[HCTX_MAX_TYPES]; 242 unsigned int nr_maps; 243 const struct blk_mq_ops *ops; 244 unsigned int nr_hw_queues; 245 unsigned int queue_depth; 246 unsigned int reserved_tags; 247 unsigned int cmd_size; 248 int numa_node; 249 unsigned int timeout; 250 unsigned int flags; 251 void *driver_data; 252 253 struct blk_mq_tags **tags; 254 255 struct mutex tag_list_lock; 256 struct list_head tag_list; 257}; 258 259/** 260 * struct blk_mq_queue_data - Data about a request inserted in a queue 261 * 262 * @rq: Request pointer. 263 * @last: If it is the last request in the queue. 264 */ 265struct blk_mq_queue_data { 266 struct request *rq; 267 bool last; 268}; 269 270typedef bool (busy_iter_fn)(struct blk_mq_hw_ctx *, struct request *, void *, 271 bool); 272typedef bool (busy_tag_iter_fn)(struct request *, void *, bool); 273 274/** 275 * struct blk_mq_ops - Callback functions that implements block driver 276 * behaviour. 277 */ 278struct blk_mq_ops { 279 /** 280 * @queue_rq: Queue a new request from block IO. 281 */ 282 blk_status_t (*queue_rq)(struct blk_mq_hw_ctx *, 283 const struct blk_mq_queue_data *); 284 285 /** 286 * @commit_rqs: If a driver uses bd->last to judge when to submit 287 * requests to hardware, it must define this function. In case of errors 288 * that make us stop issuing further requests, this hook serves the 289 * purpose of kicking the hardware (which the last request otherwise 290 * would have done). 291 */ 292 void (*commit_rqs)(struct blk_mq_hw_ctx *); 293 294 /** 295 * @get_budget: Reserve budget before queue request, once .queue_rq is 296 * run, it is driver's responsibility to release the 297 * reserved budget. Also we have to handle failure case 298 * of .get_budget for avoiding I/O deadlock. 299 */ 300 bool (*get_budget)(struct request_queue *); 301 302 /** 303 * @put_budget: Release the reserved budget. 304 */ 305 void (*put_budget)(struct request_queue *); 306 307 /** 308 * @timeout: Called on request timeout. 309 */ 310 enum blk_eh_timer_return (*timeout)(struct request *, bool); 311 312 /** 313 * @poll: Called to poll for completion of a specific tag. 314 */ 315 int (*poll)(struct blk_mq_hw_ctx *); 316 317 /** 318 * @complete: Mark the request as complete. 319 */ 320 void (*complete)(struct request *); 321 322 /** 323 * @init_hctx: Called when the block layer side of a hardware queue has 324 * been set up, allowing the driver to allocate/init matching 325 * structures. 326 */ 327 int (*init_hctx)(struct blk_mq_hw_ctx *, void *, unsigned int); 328 /** 329 * @exit_hctx: Ditto for exit/teardown. 330 */ 331 void (*exit_hctx)(struct blk_mq_hw_ctx *, unsigned int); 332 333 /** 334 * @init_request: Called for every command allocated by the block layer 335 * to allow the driver to set up driver specific data. 336 * 337 * Tag greater than or equal to queue_depth is for setting up 338 * flush request. 339 */ 340 int (*init_request)(struct blk_mq_tag_set *set, struct request *, 341 unsigned int, unsigned int); 342 /** 343 * @exit_request: Ditto for exit/teardown. 344 */ 345 void (*exit_request)(struct blk_mq_tag_set *set, struct request *, 346 unsigned int); 347 348 /** 349 * @initialize_rq_fn: Called from inside blk_get_request(). 350 */ 351 void (*initialize_rq_fn)(struct request *rq); 352 353 /** 354 * @cleanup_rq: Called before freeing one request which isn't completed 355 * yet, and usually for freeing the driver private data. 356 */ 357 void (*cleanup_rq)(struct request *); 358 359 /** 360 * @busy: If set, returns whether or not this queue currently is busy. 361 */ 362 bool (*busy)(struct request_queue *); 363 364 /** 365 * @map_queues: This allows drivers specify their own queue mapping by 366 * overriding the setup-time function that builds the mq_map. 367 */ 368 int (*map_queues)(struct blk_mq_tag_set *set); 369 370#ifdef CONFIG_BLK_DEBUG_FS 371 /** 372 * @show_rq: Used by the debugfs implementation to show driver-specific 373 * information about a request. 374 */ 375 void (*show_rq)(struct seq_file *m, struct request *rq); 376#endif 377}; 378 379enum { 380 BLK_MQ_F_SHOULD_MERGE = 1 << 0, 381 BLK_MQ_F_TAG_SHARED = 1 << 1, 382 /* 383 * Set when this device requires underlying blk-mq device for 384 * completing IO: 385 */ 386 BLK_MQ_F_STACKING = 1 << 2, 387 BLK_MQ_F_BLOCKING = 1 << 5, 388 BLK_MQ_F_NO_SCHED = 1 << 6, 389 BLK_MQ_F_ALLOC_POLICY_START_BIT = 8, 390 BLK_MQ_F_ALLOC_POLICY_BITS = 1, 391 392 BLK_MQ_S_STOPPED = 0, 393 BLK_MQ_S_TAG_ACTIVE = 1, 394 BLK_MQ_S_SCHED_RESTART = 2, 395 396 /* hw queue is inactive after all its CPUs become offline */ 397 BLK_MQ_S_INACTIVE = 3, 398 399 BLK_MQ_MAX_DEPTH = 10240, 400 401 BLK_MQ_CPU_WORK_BATCH = 8, 402}; 403#define BLK_MQ_FLAG_TO_ALLOC_POLICY(flags) \ 404 ((flags >> BLK_MQ_F_ALLOC_POLICY_START_BIT) & \ 405 ((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1)) 406#define BLK_ALLOC_POLICY_TO_MQ_FLAG(policy) \ 407 ((policy & ((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1)) \ 408 << BLK_MQ_F_ALLOC_POLICY_START_BIT) 409 410struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *); 411struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set, 412 void *queuedata); 413struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, 414 struct request_queue *q, 415 bool elevator_init); 416struct request_queue *blk_mq_init_sq_queue(struct blk_mq_tag_set *set, 417 const struct blk_mq_ops *ops, 418 unsigned int queue_depth, 419 unsigned int set_flags); 420void blk_mq_unregister_dev(struct device *, struct request_queue *); 421 422int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set); 423void blk_mq_free_tag_set(struct blk_mq_tag_set *set); 424 425void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule); 426 427void blk_mq_free_request(struct request *rq); 428 429bool blk_mq_queue_inflight(struct request_queue *q); 430 431enum { 432 /* return when out of requests */ 433 BLK_MQ_REQ_NOWAIT = (__force blk_mq_req_flags_t)(1 << 0), 434 /* allocate from reserved pool */ 435 BLK_MQ_REQ_RESERVED = (__force blk_mq_req_flags_t)(1 << 1), 436 /* set RQF_PREEMPT */ 437 BLK_MQ_REQ_PREEMPT = (__force blk_mq_req_flags_t)(1 << 3), 438}; 439 440struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, 441 blk_mq_req_flags_t flags); 442struct request *blk_mq_alloc_request_hctx(struct request_queue *q, 443 unsigned int op, blk_mq_req_flags_t flags, 444 unsigned int hctx_idx); 445struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag); 446 447enum { 448 BLK_MQ_UNIQUE_TAG_BITS = 16, 449 BLK_MQ_UNIQUE_TAG_MASK = (1 << BLK_MQ_UNIQUE_TAG_BITS) - 1, 450}; 451 452u32 blk_mq_unique_tag(struct request *rq); 453 454static inline u16 blk_mq_unique_tag_to_hwq(u32 unique_tag) 455{ 456 return unique_tag >> BLK_MQ_UNIQUE_TAG_BITS; 457} 458 459static inline u16 blk_mq_unique_tag_to_tag(u32 unique_tag) 460{ 461 return unique_tag & BLK_MQ_UNIQUE_TAG_MASK; 462} 463 464/** 465 * blk_mq_rq_state() - read the current MQ_RQ_* state of a request 466 * @rq: target request. 467 */ 468static inline enum mq_rq_state blk_mq_rq_state(struct request *rq) 469{ 470 return READ_ONCE(rq->state); 471} 472 473static inline int blk_mq_request_started(struct request *rq) 474{ 475 return blk_mq_rq_state(rq) != MQ_RQ_IDLE; 476} 477 478static inline int blk_mq_request_completed(struct request *rq) 479{ 480 return blk_mq_rq_state(rq) == MQ_RQ_COMPLETE; 481} 482 483void blk_mq_start_request(struct request *rq); 484void blk_mq_end_request(struct request *rq, blk_status_t error); 485void __blk_mq_end_request(struct request *rq, blk_status_t error); 486 487void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list); 488void blk_mq_kick_requeue_list(struct request_queue *q); 489void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs); 490void blk_mq_complete_request(struct request *rq); 491bool blk_mq_complete_request_remote(struct request *rq); 492bool blk_mq_bio_list_merge(struct request_queue *q, struct list_head *list, 493 struct bio *bio, unsigned int nr_segs); 494bool blk_mq_queue_stopped(struct request_queue *q); 495void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx); 496void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx); 497void blk_mq_stop_hw_queues(struct request_queue *q); 498void blk_mq_start_hw_queues(struct request_queue *q); 499void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); 500void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async); 501void blk_mq_quiesce_queue(struct request_queue *q); 502void blk_mq_unquiesce_queue(struct request_queue *q); 503void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); 504void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); 505void blk_mq_run_hw_queues(struct request_queue *q, bool async); 506void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs); 507void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, 508 busy_tag_iter_fn *fn, void *priv); 509void blk_mq_tagset_wait_completed_request(struct blk_mq_tag_set *tagset); 510void blk_mq_freeze_queue(struct request_queue *q); 511void blk_mq_unfreeze_queue(struct request_queue *q); 512void blk_freeze_queue_start(struct request_queue *q); 513void blk_mq_freeze_queue_wait(struct request_queue *q); 514int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, 515 unsigned long timeout); 516 517int blk_mq_map_queues(struct blk_mq_queue_map *qmap); 518void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues); 519 520void blk_mq_quiesce_queue_nowait(struct request_queue *q); 521 522unsigned int blk_mq_rq_cpu(struct request *rq); 523 524bool __blk_should_fake_timeout(struct request_queue *q); 525static inline bool blk_should_fake_timeout(struct request_queue *q) 526{ 527 if (IS_ENABLED(CONFIG_FAIL_IO_TIMEOUT) && 528 test_bit(QUEUE_FLAG_FAIL_IO, &q->queue_flags)) 529 return __blk_should_fake_timeout(q); 530 return false; 531} 532 533/** 534 * blk_mq_rq_from_pdu - cast a PDU to a request 535 * @pdu: the PDU (Protocol Data Unit) to be casted 536 * 537 * Return: request 538 * 539 * Driver command data is immediately after the request. So subtract request 540 * size to get back to the original request. 541 */ 542static inline struct request *blk_mq_rq_from_pdu(void *pdu) 543{ 544 return pdu - sizeof(struct request); 545} 546 547/** 548 * blk_mq_rq_to_pdu - cast a request to a PDU 549 * @rq: the request to be casted 550 * 551 * Return: pointer to the PDU 552 * 553 * Driver command data is immediately after the request. So add request to get 554 * the PDU. 555 */ 556static inline void *blk_mq_rq_to_pdu(struct request *rq) 557{ 558 return rq + 1; 559} 560 561#define queue_for_each_hw_ctx(q, hctx, i) \ 562 for ((i) = 0; (i) < (q)->nr_hw_queues && \ 563 ({ hctx = (q)->queue_hw_ctx[i]; 1; }); (i)++) 564 565#define hctx_for_each_ctx(hctx, ctx, i) \ 566 for ((i) = 0; (i) < (hctx)->nr_ctx && \ 567 ({ ctx = (hctx)->ctxs[(i)]; 1; }); (i)++) 568 569static inline blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, 570 struct request *rq) 571{ 572 if (rq->tag != -1) 573 return rq->tag | (hctx->queue_num << BLK_QC_T_SHIFT); 574 575 return rq->internal_tag | (hctx->queue_num << BLK_QC_T_SHIFT) | 576 BLK_QC_T_INTERNAL; 577} 578 579static inline void blk_mq_cleanup_rq(struct request *rq) 580{ 581 if (rq->q->mq_ops->cleanup_rq) 582 rq->q->mq_ops->cleanup_rq(rq); 583} 584 585blk_qc_t blk_mq_submit_bio(struct bio *bio); 586 587#endif