block/blk-mq.c at v6.5-rc6 · tjh.dev/kernel

tjh.dev / kernel
fork
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork
kernel / block / blk-mq.c
at v6.5-rc6 4851 lines 124 kB view raw
wrap content
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Block multiqueue core code
   4 *
   5 * Copyright (C) 2013-2014 Jens Axboe
   6 * Copyright (C) 2013-2014 Christoph Hellwig
   7 */
   8#include <linux/kernel.h>
   9#include <linux/module.h>
  10#include <linux/backing-dev.h>
  11#include <linux/bio.h>
  12#include <linux/blkdev.h>
  13#include <linux/blk-integrity.h>
  14#include <linux/kmemleak.h>
  15#include <linux/mm.h>
  16#include <linux/init.h>
  17#include <linux/slab.h>
  18#include <linux/workqueue.h>
  19#include <linux/smp.h>
  20#include <linux/interrupt.h>
  21#include <linux/llist.h>
  22#include <linux/cpu.h>
  23#include <linux/cache.h>
  24#include <linux/sched/sysctl.h>
  25#include <linux/sched/topology.h>
  26#include <linux/sched/signal.h>
  27#include <linux/delay.h>
  28#include <linux/crash_dump.h>
  29#include <linux/prefetch.h>
  30#include <linux/blk-crypto.h>
  31#include <linux/part_stat.h>
  32
  33#include <trace/events/block.h>
  34
  35#include <linux/t10-pi.h>
  36#include "blk.h"
  37#include "blk-mq.h"
  38#include "blk-mq-debugfs.h"
  39#include "blk-pm.h"
  40#include "blk-stat.h"
  41#include "blk-mq-sched.h"
  42#include "blk-rq-qos.h"
  43#include "blk-ioprio.h"
  44
  45static DEFINE_PER_CPU(struct llist_head, blk_cpu_done);
  46
  47static void blk_mq_insert_request(struct request *rq, blk_insert_t flags);
  48static void blk_mq_request_bypass_insert(struct request *rq,
  49		blk_insert_t flags);
  50static void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
  51		struct list_head *list);
  52static int blk_hctx_poll(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
  53			 struct io_comp_batch *iob, unsigned int flags);
  54
  55/*
  56 * Check if any of the ctx, dispatch list or elevator
  57 * have pending work in this hardware queue.
  58 */
  59static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
  60{
  61	return !list_empty_careful(&hctx->dispatch) ||
  62		sbitmap_any_bit_set(&hctx->ctx_map) ||
  63			blk_mq_sched_has_work(hctx);
  64}
  65
  66/*
  67 * Mark this ctx as having pending work in this hardware queue
  68 */
  69static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
  70				     struct blk_mq_ctx *ctx)
  71{
  72	const int bit = ctx->index_hw[hctx->type];
  73
  74	if (!sbitmap_test_bit(&hctx->ctx_map, bit))
  75		sbitmap_set_bit(&hctx->ctx_map, bit);
  76}
  77
  78static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
  79				      struct blk_mq_ctx *ctx)
  80{
  81	const int bit = ctx->index_hw[hctx->type];
  82
  83	sbitmap_clear_bit(&hctx->ctx_map, bit);
  84}
  85
  86struct mq_inflight {
  87	struct block_device *part;
  88	unsigned int inflight[2];
  89};
  90
  91static bool blk_mq_check_inflight(struct request *rq, void *priv)
  92{
  93	struct mq_inflight *mi = priv;
  94
  95	if (rq->part && blk_do_io_stat(rq) &&
  96	    (!mi->part->bd_partno || rq->part == mi->part) &&
  97	    blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT)
  98		mi->inflight[rq_data_dir(rq)]++;
  99
 100	return true;
 101}
 102
 103unsigned int blk_mq_in_flight(struct request_queue *q,
 104		struct block_device *part)
 105{
 106	struct mq_inflight mi = { .part = part };
 107
 108	blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
 109
 110	return mi.inflight[0] + mi.inflight[1];
 111}
 112
 113void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part,
 114		unsigned int inflight[2])
 115{
 116	struct mq_inflight mi = { .part = part };
 117
 118	blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
 119	inflight[0] = mi.inflight[0];
 120	inflight[1] = mi.inflight[1];
 121}
 122
 123void blk_freeze_queue_start(struct request_queue *q)
 124{
 125	mutex_lock(&q->mq_freeze_lock);
 126	if (++q->mq_freeze_depth == 1) {
 127		percpu_ref_kill(&q->q_usage_counter);
 128		mutex_unlock(&q->mq_freeze_lock);
 129		if (queue_is_mq(q))
 130			blk_mq_run_hw_queues(q, false);
 131	} else {
 132		mutex_unlock(&q->mq_freeze_lock);
 133	}
 134}
 135EXPORT_SYMBOL_GPL(blk_freeze_queue_start);
 136
 137void blk_mq_freeze_queue_wait(struct request_queue *q)
 138{
 139	wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter));
 140}
 141EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait);
 142
 143int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
 144				     unsigned long timeout)
 145{
 146	return wait_event_timeout(q->mq_freeze_wq,
 147					percpu_ref_is_zero(&q->q_usage_counter),
 148					timeout);
 149}
 150EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait_timeout);
 151
 152/*
 153 * Guarantee no request is in use, so we can change any data structure of
 154 * the queue afterward.
 155 */
 156void blk_freeze_queue(struct request_queue *q)
 157{
 158	/*
 159	 * In the !blk_mq case we are only calling this to kill the
 160	 * q_usage_counter, otherwise this increases the freeze depth
 161	 * and waits for it to return to zero.  For this reason there is
 162	 * no blk_unfreeze_queue(), and blk_freeze_queue() is not
 163	 * exported to drivers as the only user for unfreeze is blk_mq.
 164	 */
 165	blk_freeze_queue_start(q);
 166	blk_mq_freeze_queue_wait(q);
 167}
 168
 169void blk_mq_freeze_queue(struct request_queue *q)
 170{
 171	/*
 172	 * ...just an alias to keep freeze and unfreeze actions balanced
 173	 * in the blk_mq_* namespace
 174	 */
 175	blk_freeze_queue(q);
 176}
 177EXPORT_SYMBOL_GPL(blk_mq_freeze_queue);
 178
 179void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic)
 180{
 181	mutex_lock(&q->mq_freeze_lock);
 182	if (force_atomic)
 183		q->q_usage_counter.data->force_atomic = true;
 184	q->mq_freeze_depth--;
 185	WARN_ON_ONCE(q->mq_freeze_depth < 0);
 186	if (!q->mq_freeze_depth) {
 187		percpu_ref_resurrect(&q->q_usage_counter);
 188		wake_up_all(&q->mq_freeze_wq);
 189	}
 190	mutex_unlock(&q->mq_freeze_lock);
 191}
 192
 193void blk_mq_unfreeze_queue(struct request_queue *q)
 194{
 195	__blk_mq_unfreeze_queue(q, false);
 196}
 197EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
 198
 199/*
 200 * FIXME: replace the scsi_internal_device_*block_nowait() calls in the
 201 * mpt3sas driver such that this function can be removed.
 202 */
 203void blk_mq_quiesce_queue_nowait(struct request_queue *q)
 204{
 205	unsigned long flags;
 206
 207	spin_lock_irqsave(&q->queue_lock, flags);
 208	if (!q->quiesce_depth++)
 209		blk_queue_flag_set(QUEUE_FLAG_QUIESCED, q);
 210	spin_unlock_irqrestore(&q->queue_lock, flags);
 211}
 212EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);
 213
 214/**
 215 * blk_mq_wait_quiesce_done() - wait until in-progress quiesce is done
 216 * @set: tag_set to wait on
 217 *
 218 * Note: it is driver's responsibility for making sure that quiesce has
 219 * been started on or more of the request_queues of the tag_set.  This
 220 * function only waits for the quiesce on those request_queues that had
 221 * the quiesce flag set using blk_mq_quiesce_queue_nowait.
 222 */
 223void blk_mq_wait_quiesce_done(struct blk_mq_tag_set *set)
 224{
 225	if (set->flags & BLK_MQ_F_BLOCKING)
 226		synchronize_srcu(set->srcu);
 227	else
 228		synchronize_rcu();
 229}
 230EXPORT_SYMBOL_GPL(blk_mq_wait_quiesce_done);
 231
 232/**
 233 * blk_mq_quiesce_queue() - wait until all ongoing dispatches have finished
 234 * @q: request queue.
 235 *
 236 * Note: this function does not prevent that the struct request end_io()
 237 * callback function is invoked. Once this function is returned, we make
 238 * sure no dispatch can happen until the queue is unquiesced via
 239 * blk_mq_unquiesce_queue().
 240 */
 241void blk_mq_quiesce_queue(struct request_queue *q)
 242{
 243	blk_mq_quiesce_queue_nowait(q);
 244	/* nothing to wait for non-mq queues */
 245	if (queue_is_mq(q))
 246		blk_mq_wait_quiesce_done(q->tag_set);
 247}
 248EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue);
 249
 250/*
 251 * blk_mq_unquiesce_queue() - counterpart of blk_mq_quiesce_queue()
 252 * @q: request queue.
 253 *
 254 * This function recovers queue into the state before quiescing
 255 * which is done by blk_mq_quiesce_queue.
 256 */
 257void blk_mq_unquiesce_queue(struct request_queue *q)
 258{
 259	unsigned long flags;
 260	bool run_queue = false;
 261
 262	spin_lock_irqsave(&q->queue_lock, flags);
 263	if (WARN_ON_ONCE(q->quiesce_depth <= 0)) {
 264		;
 265	} else if (!--q->quiesce_depth) {
 266		blk_queue_flag_clear(QUEUE_FLAG_QUIESCED, q);
 267		run_queue = true;
 268	}
 269	spin_unlock_irqrestore(&q->queue_lock, flags);
 270
 271	/* dispatch requests which are inserted during quiescing */
 272	if (run_queue)
 273		blk_mq_run_hw_queues(q, true);
 274}
 275EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue);
 276
 277void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set)
 278{
 279	struct request_queue *q;
 280
 281	mutex_lock(&set->tag_list_lock);
 282	list_for_each_entry(q, &set->tag_list, tag_set_list) {
 283		if (!blk_queue_skip_tagset_quiesce(q))
 284			blk_mq_quiesce_queue_nowait(q);
 285	}
 286	blk_mq_wait_quiesce_done(set);
 287	mutex_unlock(&set->tag_list_lock);
 288}
 289EXPORT_SYMBOL_GPL(blk_mq_quiesce_tagset);
 290
 291void blk_mq_unquiesce_tagset(struct blk_mq_tag_set *set)
 292{
 293	struct request_queue *q;
 294
 295	mutex_lock(&set->tag_list_lock);
 296	list_for_each_entry(q, &set->tag_list, tag_set_list) {
 297		if (!blk_queue_skip_tagset_quiesce(q))
 298			blk_mq_unquiesce_queue(q);
 299	}
 300	mutex_unlock(&set->tag_list_lock);
 301}
 302EXPORT_SYMBOL_GPL(blk_mq_unquiesce_tagset);
 303
 304void blk_mq_wake_waiters(struct request_queue *q)
 305{
 306	struct blk_mq_hw_ctx *hctx;
 307	unsigned long i;
 308
 309	queue_for_each_hw_ctx(q, hctx, i)
 310		if (blk_mq_hw_queue_mapped(hctx))
 311			blk_mq_tag_wakeup_all(hctx->tags, true);
 312}
 313
 314void blk_rq_init(struct request_queue *q, struct request *rq)
 315{
 316	memset(rq, 0, sizeof(*rq));
 317
 318	INIT_LIST_HEAD(&rq->queuelist);
 319	rq->q = q;
 320	rq->__sector = (sector_t) -1;
 321	INIT_HLIST_NODE(&rq->hash);
 322	RB_CLEAR_NODE(&rq->rb_node);
 323	rq->tag = BLK_MQ_NO_TAG;
 324	rq->internal_tag = BLK_MQ_NO_TAG;
 325	rq->start_time_ns = ktime_get_ns();
 326	rq->part = NULL;
 327	blk_crypto_rq_set_defaults(rq);
 328}
 329EXPORT_SYMBOL(blk_rq_init);
 330
 331/* Set start and alloc time when the allocated request is actually used */
 332static inline void blk_mq_rq_time_init(struct request *rq, u64 alloc_time_ns)
 333{
 334	if (blk_mq_need_time_stamp(rq))
 335		rq->start_time_ns = ktime_get_ns();
 336	else
 337		rq->start_time_ns = 0;
 338
 339#ifdef CONFIG_BLK_RQ_ALLOC_TIME
 340	if (blk_queue_rq_alloc_time(rq->q))
 341		rq->alloc_time_ns = alloc_time_ns ?: rq->start_time_ns;
 342	else
 343		rq->alloc_time_ns = 0;
 344#endif
 345}
 346
 347static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 348		struct blk_mq_tags *tags, unsigned int tag)
 349{
 350	struct blk_mq_ctx *ctx = data->ctx;
 351	struct blk_mq_hw_ctx *hctx = data->hctx;
 352	struct request_queue *q = data->q;
 353	struct request *rq = tags->static_rqs[tag];
 354
 355	rq->q = q;
 356	rq->mq_ctx = ctx;
 357	rq->mq_hctx = hctx;
 358	rq->cmd_flags = data->cmd_flags;
 359
 360	if (data->flags & BLK_MQ_REQ_PM)
 361		data->rq_flags |= RQF_PM;
 362	if (blk_queue_io_stat(q))
 363		data->rq_flags |= RQF_IO_STAT;
 364	rq->rq_flags = data->rq_flags;
 365
 366	if (data->rq_flags & RQF_SCHED_TAGS) {
 367		rq->tag = BLK_MQ_NO_TAG;
 368		rq->internal_tag = tag;
 369	} else {
 370		rq->tag = tag;
 371		rq->internal_tag = BLK_MQ_NO_TAG;
 372	}
 373	rq->timeout = 0;
 374
 375	rq->part = NULL;
 376	rq->io_start_time_ns = 0;
 377	rq->stats_sectors = 0;
 378	rq->nr_phys_segments = 0;
 379#if defined(CONFIG_BLK_DEV_INTEGRITY)
 380	rq->nr_integrity_segments = 0;
 381#endif
 382	rq->end_io = NULL;
 383	rq->end_io_data = NULL;
 384
 385	blk_crypto_rq_set_defaults(rq);
 386	INIT_LIST_HEAD(&rq->queuelist);
 387	/* tag was already set */
 388	WRITE_ONCE(rq->deadline, 0);
 389	req_ref_set(rq, 1);
 390
 391	if (rq->rq_flags & RQF_USE_SCHED) {
 392		struct elevator_queue *e = data->q->elevator;
 393
 394		INIT_HLIST_NODE(&rq->hash);
 395		RB_CLEAR_NODE(&rq->rb_node);
 396
 397		if (e->type->ops.prepare_request)
 398			e->type->ops.prepare_request(rq);
 399	}
 400
 401	return rq;
 402}
 403
 404static inline struct request *
 405__blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data)
 406{
 407	unsigned int tag, tag_offset;
 408	struct blk_mq_tags *tags;
 409	struct request *rq;
 410	unsigned long tag_mask;
 411	int i, nr = 0;
 412
 413	tag_mask = blk_mq_get_tags(data, data->nr_tags, &tag_offset);
 414	if (unlikely(!tag_mask))
 415		return NULL;
 416
 417	tags = blk_mq_tags_from_data(data);
 418	for (i = 0; tag_mask; i++) {
 419		if (!(tag_mask & (1UL << i)))
 420			continue;
 421		tag = tag_offset + i;
 422		prefetch(tags->static_rqs[tag]);
 423		tag_mask &= ~(1UL << i);
 424		rq = blk_mq_rq_ctx_init(data, tags, tag);
 425		rq_list_add(data->cached_rq, rq);
 426		nr++;
 427	}
 428	/* caller already holds a reference, add for remainder */
 429	percpu_ref_get_many(&data->q->q_usage_counter, nr - 1);
 430	data->nr_tags -= nr;
 431
 432	return rq_list_pop(data->cached_rq);
 433}
 434
 435static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
 436{
 437	struct request_queue *q = data->q;
 438	u64 alloc_time_ns = 0;
 439	struct request *rq;
 440	unsigned int tag;
 441
 442	/* alloc_time includes depth and tag waits */
 443	if (blk_queue_rq_alloc_time(q))
 444		alloc_time_ns = ktime_get_ns();
 445
 446	if (data->cmd_flags & REQ_NOWAIT)
 447		data->flags |= BLK_MQ_REQ_NOWAIT;
 448
 449	if (q->elevator) {
 450		/*
 451		 * All requests use scheduler tags when an I/O scheduler is
 452		 * enabled for the queue.
 453		 */
 454		data->rq_flags |= RQF_SCHED_TAGS;
 455
 456		/*
 457		 * Flush/passthrough requests are special and go directly to the
 458		 * dispatch list.
 459		 */
 460		if ((data->cmd_flags & REQ_OP_MASK) != REQ_OP_FLUSH &&
 461		    !blk_op_is_passthrough(data->cmd_flags)) {
 462			struct elevator_mq_ops *ops = &q->elevator->type->ops;
 463
 464			WARN_ON_ONCE(data->flags & BLK_MQ_REQ_RESERVED);
 465
 466			data->rq_flags |= RQF_USE_SCHED;
 467			if (ops->limit_depth)
 468				ops->limit_depth(data->cmd_flags, data);
 469		}
 470	}
 471
 472retry:
 473	data->ctx = blk_mq_get_ctx(q);
 474	data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx);
 475	if (!(data->rq_flags & RQF_SCHED_TAGS))
 476		blk_mq_tag_busy(data->hctx);
 477
 478	if (data->flags & BLK_MQ_REQ_RESERVED)
 479		data->rq_flags |= RQF_RESV;
 480
 481	/*
 482	 * Try batched alloc if we want more than 1 tag.
 483	 */
 484	if (data->nr_tags > 1) {
 485		rq = __blk_mq_alloc_requests_batch(data);
 486		if (rq) {
 487			blk_mq_rq_time_init(rq, alloc_time_ns);
 488			return rq;
 489		}
 490		data->nr_tags = 1;
 491	}
 492
 493	/*
 494	 * Waiting allocations only fail because of an inactive hctx.  In that
 495	 * case just retry the hctx assignment and tag allocation as CPU hotplug
 496	 * should have migrated us to an online CPU by now.
 497	 */
 498	tag = blk_mq_get_tag(data);
 499	if (tag == BLK_MQ_NO_TAG) {
 500		if (data->flags & BLK_MQ_REQ_NOWAIT)
 501			return NULL;
 502		/*
 503		 * Give up the CPU and sleep for a random short time to
 504		 * ensure that thread using a realtime scheduling class
 505		 * are migrated off the CPU, and thus off the hctx that
 506		 * is going away.
 507		 */
 508		msleep(3);
 509		goto retry;
 510	}
 511
 512	rq = blk_mq_rq_ctx_init(data, blk_mq_tags_from_data(data), tag);
 513	blk_mq_rq_time_init(rq, alloc_time_ns);
 514	return rq;
 515}
 516
 517static struct request *blk_mq_rq_cache_fill(struct request_queue *q,
 518					    struct blk_plug *plug,
 519					    blk_opf_t opf,
 520					    blk_mq_req_flags_t flags)
 521{
 522	struct blk_mq_alloc_data data = {
 523		.q		= q,
 524		.flags		= flags,
 525		.cmd_flags	= opf,
 526		.nr_tags	= plug->nr_ios,
 527		.cached_rq	= &plug->cached_rq,
 528	};
 529	struct request *rq;
 530
 531	if (blk_queue_enter(q, flags))
 532		return NULL;
 533
 534	plug->nr_ios = 1;
 535
 536	rq = __blk_mq_alloc_requests(&data);
 537	if (unlikely(!rq))
 538		blk_queue_exit(q);
 539	return rq;
 540}
 541
 542static struct request *blk_mq_alloc_cached_request(struct request_queue *q,
 543						   blk_opf_t opf,
 544						   blk_mq_req_flags_t flags)
 545{
 546	struct blk_plug *plug = current->plug;
 547	struct request *rq;
 548
 549	if (!plug)
 550		return NULL;
 551
 552	if (rq_list_empty(plug->cached_rq)) {
 553		if (plug->nr_ios == 1)
 554			return NULL;
 555		rq = blk_mq_rq_cache_fill(q, plug, opf, flags);
 556		if (!rq)
 557			return NULL;
 558	} else {
 559		rq = rq_list_peek(&plug->cached_rq);
 560		if (!rq || rq->q != q)
 561			return NULL;
 562
 563		if (blk_mq_get_hctx_type(opf) != rq->mq_hctx->type)
 564			return NULL;
 565		if (op_is_flush(rq->cmd_flags) != op_is_flush(opf))
 566			return NULL;
 567
 568		plug->cached_rq = rq_list_next(rq);
 569		blk_mq_rq_time_init(rq, 0);
 570	}
 571
 572	rq->cmd_flags = opf;
 573	INIT_LIST_HEAD(&rq->queuelist);
 574	return rq;
 575}
 576
 577struct request *blk_mq_alloc_request(struct request_queue *q, blk_opf_t opf,
 578		blk_mq_req_flags_t flags)
 579{
 580	struct request *rq;
 581
 582	rq = blk_mq_alloc_cached_request(q, opf, flags);
 583	if (!rq) {
 584		struct blk_mq_alloc_data data = {
 585			.q		= q,
 586			.flags		= flags,
 587			.cmd_flags	= opf,
 588			.nr_tags	= 1,
 589		};
 590		int ret;
 591
 592		ret = blk_queue_enter(q, flags);
 593		if (ret)
 594			return ERR_PTR(ret);
 595
 596		rq = __blk_mq_alloc_requests(&data);
 597		if (!rq)
 598			goto out_queue_exit;
 599	}
 600	rq->__data_len = 0;
 601	rq->__sector = (sector_t) -1;
 602	rq->bio = rq->biotail = NULL;
 603	return rq;
 604out_queue_exit:
 605	blk_queue_exit(q);
 606	return ERR_PTR(-EWOULDBLOCK);
 607}
 608EXPORT_SYMBOL(blk_mq_alloc_request);
 609
 610struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
 611	blk_opf_t opf, blk_mq_req_flags_t flags, unsigned int hctx_idx)
 612{
 613	struct blk_mq_alloc_data data = {
 614		.q		= q,
 615		.flags		= flags,
 616		.cmd_flags	= opf,
 617		.nr_tags	= 1,
 618	};
 619	u64 alloc_time_ns = 0;
 620	struct request *rq;
 621	unsigned int cpu;
 622	unsigned int tag;
 623	int ret;
 624
 625	/* alloc_time includes depth and tag waits */
 626	if (blk_queue_rq_alloc_time(q))
 627		alloc_time_ns = ktime_get_ns();
 628
 629	/*
 630	 * If the tag allocator sleeps we could get an allocation for a
 631	 * different hardware context.  No need to complicate the low level
 632	 * allocator for this for the rare use case of a command tied to
 633	 * a specific queue.
 634	 */
 635	if (WARN_ON_ONCE(!(flags & BLK_MQ_REQ_NOWAIT)) ||
 636	    WARN_ON_ONCE(!(flags & BLK_MQ_REQ_RESERVED)))
 637		return ERR_PTR(-EINVAL);
 638
 639	if (hctx_idx >= q->nr_hw_queues)
 640		return ERR_PTR(-EIO);
 641
 642	ret = blk_queue_enter(q, flags);
 643	if (ret)
 644		return ERR_PTR(ret);
 645
 646	/*
 647	 * Check if the hardware context is actually mapped to anything.
 648	 * If not tell the caller that it should skip this queue.
 649	 */
 650	ret = -EXDEV;
 651	data.hctx = xa_load(&q->hctx_table, hctx_idx);
 652	if (!blk_mq_hw_queue_mapped(data.hctx))
 653		goto out_queue_exit;
 654	cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask);
 655	if (cpu >= nr_cpu_ids)
 656		goto out_queue_exit;
 657	data.ctx = __blk_mq_get_ctx(q, cpu);
 658
 659	if (q->elevator)
 660		data.rq_flags |= RQF_SCHED_TAGS;
 661	else
 662		blk_mq_tag_busy(data.hctx);
 663
 664	if (flags & BLK_MQ_REQ_RESERVED)
 665		data.rq_flags |= RQF_RESV;
 666
 667	ret = -EWOULDBLOCK;
 668	tag = blk_mq_get_tag(&data);
 669	if (tag == BLK_MQ_NO_TAG)
 670		goto out_queue_exit;
 671	rq = blk_mq_rq_ctx_init(&data, blk_mq_tags_from_data(&data), tag);
 672	blk_mq_rq_time_init(rq, alloc_time_ns);
 673	rq->__data_len = 0;
 674	rq->__sector = (sector_t) -1;
 675	rq->bio = rq->biotail = NULL;
 676	return rq;
 677
 678out_queue_exit:
 679	blk_queue_exit(q);
 680	return ERR_PTR(ret);
 681}
 682EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);
 683
 684static void __blk_mq_free_request(struct request *rq)
 685{
 686	struct request_queue *q = rq->q;
 687	struct blk_mq_ctx *ctx = rq->mq_ctx;
 688	struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
 689	const int sched_tag = rq->internal_tag;
 690
 691	blk_crypto_free_request(rq);
 692	blk_pm_mark_last_busy(rq);
 693	rq->mq_hctx = NULL;
 694
 695	if (rq->rq_flags & RQF_MQ_INFLIGHT)
 696		__blk_mq_dec_active_requests(hctx);
 697
 698	if (rq->tag != BLK_MQ_NO_TAG)
 699		blk_mq_put_tag(hctx->tags, ctx, rq->tag);
 700	if (sched_tag != BLK_MQ_NO_TAG)
 701		blk_mq_put_tag(hctx->sched_tags, ctx, sched_tag);
 702	blk_mq_sched_restart(hctx);
 703	blk_queue_exit(q);
 704}
 705
 706void blk_mq_free_request(struct request *rq)
 707{
 708	struct request_queue *q = rq->q;
 709
 710	if ((rq->rq_flags & RQF_USE_SCHED) &&
 711	    q->elevator->type->ops.finish_request)
 712		q->elevator->type->ops.finish_request(rq);
 713
 714	if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq)))
 715		laptop_io_completion(q->disk->bdi);
 716
 717	rq_qos_done(q, rq);
 718
 719	WRITE_ONCE(rq->state, MQ_RQ_IDLE);
 720	if (req_ref_put_and_test(rq))
 721		__blk_mq_free_request(rq);
 722}
 723EXPORT_SYMBOL_GPL(blk_mq_free_request);
 724
 725void blk_mq_free_plug_rqs(struct blk_plug *plug)
 726{
 727	struct request *rq;
 728
 729	while ((rq = rq_list_pop(&plug->cached_rq)) != NULL)
 730		blk_mq_free_request(rq);
 731}
 732
 733void blk_dump_rq_flags(struct request *rq, char *msg)
 734{
 735	printk(KERN_INFO "%s: dev %s: flags=%llx\n", msg,
 736		rq->q->disk ? rq->q->disk->disk_name : "?",
 737		(__force unsigned long long) rq->cmd_flags);
 738
 739	printk(KERN_INFO "  sector %llu, nr/cnr %u/%u\n",
 740	       (unsigned long long)blk_rq_pos(rq),
 741	       blk_rq_sectors(rq), blk_rq_cur_sectors(rq));
 742	printk(KERN_INFO "  bio %p, biotail %p, len %u\n",
 743	       rq->bio, rq->biotail, blk_rq_bytes(rq));
 744}
 745EXPORT_SYMBOL(blk_dump_rq_flags);
 746
 747static void req_bio_endio(struct request *rq, struct bio *bio,
 748			  unsigned int nbytes, blk_status_t error)
 749{
 750	if (unlikely(error)) {
 751		bio->bi_status = error;
 752	} else if (req_op(rq) == REQ_OP_ZONE_APPEND) {
 753		/*
 754		 * Partial zone append completions cannot be supported as the
 755		 * BIO fragments may end up not being written sequentially.
 756		 */
 757		if (bio->bi_iter.bi_size != nbytes)
 758			bio->bi_status = BLK_STS_IOERR;
 759		else
 760			bio->bi_iter.bi_sector = rq->__sector;
 761	}
 762
 763	bio_advance(bio, nbytes);
 764
 765	if (unlikely(rq->rq_flags & RQF_QUIET))
 766		bio_set_flag(bio, BIO_QUIET);
 767	/* don't actually finish bio if it's part of flush sequence */
 768	if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ))
 769		bio_endio(bio);
 770}
 771
 772static void blk_account_io_completion(struct request *req, unsigned int bytes)
 773{
 774	if (req->part && blk_do_io_stat(req)) {
 775		const int sgrp = op_stat_group(req_op(req));
 776
 777		part_stat_lock();
 778		part_stat_add(req->part, sectors[sgrp], bytes >> 9);
 779		part_stat_unlock();
 780	}
 781}
 782
 783static void blk_print_req_error(struct request *req, blk_status_t status)
 784{
 785	printk_ratelimited(KERN_ERR
 786		"%s error, dev %s, sector %llu op 0x%x:(%s) flags 0x%x "
 787		"phys_seg %u prio class %u\n",
 788		blk_status_to_str(status),
 789		req->q->disk ? req->q->disk->disk_name : "?",
 790		blk_rq_pos(req), (__force u32)req_op(req),
 791		blk_op_str(req_op(req)),
 792		(__force u32)(req->cmd_flags & ~REQ_OP_MASK),
 793		req->nr_phys_segments,
 794		IOPRIO_PRIO_CLASS(req->ioprio));
 795}
 796
 797/*
 798 * Fully end IO on a request. Does not support partial completions, or
 799 * errors.
 800 */
 801static void blk_complete_request(struct request *req)
 802{
 803	const bool is_flush = (req->rq_flags & RQF_FLUSH_SEQ) != 0;
 804	int total_bytes = blk_rq_bytes(req);
 805	struct bio *bio = req->bio;
 806
 807	trace_block_rq_complete(req, BLK_STS_OK, total_bytes);
 808
 809	if (!bio)
 810		return;
 811
 812#ifdef CONFIG_BLK_DEV_INTEGRITY
 813	if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ)
 814		req->q->integrity.profile->complete_fn(req, total_bytes);
 815#endif
 816
 817	/*
 818	 * Upper layers may call blk_crypto_evict_key() anytime after the last
 819	 * bio_endio().  Therefore, the keyslot must be released before that.
 820	 */
 821	blk_crypto_rq_put_keyslot(req);
 822
 823	blk_account_io_completion(req, total_bytes);
 824
 825	do {
 826		struct bio *next = bio->bi_next;
 827
 828		/* Completion has already been traced */
 829		bio_clear_flag(bio, BIO_TRACE_COMPLETION);
 830
 831		if (req_op(req) == REQ_OP_ZONE_APPEND)
 832			bio->bi_iter.bi_sector = req->__sector;
 833
 834		if (!is_flush)
 835			bio_endio(bio);
 836		bio = next;
 837	} while (bio);
 838
 839	/*
 840	 * Reset counters so that the request stacking driver
 841	 * can find how many bytes remain in the request
 842	 * later.
 843	 */
 844	if (!req->end_io) {
 845		req->bio = NULL;
 846		req->__data_len = 0;
 847	}
 848}
 849
 850/**
 851 * blk_update_request - Complete multiple bytes without completing the request
 852 * @req:      the request being processed
 853 * @error:    block status code
 854 * @nr_bytes: number of bytes to complete for @req
 855 *
 856 * Description:
 857 *     Ends I/O on a number of bytes attached to @req, but doesn't complete
 858 *     the request structure even if @req doesn't have leftover.
 859 *     If @req has leftover, sets it up for the next range of segments.
 860 *
 861 *     Passing the result of blk_rq_bytes() as @nr_bytes guarantees
 862 *     %false return from this function.
 863 *
 864 * Note:
 865 *	The RQF_SPECIAL_PAYLOAD flag is ignored on purpose in this function
 866 *      except in the consistency check at the end of this function.
 867 *
 868 * Return:
 869 *     %false - this request doesn't have any more data
 870 *     %true  - this request has more data
 871 **/
 872bool blk_update_request(struct request *req, blk_status_t error,
 873		unsigned int nr_bytes)
 874{
 875	int total_bytes;
 876
 877	trace_block_rq_complete(req, error, nr_bytes);
 878
 879	if (!req->bio)
 880		return false;
 881
 882#ifdef CONFIG_BLK_DEV_INTEGRITY
 883	if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ &&
 884	    error == BLK_STS_OK)
 885		req->q->integrity.profile->complete_fn(req, nr_bytes);
 886#endif
 887
 888	/*
 889	 * Upper layers may call blk_crypto_evict_key() anytime after the last
 890	 * bio_endio().  Therefore, the keyslot must be released before that.
 891	 */
 892	if (blk_crypto_rq_has_keyslot(req) && nr_bytes >= blk_rq_bytes(req))
 893		__blk_crypto_rq_put_keyslot(req);
 894
 895	if (unlikely(error && !blk_rq_is_passthrough(req) &&
 896		     !(req->rq_flags & RQF_QUIET)) &&
 897		     !test_bit(GD_DEAD, &req->q->disk->state)) {
 898		blk_print_req_error(req, error);
 899		trace_block_rq_error(req, error, nr_bytes);
 900	}
 901
 902	blk_account_io_completion(req, nr_bytes);
 903
 904	total_bytes = 0;
 905	while (req->bio) {
 906		struct bio *bio = req->bio;
 907		unsigned bio_bytes = min(bio->bi_iter.bi_size, nr_bytes);
 908
 909		if (bio_bytes == bio->bi_iter.bi_size)
 910			req->bio = bio->bi_next;
 911
 912		/* Completion has already been traced */
 913		bio_clear_flag(bio, BIO_TRACE_COMPLETION);
 914		req_bio_endio(req, bio, bio_bytes, error);
 915
 916		total_bytes += bio_bytes;
 917		nr_bytes -= bio_bytes;
 918
 919		if (!nr_bytes)
 920			break;
 921	}
 922
 923	/*
 924	 * completely done
 925	 */
 926	if (!req->bio) {
 927		/*
 928		 * Reset counters so that the request stacking driver
 929		 * can find how many bytes remain in the request
 930		 * later.
 931		 */
 932		req->__data_len = 0;
 933		return false;
 934	}
 935
 936	req->__data_len -= total_bytes;
 937
 938	/* update sector only for requests with clear definition of sector */
 939	if (!blk_rq_is_passthrough(req))
 940		req->__sector += total_bytes >> 9;
 941
 942	/* mixed attributes always follow the first bio */
 943	if (req->rq_flags & RQF_MIXED_MERGE) {
 944		req->cmd_flags &= ~REQ_FAILFAST_MASK;
 945		req->cmd_flags |= req->bio->bi_opf & REQ_FAILFAST_MASK;
 946	}
 947
 948	if (!(req->rq_flags & RQF_SPECIAL_PAYLOAD)) {
 949		/*
 950		 * If total number of sectors is less than the first segment
 951		 * size, something has gone terribly wrong.
 952		 */
 953		if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) {
 954			blk_dump_rq_flags(req, "request botched");
 955			req->__data_len = blk_rq_cur_bytes(req);
 956		}
 957
 958		/* recalculate the number of segments */
 959		req->nr_phys_segments = blk_recalc_rq_segments(req);
 960	}
 961
 962	return true;
 963}
 964EXPORT_SYMBOL_GPL(blk_update_request);
 965
 966static inline void blk_account_io_done(struct request *req, u64 now)
 967{
 968	trace_block_io_done(req);
 969
 970	/*
 971	 * Account IO completion.  flush_rq isn't accounted as a
 972	 * normal IO on queueing nor completion.  Accounting the
 973	 * containing request is enough.
 974	 */
 975	if (blk_do_io_stat(req) && req->part &&
 976	    !(req->rq_flags & RQF_FLUSH_SEQ)) {
 977		const int sgrp = op_stat_group(req_op(req));
 978
 979		part_stat_lock();
 980		update_io_ticks(req->part, jiffies, true);
 981		part_stat_inc(req->part, ios[sgrp]);
 982		part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns);
 983		part_stat_unlock();
 984	}
 985}
 986
 987static inline void blk_account_io_start(struct request *req)
 988{
 989	trace_block_io_start(req);
 990
 991	if (blk_do_io_stat(req)) {
 992		/*
 993		 * All non-passthrough requests are created from a bio with one
 994		 * exception: when a flush command that is part of a flush sequence
 995		 * generated by the state machine in blk-flush.c is cloned onto the
 996		 * lower device by dm-multipath we can get here without a bio.
 997		 */
 998		if (req->bio)
 999			req->part = req->bio->bi_bdev;
1000		else
1001			req->part = req->q->disk->part0;
1002
1003		part_stat_lock();
1004		update_io_ticks(req->part, jiffies, false);
1005		part_stat_unlock();
1006	}
1007}
1008
1009static inline void __blk_mq_end_request_acct(struct request *rq, u64 now)
1010{
1011	if (rq->rq_flags & RQF_STATS)
1012		blk_stat_add(rq, now);
1013
1014	blk_mq_sched_completed_request(rq, now);
1015	blk_account_io_done(rq, now);
1016}
1017
1018inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
1019{
1020	if (blk_mq_need_time_stamp(rq))
1021		__blk_mq_end_request_acct(rq, ktime_get_ns());
1022
1023	if (rq->end_io) {
1024		rq_qos_done(rq->q, rq);
1025		if (rq->end_io(rq, error) == RQ_END_IO_FREE)
1026			blk_mq_free_request(rq);
1027	} else {
1028		blk_mq_free_request(rq);
1029	}
1030}
1031EXPORT_SYMBOL(__blk_mq_end_request);
1032
1033void blk_mq_end_request(struct request *rq, blk_status_t error)
1034{
1035	if (blk_update_request(rq, error, blk_rq_bytes(rq)))
1036		BUG();
1037	__blk_mq_end_request(rq, error);
1038}
1039EXPORT_SYMBOL(blk_mq_end_request);
1040
1041#define TAG_COMP_BATCH		32
1042
1043static inline void blk_mq_flush_tag_batch(struct blk_mq_hw_ctx *hctx,
1044					  int *tag_array, int nr_tags)
1045{
1046	struct request_queue *q = hctx->queue;
1047
1048	/*
1049	 * All requests should have been marked as RQF_MQ_INFLIGHT, so
1050	 * update hctx->nr_active in batch
1051	 */
1052	if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
1053		__blk_mq_sub_active_requests(hctx, nr_tags);
1054
1055	blk_mq_put_tags(hctx->tags, tag_array, nr_tags);
1056	percpu_ref_put_many(&q->q_usage_counter, nr_tags);
1057}
1058
1059void blk_mq_end_request_batch(struct io_comp_batch *iob)
1060{
1061	int tags[TAG_COMP_BATCH], nr_tags = 0;
1062	struct blk_mq_hw_ctx *cur_hctx = NULL;
1063	struct request *rq;
1064	u64 now = 0;
1065
1066	if (iob->need_ts)
1067		now = ktime_get_ns();
1068
1069	while ((rq = rq_list_pop(&iob->req_list)) != NULL) {
1070		prefetch(rq->bio);
1071		prefetch(rq->rq_next);
1072
1073		blk_complete_request(rq);
1074		if (iob->need_ts)
1075			__blk_mq_end_request_acct(rq, now);
1076
1077		rq_qos_done(rq->q, rq);
1078
1079		/*
1080		 * If end_io handler returns NONE, then it still has
1081		 * ownership of the request.
1082		 */
1083		if (rq->end_io && rq->end_io(rq, 0) == RQ_END_IO_NONE)
1084			continue;
1085
1086		WRITE_ONCE(rq->state, MQ_RQ_IDLE);
1087		if (!req_ref_put_and_test(rq))
1088			continue;
1089
1090		blk_crypto_free_request(rq);
1091		blk_pm_mark_last_busy(rq);
1092
1093		if (nr_tags == TAG_COMP_BATCH || cur_hctx != rq->mq_hctx) {
1094			if (cur_hctx)
1095				blk_mq_flush_tag_batch(cur_hctx, tags, nr_tags);
1096			nr_tags = 0;
1097			cur_hctx = rq->mq_hctx;
1098		}
1099		tags[nr_tags++] = rq->tag;
1100	}
1101
1102	if (nr_tags)
1103		blk_mq_flush_tag_batch(cur_hctx, tags, nr_tags);
1104}
1105EXPORT_SYMBOL_GPL(blk_mq_end_request_batch);
1106
1107static void blk_complete_reqs(struct llist_head *list)
1108{
1109	struct llist_node *entry = llist_reverse_order(llist_del_all(list));
1110	struct request *rq, *next;
1111
1112	llist_for_each_entry_safe(rq, next, entry, ipi_list)
1113		rq->q->mq_ops->complete(rq);
1114}
1115
1116static __latent_entropy void blk_done_softirq(struct softirq_action *h)
1117{
1118	blk_complete_reqs(this_cpu_ptr(&blk_cpu_done));
1119}
1120
1121static int blk_softirq_cpu_dead(unsigned int cpu)
1122{
1123	blk_complete_reqs(&per_cpu(blk_cpu_done, cpu));
1124	return 0;
1125}
1126
1127static void __blk_mq_complete_request_remote(void *data)
1128{
1129	__raise_softirq_irqoff(BLOCK_SOFTIRQ);
1130}
1131
1132static inline bool blk_mq_complete_need_ipi(struct request *rq)
1133{
1134	int cpu = raw_smp_processor_id();
1135
1136	if (!IS_ENABLED(CONFIG_SMP) ||
1137	    !test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags))
1138		return false;
1139	/*
1140	 * With force threaded interrupts enabled, raising softirq from an SMP
1141	 * function call will always result in waking the ksoftirqd thread.
1142	 * This is probably worse than completing the request on a different
1143	 * cache domain.
1144	 */
1145	if (force_irqthreads())
1146		return false;
1147
1148	/* same CPU or cache domain?  Complete locally */
1149	if (cpu == rq->mq_ctx->cpu ||
1150	    (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags) &&
1151	     cpus_share_cache(cpu, rq->mq_ctx->cpu)))
1152		return false;
1153
1154	/* don't try to IPI to an offline CPU */
1155	return cpu_online(rq->mq_ctx->cpu);
1156}
1157
1158static void blk_mq_complete_send_ipi(struct request *rq)
1159{
1160	struct llist_head *list;
1161	unsigned int cpu;
1162
1163	cpu = rq->mq_ctx->cpu;
1164	list = &per_cpu(blk_cpu_done, cpu);
1165	if (llist_add(&rq->ipi_list, list)) {
1166		INIT_CSD(&rq->csd, __blk_mq_complete_request_remote, rq);
1167		smp_call_function_single_async(cpu, &rq->csd);
1168	}
1169}
1170
1171static void blk_mq_raise_softirq(struct request *rq)
1172{
1173	struct llist_head *list;
1174
1175	preempt_disable();
1176	list = this_cpu_ptr(&blk_cpu_done);
1177	if (llist_add(&rq->ipi_list, list))
1178		raise_softirq(BLOCK_SOFTIRQ);
1179	preempt_enable();
1180}
1181
1182bool blk_mq_complete_request_remote(struct request *rq)
1183{
1184	WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);
1185
1186	/*
1187	 * For request which hctx has only one ctx mapping,
1188	 * or a polled request, always complete locally,
1189	 * it's pointless to redirect the completion.
1190	 */
1191	if ((rq->mq_hctx->nr_ctx == 1 &&
1192	     rq->mq_ctx->cpu == raw_smp_processor_id()) ||
1193	     rq->cmd_flags & REQ_POLLED)
1194		return false;
1195
1196	if (blk_mq_complete_need_ipi(rq)) {
1197		blk_mq_complete_send_ipi(rq);
1198		return true;
1199	}
1200
1201	if (rq->q->nr_hw_queues == 1) {
1202		blk_mq_raise_softirq(rq);
1203		return true;
1204	}
1205	return false;
1206}
1207EXPORT_SYMBOL_GPL(blk_mq_complete_request_remote);
1208
1209/**
1210 * blk_mq_complete_request - end I/O on a request
1211 * @rq:		the request being processed
1212 *
1213 * Description:
1214 *	Complete a request by scheduling the ->complete_rq operation.
1215 **/
1216void blk_mq_complete_request(struct request *rq)
1217{
1218	if (!blk_mq_complete_request_remote(rq))
1219		rq->q->mq_ops->complete(rq);
1220}
1221EXPORT_SYMBOL(blk_mq_complete_request);
1222
1223/**
1224 * blk_mq_start_request - Start processing a request
1225 * @rq: Pointer to request to be started
1226 *
1227 * Function used by device drivers to notify the block layer that a request
1228 * is going to be processed now, so blk layer can do proper initializations
1229 * such as starting the timeout timer.
1230 */
1231void blk_mq_start_request(struct request *rq)
1232{
1233	struct request_queue *q = rq->q;
1234
1235	trace_block_rq_issue(rq);
1236
1237	if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
1238		rq->io_start_time_ns = ktime_get_ns();
1239		rq->stats_sectors = blk_rq_sectors(rq);
1240		rq->rq_flags |= RQF_STATS;
1241		rq_qos_issue(q, rq);
1242	}
1243
1244	WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE);
1245
1246	blk_add_timer(rq);
1247	WRITE_ONCE(rq->state, MQ_RQ_IN_FLIGHT);
1248
1249#ifdef CONFIG_BLK_DEV_INTEGRITY
1250	if (blk_integrity_rq(rq) && req_op(rq) == REQ_OP_WRITE)
1251		q->integrity.profile->prepare_fn(rq);
1252#endif
1253	if (rq->bio && rq->bio->bi_opf & REQ_POLLED)
1254	        WRITE_ONCE(rq->bio->bi_cookie, rq->mq_hctx->queue_num);
1255}
1256EXPORT_SYMBOL(blk_mq_start_request);
1257
1258/*
1259 * Allow 2x BLK_MAX_REQUEST_COUNT requests on plug queue for multiple
1260 * queues. This is important for md arrays to benefit from merging
1261 * requests.
1262 */
1263static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug)
1264{
1265	if (plug->multiple_queues)
1266		return BLK_MAX_REQUEST_COUNT * 2;
1267	return BLK_MAX_REQUEST_COUNT;
1268}
1269
1270static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq)
1271{
1272	struct request *last = rq_list_peek(&plug->mq_list);
1273
1274	if (!plug->rq_count) {
1275		trace_block_plug(rq->q);
1276	} else if (plug->rq_count >= blk_plug_max_rq_count(plug) ||
1277		   (!blk_queue_nomerges(rq->q) &&
1278		    blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) {
1279		blk_mq_flush_plug_list(plug, false);
1280		last = NULL;
1281		trace_block_plug(rq->q);
1282	}
1283
1284	if (!plug->multiple_queues && last && last->q != rq->q)
1285		plug->multiple_queues = true;
1286	/*
1287	 * Any request allocated from sched tags can't be issued to
1288	 * ->queue_rqs() directly
1289	 */
1290	if (!plug->has_elevator && (rq->rq_flags & RQF_SCHED_TAGS))
1291		plug->has_elevator = true;
1292	rq->rq_next = NULL;
1293	rq_list_add(&plug->mq_list, rq);
1294	plug->rq_count++;
1295}
1296
1297/**
1298 * blk_execute_rq_nowait - insert a request to I/O scheduler for execution
1299 * @rq:		request to insert
1300 * @at_head:    insert request at head or tail of queue
1301 *
1302 * Description:
1303 *    Insert a fully prepared request at the back of the I/O scheduler queue
1304 *    for execution.  Don't wait for completion.
1305 *
1306 * Note:
1307 *    This function will invoke @done directly if the queue is dead.
1308 */
1309void blk_execute_rq_nowait(struct request *rq, bool at_head)
1310{
1311	struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
1312
1313	WARN_ON(irqs_disabled());
1314	WARN_ON(!blk_rq_is_passthrough(rq));
1315
1316	blk_account_io_start(rq);
1317
1318	/*
1319	 * As plugging can be enabled for passthrough requests on a zoned
1320	 * device, directly accessing the plug instead of using blk_mq_plug()
1321	 * should not have any consequences.
1322	 */
1323	if (current->plug && !at_head) {
1324		blk_add_rq_to_plug(current->plug, rq);
1325		return;
1326	}
1327
1328	blk_mq_insert_request(rq, at_head ? BLK_MQ_INSERT_AT_HEAD : 0);
1329	blk_mq_run_hw_queue(hctx, false);
1330}
1331EXPORT_SYMBOL_GPL(blk_execute_rq_nowait);
1332
1333struct blk_rq_wait {
1334	struct completion done;
1335	blk_status_t ret;
1336};
1337
1338static enum rq_end_io_ret blk_end_sync_rq(struct request *rq, blk_status_t ret)
1339{
1340	struct blk_rq_wait *wait = rq->end_io_data;
1341
1342	wait->ret = ret;
1343	complete(&wait->done);
1344	return RQ_END_IO_NONE;
1345}
1346
1347bool blk_rq_is_poll(struct request *rq)
1348{
1349	if (!rq->mq_hctx)
1350		return false;
1351	if (rq->mq_hctx->type != HCTX_TYPE_POLL)
1352		return false;
1353	return true;
1354}
1355EXPORT_SYMBOL_GPL(blk_rq_is_poll);
1356
1357static void blk_rq_poll_completion(struct request *rq, struct completion *wait)
1358{
1359	do {
1360		blk_hctx_poll(rq->q, rq->mq_hctx, NULL, 0);
1361		cond_resched();
1362	} while (!completion_done(wait));
1363}
1364
1365/**
1366 * blk_execute_rq - insert a request into queue for execution
1367 * @rq:		request to insert
1368 * @at_head:    insert request at head or tail of queue
1369 *
1370 * Description:
1371 *    Insert a fully prepared request at the back of the I/O scheduler queue
1372 *    for execution and wait for completion.
1373 * Return: The blk_status_t result provided to blk_mq_end_request().
1374 */
1375blk_status_t blk_execute_rq(struct request *rq, bool at_head)
1376{
1377	struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
1378	struct blk_rq_wait wait = {
1379		.done = COMPLETION_INITIALIZER_ONSTACK(wait.done),
1380	};
1381
1382	WARN_ON(irqs_disabled());
1383	WARN_ON(!blk_rq_is_passthrough(rq));
1384
1385	rq->end_io_data = &wait;
1386	rq->end_io = blk_end_sync_rq;
1387
1388	blk_account_io_start(rq);
1389	blk_mq_insert_request(rq, at_head ? BLK_MQ_INSERT_AT_HEAD : 0);
1390	blk_mq_run_hw_queue(hctx, false);
1391
1392	if (blk_rq_is_poll(rq)) {
1393		blk_rq_poll_completion(rq, &wait.done);
1394	} else {
1395		/*
1396		 * Prevent hang_check timer from firing at us during very long
1397		 * I/O
1398		 */
1399		unsigned long hang_check = sysctl_hung_task_timeout_secs;
1400
1401		if (hang_check)
1402			while (!wait_for_completion_io_timeout(&wait.done,
1403					hang_check * (HZ/2)))
1404				;
1405		else
1406			wait_for_completion_io(&wait.done);
1407	}
1408
1409	return wait.ret;
1410}
1411EXPORT_SYMBOL(blk_execute_rq);
1412
1413static void __blk_mq_requeue_request(struct request *rq)
1414{
1415	struct request_queue *q = rq->q;
1416
1417	blk_mq_put_driver_tag(rq);
1418
1419	trace_block_rq_requeue(rq);
1420	rq_qos_requeue(q, rq);
1421
1422	if (blk_mq_request_started(rq)) {
1423		WRITE_ONCE(rq->state, MQ_RQ_IDLE);
1424		rq->rq_flags &= ~RQF_TIMED_OUT;
1425	}
1426}
1427
1428void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list)
1429{
1430	struct request_queue *q = rq->q;
1431	unsigned long flags;
1432
1433	__blk_mq_requeue_request(rq);
1434
1435	/* this request will be re-inserted to io scheduler queue */
1436	blk_mq_sched_requeue_request(rq);
1437
1438	spin_lock_irqsave(&q->requeue_lock, flags);
1439	list_add_tail(&rq->queuelist, &q->requeue_list);
1440	spin_unlock_irqrestore(&q->requeue_lock, flags);
1441
1442	if (kick_requeue_list)
1443		blk_mq_kick_requeue_list(q);
1444}
1445EXPORT_SYMBOL(blk_mq_requeue_request);
1446
1447static void blk_mq_requeue_work(struct work_struct *work)
1448{
1449	struct request_queue *q =
1450		container_of(work, struct request_queue, requeue_work.work);
1451	LIST_HEAD(rq_list);
1452	LIST_HEAD(flush_list);
1453	struct request *rq;
1454
1455	spin_lock_irq(&q->requeue_lock);
1456	list_splice_init(&q->requeue_list, &rq_list);
1457	list_splice_init(&q->flush_list, &flush_list);
1458	spin_unlock_irq(&q->requeue_lock);
1459
1460	while (!list_empty(&rq_list)) {
1461		rq = list_entry(rq_list.next, struct request, queuelist);
1462		/*
1463		 * If RQF_DONTPREP ist set, the request has been started by the
1464		 * driver already and might have driver-specific data allocated
1465		 * already.  Insert it into the hctx dispatch list to avoid
1466		 * block layer merges for the request.
1467		 */
1468		if (rq->rq_flags & RQF_DONTPREP) {
1469			list_del_init(&rq->queuelist);
1470			blk_mq_request_bypass_insert(rq, 0);
1471		} else {
1472			list_del_init(&rq->queuelist);
1473			blk_mq_insert_request(rq, BLK_MQ_INSERT_AT_HEAD);
1474		}
1475	}
1476
1477	while (!list_empty(&flush_list)) {
1478		rq = list_entry(flush_list.next, struct request, queuelist);
1479		list_del_init(&rq->queuelist);
1480		blk_mq_insert_request(rq, 0);
1481	}
1482
1483	blk_mq_run_hw_queues(q, false);
1484}
1485
1486void blk_mq_kick_requeue_list(struct request_queue *q)
1487{
1488	kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work, 0);
1489}
1490EXPORT_SYMBOL(blk_mq_kick_requeue_list);
1491
1492void blk_mq_delay_kick_requeue_list(struct request_queue *q,
1493				    unsigned long msecs)
1494{
1495	kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work,
1496				    msecs_to_jiffies(msecs));
1497}
1498EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list);
1499
1500static bool blk_mq_rq_inflight(struct request *rq, void *priv)
1501{
1502	/*
1503	 * If we find a request that isn't idle we know the queue is busy
1504	 * as it's checked in the iter.
1505	 * Return false to stop the iteration.
1506	 */
1507	if (blk_mq_request_started(rq)) {
1508		bool *busy = priv;
1509
1510		*busy = true;
1511		return false;
1512	}
1513
1514	return true;
1515}
1516
1517bool blk_mq_queue_inflight(struct request_queue *q)
1518{
1519	bool busy = false;
1520
1521	blk_mq_queue_tag_busy_iter(q, blk_mq_rq_inflight, &busy);
1522	return busy;
1523}
1524EXPORT_SYMBOL_GPL(blk_mq_queue_inflight);
1525
1526static void blk_mq_rq_timed_out(struct request *req)
1527{
1528	req->rq_flags |= RQF_TIMED_OUT;
1529	if (req->q->mq_ops->timeout) {
1530		enum blk_eh_timer_return ret;
1531
1532		ret = req->q->mq_ops->timeout(req);
1533		if (ret == BLK_EH_DONE)
1534			return;
1535		WARN_ON_ONCE(ret != BLK_EH_RESET_TIMER);
1536	}
1537
1538	blk_add_timer(req);
1539}
1540
1541struct blk_expired_data {
1542	bool has_timedout_rq;
1543	unsigned long next;
1544	unsigned long timeout_start;
1545};
1546
1547static bool blk_mq_req_expired(struct request *rq, struct blk_expired_data *expired)
1548{
1549	unsigned long deadline;
1550
1551	if (blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT)
1552		return false;
1553	if (rq->rq_flags & RQF_TIMED_OUT)
1554		return false;
1555
1556	deadline = READ_ONCE(rq->deadline);
1557	if (time_after_eq(expired->timeout_start, deadline))
1558		return true;
1559
1560	if (expired->next == 0)
1561		expired->next = deadline;
1562	else if (time_after(expired->next, deadline))
1563		expired->next = deadline;
1564	return false;
1565}
1566
1567void blk_mq_put_rq_ref(struct request *rq)
1568{
1569	if (is_flush_rq(rq)) {
1570		if (rq->end_io(rq, 0) == RQ_END_IO_FREE)
1571			blk_mq_free_request(rq);
1572	} else if (req_ref_put_and_test(rq)) {
1573		__blk_mq_free_request(rq);
1574	}
1575}
1576
1577static bool blk_mq_check_expired(struct request *rq, void *priv)
1578{
1579	struct blk_expired_data *expired = priv;
1580
1581	/*
1582	 * blk_mq_queue_tag_busy_iter() has locked the request, so it cannot
1583	 * be reallocated underneath the timeout handler's processing, then
1584	 * the expire check is reliable. If the request is not expired, then
1585	 * it was completed and reallocated as a new request after returning
1586	 * from blk_mq_check_expired().
1587	 */
1588	if (blk_mq_req_expired(rq, expired)) {
1589		expired->has_timedout_rq = true;
1590		return false;
1591	}
1592	return true;
1593}
1594
1595static bool blk_mq_handle_expired(struct request *rq, void *priv)
1596{
1597	struct blk_expired_data *expired = priv;
1598
1599	if (blk_mq_req_expired(rq, expired))
1600		blk_mq_rq_timed_out(rq);
1601	return true;
1602}
1603
1604static void blk_mq_timeout_work(struct work_struct *work)
1605{
1606	struct request_queue *q =
1607		container_of(work, struct request_queue, timeout_work);
1608	struct blk_expired_data expired = {
1609		.timeout_start = jiffies,
1610	};
1611	struct blk_mq_hw_ctx *hctx;
1612	unsigned long i;
1613
1614	/* A deadlock might occur if a request is stuck requiring a
1615	 * timeout at the same time a queue freeze is waiting
1616	 * completion, since the timeout code would not be able to
1617	 * acquire the queue reference here.
1618	 *
1619	 * That's why we don't use blk_queue_enter here; instead, we use
1620	 * percpu_ref_tryget directly, because we need to be able to
1621	 * obtain a reference even in the short window between the queue
1622	 * starting to freeze, by dropping the first reference in
1623	 * blk_freeze_queue_start, and the moment the last request is
1624	 * consumed, marked by the instant q_usage_counter reaches
1625	 * zero.
1626	 */
1627	if (!percpu_ref_tryget(&q->q_usage_counter))
1628		return;
1629
1630	/* check if there is any timed-out request */
1631	blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &expired);
1632	if (expired.has_timedout_rq) {
1633		/*
1634		 * Before walking tags, we must ensure any submit started
1635		 * before the current time has finished. Since the submit
1636		 * uses srcu or rcu, wait for a synchronization point to
1637		 * ensure all running submits have finished
1638		 */
1639		blk_mq_wait_quiesce_done(q->tag_set);
1640
1641		expired.next = 0;
1642		blk_mq_queue_tag_busy_iter(q, blk_mq_handle_expired, &expired);
1643	}
1644
1645	if (expired.next != 0) {
1646		mod_timer(&q->timeout, expired.next);
1647	} else {
1648		/*
1649		 * Request timeouts are handled as a forward rolling timer. If
1650		 * we end up here it means that no requests are pending and
1651		 * also that no request has been pending for a while. Mark
1652		 * each hctx as idle.
1653		 */
1654		queue_for_each_hw_ctx(q, hctx, i) {
1655			/* the hctx may be unmapped, so check it here */
1656			if (blk_mq_hw_queue_mapped(hctx))
1657				blk_mq_tag_idle(hctx);
1658		}
1659	}
1660	blk_queue_exit(q);
1661}
1662
1663struct flush_busy_ctx_data {
1664	struct blk_mq_hw_ctx *hctx;
1665	struct list_head *list;
1666};
1667
1668static bool flush_busy_ctx(struct sbitmap *sb, unsigned int bitnr, void *data)
1669{
1670	struct flush_busy_ctx_data *flush_data = data;
1671	struct blk_mq_hw_ctx *hctx = flush_data->hctx;
1672	struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
1673	enum hctx_type type = hctx->type;
1674
1675	spin_lock(&ctx->lock);
1676	list_splice_tail_init(&ctx->rq_lists[type], flush_data->list);
1677	sbitmap_clear_bit(sb, bitnr);
1678	spin_unlock(&ctx->lock);
1679	return true;
1680}
1681
1682/*
1683 * Process software queues that have been marked busy, splicing them
1684 * to the for-dispatch
1685 */
1686void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
1687{
1688	struct flush_busy_ctx_data data = {
1689		.hctx = hctx,
1690		.list = list,
1691	};
1692
1693	sbitmap_for_each_set(&hctx->ctx_map, flush_busy_ctx, &data);
1694}
1695EXPORT_SYMBOL_GPL(blk_mq_flush_busy_ctxs);
1696
1697struct dispatch_rq_data {
1698	struct blk_mq_hw_ctx *hctx;
1699	struct request *rq;
1700};
1701
1702static bool dispatch_rq_from_ctx(struct sbitmap *sb, unsigned int bitnr,
1703		void *data)
1704{
1705	struct dispatch_rq_data *dispatch_data = data;
1706	struct blk_mq_hw_ctx *hctx = dispatch_data->hctx;
1707	struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
1708	enum hctx_type type = hctx->type;
1709
1710	spin_lock(&ctx->lock);
1711	if (!list_empty(&ctx->rq_lists[type])) {
1712		dispatch_data->rq = list_entry_rq(ctx->rq_lists[type].next);
1713		list_del_init(&dispatch_data->rq->queuelist);
1714		if (list_empty(&ctx->rq_lists[type]))
1715			sbitmap_clear_bit(sb, bitnr);
1716	}
1717	spin_unlock(&ctx->lock);
1718
1719	return !dispatch_data->rq;
1720}
1721
1722struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
1723					struct blk_mq_ctx *start)
1724{
1725	unsigned off = start ? start->index_hw[hctx->type] : 0;
1726	struct dispatch_rq_data data = {
1727		.hctx = hctx,
1728		.rq   = NULL,
1729	};
1730
1731	__sbitmap_for_each_set(&hctx->ctx_map, off,
1732			       dispatch_rq_from_ctx, &data);
1733
1734	return data.rq;
1735}
1736
1737static bool __blk_mq_alloc_driver_tag(struct request *rq)
1738{
1739	struct sbitmap_queue *bt = &rq->mq_hctx->tags->bitmap_tags;
1740	unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags;
1741	int tag;
1742
1743	blk_mq_tag_busy(rq->mq_hctx);
1744
1745	if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) {
1746		bt = &rq->mq_hctx->tags->breserved_tags;
1747		tag_offset = 0;
1748	} else {
1749		if (!hctx_may_queue(rq->mq_hctx, bt))
1750			return false;
1751	}
1752
1753	tag = __sbitmap_queue_get(bt);
1754	if (tag == BLK_MQ_NO_TAG)
1755		return false;
1756
1757	rq->tag = tag + tag_offset;
1758	return true;
1759}
1760
1761bool __blk_mq_get_driver_tag(struct blk_mq_hw_ctx *hctx, struct request *rq)
1762{
1763	if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_alloc_driver_tag(rq))
1764		return false;
1765
1766	if ((hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) &&
1767			!(rq->rq_flags & RQF_MQ_INFLIGHT)) {
1768		rq->rq_flags |= RQF_MQ_INFLIGHT;
1769		__blk_mq_inc_active_requests(hctx);
1770	}
1771	hctx->tags->rqs[rq->tag] = rq;
1772	return true;
1773}
1774
1775static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
1776				int flags, void *key)
1777{
1778	struct blk_mq_hw_ctx *hctx;
1779
1780	hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait);
1781
1782	spin_lock(&hctx->dispatch_wait_lock);
1783	if (!list_empty(&wait->entry)) {
1784		struct sbitmap_queue *sbq;
1785
1786		list_del_init(&wait->entry);
1787		sbq = &hctx->tags->bitmap_tags;
1788		atomic_dec(&sbq->ws_active);
1789	}
1790	spin_unlock(&hctx->dispatch_wait_lock);
1791
1792	blk_mq_run_hw_queue(hctx, true);
1793	return 1;
1794}
1795
1796/*
1797 * Mark us waiting for a tag. For shared tags, this involves hooking us into
1798 * the tag wakeups. For non-shared tags, we can simply mark us needing a
1799 * restart. For both cases, take care to check the condition again after
1800 * marking us as waiting.
1801 */
1802static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx,
1803				 struct request *rq)
1804{
1805	struct sbitmap_queue *sbq;
1806	struct wait_queue_head *wq;
1807	wait_queue_entry_t *wait;
1808	bool ret;
1809
1810	if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) &&
1811	    !(blk_mq_is_shared_tags(hctx->flags))) {
1812		blk_mq_sched_mark_restart_hctx(hctx);
1813
1814		/*
1815		 * It's possible that a tag was freed in the window between the
1816		 * allocation failure and adding the hardware queue to the wait
1817		 * queue.
1818		 *
1819		 * Don't clear RESTART here, someone else could have set it.
1820		 * At most this will cost an extra queue run.
1821		 */
1822		return blk_mq_get_driver_tag(rq);
1823	}
1824
1825	wait = &hctx->dispatch_wait;
1826	if (!list_empty_careful(&wait->entry))
1827		return false;
1828
1829	if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag))
1830		sbq = &hctx->tags->breserved_tags;
1831	else
1832		sbq = &hctx->tags->bitmap_tags;
1833	wq = &bt_wait_ptr(sbq, hctx)->wait;
1834
1835	spin_lock_irq(&wq->lock);
1836	spin_lock(&hctx->dispatch_wait_lock);
1837	if (!list_empty(&wait->entry)) {
1838		spin_unlock(&hctx->dispatch_wait_lock);
1839		spin_unlock_irq(&wq->lock);
1840		return false;
1841	}
1842
1843	atomic_inc(&sbq->ws_active);
1844	wait->flags &= ~WQ_FLAG_EXCLUSIVE;
1845	__add_wait_queue(wq, wait);
1846
1847	/*
1848	 * It's possible that a tag was freed in the window between the
1849	 * allocation failure and adding the hardware queue to the wait
1850	 * queue.
1851	 */
1852	ret = blk_mq_get_driver_tag(rq);
1853	if (!ret) {
1854		spin_unlock(&hctx->dispatch_wait_lock);
1855		spin_unlock_irq(&wq->lock);
1856		return false;
1857	}
1858
1859	/*
1860	 * We got a tag, remove ourselves from the wait queue to ensure
1861	 * someone else gets the wakeup.
1862	 */
1863	list_del_init(&wait->entry);
1864	atomic_dec(&sbq->ws_active);
1865	spin_unlock(&hctx->dispatch_wait_lock);
1866	spin_unlock_irq(&wq->lock);
1867
1868	return true;
1869}
1870
1871#define BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT  8
1872#define BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR  4
1873/*
1874 * Update dispatch busy with the Exponential Weighted Moving Average(EWMA):
1875 * - EWMA is one simple way to compute running average value
1876 * - weight(7/8 and 1/8) is applied so that it can decrease exponentially
1877 * - take 4 as factor for avoiding to get too small(0) result, and this
1878 *   factor doesn't matter because EWMA decreases exponentially
1879 */
1880static void blk_mq_update_dispatch_busy(struct blk_mq_hw_ctx *hctx, bool busy)
1881{
1882	unsigned int ewma;
1883
1884	ewma = hctx->dispatch_busy;
1885
1886	if (!ewma && !busy)
1887		return;
1888
1889	ewma *= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT - 1;
1890	if (busy)
1891		ewma += 1 << BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR;
1892	ewma /= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT;
1893
1894	hctx->dispatch_busy = ewma;
1895}
1896
1897#define BLK_MQ_RESOURCE_DELAY	3		/* ms units */
1898
1899static void blk_mq_handle_dev_resource(struct request *rq,
1900				       struct list_head *list)
1901{
1902	list_add(&rq->queuelist, list);
1903	__blk_mq_requeue_request(rq);
1904}
1905
1906static void blk_mq_handle_zone_resource(struct request *rq,
1907					struct list_head *zone_list)
1908{
1909	/*
1910	 * If we end up here it is because we cannot dispatch a request to a
1911	 * specific zone due to LLD level zone-write locking or other zone
1912	 * related resource not being available. In this case, set the request
1913	 * aside in zone_list for retrying it later.
1914	 */
1915	list_add(&rq->queuelist, zone_list);
1916	__blk_mq_requeue_request(rq);
1917}
1918
1919enum prep_dispatch {
1920	PREP_DISPATCH_OK,
1921	PREP_DISPATCH_NO_TAG,
1922	PREP_DISPATCH_NO_BUDGET,
1923};
1924
1925static enum prep_dispatch blk_mq_prep_dispatch_rq(struct request *rq,
1926						  bool need_budget)
1927{
1928	struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
1929	int budget_token = -1;
1930
1931	if (need_budget) {
1932		budget_token = blk_mq_get_dispatch_budget(rq->q);
1933		if (budget_token < 0) {
1934			blk_mq_put_driver_tag(rq);
1935			return PREP_DISPATCH_NO_BUDGET;
1936		}
1937		blk_mq_set_rq_budget_token(rq, budget_token);
1938	}
1939
1940	if (!blk_mq_get_driver_tag(rq)) {
1941		/*
1942		 * The initial allocation attempt failed, so we need to
1943		 * rerun the hardware queue when a tag is freed. The
1944		 * waitqueue takes care of that. If the queue is run
1945		 * before we add this entry back on the dispatch list,
1946		 * we'll re-run it below.
1947		 */
1948		if (!blk_mq_mark_tag_wait(hctx, rq)) {
1949			/*
1950			 * All budgets not got from this function will be put
1951			 * together during handling partial dispatch
1952			 */
1953			if (need_budget)
1954				blk_mq_put_dispatch_budget(rq->q, budget_token);
1955			return PREP_DISPATCH_NO_TAG;
1956		}
1957	}
1958
1959	return PREP_DISPATCH_OK;
1960}
1961
1962/* release all allocated budgets before calling to blk_mq_dispatch_rq_list */
1963static void blk_mq_release_budgets(struct request_queue *q,
1964		struct list_head *list)
1965{
1966	struct request *rq;
1967
1968	list_for_each_entry(rq, list, queuelist) {
1969		int budget_token = blk_mq_get_rq_budget_token(rq);
1970
1971		if (budget_token >= 0)
1972			blk_mq_put_dispatch_budget(q, budget_token);
1973	}
1974}
1975
1976/*
1977 * blk_mq_commit_rqs will notify driver using bd->last that there is no
1978 * more requests. (See comment in struct blk_mq_ops for commit_rqs for
1979 * details)
1980 * Attention, we should explicitly call this in unusual cases:
1981 *  1) did not queue everything initially scheduled to queue
1982 *  2) the last attempt to queue a request failed
1983 */
1984static void blk_mq_commit_rqs(struct blk_mq_hw_ctx *hctx, int queued,
1985			      bool from_schedule)
1986{
1987	if (hctx->queue->mq_ops->commit_rqs && queued) {
1988		trace_block_unplug(hctx->queue, queued, !from_schedule);
1989		hctx->queue->mq_ops->commit_rqs(hctx);
1990	}
1991}
1992
1993/*
1994 * Returns true if we did some work AND can potentially do more.
1995 */
1996bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
1997			     unsigned int nr_budgets)
1998{
1999	enum prep_dispatch prep;
2000	struct request_queue *q = hctx->queue;
2001	struct request *rq;
2002	int queued;
2003	blk_status_t ret = BLK_STS_OK;
2004	LIST_HEAD(zone_list);
2005	bool needs_resource = false;
2006
2007	if (list_empty(list))
2008		return false;
2009
2010	/*
2011	 * Now process all the entries, sending them to the driver.
2012	 */
2013	queued = 0;
2014	do {
2015		struct blk_mq_queue_data bd;
2016
2017		rq = list_first_entry(list, struct request, queuelist);
2018
2019		WARN_ON_ONCE(hctx != rq->mq_hctx);
2020		prep = blk_mq_prep_dispatch_rq(rq, !nr_budgets);
2021		if (prep != PREP_DISPATCH_OK)
2022			break;
2023
2024		list_del_init(&rq->queuelist);
2025
2026		bd.rq = rq;
2027		bd.last = list_empty(list);
2028
2029		/*
2030		 * once the request is queued to lld, no need to cover the
2031		 * budget any more
2032		 */
2033		if (nr_budgets)
2034			nr_budgets--;
2035		ret = q->mq_ops->queue_rq(hctx, &bd);
2036		switch (ret) {
2037		case BLK_STS_OK:
2038			queued++;
2039			break;
2040		case BLK_STS_RESOURCE:
2041			needs_resource = true;
2042			fallthrough;
2043		case BLK_STS_DEV_RESOURCE:
2044			blk_mq_handle_dev_resource(rq, list);
2045			goto out;
2046		case BLK_STS_ZONE_RESOURCE:
2047			/*
2048			 * Move the request to zone_list and keep going through
2049			 * the dispatch list to find more requests the drive can
2050			 * accept.
2051			 */
2052			blk_mq_handle_zone_resource(rq, &zone_list);
2053			needs_resource = true;
2054			break;
2055		default:
2056			blk_mq_end_request(rq, ret);
2057		}
2058	} while (!list_empty(list));
2059out:
2060	if (!list_empty(&zone_list))
2061		list_splice_tail_init(&zone_list, list);
2062
2063	/* If we didn't flush the entire list, we could have told the driver
2064	 * there was more coming, but that turned out to be a lie.
2065	 */
2066	if (!list_empty(list) || ret != BLK_STS_OK)
2067		blk_mq_commit_rqs(hctx, queued, false);
2068
2069	/*
2070	 * Any items that need requeuing? Stuff them into hctx->dispatch,
2071	 * that is where we will continue on next queue run.
2072	 */
2073	if (!list_empty(list)) {
2074		bool needs_restart;
2075		/* For non-shared tags, the RESTART check will suffice */
2076		bool no_tag = prep == PREP_DISPATCH_NO_TAG &&
2077			((hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) ||
2078			blk_mq_is_shared_tags(hctx->flags));
2079
2080		if (nr_budgets)
2081			blk_mq_release_budgets(q, list);
2082
2083		spin_lock(&hctx->lock);
2084		list_splice_tail_init(list, &hctx->dispatch);
2085		spin_unlock(&hctx->lock);
2086
2087		/*
2088		 * Order adding requests to hctx->dispatch and checking
2089		 * SCHED_RESTART flag. The pair of this smp_mb() is the one
2090		 * in blk_mq_sched_restart(). Avoid restart code path to
2091		 * miss the new added requests to hctx->dispatch, meantime
2092		 * SCHED_RESTART is observed here.
2093		 */
2094		smp_mb();
2095
2096		/*
2097		 * If SCHED_RESTART was set by the caller of this function and
2098		 * it is no longer set that means that it was cleared by another
2099		 * thread and hence that a queue rerun is needed.
2100		 *
2101		 * If 'no_tag' is set, that means that we failed getting
2102		 * a driver tag with an I/O scheduler attached. If our dispatch
2103		 * waitqueue is no longer active, ensure that we run the queue
2104		 * AFTER adding our entries back to the list.
2105		 *
2106		 * If no I/O scheduler has been configured it is possible that
2107		 * the hardware queue got stopped and restarted before requests
2108		 * were pushed back onto the dispatch list. Rerun the queue to
2109		 * avoid starvation. Notes:
2110		 * - blk_mq_run_hw_queue() checks whether or not a queue has
2111		 *   been stopped before rerunning a queue.
2112		 * - Some but not all block drivers stop a queue before
2113		 *   returning BLK_STS_RESOURCE. Two exceptions are scsi-mq
2114		 *   and dm-rq.
2115		 *
2116		 * If driver returns BLK_STS_RESOURCE and SCHED_RESTART
2117		 * bit is set, run queue after a delay to avoid IO stalls
2118		 * that could otherwise occur if the queue is idle.  We'll do
2119		 * similar if we couldn't get budget or couldn't lock a zone
2120		 * and SCHED_RESTART is set.
2121		 */
2122		needs_restart = blk_mq_sched_needs_restart(hctx);
2123		if (prep == PREP_DISPATCH_NO_BUDGET)
2124			needs_resource = true;
2125		if (!needs_restart ||
2126		    (no_tag && list_empty_careful(&hctx->dispatch_wait.entry)))
2127			blk_mq_run_hw_queue(hctx, true);
2128		else if (needs_resource)
2129			blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY);
2130
2131		blk_mq_update_dispatch_busy(hctx, true);
2132		return false;
2133	}
2134
2135	blk_mq_update_dispatch_busy(hctx, false);
2136	return true;
2137}
2138
2139static inline int blk_mq_first_mapped_cpu(struct blk_mq_hw_ctx *hctx)
2140{
2141	int cpu = cpumask_first_and(hctx->cpumask, cpu_online_mask);
2142
2143	if (cpu >= nr_cpu_ids)
2144		cpu = cpumask_first(hctx->cpumask);
2145	return cpu;
2146}
2147
2148/*
2149 * It'd be great if the workqueue API had a way to pass
2150 * in a mask and had some smarts for more clever placement.
2151 * For now we just round-robin here, switching for every
2152 * BLK_MQ_CPU_WORK_BATCH queued items.
2153 */
2154static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
2155{
2156	bool tried = false;
2157	int next_cpu = hctx->next_cpu;
2158
2159	if (hctx->queue->nr_hw_queues == 1)
2160		return WORK_CPU_UNBOUND;
2161
2162	if (--hctx->next_cpu_batch <= 0) {
2163select_cpu:
2164		next_cpu = cpumask_next_and(next_cpu, hctx->cpumask,
2165				cpu_online_mask);
2166		if (next_cpu >= nr_cpu_ids)
2167			next_cpu = blk_mq_first_mapped_cpu(hctx);
2168		hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
2169	}
2170
2171	/*
2172	 * Do unbound schedule if we can't find a online CPU for this hctx,
2173	 * and it should only happen in the path of handling CPU DEAD.
2174	 */
2175	if (!cpu_online(next_cpu)) {
2176		if (!tried) {
2177			tried = true;
2178			goto select_cpu;
2179		}
2180
2181		/*
2182		 * Make sure to re-select CPU next time once after CPUs
2183		 * in hctx->cpumask become online again.
2184		 */
2185		hctx->next_cpu = next_cpu;
2186		hctx->next_cpu_batch = 1;
2187		return WORK_CPU_UNBOUND;
2188	}
2189
2190	hctx->next_cpu = next_cpu;
2191	return next_cpu;
2192}
2193
2194/**
2195 * blk_mq_delay_run_hw_queue - Run a hardware queue asynchronously.
2196 * @hctx: Pointer to the hardware queue to run.
2197 * @msecs: Milliseconds of delay to wait before running the queue.
2198 *
2199 * Run a hardware queue asynchronously with a delay of @msecs.
2200 */
2201void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
2202{
2203	if (unlikely(blk_mq_hctx_stopped(hctx)))
2204		return;
2205	kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work,
2206				    msecs_to_jiffies(msecs));
2207}
2208EXPORT_SYMBOL(blk_mq_delay_run_hw_queue);
2209
2210/**
2211 * blk_mq_run_hw_queue - Start to run a hardware queue.
2212 * @hctx: Pointer to the hardware queue to run.
2213 * @async: If we want to run the queue asynchronously.
2214 *
2215 * Check if the request queue is not in a quiesced state and if there are
2216 * pending requests to be sent. If this is true, run the queue to send requests
2217 * to hardware.
2218 */
2219void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
2220{
2221	bool need_run;
2222
2223	/*
2224	 * We can't run the queue inline with interrupts disabled.
2225	 */
2226	WARN_ON_ONCE(!async && in_interrupt());
2227
2228	/*
2229	 * When queue is quiesced, we may be switching io scheduler, or
2230	 * updating nr_hw_queues, or other things, and we can't run queue
2231	 * any more, even __blk_mq_hctx_has_pending() can't be called safely.
2232	 *
2233	 * And queue will be rerun in blk_mq_unquiesce_queue() if it is
2234	 * quiesced.
2235	 */
2236	__blk_mq_run_dispatch_ops(hctx->queue, false,
2237		need_run = !blk_queue_quiesced(hctx->queue) &&
2238		blk_mq_hctx_has_pending(hctx));
2239
2240	if (!need_run)
2241		return;
2242
2243	if (async || (hctx->flags & BLK_MQ_F_BLOCKING) ||
2244	    !cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)) {
2245		blk_mq_delay_run_hw_queue(hctx, 0);
2246		return;
2247	}
2248
2249	blk_mq_run_dispatch_ops(hctx->queue,
2250				blk_mq_sched_dispatch_requests(hctx));
2251}
2252EXPORT_SYMBOL(blk_mq_run_hw_queue);
2253
2254/*
2255 * Return prefered queue to dispatch from (if any) for non-mq aware IO
2256 * scheduler.
2257 */
2258static struct blk_mq_hw_ctx *blk_mq_get_sq_hctx(struct request_queue *q)
2259{
2260	struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
2261	/*
2262	 * If the IO scheduler does not respect hardware queues when
2263	 * dispatching, we just don't bother with multiple HW queues and
2264	 * dispatch from hctx for the current CPU since running multiple queues
2265	 * just causes lock contention inside the scheduler and pointless cache
2266	 * bouncing.
2267	 */
2268	struct blk_mq_hw_ctx *hctx = ctx->hctxs[HCTX_TYPE_DEFAULT];
2269
2270	if (!blk_mq_hctx_stopped(hctx))
2271		return hctx;
2272	return NULL;
2273}
2274
2275/**
2276 * blk_mq_run_hw_queues - Run all hardware queues in a request queue.
2277 * @q: Pointer to the request queue to run.
2278 * @async: If we want to run the queue asynchronously.
2279 */
2280void blk_mq_run_hw_queues(struct request_queue *q, bool async)
2281{
2282	struct blk_mq_hw_ctx *hctx, *sq_hctx;
2283	unsigned long i;
2284
2285	sq_hctx = NULL;
2286	if (blk_queue_sq_sched(q))
2287		sq_hctx = blk_mq_get_sq_hctx(q);
2288	queue_for_each_hw_ctx(q, hctx, i) {
2289		if (blk_mq_hctx_stopped(hctx))
2290			continue;
2291		/*
2292		 * Dispatch from this hctx either if there's no hctx preferred
2293		 * by IO scheduler or if it has requests that bypass the
2294		 * scheduler.
2295		 */
2296		if (!sq_hctx || sq_hctx == hctx ||
2297		    !list_empty_careful(&hctx->dispatch))
2298			blk_mq_run_hw_queue(hctx, async);
2299	}
2300}
2301EXPORT_SYMBOL(blk_mq_run_hw_queues);
2302
2303/**
2304 * blk_mq_delay_run_hw_queues - Run all hardware queues asynchronously.
2305 * @q: Pointer to the request queue to run.
2306 * @msecs: Milliseconds of delay to wait before running the queues.
2307 */
2308void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs)
2309{
2310	struct blk_mq_hw_ctx *hctx, *sq_hctx;
2311	unsigned long i;
2312
2313	sq_hctx = NULL;
2314	if (blk_queue_sq_sched(q))
2315		sq_hctx = blk_mq_get_sq_hctx(q);
2316	queue_for_each_hw_ctx(q, hctx, i) {
2317		if (blk_mq_hctx_stopped(hctx))
2318			continue;
2319		/*
2320		 * If there is already a run_work pending, leave the
2321		 * pending delay untouched. Otherwise, a hctx can stall
2322		 * if another hctx is re-delaying the other's work
2323		 * before the work executes.
2324		 */
2325		if (delayed_work_pending(&hctx->run_work))
2326			continue;
2327		/*
2328		 * Dispatch from this hctx either if there's no hctx preferred
2329		 * by IO scheduler or if it has requests that bypass the
2330		 * scheduler.
2331		 */
2332		if (!sq_hctx || sq_hctx == hctx ||
2333		    !list_empty_careful(&hctx->dispatch))
2334			blk_mq_delay_run_hw_queue(hctx, msecs);
2335	}
2336}
2337EXPORT_SYMBOL(blk_mq_delay_run_hw_queues);
2338
2339/*
2340 * This function is often used for pausing .queue_rq() by driver when
2341 * there isn't enough resource or some conditions aren't satisfied, and
2342 * BLK_STS_RESOURCE is usually returned.
2343 *
2344 * We do not guarantee that dispatch can be drained or blocked
2345 * after blk_mq_stop_hw_queue() returns. Please use
2346 * blk_mq_quiesce_queue() for that requirement.
2347 */
2348void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
2349{
2350	cancel_delayed_work(&hctx->run_work);
2351
2352	set_bit(BLK_MQ_S_STOPPED, &hctx->state);
2353}
2354EXPORT_SYMBOL(blk_mq_stop_hw_queue);
2355
2356/*
2357 * This function is often used for pausing .queue_rq() by driver when
2358 * there isn't enough resource or some conditions aren't satisfied, and
2359 * BLK_STS_RESOURCE is usually returned.
2360 *
2361 * We do not guarantee that dispatch can be drained or blocked
2362 * after blk_mq_stop_hw_queues() returns. Please use
2363 * blk_mq_quiesce_queue() for that requirement.
2364 */
2365void blk_mq_stop_hw_queues(struct request_queue *q)
2366{
2367	struct blk_mq_hw_ctx *hctx;
2368	unsigned long i;
2369
2370	queue_for_each_hw_ctx(q, hctx, i)
2371		blk_mq_stop_hw_queue(hctx);
2372}
2373EXPORT_SYMBOL(blk_mq_stop_hw_queues);
2374
2375void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx)
2376{
2377	clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
2378
2379	blk_mq_run_hw_queue(hctx, false);
2380}
2381EXPORT_SYMBOL(blk_mq_start_hw_queue);
2382
2383void blk_mq_start_hw_queues(struct request_queue *q)
2384{
2385	struct blk_mq_hw_ctx *hctx;
2386	unsigned long i;
2387
2388	queue_for_each_hw_ctx(q, hctx, i)
2389		blk_mq_start_hw_queue(hctx);
2390}
2391EXPORT_SYMBOL(blk_mq_start_hw_queues);
2392
2393void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
2394{
2395	if (!blk_mq_hctx_stopped(hctx))
2396		return;
2397
2398	clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
2399	blk_mq_run_hw_queue(hctx, async);
2400}
2401EXPORT_SYMBOL_GPL(blk_mq_start_stopped_hw_queue);
2402
2403void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async)
2404{
2405	struct blk_mq_hw_ctx *hctx;
2406	unsigned long i;
2407
2408	queue_for_each_hw_ctx(q, hctx, i)
2409		blk_mq_start_stopped_hw_queue(hctx, async);
2410}
2411EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);
2412
2413static void blk_mq_run_work_fn(struct work_struct *work)
2414{
2415	struct blk_mq_hw_ctx *hctx =
2416		container_of(work, struct blk_mq_hw_ctx, run_work.work);
2417
2418	blk_mq_run_dispatch_ops(hctx->queue,
2419				blk_mq_sched_dispatch_requests(hctx));
2420}
2421
2422/**
2423 * blk_mq_request_bypass_insert - Insert a request at dispatch list.
2424 * @rq: Pointer to request to be inserted.
2425 * @flags: BLK_MQ_INSERT_*
2426 *
2427 * Should only be used carefully, when the caller knows we want to
2428 * bypass a potential IO scheduler on the target device.
2429 */
2430static void blk_mq_request_bypass_insert(struct request *rq, blk_insert_t flags)
2431{
2432	struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
2433
2434	spin_lock(&hctx->lock);
2435	if (flags & BLK_MQ_INSERT_AT_HEAD)
2436		list_add(&rq->queuelist, &hctx->dispatch);
2437	else
2438		list_add_tail(&rq->queuelist, &hctx->dispatch);
2439	spin_unlock(&hctx->lock);
2440}
2441
2442static void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx,
2443		struct blk_mq_ctx *ctx, struct list_head *list,
2444		bool run_queue_async)
2445{
2446	struct request *rq;
2447	enum hctx_type type = hctx->type;
2448
2449	/*
2450	 * Try to issue requests directly if the hw queue isn't busy to save an
2451	 * extra enqueue & dequeue to the sw queue.
2452	 */
2453	if (!hctx->dispatch_busy && !run_queue_async) {
2454		blk_mq_run_dispatch_ops(hctx->queue,
2455			blk_mq_try_issue_list_directly(hctx, list));
2456		if (list_empty(list))
2457			goto out;
2458	}
2459
2460	/*
2461	 * preemption doesn't flush plug list, so it's possible ctx->cpu is
2462	 * offline now
2463	 */
2464	list_for_each_entry(rq, list, queuelist) {
2465		BUG_ON(rq->mq_ctx != ctx);
2466		trace_block_rq_insert(rq);
2467	}
2468
2469	spin_lock(&ctx->lock);
2470	list_splice_tail_init(list, &ctx->rq_lists[type]);
2471	blk_mq_hctx_mark_pending(hctx, ctx);
2472	spin_unlock(&ctx->lock);
2473out:
2474	blk_mq_run_hw_queue(hctx, run_queue_async);
2475}
2476
2477static void blk_mq_insert_request(struct request *rq, blk_insert_t flags)
2478{
2479	struct request_queue *q = rq->q;
2480	struct blk_mq_ctx *ctx = rq->mq_ctx;
2481	struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
2482
2483	if (blk_rq_is_passthrough(rq)) {
2484		/*
2485		 * Passthrough request have to be added to hctx->dispatch
2486		 * directly.  The device may be in a situation where it can't
2487		 * handle FS request, and always returns BLK_STS_RESOURCE for
2488		 * them, which gets them added to hctx->dispatch.
2489		 *
2490		 * If a passthrough request is required to unblock the queues,
2491		 * and it is added to the scheduler queue, there is no chance to
2492		 * dispatch it given we prioritize requests in hctx->dispatch.
2493		 */
2494		blk_mq_request_bypass_insert(rq, flags);
2495	} else if (req_op(rq) == REQ_OP_FLUSH) {
2496		/*
2497		 * Firstly normal IO request is inserted to scheduler queue or
2498		 * sw queue, meantime we add flush request to dispatch queue(
2499		 * hctx->dispatch) directly and there is at most one in-flight
2500		 * flush request for each hw queue, so it doesn't matter to add
2501		 * flush request to tail or front of the dispatch queue.
2502		 *
2503		 * Secondly in case of NCQ, flush request belongs to non-NCQ
2504		 * command, and queueing it will fail when there is any
2505		 * in-flight normal IO request(NCQ command). When adding flush
2506		 * rq to the front of hctx->dispatch, it is easier to introduce
2507		 * extra time to flush rq's latency because of S_SCHED_RESTART
2508		 * compared with adding to the tail of dispatch queue, then
2509		 * chance of flush merge is increased, and less flush requests
2510		 * will be issued to controller. It is observed that ~10% time
2511		 * is saved in blktests block/004 on disk attached to AHCI/NCQ
2512		 * drive when adding flush rq to the front of hctx->dispatch.
2513		 *
2514		 * Simply queue flush rq to the front of hctx->dispatch so that
2515		 * intensive flush workloads can benefit in case of NCQ HW.
2516		 */
2517		blk_mq_request_bypass_insert(rq, BLK_MQ_INSERT_AT_HEAD);
2518	} else if (q->elevator) {
2519		LIST_HEAD(list);
2520
2521		WARN_ON_ONCE(rq->tag != BLK_MQ_NO_TAG);
2522
2523		list_add(&rq->queuelist, &list);
2524		q->elevator->type->ops.insert_requests(hctx, &list, flags);
2525	} else {
2526		trace_block_rq_insert(rq);
2527
2528		spin_lock(&ctx->lock);
2529		if (flags & BLK_MQ_INSERT_AT_HEAD)
2530			list_add(&rq->queuelist, &ctx->rq_lists[hctx->type]);
2531		else
2532			list_add_tail(&rq->queuelist,
2533				      &ctx->rq_lists[hctx->type]);
2534		blk_mq_hctx_mark_pending(hctx, ctx);
2535		spin_unlock(&ctx->lock);
2536	}
2537}
2538
2539static void blk_mq_bio_to_request(struct request *rq, struct bio *bio,
2540		unsigned int nr_segs)
2541{
2542	int err;
2543
2544	if (bio->bi_opf & REQ_RAHEAD)
2545		rq->cmd_flags |= REQ_FAILFAST_MASK;
2546
2547	rq->__sector = bio->bi_iter.bi_sector;
2548	blk_rq_bio_prep(rq, bio, nr_segs);
2549
2550	/* This can't fail, since GFP_NOIO includes __GFP_DIRECT_RECLAIM. */
2551	err = blk_crypto_rq_bio_prep(rq, bio, GFP_NOIO);
2552	WARN_ON_ONCE(err);
2553
2554	blk_account_io_start(rq);
2555}
2556
2557static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
2558					    struct request *rq, bool last)
2559{
2560	struct request_queue *q = rq->q;
2561	struct blk_mq_queue_data bd = {
2562		.rq = rq,
2563		.last = last,
2564	};
2565	blk_status_t ret;
2566
2567	/*
2568	 * For OK queue, we are done. For error, caller may kill it.
2569	 * Any other error (busy), just add it to our list as we
2570	 * previously would have done.
2571	 */
2572	ret = q->mq_ops->queue_rq(hctx, &bd);
2573	switch (ret) {
2574	case BLK_STS_OK:
2575		blk_mq_update_dispatch_busy(hctx, false);
2576		break;
2577	case BLK_STS_RESOURCE:
2578	case BLK_STS_DEV_RESOURCE:
2579		blk_mq_update_dispatch_busy(hctx, true);
2580		__blk_mq_requeue_request(rq);
2581		break;
2582	default:
2583		blk_mq_update_dispatch_busy(hctx, false);
2584		break;
2585	}
2586
2587	return ret;
2588}
2589
2590static bool blk_mq_get_budget_and_tag(struct request *rq)
2591{
2592	int budget_token;
2593
2594	budget_token = blk_mq_get_dispatch_budget(rq->q);
2595	if (budget_token < 0)
2596		return false;
2597	blk_mq_set_rq_budget_token(rq, budget_token);
2598	if (!blk_mq_get_driver_tag(rq)) {
2599		blk_mq_put_dispatch_budget(rq->q, budget_token);
2600		return false;
2601	}
2602	return true;
2603}
2604
2605/**
2606 * blk_mq_try_issue_directly - Try to send a request directly to device driver.
2607 * @hctx: Pointer of the associated hardware queue.
2608 * @rq: Pointer to request to be sent.
2609 *
2610 * If the device has enough resources to accept a new request now, send the
2611 * request directly to device driver. Else, insert at hctx->dispatch queue, so
2612 * we can try send it another time in the future. Requests inserted at this
2613 * queue have higher priority.
2614 */
2615static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
2616		struct request *rq)
2617{
2618	blk_status_t ret;
2619
2620	if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(rq->q)) {
2621		blk_mq_insert_request(rq, 0);
2622		return;
2623	}
2624
2625	if ((rq->rq_flags & RQF_USE_SCHED) || !blk_mq_get_budget_and_tag(rq)) {
2626		blk_mq_insert_request(rq, 0);
2627		blk_mq_run_hw_queue(hctx, false);
2628		return;
2629	}
2630
2631	ret = __blk_mq_issue_directly(hctx, rq, true);
2632	switch (ret) {
2633	case BLK_STS_OK:
2634		break;
2635	case BLK_STS_RESOURCE:
2636	case BLK_STS_DEV_RESOURCE:
2637		blk_mq_request_bypass_insert(rq, 0);
2638		blk_mq_run_hw_queue(hctx, false);
2639		break;
2640	default:
2641		blk_mq_end_request(rq, ret);
2642		break;
2643	}
2644}
2645
2646static blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last)
2647{
2648	struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
2649
2650	if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(rq->q)) {
2651		blk_mq_insert_request(rq, 0);
2652		return BLK_STS_OK;
2653	}
2654
2655	if (!blk_mq_get_budget_and_tag(rq))
2656		return BLK_STS_RESOURCE;
2657	return __blk_mq_issue_directly(hctx, rq, last);
2658}
2659
2660static void blk_mq_plug_issue_direct(struct blk_plug *plug)
2661{
2662	struct blk_mq_hw_ctx *hctx = NULL;
2663	struct request *rq;
2664	int queued = 0;
2665	blk_status_t ret = BLK_STS_OK;
2666
2667	while ((rq = rq_list_pop(&plug->mq_list))) {
2668		bool last = rq_list_empty(plug->mq_list);
2669
2670		if (hctx != rq->mq_hctx) {
2671			if (hctx) {
2672				blk_mq_commit_rqs(hctx, queued, false);
2673				queued = 0;
2674			}
2675			hctx = rq->mq_hctx;
2676		}
2677
2678		ret = blk_mq_request_issue_directly(rq, last);
2679		switch (ret) {
2680		case BLK_STS_OK:
2681			queued++;
2682			break;
2683		case BLK_STS_RESOURCE:
2684		case BLK_STS_DEV_RESOURCE:
2685			blk_mq_request_bypass_insert(rq, 0);
2686			blk_mq_run_hw_queue(hctx, false);
2687			goto out;
2688		default:
2689			blk_mq_end_request(rq, ret);
2690			break;
2691		}
2692	}
2693
2694out:
2695	if (ret != BLK_STS_OK)
2696		blk_mq_commit_rqs(hctx, queued, false);
2697}
2698
2699static void __blk_mq_flush_plug_list(struct request_queue *q,
2700				     struct blk_plug *plug)
2701{
2702	if (blk_queue_quiesced(q))
2703		return;
2704	q->mq_ops->queue_rqs(&plug->mq_list);
2705}
2706
2707static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched)
2708{
2709	struct blk_mq_hw_ctx *this_hctx = NULL;
2710	struct blk_mq_ctx *this_ctx = NULL;
2711	struct request *requeue_list = NULL;
2712	struct request **requeue_lastp = &requeue_list;
2713	unsigned int depth = 0;
2714	bool is_passthrough = false;
2715	LIST_HEAD(list);
2716
2717	do {
2718		struct request *rq = rq_list_pop(&plug->mq_list);
2719
2720		if (!this_hctx) {
2721			this_hctx = rq->mq_hctx;
2722			this_ctx = rq->mq_ctx;
2723			is_passthrough = blk_rq_is_passthrough(rq);
2724		} else if (this_hctx != rq->mq_hctx || this_ctx != rq->mq_ctx ||
2725			   is_passthrough != blk_rq_is_passthrough(rq)) {
2726			rq_list_add_tail(&requeue_lastp, rq);
2727			continue;
2728		}
2729		list_add(&rq->queuelist, &list);
2730		depth++;
2731	} while (!rq_list_empty(plug->mq_list));
2732
2733	plug->mq_list = requeue_list;
2734	trace_block_unplug(this_hctx->queue, depth, !from_sched);
2735
2736	percpu_ref_get(&this_hctx->queue->q_usage_counter);
2737	/* passthrough requests should never be issued to the I/O scheduler */
2738	if (is_passthrough) {
2739		spin_lock(&this_hctx->lock);
2740		list_splice_tail_init(&list, &this_hctx->dispatch);
2741		spin_unlock(&this_hctx->lock);
2742		blk_mq_run_hw_queue(this_hctx, from_sched);
2743	} else if (this_hctx->queue->elevator) {
2744		this_hctx->queue->elevator->type->ops.insert_requests(this_hctx,
2745				&list, 0);
2746		blk_mq_run_hw_queue(this_hctx, from_sched);
2747	} else {
2748		blk_mq_insert_requests(this_hctx, this_ctx, &list, from_sched);
2749	}
2750	percpu_ref_put(&this_hctx->queue->q_usage_counter);
2751}
2752
2753void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
2754{
2755	struct request *rq;
2756
2757	/*
2758	 * We may have been called recursively midway through handling
2759	 * plug->mq_list via a schedule() in the driver's queue_rq() callback.
2760	 * To avoid mq_list changing under our feet, clear rq_count early and
2761	 * bail out specifically if rq_count is 0 rather than checking
2762	 * whether the mq_list is empty.
2763	 */
2764	if (plug->rq_count == 0)
2765		return;
2766	plug->rq_count = 0;
2767
2768	if (!plug->multiple_queues && !plug->has_elevator && !from_schedule) {
2769		struct request_queue *q;
2770
2771		rq = rq_list_peek(&plug->mq_list);
2772		q = rq->q;
2773
2774		/*
2775		 * Peek first request and see if we have a ->queue_rqs() hook.
2776		 * If we do, we can dispatch the whole plug list in one go. We
2777		 * already know at this point that all requests belong to the
2778		 * same queue, caller must ensure that's the case.
2779		 *
2780		 * Since we pass off the full list to the driver at this point,
2781		 * we do not increment the active request count for the queue.
2782		 * Bypass shared tags for now because of that.
2783		 */
2784		if (q->mq_ops->queue_rqs &&
2785		    !(rq->mq_hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) {
2786			blk_mq_run_dispatch_ops(q,
2787				__blk_mq_flush_plug_list(q, plug));
2788			if (rq_list_empty(plug->mq_list))
2789				return;
2790		}
2791
2792		blk_mq_run_dispatch_ops(q,
2793				blk_mq_plug_issue_direct(plug));
2794		if (rq_list_empty(plug->mq_list))
2795			return;
2796	}
2797
2798	do {
2799		blk_mq_dispatch_plug_list(plug, from_schedule);
2800	} while (!rq_list_empty(plug->mq_list));
2801}
2802
2803static void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
2804		struct list_head *list)
2805{
2806	int queued = 0;
2807	blk_status_t ret = BLK_STS_OK;
2808
2809	while (!list_empty(list)) {
2810		struct request *rq = list_first_entry(list, struct request,
2811				queuelist);
2812
2813		list_del_init(&rq->queuelist);
2814		ret = blk_mq_request_issue_directly(rq, list_empty(list));
2815		switch (ret) {
2816		case BLK_STS_OK:
2817			queued++;
2818			break;
2819		case BLK_STS_RESOURCE:
2820		case BLK_STS_DEV_RESOURCE:
2821			blk_mq_request_bypass_insert(rq, 0);
2822			if (list_empty(list))
2823				blk_mq_run_hw_queue(hctx, false);
2824			goto out;
2825		default:
2826			blk_mq_end_request(rq, ret);
2827			break;
2828		}
2829	}
2830
2831out:
2832	if (ret != BLK_STS_OK)
2833		blk_mq_commit_rqs(hctx, queued, false);
2834}
2835
2836static bool blk_mq_attempt_bio_merge(struct request_queue *q,
2837				     struct bio *bio, unsigned int nr_segs)
2838{
2839	if (!blk_queue_nomerges(q) && bio_mergeable(bio)) {
2840		if (blk_attempt_plug_merge(q, bio, nr_segs))
2841			return true;
2842		if (blk_mq_sched_bio_merge(q, bio, nr_segs))
2843			return true;
2844	}
2845	return false;
2846}
2847
2848static struct request *blk_mq_get_new_requests(struct request_queue *q,
2849					       struct blk_plug *plug,
2850					       struct bio *bio,
2851					       unsigned int nsegs)
2852{
2853	struct blk_mq_alloc_data data = {
2854		.q		= q,
2855		.nr_tags	= 1,
2856		.cmd_flags	= bio->bi_opf,
2857	};
2858	struct request *rq;
2859
2860	if (unlikely(bio_queue_enter(bio)))
2861		return NULL;
2862
2863	if (blk_mq_attempt_bio_merge(q, bio, nsegs))
2864		goto queue_exit;
2865
2866	rq_qos_throttle(q, bio);
2867
2868	if (plug) {
2869		data.nr_tags = plug->nr_ios;
2870		plug->nr_ios = 1;
2871		data.cached_rq = &plug->cached_rq;
2872	}
2873
2874	rq = __blk_mq_alloc_requests(&data);
2875	if (rq)
2876		return rq;
2877	rq_qos_cleanup(q, bio);
2878	if (bio->bi_opf & REQ_NOWAIT)
2879		bio_wouldblock_error(bio);
2880queue_exit:
2881	blk_queue_exit(q);
2882	return NULL;
2883}
2884
2885static inline struct request *blk_mq_get_cached_request(struct request_queue *q,
2886		struct blk_plug *plug, struct bio **bio, unsigned int nsegs)
2887{
2888	struct request *rq;
2889	enum hctx_type type, hctx_type;
2890
2891	if (!plug)
2892		return NULL;
2893	rq = rq_list_peek(&plug->cached_rq);
2894	if (!rq || rq->q != q)
2895		return NULL;
2896
2897	if (blk_mq_attempt_bio_merge(q, *bio, nsegs)) {
2898		*bio = NULL;
2899		return NULL;
2900	}
2901
2902	type = blk_mq_get_hctx_type((*bio)->bi_opf);
2903	hctx_type = rq->mq_hctx->type;
2904	if (type != hctx_type &&
2905	    !(type == HCTX_TYPE_READ && hctx_type == HCTX_TYPE_DEFAULT))
2906		return NULL;
2907	if (op_is_flush(rq->cmd_flags) != op_is_flush((*bio)->bi_opf))
2908		return NULL;
2909
2910	/*
2911	 * If any qos ->throttle() end up blocking, we will have flushed the
2912	 * plug and hence killed the cached_rq list as well. Pop this entry
2913	 * before we throttle.
2914	 */
2915	plug->cached_rq = rq_list_next(rq);
2916	rq_qos_throttle(q, *bio);
2917
2918	blk_mq_rq_time_init(rq, 0);
2919	rq->cmd_flags = (*bio)->bi_opf;
2920	INIT_LIST_HEAD(&rq->queuelist);
2921	return rq;
2922}
2923
2924static void bio_set_ioprio(struct bio *bio)
2925{
2926	/* Nobody set ioprio so far? Initialize it based on task's nice value */
2927	if (IOPRIO_PRIO_CLASS(bio->bi_ioprio) == IOPRIO_CLASS_NONE)
2928		bio->bi_ioprio = get_current_ioprio();
2929	blkcg_set_ioprio(bio);
2930}
2931
2932/**
2933 * blk_mq_submit_bio - Create and send a request to block device.
2934 * @bio: Bio pointer.
2935 *
2936 * Builds up a request structure from @q and @bio and send to the device. The
2937 * request may not be queued directly to hardware if:
2938 * * This request can be merged with another one
2939 * * We want to place request at plug queue for possible future merging
2940 * * There is an IO scheduler active at this queue
2941 *
2942 * It will not queue the request if there is an error with the bio, or at the
2943 * request creation.
2944 */
2945void blk_mq_submit_bio(struct bio *bio)
2946{
2947	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
2948	struct blk_plug *plug = blk_mq_plug(bio);
2949	const int is_sync = op_is_sync(bio->bi_opf);
2950	struct blk_mq_hw_ctx *hctx;
2951	struct request *rq;
2952	unsigned int nr_segs = 1;
2953	blk_status_t ret;
2954
2955	bio = blk_queue_bounce(bio, q);
2956	if (bio_may_exceed_limits(bio, &q->limits)) {
2957		bio = __bio_split_to_limits(bio, &q->limits, &nr_segs);
2958		if (!bio)
2959			return;
2960	}
2961
2962	if (!bio_integrity_prep(bio))
2963		return;
2964
2965	bio_set_ioprio(bio);
2966
2967	rq = blk_mq_get_cached_request(q, plug, &bio, nr_segs);
2968	if (!rq) {
2969		if (!bio)
2970			return;
2971		rq = blk_mq_get_new_requests(q, plug, bio, nr_segs);
2972		if (unlikely(!rq))
2973			return;
2974	}
2975
2976	trace_block_getrq(bio);
2977
2978	rq_qos_track(q, rq, bio);
2979
2980	blk_mq_bio_to_request(rq, bio, nr_segs);
2981
2982	ret = blk_crypto_rq_get_keyslot(rq);
2983	if (ret != BLK_STS_OK) {
2984		bio->bi_status = ret;
2985		bio_endio(bio);
2986		blk_mq_free_request(rq);
2987		return;
2988	}
2989
2990	if (op_is_flush(bio->bi_opf) && blk_insert_flush(rq))
2991		return;
2992
2993	if (plug) {
2994		blk_add_rq_to_plug(plug, rq);
2995		return;
2996	}
2997
2998	hctx = rq->mq_hctx;
2999	if ((rq->rq_flags & RQF_USE_SCHED) ||
3000	    (hctx->dispatch_busy && (q->nr_hw_queues == 1 || !is_sync))) {
3001		blk_mq_insert_request(rq, 0);
3002		blk_mq_run_hw_queue(hctx, true);
3003	} else {
3004		blk_mq_run_dispatch_ops(q, blk_mq_try_issue_directly(hctx, rq));
3005	}
3006}
3007
3008#ifdef CONFIG_BLK_MQ_STACKING
3009/**
3010 * blk_insert_cloned_request - Helper for stacking drivers to submit a request
3011 * @rq: the request being queued
3012 */
3013blk_status_t blk_insert_cloned_request(struct request *rq)
3014{
3015	struct request_queue *q = rq->q;
3016	unsigned int max_sectors = blk_queue_get_max_sectors(q, req_op(rq));
3017	unsigned int max_segments = blk_rq_get_max_segments(rq);
3018	blk_status_t ret;
3019
3020	if (blk_rq_sectors(rq) > max_sectors) {
3021		/*
3022		 * SCSI device does not have a good way to return if
3023		 * Write Same/Zero is actually supported. If a device rejects
3024		 * a non-read/write command (discard, write same,etc.) the
3025		 * low-level device driver will set the relevant queue limit to
3026		 * 0 to prevent blk-lib from issuing more of the offending
3027		 * operations. Commands queued prior to the queue limit being
3028		 * reset need to be completed with BLK_STS_NOTSUPP to avoid I/O
3029		 * errors being propagated to upper layers.
3030		 */
3031		if (max_sectors == 0)
3032			return BLK_STS_NOTSUPP;
3033
3034		printk(KERN_ERR "%s: over max size limit. (%u > %u)\n",
3035			__func__, blk_rq_sectors(rq), max_sectors);
3036		return BLK_STS_IOERR;
3037	}
3038
3039	/*
3040	 * The queue settings related to segment counting may differ from the
3041	 * original queue.
3042	 */
3043	rq->nr_phys_segments = blk_recalc_rq_segments(rq);
3044	if (rq->nr_phys_segments > max_segments) {
3045		printk(KERN_ERR "%s: over max segments limit. (%u > %u)\n",
3046			__func__, rq->nr_phys_segments, max_segments);
3047		return BLK_STS_IOERR;
3048	}
3049
3050	if (q->disk && should_fail_request(q->disk->part0, blk_rq_bytes(rq)))
3051		return BLK_STS_IOERR;
3052
3053	ret = blk_crypto_rq_get_keyslot(rq);
3054	if (ret != BLK_STS_OK)
3055		return ret;
3056
3057	blk_account_io_start(rq);
3058
3059	/*
3060	 * Since we have a scheduler attached on the top device,
3061	 * bypass a potential scheduler on the bottom device for
3062	 * insert.
3063	 */
3064	blk_mq_run_dispatch_ops(q,
3065			ret = blk_mq_request_issue_directly(rq, true));
3066	if (ret)
3067		blk_account_io_done(rq, ktime_get_ns());
3068	return ret;
3069}
3070EXPORT_SYMBOL_GPL(blk_insert_cloned_request);
3071
3072/**
3073 * blk_rq_unprep_clone - Helper function to free all bios in a cloned request
3074 * @rq: the clone request to be cleaned up
3075 *
3076 * Description:
3077 *     Free all bios in @rq for a cloned request.
3078 */
3079void blk_rq_unprep_clone(struct request *rq)
3080{
3081	struct bio *bio;
3082
3083	while ((bio = rq->bio) != NULL) {
3084		rq->bio = bio->bi_next;
3085
3086		bio_put(bio);
3087	}
3088}
3089EXPORT_SYMBOL_GPL(blk_rq_unprep_clone);
3090
3091/**
3092 * blk_rq_prep_clone - Helper function to setup clone request
3093 * @rq: the request to be setup
3094 * @rq_src: original request to be cloned
3095 * @bs: bio_set that bios for clone are allocated from
3096 * @gfp_mask: memory allocation mask for bio
3097 * @bio_ctr: setup function to be called for each clone bio.
3098 *           Returns %0 for success, non %0 for failure.
3099 * @data: private data to be passed to @bio_ctr
3100 *
3101 * Description:
3102 *     Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq.
3103 *     Also, pages which the original bios are pointing to are not copied
3104 *     and the cloned bios just point same pages.
3105 *     So cloned bios must be completed before original bios, which means
3106 *     the caller must complete @rq before @rq_src.
3107 */
3108int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
3109		      struct bio_set *bs, gfp_t gfp_mask,
3110		      int (*bio_ctr)(struct bio *, struct bio *, void *),
3111		      void *data)
3112{
3113	struct bio *bio, *bio_src;
3114
3115	if (!bs)
3116		bs = &fs_bio_set;
3117
3118	__rq_for_each_bio(bio_src, rq_src) {
3119		bio = bio_alloc_clone(rq->q->disk->part0, bio_src, gfp_mask,
3120				      bs);
3121		if (!bio)
3122			goto free_and_out;
3123
3124		if (bio_ctr && bio_ctr(bio, bio_src, data))
3125			goto free_and_out;
3126
3127		if (rq->bio) {
3128			rq->biotail->bi_next = bio;
3129			rq->biotail = bio;
3130		} else {
3131			rq->bio = rq->biotail = bio;
3132		}
3133		bio = NULL;
3134	}
3135
3136	/* Copy attributes of the original request to the clone request. */
3137	rq->__sector = blk_rq_pos(rq_src);
3138	rq->__data_len = blk_rq_bytes(rq_src);
3139	if (rq_src->rq_flags & RQF_SPECIAL_PAYLOAD) {
3140		rq->rq_flags |= RQF_SPECIAL_PAYLOAD;
3141		rq->special_vec = rq_src->special_vec;
3142	}
3143	rq->nr_phys_segments = rq_src->nr_phys_segments;
3144	rq->ioprio = rq_src->ioprio;
3145
3146	if (rq->bio && blk_crypto_rq_bio_prep(rq, rq->bio, gfp_mask) < 0)
3147		goto free_and_out;
3148
3149	return 0;
3150
3151free_and_out:
3152	if (bio)
3153		bio_put(bio);
3154	blk_rq_unprep_clone(rq);
3155
3156	return -ENOMEM;
3157}
3158EXPORT_SYMBOL_GPL(blk_rq_prep_clone);
3159#endif /* CONFIG_BLK_MQ_STACKING */
3160
3161/*
3162 * Steal bios from a request and add them to a bio list.
3163 * The request must not have been partially completed before.
3164 */
3165void blk_steal_bios(struct bio_list *list, struct request *rq)
3166{
3167	if (rq->bio) {
3168		if (list->tail)
3169			list->tail->bi_next = rq->bio;
3170		else
3171			list->head = rq->bio;
3172		list->tail = rq->biotail;
3173
3174		rq->bio = NULL;
3175		rq->biotail = NULL;
3176	}
3177
3178	rq->__data_len = 0;
3179}
3180EXPORT_SYMBOL_GPL(blk_steal_bios);
3181
3182static size_t order_to_size(unsigned int order)
3183{
3184	return (size_t)PAGE_SIZE << order;
3185}
3186
3187/* called before freeing request pool in @tags */
3188static void blk_mq_clear_rq_mapping(struct blk_mq_tags *drv_tags,
3189				    struct blk_mq_tags *tags)
3190{
3191	struct page *page;
3192	unsigned long flags;
3193
3194	/*
3195	 * There is no need to clear mapping if driver tags is not initialized
3196	 * or the mapping belongs to the driver tags.
3197	 */
3198	if (!drv_tags || drv_tags == tags)
3199		return;
3200
3201	list_for_each_entry(page, &tags->page_list, lru) {
3202		unsigned long start = (unsigned long)page_address(page);
3203		unsigned long end = start + order_to_size(page->private);
3204		int i;
3205
3206		for (i = 0; i < drv_tags->nr_tags; i++) {
3207			struct request *rq = drv_tags->rqs[i];
3208			unsigned long rq_addr = (unsigned long)rq;
3209
3210			if (rq_addr >= start && rq_addr < end) {
3211				WARN_ON_ONCE(req_ref_read(rq) != 0);
3212				cmpxchg(&drv_tags->rqs[i], rq, NULL);
3213			}
3214		}
3215	}
3216
3217	/*
3218	 * Wait until all pending iteration is done.
3219	 *
3220	 * Request reference is cleared and it is guaranteed to be observed
3221	 * after the ->lock is released.
3222	 */
3223	spin_lock_irqsave(&drv_tags->lock, flags);
3224	spin_unlock_irqrestore(&drv_tags->lock, flags);
3225}
3226
3227void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
3228		     unsigned int hctx_idx)
3229{
3230	struct blk_mq_tags *drv_tags;
3231	struct page *page;
3232
3233	if (list_empty(&tags->page_list))
3234		return;
3235
3236	if (blk_mq_is_shared_tags(set->flags))
3237		drv_tags = set->shared_tags;
3238	else
3239		drv_tags = set->tags[hctx_idx];
3240
3241	if (tags->static_rqs && set->ops->exit_request) {
3242		int i;
3243
3244		for (i = 0; i < tags->nr_tags; i++) {
3245			struct request *rq = tags->static_rqs[i];
3246
3247			if (!rq)
3248				continue;
3249			set->ops->exit_request(set, rq, hctx_idx);
3250			tags->static_rqs[i] = NULL;
3251		}
3252	}
3253
3254	blk_mq_clear_rq_mapping(drv_tags, tags);
3255
3256	while (!list_empty(&tags->page_list)) {
3257		page = list_first_entry(&tags->page_list, struct page, lru);
3258		list_del_init(&page->lru);
3259		/*
3260		 * Remove kmemleak object previously allocated in
3261		 * blk_mq_alloc_rqs().
3262		 */
3263		kmemleak_free(page_address(page));
3264		__free_pages(page, page->private);
3265	}
3266}
3267
3268void blk_mq_free_rq_map(struct blk_mq_tags *tags)
3269{
3270	kfree(tags->rqs);
3271	tags->rqs = NULL;
3272	kfree(tags->static_rqs);
3273	tags->static_rqs = NULL;
3274
3275	blk_mq_free_tags(tags);
3276}
3277
3278static enum hctx_type hctx_idx_to_type(struct blk_mq_tag_set *set,
3279		unsigned int hctx_idx)
3280{
3281	int i;
3282
3283	for (i = 0; i < set->nr_maps; i++) {
3284		unsigned int start = set->map[i].queue_offset;
3285		unsigned int end = start + set->map[i].nr_queues;
3286
3287		if (hctx_idx >= start && hctx_idx < end)
3288			break;
3289	}
3290
3291	if (i >= set->nr_maps)
3292		i = HCTX_TYPE_DEFAULT;
3293
3294	return i;
3295}
3296
3297static int blk_mq_get_hctx_node(struct blk_mq_tag_set *set,
3298		unsigned int hctx_idx)
3299{
3300	enum hctx_type type = hctx_idx_to_type(set, hctx_idx);
3301
3302	return blk_mq_hw_queue_to_node(&set->map[type], hctx_idx);
3303}
3304
3305static struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
3306					       unsigned int hctx_idx,
3307					       unsigned int nr_tags,
3308					       unsigned int reserved_tags)
3309{
3310	int node = blk_mq_get_hctx_node(set, hctx_idx);
3311	struct blk_mq_tags *tags;
3312
3313	if (node == NUMA_NO_NODE)
3314		node = set->numa_node;
3315
3316	tags = blk_mq_init_tags(nr_tags, reserved_tags, node,
3317				BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags));
3318	if (!tags)
3319		return NULL;
3320
3321	tags->rqs = kcalloc_node(nr_tags, sizeof(struct request *),
3322				 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
3323				 node);
3324	if (!tags->rqs)
3325		goto err_free_tags;
3326
3327	tags->static_rqs = kcalloc_node(nr_tags, sizeof(struct request *),
3328					GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
3329					node);
3330	if (!tags->static_rqs)
3331		goto err_free_rqs;
3332
3333	return tags;
3334
3335err_free_rqs:
3336	kfree(tags->rqs);
3337err_free_tags:
3338	blk_mq_free_tags(tags);
3339	return NULL;
3340}
3341
3342static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq,
3343			       unsigned int hctx_idx, int node)
3344{
3345	int ret;
3346
3347	if (set->ops->init_request) {
3348		ret = set->ops->init_request(set, rq, hctx_idx, node);
3349		if (ret)
3350			return ret;
3351	}
3352
3353	WRITE_ONCE(rq->state, MQ_RQ_IDLE);
3354	return 0;
3355}
3356
3357static int blk_mq_alloc_rqs(struct blk_mq_tag_set *set,
3358			    struct blk_mq_tags *tags,
3359			    unsigned int hctx_idx, unsigned int depth)
3360{
3361	unsigned int i, j, entries_per_page, max_order = 4;
3362	int node = blk_mq_get_hctx_node(set, hctx_idx);
3363	size_t rq_size, left;
3364
3365	if (node == NUMA_NO_NODE)
3366		node = set->numa_node;
3367
3368	INIT_LIST_HEAD(&tags->page_list);
3369
3370	/*
3371	 * rq_size is the size of the request plus driver payload, rounded
3372	 * to the cacheline size
3373	 */
3374	rq_size = round_up(sizeof(struct request) + set->cmd_size,
3375				cache_line_size());
3376	left = rq_size * depth;
3377
3378	for (i = 0; i < depth; ) {
3379		int this_order = max_order;
3380		struct page *page;
3381		int to_do;
3382		void *p;
3383
3384		while (this_order && left < order_to_size(this_order - 1))
3385			this_order--;
3386
3387		do {
3388			page = alloc_pages_node(node,
3389				GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO,
3390				this_order);
3391			if (page)
3392				break;
3393			if (!this_order--)
3394				break;
3395			if (order_to_size(this_order) < rq_size)
3396				break;
3397		} while (1);
3398
3399		if (!page)
3400			goto fail;
3401
3402		page->private = this_order;
3403		list_add_tail(&page->lru, &tags->page_list);
3404
3405		p = page_address(page);
3406		/*
3407		 * Allow kmemleak to scan these pages as they contain pointers
3408		 * to additional allocations like via ops->init_request().
3409		 */
3410		kmemleak_alloc(p, order_to_size(this_order), 1, GFP_NOIO);
3411		entries_per_page = order_to_size(this_order) / rq_size;
3412		to_do = min(entries_per_page, depth - i);
3413		left -= to_do * rq_size;
3414		for (j = 0; j < to_do; j++) {
3415			struct request *rq = p;
3416
3417			tags->static_rqs[i] = rq;
3418			if (blk_mq_init_request(set, rq, hctx_idx, node)) {
3419				tags->static_rqs[i] = NULL;
3420				goto fail;
3421			}
3422
3423			p += rq_size;
3424			i++;
3425		}
3426	}
3427	return 0;
3428
3429fail:
3430	blk_mq_free_rqs(set, tags, hctx_idx);
3431	return -ENOMEM;
3432}
3433
3434struct rq_iter_data {
3435	struct blk_mq_hw_ctx *hctx;
3436	bool has_rq;
3437};
3438
3439static bool blk_mq_has_request(struct request *rq, void *data)
3440{
3441	struct rq_iter_data *iter_data = data;
3442
3443	if (rq->mq_hctx != iter_data->hctx)
3444		return true;
3445	iter_data->has_rq = true;
3446	return false;
3447}
3448
3449static bool blk_mq_hctx_has_requests(struct blk_mq_hw_ctx *hctx)
3450{
3451	struct blk_mq_tags *tags = hctx->sched_tags ?
3452			hctx->sched_tags : hctx->tags;
3453	struct rq_iter_data data = {
3454		.hctx	= hctx,
3455	};
3456
3457	blk_mq_all_tag_iter(tags, blk_mq_has_request, &data);
3458	return data.has_rq;
3459}
3460
3461static inline bool blk_mq_last_cpu_in_hctx(unsigned int cpu,
3462		struct blk_mq_hw_ctx *hctx)
3463{
3464	if (cpumask_first_and(hctx->cpumask, cpu_online_mask) != cpu)
3465		return false;
3466	if (cpumask_next_and(cpu, hctx->cpumask, cpu_online_mask) < nr_cpu_ids)
3467		return false;
3468	return true;
3469}
3470
3471static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node)
3472{
3473	struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
3474			struct blk_mq_hw_ctx, cpuhp_online);
3475
3476	if (!cpumask_test_cpu(cpu, hctx->cpumask) ||
3477	    !blk_mq_last_cpu_in_hctx(cpu, hctx))
3478		return 0;
3479
3480	/*
3481	 * Prevent new request from being allocated on the current hctx.
3482	 *
3483	 * The smp_mb__after_atomic() Pairs with the implied barrier in
3484	 * test_and_set_bit_lock in sbitmap_get().  Ensures the inactive flag is
3485	 * seen once we return from the tag allocator.
3486	 */
3487	set_bit(BLK_MQ_S_INACTIVE, &hctx->state);
3488	smp_mb__after_atomic();
3489
3490	/*
3491	 * Try to grab a reference to the queue and wait for any outstanding
3492	 * requests.  If we could not grab a reference the queue has been
3493	 * frozen and there are no requests.
3494	 */
3495	if (percpu_ref_tryget(&hctx->queue->q_usage_counter)) {
3496		while (blk_mq_hctx_has_requests(hctx))
3497			msleep(5);
3498		percpu_ref_put(&hctx->queue->q_usage_counter);
3499	}
3500
3501	return 0;
3502}
3503
3504static int blk_mq_hctx_notify_online(unsigned int cpu, struct hlist_node *node)
3505{
3506	struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
3507			struct blk_mq_hw_ctx, cpuhp_online);
3508
3509	if (cpumask_test_cpu(cpu, hctx->cpumask))
3510		clear_bit(BLK_MQ_S_INACTIVE, &hctx->state);
3511	return 0;
3512}
3513
3514/*
3515 * 'cpu' is going away. splice any existing rq_list entries from this
3516 * software queue to the hw queue dispatch list, and ensure that it
3517 * gets run.
3518 */
3519static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
3520{
3521	struct blk_mq_hw_ctx *hctx;
3522	struct blk_mq_ctx *ctx;
3523	LIST_HEAD(tmp);
3524	enum hctx_type type;
3525
3526	hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead);
3527	if (!cpumask_test_cpu(cpu, hctx->cpumask))
3528		return 0;
3529
3530	ctx = __blk_mq_get_ctx(hctx->queue, cpu);
3531	type = hctx->type;
3532
3533	spin_lock(&ctx->lock);
3534	if (!list_empty(&ctx->rq_lists[type])) {
3535		list_splice_init(&ctx->rq_lists[type], &tmp);
3536		blk_mq_hctx_clear_pending(hctx, ctx);
3537	}
3538	spin_unlock(&ctx->lock);
3539
3540	if (list_empty(&tmp))
3541		return 0;
3542
3543	spin_lock(&hctx->lock);
3544	list_splice_tail_init(&tmp, &hctx->dispatch);
3545	spin_unlock(&hctx->lock);
3546
3547	blk_mq_run_hw_queue(hctx, true);
3548	return 0;
3549}
3550
3551static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx)
3552{
3553	if (!(hctx->flags & BLK_MQ_F_STACKING))
3554		cpuhp_state_remove_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE,
3555						    &hctx->cpuhp_online);
3556	cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD,
3557					    &hctx->cpuhp_dead);
3558}
3559
3560/*
3561 * Before freeing hw queue, clearing the flush request reference in
3562 * tags->rqs[] for avoiding potential UAF.
3563 */
3564static void blk_mq_clear_flush_rq_mapping(struct blk_mq_tags *tags,
3565		unsigned int queue_depth, struct request *flush_rq)
3566{
3567	int i;
3568	unsigned long flags;
3569
3570	/* The hw queue may not be mapped yet */
3571	if (!tags)
3572		return;
3573
3574	WARN_ON_ONCE(req_ref_read(flush_rq) != 0);
3575
3576	for (i = 0; i < queue_depth; i++)
3577		cmpxchg(&tags->rqs[i], flush_rq, NULL);
3578
3579	/*
3580	 * Wait until all pending iteration is done.
3581	 *
3582	 * Request reference is cleared and it is guaranteed to be observed
3583	 * after the ->lock is released.
3584	 */
3585	spin_lock_irqsave(&tags->lock, flags);
3586	spin_unlock_irqrestore(&tags->lock, flags);
3587}
3588
3589/* hctx->ctxs will be freed in queue's release handler */
3590static void blk_mq_exit_hctx(struct request_queue *q,
3591		struct blk_mq_tag_set *set,
3592		struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
3593{
3594	struct request *flush_rq = hctx->fq->flush_rq;
3595
3596	if (blk_mq_hw_queue_mapped(hctx))
3597		blk_mq_tag_idle(hctx);
3598
3599	if (blk_queue_init_done(q))
3600		blk_mq_clear_flush_rq_mapping(set->tags[hctx_idx],
3601				set->queue_depth, flush_rq);
3602	if (set->ops->exit_request)
3603		set->ops->exit_request(set, flush_rq, hctx_idx);
3604
3605	if (set->ops->exit_hctx)
3606		set->ops->exit_hctx(hctx, hctx_idx);
3607
3608	blk_mq_remove_cpuhp(hctx);
3609
3610	xa_erase(&q->hctx_table, hctx_idx);
3611
3612	spin_lock(&q->unused_hctx_lock);
3613	list_add(&hctx->hctx_list, &q->unused_hctx_list);
3614	spin_unlock(&q->unused_hctx_lock);
3615}
3616
3617static void blk_mq_exit_hw_queues(struct request_queue *q,
3618		struct blk_mq_tag_set *set, int nr_queue)
3619{
3620	struct blk_mq_hw_ctx *hctx;
3621	unsigned long i;
3622
3623	queue_for_each_hw_ctx(q, hctx, i) {
3624		if (i == nr_queue)
3625			break;
3626		blk_mq_exit_hctx(q, set, hctx, i);
3627	}
3628}
3629
3630static int blk_mq_init_hctx(struct request_queue *q,
3631		struct blk_mq_tag_set *set,
3632		struct blk_mq_hw_ctx *hctx, unsigned hctx_idx)
3633{
3634	hctx->queue_num = hctx_idx;
3635
3636	if (!(hctx->flags & BLK_MQ_F_STACKING))
3637		cpuhp_state_add_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE,
3638				&hctx->cpuhp_online);
3639	cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead);
3640
3641	hctx->tags = set->tags[hctx_idx];
3642
3643	if (set->ops->init_hctx &&
3644	    set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
3645		goto unregister_cpu_notifier;
3646
3647	if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx,
3648				hctx->numa_node))
3649		goto exit_hctx;
3650
3651	if (xa_insert(&q->hctx_table, hctx_idx, hctx, GFP_KERNEL))
3652		goto exit_flush_rq;
3653
3654	return 0;
3655
3656 exit_flush_rq:
3657	if (set->ops->exit_request)
3658		set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx);
3659 exit_hctx:
3660	if (set->ops->exit_hctx)
3661		set->ops->exit_hctx(hctx, hctx_idx);
3662 unregister_cpu_notifier:
3663	blk_mq_remove_cpuhp(hctx);
3664	return -1;
3665}
3666
3667static struct blk_mq_hw_ctx *
3668blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set,
3669		int node)
3670{
3671	struct blk_mq_hw_ctx *hctx;
3672	gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY;
3673
3674	hctx = kzalloc_node(sizeof(struct blk_mq_hw_ctx), gfp, node);
3675	if (!hctx)
3676		goto fail_alloc_hctx;
3677
3678	if (!zalloc_cpumask_var_node(&hctx->cpumask, gfp, node))
3679		goto free_hctx;
3680
3681	atomic_set(&hctx->nr_active, 0);
3682	if (node == NUMA_NO_NODE)
3683		node = set->numa_node;
3684	hctx->numa_node = node;
3685
3686	INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn);
3687	spin_lock_init(&hctx->lock);
3688	INIT_LIST_HEAD(&hctx->dispatch);
3689	hctx->queue = q;
3690	hctx->flags = set->flags & ~BLK_MQ_F_TAG_QUEUE_SHARED;
3691
3692	INIT_LIST_HEAD(&hctx->hctx_list);
3693
3694	/*
3695	 * Allocate space for all possible cpus to avoid allocation at
3696	 * runtime
3697	 */
3698	hctx->ctxs = kmalloc_array_node(nr_cpu_ids, sizeof(void *),
3699			gfp, node);
3700	if (!hctx->ctxs)
3701		goto free_cpumask;
3702
3703	if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8),
3704				gfp, node, false, false))
3705		goto free_ctxs;
3706	hctx->nr_ctx = 0;
3707
3708	spin_lock_init(&hctx->dispatch_wait_lock);
3709	init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
3710	INIT_LIST_HEAD(&hctx->dispatch_wait.entry);
3711
3712	hctx->fq = blk_alloc_flush_queue(hctx->numa_node, set->cmd_size, gfp);
3713	if (!hctx->fq)
3714		goto free_bitmap;
3715
3716	blk_mq_hctx_kobj_init(hctx);
3717
3718	return hctx;
3719
3720 free_bitmap:
3721	sbitmap_free(&hctx->ctx_map);
3722 free_ctxs:
3723	kfree(hctx->ctxs);
3724 free_cpumask:
3725	free_cpumask_var(hctx->cpumask);
3726 free_hctx:
3727	kfree(hctx);
3728 fail_alloc_hctx:
3729	return NULL;
3730}
3731
3732static void blk_mq_init_cpu_queues(struct request_queue *q,
3733				   unsigned int nr_hw_queues)
3734{
3735	struct blk_mq_tag_set *set = q->tag_set;
3736	unsigned int i, j;
3737
3738	for_each_possible_cpu(i) {
3739		struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i);
3740		struct blk_mq_hw_ctx *hctx;
3741		int k;
3742
3743		__ctx->cpu = i;
3744		spin_lock_init(&__ctx->lock);
3745		for (k = HCTX_TYPE_DEFAULT; k < HCTX_MAX_TYPES; k++)
3746			INIT_LIST_HEAD(&__ctx->rq_lists[k]);
3747
3748		__ctx->queue = q;
3749
3750		/*
3751		 * Set local node, IFF we have more than one hw queue. If
3752		 * not, we remain on the home node of the device
3753		 */
3754		for (j = 0; j < set->nr_maps; j++) {
3755			hctx = blk_mq_map_queue_type(q, j, i);
3756			if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
3757				hctx->numa_node = cpu_to_node(i);
3758		}
3759	}
3760}
3761
3762struct blk_mq_tags *blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set,
3763					     unsigned int hctx_idx,
3764					     unsigned int depth)
3765{
3766	struct blk_mq_tags *tags;
3767	int ret;
3768
3769	tags = blk_mq_alloc_rq_map(set, hctx_idx, depth, set->reserved_tags);
3770	if (!tags)
3771		return NULL;
3772
3773	ret = blk_mq_alloc_rqs(set, tags, hctx_idx, depth);
3774	if (ret) {
3775		blk_mq_free_rq_map(tags);
3776		return NULL;
3777	}
3778
3779	return tags;
3780}
3781
3782static bool __blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set,
3783				       int hctx_idx)
3784{
3785	if (blk_mq_is_shared_tags(set->flags)) {
3786		set->tags[hctx_idx] = set->shared_tags;
3787
3788		return true;
3789	}
3790
3791	set->tags[hctx_idx] = blk_mq_alloc_map_and_rqs(set, hctx_idx,
3792						       set->queue_depth);
3793
3794	return set->tags[hctx_idx];
3795}
3796
3797void blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set,
3798			     struct blk_mq_tags *tags,
3799			     unsigned int hctx_idx)
3800{
3801	if (tags) {
3802		blk_mq_free_rqs(set, tags, hctx_idx);
3803		blk_mq_free_rq_map(tags);
3804	}
3805}
3806
3807static void __blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set,
3808				      unsigned int hctx_idx)
3809{
3810	if (!blk_mq_is_shared_tags(set->flags))
3811		blk_mq_free_map_and_rqs(set, set->tags[hctx_idx], hctx_idx);
3812
3813	set->tags[hctx_idx] = NULL;
3814}
3815
3816static void blk_mq_map_swqueue(struct request_queue *q)
3817{
3818	unsigned int j, hctx_idx;
3819	unsigned long i;
3820	struct blk_mq_hw_ctx *hctx;
3821	struct blk_mq_ctx *ctx;
3822	struct blk_mq_tag_set *set = q->tag_set;
3823
3824	queue_for_each_hw_ctx(q, hctx, i) {
3825		cpumask_clear(hctx->cpumask);
3826		hctx->nr_ctx = 0;
3827		hctx->dispatch_from = NULL;
3828	}
3829
3830	/*
3831	 * Map software to hardware queues.
3832	 *
3833	 * If the cpu isn't present, the cpu is mapped to first hctx.
3834	 */
3835	for_each_possible_cpu(i) {
3836
3837		ctx = per_cpu_ptr(q->queue_ctx, i);
3838		for (j = 0; j < set->nr_maps; j++) {
3839			if (!set->map[j].nr_queues) {
3840				ctx->hctxs[j] = blk_mq_map_queue_type(q,
3841						HCTX_TYPE_DEFAULT, i);
3842				continue;
3843			}
3844			hctx_idx = set->map[j].mq_map[i];
3845			/* unmapped hw queue can be remapped after CPU topo changed */
3846			if (!set->tags[hctx_idx] &&
3847			    !__blk_mq_alloc_map_and_rqs(set, hctx_idx)) {
3848				/*
3849				 * If tags initialization fail for some hctx,
3850				 * that hctx won't be brought online.  In this
3851				 * case, remap the current ctx to hctx[0] which
3852				 * is guaranteed to always have tags allocated
3853				 */
3854				set->map[j].mq_map[i] = 0;
3855			}
3856
3857			hctx = blk_mq_map_queue_type(q, j, i);
3858			ctx->hctxs[j] = hctx;
3859			/*
3860			 * If the CPU is already set in the mask, then we've
3861			 * mapped this one already. This can happen if
3862			 * devices share queues across queue maps.
3863			 */
3864			if (cpumask_test_cpu(i, hctx->cpumask))
3865				continue;
3866
3867			cpumask_set_cpu(i, hctx->cpumask);
3868			hctx->type = j;
3869			ctx->index_hw[hctx->type] = hctx->nr_ctx;
3870			hctx->ctxs[hctx->nr_ctx++] = ctx;
3871
3872			/*
3873			 * If the nr_ctx type overflows, we have exceeded the
3874			 * amount of sw queues we can support.
3875			 */
3876			BUG_ON(!hctx->nr_ctx);
3877		}
3878
3879		for (; j < HCTX_MAX_TYPES; j++)
3880			ctx->hctxs[j] = blk_mq_map_queue_type(q,
3881					HCTX_TYPE_DEFAULT, i);
3882	}
3883
3884	queue_for_each_hw_ctx(q, hctx, i) {
3885		/*
3886		 * If no software queues are mapped to this hardware queue,
3887		 * disable it and free the request entries.
3888		 */
3889		if (!hctx->nr_ctx) {
3890			/* Never unmap queue 0.  We need it as a
3891			 * fallback in case of a new remap fails
3892			 * allocation
3893			 */
3894			if (i)
3895				__blk_mq_free_map_and_rqs(set, i);
3896
3897			hctx->tags = NULL;
3898			continue;
3899		}
3900
3901		hctx->tags = set->tags[i];
3902		WARN_ON(!hctx->tags);
3903
3904		/*
3905		 * Set the map size to the number of mapped software queues.
3906		 * This is more accurate and more efficient than looping
3907		 * over all possibly mapped software queues.
3908		 */
3909		sbitmap_resize(&hctx->ctx_map, hctx->nr_ctx);
3910
3911		/*
3912		 * Initialize batch roundrobin counts
3913		 */
3914		hctx->next_cpu = blk_mq_first_mapped_cpu(hctx);
3915		hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
3916	}
3917}
3918
3919/*
3920 * Caller needs to ensure that we're either frozen/quiesced, or that
3921 * the queue isn't live yet.
3922 */
3923static void queue_set_hctx_shared(struct request_queue *q, bool shared)
3924{
3925	struct blk_mq_hw_ctx *hctx;
3926	unsigned long i;
3927
3928	queue_for_each_hw_ctx(q, hctx, i) {
3929		if (shared) {
3930			hctx->flags |= BLK_MQ_F_TAG_QUEUE_SHARED;
3931		} else {
3932			blk_mq_tag_idle(hctx);
3933			hctx->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED;
3934		}
3935	}
3936}
3937
3938static void blk_mq_update_tag_set_shared(struct blk_mq_tag_set *set,
3939					 bool shared)
3940{
3941	struct request_queue *q;
3942
3943	lockdep_assert_held(&set->tag_list_lock);
3944
3945	list_for_each_entry(q, &set->tag_list, tag_set_list) {
3946		blk_mq_freeze_queue(q);
3947		queue_set_hctx_shared(q, shared);
3948		blk_mq_unfreeze_queue(q);
3949	}
3950}
3951
3952static void blk_mq_del_queue_tag_set(struct request_queue *q)
3953{
3954	struct blk_mq_tag_set *set = q->tag_set;
3955
3956	mutex_lock(&set->tag_list_lock);
3957	list_del(&q->tag_set_list);
3958	if (list_is_singular(&set->tag_list)) {
3959		/* just transitioned to unshared */
3960		set->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED;
3961		/* update existing queue */
3962		blk_mq_update_tag_set_shared(set, false);
3963	}
3964	mutex_unlock(&set->tag_list_lock);
3965	INIT_LIST_HEAD(&q->tag_set_list);
3966}
3967
3968static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
3969				     struct request_queue *q)
3970{
3971	mutex_lock(&set->tag_list_lock);
3972
3973	/*
3974	 * Check to see if we're transitioning to shared (from 1 to 2 queues).
3975	 */
3976	if (!list_empty(&set->tag_list) &&
3977	    !(set->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) {
3978		set->flags |= BLK_MQ_F_TAG_QUEUE_SHARED;
3979		/* update existing queue */
3980		blk_mq_update_tag_set_shared(set, true);
3981	}
3982	if (set->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
3983		queue_set_hctx_shared(q, true);
3984	list_add_tail(&q->tag_set_list, &set->tag_list);
3985
3986	mutex_unlock(&set->tag_list_lock);
3987}
3988
3989/* All allocations will be freed in release handler of q->mq_kobj */
3990static int blk_mq_alloc_ctxs(struct request_queue *q)
3991{
3992	struct blk_mq_ctxs *ctxs;
3993	int cpu;
3994
3995	ctxs = kzalloc(sizeof(*ctxs), GFP_KERNEL);
3996	if (!ctxs)
3997		return -ENOMEM;
3998
3999	ctxs->queue_ctx = alloc_percpu(struct blk_mq_ctx);
4000	if (!ctxs->queue_ctx)
4001		goto fail;
4002
4003	for_each_possible_cpu(cpu) {
4004		struct blk_mq_ctx *ctx = per_cpu_ptr(ctxs->queue_ctx, cpu);
4005		ctx->ctxs = ctxs;
4006	}
4007
4008	q->mq_kobj = &ctxs->kobj;
4009	q->queue_ctx = ctxs->queue_ctx;
4010
4011	return 0;
4012 fail:
4013	kfree(ctxs);
4014	return -ENOMEM;
4015}
4016
4017/*
4018 * It is the actual release handler for mq, but we do it from
4019 * request queue's release handler for avoiding use-after-free
4020 * and headache because q->mq_kobj shouldn't have been introduced,
4021 * but we can't group ctx/kctx kobj without it.
4022 */
4023void blk_mq_release(struct request_queue *q)
4024{
4025	struct blk_mq_hw_ctx *hctx, *next;
4026	unsigned long i;
4027
4028	queue_for_each_hw_ctx(q, hctx, i)
4029		WARN_ON_ONCE(hctx && list_empty(&hctx->hctx_list));
4030
4031	/* all hctx are in .unused_hctx_list now */
4032	list_for_each_entry_safe(hctx, next, &q->unused_hctx_list, hctx_list) {
4033		list_del_init(&hctx->hctx_list);
4034		kobject_put(&hctx->kobj);
4035	}
4036
4037	xa_destroy(&q->hctx_table);
4038
4039	/*
4040	 * release .mq_kobj and sw queue's kobject now because
4041	 * both share lifetime with request queue.
4042	 */
4043	blk_mq_sysfs_deinit(q);
4044}
4045
4046static struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set,
4047		void *queuedata)
4048{
4049	struct request_queue *q;
4050	int ret;
4051
4052	q = blk_alloc_queue(set->numa_node);
4053	if (!q)
4054		return ERR_PTR(-ENOMEM);
4055	q->queuedata = queuedata;
4056	ret = blk_mq_init_allocated_queue(set, q);
4057	if (ret) {
4058		blk_put_queue(q);
4059		return ERR_PTR(ret);
4060	}
4061	return q;
4062}
4063
4064struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
4065{
4066	return blk_mq_init_queue_data(set, NULL);
4067}
4068EXPORT_SYMBOL(blk_mq_init_queue);
4069
4070/**
4071 * blk_mq_destroy_queue - shutdown a request queue
4072 * @q: request queue to shutdown
4073 *
4074 * This shuts down a request queue allocated by blk_mq_init_queue(). All future
4075 * requests will be failed with -ENODEV. The caller is responsible for dropping
4076 * the reference from blk_mq_init_queue() by calling blk_put_queue().
4077 *
4078 * Context: can sleep
4079 */
4080void blk_mq_destroy_queue(struct request_queue *q)
4081{
4082	WARN_ON_ONCE(!queue_is_mq(q));
4083	WARN_ON_ONCE(blk_queue_registered(q));
4084
4085	might_sleep();
4086
4087	blk_queue_flag_set(QUEUE_FLAG_DYING, q);
4088	blk_queue_start_drain(q);
4089	blk_mq_freeze_queue_wait(q);
4090
4091	blk_sync_queue(q);
4092	blk_mq_cancel_work_sync(q);
4093	blk_mq_exit_queue(q);
4094}
4095EXPORT_SYMBOL(blk_mq_destroy_queue);
4096
4097struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata,
4098		struct lock_class_key *lkclass)
4099{
4100	struct request_queue *q;
4101	struct gendisk *disk;
4102
4103	q = blk_mq_init_queue_data(set, queuedata);
4104	if (IS_ERR(q))
4105		return ERR_CAST(q);
4106
4107	disk = __alloc_disk_node(q, set->numa_node, lkclass);
4108	if (!disk) {
4109		blk_mq_destroy_queue(q);
4110		blk_put_queue(q);
4111		return ERR_PTR(-ENOMEM);
4112	}
4113	set_bit(GD_OWNS_QUEUE, &disk->state);
4114	return disk;
4115}
4116EXPORT_SYMBOL(__blk_mq_alloc_disk);
4117
4118struct gendisk *blk_mq_alloc_disk_for_queue(struct request_queue *q,
4119		struct lock_class_key *lkclass)
4120{
4121	struct gendisk *disk;
4122
4123	if (!blk_get_queue(q))
4124		return NULL;
4125	disk = __alloc_disk_node(q, NUMA_NO_NODE, lkclass);
4126	if (!disk)
4127		blk_put_queue(q);
4128	return disk;
4129}
4130EXPORT_SYMBOL(blk_mq_alloc_disk_for_queue);
4131
4132static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx(
4133		struct blk_mq_tag_set *set, struct request_queue *q,
4134		int hctx_idx, int node)
4135{
4136	struct blk_mq_hw_ctx *hctx = NULL, *tmp;
4137
4138	/* reuse dead hctx first */
4139	spin_lock(&q->unused_hctx_lock);
4140	list_for_each_entry(tmp, &q->unused_hctx_list, hctx_list) {
4141		if (tmp->numa_node == node) {
4142			hctx = tmp;
4143			break;
4144		}
4145	}
4146	if (hctx)
4147		list_del_init(&hctx->hctx_list);
4148	spin_unlock(&q->unused_hctx_lock);
4149
4150	if (!hctx)
4151		hctx = blk_mq_alloc_hctx(q, set, node);
4152	if (!hctx)
4153		goto fail;
4154
4155	if (blk_mq_init_hctx(q, set, hctx, hctx_idx))
4156		goto free_hctx;
4157
4158	return hctx;
4159
4160 free_hctx:
4161	kobject_put(&hctx->kobj);
4162 fail:
4163	return NULL;
4164}
4165
4166static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
4167						struct request_queue *q)
4168{
4169	struct blk_mq_hw_ctx *hctx;
4170	unsigned long i, j;
4171
4172	/* protect against switching io scheduler  */
4173	mutex_lock(&q->sysfs_lock);
4174	for (i = 0; i < set->nr_hw_queues; i++) {
4175		int old_node;
4176		int node = blk_mq_get_hctx_node(set, i);
4177		struct blk_mq_hw_ctx *old_hctx = xa_load(&q->hctx_table, i);
4178
4179		if (old_hctx) {
4180			old_node = old_hctx->numa_node;
4181			blk_mq_exit_hctx(q, set, old_hctx, i);
4182		}
4183
4184		if (!blk_mq_alloc_and_init_hctx(set, q, i, node)) {
4185			if (!old_hctx)
4186				break;
4187			pr_warn("Allocate new hctx on node %d fails, fallback to previous one on node %d\n",
4188					node, old_node);
4189			hctx = blk_mq_alloc_and_init_hctx(set, q, i, old_node);
4190			WARN_ON_ONCE(!hctx);
4191		}
4192	}
4193	/*
4194	 * Increasing nr_hw_queues fails. Free the newly allocated
4195	 * hctxs and keep the previous q->nr_hw_queues.
4196	 */
4197	if (i != set->nr_hw_queues) {
4198		j = q->nr_hw_queues;
4199	} else {
4200		j = i;
4201		q->nr_hw_queues = set->nr_hw_queues;
4202	}
4203
4204	xa_for_each_start(&q->hctx_table, j, hctx, j)
4205		blk_mq_exit_hctx(q, set, hctx, j);
4206	mutex_unlock(&q->sysfs_lock);
4207}
4208
4209static void blk_mq_update_poll_flag(struct request_queue *q)
4210{
4211	struct blk_mq_tag_set *set = q->tag_set;
4212
4213	if (set->nr_maps > HCTX_TYPE_POLL &&
4214	    set->map[HCTX_TYPE_POLL].nr_queues)
4215		blk_queue_flag_set(QUEUE_FLAG_POLL, q);
4216	else
4217		blk_queue_flag_clear(QUEUE_FLAG_POLL, q);
4218}
4219
4220int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
4221		struct request_queue *q)
4222{
4223	/* mark the queue as mq asap */
4224	q->mq_ops = set->ops;
4225
4226	if (blk_mq_alloc_ctxs(q))
4227		goto err_exit;
4228
4229	/* init q->mq_kobj and sw queues' kobjects */
4230	blk_mq_sysfs_init(q);
4231
4232	INIT_LIST_HEAD(&q->unused_hctx_list);
4233	spin_lock_init(&q->unused_hctx_lock);
4234
4235	xa_init(&q->hctx_table);
4236
4237	blk_mq_realloc_hw_ctxs(set, q);
4238	if (!q->nr_hw_queues)
4239		goto err_hctxs;
4240
4241	INIT_WORK(&q->timeout_work, blk_mq_timeout_work);
4242	blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ);
4243
4244	q->tag_set = set;
4245
4246	q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
4247	blk_mq_update_poll_flag(q);
4248
4249	INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work);
4250	INIT_LIST_HEAD(&q->flush_list);
4251	INIT_LIST_HEAD(&q->requeue_list);
4252	spin_lock_init(&q->requeue_lock);
4253
4254	q->nr_requests = set->queue_depth;
4255
4256	blk_mq_init_cpu_queues(q, set->nr_hw_queues);
4257	blk_mq_add_queue_tag_set(set, q);
4258	blk_mq_map_swqueue(q);
4259	return 0;
4260
4261err_hctxs:
4262	blk_mq_release(q);
4263err_exit:
4264	q->mq_ops = NULL;
4265	return -ENOMEM;
4266}
4267EXPORT_SYMBOL(blk_mq_init_allocated_queue);
4268
4269/* tags can _not_ be used after returning from blk_mq_exit_queue */
4270void blk_mq_exit_queue(struct request_queue *q)
4271{
4272	struct blk_mq_tag_set *set = q->tag_set;
4273
4274	/* Checks hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED. */
4275	blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
4276	/* May clear BLK_MQ_F_TAG_QUEUE_SHARED in hctx->flags. */
4277	blk_mq_del_queue_tag_set(q);
4278}
4279
4280static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
4281{
4282	int i;
4283
4284	if (blk_mq_is_shared_tags(set->flags)) {
4285		set->shared_tags = blk_mq_alloc_map_and_rqs(set,
4286						BLK_MQ_NO_HCTX_IDX,
4287						set->queue_depth);
4288		if (!set->shared_tags)
4289			return -ENOMEM;
4290	}
4291
4292	for (i = 0; i < set->nr_hw_queues; i++) {
4293		if (!__blk_mq_alloc_map_and_rqs(set, i))
4294			goto out_unwind;
4295		cond_resched();
4296	}
4297
4298	return 0;
4299
4300out_unwind:
4301	while (--i >= 0)
4302		__blk_mq_free_map_and_rqs(set, i);
4303
4304	if (blk_mq_is_shared_tags(set->flags)) {
4305		blk_mq_free_map_and_rqs(set, set->shared_tags,
4306					BLK_MQ_NO_HCTX_IDX);
4307	}
4308
4309	return -ENOMEM;
4310}
4311
4312/*
4313 * Allocate the request maps associated with this tag_set. Note that this
4314 * may reduce the depth asked for, if memory is tight. set->queue_depth
4315 * will be updated to reflect the allocated depth.
4316 */
4317static int blk_mq_alloc_set_map_and_rqs(struct blk_mq_tag_set *set)
4318{
4319	unsigned int depth;
4320	int err;
4321
4322	depth = set->queue_depth;
4323	do {
4324		err = __blk_mq_alloc_rq_maps(set);
4325		if (!err)
4326			break;
4327
4328		set->queue_depth >>= 1;
4329		if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) {
4330			err = -ENOMEM;
4331			break;
4332		}
4333	} while (set->queue_depth);
4334
4335	if (!set->queue_depth || err) {
4336		pr_err("blk-mq: failed to allocate request map\n");
4337		return -ENOMEM;
4338	}
4339
4340	if (depth != set->queue_depth)
4341		pr_info("blk-mq: reduced tag depth (%u -> %u)\n",
4342						depth, set->queue_depth);
4343
4344	return 0;
4345}
4346
4347static void blk_mq_update_queue_map(struct blk_mq_tag_set *set)
4348{
4349	/*
4350	 * blk_mq_map_queues() and multiple .map_queues() implementations
4351	 * expect that set->map[HCTX_TYPE_DEFAULT].nr_queues is set to the
4352	 * number of hardware queues.
4353	 */
4354	if (set->nr_maps == 1)
4355		set->map[HCTX_TYPE_DEFAULT].nr_queues = set->nr_hw_queues;
4356
4357	if (set->ops->map_queues && !is_kdump_kernel()) {
4358		int i;
4359
4360		/*
4361		 * transport .map_queues is usually done in the following
4362		 * way:
4363		 *
4364		 * for (queue = 0; queue < set->nr_hw_queues; queue++) {
4365		 * 	mask = get_cpu_mask(queue)
4366		 * 	for_each_cpu(cpu, mask)
4367		 * 		set->map[x].mq_map[cpu] = queue;
4368		 * }
4369		 *
4370		 * When we need to remap, the table has to be cleared for
4371		 * killing stale mapping since one CPU may not be mapped
4372		 * to any hw queue.
4373		 */
4374		for (i = 0; i < set->nr_maps; i++)
4375			blk_mq_clear_mq_map(&set->map[i]);
4376
4377		set->ops->map_queues(set);
4378	} else {
4379		BUG_ON(set->nr_maps > 1);
4380		blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
4381	}
4382}
4383
4384static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set,
4385				       int new_nr_hw_queues)
4386{
4387	struct blk_mq_tags **new_tags;
4388
4389	if (set->nr_hw_queues >= new_nr_hw_queues)
4390		goto done;
4391
4392	new_tags = kcalloc_node(new_nr_hw_queues, sizeof(struct blk_mq_tags *),
4393				GFP_KERNEL, set->numa_node);
4394	if (!new_tags)
4395		return -ENOMEM;
4396
4397	if (set->tags)
4398		memcpy(new_tags, set->tags, set->nr_hw_queues *
4399		       sizeof(*set->tags));
4400	kfree(set->tags);
4401	set->tags = new_tags;
4402done:
4403	set->nr_hw_queues = new_nr_hw_queues;
4404	return 0;
4405}
4406
4407/*
4408 * Alloc a tag set to be associated with one or more request queues.
4409 * May fail with EINVAL for various error conditions. May adjust the
4410 * requested depth down, if it's too large. In that case, the set
4411 * value will be stored in set->queue_depth.
4412 */
4413int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
4414{
4415	int i, ret;
4416
4417	BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS);
4418
4419	if (!set->nr_hw_queues)
4420		return -EINVAL;
4421	if (!set->queue_depth)
4422		return -EINVAL;
4423	if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN)
4424		return -EINVAL;
4425
4426	if (!set->ops->queue_rq)
4427		return -EINVAL;
4428
4429	if (!set->ops->get_budget ^ !set->ops->put_budget)
4430		return -EINVAL;
4431
4432	if (set->queue_depth > BLK_MQ_MAX_DEPTH) {
4433		pr_info("blk-mq: reduced tag depth to %u\n",
4434			BLK_MQ_MAX_DEPTH);
4435		set->queue_depth = BLK_MQ_MAX_DEPTH;
4436	}
4437
4438	if (!set->nr_maps)
4439		set->nr_maps = 1;
4440	else if (set->nr_maps > HCTX_MAX_TYPES)
4441		return -EINVAL;
4442
4443	/*
4444	 * If a crashdump is active, then we are potentially in a very
4445	 * memory constrained environment. Limit us to 1 queue and
4446	 * 64 tags to prevent using too much memory.
4447	 */
4448	if (is_kdump_kernel()) {
4449		set->nr_hw_queues = 1;
4450		set->nr_maps = 1;
4451		set->queue_depth = min(64U, set->queue_depth);
4452	}
4453	/*
4454	 * There is no use for more h/w queues than cpus if we just have
4455	 * a single map
4456	 */
4457	if (set->nr_maps == 1 && set->nr_hw_queues > nr_cpu_ids)
4458		set->nr_hw_queues = nr_cpu_ids;
4459
4460	if (set->flags & BLK_MQ_F_BLOCKING) {
4461		set->srcu = kmalloc(sizeof(*set->srcu), GFP_KERNEL);
4462		if (!set->srcu)
4463			return -ENOMEM;
4464		ret = init_srcu_struct(set->srcu);
4465		if (ret)
4466			goto out_free_srcu;
4467	}
4468
4469	ret = -ENOMEM;
4470	set->tags = kcalloc_node(set->nr_hw_queues,
4471				 sizeof(struct blk_mq_tags *), GFP_KERNEL,
4472				 set->numa_node);
4473	if (!set->tags)
4474		goto out_cleanup_srcu;
4475
4476	for (i = 0; i < set->nr_maps; i++) {
4477		set->map[i].mq_map = kcalloc_node(nr_cpu_ids,
4478						  sizeof(set->map[i].mq_map[0]),
4479						  GFP_KERNEL, set->numa_node);
4480		if (!set->map[i].mq_map)
4481			goto out_free_mq_map;
4482		set->map[i].nr_queues = is_kdump_kernel() ? 1 : set->nr_hw_queues;
4483	}
4484
4485	blk_mq_update_queue_map(set);
4486
4487	ret = blk_mq_alloc_set_map_and_rqs(set);
4488	if (ret)
4489		goto out_free_mq_map;
4490
4491	mutex_init(&set->tag_list_lock);
4492	INIT_LIST_HEAD(&set->tag_list);
4493
4494	return 0;
4495
4496out_free_mq_map:
4497	for (i = 0; i < set->nr_maps; i++) {
4498		kfree(set->map[i].mq_map);
4499		set->map[i].mq_map = NULL;
4500	}
4501	kfree(set->tags);
4502	set->tags = NULL;
4503out_cleanup_srcu:
4504	if (set->flags & BLK_MQ_F_BLOCKING)
4505		cleanup_srcu_struct(set->srcu);
4506out_free_srcu:
4507	if (set->flags & BLK_MQ_F_BLOCKING)
4508		kfree(set->srcu);
4509	return ret;
4510}
4511EXPORT_SYMBOL(blk_mq_alloc_tag_set);
4512
4513/* allocate and initialize a tagset for a simple single-queue device */
4514int blk_mq_alloc_sq_tag_set(struct blk_mq_tag_set *set,
4515		const struct blk_mq_ops *ops, unsigned int queue_depth,
4516		unsigned int set_flags)
4517{
4518	memset(set, 0, sizeof(*set));
4519	set->ops = ops;
4520	set->nr_hw_queues = 1;
4521	set->nr_maps = 1;
4522	set->queue_depth = queue_depth;
4523	set->numa_node = NUMA_NO_NODE;
4524	set->flags = set_flags;
4525	return blk_mq_alloc_tag_set(set);
4526}
4527EXPORT_SYMBOL_GPL(blk_mq_alloc_sq_tag_set);
4528
4529void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
4530{
4531	int i, j;
4532
4533	for (i = 0; i < set->nr_hw_queues; i++)
4534		__blk_mq_free_map_and_rqs(set, i);
4535
4536	if (blk_mq_is_shared_tags(set->flags)) {
4537		blk_mq_free_map_and_rqs(set, set->shared_tags,
4538					BLK_MQ_NO_HCTX_IDX);
4539	}
4540
4541	for (j = 0; j < set->nr_maps; j++) {
4542		kfree(set->map[j].mq_map);
4543		set->map[j].mq_map = NULL;
4544	}
4545
4546	kfree(set->tags);
4547	set->tags = NULL;
4548	if (set->flags & BLK_MQ_F_BLOCKING) {
4549		cleanup_srcu_struct(set->srcu);
4550		kfree(set->srcu);
4551	}
4552}
4553EXPORT_SYMBOL(blk_mq_free_tag_set);
4554
4555int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
4556{
4557	struct blk_mq_tag_set *set = q->tag_set;
4558	struct blk_mq_hw_ctx *hctx;
4559	int ret;
4560	unsigned long i;
4561
4562	if (!set)
4563		return -EINVAL;
4564
4565	if (q->nr_requests == nr)
4566		return 0;
4567
4568	blk_mq_freeze_queue(q);
4569	blk_mq_quiesce_queue(q);
4570
4571	ret = 0;
4572	queue_for_each_hw_ctx(q, hctx, i) {
4573		if (!hctx->tags)
4574			continue;
4575		/*
4576		 * If we're using an MQ scheduler, just update the scheduler
4577		 * queue depth. This is similar to what the old code would do.
4578		 */
4579		if (hctx->sched_tags) {
4580			ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags,
4581						      nr, true);
4582		} else {
4583			ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr,
4584						      false);
4585		}
4586		if (ret)
4587			break;
4588		if (q->elevator && q->elevator->type->ops.depth_updated)
4589			q->elevator->type->ops.depth_updated(hctx);
4590	}
4591	if (!ret) {
4592		q->nr_requests = nr;
4593		if (blk_mq_is_shared_tags(set->flags)) {
4594			if (q->elevator)
4595				blk_mq_tag_update_sched_shared_tags(q);
4596			else
4597				blk_mq_tag_resize_shared_tags(set, nr);
4598		}
4599	}
4600
4601	blk_mq_unquiesce_queue(q);
4602	blk_mq_unfreeze_queue(q);
4603
4604	return ret;
4605}
4606
4607/*
4608 * request_queue and elevator_type pair.
4609 * It is just used by __blk_mq_update_nr_hw_queues to cache
4610 * the elevator_type associated with a request_queue.
4611 */
4612struct blk_mq_qe_pair {
4613	struct list_head node;
4614	struct request_queue *q;
4615	struct elevator_type *type;
4616};
4617
4618/*
4619 * Cache the elevator_type in qe pair list and switch the
4620 * io scheduler to 'none'
4621 */
4622static bool blk_mq_elv_switch_none(struct list_head *head,
4623		struct request_queue *q)
4624{
4625	struct blk_mq_qe_pair *qe;
4626
4627	qe = kmalloc(sizeof(*qe), GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY);
4628	if (!qe)
4629		return false;
4630
4631	/* q->elevator needs protection from ->sysfs_lock */
4632	mutex_lock(&q->sysfs_lock);
4633
4634	/* the check has to be done with holding sysfs_lock */
4635	if (!q->elevator) {
4636		kfree(qe);
4637		goto unlock;
4638	}
4639
4640	INIT_LIST_HEAD(&qe->node);
4641	qe->q = q;
4642	qe->type = q->elevator->type;
4643	/* keep a reference to the elevator module as we'll switch back */
4644	__elevator_get(qe->type);
4645	list_add(&qe->node, head);
4646	elevator_disable(q);
4647unlock:
4648	mutex_unlock(&q->sysfs_lock);
4649
4650	return true;
4651}
4652
4653static struct blk_mq_qe_pair *blk_lookup_qe_pair(struct list_head *head,
4654						struct request_queue *q)
4655{
4656	struct blk_mq_qe_pair *qe;
4657
4658	list_for_each_entry(qe, head, node)
4659		if (qe->q == q)
4660			return qe;
4661
4662	return NULL;
4663}
4664
4665static void blk_mq_elv_switch_back(struct list_head *head,
4666				  struct request_queue *q)
4667{
4668	struct blk_mq_qe_pair *qe;
4669	struct elevator_type *t;
4670
4671	qe = blk_lookup_qe_pair(head, q);
4672	if (!qe)
4673		return;
4674	t = qe->type;
4675	list_del(&qe->node);
4676	kfree(qe);
4677
4678	mutex_lock(&q->sysfs_lock);
4679	elevator_switch(q, t);
4680	/* drop the reference acquired in blk_mq_elv_switch_none */
4681	elevator_put(t);
4682	mutex_unlock(&q->sysfs_lock);
4683}
4684
4685static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
4686							int nr_hw_queues)
4687{
4688	struct request_queue *q;
4689	LIST_HEAD(head);
4690	int prev_nr_hw_queues;
4691
4692	lockdep_assert_held(&set->tag_list_lock);
4693
4694	if (set->nr_maps == 1 && nr_hw_queues > nr_cpu_ids)
4695		nr_hw_queues = nr_cpu_ids;
4696	if (nr_hw_queues < 1)
4697		return;
4698	if (set->nr_maps == 1 && nr_hw_queues == set->nr_hw_queues)
4699		return;
4700
4701	list_for_each_entry(q, &set->tag_list, tag_set_list)
4702		blk_mq_freeze_queue(q);
4703	/*
4704	 * Switch IO scheduler to 'none', cleaning up the data associated
4705	 * with the previous scheduler. We will switch back once we are done
4706	 * updating the new sw to hw queue mappings.
4707	 */
4708	list_for_each_entry(q, &set->tag_list, tag_set_list)
4709		if (!blk_mq_elv_switch_none(&head, q))
4710			goto switch_back;
4711
4712	list_for_each_entry(q, &set->tag_list, tag_set_list) {
4713		blk_mq_debugfs_unregister_hctxs(q);
4714		blk_mq_sysfs_unregister_hctxs(q);
4715	}
4716
4717	prev_nr_hw_queues = set->nr_hw_queues;
4718	if (blk_mq_realloc_tag_set_tags(set, nr_hw_queues) < 0)
4719		goto reregister;
4720
4721fallback:
4722	blk_mq_update_queue_map(set);
4723	list_for_each_entry(q, &set->tag_list, tag_set_list) {
4724		blk_mq_realloc_hw_ctxs(set, q);
4725		blk_mq_update_poll_flag(q);
4726		if (q->nr_hw_queues != set->nr_hw_queues) {
4727			int i = prev_nr_hw_queues;
4728
4729			pr_warn("Increasing nr_hw_queues to %d fails, fallback to %d\n",
4730					nr_hw_queues, prev_nr_hw_queues);
4731			for (; i < set->nr_hw_queues; i++)
4732				__blk_mq_free_map_and_rqs(set, i);
4733
4734			set->nr_hw_queues = prev_nr_hw_queues;
4735			blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
4736			goto fallback;
4737		}
4738		blk_mq_map_swqueue(q);
4739	}
4740
4741reregister:
4742	list_for_each_entry(q, &set->tag_list, tag_set_list) {
4743		blk_mq_sysfs_register_hctxs(q);
4744		blk_mq_debugfs_register_hctxs(q);
4745	}
4746
4747switch_back:
4748	list_for_each_entry(q, &set->tag_list, tag_set_list)
4749		blk_mq_elv_switch_back(&head, q);
4750
4751	list_for_each_entry(q, &set->tag_list, tag_set_list)
4752		blk_mq_unfreeze_queue(q);
4753}
4754
4755void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
4756{
4757	mutex_lock(&set->tag_list_lock);
4758	__blk_mq_update_nr_hw_queues(set, nr_hw_queues);
4759	mutex_unlock(&set->tag_list_lock);
4760}
4761EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues);
4762
4763static int blk_hctx_poll(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
4764			 struct io_comp_batch *iob, unsigned int flags)
4765{
4766	long state = get_current_state();
4767	int ret;
4768
4769	do {
4770		ret = q->mq_ops->poll(hctx, iob);
4771		if (ret > 0) {
4772			__set_current_state(TASK_RUNNING);
4773			return ret;
4774		}
4775
4776		if (signal_pending_state(state, current))
4777			__set_current_state(TASK_RUNNING);
4778		if (task_is_running(current))
4779			return 1;
4780
4781		if (ret < 0 || (flags & BLK_POLL_ONESHOT))
4782			break;
4783		cpu_relax();
4784	} while (!need_resched());
4785
4786	__set_current_state(TASK_RUNNING);
4787	return 0;
4788}
4789
4790int blk_mq_poll(struct request_queue *q, blk_qc_t cookie,
4791		struct io_comp_batch *iob, unsigned int flags)
4792{
4793	struct blk_mq_hw_ctx *hctx = xa_load(&q->hctx_table, cookie);
4794
4795	return blk_hctx_poll(q, hctx, iob, flags);
4796}
4797
4798int blk_rq_poll(struct request *rq, struct io_comp_batch *iob,
4799		unsigned int poll_flags)
4800{
4801	struct request_queue *q = rq->q;
4802	int ret;
4803
4804	if (!blk_rq_is_poll(rq))
4805		return 0;
4806	if (!percpu_ref_tryget(&q->q_usage_counter))
4807		return 0;
4808
4809	ret = blk_hctx_poll(q, rq->mq_hctx, iob, poll_flags);
4810	blk_queue_exit(q);
4811
4812	return ret;
4813}
4814EXPORT_SYMBOL_GPL(blk_rq_poll);
4815
4816unsigned int blk_mq_rq_cpu(struct request *rq)
4817{
4818	return rq->mq_ctx->cpu;
4819}
4820EXPORT_SYMBOL(blk_mq_rq_cpu);
4821
4822void blk_mq_cancel_work_sync(struct request_queue *q)
4823{
4824	struct blk_mq_hw_ctx *hctx;
4825	unsigned long i;
4826
4827	cancel_delayed_work_sync(&q->requeue_work);
4828
4829	queue_for_each_hw_ctx(q, hctx, i)
4830		cancel_delayed_work_sync(&hctx->run_work);
4831}
4832
4833static int __init blk_mq_init(void)
4834{
4835	int i;
4836
4837	for_each_possible_cpu(i)
4838		init_llist_head(&per_cpu(blk_cpu_done, i));
4839	open_softirq(BLOCK_SOFTIRQ, blk_done_softirq);
4840
4841	cpuhp_setup_state_nocalls(CPUHP_BLOCK_SOFTIRQ_DEAD,
4842				  "block/softirq:dead", NULL,
4843				  blk_softirq_cpu_dead);
4844	cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
4845				blk_mq_hctx_notify_dead);
4846	cpuhp_setup_state_multi(CPUHP_AP_BLK_MQ_ONLINE, "block/mq:online",
4847				blk_mq_hctx_notify_online,
4848				blk_mq_hctx_notify_offline);
4849	return 0;
4850}
4851subsys_initcall(blk_mq_init);
Configure Feed

Configure Feed