block/blk-mq.c at v5.17-rc6

tjh.dev / kernel
fork
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork
kernel / block / blk-mq.c
at v5.17-rc6 4763 lines 121 kB view raw
wrap content
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Block multiqueue core code
   4 *
   5 * Copyright (C) 2013-2014 Jens Axboe
   6 * Copyright (C) 2013-2014 Christoph Hellwig
   7 */
   8#include <linux/kernel.h>
   9#include <linux/module.h>
  10#include <linux/backing-dev.h>
  11#include <linux/bio.h>
  12#include <linux/blkdev.h>
  13#include <linux/blk-integrity.h>
  14#include <linux/kmemleak.h>
  15#include <linux/mm.h>
  16#include <linux/init.h>
  17#include <linux/slab.h>
  18#include <linux/workqueue.h>
  19#include <linux/smp.h>
  20#include <linux/interrupt.h>
  21#include <linux/llist.h>
  22#include <linux/cpu.h>
  23#include <linux/cache.h>
  24#include <linux/sched/sysctl.h>
  25#include <linux/sched/topology.h>
  26#include <linux/sched/signal.h>
  27#include <linux/delay.h>
  28#include <linux/crash_dump.h>
  29#include <linux/prefetch.h>
  30#include <linux/blk-crypto.h>
  31#include <linux/part_stat.h>
  32
  33#include <trace/events/block.h>
  34
  35#include <linux/blk-mq.h>
  36#include <linux/t10-pi.h>
  37#include "blk.h"
  38#include "blk-mq.h"
  39#include "blk-mq-debugfs.h"
  40#include "blk-mq-tag.h"
  41#include "blk-pm.h"
  42#include "blk-stat.h"
  43#include "blk-mq-sched.h"
  44#include "blk-rq-qos.h"
  45
  46static DEFINE_PER_CPU(struct llist_head, blk_cpu_done);
  47
  48static void blk_mq_poll_stats_start(struct request_queue *q);
  49static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
  50
  51static int blk_mq_poll_stats_bkt(const struct request *rq)
  52{
  53	int ddir, sectors, bucket;
  54
  55	ddir = rq_data_dir(rq);
  56	sectors = blk_rq_stats_sectors(rq);
  57
  58	bucket = ddir + 2 * ilog2(sectors);
  59
  60	if (bucket < 0)
  61		return -1;
  62	else if (bucket >= BLK_MQ_POLL_STATS_BKTS)
  63		return ddir + BLK_MQ_POLL_STATS_BKTS - 2;
  64
  65	return bucket;
  66}
  67
  68#define BLK_QC_T_SHIFT		16
  69#define BLK_QC_T_INTERNAL	(1U << 31)
  70
  71static inline struct blk_mq_hw_ctx *blk_qc_to_hctx(struct request_queue *q,
  72		blk_qc_t qc)
  73{
  74	return q->queue_hw_ctx[(qc & ~BLK_QC_T_INTERNAL) >> BLK_QC_T_SHIFT];
  75}
  76
  77static inline struct request *blk_qc_to_rq(struct blk_mq_hw_ctx *hctx,
  78		blk_qc_t qc)
  79{
  80	unsigned int tag = qc & ((1U << BLK_QC_T_SHIFT) - 1);
  81
  82	if (qc & BLK_QC_T_INTERNAL)
  83		return blk_mq_tag_to_rq(hctx->sched_tags, tag);
  84	return blk_mq_tag_to_rq(hctx->tags, tag);
  85}
  86
  87static inline blk_qc_t blk_rq_to_qc(struct request *rq)
  88{
  89	return (rq->mq_hctx->queue_num << BLK_QC_T_SHIFT) |
  90		(rq->tag != -1 ?
  91		 rq->tag : (rq->internal_tag | BLK_QC_T_INTERNAL));
  92}
  93
  94/*
  95 * Check if any of the ctx, dispatch list or elevator
  96 * have pending work in this hardware queue.
  97 */
  98static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
  99{
 100	return !list_empty_careful(&hctx->dispatch) ||
 101		sbitmap_any_bit_set(&hctx->ctx_map) ||
 102			blk_mq_sched_has_work(hctx);
 103}
 104
 105/*
 106 * Mark this ctx as having pending work in this hardware queue
 107 */
 108static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
 109				     struct blk_mq_ctx *ctx)
 110{
 111	const int bit = ctx->index_hw[hctx->type];
 112
 113	if (!sbitmap_test_bit(&hctx->ctx_map, bit))
 114		sbitmap_set_bit(&hctx->ctx_map, bit);
 115}
 116
 117static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
 118				      struct blk_mq_ctx *ctx)
 119{
 120	const int bit = ctx->index_hw[hctx->type];
 121
 122	sbitmap_clear_bit(&hctx->ctx_map, bit);
 123}
 124
 125struct mq_inflight {
 126	struct block_device *part;
 127	unsigned int inflight[2];
 128};
 129
 130static bool blk_mq_check_inflight(struct request *rq, void *priv,
 131				  bool reserved)
 132{
 133	struct mq_inflight *mi = priv;
 134
 135	if ((!mi->part->bd_partno || rq->part == mi->part) &&
 136	    blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT)
 137		mi->inflight[rq_data_dir(rq)]++;
 138
 139	return true;
 140}
 141
 142unsigned int blk_mq_in_flight(struct request_queue *q,
 143		struct block_device *part)
 144{
 145	struct mq_inflight mi = { .part = part };
 146
 147	blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
 148
 149	return mi.inflight[0] + mi.inflight[1];
 150}
 151
 152void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part,
 153		unsigned int inflight[2])
 154{
 155	struct mq_inflight mi = { .part = part };
 156
 157	blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
 158	inflight[0] = mi.inflight[0];
 159	inflight[1] = mi.inflight[1];
 160}
 161
 162void blk_freeze_queue_start(struct request_queue *q)
 163{
 164	mutex_lock(&q->mq_freeze_lock);
 165	if (++q->mq_freeze_depth == 1) {
 166		percpu_ref_kill(&q->q_usage_counter);
 167		mutex_unlock(&q->mq_freeze_lock);
 168		if (queue_is_mq(q))
 169			blk_mq_run_hw_queues(q, false);
 170	} else {
 171		mutex_unlock(&q->mq_freeze_lock);
 172	}
 173}
 174EXPORT_SYMBOL_GPL(blk_freeze_queue_start);
 175
 176void blk_mq_freeze_queue_wait(struct request_queue *q)
 177{
 178	wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter));
 179}
 180EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait);
 181
 182int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
 183				     unsigned long timeout)
 184{
 185	return wait_event_timeout(q->mq_freeze_wq,
 186					percpu_ref_is_zero(&q->q_usage_counter),
 187					timeout);
 188}
 189EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait_timeout);
 190
 191/*
 192 * Guarantee no request is in use, so we can change any data structure of
 193 * the queue afterward.
 194 */
 195void blk_freeze_queue(struct request_queue *q)
 196{
 197	/*
 198	 * In the !blk_mq case we are only calling this to kill the
 199	 * q_usage_counter, otherwise this increases the freeze depth
 200	 * and waits for it to return to zero.  For this reason there is
 201	 * no blk_unfreeze_queue(), and blk_freeze_queue() is not
 202	 * exported to drivers as the only user for unfreeze is blk_mq.
 203	 */
 204	blk_freeze_queue_start(q);
 205	blk_mq_freeze_queue_wait(q);
 206}
 207
 208void blk_mq_freeze_queue(struct request_queue *q)
 209{
 210	/*
 211	 * ...just an alias to keep freeze and unfreeze actions balanced
 212	 * in the blk_mq_* namespace
 213	 */
 214	blk_freeze_queue(q);
 215}
 216EXPORT_SYMBOL_GPL(blk_mq_freeze_queue);
 217
 218void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic)
 219{
 220	mutex_lock(&q->mq_freeze_lock);
 221	if (force_atomic)
 222		q->q_usage_counter.data->force_atomic = true;
 223	q->mq_freeze_depth--;
 224	WARN_ON_ONCE(q->mq_freeze_depth < 0);
 225	if (!q->mq_freeze_depth) {
 226		percpu_ref_resurrect(&q->q_usage_counter);
 227		wake_up_all(&q->mq_freeze_wq);
 228	}
 229	mutex_unlock(&q->mq_freeze_lock);
 230}
 231
 232void blk_mq_unfreeze_queue(struct request_queue *q)
 233{
 234	__blk_mq_unfreeze_queue(q, false);
 235}
 236EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
 237
 238/*
 239 * FIXME: replace the scsi_internal_device_*block_nowait() calls in the
 240 * mpt3sas driver such that this function can be removed.
 241 */
 242void blk_mq_quiesce_queue_nowait(struct request_queue *q)
 243{
 244	unsigned long flags;
 245
 246	spin_lock_irqsave(&q->queue_lock, flags);
 247	if (!q->quiesce_depth++)
 248		blk_queue_flag_set(QUEUE_FLAG_QUIESCED, q);
 249	spin_unlock_irqrestore(&q->queue_lock, flags);
 250}
 251EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);
 252
 253/**
 254 * blk_mq_wait_quiesce_done() - wait until in-progress quiesce is done
 255 * @q: request queue.
 256 *
 257 * Note: it is driver's responsibility for making sure that quiesce has
 258 * been started.
 259 */
 260void blk_mq_wait_quiesce_done(struct request_queue *q)
 261{
 262	if (blk_queue_has_srcu(q))
 263		synchronize_srcu(q->srcu);
 264	else
 265		synchronize_rcu();
 266}
 267EXPORT_SYMBOL_GPL(blk_mq_wait_quiesce_done);
 268
 269/**
 270 * blk_mq_quiesce_queue() - wait until all ongoing dispatches have finished
 271 * @q: request queue.
 272 *
 273 * Note: this function does not prevent that the struct request end_io()
 274 * callback function is invoked. Once this function is returned, we make
 275 * sure no dispatch can happen until the queue is unquiesced via
 276 * blk_mq_unquiesce_queue().
 277 */
 278void blk_mq_quiesce_queue(struct request_queue *q)
 279{
 280	blk_mq_quiesce_queue_nowait(q);
 281	blk_mq_wait_quiesce_done(q);
 282}
 283EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue);
 284
 285/*
 286 * blk_mq_unquiesce_queue() - counterpart of blk_mq_quiesce_queue()
 287 * @q: request queue.
 288 *
 289 * This function recovers queue into the state before quiescing
 290 * which is done by blk_mq_quiesce_queue.
 291 */
 292void blk_mq_unquiesce_queue(struct request_queue *q)
 293{
 294	unsigned long flags;
 295	bool run_queue = false;
 296
 297	spin_lock_irqsave(&q->queue_lock, flags);
 298	if (WARN_ON_ONCE(q->quiesce_depth <= 0)) {
 299		;
 300	} else if (!--q->quiesce_depth) {
 301		blk_queue_flag_clear(QUEUE_FLAG_QUIESCED, q);
 302		run_queue = true;
 303	}
 304	spin_unlock_irqrestore(&q->queue_lock, flags);
 305
 306	/* dispatch requests which are inserted during quiescing */
 307	if (run_queue)
 308		blk_mq_run_hw_queues(q, true);
 309}
 310EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue);
 311
 312void blk_mq_wake_waiters(struct request_queue *q)
 313{
 314	struct blk_mq_hw_ctx *hctx;
 315	unsigned int i;
 316
 317	queue_for_each_hw_ctx(q, hctx, i)
 318		if (blk_mq_hw_queue_mapped(hctx))
 319			blk_mq_tag_wakeup_all(hctx->tags, true);
 320}
 321
 322void blk_rq_init(struct request_queue *q, struct request *rq)
 323{
 324	memset(rq, 0, sizeof(*rq));
 325
 326	INIT_LIST_HEAD(&rq->queuelist);
 327	rq->q = q;
 328	rq->__sector = (sector_t) -1;
 329	INIT_HLIST_NODE(&rq->hash);
 330	RB_CLEAR_NODE(&rq->rb_node);
 331	rq->tag = BLK_MQ_NO_TAG;
 332	rq->internal_tag = BLK_MQ_NO_TAG;
 333	rq->start_time_ns = ktime_get_ns();
 334	rq->part = NULL;
 335	blk_crypto_rq_set_defaults(rq);
 336}
 337EXPORT_SYMBOL(blk_rq_init);
 338
 339static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 340		struct blk_mq_tags *tags, unsigned int tag, u64 alloc_time_ns)
 341{
 342	struct blk_mq_ctx *ctx = data->ctx;
 343	struct blk_mq_hw_ctx *hctx = data->hctx;
 344	struct request_queue *q = data->q;
 345	struct request *rq = tags->static_rqs[tag];
 346
 347	rq->q = q;
 348	rq->mq_ctx = ctx;
 349	rq->mq_hctx = hctx;
 350	rq->cmd_flags = data->cmd_flags;
 351
 352	if (data->flags & BLK_MQ_REQ_PM)
 353		data->rq_flags |= RQF_PM;
 354	if (blk_queue_io_stat(q))
 355		data->rq_flags |= RQF_IO_STAT;
 356	rq->rq_flags = data->rq_flags;
 357
 358	if (!(data->rq_flags & RQF_ELV)) {
 359		rq->tag = tag;
 360		rq->internal_tag = BLK_MQ_NO_TAG;
 361	} else {
 362		rq->tag = BLK_MQ_NO_TAG;
 363		rq->internal_tag = tag;
 364	}
 365	rq->timeout = 0;
 366
 367	if (blk_mq_need_time_stamp(rq))
 368		rq->start_time_ns = ktime_get_ns();
 369	else
 370		rq->start_time_ns = 0;
 371	rq->part = NULL;
 372#ifdef CONFIG_BLK_RQ_ALLOC_TIME
 373	rq->alloc_time_ns = alloc_time_ns;
 374#endif
 375	rq->io_start_time_ns = 0;
 376	rq->stats_sectors = 0;
 377	rq->nr_phys_segments = 0;
 378#if defined(CONFIG_BLK_DEV_INTEGRITY)
 379	rq->nr_integrity_segments = 0;
 380#endif
 381	rq->end_io = NULL;
 382	rq->end_io_data = NULL;
 383
 384	blk_crypto_rq_set_defaults(rq);
 385	INIT_LIST_HEAD(&rq->queuelist);
 386	/* tag was already set */
 387	WRITE_ONCE(rq->deadline, 0);
 388	req_ref_set(rq, 1);
 389
 390	if (rq->rq_flags & RQF_ELV) {
 391		struct elevator_queue *e = data->q->elevator;
 392
 393		INIT_HLIST_NODE(&rq->hash);
 394		RB_CLEAR_NODE(&rq->rb_node);
 395
 396		if (!op_is_flush(data->cmd_flags) &&
 397		    e->type->ops.prepare_request) {
 398			e->type->ops.prepare_request(rq);
 399			rq->rq_flags |= RQF_ELVPRIV;
 400		}
 401	}
 402
 403	return rq;
 404}
 405
 406static inline struct request *
 407__blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data,
 408		u64 alloc_time_ns)
 409{
 410	unsigned int tag, tag_offset;
 411	struct blk_mq_tags *tags;
 412	struct request *rq;
 413	unsigned long tag_mask;
 414	int i, nr = 0;
 415
 416	tag_mask = blk_mq_get_tags(data, data->nr_tags, &tag_offset);
 417	if (unlikely(!tag_mask))
 418		return NULL;
 419
 420	tags = blk_mq_tags_from_data(data);
 421	for (i = 0; tag_mask; i++) {
 422		if (!(tag_mask & (1UL << i)))
 423			continue;
 424		tag = tag_offset + i;
 425		prefetch(tags->static_rqs[tag]);
 426		tag_mask &= ~(1UL << i);
 427		rq = blk_mq_rq_ctx_init(data, tags, tag, alloc_time_ns);
 428		rq_list_add(data->cached_rq, rq);
 429		nr++;
 430	}
 431	/* caller already holds a reference, add for remainder */
 432	percpu_ref_get_many(&data->q->q_usage_counter, nr - 1);
 433	data->nr_tags -= nr;
 434
 435	return rq_list_pop(data->cached_rq);
 436}
 437
 438static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
 439{
 440	struct request_queue *q = data->q;
 441	u64 alloc_time_ns = 0;
 442	struct request *rq;
 443	unsigned int tag;
 444
 445	/* alloc_time includes depth and tag waits */
 446	if (blk_queue_rq_alloc_time(q))
 447		alloc_time_ns = ktime_get_ns();
 448
 449	if (data->cmd_flags & REQ_NOWAIT)
 450		data->flags |= BLK_MQ_REQ_NOWAIT;
 451
 452	if (q->elevator) {
 453		struct elevator_queue *e = q->elevator;
 454
 455		data->rq_flags |= RQF_ELV;
 456
 457		/*
 458		 * Flush/passthrough requests are special and go directly to the
 459		 * dispatch list. Don't include reserved tags in the
 460		 * limiting, as it isn't useful.
 461		 */
 462		if (!op_is_flush(data->cmd_flags) &&
 463		    !blk_op_is_passthrough(data->cmd_flags) &&
 464		    e->type->ops.limit_depth &&
 465		    !(data->flags & BLK_MQ_REQ_RESERVED))
 466			e->type->ops.limit_depth(data->cmd_flags, data);
 467	}
 468
 469retry:
 470	data->ctx = blk_mq_get_ctx(q);
 471	data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx);
 472	if (!(data->rq_flags & RQF_ELV))
 473		blk_mq_tag_busy(data->hctx);
 474
 475	/*
 476	 * Try batched alloc if we want more than 1 tag.
 477	 */
 478	if (data->nr_tags > 1) {
 479		rq = __blk_mq_alloc_requests_batch(data, alloc_time_ns);
 480		if (rq)
 481			return rq;
 482		data->nr_tags = 1;
 483	}
 484
 485	/*
 486	 * Waiting allocations only fail because of an inactive hctx.  In that
 487	 * case just retry the hctx assignment and tag allocation as CPU hotplug
 488	 * should have migrated us to an online CPU by now.
 489	 */
 490	tag = blk_mq_get_tag(data);
 491	if (tag == BLK_MQ_NO_TAG) {
 492		if (data->flags & BLK_MQ_REQ_NOWAIT)
 493			return NULL;
 494		/*
 495		 * Give up the CPU and sleep for a random short time to
 496		 * ensure that thread using a realtime scheduling class
 497		 * are migrated off the CPU, and thus off the hctx that
 498		 * is going away.
 499		 */
 500		msleep(3);
 501		goto retry;
 502	}
 503
 504	return blk_mq_rq_ctx_init(data, blk_mq_tags_from_data(data), tag,
 505					alloc_time_ns);
 506}
 507
 508struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
 509		blk_mq_req_flags_t flags)
 510{
 511	struct blk_mq_alloc_data data = {
 512		.q		= q,
 513		.flags		= flags,
 514		.cmd_flags	= op,
 515		.nr_tags	= 1,
 516	};
 517	struct request *rq;
 518	int ret;
 519
 520	ret = blk_queue_enter(q, flags);
 521	if (ret)
 522		return ERR_PTR(ret);
 523
 524	rq = __blk_mq_alloc_requests(&data);
 525	if (!rq)
 526		goto out_queue_exit;
 527	rq->__data_len = 0;
 528	rq->__sector = (sector_t) -1;
 529	rq->bio = rq->biotail = NULL;
 530	return rq;
 531out_queue_exit:
 532	blk_queue_exit(q);
 533	return ERR_PTR(-EWOULDBLOCK);
 534}
 535EXPORT_SYMBOL(blk_mq_alloc_request);
 536
 537struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
 538	unsigned int op, blk_mq_req_flags_t flags, unsigned int hctx_idx)
 539{
 540	struct blk_mq_alloc_data data = {
 541		.q		= q,
 542		.flags		= flags,
 543		.cmd_flags	= op,
 544		.nr_tags	= 1,
 545	};
 546	u64 alloc_time_ns = 0;
 547	unsigned int cpu;
 548	unsigned int tag;
 549	int ret;
 550
 551	/* alloc_time includes depth and tag waits */
 552	if (blk_queue_rq_alloc_time(q))
 553		alloc_time_ns = ktime_get_ns();
 554
 555	/*
 556	 * If the tag allocator sleeps we could get an allocation for a
 557	 * different hardware context.  No need to complicate the low level
 558	 * allocator for this for the rare use case of a command tied to
 559	 * a specific queue.
 560	 */
 561	if (WARN_ON_ONCE(!(flags & (BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_RESERVED))))
 562		return ERR_PTR(-EINVAL);
 563
 564	if (hctx_idx >= q->nr_hw_queues)
 565		return ERR_PTR(-EIO);
 566
 567	ret = blk_queue_enter(q, flags);
 568	if (ret)
 569		return ERR_PTR(ret);
 570
 571	/*
 572	 * Check if the hardware context is actually mapped to anything.
 573	 * If not tell the caller that it should skip this queue.
 574	 */
 575	ret = -EXDEV;
 576	data.hctx = q->queue_hw_ctx[hctx_idx];
 577	if (!blk_mq_hw_queue_mapped(data.hctx))
 578		goto out_queue_exit;
 579	cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask);
 580	data.ctx = __blk_mq_get_ctx(q, cpu);
 581
 582	if (!q->elevator)
 583		blk_mq_tag_busy(data.hctx);
 584	else
 585		data.rq_flags |= RQF_ELV;
 586
 587	ret = -EWOULDBLOCK;
 588	tag = blk_mq_get_tag(&data);
 589	if (tag == BLK_MQ_NO_TAG)
 590		goto out_queue_exit;
 591	return blk_mq_rq_ctx_init(&data, blk_mq_tags_from_data(&data), tag,
 592					alloc_time_ns);
 593
 594out_queue_exit:
 595	blk_queue_exit(q);
 596	return ERR_PTR(ret);
 597}
 598EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);
 599
 600static void __blk_mq_free_request(struct request *rq)
 601{
 602	struct request_queue *q = rq->q;
 603	struct blk_mq_ctx *ctx = rq->mq_ctx;
 604	struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
 605	const int sched_tag = rq->internal_tag;
 606
 607	blk_crypto_free_request(rq);
 608	blk_pm_mark_last_busy(rq);
 609	rq->mq_hctx = NULL;
 610	if (rq->tag != BLK_MQ_NO_TAG)
 611		blk_mq_put_tag(hctx->tags, ctx, rq->tag);
 612	if (sched_tag != BLK_MQ_NO_TAG)
 613		blk_mq_put_tag(hctx->sched_tags, ctx, sched_tag);
 614	blk_mq_sched_restart(hctx);
 615	blk_queue_exit(q);
 616}
 617
 618void blk_mq_free_request(struct request *rq)
 619{
 620	struct request_queue *q = rq->q;
 621	struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
 622
 623	if ((rq->rq_flags & RQF_ELVPRIV) &&
 624	    q->elevator->type->ops.finish_request)
 625		q->elevator->type->ops.finish_request(rq);
 626
 627	if (rq->rq_flags & RQF_MQ_INFLIGHT)
 628		__blk_mq_dec_active_requests(hctx);
 629
 630	if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq)))
 631		laptop_io_completion(q->disk->bdi);
 632
 633	rq_qos_done(q, rq);
 634
 635	WRITE_ONCE(rq->state, MQ_RQ_IDLE);
 636	if (req_ref_put_and_test(rq))
 637		__blk_mq_free_request(rq);
 638}
 639EXPORT_SYMBOL_GPL(blk_mq_free_request);
 640
 641void blk_mq_free_plug_rqs(struct blk_plug *plug)
 642{
 643	struct request *rq;
 644
 645	while ((rq = rq_list_pop(&plug->cached_rq)) != NULL)
 646		blk_mq_free_request(rq);
 647}
 648
 649void blk_dump_rq_flags(struct request *rq, char *msg)
 650{
 651	printk(KERN_INFO "%s: dev %s: flags=%llx\n", msg,
 652		rq->q->disk ? rq->q->disk->disk_name : "?",
 653		(unsigned long long) rq->cmd_flags);
 654
 655	printk(KERN_INFO "  sector %llu, nr/cnr %u/%u\n",
 656	       (unsigned long long)blk_rq_pos(rq),
 657	       blk_rq_sectors(rq), blk_rq_cur_sectors(rq));
 658	printk(KERN_INFO "  bio %p, biotail %p, len %u\n",
 659	       rq->bio, rq->biotail, blk_rq_bytes(rq));
 660}
 661EXPORT_SYMBOL(blk_dump_rq_flags);
 662
 663static void req_bio_endio(struct request *rq, struct bio *bio,
 664			  unsigned int nbytes, blk_status_t error)
 665{
 666	if (unlikely(error)) {
 667		bio->bi_status = error;
 668	} else if (req_op(rq) == REQ_OP_ZONE_APPEND) {
 669		/*
 670		 * Partial zone append completions cannot be supported as the
 671		 * BIO fragments may end up not being written sequentially.
 672		 */
 673		if (bio->bi_iter.bi_size != nbytes)
 674			bio->bi_status = BLK_STS_IOERR;
 675		else
 676			bio->bi_iter.bi_sector = rq->__sector;
 677	}
 678
 679	bio_advance(bio, nbytes);
 680
 681	if (unlikely(rq->rq_flags & RQF_QUIET))
 682		bio_set_flag(bio, BIO_QUIET);
 683	/* don't actually finish bio if it's part of flush sequence */
 684	if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ))
 685		bio_endio(bio);
 686}
 687
 688static void blk_account_io_completion(struct request *req, unsigned int bytes)
 689{
 690	if (req->part && blk_do_io_stat(req)) {
 691		const int sgrp = op_stat_group(req_op(req));
 692
 693		part_stat_lock();
 694		part_stat_add(req->part, sectors[sgrp], bytes >> 9);
 695		part_stat_unlock();
 696	}
 697}
 698
 699static void blk_print_req_error(struct request *req, blk_status_t status)
 700{
 701	printk_ratelimited(KERN_ERR
 702		"%s error, dev %s, sector %llu op 0x%x:(%s) flags 0x%x "
 703		"phys_seg %u prio class %u\n",
 704		blk_status_to_str(status),
 705		req->q->disk ? req->q->disk->disk_name : "?",
 706		blk_rq_pos(req), req_op(req), blk_op_str(req_op(req)),
 707		req->cmd_flags & ~REQ_OP_MASK,
 708		req->nr_phys_segments,
 709		IOPRIO_PRIO_CLASS(req->ioprio));
 710}
 711
 712/*
 713 * Fully end IO on a request. Does not support partial completions, or
 714 * errors.
 715 */
 716static void blk_complete_request(struct request *req)
 717{
 718	const bool is_flush = (req->rq_flags & RQF_FLUSH_SEQ) != 0;
 719	int total_bytes = blk_rq_bytes(req);
 720	struct bio *bio = req->bio;
 721
 722	trace_block_rq_complete(req, BLK_STS_OK, total_bytes);
 723
 724	if (!bio)
 725		return;
 726
 727#ifdef CONFIG_BLK_DEV_INTEGRITY
 728	if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ)
 729		req->q->integrity.profile->complete_fn(req, total_bytes);
 730#endif
 731
 732	blk_account_io_completion(req, total_bytes);
 733
 734	do {
 735		struct bio *next = bio->bi_next;
 736
 737		/* Completion has already been traced */
 738		bio_clear_flag(bio, BIO_TRACE_COMPLETION);
 739
 740		if (req_op(req) == REQ_OP_ZONE_APPEND)
 741			bio->bi_iter.bi_sector = req->__sector;
 742
 743		if (!is_flush)
 744			bio_endio(bio);
 745		bio = next;
 746	} while (bio);
 747
 748	/*
 749	 * Reset counters so that the request stacking driver
 750	 * can find how many bytes remain in the request
 751	 * later.
 752	 */
 753	req->bio = NULL;
 754	req->__data_len = 0;
 755}
 756
 757/**
 758 * blk_update_request - Complete multiple bytes without completing the request
 759 * @req:      the request being processed
 760 * @error:    block status code
 761 * @nr_bytes: number of bytes to complete for @req
 762 *
 763 * Description:
 764 *     Ends I/O on a number of bytes attached to @req, but doesn't complete
 765 *     the request structure even if @req doesn't have leftover.
 766 *     If @req has leftover, sets it up for the next range of segments.
 767 *
 768 *     Passing the result of blk_rq_bytes() as @nr_bytes guarantees
 769 *     %false return from this function.
 770 *
 771 * Note:
 772 *	The RQF_SPECIAL_PAYLOAD flag is ignored on purpose in this function
 773 *      except in the consistency check at the end of this function.
 774 *
 775 * Return:
 776 *     %false - this request doesn't have any more data
 777 *     %true  - this request has more data
 778 **/
 779bool blk_update_request(struct request *req, blk_status_t error,
 780		unsigned int nr_bytes)
 781{
 782	int total_bytes;
 783
 784	trace_block_rq_complete(req, error, nr_bytes);
 785
 786	if (!req->bio)
 787		return false;
 788
 789#ifdef CONFIG_BLK_DEV_INTEGRITY
 790	if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ &&
 791	    error == BLK_STS_OK)
 792		req->q->integrity.profile->complete_fn(req, nr_bytes);
 793#endif
 794
 795	if (unlikely(error && !blk_rq_is_passthrough(req) &&
 796		     !(req->rq_flags & RQF_QUIET)))
 797		blk_print_req_error(req, error);
 798
 799	blk_account_io_completion(req, nr_bytes);
 800
 801	total_bytes = 0;
 802	while (req->bio) {
 803		struct bio *bio = req->bio;
 804		unsigned bio_bytes = min(bio->bi_iter.bi_size, nr_bytes);
 805
 806		if (bio_bytes == bio->bi_iter.bi_size)
 807			req->bio = bio->bi_next;
 808
 809		/* Completion has already been traced */
 810		bio_clear_flag(bio, BIO_TRACE_COMPLETION);
 811		req_bio_endio(req, bio, bio_bytes, error);
 812
 813		total_bytes += bio_bytes;
 814		nr_bytes -= bio_bytes;
 815
 816		if (!nr_bytes)
 817			break;
 818	}
 819
 820	/*
 821	 * completely done
 822	 */
 823	if (!req->bio) {
 824		/*
 825		 * Reset counters so that the request stacking driver
 826		 * can find how many bytes remain in the request
 827		 * later.
 828		 */
 829		req->__data_len = 0;
 830		return false;
 831	}
 832
 833	req->__data_len -= total_bytes;
 834
 835	/* update sector only for requests with clear definition of sector */
 836	if (!blk_rq_is_passthrough(req))
 837		req->__sector += total_bytes >> 9;
 838
 839	/* mixed attributes always follow the first bio */
 840	if (req->rq_flags & RQF_MIXED_MERGE) {
 841		req->cmd_flags &= ~REQ_FAILFAST_MASK;
 842		req->cmd_flags |= req->bio->bi_opf & REQ_FAILFAST_MASK;
 843	}
 844
 845	if (!(req->rq_flags & RQF_SPECIAL_PAYLOAD)) {
 846		/*
 847		 * If total number of sectors is less than the first segment
 848		 * size, something has gone terribly wrong.
 849		 */
 850		if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) {
 851			blk_dump_rq_flags(req, "request botched");
 852			req->__data_len = blk_rq_cur_bytes(req);
 853		}
 854
 855		/* recalculate the number of segments */
 856		req->nr_phys_segments = blk_recalc_rq_segments(req);
 857	}
 858
 859	return true;
 860}
 861EXPORT_SYMBOL_GPL(blk_update_request);
 862
 863static void __blk_account_io_done(struct request *req, u64 now)
 864{
 865	const int sgrp = op_stat_group(req_op(req));
 866
 867	part_stat_lock();
 868	update_io_ticks(req->part, jiffies, true);
 869	part_stat_inc(req->part, ios[sgrp]);
 870	part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns);
 871	part_stat_unlock();
 872}
 873
 874static inline void blk_account_io_done(struct request *req, u64 now)
 875{
 876	/*
 877	 * Account IO completion.  flush_rq isn't accounted as a
 878	 * normal IO on queueing nor completion.  Accounting the
 879	 * containing request is enough.
 880	 */
 881	if (blk_do_io_stat(req) && req->part &&
 882	    !(req->rq_flags & RQF_FLUSH_SEQ))
 883		__blk_account_io_done(req, now);
 884}
 885
 886static void __blk_account_io_start(struct request *rq)
 887{
 888	/* passthrough requests can hold bios that do not have ->bi_bdev set */
 889	if (rq->bio && rq->bio->bi_bdev)
 890		rq->part = rq->bio->bi_bdev;
 891	else if (rq->q->disk)
 892		rq->part = rq->q->disk->part0;
 893
 894	part_stat_lock();
 895	update_io_ticks(rq->part, jiffies, false);
 896	part_stat_unlock();
 897}
 898
 899static inline void blk_account_io_start(struct request *req)
 900{
 901	if (blk_do_io_stat(req))
 902		__blk_account_io_start(req);
 903}
 904
 905static inline void __blk_mq_end_request_acct(struct request *rq, u64 now)
 906{
 907	if (rq->rq_flags & RQF_STATS) {
 908		blk_mq_poll_stats_start(rq->q);
 909		blk_stat_add(rq, now);
 910	}
 911
 912	blk_mq_sched_completed_request(rq, now);
 913	blk_account_io_done(rq, now);
 914}
 915
 916inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
 917{
 918	if (blk_mq_need_time_stamp(rq))
 919		__blk_mq_end_request_acct(rq, ktime_get_ns());
 920
 921	if (rq->end_io) {
 922		rq_qos_done(rq->q, rq);
 923		rq->end_io(rq, error);
 924	} else {
 925		blk_mq_free_request(rq);
 926	}
 927}
 928EXPORT_SYMBOL(__blk_mq_end_request);
 929
 930void blk_mq_end_request(struct request *rq, blk_status_t error)
 931{
 932	if (blk_update_request(rq, error, blk_rq_bytes(rq)))
 933		BUG();
 934	__blk_mq_end_request(rq, error);
 935}
 936EXPORT_SYMBOL(blk_mq_end_request);
 937
 938#define TAG_COMP_BATCH		32
 939
 940static inline void blk_mq_flush_tag_batch(struct blk_mq_hw_ctx *hctx,
 941					  int *tag_array, int nr_tags)
 942{
 943	struct request_queue *q = hctx->queue;
 944
 945	/*
 946	 * All requests should have been marked as RQF_MQ_INFLIGHT, so
 947	 * update hctx->nr_active in batch
 948	 */
 949	if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
 950		__blk_mq_sub_active_requests(hctx, nr_tags);
 951
 952	blk_mq_put_tags(hctx->tags, tag_array, nr_tags);
 953	percpu_ref_put_many(&q->q_usage_counter, nr_tags);
 954}
 955
 956void blk_mq_end_request_batch(struct io_comp_batch *iob)
 957{
 958	int tags[TAG_COMP_BATCH], nr_tags = 0;
 959	struct blk_mq_hw_ctx *cur_hctx = NULL;
 960	struct request *rq;
 961	u64 now = 0;
 962
 963	if (iob->need_ts)
 964		now = ktime_get_ns();
 965
 966	while ((rq = rq_list_pop(&iob->req_list)) != NULL) {
 967		prefetch(rq->bio);
 968		prefetch(rq->rq_next);
 969
 970		blk_complete_request(rq);
 971		if (iob->need_ts)
 972			__blk_mq_end_request_acct(rq, now);
 973
 974		rq_qos_done(rq->q, rq);
 975
 976		WRITE_ONCE(rq->state, MQ_RQ_IDLE);
 977		if (!req_ref_put_and_test(rq))
 978			continue;
 979
 980		blk_crypto_free_request(rq);
 981		blk_pm_mark_last_busy(rq);
 982
 983		if (nr_tags == TAG_COMP_BATCH || cur_hctx != rq->mq_hctx) {
 984			if (cur_hctx)
 985				blk_mq_flush_tag_batch(cur_hctx, tags, nr_tags);
 986			nr_tags = 0;
 987			cur_hctx = rq->mq_hctx;
 988		}
 989		tags[nr_tags++] = rq->tag;
 990	}
 991
 992	if (nr_tags)
 993		blk_mq_flush_tag_batch(cur_hctx, tags, nr_tags);
 994}
 995EXPORT_SYMBOL_GPL(blk_mq_end_request_batch);
 996
 997static void blk_complete_reqs(struct llist_head *list)
 998{
 999	struct llist_node *entry = llist_reverse_order(llist_del_all(list));
1000	struct request *rq, *next;
1001
1002	llist_for_each_entry_safe(rq, next, entry, ipi_list)
1003		rq->q->mq_ops->complete(rq);
1004}
1005
1006static __latent_entropy void blk_done_softirq(struct softirq_action *h)
1007{
1008	blk_complete_reqs(this_cpu_ptr(&blk_cpu_done));
1009}
1010
1011static int blk_softirq_cpu_dead(unsigned int cpu)
1012{
1013	blk_complete_reqs(&per_cpu(blk_cpu_done, cpu));
1014	return 0;
1015}
1016
1017static void __blk_mq_complete_request_remote(void *data)
1018{
1019	__raise_softirq_irqoff(BLOCK_SOFTIRQ);
1020}
1021
1022static inline bool blk_mq_complete_need_ipi(struct request *rq)
1023{
1024	int cpu = raw_smp_processor_id();
1025
1026	if (!IS_ENABLED(CONFIG_SMP) ||
1027	    !test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags))
1028		return false;
1029	/*
1030	 * With force threaded interrupts enabled, raising softirq from an SMP
1031	 * function call will always result in waking the ksoftirqd thread.
1032	 * This is probably worse than completing the request on a different
1033	 * cache domain.
1034	 */
1035	if (force_irqthreads())
1036		return false;
1037
1038	/* same CPU or cache domain?  Complete locally */
1039	if (cpu == rq->mq_ctx->cpu ||
1040	    (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags) &&
1041	     cpus_share_cache(cpu, rq->mq_ctx->cpu)))
1042		return false;
1043
1044	/* don't try to IPI to an offline CPU */
1045	return cpu_online(rq->mq_ctx->cpu);
1046}
1047
1048static void blk_mq_complete_send_ipi(struct request *rq)
1049{
1050	struct llist_head *list;
1051	unsigned int cpu;
1052
1053	cpu = rq->mq_ctx->cpu;
1054	list = &per_cpu(blk_cpu_done, cpu);
1055	if (llist_add(&rq->ipi_list, list)) {
1056		INIT_CSD(&rq->csd, __blk_mq_complete_request_remote, rq);
1057		smp_call_function_single_async(cpu, &rq->csd);
1058	}
1059}
1060
1061static void blk_mq_raise_softirq(struct request *rq)
1062{
1063	struct llist_head *list;
1064
1065	preempt_disable();
1066	list = this_cpu_ptr(&blk_cpu_done);
1067	if (llist_add(&rq->ipi_list, list))
1068		raise_softirq(BLOCK_SOFTIRQ);
1069	preempt_enable();
1070}
1071
1072bool blk_mq_complete_request_remote(struct request *rq)
1073{
1074	WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);
1075
1076	/*
1077	 * For a polled request, always complete locallly, it's pointless
1078	 * to redirect the completion.
1079	 */
1080	if (rq->cmd_flags & REQ_POLLED)
1081		return false;
1082
1083	if (blk_mq_complete_need_ipi(rq)) {
1084		blk_mq_complete_send_ipi(rq);
1085		return true;
1086	}
1087
1088	if (rq->q->nr_hw_queues == 1) {
1089		blk_mq_raise_softirq(rq);
1090		return true;
1091	}
1092	return false;
1093}
1094EXPORT_SYMBOL_GPL(blk_mq_complete_request_remote);
1095
1096/**
1097 * blk_mq_complete_request - end I/O on a request
1098 * @rq:		the request being processed
1099 *
1100 * Description:
1101 *	Complete a request by scheduling the ->complete_rq operation.
1102 **/
1103void blk_mq_complete_request(struct request *rq)
1104{
1105	if (!blk_mq_complete_request_remote(rq))
1106		rq->q->mq_ops->complete(rq);
1107}
1108EXPORT_SYMBOL(blk_mq_complete_request);
1109
1110/**
1111 * blk_mq_start_request - Start processing a request
1112 * @rq: Pointer to request to be started
1113 *
1114 * Function used by device drivers to notify the block layer that a request
1115 * is going to be processed now, so blk layer can do proper initializations
1116 * such as starting the timeout timer.
1117 */
1118void blk_mq_start_request(struct request *rq)
1119{
1120	struct request_queue *q = rq->q;
1121
1122	trace_block_rq_issue(rq);
1123
1124	if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
1125		u64 start_time;
1126#ifdef CONFIG_BLK_CGROUP
1127		if (rq->bio)
1128			start_time = bio_issue_time(&rq->bio->bi_issue);
1129		else
1130#endif
1131			start_time = ktime_get_ns();
1132		rq->io_start_time_ns = start_time;
1133		rq->stats_sectors = blk_rq_sectors(rq);
1134		rq->rq_flags |= RQF_STATS;
1135		rq_qos_issue(q, rq);
1136	}
1137
1138	WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE);
1139
1140	blk_add_timer(rq);
1141	WRITE_ONCE(rq->state, MQ_RQ_IN_FLIGHT);
1142
1143#ifdef CONFIG_BLK_DEV_INTEGRITY
1144	if (blk_integrity_rq(rq) && req_op(rq) == REQ_OP_WRITE)
1145		q->integrity.profile->prepare_fn(rq);
1146#endif
1147	if (rq->bio && rq->bio->bi_opf & REQ_POLLED)
1148	        WRITE_ONCE(rq->bio->bi_cookie, blk_rq_to_qc(rq));
1149}
1150EXPORT_SYMBOL(blk_mq_start_request);
1151
1152/**
1153 * blk_end_sync_rq - executes a completion event on a request
1154 * @rq: request to complete
1155 * @error: end I/O status of the request
1156 */
1157static void blk_end_sync_rq(struct request *rq, blk_status_t error)
1158{
1159	struct completion *waiting = rq->end_io_data;
1160
1161	rq->end_io_data = (void *)(uintptr_t)error;
1162
1163	/*
1164	 * complete last, if this is a stack request the process (and thus
1165	 * the rq pointer) could be invalid right after this complete()
1166	 */
1167	complete(waiting);
1168}
1169
1170/**
1171 * blk_execute_rq_nowait - insert a request to I/O scheduler for execution
1172 * @rq:		request to insert
1173 * @at_head:    insert request at head or tail of queue
1174 * @done:	I/O completion handler
1175 *
1176 * Description:
1177 *    Insert a fully prepared request at the back of the I/O scheduler queue
1178 *    for execution.  Don't wait for completion.
1179 *
1180 * Note:
1181 *    This function will invoke @done directly if the queue is dead.
1182 */
1183void blk_execute_rq_nowait(struct request *rq, bool at_head, rq_end_io_fn *done)
1184{
1185	WARN_ON(irqs_disabled());
1186	WARN_ON(!blk_rq_is_passthrough(rq));
1187
1188	rq->end_io = done;
1189
1190	blk_account_io_start(rq);
1191
1192	/*
1193	 * don't check dying flag for MQ because the request won't
1194	 * be reused after dying flag is set
1195	 */
1196	blk_mq_sched_insert_request(rq, at_head, true, false);
1197}
1198EXPORT_SYMBOL_GPL(blk_execute_rq_nowait);
1199
1200static bool blk_rq_is_poll(struct request *rq)
1201{
1202	if (!rq->mq_hctx)
1203		return false;
1204	if (rq->mq_hctx->type != HCTX_TYPE_POLL)
1205		return false;
1206	if (WARN_ON_ONCE(!rq->bio))
1207		return false;
1208	return true;
1209}
1210
1211static void blk_rq_poll_completion(struct request *rq, struct completion *wait)
1212{
1213	do {
1214		bio_poll(rq->bio, NULL, 0);
1215		cond_resched();
1216	} while (!completion_done(wait));
1217}
1218
1219/**
1220 * blk_execute_rq - insert a request into queue for execution
1221 * @rq:		request to insert
1222 * @at_head:    insert request at head or tail of queue
1223 *
1224 * Description:
1225 *    Insert a fully prepared request at the back of the I/O scheduler queue
1226 *    for execution and wait for completion.
1227 * Return: The blk_status_t result provided to blk_mq_end_request().
1228 */
1229blk_status_t blk_execute_rq(struct request *rq, bool at_head)
1230{
1231	DECLARE_COMPLETION_ONSTACK(wait);
1232	unsigned long hang_check;
1233
1234	rq->end_io_data = &wait;
1235	blk_execute_rq_nowait(rq, at_head, blk_end_sync_rq);
1236
1237	/* Prevent hang_check timer from firing at us during very long I/O */
1238	hang_check = sysctl_hung_task_timeout_secs;
1239
1240	if (blk_rq_is_poll(rq))
1241		blk_rq_poll_completion(rq, &wait);
1242	else if (hang_check)
1243		while (!wait_for_completion_io_timeout(&wait,
1244				hang_check * (HZ/2)))
1245			;
1246	else
1247		wait_for_completion_io(&wait);
1248
1249	return (blk_status_t)(uintptr_t)rq->end_io_data;
1250}
1251EXPORT_SYMBOL(blk_execute_rq);
1252
1253static void __blk_mq_requeue_request(struct request *rq)
1254{
1255	struct request_queue *q = rq->q;
1256
1257	blk_mq_put_driver_tag(rq);
1258
1259	trace_block_rq_requeue(rq);
1260	rq_qos_requeue(q, rq);
1261
1262	if (blk_mq_request_started(rq)) {
1263		WRITE_ONCE(rq->state, MQ_RQ_IDLE);
1264		rq->rq_flags &= ~RQF_TIMED_OUT;
1265	}
1266}
1267
1268void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list)
1269{
1270	__blk_mq_requeue_request(rq);
1271
1272	/* this request will be re-inserted to io scheduler queue */
1273	blk_mq_sched_requeue_request(rq);
1274
1275	blk_mq_add_to_requeue_list(rq, true, kick_requeue_list);
1276}
1277EXPORT_SYMBOL(blk_mq_requeue_request);
1278
1279static void blk_mq_requeue_work(struct work_struct *work)
1280{
1281	struct request_queue *q =
1282		container_of(work, struct request_queue, requeue_work.work);
1283	LIST_HEAD(rq_list);
1284	struct request *rq, *next;
1285
1286	spin_lock_irq(&q->requeue_lock);
1287	list_splice_init(&q->requeue_list, &rq_list);
1288	spin_unlock_irq(&q->requeue_lock);
1289
1290	list_for_each_entry_safe(rq, next, &rq_list, queuelist) {
1291		if (!(rq->rq_flags & (RQF_SOFTBARRIER | RQF_DONTPREP)))
1292			continue;
1293
1294		rq->rq_flags &= ~RQF_SOFTBARRIER;
1295		list_del_init(&rq->queuelist);
1296		/*
1297		 * If RQF_DONTPREP, rq has contained some driver specific
1298		 * data, so insert it to hctx dispatch list to avoid any
1299		 * merge.
1300		 */
1301		if (rq->rq_flags & RQF_DONTPREP)
1302			blk_mq_request_bypass_insert(rq, false, false);
1303		else
1304			blk_mq_sched_insert_request(rq, true, false, false);
1305	}
1306
1307	while (!list_empty(&rq_list)) {
1308		rq = list_entry(rq_list.next, struct request, queuelist);
1309		list_del_init(&rq->queuelist);
1310		blk_mq_sched_insert_request(rq, false, false, false);
1311	}
1312
1313	blk_mq_run_hw_queues(q, false);
1314}
1315
1316void blk_mq_add_to_requeue_list(struct request *rq, bool at_head,
1317				bool kick_requeue_list)
1318{
1319	struct request_queue *q = rq->q;
1320	unsigned long flags;
1321
1322	/*
1323	 * We abuse this flag that is otherwise used by the I/O scheduler to
1324	 * request head insertion from the workqueue.
1325	 */
1326	BUG_ON(rq->rq_flags & RQF_SOFTBARRIER);
1327
1328	spin_lock_irqsave(&q->requeue_lock, flags);
1329	if (at_head) {
1330		rq->rq_flags |= RQF_SOFTBARRIER;
1331		list_add(&rq->queuelist, &q->requeue_list);
1332	} else {
1333		list_add_tail(&rq->queuelist, &q->requeue_list);
1334	}
1335	spin_unlock_irqrestore(&q->requeue_lock, flags);
1336
1337	if (kick_requeue_list)
1338		blk_mq_kick_requeue_list(q);
1339}
1340
1341void blk_mq_kick_requeue_list(struct request_queue *q)
1342{
1343	kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work, 0);
1344}
1345EXPORT_SYMBOL(blk_mq_kick_requeue_list);
1346
1347void blk_mq_delay_kick_requeue_list(struct request_queue *q,
1348				    unsigned long msecs)
1349{
1350	kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work,
1351				    msecs_to_jiffies(msecs));
1352}
1353EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list);
1354
1355static bool blk_mq_rq_inflight(struct request *rq, void *priv,
1356			       bool reserved)
1357{
1358	/*
1359	 * If we find a request that isn't idle we know the queue is busy
1360	 * as it's checked in the iter.
1361	 * Return false to stop the iteration.
1362	 */
1363	if (blk_mq_request_started(rq)) {
1364		bool *busy = priv;
1365
1366		*busy = true;
1367		return false;
1368	}
1369
1370	return true;
1371}
1372
1373bool blk_mq_queue_inflight(struct request_queue *q)
1374{
1375	bool busy = false;
1376
1377	blk_mq_queue_tag_busy_iter(q, blk_mq_rq_inflight, &busy);
1378	return busy;
1379}
1380EXPORT_SYMBOL_GPL(blk_mq_queue_inflight);
1381
1382static void blk_mq_rq_timed_out(struct request *req, bool reserved)
1383{
1384	req->rq_flags |= RQF_TIMED_OUT;
1385	if (req->q->mq_ops->timeout) {
1386		enum blk_eh_timer_return ret;
1387
1388		ret = req->q->mq_ops->timeout(req, reserved);
1389		if (ret == BLK_EH_DONE)
1390			return;
1391		WARN_ON_ONCE(ret != BLK_EH_RESET_TIMER);
1392	}
1393
1394	blk_add_timer(req);
1395}
1396
1397static bool blk_mq_req_expired(struct request *rq, unsigned long *next)
1398{
1399	unsigned long deadline;
1400
1401	if (blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT)
1402		return false;
1403	if (rq->rq_flags & RQF_TIMED_OUT)
1404		return false;
1405
1406	deadline = READ_ONCE(rq->deadline);
1407	if (time_after_eq(jiffies, deadline))
1408		return true;
1409
1410	if (*next == 0)
1411		*next = deadline;
1412	else if (time_after(*next, deadline))
1413		*next = deadline;
1414	return false;
1415}
1416
1417void blk_mq_put_rq_ref(struct request *rq)
1418{
1419	if (is_flush_rq(rq))
1420		rq->end_io(rq, 0);
1421	else if (req_ref_put_and_test(rq))
1422		__blk_mq_free_request(rq);
1423}
1424
1425static bool blk_mq_check_expired(struct request *rq, void *priv, bool reserved)
1426{
1427	unsigned long *next = priv;
1428
1429	/*
1430	 * blk_mq_queue_tag_busy_iter() has locked the request, so it cannot
1431	 * be reallocated underneath the timeout handler's processing, then
1432	 * the expire check is reliable. If the request is not expired, then
1433	 * it was completed and reallocated as a new request after returning
1434	 * from blk_mq_check_expired().
1435	 */
1436	if (blk_mq_req_expired(rq, next))
1437		blk_mq_rq_timed_out(rq, reserved);
1438	return true;
1439}
1440
1441static void blk_mq_timeout_work(struct work_struct *work)
1442{
1443	struct request_queue *q =
1444		container_of(work, struct request_queue, timeout_work);
1445	unsigned long next = 0;
1446	struct blk_mq_hw_ctx *hctx;
1447	int i;
1448
1449	/* A deadlock might occur if a request is stuck requiring a
1450	 * timeout at the same time a queue freeze is waiting
1451	 * completion, since the timeout code would not be able to
1452	 * acquire the queue reference here.
1453	 *
1454	 * That's why we don't use blk_queue_enter here; instead, we use
1455	 * percpu_ref_tryget directly, because we need to be able to
1456	 * obtain a reference even in the short window between the queue
1457	 * starting to freeze, by dropping the first reference in
1458	 * blk_freeze_queue_start, and the moment the last request is
1459	 * consumed, marked by the instant q_usage_counter reaches
1460	 * zero.
1461	 */
1462	if (!percpu_ref_tryget(&q->q_usage_counter))
1463		return;
1464
1465	blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &next);
1466
1467	if (next != 0) {
1468		mod_timer(&q->timeout, next);
1469	} else {
1470		/*
1471		 * Request timeouts are handled as a forward rolling timer. If
1472		 * we end up here it means that no requests are pending and
1473		 * also that no request has been pending for a while. Mark
1474		 * each hctx as idle.
1475		 */
1476		queue_for_each_hw_ctx(q, hctx, i) {
1477			/* the hctx may be unmapped, so check it here */
1478			if (blk_mq_hw_queue_mapped(hctx))
1479				blk_mq_tag_idle(hctx);
1480		}
1481	}
1482	blk_queue_exit(q);
1483}
1484
1485struct flush_busy_ctx_data {
1486	struct blk_mq_hw_ctx *hctx;
1487	struct list_head *list;
1488};
1489
1490static bool flush_busy_ctx(struct sbitmap *sb, unsigned int bitnr, void *data)
1491{
1492	struct flush_busy_ctx_data *flush_data = data;
1493	struct blk_mq_hw_ctx *hctx = flush_data->hctx;
1494	struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
1495	enum hctx_type type = hctx->type;
1496
1497	spin_lock(&ctx->lock);
1498	list_splice_tail_init(&ctx->rq_lists[type], flush_data->list);
1499	sbitmap_clear_bit(sb, bitnr);
1500	spin_unlock(&ctx->lock);
1501	return true;
1502}
1503
1504/*
1505 * Process software queues that have been marked busy, splicing them
1506 * to the for-dispatch
1507 */
1508void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
1509{
1510	struct flush_busy_ctx_data data = {
1511		.hctx = hctx,
1512		.list = list,
1513	};
1514
1515	sbitmap_for_each_set(&hctx->ctx_map, flush_busy_ctx, &data);
1516}
1517EXPORT_SYMBOL_GPL(blk_mq_flush_busy_ctxs);
1518
1519struct dispatch_rq_data {
1520	struct blk_mq_hw_ctx *hctx;
1521	struct request *rq;
1522};
1523
1524static bool dispatch_rq_from_ctx(struct sbitmap *sb, unsigned int bitnr,
1525		void *data)
1526{
1527	struct dispatch_rq_data *dispatch_data = data;
1528	struct blk_mq_hw_ctx *hctx = dispatch_data->hctx;
1529	struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
1530	enum hctx_type type = hctx->type;
1531
1532	spin_lock(&ctx->lock);
1533	if (!list_empty(&ctx->rq_lists[type])) {
1534		dispatch_data->rq = list_entry_rq(ctx->rq_lists[type].next);
1535		list_del_init(&dispatch_data->rq->queuelist);
1536		if (list_empty(&ctx->rq_lists[type]))
1537			sbitmap_clear_bit(sb, bitnr);
1538	}
1539	spin_unlock(&ctx->lock);
1540
1541	return !dispatch_data->rq;
1542}
1543
1544struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
1545					struct blk_mq_ctx *start)
1546{
1547	unsigned off = start ? start->index_hw[hctx->type] : 0;
1548	struct dispatch_rq_data data = {
1549		.hctx = hctx,
1550		.rq   = NULL,
1551	};
1552
1553	__sbitmap_for_each_set(&hctx->ctx_map, off,
1554			       dispatch_rq_from_ctx, &data);
1555
1556	return data.rq;
1557}
1558
1559static bool __blk_mq_alloc_driver_tag(struct request *rq)
1560{
1561	struct sbitmap_queue *bt = &rq->mq_hctx->tags->bitmap_tags;
1562	unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags;
1563	int tag;
1564
1565	blk_mq_tag_busy(rq->mq_hctx);
1566
1567	if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) {
1568		bt = &rq->mq_hctx->tags->breserved_tags;
1569		tag_offset = 0;
1570	} else {
1571		if (!hctx_may_queue(rq->mq_hctx, bt))
1572			return false;
1573	}
1574
1575	tag = __sbitmap_queue_get(bt);
1576	if (tag == BLK_MQ_NO_TAG)
1577		return false;
1578
1579	rq->tag = tag + tag_offset;
1580	return true;
1581}
1582
1583bool __blk_mq_get_driver_tag(struct blk_mq_hw_ctx *hctx, struct request *rq)
1584{
1585	if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_alloc_driver_tag(rq))
1586		return false;
1587
1588	if ((hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) &&
1589			!(rq->rq_flags & RQF_MQ_INFLIGHT)) {
1590		rq->rq_flags |= RQF_MQ_INFLIGHT;
1591		__blk_mq_inc_active_requests(hctx);
1592	}
1593	hctx->tags->rqs[rq->tag] = rq;
1594	return true;
1595}
1596
1597static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
1598				int flags, void *key)
1599{
1600	struct blk_mq_hw_ctx *hctx;
1601
1602	hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait);
1603
1604	spin_lock(&hctx->dispatch_wait_lock);
1605	if (!list_empty(&wait->entry)) {
1606		struct sbitmap_queue *sbq;
1607
1608		list_del_init(&wait->entry);
1609		sbq = &hctx->tags->bitmap_tags;
1610		atomic_dec(&sbq->ws_active);
1611	}
1612	spin_unlock(&hctx->dispatch_wait_lock);
1613
1614	blk_mq_run_hw_queue(hctx, true);
1615	return 1;
1616}
1617
1618/*
1619 * Mark us waiting for a tag. For shared tags, this involves hooking us into
1620 * the tag wakeups. For non-shared tags, we can simply mark us needing a
1621 * restart. For both cases, take care to check the condition again after
1622 * marking us as waiting.
1623 */
1624static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx,
1625				 struct request *rq)
1626{
1627	struct sbitmap_queue *sbq = &hctx->tags->bitmap_tags;
1628	struct wait_queue_head *wq;
1629	wait_queue_entry_t *wait;
1630	bool ret;
1631
1632	if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) {
1633		blk_mq_sched_mark_restart_hctx(hctx);
1634
1635		/*
1636		 * It's possible that a tag was freed in the window between the
1637		 * allocation failure and adding the hardware queue to the wait
1638		 * queue.
1639		 *
1640		 * Don't clear RESTART here, someone else could have set it.
1641		 * At most this will cost an extra queue run.
1642		 */
1643		return blk_mq_get_driver_tag(rq);
1644	}
1645
1646	wait = &hctx->dispatch_wait;
1647	if (!list_empty_careful(&wait->entry))
1648		return false;
1649
1650	wq = &bt_wait_ptr(sbq, hctx)->wait;
1651
1652	spin_lock_irq(&wq->lock);
1653	spin_lock(&hctx->dispatch_wait_lock);
1654	if (!list_empty(&wait->entry)) {
1655		spin_unlock(&hctx->dispatch_wait_lock);
1656		spin_unlock_irq(&wq->lock);
1657		return false;
1658	}
1659
1660	atomic_inc(&sbq->ws_active);
1661	wait->flags &= ~WQ_FLAG_EXCLUSIVE;
1662	__add_wait_queue(wq, wait);
1663
1664	/*
1665	 * It's possible that a tag was freed in the window between the
1666	 * allocation failure and adding the hardware queue to the wait
1667	 * queue.
1668	 */
1669	ret = blk_mq_get_driver_tag(rq);
1670	if (!ret) {
1671		spin_unlock(&hctx->dispatch_wait_lock);
1672		spin_unlock_irq(&wq->lock);
1673		return false;
1674	}
1675
1676	/*
1677	 * We got a tag, remove ourselves from the wait queue to ensure
1678	 * someone else gets the wakeup.
1679	 */
1680	list_del_init(&wait->entry);
1681	atomic_dec(&sbq->ws_active);
1682	spin_unlock(&hctx->dispatch_wait_lock);
1683	spin_unlock_irq(&wq->lock);
1684
1685	return true;
1686}
1687
1688#define BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT  8
1689#define BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR  4
1690/*
1691 * Update dispatch busy with the Exponential Weighted Moving Average(EWMA):
1692 * - EWMA is one simple way to compute running average value
1693 * - weight(7/8 and 1/8) is applied so that it can decrease exponentially
1694 * - take 4 as factor for avoiding to get too small(0) result, and this
1695 *   factor doesn't matter because EWMA decreases exponentially
1696 */
1697static void blk_mq_update_dispatch_busy(struct blk_mq_hw_ctx *hctx, bool busy)
1698{
1699	unsigned int ewma;
1700
1701	ewma = hctx->dispatch_busy;
1702
1703	if (!ewma && !busy)
1704		return;
1705
1706	ewma *= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT - 1;
1707	if (busy)
1708		ewma += 1 << BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR;
1709	ewma /= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT;
1710
1711	hctx->dispatch_busy = ewma;
1712}
1713
1714#define BLK_MQ_RESOURCE_DELAY	3		/* ms units */
1715
1716static void blk_mq_handle_dev_resource(struct request *rq,
1717				       struct list_head *list)
1718{
1719	struct request *next =
1720		list_first_entry_or_null(list, struct request, queuelist);
1721
1722	/*
1723	 * If an I/O scheduler has been configured and we got a driver tag for
1724	 * the next request already, free it.
1725	 */
1726	if (next)
1727		blk_mq_put_driver_tag(next);
1728
1729	list_add(&rq->queuelist, list);
1730	__blk_mq_requeue_request(rq);
1731}
1732
1733static void blk_mq_handle_zone_resource(struct request *rq,
1734					struct list_head *zone_list)
1735{
1736	/*
1737	 * If we end up here it is because we cannot dispatch a request to a
1738	 * specific zone due to LLD level zone-write locking or other zone
1739	 * related resource not being available. In this case, set the request
1740	 * aside in zone_list for retrying it later.
1741	 */
1742	list_add(&rq->queuelist, zone_list);
1743	__blk_mq_requeue_request(rq);
1744}
1745
1746enum prep_dispatch {
1747	PREP_DISPATCH_OK,
1748	PREP_DISPATCH_NO_TAG,
1749	PREP_DISPATCH_NO_BUDGET,
1750};
1751
1752static enum prep_dispatch blk_mq_prep_dispatch_rq(struct request *rq,
1753						  bool need_budget)
1754{
1755	struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
1756	int budget_token = -1;
1757
1758	if (need_budget) {
1759		budget_token = blk_mq_get_dispatch_budget(rq->q);
1760		if (budget_token < 0) {
1761			blk_mq_put_driver_tag(rq);
1762			return PREP_DISPATCH_NO_BUDGET;
1763		}
1764		blk_mq_set_rq_budget_token(rq, budget_token);
1765	}
1766
1767	if (!blk_mq_get_driver_tag(rq)) {
1768		/*
1769		 * The initial allocation attempt failed, so we need to
1770		 * rerun the hardware queue when a tag is freed. The
1771		 * waitqueue takes care of that. If the queue is run
1772		 * before we add this entry back on the dispatch list,
1773		 * we'll re-run it below.
1774		 */
1775		if (!blk_mq_mark_tag_wait(hctx, rq)) {
1776			/*
1777			 * All budgets not got from this function will be put
1778			 * together during handling partial dispatch
1779			 */
1780			if (need_budget)
1781				blk_mq_put_dispatch_budget(rq->q, budget_token);
1782			return PREP_DISPATCH_NO_TAG;
1783		}
1784	}
1785
1786	return PREP_DISPATCH_OK;
1787}
1788
1789/* release all allocated budgets before calling to blk_mq_dispatch_rq_list */
1790static void blk_mq_release_budgets(struct request_queue *q,
1791		struct list_head *list)
1792{
1793	struct request *rq;
1794
1795	list_for_each_entry(rq, list, queuelist) {
1796		int budget_token = blk_mq_get_rq_budget_token(rq);
1797
1798		if (budget_token >= 0)
1799			blk_mq_put_dispatch_budget(q, budget_token);
1800	}
1801}
1802
1803/*
1804 * Returns true if we did some work AND can potentially do more.
1805 */
1806bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
1807			     unsigned int nr_budgets)
1808{
1809	enum prep_dispatch prep;
1810	struct request_queue *q = hctx->queue;
1811	struct request *rq, *nxt;
1812	int errors, queued;
1813	blk_status_t ret = BLK_STS_OK;
1814	LIST_HEAD(zone_list);
1815	bool needs_resource = false;
1816
1817	if (list_empty(list))
1818		return false;
1819
1820	/*
1821	 * Now process all the entries, sending them to the driver.
1822	 */
1823	errors = queued = 0;
1824	do {
1825		struct blk_mq_queue_data bd;
1826
1827		rq = list_first_entry(list, struct request, queuelist);
1828
1829		WARN_ON_ONCE(hctx != rq->mq_hctx);
1830		prep = blk_mq_prep_dispatch_rq(rq, !nr_budgets);
1831		if (prep != PREP_DISPATCH_OK)
1832			break;
1833
1834		list_del_init(&rq->queuelist);
1835
1836		bd.rq = rq;
1837
1838		/*
1839		 * Flag last if we have no more requests, or if we have more
1840		 * but can't assign a driver tag to it.
1841		 */
1842		if (list_empty(list))
1843			bd.last = true;
1844		else {
1845			nxt = list_first_entry(list, struct request, queuelist);
1846			bd.last = !blk_mq_get_driver_tag(nxt);
1847		}
1848
1849		/*
1850		 * once the request is queued to lld, no need to cover the
1851		 * budget any more
1852		 */
1853		if (nr_budgets)
1854			nr_budgets--;
1855		ret = q->mq_ops->queue_rq(hctx, &bd);
1856		switch (ret) {
1857		case BLK_STS_OK:
1858			queued++;
1859			break;
1860		case BLK_STS_RESOURCE:
1861			needs_resource = true;
1862			fallthrough;
1863		case BLK_STS_DEV_RESOURCE:
1864			blk_mq_handle_dev_resource(rq, list);
1865			goto out;
1866		case BLK_STS_ZONE_RESOURCE:
1867			/*
1868			 * Move the request to zone_list and keep going through
1869			 * the dispatch list to find more requests the drive can
1870			 * accept.
1871			 */
1872			blk_mq_handle_zone_resource(rq, &zone_list);
1873			needs_resource = true;
1874			break;
1875		default:
1876			errors++;
1877			blk_mq_end_request(rq, ret);
1878		}
1879	} while (!list_empty(list));
1880out:
1881	if (!list_empty(&zone_list))
1882		list_splice_tail_init(&zone_list, list);
1883
1884	/* If we didn't flush the entire list, we could have told the driver
1885	 * there was more coming, but that turned out to be a lie.
1886	 */
1887	if ((!list_empty(list) || errors) && q->mq_ops->commit_rqs && queued)
1888		q->mq_ops->commit_rqs(hctx);
1889	/*
1890	 * Any items that need requeuing? Stuff them into hctx->dispatch,
1891	 * that is where we will continue on next queue run.
1892	 */
1893	if (!list_empty(list)) {
1894		bool needs_restart;
1895		/* For non-shared tags, the RESTART check will suffice */
1896		bool no_tag = prep == PREP_DISPATCH_NO_TAG &&
1897			(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED);
1898
1899		if (nr_budgets)
1900			blk_mq_release_budgets(q, list);
1901
1902		spin_lock(&hctx->lock);
1903		list_splice_tail_init(list, &hctx->dispatch);
1904		spin_unlock(&hctx->lock);
1905
1906		/*
1907		 * Order adding requests to hctx->dispatch and checking
1908		 * SCHED_RESTART flag. The pair of this smp_mb() is the one
1909		 * in blk_mq_sched_restart(). Avoid restart code path to
1910		 * miss the new added requests to hctx->dispatch, meantime
1911		 * SCHED_RESTART is observed here.
1912		 */
1913		smp_mb();
1914
1915		/*
1916		 * If SCHED_RESTART was set by the caller of this function and
1917		 * it is no longer set that means that it was cleared by another
1918		 * thread and hence that a queue rerun is needed.
1919		 *
1920		 * If 'no_tag' is set, that means that we failed getting
1921		 * a driver tag with an I/O scheduler attached. If our dispatch
1922		 * waitqueue is no longer active, ensure that we run the queue
1923		 * AFTER adding our entries back to the list.
1924		 *
1925		 * If no I/O scheduler has been configured it is possible that
1926		 * the hardware queue got stopped and restarted before requests
1927		 * were pushed back onto the dispatch list. Rerun the queue to
1928		 * avoid starvation. Notes:
1929		 * - blk_mq_run_hw_queue() checks whether or not a queue has
1930		 *   been stopped before rerunning a queue.
1931		 * - Some but not all block drivers stop a queue before
1932		 *   returning BLK_STS_RESOURCE. Two exceptions are scsi-mq
1933		 *   and dm-rq.
1934		 *
1935		 * If driver returns BLK_STS_RESOURCE and SCHED_RESTART
1936		 * bit is set, run queue after a delay to avoid IO stalls
1937		 * that could otherwise occur if the queue is idle.  We'll do
1938		 * similar if we couldn't get budget or couldn't lock a zone
1939		 * and SCHED_RESTART is set.
1940		 */
1941		needs_restart = blk_mq_sched_needs_restart(hctx);
1942		if (prep == PREP_DISPATCH_NO_BUDGET)
1943			needs_resource = true;
1944		if (!needs_restart ||
1945		    (no_tag && list_empty_careful(&hctx->dispatch_wait.entry)))
1946			blk_mq_run_hw_queue(hctx, true);
1947		else if (needs_restart && needs_resource)
1948			blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY);
1949
1950		blk_mq_update_dispatch_busy(hctx, true);
1951		return false;
1952	} else
1953		blk_mq_update_dispatch_busy(hctx, false);
1954
1955	return (queued + errors) != 0;
1956}
1957
1958/**
1959 * __blk_mq_run_hw_queue - Run a hardware queue.
1960 * @hctx: Pointer to the hardware queue to run.
1961 *
1962 * Send pending requests to the hardware.
1963 */
1964static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
1965{
1966	/*
1967	 * We can't run the queue inline with ints disabled. Ensure that
1968	 * we catch bad users of this early.
1969	 */
1970	WARN_ON_ONCE(in_interrupt());
1971
1972	blk_mq_run_dispatch_ops(hctx->queue,
1973			blk_mq_sched_dispatch_requests(hctx));
1974}
1975
1976static inline int blk_mq_first_mapped_cpu(struct blk_mq_hw_ctx *hctx)
1977{
1978	int cpu = cpumask_first_and(hctx->cpumask, cpu_online_mask);
1979
1980	if (cpu >= nr_cpu_ids)
1981		cpu = cpumask_first(hctx->cpumask);
1982	return cpu;
1983}
1984
1985/*
1986 * It'd be great if the workqueue API had a way to pass
1987 * in a mask and had some smarts for more clever placement.
1988 * For now we just round-robin here, switching for every
1989 * BLK_MQ_CPU_WORK_BATCH queued items.
1990 */
1991static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
1992{
1993	bool tried = false;
1994	int next_cpu = hctx->next_cpu;
1995
1996	if (hctx->queue->nr_hw_queues == 1)
1997		return WORK_CPU_UNBOUND;
1998
1999	if (--hctx->next_cpu_batch <= 0) {
2000select_cpu:
2001		next_cpu = cpumask_next_and(next_cpu, hctx->cpumask,
2002				cpu_online_mask);
2003		if (next_cpu >= nr_cpu_ids)
2004			next_cpu = blk_mq_first_mapped_cpu(hctx);
2005		hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
2006	}
2007
2008	/*
2009	 * Do unbound schedule if we can't find a online CPU for this hctx,
2010	 * and it should only happen in the path of handling CPU DEAD.
2011	 */
2012	if (!cpu_online(next_cpu)) {
2013		if (!tried) {
2014			tried = true;
2015			goto select_cpu;
2016		}
2017
2018		/*
2019		 * Make sure to re-select CPU next time once after CPUs
2020		 * in hctx->cpumask become online again.
2021		 */
2022		hctx->next_cpu = next_cpu;
2023		hctx->next_cpu_batch = 1;
2024		return WORK_CPU_UNBOUND;
2025	}
2026
2027	hctx->next_cpu = next_cpu;
2028	return next_cpu;
2029}
2030
2031/**
2032 * __blk_mq_delay_run_hw_queue - Run (or schedule to run) a hardware queue.
2033 * @hctx: Pointer to the hardware queue to run.
2034 * @async: If we want to run the queue asynchronously.
2035 * @msecs: Milliseconds of delay to wait before running the queue.
2036 *
2037 * If !@async, try to run the queue now. Else, run the queue asynchronously and
2038 * with a delay of @msecs.
2039 */
2040static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async,
2041					unsigned long msecs)
2042{
2043	if (unlikely(blk_mq_hctx_stopped(hctx)))
2044		return;
2045
2046	if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) {
2047		int cpu = get_cpu();
2048		if (cpumask_test_cpu(cpu, hctx->cpumask)) {
2049			__blk_mq_run_hw_queue(hctx);
2050			put_cpu();
2051			return;
2052		}
2053
2054		put_cpu();
2055	}
2056
2057	kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work,
2058				    msecs_to_jiffies(msecs));
2059}
2060
2061/**
2062 * blk_mq_delay_run_hw_queue - Run a hardware queue asynchronously.
2063 * @hctx: Pointer to the hardware queue to run.
2064 * @msecs: Milliseconds of delay to wait before running the queue.
2065 *
2066 * Run a hardware queue asynchronously with a delay of @msecs.
2067 */
2068void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
2069{
2070	__blk_mq_delay_run_hw_queue(hctx, true, msecs);
2071}
2072EXPORT_SYMBOL(blk_mq_delay_run_hw_queue);
2073
2074/**
2075 * blk_mq_run_hw_queue - Start to run a hardware queue.
2076 * @hctx: Pointer to the hardware queue to run.
2077 * @async: If we want to run the queue asynchronously.
2078 *
2079 * Check if the request queue is not in a quiesced state and if there are
2080 * pending requests to be sent. If this is true, run the queue to send requests
2081 * to hardware.
2082 */
2083void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
2084{
2085	bool need_run;
2086
2087	/*
2088	 * When queue is quiesced, we may be switching io scheduler, or
2089	 * updating nr_hw_queues, or other things, and we can't run queue
2090	 * any more, even __blk_mq_hctx_has_pending() can't be called safely.
2091	 *
2092	 * And queue will be rerun in blk_mq_unquiesce_queue() if it is
2093	 * quiesced.
2094	 */
2095	__blk_mq_run_dispatch_ops(hctx->queue, false,
2096		need_run = !blk_queue_quiesced(hctx->queue) &&
2097		blk_mq_hctx_has_pending(hctx));
2098
2099	if (need_run)
2100		__blk_mq_delay_run_hw_queue(hctx, async, 0);
2101}
2102EXPORT_SYMBOL(blk_mq_run_hw_queue);
2103
2104/*
2105 * Is the request queue handled by an IO scheduler that does not respect
2106 * hardware queues when dispatching?
2107 */
2108static bool blk_mq_has_sqsched(struct request_queue *q)
2109{
2110	struct elevator_queue *e = q->elevator;
2111
2112	if (e && e->type->ops.dispatch_request &&
2113	    !(e->type->elevator_features & ELEVATOR_F_MQ_AWARE))
2114		return true;
2115	return false;
2116}
2117
2118/*
2119 * Return prefered queue to dispatch from (if any) for non-mq aware IO
2120 * scheduler.
2121 */
2122static struct blk_mq_hw_ctx *blk_mq_get_sq_hctx(struct request_queue *q)
2123{
2124	struct blk_mq_hw_ctx *hctx;
2125
2126	/*
2127	 * If the IO scheduler does not respect hardware queues when
2128	 * dispatching, we just don't bother with multiple HW queues and
2129	 * dispatch from hctx for the current CPU since running multiple queues
2130	 * just causes lock contention inside the scheduler and pointless cache
2131	 * bouncing.
2132	 */
2133	hctx = blk_mq_map_queue_type(q, HCTX_TYPE_DEFAULT,
2134				     raw_smp_processor_id());
2135	if (!blk_mq_hctx_stopped(hctx))
2136		return hctx;
2137	return NULL;
2138}
2139
2140/**
2141 * blk_mq_run_hw_queues - Run all hardware queues in a request queue.
2142 * @q: Pointer to the request queue to run.
2143 * @async: If we want to run the queue asynchronously.
2144 */
2145void blk_mq_run_hw_queues(struct request_queue *q, bool async)
2146{
2147	struct blk_mq_hw_ctx *hctx, *sq_hctx;
2148	int i;
2149
2150	sq_hctx = NULL;
2151	if (blk_mq_has_sqsched(q))
2152		sq_hctx = blk_mq_get_sq_hctx(q);
2153	queue_for_each_hw_ctx(q, hctx, i) {
2154		if (blk_mq_hctx_stopped(hctx))
2155			continue;
2156		/*
2157		 * Dispatch from this hctx either if there's no hctx preferred
2158		 * by IO scheduler or if it has requests that bypass the
2159		 * scheduler.
2160		 */
2161		if (!sq_hctx || sq_hctx == hctx ||
2162		    !list_empty_careful(&hctx->dispatch))
2163			blk_mq_run_hw_queue(hctx, async);
2164	}
2165}
2166EXPORT_SYMBOL(blk_mq_run_hw_queues);
2167
2168/**
2169 * blk_mq_delay_run_hw_queues - Run all hardware queues asynchronously.
2170 * @q: Pointer to the request queue to run.
2171 * @msecs: Milliseconds of delay to wait before running the queues.
2172 */
2173void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs)
2174{
2175	struct blk_mq_hw_ctx *hctx, *sq_hctx;
2176	int i;
2177
2178	sq_hctx = NULL;
2179	if (blk_mq_has_sqsched(q))
2180		sq_hctx = blk_mq_get_sq_hctx(q);
2181	queue_for_each_hw_ctx(q, hctx, i) {
2182		if (blk_mq_hctx_stopped(hctx))
2183			continue;
2184		/*
2185		 * Dispatch from this hctx either if there's no hctx preferred
2186		 * by IO scheduler or if it has requests that bypass the
2187		 * scheduler.
2188		 */
2189		if (!sq_hctx || sq_hctx == hctx ||
2190		    !list_empty_careful(&hctx->dispatch))
2191			blk_mq_delay_run_hw_queue(hctx, msecs);
2192	}
2193}
2194EXPORT_SYMBOL(blk_mq_delay_run_hw_queues);
2195
2196/**
2197 * blk_mq_queue_stopped() - check whether one or more hctxs have been stopped
2198 * @q: request queue.
2199 *
2200 * The caller is responsible for serializing this function against
2201 * blk_mq_{start,stop}_hw_queue().
2202 */
2203bool blk_mq_queue_stopped(struct request_queue *q)
2204{
2205	struct blk_mq_hw_ctx *hctx;
2206	int i;
2207
2208	queue_for_each_hw_ctx(q, hctx, i)
2209		if (blk_mq_hctx_stopped(hctx))
2210			return true;
2211
2212	return false;
2213}
2214EXPORT_SYMBOL(blk_mq_queue_stopped);
2215
2216/*
2217 * This function is often used for pausing .queue_rq() by driver when
2218 * there isn't enough resource or some conditions aren't satisfied, and
2219 * BLK_STS_RESOURCE is usually returned.
2220 *
2221 * We do not guarantee that dispatch can be drained or blocked
2222 * after blk_mq_stop_hw_queue() returns. Please use
2223 * blk_mq_quiesce_queue() for that requirement.
2224 */
2225void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
2226{
2227	cancel_delayed_work(&hctx->run_work);
2228
2229	set_bit(BLK_MQ_S_STOPPED, &hctx->state);
2230}
2231EXPORT_SYMBOL(blk_mq_stop_hw_queue);
2232
2233/*
2234 * This function is often used for pausing .queue_rq() by driver when
2235 * there isn't enough resource or some conditions aren't satisfied, and
2236 * BLK_STS_RESOURCE is usually returned.
2237 *
2238 * We do not guarantee that dispatch can be drained or blocked
2239 * after blk_mq_stop_hw_queues() returns. Please use
2240 * blk_mq_quiesce_queue() for that requirement.
2241 */
2242void blk_mq_stop_hw_queues(struct request_queue *q)
2243{
2244	struct blk_mq_hw_ctx *hctx;
2245	int i;
2246
2247	queue_for_each_hw_ctx(q, hctx, i)
2248		blk_mq_stop_hw_queue(hctx);
2249}
2250EXPORT_SYMBOL(blk_mq_stop_hw_queues);
2251
2252void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx)
2253{
2254	clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
2255
2256	blk_mq_run_hw_queue(hctx, false);
2257}
2258EXPORT_SYMBOL(blk_mq_start_hw_queue);
2259
2260void blk_mq_start_hw_queues(struct request_queue *q)
2261{
2262	struct blk_mq_hw_ctx *hctx;
2263	int i;
2264
2265	queue_for_each_hw_ctx(q, hctx, i)
2266		blk_mq_start_hw_queue(hctx);
2267}
2268EXPORT_SYMBOL(blk_mq_start_hw_queues);
2269
2270void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
2271{
2272	if (!blk_mq_hctx_stopped(hctx))
2273		return;
2274
2275	clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
2276	blk_mq_run_hw_queue(hctx, async);
2277}
2278EXPORT_SYMBOL_GPL(blk_mq_start_stopped_hw_queue);
2279
2280void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async)
2281{
2282	struct blk_mq_hw_ctx *hctx;
2283	int i;
2284
2285	queue_for_each_hw_ctx(q, hctx, i)
2286		blk_mq_start_stopped_hw_queue(hctx, async);
2287}
2288EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);
2289
2290static void blk_mq_run_work_fn(struct work_struct *work)
2291{
2292	struct blk_mq_hw_ctx *hctx;
2293
2294	hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work);
2295
2296	/*
2297	 * If we are stopped, don't run the queue.
2298	 */
2299	if (blk_mq_hctx_stopped(hctx))
2300		return;
2301
2302	__blk_mq_run_hw_queue(hctx);
2303}
2304
2305static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx,
2306					    struct request *rq,
2307					    bool at_head)
2308{
2309	struct blk_mq_ctx *ctx = rq->mq_ctx;
2310	enum hctx_type type = hctx->type;
2311
2312	lockdep_assert_held(&ctx->lock);
2313
2314	trace_block_rq_insert(rq);
2315
2316	if (at_head)
2317		list_add(&rq->queuelist, &ctx->rq_lists[type]);
2318	else
2319		list_add_tail(&rq->queuelist, &ctx->rq_lists[type]);
2320}
2321
2322void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
2323			     bool at_head)
2324{
2325	struct blk_mq_ctx *ctx = rq->mq_ctx;
2326
2327	lockdep_assert_held(&ctx->lock);
2328
2329	__blk_mq_insert_req_list(hctx, rq, at_head);
2330	blk_mq_hctx_mark_pending(hctx, ctx);
2331}
2332
2333/**
2334 * blk_mq_request_bypass_insert - Insert a request at dispatch list.
2335 * @rq: Pointer to request to be inserted.
2336 * @at_head: true if the request should be inserted at the head of the list.
2337 * @run_queue: If we should run the hardware queue after inserting the request.
2338 *
2339 * Should only be used carefully, when the caller knows we want to
2340 * bypass a potential IO scheduler on the target device.
2341 */
2342void blk_mq_request_bypass_insert(struct request *rq, bool at_head,
2343				  bool run_queue)
2344{
2345	struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
2346
2347	spin_lock(&hctx->lock);
2348	if (at_head)
2349		list_add(&rq->queuelist, &hctx->dispatch);
2350	else
2351		list_add_tail(&rq->queuelist, &hctx->dispatch);
2352	spin_unlock(&hctx->lock);
2353
2354	if (run_queue)
2355		blk_mq_run_hw_queue(hctx, false);
2356}
2357
2358void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
2359			    struct list_head *list)
2360
2361{
2362	struct request *rq;
2363	enum hctx_type type = hctx->type;
2364
2365	/*
2366	 * preemption doesn't flush plug list, so it's possible ctx->cpu is
2367	 * offline now
2368	 */
2369	list_for_each_entry(rq, list, queuelist) {
2370		BUG_ON(rq->mq_ctx != ctx);
2371		trace_block_rq_insert(rq);
2372	}
2373
2374	spin_lock(&ctx->lock);
2375	list_splice_tail_init(list, &ctx->rq_lists[type]);
2376	blk_mq_hctx_mark_pending(hctx, ctx);
2377	spin_unlock(&ctx->lock);
2378}
2379
2380static void blk_mq_commit_rqs(struct blk_mq_hw_ctx *hctx, int *queued,
2381			      bool from_schedule)
2382{
2383	if (hctx->queue->mq_ops->commit_rqs) {
2384		trace_block_unplug(hctx->queue, *queued, !from_schedule);
2385		hctx->queue->mq_ops->commit_rqs(hctx);
2386	}
2387	*queued = 0;
2388}
2389
2390static void blk_mq_bio_to_request(struct request *rq, struct bio *bio,
2391		unsigned int nr_segs)
2392{
2393	int err;
2394
2395	if (bio->bi_opf & REQ_RAHEAD)
2396		rq->cmd_flags |= REQ_FAILFAST_MASK;
2397
2398	rq->__sector = bio->bi_iter.bi_sector;
2399	rq->write_hint = bio->bi_write_hint;
2400	blk_rq_bio_prep(rq, bio, nr_segs);
2401
2402	/* This can't fail, since GFP_NOIO includes __GFP_DIRECT_RECLAIM. */
2403	err = blk_crypto_rq_bio_prep(rq, bio, GFP_NOIO);
2404	WARN_ON_ONCE(err);
2405
2406	blk_account_io_start(rq);
2407}
2408
2409static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
2410					    struct request *rq, bool last)
2411{
2412	struct request_queue *q = rq->q;
2413	struct blk_mq_queue_data bd = {
2414		.rq = rq,
2415		.last = last,
2416	};
2417	blk_status_t ret;
2418
2419	/*
2420	 * For OK queue, we are done. For error, caller may kill it.
2421	 * Any other error (busy), just add it to our list as we
2422	 * previously would have done.
2423	 */
2424	ret = q->mq_ops->queue_rq(hctx, &bd);
2425	switch (ret) {
2426	case BLK_STS_OK:
2427		blk_mq_update_dispatch_busy(hctx, false);
2428		break;
2429	case BLK_STS_RESOURCE:
2430	case BLK_STS_DEV_RESOURCE:
2431		blk_mq_update_dispatch_busy(hctx, true);
2432		__blk_mq_requeue_request(rq);
2433		break;
2434	default:
2435		blk_mq_update_dispatch_busy(hctx, false);
2436		break;
2437	}
2438
2439	return ret;
2440}
2441
2442static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
2443						struct request *rq,
2444						bool bypass_insert, bool last)
2445{
2446	struct request_queue *q = rq->q;
2447	bool run_queue = true;
2448	int budget_token;
2449
2450	/*
2451	 * RCU or SRCU read lock is needed before checking quiesced flag.
2452	 *
2453	 * When queue is stopped or quiesced, ignore 'bypass_insert' from
2454	 * blk_mq_request_issue_directly(), and return BLK_STS_OK to caller,
2455	 * and avoid driver to try to dispatch again.
2456	 */
2457	if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)) {
2458		run_queue = false;
2459		bypass_insert = false;
2460		goto insert;
2461	}
2462
2463	if ((rq->rq_flags & RQF_ELV) && !bypass_insert)
2464		goto insert;
2465
2466	budget_token = blk_mq_get_dispatch_budget(q);
2467	if (budget_token < 0)
2468		goto insert;
2469
2470	blk_mq_set_rq_budget_token(rq, budget_token);
2471
2472	if (!blk_mq_get_driver_tag(rq)) {
2473		blk_mq_put_dispatch_budget(q, budget_token);
2474		goto insert;
2475	}
2476
2477	return __blk_mq_issue_directly(hctx, rq, last);
2478insert:
2479	if (bypass_insert)
2480		return BLK_STS_RESOURCE;
2481
2482	blk_mq_sched_insert_request(rq, false, run_queue, false);
2483
2484	return BLK_STS_OK;
2485}
2486
2487/**
2488 * blk_mq_try_issue_directly - Try to send a request directly to device driver.
2489 * @hctx: Pointer of the associated hardware queue.
2490 * @rq: Pointer to request to be sent.
2491 *
2492 * If the device has enough resources to accept a new request now, send the
2493 * request directly to device driver. Else, insert at hctx->dispatch queue, so
2494 * we can try send it another time in the future. Requests inserted at this
2495 * queue have higher priority.
2496 */
2497static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
2498		struct request *rq)
2499{
2500	blk_status_t ret =
2501		__blk_mq_try_issue_directly(hctx, rq, false, true);
2502
2503	if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE)
2504		blk_mq_request_bypass_insert(rq, false, true);
2505	else if (ret != BLK_STS_OK)
2506		blk_mq_end_request(rq, ret);
2507}
2508
2509static blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last)
2510{
2511	return __blk_mq_try_issue_directly(rq->mq_hctx, rq, true, last);
2512}
2513
2514static void blk_mq_plug_issue_direct(struct blk_plug *plug, bool from_schedule)
2515{
2516	struct blk_mq_hw_ctx *hctx = NULL;
2517	struct request *rq;
2518	int queued = 0;
2519	int errors = 0;
2520
2521	while ((rq = rq_list_pop(&plug->mq_list))) {
2522		bool last = rq_list_empty(plug->mq_list);
2523		blk_status_t ret;
2524
2525		if (hctx != rq->mq_hctx) {
2526			if (hctx)
2527				blk_mq_commit_rqs(hctx, &queued, from_schedule);
2528			hctx = rq->mq_hctx;
2529		}
2530
2531		ret = blk_mq_request_issue_directly(rq, last);
2532		switch (ret) {
2533		case BLK_STS_OK:
2534			queued++;
2535			break;
2536		case BLK_STS_RESOURCE:
2537		case BLK_STS_DEV_RESOURCE:
2538			blk_mq_request_bypass_insert(rq, false, last);
2539			blk_mq_commit_rqs(hctx, &queued, from_schedule);
2540			return;
2541		default:
2542			blk_mq_end_request(rq, ret);
2543			errors++;
2544			break;
2545		}
2546	}
2547
2548	/*
2549	 * If we didn't flush the entire list, we could have told the driver
2550	 * there was more coming, but that turned out to be a lie.
2551	 */
2552	if (errors)
2553		blk_mq_commit_rqs(hctx, &queued, from_schedule);
2554}
2555
2556static void __blk_mq_flush_plug_list(struct request_queue *q,
2557				     struct blk_plug *plug)
2558{
2559	if (blk_queue_quiesced(q))
2560		return;
2561	q->mq_ops->queue_rqs(&plug->mq_list);
2562}
2563
2564void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
2565{
2566	struct blk_mq_hw_ctx *this_hctx;
2567	struct blk_mq_ctx *this_ctx;
2568	struct request *rq;
2569	unsigned int depth;
2570	LIST_HEAD(list);
2571
2572	if (rq_list_empty(plug->mq_list))
2573		return;
2574	plug->rq_count = 0;
2575
2576	if (!plug->multiple_queues && !plug->has_elevator && !from_schedule) {
2577		struct request_queue *q;
2578
2579		rq = rq_list_peek(&plug->mq_list);
2580		q = rq->q;
2581
2582		/*
2583		 * Peek first request and see if we have a ->queue_rqs() hook.
2584		 * If we do, we can dispatch the whole plug list in one go. We
2585		 * already know at this point that all requests belong to the
2586		 * same queue, caller must ensure that's the case.
2587		 *
2588		 * Since we pass off the full list to the driver at this point,
2589		 * we do not increment the active request count for the queue.
2590		 * Bypass shared tags for now because of that.
2591		 */
2592		if (q->mq_ops->queue_rqs &&
2593		    !(rq->mq_hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) {
2594			blk_mq_run_dispatch_ops(q,
2595				__blk_mq_flush_plug_list(q, plug));
2596			if (rq_list_empty(plug->mq_list))
2597				return;
2598		}
2599
2600		blk_mq_run_dispatch_ops(q,
2601				blk_mq_plug_issue_direct(plug, false));
2602		if (rq_list_empty(plug->mq_list))
2603			return;
2604	}
2605
2606	this_hctx = NULL;
2607	this_ctx = NULL;
2608	depth = 0;
2609	do {
2610		rq = rq_list_pop(&plug->mq_list);
2611
2612		if (!this_hctx) {
2613			this_hctx = rq->mq_hctx;
2614			this_ctx = rq->mq_ctx;
2615		} else if (this_hctx != rq->mq_hctx || this_ctx != rq->mq_ctx) {
2616			trace_block_unplug(this_hctx->queue, depth,
2617						!from_schedule);
2618			blk_mq_sched_insert_requests(this_hctx, this_ctx,
2619						&list, from_schedule);
2620			depth = 0;
2621			this_hctx = rq->mq_hctx;
2622			this_ctx = rq->mq_ctx;
2623
2624		}
2625
2626		list_add(&rq->queuelist, &list);
2627		depth++;
2628	} while (!rq_list_empty(plug->mq_list));
2629
2630	if (!list_empty(&list)) {
2631		trace_block_unplug(this_hctx->queue, depth, !from_schedule);
2632		blk_mq_sched_insert_requests(this_hctx, this_ctx, &list,
2633						from_schedule);
2634	}
2635}
2636
2637void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
2638		struct list_head *list)
2639{
2640	int queued = 0;
2641	int errors = 0;
2642
2643	while (!list_empty(list)) {
2644		blk_status_t ret;
2645		struct request *rq = list_first_entry(list, struct request,
2646				queuelist);
2647
2648		list_del_init(&rq->queuelist);
2649		ret = blk_mq_request_issue_directly(rq, list_empty(list));
2650		if (ret != BLK_STS_OK) {
2651			if (ret == BLK_STS_RESOURCE ||
2652					ret == BLK_STS_DEV_RESOURCE) {
2653				blk_mq_request_bypass_insert(rq, false,
2654							list_empty(list));
2655				break;
2656			}
2657			blk_mq_end_request(rq, ret);
2658			errors++;
2659		} else
2660			queued++;
2661	}
2662
2663	/*
2664	 * If we didn't flush the entire list, we could have told
2665	 * the driver there was more coming, but that turned out to
2666	 * be a lie.
2667	 */
2668	if ((!list_empty(list) || errors) &&
2669	     hctx->queue->mq_ops->commit_rqs && queued)
2670		hctx->queue->mq_ops->commit_rqs(hctx);
2671}
2672
2673/*
2674 * Allow 2x BLK_MAX_REQUEST_COUNT requests on plug queue for multiple
2675 * queues. This is important for md arrays to benefit from merging
2676 * requests.
2677 */
2678static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug)
2679{
2680	if (plug->multiple_queues)
2681		return BLK_MAX_REQUEST_COUNT * 2;
2682	return BLK_MAX_REQUEST_COUNT;
2683}
2684
2685static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq)
2686{
2687	struct request *last = rq_list_peek(&plug->mq_list);
2688
2689	if (!plug->rq_count) {
2690		trace_block_plug(rq->q);
2691	} else if (plug->rq_count >= blk_plug_max_rq_count(plug) ||
2692		   (!blk_queue_nomerges(rq->q) &&
2693		    blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) {
2694		blk_mq_flush_plug_list(plug, false);
2695		trace_block_plug(rq->q);
2696	}
2697
2698	if (!plug->multiple_queues && last && last->q != rq->q)
2699		plug->multiple_queues = true;
2700	if (!plug->has_elevator && (rq->rq_flags & RQF_ELV))
2701		plug->has_elevator = true;
2702	rq->rq_next = NULL;
2703	rq_list_add(&plug->mq_list, rq);
2704	plug->rq_count++;
2705}
2706
2707static bool blk_mq_attempt_bio_merge(struct request_queue *q,
2708				     struct bio *bio, unsigned int nr_segs)
2709{
2710	if (!blk_queue_nomerges(q) && bio_mergeable(bio)) {
2711		if (blk_attempt_plug_merge(q, bio, nr_segs))
2712			return true;
2713		if (blk_mq_sched_bio_merge(q, bio, nr_segs))
2714			return true;
2715	}
2716	return false;
2717}
2718
2719static struct request *blk_mq_get_new_requests(struct request_queue *q,
2720					       struct blk_plug *plug,
2721					       struct bio *bio)
2722{
2723	struct blk_mq_alloc_data data = {
2724		.q		= q,
2725		.nr_tags	= 1,
2726		.cmd_flags	= bio->bi_opf,
2727	};
2728	struct request *rq;
2729
2730	if (unlikely(bio_queue_enter(bio)))
2731		return NULL;
2732
2733	if (plug) {
2734		data.nr_tags = plug->nr_ios;
2735		plug->nr_ios = 1;
2736		data.cached_rq = &plug->cached_rq;
2737	}
2738
2739	rq = __blk_mq_alloc_requests(&data);
2740	if (rq)
2741		return rq;
2742	rq_qos_cleanup(q, bio);
2743	if (bio->bi_opf & REQ_NOWAIT)
2744		bio_wouldblock_error(bio);
2745	blk_queue_exit(q);
2746	return NULL;
2747}
2748
2749static inline struct request *blk_mq_get_cached_request(struct request_queue *q,
2750		struct blk_plug *plug, struct bio *bio)
2751{
2752	struct request *rq;
2753
2754	if (!plug)
2755		return NULL;
2756	rq = rq_list_peek(&plug->cached_rq);
2757	if (!rq || rq->q != q)
2758		return NULL;
2759
2760	if (blk_mq_get_hctx_type(bio->bi_opf) != rq->mq_hctx->type)
2761		return NULL;
2762	if (op_is_flush(rq->cmd_flags) != op_is_flush(bio->bi_opf))
2763		return NULL;
2764
2765	rq->cmd_flags = bio->bi_opf;
2766	plug->cached_rq = rq_list_next(rq);
2767	INIT_LIST_HEAD(&rq->queuelist);
2768	return rq;
2769}
2770
2771/**
2772 * blk_mq_submit_bio - Create and send a request to block device.
2773 * @bio: Bio pointer.
2774 *
2775 * Builds up a request structure from @q and @bio and send to the device. The
2776 * request may not be queued directly to hardware if:
2777 * * This request can be merged with another one
2778 * * We want to place request at plug queue for possible future merging
2779 * * There is an IO scheduler active at this queue
2780 *
2781 * It will not queue the request if there is an error with the bio, or at the
2782 * request creation.
2783 */
2784void blk_mq_submit_bio(struct bio *bio)
2785{
2786	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
2787	struct blk_plug *plug = blk_mq_plug(q, bio);
2788	const int is_sync = op_is_sync(bio->bi_opf);
2789	struct request *rq;
2790	unsigned int nr_segs = 1;
2791	blk_status_t ret;
2792
2793	if (unlikely(!blk_crypto_bio_prep(&bio)))
2794		return;
2795
2796	blk_queue_bounce(q, &bio);
2797	if (blk_may_split(q, bio))
2798		__blk_queue_split(q, &bio, &nr_segs);
2799
2800	if (!bio_integrity_prep(bio))
2801		return;
2802
2803	if (blk_mq_attempt_bio_merge(q, bio, nr_segs))
2804		return;
2805
2806	rq_qos_throttle(q, bio);
2807
2808	rq = blk_mq_get_cached_request(q, plug, bio);
2809	if (!rq) {
2810		rq = blk_mq_get_new_requests(q, plug, bio);
2811		if (unlikely(!rq))
2812			return;
2813	}
2814
2815	trace_block_getrq(bio);
2816
2817	rq_qos_track(q, rq, bio);
2818
2819	blk_mq_bio_to_request(rq, bio, nr_segs);
2820
2821	ret = blk_crypto_init_request(rq);
2822	if (ret != BLK_STS_OK) {
2823		bio->bi_status = ret;
2824		bio_endio(bio);
2825		blk_mq_free_request(rq);
2826		return;
2827	}
2828
2829	if (op_is_flush(bio->bi_opf)) {
2830		blk_insert_flush(rq);
2831		return;
2832	}
2833
2834	if (plug)
2835		blk_add_rq_to_plug(plug, rq);
2836	else if ((rq->rq_flags & RQF_ELV) ||
2837		 (rq->mq_hctx->dispatch_busy &&
2838		  (q->nr_hw_queues == 1 || !is_sync)))
2839		blk_mq_sched_insert_request(rq, false, true, true);
2840	else
2841		blk_mq_run_dispatch_ops(rq->q,
2842				blk_mq_try_issue_directly(rq->mq_hctx, rq));
2843}
2844
2845/**
2846 * blk_cloned_rq_check_limits - Helper function to check a cloned request
2847 *                              for the new queue limits
2848 * @q:  the queue
2849 * @rq: the request being checked
2850 *
2851 * Description:
2852 *    @rq may have been made based on weaker limitations of upper-level queues
2853 *    in request stacking drivers, and it may violate the limitation of @q.
2854 *    Since the block layer and the underlying device driver trust @rq
2855 *    after it is inserted to @q, it should be checked against @q before
2856 *    the insertion using this generic function.
2857 *
2858 *    Request stacking drivers like request-based dm may change the queue
2859 *    limits when retrying requests on other queues. Those requests need
2860 *    to be checked against the new queue limits again during dispatch.
2861 */
2862static blk_status_t blk_cloned_rq_check_limits(struct request_queue *q,
2863				      struct request *rq)
2864{
2865	unsigned int max_sectors = blk_queue_get_max_sectors(q, req_op(rq));
2866
2867	if (blk_rq_sectors(rq) > max_sectors) {
2868		/*
2869		 * SCSI device does not have a good way to return if
2870		 * Write Same/Zero is actually supported. If a device rejects
2871		 * a non-read/write command (discard, write same,etc.) the
2872		 * low-level device driver will set the relevant queue limit to
2873		 * 0 to prevent blk-lib from issuing more of the offending
2874		 * operations. Commands queued prior to the queue limit being
2875		 * reset need to be completed with BLK_STS_NOTSUPP to avoid I/O
2876		 * errors being propagated to upper layers.
2877		 */
2878		if (max_sectors == 0)
2879			return BLK_STS_NOTSUPP;
2880
2881		printk(KERN_ERR "%s: over max size limit. (%u > %u)\n",
2882			__func__, blk_rq_sectors(rq), max_sectors);
2883		return BLK_STS_IOERR;
2884	}
2885
2886	/*
2887	 * The queue settings related to segment counting may differ from the
2888	 * original queue.
2889	 */
2890	rq->nr_phys_segments = blk_recalc_rq_segments(rq);
2891	if (rq->nr_phys_segments > queue_max_segments(q)) {
2892		printk(KERN_ERR "%s: over max segments limit. (%hu > %hu)\n",
2893			__func__, rq->nr_phys_segments, queue_max_segments(q));
2894		return BLK_STS_IOERR;
2895	}
2896
2897	return BLK_STS_OK;
2898}
2899
2900/**
2901 * blk_insert_cloned_request - Helper for stacking drivers to submit a request
2902 * @q:  the queue to submit the request
2903 * @rq: the request being queued
2904 */
2905blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq)
2906{
2907	blk_status_t ret;
2908
2909	ret = blk_cloned_rq_check_limits(q, rq);
2910	if (ret != BLK_STS_OK)
2911		return ret;
2912
2913	if (rq->q->disk &&
2914	    should_fail_request(rq->q->disk->part0, blk_rq_bytes(rq)))
2915		return BLK_STS_IOERR;
2916
2917	if (blk_crypto_insert_cloned_request(rq))
2918		return BLK_STS_IOERR;
2919
2920	blk_account_io_start(rq);
2921
2922	/*
2923	 * Since we have a scheduler attached on the top device,
2924	 * bypass a potential scheduler on the bottom device for
2925	 * insert.
2926	 */
2927	blk_mq_run_dispatch_ops(rq->q,
2928			ret = blk_mq_request_issue_directly(rq, true));
2929	if (ret)
2930		blk_account_io_done(rq, ktime_get_ns());
2931	return ret;
2932}
2933EXPORT_SYMBOL_GPL(blk_insert_cloned_request);
2934
2935/**
2936 * blk_rq_unprep_clone - Helper function to free all bios in a cloned request
2937 * @rq: the clone request to be cleaned up
2938 *
2939 * Description:
2940 *     Free all bios in @rq for a cloned request.
2941 */
2942void blk_rq_unprep_clone(struct request *rq)
2943{
2944	struct bio *bio;
2945
2946	while ((bio = rq->bio) != NULL) {
2947		rq->bio = bio->bi_next;
2948
2949		bio_put(bio);
2950	}
2951}
2952EXPORT_SYMBOL_GPL(blk_rq_unprep_clone);
2953
2954/**
2955 * blk_rq_prep_clone - Helper function to setup clone request
2956 * @rq: the request to be setup
2957 * @rq_src: original request to be cloned
2958 * @bs: bio_set that bios for clone are allocated from
2959 * @gfp_mask: memory allocation mask for bio
2960 * @bio_ctr: setup function to be called for each clone bio.
2961 *           Returns %0 for success, non %0 for failure.
2962 * @data: private data to be passed to @bio_ctr
2963 *
2964 * Description:
2965 *     Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq.
2966 *     Also, pages which the original bios are pointing to are not copied
2967 *     and the cloned bios just point same pages.
2968 *     So cloned bios must be completed before original bios, which means
2969 *     the caller must complete @rq before @rq_src.
2970 */
2971int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
2972		      struct bio_set *bs, gfp_t gfp_mask,
2973		      int (*bio_ctr)(struct bio *, struct bio *, void *),
2974		      void *data)
2975{
2976	struct bio *bio, *bio_src;
2977
2978	if (!bs)
2979		bs = &fs_bio_set;
2980
2981	__rq_for_each_bio(bio_src, rq_src) {
2982		bio = bio_clone_fast(bio_src, gfp_mask, bs);
2983		if (!bio)
2984			goto free_and_out;
2985		bio->bi_bdev = rq->q->disk->part0;
2986
2987		if (bio_ctr && bio_ctr(bio, bio_src, data))
2988			goto free_and_out;
2989
2990		if (rq->bio) {
2991			rq->biotail->bi_next = bio;
2992			rq->biotail = bio;
2993		} else {
2994			rq->bio = rq->biotail = bio;
2995		}
2996		bio = NULL;
2997	}
2998
2999	/* Copy attributes of the original request to the clone request. */
3000	rq->__sector = blk_rq_pos(rq_src);
3001	rq->__data_len = blk_rq_bytes(rq_src);
3002	if (rq_src->rq_flags & RQF_SPECIAL_PAYLOAD) {
3003		rq->rq_flags |= RQF_SPECIAL_PAYLOAD;
3004		rq->special_vec = rq_src->special_vec;
3005	}
3006	rq->nr_phys_segments = rq_src->nr_phys_segments;
3007	rq->ioprio = rq_src->ioprio;
3008
3009	if (rq->bio && blk_crypto_rq_bio_prep(rq, rq->bio, gfp_mask) < 0)
3010		goto free_and_out;
3011
3012	return 0;
3013
3014free_and_out:
3015	if (bio)
3016		bio_put(bio);
3017	blk_rq_unprep_clone(rq);
3018
3019	return -ENOMEM;
3020}
3021EXPORT_SYMBOL_GPL(blk_rq_prep_clone);
3022
3023/*
3024 * Steal bios from a request and add them to a bio list.
3025 * The request must not have been partially completed before.
3026 */
3027void blk_steal_bios(struct bio_list *list, struct request *rq)
3028{
3029	if (rq->bio) {
3030		if (list->tail)
3031			list->tail->bi_next = rq->bio;
3032		else
3033			list->head = rq->bio;
3034		list->tail = rq->biotail;
3035
3036		rq->bio = NULL;
3037		rq->biotail = NULL;
3038	}
3039
3040	rq->__data_len = 0;
3041}
3042EXPORT_SYMBOL_GPL(blk_steal_bios);
3043
3044static size_t order_to_size(unsigned int order)
3045{
3046	return (size_t)PAGE_SIZE << order;
3047}
3048
3049/* called before freeing request pool in @tags */
3050static void blk_mq_clear_rq_mapping(struct blk_mq_tags *drv_tags,
3051				    struct blk_mq_tags *tags)
3052{
3053	struct page *page;
3054	unsigned long flags;
3055
3056	/* There is no need to clear a driver tags own mapping */
3057	if (drv_tags == tags)
3058		return;
3059
3060	list_for_each_entry(page, &tags->page_list, lru) {
3061		unsigned long start = (unsigned long)page_address(page);
3062		unsigned long end = start + order_to_size(page->private);
3063		int i;
3064
3065		for (i = 0; i < drv_tags->nr_tags; i++) {
3066			struct request *rq = drv_tags->rqs[i];
3067			unsigned long rq_addr = (unsigned long)rq;
3068
3069			if (rq_addr >= start && rq_addr < end) {
3070				WARN_ON_ONCE(req_ref_read(rq) != 0);
3071				cmpxchg(&drv_tags->rqs[i], rq, NULL);
3072			}
3073		}
3074	}
3075
3076	/*
3077	 * Wait until all pending iteration is done.
3078	 *
3079	 * Request reference is cleared and it is guaranteed to be observed
3080	 * after the ->lock is released.
3081	 */
3082	spin_lock_irqsave(&drv_tags->lock, flags);
3083	spin_unlock_irqrestore(&drv_tags->lock, flags);
3084}
3085
3086void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
3087		     unsigned int hctx_idx)
3088{
3089	struct blk_mq_tags *drv_tags;
3090	struct page *page;
3091
3092	if (blk_mq_is_shared_tags(set->flags))
3093		drv_tags = set->shared_tags;
3094	else
3095		drv_tags = set->tags[hctx_idx];
3096
3097	if (tags->static_rqs && set->ops->exit_request) {
3098		int i;
3099
3100		for (i = 0; i < tags->nr_tags; i++) {
3101			struct request *rq = tags->static_rqs[i];
3102
3103			if (!rq)
3104				continue;
3105			set->ops->exit_request(set, rq, hctx_idx);
3106			tags->static_rqs[i] = NULL;
3107		}
3108	}
3109
3110	blk_mq_clear_rq_mapping(drv_tags, tags);
3111
3112	while (!list_empty(&tags->page_list)) {
3113		page = list_first_entry(&tags->page_list, struct page, lru);
3114		list_del_init(&page->lru);
3115		/*
3116		 * Remove kmemleak object previously allocated in
3117		 * blk_mq_alloc_rqs().
3118		 */
3119		kmemleak_free(page_address(page));
3120		__free_pages(page, page->private);
3121	}
3122}
3123
3124void blk_mq_free_rq_map(struct blk_mq_tags *tags)
3125{
3126	kfree(tags->rqs);
3127	tags->rqs = NULL;
3128	kfree(tags->static_rqs);
3129	tags->static_rqs = NULL;
3130
3131	blk_mq_free_tags(tags);
3132}
3133
3134static struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
3135					       unsigned int hctx_idx,
3136					       unsigned int nr_tags,
3137					       unsigned int reserved_tags)
3138{
3139	struct blk_mq_tags *tags;
3140	int node;
3141
3142	node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx);
3143	if (node == NUMA_NO_NODE)
3144		node = set->numa_node;
3145
3146	tags = blk_mq_init_tags(nr_tags, reserved_tags, node,
3147				BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags));
3148	if (!tags)
3149		return NULL;
3150
3151	tags->rqs = kcalloc_node(nr_tags, sizeof(struct request *),
3152				 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
3153				 node);
3154	if (!tags->rqs) {
3155		blk_mq_free_tags(tags);
3156		return NULL;
3157	}
3158
3159	tags->static_rqs = kcalloc_node(nr_tags, sizeof(struct request *),
3160					GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
3161					node);
3162	if (!tags->static_rqs) {
3163		kfree(tags->rqs);
3164		blk_mq_free_tags(tags);
3165		return NULL;
3166	}
3167
3168	return tags;
3169}
3170
3171static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq,
3172			       unsigned int hctx_idx, int node)
3173{
3174	int ret;
3175
3176	if (set->ops->init_request) {
3177		ret = set->ops->init_request(set, rq, hctx_idx, node);
3178		if (ret)
3179			return ret;
3180	}
3181
3182	WRITE_ONCE(rq->state, MQ_RQ_IDLE);
3183	return 0;
3184}
3185
3186static int blk_mq_alloc_rqs(struct blk_mq_tag_set *set,
3187			    struct blk_mq_tags *tags,
3188			    unsigned int hctx_idx, unsigned int depth)
3189{
3190	unsigned int i, j, entries_per_page, max_order = 4;
3191	size_t rq_size, left;
3192	int node;
3193
3194	node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx);
3195	if (node == NUMA_NO_NODE)
3196		node = set->numa_node;
3197
3198	INIT_LIST_HEAD(&tags->page_list);
3199
3200	/*
3201	 * rq_size is the size of the request plus driver payload, rounded
3202	 * to the cacheline size
3203	 */
3204	rq_size = round_up(sizeof(struct request) + set->cmd_size,
3205				cache_line_size());
3206	left = rq_size * depth;
3207
3208	for (i = 0; i < depth; ) {
3209		int this_order = max_order;
3210		struct page *page;
3211		int to_do;
3212		void *p;
3213
3214		while (this_order && left < order_to_size(this_order - 1))
3215			this_order--;
3216
3217		do {
3218			page = alloc_pages_node(node,
3219				GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO,
3220				this_order);
3221			if (page)
3222				break;
3223			if (!this_order--)
3224				break;
3225			if (order_to_size(this_order) < rq_size)
3226				break;
3227		} while (1);
3228
3229		if (!page)
3230			goto fail;
3231
3232		page->private = this_order;
3233		list_add_tail(&page->lru, &tags->page_list);
3234
3235		p = page_address(page);
3236		/*
3237		 * Allow kmemleak to scan these pages as they contain pointers
3238		 * to additional allocations like via ops->init_request().
3239		 */
3240		kmemleak_alloc(p, order_to_size(this_order), 1, GFP_NOIO);
3241		entries_per_page = order_to_size(this_order) / rq_size;
3242		to_do = min(entries_per_page, depth - i);
3243		left -= to_do * rq_size;
3244		for (j = 0; j < to_do; j++) {
3245			struct request *rq = p;
3246
3247			tags->static_rqs[i] = rq;
3248			if (blk_mq_init_request(set, rq, hctx_idx, node)) {
3249				tags->static_rqs[i] = NULL;
3250				goto fail;
3251			}
3252
3253			p += rq_size;
3254			i++;
3255		}
3256	}
3257	return 0;
3258
3259fail:
3260	blk_mq_free_rqs(set, tags, hctx_idx);
3261	return -ENOMEM;
3262}
3263
3264struct rq_iter_data {
3265	struct blk_mq_hw_ctx *hctx;
3266	bool has_rq;
3267};
3268
3269static bool blk_mq_has_request(struct request *rq, void *data, bool reserved)
3270{
3271	struct rq_iter_data *iter_data = data;
3272
3273	if (rq->mq_hctx != iter_data->hctx)
3274		return true;
3275	iter_data->has_rq = true;
3276	return false;
3277}
3278
3279static bool blk_mq_hctx_has_requests(struct blk_mq_hw_ctx *hctx)
3280{
3281	struct blk_mq_tags *tags = hctx->sched_tags ?
3282			hctx->sched_tags : hctx->tags;
3283	struct rq_iter_data data = {
3284		.hctx	= hctx,
3285	};
3286
3287	blk_mq_all_tag_iter(tags, blk_mq_has_request, &data);
3288	return data.has_rq;
3289}
3290
3291static inline bool blk_mq_last_cpu_in_hctx(unsigned int cpu,
3292		struct blk_mq_hw_ctx *hctx)
3293{
3294	if (cpumask_first_and(hctx->cpumask, cpu_online_mask) != cpu)
3295		return false;
3296	if (cpumask_next_and(cpu, hctx->cpumask, cpu_online_mask) < nr_cpu_ids)
3297		return false;
3298	return true;
3299}
3300
3301static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node)
3302{
3303	struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
3304			struct blk_mq_hw_ctx, cpuhp_online);
3305
3306	if (!cpumask_test_cpu(cpu, hctx->cpumask) ||
3307	    !blk_mq_last_cpu_in_hctx(cpu, hctx))
3308		return 0;
3309
3310	/*
3311	 * Prevent new request from being allocated on the current hctx.
3312	 *
3313	 * The smp_mb__after_atomic() Pairs with the implied barrier in
3314	 * test_and_set_bit_lock in sbitmap_get().  Ensures the inactive flag is
3315	 * seen once we return from the tag allocator.
3316	 */
3317	set_bit(BLK_MQ_S_INACTIVE, &hctx->state);
3318	smp_mb__after_atomic();
3319
3320	/*
3321	 * Try to grab a reference to the queue and wait for any outstanding
3322	 * requests.  If we could not grab a reference the queue has been
3323	 * frozen and there are no requests.
3324	 */
3325	if (percpu_ref_tryget(&hctx->queue->q_usage_counter)) {
3326		while (blk_mq_hctx_has_requests(hctx))
3327			msleep(5);
3328		percpu_ref_put(&hctx->queue->q_usage_counter);
3329	}
3330
3331	return 0;
3332}
3333
3334static int blk_mq_hctx_notify_online(unsigned int cpu, struct hlist_node *node)
3335{
3336	struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
3337			struct blk_mq_hw_ctx, cpuhp_online);
3338
3339	if (cpumask_test_cpu(cpu, hctx->cpumask))
3340		clear_bit(BLK_MQ_S_INACTIVE, &hctx->state);
3341	return 0;
3342}
3343
3344/*
3345 * 'cpu' is going away. splice any existing rq_list entries from this
3346 * software queue to the hw queue dispatch list, and ensure that it
3347 * gets run.
3348 */
3349static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
3350{
3351	struct blk_mq_hw_ctx *hctx;
3352	struct blk_mq_ctx *ctx;
3353	LIST_HEAD(tmp);
3354	enum hctx_type type;
3355
3356	hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead);
3357	if (!cpumask_test_cpu(cpu, hctx->cpumask))
3358		return 0;
3359
3360	ctx = __blk_mq_get_ctx(hctx->queue, cpu);
3361	type = hctx->type;
3362
3363	spin_lock(&ctx->lock);
3364	if (!list_empty(&ctx->rq_lists[type])) {
3365		list_splice_init(&ctx->rq_lists[type], &tmp);
3366		blk_mq_hctx_clear_pending(hctx, ctx);
3367	}
3368	spin_unlock(&ctx->lock);
3369
3370	if (list_empty(&tmp))
3371		return 0;
3372
3373	spin_lock(&hctx->lock);
3374	list_splice_tail_init(&tmp, &hctx->dispatch);
3375	spin_unlock(&hctx->lock);
3376
3377	blk_mq_run_hw_queue(hctx, true);
3378	return 0;
3379}
3380
3381static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx)
3382{
3383	if (!(hctx->flags & BLK_MQ_F_STACKING))
3384		cpuhp_state_remove_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE,
3385						    &hctx->cpuhp_online);
3386	cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD,
3387					    &hctx->cpuhp_dead);
3388}
3389
3390/*
3391 * Before freeing hw queue, clearing the flush request reference in
3392 * tags->rqs[] for avoiding potential UAF.
3393 */
3394static void blk_mq_clear_flush_rq_mapping(struct blk_mq_tags *tags,
3395		unsigned int queue_depth, struct request *flush_rq)
3396{
3397	int i;
3398	unsigned long flags;
3399
3400	/* The hw queue may not be mapped yet */
3401	if (!tags)
3402		return;
3403
3404	WARN_ON_ONCE(req_ref_read(flush_rq) != 0);
3405
3406	for (i = 0; i < queue_depth; i++)
3407		cmpxchg(&tags->rqs[i], flush_rq, NULL);
3408
3409	/*
3410	 * Wait until all pending iteration is done.
3411	 *
3412	 * Request reference is cleared and it is guaranteed to be observed
3413	 * after the ->lock is released.
3414	 */
3415	spin_lock_irqsave(&tags->lock, flags);
3416	spin_unlock_irqrestore(&tags->lock, flags);
3417}
3418
3419/* hctx->ctxs will be freed in queue's release handler */
3420static void blk_mq_exit_hctx(struct request_queue *q,
3421		struct blk_mq_tag_set *set,
3422		struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
3423{
3424	struct request *flush_rq = hctx->fq->flush_rq;
3425
3426	if (blk_mq_hw_queue_mapped(hctx))
3427		blk_mq_tag_idle(hctx);
3428
3429	blk_mq_clear_flush_rq_mapping(set->tags[hctx_idx],
3430			set->queue_depth, flush_rq);
3431	if (set->ops->exit_request)
3432		set->ops->exit_request(set, flush_rq, hctx_idx);
3433
3434	if (set->ops->exit_hctx)
3435		set->ops->exit_hctx(hctx, hctx_idx);
3436
3437	blk_mq_remove_cpuhp(hctx);
3438
3439	spin_lock(&q->unused_hctx_lock);
3440	list_add(&hctx->hctx_list, &q->unused_hctx_list);
3441	spin_unlock(&q->unused_hctx_lock);
3442}
3443
3444static void blk_mq_exit_hw_queues(struct request_queue *q,
3445		struct blk_mq_tag_set *set, int nr_queue)
3446{
3447	struct blk_mq_hw_ctx *hctx;
3448	unsigned int i;
3449
3450	queue_for_each_hw_ctx(q, hctx, i) {
3451		if (i == nr_queue)
3452			break;
3453		blk_mq_debugfs_unregister_hctx(hctx);
3454		blk_mq_exit_hctx(q, set, hctx, i);
3455	}
3456}
3457
3458static int blk_mq_init_hctx(struct request_queue *q,
3459		struct blk_mq_tag_set *set,
3460		struct blk_mq_hw_ctx *hctx, unsigned hctx_idx)
3461{
3462	hctx->queue_num = hctx_idx;
3463
3464	if (!(hctx->flags & BLK_MQ_F_STACKING))
3465		cpuhp_state_add_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE,
3466				&hctx->cpuhp_online);
3467	cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead);
3468
3469	hctx->tags = set->tags[hctx_idx];
3470
3471	if (set->ops->init_hctx &&
3472	    set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
3473		goto unregister_cpu_notifier;
3474
3475	if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx,
3476				hctx->numa_node))
3477		goto exit_hctx;
3478	return 0;
3479
3480 exit_hctx:
3481	if (set->ops->exit_hctx)
3482		set->ops->exit_hctx(hctx, hctx_idx);
3483 unregister_cpu_notifier:
3484	blk_mq_remove_cpuhp(hctx);
3485	return -1;
3486}
3487
3488static struct blk_mq_hw_ctx *
3489blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set,
3490		int node)
3491{
3492	struct blk_mq_hw_ctx *hctx;
3493	gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY;
3494
3495	hctx = kzalloc_node(sizeof(struct blk_mq_hw_ctx), gfp, node);
3496	if (!hctx)
3497		goto fail_alloc_hctx;
3498
3499	if (!zalloc_cpumask_var_node(&hctx->cpumask, gfp, node))
3500		goto free_hctx;
3501
3502	atomic_set(&hctx->nr_active, 0);
3503	if (node == NUMA_NO_NODE)
3504		node = set->numa_node;
3505	hctx->numa_node = node;
3506
3507	INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn);
3508	spin_lock_init(&hctx->lock);
3509	INIT_LIST_HEAD(&hctx->dispatch);
3510	hctx->queue = q;
3511	hctx->flags = set->flags & ~BLK_MQ_F_TAG_QUEUE_SHARED;
3512
3513	INIT_LIST_HEAD(&hctx->hctx_list);
3514
3515	/*
3516	 * Allocate space for all possible cpus to avoid allocation at
3517	 * runtime
3518	 */
3519	hctx->ctxs = kmalloc_array_node(nr_cpu_ids, sizeof(void *),
3520			gfp, node);
3521	if (!hctx->ctxs)
3522		goto free_cpumask;
3523
3524	if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8),
3525				gfp, node, false, false))
3526		goto free_ctxs;
3527	hctx->nr_ctx = 0;
3528
3529	spin_lock_init(&hctx->dispatch_wait_lock);
3530	init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
3531	INIT_LIST_HEAD(&hctx->dispatch_wait.entry);
3532
3533	hctx->fq = blk_alloc_flush_queue(hctx->numa_node, set->cmd_size, gfp);
3534	if (!hctx->fq)
3535		goto free_bitmap;
3536
3537	blk_mq_hctx_kobj_init(hctx);
3538
3539	return hctx;
3540
3541 free_bitmap:
3542	sbitmap_free(&hctx->ctx_map);
3543 free_ctxs:
3544	kfree(hctx->ctxs);
3545 free_cpumask:
3546	free_cpumask_var(hctx->cpumask);
3547 free_hctx:
3548	kfree(hctx);
3549 fail_alloc_hctx:
3550	return NULL;
3551}
3552
3553static void blk_mq_init_cpu_queues(struct request_queue *q,
3554				   unsigned int nr_hw_queues)
3555{
3556	struct blk_mq_tag_set *set = q->tag_set;
3557	unsigned int i, j;
3558
3559	for_each_possible_cpu(i) {
3560		struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i);
3561		struct blk_mq_hw_ctx *hctx;
3562		int k;
3563
3564		__ctx->cpu = i;
3565		spin_lock_init(&__ctx->lock);
3566		for (k = HCTX_TYPE_DEFAULT; k < HCTX_MAX_TYPES; k++)
3567			INIT_LIST_HEAD(&__ctx->rq_lists[k]);
3568
3569		__ctx->queue = q;
3570
3571		/*
3572		 * Set local node, IFF we have more than one hw queue. If
3573		 * not, we remain on the home node of the device
3574		 */
3575		for (j = 0; j < set->nr_maps; j++) {
3576			hctx = blk_mq_map_queue_type(q, j, i);
3577			if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
3578				hctx->numa_node = cpu_to_node(i);
3579		}
3580	}
3581}
3582
3583struct blk_mq_tags *blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set,
3584					     unsigned int hctx_idx,
3585					     unsigned int depth)
3586{
3587	struct blk_mq_tags *tags;
3588	int ret;
3589
3590	tags = blk_mq_alloc_rq_map(set, hctx_idx, depth, set->reserved_tags);
3591	if (!tags)
3592		return NULL;
3593
3594	ret = blk_mq_alloc_rqs(set, tags, hctx_idx, depth);
3595	if (ret) {
3596		blk_mq_free_rq_map(tags);
3597		return NULL;
3598	}
3599
3600	return tags;
3601}
3602
3603static bool __blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set,
3604				       int hctx_idx)
3605{
3606	if (blk_mq_is_shared_tags(set->flags)) {
3607		set->tags[hctx_idx] = set->shared_tags;
3608
3609		return true;
3610	}
3611
3612	set->tags[hctx_idx] = blk_mq_alloc_map_and_rqs(set, hctx_idx,
3613						       set->queue_depth);
3614
3615	return set->tags[hctx_idx];
3616}
3617
3618void blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set,
3619			     struct blk_mq_tags *tags,
3620			     unsigned int hctx_idx)
3621{
3622	if (tags) {
3623		blk_mq_free_rqs(set, tags, hctx_idx);
3624		blk_mq_free_rq_map(tags);
3625	}
3626}
3627
3628static void __blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set,
3629				      unsigned int hctx_idx)
3630{
3631	if (!blk_mq_is_shared_tags(set->flags))
3632		blk_mq_free_map_and_rqs(set, set->tags[hctx_idx], hctx_idx);
3633
3634	set->tags[hctx_idx] = NULL;
3635}
3636
3637static void blk_mq_map_swqueue(struct request_queue *q)
3638{
3639	unsigned int i, j, hctx_idx;
3640	struct blk_mq_hw_ctx *hctx;
3641	struct blk_mq_ctx *ctx;
3642	struct blk_mq_tag_set *set = q->tag_set;
3643
3644	queue_for_each_hw_ctx(q, hctx, i) {
3645		cpumask_clear(hctx->cpumask);
3646		hctx->nr_ctx = 0;
3647		hctx->dispatch_from = NULL;
3648	}
3649
3650	/*
3651	 * Map software to hardware queues.
3652	 *
3653	 * If the cpu isn't present, the cpu is mapped to first hctx.
3654	 */
3655	for_each_possible_cpu(i) {
3656
3657		ctx = per_cpu_ptr(q->queue_ctx, i);
3658		for (j = 0; j < set->nr_maps; j++) {
3659			if (!set->map[j].nr_queues) {
3660				ctx->hctxs[j] = blk_mq_map_queue_type(q,
3661						HCTX_TYPE_DEFAULT, i);
3662				continue;
3663			}
3664			hctx_idx = set->map[j].mq_map[i];
3665			/* unmapped hw queue can be remapped after CPU topo changed */
3666			if (!set->tags[hctx_idx] &&
3667			    !__blk_mq_alloc_map_and_rqs(set, hctx_idx)) {
3668				/*
3669				 * If tags initialization fail for some hctx,
3670				 * that hctx won't be brought online.  In this
3671				 * case, remap the current ctx to hctx[0] which
3672				 * is guaranteed to always have tags allocated
3673				 */
3674				set->map[j].mq_map[i] = 0;
3675			}
3676
3677			hctx = blk_mq_map_queue_type(q, j, i);
3678			ctx->hctxs[j] = hctx;
3679			/*
3680			 * If the CPU is already set in the mask, then we've
3681			 * mapped this one already. This can happen if
3682			 * devices share queues across queue maps.
3683			 */
3684			if (cpumask_test_cpu(i, hctx->cpumask))
3685				continue;
3686
3687			cpumask_set_cpu(i, hctx->cpumask);
3688			hctx->type = j;
3689			ctx->index_hw[hctx->type] = hctx->nr_ctx;
3690			hctx->ctxs[hctx->nr_ctx++] = ctx;
3691
3692			/*
3693			 * If the nr_ctx type overflows, we have exceeded the
3694			 * amount of sw queues we can support.
3695			 */
3696			BUG_ON(!hctx->nr_ctx);
3697		}
3698
3699		for (; j < HCTX_MAX_TYPES; j++)
3700			ctx->hctxs[j] = blk_mq_map_queue_type(q,
3701					HCTX_TYPE_DEFAULT, i);
3702	}
3703
3704	queue_for_each_hw_ctx(q, hctx, i) {
3705		/*
3706		 * If no software queues are mapped to this hardware queue,
3707		 * disable it and free the request entries.
3708		 */
3709		if (!hctx->nr_ctx) {
3710			/* Never unmap queue 0.  We need it as a
3711			 * fallback in case of a new remap fails
3712			 * allocation
3713			 */
3714			if (i)
3715				__blk_mq_free_map_and_rqs(set, i);
3716
3717			hctx->tags = NULL;
3718			continue;
3719		}
3720
3721		hctx->tags = set->tags[i];
3722		WARN_ON(!hctx->tags);
3723
3724		/*
3725		 * Set the map size to the number of mapped software queues.
3726		 * This is more accurate and more efficient than looping
3727		 * over all possibly mapped software queues.
3728		 */
3729		sbitmap_resize(&hctx->ctx_map, hctx->nr_ctx);
3730
3731		/*
3732		 * Initialize batch roundrobin counts
3733		 */
3734		hctx->next_cpu = blk_mq_first_mapped_cpu(hctx);
3735		hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
3736	}
3737}
3738
3739/*
3740 * Caller needs to ensure that we're either frozen/quiesced, or that
3741 * the queue isn't live yet.
3742 */
3743static void queue_set_hctx_shared(struct request_queue *q, bool shared)
3744{
3745	struct blk_mq_hw_ctx *hctx;
3746	int i;
3747
3748	queue_for_each_hw_ctx(q, hctx, i) {
3749		if (shared) {
3750			hctx->flags |= BLK_MQ_F_TAG_QUEUE_SHARED;
3751		} else {
3752			blk_mq_tag_idle(hctx);
3753			hctx->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED;
3754		}
3755	}
3756}
3757
3758static void blk_mq_update_tag_set_shared(struct blk_mq_tag_set *set,
3759					 bool shared)
3760{
3761	struct request_queue *q;
3762
3763	lockdep_assert_held(&set->tag_list_lock);
3764
3765	list_for_each_entry(q, &set->tag_list, tag_set_list) {
3766		blk_mq_freeze_queue(q);
3767		queue_set_hctx_shared(q, shared);
3768		blk_mq_unfreeze_queue(q);
3769	}
3770}
3771
3772static void blk_mq_del_queue_tag_set(struct request_queue *q)
3773{
3774	struct blk_mq_tag_set *set = q->tag_set;
3775
3776	mutex_lock(&set->tag_list_lock);
3777	list_del(&q->tag_set_list);
3778	if (list_is_singular(&set->tag_list)) {
3779		/* just transitioned to unshared */
3780		set->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED;
3781		/* update existing queue */
3782		blk_mq_update_tag_set_shared(set, false);
3783	}
3784	mutex_unlock(&set->tag_list_lock);
3785	INIT_LIST_HEAD(&q->tag_set_list);
3786}
3787
3788static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
3789				     struct request_queue *q)
3790{
3791	mutex_lock(&set->tag_list_lock);
3792
3793	/*
3794	 * Check to see if we're transitioning to shared (from 1 to 2 queues).
3795	 */
3796	if (!list_empty(&set->tag_list) &&
3797	    !(set->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) {
3798		set->flags |= BLK_MQ_F_TAG_QUEUE_SHARED;
3799		/* update existing queue */
3800		blk_mq_update_tag_set_shared(set, true);
3801	}
3802	if (set->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
3803		queue_set_hctx_shared(q, true);
3804	list_add_tail(&q->tag_set_list, &set->tag_list);
3805
3806	mutex_unlock(&set->tag_list_lock);
3807}
3808
3809/* All allocations will be freed in release handler of q->mq_kobj */
3810static int blk_mq_alloc_ctxs(struct request_queue *q)
3811{
3812	struct blk_mq_ctxs *ctxs;
3813	int cpu;
3814
3815	ctxs = kzalloc(sizeof(*ctxs), GFP_KERNEL);
3816	if (!ctxs)
3817		return -ENOMEM;
3818
3819	ctxs->queue_ctx = alloc_percpu(struct blk_mq_ctx);
3820	if (!ctxs->queue_ctx)
3821		goto fail;
3822
3823	for_each_possible_cpu(cpu) {
3824		struct blk_mq_ctx *ctx = per_cpu_ptr(ctxs->queue_ctx, cpu);
3825		ctx->ctxs = ctxs;
3826	}
3827
3828	q->mq_kobj = &ctxs->kobj;
3829	q->queue_ctx = ctxs->queue_ctx;
3830
3831	return 0;
3832 fail:
3833	kfree(ctxs);
3834	return -ENOMEM;
3835}
3836
3837/*
3838 * It is the actual release handler for mq, but we do it from
3839 * request queue's release handler for avoiding use-after-free
3840 * and headache because q->mq_kobj shouldn't have been introduced,
3841 * but we can't group ctx/kctx kobj without it.
3842 */
3843void blk_mq_release(struct request_queue *q)
3844{
3845	struct blk_mq_hw_ctx *hctx, *next;
3846	int i;
3847
3848	queue_for_each_hw_ctx(q, hctx, i)
3849		WARN_ON_ONCE(hctx && list_empty(&hctx->hctx_list));
3850
3851	/* all hctx are in .unused_hctx_list now */
3852	list_for_each_entry_safe(hctx, next, &q->unused_hctx_list, hctx_list) {
3853		list_del_init(&hctx->hctx_list);
3854		kobject_put(&hctx->kobj);
3855	}
3856
3857	kfree(q->queue_hw_ctx);
3858
3859	/*
3860	 * release .mq_kobj and sw queue's kobject now because
3861	 * both share lifetime with request queue.
3862	 */
3863	blk_mq_sysfs_deinit(q);
3864}
3865
3866static struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set,
3867		void *queuedata)
3868{
3869	struct request_queue *q;
3870	int ret;
3871
3872	q = blk_alloc_queue(set->numa_node, set->flags & BLK_MQ_F_BLOCKING);
3873	if (!q)
3874		return ERR_PTR(-ENOMEM);
3875	q->queuedata = queuedata;
3876	ret = blk_mq_init_allocated_queue(set, q);
3877	if (ret) {
3878		blk_cleanup_queue(q);
3879		return ERR_PTR(ret);
3880	}
3881	return q;
3882}
3883
3884struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
3885{
3886	return blk_mq_init_queue_data(set, NULL);
3887}
3888EXPORT_SYMBOL(blk_mq_init_queue);
3889
3890struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata,
3891		struct lock_class_key *lkclass)
3892{
3893	struct request_queue *q;
3894	struct gendisk *disk;
3895
3896	q = blk_mq_init_queue_data(set, queuedata);
3897	if (IS_ERR(q))
3898		return ERR_CAST(q);
3899
3900	disk = __alloc_disk_node(q, set->numa_node, lkclass);
3901	if (!disk) {
3902		blk_cleanup_queue(q);
3903		return ERR_PTR(-ENOMEM);
3904	}
3905	return disk;
3906}
3907EXPORT_SYMBOL(__blk_mq_alloc_disk);
3908
3909static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx(
3910		struct blk_mq_tag_set *set, struct request_queue *q,
3911		int hctx_idx, int node)
3912{
3913	struct blk_mq_hw_ctx *hctx = NULL, *tmp;
3914
3915	/* reuse dead hctx first */
3916	spin_lock(&q->unused_hctx_lock);
3917	list_for_each_entry(tmp, &q->unused_hctx_list, hctx_list) {
3918		if (tmp->numa_node == node) {
3919			hctx = tmp;
3920			break;
3921		}
3922	}
3923	if (hctx)
3924		list_del_init(&hctx->hctx_list);
3925	spin_unlock(&q->unused_hctx_lock);
3926
3927	if (!hctx)
3928		hctx = blk_mq_alloc_hctx(q, set, node);
3929	if (!hctx)
3930		goto fail;
3931
3932	if (blk_mq_init_hctx(q, set, hctx, hctx_idx))
3933		goto free_hctx;
3934
3935	return hctx;
3936
3937 free_hctx:
3938	kobject_put(&hctx->kobj);
3939 fail:
3940	return NULL;
3941}
3942
3943static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
3944						struct request_queue *q)
3945{
3946	int i, j, end;
3947	struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx;
3948
3949	if (q->nr_hw_queues < set->nr_hw_queues) {
3950		struct blk_mq_hw_ctx **new_hctxs;
3951
3952		new_hctxs = kcalloc_node(set->nr_hw_queues,
3953				       sizeof(*new_hctxs), GFP_KERNEL,
3954				       set->numa_node);
3955		if (!new_hctxs)
3956			return;
3957		if (hctxs)
3958			memcpy(new_hctxs, hctxs, q->nr_hw_queues *
3959			       sizeof(*hctxs));
3960		q->queue_hw_ctx = new_hctxs;
3961		kfree(hctxs);
3962		hctxs = new_hctxs;
3963	}
3964
3965	/* protect against switching io scheduler  */
3966	mutex_lock(&q->sysfs_lock);
3967	for (i = 0; i < set->nr_hw_queues; i++) {
3968		int node;
3969		struct blk_mq_hw_ctx *hctx;
3970
3971		node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], i);
3972		/*
3973		 * If the hw queue has been mapped to another numa node,
3974		 * we need to realloc the hctx. If allocation fails, fallback
3975		 * to use the previous one.
3976		 */
3977		if (hctxs[i] && (hctxs[i]->numa_node == node))
3978			continue;
3979
3980		hctx = blk_mq_alloc_and_init_hctx(set, q, i, node);
3981		if (hctx) {
3982			if (hctxs[i])
3983				blk_mq_exit_hctx(q, set, hctxs[i], i);
3984			hctxs[i] = hctx;
3985		} else {
3986			if (hctxs[i])
3987				pr_warn("Allocate new hctx on node %d fails,\
3988						fallback to previous one on node %d\n",
3989						node, hctxs[i]->numa_node);
3990			else
3991				break;
3992		}
3993	}
3994	/*
3995	 * Increasing nr_hw_queues fails. Free the newly allocated
3996	 * hctxs and keep the previous q->nr_hw_queues.
3997	 */
3998	if (i != set->nr_hw_queues) {
3999		j = q->nr_hw_queues;
4000		end = i;
4001	} else {
4002		j = i;
4003		end = q->nr_hw_queues;
4004		q->nr_hw_queues = set->nr_hw_queues;
4005	}
4006
4007	for (; j < end; j++) {
4008		struct blk_mq_hw_ctx *hctx = hctxs[j];
4009
4010		if (hctx) {
4011			blk_mq_exit_hctx(q, set, hctx, j);
4012			hctxs[j] = NULL;
4013		}
4014	}
4015	mutex_unlock(&q->sysfs_lock);
4016}
4017
4018int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
4019		struct request_queue *q)
4020{
4021	WARN_ON_ONCE(blk_queue_has_srcu(q) !=
4022			!!(set->flags & BLK_MQ_F_BLOCKING));
4023
4024	/* mark the queue as mq asap */
4025	q->mq_ops = set->ops;
4026
4027	q->poll_cb = blk_stat_alloc_callback(blk_mq_poll_stats_fn,
4028					     blk_mq_poll_stats_bkt,
4029					     BLK_MQ_POLL_STATS_BKTS, q);
4030	if (!q->poll_cb)
4031		goto err_exit;
4032
4033	if (blk_mq_alloc_ctxs(q))
4034		goto err_poll;
4035
4036	/* init q->mq_kobj and sw queues' kobjects */
4037	blk_mq_sysfs_init(q);
4038
4039	INIT_LIST_HEAD(&q->unused_hctx_list);
4040	spin_lock_init(&q->unused_hctx_lock);
4041
4042	blk_mq_realloc_hw_ctxs(set, q);
4043	if (!q->nr_hw_queues)
4044		goto err_hctxs;
4045
4046	INIT_WORK(&q->timeout_work, blk_mq_timeout_work);
4047	blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ);
4048
4049	q->tag_set = set;
4050
4051	q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
4052	if (set->nr_maps > HCTX_TYPE_POLL &&
4053	    set->map[HCTX_TYPE_POLL].nr_queues)
4054		blk_queue_flag_set(QUEUE_FLAG_POLL, q);
4055
4056	INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work);
4057	INIT_LIST_HEAD(&q->requeue_list);
4058	spin_lock_init(&q->requeue_lock);
4059
4060	q->nr_requests = set->queue_depth;
4061
4062	/*
4063	 * Default to classic polling
4064	 */
4065	q->poll_nsec = BLK_MQ_POLL_CLASSIC;
4066
4067	blk_mq_init_cpu_queues(q, set->nr_hw_queues);
4068	blk_mq_add_queue_tag_set(set, q);
4069	blk_mq_map_swqueue(q);
4070	return 0;
4071
4072err_hctxs:
4073	kfree(q->queue_hw_ctx);
4074	q->nr_hw_queues = 0;
4075	blk_mq_sysfs_deinit(q);
4076err_poll:
4077	blk_stat_free_callback(q->poll_cb);
4078	q->poll_cb = NULL;
4079err_exit:
4080	q->mq_ops = NULL;
4081	return -ENOMEM;
4082}
4083EXPORT_SYMBOL(blk_mq_init_allocated_queue);
4084
4085/* tags can _not_ be used after returning from blk_mq_exit_queue */
4086void blk_mq_exit_queue(struct request_queue *q)
4087{
4088	struct blk_mq_tag_set *set = q->tag_set;
4089
4090	/* Checks hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED. */
4091	blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
4092	/* May clear BLK_MQ_F_TAG_QUEUE_SHARED in hctx->flags. */
4093	blk_mq_del_queue_tag_set(q);
4094}
4095
4096static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
4097{
4098	int i;
4099
4100	if (blk_mq_is_shared_tags(set->flags)) {
4101		set->shared_tags = blk_mq_alloc_map_and_rqs(set,
4102						BLK_MQ_NO_HCTX_IDX,
4103						set->queue_depth);
4104		if (!set->shared_tags)
4105			return -ENOMEM;
4106	}
4107
4108	for (i = 0; i < set->nr_hw_queues; i++) {
4109		if (!__blk_mq_alloc_map_and_rqs(set, i))
4110			goto out_unwind;
4111		cond_resched();
4112	}
4113
4114	return 0;
4115
4116out_unwind:
4117	while (--i >= 0)
4118		__blk_mq_free_map_and_rqs(set, i);
4119
4120	if (blk_mq_is_shared_tags(set->flags)) {
4121		blk_mq_free_map_and_rqs(set, set->shared_tags,
4122					BLK_MQ_NO_HCTX_IDX);
4123	}
4124
4125	return -ENOMEM;
4126}
4127
4128/*
4129 * Allocate the request maps associated with this tag_set. Note that this
4130 * may reduce the depth asked for, if memory is tight. set->queue_depth
4131 * will be updated to reflect the allocated depth.
4132 */
4133static int blk_mq_alloc_set_map_and_rqs(struct blk_mq_tag_set *set)
4134{
4135	unsigned int depth;
4136	int err;
4137
4138	depth = set->queue_depth;
4139	do {
4140		err = __blk_mq_alloc_rq_maps(set);
4141		if (!err)
4142			break;
4143
4144		set->queue_depth >>= 1;
4145		if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) {
4146			err = -ENOMEM;
4147			break;
4148		}
4149	} while (set->queue_depth);
4150
4151	if (!set->queue_depth || err) {
4152		pr_err("blk-mq: failed to allocate request map\n");
4153		return -ENOMEM;
4154	}
4155
4156	if (depth != set->queue_depth)
4157		pr_info("blk-mq: reduced tag depth (%u -> %u)\n",
4158						depth, set->queue_depth);
4159
4160	return 0;
4161}
4162
4163static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
4164{
4165	/*
4166	 * blk_mq_map_queues() and multiple .map_queues() implementations
4167	 * expect that set->map[HCTX_TYPE_DEFAULT].nr_queues is set to the
4168	 * number of hardware queues.
4169	 */
4170	if (set->nr_maps == 1)
4171		set->map[HCTX_TYPE_DEFAULT].nr_queues = set->nr_hw_queues;
4172
4173	if (set->ops->map_queues && !is_kdump_kernel()) {
4174		int i;
4175
4176		/*
4177		 * transport .map_queues is usually done in the following
4178		 * way:
4179		 *
4180		 * for (queue = 0; queue < set->nr_hw_queues; queue++) {
4181		 * 	mask = get_cpu_mask(queue)
4182		 * 	for_each_cpu(cpu, mask)
4183		 * 		set->map[x].mq_map[cpu] = queue;
4184		 * }
4185		 *
4186		 * When we need to remap, the table has to be cleared for
4187		 * killing stale mapping since one CPU may not be mapped
4188		 * to any hw queue.
4189		 */
4190		for (i = 0; i < set->nr_maps; i++)
4191			blk_mq_clear_mq_map(&set->map[i]);
4192
4193		return set->ops->map_queues(set);
4194	} else {
4195		BUG_ON(set->nr_maps > 1);
4196		return blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
4197	}
4198}
4199
4200static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set,
4201				  int cur_nr_hw_queues, int new_nr_hw_queues)
4202{
4203	struct blk_mq_tags **new_tags;
4204
4205	if (cur_nr_hw_queues >= new_nr_hw_queues)
4206		return 0;
4207
4208	new_tags = kcalloc_node(new_nr_hw_queues, sizeof(struct blk_mq_tags *),
4209				GFP_KERNEL, set->numa_node);
4210	if (!new_tags)
4211		return -ENOMEM;
4212
4213	if (set->tags)
4214		memcpy(new_tags, set->tags, cur_nr_hw_queues *
4215		       sizeof(*set->tags));
4216	kfree(set->tags);
4217	set->tags = new_tags;
4218	set->nr_hw_queues = new_nr_hw_queues;
4219
4220	return 0;
4221}
4222
4223static int blk_mq_alloc_tag_set_tags(struct blk_mq_tag_set *set,
4224				int new_nr_hw_queues)
4225{
4226	return blk_mq_realloc_tag_set_tags(set, 0, new_nr_hw_queues);
4227}
4228
4229/*
4230 * Alloc a tag set to be associated with one or more request queues.
4231 * May fail with EINVAL for various error conditions. May adjust the
4232 * requested depth down, if it's too large. In that case, the set
4233 * value will be stored in set->queue_depth.
4234 */
4235int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
4236{
4237	int i, ret;
4238
4239	BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS);
4240
4241	if (!set->nr_hw_queues)
4242		return -EINVAL;
4243	if (!set->queue_depth)
4244		return -EINVAL;
4245	if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN)
4246		return -EINVAL;
4247
4248	if (!set->ops->queue_rq)
4249		return -EINVAL;
4250
4251	if (!set->ops->get_budget ^ !set->ops->put_budget)
4252		return -EINVAL;
4253
4254	if (set->queue_depth > BLK_MQ_MAX_DEPTH) {
4255		pr_info("blk-mq: reduced tag depth to %u\n",
4256			BLK_MQ_MAX_DEPTH);
4257		set->queue_depth = BLK_MQ_MAX_DEPTH;
4258	}
4259
4260	if (!set->nr_maps)
4261		set->nr_maps = 1;
4262	else if (set->nr_maps > HCTX_MAX_TYPES)
4263		return -EINVAL;
4264
4265	/*
4266	 * If a crashdump is active, then we are potentially in a very
4267	 * memory constrained environment. Limit us to 1 queue and
4268	 * 64 tags to prevent using too much memory.
4269	 */
4270	if (is_kdump_kernel()) {
4271		set->nr_hw_queues = 1;
4272		set->nr_maps = 1;
4273		set->queue_depth = min(64U, set->queue_depth);
4274	}
4275	/*
4276	 * There is no use for more h/w queues than cpus if we just have
4277	 * a single map
4278	 */
4279	if (set->nr_maps == 1 && set->nr_hw_queues > nr_cpu_ids)
4280		set->nr_hw_queues = nr_cpu_ids;
4281
4282	if (blk_mq_alloc_tag_set_tags(set, set->nr_hw_queues) < 0)
4283		return -ENOMEM;
4284
4285	ret = -ENOMEM;
4286	for (i = 0; i < set->nr_maps; i++) {
4287		set->map[i].mq_map = kcalloc_node(nr_cpu_ids,
4288						  sizeof(set->map[i].mq_map[0]),
4289						  GFP_KERNEL, set->numa_node);
4290		if (!set->map[i].mq_map)
4291			goto out_free_mq_map;
4292		set->map[i].nr_queues = is_kdump_kernel() ? 1 : set->nr_hw_queues;
4293	}
4294
4295	ret = blk_mq_update_queue_map(set);
4296	if (ret)
4297		goto out_free_mq_map;
4298
4299	ret = blk_mq_alloc_set_map_and_rqs(set);
4300	if (ret)
4301		goto out_free_mq_map;
4302
4303	mutex_init(&set->tag_list_lock);
4304	INIT_LIST_HEAD(&set->tag_list);
4305
4306	return 0;
4307
4308out_free_mq_map:
4309	for (i = 0; i < set->nr_maps; i++) {
4310		kfree(set->map[i].mq_map);
4311		set->map[i].mq_map = NULL;
4312	}
4313	kfree(set->tags);
4314	set->tags = NULL;
4315	return ret;
4316}
4317EXPORT_SYMBOL(blk_mq_alloc_tag_set);
4318
4319/* allocate and initialize a tagset for a simple single-queue device */
4320int blk_mq_alloc_sq_tag_set(struct blk_mq_tag_set *set,
4321		const struct blk_mq_ops *ops, unsigned int queue_depth,
4322		unsigned int set_flags)
4323{
4324	memset(set, 0, sizeof(*set));
4325	set->ops = ops;
4326	set->nr_hw_queues = 1;
4327	set->nr_maps = 1;
4328	set->queue_depth = queue_depth;
4329	set->numa_node = NUMA_NO_NODE;
4330	set->flags = set_flags;
4331	return blk_mq_alloc_tag_set(set);
4332}
4333EXPORT_SYMBOL_GPL(blk_mq_alloc_sq_tag_set);
4334
4335void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
4336{
4337	int i, j;
4338
4339	for (i = 0; i < set->nr_hw_queues; i++)
4340		__blk_mq_free_map_and_rqs(set, i);
4341
4342	if (blk_mq_is_shared_tags(set->flags)) {
4343		blk_mq_free_map_and_rqs(set, set->shared_tags,
4344					BLK_MQ_NO_HCTX_IDX);
4345	}
4346
4347	for (j = 0; j < set->nr_maps; j++) {
4348		kfree(set->map[j].mq_map);
4349		set->map[j].mq_map = NULL;
4350	}
4351
4352	kfree(set->tags);
4353	set->tags = NULL;
4354}
4355EXPORT_SYMBOL(blk_mq_free_tag_set);
4356
4357int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
4358{
4359	struct blk_mq_tag_set *set = q->tag_set;
4360	struct blk_mq_hw_ctx *hctx;
4361	int i, ret;
4362
4363	if (!set)
4364		return -EINVAL;
4365
4366	if (q->nr_requests == nr)
4367		return 0;
4368
4369	blk_mq_freeze_queue(q);
4370	blk_mq_quiesce_queue(q);
4371
4372	ret = 0;
4373	queue_for_each_hw_ctx(q, hctx, i) {
4374		if (!hctx->tags)
4375			continue;
4376		/*
4377		 * If we're using an MQ scheduler, just update the scheduler
4378		 * queue depth. This is similar to what the old code would do.
4379		 */
4380		if (hctx->sched_tags) {
4381			ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags,
4382						      nr, true);
4383		} else {
4384			ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr,
4385						      false);
4386		}
4387		if (ret)
4388			break;
4389		if (q->elevator && q->elevator->type->ops.depth_updated)
4390			q->elevator->type->ops.depth_updated(hctx);
4391	}
4392	if (!ret) {
4393		q->nr_requests = nr;
4394		if (blk_mq_is_shared_tags(set->flags)) {
4395			if (q->elevator)
4396				blk_mq_tag_update_sched_shared_tags(q);
4397			else
4398				blk_mq_tag_resize_shared_tags(set, nr);
4399		}
4400	}
4401
4402	blk_mq_unquiesce_queue(q);
4403	blk_mq_unfreeze_queue(q);
4404
4405	return ret;
4406}
4407
4408/*
4409 * request_queue and elevator_type pair.
4410 * It is just used by __blk_mq_update_nr_hw_queues to cache
4411 * the elevator_type associated with a request_queue.
4412 */
4413struct blk_mq_qe_pair {
4414	struct list_head node;
4415	struct request_queue *q;
4416	struct elevator_type *type;
4417};
4418
4419/*
4420 * Cache the elevator_type in qe pair list and switch the
4421 * io scheduler to 'none'
4422 */
4423static bool blk_mq_elv_switch_none(struct list_head *head,
4424		struct request_queue *q)
4425{
4426	struct blk_mq_qe_pair *qe;
4427
4428	if (!q->elevator)
4429		return true;
4430
4431	qe = kmalloc(sizeof(*qe), GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY);
4432	if (!qe)
4433		return false;
4434
4435	INIT_LIST_HEAD(&qe->node);
4436	qe->q = q;
4437	qe->type = q->elevator->type;
4438	list_add(&qe->node, head);
4439
4440	mutex_lock(&q->sysfs_lock);
4441	/*
4442	 * After elevator_switch_mq, the previous elevator_queue will be
4443	 * released by elevator_release. The reference of the io scheduler
4444	 * module get by elevator_get will also be put. So we need to get
4445	 * a reference of the io scheduler module here to prevent it to be
4446	 * removed.
4447	 */
4448	__module_get(qe->type->elevator_owner);
4449	elevator_switch_mq(q, NULL);
4450	mutex_unlock(&q->sysfs_lock);
4451
4452	return true;
4453}
4454
4455static void blk_mq_elv_switch_back(struct list_head *head,
4456		struct request_queue *q)
4457{
4458	struct blk_mq_qe_pair *qe;
4459	struct elevator_type *t = NULL;
4460
4461	list_for_each_entry(qe, head, node)
4462		if (qe->q == q) {
4463			t = qe->type;
4464			break;
4465		}
4466
4467	if (!t)
4468		return;
4469
4470	list_del(&qe->node);
4471	kfree(qe);
4472
4473	mutex_lock(&q->sysfs_lock);
4474	elevator_switch_mq(q, t);
4475	mutex_unlock(&q->sysfs_lock);
4476}
4477
4478static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
4479							int nr_hw_queues)
4480{
4481	struct request_queue *q;
4482	LIST_HEAD(head);
4483	int prev_nr_hw_queues;
4484
4485	lockdep_assert_held(&set->tag_list_lock);
4486
4487	if (set->nr_maps == 1 && nr_hw_queues > nr_cpu_ids)
4488		nr_hw_queues = nr_cpu_ids;
4489	if (nr_hw_queues < 1)
4490		return;
4491	if (set->nr_maps == 1 && nr_hw_queues == set->nr_hw_queues)
4492		return;
4493
4494	list_for_each_entry(q, &set->tag_list, tag_set_list)
4495		blk_mq_freeze_queue(q);
4496	/*
4497	 * Switch IO scheduler to 'none', cleaning up the data associated
4498	 * with the previous scheduler. We will switch back once we are done
4499	 * updating the new sw to hw queue mappings.
4500	 */
4501	list_for_each_entry(q, &set->tag_list, tag_set_list)
4502		if (!blk_mq_elv_switch_none(&head, q))
4503			goto switch_back;
4504
4505	list_for_each_entry(q, &set->tag_list, tag_set_list) {
4506		blk_mq_debugfs_unregister_hctxs(q);
4507		blk_mq_sysfs_unregister(q);
4508	}
4509
4510	prev_nr_hw_queues = set->nr_hw_queues;
4511	if (blk_mq_realloc_tag_set_tags(set, set->nr_hw_queues, nr_hw_queues) <
4512	    0)
4513		goto reregister;
4514
4515	set->nr_hw_queues = nr_hw_queues;
4516fallback:
4517	blk_mq_update_queue_map(set);
4518	list_for_each_entry(q, &set->tag_list, tag_set_list) {
4519		blk_mq_realloc_hw_ctxs(set, q);
4520		if (q->nr_hw_queues != set->nr_hw_queues) {
4521			int i = prev_nr_hw_queues;
4522
4523			pr_warn("Increasing nr_hw_queues to %d fails, fallback to %d\n",
4524					nr_hw_queues, prev_nr_hw_queues);
4525			for (; i < set->nr_hw_queues; i++)
4526				__blk_mq_free_map_and_rqs(set, i);
4527
4528			set->nr_hw_queues = prev_nr_hw_queues;
4529			blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
4530			goto fallback;
4531		}
4532		blk_mq_map_swqueue(q);
4533	}
4534
4535reregister:
4536	list_for_each_entry(q, &set->tag_list, tag_set_list) {
4537		blk_mq_sysfs_register(q);
4538		blk_mq_debugfs_register_hctxs(q);
4539	}
4540
4541switch_back:
4542	list_for_each_entry(q, &set->tag_list, tag_set_list)
4543		blk_mq_elv_switch_back(&head, q);
4544
4545	list_for_each_entry(q, &set->tag_list, tag_set_list)
4546		blk_mq_unfreeze_queue(q);
4547}
4548
4549void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
4550{
4551	mutex_lock(&set->tag_list_lock);
4552	__blk_mq_update_nr_hw_queues(set, nr_hw_queues);
4553	mutex_unlock(&set->tag_list_lock);
4554}
4555EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues);
4556
4557/* Enable polling stats and return whether they were already enabled. */
4558static bool blk_poll_stats_enable(struct request_queue *q)
4559{
4560	if (q->poll_stat)
4561		return true;
4562
4563	return blk_stats_alloc_enable(q);
4564}
4565
4566static void blk_mq_poll_stats_start(struct request_queue *q)
4567{
4568	/*
4569	 * We don't arm the callback if polling stats are not enabled or the
4570	 * callback is already active.
4571	 */
4572	if (!q->poll_stat || blk_stat_is_active(q->poll_cb))
4573		return;
4574
4575	blk_stat_activate_msecs(q->poll_cb, 100);
4576}
4577
4578static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb)
4579{
4580	struct request_queue *q = cb->data;
4581	int bucket;
4582
4583	for (bucket = 0; bucket < BLK_MQ_POLL_STATS_BKTS; bucket++) {
4584		if (cb->stat[bucket].nr_samples)
4585			q->poll_stat[bucket] = cb->stat[bucket];
4586	}
4587}
4588
4589static unsigned long blk_mq_poll_nsecs(struct request_queue *q,
4590				       struct request *rq)
4591{
4592	unsigned long ret = 0;
4593	int bucket;
4594
4595	/*
4596	 * If stats collection isn't on, don't sleep but turn it on for
4597	 * future users
4598	 */
4599	if (!blk_poll_stats_enable(q))
4600		return 0;
4601
4602	/*
4603	 * As an optimistic guess, use half of the mean service time
4604	 * for this type of request. We can (and should) make this smarter.
4605	 * For instance, if the completion latencies are tight, we can
4606	 * get closer than just half the mean. This is especially
4607	 * important on devices where the completion latencies are longer
4608	 * than ~10 usec. We do use the stats for the relevant IO size
4609	 * if available which does lead to better estimates.
4610	 */
4611	bucket = blk_mq_poll_stats_bkt(rq);
4612	if (bucket < 0)
4613		return ret;
4614
4615	if (q->poll_stat[bucket].nr_samples)
4616		ret = (q->poll_stat[bucket].mean + 1) / 2;
4617
4618	return ret;
4619}
4620
4621static bool blk_mq_poll_hybrid(struct request_queue *q, blk_qc_t qc)
4622{
4623	struct blk_mq_hw_ctx *hctx = blk_qc_to_hctx(q, qc);
4624	struct request *rq = blk_qc_to_rq(hctx, qc);
4625	struct hrtimer_sleeper hs;
4626	enum hrtimer_mode mode;
4627	unsigned int nsecs;
4628	ktime_t kt;
4629
4630	/*
4631	 * If a request has completed on queue that uses an I/O scheduler, we
4632	 * won't get back a request from blk_qc_to_rq.
4633	 */
4634	if (!rq || (rq->rq_flags & RQF_MQ_POLL_SLEPT))
4635		return false;
4636
4637	/*
4638	 * If we get here, hybrid polling is enabled. Hence poll_nsec can be:
4639	 *
4640	 *  0:	use half of prev avg
4641	 * >0:	use this specific value
4642	 */
4643	if (q->poll_nsec > 0)
4644		nsecs = q->poll_nsec;
4645	else
4646		nsecs = blk_mq_poll_nsecs(q, rq);
4647
4648	if (!nsecs)
4649		return false;
4650
4651	rq->rq_flags |= RQF_MQ_POLL_SLEPT;
4652
4653	/*
4654	 * This will be replaced with the stats tracking code, using
4655	 * 'avg_completion_time / 2' as the pre-sleep target.
4656	 */
4657	kt = nsecs;
4658
4659	mode = HRTIMER_MODE_REL;
4660	hrtimer_init_sleeper_on_stack(&hs, CLOCK_MONOTONIC, mode);
4661	hrtimer_set_expires(&hs.timer, kt);
4662
4663	do {
4664		if (blk_mq_rq_state(rq) == MQ_RQ_COMPLETE)
4665			break;
4666		set_current_state(TASK_UNINTERRUPTIBLE);
4667		hrtimer_sleeper_start_expires(&hs, mode);
4668		if (hs.task)
4669			io_schedule();
4670		hrtimer_cancel(&hs.timer);
4671		mode = HRTIMER_MODE_ABS;
4672	} while (hs.task && !signal_pending(current));
4673
4674	__set_current_state(TASK_RUNNING);
4675	destroy_hrtimer_on_stack(&hs.timer);
4676
4677	/*
4678	 * If we sleep, have the caller restart the poll loop to reset the
4679	 * state.  Like for the other success return cases, the caller is
4680	 * responsible for checking if the IO completed.  If the IO isn't
4681	 * complete, we'll get called again and will go straight to the busy
4682	 * poll loop.
4683	 */
4684	return true;
4685}
4686
4687static int blk_mq_poll_classic(struct request_queue *q, blk_qc_t cookie,
4688			       struct io_comp_batch *iob, unsigned int flags)
4689{
4690	struct blk_mq_hw_ctx *hctx = blk_qc_to_hctx(q, cookie);
4691	long state = get_current_state();
4692	int ret;
4693
4694	do {
4695		ret = q->mq_ops->poll(hctx, iob);
4696		if (ret > 0) {
4697			__set_current_state(TASK_RUNNING);
4698			return ret;
4699		}
4700
4701		if (signal_pending_state(state, current))
4702			__set_current_state(TASK_RUNNING);
4703		if (task_is_running(current))
4704			return 1;
4705
4706		if (ret < 0 || (flags & BLK_POLL_ONESHOT))
4707			break;
4708		cpu_relax();
4709	} while (!need_resched());
4710
4711	__set_current_state(TASK_RUNNING);
4712	return 0;
4713}
4714
4715int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob,
4716		unsigned int flags)
4717{
4718	if (!(flags & BLK_POLL_NOSLEEP) &&
4719	    q->poll_nsec != BLK_MQ_POLL_CLASSIC) {
4720		if (blk_mq_poll_hybrid(q, cookie))
4721			return 1;
4722	}
4723	return blk_mq_poll_classic(q, cookie, iob, flags);
4724}
4725
4726unsigned int blk_mq_rq_cpu(struct request *rq)
4727{
4728	return rq->mq_ctx->cpu;
4729}
4730EXPORT_SYMBOL(blk_mq_rq_cpu);
4731
4732void blk_mq_cancel_work_sync(struct request_queue *q)
4733{
4734	if (queue_is_mq(q)) {
4735		struct blk_mq_hw_ctx *hctx;
4736		int i;
4737
4738		cancel_delayed_work_sync(&q->requeue_work);
4739
4740		queue_for_each_hw_ctx(q, hctx, i)
4741			cancel_delayed_work_sync(&hctx->run_work);
4742	}
4743}
4744
4745static int __init blk_mq_init(void)
4746{
4747	int i;
4748
4749	for_each_possible_cpu(i)
4750		init_llist_head(&per_cpu(blk_cpu_done, i));
4751	open_softirq(BLOCK_SOFTIRQ, blk_done_softirq);
4752
4753	cpuhp_setup_state_nocalls(CPUHP_BLOCK_SOFTIRQ_DEAD,
4754				  "block/softirq:dead", NULL,
4755				  blk_softirq_cpu_dead);
4756	cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
4757				blk_mq_hctx_notify_dead);
4758	cpuhp_setup_state_multi(CPUHP_AP_BLK_MQ_ONLINE, "block/mq:online",
4759				blk_mq_hctx_notify_online,
4760				blk_mq_hctx_notify_offline);
4761	return 0;
4762}
4763subsys_initcall(blk_mq_init);
Configure Feed

Configure Feed