···1616#include <linux/backing-dev.h>1717#include <linux/bio.h>1818#include <linux/blkdev.h>1919+#include <linux/blk-mq.h>1920#include <linux/highmem.h>2021#include <linux/mm.h>2122#include <linux/kernel_stat.h>···4948/*5049 * For the allocated request tables5150 */5252-static struct kmem_cache *request_cachep;5151+struct kmem_cache *request_cachep = NULL;53525453/*5554 * For queue allocation···6059 * Controlling structure to kblockd6160 */6261static struct workqueue_struct *kblockd_workqueue;6363-6464-static void drive_stat_acct(struct request *rq, int new_io)6565-{6666- struct hd_struct *part;6767- int rw = rq_data_dir(rq);6868- int cpu;6969-7070- if (!blk_do_io_stat(rq))7171- return;7272-7373- cpu = part_stat_lock();7474-7575- if (!new_io) {7676- part = rq->part;7777- part_stat_inc(cpu, part, merges[rw]);7878- } else {7979- part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));8080- if (!hd_struct_try_get(part)) {8181- /*8282- * The partition is already being removed,8383- * the request will be accounted on the disk only8484- *8585- * We take a reference on disk->part0 although that8686- * partition will never be deleted, so we can treat8787- * it as any other partition.8888- */8989- part = &rq->rq_disk->part0;9090- hd_struct_get(part);9191- }9292- part_round_stats(cpu, part);9393- part_inc_in_flight(part, rw);9494- rq->part = part;9595- }9696-9797- part_stat_unlock();9898-}996210063void blk_queue_congestion_threshold(struct request_queue *q)10164{···110145 rq->cmd = rq->__cmd;111146 rq->cmd_len = BLK_MAX_CDB;112147 rq->tag = -1;113113- rq->ref_count = 1;114148 rq->start_time = jiffies;115149 set_start_time_ns(rq);116150 rq->part = NULL;···138174{139175 int bit;140176141141- printk(KERN_INFO "%s: dev %s: type=%x, flags=%x\n", msg,177177+ printk(KERN_INFO "%s: dev %s: type=%x, flags=%llx\n", msg,142178 rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->cmd_type,143143- rq->cmd_flags);179179+ (unsigned long long) rq->cmd_flags);144180145181 printk(KERN_INFO " sector %llu, nr/cnr %u/%u\n",146182 (unsigned long long)blk_rq_pos(rq),···559595 if (!q)560596 return NULL;561597598598+ if (percpu_counter_init(&q->mq_usage_counter, 0))599599+ goto fail_q;600600+562601 q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask);563602 if (q->id < 0)564564- goto fail_q;603603+ goto fail_c;565604566605 q->backing_dev_info.ra_pages =567606 (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;···611644 q->bypass_depth = 1;612645 __set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);613646647647+ init_waitqueue_head(&q->mq_freeze_wq);648648+614649 if (blkcg_init_queue(q))615650 goto fail_bdi;616651···622653 bdi_destroy(&q->backing_dev_info);623654fail_id:624655 ida_simple_remove(&blk_queue_ida, q->id);656656+fail_c:657657+ percpu_counter_destroy(&q->mq_usage_counter);625658fail_q:626659 kmem_cache_free(blk_requestq_cachep, q);627660 return NULL;···10901119 goto retry;10911120}1092112110931093-struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)11221122+static struct request *blk_old_get_request(struct request_queue *q, int rw,11231123+ gfp_t gfp_mask)10941124{10951125 struct request *rq;10961126···11071135 /* q->queue_lock is unlocked at this point */1108113611091137 return rq;11381138+}11391139+11401140+struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)11411141+{11421142+ if (q->mq_ops)11431143+ return blk_mq_alloc_request(q, rw, gfp_mask, false);11441144+ else11451145+ return blk_old_get_request(q, rw, gfp_mask);11101146}11111147EXPORT_SYMBOL(blk_get_request);11121148···12011221static void add_acct_request(struct request_queue *q, struct request *rq,12021222 int where)12031223{12041204- drive_stat_acct(rq, 1);12241224+ blk_account_io_start(rq, true);12051225 __elv_add_request(q, rq, where);12061226}12071227···12621282{12631283 if (unlikely(!q))12641284 return;12651265- if (unlikely(--req->ref_count))12661266- return;1267128512681286 blk_pm_put_request(req);12691287···1290131212911313void blk_put_request(struct request *req)12921314{12931293- unsigned long flags;12941315 struct request_queue *q = req->q;1295131612961296- spin_lock_irqsave(q->queue_lock, flags);12971297- __blk_put_request(q, req);12981298- spin_unlock_irqrestore(q->queue_lock, flags);13171317+ if (q->mq_ops)13181318+ blk_mq_free_request(req);13191319+ else {13201320+ unsigned long flags;13211321+13221322+ spin_lock_irqsave(q->queue_lock, flags);13231323+ __blk_put_request(q, req);13241324+ spin_unlock_irqrestore(q->queue_lock, flags);13251325+ }12991326}13001327EXPORT_SYMBOL(blk_put_request);13011328···13361353}13371354EXPORT_SYMBOL_GPL(blk_add_request_payload);1338135513391339-static bool bio_attempt_back_merge(struct request_queue *q, struct request *req,13401340- struct bio *bio)13561356+bool bio_attempt_back_merge(struct request_queue *q, struct request *req,13571357+ struct bio *bio)13411358{13421359 const int ff = bio->bi_rw & REQ_FAILFAST_MASK;13431360···13541371 req->__data_len += bio->bi_size;13551372 req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));1356137313571357- drive_stat_acct(req, 0);13741374+ blk_account_io_start(req, false);13581375 return true;13591376}1360137713611361-static bool bio_attempt_front_merge(struct request_queue *q,13621362- struct request *req, struct bio *bio)13781378+bool bio_attempt_front_merge(struct request_queue *q, struct request *req,13791379+ struct bio *bio)13631380{13641381 const int ff = bio->bi_rw & REQ_FAILFAST_MASK;13651382···13841401 req->__data_len += bio->bi_size;13851402 req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));1386140313871387- drive_stat_acct(req, 0);14041404+ blk_account_io_start(req, false);13881405 return true;13891406}1390140713911408/**13921392- * attempt_plug_merge - try to merge with %current's plugged list14091409+ * blk_attempt_plug_merge - try to merge with %current's plugged list13931410 * @q: request_queue new bio is being queued at13941411 * @bio: new bio being queued13951412 * @request_count: out parameter for number of traversed plugged requests···14051422 * reliable access to the elevator outside queue lock. Only check basic14061423 * merging parameters without querying the elevator.14071424 */14081408-static bool attempt_plug_merge(struct request_queue *q, struct bio *bio,14091409- unsigned int *request_count)14251425+bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,14261426+ unsigned int *request_count)14101427{14111428 struct blk_plug *plug;14121429 struct request *rq;14131430 bool ret = false;14311431+ struct list_head *plug_list;1414143214151433 if (blk_queue_nomerges(q))14161434 goto out;···14211437 goto out;14221438 *request_count = 0;1423143914241424- list_for_each_entry_reverse(rq, &plug->list, queuelist) {14401440+ if (q->mq_ops)14411441+ plug_list = &plug->mq_list;14421442+ else14431443+ plug_list = &plug->list;14441444+14451445+ list_for_each_entry_reverse(rq, plug_list, queuelist) {14251446 int el_ret;1426144714271448 if (rq->q == q)···14941505 * Check if we can merge with the plugged list before grabbing14951506 * any locks.14961507 */14971497- if (attempt_plug_merge(q, bio, &request_count))15081508+ if (blk_attempt_plug_merge(q, bio, &request_count))14981509 return;1499151015001511 spin_lock_irq(q->queue_lock);···15621573 }15631574 }15641575 list_add_tail(&req->queuelist, &plug->list);15651565- drive_stat_acct(req, 1);15761576+ blk_account_io_start(req, true);15661577 } else {15671578 spin_lock_irq(q->queue_lock);15681579 add_acct_request(q, req, where);···20162027}20172028EXPORT_SYMBOL_GPL(blk_rq_err_bytes);2018202920192019-static void blk_account_io_completion(struct request *req, unsigned int bytes)20302030+void blk_account_io_completion(struct request *req, unsigned int bytes)20202031{20212032 if (blk_do_io_stat(req)) {20222033 const int rw = rq_data_dir(req);···20302041 }20312042}2032204320332033-static void blk_account_io_done(struct request *req)20442044+void blk_account_io_done(struct request *req)20342045{20352046 /*20362047 * Account IO completion. flush_rq isn't accounted as a···20772088 return rq;20782089}20792090#endif20912091+20922092+void blk_account_io_start(struct request *rq, bool new_io)20932093+{20942094+ struct hd_struct *part;20952095+ int rw = rq_data_dir(rq);20962096+ int cpu;20972097+20982098+ if (!blk_do_io_stat(rq))20992099+ return;21002100+21012101+ cpu = part_stat_lock();21022102+21032103+ if (!new_io) {21042104+ part = rq->part;21052105+ part_stat_inc(cpu, part, merges[rw]);21062106+ } else {21072107+ part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));21082108+ if (!hd_struct_try_get(part)) {21092109+ /*21102110+ * The partition is already being removed,21112111+ * the request will be accounted on the disk only21122112+ *21132113+ * We take a reference on disk->part0 although that21142114+ * partition will never be deleted, so we can treat21152115+ * it as any other partition.21162116+ */21172117+ part = &rq->rq_disk->part0;21182118+ hd_struct_get(part);21192119+ }21202120+ part_round_stats(cpu, part);21212121+ part_inc_in_flight(part, rw);21222122+ rq->part = part;21232123+ }21242124+21252125+ part_stat_unlock();21262126+}2080212720812128/**20822129 * blk_peek_request - peek at the top of a request queue···2489246424902465 if (req->cmd_flags & REQ_DONTPREP)24912466 blk_unprep_request(req);24922492-2493246724942468 blk_account_io_done(req);24952469···2911288729122888 plug->magic = PLUG_MAGIC;29132889 INIT_LIST_HEAD(&plug->list);28902890+ INIT_LIST_HEAD(&plug->mq_list);29142891 INIT_LIST_HEAD(&plug->cb_list);2915289229162893 /*···30092984 BUG_ON(plug->magic != PLUG_MAGIC);3010298530112986 flush_plug_callbacks(plug, from_schedule);29872987+29882988+ if (!list_empty(&plug->mq_list))29892989+ blk_mq_flush_plug_list(plug, from_schedule);29902990+30122991 if (list_empty(&plug->list))30132992 return;30142993
+7-7
block/blk-exec.c
···55#include <linux/module.h>66#include <linux/bio.h>77#include <linux/blkdev.h>88+#include <linux/blk-mq.h>89#include <linux/sched/sysctl.h>9101011#include "blk.h"···2524 struct completion *waiting = rq->end_io_data;26252726 rq->end_io_data = NULL;2828- __blk_put_request(rq->q, rq);29273028 /*3129 * complete last, if this is a stack request the process (and thus···59596060 rq->rq_disk = bd_disk;6161 rq->end_io = done;6262+6363+ if (q->mq_ops) {6464+ blk_mq_insert_request(q, rq, true);6565+ return;6666+ }6767+6268 /*6369 * need to check this before __blk_run_queue(), because rq can6470 * be freed before that returns.···108102 char sense[SCSI_SENSE_BUFFERSIZE];109103 int err = 0;110104 unsigned long hang_check;111111-112112- /*113113- * we need an extra reference to the request, so we can look at114114- * it after io completion115115- */116116- rq->ref_count++;117105118106 if (!rq->sense) {119107 memset(sense, 0, sizeof(sense));
+139-15
block/blk-flush.c
···6969#include <linux/bio.h>7070#include <linux/blkdev.h>7171#include <linux/gfp.h>7272+#include <linux/blk-mq.h>72737374#include "blk.h"7575+#include "blk-mq.h"74767577/* FLUSH/FUA sequences */7678enum {···126124 /* make @rq a normal request */127125 rq->cmd_flags &= ~REQ_FLUSH_SEQ;128126 rq->end_io = rq->flush.saved_end_io;127127+128128+ blk_clear_rq_complete(rq);129129+}130130+131131+static void mq_flush_data_run(struct work_struct *work)132132+{133133+ struct request *rq;134134+135135+ rq = container_of(work, struct request, mq_flush_data);136136+137137+ memset(&rq->csd, 0, sizeof(rq->csd));138138+ blk_mq_run_request(rq, true, false);139139+}140140+141141+static void blk_mq_flush_data_insert(struct request *rq)142142+{143143+ INIT_WORK(&rq->mq_flush_data, mq_flush_data_run);144144+ kblockd_schedule_work(rq->q, &rq->mq_flush_data);129145}130146131147/**···156136 * completion and trigger the next step.157137 *158138 * CONTEXT:159159- * spin_lock_irq(q->queue_lock)139139+ * spin_lock_irq(q->queue_lock or q->mq_flush_lock)160140 *161141 * RETURNS:162142 * %true if requests were added to the dispatch queue, %false otherwise.···166146{167147 struct request_queue *q = rq->q;168148 struct list_head *pending = &q->flush_queue[q->flush_pending_idx];169169- bool queued = false;149149+ bool queued = false, kicked;170150171151 BUG_ON(rq->flush.seq & seq);172152 rq->flush.seq |= seq;···187167188168 case REQ_FSEQ_DATA:189169 list_move_tail(&rq->flush.list, &q->flush_data_in_flight);190190- list_add(&rq->queuelist, &q->queue_head);191191- queued = true;170170+ if (q->mq_ops)171171+ blk_mq_flush_data_insert(rq);172172+ else {173173+ list_add(&rq->queuelist, &q->queue_head);174174+ queued = true;175175+ }192176 break;193177194178 case REQ_FSEQ_DONE:···205181 BUG_ON(!list_empty(&rq->queuelist));206182 list_del_init(&rq->flush.list);207183 blk_flush_restore_request(rq);208208- __blk_end_request_all(rq, error);184184+ if (q->mq_ops)185185+ blk_mq_end_io(rq, error);186186+ else187187+ __blk_end_request_all(rq, error);209188 break;210189211190 default:212191 BUG();213192 }214193215215- return blk_kick_flush(q) | queued;194194+ kicked = blk_kick_flush(q);195195+ /* blk_mq_run_flush will run queue */196196+ if (q->mq_ops)197197+ return queued;198198+ return kicked | queued;216199}217200218201static void flush_end_io(struct request *flush_rq, int error)219202{220203 struct request_queue *q = flush_rq->q;221221- struct list_head *running = &q->flush_queue[q->flush_running_idx];204204+ struct list_head *running;222205 bool queued = false;223206 struct request *rq, *n;207207+ unsigned long flags = 0;224208209209+ if (q->mq_ops) {210210+ blk_mq_free_request(flush_rq);211211+ spin_lock_irqsave(&q->mq_flush_lock, flags);212212+ }213213+ running = &q->flush_queue[q->flush_running_idx];225214 BUG_ON(q->flush_pending_idx == q->flush_running_idx);226215227216 /* account completion of the flush request */228217 q->flush_running_idx ^= 1;229229- elv_completed_request(q, flush_rq);218218+219219+ if (!q->mq_ops)220220+ elv_completed_request(q, flush_rq);230221231222 /* and push the waiting requests to the next stage */232223 list_for_each_entry_safe(rq, n, running, flush.list) {···262223 * directly into request_fn may confuse the driver. Always use263224 * kblockd.264225 */265265- if (queued || q->flush_queue_delayed)266266- blk_run_queue_async(q);226226+ if (queued || q->flush_queue_delayed) {227227+ if (!q->mq_ops)228228+ blk_run_queue_async(q);229229+ else230230+ /*231231+ * This can be optimized to only run queues with requests232232+ * queued if necessary.233233+ */234234+ blk_mq_run_queues(q, true);235235+ }267236 q->flush_queue_delayed = 0;237237+ if (q->mq_ops)238238+ spin_unlock_irqrestore(&q->mq_flush_lock, flags);239239+}240240+241241+static void mq_flush_work(struct work_struct *work)242242+{243243+ struct request_queue *q;244244+ struct request *rq;245245+246246+ q = container_of(work, struct request_queue, mq_flush_work);247247+248248+ /* We don't need set REQ_FLUSH_SEQ, it's for consistency */249249+ rq = blk_mq_alloc_request(q, WRITE_FLUSH|REQ_FLUSH_SEQ,250250+ __GFP_WAIT|GFP_ATOMIC, true);251251+ rq->cmd_type = REQ_TYPE_FS;252252+ rq->end_io = flush_end_io;253253+254254+ blk_mq_run_request(rq, true, false);255255+}256256+257257+/*258258+ * We can't directly use q->flush_rq, because it doesn't have tag and is not in259259+ * hctx->rqs[]. so we must allocate a new request, since we can't sleep here,260260+ * so offload the work to workqueue.261261+ *262262+ * Note: we assume a flush request finished in any hardware queue will flush263263+ * the whole disk cache.264264+ */265265+static void mq_run_flush(struct request_queue *q)266266+{267267+ kblockd_schedule_work(q, &q->mq_flush_work);268268}269269270270/**···314236 * Please read the comment at the top of this file for more info.315237 *316238 * CONTEXT:317317- * spin_lock_irq(q->queue_lock)239239+ * spin_lock_irq(q->queue_lock or q->mq_flush_lock)318240 *319241 * RETURNS:320242 * %true if flush was issued, %false otherwise.···339261 * Issue flush and toggle pending_idx. This makes pending_idx340262 * different from running_idx, which means flush is in flight.341263 */264264+ q->flush_pending_idx ^= 1;265265+ if (q->mq_ops) {266266+ mq_run_flush(q);267267+ return true;268268+ }269269+342270 blk_rq_init(q, &q->flush_rq);343271 q->flush_rq.cmd_type = REQ_TYPE_FS;344272 q->flush_rq.cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ;345273 q->flush_rq.rq_disk = first_rq->rq_disk;346274 q->flush_rq.end_io = flush_end_io;347275348348- q->flush_pending_idx ^= 1;349276 list_add_tail(&q->flush_rq.queuelist, &q->queue_head);350277 return true;351278}···367284 blk_run_queue_async(q);368285}369286287287+static void mq_flush_data_end_io(struct request *rq, int error)288288+{289289+ struct request_queue *q = rq->q;290290+ struct blk_mq_hw_ctx *hctx;291291+ struct blk_mq_ctx *ctx;292292+ unsigned long flags;293293+294294+ ctx = rq->mq_ctx;295295+ hctx = q->mq_ops->map_queue(q, ctx->cpu);296296+297297+ /*298298+ * After populating an empty queue, kick it to avoid stall. Read299299+ * the comment in flush_end_io().300300+ */301301+ spin_lock_irqsave(&q->mq_flush_lock, flags);302302+ if (blk_flush_complete_seq(rq, REQ_FSEQ_DATA, error))303303+ blk_mq_run_hw_queue(hctx, true);304304+ spin_unlock_irqrestore(&q->mq_flush_lock, flags);305305+}306306+370307/**371308 * blk_insert_flush - insert a new FLUSH/FUA request372309 * @rq: request to insert373310 *374311 * To be called from __elv_add_request() for %ELEVATOR_INSERT_FLUSH insertions.312312+ * or __blk_mq_run_hw_queue() to dispatch request.375313 * @rq is being submitted. Analyze what needs to be done and put it on the376314 * right queue.377315 *378316 * CONTEXT:379379- * spin_lock_irq(q->queue_lock)317317+ * spin_lock_irq(q->queue_lock) in !mq case380318 */381319void blk_insert_flush(struct request *rq)382320{···420316 * complete the request.421317 */422318 if (!policy) {423423- __blk_end_bidi_request(rq, 0, 0, 0);319319+ if (q->mq_ops)320320+ blk_mq_end_io(rq, 0);321321+ else322322+ __blk_end_bidi_request(rq, 0, 0, 0);424323 return;425324 }426325···436329 */437330 if ((policy & REQ_FSEQ_DATA) &&438331 !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {439439- list_add_tail(&rq->queuelist, &q->queue_head);332332+ if (q->mq_ops) {333333+ blk_mq_run_request(rq, false, true);334334+ } else335335+ list_add_tail(&rq->queuelist, &q->queue_head);440336 return;441337 }442338···451341 INIT_LIST_HEAD(&rq->flush.list);452342 rq->cmd_flags |= REQ_FLUSH_SEQ;453343 rq->flush.saved_end_io = rq->end_io; /* Usually NULL */344344+ if (q->mq_ops) {345345+ rq->end_io = mq_flush_data_end_io;346346+347347+ spin_lock_irq(&q->mq_flush_lock);348348+ blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0);349349+ spin_unlock_irq(&q->mq_flush_lock);350350+ return;351351+ }454352 rq->end_io = flush_data_end_io;455353456354 blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0);···571453 return ret;572454}573455EXPORT_SYMBOL(blkdev_issue_flush);456456+457457+void blk_mq_init_flush(struct request_queue *q)458458+{459459+ spin_lock_init(&q->mq_flush_lock);460460+ INIT_WORK(&q->mq_flush_work, mq_flush_work);461461+}
+14-3
block/blk-merge.c
···308308 return ll_new_hw_segment(q, req, bio);309309}310310311311+/*312312+ * blk-mq uses req->special to carry normal driver per-request payload, it313313+ * does not indicate a prepared command that we cannot merge with.314314+ */315315+static bool req_no_special_merge(struct request *req)316316+{317317+ struct request_queue *q = req->q;318318+319319+ return !q->mq_ops && req->special;320320+}321321+311322static int ll_merge_requests_fn(struct request_queue *q, struct request *req,312323 struct request *next)313324{···330319 * First check if the either of the requests are re-queued331320 * requests. Can't merge them if they are.332321 */333333- if (req->special || next->special)322322+ if (req_no_special_merge(req) || req_no_special_merge(next))334323 return 0;335324336325 /*···427416428417 if (rq_data_dir(req) != rq_data_dir(next)429418 || req->rq_disk != next->rq_disk430430- || next->special)419419+ || req_no_special_merge(next))431420 return 0;432421433422 if (req->cmd_flags & REQ_WRITE_SAME &&···526515 return false;527516528517 /* must be same device and not a special request */529529- if (rq->rq_disk != bio->bi_bdev->bd_disk || rq->special)518518+ if (rq->rq_disk != bio->bi_bdev->bd_disk || req_no_special_merge(rq))530519 return false;531520532521 /* only merge integrity protected bio into ditto rq */
···11+#include <linux/kernel.h>22+#include <linux/threads.h>33+#include <linux/module.h>44+#include <linux/mm.h>55+#include <linux/smp.h>66+#include <linux/cpu.h>77+88+#include <linux/blk-mq.h>99+#include "blk.h"1010+#include "blk-mq.h"1111+1212+static void show_map(unsigned int *map, unsigned int nr)1313+{1414+ int i;1515+1616+ pr_info("blk-mq: CPU -> queue map\n");1717+ for_each_online_cpu(i)1818+ pr_info(" CPU%2u -> Queue %u\n", i, map[i]);1919+}2020+2121+static int cpu_to_queue_index(unsigned int nr_cpus, unsigned int nr_queues,2222+ const int cpu)2323+{2424+ return cpu / ((nr_cpus + nr_queues - 1) / nr_queues);2525+}2626+2727+static int get_first_sibling(unsigned int cpu)2828+{2929+ unsigned int ret;3030+3131+ ret = cpumask_first(topology_thread_cpumask(cpu));3232+ if (ret < nr_cpu_ids)3333+ return ret;3434+3535+ return cpu;3636+}3737+3838+int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues)3939+{4040+ unsigned int i, nr_cpus, nr_uniq_cpus, queue, first_sibling;4141+ cpumask_var_t cpus;4242+4343+ if (!alloc_cpumask_var(&cpus, GFP_ATOMIC))4444+ return 1;4545+4646+ cpumask_clear(cpus);4747+ nr_cpus = nr_uniq_cpus = 0;4848+ for_each_online_cpu(i) {4949+ nr_cpus++;5050+ first_sibling = get_first_sibling(i);5151+ if (!cpumask_test_cpu(first_sibling, cpus))5252+ nr_uniq_cpus++;5353+ cpumask_set_cpu(i, cpus);5454+ }5555+5656+ queue = 0;5757+ for_each_possible_cpu(i) {5858+ if (!cpu_online(i)) {5959+ map[i] = 0;6060+ continue;6161+ }6262+6363+ /*6464+ * Easy case - we have equal or more hardware queues. Or6565+ * there are no thread siblings to take into account. Do6666+ * 1:1 if enough, or sequential mapping if less.6767+ */6868+ if (nr_queues >= nr_cpus || nr_cpus == nr_uniq_cpus) {6969+ map[i] = cpu_to_queue_index(nr_cpus, nr_queues, queue);7070+ queue++;7171+ continue;7272+ }7373+7474+ /*7575+ * Less then nr_cpus queues, and we have some number of7676+ * threads per cores. Map sibling threads to the same7777+ * queue.7878+ */7979+ first_sibling = get_first_sibling(i);8080+ if (first_sibling == i) {8181+ map[i] = cpu_to_queue_index(nr_uniq_cpus, nr_queues,8282+ queue);8383+ queue++;8484+ } else8585+ map[i] = map[first_sibling];8686+ }8787+8888+ show_map(map, nr_cpus);8989+ free_cpumask_var(cpus);9090+ return 0;9191+}9292+9393+unsigned int *blk_mq_make_queue_map(struct blk_mq_reg *reg)9494+{9595+ unsigned int *map;9696+9797+ /* If cpus are offline, map them to first hctx */9898+ map = kzalloc_node(sizeof(*map) * num_possible_cpus(), GFP_KERNEL,9999+ reg->numa_node);100100+ if (!map)101101+ return NULL;102102+103103+ if (!blk_mq_update_queue_map(map, reg->nr_hw_queues))104104+ return map;105105+106106+ kfree(map);107107+ return NULL;108108+}
···11+#include <linux/kernel.h>22+#include <linux/module.h>33+#include <linux/backing-dev.h>44+#include <linux/bio.h>55+#include <linux/blkdev.h>66+#include <linux/mm.h>77+#include <linux/init.h>88+#include <linux/slab.h>99+#include <linux/workqueue.h>1010+#include <linux/smp.h>1111+#include <linux/llist.h>1212+#include <linux/list_sort.h>1313+#include <linux/cpu.h>1414+#include <linux/cache.h>1515+#include <linux/sched/sysctl.h>1616+#include <linux/delay.h>1717+1818+#include <trace/events/block.h>1919+2020+#include <linux/blk-mq.h>2121+#include "blk.h"2222+#include "blk-mq.h"2323+#include "blk-mq-tag.h"2424+2525+static DEFINE_MUTEX(all_q_mutex);2626+static LIST_HEAD(all_q_list);2727+2828+static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx);2929+3030+DEFINE_PER_CPU(struct llist_head, ipi_lists);3131+3232+static struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,3333+ unsigned int cpu)3434+{3535+ return per_cpu_ptr(q->queue_ctx, cpu);3636+}3737+3838+/*3939+ * This assumes per-cpu software queueing queues. They could be per-node4040+ * as well, for instance. For now this is hardcoded as-is. Note that we don't4141+ * care about preemption, since we know the ctx's are persistent. This does4242+ * mean that we can't rely on ctx always matching the currently running CPU.4343+ */4444+static struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q)4545+{4646+ return __blk_mq_get_ctx(q, get_cpu());4747+}4848+4949+static void blk_mq_put_ctx(struct blk_mq_ctx *ctx)5050+{5151+ put_cpu();5252+}5353+5454+/*5555+ * Check if any of the ctx's have pending work in this hardware queue5656+ */5757+static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)5858+{5959+ unsigned int i;6060+6161+ for (i = 0; i < hctx->nr_ctx_map; i++)6262+ if (hctx->ctx_map[i])6363+ return true;6464+6565+ return false;6666+}6767+6868+/*6969+ * Mark this ctx as having pending work in this hardware queue7070+ */7171+static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,7272+ struct blk_mq_ctx *ctx)7373+{7474+ if (!test_bit(ctx->index_hw, hctx->ctx_map))7575+ set_bit(ctx->index_hw, hctx->ctx_map);7676+}7777+7878+static struct request *blk_mq_alloc_rq(struct blk_mq_hw_ctx *hctx, gfp_t gfp,7979+ bool reserved)8080+{8181+ struct request *rq;8282+ unsigned int tag;8383+8484+ tag = blk_mq_get_tag(hctx->tags, gfp, reserved);8585+ if (tag != BLK_MQ_TAG_FAIL) {8686+ rq = hctx->rqs[tag];8787+ rq->tag = tag;8888+8989+ return rq;9090+ }9191+9292+ return NULL;9393+}9494+9595+static int blk_mq_queue_enter(struct request_queue *q)9696+{9797+ int ret;9898+9999+ __percpu_counter_add(&q->mq_usage_counter, 1, 1000000);100100+ smp_wmb();101101+ /* we have problems to freeze the queue if it's initializing */102102+ if (!blk_queue_bypass(q) || !blk_queue_init_done(q))103103+ return 0;104104+105105+ __percpu_counter_add(&q->mq_usage_counter, -1, 1000000);106106+107107+ spin_lock_irq(q->queue_lock);108108+ ret = wait_event_interruptible_lock_irq(q->mq_freeze_wq,109109+ !blk_queue_bypass(q), *q->queue_lock);110110+ /* inc usage with lock hold to avoid freeze_queue runs here */111111+ if (!ret)112112+ __percpu_counter_add(&q->mq_usage_counter, 1, 1000000);113113+ spin_unlock_irq(q->queue_lock);114114+115115+ return ret;116116+}117117+118118+static void blk_mq_queue_exit(struct request_queue *q)119119+{120120+ __percpu_counter_add(&q->mq_usage_counter, -1, 1000000);121121+}122122+123123+/*124124+ * Guarantee no request is in use, so we can change any data structure of125125+ * the queue afterward.126126+ */127127+static void blk_mq_freeze_queue(struct request_queue *q)128128+{129129+ bool drain;130130+131131+ spin_lock_irq(q->queue_lock);132132+ drain = !q->bypass_depth++;133133+ queue_flag_set(QUEUE_FLAG_BYPASS, q);134134+ spin_unlock_irq(q->queue_lock);135135+136136+ if (!drain)137137+ return;138138+139139+ while (true) {140140+ s64 count;141141+142142+ spin_lock_irq(q->queue_lock);143143+ count = percpu_counter_sum(&q->mq_usage_counter);144144+ spin_unlock_irq(q->queue_lock);145145+146146+ if (count == 0)147147+ break;148148+ blk_mq_run_queues(q, false);149149+ msleep(10);150150+ }151151+}152152+153153+static void blk_mq_unfreeze_queue(struct request_queue *q)154154+{155155+ bool wake = false;156156+157157+ spin_lock_irq(q->queue_lock);158158+ if (!--q->bypass_depth) {159159+ queue_flag_clear(QUEUE_FLAG_BYPASS, q);160160+ wake = true;161161+ }162162+ WARN_ON_ONCE(q->bypass_depth < 0);163163+ spin_unlock_irq(q->queue_lock);164164+ if (wake)165165+ wake_up_all(&q->mq_freeze_wq);166166+}167167+168168+bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)169169+{170170+ return blk_mq_has_free_tags(hctx->tags);171171+}172172+EXPORT_SYMBOL(blk_mq_can_queue);173173+174174+static void blk_mq_rq_ctx_init(struct blk_mq_ctx *ctx, struct request *rq,175175+ unsigned int rw_flags)176176+{177177+ rq->mq_ctx = ctx;178178+ rq->cmd_flags = rw_flags;179179+ ctx->rq_dispatched[rw_is_sync(rw_flags)]++;180180+}181181+182182+static struct request *__blk_mq_alloc_request(struct blk_mq_hw_ctx *hctx,183183+ gfp_t gfp, bool reserved)184184+{185185+ return blk_mq_alloc_rq(hctx, gfp, reserved);186186+}187187+188188+static struct request *blk_mq_alloc_request_pinned(struct request_queue *q,189189+ int rw, gfp_t gfp,190190+ bool reserved)191191+{192192+ struct request *rq;193193+194194+ do {195195+ struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);196196+ struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, ctx->cpu);197197+198198+ rq = __blk_mq_alloc_request(hctx, gfp & ~__GFP_WAIT, reserved);199199+ if (rq) {200200+ blk_mq_rq_ctx_init(ctx, rq, rw);201201+ break;202202+ } else if (!(gfp & __GFP_WAIT))203203+ break;204204+205205+ blk_mq_put_ctx(ctx);206206+ __blk_mq_run_hw_queue(hctx);207207+ blk_mq_wait_for_tags(hctx->tags);208208+ } while (1);209209+210210+ return rq;211211+}212212+213213+struct request *blk_mq_alloc_request(struct request_queue *q, int rw,214214+ gfp_t gfp, bool reserved)215215+{216216+ struct request *rq;217217+218218+ if (blk_mq_queue_enter(q))219219+ return NULL;220220+221221+ rq = blk_mq_alloc_request_pinned(q, rw, gfp, reserved);222222+ blk_mq_put_ctx(rq->mq_ctx);223223+ return rq;224224+}225225+226226+struct request *blk_mq_alloc_reserved_request(struct request_queue *q, int rw,227227+ gfp_t gfp)228228+{229229+ struct request *rq;230230+231231+ if (blk_mq_queue_enter(q))232232+ return NULL;233233+234234+ rq = blk_mq_alloc_request_pinned(q, rw, gfp, true);235235+ blk_mq_put_ctx(rq->mq_ctx);236236+ return rq;237237+}238238+EXPORT_SYMBOL(blk_mq_alloc_reserved_request);239239+240240+/*241241+ * Re-init and set pdu, if we have it242242+ */243243+static void blk_mq_rq_init(struct blk_mq_hw_ctx *hctx, struct request *rq)244244+{245245+ blk_rq_init(hctx->queue, rq);246246+247247+ if (hctx->cmd_size)248248+ rq->special = blk_mq_rq_to_pdu(rq);249249+}250250+251251+static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,252252+ struct blk_mq_ctx *ctx, struct request *rq)253253+{254254+ const int tag = rq->tag;255255+ struct request_queue *q = rq->q;256256+257257+ blk_mq_rq_init(hctx, rq);258258+ blk_mq_put_tag(hctx->tags, tag);259259+260260+ blk_mq_queue_exit(q);261261+}262262+263263+void blk_mq_free_request(struct request *rq)264264+{265265+ struct blk_mq_ctx *ctx = rq->mq_ctx;266266+ struct blk_mq_hw_ctx *hctx;267267+ struct request_queue *q = rq->q;268268+269269+ ctx->rq_completed[rq_is_sync(rq)]++;270270+271271+ hctx = q->mq_ops->map_queue(q, ctx->cpu);272272+ __blk_mq_free_request(hctx, ctx, rq);273273+}274274+275275+static void blk_mq_bio_endio(struct request *rq, struct bio *bio, int error)276276+{277277+ if (error)278278+ clear_bit(BIO_UPTODATE, &bio->bi_flags);279279+ else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))280280+ error = -EIO;281281+282282+ if (unlikely(rq->cmd_flags & REQ_QUIET))283283+ set_bit(BIO_QUIET, &bio->bi_flags);284284+285285+ /* don't actually finish bio if it's part of flush sequence */286286+ if (!(rq->cmd_flags & REQ_FLUSH_SEQ))287287+ bio_endio(bio, error);288288+}289289+290290+void blk_mq_complete_request(struct request *rq, int error)291291+{292292+ struct bio *bio = rq->bio;293293+ unsigned int bytes = 0;294294+295295+ trace_block_rq_complete(rq->q, rq);296296+297297+ while (bio) {298298+ struct bio *next = bio->bi_next;299299+300300+ bio->bi_next = NULL;301301+ bytes += bio->bi_size;302302+ blk_mq_bio_endio(rq, bio, error);303303+ bio = next;304304+ }305305+306306+ blk_account_io_completion(rq, bytes);307307+308308+ if (rq->end_io)309309+ rq->end_io(rq, error);310310+ else311311+ blk_mq_free_request(rq);312312+313313+ blk_account_io_done(rq);314314+}315315+316316+void __blk_mq_end_io(struct request *rq, int error)317317+{318318+ if (!blk_mark_rq_complete(rq))319319+ blk_mq_complete_request(rq, error);320320+}321321+322322+#if defined(CONFIG_SMP) && defined(CONFIG_USE_GENERIC_SMP_HELPERS)323323+324324+/*325325+ * Called with interrupts disabled.326326+ */327327+static void ipi_end_io(void *data)328328+{329329+ struct llist_head *list = &per_cpu(ipi_lists, smp_processor_id());330330+ struct llist_node *entry, *next;331331+ struct request *rq;332332+333333+ entry = llist_del_all(list);334334+335335+ while (entry) {336336+ next = entry->next;337337+ rq = llist_entry(entry, struct request, ll_list);338338+ __blk_mq_end_io(rq, rq->errors);339339+ entry = next;340340+ }341341+}342342+343343+static int ipi_remote_cpu(struct blk_mq_ctx *ctx, const int cpu,344344+ struct request *rq, const int error)345345+{346346+ struct call_single_data *data = &rq->csd;347347+348348+ rq->errors = error;349349+ rq->ll_list.next = NULL;350350+351351+ /*352352+ * If the list is non-empty, an existing IPI must already353353+ * be "in flight". If that is the case, we need not schedule354354+ * a new one.355355+ */356356+ if (llist_add(&rq->ll_list, &per_cpu(ipi_lists, ctx->cpu))) {357357+ data->func = ipi_end_io;358358+ data->flags = 0;359359+ __smp_call_function_single(ctx->cpu, data, 0);360360+ }361361+362362+ return true;363363+}364364+#else /* CONFIG_SMP && CONFIG_USE_GENERIC_SMP_HELPERS */365365+static int ipi_remote_cpu(struct blk_mq_ctx *ctx, const int cpu,366366+ struct request *rq, const int error)367367+{368368+ return false;369369+}370370+#endif371371+372372+/*373373+ * End IO on this request on a multiqueue enabled driver. We'll either do374374+ * it directly inline, or punt to a local IPI handler on the matching375375+ * remote CPU.376376+ */377377+void blk_mq_end_io(struct request *rq, int error)378378+{379379+ struct blk_mq_ctx *ctx = rq->mq_ctx;380380+ int cpu;381381+382382+ if (!ctx->ipi_redirect)383383+ return __blk_mq_end_io(rq, error);384384+385385+ cpu = get_cpu();386386+387387+ if (cpu == ctx->cpu || !cpu_online(ctx->cpu) ||388388+ !ipi_remote_cpu(ctx, cpu, rq, error))389389+ __blk_mq_end_io(rq, error);390390+391391+ put_cpu();392392+}393393+EXPORT_SYMBOL(blk_mq_end_io);394394+395395+static void blk_mq_start_request(struct request *rq)396396+{397397+ struct request_queue *q = rq->q;398398+399399+ trace_block_rq_issue(q, rq);400400+401401+ /*402402+ * Just mark start time and set the started bit. Due to memory403403+ * ordering, we know we'll see the correct deadline as long as404404+ * REQ_ATOMIC_STARTED is seen.405405+ */406406+ rq->deadline = jiffies + q->rq_timeout;407407+ set_bit(REQ_ATOM_STARTED, &rq->atomic_flags);408408+}409409+410410+static void blk_mq_requeue_request(struct request *rq)411411+{412412+ struct request_queue *q = rq->q;413413+414414+ trace_block_rq_requeue(q, rq);415415+ clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);416416+}417417+418418+struct blk_mq_timeout_data {419419+ struct blk_mq_hw_ctx *hctx;420420+ unsigned long *next;421421+ unsigned int *next_set;422422+};423423+424424+static void blk_mq_timeout_check(void *__data, unsigned long *free_tags)425425+{426426+ struct blk_mq_timeout_data *data = __data;427427+ struct blk_mq_hw_ctx *hctx = data->hctx;428428+ unsigned int tag;429429+430430+ /* It may not be in flight yet (this is where431431+ * the REQ_ATOMIC_STARTED flag comes in). The requests are432432+ * statically allocated, so we know it's always safe to access the433433+ * memory associated with a bit offset into ->rqs[].434434+ */435435+ tag = 0;436436+ do {437437+ struct request *rq;438438+439439+ tag = find_next_zero_bit(free_tags, hctx->queue_depth, tag);440440+ if (tag >= hctx->queue_depth)441441+ break;442442+443443+ rq = hctx->rqs[tag++];444444+445445+ if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))446446+ continue;447447+448448+ blk_rq_check_expired(rq, data->next, data->next_set);449449+ } while (1);450450+}451451+452452+static void blk_mq_hw_ctx_check_timeout(struct blk_mq_hw_ctx *hctx,453453+ unsigned long *next,454454+ unsigned int *next_set)455455+{456456+ struct blk_mq_timeout_data data = {457457+ .hctx = hctx,458458+ .next = next,459459+ .next_set = next_set,460460+ };461461+462462+ /*463463+ * Ask the tagging code to iterate busy requests, so we can464464+ * check them for timeout.465465+ */466466+ blk_mq_tag_busy_iter(hctx->tags, blk_mq_timeout_check, &data);467467+}468468+469469+static void blk_mq_rq_timer(unsigned long data)470470+{471471+ struct request_queue *q = (struct request_queue *) data;472472+ struct blk_mq_hw_ctx *hctx;473473+ unsigned long next = 0;474474+ int i, next_set = 0;475475+476476+ queue_for_each_hw_ctx(q, hctx, i)477477+ blk_mq_hw_ctx_check_timeout(hctx, &next, &next_set);478478+479479+ if (next_set)480480+ mod_timer(&q->timeout, round_jiffies_up(next));481481+}482482+483483+/*484484+ * Reverse check our software queue for entries that we could potentially485485+ * merge with. Currently includes a hand-wavy stop count of 8, to not spend486486+ * too much time checking for merges.487487+ */488488+static bool blk_mq_attempt_merge(struct request_queue *q,489489+ struct blk_mq_ctx *ctx, struct bio *bio)490490+{491491+ struct request *rq;492492+ int checked = 8;493493+494494+ list_for_each_entry_reverse(rq, &ctx->rq_list, queuelist) {495495+ int el_ret;496496+497497+ if (!checked--)498498+ break;499499+500500+ if (!blk_rq_merge_ok(rq, bio))501501+ continue;502502+503503+ el_ret = blk_try_merge(rq, bio);504504+ if (el_ret == ELEVATOR_BACK_MERGE) {505505+ if (bio_attempt_back_merge(q, rq, bio)) {506506+ ctx->rq_merged++;507507+ return true;508508+ }509509+ break;510510+ } else if (el_ret == ELEVATOR_FRONT_MERGE) {511511+ if (bio_attempt_front_merge(q, rq, bio)) {512512+ ctx->rq_merged++;513513+ return true;514514+ }515515+ break;516516+ }517517+ }518518+519519+ return false;520520+}521521+522522+void blk_mq_add_timer(struct request *rq)523523+{524524+ __blk_add_timer(rq, NULL);525525+}526526+527527+/*528528+ * Run this hardware queue, pulling any software queues mapped to it in.529529+ * Note that this function currently has various problems around ordering530530+ * of IO. In particular, we'd like FIFO behaviour on handling existing531531+ * items on the hctx->dispatch list. Ignore that for now.532532+ */533533+static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)534534+{535535+ struct request_queue *q = hctx->queue;536536+ struct blk_mq_ctx *ctx;537537+ struct request *rq;538538+ LIST_HEAD(rq_list);539539+ int bit, queued;540540+541541+ if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->flags)))542542+ return;543543+544544+ hctx->run++;545545+546546+ /*547547+ * Touch any software queue that has pending entries.548548+ */549549+ for_each_set_bit(bit, hctx->ctx_map, hctx->nr_ctx) {550550+ clear_bit(bit, hctx->ctx_map);551551+ ctx = hctx->ctxs[bit];552552+ BUG_ON(bit != ctx->index_hw);553553+554554+ spin_lock(&ctx->lock);555555+ list_splice_tail_init(&ctx->rq_list, &rq_list);556556+ spin_unlock(&ctx->lock);557557+ }558558+559559+ /*560560+ * If we have previous entries on our dispatch list, grab them561561+ * and stuff them at the front for more fair dispatch.562562+ */563563+ if (!list_empty_careful(&hctx->dispatch)) {564564+ spin_lock(&hctx->lock);565565+ if (!list_empty(&hctx->dispatch))566566+ list_splice_init(&hctx->dispatch, &rq_list);567567+ spin_unlock(&hctx->lock);568568+ }569569+570570+ /*571571+ * Delete and return all entries from our dispatch list572572+ */573573+ queued = 0;574574+575575+ /*576576+ * Now process all the entries, sending them to the driver.577577+ */578578+ while (!list_empty(&rq_list)) {579579+ int ret;580580+581581+ rq = list_first_entry(&rq_list, struct request, queuelist);582582+ list_del_init(&rq->queuelist);583583+ blk_mq_start_request(rq);584584+585585+ /*586586+ * Last request in the series. Flag it as such, this587587+ * enables drivers to know when IO should be kicked off,588588+ * if they don't do it on a per-request basis.589589+ *590590+ * Note: the flag isn't the only condition drivers591591+ * should do kick off. If drive is busy, the last592592+ * request might not have the bit set.593593+ */594594+ if (list_empty(&rq_list))595595+ rq->cmd_flags |= REQ_END;596596+597597+ ret = q->mq_ops->queue_rq(hctx, rq);598598+ switch (ret) {599599+ case BLK_MQ_RQ_QUEUE_OK:600600+ queued++;601601+ continue;602602+ case BLK_MQ_RQ_QUEUE_BUSY:603603+ /*604604+ * FIXME: we should have a mechanism to stop the queue605605+ * like blk_stop_queue, otherwise we will waste cpu606606+ * time607607+ */608608+ list_add(&rq->queuelist, &rq_list);609609+ blk_mq_requeue_request(rq);610610+ break;611611+ default:612612+ pr_err("blk-mq: bad return on queue: %d\n", ret);613613+ rq->errors = -EIO;614614+ case BLK_MQ_RQ_QUEUE_ERROR:615615+ blk_mq_end_io(rq, rq->errors);616616+ break;617617+ }618618+619619+ if (ret == BLK_MQ_RQ_QUEUE_BUSY)620620+ break;621621+ }622622+623623+ if (!queued)624624+ hctx->dispatched[0]++;625625+ else if (queued < (1 << (BLK_MQ_MAX_DISPATCH_ORDER - 1)))626626+ hctx->dispatched[ilog2(queued) + 1]++;627627+628628+ /*629629+ * Any items that need requeuing? Stuff them into hctx->dispatch,630630+ * that is where we will continue on next queue run.631631+ */632632+ if (!list_empty(&rq_list)) {633633+ spin_lock(&hctx->lock);634634+ list_splice(&rq_list, &hctx->dispatch);635635+ spin_unlock(&hctx->lock);636636+ }637637+}638638+639639+void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)640640+{641641+ if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->flags)))642642+ return;643643+644644+ if (!async)645645+ __blk_mq_run_hw_queue(hctx);646646+ else {647647+ struct request_queue *q = hctx->queue;648648+649649+ kblockd_schedule_delayed_work(q, &hctx->delayed_work, 0);650650+ }651651+}652652+653653+void blk_mq_run_queues(struct request_queue *q, bool async)654654+{655655+ struct blk_mq_hw_ctx *hctx;656656+ int i;657657+658658+ queue_for_each_hw_ctx(q, hctx, i) {659659+ if ((!blk_mq_hctx_has_pending(hctx) &&660660+ list_empty_careful(&hctx->dispatch)) ||661661+ test_bit(BLK_MQ_S_STOPPED, &hctx->flags))662662+ continue;663663+664664+ blk_mq_run_hw_queue(hctx, async);665665+ }666666+}667667+EXPORT_SYMBOL(blk_mq_run_queues);668668+669669+void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)670670+{671671+ cancel_delayed_work(&hctx->delayed_work);672672+ set_bit(BLK_MQ_S_STOPPED, &hctx->state);673673+}674674+EXPORT_SYMBOL(blk_mq_stop_hw_queue);675675+676676+void blk_mq_stop_hw_queues(struct request_queue *q)677677+{678678+ struct blk_mq_hw_ctx *hctx;679679+ int i;680680+681681+ queue_for_each_hw_ctx(q, hctx, i)682682+ blk_mq_stop_hw_queue(hctx);683683+}684684+EXPORT_SYMBOL(blk_mq_stop_hw_queues);685685+686686+void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx)687687+{688688+ clear_bit(BLK_MQ_S_STOPPED, &hctx->state);689689+ __blk_mq_run_hw_queue(hctx);690690+}691691+EXPORT_SYMBOL(blk_mq_start_hw_queue);692692+693693+void blk_mq_start_stopped_hw_queues(struct request_queue *q)694694+{695695+ struct blk_mq_hw_ctx *hctx;696696+ int i;697697+698698+ queue_for_each_hw_ctx(q, hctx, i) {699699+ if (!test_bit(BLK_MQ_S_STOPPED, &hctx->state))700700+ continue;701701+702702+ clear_bit(BLK_MQ_S_STOPPED, &hctx->state);703703+ blk_mq_run_hw_queue(hctx, true);704704+ }705705+}706706+EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);707707+708708+static void blk_mq_work_fn(struct work_struct *work)709709+{710710+ struct blk_mq_hw_ctx *hctx;711711+712712+ hctx = container_of(work, struct blk_mq_hw_ctx, delayed_work.work);713713+ __blk_mq_run_hw_queue(hctx);714714+}715715+716716+static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx,717717+ struct request *rq)718718+{719719+ struct blk_mq_ctx *ctx = rq->mq_ctx;720720+721721+ list_add_tail(&rq->queuelist, &ctx->rq_list);722722+ blk_mq_hctx_mark_pending(hctx, ctx);723723+724724+ /*725725+ * We do this early, to ensure we are on the right CPU.726726+ */727727+ blk_mq_add_timer(rq);728728+}729729+730730+void blk_mq_insert_request(struct request_queue *q, struct request *rq,731731+ bool run_queue)732732+{733733+ struct blk_mq_hw_ctx *hctx;734734+ struct blk_mq_ctx *ctx, *current_ctx;735735+736736+ ctx = rq->mq_ctx;737737+ hctx = q->mq_ops->map_queue(q, ctx->cpu);738738+739739+ if (rq->cmd_flags & (REQ_FLUSH | REQ_FUA)) {740740+ blk_insert_flush(rq);741741+ } else {742742+ current_ctx = blk_mq_get_ctx(q);743743+744744+ if (!cpu_online(ctx->cpu)) {745745+ ctx = current_ctx;746746+ hctx = q->mq_ops->map_queue(q, ctx->cpu);747747+ rq->mq_ctx = ctx;748748+ }749749+ spin_lock(&ctx->lock);750750+ __blk_mq_insert_request(hctx, rq);751751+ spin_unlock(&ctx->lock);752752+753753+ blk_mq_put_ctx(current_ctx);754754+ }755755+756756+ if (run_queue)757757+ __blk_mq_run_hw_queue(hctx);758758+}759759+EXPORT_SYMBOL(blk_mq_insert_request);760760+761761+/*762762+ * This is a special version of blk_mq_insert_request to bypass FLUSH request763763+ * check. Should only be used internally.764764+ */765765+void blk_mq_run_request(struct request *rq, bool run_queue, bool async)766766+{767767+ struct request_queue *q = rq->q;768768+ struct blk_mq_hw_ctx *hctx;769769+ struct blk_mq_ctx *ctx, *current_ctx;770770+771771+ current_ctx = blk_mq_get_ctx(q);772772+773773+ ctx = rq->mq_ctx;774774+ if (!cpu_online(ctx->cpu)) {775775+ ctx = current_ctx;776776+ rq->mq_ctx = ctx;777777+ }778778+ hctx = q->mq_ops->map_queue(q, ctx->cpu);779779+780780+ /* ctx->cpu might be offline */781781+ spin_lock(&ctx->lock);782782+ __blk_mq_insert_request(hctx, rq);783783+ spin_unlock(&ctx->lock);784784+785785+ blk_mq_put_ctx(current_ctx);786786+787787+ if (run_queue)788788+ blk_mq_run_hw_queue(hctx, async);789789+}790790+791791+static void blk_mq_insert_requests(struct request_queue *q,792792+ struct blk_mq_ctx *ctx,793793+ struct list_head *list,794794+ int depth,795795+ bool from_schedule)796796+797797+{798798+ struct blk_mq_hw_ctx *hctx;799799+ struct blk_mq_ctx *current_ctx;800800+801801+ trace_block_unplug(q, depth, !from_schedule);802802+803803+ current_ctx = blk_mq_get_ctx(q);804804+805805+ if (!cpu_online(ctx->cpu))806806+ ctx = current_ctx;807807+ hctx = q->mq_ops->map_queue(q, ctx->cpu);808808+809809+ /*810810+ * preemption doesn't flush plug list, so it's possible ctx->cpu is811811+ * offline now812812+ */813813+ spin_lock(&ctx->lock);814814+ while (!list_empty(list)) {815815+ struct request *rq;816816+817817+ rq = list_first_entry(list, struct request, queuelist);818818+ list_del_init(&rq->queuelist);819819+ rq->mq_ctx = ctx;820820+ __blk_mq_insert_request(hctx, rq);821821+ }822822+ spin_unlock(&ctx->lock);823823+824824+ blk_mq_put_ctx(current_ctx);825825+826826+ blk_mq_run_hw_queue(hctx, from_schedule);827827+}828828+829829+static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b)830830+{831831+ struct request *rqa = container_of(a, struct request, queuelist);832832+ struct request *rqb = container_of(b, struct request, queuelist);833833+834834+ return !(rqa->mq_ctx < rqb->mq_ctx ||835835+ (rqa->mq_ctx == rqb->mq_ctx &&836836+ blk_rq_pos(rqa) < blk_rq_pos(rqb)));837837+}838838+839839+void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)840840+{841841+ struct blk_mq_ctx *this_ctx;842842+ struct request_queue *this_q;843843+ struct request *rq;844844+ LIST_HEAD(list);845845+ LIST_HEAD(ctx_list);846846+ unsigned int depth;847847+848848+ list_splice_init(&plug->mq_list, &list);849849+850850+ list_sort(NULL, &list, plug_ctx_cmp);851851+852852+ this_q = NULL;853853+ this_ctx = NULL;854854+ depth = 0;855855+856856+ while (!list_empty(&list)) {857857+ rq = list_entry_rq(list.next);858858+ list_del_init(&rq->queuelist);859859+ BUG_ON(!rq->q);860860+ if (rq->mq_ctx != this_ctx) {861861+ if (this_ctx) {862862+ blk_mq_insert_requests(this_q, this_ctx,863863+ &ctx_list, depth,864864+ from_schedule);865865+ }866866+867867+ this_ctx = rq->mq_ctx;868868+ this_q = rq->q;869869+ depth = 0;870870+ }871871+872872+ depth++;873873+ list_add_tail(&rq->queuelist, &ctx_list);874874+ }875875+876876+ /*877877+ * If 'this_ctx' is set, we know we have entries to complete878878+ * on 'ctx_list'. Do those.879879+ */880880+ if (this_ctx) {881881+ blk_mq_insert_requests(this_q, this_ctx, &ctx_list, depth,882882+ from_schedule);883883+ }884884+}885885+886886+static void blk_mq_bio_to_request(struct request *rq, struct bio *bio)887887+{888888+ init_request_from_bio(rq, bio);889889+ blk_account_io_start(rq, 1);890890+}891891+892892+static void blk_mq_make_request(struct request_queue *q, struct bio *bio)893893+{894894+ struct blk_mq_hw_ctx *hctx;895895+ struct blk_mq_ctx *ctx;896896+ const int is_sync = rw_is_sync(bio->bi_rw);897897+ const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA);898898+ int rw = bio_data_dir(bio);899899+ struct request *rq;900900+ unsigned int use_plug, request_count = 0;901901+902902+ /*903903+ * If we have multiple hardware queues, just go directly to904904+ * one of those for sync IO.905905+ */906906+ use_plug = !is_flush_fua && ((q->nr_hw_queues == 1) || !is_sync);907907+908908+ blk_queue_bounce(q, &bio);909909+910910+ if (use_plug && blk_attempt_plug_merge(q, bio, &request_count))911911+ return;912912+913913+ if (blk_mq_queue_enter(q)) {914914+ bio_endio(bio, -EIO);915915+ return;916916+ }917917+918918+ ctx = blk_mq_get_ctx(q);919919+ hctx = q->mq_ops->map_queue(q, ctx->cpu);920920+921921+ trace_block_getrq(q, bio, rw);922922+ rq = __blk_mq_alloc_request(hctx, GFP_ATOMIC, false);923923+ if (likely(rq))924924+ blk_mq_rq_ctx_init(ctx, rq, rw);925925+ else {926926+ blk_mq_put_ctx(ctx);927927+ trace_block_sleeprq(q, bio, rw);928928+ rq = blk_mq_alloc_request_pinned(q, rw, __GFP_WAIT|GFP_ATOMIC,929929+ false);930930+ ctx = rq->mq_ctx;931931+ hctx = q->mq_ops->map_queue(q, ctx->cpu);932932+ }933933+934934+ hctx->queued++;935935+936936+ if (unlikely(is_flush_fua)) {937937+ blk_mq_bio_to_request(rq, bio);938938+ blk_mq_put_ctx(ctx);939939+ blk_insert_flush(rq);940940+ goto run_queue;941941+ }942942+943943+ /*944944+ * A task plug currently exists. Since this is completely lockless,945945+ * utilize that to temporarily store requests until the task is946946+ * either done or scheduled away.947947+ */948948+ if (use_plug) {949949+ struct blk_plug *plug = current->plug;950950+951951+ if (plug) {952952+ blk_mq_bio_to_request(rq, bio);953953+ if (list_empty(&plug->mq_list))954954+ trace_block_plug(q);955955+ else if (request_count >= BLK_MAX_REQUEST_COUNT) {956956+ blk_flush_plug_list(plug, false);957957+ trace_block_plug(q);958958+ }959959+ list_add_tail(&rq->queuelist, &plug->mq_list);960960+ blk_mq_put_ctx(ctx);961961+ return;962962+ }963963+ }964964+965965+ spin_lock(&ctx->lock);966966+967967+ if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) &&968968+ blk_mq_attempt_merge(q, ctx, bio))969969+ __blk_mq_free_request(hctx, ctx, rq);970970+ else {971971+ blk_mq_bio_to_request(rq, bio);972972+ __blk_mq_insert_request(hctx, rq);973973+ }974974+975975+ spin_unlock(&ctx->lock);976976+ blk_mq_put_ctx(ctx);977977+978978+ /*979979+ * For a SYNC request, send it to the hardware immediately. For an980980+ * ASYNC request, just ensure that we run it later on. The latter981981+ * allows for merging opportunities and more efficient dispatching.982982+ */983983+run_queue:984984+ blk_mq_run_hw_queue(hctx, !is_sync || is_flush_fua);985985+}986986+987987+/*988988+ * Default mapping to a software queue, since we use one per CPU.989989+ */990990+struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, const int cpu)991991+{992992+ return q->queue_hw_ctx[q->mq_map[cpu]];993993+}994994+EXPORT_SYMBOL(blk_mq_map_queue);995995+996996+struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_reg *reg,997997+ unsigned int hctx_index)998998+{999999+ return kmalloc_node(sizeof(struct blk_mq_hw_ctx),10001000+ GFP_KERNEL | __GFP_ZERO, reg->numa_node);10011001+}10021002+EXPORT_SYMBOL(blk_mq_alloc_single_hw_queue);10031003+10041004+void blk_mq_free_single_hw_queue(struct blk_mq_hw_ctx *hctx,10051005+ unsigned int hctx_index)10061006+{10071007+ kfree(hctx);10081008+}10091009+EXPORT_SYMBOL(blk_mq_free_single_hw_queue);10101010+10111011+static void blk_mq_hctx_notify(void *data, unsigned long action,10121012+ unsigned int cpu)10131013+{10141014+ struct blk_mq_hw_ctx *hctx = data;10151015+ struct blk_mq_ctx *ctx;10161016+ LIST_HEAD(tmp);10171017+10181018+ if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)10191019+ return;10201020+10211021+ /*10221022+ * Move ctx entries to new CPU, if this one is going away.10231023+ */10241024+ ctx = __blk_mq_get_ctx(hctx->queue, cpu);10251025+10261026+ spin_lock(&ctx->lock);10271027+ if (!list_empty(&ctx->rq_list)) {10281028+ list_splice_init(&ctx->rq_list, &tmp);10291029+ clear_bit(ctx->index_hw, hctx->ctx_map);10301030+ }10311031+ spin_unlock(&ctx->lock);10321032+10331033+ if (list_empty(&tmp))10341034+ return;10351035+10361036+ ctx = blk_mq_get_ctx(hctx->queue);10371037+ spin_lock(&ctx->lock);10381038+10391039+ while (!list_empty(&tmp)) {10401040+ struct request *rq;10411041+10421042+ rq = list_first_entry(&tmp, struct request, queuelist);10431043+ rq->mq_ctx = ctx;10441044+ list_move_tail(&rq->queuelist, &ctx->rq_list);10451045+ }10461046+10471047+ blk_mq_hctx_mark_pending(hctx, ctx);10481048+10491049+ spin_unlock(&ctx->lock);10501050+ blk_mq_put_ctx(ctx);10511051+}10521052+10531053+static void blk_mq_init_hw_commands(struct blk_mq_hw_ctx *hctx,10541054+ void (*init)(void *, struct blk_mq_hw_ctx *,10551055+ struct request *, unsigned int),10561056+ void *data)10571057+{10581058+ unsigned int i;10591059+10601060+ for (i = 0; i < hctx->queue_depth; i++) {10611061+ struct request *rq = hctx->rqs[i];10621062+10631063+ init(data, hctx, rq, i);10641064+ }10651065+}10661066+10671067+void blk_mq_init_commands(struct request_queue *q,10681068+ void (*init)(void *, struct blk_mq_hw_ctx *,10691069+ struct request *, unsigned int),10701070+ void *data)10711071+{10721072+ struct blk_mq_hw_ctx *hctx;10731073+ unsigned int i;10741074+10751075+ queue_for_each_hw_ctx(q, hctx, i)10761076+ blk_mq_init_hw_commands(hctx, init, data);10771077+}10781078+EXPORT_SYMBOL(blk_mq_init_commands);10791079+10801080+static void blk_mq_free_rq_map(struct blk_mq_hw_ctx *hctx)10811081+{10821082+ struct page *page;10831083+10841084+ while (!list_empty(&hctx->page_list)) {10851085+ page = list_first_entry(&hctx->page_list, struct page, list);10861086+ list_del_init(&page->list);10871087+ __free_pages(page, page->private);10881088+ }10891089+10901090+ kfree(hctx->rqs);10911091+10921092+ if (hctx->tags)10931093+ blk_mq_free_tags(hctx->tags);10941094+}10951095+10961096+static size_t order_to_size(unsigned int order)10971097+{10981098+ size_t ret = PAGE_SIZE;10991099+11001100+ while (order--)11011101+ ret *= 2;11021102+11031103+ return ret;11041104+}11051105+11061106+static int blk_mq_init_rq_map(struct blk_mq_hw_ctx *hctx,11071107+ unsigned int reserved_tags, int node)11081108+{11091109+ unsigned int i, j, entries_per_page, max_order = 4;11101110+ size_t rq_size, left;11111111+11121112+ INIT_LIST_HEAD(&hctx->page_list);11131113+11141114+ hctx->rqs = kmalloc_node(hctx->queue_depth * sizeof(struct request *),11151115+ GFP_KERNEL, node);11161116+ if (!hctx->rqs)11171117+ return -ENOMEM;11181118+11191119+ /*11201120+ * rq_size is the size of the request plus driver payload, rounded11211121+ * to the cacheline size11221122+ */11231123+ rq_size = round_up(sizeof(struct request) + hctx->cmd_size,11241124+ cache_line_size());11251125+ left = rq_size * hctx->queue_depth;11261126+11271127+ for (i = 0; i < hctx->queue_depth;) {11281128+ int this_order = max_order;11291129+ struct page *page;11301130+ int to_do;11311131+ void *p;11321132+11331133+ while (left < order_to_size(this_order - 1) && this_order)11341134+ this_order--;11351135+11361136+ do {11371137+ page = alloc_pages_node(node, GFP_KERNEL, this_order);11381138+ if (page)11391139+ break;11401140+ if (!this_order--)11411141+ break;11421142+ if (order_to_size(this_order) < rq_size)11431143+ break;11441144+ } while (1);11451145+11461146+ if (!page)11471147+ break;11481148+11491149+ page->private = this_order;11501150+ list_add_tail(&page->list, &hctx->page_list);11511151+11521152+ p = page_address(page);11531153+ entries_per_page = order_to_size(this_order) / rq_size;11541154+ to_do = min(entries_per_page, hctx->queue_depth - i);11551155+ left -= to_do * rq_size;11561156+ for (j = 0; j < to_do; j++) {11571157+ hctx->rqs[i] = p;11581158+ blk_mq_rq_init(hctx, hctx->rqs[i]);11591159+ p += rq_size;11601160+ i++;11611161+ }11621162+ }11631163+11641164+ if (i < (reserved_tags + BLK_MQ_TAG_MIN))11651165+ goto err_rq_map;11661166+ else if (i != hctx->queue_depth) {11671167+ hctx->queue_depth = i;11681168+ pr_warn("%s: queue depth set to %u because of low memory\n",11691169+ __func__, i);11701170+ }11711171+11721172+ hctx->tags = blk_mq_init_tags(hctx->queue_depth, reserved_tags, node);11731173+ if (!hctx->tags) {11741174+err_rq_map:11751175+ blk_mq_free_rq_map(hctx);11761176+ return -ENOMEM;11771177+ }11781178+11791179+ return 0;11801180+}11811181+11821182+static int blk_mq_init_hw_queues(struct request_queue *q,11831183+ struct blk_mq_reg *reg, void *driver_data)11841184+{11851185+ struct blk_mq_hw_ctx *hctx;11861186+ unsigned int i, j;11871187+11881188+ /*11891189+ * Initialize hardware queues11901190+ */11911191+ queue_for_each_hw_ctx(q, hctx, i) {11921192+ unsigned int num_maps;11931193+ int node;11941194+11951195+ node = hctx->numa_node;11961196+ if (node == NUMA_NO_NODE)11971197+ node = hctx->numa_node = reg->numa_node;11981198+11991199+ INIT_DELAYED_WORK(&hctx->delayed_work, blk_mq_work_fn);12001200+ spin_lock_init(&hctx->lock);12011201+ INIT_LIST_HEAD(&hctx->dispatch);12021202+ hctx->queue = q;12031203+ hctx->queue_num = i;12041204+ hctx->flags = reg->flags;12051205+ hctx->queue_depth = reg->queue_depth;12061206+ hctx->cmd_size = reg->cmd_size;12071207+12081208+ blk_mq_init_cpu_notifier(&hctx->cpu_notifier,12091209+ blk_mq_hctx_notify, hctx);12101210+ blk_mq_register_cpu_notifier(&hctx->cpu_notifier);12111211+12121212+ if (blk_mq_init_rq_map(hctx, reg->reserved_tags, node))12131213+ break;12141214+12151215+ /*12161216+ * Allocate space for all possible cpus to avoid allocation in12171217+ * runtime12181218+ */12191219+ hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *),12201220+ GFP_KERNEL, node);12211221+ if (!hctx->ctxs)12221222+ break;12231223+12241224+ num_maps = ALIGN(nr_cpu_ids, BITS_PER_LONG) / BITS_PER_LONG;12251225+ hctx->ctx_map = kzalloc_node(num_maps * sizeof(unsigned long),12261226+ GFP_KERNEL, node);12271227+ if (!hctx->ctx_map)12281228+ break;12291229+12301230+ hctx->nr_ctx_map = num_maps;12311231+ hctx->nr_ctx = 0;12321232+12331233+ if (reg->ops->init_hctx &&12341234+ reg->ops->init_hctx(hctx, driver_data, i))12351235+ break;12361236+ }12371237+12381238+ if (i == q->nr_hw_queues)12391239+ return 0;12401240+12411241+ /*12421242+ * Init failed12431243+ */12441244+ queue_for_each_hw_ctx(q, hctx, j) {12451245+ if (i == j)12461246+ break;12471247+12481248+ if (reg->ops->exit_hctx)12491249+ reg->ops->exit_hctx(hctx, j);12501250+12511251+ blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);12521252+ blk_mq_free_rq_map(hctx);12531253+ kfree(hctx->ctxs);12541254+ }12551255+12561256+ return 1;12571257+}12581258+12591259+static void blk_mq_init_cpu_queues(struct request_queue *q,12601260+ unsigned int nr_hw_queues)12611261+{12621262+ unsigned int i;12631263+12641264+ for_each_possible_cpu(i) {12651265+ struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i);12661266+ struct blk_mq_hw_ctx *hctx;12671267+12681268+ memset(__ctx, 0, sizeof(*__ctx));12691269+ __ctx->cpu = i;12701270+ spin_lock_init(&__ctx->lock);12711271+ INIT_LIST_HEAD(&__ctx->rq_list);12721272+ __ctx->queue = q;12731273+12741274+ /* If the cpu isn't online, the cpu is mapped to first hctx */12751275+ hctx = q->mq_ops->map_queue(q, i);12761276+ hctx->nr_ctx++;12771277+12781278+ if (!cpu_online(i))12791279+ continue;12801280+12811281+ /*12821282+ * Set local node, IFF we have more than one hw queue. If12831283+ * not, we remain on the home node of the device12841284+ */12851285+ if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)12861286+ hctx->numa_node = cpu_to_node(i);12871287+ }12881288+}12891289+12901290+static void blk_mq_map_swqueue(struct request_queue *q)12911291+{12921292+ unsigned int i;12931293+ struct blk_mq_hw_ctx *hctx;12941294+ struct blk_mq_ctx *ctx;12951295+12961296+ queue_for_each_hw_ctx(q, hctx, i) {12971297+ hctx->nr_ctx = 0;12981298+ }12991299+13001300+ /*13011301+ * Map software to hardware queues13021302+ */13031303+ queue_for_each_ctx(q, ctx, i) {13041304+ /* If the cpu isn't online, the cpu is mapped to first hctx */13051305+ hctx = q->mq_ops->map_queue(q, i);13061306+ ctx->index_hw = hctx->nr_ctx;13071307+ hctx->ctxs[hctx->nr_ctx++] = ctx;13081308+ }13091309+}13101310+13111311+struct request_queue *blk_mq_init_queue(struct blk_mq_reg *reg,13121312+ void *driver_data)13131313+{13141314+ struct blk_mq_hw_ctx **hctxs;13151315+ struct blk_mq_ctx *ctx;13161316+ struct request_queue *q;13171317+ int i;13181318+13191319+ if (!reg->nr_hw_queues ||13201320+ !reg->ops->queue_rq || !reg->ops->map_queue ||13211321+ !reg->ops->alloc_hctx || !reg->ops->free_hctx)13221322+ return ERR_PTR(-EINVAL);13231323+13241324+ if (!reg->queue_depth)13251325+ reg->queue_depth = BLK_MQ_MAX_DEPTH;13261326+ else if (reg->queue_depth > BLK_MQ_MAX_DEPTH) {13271327+ pr_err("blk-mq: queuedepth too large (%u)\n", reg->queue_depth);13281328+ reg->queue_depth = BLK_MQ_MAX_DEPTH;13291329+ }13301330+13311331+ /*13321332+ * Set aside a tag for flush requests. It will only be used while13331333+ * another flush request is in progress but outside the driver.13341334+ *13351335+ * TODO: only allocate if flushes are supported13361336+ */13371337+ reg->queue_depth++;13381338+ reg->reserved_tags++;13391339+13401340+ if (reg->queue_depth < (reg->reserved_tags + BLK_MQ_TAG_MIN))13411341+ return ERR_PTR(-EINVAL);13421342+13431343+ ctx = alloc_percpu(struct blk_mq_ctx);13441344+ if (!ctx)13451345+ return ERR_PTR(-ENOMEM);13461346+13471347+ hctxs = kmalloc_node(reg->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL,13481348+ reg->numa_node);13491349+13501350+ if (!hctxs)13511351+ goto err_percpu;13521352+13531353+ for (i = 0; i < reg->nr_hw_queues; i++) {13541354+ hctxs[i] = reg->ops->alloc_hctx(reg, i);13551355+ if (!hctxs[i])13561356+ goto err_hctxs;13571357+13581358+ hctxs[i]->numa_node = NUMA_NO_NODE;13591359+ hctxs[i]->queue_num = i;13601360+ }13611361+13621362+ q = blk_alloc_queue_node(GFP_KERNEL, reg->numa_node);13631363+ if (!q)13641364+ goto err_hctxs;13651365+13661366+ q->mq_map = blk_mq_make_queue_map(reg);13671367+ if (!q->mq_map)13681368+ goto err_map;13691369+13701370+ setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q);13711371+ blk_queue_rq_timeout(q, 30000);13721372+13731373+ q->nr_queues = nr_cpu_ids;13741374+ q->nr_hw_queues = reg->nr_hw_queues;13751375+13761376+ q->queue_ctx = ctx;13771377+ q->queue_hw_ctx = hctxs;13781378+13791379+ q->mq_ops = reg->ops;13801380+13811381+ blk_queue_make_request(q, blk_mq_make_request);13821382+ blk_queue_rq_timed_out(q, reg->ops->timeout);13831383+ if (reg->timeout)13841384+ blk_queue_rq_timeout(q, reg->timeout);13851385+13861386+ blk_mq_init_flush(q);13871387+ blk_mq_init_cpu_queues(q, reg->nr_hw_queues);13881388+13891389+ if (blk_mq_init_hw_queues(q, reg, driver_data))13901390+ goto err_hw;13911391+13921392+ blk_mq_map_swqueue(q);13931393+13941394+ mutex_lock(&all_q_mutex);13951395+ list_add_tail(&q->all_q_node, &all_q_list);13961396+ mutex_unlock(&all_q_mutex);13971397+13981398+ return q;13991399+err_hw:14001400+ kfree(q->mq_map);14011401+err_map:14021402+ blk_cleanup_queue(q);14031403+err_hctxs:14041404+ for (i = 0; i < reg->nr_hw_queues; i++) {14051405+ if (!hctxs[i])14061406+ break;14071407+ reg->ops->free_hctx(hctxs[i], i);14081408+ }14091409+ kfree(hctxs);14101410+err_percpu:14111411+ free_percpu(ctx);14121412+ return ERR_PTR(-ENOMEM);14131413+}14141414+EXPORT_SYMBOL(blk_mq_init_queue);14151415+14161416+void blk_mq_free_queue(struct request_queue *q)14171417+{14181418+ struct blk_mq_hw_ctx *hctx;14191419+ int i;14201420+14211421+ queue_for_each_hw_ctx(q, hctx, i) {14221422+ cancel_delayed_work_sync(&hctx->delayed_work);14231423+ kfree(hctx->ctx_map);14241424+ kfree(hctx->ctxs);14251425+ blk_mq_free_rq_map(hctx);14261426+ blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);14271427+ if (q->mq_ops->exit_hctx)14281428+ q->mq_ops->exit_hctx(hctx, i);14291429+ q->mq_ops->free_hctx(hctx, i);14301430+ }14311431+14321432+ free_percpu(q->queue_ctx);14331433+ kfree(q->queue_hw_ctx);14341434+ kfree(q->mq_map);14351435+14361436+ q->queue_ctx = NULL;14371437+ q->queue_hw_ctx = NULL;14381438+ q->mq_map = NULL;14391439+14401440+ mutex_lock(&all_q_mutex);14411441+ list_del_init(&q->all_q_node);14421442+ mutex_unlock(&all_q_mutex);14431443+}14441444+EXPORT_SYMBOL(blk_mq_free_queue);14451445+14461446+/* Basically redo blk_mq_init_queue with queue frozen */14471447+static void __cpuinit blk_mq_queue_reinit(struct request_queue *q)14481448+{14491449+ blk_mq_freeze_queue(q);14501450+14511451+ blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues);14521452+14531453+ /*14541454+ * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe14551455+ * we should change hctx numa_node according to new topology (this14561456+ * involves free and re-allocate memory, worthy doing?)14571457+ */14581458+14591459+ blk_mq_map_swqueue(q);14601460+14611461+ blk_mq_unfreeze_queue(q);14621462+}14631463+14641464+static int __cpuinit blk_mq_queue_reinit_notify(struct notifier_block *nb,14651465+ unsigned long action, void *hcpu)14661466+{14671467+ struct request_queue *q;14681468+14691469+ /*14701470+ * Before new mapping is established, hotadded cpu might already start14711471+ * handling requests. This doesn't break anything as we map offline14721472+ * CPUs to first hardware queue. We will re-init queue below to get14731473+ * optimal settings.14741474+ */14751475+ if (action != CPU_DEAD && action != CPU_DEAD_FROZEN &&14761476+ action != CPU_ONLINE && action != CPU_ONLINE_FROZEN)14771477+ return NOTIFY_OK;14781478+14791479+ mutex_lock(&all_q_mutex);14801480+ list_for_each_entry(q, &all_q_list, all_q_node)14811481+ blk_mq_queue_reinit(q);14821482+ mutex_unlock(&all_q_mutex);14831483+ return NOTIFY_OK;14841484+}14851485+14861486+static int __init blk_mq_init(void)14871487+{14881488+ unsigned int i;14891489+14901490+ for_each_possible_cpu(i)14911491+ init_llist_head(&per_cpu(ipi_lists, i));14921492+14931493+ blk_mq_cpu_init();14941494+14951495+ /* Must be called after percpu_counter_hotcpu_callback() */14961496+ hotcpu_notifier(blk_mq_queue_reinit_notify, -10);14971497+14981498+ return 0;14991499+}15001500+subsys_initcall(blk_mq_init);
+52
block/blk-mq.h
···11+#ifndef INT_BLK_MQ_H22+#define INT_BLK_MQ_H33+44+struct blk_mq_ctx {55+ struct {66+ spinlock_t lock;77+ struct list_head rq_list;88+ } ____cacheline_aligned_in_smp;99+1010+ unsigned int cpu;1111+ unsigned int index_hw;1212+ unsigned int ipi_redirect;1313+1414+ /* incremented at dispatch time */1515+ unsigned long rq_dispatched[2];1616+ unsigned long rq_merged;1717+1818+ /* incremented at completion time */1919+ unsigned long ____cacheline_aligned_in_smp rq_completed[2];2020+2121+ struct request_queue *queue;2222+ struct kobject kobj;2323+};2424+2525+void __blk_mq_end_io(struct request *rq, int error);2626+void blk_mq_complete_request(struct request *rq, int error);2727+void blk_mq_run_request(struct request *rq, bool run_queue, bool async);2828+void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);2929+void blk_mq_init_flush(struct request_queue *q);3030+3131+/*3232+ * CPU hotplug helpers3333+ */3434+struct blk_mq_cpu_notifier;3535+void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier,3636+ void (*fn)(void *, unsigned long, unsigned int),3737+ void *data);3838+void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier);3939+void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier);4040+void blk_mq_cpu_init(void);4141+DECLARE_PER_CPU(struct llist_head, ipi_lists);4242+4343+/*4444+ * CPU -> queue mappings4545+ */4646+struct blk_mq_reg;4747+extern unsigned int *blk_mq_make_queue_map(struct blk_mq_reg *reg);4848+extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues);4949+5050+void blk_mq_add_timer(struct request *rq);5151+5252+#endif
+13
block/blk-sysfs.c
···77#include <linux/bio.h>88#include <linux/blkdev.h>99#include <linux/blktrace_api.h>1010+#include <linux/blk-mq.h>10111112#include "blk.h"1213#include "blk-cgroup.h"···543542 if (q->queue_tags)544543 __blk_queue_free_tags(q);545544545545+ percpu_counter_destroy(&q->mq_usage_counter);546546+547547+ if (q->mq_ops)548548+ blk_mq_free_queue(q);549549+546550 blk_trace_shutdown(q);547551548552 bdi_destroy(&q->backing_dev_info);···581575 * bypass from queue allocation.582576 */583577 blk_queue_bypass_end(q);578578+ queue_flag_set_unlocked(QUEUE_FLAG_INIT_DONE, q);584579585580 ret = blk_trace_init_sysfs(dev);586581 if (ret)···594587 }595588596589 kobject_uevent(&q->kobj, KOBJ_ADD);590590+591591+ if (q->mq_ops)592592+ blk_mq_register_disk(disk);597593598594 if (!q->request_fn)599595 return 0;···619609620610 if (WARN_ON(!q))621611 return;612612+613613+ if (q->mq_ops)614614+ blk_mq_unregister_disk(disk);622615623616 if (q->request_fn)624617 elv_unregister_queue(q);
+47-27
block/blk-timeout.c
···77#include <linux/fault-inject.h>8899#include "blk.h"1010+#include "blk-mq.h"10111112#ifdef CONFIG_FAIL_IO_TIMEOUT1213···8988 ret = q->rq_timed_out_fn(req);9089 switch (ret) {9190 case BLK_EH_HANDLED:9292- __blk_complete_request(req);9191+ /* Can we use req->errors here? */9292+ if (q->mq_ops)9393+ blk_mq_complete_request(req, req->errors);9494+ else9595+ __blk_complete_request(req);9396 break;9497 case BLK_EH_RESET_TIMER:9595- blk_add_timer(req);9898+ if (q->mq_ops)9999+ blk_mq_add_timer(req);100100+ else101101+ blk_add_timer(req);102102+96103 blk_clear_rq_complete(req);97104 break;98105 case BLK_EH_NOT_HANDLED:···117108 }118109}119110111111+void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout,112112+ unsigned int *next_set)113113+{114114+ if (time_after_eq(jiffies, rq->deadline)) {115115+ list_del_init(&rq->timeout_list);116116+117117+ /*118118+ * Check if we raced with end io completion119119+ */120120+ if (!blk_mark_rq_complete(rq))121121+ blk_rq_timed_out(rq);122122+ } else if (!*next_set || time_after(*next_timeout, rq->deadline)) {123123+ *next_timeout = rq->deadline;124124+ *next_set = 1;125125+ }126126+}127127+120128void blk_rq_timed_out_timer(unsigned long data)121129{122130 struct request_queue *q = (struct request_queue *) data;···143117144118 spin_lock_irqsave(q->queue_lock, flags);145119146146- list_for_each_entry_safe(rq, tmp, &q->timeout_list, timeout_list) {147147- if (time_after_eq(jiffies, rq->deadline)) {148148- list_del_init(&rq->timeout_list);149149-150150- /*151151- * Check if we raced with end io completion152152- */153153- if (blk_mark_rq_complete(rq))154154- continue;155155- blk_rq_timed_out(rq);156156- } else if (!next_set || time_after(next, rq->deadline)) {157157- next = rq->deadline;158158- next_set = 1;159159- }160160- }120120+ list_for_each_entry_safe(rq, tmp, &q->timeout_list, timeout_list)121121+ blk_rq_check_expired(rq, &next, &next_set);161122162123 if (next_set)163124 mod_timer(&q->timeout, round_jiffies_up(next));···170157}171158EXPORT_SYMBOL_GPL(blk_abort_request);172159173173-/**174174- * blk_add_timer - Start timeout timer for a single request175175- * @req: request that is about to start running.176176- *177177- * Notes:178178- * Each request has its own timer, and as it is added to the queue, we179179- * set up the timer. When the request completes, we cancel the timer.180180- */181181-void blk_add_timer(struct request *req)160160+void __blk_add_timer(struct request *req, struct list_head *timeout_list)182161{183162 struct request_queue *q = req->q;184163 unsigned long expiry;···188183 req->timeout = q->rq_timeout;189184190185 req->deadline = jiffies + req->timeout;191191- list_add_tail(&req->timeout_list, &q->timeout_list);186186+ if (timeout_list)187187+ list_add_tail(&req->timeout_list, timeout_list);192188193189 /*194190 * If the timer isn't already pending or this timeout is earlier···201195 if (!timer_pending(&q->timeout) ||202196 time_before(expiry, q->timeout.expires))203197 mod_timer(&q->timeout, expiry);198198+199199+}200200+201201+/**202202+ * blk_add_timer - Start timeout timer for a single request203203+ * @req: request that is about to start running.204204+ *205205+ * Notes:206206+ * Each request has its own timer, and as it is added to the queue, we207207+ * set up the timer. When the request completes, we cancel the timer.208208+ */209209+void blk_add_timer(struct request *req)210210+{211211+ __blk_add_timer(req, &req->q->timeout_list);204212}205213
+17
block/blk.h
···1010#define BLK_BATCH_REQ 3211111212extern struct kmem_cache *blk_requestq_cachep;1313+extern struct kmem_cache *request_cachep;1314extern struct kobj_type blk_queue_ktype;1415extern struct ida blk_queue_ida;1516···3534 unsigned int nr_bytes, unsigned int bidi_bytes);36353736void blk_rq_timed_out_timer(unsigned long data);3737+void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout,3838+ unsigned int *next_set);3939+void __blk_add_timer(struct request *req, struct list_head *timeout_list);3840void blk_delete_timer(struct request *);3941void blk_add_timer(struct request *);4242+4343+4444+bool bio_attempt_front_merge(struct request_queue *q, struct request *req,4545+ struct bio *bio);4646+bool bio_attempt_back_merge(struct request_queue *q, struct request *req,4747+ struct bio *bio);4848+bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,4949+ unsigned int *request_count);5050+5151+void blk_account_io_start(struct request *req, bool new_io);5252+void blk_account_io_completion(struct request *req, unsigned int bytes);5353+void blk_account_io_done(struct request *req);40544155/*4256 * Internal atomic flags for request handling4357 */4458enum rq_atomic_flags {4559 REQ_ATOM_COMPLETE = 0,6060+ REQ_ATOM_STARTED,4661};47624863/*
+3
drivers/block/Kconfig
···15151616if BLK_DEV17171818+config BLK_DEV_NULL_BLK1919+ tristate "Null test block driver"2020+1821config BLK_DEV_FD1922 tristate "Normal floppy disk support"2023 depends on ARCH_MAY_HAVE_PC_FDC
···3030#include <linux/spinlock.h>3131#include <linux/percpu_ida.h>32323333-/*3434- * Number of tags we move between the percpu freelist and the global freelist at3535- * a time3636- */3737-#define IDA_PCPU_BATCH_MOVE 32U3838-3939-/* Max size of percpu freelist, */4040-#define IDA_PCPU_SIZE ((IDA_PCPU_BATCH_MOVE * 3) / 2)4141-4233struct percpu_ida_cpu {4334 /*4435 * Even though this is percpu, we need a lock for tag stealing by remote···6978 struct percpu_ida_cpu *remote;70797180 for (cpus_have_tags = cpumask_weight(&pool->cpus_have_tags);7272- cpus_have_tags * IDA_PCPU_SIZE > pool->nr_tags / 2;8181+ cpus_have_tags * pool->percpu_max_size > pool->nr_tags / 2;7382 cpus_have_tags--) {7483 cpu = cpumask_next(cpu, &pool->cpus_have_tags);7584···114123{115124 move_tags(tags->freelist, &tags->nr_free,116125 pool->freelist, &pool->nr_free,117117- min(pool->nr_free, IDA_PCPU_BATCH_MOVE));126126+ min(pool->nr_free, pool->percpu_batch_size));118127}119128120129static inline unsigned alloc_local_tag(struct percpu_ida *pool,···236245 wake_up(&pool->wait);237246 }238247239239- if (nr_free == IDA_PCPU_SIZE) {248248+ if (nr_free == pool->percpu_max_size) {240249 spin_lock(&pool->lock);241250242251 /*243252 * Global lock held and irqs disabled, don't need percpu244253 * lock245254 */246246- if (tags->nr_free == IDA_PCPU_SIZE) {255255+ if (tags->nr_free == pool->percpu_max_size) {247256 move_tags(pool->freelist, &pool->nr_free,248257 tags->freelist, &tags->nr_free,249249- IDA_PCPU_BATCH_MOVE);258258+ pool->percpu_batch_size);250259251260 wake_up(&pool->wait);252261 }···283292 * Allocation is percpu, but sharding is limited by nr_tags - for best284293 * performance, the workload should not span more cpus than nr_tags / 128.285294 */286286-int percpu_ida_init(struct percpu_ida *pool, unsigned long nr_tags)295295+int __percpu_ida_init(struct percpu_ida *pool, unsigned long nr_tags,296296+ unsigned long max_size, unsigned long batch_size)287297{288298 unsigned i, cpu, order;289299···293301 init_waitqueue_head(&pool->wait);294302 spin_lock_init(&pool->lock);295303 pool->nr_tags = nr_tags;304304+ pool->percpu_max_size = max_size;305305+ pool->percpu_batch_size = batch_size;296306297307 /* Guard against overflow */298308 if (nr_tags > (unsigned) INT_MAX + 1) {···313319 pool->nr_free = nr_tags;314320315321 pool->tag_cpu = __alloc_percpu(sizeof(struct percpu_ida_cpu) +316316- IDA_PCPU_SIZE * sizeof(unsigned),322322+ pool->percpu_max_size * sizeof(unsigned),317323 sizeof(unsigned));318324 if (!pool->tag_cpu)319325 goto err;···326332 percpu_ida_destroy(pool);327333 return -ENOMEM;328334}329329-EXPORT_SYMBOL_GPL(percpu_ida_init);335335+EXPORT_SYMBOL_GPL(__percpu_ida_init);336336+337337+/**338338+ * percpu_ida_for_each_free - iterate free ids of a pool339339+ * @pool: pool to iterate340340+ * @fn: interate callback function341341+ * @data: parameter for @fn342342+ *343343+ * Note, this doesn't guarantee to iterate all free ids restrictly. Some free344344+ * ids might be missed, some might be iterated duplicated, and some might345345+ * be iterated and not free soon.346346+ */347347+int percpu_ida_for_each_free(struct percpu_ida *pool, percpu_ida_cb fn,348348+ void *data)349349+{350350+ unsigned long flags;351351+ struct percpu_ida_cpu *remote;352352+ unsigned cpu, i, err = 0;353353+354354+ local_irq_save(flags);355355+ for_each_possible_cpu(cpu) {356356+ remote = per_cpu_ptr(pool->tag_cpu, cpu);357357+ spin_lock(&remote->lock);358358+ for (i = 0; i < remote->nr_free; i++) {359359+ err = fn(remote->freelist[i], data);360360+ if (err)361361+ break;362362+ }363363+ spin_unlock(&remote->lock);364364+ if (err)365365+ goto out;366366+ }367367+368368+ spin_lock(&pool->lock);369369+ for (i = 0; i < pool->nr_free; i++) {370370+ err = fn(pool->freelist[i], data);371371+ if (err)372372+ break;373373+ }374374+ spin_unlock(&pool->lock);375375+out:376376+ local_irq_restore(flags);377377+ return err;378378+}379379+EXPORT_SYMBOL_GPL(percpu_ida_for_each_free);380380+381381+/**382382+ * percpu_ida_free_tags - return free tags number of a specific cpu or global pool383383+ * @pool: pool related384384+ * @cpu: specific cpu or global pool if @cpu == nr_cpu_ids385385+ *386386+ * Note: this just returns a snapshot of free tags number.387387+ */388388+unsigned percpu_ida_free_tags(struct percpu_ida *pool, int cpu)389389+{390390+ struct percpu_ida_cpu *remote;391391+ if (cpu == nr_cpu_ids)392392+ return pool->nr_free;393393+ remote = per_cpu_ptr(pool->tag_cpu, cpu);394394+ return remote->nr_free;395395+}396396+EXPORT_SYMBOL_GPL(percpu_ida_free_tags);