Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

blk-mq: new multi-queue block IO queueing mechanism

Linux currently has two models for block devices:

- The classic request_fn based approach, where drivers use struct
request units for IO. The block layer provides various helper
functionalities to let drivers share code, things like tag
management, timeout handling, queueing, etc.

- The "stacked" approach, where a driver squeezes in between the
block layer and IO submitter. Since this bypasses the IO stack,
driver generally have to manage everything themselves.

With drivers being written for new high IOPS devices, the classic
request_fn based driver doesn't work well enough. The design dates
back to when both SMP and high IOPS was rare. It has problems with
scaling to bigger machines, and runs into scaling issues even on
smaller machines when you have IOPS in the hundreds of thousands
per device.

The stacked approach is then most often selected as the model
for the driver. But this means that everybody has to re-invent
everything, and along with that we get all the problems again
that the shared approach solved.

This commit introduces blk-mq, block multi queue support. The
design is centered around per-cpu queues for queueing IO, which
then funnel down into x number of hardware submission queues.
We might have a 1:1 mapping between the two, or it might be
an N:M mapping. That all depends on what the hardware supports.

blk-mq provides various helper functions, which include:

- Scalable support for request tagging. Most devices need to
be able to uniquely identify a request both in the driver and
to the hardware. The tagging uses per-cpu caches for freed
tags, to enable cache hot reuse.

- Timeout handling without tracking request on a per-device
basis. Basically the driver should be able to get a notification,
if a request happens to fail.

- Optional support for non 1:1 mappings between issue and
submission queues. blk-mq can redirect IO completions to the
desired location.

- Support for per-request payloads. Drivers almost always need
to associate a request structure with some driver private
command structure. Drivers can tell blk-mq this at init time,
and then any request handed to the driver will have the
required size of memory associated with it.

- Support for merging of IO, and plugging. The stacked model
gets neither of these. Even for high IOPS devices, merging
sequential IO reduces per-command overhead and thus
increases bandwidth.

For now, this is provided as a potential 3rd queueing model, with
the hope being that, as it matures, it can replace both the classic
and stacked model. That would get us back to having just 1 real
model for block devices, leaving the stacked approach to dm/md
devices (as it was originally intended).

Contributions in this patch from the following people:

Shaohua Li <shli@fusionio.com>
Alexander Gordeev <agordeev@redhat.com>
Christoph Hellwig <hch@infradead.org>
Mike Christie <michaelc@cs.wisc.edu>
Matias Bjorling <m@bjorling.me>
Jeff Moyer <jmoyer@redhat.com>

Acked-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>

+2890 -109
+3 -2
block/Makefile
··· 5 5 obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \ 6 6 blk-flush.o blk-settings.o blk-ioc.o blk-map.o \ 7 7 blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ 8 - blk-iopoll.o blk-lib.o ioctl.o genhd.o scsi_ioctl.o \ 9 - partition-generic.o partitions/ 8 + blk-iopoll.o blk-lib.o blk-mq.o blk-mq-tag.o \ 9 + blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \ 10 + genhd.o scsi_ioctl.o partition-generic.o partitions/ 10 11 11 12 obj-$(CONFIG_BLK_DEV_BSG) += bsg.o 12 13 obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o
+84 -58
block/blk-core.c
··· 16 16 #include <linux/backing-dev.h> 17 17 #include <linux/bio.h> 18 18 #include <linux/blkdev.h> 19 + #include <linux/blk-mq.h> 19 20 #include <linux/highmem.h> 20 21 #include <linux/mm.h> 21 22 #include <linux/kernel_stat.h> ··· 49 48 /* 50 49 * For the allocated request tables 51 50 */ 52 - static struct kmem_cache *request_cachep; 51 + struct kmem_cache *request_cachep = NULL; 53 52 54 53 /* 55 54 * For queue allocation ··· 60 59 * Controlling structure to kblockd 61 60 */ 62 61 static struct workqueue_struct *kblockd_workqueue; 63 - 64 - static void drive_stat_acct(struct request *rq, int new_io) 65 - { 66 - struct hd_struct *part; 67 - int rw = rq_data_dir(rq); 68 - int cpu; 69 - 70 - if (!blk_do_io_stat(rq)) 71 - return; 72 - 73 - cpu = part_stat_lock(); 74 - 75 - if (!new_io) { 76 - part = rq->part; 77 - part_stat_inc(cpu, part, merges[rw]); 78 - } else { 79 - part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq)); 80 - if (!hd_struct_try_get(part)) { 81 - /* 82 - * The partition is already being removed, 83 - * the request will be accounted on the disk only 84 - * 85 - * We take a reference on disk->part0 although that 86 - * partition will never be deleted, so we can treat 87 - * it as any other partition. 88 - */ 89 - part = &rq->rq_disk->part0; 90 - hd_struct_get(part); 91 - } 92 - part_round_stats(cpu, part); 93 - part_inc_in_flight(part, rw); 94 - rq->part = part; 95 - } 96 - 97 - part_stat_unlock(); 98 - } 99 62 100 63 void blk_queue_congestion_threshold(struct request_queue *q) 101 64 { ··· 559 594 if (!q) 560 595 return NULL; 561 596 597 + if (percpu_counter_init(&q->mq_usage_counter, 0)) 598 + goto fail_q; 599 + 562 600 q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask); 563 601 if (q->id < 0) 564 - goto fail_q; 602 + goto fail_c; 565 603 566 604 q->backing_dev_info.ra_pages = 567 605 (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; ··· 611 643 q->bypass_depth = 1; 612 644 __set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags); 613 645 646 + init_waitqueue_head(&q->mq_freeze_wq); 647 + 614 648 if (blkcg_init_queue(q)) 615 649 goto fail_id; 616 650 ··· 620 650 621 651 fail_id: 622 652 ida_simple_remove(&blk_queue_ida, q->id); 653 + fail_c: 654 + percpu_counter_destroy(&q->mq_usage_counter); 623 655 fail_q: 624 656 kmem_cache_free(blk_requestq_cachep, q); 625 657 return NULL; ··· 1080 1108 goto retry; 1081 1109 } 1082 1110 1083 - struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) 1111 + static struct request *blk_old_get_request(struct request_queue *q, int rw, 1112 + gfp_t gfp_mask) 1084 1113 { 1085 1114 struct request *rq; 1086 1115 ··· 1097 1124 /* q->queue_lock is unlocked at this point */ 1098 1125 1099 1126 return rq; 1127 + } 1128 + 1129 + struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) 1130 + { 1131 + if (q->mq_ops) 1132 + return blk_mq_alloc_request(q, rw, gfp_mask); 1133 + else 1134 + return blk_old_get_request(q, rw, gfp_mask); 1100 1135 } 1101 1136 EXPORT_SYMBOL(blk_get_request); 1102 1137 ··· 1191 1210 static void add_acct_request(struct request_queue *q, struct request *rq, 1192 1211 int where) 1193 1212 { 1194 - drive_stat_acct(rq, 1); 1213 + blk_account_io_start(rq, true); 1195 1214 __elv_add_request(q, rq, where); 1196 1215 } 1197 1216 ··· 1280 1299 1281 1300 void blk_put_request(struct request *req) 1282 1301 { 1283 - unsigned long flags; 1284 1302 struct request_queue *q = req->q; 1285 1303 1286 - spin_lock_irqsave(q->queue_lock, flags); 1287 - __blk_put_request(q, req); 1288 - spin_unlock_irqrestore(q->queue_lock, flags); 1304 + if (q->mq_ops) 1305 + blk_mq_free_request(req); 1306 + else { 1307 + unsigned long flags; 1308 + 1309 + spin_lock_irqsave(q->queue_lock, flags); 1310 + __blk_put_request(q, req); 1311 + spin_unlock_irqrestore(q->queue_lock, flags); 1312 + } 1289 1313 } 1290 1314 EXPORT_SYMBOL(blk_put_request); 1291 1315 ··· 1326 1340 } 1327 1341 EXPORT_SYMBOL_GPL(blk_add_request_payload); 1328 1342 1329 - static bool bio_attempt_back_merge(struct request_queue *q, struct request *req, 1330 - struct bio *bio) 1343 + bool bio_attempt_back_merge(struct request_queue *q, struct request *req, 1344 + struct bio *bio) 1331 1345 { 1332 1346 const int ff = bio->bi_rw & REQ_FAILFAST_MASK; 1333 1347 ··· 1344 1358 req->__data_len += bio->bi_size; 1345 1359 req->ioprio = ioprio_best(req->ioprio, bio_prio(bio)); 1346 1360 1347 - drive_stat_acct(req, 0); 1361 + blk_account_io_start(req, false); 1348 1362 return true; 1349 1363 } 1350 1364 1351 - static bool bio_attempt_front_merge(struct request_queue *q, 1352 - struct request *req, struct bio *bio) 1365 + bool bio_attempt_front_merge(struct request_queue *q, struct request *req, 1366 + struct bio *bio) 1353 1367 { 1354 1368 const int ff = bio->bi_rw & REQ_FAILFAST_MASK; 1355 1369 ··· 1374 1388 req->__data_len += bio->bi_size; 1375 1389 req->ioprio = ioprio_best(req->ioprio, bio_prio(bio)); 1376 1390 1377 - drive_stat_acct(req, 0); 1391 + blk_account_io_start(req, false); 1378 1392 return true; 1379 1393 } 1380 1394 1381 1395 /** 1382 - * attempt_plug_merge - try to merge with %current's plugged list 1396 + * blk_attempt_plug_merge - try to merge with %current's plugged list 1383 1397 * @q: request_queue new bio is being queued at 1384 1398 * @bio: new bio being queued 1385 1399 * @request_count: out parameter for number of traversed plugged requests ··· 1395 1409 * reliable access to the elevator outside queue lock. Only check basic 1396 1410 * merging parameters without querying the elevator. 1397 1411 */ 1398 - static bool attempt_plug_merge(struct request_queue *q, struct bio *bio, 1399 - unsigned int *request_count) 1412 + bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, 1413 + unsigned int *request_count) 1400 1414 { 1401 1415 struct blk_plug *plug; 1402 1416 struct request *rq; ··· 1475 1489 * Check if we can merge with the plugged list before grabbing 1476 1490 * any locks. 1477 1491 */ 1478 - if (attempt_plug_merge(q, bio, &request_count)) 1492 + if (blk_attempt_plug_merge(q, bio, &request_count)) 1479 1493 return; 1480 1494 1481 1495 spin_lock_irq(q->queue_lock); ··· 1543 1557 } 1544 1558 } 1545 1559 list_add_tail(&req->queuelist, &plug->list); 1546 - drive_stat_acct(req, 1); 1560 + blk_account_io_start(req, true); 1547 1561 } else { 1548 1562 spin_lock_irq(q->queue_lock); 1549 1563 add_acct_request(q, req, where); ··· 1997 2011 } 1998 2012 EXPORT_SYMBOL_GPL(blk_rq_err_bytes); 1999 2013 2000 - static void blk_account_io_completion(struct request *req, unsigned int bytes) 2014 + void blk_account_io_completion(struct request *req, unsigned int bytes) 2001 2015 { 2002 2016 if (blk_do_io_stat(req)) { 2003 2017 const int rw = rq_data_dir(req); ··· 2011 2025 } 2012 2026 } 2013 2027 2014 - static void blk_account_io_done(struct request *req) 2028 + void blk_account_io_done(struct request *req) 2015 2029 { 2016 2030 /* 2017 2031 * Account IO completion. flush_rq isn't accounted as a ··· 2058 2072 return rq; 2059 2073 } 2060 2074 #endif 2075 + 2076 + void blk_account_io_start(struct request *rq, bool new_io) 2077 + { 2078 + struct hd_struct *part; 2079 + int rw = rq_data_dir(rq); 2080 + int cpu; 2081 + 2082 + if (!blk_do_io_stat(rq)) 2083 + return; 2084 + 2085 + cpu = part_stat_lock(); 2086 + 2087 + if (!new_io) { 2088 + part = rq->part; 2089 + part_stat_inc(cpu, part, merges[rw]); 2090 + } else { 2091 + part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq)); 2092 + if (!hd_struct_try_get(part)) { 2093 + /* 2094 + * The partition is already being removed, 2095 + * the request will be accounted on the disk only 2096 + * 2097 + * We take a reference on disk->part0 although that 2098 + * partition will never be deleted, so we can treat 2099 + * it as any other partition. 2100 + */ 2101 + part = &rq->rq_disk->part0; 2102 + hd_struct_get(part); 2103 + } 2104 + part_round_stats(cpu, part); 2105 + part_inc_in_flight(part, rw); 2106 + rq->part = part; 2107 + } 2108 + 2109 + part_stat_unlock(); 2110 + } 2061 2111 2062 2112 /** 2063 2113 * blk_peek_request - peek at the top of a request queue ··· 2469 2447 2470 2448 if (req->cmd_flags & REQ_DONTPREP) 2471 2449 blk_unprep_request(req); 2472 - 2473 2450 2474 2451 blk_account_io_done(req); 2475 2452 ··· 2891 2870 2892 2871 plug->magic = PLUG_MAGIC; 2893 2872 INIT_LIST_HEAD(&plug->list); 2873 + INIT_LIST_HEAD(&plug->mq_list); 2894 2874 INIT_LIST_HEAD(&plug->cb_list); 2895 2875 2896 2876 /* ··· 2989 2967 BUG_ON(plug->magic != PLUG_MAGIC); 2990 2968 2991 2969 flush_plug_callbacks(plug, from_schedule); 2970 + 2971 + if (!list_empty(&plug->mq_list)) 2972 + blk_mq_flush_plug_list(plug, from_schedule); 2973 + 2992 2974 if (list_empty(&plug->list)) 2993 2975 return; 2994 2976
+7
block/blk-exec.c
··· 5 5 #include <linux/module.h> 6 6 #include <linux/bio.h> 7 7 #include <linux/blkdev.h> 8 + #include <linux/blk-mq.h> 8 9 #include <linux/sched/sysctl.h> 9 10 10 11 #include "blk.h" ··· 59 58 60 59 rq->rq_disk = bd_disk; 61 60 rq->end_io = done; 61 + 62 + if (q->mq_ops) { 63 + blk_mq_insert_request(q, rq, true); 64 + return; 65 + } 66 + 62 67 /* 63 68 * need to check this before __blk_run_queue(), because rq can 64 69 * be freed before that returns.
+139 -15
block/blk-flush.c
··· 69 69 #include <linux/bio.h> 70 70 #include <linux/blkdev.h> 71 71 #include <linux/gfp.h> 72 + #include <linux/blk-mq.h> 72 73 73 74 #include "blk.h" 75 + #include "blk-mq.h" 74 76 75 77 /* FLUSH/FUA sequences */ 76 78 enum { ··· 126 124 /* make @rq a normal request */ 127 125 rq->cmd_flags &= ~REQ_FLUSH_SEQ; 128 126 rq->end_io = rq->flush.saved_end_io; 127 + 128 + blk_clear_rq_complete(rq); 129 + } 130 + 131 + static void mq_flush_data_run(struct work_struct *work) 132 + { 133 + struct request *rq; 134 + 135 + rq = container_of(work, struct request, mq_flush_data); 136 + 137 + memset(&rq->csd, 0, sizeof(rq->csd)); 138 + blk_mq_run_request(rq, true, false); 139 + } 140 + 141 + static void blk_mq_flush_data_insert(struct request *rq) 142 + { 143 + INIT_WORK(&rq->mq_flush_data, mq_flush_data_run); 144 + kblockd_schedule_work(rq->q, &rq->mq_flush_data); 129 145 } 130 146 131 147 /** ··· 156 136 * completion and trigger the next step. 157 137 * 158 138 * CONTEXT: 159 - * spin_lock_irq(q->queue_lock) 139 + * spin_lock_irq(q->queue_lock or q->mq_flush_lock) 160 140 * 161 141 * RETURNS: 162 142 * %true if requests were added to the dispatch queue, %false otherwise. ··· 166 146 { 167 147 struct request_queue *q = rq->q; 168 148 struct list_head *pending = &q->flush_queue[q->flush_pending_idx]; 169 - bool queued = false; 149 + bool queued = false, kicked; 170 150 171 151 BUG_ON(rq->flush.seq & seq); 172 152 rq->flush.seq |= seq; ··· 187 167 188 168 case REQ_FSEQ_DATA: 189 169 list_move_tail(&rq->flush.list, &q->flush_data_in_flight); 190 - list_add(&rq->queuelist, &q->queue_head); 191 - queued = true; 170 + if (q->mq_ops) 171 + blk_mq_flush_data_insert(rq); 172 + else { 173 + list_add(&rq->queuelist, &q->queue_head); 174 + queued = true; 175 + } 192 176 break; 193 177 194 178 case REQ_FSEQ_DONE: ··· 205 181 BUG_ON(!list_empty(&rq->queuelist)); 206 182 list_del_init(&rq->flush.list); 207 183 blk_flush_restore_request(rq); 208 - __blk_end_request_all(rq, error); 184 + if (q->mq_ops) 185 + blk_mq_end_io(rq, error); 186 + else 187 + __blk_end_request_all(rq, error); 209 188 break; 210 189 211 190 default: 212 191 BUG(); 213 192 } 214 193 215 - return blk_kick_flush(q) | queued; 194 + kicked = blk_kick_flush(q); 195 + /* blk_mq_run_flush will run queue */ 196 + if (q->mq_ops) 197 + return queued; 198 + return kicked | queued; 216 199 } 217 200 218 201 static void flush_end_io(struct request *flush_rq, int error) 219 202 { 220 203 struct request_queue *q = flush_rq->q; 221 - struct list_head *running = &q->flush_queue[q->flush_running_idx]; 204 + struct list_head *running; 222 205 bool queued = false; 223 206 struct request *rq, *n; 207 + unsigned long flags = 0; 224 208 209 + if (q->mq_ops) { 210 + blk_mq_free_request(flush_rq); 211 + spin_lock_irqsave(&q->mq_flush_lock, flags); 212 + } 213 + running = &q->flush_queue[q->flush_running_idx]; 225 214 BUG_ON(q->flush_pending_idx == q->flush_running_idx); 226 215 227 216 /* account completion of the flush request */ 228 217 q->flush_running_idx ^= 1; 229 - elv_completed_request(q, flush_rq); 218 + 219 + if (!q->mq_ops) 220 + elv_completed_request(q, flush_rq); 230 221 231 222 /* and push the waiting requests to the next stage */ 232 223 list_for_each_entry_safe(rq, n, running, flush.list) { ··· 262 223 * directly into request_fn may confuse the driver. Always use 263 224 * kblockd. 264 225 */ 265 - if (queued || q->flush_queue_delayed) 266 - blk_run_queue_async(q); 226 + if (queued || q->flush_queue_delayed) { 227 + if (!q->mq_ops) 228 + blk_run_queue_async(q); 229 + else 230 + /* 231 + * This can be optimized to only run queues with requests 232 + * queued if necessary. 233 + */ 234 + blk_mq_run_queues(q, true); 235 + } 267 236 q->flush_queue_delayed = 0; 237 + if (q->mq_ops) 238 + spin_unlock_irqrestore(&q->mq_flush_lock, flags); 239 + } 240 + 241 + static void mq_flush_work(struct work_struct *work) 242 + { 243 + struct request_queue *q; 244 + struct request *rq; 245 + 246 + q = container_of(work, struct request_queue, mq_flush_work); 247 + 248 + /* We don't need set REQ_FLUSH_SEQ, it's for consistency */ 249 + rq = blk_mq_alloc_request(q, WRITE_FLUSH|REQ_FLUSH_SEQ, 250 + __GFP_WAIT|GFP_ATOMIC); 251 + rq->cmd_type = REQ_TYPE_FS; 252 + rq->end_io = flush_end_io; 253 + 254 + blk_mq_run_request(rq, true, false); 255 + } 256 + 257 + /* 258 + * We can't directly use q->flush_rq, because it doesn't have tag and is not in 259 + * hctx->rqs[]. so we must allocate a new request, since we can't sleep here, 260 + * so offload the work to workqueue. 261 + * 262 + * Note: we assume a flush request finished in any hardware queue will flush 263 + * the whole disk cache. 264 + */ 265 + static void mq_run_flush(struct request_queue *q) 266 + { 267 + kblockd_schedule_work(q, &q->mq_flush_work); 268 268 } 269 269 270 270 /** ··· 314 236 * Please read the comment at the top of this file for more info. 315 237 * 316 238 * CONTEXT: 317 - * spin_lock_irq(q->queue_lock) 239 + * spin_lock_irq(q->queue_lock or q->mq_flush_lock) 318 240 * 319 241 * RETURNS: 320 242 * %true if flush was issued, %false otherwise. ··· 339 261 * Issue flush and toggle pending_idx. This makes pending_idx 340 262 * different from running_idx, which means flush is in flight. 341 263 */ 264 + q->flush_pending_idx ^= 1; 265 + if (q->mq_ops) { 266 + mq_run_flush(q); 267 + return true; 268 + } 269 + 342 270 blk_rq_init(q, &q->flush_rq); 343 271 q->flush_rq.cmd_type = REQ_TYPE_FS; 344 272 q->flush_rq.cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ; 345 273 q->flush_rq.rq_disk = first_rq->rq_disk; 346 274 q->flush_rq.end_io = flush_end_io; 347 275 348 - q->flush_pending_idx ^= 1; 349 276 list_add_tail(&q->flush_rq.queuelist, &q->queue_head); 350 277 return true; 351 278 } ··· 367 284 blk_run_queue_async(q); 368 285 } 369 286 287 + static void mq_flush_data_end_io(struct request *rq, int error) 288 + { 289 + struct request_queue *q = rq->q; 290 + struct blk_mq_hw_ctx *hctx; 291 + struct blk_mq_ctx *ctx; 292 + unsigned long flags; 293 + 294 + ctx = rq->mq_ctx; 295 + hctx = q->mq_ops->map_queue(q, ctx->cpu); 296 + 297 + /* 298 + * After populating an empty queue, kick it to avoid stall. Read 299 + * the comment in flush_end_io(). 300 + */ 301 + spin_lock_irqsave(&q->mq_flush_lock, flags); 302 + if (blk_flush_complete_seq(rq, REQ_FSEQ_DATA, error)) 303 + blk_mq_run_hw_queue(hctx, true); 304 + spin_unlock_irqrestore(&q->mq_flush_lock, flags); 305 + } 306 + 370 307 /** 371 308 * blk_insert_flush - insert a new FLUSH/FUA request 372 309 * @rq: request to insert 373 310 * 374 311 * To be called from __elv_add_request() for %ELEVATOR_INSERT_FLUSH insertions. 312 + * or __blk_mq_run_hw_queue() to dispatch request. 375 313 * @rq is being submitted. Analyze what needs to be done and put it on the 376 314 * right queue. 377 315 * 378 316 * CONTEXT: 379 - * spin_lock_irq(q->queue_lock) 317 + * spin_lock_irq(q->queue_lock) in !mq case 380 318 */ 381 319 void blk_insert_flush(struct request *rq) 382 320 { ··· 420 316 * complete the request. 421 317 */ 422 318 if (!policy) { 423 - __blk_end_bidi_request(rq, 0, 0, 0); 319 + if (q->mq_ops) 320 + blk_mq_end_io(rq, 0); 321 + else 322 + __blk_end_bidi_request(rq, 0, 0, 0); 424 323 return; 425 324 } 426 325 ··· 436 329 */ 437 330 if ((policy & REQ_FSEQ_DATA) && 438 331 !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) { 439 - list_add_tail(&rq->queuelist, &q->queue_head); 332 + if (q->mq_ops) { 333 + blk_mq_run_request(rq, false, true); 334 + } else 335 + list_add_tail(&rq->queuelist, &q->queue_head); 440 336 return; 441 337 } 442 338 ··· 451 341 INIT_LIST_HEAD(&rq->flush.list); 452 342 rq->cmd_flags |= REQ_FLUSH_SEQ; 453 343 rq->flush.saved_end_io = rq->end_io; /* Usually NULL */ 344 + if (q->mq_ops) { 345 + rq->end_io = mq_flush_data_end_io; 346 + 347 + spin_lock_irq(&q->mq_flush_lock); 348 + blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0); 349 + spin_unlock_irq(&q->mq_flush_lock); 350 + return; 351 + } 454 352 rq->end_io = flush_data_end_io; 455 353 456 354 blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0); ··· 571 453 return ret; 572 454 } 573 455 EXPORT_SYMBOL(blkdev_issue_flush); 456 + 457 + void blk_mq_init_flush(struct request_queue *q) 458 + { 459 + spin_lock_init(&q->mq_flush_lock); 460 + INIT_WORK(&q->mq_flush_work, mq_flush_work); 461 + }
+93
block/blk-mq-cpu.c
··· 1 + #include <linux/kernel.h> 2 + #include <linux/module.h> 3 + #include <linux/init.h> 4 + #include <linux/blkdev.h> 5 + #include <linux/list.h> 6 + #include <linux/llist.h> 7 + #include <linux/smp.h> 8 + #include <linux/cpu.h> 9 + 10 + #include <linux/blk-mq.h> 11 + #include "blk-mq.h" 12 + 13 + static LIST_HEAD(blk_mq_cpu_notify_list); 14 + static DEFINE_SPINLOCK(blk_mq_cpu_notify_lock); 15 + 16 + static int __cpuinit blk_mq_main_cpu_notify(struct notifier_block *self, 17 + unsigned long action, void *hcpu) 18 + { 19 + unsigned int cpu = (unsigned long) hcpu; 20 + struct blk_mq_cpu_notifier *notify; 21 + 22 + spin_lock(&blk_mq_cpu_notify_lock); 23 + 24 + list_for_each_entry(notify, &blk_mq_cpu_notify_list, list) 25 + notify->notify(notify->data, action, cpu); 26 + 27 + spin_unlock(&blk_mq_cpu_notify_lock); 28 + return NOTIFY_OK; 29 + } 30 + 31 + static void __cpuinit blk_mq_cpu_notify(void *data, unsigned long action, 32 + unsigned int cpu) 33 + { 34 + if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { 35 + /* 36 + * If the CPU goes away, ensure that we run any pending 37 + * completions. 38 + */ 39 + struct llist_node *node; 40 + struct request *rq; 41 + 42 + local_irq_disable(); 43 + 44 + node = llist_del_all(&per_cpu(ipi_lists, cpu)); 45 + while (node) { 46 + struct llist_node *next = node->next; 47 + 48 + rq = llist_entry(node, struct request, ll_list); 49 + __blk_mq_end_io(rq, rq->errors); 50 + node = next; 51 + } 52 + 53 + local_irq_enable(); 54 + } 55 + } 56 + 57 + static struct notifier_block __cpuinitdata blk_mq_main_cpu_notifier = { 58 + .notifier_call = blk_mq_main_cpu_notify, 59 + }; 60 + 61 + void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier) 62 + { 63 + BUG_ON(!notifier->notify); 64 + 65 + spin_lock(&blk_mq_cpu_notify_lock); 66 + list_add_tail(&notifier->list, &blk_mq_cpu_notify_list); 67 + spin_unlock(&blk_mq_cpu_notify_lock); 68 + } 69 + 70 + void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier) 71 + { 72 + spin_lock(&blk_mq_cpu_notify_lock); 73 + list_del(&notifier->list); 74 + spin_unlock(&blk_mq_cpu_notify_lock); 75 + } 76 + 77 + void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier, 78 + void (*fn)(void *, unsigned long, unsigned int), 79 + void *data) 80 + { 81 + notifier->notify = fn; 82 + notifier->data = data; 83 + } 84 + 85 + static struct blk_mq_cpu_notifier __cpuinitdata cpu_notifier = { 86 + .notify = blk_mq_cpu_notify, 87 + }; 88 + 89 + void __init blk_mq_cpu_init(void) 90 + { 91 + register_hotcpu_notifier(&blk_mq_main_cpu_notifier); 92 + blk_mq_register_cpu_notifier(&cpu_notifier); 93 + }
+108
block/blk-mq-cpumap.c
··· 1 + #include <linux/kernel.h> 2 + #include <linux/threads.h> 3 + #include <linux/module.h> 4 + #include <linux/mm.h> 5 + #include <linux/smp.h> 6 + #include <linux/cpu.h> 7 + 8 + #include <linux/blk-mq.h> 9 + #include "blk.h" 10 + #include "blk-mq.h" 11 + 12 + static void show_map(unsigned int *map, unsigned int nr) 13 + { 14 + int i; 15 + 16 + pr_info("blk-mq: CPU -> queue map\n"); 17 + for_each_online_cpu(i) 18 + pr_info(" CPU%2u -> Queue %u\n", i, map[i]); 19 + } 20 + 21 + static int cpu_to_queue_index(unsigned int nr_cpus, unsigned int nr_queues, 22 + const int cpu) 23 + { 24 + return cpu / ((nr_cpus + nr_queues - 1) / nr_queues); 25 + } 26 + 27 + static int get_first_sibling(unsigned int cpu) 28 + { 29 + unsigned int ret; 30 + 31 + ret = cpumask_first(topology_thread_cpumask(cpu)); 32 + if (ret < nr_cpu_ids) 33 + return ret; 34 + 35 + return cpu; 36 + } 37 + 38 + int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues) 39 + { 40 + unsigned int i, nr_cpus, nr_uniq_cpus, queue, first_sibling; 41 + cpumask_var_t cpus; 42 + 43 + if (!alloc_cpumask_var(&cpus, GFP_ATOMIC)) 44 + return 1; 45 + 46 + cpumask_clear(cpus); 47 + nr_cpus = nr_uniq_cpus = 0; 48 + for_each_online_cpu(i) { 49 + nr_cpus++; 50 + first_sibling = get_first_sibling(i); 51 + if (!cpumask_test_cpu(first_sibling, cpus)) 52 + nr_uniq_cpus++; 53 + cpumask_set_cpu(i, cpus); 54 + } 55 + 56 + queue = 0; 57 + for_each_possible_cpu(i) { 58 + if (!cpu_online(i)) { 59 + map[i] = 0; 60 + continue; 61 + } 62 + 63 + /* 64 + * Easy case - we have equal or more hardware queues. Or 65 + * there are no thread siblings to take into account. Do 66 + * 1:1 if enough, or sequential mapping if less. 67 + */ 68 + if (nr_queues >= nr_cpus || nr_cpus == nr_uniq_cpus) { 69 + map[i] = cpu_to_queue_index(nr_cpus, nr_queues, queue); 70 + queue++; 71 + continue; 72 + } 73 + 74 + /* 75 + * Less then nr_cpus queues, and we have some number of 76 + * threads per cores. Map sibling threads to the same 77 + * queue. 78 + */ 79 + first_sibling = get_first_sibling(i); 80 + if (first_sibling == i) { 81 + map[i] = cpu_to_queue_index(nr_uniq_cpus, nr_queues, 82 + queue); 83 + queue++; 84 + } else 85 + map[i] = map[first_sibling]; 86 + } 87 + 88 + show_map(map, nr_cpus); 89 + free_cpumask_var(cpus); 90 + return 0; 91 + } 92 + 93 + unsigned int *blk_mq_make_queue_map(struct blk_mq_reg *reg) 94 + { 95 + unsigned int *map; 96 + 97 + /* If cpus are offline, map them to first hctx */ 98 + map = kzalloc_node(sizeof(*map) * num_possible_cpus(), GFP_KERNEL, 99 + reg->numa_node); 100 + if (!map) 101 + return NULL; 102 + 103 + if (!blk_mq_update_queue_map(map, reg->nr_hw_queues)) 104 + return map; 105 + 106 + kfree(map); 107 + return NULL; 108 + }
+384
block/blk-mq-sysfs.c
··· 1 + #include <linux/kernel.h> 2 + #include <linux/module.h> 3 + #include <linux/backing-dev.h> 4 + #include <linux/bio.h> 5 + #include <linux/blkdev.h> 6 + #include <linux/mm.h> 7 + #include <linux/init.h> 8 + #include <linux/slab.h> 9 + #include <linux/workqueue.h> 10 + #include <linux/smp.h> 11 + 12 + #include <linux/blk-mq.h> 13 + #include "blk-mq.h" 14 + #include "blk-mq-tag.h" 15 + 16 + static void blk_mq_sysfs_release(struct kobject *kobj) 17 + { 18 + } 19 + 20 + struct blk_mq_ctx_sysfs_entry { 21 + struct attribute attr; 22 + ssize_t (*show)(struct blk_mq_ctx *, char *); 23 + ssize_t (*store)(struct blk_mq_ctx *, const char *, size_t); 24 + }; 25 + 26 + struct blk_mq_hw_ctx_sysfs_entry { 27 + struct attribute attr; 28 + ssize_t (*show)(struct blk_mq_hw_ctx *, char *); 29 + ssize_t (*store)(struct blk_mq_hw_ctx *, const char *, size_t); 30 + }; 31 + 32 + static ssize_t blk_mq_sysfs_show(struct kobject *kobj, struct attribute *attr, 33 + char *page) 34 + { 35 + struct blk_mq_ctx_sysfs_entry *entry; 36 + struct blk_mq_ctx *ctx; 37 + struct request_queue *q; 38 + ssize_t res; 39 + 40 + entry = container_of(attr, struct blk_mq_ctx_sysfs_entry, attr); 41 + ctx = container_of(kobj, struct blk_mq_ctx, kobj); 42 + q = ctx->queue; 43 + 44 + if (!entry->show) 45 + return -EIO; 46 + 47 + res = -ENOENT; 48 + mutex_lock(&q->sysfs_lock); 49 + if (!blk_queue_dying(q)) 50 + res = entry->show(ctx, page); 51 + mutex_unlock(&q->sysfs_lock); 52 + return res; 53 + } 54 + 55 + static ssize_t blk_mq_sysfs_store(struct kobject *kobj, struct attribute *attr, 56 + const char *page, size_t length) 57 + { 58 + struct blk_mq_ctx_sysfs_entry *entry; 59 + struct blk_mq_ctx *ctx; 60 + struct request_queue *q; 61 + ssize_t res; 62 + 63 + entry = container_of(attr, struct blk_mq_ctx_sysfs_entry, attr); 64 + ctx = container_of(kobj, struct blk_mq_ctx, kobj); 65 + q = ctx->queue; 66 + 67 + if (!entry->store) 68 + return -EIO; 69 + 70 + res = -ENOENT; 71 + mutex_lock(&q->sysfs_lock); 72 + if (!blk_queue_dying(q)) 73 + res = entry->store(ctx, page, length); 74 + mutex_unlock(&q->sysfs_lock); 75 + return res; 76 + } 77 + 78 + static ssize_t blk_mq_hw_sysfs_show(struct kobject *kobj, 79 + struct attribute *attr, char *page) 80 + { 81 + struct blk_mq_hw_ctx_sysfs_entry *entry; 82 + struct blk_mq_hw_ctx *hctx; 83 + struct request_queue *q; 84 + ssize_t res; 85 + 86 + entry = container_of(attr, struct blk_mq_hw_ctx_sysfs_entry, attr); 87 + hctx = container_of(kobj, struct blk_mq_hw_ctx, kobj); 88 + q = hctx->queue; 89 + 90 + if (!entry->show) 91 + return -EIO; 92 + 93 + res = -ENOENT; 94 + mutex_lock(&q->sysfs_lock); 95 + if (!blk_queue_dying(q)) 96 + res = entry->show(hctx, page); 97 + mutex_unlock(&q->sysfs_lock); 98 + return res; 99 + } 100 + 101 + static ssize_t blk_mq_hw_sysfs_store(struct kobject *kobj, 102 + struct attribute *attr, const char *page, 103 + size_t length) 104 + { 105 + struct blk_mq_hw_ctx_sysfs_entry *entry; 106 + struct blk_mq_hw_ctx *hctx; 107 + struct request_queue *q; 108 + ssize_t res; 109 + 110 + entry = container_of(attr, struct blk_mq_hw_ctx_sysfs_entry, attr); 111 + hctx = container_of(kobj, struct blk_mq_hw_ctx, kobj); 112 + q = hctx->queue; 113 + 114 + if (!entry->store) 115 + return -EIO; 116 + 117 + res = -ENOENT; 118 + mutex_lock(&q->sysfs_lock); 119 + if (!blk_queue_dying(q)) 120 + res = entry->store(hctx, page, length); 121 + mutex_unlock(&q->sysfs_lock); 122 + return res; 123 + } 124 + 125 + static ssize_t blk_mq_sysfs_dispatched_show(struct blk_mq_ctx *ctx, char *page) 126 + { 127 + return sprintf(page, "%lu %lu\n", ctx->rq_dispatched[1], 128 + ctx->rq_dispatched[0]); 129 + } 130 + 131 + static ssize_t blk_mq_sysfs_merged_show(struct blk_mq_ctx *ctx, char *page) 132 + { 133 + return sprintf(page, "%lu\n", ctx->rq_merged); 134 + } 135 + 136 + static ssize_t blk_mq_sysfs_completed_show(struct blk_mq_ctx *ctx, char *page) 137 + { 138 + return sprintf(page, "%lu %lu\n", ctx->rq_completed[1], 139 + ctx->rq_completed[0]); 140 + } 141 + 142 + static ssize_t sysfs_list_show(char *page, struct list_head *list, char *msg) 143 + { 144 + char *start_page = page; 145 + struct request *rq; 146 + 147 + page += sprintf(page, "%s:\n", msg); 148 + 149 + list_for_each_entry(rq, list, queuelist) 150 + page += sprintf(page, "\t%p\n", rq); 151 + 152 + return page - start_page; 153 + } 154 + 155 + static ssize_t blk_mq_sysfs_rq_list_show(struct blk_mq_ctx *ctx, char *page) 156 + { 157 + ssize_t ret; 158 + 159 + spin_lock(&ctx->lock); 160 + ret = sysfs_list_show(page, &ctx->rq_list, "CTX pending"); 161 + spin_unlock(&ctx->lock); 162 + 163 + return ret; 164 + } 165 + 166 + static ssize_t blk_mq_hw_sysfs_queued_show(struct blk_mq_hw_ctx *hctx, 167 + char *page) 168 + { 169 + return sprintf(page, "%lu\n", hctx->queued); 170 + } 171 + 172 + static ssize_t blk_mq_hw_sysfs_run_show(struct blk_mq_hw_ctx *hctx, char *page) 173 + { 174 + return sprintf(page, "%lu\n", hctx->run); 175 + } 176 + 177 + static ssize_t blk_mq_hw_sysfs_dispatched_show(struct blk_mq_hw_ctx *hctx, 178 + char *page) 179 + { 180 + char *start_page = page; 181 + int i; 182 + 183 + page += sprintf(page, "%8u\t%lu\n", 0U, hctx->dispatched[0]); 184 + 185 + for (i = 1; i < BLK_MQ_MAX_DISPATCH_ORDER; i++) { 186 + unsigned long d = 1U << (i - 1); 187 + 188 + page += sprintf(page, "%8lu\t%lu\n", d, hctx->dispatched[i]); 189 + } 190 + 191 + return page - start_page; 192 + } 193 + 194 + static ssize_t blk_mq_hw_sysfs_rq_list_show(struct blk_mq_hw_ctx *hctx, 195 + char *page) 196 + { 197 + ssize_t ret; 198 + 199 + spin_lock(&hctx->lock); 200 + ret = sysfs_list_show(page, &hctx->dispatch, "HCTX pending"); 201 + spin_unlock(&hctx->lock); 202 + 203 + return ret; 204 + } 205 + 206 + static ssize_t blk_mq_hw_sysfs_ipi_show(struct blk_mq_hw_ctx *hctx, char *page) 207 + { 208 + ssize_t ret; 209 + 210 + spin_lock(&hctx->lock); 211 + ret = sprintf(page, "%u\n", !!(hctx->flags & BLK_MQ_F_SHOULD_IPI)); 212 + spin_unlock(&hctx->lock); 213 + 214 + return ret; 215 + } 216 + 217 + static ssize_t blk_mq_hw_sysfs_ipi_store(struct blk_mq_hw_ctx *hctx, 218 + const char *page, size_t len) 219 + { 220 + struct blk_mq_ctx *ctx; 221 + unsigned long ret; 222 + unsigned int i; 223 + 224 + if (kstrtoul(page, 10, &ret)) { 225 + pr_err("blk-mq-sysfs: invalid input '%s'\n", page); 226 + return -EINVAL; 227 + } 228 + 229 + spin_lock(&hctx->lock); 230 + if (ret) 231 + hctx->flags |= BLK_MQ_F_SHOULD_IPI; 232 + else 233 + hctx->flags &= ~BLK_MQ_F_SHOULD_IPI; 234 + spin_unlock(&hctx->lock); 235 + 236 + hctx_for_each_ctx(hctx, ctx, i) 237 + ctx->ipi_redirect = !!ret; 238 + 239 + return len; 240 + } 241 + 242 + static ssize_t blk_mq_hw_sysfs_tags_show(struct blk_mq_hw_ctx *hctx, char *page) 243 + { 244 + return blk_mq_tag_sysfs_show(hctx->tags, page); 245 + } 246 + 247 + static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_dispatched = { 248 + .attr = {.name = "dispatched", .mode = S_IRUGO }, 249 + .show = blk_mq_sysfs_dispatched_show, 250 + }; 251 + static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_merged = { 252 + .attr = {.name = "merged", .mode = S_IRUGO }, 253 + .show = blk_mq_sysfs_merged_show, 254 + }; 255 + static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_completed = { 256 + .attr = {.name = "completed", .mode = S_IRUGO }, 257 + .show = blk_mq_sysfs_completed_show, 258 + }; 259 + static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_rq_list = { 260 + .attr = {.name = "rq_list", .mode = S_IRUGO }, 261 + .show = blk_mq_sysfs_rq_list_show, 262 + }; 263 + 264 + static struct attribute *default_ctx_attrs[] = { 265 + &blk_mq_sysfs_dispatched.attr, 266 + &blk_mq_sysfs_merged.attr, 267 + &blk_mq_sysfs_completed.attr, 268 + &blk_mq_sysfs_rq_list.attr, 269 + NULL, 270 + }; 271 + 272 + static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_queued = { 273 + .attr = {.name = "queued", .mode = S_IRUGO }, 274 + .show = blk_mq_hw_sysfs_queued_show, 275 + }; 276 + static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_run = { 277 + .attr = {.name = "run", .mode = S_IRUGO }, 278 + .show = blk_mq_hw_sysfs_run_show, 279 + }; 280 + static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_dispatched = { 281 + .attr = {.name = "dispatched", .mode = S_IRUGO }, 282 + .show = blk_mq_hw_sysfs_dispatched_show, 283 + }; 284 + static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_pending = { 285 + .attr = {.name = "pending", .mode = S_IRUGO }, 286 + .show = blk_mq_hw_sysfs_rq_list_show, 287 + }; 288 + static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_ipi = { 289 + .attr = {.name = "ipi_redirect", .mode = S_IRUGO | S_IWUSR}, 290 + .show = blk_mq_hw_sysfs_ipi_show, 291 + .store = blk_mq_hw_sysfs_ipi_store, 292 + }; 293 + static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_tags = { 294 + .attr = {.name = "tags", .mode = S_IRUGO }, 295 + .show = blk_mq_hw_sysfs_tags_show, 296 + }; 297 + 298 + static struct attribute *default_hw_ctx_attrs[] = { 299 + &blk_mq_hw_sysfs_queued.attr, 300 + &blk_mq_hw_sysfs_run.attr, 301 + &blk_mq_hw_sysfs_dispatched.attr, 302 + &blk_mq_hw_sysfs_pending.attr, 303 + &blk_mq_hw_sysfs_ipi.attr, 304 + &blk_mq_hw_sysfs_tags.attr, 305 + NULL, 306 + }; 307 + 308 + static const struct sysfs_ops blk_mq_sysfs_ops = { 309 + .show = blk_mq_sysfs_show, 310 + .store = blk_mq_sysfs_store, 311 + }; 312 + 313 + static const struct sysfs_ops blk_mq_hw_sysfs_ops = { 314 + .show = blk_mq_hw_sysfs_show, 315 + .store = blk_mq_hw_sysfs_store, 316 + }; 317 + 318 + static struct kobj_type blk_mq_ktype = { 319 + .sysfs_ops = &blk_mq_sysfs_ops, 320 + .release = blk_mq_sysfs_release, 321 + }; 322 + 323 + static struct kobj_type blk_mq_ctx_ktype = { 324 + .sysfs_ops = &blk_mq_sysfs_ops, 325 + .default_attrs = default_ctx_attrs, 326 + .release = blk_mq_sysfs_release, 327 + }; 328 + 329 + static struct kobj_type blk_mq_hw_ktype = { 330 + .sysfs_ops = &blk_mq_hw_sysfs_ops, 331 + .default_attrs = default_hw_ctx_attrs, 332 + .release = blk_mq_sysfs_release, 333 + }; 334 + 335 + void blk_mq_unregister_disk(struct gendisk *disk) 336 + { 337 + struct request_queue *q = disk->queue; 338 + 339 + kobject_uevent(&q->mq_kobj, KOBJ_REMOVE); 340 + kobject_del(&q->mq_kobj); 341 + 342 + kobject_put(&disk_to_dev(disk)->kobj); 343 + } 344 + 345 + int blk_mq_register_disk(struct gendisk *disk) 346 + { 347 + struct device *dev = disk_to_dev(disk); 348 + struct request_queue *q = disk->queue; 349 + struct blk_mq_hw_ctx *hctx; 350 + struct blk_mq_ctx *ctx; 351 + int ret, i, j; 352 + 353 + kobject_init(&q->mq_kobj, &blk_mq_ktype); 354 + 355 + ret = kobject_add(&q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq"); 356 + if (ret < 0) 357 + return ret; 358 + 359 + kobject_uevent(&q->mq_kobj, KOBJ_ADD); 360 + 361 + queue_for_each_hw_ctx(q, hctx, i) { 362 + kobject_init(&hctx->kobj, &blk_mq_hw_ktype); 363 + ret = kobject_add(&hctx->kobj, &q->mq_kobj, "%u", i); 364 + if (ret) 365 + break; 366 + 367 + if (!hctx->nr_ctx) 368 + continue; 369 + 370 + hctx_for_each_ctx(hctx, ctx, j) { 371 + kobject_init(&ctx->kobj, &blk_mq_ctx_ktype); 372 + ret = kobject_add(&ctx->kobj, &hctx->kobj, "cpu%u", ctx->cpu); 373 + if (ret) 374 + break; 375 + } 376 + } 377 + 378 + if (ret) { 379 + blk_mq_unregister_disk(disk); 380 + return ret; 381 + } 382 + 383 + return 0; 384 + }
+204
block/blk-mq-tag.c
··· 1 + #include <linux/kernel.h> 2 + #include <linux/module.h> 3 + #include <linux/percpu_ida.h> 4 + 5 + #include <linux/blk-mq.h> 6 + #include "blk.h" 7 + #include "blk-mq.h" 8 + #include "blk-mq-tag.h" 9 + 10 + /* 11 + * Per tagged queue (tag address space) map 12 + */ 13 + struct blk_mq_tags { 14 + unsigned int nr_tags; 15 + unsigned int nr_reserved_tags; 16 + unsigned int nr_batch_move; 17 + unsigned int nr_max_cache; 18 + 19 + struct percpu_ida free_tags; 20 + struct percpu_ida reserved_tags; 21 + }; 22 + 23 + void blk_mq_wait_for_tags(struct blk_mq_tags *tags) 24 + { 25 + int tag = blk_mq_get_tag(tags, __GFP_WAIT, false); 26 + blk_mq_put_tag(tags, tag); 27 + } 28 + 29 + bool blk_mq_has_free_tags(struct blk_mq_tags *tags) 30 + { 31 + return !tags || 32 + percpu_ida_free_tags(&tags->free_tags, nr_cpu_ids) != 0; 33 + } 34 + 35 + static unsigned int __blk_mq_get_tag(struct blk_mq_tags *tags, gfp_t gfp) 36 + { 37 + int tag; 38 + 39 + tag = percpu_ida_alloc(&tags->free_tags, gfp); 40 + if (tag < 0) 41 + return BLK_MQ_TAG_FAIL; 42 + return tag + tags->nr_reserved_tags; 43 + } 44 + 45 + static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_tags *tags, 46 + gfp_t gfp) 47 + { 48 + int tag; 49 + 50 + if (unlikely(!tags->nr_reserved_tags)) { 51 + WARN_ON_ONCE(1); 52 + return BLK_MQ_TAG_FAIL; 53 + } 54 + 55 + tag = percpu_ida_alloc(&tags->reserved_tags, gfp); 56 + if (tag < 0) 57 + return BLK_MQ_TAG_FAIL; 58 + return tag; 59 + } 60 + 61 + unsigned int blk_mq_get_tag(struct blk_mq_tags *tags, gfp_t gfp, bool reserved) 62 + { 63 + if (!reserved) 64 + return __blk_mq_get_tag(tags, gfp); 65 + 66 + return __blk_mq_get_reserved_tag(tags, gfp); 67 + } 68 + 69 + static void __blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag) 70 + { 71 + BUG_ON(tag >= tags->nr_tags); 72 + 73 + percpu_ida_free(&tags->free_tags, tag - tags->nr_reserved_tags); 74 + } 75 + 76 + static void __blk_mq_put_reserved_tag(struct blk_mq_tags *tags, 77 + unsigned int tag) 78 + { 79 + BUG_ON(tag >= tags->nr_reserved_tags); 80 + 81 + percpu_ida_free(&tags->reserved_tags, tag); 82 + } 83 + 84 + void blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag) 85 + { 86 + if (tag >= tags->nr_reserved_tags) 87 + __blk_mq_put_tag(tags, tag); 88 + else 89 + __blk_mq_put_reserved_tag(tags, tag); 90 + } 91 + 92 + static int __blk_mq_tag_iter(unsigned id, void *data) 93 + { 94 + unsigned long *tag_map = data; 95 + __set_bit(id, tag_map); 96 + return 0; 97 + } 98 + 99 + void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, 100 + void (*fn)(void *, unsigned long *), void *data) 101 + { 102 + unsigned long *tag_map; 103 + size_t map_size; 104 + 105 + map_size = ALIGN(tags->nr_tags, BITS_PER_LONG) / BITS_PER_LONG; 106 + tag_map = kzalloc(map_size * sizeof(unsigned long), GFP_ATOMIC); 107 + if (!tag_map) 108 + return; 109 + 110 + percpu_ida_for_each_free(&tags->free_tags, __blk_mq_tag_iter, tag_map); 111 + if (tags->nr_reserved_tags) 112 + percpu_ida_for_each_free(&tags->reserved_tags, __blk_mq_tag_iter, 113 + tag_map); 114 + 115 + fn(data, tag_map); 116 + kfree(tag_map); 117 + } 118 + 119 + struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, 120 + unsigned int reserved_tags, int node) 121 + { 122 + unsigned int nr_tags, nr_cache; 123 + struct blk_mq_tags *tags; 124 + int ret; 125 + 126 + if (total_tags > BLK_MQ_TAG_MAX) { 127 + pr_err("blk-mq: tag depth too large\n"); 128 + return NULL; 129 + } 130 + 131 + tags = kzalloc_node(sizeof(*tags), GFP_KERNEL, node); 132 + if (!tags) 133 + return NULL; 134 + 135 + nr_tags = total_tags - reserved_tags; 136 + nr_cache = nr_tags / num_possible_cpus(); 137 + 138 + if (nr_cache < BLK_MQ_TAG_CACHE_MIN) 139 + nr_cache = BLK_MQ_TAG_CACHE_MIN; 140 + else if (nr_cache > BLK_MQ_TAG_CACHE_MAX) 141 + nr_cache = BLK_MQ_TAG_CACHE_MAX; 142 + 143 + tags->nr_tags = total_tags; 144 + tags->nr_reserved_tags = reserved_tags; 145 + tags->nr_max_cache = nr_cache; 146 + tags->nr_batch_move = max(1u, nr_cache / 2); 147 + 148 + ret = __percpu_ida_init(&tags->free_tags, tags->nr_tags - 149 + tags->nr_reserved_tags, 150 + tags->nr_max_cache, 151 + tags->nr_batch_move); 152 + if (ret) 153 + goto err_free_tags; 154 + 155 + if (reserved_tags) { 156 + /* 157 + * With max_cahe and batch set to 1, the allocator fallbacks to 158 + * no cached. It's fine reserved tags allocation is slow. 159 + */ 160 + ret = __percpu_ida_init(&tags->reserved_tags, reserved_tags, 161 + 1, 1); 162 + if (ret) 163 + goto err_reserved_tags; 164 + } 165 + 166 + return tags; 167 + 168 + err_reserved_tags: 169 + percpu_ida_destroy(&tags->free_tags); 170 + err_free_tags: 171 + kfree(tags); 172 + return NULL; 173 + } 174 + 175 + void blk_mq_free_tags(struct blk_mq_tags *tags) 176 + { 177 + percpu_ida_destroy(&tags->free_tags); 178 + percpu_ida_destroy(&tags->reserved_tags); 179 + kfree(tags); 180 + } 181 + 182 + ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page) 183 + { 184 + char *orig_page = page; 185 + int cpu; 186 + 187 + if (!tags) 188 + return 0; 189 + 190 + page += sprintf(page, "nr_tags=%u, reserved_tags=%u, batch_move=%u," 191 + " max_cache=%u\n", tags->nr_tags, tags->nr_reserved_tags, 192 + tags->nr_batch_move, tags->nr_max_cache); 193 + 194 + page += sprintf(page, "nr_free=%u, nr_reserved=%u\n", 195 + percpu_ida_free_tags(&tags->free_tags, nr_cpu_ids), 196 + percpu_ida_free_tags(&tags->reserved_tags, nr_cpu_ids)); 197 + 198 + for_each_possible_cpu(cpu) { 199 + page += sprintf(page, " cpu%02u: nr_free=%u\n", cpu, 200 + percpu_ida_free_tags(&tags->free_tags, cpu)); 201 + } 202 + 203 + return page - orig_page; 204 + }
+27
block/blk-mq-tag.h
··· 1 + #ifndef INT_BLK_MQ_TAG_H 2 + #define INT_BLK_MQ_TAG_H 3 + 4 + struct blk_mq_tags; 5 + 6 + extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node); 7 + extern void blk_mq_free_tags(struct blk_mq_tags *tags); 8 + 9 + extern unsigned int blk_mq_get_tag(struct blk_mq_tags *tags, gfp_t gfp, bool reserved); 10 + extern void blk_mq_wait_for_tags(struct blk_mq_tags *tags); 11 + extern void blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag); 12 + extern void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, void (*fn)(void *data, unsigned long *), void *data); 13 + extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags); 14 + extern ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page); 15 + 16 + enum { 17 + BLK_MQ_TAG_CACHE_MIN = 1, 18 + BLK_MQ_TAG_CACHE_MAX = 64, 19 + }; 20 + 21 + enum { 22 + BLK_MQ_TAG_FAIL = -1U, 23 + BLK_MQ_TAG_MIN = BLK_MQ_TAG_CACHE_MIN, 24 + BLK_MQ_TAG_MAX = BLK_MQ_TAG_FAIL - 1, 25 + }; 26 + 27 + #endif
+1480
block/blk-mq.c
··· 1 + #include <linux/kernel.h> 2 + #include <linux/module.h> 3 + #include <linux/backing-dev.h> 4 + #include <linux/bio.h> 5 + #include <linux/blkdev.h> 6 + #include <linux/mm.h> 7 + #include <linux/init.h> 8 + #include <linux/slab.h> 9 + #include <linux/workqueue.h> 10 + #include <linux/smp.h> 11 + #include <linux/llist.h> 12 + #include <linux/list_sort.h> 13 + #include <linux/cpu.h> 14 + #include <linux/cache.h> 15 + #include <linux/sched/sysctl.h> 16 + #include <linux/delay.h> 17 + 18 + #include <trace/events/block.h> 19 + 20 + #include <linux/blk-mq.h> 21 + #include "blk.h" 22 + #include "blk-mq.h" 23 + #include "blk-mq-tag.h" 24 + 25 + static DEFINE_MUTEX(all_q_mutex); 26 + static LIST_HEAD(all_q_list); 27 + 28 + static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx); 29 + 30 + DEFINE_PER_CPU(struct llist_head, ipi_lists); 31 + 32 + static struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q, 33 + unsigned int cpu) 34 + { 35 + return per_cpu_ptr(q->queue_ctx, cpu); 36 + } 37 + 38 + /* 39 + * This assumes per-cpu software queueing queues. They could be per-node 40 + * as well, for instance. For now this is hardcoded as-is. Note that we don't 41 + * care about preemption, since we know the ctx's are persistent. This does 42 + * mean that we can't rely on ctx always matching the currently running CPU. 43 + */ 44 + static struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q) 45 + { 46 + return __blk_mq_get_ctx(q, get_cpu()); 47 + } 48 + 49 + static void blk_mq_put_ctx(struct blk_mq_ctx *ctx) 50 + { 51 + put_cpu(); 52 + } 53 + 54 + /* 55 + * Check if any of the ctx's have pending work in this hardware queue 56 + */ 57 + static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx) 58 + { 59 + unsigned int i; 60 + 61 + for (i = 0; i < hctx->nr_ctx_map; i++) 62 + if (hctx->ctx_map[i]) 63 + return true; 64 + 65 + return false; 66 + } 67 + 68 + /* 69 + * Mark this ctx as having pending work in this hardware queue 70 + */ 71 + static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx, 72 + struct blk_mq_ctx *ctx) 73 + { 74 + if (!test_bit(ctx->index_hw, hctx->ctx_map)) 75 + set_bit(ctx->index_hw, hctx->ctx_map); 76 + } 77 + 78 + static struct request *blk_mq_alloc_rq(struct blk_mq_hw_ctx *hctx, gfp_t gfp, 79 + bool reserved) 80 + { 81 + struct request *rq; 82 + unsigned int tag; 83 + 84 + tag = blk_mq_get_tag(hctx->tags, gfp, reserved); 85 + if (tag != BLK_MQ_TAG_FAIL) { 86 + rq = hctx->rqs[tag]; 87 + rq->tag = tag; 88 + 89 + return rq; 90 + } 91 + 92 + return NULL; 93 + } 94 + 95 + static int blk_mq_queue_enter(struct request_queue *q) 96 + { 97 + int ret; 98 + 99 + __percpu_counter_add(&q->mq_usage_counter, 1, 1000000); 100 + smp_wmb(); 101 + /* we have problems to freeze the queue if it's initializing */ 102 + if (!blk_queue_bypass(q) || !blk_queue_init_done(q)) 103 + return 0; 104 + 105 + __percpu_counter_add(&q->mq_usage_counter, -1, 1000000); 106 + 107 + spin_lock_irq(q->queue_lock); 108 + ret = wait_event_interruptible_lock_irq(q->mq_freeze_wq, 109 + !blk_queue_bypass(q), *q->queue_lock); 110 + /* inc usage with lock hold to avoid freeze_queue runs here */ 111 + if (!ret) 112 + __percpu_counter_add(&q->mq_usage_counter, 1, 1000000); 113 + spin_unlock_irq(q->queue_lock); 114 + 115 + return ret; 116 + } 117 + 118 + static void blk_mq_queue_exit(struct request_queue *q) 119 + { 120 + __percpu_counter_add(&q->mq_usage_counter, -1, 1000000); 121 + } 122 + 123 + /* 124 + * Guarantee no request is in use, so we can change any data structure of 125 + * the queue afterward. 126 + */ 127 + static void blk_mq_freeze_queue(struct request_queue *q) 128 + { 129 + bool drain; 130 + 131 + spin_lock_irq(q->queue_lock); 132 + drain = !q->bypass_depth++; 133 + queue_flag_set(QUEUE_FLAG_BYPASS, q); 134 + spin_unlock_irq(q->queue_lock); 135 + 136 + if (!drain) 137 + return; 138 + 139 + while (true) { 140 + s64 count; 141 + 142 + spin_lock_irq(q->queue_lock); 143 + count = percpu_counter_sum(&q->mq_usage_counter); 144 + spin_unlock_irq(q->queue_lock); 145 + 146 + if (count == 0) 147 + break; 148 + blk_mq_run_queues(q, false); 149 + msleep(10); 150 + } 151 + } 152 + 153 + static void blk_mq_unfreeze_queue(struct request_queue *q) 154 + { 155 + bool wake = false; 156 + 157 + spin_lock_irq(q->queue_lock); 158 + if (!--q->bypass_depth) { 159 + queue_flag_clear(QUEUE_FLAG_BYPASS, q); 160 + wake = true; 161 + } 162 + WARN_ON_ONCE(q->bypass_depth < 0); 163 + spin_unlock_irq(q->queue_lock); 164 + if (wake) 165 + wake_up_all(&q->mq_freeze_wq); 166 + } 167 + 168 + bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx) 169 + { 170 + return blk_mq_has_free_tags(hctx->tags); 171 + } 172 + EXPORT_SYMBOL(blk_mq_can_queue); 173 + 174 + static void blk_mq_rq_ctx_init(struct blk_mq_ctx *ctx, struct request *rq, 175 + unsigned int rw_flags) 176 + { 177 + rq->mq_ctx = ctx; 178 + rq->cmd_flags = rw_flags; 179 + ctx->rq_dispatched[rw_is_sync(rw_flags)]++; 180 + } 181 + 182 + static struct request *__blk_mq_alloc_request(struct blk_mq_hw_ctx *hctx, 183 + gfp_t gfp, bool reserved) 184 + { 185 + return blk_mq_alloc_rq(hctx, gfp, reserved); 186 + } 187 + 188 + static struct request *blk_mq_alloc_request_pinned(struct request_queue *q, 189 + int rw, gfp_t gfp, 190 + bool reserved) 191 + { 192 + struct request *rq; 193 + 194 + do { 195 + struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); 196 + struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, ctx->cpu); 197 + 198 + rq = __blk_mq_alloc_request(hctx, gfp & ~__GFP_WAIT, reserved); 199 + if (rq) { 200 + blk_mq_rq_ctx_init(ctx, rq, rw); 201 + break; 202 + } else if (!(gfp & __GFP_WAIT)) 203 + break; 204 + 205 + blk_mq_put_ctx(ctx); 206 + __blk_mq_run_hw_queue(hctx); 207 + blk_mq_wait_for_tags(hctx->tags); 208 + } while (1); 209 + 210 + return rq; 211 + } 212 + 213 + struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp) 214 + { 215 + struct request *rq; 216 + 217 + if (blk_mq_queue_enter(q)) 218 + return NULL; 219 + 220 + rq = blk_mq_alloc_request_pinned(q, rw, gfp, false); 221 + blk_mq_put_ctx(rq->mq_ctx); 222 + return rq; 223 + } 224 + 225 + struct request *blk_mq_alloc_reserved_request(struct request_queue *q, int rw, 226 + gfp_t gfp) 227 + { 228 + struct request *rq; 229 + 230 + if (blk_mq_queue_enter(q)) 231 + return NULL; 232 + 233 + rq = blk_mq_alloc_request_pinned(q, rw, gfp, true); 234 + blk_mq_put_ctx(rq->mq_ctx); 235 + return rq; 236 + } 237 + EXPORT_SYMBOL(blk_mq_alloc_reserved_request); 238 + 239 + /* 240 + * Re-init and set pdu, if we have it 241 + */ 242 + static void blk_mq_rq_init(struct blk_mq_hw_ctx *hctx, struct request *rq) 243 + { 244 + blk_rq_init(hctx->queue, rq); 245 + 246 + if (hctx->cmd_size) 247 + rq->special = blk_mq_rq_to_pdu(rq); 248 + } 249 + 250 + static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, 251 + struct blk_mq_ctx *ctx, struct request *rq) 252 + { 253 + const int tag = rq->tag; 254 + struct request_queue *q = rq->q; 255 + 256 + blk_mq_rq_init(hctx, rq); 257 + blk_mq_put_tag(hctx->tags, tag); 258 + 259 + blk_mq_queue_exit(q); 260 + } 261 + 262 + void blk_mq_free_request(struct request *rq) 263 + { 264 + struct blk_mq_ctx *ctx = rq->mq_ctx; 265 + struct blk_mq_hw_ctx *hctx; 266 + struct request_queue *q = rq->q; 267 + 268 + ctx->rq_completed[rq_is_sync(rq)]++; 269 + 270 + hctx = q->mq_ops->map_queue(q, ctx->cpu); 271 + __blk_mq_free_request(hctx, ctx, rq); 272 + } 273 + 274 + static void blk_mq_bio_endio(struct request *rq, struct bio *bio, int error) 275 + { 276 + if (error) 277 + clear_bit(BIO_UPTODATE, &bio->bi_flags); 278 + else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) 279 + error = -EIO; 280 + 281 + if (unlikely(rq->cmd_flags & REQ_QUIET)) 282 + set_bit(BIO_QUIET, &bio->bi_flags); 283 + 284 + /* don't actually finish bio if it's part of flush sequence */ 285 + if (!(rq->cmd_flags & REQ_FLUSH_SEQ)) 286 + bio_endio(bio, error); 287 + } 288 + 289 + void blk_mq_complete_request(struct request *rq, int error) 290 + { 291 + struct bio *bio = rq->bio; 292 + unsigned int bytes = 0; 293 + 294 + trace_block_rq_complete(rq->q, rq); 295 + 296 + while (bio) { 297 + struct bio *next = bio->bi_next; 298 + 299 + bio->bi_next = NULL; 300 + bytes += bio->bi_size; 301 + blk_mq_bio_endio(rq, bio, error); 302 + bio = next; 303 + } 304 + 305 + blk_account_io_completion(rq, bytes); 306 + 307 + if (rq->end_io) 308 + rq->end_io(rq, error); 309 + else 310 + blk_mq_free_request(rq); 311 + 312 + blk_account_io_done(rq); 313 + } 314 + 315 + void __blk_mq_end_io(struct request *rq, int error) 316 + { 317 + if (!blk_mark_rq_complete(rq)) 318 + blk_mq_complete_request(rq, error); 319 + } 320 + 321 + #if defined(CONFIG_SMP) && defined(CONFIG_USE_GENERIC_SMP_HELPERS) 322 + 323 + /* 324 + * Called with interrupts disabled. 325 + */ 326 + static void ipi_end_io(void *data) 327 + { 328 + struct llist_head *list = &per_cpu(ipi_lists, smp_processor_id()); 329 + struct llist_node *entry, *next; 330 + struct request *rq; 331 + 332 + entry = llist_del_all(list); 333 + 334 + while (entry) { 335 + next = entry->next; 336 + rq = llist_entry(entry, struct request, ll_list); 337 + __blk_mq_end_io(rq, rq->errors); 338 + entry = next; 339 + } 340 + } 341 + 342 + static int ipi_remote_cpu(struct blk_mq_ctx *ctx, const int cpu, 343 + struct request *rq, const int error) 344 + { 345 + struct call_single_data *data = &rq->csd; 346 + 347 + rq->errors = error; 348 + rq->ll_list.next = NULL; 349 + 350 + /* 351 + * If the list is non-empty, an existing IPI must already 352 + * be "in flight". If that is the case, we need not schedule 353 + * a new one. 354 + */ 355 + if (llist_add(&rq->ll_list, &per_cpu(ipi_lists, ctx->cpu))) { 356 + data->func = ipi_end_io; 357 + data->flags = 0; 358 + __smp_call_function_single(ctx->cpu, data, 0); 359 + } 360 + 361 + return true; 362 + } 363 + #else /* CONFIG_SMP && CONFIG_USE_GENERIC_SMP_HELPERS */ 364 + static int ipi_remote_cpu(struct blk_mq_ctx *ctx, const int cpu, 365 + struct request *rq, const int error) 366 + { 367 + return false; 368 + } 369 + #endif 370 + 371 + /* 372 + * End IO on this request on a multiqueue enabled driver. We'll either do 373 + * it directly inline, or punt to a local IPI handler on the matching 374 + * remote CPU. 375 + */ 376 + void blk_mq_end_io(struct request *rq, int error) 377 + { 378 + struct blk_mq_ctx *ctx = rq->mq_ctx; 379 + int cpu; 380 + 381 + if (!ctx->ipi_redirect) 382 + return __blk_mq_end_io(rq, error); 383 + 384 + cpu = get_cpu(); 385 + 386 + if (cpu == ctx->cpu || !cpu_online(ctx->cpu) || 387 + !ipi_remote_cpu(ctx, cpu, rq, error)) 388 + __blk_mq_end_io(rq, error); 389 + 390 + put_cpu(); 391 + } 392 + EXPORT_SYMBOL(blk_mq_end_io); 393 + 394 + static void blk_mq_start_request(struct request *rq) 395 + { 396 + struct request_queue *q = rq->q; 397 + 398 + trace_block_rq_issue(q, rq); 399 + 400 + /* 401 + * Just mark start time and set the started bit. Due to memory 402 + * ordering, we know we'll see the correct deadline as long as 403 + * REQ_ATOMIC_STARTED is seen. 404 + */ 405 + rq->deadline = jiffies + q->rq_timeout; 406 + set_bit(REQ_ATOM_STARTED, &rq->atomic_flags); 407 + } 408 + 409 + static void blk_mq_requeue_request(struct request *rq) 410 + { 411 + struct request_queue *q = rq->q; 412 + 413 + trace_block_rq_requeue(q, rq); 414 + clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); 415 + } 416 + 417 + struct blk_mq_timeout_data { 418 + struct blk_mq_hw_ctx *hctx; 419 + unsigned long *next; 420 + unsigned int *next_set; 421 + }; 422 + 423 + static void blk_mq_timeout_check(void *__data, unsigned long *free_tags) 424 + { 425 + struct blk_mq_timeout_data *data = __data; 426 + struct blk_mq_hw_ctx *hctx = data->hctx; 427 + unsigned int tag; 428 + 429 + /* It may not be in flight yet (this is where 430 + * the REQ_ATOMIC_STARTED flag comes in). The requests are 431 + * statically allocated, so we know it's always safe to access the 432 + * memory associated with a bit offset into ->rqs[]. 433 + */ 434 + tag = 0; 435 + do { 436 + struct request *rq; 437 + 438 + tag = find_next_zero_bit(free_tags, hctx->queue_depth, tag); 439 + if (tag >= hctx->queue_depth) 440 + break; 441 + 442 + rq = hctx->rqs[tag++]; 443 + 444 + if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) 445 + continue; 446 + 447 + blk_rq_check_expired(rq, data->next, data->next_set); 448 + } while (1); 449 + } 450 + 451 + static void blk_mq_hw_ctx_check_timeout(struct blk_mq_hw_ctx *hctx, 452 + unsigned long *next, 453 + unsigned int *next_set) 454 + { 455 + struct blk_mq_timeout_data data = { 456 + .hctx = hctx, 457 + .next = next, 458 + .next_set = next_set, 459 + }; 460 + 461 + /* 462 + * Ask the tagging code to iterate busy requests, so we can 463 + * check them for timeout. 464 + */ 465 + blk_mq_tag_busy_iter(hctx->tags, blk_mq_timeout_check, &data); 466 + } 467 + 468 + static void blk_mq_rq_timer(unsigned long data) 469 + { 470 + struct request_queue *q = (struct request_queue *) data; 471 + struct blk_mq_hw_ctx *hctx; 472 + unsigned long next = 0; 473 + int i, next_set = 0; 474 + 475 + queue_for_each_hw_ctx(q, hctx, i) 476 + blk_mq_hw_ctx_check_timeout(hctx, &next, &next_set); 477 + 478 + if (next_set) 479 + mod_timer(&q->timeout, round_jiffies_up(next)); 480 + } 481 + 482 + /* 483 + * Reverse check our software queue for entries that we could potentially 484 + * merge with. Currently includes a hand-wavy stop count of 8, to not spend 485 + * too much time checking for merges. 486 + */ 487 + static bool blk_mq_attempt_merge(struct request_queue *q, 488 + struct blk_mq_ctx *ctx, struct bio *bio) 489 + { 490 + struct request *rq; 491 + int checked = 8; 492 + 493 + list_for_each_entry_reverse(rq, &ctx->rq_list, queuelist) { 494 + int el_ret; 495 + 496 + if (!checked--) 497 + break; 498 + 499 + if (!blk_rq_merge_ok(rq, bio)) 500 + continue; 501 + 502 + el_ret = blk_try_merge(rq, bio); 503 + if (el_ret == ELEVATOR_BACK_MERGE) { 504 + if (bio_attempt_back_merge(q, rq, bio)) { 505 + ctx->rq_merged++; 506 + return true; 507 + } 508 + break; 509 + } else if (el_ret == ELEVATOR_FRONT_MERGE) { 510 + if (bio_attempt_front_merge(q, rq, bio)) { 511 + ctx->rq_merged++; 512 + return true; 513 + } 514 + break; 515 + } 516 + } 517 + 518 + return false; 519 + } 520 + 521 + void blk_mq_add_timer(struct request *rq) 522 + { 523 + __blk_add_timer(rq, NULL); 524 + } 525 + 526 + /* 527 + * Run this hardware queue, pulling any software queues mapped to it in. 528 + * Note that this function currently has various problems around ordering 529 + * of IO. In particular, we'd like FIFO behaviour on handling existing 530 + * items on the hctx->dispatch list. Ignore that for now. 531 + */ 532 + static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) 533 + { 534 + struct request_queue *q = hctx->queue; 535 + struct blk_mq_ctx *ctx; 536 + struct request *rq; 537 + LIST_HEAD(rq_list); 538 + int bit, queued; 539 + 540 + if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->flags))) 541 + return; 542 + 543 + hctx->run++; 544 + 545 + /* 546 + * Touch any software queue that has pending entries. 547 + */ 548 + for_each_set_bit(bit, hctx->ctx_map, hctx->nr_ctx) { 549 + clear_bit(bit, hctx->ctx_map); 550 + ctx = hctx->ctxs[bit]; 551 + BUG_ON(bit != ctx->index_hw); 552 + 553 + spin_lock(&ctx->lock); 554 + list_splice_tail_init(&ctx->rq_list, &rq_list); 555 + spin_unlock(&ctx->lock); 556 + } 557 + 558 + /* 559 + * If we have previous entries on our dispatch list, grab them 560 + * and stuff them at the front for more fair dispatch. 561 + */ 562 + if (!list_empty_careful(&hctx->dispatch)) { 563 + spin_lock(&hctx->lock); 564 + if (!list_empty(&hctx->dispatch)) 565 + list_splice_init(&hctx->dispatch, &rq_list); 566 + spin_unlock(&hctx->lock); 567 + } 568 + 569 + /* 570 + * Delete and return all entries from our dispatch list 571 + */ 572 + queued = 0; 573 + 574 + /* 575 + * Now process all the entries, sending them to the driver. 576 + */ 577 + while (!list_empty(&rq_list)) { 578 + int ret; 579 + 580 + rq = list_first_entry(&rq_list, struct request, queuelist); 581 + list_del_init(&rq->queuelist); 582 + blk_mq_start_request(rq); 583 + 584 + /* 585 + * Last request in the series. Flag it as such, this 586 + * enables drivers to know when IO should be kicked off, 587 + * if they don't do it on a per-request basis. 588 + * 589 + * Note: the flag isn't the only condition drivers 590 + * should do kick off. If drive is busy, the last 591 + * request might not have the bit set. 592 + */ 593 + if (list_empty(&rq_list)) 594 + rq->cmd_flags |= REQ_END; 595 + 596 + ret = q->mq_ops->queue_rq(hctx, rq); 597 + switch (ret) { 598 + case BLK_MQ_RQ_QUEUE_OK: 599 + queued++; 600 + continue; 601 + case BLK_MQ_RQ_QUEUE_BUSY: 602 + /* 603 + * FIXME: we should have a mechanism to stop the queue 604 + * like blk_stop_queue, otherwise we will waste cpu 605 + * time 606 + */ 607 + list_add(&rq->queuelist, &rq_list); 608 + blk_mq_requeue_request(rq); 609 + break; 610 + default: 611 + pr_err("blk-mq: bad return on queue: %d\n", ret); 612 + rq->errors = -EIO; 613 + case BLK_MQ_RQ_QUEUE_ERROR: 614 + blk_mq_end_io(rq, rq->errors); 615 + break; 616 + } 617 + 618 + if (ret == BLK_MQ_RQ_QUEUE_BUSY) 619 + break; 620 + } 621 + 622 + if (!queued) 623 + hctx->dispatched[0]++; 624 + else if (queued < (1 << (BLK_MQ_MAX_DISPATCH_ORDER - 1))) 625 + hctx->dispatched[ilog2(queued) + 1]++; 626 + 627 + /* 628 + * Any items that need requeuing? Stuff them into hctx->dispatch, 629 + * that is where we will continue on next queue run. 630 + */ 631 + if (!list_empty(&rq_list)) { 632 + spin_lock(&hctx->lock); 633 + list_splice(&rq_list, &hctx->dispatch); 634 + spin_unlock(&hctx->lock); 635 + } 636 + } 637 + 638 + void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) 639 + { 640 + if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->flags))) 641 + return; 642 + 643 + if (!async) 644 + __blk_mq_run_hw_queue(hctx); 645 + else { 646 + struct request_queue *q = hctx->queue; 647 + 648 + kblockd_schedule_delayed_work(q, &hctx->delayed_work, 0); 649 + } 650 + } 651 + 652 + void blk_mq_run_queues(struct request_queue *q, bool async) 653 + { 654 + struct blk_mq_hw_ctx *hctx; 655 + int i; 656 + 657 + queue_for_each_hw_ctx(q, hctx, i) { 658 + if ((!blk_mq_hctx_has_pending(hctx) && 659 + list_empty_careful(&hctx->dispatch)) || 660 + test_bit(BLK_MQ_S_STOPPED, &hctx->flags)) 661 + continue; 662 + 663 + blk_mq_run_hw_queue(hctx, async); 664 + } 665 + } 666 + EXPORT_SYMBOL(blk_mq_run_queues); 667 + 668 + void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx) 669 + { 670 + cancel_delayed_work(&hctx->delayed_work); 671 + set_bit(BLK_MQ_S_STOPPED, &hctx->state); 672 + } 673 + EXPORT_SYMBOL(blk_mq_stop_hw_queue); 674 + 675 + void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx) 676 + { 677 + clear_bit(BLK_MQ_S_STOPPED, &hctx->state); 678 + __blk_mq_run_hw_queue(hctx); 679 + } 680 + EXPORT_SYMBOL(blk_mq_start_hw_queue); 681 + 682 + void blk_mq_start_stopped_hw_queues(struct request_queue *q) 683 + { 684 + struct blk_mq_hw_ctx *hctx; 685 + int i; 686 + 687 + queue_for_each_hw_ctx(q, hctx, i) { 688 + if (!test_bit(BLK_MQ_S_STOPPED, &hctx->state)) 689 + continue; 690 + 691 + clear_bit(BLK_MQ_S_STOPPED, &hctx->state); 692 + blk_mq_run_hw_queue(hctx, true); 693 + } 694 + } 695 + EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues); 696 + 697 + static void blk_mq_work_fn(struct work_struct *work) 698 + { 699 + struct blk_mq_hw_ctx *hctx; 700 + 701 + hctx = container_of(work, struct blk_mq_hw_ctx, delayed_work.work); 702 + __blk_mq_run_hw_queue(hctx); 703 + } 704 + 705 + static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, 706 + struct request *rq) 707 + { 708 + struct blk_mq_ctx *ctx = rq->mq_ctx; 709 + 710 + list_add_tail(&rq->queuelist, &ctx->rq_list); 711 + blk_mq_hctx_mark_pending(hctx, ctx); 712 + 713 + /* 714 + * We do this early, to ensure we are on the right CPU. 715 + */ 716 + blk_mq_add_timer(rq); 717 + } 718 + 719 + void blk_mq_insert_request(struct request_queue *q, struct request *rq, 720 + bool run_queue) 721 + { 722 + struct blk_mq_hw_ctx *hctx; 723 + struct blk_mq_ctx *ctx, *current_ctx; 724 + 725 + ctx = rq->mq_ctx; 726 + hctx = q->mq_ops->map_queue(q, ctx->cpu); 727 + 728 + if (rq->cmd_flags & (REQ_FLUSH | REQ_FUA)) { 729 + blk_insert_flush(rq); 730 + } else { 731 + current_ctx = blk_mq_get_ctx(q); 732 + 733 + if (!cpu_online(ctx->cpu)) { 734 + ctx = current_ctx; 735 + hctx = q->mq_ops->map_queue(q, ctx->cpu); 736 + rq->mq_ctx = ctx; 737 + } 738 + spin_lock(&ctx->lock); 739 + __blk_mq_insert_request(hctx, rq); 740 + spin_unlock(&ctx->lock); 741 + 742 + blk_mq_put_ctx(current_ctx); 743 + } 744 + 745 + if (run_queue) 746 + __blk_mq_run_hw_queue(hctx); 747 + } 748 + EXPORT_SYMBOL(blk_mq_insert_request); 749 + 750 + /* 751 + * This is a special version of blk_mq_insert_request to bypass FLUSH request 752 + * check. Should only be used internally. 753 + */ 754 + void blk_mq_run_request(struct request *rq, bool run_queue, bool async) 755 + { 756 + struct request_queue *q = rq->q; 757 + struct blk_mq_hw_ctx *hctx; 758 + struct blk_mq_ctx *ctx, *current_ctx; 759 + 760 + current_ctx = blk_mq_get_ctx(q); 761 + 762 + ctx = rq->mq_ctx; 763 + if (!cpu_online(ctx->cpu)) { 764 + ctx = current_ctx; 765 + rq->mq_ctx = ctx; 766 + } 767 + hctx = q->mq_ops->map_queue(q, ctx->cpu); 768 + 769 + /* ctx->cpu might be offline */ 770 + spin_lock(&ctx->lock); 771 + __blk_mq_insert_request(hctx, rq); 772 + spin_unlock(&ctx->lock); 773 + 774 + blk_mq_put_ctx(current_ctx); 775 + 776 + if (run_queue) 777 + blk_mq_run_hw_queue(hctx, async); 778 + } 779 + 780 + static void blk_mq_insert_requests(struct request_queue *q, 781 + struct blk_mq_ctx *ctx, 782 + struct list_head *list, 783 + int depth, 784 + bool from_schedule) 785 + 786 + { 787 + struct blk_mq_hw_ctx *hctx; 788 + struct blk_mq_ctx *current_ctx; 789 + 790 + trace_block_unplug(q, depth, !from_schedule); 791 + 792 + current_ctx = blk_mq_get_ctx(q); 793 + 794 + if (!cpu_online(ctx->cpu)) 795 + ctx = current_ctx; 796 + hctx = q->mq_ops->map_queue(q, ctx->cpu); 797 + 798 + /* 799 + * preemption doesn't flush plug list, so it's possible ctx->cpu is 800 + * offline now 801 + */ 802 + spin_lock(&ctx->lock); 803 + while (!list_empty(list)) { 804 + struct request *rq; 805 + 806 + rq = list_first_entry(list, struct request, queuelist); 807 + list_del_init(&rq->queuelist); 808 + rq->mq_ctx = ctx; 809 + __blk_mq_insert_request(hctx, rq); 810 + } 811 + spin_unlock(&ctx->lock); 812 + 813 + blk_mq_put_ctx(current_ctx); 814 + 815 + blk_mq_run_hw_queue(hctx, from_schedule); 816 + } 817 + 818 + static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b) 819 + { 820 + struct request *rqa = container_of(a, struct request, queuelist); 821 + struct request *rqb = container_of(b, struct request, queuelist); 822 + 823 + return !(rqa->mq_ctx < rqb->mq_ctx || 824 + (rqa->mq_ctx == rqb->mq_ctx && 825 + blk_rq_pos(rqa) < blk_rq_pos(rqb))); 826 + } 827 + 828 + void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) 829 + { 830 + struct blk_mq_ctx *this_ctx; 831 + struct request_queue *this_q; 832 + struct request *rq; 833 + LIST_HEAD(list); 834 + LIST_HEAD(ctx_list); 835 + unsigned int depth; 836 + 837 + list_splice_init(&plug->mq_list, &list); 838 + 839 + list_sort(NULL, &list, plug_ctx_cmp); 840 + 841 + this_q = NULL; 842 + this_ctx = NULL; 843 + depth = 0; 844 + 845 + while (!list_empty(&list)) { 846 + rq = list_entry_rq(list.next); 847 + list_del_init(&rq->queuelist); 848 + BUG_ON(!rq->q); 849 + if (rq->mq_ctx != this_ctx) { 850 + if (this_ctx) { 851 + blk_mq_insert_requests(this_q, this_ctx, 852 + &ctx_list, depth, 853 + from_schedule); 854 + } 855 + 856 + this_ctx = rq->mq_ctx; 857 + this_q = rq->q; 858 + depth = 0; 859 + } 860 + 861 + depth++; 862 + list_add_tail(&rq->queuelist, &ctx_list); 863 + } 864 + 865 + /* 866 + * If 'this_ctx' is set, we know we have entries to complete 867 + * on 'ctx_list'. Do those. 868 + */ 869 + if (this_ctx) { 870 + blk_mq_insert_requests(this_q, this_ctx, &ctx_list, depth, 871 + from_schedule); 872 + } 873 + } 874 + 875 + static void blk_mq_bio_to_request(struct request *rq, struct bio *bio) 876 + { 877 + init_request_from_bio(rq, bio); 878 + blk_account_io_start(rq, 1); 879 + } 880 + 881 + static void blk_mq_make_request(struct request_queue *q, struct bio *bio) 882 + { 883 + struct blk_mq_hw_ctx *hctx; 884 + struct blk_mq_ctx *ctx; 885 + const int is_sync = rw_is_sync(bio->bi_rw); 886 + const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA); 887 + int rw = bio_data_dir(bio); 888 + struct request *rq; 889 + unsigned int use_plug, request_count = 0; 890 + 891 + /* 892 + * If we have multiple hardware queues, just go directly to 893 + * one of those for sync IO. 894 + */ 895 + use_plug = !is_flush_fua && ((q->nr_hw_queues == 1) || !is_sync); 896 + 897 + blk_queue_bounce(q, &bio); 898 + 899 + if (use_plug && blk_attempt_plug_merge(q, bio, &request_count)) 900 + return; 901 + 902 + if (blk_mq_queue_enter(q)) { 903 + bio_endio(bio, -EIO); 904 + return; 905 + } 906 + 907 + ctx = blk_mq_get_ctx(q); 908 + hctx = q->mq_ops->map_queue(q, ctx->cpu); 909 + 910 + trace_block_getrq(q, bio, rw); 911 + rq = __blk_mq_alloc_request(hctx, GFP_ATOMIC, false); 912 + if (likely(rq)) 913 + blk_mq_rq_ctx_init(ctx, rq, rw); 914 + else { 915 + blk_mq_put_ctx(ctx); 916 + trace_block_sleeprq(q, bio, rw); 917 + rq = blk_mq_alloc_request_pinned(q, rw, __GFP_WAIT|GFP_ATOMIC, 918 + false); 919 + ctx = rq->mq_ctx; 920 + hctx = q->mq_ops->map_queue(q, ctx->cpu); 921 + } 922 + 923 + hctx->queued++; 924 + 925 + if (unlikely(is_flush_fua)) { 926 + blk_mq_bio_to_request(rq, bio); 927 + blk_mq_put_ctx(ctx); 928 + blk_insert_flush(rq); 929 + goto run_queue; 930 + } 931 + 932 + /* 933 + * A task plug currently exists. Since this is completely lockless, 934 + * utilize that to temporarily store requests until the task is 935 + * either done or scheduled away. 936 + */ 937 + if (use_plug) { 938 + struct blk_plug *plug = current->plug; 939 + 940 + if (plug) { 941 + blk_mq_bio_to_request(rq, bio); 942 + if (list_empty(&plug->list)) 943 + trace_block_plug(q); 944 + else if (request_count >= BLK_MAX_REQUEST_COUNT) { 945 + blk_flush_plug_list(plug, false); 946 + trace_block_plug(q); 947 + } 948 + list_add_tail(&rq->queuelist, &plug->mq_list); 949 + blk_mq_put_ctx(ctx); 950 + return; 951 + } 952 + } 953 + 954 + spin_lock(&ctx->lock); 955 + 956 + if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) && 957 + blk_mq_attempt_merge(q, ctx, bio)) 958 + __blk_mq_free_request(hctx, ctx, rq); 959 + else { 960 + blk_mq_bio_to_request(rq, bio); 961 + __blk_mq_insert_request(hctx, rq); 962 + } 963 + 964 + spin_unlock(&ctx->lock); 965 + blk_mq_put_ctx(ctx); 966 + 967 + /* 968 + * For a SYNC request, send it to the hardware immediately. For an 969 + * ASYNC request, just ensure that we run it later on. The latter 970 + * allows for merging opportunities and more efficient dispatching. 971 + */ 972 + run_queue: 973 + blk_mq_run_hw_queue(hctx, !is_sync || is_flush_fua); 974 + } 975 + 976 + /* 977 + * Default mapping to a software queue, since we use one per CPU. 978 + */ 979 + struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, const int cpu) 980 + { 981 + return q->queue_hw_ctx[q->mq_map[cpu]]; 982 + } 983 + EXPORT_SYMBOL(blk_mq_map_queue); 984 + 985 + struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_reg *reg, 986 + unsigned int hctx_index) 987 + { 988 + return kmalloc_node(sizeof(struct blk_mq_hw_ctx), 989 + GFP_KERNEL | __GFP_ZERO, reg->numa_node); 990 + } 991 + EXPORT_SYMBOL(blk_mq_alloc_single_hw_queue); 992 + 993 + void blk_mq_free_single_hw_queue(struct blk_mq_hw_ctx *hctx, 994 + unsigned int hctx_index) 995 + { 996 + kfree(hctx); 997 + } 998 + EXPORT_SYMBOL(blk_mq_free_single_hw_queue); 999 + 1000 + static void blk_mq_hctx_notify(void *data, unsigned long action, 1001 + unsigned int cpu) 1002 + { 1003 + struct blk_mq_hw_ctx *hctx = data; 1004 + struct blk_mq_ctx *ctx; 1005 + LIST_HEAD(tmp); 1006 + 1007 + if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) 1008 + return; 1009 + 1010 + /* 1011 + * Move ctx entries to new CPU, if this one is going away. 1012 + */ 1013 + ctx = __blk_mq_get_ctx(hctx->queue, cpu); 1014 + 1015 + spin_lock(&ctx->lock); 1016 + if (!list_empty(&ctx->rq_list)) { 1017 + list_splice_init(&ctx->rq_list, &tmp); 1018 + clear_bit(ctx->index_hw, hctx->ctx_map); 1019 + } 1020 + spin_unlock(&ctx->lock); 1021 + 1022 + if (list_empty(&tmp)) 1023 + return; 1024 + 1025 + ctx = blk_mq_get_ctx(hctx->queue); 1026 + spin_lock(&ctx->lock); 1027 + 1028 + while (!list_empty(&tmp)) { 1029 + struct request *rq; 1030 + 1031 + rq = list_first_entry(&tmp, struct request, queuelist); 1032 + rq->mq_ctx = ctx; 1033 + list_move_tail(&rq->queuelist, &ctx->rq_list); 1034 + } 1035 + 1036 + blk_mq_hctx_mark_pending(hctx, ctx); 1037 + 1038 + spin_unlock(&ctx->lock); 1039 + blk_mq_put_ctx(ctx); 1040 + } 1041 + 1042 + static void blk_mq_init_hw_commands(struct blk_mq_hw_ctx *hctx, 1043 + void (*init)(void *, struct blk_mq_hw_ctx *, 1044 + struct request *, unsigned int), 1045 + void *data) 1046 + { 1047 + unsigned int i; 1048 + 1049 + for (i = 0; i < hctx->queue_depth; i++) { 1050 + struct request *rq = hctx->rqs[i]; 1051 + 1052 + init(data, hctx, rq, i); 1053 + } 1054 + } 1055 + 1056 + void blk_mq_init_commands(struct request_queue *q, 1057 + void (*init)(void *, struct blk_mq_hw_ctx *, 1058 + struct request *, unsigned int), 1059 + void *data) 1060 + { 1061 + struct blk_mq_hw_ctx *hctx; 1062 + unsigned int i; 1063 + 1064 + queue_for_each_hw_ctx(q, hctx, i) 1065 + blk_mq_init_hw_commands(hctx, init, data); 1066 + } 1067 + EXPORT_SYMBOL(blk_mq_init_commands); 1068 + 1069 + static void blk_mq_free_rq_map(struct blk_mq_hw_ctx *hctx) 1070 + { 1071 + struct page *page; 1072 + 1073 + while (!list_empty(&hctx->page_list)) { 1074 + page = list_first_entry(&hctx->page_list, struct page, list); 1075 + list_del_init(&page->list); 1076 + __free_pages(page, page->private); 1077 + } 1078 + 1079 + kfree(hctx->rqs); 1080 + 1081 + if (hctx->tags) 1082 + blk_mq_free_tags(hctx->tags); 1083 + } 1084 + 1085 + static size_t order_to_size(unsigned int order) 1086 + { 1087 + size_t ret = PAGE_SIZE; 1088 + 1089 + while (order--) 1090 + ret *= 2; 1091 + 1092 + return ret; 1093 + } 1094 + 1095 + static int blk_mq_init_rq_map(struct blk_mq_hw_ctx *hctx, 1096 + unsigned int reserved_tags, int node) 1097 + { 1098 + unsigned int i, j, entries_per_page, max_order = 4; 1099 + size_t rq_size, left; 1100 + 1101 + INIT_LIST_HEAD(&hctx->page_list); 1102 + 1103 + hctx->rqs = kmalloc_node(hctx->queue_depth * sizeof(struct request *), 1104 + GFP_KERNEL, node); 1105 + if (!hctx->rqs) 1106 + return -ENOMEM; 1107 + 1108 + /* 1109 + * rq_size is the size of the request plus driver payload, rounded 1110 + * to the cacheline size 1111 + */ 1112 + rq_size = round_up(sizeof(struct request) + hctx->cmd_size, 1113 + cache_line_size()); 1114 + left = rq_size * hctx->queue_depth; 1115 + 1116 + for (i = 0; i < hctx->queue_depth;) { 1117 + int this_order = max_order; 1118 + struct page *page; 1119 + int to_do; 1120 + void *p; 1121 + 1122 + while (left < order_to_size(this_order - 1) && this_order) 1123 + this_order--; 1124 + 1125 + do { 1126 + page = alloc_pages_node(node, GFP_KERNEL, this_order); 1127 + if (page) 1128 + break; 1129 + if (!this_order--) 1130 + break; 1131 + if (order_to_size(this_order) < rq_size) 1132 + break; 1133 + } while (1); 1134 + 1135 + if (!page) 1136 + break; 1137 + 1138 + page->private = this_order; 1139 + list_add_tail(&page->list, &hctx->page_list); 1140 + 1141 + p = page_address(page); 1142 + entries_per_page = order_to_size(this_order) / rq_size; 1143 + to_do = min(entries_per_page, hctx->queue_depth - i); 1144 + left -= to_do * rq_size; 1145 + for (j = 0; j < to_do; j++) { 1146 + hctx->rqs[i] = p; 1147 + blk_mq_rq_init(hctx, hctx->rqs[i]); 1148 + p += rq_size; 1149 + i++; 1150 + } 1151 + } 1152 + 1153 + if (i < (reserved_tags + BLK_MQ_TAG_MIN)) 1154 + goto err_rq_map; 1155 + else if (i != hctx->queue_depth) { 1156 + hctx->queue_depth = i; 1157 + pr_warn("%s: queue depth set to %u because of low memory\n", 1158 + __func__, i); 1159 + } 1160 + 1161 + hctx->tags = blk_mq_init_tags(hctx->queue_depth, reserved_tags, node); 1162 + if (!hctx->tags) { 1163 + err_rq_map: 1164 + blk_mq_free_rq_map(hctx); 1165 + return -ENOMEM; 1166 + } 1167 + 1168 + return 0; 1169 + } 1170 + 1171 + static int blk_mq_init_hw_queues(struct request_queue *q, 1172 + struct blk_mq_reg *reg, void *driver_data) 1173 + { 1174 + struct blk_mq_hw_ctx *hctx; 1175 + unsigned int i, j; 1176 + 1177 + /* 1178 + * Initialize hardware queues 1179 + */ 1180 + queue_for_each_hw_ctx(q, hctx, i) { 1181 + unsigned int num_maps; 1182 + int node; 1183 + 1184 + node = hctx->numa_node; 1185 + if (node == NUMA_NO_NODE) 1186 + node = hctx->numa_node = reg->numa_node; 1187 + 1188 + INIT_DELAYED_WORK(&hctx->delayed_work, blk_mq_work_fn); 1189 + spin_lock_init(&hctx->lock); 1190 + INIT_LIST_HEAD(&hctx->dispatch); 1191 + hctx->queue = q; 1192 + hctx->queue_num = i; 1193 + hctx->flags = reg->flags; 1194 + hctx->queue_depth = reg->queue_depth; 1195 + hctx->cmd_size = reg->cmd_size; 1196 + 1197 + blk_mq_init_cpu_notifier(&hctx->cpu_notifier, 1198 + blk_mq_hctx_notify, hctx); 1199 + blk_mq_register_cpu_notifier(&hctx->cpu_notifier); 1200 + 1201 + if (blk_mq_init_rq_map(hctx, reg->reserved_tags, node)) 1202 + break; 1203 + 1204 + /* 1205 + * Allocate space for all possible cpus to avoid allocation in 1206 + * runtime 1207 + */ 1208 + hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *), 1209 + GFP_KERNEL, node); 1210 + if (!hctx->ctxs) 1211 + break; 1212 + 1213 + num_maps = ALIGN(nr_cpu_ids, BITS_PER_LONG) / BITS_PER_LONG; 1214 + hctx->ctx_map = kzalloc_node(num_maps * sizeof(unsigned long), 1215 + GFP_KERNEL, node); 1216 + if (!hctx->ctx_map) 1217 + break; 1218 + 1219 + hctx->nr_ctx_map = num_maps; 1220 + hctx->nr_ctx = 0; 1221 + 1222 + if (reg->ops->init_hctx && 1223 + reg->ops->init_hctx(hctx, driver_data, i)) 1224 + break; 1225 + } 1226 + 1227 + if (i == q->nr_hw_queues) 1228 + return 0; 1229 + 1230 + /* 1231 + * Init failed 1232 + */ 1233 + queue_for_each_hw_ctx(q, hctx, j) { 1234 + if (i == j) 1235 + break; 1236 + 1237 + if (reg->ops->exit_hctx) 1238 + reg->ops->exit_hctx(hctx, j); 1239 + 1240 + blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); 1241 + blk_mq_free_rq_map(hctx); 1242 + kfree(hctx->ctxs); 1243 + } 1244 + 1245 + return 1; 1246 + } 1247 + 1248 + static void blk_mq_init_cpu_queues(struct request_queue *q, 1249 + unsigned int nr_hw_queues) 1250 + { 1251 + unsigned int i; 1252 + 1253 + for_each_possible_cpu(i) { 1254 + struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i); 1255 + struct blk_mq_hw_ctx *hctx; 1256 + 1257 + memset(__ctx, 0, sizeof(*__ctx)); 1258 + __ctx->cpu = i; 1259 + spin_lock_init(&__ctx->lock); 1260 + INIT_LIST_HEAD(&__ctx->rq_list); 1261 + __ctx->queue = q; 1262 + 1263 + /* If the cpu isn't online, the cpu is mapped to first hctx */ 1264 + hctx = q->mq_ops->map_queue(q, i); 1265 + hctx->nr_ctx++; 1266 + 1267 + if (!cpu_online(i)) 1268 + continue; 1269 + 1270 + /* 1271 + * Set local node, IFF we have more than one hw queue. If 1272 + * not, we remain on the home node of the device 1273 + */ 1274 + if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE) 1275 + hctx->numa_node = cpu_to_node(i); 1276 + } 1277 + } 1278 + 1279 + static void blk_mq_map_swqueue(struct request_queue *q) 1280 + { 1281 + unsigned int i; 1282 + struct blk_mq_hw_ctx *hctx; 1283 + struct blk_mq_ctx *ctx; 1284 + 1285 + queue_for_each_hw_ctx(q, hctx, i) { 1286 + hctx->nr_ctx = 0; 1287 + } 1288 + 1289 + /* 1290 + * Map software to hardware queues 1291 + */ 1292 + queue_for_each_ctx(q, ctx, i) { 1293 + /* If the cpu isn't online, the cpu is mapped to first hctx */ 1294 + hctx = q->mq_ops->map_queue(q, i); 1295 + ctx->index_hw = hctx->nr_ctx; 1296 + hctx->ctxs[hctx->nr_ctx++] = ctx; 1297 + } 1298 + } 1299 + 1300 + struct request_queue *blk_mq_init_queue(struct blk_mq_reg *reg, 1301 + void *driver_data) 1302 + { 1303 + struct blk_mq_hw_ctx **hctxs; 1304 + struct blk_mq_ctx *ctx; 1305 + struct request_queue *q; 1306 + int i; 1307 + 1308 + if (!reg->nr_hw_queues || 1309 + !reg->ops->queue_rq || !reg->ops->map_queue || 1310 + !reg->ops->alloc_hctx || !reg->ops->free_hctx) 1311 + return ERR_PTR(-EINVAL); 1312 + 1313 + if (!reg->queue_depth) 1314 + reg->queue_depth = BLK_MQ_MAX_DEPTH; 1315 + else if (reg->queue_depth > BLK_MQ_MAX_DEPTH) { 1316 + pr_err("blk-mq: queuedepth too large (%u)\n", reg->queue_depth); 1317 + reg->queue_depth = BLK_MQ_MAX_DEPTH; 1318 + } 1319 + 1320 + if (reg->queue_depth < (reg->reserved_tags + BLK_MQ_TAG_MIN)) 1321 + return ERR_PTR(-EINVAL); 1322 + 1323 + ctx = alloc_percpu(struct blk_mq_ctx); 1324 + if (!ctx) 1325 + return ERR_PTR(-ENOMEM); 1326 + 1327 + hctxs = kmalloc_node(reg->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL, 1328 + reg->numa_node); 1329 + 1330 + if (!hctxs) 1331 + goto err_percpu; 1332 + 1333 + for (i = 0; i < reg->nr_hw_queues; i++) { 1334 + hctxs[i] = reg->ops->alloc_hctx(reg, i); 1335 + if (!hctxs[i]) 1336 + goto err_hctxs; 1337 + 1338 + hctxs[i]->numa_node = NUMA_NO_NODE; 1339 + hctxs[i]->queue_num = i; 1340 + } 1341 + 1342 + q = blk_alloc_queue_node(GFP_KERNEL, reg->numa_node); 1343 + if (!q) 1344 + goto err_hctxs; 1345 + 1346 + q->mq_map = blk_mq_make_queue_map(reg); 1347 + if (!q->mq_map) 1348 + goto err_map; 1349 + 1350 + setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q); 1351 + blk_queue_rq_timeout(q, 30000); 1352 + 1353 + q->nr_queues = nr_cpu_ids; 1354 + q->nr_hw_queues = reg->nr_hw_queues; 1355 + 1356 + q->queue_ctx = ctx; 1357 + q->queue_hw_ctx = hctxs; 1358 + 1359 + q->mq_ops = reg->ops; 1360 + 1361 + blk_queue_make_request(q, blk_mq_make_request); 1362 + blk_queue_rq_timed_out(q, reg->ops->timeout); 1363 + if (reg->timeout) 1364 + blk_queue_rq_timeout(q, reg->timeout); 1365 + 1366 + blk_mq_init_flush(q); 1367 + blk_mq_init_cpu_queues(q, reg->nr_hw_queues); 1368 + 1369 + if (blk_mq_init_hw_queues(q, reg, driver_data)) 1370 + goto err_hw; 1371 + 1372 + blk_mq_map_swqueue(q); 1373 + 1374 + mutex_lock(&all_q_mutex); 1375 + list_add_tail(&q->all_q_node, &all_q_list); 1376 + mutex_unlock(&all_q_mutex); 1377 + 1378 + return q; 1379 + err_hw: 1380 + kfree(q->mq_map); 1381 + err_map: 1382 + blk_cleanup_queue(q); 1383 + err_hctxs: 1384 + for (i = 0; i < reg->nr_hw_queues; i++) { 1385 + if (!hctxs[i]) 1386 + break; 1387 + reg->ops->free_hctx(hctxs[i], i); 1388 + } 1389 + kfree(hctxs); 1390 + err_percpu: 1391 + free_percpu(ctx); 1392 + return ERR_PTR(-ENOMEM); 1393 + } 1394 + EXPORT_SYMBOL(blk_mq_init_queue); 1395 + 1396 + void blk_mq_free_queue(struct request_queue *q) 1397 + { 1398 + struct blk_mq_hw_ctx *hctx; 1399 + int i; 1400 + 1401 + queue_for_each_hw_ctx(q, hctx, i) { 1402 + cancel_delayed_work_sync(&hctx->delayed_work); 1403 + kfree(hctx->ctx_map); 1404 + kfree(hctx->ctxs); 1405 + blk_mq_free_rq_map(hctx); 1406 + blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); 1407 + if (q->mq_ops->exit_hctx) 1408 + q->mq_ops->exit_hctx(hctx, i); 1409 + q->mq_ops->free_hctx(hctx, i); 1410 + } 1411 + 1412 + free_percpu(q->queue_ctx); 1413 + kfree(q->queue_hw_ctx); 1414 + kfree(q->mq_map); 1415 + 1416 + q->queue_ctx = NULL; 1417 + q->queue_hw_ctx = NULL; 1418 + q->mq_map = NULL; 1419 + 1420 + mutex_lock(&all_q_mutex); 1421 + list_del_init(&q->all_q_node); 1422 + mutex_unlock(&all_q_mutex); 1423 + } 1424 + EXPORT_SYMBOL(blk_mq_free_queue); 1425 + 1426 + /* Basically redo blk_mq_init_queue with queue frozen */ 1427 + static void __cpuinit blk_mq_queue_reinit(struct request_queue *q) 1428 + { 1429 + blk_mq_freeze_queue(q); 1430 + 1431 + blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues); 1432 + 1433 + /* 1434 + * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe 1435 + * we should change hctx numa_node according to new topology (this 1436 + * involves free and re-allocate memory, worthy doing?) 1437 + */ 1438 + 1439 + blk_mq_map_swqueue(q); 1440 + 1441 + blk_mq_unfreeze_queue(q); 1442 + } 1443 + 1444 + static int __cpuinit blk_mq_queue_reinit_notify(struct notifier_block *nb, 1445 + unsigned long action, void *hcpu) 1446 + { 1447 + struct request_queue *q; 1448 + 1449 + /* 1450 + * Before new mapping is established, hotadded cpu might already start 1451 + * handling requests. This doesn't break anything as we map offline 1452 + * CPUs to first hardware queue. We will re-init queue below to get 1453 + * optimal settings. 1454 + */ 1455 + if (action != CPU_DEAD && action != CPU_DEAD_FROZEN && 1456 + action != CPU_ONLINE && action != CPU_ONLINE_FROZEN) 1457 + return NOTIFY_OK; 1458 + 1459 + mutex_lock(&all_q_mutex); 1460 + list_for_each_entry(q, &all_q_list, all_q_node) 1461 + blk_mq_queue_reinit(q); 1462 + mutex_unlock(&all_q_mutex); 1463 + return NOTIFY_OK; 1464 + } 1465 + 1466 + static int __init blk_mq_init(void) 1467 + { 1468 + unsigned int i; 1469 + 1470 + for_each_possible_cpu(i) 1471 + init_llist_head(&per_cpu(ipi_lists, i)); 1472 + 1473 + blk_mq_cpu_init(); 1474 + 1475 + /* Must be called after percpu_counter_hotcpu_callback() */ 1476 + hotcpu_notifier(blk_mq_queue_reinit_notify, -10); 1477 + 1478 + return 0; 1479 + } 1480 + subsys_initcall(blk_mq_init);
+52
block/blk-mq.h
··· 1 + #ifndef INT_BLK_MQ_H 2 + #define INT_BLK_MQ_H 3 + 4 + struct blk_mq_ctx { 5 + struct { 6 + spinlock_t lock; 7 + struct list_head rq_list; 8 + } ____cacheline_aligned_in_smp; 9 + 10 + unsigned int cpu; 11 + unsigned int index_hw; 12 + unsigned int ipi_redirect; 13 + 14 + /* incremented at dispatch time */ 15 + unsigned long rq_dispatched[2]; 16 + unsigned long rq_merged; 17 + 18 + /* incremented at completion time */ 19 + unsigned long ____cacheline_aligned_in_smp rq_completed[2]; 20 + 21 + struct request_queue *queue; 22 + struct kobject kobj; 23 + }; 24 + 25 + void __blk_mq_end_io(struct request *rq, int error); 26 + void blk_mq_complete_request(struct request *rq, int error); 27 + void blk_mq_run_request(struct request *rq, bool run_queue, bool async); 28 + void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); 29 + void blk_mq_init_flush(struct request_queue *q); 30 + 31 + /* 32 + * CPU hotplug helpers 33 + */ 34 + struct blk_mq_cpu_notifier; 35 + void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier, 36 + void (*fn)(void *, unsigned long, unsigned int), 37 + void *data); 38 + void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier); 39 + void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier); 40 + void blk_mq_cpu_init(void); 41 + DECLARE_PER_CPU(struct llist_head, ipi_lists); 42 + 43 + /* 44 + * CPU -> queue mappings 45 + */ 46 + struct blk_mq_reg; 47 + extern unsigned int *blk_mq_make_queue_map(struct blk_mq_reg *reg); 48 + extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues); 49 + 50 + void blk_mq_add_timer(struct request *rq); 51 + 52 + #endif
+13
block/blk-sysfs.c
··· 7 7 #include <linux/bio.h> 8 8 #include <linux/blkdev.h> 9 9 #include <linux/blktrace_api.h> 10 + #include <linux/blk-mq.h> 10 11 11 12 #include "blk.h" 12 13 #include "blk-cgroup.h" ··· 543 542 if (q->queue_tags) 544 543 __blk_queue_free_tags(q); 545 544 545 + percpu_counter_destroy(&q->mq_usage_counter); 546 + 547 + if (q->mq_ops) 548 + blk_mq_free_queue(q); 549 + 546 550 blk_trace_shutdown(q); 547 551 548 552 bdi_destroy(&q->backing_dev_info); ··· 581 575 * bypass from queue allocation. 582 576 */ 583 577 blk_queue_bypass_end(q); 578 + queue_flag_set_unlocked(QUEUE_FLAG_INIT_DONE, q); 584 579 585 580 ret = blk_trace_init_sysfs(dev); 586 581 if (ret) ··· 594 587 } 595 588 596 589 kobject_uevent(&q->kobj, KOBJ_ADD); 590 + 591 + if (q->mq_ops) 592 + blk_mq_register_disk(disk); 597 593 598 594 if (!q->request_fn) 599 595 return 0; ··· 619 609 620 610 if (WARN_ON(!q)) 621 611 return; 612 + 613 + if (q->mq_ops) 614 + blk_mq_unregister_disk(disk); 622 615 623 616 if (q->request_fn) 624 617 elv_unregister_queue(q);
+46 -27
block/blk-timeout.c
··· 7 7 #include <linux/fault-inject.h> 8 8 9 9 #include "blk.h" 10 + #include "blk-mq.h" 10 11 11 12 #ifdef CONFIG_FAIL_IO_TIMEOUT 12 13 ··· 89 88 ret = q->rq_timed_out_fn(req); 90 89 switch (ret) { 91 90 case BLK_EH_HANDLED: 92 - __blk_complete_request(req); 91 + /* Can we use req->errors here? */ 92 + if (q->mq_ops) 93 + blk_mq_complete_request(req, req->errors); 94 + else 95 + __blk_complete_request(req); 93 96 break; 94 97 case BLK_EH_RESET_TIMER: 95 98 blk_clear_rq_complete(req); 96 - blk_add_timer(req); 99 + if (q->mq_ops) 100 + blk_mq_add_timer(req); 101 + else 102 + blk_add_timer(req); 97 103 break; 98 104 case BLK_EH_NOT_HANDLED: 99 105 /* ··· 116 108 } 117 109 } 118 110 111 + void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout, 112 + unsigned int *next_set) 113 + { 114 + if (time_after_eq(jiffies, rq->deadline)) { 115 + list_del_init(&rq->timeout_list); 116 + 117 + /* 118 + * Check if we raced with end io completion 119 + */ 120 + if (!blk_mark_rq_complete(rq)) 121 + blk_rq_timed_out(rq); 122 + } else if (!*next_set || time_after(*next_timeout, rq->deadline)) { 123 + *next_timeout = rq->deadline; 124 + *next_set = 1; 125 + } 126 + } 127 + 119 128 void blk_rq_timed_out_timer(unsigned long data) 120 129 { 121 130 struct request_queue *q = (struct request_queue *) data; ··· 142 117 143 118 spin_lock_irqsave(q->queue_lock, flags); 144 119 145 - list_for_each_entry_safe(rq, tmp, &q->timeout_list, timeout_list) { 146 - if (time_after_eq(jiffies, rq->deadline)) { 147 - list_del_init(&rq->timeout_list); 148 - 149 - /* 150 - * Check if we raced with end io completion 151 - */ 152 - if (blk_mark_rq_complete(rq)) 153 - continue; 154 - blk_rq_timed_out(rq); 155 - } else if (!next_set || time_after(next, rq->deadline)) { 156 - next = rq->deadline; 157 - next_set = 1; 158 - } 159 - } 120 + list_for_each_entry_safe(rq, tmp, &q->timeout_list, timeout_list) 121 + blk_rq_check_expired(rq, &next, &next_set); 160 122 161 123 if (next_set) 162 124 mod_timer(&q->timeout, round_jiffies_up(next)); ··· 169 157 } 170 158 EXPORT_SYMBOL_GPL(blk_abort_request); 171 159 172 - /** 173 - * blk_add_timer - Start timeout timer for a single request 174 - * @req: request that is about to start running. 175 - * 176 - * Notes: 177 - * Each request has its own timer, and as it is added to the queue, we 178 - * set up the timer. When the request completes, we cancel the timer. 179 - */ 180 - void blk_add_timer(struct request *req) 160 + void __blk_add_timer(struct request *req, struct list_head *timeout_list) 181 161 { 182 162 struct request_queue *q = req->q; 183 163 unsigned long expiry; ··· 188 184 req->timeout = q->rq_timeout; 189 185 190 186 req->deadline = jiffies + req->timeout; 191 - list_add_tail(&req->timeout_list, &q->timeout_list); 187 + if (timeout_list) 188 + list_add_tail(&req->timeout_list, timeout_list); 192 189 193 190 /* 194 191 * If the timer isn't already pending or this timeout is earlier ··· 201 196 if (!timer_pending(&q->timeout) || 202 197 time_before(expiry, q->timeout.expires)) 203 198 mod_timer(&q->timeout, expiry); 199 + 200 + } 201 + 202 + /** 203 + * blk_add_timer - Start timeout timer for a single request 204 + * @req: request that is about to start running. 205 + * 206 + * Notes: 207 + * Each request has its own timer, and as it is added to the queue, we 208 + * set up the timer. When the request completes, we cancel the timer. 209 + */ 210 + void blk_add_timer(struct request *req) 211 + { 212 + __blk_add_timer(req, &req->q->timeout_list); 204 213 } 205 214
+17
block/blk.h
··· 10 10 #define BLK_BATCH_REQ 32 11 11 12 12 extern struct kmem_cache *blk_requestq_cachep; 13 + extern struct kmem_cache *request_cachep; 13 14 extern struct kobj_type blk_queue_ktype; 14 15 extern struct ida blk_queue_ida; 15 16 ··· 35 34 unsigned int nr_bytes, unsigned int bidi_bytes); 36 35 37 36 void blk_rq_timed_out_timer(unsigned long data); 37 + void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout, 38 + unsigned int *next_set); 39 + void __blk_add_timer(struct request *req, struct list_head *timeout_list); 38 40 void blk_delete_timer(struct request *); 39 41 void blk_add_timer(struct request *); 42 + 43 + 44 + bool bio_attempt_front_merge(struct request_queue *q, struct request *req, 45 + struct bio *bio); 46 + bool bio_attempt_back_merge(struct request_queue *q, struct request *req, 47 + struct bio *bio); 48 + bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, 49 + unsigned int *request_count); 50 + 51 + void blk_account_io_start(struct request *req, bool new_io); 52 + void blk_account_io_completion(struct request *req, unsigned int bytes); 53 + void blk_account_io_done(struct request *req); 40 54 41 55 /* 42 56 * Internal atomic flags for request handling 43 57 */ 44 58 enum rq_atomic_flags { 45 59 REQ_ATOM_COMPLETE = 0, 60 + REQ_ATOM_STARTED, 46 61 }; 47 62 48 63 /*
+2
include/linux/bio.h
··· 419 419 bl->head = bl->tail = NULL; 420 420 } 421 421 422 + #define BIO_EMPTY_LIST { NULL, NULL } 423 + 422 424 #define bio_list_for_each(bio, bl) \ 423 425 for (bio = (bl)->head; bio; bio = bio->bi_next) 424 426
+182
include/linux/blk-mq.h
··· 1 + #ifndef BLK_MQ_H 2 + #define BLK_MQ_H 3 + 4 + #include <linux/blkdev.h> 5 + 6 + struct blk_mq_tags; 7 + 8 + struct blk_mq_cpu_notifier { 9 + struct list_head list; 10 + void *data; 11 + void (*notify)(void *data, unsigned long action, unsigned int cpu); 12 + }; 13 + 14 + struct blk_mq_hw_ctx { 15 + struct { 16 + spinlock_t lock; 17 + struct list_head dispatch; 18 + } ____cacheline_aligned_in_smp; 19 + 20 + unsigned long state; /* BLK_MQ_S_* flags */ 21 + struct delayed_work delayed_work; 22 + 23 + unsigned long flags; /* BLK_MQ_F_* flags */ 24 + 25 + struct request_queue *queue; 26 + unsigned int queue_num; 27 + 28 + void *driver_data; 29 + 30 + unsigned int nr_ctx; 31 + struct blk_mq_ctx **ctxs; 32 + unsigned int nr_ctx_map; 33 + unsigned long *ctx_map; 34 + 35 + struct request **rqs; 36 + struct list_head page_list; 37 + struct blk_mq_tags *tags; 38 + 39 + unsigned long queued; 40 + unsigned long run; 41 + #define BLK_MQ_MAX_DISPATCH_ORDER 10 42 + unsigned long dispatched[BLK_MQ_MAX_DISPATCH_ORDER]; 43 + 44 + unsigned int queue_depth; 45 + unsigned int numa_node; 46 + unsigned int cmd_size; /* per-request extra data */ 47 + 48 + struct blk_mq_cpu_notifier cpu_notifier; 49 + struct kobject kobj; 50 + }; 51 + 52 + struct blk_mq_reg { 53 + struct blk_mq_ops *ops; 54 + unsigned int nr_hw_queues; 55 + unsigned int queue_depth; 56 + unsigned int reserved_tags; 57 + unsigned int cmd_size; /* per-request extra data */ 58 + int numa_node; 59 + unsigned int timeout; 60 + unsigned int flags; /* BLK_MQ_F_* */ 61 + }; 62 + 63 + typedef int (queue_rq_fn)(struct blk_mq_hw_ctx *, struct request *); 64 + typedef struct blk_mq_hw_ctx *(map_queue_fn)(struct request_queue *, const int); 65 + typedef struct blk_mq_hw_ctx *(alloc_hctx_fn)(struct blk_mq_reg *,unsigned int); 66 + typedef void (free_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int); 67 + typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int); 68 + typedef void (exit_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int); 69 + 70 + struct blk_mq_ops { 71 + /* 72 + * Queue request 73 + */ 74 + queue_rq_fn *queue_rq; 75 + 76 + /* 77 + * Map to specific hardware queue 78 + */ 79 + map_queue_fn *map_queue; 80 + 81 + /* 82 + * Called on request timeout 83 + */ 84 + rq_timed_out_fn *timeout; 85 + 86 + /* 87 + * Override for hctx allocations (should probably go) 88 + */ 89 + alloc_hctx_fn *alloc_hctx; 90 + free_hctx_fn *free_hctx; 91 + 92 + /* 93 + * Called when the block layer side of a hardware queue has been 94 + * set up, allowing the driver to allocate/init matching structures. 95 + * Ditto for exit/teardown. 96 + */ 97 + init_hctx_fn *init_hctx; 98 + exit_hctx_fn *exit_hctx; 99 + }; 100 + 101 + enum { 102 + BLK_MQ_RQ_QUEUE_OK = 0, /* queued fine */ 103 + BLK_MQ_RQ_QUEUE_BUSY = 1, /* requeue IO for later */ 104 + BLK_MQ_RQ_QUEUE_ERROR = 2, /* end IO with error */ 105 + 106 + BLK_MQ_F_SHOULD_MERGE = 1 << 0, 107 + BLK_MQ_F_SHOULD_SORT = 1 << 1, 108 + BLK_MQ_F_SHOULD_IPI = 1 << 2, 109 + 110 + BLK_MQ_S_STOPPED = 1 << 0, 111 + 112 + BLK_MQ_MAX_DEPTH = 2048, 113 + }; 114 + 115 + struct request_queue *blk_mq_init_queue(struct blk_mq_reg *, void *); 116 + void blk_mq_free_queue(struct request_queue *); 117 + int blk_mq_register_disk(struct gendisk *); 118 + void blk_mq_unregister_disk(struct gendisk *); 119 + void blk_mq_init_commands(struct request_queue *, void (*init)(void *data, struct blk_mq_hw_ctx *, struct request *, unsigned int), void *data); 120 + 121 + void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule); 122 + 123 + void blk_mq_insert_request(struct request_queue *, struct request *, bool); 124 + void blk_mq_run_queues(struct request_queue *q, bool async); 125 + void blk_mq_free_request(struct request *rq); 126 + bool blk_mq_can_queue(struct blk_mq_hw_ctx *); 127 + struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp); 128 + struct request *blk_mq_alloc_reserved_request(struct request_queue *q, int rw, gfp_t gfp); 129 + struct request *blk_mq_rq_from_tag(struct request_queue *q, unsigned int tag); 130 + 131 + struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *, const int ctx_index); 132 + struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_reg *, unsigned int); 133 + void blk_mq_free_single_hw_queue(struct blk_mq_hw_ctx *, unsigned int); 134 + 135 + void blk_mq_end_io(struct request *rq, int error); 136 + 137 + void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx); 138 + void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx); 139 + void blk_mq_start_stopped_hw_queues(struct request_queue *q); 140 + 141 + /* 142 + * Driver command data is immediately after the request. So subtract request 143 + * size to get back to the original request. 144 + */ 145 + static inline struct request *blk_mq_rq_from_pdu(void *pdu) 146 + { 147 + return pdu - sizeof(struct request); 148 + } 149 + static inline void *blk_mq_rq_to_pdu(struct request *rq) 150 + { 151 + return (void *) rq + sizeof(*rq); 152 + } 153 + 154 + static inline struct request *blk_mq_tag_to_rq(struct blk_mq_hw_ctx *hctx, 155 + unsigned int tag) 156 + { 157 + return hctx->rqs[tag]; 158 + } 159 + 160 + #define queue_for_each_hw_ctx(q, hctx, i) \ 161 + for ((i) = 0, hctx = (q)->queue_hw_ctx[0]; \ 162 + (i) < (q)->nr_hw_queues; (i)++, hctx = (q)->queue_hw_ctx[i]) 163 + 164 + #define queue_for_each_ctx(q, ctx, i) \ 165 + for ((i) = 0, ctx = per_cpu_ptr((q)->queue_ctx, 0); \ 166 + (i) < (q)->nr_queues; (i)++, ctx = per_cpu_ptr(q->queue_ctx, (i))) 167 + 168 + #define hctx_for_each_ctx(hctx, ctx, i) \ 169 + for ((i) = 0, ctx = (hctx)->ctxs[0]; \ 170 + (i) < (hctx)->nr_ctx; (i)++, ctx = (hctx)->ctxs[(i)]) 171 + 172 + #define blk_ctx_sum(q, sum) \ 173 + ({ \ 174 + struct blk_mq_ctx *__x; \ 175 + unsigned int __ret = 0, __i; \ 176 + \ 177 + queue_for_each_ctx((q), __x, __i) \ 178 + __ret += sum; \ 179 + __ret; \ 180 + }) 181 + 182 + #endif
+2
include/linux/blk_types.h
··· 178 178 __REQ_MIXED_MERGE, /* merge of different types, fail separately */ 179 179 __REQ_KERNEL, /* direct IO to kernel pages */ 180 180 __REQ_PM, /* runtime pm request */ 181 + __REQ_END, /* last of chain of requests */ 181 182 __REQ_NR_BITS, /* stops here */ 182 183 }; 183 184 ··· 230 229 #define REQ_SECURE (1ULL << __REQ_SECURE) 231 230 #define REQ_KERNEL (1ULL << __REQ_KERNEL) 232 231 #define REQ_PM (1ULL << __REQ_PM) 232 + #define REQ_END (1ULL << __REQ_END) 233 233 234 234 #endif /* __LINUX_BLK_TYPES_H */
+47 -7
include/linux/blkdev.h
··· 8 8 #include <linux/major.h> 9 9 #include <linux/genhd.h> 10 10 #include <linux/list.h> 11 + #include <linux/llist.h> 11 12 #include <linux/timer.h> 12 13 #include <linux/workqueue.h> 13 14 #include <linux/pagemap.h> ··· 95 94 * as well! 96 95 */ 97 96 struct request { 98 - struct list_head queuelist; 99 - struct call_single_data csd; 97 + union { 98 + struct list_head queuelist; 99 + struct llist_node ll_list; 100 + }; 101 + union { 102 + struct call_single_data csd; 103 + struct work_struct mq_flush_data; 104 + }; 100 105 101 106 struct request_queue *q; 107 + struct blk_mq_ctx *mq_ctx; 102 108 103 109 u64 cmd_flags; 104 110 enum rq_cmd_type_bits cmd_type; ··· 221 213 222 214 #include <linux/elevator.h> 223 215 216 + struct blk_queue_ctx; 217 + 224 218 typedef void (request_fn_proc) (struct request_queue *q); 225 219 typedef void (make_request_fn) (struct request_queue *q, struct bio *bio); 226 220 typedef int (prep_rq_fn) (struct request_queue *, struct request *); ··· 321 311 dma_drain_needed_fn *dma_drain_needed; 322 312 lld_busy_fn *lld_busy_fn; 323 313 314 + struct blk_mq_ops *mq_ops; 315 + 316 + unsigned int *mq_map; 317 + 318 + /* sw queues */ 319 + struct blk_mq_ctx *queue_ctx; 320 + unsigned int nr_queues; 321 + 322 + /* hw dispatch queues */ 323 + struct blk_mq_hw_ctx **queue_hw_ctx; 324 + unsigned int nr_hw_queues; 325 + 324 326 /* 325 327 * Dispatch queue sorting 326 328 */ ··· 380 358 * queue kobject 381 359 */ 382 360 struct kobject kobj; 361 + 362 + /* 363 + * mq queue kobject 364 + */ 365 + struct kobject mq_kobj; 383 366 384 367 #ifdef CONFIG_PM_RUNTIME 385 368 struct device *dev; ··· 450 423 unsigned long flush_pending_since; 451 424 struct list_head flush_queue[2]; 452 425 struct list_head flush_data_in_flight; 453 - struct request flush_rq; 426 + union { 427 + struct request flush_rq; 428 + struct { 429 + spinlock_t mq_flush_lock; 430 + struct work_struct mq_flush_work; 431 + }; 432 + }; 454 433 455 434 struct mutex sysfs_lock; 456 435 ··· 468 435 struct bsg_class_device bsg_dev; 469 436 #endif 470 437 471 - #ifdef CONFIG_BLK_CGROUP 472 - struct list_head all_q_node; 473 - #endif 474 438 #ifdef CONFIG_BLK_DEV_THROTTLING 475 439 /* Throttle data */ 476 440 struct throtl_data *td; 477 441 #endif 478 442 struct rcu_head rcu_head; 443 + wait_queue_head_t mq_freeze_wq; 444 + struct percpu_counter mq_usage_counter; 445 + struct list_head all_q_node; 479 446 }; 480 447 481 448 #define QUEUE_FLAG_QUEUED 1 /* uses generic tag queueing */ ··· 498 465 #define QUEUE_FLAG_SECDISCARD 17 /* supports SECDISCARD */ 499 466 #define QUEUE_FLAG_SAME_FORCE 18 /* force complete on same CPU */ 500 467 #define QUEUE_FLAG_DEAD 19 /* queue tear-down finished */ 468 + #define QUEUE_FLAG_INIT_DONE 20 /* queue is initialized */ 501 469 502 470 #define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ 503 471 (1 << QUEUE_FLAG_STACKABLE) | \ ··· 571 537 #define blk_queue_dying(q) test_bit(QUEUE_FLAG_DYING, &(q)->queue_flags) 572 538 #define blk_queue_dead(q) test_bit(QUEUE_FLAG_DEAD, &(q)->queue_flags) 573 539 #define blk_queue_bypass(q) test_bit(QUEUE_FLAG_BYPASS, &(q)->queue_flags) 540 + #define blk_queue_init_done(q) test_bit(QUEUE_FLAG_INIT_DONE, &(q)->queue_flags) 574 541 #define blk_queue_nomerges(q) test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags) 575 542 #define blk_queue_noxmerges(q) \ 576 543 test_bit(QUEUE_FLAG_NOXMERGES, &(q)->queue_flags) ··· 1046 1011 struct blk_plug { 1047 1012 unsigned long magic; /* detect uninitialized use-cases */ 1048 1013 struct list_head list; /* requests */ 1014 + struct list_head mq_list; /* blk-mq requests */ 1049 1015 struct list_head cb_list; /* md requires an unplug callback */ 1050 1016 }; 1051 1017 #define BLK_MAX_REQUEST_COUNT 16 ··· 1084 1048 { 1085 1049 struct blk_plug *plug = tsk->plug; 1086 1050 1087 - return plug && (!list_empty(&plug->list) || !list_empty(&plug->cb_list)); 1051 + return plug && 1052 + (!list_empty(&plug->list) || 1053 + !list_empty(&plug->mq_list) || 1054 + !list_empty(&plug->cb_list)); 1088 1055 } 1089 1056 1090 1057 /* ··· 1362 1323 1363 1324 struct work_struct; 1364 1325 int kblockd_schedule_work(struct request_queue *q, struct work_struct *work); 1326 + int kblockd_schedule_delayed_work(struct request_queue *q, struct delayed_work *dwork, unsigned long delay); 1365 1327 1366 1328 #ifdef CONFIG_BLK_CGROUP 1367 1329 /*