Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

scsi: add support for a blk-mq based I/O path.

This patch adds support for an alternate I/O path in the scsi midlayer
which uses the blk-mq infrastructure instead of the legacy request code.

Use of blk-mq is fully transparent to drivers, although for now a host
template field is provided to opt out of blk-mq usage in case any unforseen
incompatibilities arise.

In general replacing the legacy request code with blk-mq is a simple and
mostly mechanical transformation. The biggest exception is the new code
that deals with the fact the I/O submissions in blk-mq must happen from
process context, which slightly complicates the I/O completion handler.
The second biggest differences is that blk-mq is build around the concept
of preallocated requests that also include driver specific data, which
in SCSI context means the scsi_cmnd structure. This completely avoids
dynamic memory allocations for the fast path through I/O submission.

Due the preallocated requests the MQ code path exclusively uses the
host-wide shared tag allocator instead of a per-LUN one. This only
affects drivers actually using the block layer provided tag allocator
instead of their own. Unlike the old path blk-mq always provides a tag,
although drivers don't have to use it.

For now the blk-mq path is disable by defauly and must be enabled using
the "use_blk_mq" module parameter. Once the remaining work in the block
layer to make blk-mq more suitable for slow devices is complete I hope
to make it the default and eventually even remove the old code path.

Based on the earlier scsi-mq prototype by Nicholas Bellinger.

Thanks to Bart Van Assche and Robert Elliot for testing, benchmarking and
various sugestions and code contributions.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Webb Scales <webbnh@hp.com>
Acked-by: Jens Axboe <axboe@kernel.dk>
Tested-by: Bart Van Assche <bvanassche@acm.org>
Tested-by: Robert Elliott <elliott@hp.com>

+488 -72
+30 -5
drivers/scsi/hosts.c
··· 213 213 goto fail; 214 214 } 215 215 216 + if (shost_use_blk_mq(shost)) { 217 + error = scsi_mq_setup_tags(shost); 218 + if (error) 219 + goto fail; 220 + } 221 + 222 + /* 223 + * Note that we allocate the freelist even for the MQ case for now, 224 + * as we need a command set aside for scsi_reset_provider. Having 225 + * the full host freelist and one command available for that is a 226 + * little heavy-handed, but avoids introducing a special allocator 227 + * just for this. Eventually the structure of scsi_reset_provider 228 + * will need a major overhaul. 229 + */ 216 230 error = scsi_setup_command_freelist(shost); 217 231 if (error) 218 - goto fail; 232 + goto out_destroy_tags; 233 + 219 234 220 235 if (!shost->shost_gendev.parent) 221 236 shost->shost_gendev.parent = dev ? dev : &platform_bus; ··· 241 226 242 227 error = device_add(&shost->shost_gendev); 243 228 if (error) 244 - goto out; 229 + goto out_destroy_freelist; 245 230 246 231 pm_runtime_set_active(&shost->shost_gendev); 247 232 pm_runtime_enable(&shost->shost_gendev); ··· 294 279 device_del(&shost->shost_dev); 295 280 out_del_gendev: 296 281 device_del(&shost->shost_gendev); 297 - out: 282 + out_destroy_freelist: 298 283 scsi_destroy_command_freelist(shost); 284 + out_destroy_tags: 285 + if (shost_use_blk_mq(shost)) 286 + scsi_mq_destroy_tags(shost); 299 287 fail: 300 288 return error; 301 289 } ··· 327 309 } 328 310 329 311 scsi_destroy_command_freelist(shost); 330 - if (shost->bqt) 331 - blk_free_tags(shost->bqt); 312 + if (shost_use_blk_mq(shost)) { 313 + if (shost->tag_set.tags) 314 + scsi_mq_destroy_tags(shost); 315 + } else { 316 + if (shost->bqt) 317 + blk_free_tags(shost->bqt); 318 + } 332 319 333 320 kfree(shost->shost_data); 334 321 ··· 458 435 shost->dma_boundary = sht->dma_boundary; 459 436 else 460 437 shost->dma_boundary = 0xffffffff; 438 + 439 + shost->use_blk_mq = scsi_use_blk_mq && !shost->hostt->disable_blk_mq; 461 440 462 441 device_initialize(&shost->shost_gendev); 463 442 dev_set_name(&shost->shost_gendev, "host%d", shost->host_no);
+4 -1
drivers/scsi/scsi.c
··· 805 805 * is more IO than the LLD's can_queue (so there are not enuogh 806 806 * tags) request_fn's host queue ready check will handle it. 807 807 */ 808 - if (!sdev->host->bqt) { 808 + if (!shost_use_blk_mq(sdev->host) && !sdev->host->bqt) { 809 809 if (blk_queue_tagged(sdev->request_queue) && 810 810 blk_queue_resize_tags(sdev->request_queue, tags) != 0) 811 811 goto out; ··· 1360 1360 1361 1361 module_param(scsi_logging_level, int, S_IRUGO|S_IWUSR); 1362 1362 MODULE_PARM_DESC(scsi_logging_level, "a bit mask of logging levels"); 1363 + 1364 + bool scsi_use_blk_mq = false; 1365 + module_param_named(use_blk_mq, scsi_use_blk_mq, bool, S_IWUSR | S_IRUGO); 1363 1366 1364 1367 static int __init init_scsi(void) 1365 1368 {
+403 -61
drivers/scsi/scsi_lib.c
··· 1 1 /* 2 - * scsi_lib.c Copyright (C) 1999 Eric Youngdale 2 + * Copyright (C) 1999 Eric Youngdale 3 + * Copyright (C) 2014 Christoph Hellwig 3 4 * 4 5 * SCSI queueing library. 5 6 * Initial versions: Eric Youngdale (eric@andante.org). ··· 21 20 #include <linux/delay.h> 22 21 #include <linux/hardirq.h> 23 22 #include <linux/scatterlist.h> 23 + #include <linux/blk-mq.h> 24 24 25 25 #include <scsi/scsi.h> 26 26 #include <scsi/scsi_cmnd.h> ··· 115 113 } 116 114 } 117 115 116 + static void scsi_mq_requeue_cmd(struct scsi_cmnd *cmd) 117 + { 118 + struct scsi_device *sdev = cmd->device; 119 + struct request_queue *q = cmd->request->q; 120 + 121 + blk_mq_requeue_request(cmd->request); 122 + blk_mq_kick_requeue_list(q); 123 + put_device(&sdev->sdev_gendev); 124 + } 125 + 118 126 /** 119 127 * __scsi_queue_insert - private queue insertion 120 128 * @cmd: The SCSI command being requeued ··· 162 150 * before blk_cleanup_queue() finishes. 163 151 */ 164 152 cmd->result = 0; 153 + if (q->mq_ops) { 154 + scsi_mq_requeue_cmd(cmd); 155 + return; 156 + } 165 157 spin_lock_irqsave(q->queue_lock, flags); 166 158 blk_requeue_request(q, cmd->request); 167 159 kblockd_schedule_work(&device->requeue_work); ··· 324 308 atomic_dec(&sdev->device_busy); 325 309 } 326 310 311 + static void scsi_kick_queue(struct request_queue *q) 312 + { 313 + if (q->mq_ops) 314 + blk_mq_start_hw_queues(q); 315 + else 316 + blk_run_queue(q); 317 + } 318 + 327 319 /* 328 320 * Called for single_lun devices on IO completion. Clear starget_sdev_user, 329 321 * and call blk_run_queue for all the scsi_devices on the target - ··· 356 332 * but in most cases, we will be first. Ideally, each LU on the 357 333 * target would get some limited time or requests on the target. 358 334 */ 359 - blk_run_queue(current_sdev->request_queue); 335 + scsi_kick_queue(current_sdev->request_queue); 360 336 361 337 spin_lock_irqsave(shost->host_lock, flags); 362 338 if (starget->starget_sdev_user) ··· 369 345 continue; 370 346 371 347 spin_unlock_irqrestore(shost->host_lock, flags); 372 - blk_run_queue(sdev->request_queue); 348 + scsi_kick_queue(sdev->request_queue); 373 349 spin_lock_irqsave(shost->host_lock, flags); 374 350 375 351 scsi_device_put(sdev); ··· 459 435 continue; 460 436 spin_unlock_irqrestore(shost->host_lock, flags); 461 437 462 - blk_run_queue(slq); 438 + scsi_kick_queue(slq); 463 439 blk_put_queue(slq); 464 440 465 441 spin_lock_irqsave(shost->host_lock, flags); ··· 490 466 if (!list_empty(&sdev->host->starved_list)) 491 467 scsi_starved_list_run(sdev->host); 492 468 493 - blk_run_queue(q); 469 + if (q->mq_ops) 470 + blk_mq_start_stopped_hw_queues(q, false); 471 + else 472 + blk_run_queue(q); 494 473 } 495 474 496 475 void scsi_requeue_run_queue(struct work_struct *work) ··· 591 564 return mempool_alloc(sgp->pool, gfp_mask); 592 565 } 593 566 594 - static void scsi_free_sgtable(struct scsi_data_buffer *sdb) 567 + static void scsi_free_sgtable(struct scsi_data_buffer *sdb, bool mq) 595 568 { 596 - __sg_free_table(&sdb->table, SCSI_MAX_SG_SEGMENTS, false, scsi_sg_free); 569 + if (mq && sdb->table.nents <= SCSI_MAX_SG_SEGMENTS) 570 + return; 571 + __sg_free_table(&sdb->table, SCSI_MAX_SG_SEGMENTS, mq, scsi_sg_free); 597 572 } 598 573 599 574 static int scsi_alloc_sgtable(struct scsi_data_buffer *sdb, int nents, 600 - gfp_t gfp_mask) 575 + gfp_t gfp_mask, bool mq) 601 576 { 577 + struct scatterlist *first_chunk = NULL; 602 578 int ret; 603 579 604 580 BUG_ON(!nents); 605 581 582 + if (mq) { 583 + if (nents <= SCSI_MAX_SG_SEGMENTS) { 584 + sdb->table.nents = nents; 585 + sg_init_table(sdb->table.sgl, sdb->table.nents); 586 + return 0; 587 + } 588 + first_chunk = sdb->table.sgl; 589 + } 590 + 606 591 ret = __sg_alloc_table(&sdb->table, nents, SCSI_MAX_SG_SEGMENTS, 607 - NULL, gfp_mask, scsi_sg_alloc); 592 + first_chunk, gfp_mask, scsi_sg_alloc); 608 593 if (unlikely(ret)) 609 - scsi_free_sgtable(sdb); 594 + scsi_free_sgtable(sdb, mq); 610 595 return ret; 596 + } 597 + 598 + static void scsi_uninit_cmd(struct scsi_cmnd *cmd) 599 + { 600 + if (cmd->request->cmd_type == REQ_TYPE_FS) { 601 + struct scsi_driver *drv = scsi_cmd_to_driver(cmd); 602 + 603 + if (drv->uninit_command) 604 + drv->uninit_command(cmd); 605 + } 606 + } 607 + 608 + static void scsi_mq_free_sgtables(struct scsi_cmnd *cmd) 609 + { 610 + if (cmd->sdb.table.nents) 611 + scsi_free_sgtable(&cmd->sdb, true); 612 + if (cmd->request->next_rq && cmd->request->next_rq->special) 613 + scsi_free_sgtable(cmd->request->next_rq->special, true); 614 + if (scsi_prot_sg_count(cmd)) 615 + scsi_free_sgtable(cmd->prot_sdb, true); 616 + } 617 + 618 + static void scsi_mq_uninit_cmd(struct scsi_cmnd *cmd) 619 + { 620 + struct scsi_device *sdev = cmd->device; 621 + unsigned long flags; 622 + 623 + BUG_ON(list_empty(&cmd->list)); 624 + 625 + scsi_mq_free_sgtables(cmd); 626 + scsi_uninit_cmd(cmd); 627 + 628 + spin_lock_irqsave(&sdev->list_lock, flags); 629 + list_del_init(&cmd->list); 630 + spin_unlock_irqrestore(&sdev->list_lock, flags); 611 631 } 612 632 613 633 /* ··· 676 602 static void scsi_release_buffers(struct scsi_cmnd *cmd) 677 603 { 678 604 if (cmd->sdb.table.nents) 679 - scsi_free_sgtable(&cmd->sdb); 605 + scsi_free_sgtable(&cmd->sdb, false); 680 606 681 607 memset(&cmd->sdb, 0, sizeof(cmd->sdb)); 682 608 683 609 if (scsi_prot_sg_count(cmd)) 684 - scsi_free_sgtable(cmd->prot_sdb); 610 + scsi_free_sgtable(cmd->prot_sdb, false); 685 611 } 686 612 687 613 static void scsi_release_bidi_buffers(struct scsi_cmnd *cmd) 688 614 { 689 615 struct scsi_data_buffer *bidi_sdb = cmd->request->next_rq->special; 690 616 691 - scsi_free_sgtable(bidi_sdb); 617 + scsi_free_sgtable(bidi_sdb, false); 692 618 kmem_cache_free(scsi_sdb_cache, bidi_sdb); 693 619 cmd->request->next_rq->special = NULL; 694 620 } ··· 699 625 struct scsi_cmnd *cmd = req->special; 700 626 struct scsi_device *sdev = cmd->device; 701 627 struct request_queue *q = sdev->request_queue; 702 - unsigned long flags; 703 - 704 628 705 629 if (blk_update_request(req, error, bytes)) 706 630 return true; ··· 711 639 if (blk_queue_add_random(q)) 712 640 add_disk_randomness(req->rq_disk); 713 641 714 - spin_lock_irqsave(q->queue_lock, flags); 715 - blk_finish_request(req, error); 716 - spin_unlock_irqrestore(q->queue_lock, flags); 642 + if (req->mq_ctx) { 643 + /* 644 + * In the MQ case the command gets freed by __blk_mq_end_io, 645 + * so we have to do all cleanup that depends on it earlier. 646 + * 647 + * We also can't kick the queues from irq context, so we 648 + * will have to defer it to a workqueue. 649 + */ 650 + scsi_mq_uninit_cmd(cmd); 717 651 718 - if (bidi_bytes) 719 - scsi_release_bidi_buffers(cmd); 720 - scsi_release_buffers(cmd); 721 - scsi_next_command(cmd); 652 + __blk_mq_end_io(req, error); 653 + 654 + if (scsi_target(sdev)->single_lun || 655 + !list_empty(&sdev->host->starved_list)) 656 + kblockd_schedule_work(&sdev->requeue_work); 657 + else 658 + blk_mq_start_stopped_hw_queues(q, true); 659 + 660 + put_device(&sdev->sdev_gendev); 661 + } else { 662 + unsigned long flags; 663 + 664 + spin_lock_irqsave(q->queue_lock, flags); 665 + blk_finish_request(req, error); 666 + spin_unlock_irqrestore(q->queue_lock, flags); 667 + 668 + if (bidi_bytes) 669 + scsi_release_bidi_buffers(cmd); 670 + scsi_release_buffers(cmd); 671 + scsi_next_command(cmd); 672 + } 673 + 722 674 return false; 723 675 } 724 676 ··· 1049 953 /* Unprep the request and put it back at the head of the queue. 1050 954 * A new command will be prepared and issued. 1051 955 */ 1052 - scsi_release_buffers(cmd); 1053 - scsi_requeue_command(q, cmd); 956 + if (q->mq_ops) { 957 + cmd->request->cmd_flags &= ~REQ_DONTPREP; 958 + scsi_mq_uninit_cmd(cmd); 959 + scsi_mq_requeue_cmd(cmd); 960 + } else { 961 + scsi_release_buffers(cmd); 962 + scsi_requeue_command(q, cmd); 963 + } 1054 964 break; 1055 965 case ACTION_RETRY: 1056 966 /* Retry the same command immediately */ ··· 1078 976 * If sg table allocation fails, requeue request later. 1079 977 */ 1080 978 if (unlikely(scsi_alloc_sgtable(sdb, req->nr_phys_segments, 1081 - gfp_mask))) { 979 + gfp_mask, req->mq_ctx != NULL))) 1082 980 return BLKPREP_DEFER; 1083 - } 1084 981 1085 982 /* 1086 983 * Next, walk the list, and fill in the addresses and sizes of ··· 1107 1006 { 1108 1007 struct scsi_device *sdev = cmd->device; 1109 1008 struct request *rq = cmd->request; 1009 + bool is_mq = (rq->mq_ctx != NULL); 1110 1010 int error; 1111 1011 1112 1012 BUG_ON(!rq->nr_phys_segments); ··· 1117 1015 goto err_exit; 1118 1016 1119 1017 if (blk_bidi_rq(rq)) { 1120 - struct scsi_data_buffer *bidi_sdb = kmem_cache_zalloc( 1121 - scsi_sdb_cache, GFP_ATOMIC); 1122 - if (!bidi_sdb) { 1123 - error = BLKPREP_DEFER; 1124 - goto err_exit; 1018 + if (!rq->q->mq_ops) { 1019 + struct scsi_data_buffer *bidi_sdb = 1020 + kmem_cache_zalloc(scsi_sdb_cache, GFP_ATOMIC); 1021 + if (!bidi_sdb) { 1022 + error = BLKPREP_DEFER; 1023 + goto err_exit; 1024 + } 1025 + 1026 + rq->next_rq->special = bidi_sdb; 1125 1027 } 1126 1028 1127 - rq->next_rq->special = bidi_sdb; 1128 - error = scsi_init_sgtable(rq->next_rq, bidi_sdb, GFP_ATOMIC); 1029 + error = scsi_init_sgtable(rq->next_rq, rq->next_rq->special, 1030 + GFP_ATOMIC); 1129 1031 if (error) 1130 1032 goto err_exit; 1131 1033 } ··· 1141 1035 BUG_ON(prot_sdb == NULL); 1142 1036 ivecs = blk_rq_count_integrity_sg(rq->q, rq->bio); 1143 1037 1144 - if (scsi_alloc_sgtable(prot_sdb, ivecs, gfp_mask)) { 1038 + if (scsi_alloc_sgtable(prot_sdb, ivecs, gfp_mask, is_mq)) { 1145 1039 error = BLKPREP_DEFER; 1146 1040 goto err_exit; 1147 1041 } ··· 1155 1049 cmd->prot_sdb->table.nents = count; 1156 1050 } 1157 1051 1158 - return BLKPREP_OK ; 1159 - 1052 + return BLKPREP_OK; 1160 1053 err_exit: 1161 - scsi_release_buffers(cmd); 1162 - cmd->request->special = NULL; 1163 - scsi_put_command(cmd); 1164 - put_device(&sdev->sdev_gendev); 1054 + if (is_mq) { 1055 + scsi_mq_free_sgtables(cmd); 1056 + } else { 1057 + scsi_release_buffers(cmd); 1058 + cmd->request->special = NULL; 1059 + scsi_put_command(cmd); 1060 + put_device(&sdev->sdev_gendev); 1061 + } 1165 1062 return error; 1166 1063 } 1167 1064 EXPORT_SYMBOL(scsi_init_io); ··· 1375 1266 1376 1267 static void scsi_unprep_fn(struct request_queue *q, struct request *req) 1377 1268 { 1378 - if (req->cmd_type == REQ_TYPE_FS) { 1379 - struct scsi_cmnd *cmd = req->special; 1380 - struct scsi_driver *drv = scsi_cmd_to_driver(cmd); 1381 - 1382 - if (drv->uninit_command) 1383 - drv->uninit_command(cmd); 1384 - } 1269 + scsi_uninit_cmd(req->special); 1385 1270 } 1386 1271 1387 1272 /* ··· 1398 1295 * unblock after device_blocked iterates to zero 1399 1296 */ 1400 1297 if (atomic_dec_return(&sdev->device_blocked) > 0) { 1401 - blk_delay_queue(q, SCSI_QUEUE_DELAY); 1298 + /* 1299 + * For the MQ case we take care of this in the caller. 1300 + */ 1301 + if (!q->mq_ops) 1302 + blk_delay_queue(q, SCSI_QUEUE_DELAY); 1402 1303 goto out_dec; 1403 1304 } 1404 1305 SCSI_LOG_MLQUEUE(3, sdev_printk(KERN_INFO, sdev, ··· 1778 1671 blk_delay_queue(q, SCSI_QUEUE_DELAY); 1779 1672 } 1780 1673 1674 + static inline int prep_to_mq(int ret) 1675 + { 1676 + switch (ret) { 1677 + case BLKPREP_OK: 1678 + return 0; 1679 + case BLKPREP_DEFER: 1680 + return BLK_MQ_RQ_QUEUE_BUSY; 1681 + default: 1682 + return BLK_MQ_RQ_QUEUE_ERROR; 1683 + } 1684 + } 1685 + 1686 + static int scsi_mq_prep_fn(struct request *req) 1687 + { 1688 + struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req); 1689 + struct scsi_device *sdev = req->q->queuedata; 1690 + struct Scsi_Host *shost = sdev->host; 1691 + unsigned char *sense_buf = cmd->sense_buffer; 1692 + struct scatterlist *sg; 1693 + 1694 + memset(cmd, 0, sizeof(struct scsi_cmnd)); 1695 + 1696 + req->special = cmd; 1697 + 1698 + cmd->request = req; 1699 + cmd->device = sdev; 1700 + cmd->sense_buffer = sense_buf; 1701 + 1702 + cmd->tag = req->tag; 1703 + 1704 + req->cmd = req->__cmd; 1705 + cmd->cmnd = req->cmd; 1706 + cmd->prot_op = SCSI_PROT_NORMAL; 1707 + 1708 + INIT_LIST_HEAD(&cmd->list); 1709 + INIT_DELAYED_WORK(&cmd->abort_work, scmd_eh_abort_handler); 1710 + cmd->jiffies_at_alloc = jiffies; 1711 + 1712 + /* 1713 + * XXX: cmd_list lookups are only used by two drivers, try to get 1714 + * rid of this list in common code. 1715 + */ 1716 + spin_lock_irq(&sdev->list_lock); 1717 + list_add_tail(&cmd->list, &sdev->cmd_list); 1718 + spin_unlock_irq(&sdev->list_lock); 1719 + 1720 + sg = (void *)cmd + sizeof(struct scsi_cmnd) + shost->hostt->cmd_size; 1721 + cmd->sdb.table.sgl = sg; 1722 + 1723 + if (scsi_host_get_prot(shost)) { 1724 + cmd->prot_sdb = (void *)sg + 1725 + shost->sg_tablesize * sizeof(struct scatterlist); 1726 + memset(cmd->prot_sdb, 0, sizeof(struct scsi_data_buffer)); 1727 + 1728 + cmd->prot_sdb->table.sgl = 1729 + (struct scatterlist *)(cmd->prot_sdb + 1); 1730 + } 1731 + 1732 + if (blk_bidi_rq(req)) { 1733 + struct request *next_rq = req->next_rq; 1734 + struct scsi_data_buffer *bidi_sdb = blk_mq_rq_to_pdu(next_rq); 1735 + 1736 + memset(bidi_sdb, 0, sizeof(struct scsi_data_buffer)); 1737 + bidi_sdb->table.sgl = 1738 + (struct scatterlist *)(bidi_sdb + 1); 1739 + 1740 + next_rq->special = bidi_sdb; 1741 + } 1742 + 1743 + return scsi_setup_cmnd(sdev, req); 1744 + } 1745 + 1746 + static void scsi_mq_done(struct scsi_cmnd *cmd) 1747 + { 1748 + trace_scsi_dispatch_cmd_done(cmd); 1749 + blk_mq_complete_request(cmd->request); 1750 + } 1751 + 1752 + static int scsi_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req) 1753 + { 1754 + struct request_queue *q = req->q; 1755 + struct scsi_device *sdev = q->queuedata; 1756 + struct Scsi_Host *shost = sdev->host; 1757 + struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req); 1758 + int ret; 1759 + int reason; 1760 + 1761 + ret = prep_to_mq(scsi_prep_state_check(sdev, req)); 1762 + if (ret) 1763 + goto out; 1764 + 1765 + ret = BLK_MQ_RQ_QUEUE_BUSY; 1766 + if (!get_device(&sdev->sdev_gendev)) 1767 + goto out; 1768 + 1769 + if (!scsi_dev_queue_ready(q, sdev)) 1770 + goto out_put_device; 1771 + if (!scsi_target_queue_ready(shost, sdev)) 1772 + goto out_dec_device_busy; 1773 + if (!scsi_host_queue_ready(q, shost, sdev)) 1774 + goto out_dec_target_busy; 1775 + 1776 + if (!(req->cmd_flags & REQ_DONTPREP)) { 1777 + ret = prep_to_mq(scsi_mq_prep_fn(req)); 1778 + if (ret) 1779 + goto out_dec_host_busy; 1780 + req->cmd_flags |= REQ_DONTPREP; 1781 + } 1782 + 1783 + scsi_init_cmd_errh(cmd); 1784 + cmd->scsi_done = scsi_mq_done; 1785 + 1786 + reason = scsi_dispatch_cmd(cmd); 1787 + if (reason) { 1788 + scsi_set_blocked(cmd, reason); 1789 + ret = BLK_MQ_RQ_QUEUE_BUSY; 1790 + goto out_dec_host_busy; 1791 + } 1792 + 1793 + return BLK_MQ_RQ_QUEUE_OK; 1794 + 1795 + out_dec_host_busy: 1796 + atomic_dec(&shost->host_busy); 1797 + out_dec_target_busy: 1798 + if (scsi_target(sdev)->can_queue > 0) 1799 + atomic_dec(&scsi_target(sdev)->target_busy); 1800 + out_dec_device_busy: 1801 + atomic_dec(&sdev->device_busy); 1802 + out_put_device: 1803 + put_device(&sdev->sdev_gendev); 1804 + out: 1805 + switch (ret) { 1806 + case BLK_MQ_RQ_QUEUE_BUSY: 1807 + blk_mq_stop_hw_queue(hctx); 1808 + if (atomic_read(&sdev->device_busy) == 0 && 1809 + !scsi_device_blocked(sdev)) 1810 + blk_mq_delay_queue(hctx, SCSI_QUEUE_DELAY); 1811 + break; 1812 + case BLK_MQ_RQ_QUEUE_ERROR: 1813 + /* 1814 + * Make sure to release all allocated ressources when 1815 + * we hit an error, as we will never see this command 1816 + * again. 1817 + */ 1818 + if (req->cmd_flags & REQ_DONTPREP) 1819 + scsi_mq_uninit_cmd(cmd); 1820 + break; 1821 + default: 1822 + break; 1823 + } 1824 + return ret; 1825 + } 1826 + 1827 + static int scsi_init_request(void *data, struct request *rq, 1828 + unsigned int hctx_idx, unsigned int request_idx, 1829 + unsigned int numa_node) 1830 + { 1831 + struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(rq); 1832 + 1833 + cmd->sense_buffer = kzalloc_node(SCSI_SENSE_BUFFERSIZE, GFP_KERNEL, 1834 + numa_node); 1835 + if (!cmd->sense_buffer) 1836 + return -ENOMEM; 1837 + return 0; 1838 + } 1839 + 1840 + static void scsi_exit_request(void *data, struct request *rq, 1841 + unsigned int hctx_idx, unsigned int request_idx) 1842 + { 1843 + struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(rq); 1844 + 1845 + kfree(cmd->sense_buffer); 1846 + } 1847 + 1781 1848 static u64 scsi_calculate_bounce_limit(struct Scsi_Host *shost) 1782 1849 { 1783 1850 struct device *host_dev; ··· 1973 1692 return bounce_limit; 1974 1693 } 1975 1694 1976 - struct request_queue *__scsi_alloc_queue(struct Scsi_Host *shost, 1977 - request_fn_proc *request_fn) 1695 + static void __scsi_init_queue(struct Scsi_Host *shost, struct request_queue *q) 1978 1696 { 1979 - struct request_queue *q; 1980 1697 struct device *dev = shost->dma_dev; 1981 - 1982 - q = blk_init_queue(request_fn, NULL); 1983 - if (!q) 1984 - return NULL; 1985 1698 1986 1699 /* 1987 1700 * this limit is imposed by hardware restrictions ··· 2007 1732 * blk_queue_update_dma_alignment() later. 2008 1733 */ 2009 1734 blk_queue_dma_alignment(q, 0x03); 1735 + } 2010 1736 1737 + struct request_queue *__scsi_alloc_queue(struct Scsi_Host *shost, 1738 + request_fn_proc *request_fn) 1739 + { 1740 + struct request_queue *q; 1741 + 1742 + q = blk_init_queue(request_fn, NULL); 1743 + if (!q) 1744 + return NULL; 1745 + __scsi_init_queue(shost, q); 2011 1746 return q; 2012 1747 } 2013 1748 EXPORT_SYMBOL(__scsi_alloc_queue); ··· 2036 1751 blk_queue_rq_timed_out(q, scsi_times_out); 2037 1752 blk_queue_lld_busy(q, scsi_lld_busy); 2038 1753 return q; 1754 + } 1755 + 1756 + static struct blk_mq_ops scsi_mq_ops = { 1757 + .map_queue = blk_mq_map_queue, 1758 + .queue_rq = scsi_queue_rq, 1759 + .complete = scsi_softirq_done, 1760 + .timeout = scsi_times_out, 1761 + .init_request = scsi_init_request, 1762 + .exit_request = scsi_exit_request, 1763 + }; 1764 + 1765 + struct request_queue *scsi_mq_alloc_queue(struct scsi_device *sdev) 1766 + { 1767 + sdev->request_queue = blk_mq_init_queue(&sdev->host->tag_set); 1768 + if (IS_ERR(sdev->request_queue)) 1769 + return NULL; 1770 + 1771 + sdev->request_queue->queuedata = sdev; 1772 + __scsi_init_queue(sdev->host, sdev->request_queue); 1773 + return sdev->request_queue; 1774 + } 1775 + 1776 + int scsi_mq_setup_tags(struct Scsi_Host *shost) 1777 + { 1778 + unsigned int cmd_size, sgl_size, tbl_size; 1779 + 1780 + tbl_size = shost->sg_tablesize; 1781 + if (tbl_size > SCSI_MAX_SG_SEGMENTS) 1782 + tbl_size = SCSI_MAX_SG_SEGMENTS; 1783 + sgl_size = tbl_size * sizeof(struct scatterlist); 1784 + cmd_size = sizeof(struct scsi_cmnd) + shost->hostt->cmd_size + sgl_size; 1785 + if (scsi_host_get_prot(shost)) 1786 + cmd_size += sizeof(struct scsi_data_buffer) + sgl_size; 1787 + 1788 + memset(&shost->tag_set, 0, sizeof(shost->tag_set)); 1789 + shost->tag_set.ops = &scsi_mq_ops; 1790 + shost->tag_set.nr_hw_queues = 1; 1791 + shost->tag_set.queue_depth = shost->can_queue; 1792 + shost->tag_set.cmd_size = cmd_size; 1793 + shost->tag_set.numa_node = NUMA_NO_NODE; 1794 + shost->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; 1795 + shost->tag_set.driver_data = shost; 1796 + 1797 + return blk_mq_alloc_tag_set(&shost->tag_set); 1798 + } 1799 + 1800 + void scsi_mq_destroy_tags(struct Scsi_Host *shost) 1801 + { 1802 + blk_mq_free_tag_set(&shost->tag_set); 2039 1803 } 2040 1804 2041 1805 /* ··· 2832 2498 * block layer from calling the midlayer with this device's 2833 2499 * request queue. 2834 2500 */ 2835 - spin_lock_irqsave(q->queue_lock, flags); 2836 - blk_stop_queue(q); 2837 - spin_unlock_irqrestore(q->queue_lock, flags); 2501 + if (q->mq_ops) { 2502 + blk_mq_stop_hw_queues(q); 2503 + } else { 2504 + spin_lock_irqsave(q->queue_lock, flags); 2505 + blk_stop_queue(q); 2506 + spin_unlock_irqrestore(q->queue_lock, flags); 2507 + } 2838 2508 2839 2509 return 0; 2840 2510 } ··· 2884 2546 sdev->sdev_state != SDEV_OFFLINE) 2885 2547 return -EINVAL; 2886 2548 2887 - spin_lock_irqsave(q->queue_lock, flags); 2888 - blk_start_queue(q); 2889 - spin_unlock_irqrestore(q->queue_lock, flags); 2549 + if (q->mq_ops) { 2550 + blk_mq_start_stopped_hw_queues(q, false); 2551 + } else { 2552 + spin_lock_irqsave(q->queue_lock, flags); 2553 + blk_start_queue(q); 2554 + spin_unlock_irqrestore(q->queue_lock, flags); 2555 + } 2890 2556 2891 2557 return 0; 2892 2558 }
+3
drivers/scsi/scsi_priv.h
··· 88 88 extern void scsi_io_completion(struct scsi_cmnd *, unsigned int); 89 89 extern void scsi_run_host_queues(struct Scsi_Host *shost); 90 90 extern struct request_queue *scsi_alloc_queue(struct scsi_device *sdev); 91 + extern struct request_queue *scsi_mq_alloc_queue(struct scsi_device *sdev); 92 + extern int scsi_mq_setup_tags(struct Scsi_Host *shost); 93 + extern void scsi_mq_destroy_tags(struct Scsi_Host *shost); 91 94 extern int scsi_init_queue(void); 92 95 extern void scsi_exit_queue(void); 93 96 struct request_queue;
+4 -1
drivers/scsi/scsi_scan.c
··· 273 273 */ 274 274 sdev->borken = 1; 275 275 276 - sdev->request_queue = scsi_alloc_queue(sdev); 276 + if (shost_use_blk_mq(shost)) 277 + sdev->request_queue = scsi_mq_alloc_queue(sdev); 278 + else 279 + sdev->request_queue = scsi_alloc_queue(sdev); 277 280 if (!sdev->request_queue) { 278 281 /* release fn is set up in scsi_sysfs_device_initialise, so 279 282 * have to free and put manually here */
+2
drivers/scsi/scsi_sysfs.c
··· 333 333 334 334 static DEVICE_ATTR(eh_deadline, S_IRUGO | S_IWUSR, show_shost_eh_deadline, store_shost_eh_deadline); 335 335 336 + shost_rd_attr(use_blk_mq, "%d\n"); 336 337 shost_rd_attr(unique_id, "%u\n"); 337 338 shost_rd_attr(cmd_per_lun, "%hd\n"); 338 339 shost_rd_attr(can_queue, "%hd\n"); ··· 353 352 static DEVICE_ATTR(host_busy, S_IRUGO, show_host_busy, NULL); 354 353 355 354 static struct attribute *scsi_sysfs_shost_attrs[] = { 355 + &dev_attr_use_blk_mq.attr, 356 356 &dev_attr_unique_id.attr, 357 357 &dev_attr_host_busy.attr, 358 358 &dev_attr_cmd_per_lun.attr,
+17 -1
include/scsi/scsi_host.h
··· 7 7 #include <linux/workqueue.h> 8 8 #include <linux/mutex.h> 9 9 #include <linux/seq_file.h> 10 + #include <linux/blk-mq.h> 10 11 #include <scsi/scsi.h> 11 12 12 13 struct request_queue; ··· 511 510 */ 512 511 unsigned int cmd_size; 513 512 struct scsi_host_cmd_pool *cmd_pool; 513 + 514 + /* temporary flag to disable blk-mq I/O path */ 515 + bool disable_blk_mq; 514 516 }; 515 517 516 518 /* ··· 584 580 * Area to keep a shared tag map (if needed, will be 585 581 * NULL if not). 586 582 */ 587 - struct blk_queue_tag *bqt; 583 + union { 584 + struct blk_queue_tag *bqt; 585 + struct blk_mq_tag_set tag_set; 586 + }; 588 587 589 588 atomic_t host_busy; /* commands actually active on low-level */ 590 589 atomic_t host_blocked; ··· 678 671 679 672 /* The controller does not support WRITE SAME */ 680 673 unsigned no_write_same:1; 674 + 675 + unsigned use_blk_mq:1; 681 676 682 677 /* 683 678 * Optional work queue to be utilized by the transport ··· 779 770 shost->shost_state == SHOST_CANCEL_RECOVERY || 780 771 shost->shost_state == SHOST_DEL_RECOVERY || 781 772 shost->tmf_in_progress; 773 + } 774 + 775 + extern bool scsi_use_blk_mq; 776 + 777 + static inline bool shost_use_blk_mq(struct Scsi_Host *shost) 778 + { 779 + return shost->use_blk_mq; 782 780 } 783 781 784 782 extern int scsi_queue_work(struct Scsi_Host *, struct work_struct *);
+25 -3
include/scsi/scsi_tcq.h
··· 67 67 if (!sdev->tagged_supported) 68 68 return; 69 69 70 - if (!blk_queue_tagged(sdev->request_queue)) 70 + if (!shost_use_blk_mq(sdev->host) && 71 + blk_queue_tagged(sdev->request_queue)) 71 72 blk_queue_init_tags(sdev->request_queue, depth, 72 73 sdev->host->bqt); 73 74 ··· 81 80 **/ 82 81 static inline void scsi_deactivate_tcq(struct scsi_device *sdev, int depth) 83 82 { 84 - if (blk_queue_tagged(sdev->request_queue)) 83 + if (!shost_use_blk_mq(sdev->host) && 84 + blk_queue_tagged(sdev->request_queue)) 85 85 blk_queue_free_tags(sdev->request_queue); 86 86 scsi_adjust_queue_depth(sdev, 0, depth); 87 87 } ··· 110 108 return 0; 111 109 } 112 110 111 + static inline struct scsi_cmnd *scsi_mq_find_tag(struct Scsi_Host *shost, 112 + unsigned int hw_ctx, int tag) 113 + { 114 + struct request *req; 115 + 116 + req = blk_mq_tag_to_rq(shost->tag_set.tags[hw_ctx], tag); 117 + return req ? (struct scsi_cmnd *)req->special : NULL; 118 + } 119 + 113 120 /** 114 121 * scsi_find_tag - find a tagged command by device 115 122 * @SDpnt: pointer to the ScSI device ··· 129 118 **/ 130 119 static inline struct scsi_cmnd *scsi_find_tag(struct scsi_device *sdev, int tag) 131 120 { 132 - 133 121 struct request *req; 134 122 135 123 if (tag != SCSI_NO_TAG) { 124 + if (shost_use_blk_mq(sdev->host)) 125 + return scsi_mq_find_tag(sdev->host, 0, tag); 126 + 136 127 req = blk_queue_find_tag(sdev->request_queue, tag); 137 128 return req ? (struct scsi_cmnd *)req->special : NULL; 138 129 } ··· 143 130 return sdev->current_cmnd; 144 131 } 145 132 133 + 146 134 /** 147 135 * scsi_init_shared_tag_map - create a shared tag map 148 136 * @shost: the host to share the tag map among all devices ··· 151 137 */ 152 138 static inline int scsi_init_shared_tag_map(struct Scsi_Host *shost, int depth) 153 139 { 140 + /* 141 + * We always have a shared tag map around when using blk-mq. 142 + */ 143 + if (shost_use_blk_mq(shost)) 144 + return 0; 145 + 154 146 /* 155 147 * If the shared tag map isn't already initialized, do it now. 156 148 * This saves callers from having to check ->bqt when setting up ··· 185 165 struct request *req; 186 166 187 167 if (tag != SCSI_NO_TAG) { 168 + if (shost_use_blk_mq(shost)) 169 + return scsi_mq_find_tag(shost, 0, tag); 188 170 req = blk_map_queue_find_tag(shost->bqt, tag); 189 171 return req ? (struct scsi_cmnd *)req->special : NULL; 190 172 }