Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'nvme-5.4' of git://git.infradead.org/nvme into for-5.4/block

Pull NVMe changes from Sagi:

"The nvme updates include:
- ana log parse fix from Anton
- nvme quirks support for Apple devices from Ben
- fix missing bio completion tracing for multipath stack devices from
Hannes and Mikhail
- IP TOS settings for nvme rdma and tcp transports from Israel
- rq_dma_dir cleanups from Israel
- tracing for Get LBA Status command from Minwoo
- Some nvme-tcp cleanups from Minwoo, Potnuri and Myself
- Some consolidation between the fabrics transports for handling the CAP
register
- reset race with ns scanning fix for fabrics (move fabrics commands to
a dedicated request queue with a different lifetime from the admin
request queue)."

* 'nvme-5.4' of git://git.infradead.org/nvme: (30 commits)
nvme-rdma: Use rq_dma_dir macro
nvme-fc: Use rq_dma_dir macro
nvme-pci: Tidy up nvme_unmap_data
nvme: make fabrics command run on a separate request queue
nvme-pci: Support shared tags across queues for Apple 2018 controllers
nvme-pci: Add support for Apple 2018+ models
nvme-pci: Add support for variable IO SQ element size
nvme-pci: Pass the queue to SQ_SIZE/CQ_SIZE macros
nvme: trace bio completion
nvme-multipath: fix ana log nsid lookup when nsid is not found
nvmet-tcp: Add TOS for tcp transport
nvme-tcp: Add TOS for tcp transport
nvme-tcp: Use struct nvme_ctrl directly
nvme-rdma: Add TOS for rdma transport
nvme-fabrics: Add type of service (TOS) configuration
nvmet-tcp: fix possible memory leak
nvmet-tcp: fix possible NULL deref
nvmet: trace: parse Get LBA Status command in detail
nvme: trace: parse Get LBA Status command in detail
nvme: trace: support for Get LBA Status opcode parsed
...

+379 -157
+1
drivers/nvme/host/Kconfig
··· 64 64 depends on INET 65 65 depends on BLK_DEV_NVME 66 66 select NVME_FABRICS 67 + select CRYPTO_CRC32C 67 68 help 68 69 This provides support for the NVMe over Fabrics protocol using 69 70 the TCP transport. This allows you to use remote block devices
+20 -17
drivers/nvme/host/core.c
··· 22 22 #include <linux/pm_qos.h> 23 23 #include <asm/unaligned.h> 24 24 25 - #define CREATE_TRACE_POINTS 26 - #include "trace.h" 27 - 28 25 #include "nvme.h" 29 26 #include "fabrics.h" 27 + 28 + #define CREATE_TRACE_POINTS 29 + #include "trace.h" 30 30 31 31 #define NVME_MINORS (1U << MINORBITS) 32 32 ··· 279 279 return; 280 280 } 281 281 } 282 + 283 + nvme_trace_bio_complete(req, status); 282 284 blk_mq_end_request(req, status); 283 285 } 284 286 EXPORT_SYMBOL_GPL(nvme_complete_rq); ··· 1952 1950 * bits', but doing so may cause the device to complete commands to the 1953 1951 * admin queue ... and we don't know what memory that might be pointing at! 1954 1952 */ 1955 - int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap) 1953 + int nvme_disable_ctrl(struct nvme_ctrl *ctrl) 1956 1954 { 1957 1955 int ret; 1958 1956 ··· 1966 1964 if (ctrl->quirks & NVME_QUIRK_DELAY_BEFORE_CHK_RDY) 1967 1965 msleep(NVME_QUIRK_DELAY_AMOUNT); 1968 1966 1969 - return nvme_wait_ready(ctrl, cap, false); 1967 + return nvme_wait_ready(ctrl, ctrl->cap, false); 1970 1968 } 1971 1969 EXPORT_SYMBOL_GPL(nvme_disable_ctrl); 1972 1970 1973 - int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap) 1971 + int nvme_enable_ctrl(struct nvme_ctrl *ctrl) 1974 1972 { 1975 1973 /* 1976 1974 * Default to a 4K page size, with the intention to update this 1977 1975 * path in the future to accomodate architectures with differing 1978 1976 * kernel and IO page sizes. 1979 1977 */ 1980 - unsigned dev_page_min = NVME_CAP_MPSMIN(cap) + 12, page_shift = 12; 1978 + unsigned dev_page_min, page_shift = 12; 1981 1979 int ret; 1980 + 1981 + ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap); 1982 + if (ret) { 1983 + dev_err(ctrl->device, "Reading CAP failed (%d)\n", ret); 1984 + return ret; 1985 + } 1986 + dev_page_min = NVME_CAP_MPSMIN(ctrl->cap) + 12; 1982 1987 1983 1988 if (page_shift < dev_page_min) { 1984 1989 dev_err(ctrl->device, ··· 2005 1996 ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config); 2006 1997 if (ret) 2007 1998 return ret; 2008 - return nvme_wait_ready(ctrl, cap, true); 1999 + return nvme_wait_ready(ctrl, ctrl->cap, true); 2009 2000 } 2010 2001 EXPORT_SYMBOL_GPL(nvme_enable_ctrl); 2011 2002 ··· 2571 2562 int nvme_init_identify(struct nvme_ctrl *ctrl) 2572 2563 { 2573 2564 struct nvme_id_ctrl *id; 2574 - u64 cap; 2575 2565 int ret, page_shift; 2576 2566 u32 max_hw_sectors; 2577 2567 bool prev_apst_enabled; ··· 2580 2572 dev_err(ctrl->device, "Reading VS failed (%d)\n", ret); 2581 2573 return ret; 2582 2574 } 2583 - 2584 - ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &cap); 2585 - if (ret) { 2586 - dev_err(ctrl->device, "Reading CAP failed (%d)\n", ret); 2587 - return ret; 2588 - } 2589 - page_shift = NVME_CAP_MPSMIN(cap) + 12; 2575 + page_shift = NVME_CAP_MPSMIN(ctrl->cap) + 12; 2576 + ctrl->sqsize = min_t(int, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize); 2590 2577 2591 2578 if (ctrl->vs >= NVME_VS(1, 1, 0)) 2592 - ctrl->subsystem = NVME_CAP_NSSRC(cap); 2579 + ctrl->subsystem = NVME_CAP_NSSRC(ctrl->cap); 2593 2580 2594 2581 ret = nvme_identify_ctrl(ctrl, &id); 2595 2582 if (ret) {
+22 -4
drivers/nvme/host/fabrics.c
··· 150 150 cmd.prop_get.fctype = nvme_fabrics_type_property_get; 151 151 cmd.prop_get.offset = cpu_to_le32(off); 152 152 153 - ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, &res, NULL, 0, 0, 153 + ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, &res, NULL, 0, 0, 154 154 NVME_QID_ANY, 0, 0, false); 155 155 156 156 if (ret >= 0) ··· 197 197 cmd.prop_get.attrib = 1; 198 198 cmd.prop_get.offset = cpu_to_le32(off); 199 199 200 - ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, &res, NULL, 0, 0, 200 + ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, &res, NULL, 0, 0, 201 201 NVME_QID_ANY, 0, 0, false); 202 202 203 203 if (ret >= 0) ··· 243 243 cmd.prop_set.offset = cpu_to_le32(off); 244 244 cmd.prop_set.value = cpu_to_le64(val); 245 245 246 - ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, NULL, 0, 0, 246 + ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, NULL, NULL, 0, 0, 247 247 NVME_QID_ANY, 0, 0, false); 248 248 if (unlikely(ret)) 249 249 dev_err(ctrl->device, ··· 396 396 strncpy(data->subsysnqn, ctrl->opts->subsysnqn, NVMF_NQN_SIZE); 397 397 strncpy(data->hostnqn, ctrl->opts->host->nqn, NVMF_NQN_SIZE); 398 398 399 - ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, &res, 399 + ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, &res, 400 400 data, sizeof(*data), 0, NVME_QID_ANY, 1, 401 401 BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT, false); 402 402 if (ret) { ··· 611 611 { NVMF_OPT_DATA_DIGEST, "data_digest" }, 612 612 { NVMF_OPT_NR_WRITE_QUEUES, "nr_write_queues=%d" }, 613 613 { NVMF_OPT_NR_POLL_QUEUES, "nr_poll_queues=%d" }, 614 + { NVMF_OPT_TOS, "tos=%d" }, 614 615 { NVMF_OPT_ERR, NULL } 615 616 }; 616 617 ··· 633 632 opts->duplicate_connect = false; 634 633 opts->hdr_digest = false; 635 634 opts->data_digest = false; 635 + opts->tos = -1; /* < 0 == use transport default */ 636 636 637 637 options = o = kstrdup(buf, GFP_KERNEL); 638 638 if (!options) ··· 857 855 goto out; 858 856 } 859 857 opts->nr_poll_queues = token; 858 + break; 859 + case NVMF_OPT_TOS: 860 + if (match_int(args, &token)) { 861 + ret = -EINVAL; 862 + goto out; 863 + } 864 + if (token < 0) { 865 + pr_err("Invalid type of service %d\n", token); 866 + ret = -EINVAL; 867 + goto out; 868 + } 869 + if (token > 255) { 870 + pr_warn("Clamping type of service to 255\n"); 871 + token = 255; 872 + } 873 + opts->tos = token; 860 874 break; 861 875 default: 862 876 pr_warn("unknown parameter or missing value '%s' in ctrl creation request\n",
+3
drivers/nvme/host/fabrics.h
··· 55 55 NVMF_OPT_DATA_DIGEST = 1 << 16, 56 56 NVMF_OPT_NR_WRITE_QUEUES = 1 << 17, 57 57 NVMF_OPT_NR_POLL_QUEUES = 1 << 18, 58 + NVMF_OPT_TOS = 1 << 19, 58 59 }; 59 60 60 61 /** ··· 88 87 * @data_digest: generate/verify data digest (TCP) 89 88 * @nr_write_queues: number of queues for write I/O 90 89 * @nr_poll_queues: number of queues for polling I/O 90 + * @tos: type of service 91 91 */ 92 92 struct nvmf_ctrl_options { 93 93 unsigned mask; ··· 110 108 bool data_digest; 111 109 unsigned int nr_write_queues; 112 110 unsigned int nr_poll_queues; 111 + int tos; 113 112 }; 114 113 115 114 /*
+15 -19
drivers/nvme/host/fc.c
··· 2006 2006 2007 2007 blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); 2008 2008 blk_cleanup_queue(ctrl->ctrl.admin_q); 2009 + blk_cleanup_queue(ctrl->ctrl.fabrics_q); 2009 2010 blk_mq_free_tag_set(&ctrl->admin_tag_set); 2010 2011 2011 2012 kfree(ctrl->queues); ··· 2108 2107 struct nvme_fc_fcp_op *op) 2109 2108 { 2110 2109 struct nvmefc_fcp_req *freq = &op->fcp_req; 2111 - enum dma_data_direction dir; 2112 2110 int ret; 2113 2111 2114 2112 freq->sg_cnt = 0; ··· 2124 2124 2125 2125 op->nents = blk_rq_map_sg(rq->q, rq, freq->sg_table.sgl); 2126 2126 WARN_ON(op->nents > blk_rq_nr_phys_segments(rq)); 2127 - dir = (rq_data_dir(rq) == WRITE) ? DMA_TO_DEVICE : DMA_FROM_DEVICE; 2128 2127 freq->sg_cnt = fc_dma_map_sg(ctrl->lport->dev, freq->sg_table.sgl, 2129 - op->nents, dir); 2128 + op->nents, rq_dma_dir(rq)); 2130 2129 if (unlikely(freq->sg_cnt <= 0)) { 2131 2130 sg_free_table_chained(&freq->sg_table, SG_CHUNK_SIZE); 2132 2131 freq->sg_cnt = 0; ··· 2148 2149 return; 2149 2150 2150 2151 fc_dma_unmap_sg(ctrl->lport->dev, freq->sg_table.sgl, op->nents, 2151 - ((rq_data_dir(rq) == WRITE) ? 2152 - DMA_TO_DEVICE : DMA_FROM_DEVICE)); 2152 + rq_dma_dir(rq)); 2153 2153 2154 2154 nvme_cleanup_cmd(rq); 2155 2155 ··· 2631 2633 if (ret) 2632 2634 goto out_delete_hw_queue; 2633 2635 2634 - blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); 2635 - 2636 2636 ret = nvmf_connect_admin_queue(&ctrl->ctrl); 2637 2637 if (ret) 2638 2638 goto out_disconnect_admin_queue; ··· 2644 2648 * prior connection values 2645 2649 */ 2646 2650 2647 - ret = nvmf_reg_read64(&ctrl->ctrl, NVME_REG_CAP, &ctrl->ctrl.cap); 2648 - if (ret) { 2649 - dev_err(ctrl->ctrl.device, 2650 - "prop_get NVME_REG_CAP failed\n"); 2651 - goto out_disconnect_admin_queue; 2652 - } 2653 - 2654 - ctrl->ctrl.sqsize = 2655 - min_t(int, NVME_CAP_MQES(ctrl->ctrl.cap), ctrl->ctrl.sqsize); 2656 - 2657 - ret = nvme_enable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap); 2651 + ret = nvme_enable_ctrl(&ctrl->ctrl); 2658 2652 if (ret) 2659 2653 goto out_disconnect_admin_queue; 2660 2654 2661 2655 ctrl->ctrl.max_hw_sectors = 2662 2656 (ctrl->lport->ops->max_sgl_segments - 1) << (PAGE_SHIFT - 9); 2657 + 2658 + blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); 2663 2659 2664 2660 ret = nvme_init_identify(&ctrl->ctrl); 2665 2661 if (ret) ··· 3099 3111 goto out_free_queues; 3100 3112 ctrl->ctrl.admin_tagset = &ctrl->admin_tag_set; 3101 3113 3114 + ctrl->ctrl.fabrics_q = blk_mq_init_queue(&ctrl->admin_tag_set); 3115 + if (IS_ERR(ctrl->ctrl.fabrics_q)) { 3116 + ret = PTR_ERR(ctrl->ctrl.fabrics_q); 3117 + goto out_free_admin_tag_set; 3118 + } 3119 + 3102 3120 ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set); 3103 3121 if (IS_ERR(ctrl->ctrl.admin_q)) { 3104 3122 ret = PTR_ERR(ctrl->ctrl.admin_q); 3105 - goto out_free_admin_tag_set; 3123 + goto out_cleanup_fabrics_q; 3106 3124 } 3107 3125 3108 3126 /* ··· 3180 3186 3181 3187 out_cleanup_admin_q: 3182 3188 blk_cleanup_queue(ctrl->ctrl.admin_q); 3189 + out_cleanup_fabrics_q: 3190 + blk_cleanup_queue(ctrl->ctrl.fabrics_q); 3183 3191 out_free_admin_tag_set: 3184 3192 blk_mq_free_tag_set(&ctrl->admin_tag_set); 3185 3193 out_free_queues:
+5 -3
drivers/nvme/host/multipath.c
··· 444 444 445 445 down_write(&ctrl->namespaces_rwsem); 446 446 list_for_each_entry(ns, &ctrl->namespaces, list) { 447 - if (ns->head->ns_id != le32_to_cpu(desc->nsids[n])) 447 + unsigned nsid = le32_to_cpu(desc->nsids[n]); 448 + 449 + if (ns->head->ns_id < nsid) 448 450 continue; 449 - nvme_update_ns_ana_state(desc, ns); 451 + if (ns->head->ns_id == nsid) 452 + nvme_update_ns_ana_state(desc, ns); 450 453 if (++n == nr_nsids) 451 454 break; 452 455 } 453 456 up_write(&ctrl->namespaces_rwsem); 454 - WARN_ON_ONCE(n < nr_nsids); 455 457 return 0; 456 458 } 457 459
+34 -2
drivers/nvme/host/nvme.h
··· 16 16 #include <linux/fault-inject.h> 17 17 #include <linux/rcupdate.h> 18 18 19 + #include <trace/events/block.h> 20 + 19 21 extern unsigned int nvme_io_timeout; 20 22 #define NVME_IO_TIMEOUT (nvme_io_timeout * HZ) 21 23 ··· 94 92 * Broken Write Zeroes. 95 93 */ 96 94 NVME_QUIRK_DISABLE_WRITE_ZEROES = (1 << 9), 95 + 96 + /* 97 + * Use only one interrupt vector for all queues 98 + */ 99 + NVME_QUIRK_SINGLE_VECTOR = (1 << 10), 100 + 101 + /* 102 + * Use non-standard 128 bytes SQEs. 103 + */ 104 + NVME_QUIRK_128_BYTES_SQES = (1 << 11), 105 + 106 + /* 107 + * Prevent tag overlap between queues 108 + */ 109 + NVME_QUIRK_SHARED_TAGS = (1 << 12), 97 110 }; 98 111 99 112 /* ··· 181 164 const struct nvme_ctrl_ops *ops; 182 165 struct request_queue *admin_q; 183 166 struct request_queue *connect_q; 167 + struct request_queue *fabrics_q; 184 168 struct device *dev; 185 169 int instance; 186 170 int numa_node; ··· 444 426 bool nvme_cancel_request(struct request *req, void *data, bool reserved); 445 427 bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, 446 428 enum nvme_ctrl_state new_state); 447 - int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap); 448 - int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap); 429 + int nvme_disable_ctrl(struct nvme_ctrl *ctrl); 430 + int nvme_enable_ctrl(struct nvme_ctrl *ctrl); 449 431 int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl); 450 432 int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, 451 433 const struct nvme_ctrl_ops *ops, unsigned long quirks); ··· 529 511 kblockd_schedule_work(&head->requeue_work); 530 512 } 531 513 514 + static inline void nvme_trace_bio_complete(struct request *req, 515 + blk_status_t status) 516 + { 517 + struct nvme_ns *ns = req->q->queuedata; 518 + 519 + if (req->cmd_flags & REQ_NVME_MPATH) 520 + trace_block_bio_complete(ns->head->disk->queue, 521 + req->bio, status); 522 + } 523 + 532 524 extern struct device_attribute dev_attr_ana_grpid; 533 525 extern struct device_attribute dev_attr_ana_state; 534 526 extern struct device_attribute subsys_attr_iopolicy; ··· 580 552 { 581 553 } 582 554 static inline void nvme_mpath_check_last_path(struct nvme_ns *ns) 555 + { 556 + } 557 + static inline void nvme_trace_bio_complete(struct request *req, 558 + blk_status_t status) 583 559 { 584 560 } 585 561 static inline int nvme_mpath_init(struct nvme_ctrl *ctrl,
+76 -23
drivers/nvme/host/pci.c
··· 28 28 #include "trace.h" 29 29 #include "nvme.h" 30 30 31 - #define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) 32 - #define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion)) 31 + #define SQ_SIZE(q) ((q)->q_depth << (q)->sqes) 32 + #define CQ_SIZE(q) ((q)->q_depth * sizeof(struct nvme_completion)) 33 33 34 34 #define SGES_PER_PAGE (PAGE_SIZE / sizeof(struct nvme_sgl_desc)) 35 35 ··· 100 100 unsigned io_queues[HCTX_MAX_TYPES]; 101 101 unsigned int num_vecs; 102 102 int q_depth; 103 + int io_sqes; 103 104 u32 db_stride; 104 105 void __iomem *bar; 105 106 unsigned long bar_mapped_size; ··· 163 162 struct nvme_queue { 164 163 struct nvme_dev *dev; 165 164 spinlock_t sq_lock; 166 - struct nvme_command *sq_cmds; 165 + void *sq_cmds; 167 166 /* only used for poll queues: */ 168 167 spinlock_t cq_poll_lock ____cacheline_aligned_in_smp; 169 168 volatile struct nvme_completion *cqes; ··· 179 178 u16 last_cq_head; 180 179 u16 qid; 181 180 u8 cq_phase; 181 + u8 sqes; 182 182 unsigned long flags; 183 183 #define NVMEQ_ENABLED 0 184 184 #define NVMEQ_SQ_CMB 1 ··· 490 488 bool write_sq) 491 489 { 492 490 spin_lock(&nvmeq->sq_lock); 493 - memcpy(&nvmeq->sq_cmds[nvmeq->sq_tail], cmd, sizeof(*cmd)); 491 + memcpy(nvmeq->sq_cmds + (nvmeq->sq_tail << nvmeq->sqes), 492 + cmd, sizeof(*cmd)); 494 493 if (++nvmeq->sq_tail == nvmeq->q_depth) 495 494 nvmeq->sq_tail = 0; 496 495 nvme_write_sq_db(nvmeq, write_sq); ··· 537 534 static void nvme_unmap_data(struct nvme_dev *dev, struct request *req) 538 535 { 539 536 struct nvme_iod *iod = blk_mq_rq_to_pdu(req); 540 - enum dma_data_direction dma_dir = rq_data_dir(req) ? 541 - DMA_TO_DEVICE : DMA_FROM_DEVICE; 542 537 const int last_prp = dev->ctrl.page_size / sizeof(__le64) - 1; 543 538 dma_addr_t dma_addr = iod->first_dma, next_dma_addr; 544 539 int i; 545 540 546 541 if (iod->dma_len) { 547 - dma_unmap_page(dev->dev, dma_addr, iod->dma_len, dma_dir); 542 + dma_unmap_page(dev->dev, dma_addr, iod->dma_len, 543 + rq_dma_dir(req)); 548 544 return; 549 545 } 550 546 ··· 1346 1344 1347 1345 static void nvme_free_queue(struct nvme_queue *nvmeq) 1348 1346 { 1349 - dma_free_coherent(nvmeq->dev->dev, CQ_SIZE(nvmeq->q_depth), 1347 + dma_free_coherent(nvmeq->dev->dev, CQ_SIZE(nvmeq), 1350 1348 (void *)nvmeq->cqes, nvmeq->cq_dma_addr); 1351 1349 if (!nvmeq->sq_cmds) 1352 1350 return; 1353 1351 1354 1352 if (test_and_clear_bit(NVMEQ_SQ_CMB, &nvmeq->flags)) { 1355 1353 pci_free_p2pmem(to_pci_dev(nvmeq->dev->dev), 1356 - nvmeq->sq_cmds, SQ_SIZE(nvmeq->q_depth)); 1354 + nvmeq->sq_cmds, SQ_SIZE(nvmeq)); 1357 1355 } else { 1358 - dma_free_coherent(nvmeq->dev->dev, SQ_SIZE(nvmeq->q_depth), 1356 + dma_free_coherent(nvmeq->dev->dev, SQ_SIZE(nvmeq), 1359 1357 nvmeq->sq_cmds, nvmeq->sq_dma_addr); 1360 1358 } 1361 1359 } ··· 1405 1403 if (shutdown) 1406 1404 nvme_shutdown_ctrl(&dev->ctrl); 1407 1405 else 1408 - nvme_disable_ctrl(&dev->ctrl, dev->ctrl.cap); 1406 + nvme_disable_ctrl(&dev->ctrl); 1409 1407 1410 1408 nvme_poll_irqdisable(nvmeq, -1); 1411 1409 } ··· 1435 1433 } 1436 1434 1437 1435 static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq, 1438 - int qid, int depth) 1436 + int qid) 1439 1437 { 1440 1438 struct pci_dev *pdev = to_pci_dev(dev->dev); 1441 1439 1442 1440 if (qid && dev->cmb_use_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) { 1443 - nvmeq->sq_cmds = pci_alloc_p2pmem(pdev, SQ_SIZE(depth)); 1441 + nvmeq->sq_cmds = pci_alloc_p2pmem(pdev, SQ_SIZE(nvmeq)); 1444 1442 if (nvmeq->sq_cmds) { 1445 1443 nvmeq->sq_dma_addr = pci_p2pmem_virt_to_bus(pdev, 1446 1444 nvmeq->sq_cmds); ··· 1449 1447 return 0; 1450 1448 } 1451 1449 1452 - pci_free_p2pmem(pdev, nvmeq->sq_cmds, SQ_SIZE(depth)); 1450 + pci_free_p2pmem(pdev, nvmeq->sq_cmds, SQ_SIZE(nvmeq)); 1453 1451 } 1454 1452 } 1455 1453 1456 - nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth), 1454 + nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(nvmeq), 1457 1455 &nvmeq->sq_dma_addr, GFP_KERNEL); 1458 1456 if (!nvmeq->sq_cmds) 1459 1457 return -ENOMEM; ··· 1467 1465 if (dev->ctrl.queue_count > qid) 1468 1466 return 0; 1469 1467 1470 - nvmeq->cqes = dma_alloc_coherent(dev->dev, CQ_SIZE(depth), 1468 + nvmeq->sqes = qid ? dev->io_sqes : NVME_ADM_SQES; 1469 + nvmeq->q_depth = depth; 1470 + nvmeq->cqes = dma_alloc_coherent(dev->dev, CQ_SIZE(nvmeq), 1471 1471 &nvmeq->cq_dma_addr, GFP_KERNEL); 1472 1472 if (!nvmeq->cqes) 1473 1473 goto free_nvmeq; 1474 1474 1475 - if (nvme_alloc_sq_cmds(dev, nvmeq, qid, depth)) 1475 + if (nvme_alloc_sq_cmds(dev, nvmeq, qid)) 1476 1476 goto free_cqdma; 1477 1477 1478 1478 nvmeq->dev = dev; ··· 1483 1479 nvmeq->cq_head = 0; 1484 1480 nvmeq->cq_phase = 1; 1485 1481 nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride]; 1486 - nvmeq->q_depth = depth; 1487 1482 nvmeq->qid = qid; 1488 1483 dev->ctrl.queue_count++; 1489 1484 1490 1485 return 0; 1491 1486 1492 1487 free_cqdma: 1493 - dma_free_coherent(dev->dev, CQ_SIZE(depth), (void *)nvmeq->cqes, 1494 - nvmeq->cq_dma_addr); 1488 + dma_free_coherent(dev->dev, CQ_SIZE(nvmeq), (void *)nvmeq->cqes, 1489 + nvmeq->cq_dma_addr); 1495 1490 free_nvmeq: 1496 1491 return -ENOMEM; 1497 1492 } ··· 1518 1515 nvmeq->cq_head = 0; 1519 1516 nvmeq->cq_phase = 1; 1520 1517 nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride]; 1521 - memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth)); 1518 + memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq)); 1522 1519 nvme_dbbuf_init(dev, nvmeq, qid); 1523 1520 dev->online_queues++; 1524 1521 wmb(); /* ensure the first interrupt sees the initialization */ ··· 1682 1679 (readl(dev->bar + NVME_REG_CSTS) & NVME_CSTS_NSSRO)) 1683 1680 writel(NVME_CSTS_NSSRO, dev->bar + NVME_REG_CSTS); 1684 1681 1685 - result = nvme_disable_ctrl(&dev->ctrl, dev->ctrl.cap); 1682 + result = nvme_disable_ctrl(&dev->ctrl); 1686 1683 if (result < 0) 1687 1684 return result; 1688 1685 ··· 1698 1695 lo_hi_writeq(nvmeq->sq_dma_addr, dev->bar + NVME_REG_ASQ); 1699 1696 lo_hi_writeq(nvmeq->cq_dma_addr, dev->bar + NVME_REG_ACQ); 1700 1697 1701 - result = nvme_enable_ctrl(&dev->ctrl, dev->ctrl.cap); 1698 + result = nvme_enable_ctrl(&dev->ctrl); 1702 1699 if (result) 1703 1700 return result; 1704 1701 ··· 2080 2077 dev->io_queues[HCTX_TYPE_DEFAULT] = 1; 2081 2078 dev->io_queues[HCTX_TYPE_READ] = 0; 2082 2079 2080 + /* 2081 + * Some Apple controllers require all queues to use the 2082 + * first vector. 2083 + */ 2084 + if (dev->ctrl.quirks & NVME_QUIRK_SINGLE_VECTOR) 2085 + irq_queues = 1; 2086 + 2083 2087 return pci_alloc_irq_vectors_affinity(pdev, 1, irq_queues, 2084 2088 PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd); 2085 2089 } ··· 2105 2095 unsigned long size; 2106 2096 2107 2097 nr_io_queues = max_io_queues(); 2098 + 2099 + /* 2100 + * If tags are shared with admin queue (Apple bug), then 2101 + * make sure we only use one IO queue. 2102 + */ 2103 + if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS) 2104 + nr_io_queues = 1; 2105 + 2108 2106 result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues); 2109 2107 if (result < 0) 2110 2108 return result; ··· 2283 2265 dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE; 2284 2266 dev->tagset.driver_data = dev; 2285 2267 2268 + /* 2269 + * Some Apple controllers requires tags to be unique 2270 + * across admin and IO queue, so reserve the first 32 2271 + * tags of the IO queue. 2272 + */ 2273 + if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS) 2274 + dev->tagset.reserved_tags = NVME_AQ_DEPTH; 2275 + 2286 2276 ret = blk_mq_alloc_tag_set(&dev->tagset); 2287 2277 if (ret) { 2288 2278 dev_warn(dev->ctrl.device, ··· 2340 2314 2341 2315 dev->q_depth = min_t(int, NVME_CAP_MQES(dev->ctrl.cap) + 1, 2342 2316 io_queue_depth); 2317 + dev->ctrl.sqsize = dev->q_depth - 1; /* 0's based queue depth */ 2343 2318 dev->db_stride = 1 << NVME_CAP_STRIDE(dev->ctrl.cap); 2344 2319 dev->dbs = dev->bar + 4096; 2320 + 2321 + /* 2322 + * Some Apple controllers require a non-standard SQE size. 2323 + * Interestingly they also seem to ignore the CC:IOSQES register 2324 + * so we don't bother updating it here. 2325 + */ 2326 + if (dev->ctrl.quirks & NVME_QUIRK_128_BYTES_SQES) 2327 + dev->io_sqes = 7; 2328 + else 2329 + dev->io_sqes = NVME_NVM_IOSQES; 2345 2330 2346 2331 /* 2347 2332 * Temporary fix for the Apple controller found in the MacBook8,1 and ··· 2370 2333 dev_err(dev->ctrl.device, "detected PM1725 NVMe controller, " 2371 2334 "set queue depth=%u\n", dev->q_depth); 2372 2335 } 2336 + 2337 + /* 2338 + * Controllers with the shared tags quirk need the IO queue to be 2339 + * big enough so that we get 32 tags for the admin queue 2340 + */ 2341 + if ((dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS) && 2342 + (dev->q_depth < (NVME_AQ_DEPTH + 2))) { 2343 + dev->q_depth = NVME_AQ_DEPTH + 2; 2344 + dev_warn(dev->ctrl.device, "IO queue depth clamped to %d\n", 2345 + dev->q_depth); 2346 + } 2347 + 2373 2348 2374 2349 nvme_map_cmb(dev); 2375 2350 ··· 3083 3034 { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) }, 3084 3035 { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001) }, 3085 3036 { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) }, 3037 + { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2005), 3038 + .driver_data = NVME_QUIRK_SINGLE_VECTOR | 3039 + NVME_QUIRK_128_BYTES_SQES | 3040 + NVME_QUIRK_SHARED_TAGS }, 3086 3041 { 0, } 3087 3042 }; 3088 3043 MODULE_DEVICE_TABLE(pci, nvme_id_table);
+28 -25
drivers/nvme/host/rdma.c
··· 751 751 { 752 752 if (remove) { 753 753 blk_cleanup_queue(ctrl->ctrl.admin_q); 754 + blk_cleanup_queue(ctrl->ctrl.fabrics_q); 754 755 blk_mq_free_tag_set(ctrl->ctrl.admin_tagset); 755 756 } 756 757 if (ctrl->async_event_sqe.data) { ··· 793 792 goto out_free_async_qe; 794 793 } 795 794 795 + ctrl->ctrl.fabrics_q = blk_mq_init_queue(&ctrl->admin_tag_set); 796 + if (IS_ERR(ctrl->ctrl.fabrics_q)) { 797 + error = PTR_ERR(ctrl->ctrl.fabrics_q); 798 + goto out_free_tagset; 799 + } 800 + 796 801 ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set); 797 802 if (IS_ERR(ctrl->ctrl.admin_q)) { 798 803 error = PTR_ERR(ctrl->ctrl.admin_q); 799 - goto out_free_tagset; 804 + goto out_cleanup_fabrics_q; 800 805 } 801 806 } 802 807 ··· 810 803 if (error) 811 804 goto out_cleanup_queue; 812 805 813 - error = ctrl->ctrl.ops->reg_read64(&ctrl->ctrl, NVME_REG_CAP, 814 - &ctrl->ctrl.cap); 815 - if (error) { 816 - dev_err(ctrl->ctrl.device, 817 - "prop_get NVME_REG_CAP failed\n"); 818 - goto out_stop_queue; 819 - } 820 - 821 - ctrl->ctrl.sqsize = 822 - min_t(int, NVME_CAP_MQES(ctrl->ctrl.cap), ctrl->ctrl.sqsize); 823 - 824 - error = nvme_enable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap); 806 + error = nvme_enable_ctrl(&ctrl->ctrl); 825 807 if (error) 826 808 goto out_stop_queue; 827 809 828 810 ctrl->ctrl.max_hw_sectors = 829 811 (ctrl->max_fr_pages - 1) << (ilog2(SZ_4K) - 9); 812 + 813 + blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); 830 814 831 815 error = nvme_init_identify(&ctrl->ctrl); 832 816 if (error) ··· 830 832 out_cleanup_queue: 831 833 if (new) 832 834 blk_cleanup_queue(ctrl->ctrl.admin_q); 835 + out_cleanup_fabrics_q: 836 + if (new) 837 + blk_cleanup_queue(ctrl->ctrl.fabrics_q); 833 838 out_free_tagset: 834 839 if (new) 835 840 blk_mq_free_tag_set(ctrl->ctrl.admin_tagset); ··· 907 906 nvme_cancel_request, &ctrl->ctrl); 908 907 blk_mq_tagset_wait_completed_request(ctrl->ctrl.admin_tagset); 909 908 } 910 - blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); 909 + if (remove) 910 + blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); 911 911 nvme_rdma_destroy_admin_queue(ctrl, remove); 912 912 } 913 913 ··· 1059 1057 nvme_rdma_teardown_io_queues(ctrl, false); 1060 1058 nvme_start_queues(&ctrl->ctrl); 1061 1059 nvme_rdma_teardown_admin_queue(ctrl, false); 1060 + blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); 1062 1061 1063 1062 if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) { 1064 1063 /* state change failure is ok if we're in DELETING state */ ··· 1146 1143 req->mr = NULL; 1147 1144 } 1148 1145 1149 - ib_dma_unmap_sg(ibdev, req->sg_table.sgl, 1150 - req->nents, rq_data_dir(rq) == 1151 - WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE); 1146 + ib_dma_unmap_sg(ibdev, req->sg_table.sgl, req->nents, rq_dma_dir(rq)); 1152 1147 1153 1148 nvme_cleanup_cmd(rq); 1154 1149 sg_free_table_chained(&req->sg_table, SG_CHUNK_SIZE); ··· 1272 1271 req->nents = blk_rq_map_sg(rq->q, rq, req->sg_table.sgl); 1273 1272 1274 1273 count = ib_dma_map_sg(ibdev, req->sg_table.sgl, req->nents, 1275 - rq_data_dir(rq) == WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE); 1274 + rq_dma_dir(rq)); 1276 1275 if (unlikely(count <= 0)) { 1277 1276 ret = -EIO; 1278 1277 goto out_free_table; ··· 1301 1300 return 0; 1302 1301 1303 1302 out_unmap_sg: 1304 - ib_dma_unmap_sg(ibdev, req->sg_table.sgl, 1305 - req->nents, rq_data_dir(rq) == 1306 - WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE); 1303 + ib_dma_unmap_sg(ibdev, req->sg_table.sgl, req->nents, rq_dma_dir(rq)); 1307 1304 out_free_table: 1308 1305 sg_free_table_chained(&req->sg_table, SG_CHUNK_SIZE); 1309 1306 return ret; ··· 1544 1545 1545 1546 static int nvme_rdma_addr_resolved(struct nvme_rdma_queue *queue) 1546 1547 { 1548 + struct nvme_ctrl *ctrl = &queue->ctrl->ctrl; 1547 1549 int ret; 1548 1550 1549 1551 ret = nvme_rdma_create_queue_ib(queue); 1550 1552 if (ret) 1551 1553 return ret; 1552 1554 1555 + if (ctrl->opts->tos >= 0) 1556 + rdma_set_service_type(queue->cm_id, ctrl->opts->tos); 1553 1557 ret = rdma_resolve_route(queue->cm_id, NVME_RDMA_CONNECT_TIMEOUT_MS); 1554 1558 if (ret) { 1555 - dev_err(queue->ctrl->ctrl.device, 1556 - "rdma_resolve_route failed (%d).\n", 1559 + dev_err(ctrl->device, "rdma_resolve_route failed (%d).\n", 1557 1560 queue->cm_error); 1558 1561 goto out_destroy_queue; 1559 1562 } ··· 1868 1867 cancel_delayed_work_sync(&ctrl->reconnect_work); 1869 1868 1870 1869 nvme_rdma_teardown_io_queues(ctrl, shutdown); 1870 + blk_mq_quiesce_queue(ctrl->ctrl.admin_q); 1871 1871 if (shutdown) 1872 1872 nvme_shutdown_ctrl(&ctrl->ctrl); 1873 1873 else 1874 - nvme_disable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap); 1874 + nvme_disable_ctrl(&ctrl->ctrl); 1875 1875 nvme_rdma_teardown_admin_queue(ctrl, shutdown); 1876 1876 } 1877 1877 ··· 2051 2049 .required_opts = NVMF_OPT_TRADDR, 2052 2050 .allowed_opts = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY | 2053 2051 NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO | 2054 - NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES, 2052 + NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES | 2053 + NVMF_OPT_TOS, 2055 2054 .create_ctrl = nvme_rdma_create_ctrl, 2056 2055 }; 2057 2056
+93 -39
drivers/nvme/host/tcp.c
··· 13 13 #include <net/tcp.h> 14 14 #include <linux/blk-mq.h> 15 15 #include <crypto/hash.h> 16 + #include <net/busy_poll.h> 16 17 17 18 #include "nvme.h" 18 19 #include "fabrics.h" ··· 73 72 int pdu_offset; 74 73 size_t data_remaining; 75 74 size_t ddgst_remaining; 75 + unsigned int nr_cqe; 76 76 77 77 /* send state */ 78 78 struct nvme_tcp_request *request; ··· 440 438 } 441 439 442 440 nvme_end_request(rq, cqe->status, cqe->result); 441 + queue->nr_cqe++; 443 442 444 443 return 0; 445 444 } ··· 611 608 612 609 switch (hdr->type) { 613 610 case nvme_tcp_c2h_data: 614 - ret = nvme_tcp_handle_c2h_data(queue, (void *)queue->pdu); 615 - break; 611 + return nvme_tcp_handle_c2h_data(queue, (void *)queue->pdu); 616 612 case nvme_tcp_rsp: 617 613 nvme_tcp_init_recv_ctx(queue); 618 - ret = nvme_tcp_handle_comp(queue, (void *)queue->pdu); 619 - break; 614 + return nvme_tcp_handle_comp(queue, (void *)queue->pdu); 620 615 case nvme_tcp_r2t: 621 616 nvme_tcp_init_recv_ctx(queue); 622 - ret = nvme_tcp_handle_r2t(queue, (void *)queue->pdu); 623 - break; 617 + return nvme_tcp_handle_r2t(queue, (void *)queue->pdu); 624 618 default: 625 619 dev_err(queue->ctrl->ctrl.device, 626 620 "unsupported pdu type (%d)\n", hdr->type); 627 621 return -EINVAL; 628 622 } 629 - 630 - return ret; 631 623 } 632 624 633 625 static inline void nvme_tcp_end_request(struct request *rq, u16 status) ··· 699 701 nvme_tcp_ddgst_final(queue->rcv_hash, &queue->exp_ddgst); 700 702 queue->ddgst_remaining = NVME_TCP_DIGEST_LENGTH; 701 703 } else { 702 - if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) 704 + if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) { 703 705 nvme_tcp_end_request(rq, NVME_SC_SUCCESS); 706 + queue->nr_cqe++; 707 + } 704 708 nvme_tcp_init_recv_ctx(queue); 705 709 } 706 710 } ··· 742 742 pdu->command_id); 743 743 744 744 nvme_tcp_end_request(rq, NVME_SC_SUCCESS); 745 + queue->nr_cqe++; 745 746 } 746 747 747 748 nvme_tcp_init_recv_ctx(queue); ··· 1024 1023 1025 1024 static int nvme_tcp_try_recv(struct nvme_tcp_queue *queue) 1026 1025 { 1027 - struct sock *sk = queue->sock->sk; 1026 + struct socket *sock = queue->sock; 1027 + struct sock *sk = sock->sk; 1028 1028 read_descriptor_t rd_desc; 1029 1029 int consumed; 1030 1030 1031 1031 rd_desc.arg.data = queue; 1032 1032 rd_desc.count = 1; 1033 1033 lock_sock(sk); 1034 - consumed = tcp_read_sock(sk, &rd_desc, nvme_tcp_recv_skb); 1034 + queue->nr_cqe = 0; 1035 + consumed = sock->ops->read_sock(sk, &rd_desc, nvme_tcp_recv_skb); 1035 1036 release_sock(sk); 1036 1037 return consumed; 1037 1038 } ··· 1258 1255 queue->queue_size = queue_size; 1259 1256 1260 1257 if (qid > 0) 1261 - queue->cmnd_capsule_len = ctrl->ctrl.ioccsz * 16; 1258 + queue->cmnd_capsule_len = nctrl->ioccsz * 16; 1262 1259 else 1263 1260 queue->cmnd_capsule_len = sizeof(struct nvme_command) + 1264 1261 NVME_TCP_ADMIN_CCSZ; ··· 1266 1263 ret = sock_create(ctrl->addr.ss_family, SOCK_STREAM, 1267 1264 IPPROTO_TCP, &queue->sock); 1268 1265 if (ret) { 1269 - dev_err(ctrl->ctrl.device, 1266 + dev_err(nctrl->device, 1270 1267 "failed to create socket: %d\n", ret); 1271 1268 return ret; 1272 1269 } ··· 1276 1273 ret = kernel_setsockopt(queue->sock, IPPROTO_TCP, TCP_SYNCNT, 1277 1274 (char *)&opt, sizeof(opt)); 1278 1275 if (ret) { 1279 - dev_err(ctrl->ctrl.device, 1276 + dev_err(nctrl->device, 1280 1277 "failed to set TCP_SYNCNT sock opt %d\n", ret); 1281 1278 goto err_sock; 1282 1279 } ··· 1286 1283 ret = kernel_setsockopt(queue->sock, IPPROTO_TCP, 1287 1284 TCP_NODELAY, (char *)&opt, sizeof(opt)); 1288 1285 if (ret) { 1289 - dev_err(ctrl->ctrl.device, 1286 + dev_err(nctrl->device, 1290 1287 "failed to set TCP_NODELAY sock opt %d\n", ret); 1291 1288 goto err_sock; 1292 1289 } ··· 1299 1296 ret = kernel_setsockopt(queue->sock, SOL_SOCKET, SO_LINGER, 1300 1297 (char *)&sol, sizeof(sol)); 1301 1298 if (ret) { 1302 - dev_err(ctrl->ctrl.device, 1299 + dev_err(nctrl->device, 1303 1300 "failed to set SO_LINGER sock opt %d\n", ret); 1304 1301 goto err_sock; 1302 + } 1303 + 1304 + /* Set socket type of service */ 1305 + if (nctrl->opts->tos >= 0) { 1306 + opt = nctrl->opts->tos; 1307 + ret = kernel_setsockopt(queue->sock, SOL_IP, IP_TOS, 1308 + (char *)&opt, sizeof(opt)); 1309 + if (ret) { 1310 + dev_err(nctrl->device, 1311 + "failed to set IP_TOS sock opt %d\n", ret); 1312 + goto err_sock; 1313 + } 1305 1314 } 1306 1315 1307 1316 queue->sock->sk->sk_allocation = GFP_ATOMIC; ··· 1329 1314 queue->pdu_offset = 0; 1330 1315 sk_set_memalloc(queue->sock->sk); 1331 1316 1332 - if (ctrl->ctrl.opts->mask & NVMF_OPT_HOST_TRADDR) { 1317 + if (nctrl->opts->mask & NVMF_OPT_HOST_TRADDR) { 1333 1318 ret = kernel_bind(queue->sock, (struct sockaddr *)&ctrl->src_addr, 1334 1319 sizeof(ctrl->src_addr)); 1335 1320 if (ret) { 1336 - dev_err(ctrl->ctrl.device, 1321 + dev_err(nctrl->device, 1337 1322 "failed to bind queue %d socket %d\n", 1338 1323 qid, ret); 1339 1324 goto err_sock; ··· 1345 1330 if (queue->hdr_digest || queue->data_digest) { 1346 1331 ret = nvme_tcp_alloc_crypto(queue); 1347 1332 if (ret) { 1348 - dev_err(ctrl->ctrl.device, 1333 + dev_err(nctrl->device, 1349 1334 "failed to allocate queue %d crypto\n", qid); 1350 1335 goto err_sock; 1351 1336 } ··· 1359 1344 goto err_crypto; 1360 1345 } 1361 1346 1362 - dev_dbg(ctrl->ctrl.device, "connecting queue %d\n", 1347 + dev_dbg(nctrl->device, "connecting queue %d\n", 1363 1348 nvme_tcp_queue_id(queue)); 1364 1349 1365 1350 ret = kernel_connect(queue->sock, (struct sockaddr *)&ctrl->addr, 1366 1351 sizeof(ctrl->addr), 0); 1367 1352 if (ret) { 1368 - dev_err(ctrl->ctrl.device, 1353 + dev_err(nctrl->device, 1369 1354 "failed to connect socket: %d\n", ret); 1370 1355 goto err_rcv_pdu; 1371 1356 } ··· 1386 1371 queue->sock->sk->sk_data_ready = nvme_tcp_data_ready; 1387 1372 queue->sock->sk->sk_state_change = nvme_tcp_state_change; 1388 1373 queue->sock->sk->sk_write_space = nvme_tcp_write_space; 1374 + queue->sock->sk->sk_ll_usec = 1; 1389 1375 write_unlock_bh(&queue->sock->sk->sk_callback_lock); 1390 1376 1391 1377 return 0; ··· 1485 1469 set->driver_data = ctrl; 1486 1470 set->nr_hw_queues = nctrl->queue_count - 1; 1487 1471 set->timeout = NVME_IO_TIMEOUT; 1488 - set->nr_maps = 2 /* default + read */; 1472 + set->nr_maps = nctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2; 1489 1473 } 1490 1474 1491 1475 ret = blk_mq_alloc_tag_set(set); ··· 1584 1568 1585 1569 nr_io_queues = min(ctrl->opts->nr_io_queues, num_online_cpus()); 1586 1570 nr_io_queues += min(ctrl->opts->nr_write_queues, num_online_cpus()); 1571 + nr_io_queues += min(ctrl->opts->nr_poll_queues, num_online_cpus()); 1587 1572 1588 1573 return nr_io_queues; 1589 1574 } ··· 1615 1598 ctrl->io_queues[HCTX_TYPE_DEFAULT] = 1616 1599 min(opts->nr_io_queues, nr_io_queues); 1617 1600 nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT]; 1601 + } 1602 + 1603 + if (opts->nr_poll_queues && nr_io_queues) { 1604 + /* map dedicated poll queues only if we have queues left */ 1605 + ctrl->io_queues[HCTX_TYPE_POLL] = 1606 + min(opts->nr_poll_queues, nr_io_queues); 1618 1607 } 1619 1608 } 1620 1609 ··· 1703 1680 nvme_tcp_stop_queue(ctrl, 0); 1704 1681 if (remove) { 1705 1682 blk_cleanup_queue(ctrl->admin_q); 1683 + blk_cleanup_queue(ctrl->fabrics_q); 1706 1684 blk_mq_free_tag_set(ctrl->admin_tagset); 1707 1685 } 1708 1686 nvme_tcp_free_admin_queue(ctrl); ··· 1724 1700 goto out_free_queue; 1725 1701 } 1726 1702 1703 + ctrl->fabrics_q = blk_mq_init_queue(ctrl->admin_tagset); 1704 + if (IS_ERR(ctrl->fabrics_q)) { 1705 + error = PTR_ERR(ctrl->fabrics_q); 1706 + goto out_free_tagset; 1707 + } 1708 + 1727 1709 ctrl->admin_q = blk_mq_init_queue(ctrl->admin_tagset); 1728 1710 if (IS_ERR(ctrl->admin_q)) { 1729 1711 error = PTR_ERR(ctrl->admin_q); 1730 - goto out_free_tagset; 1712 + goto out_cleanup_fabrics_q; 1731 1713 } 1732 1714 } 1733 1715 ··· 1741 1711 if (error) 1742 1712 goto out_cleanup_queue; 1743 1713 1744 - error = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap); 1745 - if (error) { 1746 - dev_err(ctrl->device, 1747 - "prop_get NVME_REG_CAP failed\n"); 1748 - goto out_stop_queue; 1749 - } 1750 - 1751 - ctrl->sqsize = min_t(int, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize); 1752 - 1753 - error = nvme_enable_ctrl(ctrl, ctrl->cap); 1714 + error = nvme_enable_ctrl(ctrl); 1754 1715 if (error) 1755 1716 goto out_stop_queue; 1717 + 1718 + blk_mq_unquiesce_queue(ctrl->admin_q); 1756 1719 1757 1720 error = nvme_init_identify(ctrl); 1758 1721 if (error) ··· 1758 1735 out_cleanup_queue: 1759 1736 if (new) 1760 1737 blk_cleanup_queue(ctrl->admin_q); 1738 + out_cleanup_fabrics_q: 1739 + if (new) 1740 + blk_cleanup_queue(ctrl->fabrics_q); 1761 1741 out_free_tagset: 1762 1742 if (new) 1763 1743 blk_mq_free_tag_set(ctrl->admin_tagset); ··· 1779 1753 nvme_cancel_request, ctrl); 1780 1754 blk_mq_tagset_wait_completed_request(ctrl->admin_tagset); 1781 1755 } 1782 - blk_mq_unquiesce_queue(ctrl->admin_q); 1756 + if (remove) 1757 + blk_mq_unquiesce_queue(ctrl->admin_q); 1783 1758 nvme_tcp_destroy_admin_queue(ctrl, remove); 1784 1759 } 1785 1760 ··· 1907 1880 /* unquiesce to fail fast pending requests */ 1908 1881 nvme_start_queues(ctrl); 1909 1882 nvme_tcp_teardown_admin_queue(ctrl, false); 1883 + blk_mq_unquiesce_queue(ctrl->admin_q); 1910 1884 1911 1885 if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) { 1912 1886 /* state change failure is ok if we're in DELETING state */ ··· 1924 1896 cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work); 1925 1897 1926 1898 nvme_tcp_teardown_io_queues(ctrl, shutdown); 1899 + blk_mq_quiesce_queue(ctrl->admin_q); 1927 1900 if (shutdown) 1928 1901 nvme_shutdown_ctrl(ctrl); 1929 1902 else 1930 - nvme_disable_ctrl(ctrl, ctrl->cap); 1903 + nvme_disable_ctrl(ctrl); 1931 1904 nvme_tcp_teardown_admin_queue(ctrl, shutdown); 1932 1905 } 1933 1906 ··· 2184 2155 blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]); 2185 2156 blk_mq_map_queues(&set->map[HCTX_TYPE_READ]); 2186 2157 2158 + if (opts->nr_poll_queues && ctrl->io_queues[HCTX_TYPE_POLL]) { 2159 + /* map dedicated poll queues only if we have queues left */ 2160 + set->map[HCTX_TYPE_POLL].nr_queues = 2161 + ctrl->io_queues[HCTX_TYPE_POLL]; 2162 + set->map[HCTX_TYPE_POLL].queue_offset = 2163 + ctrl->io_queues[HCTX_TYPE_DEFAULT] + 2164 + ctrl->io_queues[HCTX_TYPE_READ]; 2165 + blk_mq_map_queues(&set->map[HCTX_TYPE_POLL]); 2166 + } 2167 + 2187 2168 dev_info(ctrl->ctrl.device, 2188 - "mapped %d/%d default/read queues.\n", 2169 + "mapped %d/%d/%d default/read/poll queues.\n", 2189 2170 ctrl->io_queues[HCTX_TYPE_DEFAULT], 2190 - ctrl->io_queues[HCTX_TYPE_READ]); 2171 + ctrl->io_queues[HCTX_TYPE_READ], 2172 + ctrl->io_queues[HCTX_TYPE_POLL]); 2191 2173 2192 2174 return 0; 2175 + } 2176 + 2177 + static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx) 2178 + { 2179 + struct nvme_tcp_queue *queue = hctx->driver_data; 2180 + struct sock *sk = queue->sock->sk; 2181 + 2182 + if (sk_can_busy_loop(sk) && skb_queue_empty(&sk->sk_receive_queue)) 2183 + sk_busy_loop(sk, true); 2184 + nvme_tcp_try_recv(queue); 2185 + return queue->nr_cqe; 2193 2186 } 2194 2187 2195 2188 static struct blk_mq_ops nvme_tcp_mq_ops = { ··· 2222 2171 .init_hctx = nvme_tcp_init_hctx, 2223 2172 .timeout = nvme_tcp_timeout, 2224 2173 .map_queues = nvme_tcp_map_queues, 2174 + .poll = nvme_tcp_poll, 2225 2175 }; 2226 2176 2227 2177 static struct blk_mq_ops nvme_tcp_admin_mq_ops = { ··· 2276 2224 2277 2225 INIT_LIST_HEAD(&ctrl->list); 2278 2226 ctrl->ctrl.opts = opts; 2279 - ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues + 1; 2227 + ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues + 2228 + opts->nr_poll_queues + 1; 2280 2229 ctrl->ctrl.sqsize = opts->queue_size - 1; 2281 2230 ctrl->ctrl.kato = opts->kato; 2282 2231 ··· 2371 2318 .allowed_opts = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY | 2372 2319 NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO | 2373 2320 NVMF_OPT_HDR_DIGEST | NVMF_OPT_DATA_DIGEST | 2374 - NVMF_OPT_NR_WRITE_QUEUES, 2321 + NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES | 2322 + NVMF_OPT_TOS, 2375 2323 .create_ctrl = nvme_tcp_create_ctrl, 2376 2324 }; 2377 2325
+18
drivers/nvme/host/trace.c
··· 86 86 return ret; 87 87 } 88 88 89 + static const char *nvme_trace_get_lba_status(struct trace_seq *p, 90 + u8 *cdw10) 91 + { 92 + const char *ret = trace_seq_buffer_ptr(p); 93 + u64 slba = get_unaligned_le64(cdw10); 94 + u32 mndw = get_unaligned_le32(cdw10 + 8); 95 + u16 rl = get_unaligned_le16(cdw10 + 12); 96 + u8 atype = cdw10[15]; 97 + 98 + trace_seq_printf(p, "slba=0x%llx, mndw=0x%x, rl=0x%x, atype=%u", 99 + slba, mndw, rl, atype); 100 + trace_seq_putc(p, 0); 101 + 102 + return ret; 103 + } 104 + 89 105 static const char *nvme_trace_read_write(struct trace_seq *p, u8 *cdw10) 90 106 { 91 107 const char *ret = trace_seq_buffer_ptr(p); ··· 157 141 return nvme_trace_admin_identify(p, cdw10); 158 142 case nvme_admin_get_features: 159 143 return nvme_trace_admin_get_features(p, cdw10); 144 + case nvme_admin_get_lba_status: 145 + return nvme_trace_get_lba_status(p, cdw10); 160 146 default: 161 147 return nvme_trace_common(p, cdw10); 162 148 }
+8 -6
drivers/nvme/target/admin-cmd.c
··· 81 81 goto out; 82 82 83 83 host_reads = part_stat_read(ns->bdev->bd_part, ios[READ]); 84 - data_units_read = part_stat_read(ns->bdev->bd_part, sectors[READ]); 84 + data_units_read = DIV_ROUND_UP(part_stat_read(ns->bdev->bd_part, 85 + sectors[READ]), 1000); 85 86 host_writes = part_stat_read(ns->bdev->bd_part, ios[WRITE]); 86 - data_units_written = part_stat_read(ns->bdev->bd_part, sectors[WRITE]); 87 + data_units_written = DIV_ROUND_UP(part_stat_read(ns->bdev->bd_part, 88 + sectors[WRITE]), 1000); 87 89 88 90 put_unaligned_le64(host_reads, &slog->host_reads[0]); 89 91 put_unaligned_le64(data_units_read, &slog->data_units_read[0]); ··· 113 111 if (!ns->bdev) 114 112 continue; 115 113 host_reads += part_stat_read(ns->bdev->bd_part, ios[READ]); 116 - data_units_read += 117 - part_stat_read(ns->bdev->bd_part, sectors[READ]); 114 + data_units_read += DIV_ROUND_UP( 115 + part_stat_read(ns->bdev->bd_part, sectors[READ]), 1000); 118 116 host_writes += part_stat_read(ns->bdev->bd_part, ios[WRITE]); 119 - data_units_written += 120 - part_stat_read(ns->bdev->bd_part, sectors[WRITE]); 117 + data_units_written += DIV_ROUND_UP( 118 + part_stat_read(ns->bdev->bd_part, sectors[WRITE]), 1000); 121 119 122 120 } 123 121 rcu_read_unlock();
+14 -14
drivers/nvme/target/loop.c
··· 253 253 clear_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[0].flags); 254 254 nvmet_sq_destroy(&ctrl->queues[0].nvme_sq); 255 255 blk_cleanup_queue(ctrl->ctrl.admin_q); 256 + blk_cleanup_queue(ctrl->ctrl.fabrics_q); 256 257 blk_mq_free_tag_set(&ctrl->admin_tag_set); 257 258 } 258 259 ··· 358 357 goto out_free_sq; 359 358 ctrl->ctrl.admin_tagset = &ctrl->admin_tag_set; 360 359 360 + ctrl->ctrl.fabrics_q = blk_mq_init_queue(&ctrl->admin_tag_set); 361 + if (IS_ERR(ctrl->ctrl.fabrics_q)) { 362 + error = PTR_ERR(ctrl->ctrl.fabrics_q); 363 + goto out_free_tagset; 364 + } 365 + 361 366 ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set); 362 367 if (IS_ERR(ctrl->ctrl.admin_q)) { 363 368 error = PTR_ERR(ctrl->ctrl.admin_q); 364 - goto out_free_tagset; 369 + goto out_cleanup_fabrics_q; 365 370 } 366 371 367 372 error = nvmf_connect_admin_queue(&ctrl->ctrl); ··· 376 369 377 370 set_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[0].flags); 378 371 379 - error = nvmf_reg_read64(&ctrl->ctrl, NVME_REG_CAP, &ctrl->ctrl.cap); 380 - if (error) { 381 - dev_err(ctrl->ctrl.device, 382 - "prop_get NVME_REG_CAP failed\n"); 383 - goto out_cleanup_queue; 384 - } 385 - 386 - ctrl->ctrl.sqsize = 387 - min_t(int, NVME_CAP_MQES(ctrl->ctrl.cap), ctrl->ctrl.sqsize); 388 - 389 - error = nvme_enable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap); 372 + error = nvme_enable_ctrl(&ctrl->ctrl); 390 373 if (error) 391 374 goto out_cleanup_queue; 392 375 393 376 ctrl->ctrl.max_hw_sectors = 394 377 (NVME_LOOP_MAX_SEGMENTS - 1) << (PAGE_SHIFT - 9); 378 + 379 + blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); 395 380 396 381 error = nvme_init_identify(&ctrl->ctrl); 397 382 if (error) ··· 393 394 394 395 out_cleanup_queue: 395 396 blk_cleanup_queue(ctrl->ctrl.admin_q); 397 + out_cleanup_fabrics_q: 398 + blk_cleanup_queue(ctrl->ctrl.fabrics_q); 396 399 out_free_tagset: 397 400 blk_mq_free_tag_set(&ctrl->admin_tag_set); 398 401 out_free_sq: ··· 412 411 nvme_loop_destroy_io_queues(ctrl); 413 412 } 414 413 414 + blk_mq_quiesce_queue(ctrl->ctrl.admin_q); 415 415 if (ctrl->ctrl.state == NVME_CTRL_LIVE) 416 416 nvme_shutdown_ctrl(&ctrl->ctrl); 417 417 418 - blk_mq_quiesce_queue(ctrl->ctrl.admin_q); 419 418 blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, 420 419 nvme_cancel_request, &ctrl->ctrl); 421 420 blk_mq_tagset_wait_completed_request(&ctrl->admin_tag_set); 422 - blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); 423 421 nvme_loop_destroy_admin_queue(ctrl); 424 422 } 425 423
+20 -4
drivers/nvme/target/tcp.c
··· 348 348 349 349 return 0; 350 350 err: 351 - sgl_free(cmd->req.sg); 351 + if (cmd->req.sg_cnt) 352 + sgl_free(cmd->req.sg); 352 353 return NVME_SC_INTERNAL; 353 354 } 354 355 ··· 554 553 555 554 if (queue->nvme_sq.sqhd_disabled) { 556 555 kfree(cmd->iov); 557 - sgl_free(cmd->req.sg); 556 + if (cmd->req.sg_cnt) 557 + sgl_free(cmd->req.sg); 558 558 } 559 559 560 560 return 1; ··· 586 584 return -EAGAIN; 587 585 588 586 kfree(cmd->iov); 589 - sgl_free(cmd->req.sg); 587 + if (cmd->req.sg_cnt) 588 + sgl_free(cmd->req.sg); 590 589 cmd->queue->snd_cmd = NULL; 591 590 nvmet_tcp_put_cmd(cmd); 592 591 return 1; ··· 1309 1306 { 1310 1307 nvmet_req_uninit(&cmd->req); 1311 1308 nvmet_tcp_unmap_pdu_iovec(cmd); 1312 - sgl_free(cmd->req.sg); 1309 + kfree(cmd->iov); 1310 + if (cmd->req.sg_cnt) 1311 + sgl_free(cmd->req.sg); 1313 1312 } 1314 1313 1315 1314 static void nvmet_tcp_uninit_data_in_cmds(struct nvmet_tcp_queue *queue) ··· 1415 1410 static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue) 1416 1411 { 1417 1412 struct socket *sock = queue->sock; 1413 + struct inet_sock *inet = inet_sk(sock->sk); 1418 1414 struct linger sol = { .l_onoff = 1, .l_linger = 0 }; 1419 1415 int ret; 1420 1416 ··· 1438 1432 (char *)&sol, sizeof(sol)); 1439 1433 if (ret) 1440 1434 return ret; 1435 + 1436 + /* Set socket type of service */ 1437 + if (inet->rcv_tos > 0) { 1438 + int tos = inet->rcv_tos; 1439 + 1440 + ret = kernel_setsockopt(sock, SOL_IP, IP_TOS, 1441 + (char *)&tos, sizeof(tos)); 1442 + if (ret) 1443 + return ret; 1444 + } 1441 1445 1442 1446 write_lock_bh(&sock->sk->sk_callback_lock); 1443 1447 sock->sk->sk_user_data = queue;
+18
drivers/nvme/target/trace.c
··· 33 33 return ret; 34 34 } 35 35 36 + static const char *nvmet_trace_get_lba_status(struct trace_seq *p, 37 + u8 *cdw10) 38 + { 39 + const char *ret = trace_seq_buffer_ptr(p); 40 + u64 slba = get_unaligned_le64(cdw10); 41 + u32 mndw = get_unaligned_le32(cdw10 + 8); 42 + u16 rl = get_unaligned_le16(cdw10 + 12); 43 + u8 atype = cdw10[15]; 44 + 45 + trace_seq_printf(p, "slba=0x%llx, mndw=0x%x, rl=0x%x, atype=%u", 46 + slba, mndw, rl, atype); 47 + trace_seq_putc(p, 0); 48 + 49 + return ret; 50 + } 51 + 36 52 static const char *nvmet_trace_read_write(struct trace_seq *p, u8 *cdw10) 37 53 { 38 54 const char *ret = trace_seq_buffer_ptr(p); ··· 96 80 return nvmet_trace_admin_identify(p, cdw10); 97 81 case nvme_admin_get_features: 98 82 return nvmet_trace_admin_get_features(p, cdw10); 83 + case nvme_admin_get_lba_status: 84 + return nvmet_trace_get_lba_status(p, cdw10); 99 85 default: 100 86 return nvmet_trace_common(p, cdw10); 101 87 }
+4 -1
include/linux/nvme.h
··· 140 140 * Submission and Completion Queue Entry Sizes for the NVM command set. 141 141 * (In bytes and specified as a power of two (2^n)). 142 142 */ 143 + #define NVME_ADM_SQES 6 143 144 #define NVME_NVM_IOSQES 6 144 145 #define NVME_NVM_IOCQES 4 145 146 ··· 815 814 nvme_admin_security_send = 0x81, 816 815 nvme_admin_security_recv = 0x82, 817 816 nvme_admin_sanitize_nvm = 0x84, 817 + nvme_admin_get_lba_status = 0x86, 818 818 }; 819 819 820 820 #define nvme_admin_opcode_name(opcode) { opcode, #opcode } ··· 842 840 nvme_admin_opcode_name(nvme_admin_format_nvm), \ 843 841 nvme_admin_opcode_name(nvme_admin_security_send), \ 844 842 nvme_admin_opcode_name(nvme_admin_security_recv), \ 845 - nvme_admin_opcode_name(nvme_admin_sanitize_nvm)) 843 + nvme_admin_opcode_name(nvme_admin_sanitize_nvm), \ 844 + nvme_admin_opcode_name(nvme_admin_get_lba_status)) 846 845 847 846 enum { 848 847 NVME_QUEUE_PHYS_CONTIG = (1 << 0),