Merge tag 'block-6.15-20250403' of git://git.kernel.dk/linux

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Pull more block updates from Jens Axboe:

- NVMe pull request via Keith:
- PCI endpoint target cleanup (Damien)
- Early import for uring_cmd fixed buffer (Caleb)
- Multipath documentation and notification improvements (John)
- Invalid pci sq doorbell write fix (Maurizio)

- Queue init locking fix

- Remove dead nsegs parameter from blk_mq_get_new_requests()

* tag 'block-6.15-20250403' of git://git.kernel.dk/linux:
block: don't grab elevator lock during queue initialization
nvme-pci: skip nvme_write_sq_db on empty rqlist
nvme-multipath: change the NVME_MULTIPATH config option
nvme: update the multipath warning in nvme_init_ns_head
nvme/ioctl: move fixed buffer lookup to nvme_uring_cmd_io()
nvme/ioctl: move blk_mq_free_request() out of nvme_map_user_request()
nvme/ioctl: don't warn on vectorized uring_cmd with fixed buffer
nvmet: pci-epf: Keep completion queues mapped
block: remove unused nseg parameter

Linus Torvalds 1 year ago 949dd321 7930edcc

+94 -84

6 changed files

expand all

block

blk-mq.c

drivers

nvme

host

Kconfig

core.c

ioctl.c

pci.c

target

pci-epf.c

+19 -10

block/blk-mq.c

··· 2965 2965 2966 2966 static struct request *blk_mq_get_new_requests(struct request_queue *q, 2967 2967 struct blk_plug *plug, 2968 - struct bio *bio, 2969 - unsigned int nsegs) 2968 + struct bio *bio) 2970 2969 { 2971 2970 struct blk_mq_alloc_data data = { 2972 2971 .q = q, ··· 3124 3125 if (rq) { 3125 3126 blk_mq_use_cached_rq(rq, plug, bio); 3126 3127 } else { 3127 - rq = blk_mq_get_new_requests(q, plug, bio, nr_segs); 3128 + rq = blk_mq_get_new_requests(q, plug, bio); 3128 3129 if (unlikely(!rq)) { 3129 3130 if (bio->bi_opf & REQ_NOWAIT) 3130 3131 bio_wouldblock_error(bio); ··· 4464 4465 return NULL; 4465 4466 } 4466 4467 4467 - static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, 4468 - struct request_queue *q) 4468 + static void __blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, 4469 + struct request_queue *q) 4469 4470 { 4470 4471 struct blk_mq_hw_ctx *hctx; 4471 4472 unsigned long i, j; 4472 4473 4473 - /* protect against switching io scheduler */ 4474 - mutex_lock(&q->elevator_lock); 4475 4474 for (i = 0; i < set->nr_hw_queues; i++) { 4476 4475 int old_node; 4477 4476 int node = blk_mq_get_hctx_node(set, i); ··· 4502 4505 4503 4506 xa_for_each_start(&q->hctx_table, j, hctx, j) 4504 4507 blk_mq_exit_hctx(q, set, hctx, j); 4505 - mutex_unlock(&q->elevator_lock); 4508 + } 4509 + 4510 + static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, 4511 + struct request_queue *q, bool lock) 4512 + { 4513 + if (lock) { 4514 + /* protect against switching io scheduler */ 4515 + mutex_lock(&q->elevator_lock); 4516 + __blk_mq_realloc_hw_ctxs(set, q); 4517 + mutex_unlock(&q->elevator_lock); 4518 + } else { 4519 + __blk_mq_realloc_hw_ctxs(set, q); 4520 + } 4506 4521 4507 4522 /* unregister cpuhp callbacks for exited hctxs */ 4508 4523 blk_mq_remove_hw_queues_cpuhp(q); ··· 4546 4537 4547 4538 xa_init(&q->hctx_table); 4548 4539 4549 - blk_mq_realloc_hw_ctxs(set, q); 4540 + blk_mq_realloc_hw_ctxs(set, q, false); 4550 4541 if (!q->nr_hw_queues) 4551 4542 goto err_hctxs; 4552 4543 ··· 5042 5033 fallback: 5043 5034 blk_mq_update_queue_map(set); 5044 5035 list_for_each_entry(q, &set->tag_list, tag_set_list) { 5045 - blk_mq_realloc_hw_ctxs(set, q); 5036 + blk_mq_realloc_hw_ctxs(set, q, true); 5046 5037 5047 5038 if (q->nr_hw_queues != set->nr_hw_queues) { 5048 5039 int i = prev_nr_hw_queues;

+9 -4

drivers/nvme/host/Kconfig

··· 18 18 bool "NVMe multipath support" 19 19 depends on NVME_CORE 20 20 help 21 - This option enables support for multipath access to NVMe 22 - subsystems. If this option is enabled only a single 23 - /dev/nvmeXnY device will show up for each NVMe namespace, 24 - even if it is accessible through multiple controllers. 21 + This option controls support for multipath access to NVMe 22 + subsystems. If this option is enabled support for NVMe multipath 23 + access is included in the kernel. If this option is disabled support 24 + for NVMe multipath access is excluded from the kernel. When this 25 + option is disabled each controller/namespace receives its 26 + own /dev/nvmeXnY device entry and NVMe multipath access is 27 + not supported. 28 + 29 + If unsure, say Y. 25 30 26 31 config NVME_VERBOSE_ERRORS 27 32 bool "NVMe verbose error reporting"

+1 -1

drivers/nvme/host/core.c

··· 3822 3822 "Found shared namespace %d, but multipathing not supported.\n", 3823 3823 info->nsid); 3824 3824 dev_warn_once(ctrl->device, 3825 - "Support for shared namespaces without CONFIG_NVME_MULTIPATH is deprecated and will be removed in Linux 6.0.\n"); 3825 + "Shared namespace support requires core_nvme.multipath=Y.\n"); 3826 3826 } 3827 3827 } 3828 3828

+37 -31

drivers/nvme/host/ioctl.c

··· 114 114 115 115 static int nvme_map_user_request(struct request *req, u64 ubuffer, 116 116 unsigned bufflen, void __user *meta_buffer, unsigned meta_len, 117 - struct io_uring_cmd *ioucmd, unsigned int flags, 118 - unsigned int iou_issue_flags) 117 + struct iov_iter *iter, unsigned int flags) 119 118 { 120 119 struct request_queue *q = req->q; 121 120 struct nvme_ns *ns = q->queuedata; ··· 128 129 if (!nvme_ctrl_sgl_supported(ctrl)) 129 130 dev_warn_once(ctrl->device, "using unchecked data buffer\n"); 130 131 if (has_metadata) { 131 - if (!supports_metadata) { 132 - ret = -EINVAL; 133 - goto out; 134 - } 132 + if (!supports_metadata) 133 + return -EINVAL; 134 + 135 135 if (!nvme_ctrl_meta_sgl_supported(ctrl)) 136 136 dev_warn_once(ctrl->device, 137 137 "using unchecked metadata buffer\n"); 138 138 } 139 139 140 - if (ioucmd && (ioucmd->flags & IORING_URING_CMD_FIXED)) { 141 - struct iov_iter iter; 142 - 143 - /* fixedbufs is only for non-vectored io */ 144 - if (WARN_ON_ONCE(flags & NVME_IOCTL_VEC)) { 145 - ret = -EINVAL; 146 - goto out; 147 - } 148 - ret = io_uring_cmd_import_fixed(ubuffer, bufflen, 149 - rq_data_dir(req), &iter, ioucmd, 150 - iou_issue_flags); 151 - if (ret < 0) 152 - goto out; 153 - ret = blk_rq_map_user_iov(q, req, NULL, &iter, GFP_KERNEL); 154 - } else { 140 + if (iter) 141 + ret = blk_rq_map_user_iov(q, req, NULL, iter, GFP_KERNEL); 142 + else 155 143 ret = blk_rq_map_user_io(req, NULL, nvme_to_user_ptr(ubuffer), 156 144 bufflen, GFP_KERNEL, flags & NVME_IOCTL_VEC, 0, 157 145 0, rq_data_dir(req)); 158 - } 159 146 160 147 if (ret) 161 - goto out; 148 + return ret; 162 149 163 150 bio = req->bio; 164 151 if (bdev) ··· 161 176 out_unmap: 162 177 if (bio) 163 178 blk_rq_unmap_user(bio); 164 - out: 165 - blk_mq_free_request(req); 166 179 return ret; 167 180 } 168 181 ··· 183 200 req->timeout = timeout; 184 201 if (ubuffer && bufflen) { 185 202 ret = nvme_map_user_request(req, ubuffer, bufflen, meta_buffer, 186 - meta_len, NULL, flags, 0); 203 + meta_len, NULL, flags); 187 204 if (ret) 188 - return ret; 205 + goto out_free_req; 189 206 } 190 207 191 208 bio = req->bio; ··· 201 218 202 219 if (effects) 203 220 nvme_passthru_end(ctrl, ns, effects, cmd, ret); 221 + return ret; 204 222 223 + out_free_req: 224 + blk_mq_free_request(req); 205 225 return ret; 206 226 } 207 227 ··· 455 469 struct request_queue *q = ns ? ns->queue : ctrl->admin_q; 456 470 struct nvme_uring_data d; 457 471 struct nvme_command c; 472 + struct iov_iter iter; 473 + struct iov_iter *map_iter = NULL; 458 474 struct request *req; 459 475 blk_opf_t rq_flags = REQ_ALLOC_CACHE; 460 476 blk_mq_req_flags_t blk_flags = 0; ··· 492 504 d.metadata_len = READ_ONCE(cmd->metadata_len); 493 505 d.timeout_ms = READ_ONCE(cmd->timeout_ms); 494 506 507 + if (d.data_len && (ioucmd->flags & IORING_URING_CMD_FIXED)) { 508 + /* fixedbufs is only for non-vectored io */ 509 + if (vec) 510 + return -EINVAL; 511 + 512 + ret = io_uring_cmd_import_fixed(d.addr, d.data_len, 513 + nvme_is_write(&c) ? WRITE : READ, &iter, ioucmd, 514 + issue_flags); 515 + if (ret < 0) 516 + return ret; 517 + 518 + map_iter = &iter; 519 + } 520 + 495 521 if (issue_flags & IO_URING_F_NONBLOCK) { 496 522 rq_flags |= REQ_NOWAIT; 497 523 blk_flags = BLK_MQ_REQ_NOWAIT; ··· 519 517 req->timeout = d.timeout_ms ? msecs_to_jiffies(d.timeout_ms) : 0; 520 518 521 519 if (d.data_len) { 522 - ret = nvme_map_user_request(req, d.addr, 523 - d.data_len, nvme_to_user_ptr(d.metadata), 524 - d.metadata_len, ioucmd, vec, issue_flags); 520 + ret = nvme_map_user_request(req, d.addr, d.data_len, 521 + nvme_to_user_ptr(d.metadata), d.metadata_len, 522 + map_iter, vec); 525 523 if (ret) 526 - return ret; 524 + goto out_free_req; 527 525 } 528 526 529 527 /* to free bio on completion, as req->bio will be null at that time */ ··· 533 531 req->end_io = nvme_uring_cmd_end_io; 534 532 blk_execute_rq_nowait(req, false); 535 533 return -EIOCBQUEUED; 534 + 535 + out_free_req: 536 + blk_mq_free_request(req); 537 + return ret; 536 538 } 537 539 538 540 static bool is_ctrl_ioctl(unsigned int cmd)

drivers/nvme/host/pci.c

··· 986 986 { 987 987 struct request *req; 988 988 989 + if (rq_list_empty(rqlist)) 990 + return; 991 + 989 992 spin_lock(&nvmeq->sq_lock); 990 993 while ((req = rq_list_pop(rqlist))) { 991 994 struct nvme_iod *iod = blk_mq_rq_to_pdu(req);

+25 -38

drivers/nvme/target/pci-epf.c

··· 1264 1264 struct nvmet_pci_epf_ctrl *ctrl = tctrl->drvdata; 1265 1265 struct nvmet_pci_epf_queue *cq = &ctrl->cq[cqid]; 1266 1266 u16 status; 1267 + int ret; 1267 1268 1268 1269 if (test_bit(NVMET_PCI_EPF_Q_LIVE, &cq->flags)) 1269 1270 return NVME_SC_QID_INVALID | NVME_STATUS_DNR; ··· 1299 1298 if (status != NVME_SC_SUCCESS) 1300 1299 goto err; 1301 1300 1301 + /* 1302 + * Map the CQ PCI address space and since PCI endpoint controllers may 1303 + * return a partial mapping, check that the mapping is large enough. 1304 + */ 1305 + ret = nvmet_pci_epf_mem_map(ctrl->nvme_epf, cq->pci_addr, cq->pci_size, 1306 + &cq->pci_map); 1307 + if (ret) { 1308 + dev_err(ctrl->dev, "Failed to map CQ %u (err=%d)\n", 1309 + cq->qid, ret); 1310 + goto err_internal; 1311 + } 1312 + 1313 + if (cq->pci_map.pci_size < cq->pci_size) { 1314 + dev_err(ctrl->dev, "Invalid partial mapping of queue %u\n", 1315 + cq->qid); 1316 + goto err_unmap_queue; 1317 + } 1318 + 1302 1319 set_bit(NVMET_PCI_EPF_Q_LIVE, &cq->flags); 1303 1320 1304 1321 dev_dbg(ctrl->dev, "CQ[%u]: %u entries of %zu B, IRQ vector %u\n", ··· 1324 1305 1325 1306 return NVME_SC_SUCCESS; 1326 1307 1308 + err_unmap_queue: 1309 + nvmet_pci_epf_mem_unmap(ctrl->nvme_epf, &cq->pci_map); 1310 + err_internal: 1311 + status = NVME_SC_INTERNAL | NVME_STATUS_DNR; 1327 1312 err: 1328 1313 if (test_and_clear_bit(NVMET_PCI_EPF_Q_IRQ_ENABLED, &cq->flags)) 1329 1314 nvmet_pci_epf_remove_irq_vector(ctrl, cq->vector); ··· 1345 1322 cancel_delayed_work_sync(&cq->work); 1346 1323 nvmet_pci_epf_drain_queue(cq); 1347 1324 nvmet_pci_epf_remove_irq_vector(ctrl, cq->vector); 1325 + nvmet_pci_epf_mem_unmap(ctrl->nvme_epf, &cq->pci_map); 1348 1326 1349 1327 return NVME_SC_SUCCESS; 1350 1328 } ··· 1577 1553 ctrl->cq = NULL; 1578 1554 } 1579 1555 1580 - static int nvmet_pci_epf_map_queue(struct nvmet_pci_epf_ctrl *ctrl, 1581 - struct nvmet_pci_epf_queue *queue) 1582 - { 1583 - struct nvmet_pci_epf *nvme_epf = ctrl->nvme_epf; 1584 - int ret; 1585 - 1586 - ret = nvmet_pci_epf_mem_map(nvme_epf, queue->pci_addr, 1587 - queue->pci_size, &queue->pci_map); 1588 - if (ret) { 1589 - dev_err(ctrl->dev, "Failed to map queue %u (err=%d)\n", 1590 - queue->qid, ret); 1591 - return ret; 1592 - } 1593 - 1594 - if (queue->pci_map.pci_size < queue->pci_size) { 1595 - dev_err(ctrl->dev, "Invalid partial mapping of queue %u\n", 1596 - queue->qid); 1597 - nvmet_pci_epf_mem_unmap(nvme_epf, &queue->pci_map); 1598 - return -ENOMEM; 1599 - } 1600 - 1601 - return 0; 1602 - } 1603 - 1604 - static inline void nvmet_pci_epf_unmap_queue(struct nvmet_pci_epf_ctrl *ctrl, 1605 - struct nvmet_pci_epf_queue *queue) 1606 - { 1607 - nvmet_pci_epf_mem_unmap(ctrl->nvme_epf, &queue->pci_map); 1608 - } 1609 - 1610 1556 static void nvmet_pci_epf_exec_iod_work(struct work_struct *work) 1611 1557 { 1612 1558 struct nvmet_pci_epf_iod *iod = ··· 1740 1746 struct nvme_completion *cqe; 1741 1747 struct nvmet_pci_epf_iod *iod; 1742 1748 unsigned long flags; 1743 - int ret, n = 0; 1744 - 1745 - ret = nvmet_pci_epf_map_queue(ctrl, cq); 1746 - if (ret) 1747 - goto again; 1749 + int ret = 0, n = 0; 1748 1750 1749 1751 while (test_bit(NVMET_PCI_EPF_Q_LIVE, &cq->flags) && ctrl->link_up) { 1750 1752 ··· 1787 1797 n++; 1788 1798 } 1789 1799 1790 - nvmet_pci_epf_unmap_queue(ctrl, cq); 1791 - 1792 1800 /* 1793 1801 * We do not support precise IRQ coalescing time (100ns units as per 1794 1802 * NVMe specifications). So if we have posted completion entries without ··· 1795 1807 if (n) 1796 1808 nvmet_pci_epf_raise_irq(ctrl, cq, true); 1797 1809 1798 - again: 1799 1810 if (ret < 0) 1800 1811 queue_delayed_work(system_highpri_wq, &cq->work, 1801 1812 NVMET_PCI_EPF_CQ_RETRY_INTERVAL);