Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'io_uring-5.9-2020-08-23' of git://git.kernel.dk/linux-block

Pull block fixes from Jens Axboe:

- NVMe pull request from Sagi:
- nvme completion rework from Christoph and Chao that mostly came
from a bit of divergence of how we classify errors related to
pathing/retry etc.
- nvmet passthru fixes from Chaitanya
- minor nvmet fixes from Amit and I
- mpath round-robin path selection fix from Martin
- ignore noiob for zoned devices from Keith
- minor nvme-fc fix from Tianjia"

- BFQ cgroup leak fix (Dmitry)

- block layer MAINTAINERS addition (Geert)

- fix null_blk FUA checking (Hou)

- get_max_io_size() size fix (Keith)

- fix block page_is_mergeable() for compound pages (Matthew)

- discard granularity fixes (Ming)

- IO scheduler ordering fix (Ming)

- misc fixes

* tag 'io_uring-5.9-2020-08-23' of git://git.kernel.dk/linux-block: (31 commits)
null_blk: fix passing of REQ_FUA flag in null_handle_rq
nvmet: Disable keep-alive timer when kato is cleared to 0h
nvme: redirect commands on dying queue
nvme: just check the status code type in nvme_is_path_error
nvme: refactor command completion
nvme: rename and document nvme_end_request
nvme: skip noiob for zoned devices
nvme-pci: fix PRP pool size
nvme-pci: Use u32 for nvme_dev.q_depth and nvme_queue.q_depth
nvme: Use spin_lock_irq() when taking the ctrl->lock
nvmet: call blk_mq_free_request() directly
nvmet: fix oops in pt cmd execution
nvmet: add ns tear down label for pt-cmd handling
nvme: multipath: round-robin: eliminate "fallback" variable
nvme: multipath: round-robin: fix single non-optimized path case
nvme-fc: Fix wrong return value in __nvme_fc_init_request()
nvmet-passthru: Reject commands with non-sgl flags set
nvmet: fix a memory leak
blkcg: fix memleak for iolatency
MAINTAINERS: Add missing header files to BLOCK LAYER section
...

+239 -154
+1 -1
Documentation/fault-injection/nvme-fault-injection.rst
··· 3 3 Linux's fault injection framework provides a systematic way to support 4 4 error injection via debugfs in the /sys/kernel/debug directory. When 5 5 enabled, the default NVME_SC_INVALID_OPCODE with no retry will be 6 - injected into the nvme_end_request. Users can change the default status 6 + injected into the nvme_try_complete_req. Users can change the default status 7 7 code and no retry flag via the debugfs. The list of Generic Command 8 8 Status can be found in include/linux/nvme.h 9 9
+1
MAINTAINERS
··· 3205 3205 T: git git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git 3206 3206 F: block/ 3207 3207 F: drivers/block/ 3208 + F: include/linux/blk* 3208 3209 F: kernel/trace/blktrace.c 3209 3210 F: lib/sbitmap.c 3210 3211
+1 -1
block/bfq-cgroup.c
··· 332 332 kfree(bfqg); 333 333 } 334 334 335 - void bfqg_and_blkg_get(struct bfq_group *bfqg) 335 + static void bfqg_and_blkg_get(struct bfq_group *bfqg) 336 336 { 337 337 /* see comments in bfq_bic_update_cgroup for why refcounting bfqg */ 338 338 bfqg_get(bfqg);
-1
block/bfq-iosched.h
··· 986 986 struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); 987 987 struct bfq_group *bfqq_group(struct bfq_queue *bfqq); 988 988 struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node); 989 - void bfqg_and_blkg_get(struct bfq_group *bfqg); 990 989 void bfqg_and_blkg_put(struct bfq_group *bfqg); 991 990 992 991 #ifdef CONFIG_BFQ_GROUP_IOSCHED
+2 -10
block/bfq-wf2q.c
··· 533 533 bfqq->ref++; 534 534 bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d", 535 535 bfqq, bfqq->ref); 536 - } else 537 - bfqg_and_blkg_get(container_of(entity, struct bfq_group, 538 - entity)); 536 + } 539 537 } 540 538 541 539 /** ··· 647 649 648 650 entity->on_st_or_in_serv = false; 649 651 st->wsum -= entity->weight; 650 - if (is_in_service) 651 - return; 652 - 653 - if (bfqq) 652 + if (bfqq && !is_in_service) 654 653 bfq_put_queue(bfqq); 655 - else 656 - bfqg_and_blkg_put(container_of(entity, struct bfq_group, 657 - entity)); 658 654 } 659 655 660 656 /**
+5 -5
block/bio.c
··· 740 740 struct page *page, unsigned int len, unsigned int off, 741 741 bool *same_page) 742 742 { 743 - phys_addr_t vec_end_addr = page_to_phys(bv->bv_page) + 744 - bv->bv_offset + bv->bv_len - 1; 743 + size_t bv_end = bv->bv_offset + bv->bv_len; 744 + phys_addr_t vec_end_addr = page_to_phys(bv->bv_page) + bv_end - 1; 745 745 phys_addr_t page_addr = page_to_phys(page); 746 746 747 747 if (vec_end_addr + 1 != page_addr + off) ··· 750 750 return false; 751 751 752 752 *same_page = ((vec_end_addr & PAGE_MASK) == page_addr); 753 - if (!*same_page && pfn_to_page(PFN_DOWN(vec_end_addr)) + 1 != page) 754 - return false; 755 - return true; 753 + if (*same_page) 754 + return true; 755 + return (bv->bv_page + bv_end / PAGE_SIZE) == (page + off / PAGE_SIZE); 756 756 } 757 757 758 758 /*
+6 -4
block/blk-cgroup.c
··· 1152 1152 if (preloaded) 1153 1153 radix_tree_preload_end(); 1154 1154 1155 - ret = blk_iolatency_init(q); 1156 - if (ret) 1157 - goto err_destroy_all; 1158 - 1159 1155 ret = blk_throtl_init(q); 1160 1156 if (ret) 1161 1157 goto err_destroy_all; 1158 + 1159 + ret = blk_iolatency_init(q); 1160 + if (ret) { 1161 + blk_throtl_exit(q); 1162 + goto err_destroy_all; 1163 + } 1162 1164 return 0; 1163 1165 1164 1166 err_destroy_all:
+10 -3
block/blk-merge.c
··· 154 154 if (max_sectors > start_offset) 155 155 return max_sectors - start_offset; 156 156 157 - return sectors & (lbs - 1); 157 + return sectors & ~(lbs - 1); 158 158 } 159 159 160 160 static inline unsigned get_max_segment_size(const struct request_queue *q, ··· 533 533 } 534 534 EXPORT_SYMBOL(__blk_rq_map_sg); 535 535 536 + static inline unsigned int blk_rq_get_max_segments(struct request *rq) 537 + { 538 + if (req_op(rq) == REQ_OP_DISCARD) 539 + return queue_max_discard_segments(rq->q); 540 + return queue_max_segments(rq->q); 541 + } 542 + 536 543 static inline int ll_new_hw_segment(struct request *req, struct bio *bio, 537 544 unsigned int nr_phys_segs) 538 545 { 539 - if (req->nr_phys_segments + nr_phys_segs > queue_max_segments(req->q)) 546 + if (req->nr_phys_segments + nr_phys_segs > blk_rq_get_max_segments(req)) 540 547 goto no_merge; 541 548 542 549 if (blk_integrity_merge_bio(req->q, req, bio) == false) ··· 631 624 return 0; 632 625 633 626 total_phys_segments = req->nr_phys_segments + next->nr_phys_segments; 634 - if (total_phys_segments > queue_max_segments(q)) 627 + if (total_phys_segments > blk_rq_get_max_segments(req)) 635 628 return 0; 636 629 637 630 if (blk_integrity_merge_rq(q, req, next) == false)
+9
block/blk-mq-sched.c
··· 78 78 return; 79 79 clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); 80 80 81 + /* 82 + * Order clearing SCHED_RESTART and list_empty_careful(&hctx->dispatch) 83 + * in blk_mq_run_hw_queue(). Its pair is the barrier in 84 + * blk_mq_dispatch_rq_list(). So dispatch code won't see SCHED_RESTART, 85 + * meantime new request added to hctx->dispatch is missed to check in 86 + * blk_mq_run_hw_queue(). 87 + */ 88 + smp_mb(); 89 + 81 90 blk_mq_run_hw_queue(hctx, true); 82 91 } 83 92
+12 -1
block/blk-mq.c
··· 1438 1438 spin_unlock(&hctx->lock); 1439 1439 1440 1440 /* 1441 + * Order adding requests to hctx->dispatch and checking 1442 + * SCHED_RESTART flag. The pair of this smp_mb() is the one 1443 + * in blk_mq_sched_restart(). Avoid restart code path to 1444 + * miss the new added requests to hctx->dispatch, meantime 1445 + * SCHED_RESTART is observed here. 1446 + */ 1447 + smp_mb(); 1448 + 1449 + /* 1441 1450 * If SCHED_RESTART was set by the caller of this function and 1442 1451 * it is no longer set that means that it was cleared by another 1443 1452 * thread and hence that a queue rerun is needed. ··· 1843 1834 /** 1844 1835 * blk_mq_request_bypass_insert - Insert a request at dispatch list. 1845 1836 * @rq: Pointer to request to be inserted. 1837 + * @at_head: true if the request should be inserted at the head of the list. 1846 1838 * @run_queue: If we should run the hardware queue after inserting the request. 1847 1839 * 1848 1840 * Should only be used carefully, when the caller knows we want to ··· 2026 2016 if (bypass_insert) 2027 2017 return BLK_STS_RESOURCE; 2028 2018 2029 - blk_mq_request_bypass_insert(rq, false, run_queue); 2019 + blk_mq_sched_insert_request(rq, false, run_queue, false); 2020 + 2030 2021 return BLK_STS_OK; 2031 2022 } 2032 2023
+1 -1
block/bsg-lib.c
··· 378 378 bset->timeout_fn = timeout; 379 379 380 380 set = &bset->tag_set; 381 - set->ops = &bsg_mq_ops, 381 + set->ops = &bsg_mq_ops; 382 382 set->nr_hw_queues = 1; 383 383 set->queue_depth = 128; 384 384 set->numa_node = NUMA_NO_NODE;
+18 -15
drivers/block/loop.c
··· 878 878 struct file *file = lo->lo_backing_file; 879 879 struct inode *inode = file->f_mapping->host; 880 880 struct request_queue *q = lo->lo_queue; 881 + u32 granularity, max_discard_sectors; 881 882 882 883 /* 883 884 * If the backing device is a block device, mirror its zeroing ··· 891 890 struct request_queue *backingq; 892 891 893 892 backingq = bdev_get_queue(inode->i_bdev); 894 - blk_queue_max_discard_sectors(q, 895 - backingq->limits.max_write_zeroes_sectors); 896 893 897 - blk_queue_max_write_zeroes_sectors(q, 898 - backingq->limits.max_write_zeroes_sectors); 894 + max_discard_sectors = backingq->limits.max_write_zeroes_sectors; 895 + granularity = backingq->limits.discard_granularity ?: 896 + queue_physical_block_size(backingq); 899 897 900 898 /* 901 899 * We use punch hole to reclaim the free space used by the ··· 903 903 * useful information. 904 904 */ 905 905 } else if (!file->f_op->fallocate || lo->lo_encrypt_key_size) { 906 - q->limits.discard_granularity = 0; 907 - q->limits.discard_alignment = 0; 908 - blk_queue_max_discard_sectors(q, 0); 909 - blk_queue_max_write_zeroes_sectors(q, 0); 906 + max_discard_sectors = 0; 907 + granularity = 0; 910 908 911 909 } else { 912 - q->limits.discard_granularity = inode->i_sb->s_blocksize; 913 - q->limits.discard_alignment = 0; 914 - 915 - blk_queue_max_discard_sectors(q, UINT_MAX >> 9); 916 - blk_queue_max_write_zeroes_sectors(q, UINT_MAX >> 9); 910 + max_discard_sectors = UINT_MAX >> 9; 911 + granularity = inode->i_sb->s_blocksize; 917 912 } 918 913 919 - if (q->limits.max_write_zeroes_sectors) 914 + if (max_discard_sectors) { 915 + q->limits.discard_granularity = granularity; 916 + blk_queue_max_discard_sectors(q, max_discard_sectors); 917 + blk_queue_max_write_zeroes_sectors(q, max_discard_sectors); 920 918 blk_queue_flag_set(QUEUE_FLAG_DISCARD, q); 921 - else 919 + } else { 920 + q->limits.discard_granularity = 0; 921 + blk_queue_max_discard_sectors(q, 0); 922 + blk_queue_max_write_zeroes_sectors(q, 0); 922 923 blk_queue_flag_clear(QUEUE_FLAG_DISCARD, q); 924 + } 925 + q->limits.discard_alignment = 0; 923 926 } 924 927 925 928 static void loop_unprepare_queue(struct loop_device *lo)
+1 -1
drivers/block/null_blk_main.c
··· 1147 1147 len = bvec.bv_len; 1148 1148 err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset, 1149 1149 op_is_write(req_op(rq)), sector, 1150 - req_op(rq) & REQ_FUA); 1150 + rq->cmd_flags & REQ_FUA); 1151 1151 if (err) { 1152 1152 spin_unlock_irq(&nullb->lock); 1153 1153 return err;
+2 -1
drivers/block/rnbd/rnbd-srv.c
··· 148 148 /* Generate bio with pages pointing to the rdma buffer */ 149 149 bio = rnbd_bio_map_kern(data, sess_dev->rnbd_dev->ibd_bio_set, datalen, GFP_KERNEL); 150 150 if (IS_ERR(bio)) { 151 - rnbd_srv_err(sess_dev, "Failed to generate bio, err: %ld\n", PTR_ERR(bio)); 151 + err = PTR_ERR(bio); 152 + rnbd_srv_err(sess_dev, "Failed to generate bio, err: %d\n", err); 152 153 goto sess_dev_put; 153 154 } 154 155
+22 -7
drivers/block/virtio_blk.c
··· 126 126 if (!range) 127 127 return -ENOMEM; 128 128 129 - __rq_for_each_bio(bio, req) { 130 - u64 sector = bio->bi_iter.bi_sector; 131 - u32 num_sectors = bio->bi_iter.bi_size >> SECTOR_SHIFT; 129 + /* 130 + * Single max discard segment means multi-range discard isn't 131 + * supported, and block layer only runs contiguity merge like 132 + * normal RW request. So we can't reply on bio for retrieving 133 + * each range info. 134 + */ 135 + if (queue_max_discard_segments(req->q) == 1) { 136 + range[0].flags = cpu_to_le32(flags); 137 + range[0].num_sectors = cpu_to_le32(blk_rq_sectors(req)); 138 + range[0].sector = cpu_to_le64(blk_rq_pos(req)); 139 + n = 1; 140 + } else { 141 + __rq_for_each_bio(bio, req) { 142 + u64 sector = bio->bi_iter.bi_sector; 143 + u32 num_sectors = bio->bi_iter.bi_size >> SECTOR_SHIFT; 132 144 133 - range[n].flags = cpu_to_le32(flags); 134 - range[n].num_sectors = cpu_to_le32(num_sectors); 135 - range[n].sector = cpu_to_le64(sector); 136 - n++; 145 + range[n].flags = cpu_to_le32(flags); 146 + range[n].num_sectors = cpu_to_le32(num_sectors); 147 + range[n].sector = cpu_to_le64(sector); 148 + n++; 149 + } 137 150 } 151 + 152 + WARN_ON_ONCE(n != segments); 138 153 139 154 req->special_vec.bv_page = virt_to_page(range); 140 155 req->special_vec.bv_offset = offset_in_page(range);
+55 -33
drivers/nvme/host/core.c
··· 241 241 } 242 242 } 243 243 244 - static inline bool nvme_req_needs_retry(struct request *req) 245 - { 246 - if (blk_noretry_request(req)) 247 - return false; 248 - if (nvme_req(req)->status & NVME_SC_DNR) 249 - return false; 250 - if (nvme_req(req)->retries >= nvme_max_retries) 251 - return false; 252 - return true; 253 - } 254 - 255 244 static void nvme_retry_req(struct request *req) 256 245 { 257 246 struct nvme_ns *ns = req->q->queuedata; ··· 257 268 blk_mq_delay_kick_requeue_list(req->q, delay); 258 269 } 259 270 260 - void nvme_complete_rq(struct request *req) 271 + enum nvme_disposition { 272 + COMPLETE, 273 + RETRY, 274 + FAILOVER, 275 + }; 276 + 277 + static inline enum nvme_disposition nvme_decide_disposition(struct request *req) 278 + { 279 + if (likely(nvme_req(req)->status == 0)) 280 + return COMPLETE; 281 + 282 + if (blk_noretry_request(req) || 283 + (nvme_req(req)->status & NVME_SC_DNR) || 284 + nvme_req(req)->retries >= nvme_max_retries) 285 + return COMPLETE; 286 + 287 + if (req->cmd_flags & REQ_NVME_MPATH) { 288 + if (nvme_is_path_error(nvme_req(req)->status) || 289 + blk_queue_dying(req->q)) 290 + return FAILOVER; 291 + } else { 292 + if (blk_queue_dying(req->q)) 293 + return COMPLETE; 294 + } 295 + 296 + return RETRY; 297 + } 298 + 299 + static inline void nvme_end_req(struct request *req) 261 300 { 262 301 blk_status_t status = nvme_error_status(nvme_req(req)->status); 263 302 264 - trace_nvme_complete_rq(req); 303 + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && 304 + req_op(req) == REQ_OP_ZONE_APPEND) 305 + req->__sector = nvme_lba_to_sect(req->q->queuedata, 306 + le64_to_cpu(nvme_req(req)->result.u64)); 265 307 308 + nvme_trace_bio_complete(req, status); 309 + blk_mq_end_request(req, status); 310 + } 311 + 312 + void nvme_complete_rq(struct request *req) 313 + { 314 + trace_nvme_complete_rq(req); 266 315 nvme_cleanup_cmd(req); 267 316 268 317 if (nvme_req(req)->ctrl->kas) 269 318 nvme_req(req)->ctrl->comp_seen = true; 270 319 271 - if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) { 272 - if ((req->cmd_flags & REQ_NVME_MPATH) && nvme_failover_req(req)) 273 - return; 274 - 275 - if (!blk_queue_dying(req->q)) { 276 - nvme_retry_req(req); 277 - return; 278 - } 279 - } else if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && 280 - req_op(req) == REQ_OP_ZONE_APPEND) { 281 - req->__sector = nvme_lba_to_sect(req->q->queuedata, 282 - le64_to_cpu(nvme_req(req)->result.u64)); 320 + switch (nvme_decide_disposition(req)) { 321 + case COMPLETE: 322 + nvme_end_req(req); 323 + return; 324 + case RETRY: 325 + nvme_retry_req(req); 326 + return; 327 + case FAILOVER: 328 + nvme_failover_req(req); 329 + return; 283 330 } 284 - 285 - nvme_trace_bio_complete(req, status); 286 - blk_mq_end_request(req, status); 287 331 } 288 332 EXPORT_SYMBOL_GPL(nvme_complete_rq); 289 333 ··· 2097 2075 } 2098 2076 } 2099 2077 2100 - if (iob) 2078 + if (iob && !blk_queue_is_zoned(ns->queue)) 2101 2079 blk_queue_chunk_sectors(ns->queue, rounddown_pow_of_two(iob)); 2102 2080 nvme_update_disk_info(disk, ns, id); 2103 2081 #ifdef CONFIG_NVME_MULTIPATH ··· 2987 2965 { 2988 2966 struct nvme_cel *cel, *ret = NULL; 2989 2967 2990 - spin_lock(&ctrl->lock); 2968 + spin_lock_irq(&ctrl->lock); 2991 2969 list_for_each_entry(cel, &ctrl->cels, entry) { 2992 2970 if (cel->csi == csi) { 2993 2971 ret = cel; 2994 2972 break; 2995 2973 } 2996 2974 } 2997 - spin_unlock(&ctrl->lock); 2975 + spin_unlock_irq(&ctrl->lock); 2998 2976 2999 2977 return ret; 3000 2978 } ··· 3021 2999 3022 3000 cel->csi = csi; 3023 3001 3024 - spin_lock(&ctrl->lock); 3002 + spin_lock_irq(&ctrl->lock); 3025 3003 list_add_tail(&cel->entry, &ctrl->cels); 3026 - spin_unlock(&ctrl->lock); 3004 + spin_unlock_irq(&ctrl->lock); 3027 3005 out: 3028 3006 *log = &cel->log; 3029 3007 return 0;
+3 -3
drivers/nvme/host/fc.c
··· 2035 2035 } 2036 2036 2037 2037 __nvme_fc_fcpop_chk_teardowns(ctrl, op, opstate); 2038 - if (!nvme_end_request(rq, status, result)) 2038 + if (!nvme_try_complete_req(rq, status, result)) 2039 2039 nvme_fc_complete_rq(rq); 2040 2040 2041 2041 check_error: ··· 2078 2078 if (fc_dma_mapping_error(ctrl->lport->dev, op->fcp_req.cmddma)) { 2079 2079 dev_err(ctrl->dev, 2080 2080 "FCP Op failed - cmdiu dma mapping failed.\n"); 2081 - ret = EFAULT; 2081 + ret = -EFAULT; 2082 2082 goto out_on_error; 2083 2083 } 2084 2084 ··· 2088 2088 if (fc_dma_mapping_error(ctrl->lport->dev, op->fcp_req.rspdma)) { 2089 2089 dev_err(ctrl->dev, 2090 2090 "FCP Op failed - rspiu dma mapping failed.\n"); 2091 - ret = EFAULT; 2091 + ret = -EFAULT; 2092 2092 } 2093 2093 2094 2094 atomic_set(&op->state, FCPOP_STATE_IDLE);
+26 -43
drivers/nvme/host/multipath.c
··· 65 65 } 66 66 } 67 67 68 - bool nvme_failover_req(struct request *req) 68 + void nvme_failover_req(struct request *req) 69 69 { 70 70 struct nvme_ns *ns = req->q->queuedata; 71 - u16 status = nvme_req(req)->status; 71 + u16 status = nvme_req(req)->status & 0x7ff; 72 72 unsigned long flags; 73 73 74 - switch (status & 0x7ff) { 75 - case NVME_SC_ANA_TRANSITION: 76 - case NVME_SC_ANA_INACCESSIBLE: 77 - case NVME_SC_ANA_PERSISTENT_LOSS: 78 - /* 79 - * If we got back an ANA error we know the controller is alive, 80 - * but not ready to serve this namespaces. The spec suggests 81 - * we should update our general state here, but due to the fact 82 - * that the admin and I/O queues are not serialized that is 83 - * fundamentally racy. So instead just clear the current path, 84 - * mark the the path as pending and kick of a re-read of the ANA 85 - * log page ASAP. 86 - */ 87 - nvme_mpath_clear_current_path(ns); 88 - if (ns->ctrl->ana_log_buf) { 89 - set_bit(NVME_NS_ANA_PENDING, &ns->flags); 90 - queue_work(nvme_wq, &ns->ctrl->ana_work); 91 - } 92 - break; 93 - case NVME_SC_HOST_PATH_ERROR: 94 - case NVME_SC_HOST_ABORTED_CMD: 95 - /* 96 - * Temporary transport disruption in talking to the controller. 97 - * Try to send on a new path. 98 - */ 99 - nvme_mpath_clear_current_path(ns); 100 - break; 101 - default: 102 - /* This was a non-ANA error so follow the normal error path. */ 103 - return false; 74 + nvme_mpath_clear_current_path(ns); 75 + 76 + /* 77 + * If we got back an ANA error, we know the controller is alive but not 78 + * ready to serve this namespace. Kick of a re-read of the ANA 79 + * information page, and just try any other available path for now. 80 + */ 81 + if (nvme_is_ana_error(status) && ns->ctrl->ana_log_buf) { 82 + set_bit(NVME_NS_ANA_PENDING, &ns->flags); 83 + queue_work(nvme_wq, &ns->ctrl->ana_work); 104 84 } 105 85 106 86 spin_lock_irqsave(&ns->head->requeue_lock, flags); 107 87 blk_steal_bios(&ns->head->requeue_list, req); 108 88 spin_unlock_irqrestore(&ns->head->requeue_lock, flags); 109 - blk_mq_end_request(req, 0); 110 89 90 + blk_mq_end_request(req, 0); 111 91 kblockd_schedule_work(&ns->head->requeue_work); 112 - return true; 113 92 } 114 93 115 94 void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) ··· 212 233 static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head, 213 234 int node, struct nvme_ns *old) 214 235 { 215 - struct nvme_ns *ns, *found, *fallback = NULL; 236 + struct nvme_ns *ns, *found = NULL; 216 237 217 238 if (list_is_singular(&head->list)) { 218 239 if (nvme_path_is_disabled(old)) ··· 231 252 goto out; 232 253 } 233 254 if (ns->ana_state == NVME_ANA_NONOPTIMIZED) 234 - fallback = ns; 255 + found = ns; 235 256 } 236 257 237 - /* No optimized path found, re-check the current path */ 258 + /* 259 + * The loop above skips the current path for round-robin semantics. 260 + * Fall back to the current path if either: 261 + * - no other optimized path found and current is optimized, 262 + * - no other usable path found and current is usable. 263 + */ 238 264 if (!nvme_path_is_disabled(old) && 239 - old->ana_state == NVME_ANA_OPTIMIZED) { 240 - found = old; 241 - goto out; 242 - } 243 - if (!fallback) 265 + (old->ana_state == NVME_ANA_OPTIMIZED || 266 + (!found && old->ana_state == NVME_ANA_NONOPTIMIZED))) 267 + return old; 268 + 269 + if (!found) 244 270 return NULL; 245 - found = fallback; 246 271 out: 247 272 rcu_assign_pointer(head->current_path[node], found); 248 273 return found;
+27 -4
drivers/nvme/host/nvme.h
··· 523 523 return (len >> 2) - 1; 524 524 } 525 525 526 - static inline bool nvme_end_request(struct request *req, __le16 status, 526 + static inline bool nvme_is_ana_error(u16 status) 527 + { 528 + switch (status & 0x7ff) { 529 + case NVME_SC_ANA_TRANSITION: 530 + case NVME_SC_ANA_INACCESSIBLE: 531 + case NVME_SC_ANA_PERSISTENT_LOSS: 532 + return true; 533 + default: 534 + return false; 535 + } 536 + } 537 + 538 + static inline bool nvme_is_path_error(u16 status) 539 + { 540 + /* check for a status code type of 'path related status' */ 541 + return (status & 0x700) == 0x300; 542 + } 543 + 544 + /* 545 + * Fill in the status and result information from the CQE, and then figure out 546 + * if blk-mq will need to use IPI magic to complete the request, and if yes do 547 + * so. If not let the caller complete the request without an indirect function 548 + * call. 549 + */ 550 + static inline bool nvme_try_complete_req(struct request *req, __le16 status, 527 551 union nvme_result result) 528 552 { 529 553 struct nvme_request *rq = nvme_req(req); ··· 653 629 void nvme_mpath_start_freeze(struct nvme_subsystem *subsys); 654 630 void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns, 655 631 struct nvme_ctrl *ctrl, int *flags); 656 - bool nvme_failover_req(struct request *req); 632 + void nvme_failover_req(struct request *req); 657 633 void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl); 658 634 int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head); 659 635 void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id); ··· 712 688 sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->head->instance); 713 689 } 714 690 715 - static inline bool nvme_failover_req(struct request *req) 691 + static inline void nvme_failover_req(struct request *req) 716 692 { 717 - return false; 718 693 } 719 694 static inline void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) 720 695 {
+9 -8
drivers/nvme/host/pci.c
··· 120 120 unsigned max_qid; 121 121 unsigned io_queues[HCTX_MAX_TYPES]; 122 122 unsigned int num_vecs; 123 - u16 q_depth; 123 + u32 q_depth; 124 124 int io_sqes; 125 125 u32 db_stride; 126 126 void __iomem *bar; ··· 157 157 static int io_queue_depth_set(const char *val, const struct kernel_param *kp) 158 158 { 159 159 int ret; 160 - u16 n; 160 + u32 n; 161 161 162 - ret = kstrtou16(val, 10, &n); 162 + ret = kstrtou32(val, 10, &n); 163 163 if (ret != 0 || n < 2) 164 164 return -EINVAL; 165 165 166 - return param_set_ushort(val, kp); 166 + return param_set_uint(val, kp); 167 167 } 168 168 169 169 static inline unsigned int sq_idx(unsigned int qid, u32 stride) ··· 195 195 dma_addr_t sq_dma_addr; 196 196 dma_addr_t cq_dma_addr; 197 197 u32 __iomem *q_db; 198 - u16 q_depth; 198 + u32 q_depth; 199 199 u16 cq_vector; 200 200 u16 sq_tail; 201 201 u16 cq_head; ··· 961 961 962 962 req = blk_mq_tag_to_rq(nvme_queue_tagset(nvmeq), cqe->command_id); 963 963 trace_nvme_sq(req, cqe->sq_head, nvmeq->sq_tail); 964 - if (!nvme_end_request(req, cqe->status, cqe->result)) 964 + if (!nvme_try_complete_req(req, cqe->status, cqe->result)) 965 965 nvme_pci_complete_rq(req); 966 966 } 967 967 ··· 2320 2320 2321 2321 dev->ctrl.cap = lo_hi_readq(dev->bar + NVME_REG_CAP); 2322 2322 2323 - dev->q_depth = min_t(u16, NVME_CAP_MQES(dev->ctrl.cap) + 1, 2323 + dev->q_depth = min_t(u32, NVME_CAP_MQES(dev->ctrl.cap) + 1, 2324 2324 io_queue_depth); 2325 2325 dev->ctrl.sqsize = dev->q_depth - 1; /* 0's based queue depth */ 2326 2326 dev->db_stride = 1 << NVME_CAP_STRIDE(dev->ctrl.cap); ··· 2460 2460 static int nvme_setup_prp_pools(struct nvme_dev *dev) 2461 2461 { 2462 2462 dev->prp_page_pool = dma_pool_create("prp list page", dev->dev, 2463 - PAGE_SIZE, PAGE_SIZE, 0); 2463 + NVME_CTRL_PAGE_SIZE, 2464 + NVME_CTRL_PAGE_SIZE, 0); 2464 2465 if (!dev->prp_page_pool) 2465 2466 return -ENOMEM; 2466 2467
+1 -1
drivers/nvme/host/rdma.c
··· 1189 1189 1190 1190 if (!refcount_dec_and_test(&req->ref)) 1191 1191 return; 1192 - if (!nvme_end_request(rq, req->status, req->result)) 1192 + if (!nvme_try_complete_req(rq, req->status, req->result)) 1193 1193 nvme_rdma_complete_rq(rq); 1194 1194 } 1195 1195
+2 -2
drivers/nvme/host/tcp.c
··· 481 481 return -EINVAL; 482 482 } 483 483 484 - if (!nvme_end_request(rq, cqe->status, cqe->result)) 484 + if (!nvme_try_complete_req(rq, cqe->status, cqe->result)) 485 485 nvme_complete_rq(rq); 486 486 queue->nr_cqe++; 487 487 ··· 672 672 { 673 673 union nvme_result res = {}; 674 674 675 - if (!nvme_end_request(rq, cpu_to_le16(status << 1), res)) 675 + if (!nvme_try_complete_req(rq, cpu_to_le16(status << 1), res)) 676 676 nvme_complete_rq(rq); 677 677 } 678 678
+1
drivers/nvme/target/configfs.c
··· 1136 1136 up_write(&nvmet_config_sem); 1137 1137 1138 1138 kfree_rcu(new_model, rcuhead); 1139 + kfree(new_model_number); 1139 1140 1140 1141 return count; 1141 1142 }
+6
drivers/nvme/target/core.c
··· 397 397 398 398 static void nvmet_start_keep_alive_timer(struct nvmet_ctrl *ctrl) 399 399 { 400 + if (unlikely(ctrl->kato == 0)) 401 + return; 402 + 400 403 pr_debug("ctrl %d start keep-alive timer for %d secs\n", 401 404 ctrl->cntlid, ctrl->kato); 402 405 ··· 409 406 410 407 static void nvmet_stop_keep_alive_timer(struct nvmet_ctrl *ctrl) 411 408 { 409 + if (unlikely(ctrl->kato == 0)) 410 + return; 411 + 412 412 pr_debug("ctrl %d stop keep-alive\n", ctrl->cntlid); 413 413 414 414 cancel_delayed_work_sync(&ctrl->ka_work);
+1 -1
drivers/nvme/target/loop.c
··· 115 115 return; 116 116 } 117 117 118 - if (!nvme_end_request(rq, cqe->status, cqe->result)) 118 + if (!nvme_try_complete_req(rq, cqe->status, cqe->result)) 119 119 nvme_loop_complete_rq(rq); 120 120 } 121 121 }
+17 -8
drivers/nvme/target/passthru.c
··· 165 165 166 166 req->cqe->result = nvme_req(rq)->result; 167 167 nvmet_req_complete(req, status); 168 - blk_put_request(rq); 168 + blk_mq_free_request(rq); 169 169 } 170 170 171 171 static void nvmet_passthru_req_done(struct request *rq, ··· 175 175 176 176 req->cqe->result = nvme_req(rq)->result; 177 177 nvmet_req_complete(req, nvme_req(rq)->status); 178 - blk_put_request(rq); 178 + blk_mq_free_request(rq); 179 179 } 180 180 181 181 static int nvmet_passthru_map_sg(struct nvmet_req *req, struct request *rq) ··· 230 230 if (unlikely(!ns)) { 231 231 pr_err("failed to get passthru ns nsid:%u\n", nsid); 232 232 status = NVME_SC_INVALID_NS | NVME_SC_DNR; 233 - goto fail_out; 233 + goto out; 234 234 } 235 235 236 236 q = ns->queue; ··· 238 238 239 239 rq = nvme_alloc_request(q, req->cmd, BLK_MQ_REQ_NOWAIT, NVME_QID_ANY); 240 240 if (IS_ERR(rq)) { 241 - rq = NULL; 242 241 status = NVME_SC_INTERNAL; 243 - goto fail_out; 242 + goto out_put_ns; 244 243 } 245 244 246 245 if (req->sg_cnt) { 247 246 ret = nvmet_passthru_map_sg(req, rq); 248 247 if (unlikely(ret)) { 249 248 status = NVME_SC_INTERNAL; 250 - goto fail_out; 249 + goto out_put_req; 251 250 } 252 251 } 253 252 ··· 273 274 274 275 return; 275 276 276 - fail_out: 277 + out_put_req: 278 + blk_mq_free_request(rq); 279 + out_put_ns: 277 280 if (ns) 278 281 nvme_put_ns(ns); 282 + out: 279 283 nvmet_req_complete(req, status); 280 - blk_put_request(rq); 281 284 } 282 285 283 286 /* ··· 327 326 328 327 u16 nvmet_parse_passthru_io_cmd(struct nvmet_req *req) 329 328 { 329 + /* Reject any commands with non-sgl flags set (ie. fused commands) */ 330 + if (req->cmd->common.flags & ~NVME_CMD_SGL_ALL) 331 + return NVME_SC_INVALID_FIELD; 332 + 330 333 switch (req->cmd->common.opcode) { 331 334 case nvme_cmd_resv_register: 332 335 case nvme_cmd_resv_report: ··· 401 396 402 397 u16 nvmet_parse_passthru_admin_cmd(struct nvmet_req *req) 403 398 { 399 + /* Reject any commands with non-sgl flags set (ie. fused commands) */ 400 + if (req->cmd->common.flags & ~NVME_CMD_SGL_ALL) 401 + return NVME_SC_INVALID_FIELD; 402 + 404 403 /* 405 404 * Passthru all vendor specific commands 406 405 */