Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'block-6.5-2023-07-14' of git://git.kernel.dk/linux

Pull block fixes from Jens Axboe:

- NVMe pull request via Keith:
- Don't require quirk to use duplicate namespace identifiers
(Christoph, Sagi)
- One more BOGUS_NID quirk (Pankaj)
- IO timeout and error hanlding fixes for PCI (Keith)
- Enhanced metadata format mask fix (Ankit)
- Association race condition fix for fibre channel (Michael)
- Correct debugfs error checks (Minjie)
- Use PAGE_SECTORS_SHIFT where needed (Damien)
- Reduce kernel logs for legacy nguid attribute (Keith)
- Use correct dma direction when unmapping metadata (Ming)

- Fix for a flush handling regression in this release (Christoph)

- Fix for batched request time stamping (Chengming)

- Fix for a regression in the mq-deadline position calculation (Bart)

- Lockdep fix for blk-crypto (Eric)

- Fix for a regression in the Amiga partition handling changes
(Michael)

* tag 'block-6.5-2023-07-14' of git://git.kernel.dk/linux:
block: queue data commands from the flush state machine at the head
blk-mq: fix start_time_ns and alloc_time_ns for pre-allocated rq
nvme-pci: fix DMA direction of unmapping integrity data
nvme: don't reject probe due to duplicate IDs for single-ported PCIe devices
block/mq-deadline: Fix a bug in deadline_from_pos()
nvme: ensure disabling pairs with unquiesce
nvme-fc: fix race between error recovery and creating association
nvme-fc: return non-zero status code when fails to create association
nvme: fix parameter check in nvme_fault_inject_init()
nvme: warn only once for legacy uuid attribute
block: remove dead struc request->completion_data field
nvme: fix the NVME_ID_NS_NVM_STS_MASK definition
nvmet: use PAGE_SECTORS_SHIFT
nvme: add BOGUS_NID quirk for Samsung SM953
blk-crypto: use dynamic lock class for blk_crypto_profile::lock
block/partition: fix signedness issue for Amiga partitions

+136 -50
+10 -2
block/blk-crypto-profile.c
··· 79 79 unsigned int slot_hashtable_size; 80 80 81 81 memset(profile, 0, sizeof(*profile)); 82 - init_rwsem(&profile->lock); 82 + 83 + /* 84 + * profile->lock of an underlying device can nest inside profile->lock 85 + * of a device-mapper device, so use a dynamic lock class to avoid 86 + * false-positive lockdep reports. 87 + */ 88 + lockdep_register_key(&profile->lockdep_key); 89 + __init_rwsem(&profile->lock, "&profile->lock", &profile->lockdep_key); 83 90 84 91 if (num_slots == 0) 85 92 return 0; ··· 96 89 profile->slots = kvcalloc(num_slots, sizeof(profile->slots[0]), 97 90 GFP_KERNEL); 98 91 if (!profile->slots) 99 - return -ENOMEM; 92 + goto err_destroy; 100 93 101 94 profile->num_slots = num_slots; 102 95 ··· 442 435 { 443 436 if (!profile) 444 437 return; 438 + lockdep_unregister_key(&profile->lockdep_key); 445 439 kvfree(profile->slot_hashtable); 446 440 kvfree_sensitive(profile->slots, 447 441 sizeof(profile->slots[0]) * profile->num_slots);
+1 -1
block/blk-flush.c
··· 189 189 case REQ_FSEQ_DATA: 190 190 list_move_tail(&rq->flush.list, &fq->flush_data_in_flight); 191 191 spin_lock(&q->requeue_lock); 192 - list_add_tail(&rq->queuelist, &q->flush_list); 192 + list_add(&rq->queuelist, &q->requeue_list); 193 193 spin_unlock(&q->requeue_lock); 194 194 blk_mq_kick_requeue_list(q); 195 195 break;
+30 -17
block/blk-mq.c
··· 328 328 } 329 329 EXPORT_SYMBOL(blk_rq_init); 330 330 331 + /* Set start and alloc time when the allocated request is actually used */ 332 + static inline void blk_mq_rq_time_init(struct request *rq, u64 alloc_time_ns) 333 + { 334 + if (blk_mq_need_time_stamp(rq)) 335 + rq->start_time_ns = ktime_get_ns(); 336 + else 337 + rq->start_time_ns = 0; 338 + 339 + #ifdef CONFIG_BLK_RQ_ALLOC_TIME 340 + if (blk_queue_rq_alloc_time(rq->q)) 341 + rq->alloc_time_ns = alloc_time_ns ?: rq->start_time_ns; 342 + else 343 + rq->alloc_time_ns = 0; 344 + #endif 345 + } 346 + 331 347 static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, 332 - struct blk_mq_tags *tags, unsigned int tag, u64 alloc_time_ns) 348 + struct blk_mq_tags *tags, unsigned int tag) 333 349 { 334 350 struct blk_mq_ctx *ctx = data->ctx; 335 351 struct blk_mq_hw_ctx *hctx = data->hctx; ··· 372 356 } 373 357 rq->timeout = 0; 374 358 375 - if (blk_mq_need_time_stamp(rq)) 376 - rq->start_time_ns = ktime_get_ns(); 377 - else 378 - rq->start_time_ns = 0; 379 359 rq->part = NULL; 380 - #ifdef CONFIG_BLK_RQ_ALLOC_TIME 381 - rq->alloc_time_ns = alloc_time_ns; 382 - #endif 383 360 rq->io_start_time_ns = 0; 384 361 rq->stats_sectors = 0; 385 362 rq->nr_phys_segments = 0; ··· 402 393 } 403 394 404 395 static inline struct request * 405 - __blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data, 406 - u64 alloc_time_ns) 396 + __blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data) 407 397 { 408 398 unsigned int tag, tag_offset; 409 399 struct blk_mq_tags *tags; ··· 421 413 tag = tag_offset + i; 422 414 prefetch(tags->static_rqs[tag]); 423 415 tag_mask &= ~(1UL << i); 424 - rq = blk_mq_rq_ctx_init(data, tags, tag, alloc_time_ns); 416 + rq = blk_mq_rq_ctx_init(data, tags, tag); 425 417 rq_list_add(data->cached_rq, rq); 426 418 nr++; 427 419 } ··· 482 474 * Try batched alloc if we want more than 1 tag. 483 475 */ 484 476 if (data->nr_tags > 1) { 485 - rq = __blk_mq_alloc_requests_batch(data, alloc_time_ns); 486 - if (rq) 477 + rq = __blk_mq_alloc_requests_batch(data); 478 + if (rq) { 479 + blk_mq_rq_time_init(rq, alloc_time_ns); 487 480 return rq; 481 + } 488 482 data->nr_tags = 1; 489 483 } 490 484 ··· 509 499 goto retry; 510 500 } 511 501 512 - return blk_mq_rq_ctx_init(data, blk_mq_tags_from_data(data), tag, 513 - alloc_time_ns); 502 + rq = blk_mq_rq_ctx_init(data, blk_mq_tags_from_data(data), tag); 503 + blk_mq_rq_time_init(rq, alloc_time_ns); 504 + return rq; 514 505 } 515 506 516 507 static struct request *blk_mq_rq_cache_fill(struct request_queue *q, ··· 566 555 return NULL; 567 556 568 557 plug->cached_rq = rq_list_next(rq); 558 + blk_mq_rq_time_init(rq, 0); 569 559 } 570 560 571 561 rq->cmd_flags = opf; ··· 668 656 tag = blk_mq_get_tag(&data); 669 657 if (tag == BLK_MQ_NO_TAG) 670 658 goto out_queue_exit; 671 - rq = blk_mq_rq_ctx_init(&data, blk_mq_tags_from_data(&data), tag, 672 - alloc_time_ns); 659 + rq = blk_mq_rq_ctx_init(&data, blk_mq_tags_from_data(&data), tag); 660 + blk_mq_rq_time_init(rq, alloc_time_ns); 673 661 rq->__data_len = 0; 674 662 rq->__sector = (sector_t) -1; 675 663 rq->bio = rq->biotail = NULL; ··· 2908 2896 plug->cached_rq = rq_list_next(rq); 2909 2897 rq_qos_throttle(q, *bio); 2910 2898 2899 + blk_mq_rq_time_init(rq, 0); 2911 2900 rq->cmd_flags = (*bio)->bi_opf; 2912 2901 INIT_LIST_HEAD(&rq->queuelist); 2913 2902 return rq;
+1 -1
block/mq-deadline.c
··· 176 176 * zoned writes, start searching from the start of a zone. 177 177 */ 178 178 if (blk_rq_is_seq_zoned_write(rq)) 179 - pos -= round_down(pos, rq->q->limits.chunk_sectors); 179 + pos = round_down(pos, rq->q->limits.chunk_sectors); 180 180 181 181 while (node) { 182 182 rq = rb_entry_rq(node);
+1 -1
block/partitions/amiga.c
··· 90 90 } 91 91 blk = be32_to_cpu(rdb->rdb_PartitionList); 92 92 put_dev_sector(sect); 93 - for (part = 1; blk>0 && part<=16; part++, put_dev_sector(sect)) { 93 + for (part = 1; (s32) blk>0 && part<=16; part++, put_dev_sector(sect)) { 94 94 /* Read in terms partition table understands */ 95 95 if (check_mul_overflow(blk, (sector_t) blksize, &blk)) { 96 96 pr_err("Dev %s: overflow calculating partition block %llu! Skipping partitions %u and beyond\n",
+33 -3
drivers/nvme/host/core.c
··· 3431 3431 3432 3432 ret = nvme_global_check_duplicate_ids(ctrl->subsys, &info->ids); 3433 3433 if (ret) { 3434 - dev_err(ctrl->device, 3435 - "globally duplicate IDs for nsid %d\n", info->nsid); 3434 + /* 3435 + * We've found two different namespaces on two different 3436 + * subsystems that report the same ID. This is pretty nasty 3437 + * for anything that actually requires unique device 3438 + * identification. In the kernel we need this for multipathing, 3439 + * and in user space the /dev/disk/by-id/ links rely on it. 3440 + * 3441 + * If the device also claims to be multi-path capable back off 3442 + * here now and refuse the probe the second device as this is a 3443 + * recipe for data corruption. If not this is probably a 3444 + * cheap consumer device if on the PCIe bus, so let the user 3445 + * proceed and use the shiny toy, but warn that with changing 3446 + * probing order (which due to our async probing could just be 3447 + * device taking longer to startup) the other device could show 3448 + * up at any time. 3449 + */ 3436 3450 nvme_print_device_info(ctrl); 3437 - return ret; 3451 + if ((ns->ctrl->ops->flags & NVME_F_FABRICS) || /* !PCIe */ 3452 + ((ns->ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) && 3453 + info->is_shared)) { 3454 + dev_err(ctrl->device, 3455 + "ignoring nsid %d because of duplicate IDs\n", 3456 + info->nsid); 3457 + return ret; 3458 + } 3459 + 3460 + dev_err(ctrl->device, 3461 + "clearing duplicate IDs for nsid %d\n", info->nsid); 3462 + dev_err(ctrl->device, 3463 + "use of /dev/disk/by-id/ may cause data corruption\n"); 3464 + memset(&info->ids.nguid, 0, sizeof(info->ids.nguid)); 3465 + memset(&info->ids.uuid, 0, sizeof(info->ids.uuid)); 3466 + memset(&info->ids.eui64, 0, sizeof(info->ids.eui64)); 3467 + ctrl->quirks |= NVME_QUIRK_BOGUS_NID; 3438 3468 } 3439 3469 3440 3470 mutex_lock(&ctrl->subsys->lock);
+1 -1
drivers/nvme/host/fault_inject.c
··· 27 27 28 28 /* create debugfs directory and attribute */ 29 29 parent = debugfs_create_dir(dev_name, NULL); 30 - if (!parent) { 30 + if (IS_ERR(parent)) { 31 31 pr_warn("%s: failed to create debugfs directory\n", dev_name); 32 32 return; 33 33 }
+30 -7
drivers/nvme/host/fc.c
··· 2548 2548 * the controller. Abort any ios on the association and let the 2549 2549 * create_association error path resolve things. 2550 2550 */ 2551 - if (ctrl->ctrl.state == NVME_CTRL_CONNECTING) { 2552 - __nvme_fc_abort_outstanding_ios(ctrl, true); 2551 + enum nvme_ctrl_state state; 2552 + unsigned long flags; 2553 + 2554 + spin_lock_irqsave(&ctrl->lock, flags); 2555 + state = ctrl->ctrl.state; 2556 + if (state == NVME_CTRL_CONNECTING) { 2553 2557 set_bit(ASSOC_FAILED, &ctrl->flags); 2558 + spin_unlock_irqrestore(&ctrl->lock, flags); 2559 + __nvme_fc_abort_outstanding_ios(ctrl, true); 2560 + dev_warn(ctrl->ctrl.device, 2561 + "NVME-FC{%d}: transport error during (re)connect\n", 2562 + ctrl->cnum); 2554 2563 return; 2555 2564 } 2565 + spin_unlock_irqrestore(&ctrl->lock, flags); 2556 2566 2557 2567 /* Otherwise, only proceed if in LIVE state - e.g. on first error */ 2558 - if (ctrl->ctrl.state != NVME_CTRL_LIVE) 2568 + if (state != NVME_CTRL_LIVE) 2559 2569 return; 2560 2570 2561 2571 dev_warn(ctrl->ctrl.device, ··· 3120 3110 */ 3121 3111 3122 3112 ret = nvme_enable_ctrl(&ctrl->ctrl); 3123 - if (ret || test_bit(ASSOC_FAILED, &ctrl->flags)) 3113 + if (!ret && test_bit(ASSOC_FAILED, &ctrl->flags)) 3114 + ret = -EIO; 3115 + if (ret) 3124 3116 goto out_disconnect_admin_queue; 3125 3117 3126 3118 ctrl->ctrl.max_segments = ctrl->lport->ops->max_sgl_segments; ··· 3132 3120 nvme_unquiesce_admin_queue(&ctrl->ctrl); 3133 3121 3134 3122 ret = nvme_init_ctrl_finish(&ctrl->ctrl, false); 3135 - if (ret || test_bit(ASSOC_FAILED, &ctrl->flags)) 3123 + if (!ret && test_bit(ASSOC_FAILED, &ctrl->flags)) 3124 + ret = -EIO; 3125 + if (ret) 3136 3126 goto out_disconnect_admin_queue; 3137 3127 3138 3128 /* sanity checks */ ··· 3179 3165 else 3180 3166 ret = nvme_fc_recreate_io_queues(ctrl); 3181 3167 } 3182 - if (ret || test_bit(ASSOC_FAILED, &ctrl->flags)) 3183 - goto out_term_aen_ops; 3184 3168 3169 + spin_lock_irqsave(&ctrl->lock, flags); 3170 + if (!ret && test_bit(ASSOC_FAILED, &ctrl->flags)) 3171 + ret = -EIO; 3172 + if (ret) { 3173 + spin_unlock_irqrestore(&ctrl->lock, flags); 3174 + goto out_term_aen_ops; 3175 + } 3185 3176 changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); 3177 + spin_unlock_irqrestore(&ctrl->lock, flags); 3186 3178 3187 3179 ctrl->ctrl.nr_reconnects = 0; 3188 3180 ··· 3200 3180 out_term_aen_ops: 3201 3181 nvme_fc_term_aen_ops(ctrl); 3202 3182 out_disconnect_admin_queue: 3183 + dev_warn(ctrl->ctrl.device, 3184 + "NVME-FC{%d}: create_assoc failed, assoc_id %llx ret %d\n", 3185 + ctrl->cnum, ctrl->association_id, ret); 3203 3186 /* send a Disconnect(association) LS to fc-nvme target */ 3204 3187 nvme_fc_xmt_disconnect_assoc(ctrl); 3205 3188 spin_lock_irqsave(&ctrl->lock, flags);
+20 -9
drivers/nvme/host/pci.c
··· 967 967 struct nvme_iod *iod = blk_mq_rq_to_pdu(req); 968 968 969 969 dma_unmap_page(dev->dev, iod->meta_dma, 970 - rq_integrity_vec(req)->bv_len, rq_data_dir(req)); 970 + rq_integrity_vec(req)->bv_len, rq_dma_dir(req)); 971 971 } 972 972 973 973 if (blk_rq_nr_phys_segments(req)) ··· 1298 1298 */ 1299 1299 if (nvme_should_reset(dev, csts)) { 1300 1300 nvme_warn_reset(dev, csts); 1301 - nvme_dev_disable(dev, false); 1302 - nvme_reset_ctrl(&dev->ctrl); 1303 - return BLK_EH_DONE; 1301 + goto disable; 1304 1302 } 1305 1303 1306 1304 /* ··· 1349 1351 "I/O %d QID %d timeout, reset controller\n", 1350 1352 req->tag, nvmeq->qid); 1351 1353 nvme_req(req)->flags |= NVME_REQ_CANCELLED; 1352 - nvme_dev_disable(dev, false); 1353 - nvme_reset_ctrl(&dev->ctrl); 1354 - 1355 - return BLK_EH_DONE; 1354 + goto disable; 1356 1355 } 1357 1356 1358 1357 if (atomic_dec_return(&dev->ctrl.abort_limit) < 0) { ··· 1386 1391 * as the device then is in a faulty state. 1387 1392 */ 1388 1393 return BLK_EH_RESET_TIMER; 1394 + 1395 + disable: 1396 + if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING)) 1397 + return BLK_EH_DONE; 1398 + 1399 + nvme_dev_disable(dev, false); 1400 + if (nvme_try_sched_reset(&dev->ctrl)) 1401 + nvme_unquiesce_io_queues(&dev->ctrl); 1402 + return BLK_EH_DONE; 1389 1403 } 1390 1404 1391 1405 static void nvme_free_queue(struct nvme_queue *nvmeq) ··· 3282 3278 case pci_channel_io_frozen: 3283 3279 dev_warn(dev->ctrl.device, 3284 3280 "frozen state error detected, reset controller\n"); 3281 + if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING)) { 3282 + nvme_dev_disable(dev, true); 3283 + return PCI_ERS_RESULT_DISCONNECT; 3284 + } 3285 3285 nvme_dev_disable(dev, false); 3286 3286 return PCI_ERS_RESULT_NEED_RESET; 3287 3287 case pci_channel_io_perm_failure: ··· 3302 3294 3303 3295 dev_info(dev->ctrl.device, "restart after slot reset\n"); 3304 3296 pci_restore_state(pdev); 3305 - nvme_reset_ctrl(&dev->ctrl); 3297 + if (!nvme_try_sched_reset(&dev->ctrl)) 3298 + nvme_unquiesce_io_queues(&dev->ctrl); 3306 3299 return PCI_ERS_RESULT_RECOVERED; 3307 3300 } 3308 3301 ··· 3405 3396 .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, }, 3406 3397 { PCI_DEVICE(0x144d, 0xa809), /* Samsung MZALQ256HBJD 256G */ 3407 3398 .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, }, 3399 + { PCI_DEVICE(0x144d, 0xa802), /* Samsung SM953 */ 3400 + .driver_data = NVME_QUIRK_BOGUS_NID, }, 3408 3401 { PCI_DEVICE(0x1cc4, 0x6303), /* UMIS RPJTJ512MGE1QDY 512G */ 3409 3402 .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, }, 3410 3403 { PCI_DEVICE(0x1cc4, 0x6302), /* UMIS RPJTJ256MGE1QDY 256G */
+1 -1
drivers/nvme/host/sysfs.c
··· 92 92 * we have no UUID set 93 93 */ 94 94 if (uuid_is_null(&ids->uuid)) { 95 - dev_warn_ratelimited(dev, 95 + dev_warn_once(dev, 96 96 "No UUID available providing old NGUID\n"); 97 97 return sysfs_emit(buf, "%pU\n", ids->nguid); 98 98 }
+1 -1
drivers/nvme/target/loop.c
··· 373 373 goto out_cleanup_tagset; 374 374 375 375 ctrl->ctrl.max_hw_sectors = 376 - (NVME_LOOP_MAX_SEGMENTS - 1) << (PAGE_SHIFT - 9); 376 + (NVME_LOOP_MAX_SEGMENTS - 1) << PAGE_SECTORS_SHIFT; 377 377 378 378 nvme_unquiesce_admin_queue(&ctrl->ctrl); 379 379
+2 -2
drivers/nvme/target/passthru.c
··· 102 102 * which depends on the host's memory fragementation. To solve this, 103 103 * ensure mdts is limited to the pages equal to the number of segments. 104 104 */ 105 - max_hw_sectors = min_not_zero(pctrl->max_segments << (PAGE_SHIFT - 9), 105 + max_hw_sectors = min_not_zero(pctrl->max_segments << PAGE_SECTORS_SHIFT, 106 106 pctrl->max_hw_sectors); 107 107 108 108 /* 109 109 * nvmet_passthru_map_sg is limitted to using a single bio so limit 110 110 * the mdts based on BIO_MAX_VECS as well 111 111 */ 112 - max_hw_sectors = min_not_zero(BIO_MAX_VECS << (PAGE_SHIFT - 9), 112 + max_hw_sectors = min_not_zero(BIO_MAX_VECS << PAGE_SECTORS_SHIFT, 113 113 max_hw_sectors); 114 114 115 115 page_shift = NVME_CAP_MPSMIN(ctrl->cap) + 12;
+1
include/linux/blk-crypto-profile.h
··· 111 111 * keyslots while ensuring that they can't be changed concurrently. 112 112 */ 113 113 struct rw_semaphore lock; 114 + struct lock_class_key lockdep_key; 114 115 115 116 /* List of idle slots, with least recently used slot at front */ 116 117 wait_queue_head_t idle_slots_wait_queue;
+3 -3
include/linux/blk-mq.h
··· 158 158 159 159 /* 160 160 * The rb_node is only used inside the io scheduler, requests 161 - * are pruned when moved to the dispatch queue. So let the 162 - * completion_data share space with the rb_node. 161 + * are pruned when moved to the dispatch queue. special_vec must 162 + * only be used if RQF_SPECIAL_PAYLOAD is set, and those cannot be 163 + * insert into an IO scheduler. 163 164 */ 164 165 union { 165 166 struct rb_node rb_node; /* sort/lookup */ 166 167 struct bio_vec special_vec; 167 - void *completion_data; 168 168 }; 169 169 170 170 /*
+1 -1
include/linux/nvme.h
··· 473 473 }; 474 474 475 475 enum { 476 - NVME_ID_NS_NVM_STS_MASK = 0x3f, 476 + NVME_ID_NS_NVM_STS_MASK = 0x7f, 477 477 NVME_ID_NS_NVM_GUARD_SHIFT = 7, 478 478 NVME_ID_NS_NVM_GUARD_MASK = 0x3, 479 479 };