Merge tag 'block-5.15-2021-09-25' of git://git.kernel.dk/linux-block

Pull block fixes from Jens Axboe:

- NVMe pull request via Christoph:
- keep ctrl->namespaces ordered (Christoph Hellwig)
- fix incorrect h2cdata pdu offset accounting in nvme-tcp (Sagi
Grimberg)
- handled updated hw_queues in nvme-fc more carefully (Daniel
Wagner, James Smart)

- md lock order fix (Christoph)

- fallocate locking fix (Ming)

- blktrace UAF fix (Zhihao)

- rq-qos bio tracking fix (Ming)

* tag 'block-5.15-2021-09-25' of git://git.kernel.dk/linux-block:
block: hold ->invalidate_lock in blkdev_fallocate
blktrace: Fix uaf in blk_trace access after removing by sysfs
block: don't call rq_qos_ops->done_bio if the bio isn't tracked
md: fix a lock order reversal in md_alloc
nvme: keep ctrl->namespaces ordered
nvme-tcp: fix incorrect h2cdata pdu offset accounting
nvme-fc: remove freeze/unfreeze around update_nr_hw_queues
nvme-fc: avoid race between time out and tear down
nvme-fc: update hardware queues before using them

+55 -45
+1 -1
block/bio.c
··· 1466 1466 if (!bio_integrity_endio(bio)) 1467 1467 return; 1468 1468 1469 - if (bio->bi_bdev) 1469 + if (bio->bi_bdev && bio_flagged(bio, BIO_TRACKED)) 1470 1470 rq_qos_done_bio(bio->bi_bdev->bd_disk->queue, bio); 1471 1471 1472 1472 if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) {
+10 -11
block/fops.c
··· 14 14 #include <linux/task_io_accounting_ops.h> 15 15 #include <linux/falloc.h> 16 16 #include <linux/suspend.h> 17 + #include <linux/fs.h> 17 18 #include "blk.h" 18 19 19 20 static struct inode *bdev_file_inode(struct file *file) ··· 554 553 static long blkdev_fallocate(struct file *file, int mode, loff_t start, 555 554 loff_t len) 556 555 { 557 - struct block_device *bdev = I_BDEV(bdev_file_inode(file)); 556 + struct inode *inode = bdev_file_inode(file); 557 + struct block_device *bdev = I_BDEV(inode); 558 558 loff_t end = start + len - 1; 559 559 loff_t isize; 560 560 int error; ··· 582 580 if ((start | len) & (bdev_logical_block_size(bdev) - 1)) 583 581 return -EINVAL; 584 582 583 + filemap_invalidate_lock(inode->i_mapping); 584 + 585 585 /* Invalidate the page cache, including dirty pages. */ 586 586 error = truncate_bdev_range(bdev, file->f_mode, start, end); 587 587 if (error) 588 - return error; 588 + goto fail; 589 589 590 590 switch (mode) { 591 591 case FALLOC_FL_ZERO_RANGE: ··· 604 600 GFP_KERNEL, 0); 605 601 break; 606 602 default: 607 - return -EOPNOTSUPP; 603 + error = -EOPNOTSUPP; 608 604 } 609 - if (error) 610 - return error; 611 605 612 - /* 613 - * Invalidate the page cache again; if someone wandered in and dirtied 614 - * a page, we just discard it - userspace has no way of knowing whether 615 - * the write happened before or after discard completing... 616 - */ 617 - return truncate_bdev_range(bdev, file->f_mode, start, end); 606 + fail: 607 + filemap_invalidate_unlock(inode->i_mapping); 608 + return error; 618 609 } 619 610 620 611 const struct file_operations def_blk_fops = {
-5
drivers/md/md.c
··· 5700 5700 disk->flags |= GENHD_FL_EXT_DEVT; 5701 5701 disk->events |= DISK_EVENT_MEDIA_CHANGE; 5702 5702 mddev->gendisk = disk; 5703 - /* As soon as we call add_disk(), another thread could get 5704 - * through to md_open, so make sure it doesn't get too far 5705 - */ 5706 - mutex_lock(&mddev->open_mutex); 5707 5703 add_disk(disk); 5708 5704 5709 5705 error = kobject_add(&mddev->kobj, &disk_to_dev(disk)->kobj, "%s", "md"); ··· 5714 5718 if (mddev->kobj.sd && 5715 5719 sysfs_create_group(&mddev->kobj, &md_bitmap_group)) 5716 5720 pr_debug("pointless warning\n"); 5717 - mutex_unlock(&mddev->open_mutex); 5718 5721 abort: 5719 5722 mutex_unlock(&disks_mutex); 5720 5723 if (!error && mddev->kobj.sd) {
+17 -16
drivers/nvme/host/core.c
··· 13 13 #include <linux/kernel.h> 14 14 #include <linux/module.h> 15 15 #include <linux/backing-dev.h> 16 - #include <linux/list_sort.h> 17 16 #include <linux/slab.h> 18 17 #include <linux/types.h> 19 18 #include <linux/pr.h> ··· 3715 3716 return ret; 3716 3717 } 3717 3718 3718 - static int ns_cmp(void *priv, const struct list_head *a, 3719 - const struct list_head *b) 3720 - { 3721 - struct nvme_ns *nsa = container_of(a, struct nvme_ns, list); 3722 - struct nvme_ns *nsb = container_of(b, struct nvme_ns, list); 3723 - 3724 - return nsa->head->ns_id - nsb->head->ns_id; 3725 - } 3726 - 3727 3719 struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid) 3728 3720 { 3729 3721 struct nvme_ns *ns, *ret = NULL; ··· 3734 3744 return ret; 3735 3745 } 3736 3746 EXPORT_SYMBOL_NS_GPL(nvme_find_get_ns, NVME_TARGET_PASSTHRU); 3747 + 3748 + /* 3749 + * Add the namespace to the controller list while keeping the list ordered. 3750 + */ 3751 + static void nvme_ns_add_to_ctrl_list(struct nvme_ns *ns) 3752 + { 3753 + struct nvme_ns *tmp; 3754 + 3755 + list_for_each_entry_reverse(tmp, &ns->ctrl->namespaces, list) { 3756 + if (tmp->head->ns_id < ns->head->ns_id) { 3757 + list_add(&ns->list, &tmp->list); 3758 + return; 3759 + } 3760 + } 3761 + list_add(&ns->list, &ns->ctrl->namespaces); 3762 + } 3737 3763 3738 3764 static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid, 3739 3765 struct nvme_ns_ids *ids) ··· 3801 3795 goto out_unlink_ns; 3802 3796 3803 3797 down_write(&ctrl->namespaces_rwsem); 3804 - list_add_tail(&ns->list, &ctrl->namespaces); 3798 + nvme_ns_add_to_ctrl_list(ns); 3805 3799 up_write(&ctrl->namespaces_rwsem); 3806 - 3807 3800 nvme_get_ctrl(ctrl); 3808 3801 3809 3802 if (device_add_disk(ctrl->device, ns->disk, nvme_ns_id_attr_groups)) ··· 4085 4080 if (nvme_scan_ns_list(ctrl) != 0) 4086 4081 nvme_scan_ns_sequential(ctrl); 4087 4082 mutex_unlock(&ctrl->scan_lock); 4088 - 4089 - down_write(&ctrl->namespaces_rwsem); 4090 - list_sort(NULL, &ctrl->namespaces, ns_cmp); 4091 - up_write(&ctrl->namespaces_rwsem); 4092 4083 } 4093 4084 4094 4085 /*
+9 -9
drivers/nvme/host/fc.c
··· 2487 2487 */ 2488 2488 if (ctrl->ctrl.queue_count > 1) { 2489 2489 nvme_stop_queues(&ctrl->ctrl); 2490 + nvme_sync_io_queues(&ctrl->ctrl); 2490 2491 blk_mq_tagset_busy_iter(&ctrl->tag_set, 2491 2492 nvme_fc_terminate_exchange, &ctrl->ctrl); 2492 2493 blk_mq_tagset_wait_completed_request(&ctrl->tag_set); ··· 2511 2510 * clean up the admin queue. Same thing as above. 2512 2511 */ 2513 2512 blk_mq_quiesce_queue(ctrl->ctrl.admin_q); 2513 + blk_sync_queue(ctrl->ctrl.admin_q); 2514 2514 blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, 2515 2515 nvme_fc_terminate_exchange, &ctrl->ctrl); 2516 2516 blk_mq_tagset_wait_completed_request(&ctrl->admin_tag_set); ··· 2953 2951 if (ctrl->ctrl.queue_count == 1) 2954 2952 return 0; 2955 2953 2954 + if (prior_ioq_cnt != nr_io_queues) { 2955 + dev_info(ctrl->ctrl.device, 2956 + "reconnect: revising io queue count from %d to %d\n", 2957 + prior_ioq_cnt, nr_io_queues); 2958 + blk_mq_update_nr_hw_queues(&ctrl->tag_set, nr_io_queues); 2959 + } 2960 + 2956 2961 ret = nvme_fc_create_hw_io_queues(ctrl, ctrl->ctrl.sqsize + 1); 2957 2962 if (ret) 2958 2963 goto out_free_io_queues; ··· 2967 2958 ret = nvme_fc_connect_io_queues(ctrl, ctrl->ctrl.sqsize + 1); 2968 2959 if (ret) 2969 2960 goto out_delete_hw_queues; 2970 - 2971 - if (prior_ioq_cnt != nr_io_queues) { 2972 - dev_info(ctrl->ctrl.device, 2973 - "reconnect: revising io queue count from %d to %d\n", 2974 - prior_ioq_cnt, nr_io_queues); 2975 - nvme_wait_freeze(&ctrl->ctrl); 2976 - blk_mq_update_nr_hw_queues(&ctrl->tag_set, nr_io_queues); 2977 - nvme_unfreeze(&ctrl->ctrl); 2978 - } 2979 2961 2980 2962 return 0; 2981 2963
+10 -3
drivers/nvme/host/tcp.c
··· 620 620 cpu_to_le32(data->hdr.hlen + hdgst + req->pdu_len + ddgst); 621 621 data->ttag = pdu->ttag; 622 622 data->command_id = nvme_cid(rq); 623 - data->data_offset = cpu_to_le32(req->data_sent); 623 + data->data_offset = pdu->r2t_offset; 624 624 data->data_length = cpu_to_le32(req->pdu_len); 625 625 return 0; 626 626 } ··· 953 953 nvme_tcp_ddgst_update(queue->snd_hash, page, 954 954 offset, ret); 955 955 956 - /* fully successful last write*/ 956 + /* 957 + * update the request iterator except for the last payload send 958 + * in the request where we don't want to modify it as we may 959 + * compete with the RX path completing the request. 960 + */ 961 + if (req->data_sent + ret < req->data_len) 962 + nvme_tcp_advance_req(req, ret); 963 + 964 + /* fully successful last send in current PDU */ 957 965 if (last && ret == len) { 958 966 if (queue->data_digest) { 959 967 nvme_tcp_ddgst_final(queue->snd_hash, ··· 973 965 } 974 966 return 1; 975 967 } 976 - nvme_tcp_advance_req(req, ret); 977 968 } 978 969 return -EAGAIN; 979 970 }
+8
kernel/trace/blktrace.c
··· 1605 1605 if (bt == NULL) 1606 1606 return -EINVAL; 1607 1607 1608 + if (bt->trace_state == Blktrace_running) { 1609 + bt->trace_state = Blktrace_stopped; 1610 + spin_lock_irq(&running_trace_lock); 1611 + list_del_init(&bt->running_list); 1612 + spin_unlock_irq(&running_trace_lock); 1613 + relay_flush(bt->rchan); 1614 + } 1615 + 1608 1616 put_probe_ref(); 1609 1617 synchronize_rcu(); 1610 1618 blk_trace_free(bt);