Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

virtio-blk: fix to match virtio spec

The merged patch series to support zoned block devices in virtio-blk
is not the most up to date version. The merged patch can be found at

https://lore.kernel.org/linux-block/20221016034127.330942-3-dmitry.fomichev@wdc.com/

but the latest and reviewed version is

https://lore.kernel.org/linux-block/20221110053952.3378990-3-dmitry.fomichev@wdc.com/

The reason is apparently that the correct mailing lists and
maintainers were not copied.

The differences between the two are mostly cleanups, but there is one
change that is very important in terms of compatibility with the
approved virtio-zbd specification.

Before it was approved, the OASIS virtio spec had a change in
VIRTIO_BLK_T_ZONE_APPEND request layout that is not reflected in the
current virtio-blk driver code. In the running code, the status is
the first byte of the in-header that is followed by some pad bytes
and the u64 that carries the sector at which the data has been written
to the zone back to the driver, aka the append sector.

This layout turned out to be problematic for implementing in QEMU and
the request status byte has been eventually made the last byte of the
in-header. The current code doesn't expect that and this causes the
append sector value always come as zero to the block layer. This needs
to be fixed ASAP.

Fixes: 95bfec41bd3d ("virtio-blk: add support for zoned block devices")
Cc: stable@vger.kernel.org
Signed-off-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Message-Id: <20230330214953.1088216-2-dmitry.fomichev@wdc.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>

authored by

Dmitry Fomichev and committed by
Michael S. Tsirkin
f1ba4e67 7e364e56

+166 -90
+157 -81
drivers/block/virtio_blk.c
··· 96 96 97 97 /* 98 98 * The zone append command has an extended in header. 99 - * The status field in zone_append_in_hdr must have 100 - * the same offset in virtblk_req as the non-zoned 101 - * status field above. 99 + * The status field in zone_append_in_hdr must always 100 + * be the last byte. 102 101 */ 103 102 struct { 103 + __virtio64 sector; 104 104 u8 status; 105 - u8 reserved[7]; 106 - __le64 append_sector; 107 - } zone_append_in_hdr; 108 - }; 105 + } zone_append; 106 + } in_hdr; 109 107 110 108 size_t in_hdr_len; 111 109 ··· 152 154 sgs[num_out + num_in++] = vbr->sg_table.sgl; 153 155 } 154 156 155 - sg_init_one(&in_hdr, &vbr->status, vbr->in_hdr_len); 157 + sg_init_one(&in_hdr, &vbr->in_hdr.status, vbr->in_hdr_len); 156 158 sgs[num_out + num_in++] = &in_hdr; 157 159 158 160 return virtqueue_add_sgs(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC); ··· 240 242 struct request *req, 241 243 struct virtblk_req *vbr) 242 244 { 243 - size_t in_hdr_len = sizeof(vbr->status); 245 + size_t in_hdr_len = sizeof(vbr->in_hdr.status); 244 246 bool unmap = false; 245 247 u32 type; 246 248 u64 sector = 0; 249 + 250 + if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) && op_is_zone_mgmt(req_op(req))) 251 + return BLK_STS_NOTSUPP; 247 252 248 253 /* Set fields for all request types */ 249 254 vbr->out_hdr.ioprio = cpu_to_virtio32(vdev, req_get_ioprio(req)); ··· 288 287 case REQ_OP_ZONE_APPEND: 289 288 type = VIRTIO_BLK_T_ZONE_APPEND; 290 289 sector = blk_rq_pos(req); 291 - in_hdr_len = sizeof(vbr->zone_append_in_hdr); 290 + in_hdr_len = sizeof(vbr->in_hdr.zone_append); 292 291 break; 293 292 case REQ_OP_ZONE_RESET: 294 293 type = VIRTIO_BLK_T_ZONE_RESET; ··· 298 297 type = VIRTIO_BLK_T_ZONE_RESET_ALL; 299 298 break; 300 299 case REQ_OP_DRV_IN: 301 - /* Out header already filled in, nothing to do */ 300 + /* 301 + * Out header has already been prepared by the caller (virtblk_get_id() 302 + * or virtblk_submit_zone_report()), nothing to do here. 303 + */ 302 304 return 0; 303 305 default: 304 306 WARN_ON_ONCE(1); ··· 322 318 return 0; 323 319 } 324 320 321 + /* 322 + * The status byte is always the last byte of the virtblk request 323 + * in-header. This helper fetches its value for all in-header formats 324 + * that are currently defined. 325 + */ 326 + static inline u8 virtblk_vbr_status(struct virtblk_req *vbr) 327 + { 328 + return *((u8 *)&vbr->in_hdr + vbr->in_hdr_len - 1); 329 + } 330 + 325 331 static inline void virtblk_request_done(struct request *req) 326 332 { 327 333 struct virtblk_req *vbr = blk_mq_rq_to_pdu(req); 328 - blk_status_t status = virtblk_result(vbr->status); 334 + blk_status_t status = virtblk_result(virtblk_vbr_status(vbr)); 335 + struct virtio_blk *vblk = req->mq_hctx->queue->queuedata; 329 336 330 337 virtblk_unmap_data(req, vbr); 331 338 virtblk_cleanup_cmd(req); 332 339 333 340 if (req_op(req) == REQ_OP_ZONE_APPEND) 334 - req->__sector = le64_to_cpu(vbr->zone_append_in_hdr.append_sector); 341 + req->__sector = virtio64_to_cpu(vblk->vdev, 342 + vbr->in_hdr.zone_append.sector); 335 343 336 344 blk_mq_end_request(req, status); 337 345 } ··· 371 355 372 356 if (likely(!blk_should_fake_timeout(req->q)) && 373 357 !blk_mq_complete_request_remote(req) && 374 - !blk_mq_add_to_batch(req, iob, vbr->status, 358 + !blk_mq_add_to_batch(req, iob, virtblk_vbr_status(vbr), 375 359 virtblk_complete_batch)) 376 360 virtblk_request_done(req); 377 361 req_done++; ··· 566 550 #ifdef CONFIG_BLK_DEV_ZONED 567 551 static void *virtblk_alloc_report_buffer(struct virtio_blk *vblk, 568 552 unsigned int nr_zones, 569 - unsigned int zone_sectors, 570 553 size_t *buflen) 571 554 { 572 555 struct request_queue *q = vblk->disk->queue; ··· 573 558 void *buf; 574 559 575 560 nr_zones = min_t(unsigned int, nr_zones, 576 - get_capacity(vblk->disk) >> ilog2(zone_sectors)); 561 + get_capacity(vblk->disk) >> ilog2(vblk->zone_sectors)); 577 562 578 563 bufsize = sizeof(struct virtio_blk_zone_report) + 579 564 nr_zones * sizeof(struct virtio_blk_zone_descriptor); ··· 607 592 return PTR_ERR(req); 608 593 609 594 vbr = blk_mq_rq_to_pdu(req); 610 - vbr->in_hdr_len = sizeof(vbr->status); 595 + vbr->in_hdr_len = sizeof(vbr->in_hdr.status); 611 596 vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_ZONE_REPORT); 612 597 vbr->out_hdr.sector = cpu_to_virtio64(vblk->vdev, sector); 613 598 ··· 616 601 goto out; 617 602 618 603 blk_execute_rq(req, false); 619 - err = blk_status_to_errno(virtblk_result(vbr->status)); 604 + err = blk_status_to_errno(virtblk_result(vbr->in_hdr.status)); 620 605 out: 621 606 blk_mq_free_request(req); 622 607 return err; ··· 624 609 625 610 static int virtblk_parse_zone(struct virtio_blk *vblk, 626 611 struct virtio_blk_zone_descriptor *entry, 627 - unsigned int idx, unsigned int zone_sectors, 628 - report_zones_cb cb, void *data) 612 + unsigned int idx, report_zones_cb cb, void *data) 629 613 { 630 614 struct blk_zone zone = { }; 631 615 632 - if (entry->z_type != VIRTIO_BLK_ZT_SWR && 633 - entry->z_type != VIRTIO_BLK_ZT_SWP && 634 - entry->z_type != VIRTIO_BLK_ZT_CONV) { 635 - dev_err(&vblk->vdev->dev, "invalid zone type %#x\n", 636 - entry->z_type); 637 - return -EINVAL; 616 + zone.start = virtio64_to_cpu(vblk->vdev, entry->z_start); 617 + if (zone.start + vblk->zone_sectors <= get_capacity(vblk->disk)) 618 + zone.len = vblk->zone_sectors; 619 + else 620 + zone.len = get_capacity(vblk->disk) - zone.start; 621 + zone.capacity = virtio64_to_cpu(vblk->vdev, entry->z_cap); 622 + zone.wp = virtio64_to_cpu(vblk->vdev, entry->z_wp); 623 + 624 + switch (entry->z_type) { 625 + case VIRTIO_BLK_ZT_SWR: 626 + zone.type = BLK_ZONE_TYPE_SEQWRITE_REQ; 627 + break; 628 + case VIRTIO_BLK_ZT_SWP: 629 + zone.type = BLK_ZONE_TYPE_SEQWRITE_PREF; 630 + break; 631 + case VIRTIO_BLK_ZT_CONV: 632 + zone.type = BLK_ZONE_TYPE_CONVENTIONAL; 633 + break; 634 + default: 635 + dev_err(&vblk->vdev->dev, "zone %llu: invalid type %#x\n", 636 + zone.start, entry->z_type); 637 + return -EIO; 638 638 } 639 639 640 - zone.type = entry->z_type; 641 - zone.cond = entry->z_state; 642 - zone.len = zone_sectors; 643 - zone.capacity = le64_to_cpu(entry->z_cap); 644 - zone.start = le64_to_cpu(entry->z_start); 645 - if (zone.cond == BLK_ZONE_COND_FULL) 640 + switch (entry->z_state) { 641 + case VIRTIO_BLK_ZS_EMPTY: 642 + zone.cond = BLK_ZONE_COND_EMPTY; 643 + break; 644 + case VIRTIO_BLK_ZS_CLOSED: 645 + zone.cond = BLK_ZONE_COND_CLOSED; 646 + break; 647 + case VIRTIO_BLK_ZS_FULL: 648 + zone.cond = BLK_ZONE_COND_FULL; 646 649 zone.wp = zone.start + zone.len; 647 - else 648 - zone.wp = le64_to_cpu(entry->z_wp); 650 + break; 651 + case VIRTIO_BLK_ZS_EOPEN: 652 + zone.cond = BLK_ZONE_COND_EXP_OPEN; 653 + break; 654 + case VIRTIO_BLK_ZS_IOPEN: 655 + zone.cond = BLK_ZONE_COND_IMP_OPEN; 656 + break; 657 + case VIRTIO_BLK_ZS_NOT_WP: 658 + zone.cond = BLK_ZONE_COND_NOT_WP; 659 + break; 660 + case VIRTIO_BLK_ZS_RDONLY: 661 + zone.cond = BLK_ZONE_COND_READONLY; 662 + zone.wp = ULONG_MAX; 663 + break; 664 + case VIRTIO_BLK_ZS_OFFLINE: 665 + zone.cond = BLK_ZONE_COND_OFFLINE; 666 + zone.wp = ULONG_MAX; 667 + break; 668 + default: 669 + dev_err(&vblk->vdev->dev, "zone %llu: invalid condition %#x\n", 670 + zone.start, entry->z_state); 671 + return -EIO; 672 + } 649 673 674 + /* 675 + * The callback below checks the validity of the reported 676 + * entry data, no need to further validate it here. 677 + */ 650 678 return cb(&zone, idx, data); 651 679 } 652 680 ··· 699 641 { 700 642 struct virtio_blk *vblk = disk->private_data; 701 643 struct virtio_blk_zone_report *report; 702 - unsigned int zone_sectors = vblk->zone_sectors; 703 - unsigned int nz, i; 704 - int ret, zone_idx = 0; 644 + unsigned long long nz, i; 705 645 size_t buflen; 646 + unsigned int zone_idx = 0; 647 + int ret; 706 648 707 649 if (WARN_ON_ONCE(!vblk->zone_sectors)) 708 650 return -EOPNOTSUPP; 709 651 710 - report = virtblk_alloc_report_buffer(vblk, nr_zones, 711 - zone_sectors, &buflen); 652 + report = virtblk_alloc_report_buffer(vblk, nr_zones, &buflen); 712 653 if (!report) 713 654 return -ENOMEM; 655 + 656 + mutex_lock(&vblk->vdev_mutex); 657 + 658 + if (!vblk->vdev) { 659 + ret = -ENXIO; 660 + goto fail_report; 661 + } 714 662 715 663 while (zone_idx < nr_zones && sector < get_capacity(vblk->disk)) { 716 664 memset(report, 0, buflen); 717 665 718 666 ret = virtblk_submit_zone_report(vblk, (char *)report, 719 667 buflen, sector); 720 - if (ret) { 721 - if (ret > 0) 722 - ret = -EIO; 723 - goto out_free; 724 - } 725 - nz = min((unsigned int)le64_to_cpu(report->nr_zones), nr_zones); 668 + if (ret) 669 + goto fail_report; 670 + 671 + nz = min_t(u64, virtio64_to_cpu(vblk->vdev, report->nr_zones), 672 + nr_zones); 726 673 if (!nz) 727 674 break; 728 675 729 676 for (i = 0; i < nz && zone_idx < nr_zones; i++) { 730 677 ret = virtblk_parse_zone(vblk, &report->zones[i], 731 - zone_idx, zone_sectors, cb, data); 678 + zone_idx, cb, data); 732 679 if (ret) 733 - goto out_free; 734 - sector = le64_to_cpu(report->zones[i].z_start) + zone_sectors; 680 + goto fail_report; 681 + 682 + sector = virtio64_to_cpu(vblk->vdev, 683 + report->zones[i].z_start) + 684 + vblk->zone_sectors; 735 685 zone_idx++; 736 686 } 737 687 } ··· 748 682 ret = zone_idx; 749 683 else 750 684 ret = -EINVAL; 751 - out_free: 685 + fail_report: 686 + mutex_unlock(&vblk->vdev_mutex); 752 687 kvfree(report); 753 688 return ret; 754 689 } ··· 758 691 { 759 692 u8 model; 760 693 761 - if (!vblk->zone_sectors) 762 - return; 763 - 764 694 virtio_cread(vblk->vdev, struct virtio_blk_config, 765 695 zoned.model, &model); 766 - if (!blk_revalidate_disk_zones(vblk->disk, NULL)) 767 - set_capacity_and_notify(vblk->disk, 0); 696 + switch (model) { 697 + default: 698 + dev_err(&vblk->vdev->dev, "unknown zone model %d\n", model); 699 + fallthrough; 700 + case VIRTIO_BLK_Z_NONE: 701 + case VIRTIO_BLK_Z_HA: 702 + disk_set_zoned(vblk->disk, BLK_ZONED_NONE); 703 + return; 704 + case VIRTIO_BLK_Z_HM: 705 + WARN_ON_ONCE(!vblk->zone_sectors); 706 + if (!blk_revalidate_disk_zones(vblk->disk, NULL)) 707 + set_capacity_and_notify(vblk->disk, 0); 708 + } 768 709 } 769 710 770 711 static int virtblk_probe_zoned_device(struct virtio_device *vdev, 771 712 struct virtio_blk *vblk, 772 713 struct request_queue *q) 773 714 { 774 - u32 v; 715 + u32 v, wg; 775 716 u8 model; 776 717 int ret; 777 718 ··· 788 713 789 714 switch (model) { 790 715 case VIRTIO_BLK_Z_NONE: 716 + case VIRTIO_BLK_Z_HA: 717 + /* Present the host-aware device as non-zoned */ 791 718 return 0; 792 719 case VIRTIO_BLK_Z_HM: 793 720 break; 794 - case VIRTIO_BLK_Z_HA: 795 - /* 796 - * Present the host-aware device as a regular drive. 797 - * TODO It is possible to add an option to make it appear 798 - * in the system as a zoned drive. 799 - */ 800 - return 0; 801 721 default: 802 722 dev_err(&vdev->dev, "unsupported zone model %d\n", model); 803 723 return -EINVAL; ··· 805 735 806 736 virtio_cread(vdev, struct virtio_blk_config, 807 737 zoned.max_open_zones, &v); 808 - disk_set_max_open_zones(vblk->disk, le32_to_cpu(v)); 809 - 810 - dev_dbg(&vdev->dev, "max open zones = %u\n", le32_to_cpu(v)); 738 + disk_set_max_open_zones(vblk->disk, v); 739 + dev_dbg(&vdev->dev, "max open zones = %u\n", v); 811 740 812 741 virtio_cread(vdev, struct virtio_blk_config, 813 742 zoned.max_active_zones, &v); 814 - disk_set_max_active_zones(vblk->disk, le32_to_cpu(v)); 815 - dev_dbg(&vdev->dev, "max active zones = %u\n", le32_to_cpu(v)); 743 + disk_set_max_active_zones(vblk->disk, v); 744 + dev_dbg(&vdev->dev, "max active zones = %u\n", v); 816 745 817 746 virtio_cread(vdev, struct virtio_blk_config, 818 - zoned.write_granularity, &v); 819 - if (!v) { 747 + zoned.write_granularity, &wg); 748 + if (!wg) { 820 749 dev_warn(&vdev->dev, "zero write granularity reported\n"); 821 750 return -ENODEV; 822 751 } 823 - blk_queue_physical_block_size(q, le32_to_cpu(v)); 824 - blk_queue_io_min(q, le32_to_cpu(v)); 752 + blk_queue_physical_block_size(q, wg); 753 + blk_queue_io_min(q, wg); 825 754 826 - dev_dbg(&vdev->dev, "write granularity = %u\n", le32_to_cpu(v)); 755 + dev_dbg(&vdev->dev, "write granularity = %u\n", wg); 827 756 828 757 /* 829 758 * virtio ZBD specification doesn't require zones to be a power of 830 759 * two sectors in size, but the code in this driver expects that. 831 760 */ 832 - virtio_cread(vdev, struct virtio_blk_config, zoned.zone_sectors, &v); 833 - vblk->zone_sectors = le32_to_cpu(v); 761 + virtio_cread(vdev, struct virtio_blk_config, zoned.zone_sectors, 762 + &vblk->zone_sectors); 834 763 if (vblk->zone_sectors == 0 || !is_power_of_2(vblk->zone_sectors)) { 835 764 dev_err(&vdev->dev, 836 765 "zoned device with non power of two zone size %u\n", ··· 852 783 dev_warn(&vdev->dev, "zero max_append_sectors reported\n"); 853 784 return -ENODEV; 854 785 } 855 - blk_queue_max_zone_append_sectors(q, le32_to_cpu(v)); 856 - dev_dbg(&vdev->dev, "max append sectors = %u\n", le32_to_cpu(v)); 786 + if ((v << SECTOR_SHIFT) < wg) { 787 + dev_err(&vdev->dev, 788 + "write granularity %u exceeds max_append_sectors %u limit\n", 789 + wg, v); 790 + return -ENODEV; 791 + } 792 + 793 + blk_queue_max_zone_append_sectors(q, v); 794 + dev_dbg(&vdev->dev, "max append sectors = %u\n", v); 857 795 } 858 796 859 797 return ret; ··· 870 794 { 871 795 return virtio_has_feature(vdev, VIRTIO_BLK_F_ZONED); 872 796 } 797 + 873 798 #else 874 799 875 800 /* ··· 878 801 * We only need to define a few symbols to avoid compilation errors. 879 802 */ 880 803 #define virtblk_report_zones NULL 804 + 881 805 static inline void virtblk_revalidate_zones(struct virtio_blk *vblk) 882 806 { 883 807 } 808 + 884 809 static inline int virtblk_probe_zoned_device(struct virtio_device *vdev, 885 810 struct virtio_blk *vblk, struct request_queue *q) 886 811 { ··· 910 831 return PTR_ERR(req); 911 832 912 833 vbr = blk_mq_rq_to_pdu(req); 913 - vbr->in_hdr_len = sizeof(vbr->status); 834 + vbr->in_hdr_len = sizeof(vbr->in_hdr.status); 914 835 vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_GET_ID); 915 836 vbr->out_hdr.sector = 0; 916 837 ··· 919 840 goto out; 920 841 921 842 blk_execute_rq(req, false); 922 - err = blk_status_to_errno(virtblk_result(vbr->status)); 843 + err = blk_status_to_errno(virtblk_result(vbr->in_hdr.status)); 923 844 out: 924 845 blk_mq_free_request(req); 925 846 return err; ··· 1582 1503 if (err) 1583 1504 goto out_cleanup_disk; 1584 1505 } 1585 - 1586 - dev_info(&vdev->dev, "blk config size: %zu\n", 1587 - sizeof(struct virtio_blk_config)); 1588 1506 1589 1507 err = device_add_disk(&vdev->dev, vblk->disk, virtblk_attr_groups); 1590 1508 if (err)
+9 -9
include/uapi/linux/virtio_blk.h
··· 140 140 141 141 /* Zoned block device characteristics (if VIRTIO_BLK_F_ZONED) */ 142 142 struct virtio_blk_zoned_characteristics { 143 - __le32 zone_sectors; 144 - __le32 max_open_zones; 145 - __le32 max_active_zones; 146 - __le32 max_append_sectors; 147 - __le32 write_granularity; 143 + __virtio32 zone_sectors; 144 + __virtio32 max_open_zones; 145 + __virtio32 max_active_zones; 146 + __virtio32 max_append_sectors; 147 + __virtio32 write_granularity; 148 148 __u8 model; 149 149 __u8 unused2[3]; 150 150 } zoned; ··· 241 241 */ 242 242 struct virtio_blk_zone_descriptor { 243 243 /* Zone capacity */ 244 - __le64 z_cap; 244 + __virtio64 z_cap; 245 245 /* The starting sector of the zone */ 246 - __le64 z_start; 246 + __virtio64 z_start; 247 247 /* Zone write pointer position in sectors */ 248 - __le64 z_wp; 248 + __virtio64 z_wp; 249 249 /* Zone type */ 250 250 __u8 z_type; 251 251 /* Zone state */ ··· 254 254 }; 255 255 256 256 struct virtio_blk_zone_report { 257 - __le64 nr_zones; 257 + __virtio64 nr_zones; 258 258 __u8 reserved[56]; 259 259 struct virtio_blk_zone_descriptor zones[]; 260 260 };