Merge tag 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost

+9 -8

drivers/block/virtio_blk.c

··· 21 21 #define VQ_NAME_LEN 16 22 22 #define MAX_DISCARD_SEGMENTS 256u 23 23 24 + /* The maximum number of sg elements that fit into a virtqueue */ 25 + #define VIRTIO_BLK_MAX_SG_ELEMS 32768 26 + 24 27 static int major; 25 28 static DEFINE_IDA(vd_index_ida); 26 29 ··· 450 447 /* Host must always specify the capacity. */ 451 448 virtio_cread(vdev, struct virtio_blk_config, capacity, &capacity); 452 449 453 - /* If capacity is too big, truncate with warning. */ 454 - if ((sector_t)capacity != capacity) { 455 - dev_warn(&vdev->dev, "Capacity %llu too large: truncating\n", 456 - (unsigned long long)capacity); 457 - capacity = (sector_t)-1; 458 - } 459 - 460 450 nblocks = DIV_ROUND_UP_ULL(capacity, queue_logical_block_size(q) >> 9); 461 451 462 452 string_get_size(nblocks, queue_logical_block_size(q), ··· 724 728 if (err || !sg_elems) 725 729 sg_elems = 1; 726 730 727 - /* We need an extra sg elements at head and tail. */ 731 + /* Prevent integer overflows and honor max vq size */ 732 + sg_elems = min_t(u32, sg_elems, VIRTIO_BLK_MAX_SG_ELEMS - 2); 733 + 734 + /* We need extra sg elements at head and tail. */ 728 735 sg_elems += 2; 729 736 vdev->priv = vblk = kmalloc(sizeof(*vblk), GFP_KERNEL); 730 737 if (!vblk) { ··· 935 936 blk_mq_quiesce_queue(vblk->disk->queue); 936 937 937 938 vdev->config->del_vqs(vdev); 939 + kfree(vblk->vqs); 940 + 938 941 return 0; 939 942 } 940 943

+2 -2

drivers/char/virtio_console.c

··· 475 475 476 476 buf = virtqueue_get_buf(port->in_vq, &len); 477 477 if (buf) { 478 - buf->len = len; 478 + buf->len = min_t(size_t, len, buf->size); 479 479 buf->offset = 0; 480 480 port->stats.bytes_received += len; 481 481 } ··· 1709 1709 while ((buf = virtqueue_get_buf(vq, &len))) { 1710 1710 spin_unlock(&portdev->c_ivq_lock); 1711 1711 1712 - buf->len = len; 1712 + buf->len = min_t(size_t, len, buf->size); 1713 1713 buf->offset = 0; 1714 1714 1715 1715 handle_control_message(vq->vdev, portdev, buf);

+42 -11

drivers/net/virtio_net.c

··· 1516 1516 return; 1517 1517 1518 1518 if (__netif_tx_trylock(txq)) { 1519 - free_old_xmit_skbs(sq, true); 1519 + do { 1520 + virtqueue_disable_cb(sq->vq); 1521 + free_old_xmit_skbs(sq, true); 1522 + } while (unlikely(!virtqueue_enable_cb_delayed(sq->vq))); 1523 + 1524 + if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS) 1525 + netif_tx_wake_queue(txq); 1526 + 1520 1527 __netif_tx_unlock(txq); 1521 1528 } 1522 - 1523 - if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS) 1524 - netif_tx_wake_queue(txq); 1525 1529 } 1526 1530 1527 1531 static int virtnet_poll(struct napi_struct *napi, int budget) ··· 1596 1592 struct virtnet_info *vi = sq->vq->vdev->priv; 1597 1593 unsigned int index = vq2txq(sq->vq); 1598 1594 struct netdev_queue *txq; 1595 + int opaque; 1596 + bool done; 1599 1597 1600 1598 if (unlikely(is_xdp_raw_buffer_queue(vi, index))) { 1601 1599 /* We don't need to enable cb for XDP */ ··· 1607 1601 1608 1602 txq = netdev_get_tx_queue(vi->dev, index); 1609 1603 __netif_tx_lock(txq, raw_smp_processor_id()); 1604 + virtqueue_disable_cb(sq->vq); 1610 1605 free_old_xmit_skbs(sq, true); 1611 - __netif_tx_unlock(txq); 1612 - 1613 - virtqueue_napi_complete(napi, sq->vq, 0); 1614 1606 1615 1607 if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS) 1616 1608 netif_tx_wake_queue(txq); 1609 + 1610 + opaque = virtqueue_enable_cb_prepare(sq->vq); 1611 + 1612 + done = napi_complete_done(napi, 0); 1613 + 1614 + if (!done) 1615 + virtqueue_disable_cb(sq->vq); 1616 + 1617 + __netif_tx_unlock(txq); 1618 + 1619 + if (done) { 1620 + if (unlikely(virtqueue_poll(sq->vq, opaque))) { 1621 + if (napi_schedule_prep(napi)) { 1622 + __netif_tx_lock(txq, raw_smp_processor_id()); 1623 + virtqueue_disable_cb(sq->vq); 1624 + __netif_tx_unlock(txq); 1625 + __napi_schedule(napi); 1626 + } 1627 + } 1628 + } 1617 1629 1618 1630 return 0; 1619 1631 } ··· 1694 1670 bool use_napi = sq->napi.weight; 1695 1671 1696 1672 /* Free up any pending old buffers before queueing new ones. */ 1697 - free_old_xmit_skbs(sq, false); 1673 + do { 1674 + if (use_napi) 1675 + virtqueue_disable_cb(sq->vq); 1698 1676 1699 - if (use_napi && kick) 1700 - virtqueue_enable_cb_delayed(sq->vq); 1677 + free_old_xmit_skbs(sq, false); 1678 + 1679 + } while (use_napi && kick && 1680 + unlikely(!virtqueue_enable_cb_delayed(sq->vq))); 1701 1681 1702 1682 /* timestamp packet in software */ 1703 1683 skb_tx_timestamp(skb); ··· 3338 3310 virtnet_set_queues(vi, vi->curr_queue_pairs); 3339 3311 3340 3312 err = virtnet_cpu_notif_add(vi); 3341 - if (err) 3313 + if (err) { 3314 + virtnet_freeze_down(vdev); 3315 + remove_vq_common(vi); 3342 3316 return err; 3317 + } 3343 3318 3344 3319 return 0; 3345 3320 }

+4

drivers/vdpa/ifcvf/ifcvf_base.c

··· 133 133 &hw->notify_off_multiplier); 134 134 hw->notify_bar = cap.bar; 135 135 hw->notify_base = get_cap_addr(hw, &cap); 136 + hw->notify_base_pa = pci_resource_start(pdev, cap.bar) + 137 + le32_to_cpu(cap.offset); 136 138 IFCVF_DBG(pdev, "hw->notify_base = %p\n", 137 139 hw->notify_base); 138 140 break; ··· 162 160 ifc_iowrite16(i, &hw->common_cfg->queue_select); 163 161 notify_off = ifc_ioread16(&hw->common_cfg->queue_notify_off); 164 162 hw->vring[i].notify_addr = hw->notify_base + 163 + notify_off * hw->notify_off_multiplier; 164 + hw->vring[i].notify_pa = hw->notify_base_pa + 165 165 notify_off * hw->notify_off_multiplier; 166 166 } 167 167

+2 -12

drivers/vdpa/ifcvf/ifcvf_base.h

··· 19 19 #include <uapi/linux/virtio_config.h> 20 20 #include <uapi/linux/virtio_pci.h> 21 21 22 - #define N3000_VENDOR_ID 0x1AF4 23 22 #define N3000_DEVICE_ID 0x1041 24 - #define N3000_SUBSYS_VENDOR_ID 0x8086 25 23 #define N3000_SUBSYS_DEVICE_ID 0x001A 26 - 27 - #define C5000X_PL_VENDOR_ID 0x1AF4 28 - #define C5000X_PL_DEVICE_ID 0x1000 29 - #define C5000X_PL_SUBSYS_VENDOR_ID 0x8086 30 - #define C5000X_PL_SUBSYS_DEVICE_ID 0x0001 31 - 32 - #define C5000X_PL_BLK_VENDOR_ID 0x1AF4 33 - #define C5000X_PL_BLK_DEVICE_ID 0x1001 34 - #define C5000X_PL_BLK_SUBSYS_VENDOR_ID 0x8086 35 - #define C5000X_PL_BLK_SUBSYS_DEVICE_ID 0x0002 36 24 37 25 #define IFCVF_NET_SUPPORTED_FEATURES \ 38 26 ((1ULL << VIRTIO_NET_F_MAC) | \ ··· 61 73 u16 last_avail_idx; 62 74 bool ready; 63 75 void __iomem *notify_addr; 76 + phys_addr_t notify_pa; 64 77 u32 irq; 65 78 struct vdpa_callback cb; 66 79 char msix_name[256]; ··· 76 87 u8 notify_bar; 77 88 /* Notificaiton bar address */ 78 89 void __iomem *notify_base; 90 + phys_addr_t notify_base_pa; 79 91 u32 notify_off_multiplier; 80 92 u64 req_features; 81 93 u64 hw_features;

+31 -12

drivers/vdpa/ifcvf/ifcvf_main.c

··· 264 264 { 265 265 struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev); 266 266 267 - state->avail_index = ifcvf_get_vq_state(vf, qid); 267 + state->split.avail_index = ifcvf_get_vq_state(vf, qid); 268 268 return 0; 269 269 } 270 270 ··· 273 273 { 274 274 struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev); 275 275 276 - return ifcvf_set_vq_state(vf, qid, state->avail_index); 276 + return ifcvf_set_vq_state(vf, qid, state->split.avail_index); 277 277 } 278 278 279 279 static void ifcvf_vdpa_set_vq_cb(struct vdpa_device *vdpa_dev, u16 qid, ··· 413 413 return vf->vring[qid].irq; 414 414 } 415 415 416 + static struct vdpa_notification_area ifcvf_get_vq_notification(struct vdpa_device *vdpa_dev, 417 + u16 idx) 418 + { 419 + struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev); 420 + struct vdpa_notification_area area; 421 + 422 + area.addr = vf->vring[idx].notify_pa; 423 + if (!vf->notify_off_multiplier) 424 + area.size = PAGE_SIZE; 425 + else 426 + area.size = vf->notify_off_multiplier; 427 + 428 + return area; 429 + } 430 + 416 431 /* 417 432 * IFCVF currently does't have on-chip IOMMU, so not 418 433 * implemented set_map()/dma_map()/dma_unmap() ··· 455 440 .get_config = ifcvf_vdpa_get_config, 456 441 .set_config = ifcvf_vdpa_set_config, 457 442 .set_config_cb = ifcvf_vdpa_set_config_cb, 443 + .get_vq_notification = ifcvf_get_vq_notification, 458 444 }; 459 445 460 446 static int ifcvf_probe(struct pci_dev *pdev, const struct pci_device_id *id) ··· 552 536 } 553 537 554 538 static struct pci_device_id ifcvf_pci_ids[] = { 555 - { PCI_DEVICE_SUB(N3000_VENDOR_ID, 539 + /* N3000 network device */ 540 + { PCI_DEVICE_SUB(PCI_VENDOR_ID_REDHAT_QUMRANET, 556 541 N3000_DEVICE_ID, 557 - N3000_SUBSYS_VENDOR_ID, 542 + PCI_VENDOR_ID_INTEL, 558 543 N3000_SUBSYS_DEVICE_ID) }, 559 - { PCI_DEVICE_SUB(C5000X_PL_VENDOR_ID, 560 - C5000X_PL_DEVICE_ID, 561 - C5000X_PL_SUBSYS_VENDOR_ID, 562 - C5000X_PL_SUBSYS_DEVICE_ID) }, 563 - { PCI_DEVICE_SUB(C5000X_PL_BLK_VENDOR_ID, 564 - C5000X_PL_BLK_DEVICE_ID, 565 - C5000X_PL_BLK_SUBSYS_VENDOR_ID, 566 - C5000X_PL_BLK_SUBSYS_DEVICE_ID) }, 544 + /* C5000X-PL network device */ 545 + { PCI_DEVICE_SUB(PCI_VENDOR_ID_REDHAT_QUMRANET, 546 + VIRTIO_TRANS_ID_NET, 547 + PCI_VENDOR_ID_INTEL, 548 + VIRTIO_ID_NET) }, 549 + /* C5000X-PL block device */ 550 + { PCI_DEVICE_SUB(PCI_VENDOR_ID_REDHAT_QUMRANET, 551 + VIRTIO_TRANS_ID_BLOCK, 552 + PCI_VENDOR_ID_INTEL, 553 + VIRTIO_ID_BLOCK) }, 567 554 568 555 { 0 }, 569 556 };

+2

drivers/vdpa/mlx5/core/mlx5_vdpa.h

··· 35 35 36 36 /* serialize mkey creation and destruction */ 37 37 struct mutex mkey_mtx; 38 + bool user_mr; 38 39 }; 39 40 40 41 struct mlx5_vdpa_resources { 41 42 u32 pdn; 42 43 struct mlx5_uars_page *uar; 43 44 void __iomem *kick_addr; 45 + u64 phys_kick_addr; 44 46 u16 uid; 45 47 u32 null_mkey; 46 48 bool valid;

+75 -22

drivers/vdpa/mlx5/core/mr.c

··· 219 219 mlx5_vdpa_destroy_mkey(mvdev, &mkey->mkey); 220 220 } 221 221 222 - static struct device *get_dma_device(struct mlx5_vdpa_dev *mvdev) 223 - { 224 - return &mvdev->mdev->pdev->dev; 225 - } 226 - 227 222 static int map_direct_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_direct_mr *mr, 228 223 struct vhost_iotlb *iotlb) 229 224 { ··· 234 239 u64 pa; 235 240 u64 paend; 236 241 struct scatterlist *sg; 237 - struct device *dma = get_dma_device(mvdev); 242 + struct device *dma = mvdev->vdev.dma_dev; 238 243 239 244 for (map = vhost_iotlb_itree_first(iotlb, mr->start, mr->end - 1); 240 245 map; map = vhost_iotlb_itree_next(map, start, mr->end - 1)) { ··· 293 298 294 299 static void unmap_direct_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_direct_mr *mr) 295 300 { 296 - struct device *dma = get_dma_device(mvdev); 301 + struct device *dma = mvdev->vdev.dma_dev; 297 302 298 303 destroy_direct_mr(mvdev, mr); 299 304 dma_unmap_sg_attrs(dma, mr->sg_head.sgl, mr->nsg, DMA_BIDIRECTIONAL, 0); ··· 355 360 * indirect memory key that provides access to the enitre address space given 356 361 * by iotlb. 357 362 */ 358 - static int _mlx5_vdpa_create_mr(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb) 363 + static int create_user_mr(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb) 359 364 { 360 365 struct mlx5_vdpa_mr *mr = &mvdev->mr; 361 366 struct mlx5_vdpa_direct_mr *dmr; ··· 368 373 u64 start = 0; 369 374 int err = 0; 370 375 int nnuls; 371 - 372 - if (mr->initialized) 373 - return 0; 374 376 375 377 INIT_LIST_HEAD(&mr->head); 376 378 for (map = vhost_iotlb_itree_first(iotlb, start, last); map; ··· 406 414 if (err) 407 415 goto err_chain; 408 416 409 - mr->initialized = true; 417 + mr->user_mr = true; 410 418 return 0; 411 419 412 420 err_chain: ··· 418 426 return err; 419 427 } 420 428 421 - int mlx5_vdpa_create_mr(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb) 429 + static int create_dma_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_mr *mr) 430 + { 431 + int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 432 + void *mkc; 433 + u32 *in; 434 + int err; 435 + 436 + in = kzalloc(inlen, GFP_KERNEL); 437 + if (!in) 438 + return -ENOMEM; 439 + 440 + mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 441 + 442 + MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_PA); 443 + MLX5_SET(mkc, mkc, length64, 1); 444 + MLX5_SET(mkc, mkc, lw, 1); 445 + MLX5_SET(mkc, mkc, lr, 1); 446 + MLX5_SET(mkc, mkc, pd, mvdev->res.pdn); 447 + MLX5_SET(mkc, mkc, qpn, 0xffffff); 448 + 449 + err = mlx5_vdpa_create_mkey(mvdev, &mr->mkey, in, inlen); 450 + if (!err) 451 + mr->user_mr = false; 452 + 453 + kfree(in); 454 + return err; 455 + } 456 + 457 + static void destroy_dma_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_mr *mr) 458 + { 459 + mlx5_vdpa_destroy_mkey(mvdev, &mr->mkey); 460 + } 461 + 462 + static int _mlx5_vdpa_create_mr(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb) 422 463 { 423 464 struct mlx5_vdpa_mr *mr = &mvdev->mr; 424 465 int err; 425 466 426 - mutex_lock(&mr->mkey_mtx); 427 - err = _mlx5_vdpa_create_mr(mvdev, iotlb); 428 - mutex_unlock(&mr->mkey_mtx); 467 + if (mr->initialized) 468 + return 0; 469 + 470 + if (iotlb) 471 + err = create_user_mr(mvdev, iotlb); 472 + else 473 + err = create_dma_mr(mvdev, mr); 474 + 475 + if (!err) 476 + mr->initialized = true; 477 + 429 478 return err; 430 479 } 431 480 432 - void mlx5_vdpa_destroy_mr(struct mlx5_vdpa_dev *mvdev) 481 + int mlx5_vdpa_create_mr(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb) 433 482 { 434 - struct mlx5_vdpa_mr *mr = &mvdev->mr; 483 + int err; 484 + 485 + mutex_lock(&mvdev->mr.mkey_mtx); 486 + err = _mlx5_vdpa_create_mr(mvdev, iotlb); 487 + mutex_unlock(&mvdev->mr.mkey_mtx); 488 + return err; 489 + } 490 + 491 + static void destroy_user_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_mr *mr) 492 + { 435 493 struct mlx5_vdpa_direct_mr *dmr; 436 494 struct mlx5_vdpa_direct_mr *n; 437 - 438 - mutex_lock(&mr->mkey_mtx); 439 - if (!mr->initialized) 440 - goto out; 441 495 442 496 destroy_indirect_key(mvdev, mr); 443 497 list_for_each_entry_safe_reverse(dmr, n, &mr->head, list) { ··· 491 453 unmap_direct_mr(mvdev, dmr); 492 454 kfree(dmr); 493 455 } 456 + } 457 + 458 + void mlx5_vdpa_destroy_mr(struct mlx5_vdpa_dev *mvdev) 459 + { 460 + struct mlx5_vdpa_mr *mr = &mvdev->mr; 461 + 462 + mutex_lock(&mr->mkey_mtx); 463 + if (!mr->initialized) 464 + goto out; 465 + 466 + if (mr->user_mr) 467 + destroy_user_mr(mvdev, mr); 468 + else 469 + destroy_dma_mr(mvdev, mr); 470 + 494 471 memset(mr, 0, sizeof(*mr)); 495 472 mr->initialized = false; 496 473 out:

+7

drivers/vdpa/mlx5/core/resources.c

··· 54 54 void *in; 55 55 int err; 56 56 57 + if (MLX5_CAP_GEN(mvdev->mdev, umem_uid_0)) 58 + return 0; 59 + 57 60 /* 0 means not supported */ 58 61 if (!MLX5_CAP_GEN(mvdev->mdev, log_max_uctx)) 59 62 return -EOPNOTSUPP; ··· 81 78 { 82 79 u32 out[MLX5_ST_SZ_DW(destroy_uctx_out)] = {}; 83 80 u32 in[MLX5_ST_SZ_DW(destroy_uctx_in)] = {}; 81 + 82 + if (!uid) 83 + return; 84 84 85 85 MLX5_SET(destroy_uctx_in, in, opcode, MLX5_CMD_OP_DESTROY_UCTX); 86 86 MLX5_SET(destroy_uctx_in, in, uid, uid); ··· 253 247 goto err_key; 254 248 255 249 kick_addr = mdev->bar_addr + offset; 250 + res->phys_kick_addr = kick_addr; 256 251 257 252 res->kick_addr = ioremap(kick_addr, PAGE_SIZE); 258 253 if (!res->kick_addr) {

+49 -18

drivers/vdpa/mlx5/net/mlx5_vnet.c

··· 611 611 mlx5_db_free(ndev->mvdev.mdev, &vcq->db); 612 612 } 613 613 614 - static int umem_size(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int num, 615 - struct mlx5_vdpa_umem **umemp) 614 + static void set_umem_size(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int num, 615 + struct mlx5_vdpa_umem **umemp) 616 616 { 617 617 struct mlx5_core_dev *mdev = ndev->mvdev.mdev; 618 618 int p_a; ··· 635 635 *umemp = &mvq->umem3; 636 636 break; 637 637 } 638 - return p_a * mvq->num_ent + p_b; 638 + (*umemp)->size = p_a * mvq->num_ent + p_b; 639 639 } 640 640 641 641 static void umem_frag_buf_free(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_umem *umem) ··· 651 651 void *in; 652 652 int err; 653 653 __be64 *pas; 654 - int size; 655 654 struct mlx5_vdpa_umem *umem; 656 655 657 - size = umem_size(ndev, mvq, num, &umem); 658 - if (size < 0) 659 - return size; 660 - 661 - umem->size = size; 662 - err = umem_frag_buf_alloc(ndev, umem, size); 656 + set_umem_size(ndev, mvq, num, &umem); 657 + err = umem_frag_buf_alloc(ndev, umem, umem->size); 663 658 if (err) 664 659 return err; 665 660 ··· 824 829 MLX5_SET(virtio_q, vq_ctx, umem_1_id, mvq->umem1.id); 825 830 MLX5_SET(virtio_q, vq_ctx, umem_1_size, mvq->umem1.size); 826 831 MLX5_SET(virtio_q, vq_ctx, umem_2_id, mvq->umem2.id); 827 - MLX5_SET(virtio_q, vq_ctx, umem_2_size, mvq->umem1.size); 832 + MLX5_SET(virtio_q, vq_ctx, umem_2_size, mvq->umem2.size); 828 833 MLX5_SET(virtio_q, vq_ctx, umem_3_id, mvq->umem3.id); 829 - MLX5_SET(virtio_q, vq_ctx, umem_3_size, mvq->umem1.size); 834 + MLX5_SET(virtio_q, vq_ctx, umem_3_size, mvq->umem3.size); 830 835 MLX5_SET(virtio_q, vq_ctx, pd, ndev->mvdev.res.pdn); 831 836 if (MLX5_CAP_DEV_VDPA_EMULATION(ndev->mvdev.mdev, eth_frame_offload_type)) 832 837 MLX5_SET(virtio_q, vq_ctx, virtio_version_1_0, 1); ··· 1423 1428 return -EINVAL; 1424 1429 } 1425 1430 1426 - mvq->used_idx = state->avail_index; 1427 - mvq->avail_idx = state->avail_index; 1431 + mvq->used_idx = state->split.avail_index; 1432 + mvq->avail_idx = state->split.avail_index; 1428 1433 return 0; 1429 1434 } 1430 1435 ··· 1445 1450 * Since both values should be identical, we take the value of 1446 1451 * used_idx which is reported correctly. 1447 1452 */ 1448 - state->avail_index = mvq->used_idx; 1453 + state->split.avail_index = mvq->used_idx; 1449 1454 return 0; 1450 1455 } 1451 1456 ··· 1454 1459 mlx5_vdpa_warn(mvdev, "failed to query virtqueue\n"); 1455 1460 return err; 1456 1461 } 1457 - state->avail_index = attr.used_index; 1462 + state->split.avail_index = attr.used_index; 1458 1463 return 0; 1459 1464 } 1460 1465 ··· 1767 1772 mutex_unlock(&ndev->reslock); 1768 1773 } 1769 1774 1775 + static void clear_vqs_ready(struct mlx5_vdpa_net *ndev) 1776 + { 1777 + int i; 1778 + 1779 + for (i = 0; i < ndev->mvdev.max_vqs; i++) 1780 + ndev->vqs[i].ready = false; 1781 + } 1782 + 1770 1783 static void mlx5_vdpa_set_status(struct vdpa_device *vdev, u8 status) 1771 1784 { 1772 1785 struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); ··· 1785 1782 if (!status) { 1786 1783 mlx5_vdpa_info(mvdev, "performing device reset\n"); 1787 1784 teardown_driver(ndev); 1785 + clear_vqs_ready(ndev); 1788 1786 mlx5_vdpa_destroy_mr(&ndev->mvdev); 1789 1787 ndev->mvdev.status = 0; 1790 1788 ndev->mvdev.mlx_features = 0; 1791 1789 ++mvdev->generation; 1790 + if (MLX5_CAP_GEN(mvdev->mdev, umem_uid_0)) { 1791 + if (mlx5_vdpa_create_mr(mvdev, NULL)) 1792 + mlx5_vdpa_warn(mvdev, "create MR failed\n"); 1793 + } 1792 1794 return; 1793 1795 } 1794 1796 ··· 1874 1866 ndev = to_mlx5_vdpa_ndev(mvdev); 1875 1867 1876 1868 free_resources(ndev); 1869 + mlx5_vdpa_destroy_mr(mvdev); 1877 1870 if (!is_zero_ether_addr(ndev->config.mac)) { 1878 1871 pfmdev = pci_get_drvdata(pci_physfn(mvdev->mdev->pdev)); 1879 1872 mlx5_mpfs_del_mac(pfmdev, ndev->config.mac); ··· 1885 1876 1886 1877 static struct vdpa_notification_area mlx5_get_vq_notification(struct vdpa_device *vdev, u16 idx) 1887 1878 { 1879 + struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); 1888 1880 struct vdpa_notification_area ret = {}; 1881 + struct mlx5_vdpa_net *ndev; 1882 + phys_addr_t addr; 1889 1883 1884 + /* If SF BAR size is smaller than PAGE_SIZE, do not use direct 1885 + * notification to avoid the risk of mapping pages that contain BAR of more 1886 + * than one SF 1887 + */ 1888 + if (MLX5_CAP_GEN(mvdev->mdev, log_min_sf_size) + 12 < PAGE_SHIFT) 1889 + return ret; 1890 + 1891 + ndev = to_mlx5_vdpa_ndev(mvdev); 1892 + addr = (phys_addr_t)ndev->mvdev.res.phys_kick_addr; 1893 + ret.addr = addr; 1894 + ret.size = PAGE_SIZE; 1890 1895 return ret; 1891 1896 } 1892 1897 ··· 2060 2037 goto err_mtu; 2061 2038 } 2062 2039 2063 - mvdev->vdev.dma_dev = mdev->device; 2040 + mvdev->vdev.dma_dev = &mdev->pdev->dev; 2064 2041 err = mlx5_vdpa_alloc_resources(&ndev->mvdev); 2065 2042 if (err) 2066 2043 goto err_mpfs; 2067 2044 2045 + if (MLX5_CAP_GEN(mvdev->mdev, umem_uid_0)) { 2046 + err = mlx5_vdpa_create_mr(mvdev, NULL); 2047 + if (err) 2048 + goto err_res; 2049 + } 2050 + 2068 2051 err = alloc_resources(ndev); 2069 2052 if (err) 2070 - goto err_res; 2053 + goto err_mr; 2071 2054 2072 2055 mvdev->vdev.mdev = &mgtdev->mgtdev; 2073 2056 err = _vdpa_register_device(&mvdev->vdev, 2 * mlx5_vdpa_max_qps(max_vqs)); ··· 2085 2056 2086 2057 err_reg: 2087 2058 free_resources(ndev); 2059 + err_mr: 2060 + mlx5_vdpa_destroy_mr(mvdev); 2088 2061 err_res: 2089 2062 mlx5_vdpa_free_resources(&ndev->mvdev); 2090 2063 err_mpfs:

+2 -2

drivers/vdpa/vdpa_sim/vdpa_sim.c

··· 374 374 struct vringh *vrh = &vq->vring; 375 375 376 376 spin_lock(&vdpasim->lock); 377 - vrh->last_avail_idx = state->avail_index; 377 + vrh->last_avail_idx = state->split.avail_index; 378 378 spin_unlock(&vdpasim->lock); 379 379 380 380 return 0; ··· 387 387 struct vdpasim_virtqueue *vq = &vdpasim->vqs[idx]; 388 388 struct vringh *vrh = &vq->vring; 389 389 390 - state->avail_index = vrh->last_avail_idx; 390 + state->split.avail_index = vrh->last_avail_idx; 391 391 return 0; 392 392 } 393 393

-1

drivers/vdpa/vdpa_sim/vdpa_sim_blk.c

··· 15 15 #include <linux/blkdev.h> 16 16 #include <linux/vringh.h> 17 17 #include <linux/vdpa.h> 18 - #include <linux/blkdev.h> 19 18 #include <uapi/linux/virtio_blk.h> 20 19 21 20 #include "vdpa_sim.h"

+40 -3

drivers/vdpa/virtio_pci/vp_vdpa.c

··· 210 210 return -EOPNOTSUPP; 211 211 } 212 212 213 + static int vp_vdpa_set_vq_state_split(struct vdpa_device *vdpa, 214 + const struct vdpa_vq_state *state) 215 + { 216 + const struct vdpa_vq_state_split *split = &state->split; 217 + 218 + if (split->avail_index == 0) 219 + return 0; 220 + 221 + return -EOPNOTSUPP; 222 + } 223 + 224 + static int vp_vdpa_set_vq_state_packed(struct vdpa_device *vdpa, 225 + const struct vdpa_vq_state *state) 226 + { 227 + const struct vdpa_vq_state_packed *packed = &state->packed; 228 + 229 + if (packed->last_avail_counter == 1 && 230 + packed->last_avail_idx == 0 && 231 + packed->last_used_counter == 1 && 232 + packed->last_used_idx == 0) 233 + return 0; 234 + 235 + return -EOPNOTSUPP; 236 + } 237 + 213 238 static int vp_vdpa_set_vq_state(struct vdpa_device *vdpa, u16 qid, 214 239 const struct vdpa_vq_state *state) 215 240 { 216 - /* Note that this is not supported by virtio specification, so 217 - * we return -ENOPOTSUPP here. This means we can't support live 218 - * migration, vhost device start/stop. 241 + struct virtio_pci_modern_device *mdev = vdpa_to_mdev(vdpa); 242 + 243 + /* Note that this is not supported by virtio specification. 244 + * But if the state is by chance equal to the device initial 245 + * state, we can let it go. 219 246 */ 247 + if ((vp_modern_get_status(mdev) & VIRTIO_CONFIG_S_FEATURES_OK) && 248 + !vp_modern_get_queue_enable(mdev, qid)) { 249 + if (vp_modern_get_driver_features(mdev) & 250 + BIT_ULL(VIRTIO_F_RING_PACKED)) 251 + return vp_vdpa_set_vq_state_packed(vdpa, state); 252 + else 253 + return vp_vdpa_set_vq_state_split(vdpa, state); 254 + } 255 + 220 256 return -EOPNOTSUPP; 221 257 } 222 258 ··· 478 442 vp_modern_map_vq_notify(mdev, i, 479 443 &vp_vdpa->vring[i].notify_pa); 480 444 if (!vp_vdpa->vring[i].notify) { 445 + ret = -EINVAL; 481 446 dev_warn(&pdev->dev, "Fail to map vq notify %d\n", i); 482 447 goto err; 483 448 }

+1 -1

drivers/vhost/iotlb.c

··· 83 83 EXPORT_SYMBOL_GPL(vhost_iotlb_add_range); 84 84 85 85 /** 86 - * vring_iotlb_del_range - delete overlapped ranges from vhost IOTLB 86 + * vhost_iotlb_del_range - delete overlapped ranges from vhost IOTLB 87 87 * @iotlb: the IOTLB 88 88 * @start: start of the IOVA range 89 89 * @last: last of IOVA range

+7 -14

drivers/vhost/scsi.c

··· 1430 1430 vhost_scsi_handle_vq(vs, vq); 1431 1431 } 1432 1432 1433 - static void vhost_scsi_flush_vq(struct vhost_scsi *vs, int index) 1434 - { 1435 - vhost_poll_flush(&vs->vqs[index].vq.poll); 1436 - } 1437 - 1438 1433 /* Callers must hold dev mutex */ 1439 1434 static void vhost_scsi_flush(struct vhost_scsi *vs) 1440 1435 { ··· 1448 1453 kref_put(&old_inflight[i]->kref, vhost_scsi_done_inflight); 1449 1454 1450 1455 /* Flush both the vhost poll and vhost work */ 1451 - for (i = 0; i < VHOST_SCSI_MAX_VQ; i++) 1452 - vhost_scsi_flush_vq(vs, i); 1453 - vhost_work_flush(&vs->dev, &vs->vs_completion_work); 1454 - vhost_work_flush(&vs->dev, &vs->vs_event_work); 1456 + vhost_work_dev_flush(&vs->dev); 1455 1457 1456 1458 /* Wait for all reqs issued before the flush to be finished */ 1457 1459 for (i = 0; i < VHOST_SCSI_MAX_VQ; i++) ··· 1732 1740 mutex_lock(&vq->mutex); 1733 1741 vhost_vq_set_backend(vq, NULL); 1734 1742 mutex_unlock(&vq->mutex); 1735 - /* 1736 - * Make sure cmds are not running before tearing them 1737 - * down. 1738 - */ 1739 - vhost_scsi_flush(vs); 1743 + } 1744 + /* Make sure cmds are not running before tearing them down. */ 1745 + vhost_scsi_flush(vs); 1746 + 1747 + for (i = 0; i < VHOST_SCSI_MAX_VQ; i++) { 1748 + vq = &vs->vqs[i].vq; 1740 1749 vhost_scsi_destroy_vq_cmds(vq); 1741 1750 } 1742 1751 }

+2 -2

drivers/vhost/vdpa.c

··· 383 383 if (r) 384 384 return r; 385 385 386 - vq->last_avail_idx = vq_state.avail_index; 386 + vq->last_avail_idx = vq_state.split.avail_index; 387 387 break; 388 388 } 389 389 ··· 401 401 break; 402 402 403 403 case VHOST_SET_VRING_BASE: 404 - vq_state.avail_index = vq->last_avail_idx; 404 + vq_state.split.avail_index = vq->last_avail_idx; 405 405 if (ops->set_vq_state(vdpa, idx, &vq_state)) 406 406 r = -EINVAL; 407 407 break;

+4 -4

drivers/vhost/vhost.c

··· 231 231 } 232 232 EXPORT_SYMBOL_GPL(vhost_poll_stop); 233 233 234 - void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work) 234 + void vhost_work_dev_flush(struct vhost_dev *dev) 235 235 { 236 236 struct vhost_flush_struct flush; 237 237 ··· 243 243 wait_for_completion(&flush.wait_event); 244 244 } 245 245 } 246 - EXPORT_SYMBOL_GPL(vhost_work_flush); 246 + EXPORT_SYMBOL_GPL(vhost_work_dev_flush); 247 247 248 248 /* Flush any work that has been scheduled. When calling this, don't hold any 249 249 * locks that are also used by the callback. */ 250 250 void vhost_poll_flush(struct vhost_poll *poll) 251 251 { 252 - vhost_work_flush(poll->dev, &poll->work); 252 + vhost_work_dev_flush(poll->dev); 253 253 } 254 254 EXPORT_SYMBOL_GPL(vhost_poll_flush); 255 255 ··· 538 538 attach.owner = current; 539 539 vhost_work_init(&attach.work, vhost_attach_cgroups_work); 540 540 vhost_work_queue(dev, &attach.work); 541 - vhost_work_flush(dev, &attach.work); 541 + vhost_work_dev_flush(dev); 542 542 return attach.ret; 543 543 } 544 544

+10 -11

drivers/vhost/vhost.h

··· 20 20 21 21 #define VHOST_WORK_QUEUED 1 22 22 struct vhost_work { 23 - struct llist_node node; 24 - vhost_work_fn_t fn; 25 - unsigned long flags; 23 + struct llist_node node; 24 + vhost_work_fn_t fn; 25 + unsigned long flags; 26 26 }; 27 27 28 28 /* Poll a file (eventfd or socket) */ 29 29 /* Note: there's nothing vhost specific about this structure. */ 30 30 struct vhost_poll { 31 - poll_table table; 32 - wait_queue_head_t *wqh; 33 - wait_queue_entry_t wait; 34 - struct vhost_work work; 35 - __poll_t mask; 36 - struct vhost_dev *dev; 31 + poll_table table; 32 + wait_queue_head_t *wqh; 33 + wait_queue_entry_t wait; 34 + struct vhost_work work; 35 + __poll_t mask; 36 + struct vhost_dev *dev; 37 37 }; 38 38 39 39 void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn); ··· 46 46 void vhost_poll_stop(struct vhost_poll *poll); 47 47 void vhost_poll_flush(struct vhost_poll *poll); 48 48 void vhost_poll_queue(struct vhost_poll *poll); 49 - void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work); 50 - long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp); 49 + void vhost_work_dev_flush(struct vhost_dev *dev); 51 50 52 51 struct vhost_log { 53 52 u64 addr;

+1 -1

drivers/vhost/vsock.c

··· 708 708 for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) 709 709 if (vsock->vqs[i].handle_kick) 710 710 vhost_poll_flush(&vsock->vqs[i].poll); 711 - vhost_work_flush(&vsock->dev, &vsock->send_pkt_work); 711 + vhost_work_dev_flush(&vsock->dev); 712 712 } 713 713 714 714 static void vhost_vsock_reset_orphans(struct sock *sk)

+178 -168

drivers/virtio/virtio_mem.c

··· 75 75 VIRTIO_MEM_SBM_MB_OFFLINE, 76 76 /* Partially plugged, fully added to Linux, offline. */ 77 77 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL, 78 - /* Fully plugged, fully added to Linux, online. */ 79 - VIRTIO_MEM_SBM_MB_ONLINE, 80 - /* Partially plugged, fully added to Linux, online. */ 81 - VIRTIO_MEM_SBM_MB_ONLINE_PARTIAL, 78 + /* Fully plugged, fully added to Linux, onlined to a kernel zone. */ 79 + VIRTIO_MEM_SBM_MB_KERNEL, 80 + /* Partially plugged, fully added to Linux, online to a kernel zone */ 81 + VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL, 82 + /* Fully plugged, fully added to Linux, onlined to ZONE_MOVABLE. */ 83 + VIRTIO_MEM_SBM_MB_MOVABLE, 84 + /* Partially plugged, fully added to Linux, onlined to ZONE_MOVABLE. */ 85 + VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL, 82 86 VIRTIO_MEM_SBM_MB_COUNT 83 87 }; 84 88 ··· 703 699 } 704 700 705 701 /* 706 - * See virtio_mem_remove_memory(): Try to remove all Linux memory blocks covered 707 - * by the big block. 708 - */ 709 - static int virtio_mem_bbm_remove_bb(struct virtio_mem *vm, unsigned long bb_id) 710 - { 711 - const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id); 712 - const uint64_t size = vm->bbm.bb_size; 713 - 714 - return virtio_mem_remove_memory(vm, addr, size); 715 - } 716 - 717 - /* 718 702 * Try offlining and removing memory from Linux. 719 703 * 720 704 * Must not be called with the vm->hotplug_mutex held (possible deadlock with ··· 824 832 unsigned long mb_id) 825 833 { 826 834 switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) { 827 - case VIRTIO_MEM_SBM_MB_ONLINE_PARTIAL: 835 + case VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL: 836 + case VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL: 828 837 virtio_mem_sbm_set_mb_state(vm, mb_id, 829 838 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL); 830 839 break; 831 - case VIRTIO_MEM_SBM_MB_ONLINE: 840 + case VIRTIO_MEM_SBM_MB_KERNEL: 841 + case VIRTIO_MEM_SBM_MB_MOVABLE: 832 842 virtio_mem_sbm_set_mb_state(vm, mb_id, 833 843 VIRTIO_MEM_SBM_MB_OFFLINE); 834 844 break; ··· 841 847 } 842 848 843 849 static void virtio_mem_sbm_notify_online(struct virtio_mem *vm, 844 - unsigned long mb_id) 850 + unsigned long mb_id, 851 + unsigned long start_pfn) 845 852 { 853 + const bool is_movable = page_zonenum(pfn_to_page(start_pfn)) == 854 + ZONE_MOVABLE; 855 + int new_state; 856 + 846 857 switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) { 847 858 case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL: 848 - virtio_mem_sbm_set_mb_state(vm, mb_id, 849 - VIRTIO_MEM_SBM_MB_ONLINE_PARTIAL); 859 + new_state = VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL; 860 + if (is_movable) 861 + new_state = VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL; 850 862 break; 851 863 case VIRTIO_MEM_SBM_MB_OFFLINE: 852 - virtio_mem_sbm_set_mb_state(vm, mb_id, 853 - VIRTIO_MEM_SBM_MB_ONLINE); 864 + new_state = VIRTIO_MEM_SBM_MB_KERNEL; 865 + if (is_movable) 866 + new_state = VIRTIO_MEM_SBM_MB_MOVABLE; 854 867 break; 855 868 default: 856 869 BUG(); 857 870 break; 858 871 } 872 + virtio_mem_sbm_set_mb_state(vm, mb_id, new_state); 859 873 } 860 874 861 875 static void virtio_mem_sbm_notify_going_offline(struct virtio_mem *vm, ··· 1017 1015 break; 1018 1016 case MEM_ONLINE: 1019 1017 if (vm->in_sbm) 1020 - virtio_mem_sbm_notify_online(vm, id); 1018 + virtio_mem_sbm_notify_online(vm, id, mhp->start_pfn); 1021 1019 1022 1020 atomic64_sub(size, &vm->offline_size); 1023 1021 /* ··· 1139 1137 */ 1140 1138 static int virtio_mem_fake_offline(unsigned long pfn, unsigned long nr_pages) 1141 1139 { 1142 - const bool is_movable = zone_idx(page_zone(pfn_to_page(pfn))) == 1140 + const bool is_movable = page_zonenum(pfn_to_page(pfn)) == 1143 1141 ZONE_MOVABLE; 1144 1142 int rc, retry_count; 1145 1143 ··· 1457 1455 * 1458 1456 * Note: can fail after some subblocks were unplugged. 1459 1457 */ 1460 - static int virtio_mem_sbm_unplug_any_sb(struct virtio_mem *vm, 1461 - unsigned long mb_id, uint64_t *nb_sb) 1458 + static int virtio_mem_sbm_unplug_any_sb_raw(struct virtio_mem *vm, 1459 + unsigned long mb_id, uint64_t *nb_sb) 1462 1460 { 1463 1461 int sb_id, count; 1464 1462 int rc; ··· 1500 1498 { 1501 1499 uint64_t nb_sb = vm->sbm.sbs_per_mb; 1502 1500 1503 - return virtio_mem_sbm_unplug_any_sb(vm, mb_id, &nb_sb); 1501 + return virtio_mem_sbm_unplug_any_sb_raw(vm, mb_id, &nb_sb); 1504 1502 } 1505 1503 1506 1504 /* ··· 1587 1585 * Note: Can fail after some subblocks were successfully plugged. 1588 1586 */ 1589 1587 static int virtio_mem_sbm_plug_any_sb(struct virtio_mem *vm, 1590 - unsigned long mb_id, uint64_t *nb_sb, 1591 - bool online) 1588 + unsigned long mb_id, uint64_t *nb_sb) 1592 1589 { 1590 + const int old_state = virtio_mem_sbm_get_mb_state(vm, mb_id); 1593 1591 unsigned long pfn, nr_pages; 1594 1592 int sb_id, count; 1595 1593 int rc; ··· 1611 1609 if (rc) 1612 1610 return rc; 1613 1611 *nb_sb -= count; 1614 - if (!online) 1612 + if (old_state == VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL) 1615 1613 continue; 1616 1614 1617 1615 /* fake-online the pages if the memory block is online */ ··· 1621 1619 virtio_mem_fake_online(pfn, nr_pages); 1622 1620 } 1623 1621 1624 - if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) { 1625 - if (online) 1626 - virtio_mem_sbm_set_mb_state(vm, mb_id, 1627 - VIRTIO_MEM_SBM_MB_ONLINE); 1628 - else 1629 - virtio_mem_sbm_set_mb_state(vm, mb_id, 1630 - VIRTIO_MEM_SBM_MB_OFFLINE); 1631 - } 1622 + if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) 1623 + virtio_mem_sbm_set_mb_state(vm, mb_id, old_state - 1); 1632 1624 1633 1625 return 0; 1634 1626 } 1635 1627 1636 1628 static int virtio_mem_sbm_plug_request(struct virtio_mem *vm, uint64_t diff) 1637 1629 { 1630 + const int mb_states[] = { 1631 + VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL, 1632 + VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL, 1633 + VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL, 1634 + }; 1638 1635 uint64_t nb_sb = diff / vm->sbm.sb_size; 1639 1636 unsigned long mb_id; 1640 - int rc; 1637 + int rc, i; 1641 1638 1642 1639 if (!nb_sb) 1643 1640 return 0; ··· 1644 1643 /* Don't race with onlining/offlining */ 1645 1644 mutex_lock(&vm->hotplug_mutex); 1646 1645 1647 - /* Try to plug subblocks of partially plugged online blocks. */ 1648 - virtio_mem_sbm_for_each_mb(vm, mb_id, 1649 - VIRTIO_MEM_SBM_MB_ONLINE_PARTIAL) { 1650 - rc = virtio_mem_sbm_plug_any_sb(vm, mb_id, &nb_sb, true); 1651 - if (rc || !nb_sb) 1652 - goto out_unlock; 1653 - cond_resched(); 1654 - } 1655 - 1656 - /* Try to plug subblocks of partially plugged offline blocks. */ 1657 - virtio_mem_sbm_for_each_mb(vm, mb_id, 1658 - VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL) { 1659 - rc = virtio_mem_sbm_plug_any_sb(vm, mb_id, &nb_sb, false); 1660 - if (rc || !nb_sb) 1661 - goto out_unlock; 1662 - cond_resched(); 1646 + for (i = 0; i < ARRAY_SIZE(mb_states); i++) { 1647 + virtio_mem_sbm_for_each_mb(vm, mb_id, mb_states[i]) { 1648 + rc = virtio_mem_sbm_plug_any_sb(vm, mb_id, &nb_sb); 1649 + if (rc || !nb_sb) 1650 + goto out_unlock; 1651 + cond_resched(); 1652 + } 1663 1653 } 1664 1654 1665 1655 /* ··· 1811 1819 { 1812 1820 int rc; 1813 1821 1814 - rc = virtio_mem_sbm_unplug_any_sb(vm, mb_id, nb_sb); 1822 + rc = virtio_mem_sbm_unplug_any_sb_raw(vm, mb_id, nb_sb); 1815 1823 1816 1824 /* some subblocks might have been unplugged even on failure */ 1817 1825 if (!virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) ··· 1848 1856 int count) 1849 1857 { 1850 1858 const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size) * count; 1859 + const int old_state = virtio_mem_sbm_get_mb_state(vm, mb_id); 1851 1860 unsigned long start_pfn; 1852 1861 int rc; 1853 1862 ··· 1867 1874 return rc; 1868 1875 } 1869 1876 1870 - virtio_mem_sbm_set_mb_state(vm, mb_id, 1871 - VIRTIO_MEM_SBM_MB_ONLINE_PARTIAL); 1877 + switch (old_state) { 1878 + case VIRTIO_MEM_SBM_MB_KERNEL: 1879 + virtio_mem_sbm_set_mb_state(vm, mb_id, 1880 + VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL); 1881 + break; 1882 + case VIRTIO_MEM_SBM_MB_MOVABLE: 1883 + virtio_mem_sbm_set_mb_state(vm, mb_id, 1884 + VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL); 1885 + break; 1886 + } 1887 + 1872 1888 return 0; 1873 1889 } 1874 1890 ··· 1944 1942 return 0; 1945 1943 } 1946 1944 1945 + /* 1946 + * Unplug the desired number of plugged subblocks of a memory block that is 1947 + * already added to Linux. Will skip subblock of online memory blocks that are 1948 + * busy (by the OS). Will fail if any subblock that's not busy cannot get 1949 + * unplugged. 1950 + * 1951 + * Will modify the state of the memory block. Might temporarily drop the 1952 + * hotplug_mutex. 1953 + * 1954 + * Note: Can fail after some subblocks were successfully unplugged. Can 1955 + * return 0 even if subblocks were busy and could not get unplugged. 1956 + */ 1957 + static int virtio_mem_sbm_unplug_any_sb(struct virtio_mem *vm, 1958 + unsigned long mb_id, 1959 + uint64_t *nb_sb) 1960 + { 1961 + const int old_state = virtio_mem_sbm_get_mb_state(vm, mb_id); 1962 + 1963 + switch (old_state) { 1964 + case VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL: 1965 + case VIRTIO_MEM_SBM_MB_KERNEL: 1966 + case VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL: 1967 + case VIRTIO_MEM_SBM_MB_MOVABLE: 1968 + return virtio_mem_sbm_unplug_any_sb_online(vm, mb_id, nb_sb); 1969 + case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL: 1970 + case VIRTIO_MEM_SBM_MB_OFFLINE: 1971 + return virtio_mem_sbm_unplug_any_sb_offline(vm, mb_id, nb_sb); 1972 + } 1973 + return -EINVAL; 1974 + } 1975 + 1947 1976 static int virtio_mem_sbm_unplug_request(struct virtio_mem *vm, uint64_t diff) 1948 1977 { 1978 + const int mb_states[] = { 1979 + VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL, 1980 + VIRTIO_MEM_SBM_MB_OFFLINE, 1981 + VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL, 1982 + VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL, 1983 + VIRTIO_MEM_SBM_MB_MOVABLE, 1984 + VIRTIO_MEM_SBM_MB_KERNEL, 1985 + }; 1949 1986 uint64_t nb_sb = diff / vm->sbm.sb_size; 1950 1987 unsigned long mb_id; 1951 - int rc; 1988 + int rc, i; 1952 1989 1953 1990 if (!nb_sb) 1954 1991 return 0; ··· 1999 1958 */ 2000 1959 mutex_lock(&vm->hotplug_mutex); 2001 1960 2002 - /* Try to unplug subblocks of partially plugged offline blocks. */ 2003 - virtio_mem_sbm_for_each_mb_rev(vm, mb_id, 2004 - VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL) { 2005 - rc = virtio_mem_sbm_unplug_any_sb_offline(vm, mb_id, &nb_sb); 2006 - if (rc || !nb_sb) 2007 - goto out_unlock; 2008 - cond_resched(); 2009 - } 2010 - 2011 - /* Try to unplug subblocks of plugged offline blocks. */ 2012 - virtio_mem_sbm_for_each_mb_rev(vm, mb_id, VIRTIO_MEM_SBM_MB_OFFLINE) { 2013 - rc = virtio_mem_sbm_unplug_any_sb_offline(vm, mb_id, &nb_sb); 2014 - if (rc || !nb_sb) 2015 - goto out_unlock; 2016 - cond_resched(); 2017 - } 2018 - 2019 - if (!unplug_online) { 2020 - mutex_unlock(&vm->hotplug_mutex); 2021 - return 0; 2022 - } 2023 - 2024 - /* Try to unplug subblocks of partially plugged online blocks. */ 2025 - virtio_mem_sbm_for_each_mb_rev(vm, mb_id, 2026 - VIRTIO_MEM_SBM_MB_ONLINE_PARTIAL) { 2027 - rc = virtio_mem_sbm_unplug_any_sb_online(vm, mb_id, &nb_sb); 2028 - if (rc || !nb_sb) 2029 - goto out_unlock; 2030 - mutex_unlock(&vm->hotplug_mutex); 2031 - cond_resched(); 2032 - mutex_lock(&vm->hotplug_mutex); 2033 - } 2034 - 2035 - /* Try to unplug subblocks of plugged online blocks. */ 2036 - virtio_mem_sbm_for_each_mb_rev(vm, mb_id, VIRTIO_MEM_SBM_MB_ONLINE) { 2037 - rc = virtio_mem_sbm_unplug_any_sb_online(vm, mb_id, &nb_sb); 2038 - if (rc || !nb_sb) 2039 - goto out_unlock; 2040 - mutex_unlock(&vm->hotplug_mutex); 2041 - cond_resched(); 2042 - mutex_lock(&vm->hotplug_mutex); 1961 + /* 1962 + * We try unplug from partially plugged blocks first, to try removing 1963 + * whole memory blocks along with metadata. We prioritize ZONE_MOVABLE 1964 + * as it's more reliable to unplug memory and remove whole memory 1965 + * blocks, and we don't want to trigger a zone imbalances by 1966 + * accidentially removing too much kernel memory. 1967 + */ 1968 + for (i = 0; i < ARRAY_SIZE(mb_states); i++) { 1969 + virtio_mem_sbm_for_each_mb_rev(vm, mb_id, mb_states[i]) { 1970 + rc = virtio_mem_sbm_unplug_any_sb(vm, mb_id, &nb_sb); 1971 + if (rc || !nb_sb) 1972 + goto out_unlock; 1973 + mutex_unlock(&vm->hotplug_mutex); 1974 + cond_resched(); 1975 + mutex_lock(&vm->hotplug_mutex); 1976 + } 1977 + if (!unplug_online && i == 1) { 1978 + mutex_unlock(&vm->hotplug_mutex); 1979 + return 0; 1980 + } 2043 1981 } 2044 1982 2045 1983 mutex_unlock(&vm->hotplug_mutex); ··· 2105 2085 } 2106 2086 2107 2087 /* 2108 - * Try to remove a big block from Linux and unplug it. Will fail with 2109 - * -EBUSY if some memory is online. 2110 - * 2111 - * Will modify the state of the memory block. 2112 - */ 2113 - static int virtio_mem_bbm_remove_and_unplug_bb(struct virtio_mem *vm, 2114 - unsigned long bb_id) 2115 - { 2116 - int rc; 2117 - 2118 - if (WARN_ON_ONCE(virtio_mem_bbm_get_bb_state(vm, bb_id) != 2119 - VIRTIO_MEM_BBM_BB_ADDED)) 2120 - return -EINVAL; 2121 - 2122 - rc = virtio_mem_bbm_remove_bb(vm, bb_id); 2123 - if (rc) 2124 - return -EBUSY; 2125 - 2126 - rc = virtio_mem_bbm_unplug_bb(vm, bb_id); 2127 - if (rc) 2128 - virtio_mem_bbm_set_bb_state(vm, bb_id, 2129 - VIRTIO_MEM_BBM_BB_PLUGGED); 2130 - else 2131 - virtio_mem_bbm_set_bb_state(vm, bb_id, 2132 - VIRTIO_MEM_BBM_BB_UNUSED); 2133 - return rc; 2134 - } 2135 - 2136 - /* 2137 2088 * Test if a big block is completely offline. 2138 2089 */ 2139 2090 static bool virtio_mem_bbm_bb_is_offline(struct virtio_mem *vm, ··· 2123 2132 return true; 2124 2133 } 2125 2134 2135 + /* 2136 + * Test if a big block is completely onlined to ZONE_MOVABLE (or offline). 2137 + */ 2138 + static bool virtio_mem_bbm_bb_is_movable(struct virtio_mem *vm, 2139 + unsigned long bb_id) 2140 + { 2141 + const unsigned long start_pfn = PFN_DOWN(virtio_mem_bb_id_to_phys(vm, bb_id)); 2142 + const unsigned long nr_pages = PFN_DOWN(vm->bbm.bb_size); 2143 + struct page *page; 2144 + unsigned long pfn; 2145 + 2146 + for (pfn = start_pfn; pfn < start_pfn + nr_pages; 2147 + pfn += PAGES_PER_SECTION) { 2148 + page = pfn_to_online_page(pfn); 2149 + if (!page) 2150 + continue; 2151 + if (page_zonenum(page) != ZONE_MOVABLE) 2152 + return false; 2153 + } 2154 + 2155 + return true; 2156 + } 2157 + 2126 2158 static int virtio_mem_bbm_unplug_request(struct virtio_mem *vm, uint64_t diff) 2127 2159 { 2128 2160 uint64_t nb_bb = diff / vm->bbm.bb_size; 2129 2161 uint64_t bb_id; 2130 - int rc; 2162 + int rc, i; 2131 2163 2132 2164 if (!nb_bb) 2133 2165 return 0; 2134 2166 2135 - /* Try to unplug completely offline big blocks first. */ 2136 - virtio_mem_bbm_for_each_bb_rev(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED) { 2137 - cond_resched(); 2138 - /* 2139 - * As we're holding no locks, this check is racy as memory 2140 - * can get onlined in the meantime - but we'll fail gracefully. 2141 - */ 2142 - if (!virtio_mem_bbm_bb_is_offline(vm, bb_id)) 2143 - continue; 2144 - rc = virtio_mem_bbm_remove_and_unplug_bb(vm, bb_id); 2145 - if (rc == -EBUSY) 2146 - continue; 2147 - if (!rc) 2148 - nb_bb--; 2149 - if (rc || !nb_bb) 2150 - return rc; 2151 - } 2167 + /* 2168 + * Try to unplug big blocks. Similar to SBM, start with offline 2169 + * big blocks. 2170 + */ 2171 + for (i = 0; i < 3; i++) { 2172 + virtio_mem_bbm_for_each_bb_rev(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED) { 2173 + cond_resched(); 2152 2174 2153 - if (!unplug_online) 2154 - return 0; 2155 - 2156 - /* Try to unplug any big blocks. */ 2157 - virtio_mem_bbm_for_each_bb_rev(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED) { 2158 - cond_resched(); 2159 - rc = virtio_mem_bbm_offline_remove_and_unplug_bb(vm, bb_id); 2160 - if (rc == -EBUSY) 2161 - continue; 2162 - if (!rc) 2163 - nb_bb--; 2164 - if (rc || !nb_bb) 2165 - return rc; 2175 + /* 2176 + * As we're holding no locks, these checks are racy, 2177 + * but we don't care. 2178 + */ 2179 + if (i == 0 && !virtio_mem_bbm_bb_is_offline(vm, bb_id)) 2180 + continue; 2181 + if (i == 1 && !virtio_mem_bbm_bb_is_movable(vm, bb_id)) 2182 + continue; 2183 + rc = virtio_mem_bbm_offline_remove_and_unplug_bb(vm, bb_id); 2184 + if (rc == -EBUSY) 2185 + continue; 2186 + if (!rc) 2187 + nb_bb--; 2188 + if (rc || !nb_bb) 2189 + return rc; 2190 + } 2191 + if (i == 0 && !unplug_online) 2192 + return 0; 2166 2193 } 2167 2194 2168 2195 return nb_bb ? -EBUSY : 0; ··· 2431 2422 dev_warn(&vm->vdev->dev, 2432 2423 "Some device memory is not addressable/pluggable. This can make some memory unusable.\n"); 2433 2424 2425 + /* Prepare the offline threshold - make sure we can add two blocks. */ 2426 + vm->offline_threshold = max_t(uint64_t, 2 * memory_block_size_bytes(), 2427 + VIRTIO_MEM_DEFAULT_OFFLINE_THRESHOLD); 2428 + 2434 2429 /* 2435 2430 * We want subblocks to span at least MAX_ORDER_NR_PAGES and 2436 2431 * pageblock_nr_pages pages. This: ··· 2481 2468 vm->bbm.bb_size - 1; 2482 2469 vm->bbm.first_bb_id = virtio_mem_phys_to_bb_id(vm, addr); 2483 2470 vm->bbm.next_bb_id = vm->bbm.first_bb_id; 2484 - } 2485 2471 2486 - /* Prepare the offline threshold - make sure we can add two blocks. */ 2487 - vm->offline_threshold = max_t(uint64_t, 2 * memory_block_size_bytes(), 2488 - VIRTIO_MEM_DEFAULT_OFFLINE_THRESHOLD); 2489 - /* In BBM, we also want at least two big blocks. */ 2490 - vm->offline_threshold = max_t(uint64_t, 2 * vm->bbm.bb_size, 2491 - vm->offline_threshold); 2472 + /* Make sure we can add two big blocks. */ 2473 + vm->offline_threshold = max_t(uint64_t, 2 * vm->bbm.bb_size, 2474 + vm->offline_threshold); 2475 + } 2492 2476 2493 2477 dev_info(&vm->vdev->dev, "start address: 0x%llx", vm->addr); 2494 2478 dev_info(&vm->vdev->dev, "region size: 0x%llx", vm->region_size);

+21

drivers/virtio/virtio_pci_modern_dev.c

··· 384 384 EXPORT_SYMBOL_GPL(vp_modern_get_features); 385 385 386 386 /* 387 + * vp_modern_get_driver_features - get driver features from device 388 + * @mdev: the modern virtio-pci device 389 + * 390 + * Returns the driver features read from the device 391 + */ 392 + u64 vp_modern_get_driver_features(struct virtio_pci_modern_device *mdev) 393 + { 394 + struct virtio_pci_common_cfg __iomem *cfg = mdev->common; 395 + 396 + u64 features; 397 + 398 + vp_iowrite32(0, &cfg->guest_feature_select); 399 + features = vp_ioread32(&cfg->guest_feature); 400 + vp_iowrite32(1, &cfg->guest_feature_select); 401 + features |= ((u64)vp_ioread32(&cfg->guest_feature) << 32); 402 + 403 + return features; 404 + } 405 + EXPORT_SYMBOL_GPL(vp_modern_get_driver_features); 406 + 407 + /* 387 408 * vp_modern_set_features - set features to device 388 409 * @mdev: the modern virtio-pci device 389 410 * @features: the features set to device

+170 -59

drivers/virtio/virtio_ring.c

··· 74 74 void *data; /* Data for callback. */ 75 75 struct vring_packed_desc *indir_desc; /* Indirect descriptor, if any. */ 76 76 u16 num; /* Descriptor list length. */ 77 - u16 next; /* The next desc state in a list. */ 78 77 u16 last; /* The last desc state in a list. */ 79 78 }; 80 79 81 - struct vring_desc_extra_packed { 80 + struct vring_desc_extra { 82 81 dma_addr_t addr; /* Buffer DMA addr. */ 83 82 u32 len; /* Buffer length. */ 84 83 u16 flags; /* Descriptor flags. */ 84 + u16 next; /* The next desc state in a list. */ 85 85 }; 86 86 87 87 struct vring_virtqueue { ··· 113 113 /* Last used index we've seen. */ 114 114 u16 last_used_idx; 115 115 116 + /* Hint for event idx: already triggered no need to disable. */ 117 + bool event_triggered; 118 + 116 119 union { 117 120 /* Available for split ring */ 118 121 struct { ··· 133 130 134 131 /* Per-descriptor state. */ 135 132 struct vring_desc_state_split *desc_state; 133 + struct vring_desc_extra *desc_extra; 136 134 137 135 /* DMA address and size information */ 138 136 dma_addr_t queue_dma_addr; ··· 170 166 171 167 /* Per-descriptor state. */ 172 168 struct vring_desc_state_packed *desc_state; 173 - struct vring_desc_extra_packed *desc_extra; 169 + struct vring_desc_extra *desc_extra; 174 170 175 171 /* DMA address and size information */ 176 172 dma_addr_t ring_dma_addr; ··· 368 364 * Split ring specific functions - *_split(). 369 365 */ 370 366 371 - static void vring_unmap_one_split(const struct vring_virtqueue *vq, 372 - struct vring_desc *desc) 367 + static void vring_unmap_one_split_indirect(const struct vring_virtqueue *vq, 368 + struct vring_desc *desc) 373 369 { 374 370 u16 flags; 375 371 ··· 393 389 } 394 390 } 395 391 392 + static unsigned int vring_unmap_one_split(const struct vring_virtqueue *vq, 393 + unsigned int i) 394 + { 395 + struct vring_desc_extra *extra = vq->split.desc_extra; 396 + u16 flags; 397 + 398 + if (!vq->use_dma_api) 399 + goto out; 400 + 401 + flags = extra[i].flags; 402 + 403 + if (flags & VRING_DESC_F_INDIRECT) { 404 + dma_unmap_single(vring_dma_dev(vq), 405 + extra[i].addr, 406 + extra[i].len, 407 + (flags & VRING_DESC_F_WRITE) ? 408 + DMA_FROM_DEVICE : DMA_TO_DEVICE); 409 + } else { 410 + dma_unmap_page(vring_dma_dev(vq), 411 + extra[i].addr, 412 + extra[i].len, 413 + (flags & VRING_DESC_F_WRITE) ? 414 + DMA_FROM_DEVICE : DMA_TO_DEVICE); 415 + } 416 + 417 + out: 418 + return extra[i].next; 419 + } 420 + 396 421 static struct vring_desc *alloc_indirect_split(struct virtqueue *_vq, 397 422 unsigned int total_sg, 398 423 gfp_t gfp) ··· 443 410 for (i = 0; i < total_sg; i++) 444 411 desc[i].next = cpu_to_virtio16(_vq->vdev, i + 1); 445 412 return desc; 413 + } 414 + 415 + static inline unsigned int virtqueue_add_desc_split(struct virtqueue *vq, 416 + struct vring_desc *desc, 417 + unsigned int i, 418 + dma_addr_t addr, 419 + unsigned int len, 420 + u16 flags, 421 + bool indirect) 422 + { 423 + struct vring_virtqueue *vring = to_vvq(vq); 424 + struct vring_desc_extra *extra = vring->split.desc_extra; 425 + u16 next; 426 + 427 + desc[i].flags = cpu_to_virtio16(vq->vdev, flags); 428 + desc[i].addr = cpu_to_virtio64(vq->vdev, addr); 429 + desc[i].len = cpu_to_virtio32(vq->vdev, len); 430 + 431 + if (!indirect) { 432 + next = extra[i].next; 433 + desc[i].next = cpu_to_virtio16(vq->vdev, next); 434 + 435 + extra[i].addr = addr; 436 + extra[i].len = len; 437 + extra[i].flags = flags; 438 + } else 439 + next = virtio16_to_cpu(vq->vdev, desc[i].next); 440 + 441 + return next; 446 442 } 447 443 448 444 static inline int virtqueue_add_split(struct virtqueue *_vq, ··· 546 484 if (vring_mapping_error(vq, addr)) 547 485 goto unmap_release; 548 486 549 - desc[i].flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_NEXT); 550 - desc[i].addr = cpu_to_virtio64(_vq->vdev, addr); 551 - desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length); 552 487 prev = i; 553 - i = virtio16_to_cpu(_vq->vdev, desc[i].next); 488 + /* Note that we trust indirect descriptor 489 + * table since it use stream DMA mapping. 490 + */ 491 + i = virtqueue_add_desc_split(_vq, desc, i, addr, sg->length, 492 + VRING_DESC_F_NEXT, 493 + indirect); 554 494 } 555 495 } 556 496 for (; n < (out_sgs + in_sgs); n++) { ··· 561 497 if (vring_mapping_error(vq, addr)) 562 498 goto unmap_release; 563 499 564 - desc[i].flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_NEXT | VRING_DESC_F_WRITE); 565 - desc[i].addr = cpu_to_virtio64(_vq->vdev, addr); 566 - desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length); 567 500 prev = i; 568 - i = virtio16_to_cpu(_vq->vdev, desc[i].next); 501 + /* Note that we trust indirect descriptor 502 + * table since it use stream DMA mapping. 503 + */ 504 + i = virtqueue_add_desc_split(_vq, desc, i, addr, 505 + sg->length, 506 + VRING_DESC_F_NEXT | 507 + VRING_DESC_F_WRITE, 508 + indirect); 569 509 } 570 510 } 571 511 /* Last one doesn't continue. */ 572 512 desc[prev].flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT); 513 + if (!indirect && vq->use_dma_api) 514 + vq->split.desc_extra[prev & (vq->split.vring.num - 1)].flags = 515 + ~VRING_DESC_F_NEXT; 573 516 574 517 if (indirect) { 575 518 /* Now that the indirect table is filled in, map it. */ ··· 586 515 if (vring_mapping_error(vq, addr)) 587 516 goto unmap_release; 588 517 589 - vq->split.vring.desc[head].flags = cpu_to_virtio16(_vq->vdev, 590 - VRING_DESC_F_INDIRECT); 591 - vq->split.vring.desc[head].addr = cpu_to_virtio64(_vq->vdev, 592 - addr); 593 - 594 - vq->split.vring.desc[head].len = cpu_to_virtio32(_vq->vdev, 595 - total_sg * sizeof(struct vring_desc)); 518 + virtqueue_add_desc_split(_vq, vq->split.vring.desc, 519 + head, addr, 520 + total_sg * sizeof(struct vring_desc), 521 + VRING_DESC_F_INDIRECT, 522 + false); 596 523 } 597 524 598 525 /* We're using some buffers from the free list. */ ··· 598 529 599 530 /* Update free pointer */ 600 531 if (indirect) 601 - vq->free_head = virtio16_to_cpu(_vq->vdev, 602 - vq->split.vring.desc[head].next); 532 + vq->free_head = vq->split.desc_extra[head].next; 603 533 else 604 534 vq->free_head = i; 605 535 ··· 643 575 for (n = 0; n < total_sg; n++) { 644 576 if (i == err_idx) 645 577 break; 646 - vring_unmap_one_split(vq, &desc[i]); 647 - i = virtio16_to_cpu(_vq->vdev, desc[i].next); 578 + if (indirect) { 579 + vring_unmap_one_split_indirect(vq, &desc[i]); 580 + i = virtio16_to_cpu(_vq->vdev, desc[i].next); 581 + } else 582 + i = vring_unmap_one_split(vq, i); 648 583 } 649 584 650 585 if (indirect) ··· 701 630 i = head; 702 631 703 632 while (vq->split.vring.desc[i].flags & nextflag) { 704 - vring_unmap_one_split(vq, &vq->split.vring.desc[i]); 705 - i = virtio16_to_cpu(vq->vq.vdev, vq->split.vring.desc[i].next); 633 + vring_unmap_one_split(vq, i); 634 + i = vq->split.desc_extra[i].next; 706 635 vq->vq.num_free++; 707 636 } 708 637 709 - vring_unmap_one_split(vq, &vq->split.vring.desc[i]); 710 - vq->split.vring.desc[i].next = cpu_to_virtio16(vq->vq.vdev, 711 - vq->free_head); 638 + vring_unmap_one_split(vq, i); 639 + vq->split.desc_extra[i].next = vq->free_head; 712 640 vq->free_head = head; 713 641 714 642 /* Plus final descriptor */ ··· 722 652 if (!indir_desc) 723 653 return; 724 654 725 - len = virtio32_to_cpu(vq->vq.vdev, 726 - vq->split.vring.desc[head].len); 655 + len = vq->split.desc_extra[head].len; 727 656 728 - BUG_ON(!(vq->split.vring.desc[head].flags & 729 - cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_INDIRECT))); 657 + BUG_ON(!(vq->split.desc_extra[head].flags & 658 + VRING_DESC_F_INDIRECT)); 730 659 BUG_ON(len == 0 || len % sizeof(struct vring_desc)); 731 660 732 661 for (j = 0; j < len / sizeof(struct vring_desc); j++) 733 - vring_unmap_one_split(vq, &indir_desc[j]); 662 + vring_unmap_one_split_indirect(vq, &indir_desc[j]); 734 663 735 664 kfree(indir_desc); 736 665 vq->split.desc_state[head].indir_desc = NULL; ··· 808 739 809 740 if (!(vq->split.avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT)) { 810 741 vq->split.avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT; 811 - if (!vq->event) 742 + if (vq->event) 743 + /* TODO: this is a hack. Figure out a cleaner value to write. */ 744 + vring_used_event(&vq->split.vring) = 0x0; 745 + else 812 746 vq->split.vring.avail->flags = 813 747 cpu_to_virtio16(_vq->vdev, 814 748 vq->split.avail_flags_shadow); ··· 984 912 */ 985 913 986 914 static void vring_unmap_state_packed(const struct vring_virtqueue *vq, 987 - struct vring_desc_extra_packed *state) 915 + struct vring_desc_extra *state) 988 916 { 989 917 u16 flags; 990 918 ··· 1133 1061 1 << VRING_PACKED_DESC_F_USED; 1134 1062 } 1135 1063 vq->packed.next_avail_idx = n; 1136 - vq->free_head = vq->packed.desc_state[id].next; 1064 + vq->free_head = vq->packed.desc_extra[id].next; 1137 1065 1138 1066 /* Store token and indirect buffer state. */ 1139 1067 vq->packed.desc_state[id].num = 1; ··· 1241 1169 le16_to_cpu(flags); 1242 1170 } 1243 1171 prev = curr; 1244 - curr = vq->packed.desc_state[curr].next; 1172 + curr = vq->packed.desc_extra[curr].next; 1245 1173 1246 1174 if ((unlikely(++i >= vq->packed.vring.num))) { 1247 1175 i = 0; ··· 1285 1213 unmap_release: 1286 1214 err_idx = i; 1287 1215 i = head; 1216 + curr = vq->free_head; 1288 1217 1289 1218 vq->packed.avail_used_flags = avail_used_flags; 1290 1219 1291 1220 for (n = 0; n < total_sg; n++) { 1292 1221 if (i == err_idx) 1293 1222 break; 1294 - vring_unmap_desc_packed(vq, &desc[i]); 1223 + vring_unmap_state_packed(vq, 1224 + &vq->packed.desc_extra[curr]); 1225 + curr = vq->packed.desc_extra[curr].next; 1295 1226 i++; 1296 1227 if (i >= vq->packed.vring.num) 1297 1228 i = 0; ··· 1365 1290 /* Clear data ptr. */ 1366 1291 state->data = NULL; 1367 1292 1368 - vq->packed.desc_state[state->last].next = vq->free_head; 1293 + vq->packed.desc_extra[state->last].next = vq->free_head; 1369 1294 vq->free_head = id; 1370 1295 vq->vq.num_free += state->num; 1371 1296 ··· 1374 1299 for (i = 0; i < state->num; i++) { 1375 1300 vring_unmap_state_packed(vq, 1376 1301 &vq->packed.desc_extra[curr]); 1377 - curr = vq->packed.desc_state[curr].next; 1302 + curr = vq->packed.desc_extra[curr].next; 1378 1303 } 1379 1304 } 1380 1305 ··· 1625 1550 return NULL; 1626 1551 } 1627 1552 1553 + static struct vring_desc_extra *vring_alloc_desc_extra(struct vring_virtqueue *vq, 1554 + unsigned int num) 1555 + { 1556 + struct vring_desc_extra *desc_extra; 1557 + unsigned int i; 1558 + 1559 + desc_extra = kmalloc_array(num, sizeof(struct vring_desc_extra), 1560 + GFP_KERNEL); 1561 + if (!desc_extra) 1562 + return NULL; 1563 + 1564 + memset(desc_extra, 0, num * sizeof(struct vring_desc_extra)); 1565 + 1566 + for (i = 0; i < num - 1; i++) 1567 + desc_extra[i].next = i + 1; 1568 + 1569 + return desc_extra; 1570 + } 1571 + 1628 1572 static struct virtqueue *vring_create_virtqueue_packed( 1629 1573 unsigned int index, 1630 1574 unsigned int num, ··· 1661 1567 struct vring_packed_desc_event *driver, *device; 1662 1568 dma_addr_t ring_dma_addr, driver_event_dma_addr, device_event_dma_addr; 1663 1569 size_t ring_size_in_bytes, event_size_in_bytes; 1664 - unsigned int i; 1665 1570 1666 1571 ring_size_in_bytes = num * sizeof(struct vring_packed_desc); 1667 1572 ··· 1698 1605 vq->weak_barriers = weak_barriers; 1699 1606 vq->broken = false; 1700 1607 vq->last_used_idx = 0; 1608 + vq->event_triggered = false; 1701 1609 vq->num_added = 0; 1702 1610 vq->packed_ring = true; 1703 1611 vq->use_dma_api = vring_use_dma_api(vdev); ··· 1743 1649 1744 1650 /* Put everything in free lists. */ 1745 1651 vq->free_head = 0; 1746 - for (i = 0; i < num-1; i++) 1747 - vq->packed.desc_state[i].next = i + 1; 1748 1652 1749 - vq->packed.desc_extra = kmalloc_array(num, 1750 - sizeof(struct vring_desc_extra_packed), 1751 - GFP_KERNEL); 1653 + vq->packed.desc_extra = vring_alloc_desc_extra(vq, num); 1752 1654 if (!vq->packed.desc_extra) 1753 1655 goto err_desc_extra; 1754 - 1755 - memset(vq->packed.desc_extra, 0, 1756 - num * sizeof(struct vring_desc_extra_packed)); 1757 1656 1758 1657 /* No callback? Tell other side not to bother us. */ 1759 1658 if (!callback) { ··· 1962 1875 EXPORT_SYMBOL_GPL(virtqueue_kick); 1963 1876 1964 1877 /** 1965 - * virtqueue_get_buf - get the next used buffer 1878 + * virtqueue_get_buf_ctx - get the next used buffer 1966 1879 * @_vq: the struct virtqueue we're talking about. 1967 1880 * @len: the length written into the buffer 1968 1881 * @ctx: extra context for the token ··· 2006 1919 { 2007 1920 struct vring_virtqueue *vq = to_vvq(_vq); 2008 1921 1922 + /* If device triggered an event already it won't trigger one again: 1923 + * no need to disable. 1924 + */ 1925 + if (vq->event_triggered) 1926 + return; 1927 + 2009 1928 if (vq->packed_ring) 2010 1929 virtqueue_disable_cb_packed(_vq); 2011 1930 else ··· 2034 1941 unsigned virtqueue_enable_cb_prepare(struct virtqueue *_vq) 2035 1942 { 2036 1943 struct vring_virtqueue *vq = to_vvq(_vq); 1944 + 1945 + if (vq->event_triggered) 1946 + vq->event_triggered = false; 2037 1947 2038 1948 return vq->packed_ring ? virtqueue_enable_cb_prepare_packed(_vq) : 2039 1949 virtqueue_enable_cb_prepare_split(_vq); ··· 2101 2005 { 2102 2006 struct vring_virtqueue *vq = to_vvq(_vq); 2103 2007 2008 + if (vq->event_triggered) 2009 + vq->event_triggered = false; 2010 + 2104 2011 return vq->packed_ring ? virtqueue_enable_cb_delayed_packed(_vq) : 2105 2012 virtqueue_enable_cb_delayed_split(_vq); 2106 2013 } ··· 2143 2044 if (unlikely(vq->broken)) 2144 2045 return IRQ_HANDLED; 2145 2046 2047 + /* Just a hint for performance: so it's ok that this can be racy! */ 2048 + if (vq->event) 2049 + vq->event_triggered = true; 2050 + 2146 2051 pr_debug("virtqueue callback for %p (%p)\n", vq, vq->vq.callback); 2147 2052 if (vq->vq.callback) 2148 2053 vq->vq.callback(&vq->vq); ··· 2165 2062 void (*callback)(struct virtqueue *), 2166 2063 const char *name) 2167 2064 { 2168 - unsigned int i; 2169 2065 struct vring_virtqueue *vq; 2170 2066 2171 2067 if (virtio_has_feature(vdev, VIRTIO_F_RING_PACKED)) ··· 2185 2083 vq->weak_barriers = weak_barriers; 2186 2084 vq->broken = false; 2187 2085 vq->last_used_idx = 0; 2086 + vq->event_triggered = false; 2188 2087 vq->num_added = 0; 2189 2088 vq->use_dma_api = vring_use_dma_api(vdev); 2190 2089 #ifdef DEBUG ··· 2217 2114 2218 2115 vq->split.desc_state = kmalloc_array(vring.num, 2219 2116 sizeof(struct vring_desc_state_split), GFP_KERNEL); 2220 - if (!vq->split.desc_state) { 2221 - kfree(vq); 2222 - return NULL; 2223 - } 2117 + if (!vq->split.desc_state) 2118 + goto err_state; 2119 + 2120 + vq->split.desc_extra = vring_alloc_desc_extra(vq, vring.num); 2121 + if (!vq->split.desc_extra) 2122 + goto err_extra; 2224 2123 2225 2124 /* Put everything in free lists. */ 2226 2125 vq->free_head = 0; 2227 - for (i = 0; i < vring.num-1; i++) 2228 - vq->split.vring.desc[i].next = cpu_to_virtio16(vdev, i + 1); 2229 2126 memset(vq->split.desc_state, 0, vring.num * 2230 2127 sizeof(struct vring_desc_state_split)); 2231 2128 2232 2129 list_add_tail(&vq->vq.list, &vdev->vqs); 2233 2130 return &vq->vq; 2131 + 2132 + err_extra: 2133 + kfree(vq->split.desc_state); 2134 + err_state: 2135 + kfree(vq); 2136 + return NULL; 2234 2137 } 2235 2138 EXPORT_SYMBOL_GPL(__vring_new_virtqueue); 2236 2139 ··· 2317 2208 vq->split.queue_dma_addr); 2318 2209 } 2319 2210 } 2320 - if (!vq->packed_ring) 2211 + if (!vq->packed_ring) { 2321 2212 kfree(vq->split.desc_state); 2213 + kfree(vq->split.desc_extra); 2214 + } 2322 2215 list_del(&_vq->list); 2323 2216 kfree(vq); 2324 2217 }

+15

drivers/virtio/virtio_vdpa.c

··· 142 142 struct vdpa_callback cb; 143 143 struct virtqueue *vq; 144 144 u64 desc_addr, driver_addr, device_addr; 145 + /* Assume split virtqueue, switch to packed if necessary */ 146 + struct vdpa_vq_state state = {0}; 145 147 unsigned long flags; 146 148 u32 align, num; 147 149 int err; ··· 192 190 err = -EINVAL; 193 191 goto err_vq; 194 192 } 193 + 194 + /* reset virtqueue state index */ 195 + if (virtio_has_feature(vdev, VIRTIO_F_RING_PACKED)) { 196 + struct vdpa_vq_state_packed *s = &state.packed; 197 + 198 + s->last_avail_counter = 1; 199 + s->last_avail_idx = 0; 200 + s->last_used_counter = 1; 201 + s->last_used_idx = 0; 202 + } 203 + err = ops->set_vq_state(vdpa, index, &state); 204 + if (err) 205 + goto err_vq; 195 206 196 207 ops->set_vq_ready(vdpa, index, 1); 197 208

+3 -1

include/linux/mlx5/mlx5_ifc.h

··· 1512 1512 u8 uar_4k[0x1]; 1513 1513 u8 reserved_at_241[0x9]; 1514 1514 u8 uar_sz[0x6]; 1515 - u8 reserved_at_250[0x8]; 1515 + u8 reserved_at_248[0x2]; 1516 + u8 umem_uid_0[0x1]; 1517 + u8 reserved_at_250[0x5]; 1516 1518 u8 log_pg_sz[0x8]; 1517 1519 1518 1520 u8 bf[0x1];

+23 -2

include/linux/vdpa.h

··· 28 28 }; 29 29 30 30 /** 31 - * struct vdpa_vq_state - vDPA vq_state definition 31 + * struct vdpa_vq_state_split - vDPA split virtqueue state 32 32 * @avail_index: available index 33 33 */ 34 - struct vdpa_vq_state { 34 + struct vdpa_vq_state_split { 35 35 u16 avail_index; 36 + }; 37 + 38 + /** 39 + * struct vdpa_vq_state_packed - vDPA packed virtqueue state 40 + * @last_avail_counter: last driver ring wrap counter observed by device 41 + * @last_avail_idx: device available index 42 + * @last_used_counter: device ring wrap counter 43 + * @last_used_idx: used index 44 + */ 45 + struct vdpa_vq_state_packed { 46 + u16 last_avail_counter:1; 47 + u16 last_avail_idx:15; 48 + u16 last_used_counter:1; 49 + u16 last_used_idx:15; 50 + }; 51 + 52 + struct vdpa_vq_state { 53 + union { 54 + struct vdpa_vq_state_split split; 55 + struct vdpa_vq_state_packed packed; 56 + }; 36 57 }; 37 58 38 59 struct vdpa_mgmt_dev;

+1

include/linux/virtio_pci_modern.h

··· 79 79 } 80 80 81 81 u64 vp_modern_get_features(struct virtio_pci_modern_device *mdev); 82 + u64 vp_modern_get_driver_features(struct virtio_pci_modern_device *mdev); 82 83 void vp_modern_set_features(struct virtio_pci_modern_device *mdev, 83 84 u64 features); 84 85 u32 vp_modern_generation(struct virtio_pci_modern_device *mdev);

+12

include/uapi/linux/virtio_ids.h

··· 57 57 #define VIRTIO_ID_MAC80211_HWSIM 29 /* virtio mac80211-hwsim */ 58 58 #define VIRTIO_ID_BT 40 /* virtio bluetooth */ 59 59 60 + /* 61 + * Virtio Transitional IDs 62 + */ 63 + 64 + #define VIRTIO_TRANS_ID_NET 1000 /* transitional virtio net */ 65 + #define VIRTIO_TRANS_ID_BLOCK 1001 /* transitional virtio block */ 66 + #define VIRTIO_TRANS_ID_BALLOON 1002 /* transitional virtio balloon */ 67 + #define VIRTIO_TRANS_ID_CONSOLE 1003 /* transitional virtio console */ 68 + #define VIRTIO_TRANS_ID_SCSI 1004 /* transitional virtio SCSI */ 69 + #define VIRTIO_TRANS_ID_RNG 1005 /* transitional virtio rng */ 70 + #define VIRTIO_TRANS_ID_9P 1009 /* transitional virtio 9p console */ 71 + 60 72 #endif /* _LINUX_VIRTIO_IDS_H */