Merge tag 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost

+4

Documentation/devicetree/bindings/virtio/mmio.yaml

··· 33 33 description: Required for devices making accesses thru an IOMMU. 34 34 maxItems: 1 35 35 36 + wakeup-source: 37 + type: boolean 38 + description: Required for setting irq of a virtio_mmio device as wakeup source. 39 + 36 40 required: 37 41 - compatible 38 42 - reg

+2 -1

arch/um/drivers/virtio_uml.c

··· 958 958 goto error_create; 959 959 } 960 960 vq->priv = info; 961 + vq->num_max = num; 961 962 num = virtqueue_get_vring_size(vq); 962 963 963 964 if (vu_dev->protocol_features & ··· 1011 1010 1012 1011 static int vu_find_vqs(struct virtio_device *vdev, unsigned nvqs, 1013 1012 struct virtqueue *vqs[], vq_callback_t *callbacks[], 1014 - const char * const names[], const bool *ctx, 1013 + const char * const names[], u32 sizes[], const bool *ctx, 1015 1014 struct irq_affinity *desc) 1016 1015 { 1017 1016 struct virtio_uml_device *vu_dev = to_virtio_uml_device(vdev);

+10 -14

drivers/block/virtio_blk.c

··· 101 101 } 102 102 } 103 103 104 + static inline struct virtio_blk_vq *get_virtio_blk_vq(struct blk_mq_hw_ctx *hctx) 105 + { 106 + struct virtio_blk *vblk = hctx->queue->queuedata; 107 + struct virtio_blk_vq *vq = &vblk->vqs[hctx->queue_num]; 108 + 109 + return vq; 110 + } 111 + 104 112 static int virtblk_add_req(struct virtqueue *vq, struct virtblk_req *vbr) 105 113 { 106 114 struct scatterlist hdr, status, *sgs[3]; ··· 424 416 struct request *requeue_list = NULL; 425 417 426 418 rq_list_for_each_safe(rqlist, req, next) { 427 - struct virtio_blk_vq *vq = req->mq_hctx->driver_data; 419 + struct virtio_blk_vq *vq = get_virtio_blk_vq(req->mq_hctx); 428 420 bool kick; 429 421 430 422 if (!virtblk_prep_rq_batch(req)) { ··· 845 837 static int virtblk_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob) 846 838 { 847 839 struct virtio_blk *vblk = hctx->queue->queuedata; 848 - struct virtio_blk_vq *vq = hctx->driver_data; 840 + struct virtio_blk_vq *vq = get_virtio_blk_vq(hctx); 849 841 struct virtblk_req *vbr; 850 842 unsigned long flags; 851 843 unsigned int len; ··· 870 862 return found; 871 863 } 872 864 873 - static int virtblk_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, 874 - unsigned int hctx_idx) 875 - { 876 - struct virtio_blk *vblk = data; 877 - struct virtio_blk_vq *vq = &vblk->vqs[hctx_idx]; 878 - 879 - WARN_ON(vblk->tag_set.tags[hctx_idx] != hctx->tags); 880 - hctx->driver_data = vq; 881 - return 0; 882 - } 883 - 884 865 static const struct blk_mq_ops virtio_mq_ops = { 885 866 .queue_rq = virtio_queue_rq, 886 867 .queue_rqs = virtio_queue_rqs, 887 868 .commit_rqs = virtio_commit_rqs, 888 - .init_hctx = virtblk_init_hctx, 889 869 .complete = virtblk_request_done, 890 870 .map_queues = virtblk_map_queues, 891 871 .poll = virtblk_poll,

+286 -39

drivers/net/virtio_net.c

··· 135 135 struct virtnet_sq_stats stats; 136 136 137 137 struct napi_struct napi; 138 + 139 + /* Record whether sq is in reset state. */ 140 + bool reset; 138 141 }; 139 142 140 143 /* Internal representation of a receive virtqueue */ ··· 270 267 u8 duplex; 271 268 u32 speed; 272 269 270 + /* Interrupt coalescing settings */ 271 + u32 tx_usecs; 272 + u32 rx_usecs; 273 + u32 tx_max_packets; 274 + u32 rx_max_packets; 275 + 273 276 unsigned long guest_offloads; 274 277 unsigned long guest_offloads_capable; 275 278 ··· 292 283 */ 293 284 char padding[12]; 294 285 }; 286 + 287 + static void virtnet_rq_free_unused_buf(struct virtqueue *vq, void *buf); 288 + static void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf); 295 289 296 290 static bool is_xdp_frame(void *ptr) 297 291 { ··· 1640 1628 return; 1641 1629 1642 1630 if (__netif_tx_trylock(txq)) { 1631 + if (sq->reset) { 1632 + __netif_tx_unlock(txq); 1633 + return; 1634 + } 1635 + 1643 1636 do { 1644 1637 virtqueue_disable_cb(sq->vq); 1645 1638 free_old_xmit_skbs(sq, true); ··· 1890 1873 } 1891 1874 1892 1875 return NETDEV_TX_OK; 1876 + } 1877 + 1878 + static int virtnet_rx_resize(struct virtnet_info *vi, 1879 + struct receive_queue *rq, u32 ring_num) 1880 + { 1881 + bool running = netif_running(vi->dev); 1882 + int err, qindex; 1883 + 1884 + qindex = rq - vi->rq; 1885 + 1886 + if (running) 1887 + napi_disable(&rq->napi); 1888 + 1889 + err = virtqueue_resize(rq->vq, ring_num, virtnet_rq_free_unused_buf); 1890 + if (err) 1891 + netdev_err(vi->dev, "resize rx fail: rx queue index: %d err: %d\n", qindex, err); 1892 + 1893 + if (!try_fill_recv(vi, rq, GFP_KERNEL)) 1894 + schedule_delayed_work(&vi->refill, 0); 1895 + 1896 + if (running) 1897 + virtnet_napi_enable(rq->vq, &rq->napi); 1898 + return err; 1899 + } 1900 + 1901 + static int virtnet_tx_resize(struct virtnet_info *vi, 1902 + struct send_queue *sq, u32 ring_num) 1903 + { 1904 + bool running = netif_running(vi->dev); 1905 + struct netdev_queue *txq; 1906 + int err, qindex; 1907 + 1908 + qindex = sq - vi->sq; 1909 + 1910 + if (running) 1911 + virtnet_napi_tx_disable(&sq->napi); 1912 + 1913 + txq = netdev_get_tx_queue(vi->dev, qindex); 1914 + 1915 + /* 1. wait all ximt complete 1916 + * 2. fix the race of netif_stop_subqueue() vs netif_start_subqueue() 1917 + */ 1918 + __netif_tx_lock_bh(txq); 1919 + 1920 + /* Prevent rx poll from accessing sq. */ 1921 + sq->reset = true; 1922 + 1923 + /* Prevent the upper layer from trying to send packets. */ 1924 + netif_stop_subqueue(vi->dev, qindex); 1925 + 1926 + __netif_tx_unlock_bh(txq); 1927 + 1928 + err = virtqueue_resize(sq->vq, ring_num, virtnet_sq_free_unused_buf); 1929 + if (err) 1930 + netdev_err(vi->dev, "resize tx fail: tx queue index: %d err: %d\n", qindex, err); 1931 + 1932 + __netif_tx_lock_bh(txq); 1933 + sq->reset = false; 1934 + netif_tx_wake_queue(txq); 1935 + __netif_tx_unlock_bh(txq); 1936 + 1937 + if (running) 1938 + virtnet_napi_tx_enable(vi, sq->vq, &sq->napi); 1939 + return err; 1893 1940 } 1894 1941 1895 1942 /* ··· 2366 2285 { 2367 2286 struct virtnet_info *vi = netdev_priv(dev); 2368 2287 2369 - ring->rx_max_pending = virtqueue_get_vring_size(vi->rq[0].vq); 2370 - ring->tx_max_pending = virtqueue_get_vring_size(vi->sq[0].vq); 2371 - ring->rx_pending = ring->rx_max_pending; 2372 - ring->tx_pending = ring->tx_max_pending; 2288 + ring->rx_max_pending = vi->rq[0].vq->num_max; 2289 + ring->tx_max_pending = vi->sq[0].vq->num_max; 2290 + ring->rx_pending = virtqueue_get_vring_size(vi->rq[0].vq); 2291 + ring->tx_pending = virtqueue_get_vring_size(vi->sq[0].vq); 2292 + } 2293 + 2294 + static int virtnet_set_ringparam(struct net_device *dev, 2295 + struct ethtool_ringparam *ring, 2296 + struct kernel_ethtool_ringparam *kernel_ring, 2297 + struct netlink_ext_ack *extack) 2298 + { 2299 + struct virtnet_info *vi = netdev_priv(dev); 2300 + u32 rx_pending, tx_pending; 2301 + struct receive_queue *rq; 2302 + struct send_queue *sq; 2303 + int i, err; 2304 + 2305 + if (ring->rx_mini_pending || ring->rx_jumbo_pending) 2306 + return -EINVAL; 2307 + 2308 + rx_pending = virtqueue_get_vring_size(vi->rq[0].vq); 2309 + tx_pending = virtqueue_get_vring_size(vi->sq[0].vq); 2310 + 2311 + if (ring->rx_pending == rx_pending && 2312 + ring->tx_pending == tx_pending) 2313 + return 0; 2314 + 2315 + if (ring->rx_pending > vi->rq[0].vq->num_max) 2316 + return -EINVAL; 2317 + 2318 + if (ring->tx_pending > vi->sq[0].vq->num_max) 2319 + return -EINVAL; 2320 + 2321 + for (i = 0; i < vi->max_queue_pairs; i++) { 2322 + rq = vi->rq + i; 2323 + sq = vi->sq + i; 2324 + 2325 + if (ring->tx_pending != tx_pending) { 2326 + err = virtnet_tx_resize(vi, sq, ring->tx_pending); 2327 + if (err) 2328 + return err; 2329 + } 2330 + 2331 + if (ring->rx_pending != rx_pending) { 2332 + err = virtnet_rx_resize(vi, rq, ring->rx_pending); 2333 + if (err) 2334 + return err; 2335 + } 2336 + } 2337 + 2338 + return 0; 2373 2339 } 2374 2340 2375 2341 static bool virtnet_commit_rss_command(struct virtnet_info *vi) ··· 2746 2618 return 0; 2747 2619 } 2748 2620 2621 + static int virtnet_send_notf_coal_cmds(struct virtnet_info *vi, 2622 + struct ethtool_coalesce *ec) 2623 + { 2624 + struct scatterlist sgs_tx, sgs_rx; 2625 + struct virtio_net_ctrl_coal_tx coal_tx; 2626 + struct virtio_net_ctrl_coal_rx coal_rx; 2627 + 2628 + coal_tx.tx_usecs = cpu_to_le32(ec->tx_coalesce_usecs); 2629 + coal_tx.tx_max_packets = cpu_to_le32(ec->tx_max_coalesced_frames); 2630 + sg_init_one(&sgs_tx, &coal_tx, sizeof(coal_tx)); 2631 + 2632 + if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_NOTF_COAL, 2633 + VIRTIO_NET_CTRL_NOTF_COAL_TX_SET, 2634 + &sgs_tx)) 2635 + return -EINVAL; 2636 + 2637 + /* Save parameters */ 2638 + vi->tx_usecs = ec->tx_coalesce_usecs; 2639 + vi->tx_max_packets = ec->tx_max_coalesced_frames; 2640 + 2641 + coal_rx.rx_usecs = cpu_to_le32(ec->rx_coalesce_usecs); 2642 + coal_rx.rx_max_packets = cpu_to_le32(ec->rx_max_coalesced_frames); 2643 + sg_init_one(&sgs_rx, &coal_rx, sizeof(coal_rx)); 2644 + 2645 + if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_NOTF_COAL, 2646 + VIRTIO_NET_CTRL_NOTF_COAL_RX_SET, 2647 + &sgs_rx)) 2648 + return -EINVAL; 2649 + 2650 + /* Save parameters */ 2651 + vi->rx_usecs = ec->rx_coalesce_usecs; 2652 + vi->rx_max_packets = ec->rx_max_coalesced_frames; 2653 + 2654 + return 0; 2655 + } 2656 + 2657 + static int virtnet_coal_params_supported(struct ethtool_coalesce *ec) 2658 + { 2659 + /* usecs coalescing is supported only if VIRTIO_NET_F_NOTF_COAL 2660 + * feature is negotiated. 2661 + */ 2662 + if (ec->rx_coalesce_usecs || ec->tx_coalesce_usecs) 2663 + return -EOPNOTSUPP; 2664 + 2665 + if (ec->tx_max_coalesced_frames > 1 || 2666 + ec->rx_max_coalesced_frames != 1) 2667 + return -EINVAL; 2668 + 2669 + return 0; 2670 + } 2671 + 2749 2672 static int virtnet_set_coalesce(struct net_device *dev, 2750 2673 struct ethtool_coalesce *ec, 2751 2674 struct kernel_ethtool_coalesce *kernel_coal, 2752 2675 struct netlink_ext_ack *extack) 2753 2676 { 2754 2677 struct virtnet_info *vi = netdev_priv(dev); 2755 - int i, napi_weight; 2678 + int ret, i, napi_weight; 2679 + bool update_napi = false; 2756 2680 2757 - if (ec->tx_max_coalesced_frames > 1 || 2758 - ec->rx_max_coalesced_frames != 1) 2759 - return -EINVAL; 2760 - 2681 + /* Can't change NAPI weight if the link is up */ 2761 2682 napi_weight = ec->tx_max_coalesced_frames ? NAPI_POLL_WEIGHT : 0; 2762 2683 if (napi_weight ^ vi->sq[0].napi.weight) { 2763 2684 if (dev->flags & IFF_UP) 2764 2685 return -EBUSY; 2686 + else 2687 + update_napi = true; 2688 + } 2689 + 2690 + if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_NOTF_COAL)) 2691 + ret = virtnet_send_notf_coal_cmds(vi, ec); 2692 + else 2693 + ret = virtnet_coal_params_supported(ec); 2694 + 2695 + if (ret) 2696 + return ret; 2697 + 2698 + if (update_napi) { 2765 2699 for (i = 0; i < vi->max_queue_pairs; i++) 2766 2700 vi->sq[i].napi.weight = napi_weight; 2767 2701 } 2768 2702 2769 - return 0; 2703 + return ret; 2770 2704 } 2771 2705 2772 2706 static int virtnet_get_coalesce(struct net_device *dev, ··· 2836 2646 struct kernel_ethtool_coalesce *kernel_coal, 2837 2647 struct netlink_ext_ack *extack) 2838 2648 { 2839 - struct ethtool_coalesce ec_default = { 2840 - .cmd = ETHTOOL_GCOALESCE, 2841 - .rx_max_coalesced_frames = 1, 2842 - }; 2843 2649 struct virtnet_info *vi = netdev_priv(dev); 2844 2650 2845 - memcpy(ec, &ec_default, sizeof(ec_default)); 2651 + if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_NOTF_COAL)) { 2652 + ec->rx_coalesce_usecs = vi->rx_usecs; 2653 + ec->tx_coalesce_usecs = vi->tx_usecs; 2654 + ec->tx_max_coalesced_frames = vi->tx_max_packets; 2655 + ec->rx_max_coalesced_frames = vi->rx_max_packets; 2656 + } else { 2657 + ec->rx_max_coalesced_frames = 1; 2846 2658 2847 - if (vi->sq[0].napi.weight) 2848 - ec->tx_max_coalesced_frames = 1; 2659 + if (vi->sq[0].napi.weight) 2660 + ec->tx_max_coalesced_frames = 1; 2661 + } 2849 2662 2850 2663 return 0; 2851 2664 } ··· 2967 2774 } 2968 2775 2969 2776 static const struct ethtool_ops virtnet_ethtool_ops = { 2970 - .supported_coalesce_params = ETHTOOL_COALESCE_MAX_FRAMES, 2777 + .supported_coalesce_params = ETHTOOL_COALESCE_MAX_FRAMES | 2778 + ETHTOOL_COALESCE_USECS, 2971 2779 .get_drvinfo = virtnet_get_drvinfo, 2972 2780 .get_link = ethtool_op_get_link, 2973 2781 .get_ringparam = virtnet_get_ringparam, 2782 + .set_ringparam = virtnet_set_ringparam, 2974 2783 .get_strings = virtnet_get_strings, 2975 2784 .get_sset_count = virtnet_get_sset_count, 2976 2785 .get_ethtool_stats = virtnet_get_ethtool_stats, ··· 3366 3171 put_page(vi->rq[i].alloc_frag.page); 3367 3172 } 3368 3173 3174 + static void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf) 3175 + { 3176 + if (!is_xdp_frame(buf)) 3177 + dev_kfree_skb(buf); 3178 + else 3179 + xdp_return_frame(ptr_to_xdp(buf)); 3180 + } 3181 + 3182 + static void virtnet_rq_free_unused_buf(struct virtqueue *vq, void *buf) 3183 + { 3184 + struct virtnet_info *vi = vq->vdev->priv; 3185 + int i = vq2rxq(vq); 3186 + 3187 + if (vi->mergeable_rx_bufs) 3188 + put_page(virt_to_head_page(buf)); 3189 + else if (vi->big_packets) 3190 + give_pages(&vi->rq[i], buf); 3191 + else 3192 + put_page(virt_to_head_page(buf)); 3193 + } 3194 + 3369 3195 static void free_unused_bufs(struct virtnet_info *vi) 3370 3196 { 3371 3197 void *buf; ··· 3394 3178 3395 3179 for (i = 0; i < vi->max_queue_pairs; i++) { 3396 3180 struct virtqueue *vq = vi->sq[i].vq; 3397 - while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) { 3398 - if (!is_xdp_frame(buf)) 3399 - dev_kfree_skb(buf); 3400 - else 3401 - xdp_return_frame(ptr_to_xdp(buf)); 3402 - } 3181 + while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) 3182 + virtnet_sq_free_unused_buf(vq, buf); 3403 3183 } 3404 3184 3405 3185 for (i = 0; i < vi->max_queue_pairs; i++) { 3406 3186 struct virtqueue *vq = vi->rq[i].vq; 3407 - 3408 - while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) { 3409 - if (vi->mergeable_rx_bufs) { 3410 - put_page(virt_to_head_page(buf)); 3411 - } else if (vi->big_packets) { 3412 - give_pages(&vi->rq[i], buf); 3413 - } else { 3414 - put_page(virt_to_head_page(buf)); 3415 - } 3416 - } 3187 + while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) 3188 + virtnet_rq_free_unused_buf(vq, buf); 3417 3189 } 3418 3190 } 3419 3191 ··· 3432 3228 (unsigned int)GOOD_PACKET_LEN); 3433 3229 } 3434 3230 3231 + static void virtnet_config_sizes(struct virtnet_info *vi, u32 *sizes) 3232 + { 3233 + u32 i, rx_size, tx_size; 3234 + 3235 + if (vi->speed == SPEED_UNKNOWN || vi->speed < SPEED_10000) { 3236 + rx_size = 1024; 3237 + tx_size = 1024; 3238 + 3239 + } else if (vi->speed < SPEED_40000) { 3240 + rx_size = 1024 * 4; 3241 + tx_size = 1024 * 4; 3242 + 3243 + } else { 3244 + rx_size = 1024 * 8; 3245 + tx_size = 1024 * 8; 3246 + } 3247 + 3248 + for (i = 0; i < vi->max_queue_pairs; i++) { 3249 + sizes[rxq2vq(i)] = rx_size; 3250 + sizes[txq2vq(i)] = tx_size; 3251 + } 3252 + } 3253 + 3435 3254 static int virtnet_find_vqs(struct virtnet_info *vi) 3436 3255 { 3437 3256 vq_callback_t **callbacks; ··· 3462 3235 int ret = -ENOMEM; 3463 3236 int i, total_vqs; 3464 3237 const char **names; 3238 + u32 *sizes; 3465 3239 bool *ctx; 3466 3240 3467 3241 /* We expect 1 RX virtqueue followed by 1 TX virtqueue, followed by ··· 3490 3262 ctx = NULL; 3491 3263 } 3492 3264 3265 + sizes = kmalloc_array(total_vqs, sizeof(*sizes), GFP_KERNEL); 3266 + if (!sizes) 3267 + goto err_sizes; 3268 + 3493 3269 /* Parameters for control virtqueue, if any */ 3494 3270 if (vi->has_cvq) { 3495 3271 callbacks[total_vqs - 1] = NULL; 3496 3272 names[total_vqs - 1] = "control"; 3273 + sizes[total_vqs - 1] = 64; 3497 3274 } 3498 3275 3499 3276 /* Allocate/initialize parameters for send/receive virtqueues */ ··· 3513 3280 ctx[rxq2vq(i)] = true; 3514 3281 } 3515 3282 3516 - ret = virtio_find_vqs_ctx(vi->vdev, total_vqs, vqs, callbacks, 3517 - names, ctx, NULL); 3283 + virtnet_config_sizes(vi, sizes); 3284 + 3285 + ret = virtio_find_vqs_ctx_size(vi->vdev, total_vqs, vqs, callbacks, 3286 + names, sizes, ctx, NULL); 3518 3287 if (ret) 3519 3288 goto err_find; 3520 3289 ··· 3536 3301 3537 3302 3538 3303 err_find: 3304 + kfree(sizes); 3305 + err_sizes: 3539 3306 kfree(ctx); 3540 3307 err_ctx: 3541 3308 kfree(names); ··· 3681 3444 VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_RSS, 3682 3445 "VIRTIO_NET_F_CTRL_VQ") || 3683 3446 VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_HASH_REPORT, 3447 + "VIRTIO_NET_F_CTRL_VQ") || 3448 + VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_NOTF_COAL, 3684 3449 "VIRTIO_NET_F_CTRL_VQ"))) { 3685 3450 return false; 3686 3451 } ··· 3819 3580 if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF)) 3820 3581 vi->mergeable_rx_bufs = true; 3821 3582 3583 + if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_NOTF_COAL)) { 3584 + vi->rx_usecs = 0; 3585 + vi->tx_usecs = 0; 3586 + vi->tx_max_packets = 0; 3587 + vi->rx_max_packets = 0; 3588 + } 3589 + 3822 3590 if (virtio_has_feature(vdev, VIRTIO_NET_F_HASH_REPORT)) 3823 3591 vi->has_rss_hash_report = true; 3824 3592 ··· 3897 3651 vi->curr_queue_pairs = num_online_cpus(); 3898 3652 vi->max_queue_pairs = max_queue_pairs; 3899 3653 3654 + virtnet_init_settings(dev); 3655 + virtnet_update_settings(vi); 3656 + 3900 3657 /* Allocate/initialize the rx/tx queues, and invoke find_vqs */ 3901 3658 err = init_vqs(vi); 3902 3659 if (err) ··· 3911 3662 #endif 3912 3663 netif_set_real_num_tx_queues(dev, vi->curr_queue_pairs); 3913 3664 netif_set_real_num_rx_queues(dev, vi->curr_queue_pairs); 3914 - 3915 - virtnet_init_settings(dev); 3916 3665 3917 3666 if (virtio_has_feature(vdev, VIRTIO_NET_F_STANDBY)) { 3918 3667 vi->failover = net_failover_create(vi->dev); ··· 4061 3814 VIRTIO_NET_F_CTRL_MAC_ADDR, \ 4062 3815 VIRTIO_NET_F_MTU, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS, \ 4063 3816 VIRTIO_NET_F_SPEED_DUPLEX, VIRTIO_NET_F_STANDBY, \ 4064 - VIRTIO_NET_F_RSS, VIRTIO_NET_F_HASH_REPORT 3817 + VIRTIO_NET_F_RSS, VIRTIO_NET_F_HASH_REPORT, VIRTIO_NET_F_NOTF_COAL 4065 3818 4066 3819 static unsigned int features[] = { 4067 3820 VIRTNET_FEATURES,

+8 -1

drivers/nvdimm/virtio_pmem.c

··· 81 81 ndr_desc.res = &res; 82 82 ndr_desc.numa_node = nid; 83 83 ndr_desc.flush = async_pmem_flush; 84 + ndr_desc.provider_data = vdev; 84 85 set_bit(ND_REGION_PAGEMAP, &ndr_desc.flags); 85 86 set_bit(ND_REGION_ASYNC, &ndr_desc.flags); 87 + /* 88 + * The NVDIMM region could be available before the 89 + * virtio_device_ready() that is called by 90 + * virtio_dev_probe(), so we set device ready here. 91 + */ 92 + virtio_device_ready(vdev); 86 93 nd_region = nvdimm_pmem_region_create(vpmem->nvdimm_bus, &ndr_desc); 87 94 if (!nd_region) { 88 95 dev_err(&vdev->dev, "failed to create nvdimm region\n"); 89 96 err = -ENXIO; 90 97 goto out_nd; 91 98 } 92 - nd_region->provider_data = dev_to_virtio(nd_region->dev.parent->parent); 93 99 return 0; 94 100 out_nd: 101 + virtio_reset_device(vdev); 95 102 nvdimm_bus_unregister(vpmem->nvdimm_bus); 96 103 out_vq: 97 104 vdev->config->del_vqs(vdev);

+3

drivers/platform/mellanox/mlxbf-tmfifo.c

··· 928 928 struct virtqueue *vqs[], 929 929 vq_callback_t *callbacks[], 930 930 const char * const names[], 931 + u32 sizes[], 931 932 const bool *ctx, 932 933 struct irq_affinity *desc) 933 934 { ··· 959 958 ret = -ENOMEM; 960 959 goto error; 961 960 } 961 + 962 + vq->num_max = vring->num; 962 963 963 964 vqs[i] = vq; 964 965 vring->vq = vq;

+2 -2

drivers/remoteproc/remoteproc_core.c

··· 335 335 size_t size; 336 336 337 337 /* actual size of vring (in bytes) */ 338 - size = PAGE_ALIGN(vring_size(rvring->len, rvring->align)); 338 + size = PAGE_ALIGN(vring_size(rvring->num, rvring->align)); 339 339 340 340 rsc = (void *)rproc->table_ptr + rvdev->rsc_offset; 341 341 ··· 402 402 return -EINVAL; 403 403 } 404 404 405 - rvring->len = vring->num; 405 + rvring->num = vring->num; 406 406 rvring->align = vring->align; 407 407 rvring->rvdev = rvdev; 408 408

+8 -5

drivers/remoteproc/remoteproc_virtio.c

··· 87 87 struct fw_rsc_vdev *rsc; 88 88 struct virtqueue *vq; 89 89 void *addr; 90 - int len, size; 90 + int num, size; 91 91 92 92 /* we're temporarily limited to two virtqueues per rvdev */ 93 93 if (id >= ARRAY_SIZE(rvdev->vring)) ··· 104 104 105 105 rvring = &rvdev->vring[id]; 106 106 addr = mem->va; 107 - len = rvring->len; 107 + num = rvring->num; 108 108 109 109 /* zero vring */ 110 - size = vring_size(len, rvring->align); 110 + size = vring_size(num, rvring->align); 111 111 memset(addr, 0, size); 112 112 113 113 dev_dbg(dev, "vring%d: va %pK qsz %d notifyid %d\n", 114 - id, addr, len, rvring->notifyid); 114 + id, addr, num, rvring->notifyid); 115 115 116 116 /* 117 117 * Create the new vq, and tell virtio we're not interested in 118 118 * the 'weak' smp barriers, since we're talking with a real device. 119 119 */ 120 - vq = vring_new_virtqueue(id, len, rvring->align, vdev, false, ctx, 120 + vq = vring_new_virtqueue(id, num, rvring->align, vdev, false, ctx, 121 121 addr, rproc_virtio_notify, callback, name); 122 122 if (!vq) { 123 123 dev_err(dev, "vring_new_virtqueue %s failed\n", name); 124 124 rproc_free_vring(rvring); 125 125 return ERR_PTR(-ENOMEM); 126 126 } 127 + 128 + vq->num_max = num; 127 129 128 130 rvring->vq = vq; 129 131 vq->priv = rvring; ··· 158 156 struct virtqueue *vqs[], 159 157 vq_callback_t *callbacks[], 160 158 const char * const names[], 159 + u32 sizes[], 161 160 const bool * ctx, 162 161 struct irq_affinity *desc) 163 162 {

+4

drivers/s390/virtio/virtio_ccw.c

··· 532 532 err = -ENOMEM; 533 533 goto out_err; 534 534 } 535 + 536 + vq->num_max = info->num; 537 + 535 538 /* it may have been reduced */ 536 539 info->num = virtqueue_get_vring_size(vq); 537 540 ··· 637 634 struct virtqueue *vqs[], 638 635 vq_callback_t *callbacks[], 639 636 const char * const names[], 637 + u32 sizes[], 640 638 const bool *ctx, 641 639 struct irq_affinity *desc) 642 640 {

+11 -3

drivers/vdpa/ifcvf/ifcvf_base.c

··· 29 29 { 30 30 struct virtio_pci_common_cfg __iomem *cfg = hw->common_cfg; 31 31 32 - cfg = hw->common_cfg; 33 32 vp_iowrite16(vector, &cfg->msix_config); 34 33 35 34 return vp_ioread16(&cfg->msix_config); ··· 127 128 break; 128 129 case VIRTIO_PCI_CAP_DEVICE_CFG: 129 130 hw->dev_cfg = get_cap_addr(hw, &cap); 131 + hw->cap_dev_config_size = le32_to_cpu(cap.length); 130 132 IFCVF_DBG(pdev, "hw->dev_cfg = %p\n", hw->dev_cfg); 131 133 break; 132 134 } ··· 233 233 u32 ifcvf_get_config_size(struct ifcvf_hw *hw) 234 234 { 235 235 struct ifcvf_adapter *adapter; 236 + u32 net_config_size = sizeof(struct virtio_net_config); 237 + u32 blk_config_size = sizeof(struct virtio_blk_config); 238 + u32 cap_size = hw->cap_dev_config_size; 236 239 u32 config_size; 237 240 238 241 adapter = vf_to_adapter(hw); 242 + /* If the onboard device config space size is greater than 243 + * the size of struct virtio_net/blk_config, only the spec 244 + * implementing contents size is returned, this is very 245 + * unlikely, defensive programming. 246 + */ 239 247 switch (hw->dev_type) { 240 248 case VIRTIO_ID_NET: 241 - config_size = sizeof(struct virtio_net_config); 249 + config_size = min(cap_size, net_config_size); 242 250 break; 243 251 case VIRTIO_ID_BLOCK: 244 - config_size = sizeof(struct virtio_blk_config); 252 + config_size = min(cap_size, blk_config_size); 245 253 break; 246 254 default: 247 255 config_size = 0;

+2

drivers/vdpa/ifcvf/ifcvf_base.h

··· 87 87 int config_irq; 88 88 int vqs_reused_irq; 89 89 u16 nr_vring; 90 + /* VIRTIO_PCI_CAP_DEVICE_CFG size */ 91 + u32 cap_dev_config_size; 90 92 }; 91 93 92 94 struct ifcvf_adapter {

+77 -67

drivers/vdpa/ifcvf/ifcvf_main.c

··· 685 685 } 686 686 687 687 /* 688 - * IFCVF currently does't have on-chip IOMMU, so not 688 + * IFCVF currently doesn't have on-chip IOMMU, so not 689 689 * implemented set_map()/dma_map()/dma_unmap() 690 690 */ 691 691 static const struct vdpa_config_ops ifc_vdpa_ops = { ··· 752 752 { 753 753 struct ifcvf_vdpa_mgmt_dev *ifcvf_mgmt_dev; 754 754 struct ifcvf_adapter *adapter; 755 + struct vdpa_device *vdpa_dev; 755 756 struct pci_dev *pdev; 756 757 struct ifcvf_hw *vf; 757 - struct device *dev; 758 - int ret, i; 758 + int ret; 759 759 760 760 ifcvf_mgmt_dev = container_of(mdev, struct ifcvf_vdpa_mgmt_dev, mdev); 761 - if (ifcvf_mgmt_dev->adapter) 761 + if (!ifcvf_mgmt_dev->adapter) 762 762 return -EOPNOTSUPP; 763 763 764 - pdev = ifcvf_mgmt_dev->pdev; 765 - dev = &pdev->dev; 766 - adapter = vdpa_alloc_device(struct ifcvf_adapter, vdpa, 767 - dev, &ifc_vdpa_ops, 1, 1, name, false); 768 - if (IS_ERR(adapter)) { 769 - IFCVF_ERR(pdev, "Failed to allocate vDPA structure"); 770 - return PTR_ERR(adapter); 771 - } 772 - 773 - ifcvf_mgmt_dev->adapter = adapter; 774 - 764 + adapter = ifcvf_mgmt_dev->adapter; 775 765 vf = &adapter->vf; 776 - vf->dev_type = get_dev_type(pdev); 777 - vf->base = pcim_iomap_table(pdev); 766 + pdev = adapter->pdev; 767 + vdpa_dev = &adapter->vdpa; 778 768 779 - adapter->pdev = pdev; 780 - adapter->vdpa.dma_dev = &pdev->dev; 769 + if (name) 770 + ret = dev_set_name(&vdpa_dev->dev, "%s", name); 771 + else 772 + ret = dev_set_name(&vdpa_dev->dev, "vdpa%u", vdpa_dev->index); 781 773 782 - ret = ifcvf_init_hw(vf, pdev); 783 - if (ret) { 784 - IFCVF_ERR(pdev, "Failed to init IFCVF hw\n"); 785 - goto err; 786 - } 787 - 788 - for (i = 0; i < vf->nr_vring; i++) 789 - vf->vring[i].irq = -EINVAL; 790 - 791 - vf->hw_features = ifcvf_get_hw_features(vf); 792 - vf->config_size = ifcvf_get_config_size(vf); 793 - 794 - adapter->vdpa.mdev = &ifcvf_mgmt_dev->mdev; 795 774 ret = _vdpa_register_device(&adapter->vdpa, vf->nr_vring); 796 775 if (ret) { 776 + put_device(&adapter->vdpa.dev); 797 777 IFCVF_ERR(pdev, "Failed to register to vDPA bus"); 798 - goto err; 778 + return ret; 799 779 } 800 780 801 781 return 0; 802 - 803 - err: 804 - put_device(&adapter->vdpa.dev); 805 - return ret; 806 782 } 783 + 807 784 808 785 static void ifcvf_vdpa_dev_del(struct vdpa_mgmt_dev *mdev, struct vdpa_device *dev) 809 786 { ··· 800 823 { 801 824 struct ifcvf_vdpa_mgmt_dev *ifcvf_mgmt_dev; 802 825 struct device *dev = &pdev->dev; 826 + struct ifcvf_adapter *adapter; 827 + struct ifcvf_hw *vf; 803 828 u32 dev_type; 804 - int ret; 829 + int ret, i; 830 + 831 + ret = pcim_enable_device(pdev); 832 + if (ret) { 833 + IFCVF_ERR(pdev, "Failed to enable device\n"); 834 + return ret; 835 + } 836 + ret = pcim_iomap_regions(pdev, BIT(0) | BIT(2) | BIT(4), 837 + IFCVF_DRIVER_NAME); 838 + if (ret) { 839 + IFCVF_ERR(pdev, "Failed to request MMIO region\n"); 840 + return ret; 841 + } 842 + 843 + ret = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64)); 844 + if (ret) { 845 + IFCVF_ERR(pdev, "No usable DMA configuration\n"); 846 + return ret; 847 + } 848 + 849 + ret = devm_add_action_or_reset(dev, ifcvf_free_irq_vectors, pdev); 850 + if (ret) { 851 + IFCVF_ERR(pdev, 852 + "Failed for adding devres for freeing irq vectors\n"); 853 + return ret; 854 + } 855 + 856 + pci_set_master(pdev); 857 + 858 + adapter = vdpa_alloc_device(struct ifcvf_adapter, vdpa, 859 + dev, &ifc_vdpa_ops, 1, 1, NULL, false); 860 + if (IS_ERR(adapter)) { 861 + IFCVF_ERR(pdev, "Failed to allocate vDPA structure"); 862 + return PTR_ERR(adapter); 863 + } 864 + 865 + vf = &adapter->vf; 866 + vf->dev_type = get_dev_type(pdev); 867 + vf->base = pcim_iomap_table(pdev); 868 + 869 + adapter->pdev = pdev; 870 + adapter->vdpa.dma_dev = &pdev->dev; 871 + 872 + ret = ifcvf_init_hw(vf, pdev); 873 + if (ret) { 874 + IFCVF_ERR(pdev, "Failed to init IFCVF hw\n"); 875 + return ret; 876 + } 877 + 878 + for (i = 0; i < vf->nr_vring; i++) 879 + vf->vring[i].irq = -EINVAL; 880 + 881 + vf->hw_features = ifcvf_get_hw_features(vf); 882 + vf->config_size = ifcvf_get_config_size(vf); 805 883 806 884 ifcvf_mgmt_dev = kzalloc(sizeof(struct ifcvf_vdpa_mgmt_dev), GFP_KERNEL); 807 885 if (!ifcvf_mgmt_dev) { 808 886 IFCVF_ERR(pdev, "Failed to alloc memory for the vDPA management device\n"); 809 887 return -ENOMEM; 810 888 } 889 + 890 + ifcvf_mgmt_dev->mdev.ops = &ifcvf_vdpa_mgmt_dev_ops; 891 + ifcvf_mgmt_dev->mdev.device = dev; 892 + ifcvf_mgmt_dev->adapter = adapter; 811 893 812 894 dev_type = get_dev_type(pdev); 813 895 switch (dev_type) { ··· 882 846 goto err; 883 847 } 884 848 885 - ifcvf_mgmt_dev->mdev.ops = &ifcvf_vdpa_mgmt_dev_ops; 886 - ifcvf_mgmt_dev->mdev.device = dev; 887 - ifcvf_mgmt_dev->pdev = pdev; 849 + ifcvf_mgmt_dev->mdev.max_supported_vqs = vf->nr_vring; 850 + ifcvf_mgmt_dev->mdev.supported_features = vf->hw_features; 888 851 889 - ret = pcim_enable_device(pdev); 890 - if (ret) { 891 - IFCVF_ERR(pdev, "Failed to enable device\n"); 892 - goto err; 893 - } 852 + adapter->vdpa.mdev = &ifcvf_mgmt_dev->mdev; 894 853 895 - ret = pcim_iomap_regions(pdev, BIT(0) | BIT(2) | BIT(4), 896 - IFCVF_DRIVER_NAME); 897 - if (ret) { 898 - IFCVF_ERR(pdev, "Failed to request MMIO region\n"); 899 - goto err; 900 - } 901 - 902 - ret = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64)); 903 - if (ret) { 904 - IFCVF_ERR(pdev, "No usable DMA configuration\n"); 905 - goto err; 906 - } 907 - 908 - ret = devm_add_action_or_reset(dev, ifcvf_free_irq_vectors, pdev); 909 - if (ret) { 910 - IFCVF_ERR(pdev, 911 - "Failed for adding devres for freeing irq vectors\n"); 912 - goto err; 913 - } 914 - 915 - pci_set_master(pdev); 916 854 917 855 ret = vdpa_mgmtdev_register(&ifcvf_mgmt_dev->mdev); 918 856 if (ret) {

+11

drivers/vdpa/mlx5/core/mlx5_vdpa.h

··· 70 70 struct mlx5_vdpa_dev *mvdev; 71 71 }; 72 72 73 + enum { 74 + MLX5_VDPA_DATAVQ_GROUP, 75 + MLX5_VDPA_CVQ_GROUP, 76 + MLX5_VDPA_NUMVQ_GROUPS 77 + }; 78 + 79 + enum { 80 + MLX5_VDPA_NUM_AS = MLX5_VDPA_NUMVQ_GROUPS 81 + }; 82 + 73 83 struct mlx5_vdpa_dev { 74 84 struct vdpa_device vdev; 75 85 struct mlx5_core_dev *mdev; ··· 95 85 struct mlx5_vdpa_mr mr; 96 86 struct mlx5_control_vq cvq; 97 87 struct workqueue_struct *wq; 88 + unsigned int group2asid[MLX5_VDPA_NUMVQ_GROUPS]; 98 89 }; 99 90 100 91 int mlx5_vdpa_alloc_pd(struct mlx5_vdpa_dev *dev, u32 *pdn, u16 uid);

+159 -16

drivers/vdpa/mlx5/net/mlx5_vnet.c

··· 164 164 bool setup; 165 165 u32 cur_num_vqs; 166 166 u32 rqt_size; 167 + bool nb_registered; 167 168 struct notifier_block nb; 168 169 struct vdpa_callback config_cb; 169 170 struct mlx5_vdpa_wq_ent cvq_ent; ··· 896 895 if (err) 897 896 goto err_cmd; 898 897 898 + mvq->fw_state = MLX5_VIRTIO_NET_Q_OBJECT_STATE_INIT; 899 899 kfree(in); 900 900 mvq->virtq_id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id); 901 901 ··· 924 922 mlx5_vdpa_warn(&ndev->mvdev, "destroy virtqueue 0x%x\n", mvq->virtq_id); 925 923 return; 926 924 } 925 + mvq->fw_state = MLX5_VIRTIO_NET_Q_OBJECT_NONE; 927 926 umems_destroy(ndev, mvq); 928 927 } 929 928 ··· 1124 1121 return err; 1125 1122 } 1126 1123 1124 + static bool is_valid_state_change(int oldstate, int newstate) 1125 + { 1126 + switch (oldstate) { 1127 + case MLX5_VIRTIO_NET_Q_OBJECT_STATE_INIT: 1128 + return newstate == MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY; 1129 + case MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY: 1130 + return newstate == MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND; 1131 + case MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND: 1132 + case MLX5_VIRTIO_NET_Q_OBJECT_STATE_ERR: 1133 + default: 1134 + return false; 1135 + } 1136 + } 1137 + 1127 1138 static int modify_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int state) 1128 1139 { 1129 1140 int inlen = MLX5_ST_SZ_BYTES(modify_virtio_net_q_in); ··· 1146 1129 void *cmd_hdr; 1147 1130 void *in; 1148 1131 int err; 1132 + 1133 + if (mvq->fw_state == MLX5_VIRTIO_NET_Q_OBJECT_NONE) 1134 + return 0; 1135 + 1136 + if (!is_valid_state_change(mvq->fw_state, state)) 1137 + return -EINVAL; 1149 1138 1150 1139 in = kzalloc(inlen, GFP_KERNEL); 1151 1140 if (!in) ··· 1463 1440 headers_v = MLX5_ADDR_OF(fte_match_param, spec->match_value, outer_headers); 1464 1441 dmac_c = MLX5_ADDR_OF(fte_match_param, headers_c, outer_headers.dmac_47_16); 1465 1442 dmac_v = MLX5_ADDR_OF(fte_match_param, headers_v, outer_headers.dmac_47_16); 1466 - memset(dmac_c, 0xff, ETH_ALEN); 1443 + eth_broadcast_addr(dmac_c); 1467 1444 ether_addr_copy(dmac_v, mac); 1468 1445 MLX5_SET(fte_match_set_lyr_2_4, headers_c, cvlan_tag, 1); 1469 1446 if (tagged) { ··· 2015 1992 struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); 2016 1993 struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); 2017 1994 struct mlx5_vdpa_virtqueue *mvq; 1995 + int err; 2018 1996 2019 1997 if (!mvdev->actual_features) 2020 1998 return; ··· 2029 2005 } 2030 2006 2031 2007 mvq = &ndev->vqs[idx]; 2032 - if (!ready) 2008 + if (!ready) { 2033 2009 suspend_vq(ndev, mvq); 2010 + } else { 2011 + err = modify_virtqueue(ndev, mvq, MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY); 2012 + if (err) { 2013 + mlx5_vdpa_warn(mvdev, "modify VQ %d to ready failed (%d)\n", idx, err); 2014 + ready = false; 2015 + } 2016 + } 2017 + 2034 2018 2035 2019 mvq->ready = ready; 2036 2020 } ··· 2127 2095 return PAGE_SIZE; 2128 2096 } 2129 2097 2130 - static u32 mlx5_vdpa_get_vq_group(struct vdpa_device *vdpa, u16 idx) 2098 + static u32 mlx5_vdpa_get_vq_group(struct vdpa_device *vdev, u16 idx) 2131 2099 { 2132 - return 0; 2100 + struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); 2101 + 2102 + if (is_ctrl_vq_idx(mvdev, idx)) 2103 + return MLX5_VDPA_CVQ_GROUP; 2104 + 2105 + return MLX5_VDPA_DATAVQ_GROUP; 2133 2106 } 2134 2107 2135 2108 enum { MLX5_VIRTIO_NET_F_GUEST_CSUM = 1 << 9, ··· 2548 2511 up_write(&ndev->reslock); 2549 2512 } 2550 2513 2514 + static void init_group_to_asid_map(struct mlx5_vdpa_dev *mvdev) 2515 + { 2516 + int i; 2517 + 2518 + /* default mapping all groups are mapped to asid 0 */ 2519 + for (i = 0; i < MLX5_VDPA_NUMVQ_GROUPS; i++) 2520 + mvdev->group2asid[i] = 0; 2521 + } 2522 + 2551 2523 static int mlx5_vdpa_reset(struct vdpa_device *vdev) 2552 2524 { 2553 2525 struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); ··· 2575 2529 ndev->mvdev.cvq.completed_desc = 0; 2576 2530 memset(ndev->event_cbs, 0, sizeof(*ndev->event_cbs) * (mvdev->max_vqs + 1)); 2577 2531 ndev->mvdev.actual_features = 0; 2532 + init_group_to_asid_map(mvdev); 2578 2533 ++mvdev->generation; 2534 + 2579 2535 if (MLX5_CAP_GEN(mvdev->mdev, umem_uid_0)) { 2580 2536 if (mlx5_vdpa_create_mr(mvdev, NULL)) 2581 2537 mlx5_vdpa_warn(mvdev, "create MR failed\n"); ··· 2615 2567 return mvdev->generation; 2616 2568 } 2617 2569 2618 - static int mlx5_vdpa_set_map(struct vdpa_device *vdev, unsigned int asid, 2619 - struct vhost_iotlb *iotlb) 2570 + static int set_map_control(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb) 2620 2571 { 2621 - struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); 2622 - struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); 2572 + u64 start = 0ULL, last = 0ULL - 1; 2573 + struct vhost_iotlb_map *map; 2574 + int err = 0; 2575 + 2576 + spin_lock(&mvdev->cvq.iommu_lock); 2577 + vhost_iotlb_reset(mvdev->cvq.iotlb); 2578 + 2579 + for (map = vhost_iotlb_itree_first(iotlb, start, last); map; 2580 + map = vhost_iotlb_itree_next(map, start, last)) { 2581 + err = vhost_iotlb_add_range(mvdev->cvq.iotlb, map->start, 2582 + map->last, map->addr, map->perm); 2583 + if (err) 2584 + goto out; 2585 + } 2586 + 2587 + out: 2588 + spin_unlock(&mvdev->cvq.iommu_lock); 2589 + return err; 2590 + } 2591 + 2592 + static int set_map_data(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb) 2593 + { 2623 2594 bool change_map; 2624 2595 int err; 2625 - 2626 - down_write(&ndev->reslock); 2627 2596 2628 2597 err = mlx5_vdpa_handle_set_map(mvdev, iotlb, &change_map); 2629 2598 if (err) { 2630 2599 mlx5_vdpa_warn(mvdev, "set map failed(%d)\n", err); 2631 - goto err; 2600 + return err; 2632 2601 } 2633 2602 2634 2603 if (change_map) 2635 2604 err = mlx5_vdpa_change_map(mvdev, iotlb); 2636 2605 2637 - err: 2606 + return err; 2607 + } 2608 + 2609 + static int mlx5_vdpa_set_map(struct vdpa_device *vdev, unsigned int asid, 2610 + struct vhost_iotlb *iotlb) 2611 + { 2612 + struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); 2613 + struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); 2614 + int err = -EINVAL; 2615 + 2616 + down_write(&ndev->reslock); 2617 + if (mvdev->group2asid[MLX5_VDPA_DATAVQ_GROUP] == asid) { 2618 + err = set_map_data(mvdev, iotlb); 2619 + if (err) 2620 + goto out; 2621 + } 2622 + 2623 + if (mvdev->group2asid[MLX5_VDPA_CVQ_GROUP] == asid) 2624 + err = set_map_control(mvdev, iotlb); 2625 + 2626 + out: 2638 2627 up_write(&ndev->reslock); 2639 2628 return err; 2640 2629 } ··· 2818 2733 return err; 2819 2734 } 2820 2735 2736 + static void mlx5_vdpa_cvq_suspend(struct mlx5_vdpa_dev *mvdev) 2737 + { 2738 + struct mlx5_control_vq *cvq; 2739 + 2740 + if (!(mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VQ))) 2741 + return; 2742 + 2743 + cvq = &mvdev->cvq; 2744 + cvq->ready = false; 2745 + } 2746 + 2747 + static int mlx5_vdpa_suspend(struct vdpa_device *vdev) 2748 + { 2749 + struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); 2750 + struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); 2751 + struct mlx5_vdpa_virtqueue *mvq; 2752 + int i; 2753 + 2754 + down_write(&ndev->reslock); 2755 + mlx5_notifier_unregister(mvdev->mdev, &ndev->nb); 2756 + ndev->nb_registered = false; 2757 + flush_workqueue(ndev->mvdev.wq); 2758 + for (i = 0; i < ndev->cur_num_vqs; i++) { 2759 + mvq = &ndev->vqs[i]; 2760 + suspend_vq(ndev, mvq); 2761 + } 2762 + mlx5_vdpa_cvq_suspend(mvdev); 2763 + up_write(&ndev->reslock); 2764 + return 0; 2765 + } 2766 + 2767 + static int mlx5_set_group_asid(struct vdpa_device *vdev, u32 group, 2768 + unsigned int asid) 2769 + { 2770 + struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); 2771 + 2772 + if (group >= MLX5_VDPA_NUMVQ_GROUPS) 2773 + return -EINVAL; 2774 + 2775 + mvdev->group2asid[group] = asid; 2776 + return 0; 2777 + } 2778 + 2821 2779 static const struct vdpa_config_ops mlx5_vdpa_ops = { 2822 2780 .set_vq_address = mlx5_vdpa_set_vq_address, 2823 2781 .set_vq_num = mlx5_vdpa_set_vq_num, ··· 2890 2762 .set_config = mlx5_vdpa_set_config, 2891 2763 .get_generation = mlx5_vdpa_get_generation, 2892 2764 .set_map = mlx5_vdpa_set_map, 2765 + .set_group_asid = mlx5_set_group_asid, 2893 2766 .free = mlx5_vdpa_free, 2767 + .suspend = mlx5_vdpa_suspend, 2894 2768 }; 2895 2769 2896 2770 static int query_mtu(struct mlx5_core_dev *mdev, u16 *mtu) ··· 2958 2828 mvq->index = i; 2959 2829 mvq->ndev = ndev; 2960 2830 mvq->fwqp.fw = true; 2831 + mvq->fw_state = MLX5_VIRTIO_NET_Q_OBJECT_NONE; 2961 2832 } 2962 2833 for (; i < ndev->mvdev.max_vqs; i++) { 2963 2834 mvq = &ndev->vqs[i]; ··· 3033 2902 switch (eqe->sub_type) { 3034 2903 case MLX5_PORT_CHANGE_SUBTYPE_DOWN: 3035 2904 case MLX5_PORT_CHANGE_SUBTYPE_ACTIVE: 3036 - wqent = kzalloc(sizeof(*wqent), GFP_ATOMIC); 3037 - if (!wqent) 2905 + down_read(&ndev->reslock); 2906 + if (!ndev->nb_registered) { 2907 + up_read(&ndev->reslock); 3038 2908 return NOTIFY_DONE; 2909 + } 2910 + wqent = kzalloc(sizeof(*wqent), GFP_ATOMIC); 2911 + if (!wqent) { 2912 + up_read(&ndev->reslock); 2913 + return NOTIFY_DONE; 2914 + } 3039 2915 3040 2916 wqent->mvdev = &ndev->mvdev; 3041 2917 INIT_WORK(&wqent->work, update_carrier); 3042 2918 queue_work(ndev->mvdev.wq, &wqent->work); 2919 + up_read(&ndev->reslock); 3043 2920 ret = NOTIFY_OK; 3044 2921 break; 3045 2922 default: ··· 3121 2982 } 3122 2983 3123 2984 ndev = vdpa_alloc_device(struct mlx5_vdpa_net, mvdev.vdev, mdev->device, &mlx5_vdpa_ops, 3124 - 1, 1, name, false); 2985 + MLX5_VDPA_NUMVQ_GROUPS, MLX5_VDPA_NUM_AS, name, false); 3125 2986 if (IS_ERR(ndev)) 3126 2987 return PTR_ERR(ndev); 3127 2988 ··· 3201 3062 3202 3063 ndev->nb.notifier_call = event_handler; 3203 3064 mlx5_notifier_register(mdev, &ndev->nb); 3065 + ndev->nb_registered = true; 3204 3066 mvdev->vdev.mdev = &mgtdev->mgtdev; 3205 3067 err = _vdpa_register_device(&mvdev->vdev, max_vqs + 1); 3206 3068 if (err) ··· 3233 3093 struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); 3234 3094 struct workqueue_struct *wq; 3235 3095 3236 - mlx5_notifier_unregister(mvdev->mdev, &ndev->nb); 3096 + if (ndev->nb_registered) { 3097 + mlx5_notifier_unregister(mvdev->mdev, &ndev->nb); 3098 + ndev->nb_registered = false; 3099 + } 3237 3100 wq = mvdev->wq; 3238 3101 mvdev->wq = NULL; 3239 3102 destroy_workqueue(wq);

+3 -11

drivers/vdpa/vdpa.c

··· 824 824 config.mac)) 825 825 return -EMSGSIZE; 826 826 827 - val_u16 = le16_to_cpu(config.status); 827 + val_u16 = __virtio16_to_cpu(true, config.status); 828 828 if (nla_put_u16(msg, VDPA_ATTR_DEV_NET_STATUS, val_u16)) 829 829 return -EMSGSIZE; 830 830 831 - val_u16 = le16_to_cpu(config.mtu); 831 + val_u16 = __virtio16_to_cpu(true, config.mtu); 832 832 if (nla_put_u16(msg, VDPA_ATTR_DEV_NET_CFG_MTU, val_u16)) 833 833 return -EMSGSIZE; 834 834 ··· 846 846 { 847 847 u32 device_id; 848 848 void *hdr; 849 - u8 status; 850 849 int err; 851 850 852 851 down_read(&vdev->cf_lock); 853 - status = vdev->config->get_status(vdev); 854 - if (!(status & VIRTIO_CONFIG_S_FEATURES_OK)) { 855 - NL_SET_ERR_MSG_MOD(extack, "Features negotiation not completed"); 856 - err = -EAGAIN; 857 - goto out; 858 - } 859 - 860 852 hdr = genlmsg_put(msg, portid, seq, &vdpa_nl_family, flags, 861 853 VDPA_CMD_DEV_CONFIG_GET); 862 854 if (!hdr) { ··· 905 913 } 906 914 vdpa_get_config_unlocked(vdev, 0, &config, sizeof(config)); 907 915 908 - max_vqp = le16_to_cpu(config.max_virtqueue_pairs); 916 + max_vqp = __virtio16_to_cpu(true, config.max_virtqueue_pairs); 909 917 if (nla_put_u16(msg, VDPA_ATTR_DEV_NET_CFG_MAX_VQP, max_vqp)) 910 918 return -EMSGSIZE; 911 919

+16 -2

drivers/vdpa/vdpa_sim/vdpa_sim.c

··· 33 33 static int max_iotlb_entries = 2048; 34 34 module_param(max_iotlb_entries, int, 0444); 35 35 MODULE_PARM_DESC(max_iotlb_entries, 36 - "Maximum number of iotlb entries. 0 means unlimited. (default: 2048)"); 36 + "Maximum number of iotlb entries for each address space. 0 means unlimited. (default: 2048)"); 37 37 38 38 #define VDPASIM_QUEUE_ALIGN PAGE_SIZE 39 39 #define VDPASIM_QUEUE_MAX 256 ··· 107 107 for (i = 0; i < vdpasim->dev_attr.nas; i++) 108 108 vhost_iotlb_reset(&vdpasim->iommu[i]); 109 109 110 + vdpasim->running = true; 110 111 spin_unlock(&vdpasim->iommu_lock); 111 112 112 113 vdpasim->features = 0; ··· 292 291 goto err_iommu; 293 292 294 293 for (i = 0; i < vdpasim->dev_attr.nas; i++) 295 - vhost_iotlb_init(&vdpasim->iommu[i], 0, 0); 294 + vhost_iotlb_init(&vdpasim->iommu[i], max_iotlb_entries, 0); 296 295 297 296 vdpasim->buffer = kvmalloc(dev_attr->buffer_size, GFP_KERNEL); 298 297 if (!vdpasim->buffer) ··· 506 505 return 0; 507 506 } 508 507 508 + static int vdpasim_suspend(struct vdpa_device *vdpa) 509 + { 510 + struct vdpasim *vdpasim = vdpa_to_sim(vdpa); 511 + 512 + spin_lock(&vdpasim->lock); 513 + vdpasim->running = false; 514 + spin_unlock(&vdpasim->lock); 515 + 516 + return 0; 517 + } 518 + 509 519 static size_t vdpasim_get_config_size(struct vdpa_device *vdpa) 510 520 { 511 521 struct vdpasim *vdpasim = vdpa_to_sim(vdpa); ··· 706 694 .get_status = vdpasim_get_status, 707 695 .set_status = vdpasim_set_status, 708 696 .reset = vdpasim_reset, 697 + .suspend = vdpasim_suspend, 709 698 .get_config_size = vdpasim_get_config_size, 710 699 .get_config = vdpasim_get_config, 711 700 .set_config = vdpasim_set_config, ··· 739 726 .get_status = vdpasim_get_status, 740 727 .set_status = vdpasim_set_status, 741 728 .reset = vdpasim_reset, 729 + .suspend = vdpasim_suspend, 742 730 .get_config_size = vdpasim_get_config_size, 743 731 .get_config = vdpasim_get_config, 744 732 .set_config = vdpasim_set_config,

+1

drivers/vdpa/vdpa_sim/vdpa_sim.h

··· 66 66 u32 generation; 67 67 u64 features; 68 68 u32 groups; 69 + bool running; 69 70 /* spinlock to synchronize iommu table */ 70 71 spinlock_t iommu_lock; 71 72 };

+147 -29

drivers/vdpa/vdpa_sim/vdpa_sim_blk.c

··· 25 25 #define DRV_LICENSE "GPL v2" 26 26 27 27 #define VDPASIM_BLK_FEATURES (VDPASIM_FEATURES | \ 28 + (1ULL << VIRTIO_BLK_F_FLUSH) | \ 28 29 (1ULL << VIRTIO_BLK_F_SIZE_MAX) | \ 29 30 (1ULL << VIRTIO_BLK_F_SEG_MAX) | \ 30 31 (1ULL << VIRTIO_BLK_F_BLK_SIZE) | \ 31 32 (1ULL << VIRTIO_BLK_F_TOPOLOGY) | \ 32 - (1ULL << VIRTIO_BLK_F_MQ)) 33 + (1ULL << VIRTIO_BLK_F_MQ) | \ 34 + (1ULL << VIRTIO_BLK_F_DISCARD) | \ 35 + (1ULL << VIRTIO_BLK_F_WRITE_ZEROES)) 33 36 34 37 #define VDPASIM_BLK_CAPACITY 0x40000 35 38 #define VDPASIM_BLK_SIZE_MAX 0x1000 36 39 #define VDPASIM_BLK_SEG_MAX 32 40 + #define VDPASIM_BLK_DWZ_MAX_SECTORS UINT_MAX 41 + 42 + /* 1 virtqueue, 1 address space, 1 virtqueue group */ 37 43 #define VDPASIM_BLK_VQ_NUM 1 44 + #define VDPASIM_BLK_AS_NUM 1 45 + #define VDPASIM_BLK_GROUP_NUM 1 38 46 39 47 static char vdpasim_blk_id[VIRTIO_BLK_ID_BYTES] = "vdpa_blk_sim"; 40 48 41 - static bool vdpasim_blk_check_range(u64 start_sector, size_t range_size) 49 + static bool vdpasim_blk_check_range(struct vdpasim *vdpasim, u64 start_sector, 50 + u64 num_sectors, u64 max_sectors) 42 51 { 43 - u64 range_sectors = range_size >> SECTOR_SHIFT; 52 + if (start_sector > VDPASIM_BLK_CAPACITY) { 53 + dev_dbg(&vdpasim->vdpa.dev, 54 + "starting sector exceeds the capacity - start: 0x%llx capacity: 0x%x\n", 55 + start_sector, VDPASIM_BLK_CAPACITY); 56 + } 44 57 45 - if (range_size > VDPASIM_BLK_SIZE_MAX * VDPASIM_BLK_SEG_MAX) 58 + if (num_sectors > max_sectors) { 59 + dev_dbg(&vdpasim->vdpa.dev, 60 + "number of sectors exceeds the max allowed in a request - num: 0x%llx max: 0x%llx\n", 61 + num_sectors, max_sectors); 46 62 return false; 63 + } 47 64 48 - if (start_sector > VDPASIM_BLK_CAPACITY) 65 + if (num_sectors > VDPASIM_BLK_CAPACITY - start_sector) { 66 + dev_dbg(&vdpasim->vdpa.dev, 67 + "request exceeds the capacity - start: 0x%llx num: 0x%llx capacity: 0x%x\n", 68 + start_sector, num_sectors, VDPASIM_BLK_CAPACITY); 49 69 return false; 50 - 51 - if (range_sectors > VDPASIM_BLK_CAPACITY - start_sector) 52 - return false; 70 + } 53 71 54 72 return true; 55 73 } ··· 81 63 { 82 64 size_t pushed = 0, to_pull, to_push; 83 65 struct virtio_blk_outhdr hdr; 66 + bool handled = false; 84 67 ssize_t bytes; 85 68 loff_t offset; 86 69 u64 sector; ··· 95 76 return false; 96 77 97 78 if (vq->out_iov.used < 1 || vq->in_iov.used < 1) { 98 - dev_err(&vdpasim->vdpa.dev, "missing headers - out_iov: %u in_iov %u\n", 79 + dev_dbg(&vdpasim->vdpa.dev, "missing headers - out_iov: %u in_iov %u\n", 99 80 vq->out_iov.used, vq->in_iov.used); 100 - return false; 81 + goto err; 101 82 } 102 83 103 84 if (vq->in_iov.iov[vq->in_iov.used - 1].iov_len < 1) { 104 - dev_err(&vdpasim->vdpa.dev, "request in header too short\n"); 105 - return false; 85 + dev_dbg(&vdpasim->vdpa.dev, "request in header too short\n"); 86 + goto err; 106 87 } 107 88 108 89 /* The last byte is the status and we checked if the last iov has ··· 115 96 bytes = vringh_iov_pull_iotlb(&vq->vring, &vq->out_iov, &hdr, 116 97 sizeof(hdr)); 117 98 if (bytes != sizeof(hdr)) { 118 - dev_err(&vdpasim->vdpa.dev, "request out header too short\n"); 119 - return false; 99 + dev_dbg(&vdpasim->vdpa.dev, "request out header too short\n"); 100 + goto err; 120 101 } 121 102 122 103 to_pull -= bytes; ··· 126 107 offset = sector << SECTOR_SHIFT; 127 108 status = VIRTIO_BLK_S_OK; 128 109 110 + if (type != VIRTIO_BLK_T_IN && type != VIRTIO_BLK_T_OUT && 111 + sector != 0) { 112 + dev_dbg(&vdpasim->vdpa.dev, 113 + "sector must be 0 for %u request - sector: 0x%llx\n", 114 + type, sector); 115 + status = VIRTIO_BLK_S_IOERR; 116 + goto err_status; 117 + } 118 + 129 119 switch (type) { 130 120 case VIRTIO_BLK_T_IN: 131 - if (!vdpasim_blk_check_range(sector, to_push)) { 132 - dev_err(&vdpasim->vdpa.dev, 133 - "reading over the capacity - offset: 0x%llx len: 0x%zx\n", 134 - offset, to_push); 121 + if (!vdpasim_blk_check_range(vdpasim, sector, 122 + to_push >> SECTOR_SHIFT, 123 + VDPASIM_BLK_SIZE_MAX * VDPASIM_BLK_SEG_MAX)) { 135 124 status = VIRTIO_BLK_S_IOERR; 136 125 break; 137 126 } ··· 148 121 vdpasim->buffer + offset, 149 122 to_push); 150 123 if (bytes < 0) { 151 - dev_err(&vdpasim->vdpa.dev, 124 + dev_dbg(&vdpasim->vdpa.dev, 152 125 "vringh_iov_push_iotlb() error: %zd offset: 0x%llx len: 0x%zx\n", 153 126 bytes, offset, to_push); 154 127 status = VIRTIO_BLK_S_IOERR; ··· 159 132 break; 160 133 161 134 case VIRTIO_BLK_T_OUT: 162 - if (!vdpasim_blk_check_range(sector, to_pull)) { 163 - dev_err(&vdpasim->vdpa.dev, 164 - "writing over the capacity - offset: 0x%llx len: 0x%zx\n", 165 - offset, to_pull); 135 + if (!vdpasim_blk_check_range(vdpasim, sector, 136 + to_pull >> SECTOR_SHIFT, 137 + VDPASIM_BLK_SIZE_MAX * VDPASIM_BLK_SEG_MAX)) { 166 138 status = VIRTIO_BLK_S_IOERR; 167 139 break; 168 140 } ··· 170 144 vdpasim->buffer + offset, 171 145 to_pull); 172 146 if (bytes < 0) { 173 - dev_err(&vdpasim->vdpa.dev, 147 + dev_dbg(&vdpasim->vdpa.dev, 174 148 "vringh_iov_pull_iotlb() error: %zd offset: 0x%llx len: 0x%zx\n", 175 149 bytes, offset, to_pull); 176 150 status = VIRTIO_BLK_S_IOERR; ··· 183 157 vdpasim_blk_id, 184 158 VIRTIO_BLK_ID_BYTES); 185 159 if (bytes < 0) { 186 - dev_err(&vdpasim->vdpa.dev, 160 + dev_dbg(&vdpasim->vdpa.dev, 187 161 "vringh_iov_push_iotlb() error: %zd\n", bytes); 188 162 status = VIRTIO_BLK_S_IOERR; 189 163 break; ··· 192 166 pushed += bytes; 193 167 break; 194 168 169 + case VIRTIO_BLK_T_FLUSH: 170 + /* nothing to do */ 171 + break; 172 + 173 + case VIRTIO_BLK_T_DISCARD: 174 + case VIRTIO_BLK_T_WRITE_ZEROES: { 175 + struct virtio_blk_discard_write_zeroes range; 176 + u32 num_sectors, flags; 177 + 178 + if (to_pull != sizeof(range)) { 179 + dev_dbg(&vdpasim->vdpa.dev, 180 + "discard/write_zeroes header len: 0x%zx [expected: 0x%zx]\n", 181 + to_pull, sizeof(range)); 182 + status = VIRTIO_BLK_S_IOERR; 183 + break; 184 + } 185 + 186 + bytes = vringh_iov_pull_iotlb(&vq->vring, &vq->out_iov, &range, 187 + to_pull); 188 + if (bytes < 0) { 189 + dev_dbg(&vdpasim->vdpa.dev, 190 + "vringh_iov_pull_iotlb() error: %zd offset: 0x%llx len: 0x%zx\n", 191 + bytes, offset, to_pull); 192 + status = VIRTIO_BLK_S_IOERR; 193 + break; 194 + } 195 + 196 + sector = le64_to_cpu(range.sector); 197 + offset = sector << SECTOR_SHIFT; 198 + num_sectors = le32_to_cpu(range.num_sectors); 199 + flags = le32_to_cpu(range.flags); 200 + 201 + if (type == VIRTIO_BLK_T_DISCARD && flags != 0) { 202 + dev_dbg(&vdpasim->vdpa.dev, 203 + "discard unexpected flags set - flags: 0x%x\n", 204 + flags); 205 + status = VIRTIO_BLK_S_UNSUPP; 206 + break; 207 + } 208 + 209 + if (type == VIRTIO_BLK_T_WRITE_ZEROES && 210 + flags & ~VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP) { 211 + dev_dbg(&vdpasim->vdpa.dev, 212 + "write_zeroes unexpected flags set - flags: 0x%x\n", 213 + flags); 214 + status = VIRTIO_BLK_S_UNSUPP; 215 + break; 216 + } 217 + 218 + if (!vdpasim_blk_check_range(vdpasim, sector, num_sectors, 219 + VDPASIM_BLK_DWZ_MAX_SECTORS)) { 220 + status = VIRTIO_BLK_S_IOERR; 221 + break; 222 + } 223 + 224 + if (type == VIRTIO_BLK_T_WRITE_ZEROES) { 225 + memset(vdpasim->buffer + offset, 0, 226 + num_sectors << SECTOR_SHIFT); 227 + } 228 + 229 + break; 230 + } 195 231 default: 196 - dev_warn(&vdpasim->vdpa.dev, 197 - "Unsupported request type %d\n", type); 232 + dev_dbg(&vdpasim->vdpa.dev, 233 + "Unsupported request type %d\n", type); 198 234 status = VIRTIO_BLK_S_IOERR; 199 235 break; 200 236 } 201 237 238 + err_status: 202 239 /* If some operations fail, we need to skip the remaining bytes 203 240 * to put the status in the last byte 204 241 */ ··· 271 182 /* Last byte is the status */ 272 183 bytes = vringh_iov_push_iotlb(&vq->vring, &vq->in_iov, &status, 1); 273 184 if (bytes != 1) 274 - return false; 185 + goto err; 275 186 276 187 pushed += bytes; 277 188 278 189 /* Make sure data is wrote before advancing index */ 279 190 smp_wmb(); 280 191 192 + handled = true; 193 + 194 + err: 281 195 vringh_complete_iotlb(&vq->vring, vq->head, pushed); 282 196 283 - return true; 197 + return handled; 284 198 } 285 199 286 200 static void vdpasim_blk_work(struct work_struct *work) 287 201 { 288 202 struct vdpasim *vdpasim = container_of(work, struct vdpasim, work); 203 + bool reschedule = false; 289 204 int i; 290 205 291 206 spin_lock(&vdpasim->lock); ··· 297 204 if (!(vdpasim->status & VIRTIO_CONFIG_S_DRIVER_OK)) 298 205 goto out; 299 206 207 + if (!vdpasim->running) 208 + goto out; 209 + 300 210 for (i = 0; i < VDPASIM_BLK_VQ_NUM; i++) { 301 211 struct vdpasim_virtqueue *vq = &vdpasim->vqs[i]; 212 + int reqs = 0; 302 213 303 214 if (!vq->ready) 304 215 continue; ··· 315 218 if (vringh_need_notify_iotlb(&vq->vring) > 0) 316 219 vringh_notify(&vq->vring); 317 220 local_bh_enable(); 221 + 222 + if (++reqs > 4) { 223 + reschedule = true; 224 + break; 225 + } 318 226 } 319 227 } 320 228 out: 321 229 spin_unlock(&vdpasim->lock); 230 + 231 + if (reschedule) 232 + schedule_work(&vdpasim->work); 322 233 } 323 234 324 235 static void vdpasim_blk_get_config(struct vdpasim *vdpasim, void *config) ··· 342 237 blk_config->min_io_size = cpu_to_vdpasim16(vdpasim, 1); 343 238 blk_config->opt_io_size = cpu_to_vdpasim32(vdpasim, 1); 344 239 blk_config->blk_size = cpu_to_vdpasim32(vdpasim, SECTOR_SIZE); 240 + /* VIRTIO_BLK_F_DISCARD */ 241 + blk_config->discard_sector_alignment = 242 + cpu_to_vdpasim32(vdpasim, SECTOR_SIZE); 243 + blk_config->max_discard_sectors = 244 + cpu_to_vdpasim32(vdpasim, VDPASIM_BLK_DWZ_MAX_SECTORS); 245 + blk_config->max_discard_seg = cpu_to_vdpasim32(vdpasim, 1); 246 + /* VIRTIO_BLK_F_WRITE_ZEROES */ 247 + blk_config->max_write_zeroes_sectors = 248 + cpu_to_vdpasim32(vdpasim, VDPASIM_BLK_DWZ_MAX_SECTORS); 249 + blk_config->max_write_zeroes_seg = cpu_to_vdpasim32(vdpasim, 1); 250 + 345 251 } 346 252 347 253 static void vdpasim_blk_mgmtdev_release(struct device *dev) ··· 376 260 dev_attr.id = VIRTIO_ID_BLOCK; 377 261 dev_attr.supported_features = VDPASIM_BLK_FEATURES; 378 262 dev_attr.nvqs = VDPASIM_BLK_VQ_NUM; 263 + dev_attr.ngroups = VDPASIM_BLK_GROUP_NUM; 264 + dev_attr.nas = VDPASIM_BLK_AS_NUM; 379 265 dev_attr.config_size = sizeof(struct virtio_blk_config); 380 266 dev_attr.get_config = vdpasim_blk_get_config; 381 267 dev_attr.work_fn = vdpasim_blk_work;

+3

drivers/vdpa/vdpa_sim/vdpa_sim_net.c

··· 154 154 155 155 spin_lock(&vdpasim->lock); 156 156 157 + if (!vdpasim->running) 158 + goto out; 159 + 157 160 if (!(vdpasim->status & VIRTIO_CONFIG_S_DRIVER_OK)) 158 161 goto out; 159 162

+89 -13

drivers/vdpa/vdpa_user/iova_domain.c

··· 138 138 { 139 139 unsigned long pfn = PFN_DOWN(orig); 140 140 unsigned int offset = offset_in_page(orig); 141 - char *buffer; 141 + struct page *page; 142 142 unsigned int sz = 0; 143 143 144 144 while (size) { 145 145 sz = min_t(size_t, PAGE_SIZE - offset, size); 146 146 147 - buffer = kmap_atomic(pfn_to_page(pfn)); 147 + page = pfn_to_page(pfn); 148 148 if (dir == DMA_TO_DEVICE) 149 - memcpy(addr, buffer + offset, sz); 149 + memcpy_from_page(addr, page, offset, sz); 150 150 else 151 - memcpy(buffer + offset, addr, sz); 152 - kunmap_atomic(buffer); 151 + memcpy_to_page(page, offset, addr, sz); 153 152 154 153 size -= sz; 155 154 pfn++; ··· 178 179 map->orig_phys == INVALID_PHYS_ADDR)) 179 180 return; 180 181 181 - addr = page_address(map->bounce_page) + offset; 182 - do_bounce(map->orig_phys + offset, addr, sz, dir); 182 + addr = kmap_local_page(map->bounce_page); 183 + do_bounce(map->orig_phys + offset, addr + offset, sz, dir); 184 + kunmap_local(addr); 183 185 size -= sz; 184 186 iova += sz; 185 187 } ··· 213 213 struct vduse_bounce_map *map; 214 214 struct page *page = NULL; 215 215 216 - spin_lock(&domain->iotlb_lock); 216 + read_lock(&domain->bounce_lock); 217 217 map = &domain->bounce_maps[iova >> PAGE_SHIFT]; 218 - if (!map->bounce_page) 218 + if (domain->user_bounce_pages || !map->bounce_page) 219 219 goto out; 220 220 221 221 page = map->bounce_page; 222 222 get_page(page); 223 223 out: 224 - spin_unlock(&domain->iotlb_lock); 224 + read_unlock(&domain->bounce_lock); 225 225 226 226 return page; 227 227 } 228 228 229 229 static void 230 - vduse_domain_free_bounce_pages(struct vduse_iova_domain *domain) 230 + vduse_domain_free_kernel_bounce_pages(struct vduse_iova_domain *domain) 231 231 { 232 232 struct vduse_bounce_map *map; 233 233 unsigned long pfn, bounce_pfns; ··· 245 245 __free_page(map->bounce_page); 246 246 map->bounce_page = NULL; 247 247 } 248 + } 249 + 250 + int vduse_domain_add_user_bounce_pages(struct vduse_iova_domain *domain, 251 + struct page **pages, int count) 252 + { 253 + struct vduse_bounce_map *map; 254 + int i, ret; 255 + 256 + /* Now we don't support partial mapping */ 257 + if (count != (domain->bounce_size >> PAGE_SHIFT)) 258 + return -EINVAL; 259 + 260 + write_lock(&domain->bounce_lock); 261 + ret = -EEXIST; 262 + if (domain->user_bounce_pages) 263 + goto out; 264 + 265 + for (i = 0; i < count; i++) { 266 + map = &domain->bounce_maps[i]; 267 + if (map->bounce_page) { 268 + /* Copy kernel page to user page if it's in use */ 269 + if (map->orig_phys != INVALID_PHYS_ADDR) 270 + memcpy_to_page(pages[i], 0, 271 + page_address(map->bounce_page), 272 + PAGE_SIZE); 273 + __free_page(map->bounce_page); 274 + } 275 + map->bounce_page = pages[i]; 276 + get_page(pages[i]); 277 + } 278 + domain->user_bounce_pages = true; 279 + ret = 0; 280 + out: 281 + write_unlock(&domain->bounce_lock); 282 + 283 + return ret; 284 + } 285 + 286 + void vduse_domain_remove_user_bounce_pages(struct vduse_iova_domain *domain) 287 + { 288 + struct vduse_bounce_map *map; 289 + unsigned long i, count; 290 + 291 + write_lock(&domain->bounce_lock); 292 + if (!domain->user_bounce_pages) 293 + goto out; 294 + 295 + count = domain->bounce_size >> PAGE_SHIFT; 296 + for (i = 0; i < count; i++) { 297 + struct page *page = NULL; 298 + 299 + map = &domain->bounce_maps[i]; 300 + if (WARN_ON(!map->bounce_page)) 301 + continue; 302 + 303 + /* Copy user page to kernel page if it's in use */ 304 + if (map->orig_phys != INVALID_PHYS_ADDR) { 305 + page = alloc_page(GFP_ATOMIC | __GFP_NOFAIL); 306 + memcpy_from_page(page_address(page), 307 + map->bounce_page, 0, PAGE_SIZE); 308 + } 309 + put_page(map->bounce_page); 310 + map->bounce_page = page; 311 + } 312 + domain->user_bounce_pages = false; 313 + out: 314 + write_unlock(&domain->bounce_lock); 248 315 } 249 316 250 317 void vduse_domain_reset_bounce_map(struct vduse_iova_domain *domain) ··· 389 322 if (vduse_domain_init_bounce_map(domain)) 390 323 goto err; 391 324 325 + read_lock(&domain->bounce_lock); 392 326 if (vduse_domain_map_bounce_page(domain, (u64)iova, (u64)size, pa)) 393 - goto err; 327 + goto err_unlock; 394 328 395 329 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL) 396 330 vduse_domain_bounce(domain, iova, size, DMA_TO_DEVICE); 397 331 332 + read_unlock(&domain->bounce_lock); 333 + 398 334 return iova; 335 + err_unlock: 336 + read_unlock(&domain->bounce_lock); 399 337 err: 400 338 vduse_domain_free_iova(iovad, iova, size); 401 339 return DMA_MAPPING_ERROR; ··· 412 340 { 413 341 struct iova_domain *iovad = &domain->stream_iovad; 414 342 343 + read_lock(&domain->bounce_lock); 415 344 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL) 416 345 vduse_domain_bounce(domain, dma_addr, size, DMA_FROM_DEVICE); 417 346 418 347 vduse_domain_unmap_bounce_page(domain, (u64)dma_addr, (u64)size); 348 + read_unlock(&domain->bounce_lock); 419 349 vduse_domain_free_iova(iovad, dma_addr, size); 420 350 } 421 351 ··· 525 451 526 452 spin_lock(&domain->iotlb_lock); 527 453 vduse_iotlb_del_range(domain, 0, ULLONG_MAX); 528 - vduse_domain_free_bounce_pages(domain); 454 + vduse_domain_remove_user_bounce_pages(domain); 455 + vduse_domain_free_kernel_bounce_pages(domain); 529 456 spin_unlock(&domain->iotlb_lock); 530 457 put_iova_domain(&domain->stream_iovad); 531 458 put_iova_domain(&domain->consistent_iovad); ··· 586 511 goto err_file; 587 512 588 513 domain->file = file; 514 + rwlock_init(&domain->bounce_lock); 589 515 spin_lock_init(&domain->iotlb_lock); 590 516 init_iova_domain(&domain->stream_iovad, 591 517 PAGE_SIZE, IOVA_START_PFN);

+8

drivers/vdpa/vdpa_user/iova_domain.h

··· 14 14 #include <linux/iova.h> 15 15 #include <linux/dma-mapping.h> 16 16 #include <linux/vhost_iotlb.h> 17 + #include <linux/rwlock.h> 17 18 18 19 #define IOVA_START_PFN 1 19 20 ··· 35 34 struct vhost_iotlb *iotlb; 36 35 spinlock_t iotlb_lock; 37 36 struct file *file; 37 + bool user_bounce_pages; 38 + rwlock_t bounce_lock; 38 39 }; 39 40 40 41 int vduse_domain_set_map(struct vduse_iova_domain *domain, ··· 63 60 unsigned long attrs); 64 61 65 62 void vduse_domain_reset_bounce_map(struct vduse_iova_domain *domain); 63 + 64 + int vduse_domain_add_user_bounce_pages(struct vduse_iova_domain *domain, 65 + struct page **pages, int count); 66 + 67 + void vduse_domain_remove_user_bounce_pages(struct vduse_iova_domain *domain); 66 68 67 69 void vduse_domain_destroy(struct vduse_iova_domain *domain); 68 70

+180

drivers/vdpa/vdpa_user/vduse_dev.c

··· 21 21 #include <linux/uio.h> 22 22 #include <linux/vdpa.h> 23 23 #include <linux/nospec.h> 24 + #include <linux/vmalloc.h> 25 + #include <linux/sched/mm.h> 24 26 #include <uapi/linux/vduse.h> 25 27 #include <uapi/linux/vdpa.h> 26 28 #include <uapi/linux/virtio_config.h> ··· 66 64 struct vduse_dev *dev; 67 65 }; 68 66 67 + struct vduse_umem { 68 + unsigned long iova; 69 + unsigned long npages; 70 + struct page **pages; 71 + struct mm_struct *mm; 72 + }; 73 + 69 74 struct vduse_dev { 70 75 struct vduse_vdpa *vdev; 71 76 struct device *dev; ··· 104 95 u8 status; 105 96 u32 vq_num; 106 97 u32 vq_align; 98 + struct vduse_umem *umem; 99 + struct mutex mem_lock; 107 100 }; 108 101 109 102 struct vduse_dev_msg { ··· 928 917 return ret; 929 918 } 930 919 920 + static int vduse_dev_dereg_umem(struct vduse_dev *dev, 921 + u64 iova, u64 size) 922 + { 923 + int ret; 924 + 925 + mutex_lock(&dev->mem_lock); 926 + ret = -ENOENT; 927 + if (!dev->umem) 928 + goto unlock; 929 + 930 + ret = -EINVAL; 931 + if (dev->umem->iova != iova || size != dev->domain->bounce_size) 932 + goto unlock; 933 + 934 + vduse_domain_remove_user_bounce_pages(dev->domain); 935 + unpin_user_pages_dirty_lock(dev->umem->pages, 936 + dev->umem->npages, true); 937 + atomic64_sub(dev->umem->npages, &dev->umem->mm->pinned_vm); 938 + mmdrop(dev->umem->mm); 939 + vfree(dev->umem->pages); 940 + kfree(dev->umem); 941 + dev->umem = NULL; 942 + ret = 0; 943 + unlock: 944 + mutex_unlock(&dev->mem_lock); 945 + return ret; 946 + } 947 + 948 + static int vduse_dev_reg_umem(struct vduse_dev *dev, 949 + u64 iova, u64 uaddr, u64 size) 950 + { 951 + struct page **page_list = NULL; 952 + struct vduse_umem *umem = NULL; 953 + long pinned = 0; 954 + unsigned long npages, lock_limit; 955 + int ret; 956 + 957 + if (!dev->domain->bounce_map || 958 + size != dev->domain->bounce_size || 959 + iova != 0 || uaddr & ~PAGE_MASK) 960 + return -EINVAL; 961 + 962 + mutex_lock(&dev->mem_lock); 963 + ret = -EEXIST; 964 + if (dev->umem) 965 + goto unlock; 966 + 967 + ret = -ENOMEM; 968 + npages = size >> PAGE_SHIFT; 969 + page_list = __vmalloc(array_size(npages, sizeof(struct page *)), 970 + GFP_KERNEL_ACCOUNT); 971 + umem = kzalloc(sizeof(*umem), GFP_KERNEL); 972 + if (!page_list || !umem) 973 + goto unlock; 974 + 975 + mmap_read_lock(current->mm); 976 + 977 + lock_limit = PFN_DOWN(rlimit(RLIMIT_MEMLOCK)); 978 + if (npages + atomic64_read(&current->mm->pinned_vm) > lock_limit) 979 + goto out; 980 + 981 + pinned = pin_user_pages(uaddr, npages, FOLL_LONGTERM | FOLL_WRITE, 982 + page_list, NULL); 983 + if (pinned != npages) { 984 + ret = pinned < 0 ? pinned : -ENOMEM; 985 + goto out; 986 + } 987 + 988 + ret = vduse_domain_add_user_bounce_pages(dev->domain, 989 + page_list, pinned); 990 + if (ret) 991 + goto out; 992 + 993 + atomic64_add(npages, &current->mm->pinned_vm); 994 + 995 + umem->pages = page_list; 996 + umem->npages = pinned; 997 + umem->iova = iova; 998 + umem->mm = current->mm; 999 + mmgrab(current->mm); 1000 + 1001 + dev->umem = umem; 1002 + out: 1003 + if (ret && pinned > 0) 1004 + unpin_user_pages(page_list, pinned); 1005 + 1006 + mmap_read_unlock(current->mm); 1007 + unlock: 1008 + if (ret) { 1009 + vfree(page_list); 1010 + kfree(umem); 1011 + } 1012 + mutex_unlock(&dev->mem_lock); 1013 + return ret; 1014 + } 1015 + 931 1016 static long vduse_dev_ioctl(struct file *file, unsigned int cmd, 932 1017 unsigned long arg) 933 1018 { ··· 1196 1089 ret = vduse_dev_queue_irq_work(dev, &dev->vqs[index].inject); 1197 1090 break; 1198 1091 } 1092 + case VDUSE_IOTLB_REG_UMEM: { 1093 + struct vduse_iova_umem umem; 1094 + 1095 + ret = -EFAULT; 1096 + if (copy_from_user(&umem, argp, sizeof(umem))) 1097 + break; 1098 + 1099 + ret = -EINVAL; 1100 + if (!is_mem_zero((const char *)umem.reserved, 1101 + sizeof(umem.reserved))) 1102 + break; 1103 + 1104 + ret = vduse_dev_reg_umem(dev, umem.iova, 1105 + umem.uaddr, umem.size); 1106 + break; 1107 + } 1108 + case VDUSE_IOTLB_DEREG_UMEM: { 1109 + struct vduse_iova_umem umem; 1110 + 1111 + ret = -EFAULT; 1112 + if (copy_from_user(&umem, argp, sizeof(umem))) 1113 + break; 1114 + 1115 + ret = -EINVAL; 1116 + if (!is_mem_zero((const char *)umem.reserved, 1117 + sizeof(umem.reserved))) 1118 + break; 1119 + 1120 + ret = vduse_dev_dereg_umem(dev, umem.iova, 1121 + umem.size); 1122 + break; 1123 + } 1124 + case VDUSE_IOTLB_GET_INFO: { 1125 + struct vduse_iova_info info; 1126 + struct vhost_iotlb_map *map; 1127 + struct vduse_iova_domain *domain = dev->domain; 1128 + 1129 + ret = -EFAULT; 1130 + if (copy_from_user(&info, argp, sizeof(info))) 1131 + break; 1132 + 1133 + ret = -EINVAL; 1134 + if (info.start > info.last) 1135 + break; 1136 + 1137 + if (!is_mem_zero((const char *)info.reserved, 1138 + sizeof(info.reserved))) 1139 + break; 1140 + 1141 + spin_lock(&domain->iotlb_lock); 1142 + map = vhost_iotlb_itree_first(domain->iotlb, 1143 + info.start, info.last); 1144 + if (map) { 1145 + info.start = map->start; 1146 + info.last = map->last; 1147 + info.capability = 0; 1148 + if (domain->bounce_map && map->start == 0 && 1149 + map->last == domain->bounce_size - 1) 1150 + info.capability |= VDUSE_IOVA_CAP_UMEM; 1151 + } 1152 + spin_unlock(&domain->iotlb_lock); 1153 + if (!map) 1154 + break; 1155 + 1156 + ret = -EFAULT; 1157 + if (copy_to_user(argp, &info, sizeof(info))) 1158 + break; 1159 + 1160 + ret = 0; 1161 + break; 1162 + } 1199 1163 default: 1200 1164 ret = -ENOIOCTLCMD; 1201 1165 break; ··· 1279 1101 { 1280 1102 struct vduse_dev *dev = file->private_data; 1281 1103 1104 + vduse_dev_dereg_umem(dev, 0, dev->domain->bounce_size); 1282 1105 spin_lock(&dev->msg_lock); 1283 1106 /* Make sure the inflight messages can processed after reconncection */ 1284 1107 list_splice_init(&dev->recv_list, &dev->send_list); ··· 1342 1163 return NULL; 1343 1164 1344 1165 mutex_init(&dev->lock); 1166 + mutex_init(&dev->mem_lock); 1345 1167 spin_lock_init(&dev->msg_lock); 1346 1168 INIT_LIST_HEAD(&dev->send_list); 1347 1169 INIT_LIST_HEAD(&dev->recv_list);

+61 -24

drivers/vhost/scsi.c

··· 159 159 }; 160 160 161 161 #define VHOST_SCSI_MAX_TARGET 256 162 - #define VHOST_SCSI_MAX_VQ 128 162 + #define VHOST_SCSI_MAX_IO_VQ 1024 163 163 #define VHOST_SCSI_MAX_EVENT 128 164 + 165 + static unsigned vhost_scsi_max_io_vqs = 128; 166 + module_param_named(max_io_vqs, vhost_scsi_max_io_vqs, uint, 0644); 167 + MODULE_PARM_DESC(max_io_vqs, "Set the max number of IO virtqueues a vhost scsi device can support. The default is 128. The max is 1024."); 164 168 165 169 struct vhost_scsi_virtqueue { 166 170 struct vhost_virtqueue vq; ··· 190 186 char vs_vhost_wwpn[TRANSPORT_IQN_LEN]; 191 187 192 188 struct vhost_dev dev; 193 - struct vhost_scsi_virtqueue vqs[VHOST_SCSI_MAX_VQ]; 189 + struct vhost_scsi_virtqueue *vqs; 190 + unsigned long *compl_bitmap; 191 + struct vhost_scsi_inflight **old_inflight; 194 192 195 193 struct vhost_work vs_completion_work; /* cmd completion work item */ 196 194 struct llist_head vs_completion_list; /* cmd completion queue */ ··· 251 245 struct vhost_virtqueue *vq; 252 246 int idx, i; 253 247 254 - for (i = 0; i < VHOST_SCSI_MAX_VQ; i++) { 248 + for (i = 0; i < vs->dev.nvqs; i++) { 255 249 vq = &vs->vqs[i].vq; 256 250 257 251 mutex_lock(&vq->mutex); ··· 539 533 { 540 534 struct vhost_scsi *vs = container_of(work, struct vhost_scsi, 541 535 vs_completion_work); 542 - DECLARE_BITMAP(signal, VHOST_SCSI_MAX_VQ); 543 536 struct virtio_scsi_cmd_resp v_rsp; 544 537 struct vhost_scsi_cmd *cmd, *t; 545 538 struct llist_node *llnode; ··· 546 541 struct iov_iter iov_iter; 547 542 int ret, vq; 548 543 549 - bitmap_zero(signal, VHOST_SCSI_MAX_VQ); 544 + bitmap_zero(vs->compl_bitmap, vs->dev.nvqs); 550 545 llnode = llist_del_all(&vs->vs_completion_list); 551 546 llist_for_each_entry_safe(cmd, t, llnode, tvc_completion_list) { 552 547 se_cmd = &cmd->tvc_se_cmd; ··· 571 566 vhost_add_used(cmd->tvc_vq, cmd->tvc_vq_desc, 0); 572 567 q = container_of(cmd->tvc_vq, struct vhost_scsi_virtqueue, vq); 573 568 vq = q - vs->vqs; 574 - __set_bit(vq, signal); 569 + __set_bit(vq, vs->compl_bitmap); 575 570 } else 576 571 pr_err("Faulted on virtio_scsi_cmd_resp\n"); 577 572 ··· 579 574 } 580 575 581 576 vq = -1; 582 - while ((vq = find_next_bit(signal, VHOST_SCSI_MAX_VQ, vq + 1)) 583 - < VHOST_SCSI_MAX_VQ) 577 + while ((vq = find_next_bit(vs->compl_bitmap, vs->dev.nvqs, vq + 1)) 578 + < vs->dev.nvqs) 584 579 vhost_signal(&vs->dev, &vs->vqs[vq].vq); 585 580 } 586 581 ··· 1424 1419 /* Callers must hold dev mutex */ 1425 1420 static void vhost_scsi_flush(struct vhost_scsi *vs) 1426 1421 { 1427 - struct vhost_scsi_inflight *old_inflight[VHOST_SCSI_MAX_VQ]; 1428 1422 int i; 1429 1423 1430 1424 /* Init new inflight and remember the old inflight */ 1431 - vhost_scsi_init_inflight(vs, old_inflight); 1425 + vhost_scsi_init_inflight(vs, vs->old_inflight); 1432 1426 1433 1427 /* 1434 1428 * The inflight->kref was initialized to 1. We decrement it here to 1435 1429 * indicate the start of the flush operation so that it will reach 0 1436 1430 * when all the reqs are finished. 1437 1431 */ 1438 - for (i = 0; i < VHOST_SCSI_MAX_VQ; i++) 1439 - kref_put(&old_inflight[i]->kref, vhost_scsi_done_inflight); 1432 + for (i = 0; i < vs->dev.nvqs; i++) 1433 + kref_put(&vs->old_inflight[i]->kref, vhost_scsi_done_inflight); 1440 1434 1441 1435 /* Flush both the vhost poll and vhost work */ 1442 1436 vhost_dev_flush(&vs->dev); 1443 1437 1444 1438 /* Wait for all reqs issued before the flush to be finished */ 1445 - for (i = 0; i < VHOST_SCSI_MAX_VQ; i++) 1446 - wait_for_completion(&old_inflight[i]->comp); 1439 + for (i = 0; i < vs->dev.nvqs; i++) 1440 + wait_for_completion(&vs->old_inflight[i]->comp); 1447 1441 } 1448 1442 1449 1443 static void vhost_scsi_destroy_vq_cmds(struct vhost_virtqueue *vq) ··· 1605 1601 memcpy(vs->vs_vhost_wwpn, t->vhost_wwpn, 1606 1602 sizeof(vs->vs_vhost_wwpn)); 1607 1603 1608 - for (i = VHOST_SCSI_VQ_IO; i < VHOST_SCSI_MAX_VQ; i++) { 1604 + for (i = VHOST_SCSI_VQ_IO; i < vs->dev.nvqs; i++) { 1609 1605 vq = &vs->vqs[i].vq; 1610 1606 if (!vhost_vq_is_setup(vq)) 1611 1607 continue; ··· 1615 1611 goto destroy_vq_cmds; 1616 1612 } 1617 1613 1618 - for (i = 0; i < VHOST_SCSI_MAX_VQ; i++) { 1614 + for (i = 0; i < vs->dev.nvqs; i++) { 1619 1615 vq = &vs->vqs[i].vq; 1620 1616 mutex_lock(&vq->mutex); 1621 1617 vhost_vq_set_backend(vq, vs_tpg); ··· 1717 1713 target_undepend_item(&se_tpg->tpg_group.cg_item); 1718 1714 } 1719 1715 if (match) { 1720 - for (i = 0; i < VHOST_SCSI_MAX_VQ; i++) { 1716 + for (i = 0; i < vs->dev.nvqs; i++) { 1721 1717 vq = &vs->vqs[i].vq; 1722 1718 mutex_lock(&vq->mutex); 1723 1719 vhost_vq_set_backend(vq, NULL); ··· 1726 1722 /* Make sure cmds are not running before tearing them down. */ 1727 1723 vhost_scsi_flush(vs); 1728 1724 1729 - for (i = 0; i < VHOST_SCSI_MAX_VQ; i++) { 1725 + for (i = 0; i < vs->dev.nvqs; i++) { 1730 1726 vq = &vs->vqs[i].vq; 1731 1727 vhost_scsi_destroy_vq_cmds(vq); 1732 1728 } ··· 1766 1762 return -EFAULT; 1767 1763 } 1768 1764 1769 - for (i = 0; i < VHOST_SCSI_MAX_VQ; i++) { 1765 + for (i = 0; i < vs->dev.nvqs; i++) { 1770 1766 vq = &vs->vqs[i].vq; 1771 1767 mutex_lock(&vq->mutex); 1772 1768 vq->acked_features = features; ··· 1780 1776 { 1781 1777 struct vhost_scsi *vs; 1782 1778 struct vhost_virtqueue **vqs; 1783 - int r = -ENOMEM, i; 1779 + int r = -ENOMEM, i, nvqs = vhost_scsi_max_io_vqs; 1784 1780 1785 1781 vs = kvzalloc(sizeof(*vs), GFP_KERNEL); 1786 1782 if (!vs) 1787 1783 goto err_vs; 1788 1784 1789 - vqs = kmalloc_array(VHOST_SCSI_MAX_VQ, sizeof(*vqs), GFP_KERNEL); 1790 - if (!vqs) 1785 + if (nvqs > VHOST_SCSI_MAX_IO_VQ) { 1786 + pr_err("Invalid max_io_vqs of %d. Using %d.\n", nvqs, 1787 + VHOST_SCSI_MAX_IO_VQ); 1788 + nvqs = VHOST_SCSI_MAX_IO_VQ; 1789 + } else if (nvqs == 0) { 1790 + pr_err("Invalid max_io_vqs of %d. Using 1.\n", nvqs); 1791 + nvqs = 1; 1792 + } 1793 + nvqs += VHOST_SCSI_VQ_IO; 1794 + 1795 + vs->compl_bitmap = bitmap_alloc(nvqs, GFP_KERNEL); 1796 + if (!vs->compl_bitmap) 1797 + goto err_compl_bitmap; 1798 + 1799 + vs->old_inflight = kmalloc_array(nvqs, sizeof(*vs->old_inflight), 1800 + GFP_KERNEL | __GFP_ZERO); 1801 + if (!vs->old_inflight) 1802 + goto err_inflight; 1803 + 1804 + vs->vqs = kmalloc_array(nvqs, sizeof(*vs->vqs), 1805 + GFP_KERNEL | __GFP_ZERO); 1806 + if (!vs->vqs) 1791 1807 goto err_vqs; 1808 + 1809 + vqs = kmalloc_array(nvqs, sizeof(*vqs), GFP_KERNEL); 1810 + if (!vqs) 1811 + goto err_local_vqs; 1792 1812 1793 1813 vhost_work_init(&vs->vs_completion_work, vhost_scsi_complete_cmd_work); 1794 1814 vhost_work_init(&vs->vs_event_work, vhost_scsi_evt_work); ··· 1824 1796 vqs[VHOST_SCSI_VQ_EVT] = &vs->vqs[VHOST_SCSI_VQ_EVT].vq; 1825 1797 vs->vqs[VHOST_SCSI_VQ_CTL].vq.handle_kick = vhost_scsi_ctl_handle_kick; 1826 1798 vs->vqs[VHOST_SCSI_VQ_EVT].vq.handle_kick = vhost_scsi_evt_handle_kick; 1827 - for (i = VHOST_SCSI_VQ_IO; i < VHOST_SCSI_MAX_VQ; i++) { 1799 + for (i = VHOST_SCSI_VQ_IO; i < nvqs; i++) { 1828 1800 vqs[i] = &vs->vqs[i].vq; 1829 1801 vs->vqs[i].vq.handle_kick = vhost_scsi_handle_kick; 1830 1802 } 1831 - vhost_dev_init(&vs->dev, vqs, VHOST_SCSI_MAX_VQ, UIO_MAXIOV, 1803 + vhost_dev_init(&vs->dev, vqs, nvqs, UIO_MAXIOV, 1832 1804 VHOST_SCSI_WEIGHT, 0, true, NULL); 1833 1805 1834 1806 vhost_scsi_init_inflight(vs, NULL); ··· 1836 1808 f->private_data = vs; 1837 1809 return 0; 1838 1810 1811 + err_local_vqs: 1812 + kfree(vs->vqs); 1839 1813 err_vqs: 1814 + kfree(vs->old_inflight); 1815 + err_inflight: 1816 + bitmap_free(vs->compl_bitmap); 1817 + err_compl_bitmap: 1840 1818 kvfree(vs); 1841 1819 err_vs: 1842 1820 return r; ··· 1860 1826 vhost_dev_stop(&vs->dev); 1861 1827 vhost_dev_cleanup(&vs->dev); 1862 1828 kfree(vs->dev.vqs); 1829 + kfree(vs->vqs); 1830 + kfree(vs->old_inflight); 1831 + bitmap_free(vs->compl_bitmap); 1863 1832 kvfree(vs); 1864 1833 return 0; 1865 1834 }

+36 -2

drivers/vhost/vdpa.c

··· 347 347 return 0; 348 348 } 349 349 350 + static bool vhost_vdpa_can_suspend(const struct vhost_vdpa *v) 351 + { 352 + struct vdpa_device *vdpa = v->vdpa; 353 + const struct vdpa_config_ops *ops = vdpa->config; 354 + 355 + return ops->suspend; 356 + } 357 + 350 358 static long vhost_vdpa_get_features(struct vhost_vdpa *v, u64 __user *featurep) 351 359 { 352 360 struct vdpa_device *vdpa = v->vdpa; ··· 478 470 return 0; 479 471 } 480 472 473 + /* After a successful return of ioctl the device must not process more 474 + * virtqueue descriptors. The device can answer to read or writes of config 475 + * fields as if it were not suspended. In particular, writing to "queue_enable" 476 + * with a value of 1 will not make the device start processing buffers. 477 + */ 478 + static long vhost_vdpa_suspend(struct vhost_vdpa *v) 479 + { 480 + struct vdpa_device *vdpa = v->vdpa; 481 + const struct vdpa_config_ops *ops = vdpa->config; 482 + 483 + if (!ops->suspend) 484 + return -EOPNOTSUPP; 485 + 486 + return ops->suspend(vdpa); 487 + } 488 + 481 489 static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, unsigned int cmd, 482 490 void __user *argp) 483 491 { ··· 601 577 if (cmd == VHOST_SET_BACKEND_FEATURES) { 602 578 if (copy_from_user(&features, featurep, sizeof(features))) 603 579 return -EFAULT; 604 - if (features & ~VHOST_VDPA_BACKEND_FEATURES) 580 + if (features & ~(VHOST_VDPA_BACKEND_FEATURES | 581 + BIT_ULL(VHOST_BACKEND_F_SUSPEND))) 582 + return -EOPNOTSUPP; 583 + if ((features & BIT_ULL(VHOST_BACKEND_F_SUSPEND)) && 584 + !vhost_vdpa_can_suspend(v)) 605 585 return -EOPNOTSUPP; 606 586 vhost_set_backend_features(&v->vdev, features); 607 587 return 0; ··· 656 628 break; 657 629 case VHOST_GET_BACKEND_FEATURES: 658 630 features = VHOST_VDPA_BACKEND_FEATURES; 631 + if (vhost_vdpa_can_suspend(v)) 632 + features |= BIT_ULL(VHOST_BACKEND_F_SUSPEND); 659 633 if (copy_to_user(featurep, &features, sizeof(features))) 660 634 r = -EFAULT; 661 635 break; ··· 669 639 break; 670 640 case VHOST_VDPA_GET_VQS_COUNT: 671 641 r = vhost_vdpa_get_vqs_count(v, argp); 642 + break; 643 + case VHOST_VDPA_SUSPEND: 644 + r = vhost_vdpa_suspend(v); 672 645 break; 673 646 default: 674 647 r = vhost_dev_ioctl(&v->vdev, cmd, argp); ··· 1109 1076 if (!bus) 1110 1077 return -EFAULT; 1111 1078 1112 - if (!iommu_capable(bus, IOMMU_CAP_CACHE_COHERENCY)) 1079 + if (!device_iommu_capable(dma_dev, IOMMU_CAP_CACHE_COHERENCY)) 1113 1080 return -ENOTSUPP; 1114 1081 1115 1082 v->domain = iommu_domain_alloc(bus); ··· 1396 1363 1397 1364 err: 1398 1365 put_device(&v->dev); 1366 + ida_simple_remove(&vhost_vdpa_ida, v->minor); 1399 1367 return r; 1400 1368 } 1401 1369

+56 -22

drivers/vhost/vringh.c

··· 1095 1095 #if IS_REACHABLE(CONFIG_VHOST_IOTLB) 1096 1096 1097 1097 static int iotlb_translate(const struct vringh *vrh, 1098 - u64 addr, u64 len, struct bio_vec iov[], 1098 + u64 addr, u64 len, u64 *translated, 1099 + struct bio_vec iov[], 1099 1100 int iov_size, u32 perm) 1100 1101 { 1101 1102 struct vhost_iotlb_map *map; ··· 1137 1136 1138 1137 spin_unlock(vrh->iotlb_lock); 1139 1138 1139 + if (translated) 1140 + *translated = min(len, s); 1141 + 1140 1142 return ret; 1141 1143 } 1142 1144 1143 1145 static inline int copy_from_iotlb(const struct vringh *vrh, void *dst, 1144 1146 void *src, size_t len) 1145 1147 { 1146 - struct iov_iter iter; 1147 - struct bio_vec iov[16]; 1148 - int ret; 1148 + u64 total_translated = 0; 1149 1149 1150 - ret = iotlb_translate(vrh, (u64)(uintptr_t)src, 1151 - len, iov, 16, VHOST_MAP_RO); 1152 - if (ret < 0) 1153 - return ret; 1150 + while (total_translated < len) { 1151 + struct bio_vec iov[16]; 1152 + struct iov_iter iter; 1153 + u64 translated; 1154 + int ret; 1154 1155 1155 - iov_iter_bvec(&iter, READ, iov, ret, len); 1156 + ret = iotlb_translate(vrh, (u64)(uintptr_t)src, 1157 + len - total_translated, &translated, 1158 + iov, ARRAY_SIZE(iov), VHOST_MAP_RO); 1159 + if (ret == -ENOBUFS) 1160 + ret = ARRAY_SIZE(iov); 1161 + else if (ret < 0) 1162 + return ret; 1156 1163 1157 - ret = copy_from_iter(dst, len, &iter); 1164 + iov_iter_bvec(&iter, READ, iov, ret, translated); 1158 1165 1159 - return ret; 1166 + ret = copy_from_iter(dst, translated, &iter); 1167 + if (ret < 0) 1168 + return ret; 1169 + 1170 + src += translated; 1171 + dst += translated; 1172 + total_translated += translated; 1173 + } 1174 + 1175 + return total_translated; 1160 1176 } 1161 1177 1162 1178 static inline int copy_to_iotlb(const struct vringh *vrh, void *dst, 1163 1179 void *src, size_t len) 1164 1180 { 1165 - struct iov_iter iter; 1166 - struct bio_vec iov[16]; 1167 - int ret; 1181 + u64 total_translated = 0; 1168 1182 1169 - ret = iotlb_translate(vrh, (u64)(uintptr_t)dst, 1170 - len, iov, 16, VHOST_MAP_WO); 1171 - if (ret < 0) 1172 - return ret; 1183 + while (total_translated < len) { 1184 + struct bio_vec iov[16]; 1185 + struct iov_iter iter; 1186 + u64 translated; 1187 + int ret; 1173 1188 1174 - iov_iter_bvec(&iter, WRITE, iov, ret, len); 1189 + ret = iotlb_translate(vrh, (u64)(uintptr_t)dst, 1190 + len - total_translated, &translated, 1191 + iov, ARRAY_SIZE(iov), VHOST_MAP_WO); 1192 + if (ret == -ENOBUFS) 1193 + ret = ARRAY_SIZE(iov); 1194 + else if (ret < 0) 1195 + return ret; 1175 1196 1176 - return copy_to_iter(src, len, &iter); 1197 + iov_iter_bvec(&iter, WRITE, iov, ret, translated); 1198 + 1199 + ret = copy_to_iter(src, translated, &iter); 1200 + if (ret < 0) 1201 + return ret; 1202 + 1203 + src += translated; 1204 + dst += translated; 1205 + total_translated += translated; 1206 + } 1207 + 1208 + return total_translated; 1177 1209 } 1178 1210 1179 1211 static inline int getu16_iotlb(const struct vringh *vrh, ··· 1217 1183 int ret; 1218 1184 1219 1185 /* Atomic read is needed for getu16 */ 1220 - ret = iotlb_translate(vrh, (u64)(uintptr_t)p, sizeof(*p), 1186 + ret = iotlb_translate(vrh, (u64)(uintptr_t)p, sizeof(*p), NULL, 1221 1187 &iov, 1, VHOST_MAP_RO); 1222 1188 if (ret < 0) 1223 1189 return ret; ··· 1238 1204 int ret; 1239 1205 1240 1206 /* Atomic write is needed for putu16 */ 1241 - ret = iotlb_translate(vrh, (u64)(uintptr_t)p, sizeof(*p), 1207 + ret = iotlb_translate(vrh, (u64)(uintptr_t)p, sizeof(*p), NULL, 1242 1208 &iov, 1, VHOST_MAP_WO); 1243 1209 if (ret < 0) 1244 1210 return ret;

+7 -4

drivers/virtio/Kconfig

··· 35 35 36 36 config VIRTIO_HARDEN_NOTIFICATION 37 37 bool "Harden virtio notification" 38 + depends on BROKEN 38 39 help 39 40 Enable this to harden the device notifications and suppress 40 41 those that happen at a time where notifications are illegal. 41 42 42 - Experimental: Note that several drivers still have bugs that 43 + Experimental: Note that several drivers still have issues that 43 44 may cause crashes or hangs when correct handling of 44 45 notifications is enforced; depending on the subset of 45 46 drivers and devices you use, this may or may not work. ··· 127 126 This driver provides access to virtio-mem paravirtualized memory 128 127 devices, allowing to hotplug and hotunplug memory. 129 128 130 - This driver was only tested under x86-64 and arm64, but should 131 - theoretically work on all architectures that support memory hotplug 132 - and hotremove. 129 + This driver currently only supports x86-64 and arm64. Although it 130 + should compile on other architectures that implement memory 131 + hot(un)plug, architecture-specific and/or common 132 + code changes may be required for virtio-mem, kdump and kexec to work as 133 + expected. 133 134 134 135 If unsure, say M. 135 136

+3 -1

drivers/virtio/virtio.c

··· 428 428 goto out; 429 429 430 430 dev->index = err; 431 - dev_set_name(&dev->dev, "virtio%u", dev->index); 431 + err = dev_set_name(&dev->dev, "virtio%u", dev->index); 432 + if (err) 433 + goto out_ida_remove; 432 434 433 435 err = virtio_device_of_init(dev); 434 436 if (err)

+12 -2

drivers/virtio/virtio_mmio.c

··· 360 360 361 361 static struct virtqueue *vm_setup_vq(struct virtio_device *vdev, unsigned int index, 362 362 void (*callback)(struct virtqueue *vq), 363 - const char *name, bool ctx) 363 + const char *name, u32 size, bool ctx) 364 364 { 365 365 struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); 366 366 struct virtio_mmio_vq_info *info; ··· 395 395 goto error_new_virtqueue; 396 396 } 397 397 398 + if (!size || size > num) 399 + size = num; 400 + 398 401 /* Create the vring */ 399 - vq = vring_create_virtqueue(index, num, VIRTIO_MMIO_VRING_ALIGN, vdev, 402 + vq = vring_create_virtqueue(index, size, VIRTIO_MMIO_VRING_ALIGN, vdev, 400 403 true, true, ctx, vm_notify, callback, name); 401 404 if (!vq) { 402 405 err = -ENOMEM; 403 406 goto error_new_virtqueue; 404 407 } 408 + 409 + vq->num_max = num; 405 410 406 411 /* Activate the queue */ 407 412 writel(virtqueue_get_vring_size(vq), vm_dev->base + VIRTIO_MMIO_QUEUE_NUM); ··· 477 472 struct virtqueue *vqs[], 478 473 vq_callback_t *callbacks[], 479 474 const char * const names[], 475 + u32 sizes[], 480 476 const bool *ctx, 481 477 struct irq_affinity *desc) 482 478 { ··· 493 487 if (err) 494 488 return err; 495 489 490 + if (of_property_read_bool(vm_dev->pdev->dev.of_node, "wakeup-source")) 491 + enable_irq_wake(irq); 492 + 496 493 for (i = 0; i < nvqs; ++i) { 497 494 if (!names[i]) { 498 495 vqs[i] = NULL; ··· 503 494 } 504 495 505 496 vqs[i] = vm_setup_vq(vdev, queue_idx++, callbacks[i], names[i], 497 + sizes ? sizes[i] : 0, 506 498 ctx ? ctx[i] : false); 507 499 if (IS_ERR(vqs[i])) { 508 500 vm_del_vqs(vdev);

+20 -12

drivers/virtio/virtio_pci_common.c

··· 174 174 static struct virtqueue *vp_setup_vq(struct virtio_device *vdev, unsigned int index, 175 175 void (*callback)(struct virtqueue *vq), 176 176 const char *name, 177 + u32 size, 177 178 bool ctx, 178 179 u16 msix_vec) 179 180 { ··· 187 186 if (!info) 188 187 return ERR_PTR(-ENOMEM); 189 188 190 - vq = vp_dev->setup_vq(vp_dev, info, index, callback, name, ctx, 189 + vq = vp_dev->setup_vq(vp_dev, info, index, callback, name, size, ctx, 191 190 msix_vec); 192 191 if (IS_ERR(vq)) 193 192 goto out_info; ··· 215 214 struct virtio_pci_vq_info *info = vp_dev->vqs[vq->index]; 216 215 unsigned long flags; 217 216 218 - spin_lock_irqsave(&vp_dev->lock, flags); 219 - list_del(&info->node); 220 - spin_unlock_irqrestore(&vp_dev->lock, flags); 217 + /* 218 + * If it fails during re-enable reset vq. This way we won't rejoin 219 + * info->node to the queue. Prevent unexpected irqs. 220 + */ 221 + if (!vq->reset) { 222 + spin_lock_irqsave(&vp_dev->lock, flags); 223 + list_del(&info->node); 224 + spin_unlock_irqrestore(&vp_dev->lock, flags); 225 + } 221 226 222 227 vp_dev->del_vq(info); 223 228 kfree(info); ··· 284 277 285 278 static int vp_find_vqs_msix(struct virtio_device *vdev, unsigned int nvqs, 286 279 struct virtqueue *vqs[], vq_callback_t *callbacks[], 287 - const char * const names[], bool per_vq_vectors, 280 + const char * const names[], u32 sizes[], bool per_vq_vectors, 288 281 const bool *ctx, 289 282 struct irq_affinity *desc) 290 283 { ··· 327 320 else 328 321 msix_vec = VP_MSIX_VQ_VECTOR; 329 322 vqs[i] = vp_setup_vq(vdev, queue_idx++, callbacks[i], names[i], 330 - ctx ? ctx[i] : false, 331 - msix_vec); 323 + sizes ? sizes[i] : 0, 324 + ctx ? ctx[i] : false, msix_vec); 332 325 if (IS_ERR(vqs[i])) { 333 326 err = PTR_ERR(vqs[i]); 334 327 goto error_find; ··· 358 351 359 352 static int vp_find_vqs_intx(struct virtio_device *vdev, unsigned int nvqs, 360 353 struct virtqueue *vqs[], vq_callback_t *callbacks[], 361 - const char * const names[], const bool *ctx) 354 + const char * const names[], u32 sizes[], const bool *ctx) 362 355 { 363 356 struct virtio_pci_device *vp_dev = to_vp_device(vdev); 364 357 int i, err, queue_idx = 0; ··· 380 373 continue; 381 374 } 382 375 vqs[i] = vp_setup_vq(vdev, queue_idx++, callbacks[i], names[i], 376 + sizes ? sizes[i] : 0, 383 377 ctx ? ctx[i] : false, 384 378 VIRTIO_MSI_NO_VECTOR); 385 379 if (IS_ERR(vqs[i])) { ··· 398 390 /* the config->find_vqs() implementation */ 399 391 int vp_find_vqs(struct virtio_device *vdev, unsigned int nvqs, 400 392 struct virtqueue *vqs[], vq_callback_t *callbacks[], 401 - const char * const names[], const bool *ctx, 393 + const char * const names[], u32 sizes[], const bool *ctx, 402 394 struct irq_affinity *desc) 403 395 { 404 396 int err; 405 397 406 398 /* Try MSI-X with one vector per queue. */ 407 - err = vp_find_vqs_msix(vdev, nvqs, vqs, callbacks, names, true, ctx, desc); 399 + err = vp_find_vqs_msix(vdev, nvqs, vqs, callbacks, names, sizes, true, ctx, desc); 408 400 if (!err) 409 401 return 0; 410 402 /* Fallback: MSI-X with one vector for config, one shared for queues. */ 411 - err = vp_find_vqs_msix(vdev, nvqs, vqs, callbacks, names, false, ctx, desc); 403 + err = vp_find_vqs_msix(vdev, nvqs, vqs, callbacks, names, sizes, false, ctx, desc); 412 404 if (!err) 413 405 return 0; 414 406 /* Finally fall back to regular interrupts. */ 415 - return vp_find_vqs_intx(vdev, nvqs, vqs, callbacks, names, ctx); 407 + return vp_find_vqs_intx(vdev, nvqs, vqs, callbacks, names, sizes, ctx); 416 408 } 417 409 418 410 const char *vp_bus_name(struct virtio_device *vdev)

+2 -1

drivers/virtio/virtio_pci_common.h

··· 80 80 unsigned int idx, 81 81 void (*callback)(struct virtqueue *vq), 82 82 const char *name, 83 + u32 size, 83 84 bool ctx, 84 85 u16 msix_vec); 85 86 void (*del_vq)(struct virtio_pci_vq_info *info); ··· 111 110 /* the config->find_vqs() implementation */ 112 111 int vp_find_vqs(struct virtio_device *vdev, unsigned int nvqs, 113 112 struct virtqueue *vqs[], vq_callback_t *callbacks[], 114 - const char * const names[], const bool *ctx, 113 + const char * const names[], u32 sizes[], const bool *ctx, 115 114 struct irq_affinity *desc); 116 115 const char *vp_bus_name(struct virtio_device *vdev); 117 116

+7 -1

drivers/virtio/virtio_pci_legacy.c

··· 112 112 unsigned int index, 113 113 void (*callback)(struct virtqueue *vq), 114 114 const char *name, 115 + u32 size, 115 116 bool ctx, 116 117 u16 msix_vec) 117 118 { ··· 126 125 if (!num || vp_legacy_get_queue_enable(&vp_dev->ldev, index)) 127 126 return ERR_PTR(-ENOENT); 128 127 128 + if (!size || size > num) 129 + size = num; 130 + 129 131 info->msix_vector = msix_vec; 130 132 131 133 /* create the vring */ 132 - vq = vring_create_virtqueue(index, num, 134 + vq = vring_create_virtqueue(index, size, 133 135 VIRTIO_PCI_VRING_ALIGN, &vp_dev->vdev, 134 136 true, false, ctx, 135 137 vp_notify, callback, name); 136 138 if (!vq) 137 139 return ERR_PTR(-ENOMEM); 140 + 141 + vq->num_max = num; 138 142 139 143 q_pfn = virtqueue_get_desc_addr(vq) >> VIRTIO_PCI_QUEUE_ADDR_SHIFT; 140 144 if (q_pfn >> 32) {

+130 -23

drivers/virtio/virtio_pci_modern.c

··· 34 34 if ((features & BIT_ULL(VIRTIO_F_SR_IOV)) && 35 35 pci_find_ext_capability(pci_dev, PCI_EXT_CAP_ID_SRIOV)) 36 36 __virtio_set_bit(vdev, VIRTIO_F_SR_IOV); 37 + 38 + if (features & BIT_ULL(VIRTIO_F_RING_RESET)) 39 + __virtio_set_bit(vdev, VIRTIO_F_RING_RESET); 37 40 } 38 41 39 42 /* virtio config->finalize_features() implementation */ ··· 179 176 vp_synchronize_vectors(vdev); 180 177 } 181 178 179 + static int vp_active_vq(struct virtqueue *vq, u16 msix_vec) 180 + { 181 + struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev); 182 + struct virtio_pci_modern_device *mdev = &vp_dev->mdev; 183 + unsigned long index; 184 + 185 + index = vq->index; 186 + 187 + /* activate the queue */ 188 + vp_modern_set_queue_size(mdev, index, virtqueue_get_vring_size(vq)); 189 + vp_modern_queue_address(mdev, index, virtqueue_get_desc_addr(vq), 190 + virtqueue_get_avail_addr(vq), 191 + virtqueue_get_used_addr(vq)); 192 + 193 + if (msix_vec != VIRTIO_MSI_NO_VECTOR) { 194 + msix_vec = vp_modern_queue_vector(mdev, index, msix_vec); 195 + if (msix_vec == VIRTIO_MSI_NO_VECTOR) 196 + return -EBUSY; 197 + } 198 + 199 + return 0; 200 + } 201 + 202 + static int vp_modern_disable_vq_and_reset(struct virtqueue *vq) 203 + { 204 + struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev); 205 + struct virtio_pci_modern_device *mdev = &vp_dev->mdev; 206 + struct virtio_pci_vq_info *info; 207 + unsigned long flags; 208 + 209 + if (!virtio_has_feature(vq->vdev, VIRTIO_F_RING_RESET)) 210 + return -ENOENT; 211 + 212 + vp_modern_set_queue_reset(mdev, vq->index); 213 + 214 + info = vp_dev->vqs[vq->index]; 215 + 216 + /* delete vq from irq handler */ 217 + spin_lock_irqsave(&vp_dev->lock, flags); 218 + list_del(&info->node); 219 + spin_unlock_irqrestore(&vp_dev->lock, flags); 220 + 221 + INIT_LIST_HEAD(&info->node); 222 + 223 + #ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION 224 + __virtqueue_break(vq); 225 + #endif 226 + 227 + /* For the case where vq has an exclusive irq, call synchronize_irq() to 228 + * wait for completion. 229 + * 230 + * note: We can't use disable_irq() since it conflicts with the affinity 231 + * managed IRQ that is used by some drivers. 232 + */ 233 + if (vp_dev->per_vq_vectors && info->msix_vector != VIRTIO_MSI_NO_VECTOR) 234 + synchronize_irq(pci_irq_vector(vp_dev->pci_dev, info->msix_vector)); 235 + 236 + vq->reset = true; 237 + 238 + return 0; 239 + } 240 + 241 + static int vp_modern_enable_vq_after_reset(struct virtqueue *vq) 242 + { 243 + struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev); 244 + struct virtio_pci_modern_device *mdev = &vp_dev->mdev; 245 + struct virtio_pci_vq_info *info; 246 + unsigned long flags, index; 247 + int err; 248 + 249 + if (!vq->reset) 250 + return -EBUSY; 251 + 252 + index = vq->index; 253 + info = vp_dev->vqs[index]; 254 + 255 + if (vp_modern_get_queue_reset(mdev, index)) 256 + return -EBUSY; 257 + 258 + if (vp_modern_get_queue_enable(mdev, index)) 259 + return -EBUSY; 260 + 261 + err = vp_active_vq(vq, info->msix_vector); 262 + if (err) 263 + return err; 264 + 265 + if (vq->callback) { 266 + spin_lock_irqsave(&vp_dev->lock, flags); 267 + list_add(&info->node, &vp_dev->virtqueues); 268 + spin_unlock_irqrestore(&vp_dev->lock, flags); 269 + } else { 270 + INIT_LIST_HEAD(&info->node); 271 + } 272 + 273 + #ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION 274 + __virtqueue_unbreak(vq); 275 + #endif 276 + 277 + vp_modern_set_queue_enable(&vp_dev->mdev, index, true); 278 + vq->reset = false; 279 + 280 + return 0; 281 + } 282 + 182 283 static u16 vp_config_vector(struct virtio_pci_device *vp_dev, u16 vector) 183 284 { 184 285 return vp_modern_config_vector(&vp_dev->mdev, vector); ··· 293 186 unsigned int index, 294 187 void (*callback)(struct virtqueue *vq), 295 188 const char *name, 189 + u32 size, 296 190 bool ctx, 297 191 u16 msix_vec) 298 192 { ··· 311 203 if (!num || vp_modern_get_queue_enable(mdev, index)) 312 204 return ERR_PTR(-ENOENT); 313 205 314 - if (num & (num - 1)) { 315 - dev_warn(&vp_dev->pci_dev->dev, "bad queue size %u", num); 206 + if (!size || size > num) 207 + size = num; 208 + 209 + if (size & (size - 1)) { 210 + dev_warn(&vp_dev->pci_dev->dev, "bad queue size %u", size); 316 211 return ERR_PTR(-EINVAL); 317 212 } 318 213 319 214 info->msix_vector = msix_vec; 320 215 321 216 /* create the vring */ 322 - vq = vring_create_virtqueue(index, num, 217 + vq = vring_create_virtqueue(index, size, 323 218 SMP_CACHE_BYTES, &vp_dev->vdev, 324 219 true, true, ctx, 325 220 vp_notify, callback, name); 326 221 if (!vq) 327 222 return ERR_PTR(-ENOMEM); 328 223 329 - /* activate the queue */ 330 - vp_modern_set_queue_size(mdev, index, virtqueue_get_vring_size(vq)); 331 - vp_modern_queue_address(mdev, index, virtqueue_get_desc_addr(vq), 332 - virtqueue_get_avail_addr(vq), 333 - virtqueue_get_used_addr(vq)); 224 + vq->num_max = num; 225 + 226 + err = vp_active_vq(vq, msix_vec); 227 + if (err) 228 + goto err; 334 229 335 230 vq->priv = (void __force *)vp_modern_map_vq_notify(mdev, index, NULL); 336 231 if (!vq->priv) { 337 232 err = -ENOMEM; 338 - goto err_map_notify; 339 - } 340 - 341 - if (msix_vec != VIRTIO_MSI_NO_VECTOR) { 342 - msix_vec = vp_modern_queue_vector(mdev, index, msix_vec); 343 - if (msix_vec == VIRTIO_MSI_NO_VECTOR) { 344 - err = -EBUSY; 345 - goto err_assign_vector; 346 - } 233 + goto err; 347 234 } 348 235 349 236 return vq; 350 237 351 - err_assign_vector: 352 - if (!mdev->notify_base) 353 - pci_iounmap(mdev->pci_dev, (void __iomem __force *)vq->priv); 354 - err_map_notify: 238 + err: 355 239 vring_del_virtqueue(vq); 356 240 return ERR_PTR(err); 357 241 } ··· 351 251 static int vp_modern_find_vqs(struct virtio_device *vdev, unsigned int nvqs, 352 252 struct virtqueue *vqs[], 353 253 vq_callback_t *callbacks[], 354 - const char * const names[], const bool *ctx, 254 + const char * const names[], 255 + u32 sizes[], 256 + const bool *ctx, 355 257 struct irq_affinity *desc) 356 258 { 357 259 struct virtio_pci_device *vp_dev = to_vp_device(vdev); 358 260 struct virtqueue *vq; 359 - int rc = vp_find_vqs(vdev, nvqs, vqs, callbacks, names, ctx, desc); 261 + int rc = vp_find_vqs(vdev, nvqs, vqs, callbacks, names, sizes, ctx, 262 + desc); 360 263 361 264 if (rc) 362 265 return rc; ··· 504 401 .set_vq_affinity = vp_set_vq_affinity, 505 402 .get_vq_affinity = vp_get_vq_affinity, 506 403 .get_shm_region = vp_get_shm_region, 404 + .disable_vq_and_reset = vp_modern_disable_vq_and_reset, 405 + .enable_vq_after_reset = vp_modern_enable_vq_after_reset, 507 406 }; 508 407 509 408 static const struct virtio_config_ops virtio_pci_config_ops = { ··· 524 419 .set_vq_affinity = vp_set_vq_affinity, 525 420 .get_vq_affinity = vp_get_vq_affinity, 526 421 .get_shm_region = vp_get_shm_region, 422 + .disable_vq_and_reset = vp_modern_disable_vq_and_reset, 423 + .enable_vq_after_reset = vp_modern_enable_vq_after_reset, 527 424 }; 528 425 529 426 /* the PCI probing function */

+39

drivers/virtio/virtio_pci_modern_dev.c

··· 3 3 #include <linux/virtio_pci_modern.h> 4 4 #include <linux/module.h> 5 5 #include <linux/pci.h> 6 + #include <linux/delay.h> 6 7 7 8 /* 8 9 * vp_modern_map_capability - map a part of virtio pci capability ··· 474 473 vp_iowrite8(status, &cfg->device_status); 475 474 } 476 475 EXPORT_SYMBOL_GPL(vp_modern_set_status); 476 + 477 + /* 478 + * vp_modern_get_queue_reset - get the queue reset status 479 + * @mdev: the modern virtio-pci device 480 + * @index: queue index 481 + */ 482 + int vp_modern_get_queue_reset(struct virtio_pci_modern_device *mdev, u16 index) 483 + { 484 + struct virtio_pci_modern_common_cfg __iomem *cfg; 485 + 486 + cfg = (struct virtio_pci_modern_common_cfg __iomem *)mdev->common; 487 + 488 + vp_iowrite16(index, &cfg->cfg.queue_select); 489 + return vp_ioread16(&cfg->queue_reset); 490 + } 491 + EXPORT_SYMBOL_GPL(vp_modern_get_queue_reset); 492 + 493 + /* 494 + * vp_modern_set_queue_reset - reset the queue 495 + * @mdev: the modern virtio-pci device 496 + * @index: queue index 497 + */ 498 + void vp_modern_set_queue_reset(struct virtio_pci_modern_device *mdev, u16 index) 499 + { 500 + struct virtio_pci_modern_common_cfg __iomem *cfg; 501 + 502 + cfg = (struct virtio_pci_modern_common_cfg __iomem *)mdev->common; 503 + 504 + vp_iowrite16(index, &cfg->cfg.queue_select); 505 + vp_iowrite16(1, &cfg->queue_reset); 506 + 507 + while (vp_ioread16(&cfg->queue_reset)) 508 + msleep(1); 509 + 510 + while (vp_ioread16(&cfg->cfg.queue_enable)) 511 + msleep(1); 512 + } 513 + EXPORT_SYMBOL_GPL(vp_modern_set_queue_reset); 477 514 478 515 /* 479 516 * vp_modern_queue_vector - set the MSIX vector for a specific virtqueue

+581 -233

drivers/virtio/virtio_ring.c

··· 85 85 u16 next; /* The next desc state in a list. */ 86 86 }; 87 87 88 + struct vring_virtqueue_split { 89 + /* Actual memory layout for this queue. */ 90 + struct vring vring; 91 + 92 + /* Last written value to avail->flags */ 93 + u16 avail_flags_shadow; 94 + 95 + /* 96 + * Last written value to avail->idx in 97 + * guest byte order. 98 + */ 99 + u16 avail_idx_shadow; 100 + 101 + /* Per-descriptor state. */ 102 + struct vring_desc_state_split *desc_state; 103 + struct vring_desc_extra *desc_extra; 104 + 105 + /* DMA address and size information */ 106 + dma_addr_t queue_dma_addr; 107 + size_t queue_size_in_bytes; 108 + 109 + /* 110 + * The parameters for creating vrings are reserved for creating new 111 + * vring. 112 + */ 113 + u32 vring_align; 114 + bool may_reduce_num; 115 + }; 116 + 117 + struct vring_virtqueue_packed { 118 + /* Actual memory layout for this queue. */ 119 + struct { 120 + unsigned int num; 121 + struct vring_packed_desc *desc; 122 + struct vring_packed_desc_event *driver; 123 + struct vring_packed_desc_event *device; 124 + } vring; 125 + 126 + /* Driver ring wrap counter. */ 127 + bool avail_wrap_counter; 128 + 129 + /* Avail used flags. */ 130 + u16 avail_used_flags; 131 + 132 + /* Index of the next avail descriptor. */ 133 + u16 next_avail_idx; 134 + 135 + /* 136 + * Last written value to driver->flags in 137 + * guest byte order. 138 + */ 139 + u16 event_flags_shadow; 140 + 141 + /* Per-descriptor state. */ 142 + struct vring_desc_state_packed *desc_state; 143 + struct vring_desc_extra *desc_extra; 144 + 145 + /* DMA address and size information */ 146 + dma_addr_t ring_dma_addr; 147 + dma_addr_t driver_event_dma_addr; 148 + dma_addr_t device_event_dma_addr; 149 + size_t ring_size_in_bytes; 150 + size_t event_size_in_bytes; 151 + }; 152 + 88 153 struct vring_virtqueue { 89 154 struct virtqueue vq; 90 155 ··· 189 124 190 125 union { 191 126 /* Available for split ring */ 192 - struct { 193 - /* Actual memory layout for this queue. */ 194 - struct vring vring; 195 - 196 - /* Last written value to avail->flags */ 197 - u16 avail_flags_shadow; 198 - 199 - /* 200 - * Last written value to avail->idx in 201 - * guest byte order. 202 - */ 203 - u16 avail_idx_shadow; 204 - 205 - /* Per-descriptor state. */ 206 - struct vring_desc_state_split *desc_state; 207 - struct vring_desc_extra *desc_extra; 208 - 209 - /* DMA address and size information */ 210 - dma_addr_t queue_dma_addr; 211 - size_t queue_size_in_bytes; 212 - } split; 127 + struct vring_virtqueue_split split; 213 128 214 129 /* Available for packed ring */ 215 - struct { 216 - /* Actual memory layout for this queue. */ 217 - struct { 218 - unsigned int num; 219 - struct vring_packed_desc *desc; 220 - struct vring_packed_desc_event *driver; 221 - struct vring_packed_desc_event *device; 222 - } vring; 223 - 224 - /* Driver ring wrap counter. */ 225 - bool avail_wrap_counter; 226 - 227 - /* Avail used flags. */ 228 - u16 avail_used_flags; 229 - 230 - /* Index of the next avail descriptor. */ 231 - u16 next_avail_idx; 232 - 233 - /* 234 - * Last written value to driver->flags in 235 - * guest byte order. 236 - */ 237 - u16 event_flags_shadow; 238 - 239 - /* Per-descriptor state. */ 240 - struct vring_desc_state_packed *desc_state; 241 - struct vring_desc_extra *desc_extra; 242 - 243 - /* DMA address and size information */ 244 - dma_addr_t ring_dma_addr; 245 - dma_addr_t driver_event_dma_addr; 246 - dma_addr_t device_event_dma_addr; 247 - size_t ring_size_in_bytes; 248 - size_t event_size_in_bytes; 249 - } packed; 130 + struct vring_virtqueue_packed packed; 250 131 }; 251 132 252 133 /* How to notify other side. FIXME: commonalize hcalls! */ ··· 211 200 #endif 212 201 }; 213 202 203 + static struct virtqueue *__vring_new_virtqueue(unsigned int index, 204 + struct vring_virtqueue_split *vring_split, 205 + struct virtio_device *vdev, 206 + bool weak_barriers, 207 + bool context, 208 + bool (*notify)(struct virtqueue *), 209 + void (*callback)(struct virtqueue *), 210 + const char *name); 211 + static struct vring_desc_extra *vring_alloc_desc_extra(unsigned int num); 212 + static void vring_free(struct virtqueue *_vq); 214 213 215 214 /* 216 215 * Helpers. ··· 383 362 return 0; 384 363 385 364 return dma_mapping_error(vring_dma_dev(vq), addr); 365 + } 366 + 367 + static void virtqueue_init(struct vring_virtqueue *vq, u32 num) 368 + { 369 + vq->vq.num_free = num; 370 + 371 + if (vq->packed_ring) 372 + vq->last_used_idx = 0 | (1 << VRING_PACKED_EVENT_F_WRAP_CTR); 373 + else 374 + vq->last_used_idx = 0; 375 + 376 + vq->event_triggered = false; 377 + vq->num_added = 0; 378 + 379 + #ifdef DEBUG 380 + vq->in_use = false; 381 + vq->last_add_time_valid = false; 382 + #endif 386 383 } 387 384 388 385 ··· 946 907 return NULL; 947 908 } 948 909 910 + static void virtqueue_vring_init_split(struct vring_virtqueue_split *vring_split, 911 + struct vring_virtqueue *vq) 912 + { 913 + struct virtio_device *vdev; 914 + 915 + vdev = vq->vq.vdev; 916 + 917 + vring_split->avail_flags_shadow = 0; 918 + vring_split->avail_idx_shadow = 0; 919 + 920 + /* No callback? Tell other side not to bother us. */ 921 + if (!vq->vq.callback) { 922 + vring_split->avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT; 923 + if (!vq->event) 924 + vring_split->vring.avail->flags = cpu_to_virtio16(vdev, 925 + vring_split->avail_flags_shadow); 926 + } 927 + } 928 + 929 + static void virtqueue_reinit_split(struct vring_virtqueue *vq) 930 + { 931 + int num; 932 + 933 + num = vq->split.vring.num; 934 + 935 + vq->split.vring.avail->flags = 0; 936 + vq->split.vring.avail->idx = 0; 937 + 938 + /* reset avail event */ 939 + vq->split.vring.avail->ring[num] = 0; 940 + 941 + vq->split.vring.used->flags = 0; 942 + vq->split.vring.used->idx = 0; 943 + 944 + /* reset used event */ 945 + *(__virtio16 *)&(vq->split.vring.used->ring[num]) = 0; 946 + 947 + virtqueue_init(vq, num); 948 + 949 + virtqueue_vring_init_split(&vq->split, vq); 950 + } 951 + 952 + static void virtqueue_vring_attach_split(struct vring_virtqueue *vq, 953 + struct vring_virtqueue_split *vring_split) 954 + { 955 + vq->split = *vring_split; 956 + 957 + /* Put everything in free lists. */ 958 + vq->free_head = 0; 959 + } 960 + 961 + static int vring_alloc_state_extra_split(struct vring_virtqueue_split *vring_split) 962 + { 963 + struct vring_desc_state_split *state; 964 + struct vring_desc_extra *extra; 965 + u32 num = vring_split->vring.num; 966 + 967 + state = kmalloc_array(num, sizeof(struct vring_desc_state_split), GFP_KERNEL); 968 + if (!state) 969 + goto err_state; 970 + 971 + extra = vring_alloc_desc_extra(num); 972 + if (!extra) 973 + goto err_extra; 974 + 975 + memset(state, 0, num * sizeof(struct vring_desc_state_split)); 976 + 977 + vring_split->desc_state = state; 978 + vring_split->desc_extra = extra; 979 + return 0; 980 + 981 + err_extra: 982 + kfree(state); 983 + err_state: 984 + return -ENOMEM; 985 + } 986 + 987 + static void vring_free_split(struct vring_virtqueue_split *vring_split, 988 + struct virtio_device *vdev) 989 + { 990 + vring_free_queue(vdev, vring_split->queue_size_in_bytes, 991 + vring_split->vring.desc, 992 + vring_split->queue_dma_addr); 993 + 994 + kfree(vring_split->desc_state); 995 + kfree(vring_split->desc_extra); 996 + } 997 + 998 + static int vring_alloc_queue_split(struct vring_virtqueue_split *vring_split, 999 + struct virtio_device *vdev, 1000 + u32 num, 1001 + unsigned int vring_align, 1002 + bool may_reduce_num) 1003 + { 1004 + void *queue = NULL; 1005 + dma_addr_t dma_addr; 1006 + 1007 + /* We assume num is a power of 2. */ 1008 + if (num & (num - 1)) { 1009 + dev_warn(&vdev->dev, "Bad virtqueue length %u\n", num); 1010 + return -EINVAL; 1011 + } 1012 + 1013 + /* TODO: allocate each queue chunk individually */ 1014 + for (; num && vring_size(num, vring_align) > PAGE_SIZE; num /= 2) { 1015 + queue = vring_alloc_queue(vdev, vring_size(num, vring_align), 1016 + &dma_addr, 1017 + GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO); 1018 + if (queue) 1019 + break; 1020 + if (!may_reduce_num) 1021 + return -ENOMEM; 1022 + } 1023 + 1024 + if (!num) 1025 + return -ENOMEM; 1026 + 1027 + if (!queue) { 1028 + /* Try to get a single page. You are my only hope! */ 1029 + queue = vring_alloc_queue(vdev, vring_size(num, vring_align), 1030 + &dma_addr, GFP_KERNEL|__GFP_ZERO); 1031 + } 1032 + if (!queue) 1033 + return -ENOMEM; 1034 + 1035 + vring_init(&vring_split->vring, num, queue, vring_align); 1036 + 1037 + vring_split->queue_dma_addr = dma_addr; 1038 + vring_split->queue_size_in_bytes = vring_size(num, vring_align); 1039 + 1040 + vring_split->vring_align = vring_align; 1041 + vring_split->may_reduce_num = may_reduce_num; 1042 + 1043 + return 0; 1044 + } 1045 + 949 1046 static struct virtqueue *vring_create_virtqueue_split( 950 1047 unsigned int index, 951 1048 unsigned int num, ··· 1094 919 void (*callback)(struct virtqueue *), 1095 920 const char *name) 1096 921 { 922 + struct vring_virtqueue_split vring_split = {}; 1097 923 struct virtqueue *vq; 1098 - void *queue = NULL; 1099 - dma_addr_t dma_addr; 1100 - size_t queue_size_in_bytes; 1101 - struct vring vring; 924 + int err; 1102 925 1103 - /* We assume num is a power of 2. */ 1104 - if (num & (num - 1)) { 1105 - dev_warn(&vdev->dev, "Bad virtqueue length %u\n", num); 1106 - return NULL; 1107 - } 1108 - 1109 - /* TODO: allocate each queue chunk individually */ 1110 - for (; num && vring_size(num, vring_align) > PAGE_SIZE; num /= 2) { 1111 - queue = vring_alloc_queue(vdev, vring_size(num, vring_align), 1112 - &dma_addr, 1113 - GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO); 1114 - if (queue) 1115 - break; 1116 - if (!may_reduce_num) 1117 - return NULL; 1118 - } 1119 - 1120 - if (!num) 926 + err = vring_alloc_queue_split(&vring_split, vdev, num, vring_align, 927 + may_reduce_num); 928 + if (err) 1121 929 return NULL; 1122 930 1123 - if (!queue) { 1124 - /* Try to get a single page. You are my only hope! */ 1125 - queue = vring_alloc_queue(vdev, vring_size(num, vring_align), 1126 - &dma_addr, GFP_KERNEL|__GFP_ZERO); 1127 - } 1128 - if (!queue) 1129 - return NULL; 1130 - 1131 - queue_size_in_bytes = vring_size(num, vring_align); 1132 - vring_init(&vring, num, queue, vring_align); 1133 - 1134 - vq = __vring_new_virtqueue(index, vring, vdev, weak_barriers, context, 1135 - notify, callback, name); 931 + vq = __vring_new_virtqueue(index, &vring_split, vdev, weak_barriers, 932 + context, notify, callback, name); 1136 933 if (!vq) { 1137 - vring_free_queue(vdev, queue_size_in_bytes, queue, 1138 - dma_addr); 934 + vring_free_split(&vring_split, vdev); 1139 935 return NULL; 1140 936 } 1141 937 1142 - to_vvq(vq)->split.queue_dma_addr = dma_addr; 1143 - to_vvq(vq)->split.queue_size_in_bytes = queue_size_in_bytes; 1144 938 to_vvq(vq)->we_own_ring = true; 1145 939 1146 940 return vq; 941 + } 942 + 943 + static int virtqueue_resize_split(struct virtqueue *_vq, u32 num) 944 + { 945 + struct vring_virtqueue_split vring_split = {}; 946 + struct vring_virtqueue *vq = to_vvq(_vq); 947 + struct virtio_device *vdev = _vq->vdev; 948 + int err; 949 + 950 + err = vring_alloc_queue_split(&vring_split, vdev, num, 951 + vq->split.vring_align, 952 + vq->split.may_reduce_num); 953 + if (err) 954 + goto err; 955 + 956 + err = vring_alloc_state_extra_split(&vring_split); 957 + if (err) 958 + goto err_state_extra; 959 + 960 + vring_free(&vq->vq); 961 + 962 + virtqueue_vring_init_split(&vring_split, vq); 963 + 964 + virtqueue_init(vq, vring_split.vring.num); 965 + virtqueue_vring_attach_split(vq, &vring_split); 966 + 967 + return 0; 968 + 969 + err_state_extra: 970 + vring_free_split(&vring_split, vdev); 971 + err: 972 + virtqueue_reinit_split(vq); 973 + return -ENOMEM; 1147 974 } 1148 975 1149 976 ··· 1814 1637 return NULL; 1815 1638 } 1816 1639 1817 - static struct vring_desc_extra *vring_alloc_desc_extra(struct vring_virtqueue *vq, 1818 - unsigned int num) 1640 + static struct vring_desc_extra *vring_alloc_desc_extra(unsigned int num) 1819 1641 { 1820 1642 struct vring_desc_extra *desc_extra; 1821 1643 unsigned int i; ··· 1832 1656 return desc_extra; 1833 1657 } 1834 1658 1659 + static void vring_free_packed(struct vring_virtqueue_packed *vring_packed, 1660 + struct virtio_device *vdev) 1661 + { 1662 + if (vring_packed->vring.desc) 1663 + vring_free_queue(vdev, vring_packed->ring_size_in_bytes, 1664 + vring_packed->vring.desc, 1665 + vring_packed->ring_dma_addr); 1666 + 1667 + if (vring_packed->vring.driver) 1668 + vring_free_queue(vdev, vring_packed->event_size_in_bytes, 1669 + vring_packed->vring.driver, 1670 + vring_packed->driver_event_dma_addr); 1671 + 1672 + if (vring_packed->vring.device) 1673 + vring_free_queue(vdev, vring_packed->event_size_in_bytes, 1674 + vring_packed->vring.device, 1675 + vring_packed->device_event_dma_addr); 1676 + 1677 + kfree(vring_packed->desc_state); 1678 + kfree(vring_packed->desc_extra); 1679 + } 1680 + 1681 + static int vring_alloc_queue_packed(struct vring_virtqueue_packed *vring_packed, 1682 + struct virtio_device *vdev, 1683 + u32 num) 1684 + { 1685 + struct vring_packed_desc *ring; 1686 + struct vring_packed_desc_event *driver, *device; 1687 + dma_addr_t ring_dma_addr, driver_event_dma_addr, device_event_dma_addr; 1688 + size_t ring_size_in_bytes, event_size_in_bytes; 1689 + 1690 + ring_size_in_bytes = num * sizeof(struct vring_packed_desc); 1691 + 1692 + ring = vring_alloc_queue(vdev, ring_size_in_bytes, 1693 + &ring_dma_addr, 1694 + GFP_KERNEL|__GFP_NOWARN|__GFP_ZERO); 1695 + if (!ring) 1696 + goto err; 1697 + 1698 + vring_packed->vring.desc = ring; 1699 + vring_packed->ring_dma_addr = ring_dma_addr; 1700 + vring_packed->ring_size_in_bytes = ring_size_in_bytes; 1701 + 1702 + event_size_in_bytes = sizeof(struct vring_packed_desc_event); 1703 + 1704 + driver = vring_alloc_queue(vdev, event_size_in_bytes, 1705 + &driver_event_dma_addr, 1706 + GFP_KERNEL|__GFP_NOWARN|__GFP_ZERO); 1707 + if (!driver) 1708 + goto err; 1709 + 1710 + vring_packed->vring.driver = driver; 1711 + vring_packed->event_size_in_bytes = event_size_in_bytes; 1712 + vring_packed->driver_event_dma_addr = driver_event_dma_addr; 1713 + 1714 + device = vring_alloc_queue(vdev, event_size_in_bytes, 1715 + &device_event_dma_addr, 1716 + GFP_KERNEL|__GFP_NOWARN|__GFP_ZERO); 1717 + if (!device) 1718 + goto err; 1719 + 1720 + vring_packed->vring.device = device; 1721 + vring_packed->device_event_dma_addr = device_event_dma_addr; 1722 + 1723 + vring_packed->vring.num = num; 1724 + 1725 + return 0; 1726 + 1727 + err: 1728 + vring_free_packed(vring_packed, vdev); 1729 + return -ENOMEM; 1730 + } 1731 + 1732 + static int vring_alloc_state_extra_packed(struct vring_virtqueue_packed *vring_packed) 1733 + { 1734 + struct vring_desc_state_packed *state; 1735 + struct vring_desc_extra *extra; 1736 + u32 num = vring_packed->vring.num; 1737 + 1738 + state = kmalloc_array(num, sizeof(struct vring_desc_state_packed), GFP_KERNEL); 1739 + if (!state) 1740 + goto err_desc_state; 1741 + 1742 + memset(state, 0, num * sizeof(struct vring_desc_state_packed)); 1743 + 1744 + extra = vring_alloc_desc_extra(num); 1745 + if (!extra) 1746 + goto err_desc_extra; 1747 + 1748 + vring_packed->desc_state = state; 1749 + vring_packed->desc_extra = extra; 1750 + 1751 + return 0; 1752 + 1753 + err_desc_extra: 1754 + kfree(state); 1755 + err_desc_state: 1756 + return -ENOMEM; 1757 + } 1758 + 1759 + static void virtqueue_vring_init_packed(struct vring_virtqueue_packed *vring_packed, 1760 + bool callback) 1761 + { 1762 + vring_packed->next_avail_idx = 0; 1763 + vring_packed->avail_wrap_counter = 1; 1764 + vring_packed->event_flags_shadow = 0; 1765 + vring_packed->avail_used_flags = 1 << VRING_PACKED_DESC_F_AVAIL; 1766 + 1767 + /* No callback? Tell other side not to bother us. */ 1768 + if (!callback) { 1769 + vring_packed->event_flags_shadow = VRING_PACKED_EVENT_FLAG_DISABLE; 1770 + vring_packed->vring.driver->flags = 1771 + cpu_to_le16(vring_packed->event_flags_shadow); 1772 + } 1773 + } 1774 + 1775 + static void virtqueue_vring_attach_packed(struct vring_virtqueue *vq, 1776 + struct vring_virtqueue_packed *vring_packed) 1777 + { 1778 + vq->packed = *vring_packed; 1779 + 1780 + /* Put everything in free lists. */ 1781 + vq->free_head = 0; 1782 + } 1783 + 1784 + static void virtqueue_reinit_packed(struct vring_virtqueue *vq) 1785 + { 1786 + memset(vq->packed.vring.device, 0, vq->packed.event_size_in_bytes); 1787 + memset(vq->packed.vring.driver, 0, vq->packed.event_size_in_bytes); 1788 + 1789 + /* we need to reset the desc.flags. For more, see is_used_desc_packed() */ 1790 + memset(vq->packed.vring.desc, 0, vq->packed.ring_size_in_bytes); 1791 + 1792 + virtqueue_init(vq, vq->packed.vring.num); 1793 + virtqueue_vring_init_packed(&vq->packed, !!vq->vq.callback); 1794 + } 1795 + 1835 1796 static struct virtqueue *vring_create_virtqueue_packed( 1836 1797 unsigned int index, 1837 1798 unsigned int num, ··· 1981 1668 void (*callback)(struct virtqueue *), 1982 1669 const char *name) 1983 1670 { 1671 + struct vring_virtqueue_packed vring_packed = {}; 1984 1672 struct vring_virtqueue *vq; 1985 - struct vring_packed_desc *ring; 1986 - struct vring_packed_desc_event *driver, *device; 1987 - dma_addr_t ring_dma_addr, driver_event_dma_addr, device_event_dma_addr; 1988 - size_t ring_size_in_bytes, event_size_in_bytes; 1673 + int err; 1989 1674 1990 - ring_size_in_bytes = num * sizeof(struct vring_packed_desc); 1991 - 1992 - ring = vring_alloc_queue(vdev, ring_size_in_bytes, 1993 - &ring_dma_addr, 1994 - GFP_KERNEL|__GFP_NOWARN|__GFP_ZERO); 1995 - if (!ring) 1675 + if (vring_alloc_queue_packed(&vring_packed, vdev, num)) 1996 1676 goto err_ring; 1997 - 1998 - event_size_in_bytes = sizeof(struct vring_packed_desc_event); 1999 - 2000 - driver = vring_alloc_queue(vdev, event_size_in_bytes, 2001 - &driver_event_dma_addr, 2002 - GFP_KERNEL|__GFP_NOWARN|__GFP_ZERO); 2003 - if (!driver) 2004 - goto err_driver; 2005 - 2006 - device = vring_alloc_queue(vdev, event_size_in_bytes, 2007 - &device_event_dma_addr, 2008 - GFP_KERNEL|__GFP_NOWARN|__GFP_ZERO); 2009 - if (!device) 2010 - goto err_device; 2011 1677 2012 1678 vq = kmalloc(sizeof(*vq), GFP_KERNEL); 2013 1679 if (!vq) ··· 1995 1703 vq->vq.callback = callback; 1996 1704 vq->vq.vdev = vdev; 1997 1705 vq->vq.name = name; 1998 - vq->vq.num_free = num; 1999 1706 vq->vq.index = index; 1707 + vq->vq.reset = false; 2000 1708 vq->we_own_ring = true; 2001 1709 vq->notify = notify; 2002 1710 vq->weak_barriers = weak_barriers; ··· 2005 1713 #else 2006 1714 vq->broken = false; 2007 1715 #endif 2008 - vq->last_used_idx = 0 | (1 << VRING_PACKED_EVENT_F_WRAP_CTR); 2009 - vq->event_triggered = false; 2010 - vq->num_added = 0; 2011 1716 vq->packed_ring = true; 2012 1717 vq->use_dma_api = vring_use_dma_api(vdev); 2013 - #ifdef DEBUG 2014 - vq->in_use = false; 2015 - vq->last_add_time_valid = false; 2016 - #endif 2017 1718 2018 1719 vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC) && 2019 1720 !context; ··· 2015 1730 if (virtio_has_feature(vdev, VIRTIO_F_ORDER_PLATFORM)) 2016 1731 vq->weak_barriers = false; 2017 1732 2018 - vq->packed.ring_dma_addr = ring_dma_addr; 2019 - vq->packed.driver_event_dma_addr = driver_event_dma_addr; 2020 - vq->packed.device_event_dma_addr = device_event_dma_addr; 1733 + err = vring_alloc_state_extra_packed(&vring_packed); 1734 + if (err) 1735 + goto err_state_extra; 2021 1736 2022 - vq->packed.ring_size_in_bytes = ring_size_in_bytes; 2023 - vq->packed.event_size_in_bytes = event_size_in_bytes; 1737 + virtqueue_vring_init_packed(&vring_packed, !!callback); 2024 1738 2025 - vq->packed.vring.num = num; 2026 - vq->packed.vring.desc = ring; 2027 - vq->packed.vring.driver = driver; 2028 - vq->packed.vring.device = device; 2029 - 2030 - vq->packed.next_avail_idx = 0; 2031 - vq->packed.avail_wrap_counter = 1; 2032 - vq->packed.event_flags_shadow = 0; 2033 - vq->packed.avail_used_flags = 1 << VRING_PACKED_DESC_F_AVAIL; 2034 - 2035 - vq->packed.desc_state = kmalloc_array(num, 2036 - sizeof(struct vring_desc_state_packed), 2037 - GFP_KERNEL); 2038 - if (!vq->packed.desc_state) 2039 - goto err_desc_state; 2040 - 2041 - memset(vq->packed.desc_state, 0, 2042 - num * sizeof(struct vring_desc_state_packed)); 2043 - 2044 - /* Put everything in free lists. */ 2045 - vq->free_head = 0; 2046 - 2047 - vq->packed.desc_extra = vring_alloc_desc_extra(vq, num); 2048 - if (!vq->packed.desc_extra) 2049 - goto err_desc_extra; 2050 - 2051 - /* No callback? Tell other side not to bother us. */ 2052 - if (!callback) { 2053 - vq->packed.event_flags_shadow = VRING_PACKED_EVENT_FLAG_DISABLE; 2054 - vq->packed.vring.driver->flags = 2055 - cpu_to_le16(vq->packed.event_flags_shadow); 2056 - } 1739 + virtqueue_init(vq, num); 1740 + virtqueue_vring_attach_packed(vq, &vring_packed); 2057 1741 2058 1742 spin_lock(&vdev->vqs_list_lock); 2059 1743 list_add_tail(&vq->vq.list, &vdev->vqs); 2060 1744 spin_unlock(&vdev->vqs_list_lock); 2061 1745 return &vq->vq; 2062 1746 2063 - err_desc_extra: 2064 - kfree(vq->packed.desc_state); 2065 - err_desc_state: 1747 + err_state_extra: 2066 1748 kfree(vq); 2067 1749 err_vq: 2068 - vring_free_queue(vdev, event_size_in_bytes, device, device_event_dma_addr); 2069 - err_device: 2070 - vring_free_queue(vdev, event_size_in_bytes, driver, driver_event_dma_addr); 2071 - err_driver: 2072 - vring_free_queue(vdev, ring_size_in_bytes, ring, ring_dma_addr); 1750 + vring_free_packed(&vring_packed, vdev); 2073 1751 err_ring: 2074 1752 return NULL; 1753 + } 1754 + 1755 + static int virtqueue_resize_packed(struct virtqueue *_vq, u32 num) 1756 + { 1757 + struct vring_virtqueue_packed vring_packed = {}; 1758 + struct vring_virtqueue *vq = to_vvq(_vq); 1759 + struct virtio_device *vdev = _vq->vdev; 1760 + int err; 1761 + 1762 + if (vring_alloc_queue_packed(&vring_packed, vdev, num)) 1763 + goto err_ring; 1764 + 1765 + err = vring_alloc_state_extra_packed(&vring_packed); 1766 + if (err) 1767 + goto err_state_extra; 1768 + 1769 + vring_free(&vq->vq); 1770 + 1771 + virtqueue_vring_init_packed(&vring_packed, !!vq->vq.callback); 1772 + 1773 + virtqueue_init(vq, vring_packed.vring.num); 1774 + virtqueue_vring_attach_packed(vq, &vring_packed); 1775 + 1776 + return 0; 1777 + 1778 + err_state_extra: 1779 + vring_free_packed(&vring_packed, vdev); 1780 + err_ring: 1781 + virtqueue_reinit_packed(vq); 1782 + return -ENOMEM; 2075 1783 } 2076 1784 2077 1785 ··· 2409 2131 * @_vq: the struct virtqueue we're talking about. 2410 2132 * 2411 2133 * Returns NULL or the "data" token handed to virtqueue_add_*(). 2412 - * This is not valid on an active queue; it is useful only for device 2413 - * shutdown. 2134 + * This is not valid on an active queue; it is useful for device 2135 + * shutdown or the reset queue. 2414 2136 */ 2415 2137 void *virtqueue_detach_unused_buf(struct virtqueue *_vq) 2416 2138 { ··· 2458 2180 EXPORT_SYMBOL_GPL(vring_interrupt); 2459 2181 2460 2182 /* Only available for split ring */ 2461 - struct virtqueue *__vring_new_virtqueue(unsigned int index, 2462 - struct vring vring, 2463 - struct virtio_device *vdev, 2464 - bool weak_barriers, 2465 - bool context, 2466 - bool (*notify)(struct virtqueue *), 2467 - void (*callback)(struct virtqueue *), 2468 - const char *name) 2183 + static struct virtqueue *__vring_new_virtqueue(unsigned int index, 2184 + struct vring_virtqueue_split *vring_split, 2185 + struct virtio_device *vdev, 2186 + bool weak_barriers, 2187 + bool context, 2188 + bool (*notify)(struct virtqueue *), 2189 + void (*callback)(struct virtqueue *), 2190 + const char *name) 2469 2191 { 2470 2192 struct vring_virtqueue *vq; 2193 + int err; 2471 2194 2472 2195 if (virtio_has_feature(vdev, VIRTIO_F_RING_PACKED)) 2473 2196 return NULL; ··· 2481 2202 vq->vq.callback = callback; 2482 2203 vq->vq.vdev = vdev; 2483 2204 vq->vq.name = name; 2484 - vq->vq.num_free = vring.num; 2485 2205 vq->vq.index = index; 2206 + vq->vq.reset = false; 2486 2207 vq->we_own_ring = false; 2487 2208 vq->notify = notify; 2488 2209 vq->weak_barriers = weak_barriers; ··· 2491 2212 #else 2492 2213 vq->broken = false; 2493 2214 #endif 2494 - vq->last_used_idx = 0; 2495 - vq->event_triggered = false; 2496 - vq->num_added = 0; 2497 2215 vq->use_dma_api = vring_use_dma_api(vdev); 2498 - #ifdef DEBUG 2499 - vq->in_use = false; 2500 - vq->last_add_time_valid = false; 2501 - #endif 2502 2216 2503 2217 vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC) && 2504 2218 !context; ··· 2500 2228 if (virtio_has_feature(vdev, VIRTIO_F_ORDER_PLATFORM)) 2501 2229 vq->weak_barriers = false; 2502 2230 2503 - vq->split.queue_dma_addr = 0; 2504 - vq->split.queue_size_in_bytes = 0; 2505 - 2506 - vq->split.vring = vring; 2507 - vq->split.avail_flags_shadow = 0; 2508 - vq->split.avail_idx_shadow = 0; 2509 - 2510 - /* No callback? Tell other side not to bother us. */ 2511 - if (!callback) { 2512 - vq->split.avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT; 2513 - if (!vq->event) 2514 - vq->split.vring.avail->flags = cpu_to_virtio16(vdev, 2515 - vq->split.avail_flags_shadow); 2231 + err = vring_alloc_state_extra_split(vring_split); 2232 + if (err) { 2233 + kfree(vq); 2234 + return NULL; 2516 2235 } 2517 2236 2518 - vq->split.desc_state = kmalloc_array(vring.num, 2519 - sizeof(struct vring_desc_state_split), GFP_KERNEL); 2520 - if (!vq->split.desc_state) 2521 - goto err_state; 2237 + virtqueue_vring_init_split(vring_split, vq); 2522 2238 2523 - vq->split.desc_extra = vring_alloc_desc_extra(vq, vring.num); 2524 - if (!vq->split.desc_extra) 2525 - goto err_extra; 2526 - 2527 - /* Put everything in free lists. */ 2528 - vq->free_head = 0; 2529 - memset(vq->split.desc_state, 0, vring.num * 2530 - sizeof(struct vring_desc_state_split)); 2239 + virtqueue_init(vq, vring_split->vring.num); 2240 + virtqueue_vring_attach_split(vq, vring_split); 2531 2241 2532 2242 spin_lock(&vdev->vqs_list_lock); 2533 2243 list_add_tail(&vq->vq.list, &vdev->vqs); 2534 2244 spin_unlock(&vdev->vqs_list_lock); 2535 2245 return &vq->vq; 2536 - 2537 - err_extra: 2538 - kfree(vq->split.desc_state); 2539 - err_state: 2540 - kfree(vq); 2541 - return NULL; 2542 2246 } 2543 - EXPORT_SYMBOL_GPL(__vring_new_virtqueue); 2544 2247 2545 2248 struct virtqueue *vring_create_virtqueue( 2546 2249 unsigned int index, ··· 2541 2294 } 2542 2295 EXPORT_SYMBOL_GPL(vring_create_virtqueue); 2543 2296 2297 + /** 2298 + * virtqueue_resize - resize the vring of vq 2299 + * @_vq: the struct virtqueue we're talking about. 2300 + * @num: new ring num 2301 + * @recycle: callback for recycle the useless buffer 2302 + * 2303 + * When it is really necessary to create a new vring, it will set the current vq 2304 + * into the reset state. Then call the passed callback to recycle the buffer 2305 + * that is no longer used. Only after the new vring is successfully created, the 2306 + * old vring will be released. 2307 + * 2308 + * Caller must ensure we don't call this with other virtqueue operations 2309 + * at the same time (except where noted). 2310 + * 2311 + * Returns zero or a negative error. 2312 + * 0: success. 2313 + * -ENOMEM: Failed to allocate a new ring, fall back to the original ring size. 2314 + * vq can still work normally 2315 + * -EBUSY: Failed to sync with device, vq may not work properly 2316 + * -ENOENT: Transport or device not supported 2317 + * -E2BIG/-EINVAL: num error 2318 + * -EPERM: Operation not permitted 2319 + * 2320 + */ 2321 + int virtqueue_resize(struct virtqueue *_vq, u32 num, 2322 + void (*recycle)(struct virtqueue *vq, void *buf)) 2323 + { 2324 + struct vring_virtqueue *vq = to_vvq(_vq); 2325 + struct virtio_device *vdev = vq->vq.vdev; 2326 + void *buf; 2327 + int err; 2328 + 2329 + if (!vq->we_own_ring) 2330 + return -EPERM; 2331 + 2332 + if (num > vq->vq.num_max) 2333 + return -E2BIG; 2334 + 2335 + if (!num) 2336 + return -EINVAL; 2337 + 2338 + if ((vq->packed_ring ? vq->packed.vring.num : vq->split.vring.num) == num) 2339 + return 0; 2340 + 2341 + if (!vdev->config->disable_vq_and_reset) 2342 + return -ENOENT; 2343 + 2344 + if (!vdev->config->enable_vq_after_reset) 2345 + return -ENOENT; 2346 + 2347 + err = vdev->config->disable_vq_and_reset(_vq); 2348 + if (err) 2349 + return err; 2350 + 2351 + while ((buf = virtqueue_detach_unused_buf(_vq)) != NULL) 2352 + recycle(_vq, buf); 2353 + 2354 + if (vq->packed_ring) 2355 + err = virtqueue_resize_packed(_vq, num); 2356 + else 2357 + err = virtqueue_resize_split(_vq, num); 2358 + 2359 + if (vdev->config->enable_vq_after_reset(_vq)) 2360 + return -EBUSY; 2361 + 2362 + return err; 2363 + } 2364 + EXPORT_SYMBOL_GPL(virtqueue_resize); 2365 + 2544 2366 /* Only available for split ring */ 2545 2367 struct virtqueue *vring_new_virtqueue(unsigned int index, 2546 2368 unsigned int num, ··· 2622 2306 void (*callback)(struct virtqueue *vq), 2623 2307 const char *name) 2624 2308 { 2625 - struct vring vring; 2309 + struct vring_virtqueue_split vring_split = {}; 2626 2310 2627 2311 if (virtio_has_feature(vdev, VIRTIO_F_RING_PACKED)) 2628 2312 return NULL; 2629 2313 2630 - vring_init(&vring, num, pages, vring_align); 2631 - return __vring_new_virtqueue(index, vring, vdev, weak_barriers, context, 2632 - notify, callback, name); 2314 + vring_init(&vring_split.vring, num, pages, vring_align); 2315 + return __vring_new_virtqueue(index, &vring_split, vdev, weak_barriers, 2316 + context, notify, callback, name); 2633 2317 } 2634 2318 EXPORT_SYMBOL_GPL(vring_new_virtqueue); 2635 2319 2636 - void vring_del_virtqueue(struct virtqueue *_vq) 2320 + static void vring_free(struct virtqueue *_vq) 2637 2321 { 2638 2322 struct vring_virtqueue *vq = to_vvq(_vq); 2639 - 2640 - spin_lock(&vq->vq.vdev->vqs_list_lock); 2641 - list_del(&_vq->list); 2642 - spin_unlock(&vq->vq.vdev->vqs_list_lock); 2643 2323 2644 2324 if (vq->we_own_ring) { 2645 2325 if (vq->packed_ring) { ··· 2667 2355 kfree(vq->split.desc_state); 2668 2356 kfree(vq->split.desc_extra); 2669 2357 } 2358 + } 2359 + 2360 + void vring_del_virtqueue(struct virtqueue *_vq) 2361 + { 2362 + struct vring_virtqueue *vq = to_vvq(_vq); 2363 + 2364 + spin_lock(&vq->vq.vdev->vqs_list_lock); 2365 + list_del(&_vq->list); 2366 + spin_unlock(&vq->vq.vdev->vqs_list_lock); 2367 + 2368 + vring_free(_vq); 2369 + 2670 2370 kfree(vq); 2671 2371 } 2672 2372 EXPORT_SYMBOL_GPL(vring_del_virtqueue); ··· 2725 2401 return vq->packed_ring ? vq->packed.vring.num : vq->split.vring.num; 2726 2402 } 2727 2403 EXPORT_SYMBOL_GPL(virtqueue_get_vring_size); 2404 + 2405 + /* 2406 + * This function should only be called by the core, not directly by the driver. 2407 + */ 2408 + void __virtqueue_break(struct virtqueue *_vq) 2409 + { 2410 + struct vring_virtqueue *vq = to_vvq(_vq); 2411 + 2412 + /* Pairs with READ_ONCE() in virtqueue_is_broken(). */ 2413 + WRITE_ONCE(vq->broken, true); 2414 + } 2415 + EXPORT_SYMBOL_GPL(__virtqueue_break); 2416 + 2417 + /* 2418 + * This function should only be called by the core, not directly by the driver. 2419 + */ 2420 + void __virtqueue_unbreak(struct virtqueue *_vq) 2421 + { 2422 + struct vring_virtqueue *vq = to_vvq(_vq); 2423 + 2424 + /* Pairs with READ_ONCE() in virtqueue_is_broken(). */ 2425 + WRITE_ONCE(vq->broken, false); 2426 + } 2427 + EXPORT_SYMBOL_GPL(__virtqueue_unbreak); 2728 2428 2729 2429 bool virtqueue_is_broken(struct virtqueue *_vq) 2730 2430 {

+12 -6

drivers/virtio/virtio_vdpa.c

··· 131 131 static struct virtqueue * 132 132 virtio_vdpa_setup_vq(struct virtio_device *vdev, unsigned int index, 133 133 void (*callback)(struct virtqueue *vq), 134 - const char *name, bool ctx) 134 + const char *name, u32 size, bool ctx) 135 135 { 136 136 struct virtio_vdpa_device *vd_dev = to_virtio_vdpa_device(vdev); 137 137 struct vdpa_device *vdpa = vd_get_vdpa(vdev); ··· 168 168 goto error_new_virtqueue; 169 169 } 170 170 171 + if (!size || size > max_num) 172 + size = max_num; 173 + 171 174 if (ops->get_vq_num_min) 172 175 min_num = ops->get_vq_num_min(vdpa); 173 176 174 - may_reduce_num = (max_num == min_num) ? false : true; 177 + may_reduce_num = (size == min_num) ? false : true; 175 178 176 179 /* Create the vring */ 177 180 align = ops->get_vq_align(vdpa); 178 - vq = vring_create_virtqueue(index, max_num, align, vdev, 181 + vq = vring_create_virtqueue(index, size, align, vdev, 179 182 true, may_reduce_num, ctx, 180 183 virtio_vdpa_notify, callback, name); 181 184 if (!vq) { 182 185 err = -ENOMEM; 183 186 goto error_new_virtqueue; 184 187 } 188 + 189 + vq->num_max = max_num; 185 190 186 191 /* Setup virtqueue callback */ 187 192 cb.callback = callback ? virtio_vdpa_virtqueue_cb : NULL; ··· 272 267 struct virtqueue *vqs[], 273 268 vq_callback_t *callbacks[], 274 269 const char * const names[], 270 + u32 sizes[], 275 271 const bool *ctx, 276 272 struct irq_affinity *desc) 277 273 { ··· 288 282 continue; 289 283 } 290 284 291 - vqs[i] = virtio_vdpa_setup_vq(vdev, queue_idx++, 292 - callbacks[i], names[i], ctx ? 293 - ctx[i] : false); 285 + vqs[i] = virtio_vdpa_setup_vq(vdev, queue_idx++, callbacks[i], 286 + names[i], sizes ? sizes[i] : 0, 287 + ctx ? ctx[i] : false); 294 288 if (IS_ERR(vqs[i])) { 295 289 err = PTR_ERR(vqs[i]); 296 290 goto err_setup_vq;

+8

include/linux/mlx5/mlx5_ifc_vdpa.h

··· 150 150 MLX5_VIRTIO_NET_Q_OBJECT_STATE_ERR = 0x3, 151 151 }; 152 152 153 + /* This indicates that the object was not created or has already 154 + * been desroyed. It is very safe to assume that this object will never 155 + * have so many states 156 + */ 157 + enum { 158 + MLX5_VIRTIO_NET_Q_OBJECT_NONE = 0xffffffff 159 + }; 160 + 153 161 enum { 154 162 MLX5_RQTC_LIST_Q_TYPE_RQ = 0x0, 155 163 MLX5_RQTC_LIST_Q_TYPE_VIRTIO_NET_Q = 0x1,

+2 -2

include/linux/remoteproc.h

··· 597 597 /** 598 598 * struct rproc_vring - remoteproc vring state 599 599 * @va: virtual address 600 - * @len: length, in bytes 600 + * @num: vring size 601 601 * @da: device address 602 602 * @align: vring alignment 603 603 * @notifyid: rproc-specific unique vring index ··· 606 606 */ 607 607 struct rproc_vring { 608 608 void *va; 609 - int len; 609 + int num; 610 610 u32 da; 611 611 u32 align; 612 612 int notifyid;

+4

include/linux/vdpa.h

··· 218 218 * @reset: Reset device 219 219 * @vdev: vdpa device 220 220 * Returns integer: success (0) or error (< 0) 221 + * @suspend: Suspend or resume the device (optional) 222 + * @vdev: vdpa device 223 + * Returns integer: success (0) or error (< 0) 221 224 * @get_config_size: Get the size of the configuration space includes 222 225 * fields that are conditional on feature bits. 223 226 * @vdev: vdpa device ··· 322 319 u8 (*get_status)(struct vdpa_device *vdev); 323 320 void (*set_status)(struct vdpa_device *vdev, u8 status); 324 321 int (*reset)(struct vdpa_device *vdev); 322 + int (*suspend)(struct vdpa_device *vdev); 325 323 size_t (*get_config_size)(struct vdpa_device *vdev); 326 324 void (*get_config)(struct vdpa_device *vdev, unsigned int offset, 327 325 void *buf, unsigned int len);

+10

include/linux/virtio.h

··· 19 19 * @priv: a pointer for the virtqueue implementation to use. 20 20 * @index: the zero-based ordinal number for this queue. 21 21 * @num_free: number of elements we expect to be able to fit. 22 + * @num_max: the maximum number of elements supported by the device. 23 + * @reset: vq is in reset state or not. 22 24 * 23 25 * A note on @num_free: with indirect buffers, each buffer needs one 24 26 * element in the queue, otherwise a buffer will need one element per ··· 33 31 struct virtio_device *vdev; 34 32 unsigned int index; 35 33 unsigned int num_free; 34 + unsigned int num_max; 36 35 void *priv; 36 + bool reset; 37 37 }; 38 38 39 39 int virtqueue_add_outbuf(struct virtqueue *vq, ··· 93 89 dma_addr_t virtqueue_get_avail_addr(struct virtqueue *vq); 94 90 dma_addr_t virtqueue_get_used_addr(struct virtqueue *vq); 95 91 92 + int virtqueue_resize(struct virtqueue *vq, u32 num, 93 + void (*recycle)(struct virtqueue *vq, void *buf)); 94 + 96 95 /** 97 96 * virtio_device - representation of a device using virtio 98 97 * @index: unique position on the virtio bus ··· 139 132 140 133 void virtio_break_device(struct virtio_device *dev); 141 134 void __virtio_unbreak_device(struct virtio_device *dev); 135 + 136 + void __virtqueue_break(struct virtqueue *_vq); 137 + void __virtqueue_unbreak(struct virtqueue *_vq); 142 138 143 139 void virtio_config_changed(struct virtio_device *dev); 144 140 #ifdef CONFIG_PM_SLEEP

+35 -5

include/linux/virtio_config.h

··· 55 55 * include a NULL entry for vqs that do not need a callback 56 56 * names: array of virtqueue names (mainly for debugging) 57 57 * include a NULL entry for vqs unused by driver 58 + * sizes: array of virtqueue sizes 58 59 * Returns 0 on success or error status 59 60 * @del_vqs: free virtqueues found by find_vqs(). 60 61 * @synchronize_cbs: synchronize with the virtqueue callbacks (optional) ··· 79 78 * @set_vq_affinity: set the affinity for a virtqueue (optional). 80 79 * @get_vq_affinity: get the affinity for a virtqueue (optional). 81 80 * @get_shm_region: get a shared memory region based on the index. 81 + * @disable_vq_and_reset: reset a queue individually (optional). 82 + * vq: the virtqueue 83 + * Returns 0 on success or error status 84 + * disable_vq_and_reset will guarantee that the callbacks are disabled and 85 + * synchronized. 86 + * Except for the callback, the caller should guarantee that the vring is 87 + * not accessed by any functions of virtqueue. 88 + * @enable_vq_after_reset: enable a reset queue 89 + * vq: the virtqueue 90 + * Returns 0 on success or error status 91 + * If disable_vq_and_reset is set, then enable_vq_after_reset must also be 92 + * set. 82 93 */ 83 94 typedef void vq_callback_t(struct virtqueue *); 84 95 struct virtio_config_ops { ··· 104 91 void (*reset)(struct virtio_device *vdev); 105 92 int (*find_vqs)(struct virtio_device *, unsigned nvqs, 106 93 struct virtqueue *vqs[], vq_callback_t *callbacks[], 107 - const char * const names[], const bool *ctx, 94 + const char * const names[], 95 + u32 sizes[], 96 + const bool *ctx, 108 97 struct irq_affinity *desc); 109 98 void (*del_vqs)(struct virtio_device *); 110 99 void (*synchronize_cbs)(struct virtio_device *); ··· 119 104 int index); 120 105 bool (*get_shm_region)(struct virtio_device *vdev, 121 106 struct virtio_shm_region *region, u8 id); 107 + int (*disable_vq_and_reset)(struct virtqueue *vq); 108 + int (*enable_vq_after_reset)(struct virtqueue *vq); 122 109 }; 123 110 124 111 /* If driver didn't advertise the feature, it will never appear. */ ··· 215 198 const char *names[] = { n }; 216 199 struct virtqueue *vq; 217 200 int err = vdev->config->find_vqs(vdev, 1, &vq, callbacks, names, NULL, 218 - NULL); 201 + NULL, NULL); 219 202 if (err < 0) 220 203 return ERR_PTR(err); 221 204 return vq; ··· 227 210 const char * const names[], 228 211 struct irq_affinity *desc) 229 212 { 230 - return vdev->config->find_vqs(vdev, nvqs, vqs, callbacks, names, NULL, desc); 213 + return vdev->config->find_vqs(vdev, nvqs, vqs, callbacks, names, NULL, 214 + NULL, desc); 231 215 } 232 216 233 217 static inline ··· 237 219 const char * const names[], const bool *ctx, 238 220 struct irq_affinity *desc) 239 221 { 240 - return vdev->config->find_vqs(vdev, nvqs, vqs, callbacks, names, ctx, 241 - desc); 222 + return vdev->config->find_vqs(vdev, nvqs, vqs, callbacks, names, NULL, 223 + ctx, desc); 224 + } 225 + 226 + static inline 227 + int virtio_find_vqs_ctx_size(struct virtio_device *vdev, u32 nvqs, 228 + struct virtqueue *vqs[], 229 + vq_callback_t *callbacks[], 230 + const char * const names[], 231 + u32 sizes[], 232 + const bool *ctx, struct irq_affinity *desc) 233 + { 234 + return vdev->config->find_vqs(vdev, nvqs, vqs, callbacks, names, sizes, 235 + ctx, desc); 242 236 } 243 237 244 238 /**

+9

include/linux/virtio_pci_modern.h

··· 5 5 #include <linux/pci.h> 6 6 #include <linux/virtio_pci.h> 7 7 8 + struct virtio_pci_modern_common_cfg { 9 + struct virtio_pci_common_cfg cfg; 10 + 11 + __le16 queue_notify_data; /* read-write */ 12 + __le16 queue_reset; /* read-write */ 13 + }; 14 + 8 15 struct virtio_pci_modern_device { 9 16 struct pci_dev *pci_dev; 10 17 ··· 113 106 u16 index, resource_size_t *pa); 114 107 int vp_modern_probe(struct virtio_pci_modern_device *mdev); 115 108 void vp_modern_remove(struct virtio_pci_modern_device *mdev); 109 + int vp_modern_get_queue_reset(struct virtio_pci_modern_device *mdev, u16 index); 110 + void vp_modern_set_queue_reset(struct virtio_pci_modern_device *mdev, u16 index); 116 111 #endif

-10

include/linux/virtio_ring.h

··· 76 76 void (*callback)(struct virtqueue *vq), 77 77 const char *name); 78 78 79 - /* Creates a virtqueue with a custom layout. */ 80 - struct virtqueue *__vring_new_virtqueue(unsigned int index, 81 - struct vring vring, 82 - struct virtio_device *vdev, 83 - bool weak_barriers, 84 - bool ctx, 85 - bool (*notify)(struct virtqueue *), 86 - void (*callback)(struct virtqueue *), 87 - const char *name); 88 - 89 79 /* 90 80 * Creates a virtqueue with a standard layout but a caller-allocated 91 81 * ring.

+47

include/uapi/linux/vduse.h

··· 210 210 */ 211 211 #define VDUSE_VQ_INJECT_IRQ _IOW(VDUSE_BASE, 0x17, __u32) 212 212 213 + /** 214 + * struct vduse_iova_umem - userspace memory configuration for one IOVA region 215 + * @uaddr: start address of userspace memory, it must be aligned to page size 216 + * @iova: start of the IOVA region 217 + * @size: size of the IOVA region 218 + * @reserved: for future use, needs to be initialized to zero 219 + * 220 + * Structure used by VDUSE_IOTLB_REG_UMEM and VDUSE_IOTLB_DEREG_UMEM 221 + * ioctls to register/de-register userspace memory for IOVA regions 222 + */ 223 + struct vduse_iova_umem { 224 + __u64 uaddr; 225 + __u64 iova; 226 + __u64 size; 227 + __u64 reserved[3]; 228 + }; 229 + 230 + /* Register userspace memory for IOVA regions */ 231 + #define VDUSE_IOTLB_REG_UMEM _IOW(VDUSE_BASE, 0x18, struct vduse_iova_umem) 232 + 233 + /* De-register the userspace memory. Caller should set iova and size field. */ 234 + #define VDUSE_IOTLB_DEREG_UMEM _IOW(VDUSE_BASE, 0x19, struct vduse_iova_umem) 235 + 236 + /** 237 + * struct vduse_iova_info - information of one IOVA region 238 + * @start: start of the IOVA region 239 + * @last: last of the IOVA region 240 + * @capability: capability of the IOVA regsion 241 + * @reserved: for future use, needs to be initialized to zero 242 + * 243 + * Structure used by VDUSE_IOTLB_GET_INFO ioctl to get information of 244 + * one IOVA region. 245 + */ 246 + struct vduse_iova_info { 247 + __u64 start; 248 + __u64 last; 249 + #define VDUSE_IOVA_CAP_UMEM (1 << 0) 250 + __u64 capability; 251 + __u64 reserved[3]; 252 + }; 253 + 254 + /* 255 + * Find the first IOVA region that overlaps with the range [start, last] 256 + * and return some information on it. Caller should set start and last fields. 257 + */ 258 + #define VDUSE_IOTLB_GET_INFO _IOWR(VDUSE_BASE, 0x1a, struct vduse_iova_info) 259 + 213 260 /* The control messages definition for read(2)/write(2) on /dev/vduse/$NAME */ 214 261 215 262 /**

+9

include/uapi/linux/vhost.h

··· 171 171 #define VHOST_VDPA_SET_GROUP_ASID _IOW(VHOST_VIRTIO, 0x7C, \ 172 172 struct vhost_vring_state) 173 173 174 + /* Suspend a device so it does not process virtqueue requests anymore 175 + * 176 + * After the return of ioctl the device must preserve all the necessary state 177 + * (the virtqueue vring base plus the possible device specific states) that is 178 + * required for restoring in the future. The device must not change its 179 + * configuration after that point. 180 + */ 181 + #define VHOST_VDPA_SUSPEND _IO(VHOST_VIRTIO, 0x7D) 182 + 174 183 #endif

+2

include/uapi/linux/vhost_types.h

··· 161 161 * message 162 162 */ 163 163 #define VHOST_BACKEND_F_IOTLB_ASID 0x3 164 + /* Device can be suspended */ 165 + #define VHOST_BACKEND_F_SUSPEND 0x4 164 166 165 167 #endif

+6 -1

include/uapi/linux/virtio_config.h

··· 52 52 * rest are per-device feature bits. 53 53 */ 54 54 #define VIRTIO_TRANSPORT_F_START 28 55 - #define VIRTIO_TRANSPORT_F_END 38 55 + #define VIRTIO_TRANSPORT_F_END 41 56 56 57 57 #ifndef VIRTIO_CONFIG_NO_LEGACY 58 58 /* Do we get callbacks when the ring is completely used, even if we've ··· 98 98 * Does the device support Single Root I/O Virtualization? 99 99 */ 100 100 #define VIRTIO_F_SR_IOV 37 101 + 102 + /* 103 + * This feature indicates that the driver can reset a queue individually. 104 + */ 105 + #define VIRTIO_F_RING_RESET 40 101 106 #endif /* _UAPI_LINUX_VIRTIO_CONFIG_H */

+33 -1

include/uapi/linux/virtio_net.h

··· 56 56 #define VIRTIO_NET_F_MQ 22 /* Device supports Receive Flow 57 57 * Steering */ 58 58 #define VIRTIO_NET_F_CTRL_MAC_ADDR 23 /* Set MAC address */ 59 - 59 + #define VIRTIO_NET_F_NOTF_COAL 53 /* Guest can handle notifications coalescing */ 60 60 #define VIRTIO_NET_F_HASH_REPORT 57 /* Supports hash report */ 61 61 #define VIRTIO_NET_F_RSS 60 /* Supports RSS RX steering */ 62 62 #define VIRTIO_NET_F_RSC_EXT 61 /* extended coalescing info */ ··· 354 354 */ 355 355 #define VIRTIO_NET_CTRL_GUEST_OFFLOADS 5 356 356 #define VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET 0 357 + 358 + /* 359 + * Control notifications coalescing. 360 + * 361 + * Request the device to change the notifications coalescing parameters. 362 + * 363 + * Available with the VIRTIO_NET_F_NOTF_COAL feature bit. 364 + */ 365 + #define VIRTIO_NET_CTRL_NOTF_COAL 6 366 + /* 367 + * Set the tx-usecs/tx-max-packets patameters. 368 + * tx-usecs - Maximum number of usecs to delay a TX notification. 369 + * tx-max-packets - Maximum number of packets to send before a TX notification. 370 + */ 371 + struct virtio_net_ctrl_coal_tx { 372 + __le32 tx_max_packets; 373 + __le32 tx_usecs; 374 + }; 375 + 376 + #define VIRTIO_NET_CTRL_NOTF_COAL_TX_SET 0 377 + 378 + /* 379 + * Set the rx-usecs/rx-max-packets patameters. 380 + * rx-usecs - Maximum number of usecs to delay a RX notification. 381 + * rx-max-frames - Maximum number of packets to receive before a RX notification. 382 + */ 383 + struct virtio_net_ctrl_coal_rx { 384 + __le32 rx_max_packets; 385 + __le32 rx_usecs; 386 + }; 387 + 388 + #define VIRTIO_NET_CTRL_NOTF_COAL_RX_SET 1 357 389 358 390 #endif /* _UAPI_LINUX_VIRTIO_NET_H */

+2

include/uapi/linux/virtio_pci.h

··· 202 202 #define VIRTIO_PCI_COMMON_Q_AVAILHI 44 203 203 #define VIRTIO_PCI_COMMON_Q_USEDLO 48 204 204 #define VIRTIO_PCI_COMMON_Q_USEDHI 52 205 + #define VIRTIO_PCI_COMMON_Q_NDATA 56 206 + #define VIRTIO_PCI_COMMON_Q_RESET 58 205 207 206 208 #endif /* VIRTIO_PCI_NO_MODERN */ 207 209

+1 -1

tools/virtio/linux/kernel.h

··· 29 29 #define READ 0 30 30 #define WRITE 1 31 31 32 - typedef unsigned long long phys_addr_t; 33 32 typedef unsigned long long dma_addr_t; 34 33 typedef size_t __kernel_size_t; 35 34 typedef unsigned int __wsum; ··· 135 136 #endif 136 137 #define dev_err(dev, format, ...) fprintf (stderr, format, ## __VA_ARGS__) 137 138 #define dev_warn(dev, format, ...) fprintf (stderr, format, ## __VA_ARGS__) 139 + #define dev_warn_once(dev, format, ...) fprintf (stderr, format, ## __VA_ARGS__) 138 140 139 141 #define min(x, y) ({ \ 140 142 typeof(x) _min1 = (x); \

+1

tools/virtio/linux/vringh.h

··· 1 + #include <limits.h> 1 2 #include "../../../include/linux/vringh.h"

+2 -2

tools/virtio/virtio_test.c

··· 102 102 103 103 memset(info->ring, 0, vring_size(num, 4096)); 104 104 vring_init(&info->vring, num, info->ring, 4096); 105 - info->vq = __vring_new_virtqueue(info->idx, info->vring, vdev, true, 106 - false, vq_notify, vq_callback, "test"); 105 + info->vq = vring_new_virtqueue(info->idx, num, 4096, vdev, true, false, 106 + info->ring, vq_notify, vq_callback, "test"); 107 107 assert(info->vq); 108 108 info->vq->priv = info; 109 109 }