Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'vhost-net-next' of git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost

Michael S. Tsirkin says:

--------------------
There are mostly bugfixes here.
I hope to merge some more patches by 3.5, in particular
vlan support fixes are waiting for Eric's ack,
and a version of tracepoint patch might be
ready in time, but let's merge what's ready so it's testable.

This includes a ton of zerocopy fixes by Jason -
good stuff but too intrusive for 3.4 and zerocopy is experimental
anyway.

virtio supported delayed interrupt for a while now
so adding support to the virtio tool made sense
--------------------

Signed-off-by: David S. Miller <davem@davemloft.net>

+69 -23
+40 -17
drivers/net/macvtap.c
··· 505 505 if (copy > size) { 506 506 ++from; 507 507 --count; 508 - } 508 + offset = 0; 509 + } else 510 + offset += size; 509 511 copy -= size; 510 512 offset1 += size; 511 - offset = 0; 512 513 } 513 514 514 515 if (len == offset1) ··· 519 518 struct page *page[MAX_SKB_FRAGS]; 520 519 int num_pages; 521 520 unsigned long base; 521 + unsigned long truesize; 522 522 523 - len = from->iov_len - offset1; 523 + len = from->iov_len - offset; 524 524 if (!len) { 525 - offset1 = 0; 525 + offset = 0; 526 526 ++from; 527 527 continue; 528 528 } 529 - base = (unsigned long)from->iov_base + offset1; 529 + base = (unsigned long)from->iov_base + offset; 530 530 size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT; 531 + if (i + size > MAX_SKB_FRAGS) 532 + return -EMSGSIZE; 531 533 num_pages = get_user_pages_fast(base, size, 0, &page[i]); 532 - if ((num_pages != size) || 533 - (num_pages > MAX_SKB_FRAGS - skb_shinfo(skb)->nr_frags)) 534 - /* put_page is in skb free */ 534 + if (num_pages != size) { 535 + for (i = 0; i < num_pages; i++) 536 + put_page(page[i]); 535 537 return -EFAULT; 538 + } 539 + truesize = size * PAGE_SIZE; 536 540 skb->data_len += len; 537 541 skb->len += len; 538 - skb->truesize += len; 539 - atomic_add(len, &skb->sk->sk_wmem_alloc); 542 + skb->truesize += truesize; 543 + atomic_add(truesize, &skb->sk->sk_wmem_alloc); 540 544 while (len) { 541 545 int off = base & ~PAGE_MASK; 542 546 int size = min_t(int, len, PAGE_SIZE - off); ··· 552 546 len -= size; 553 547 i++; 554 548 } 555 - offset1 = 0; 549 + offset = 0; 556 550 ++from; 557 551 } 558 552 return 0; ··· 652 646 int err; 653 647 struct virtio_net_hdr vnet_hdr = { 0 }; 654 648 int vnet_hdr_len = 0; 655 - int copylen; 649 + int copylen = 0; 656 650 bool zerocopy = false; 657 651 658 652 if (q->flags & IFF_VNET_HDR) { ··· 681 675 if (unlikely(len < ETH_HLEN)) 682 676 goto err; 683 677 678 + err = -EMSGSIZE; 679 + if (unlikely(count > UIO_MAXIOV)) 680 + goto err; 681 + 684 682 if (m && m->msg_control && sock_flag(&q->sk, SOCK_ZEROCOPY)) 685 683 zerocopy = true; 686 684 687 685 if (zerocopy) { 686 + /* Userspace may produce vectors with count greater than 687 + * MAX_SKB_FRAGS, so we need to linearize parts of the skb 688 + * to let the rest of data to be fit in the frags. 689 + */ 690 + if (count > MAX_SKB_FRAGS) { 691 + copylen = iov_length(iv, count - MAX_SKB_FRAGS); 692 + if (copylen < vnet_hdr_len) 693 + copylen = 0; 694 + else 695 + copylen -= vnet_hdr_len; 696 + } 688 697 /* There are 256 bytes to be copied in skb, so there is enough 689 698 * room for skb expand head in case it is used. 690 699 * The rest buffer is mapped from userspace. 691 700 */ 692 - copylen = vnet_hdr.hdr_len; 701 + if (copylen < vnet_hdr.hdr_len) 702 + copylen = vnet_hdr.hdr_len; 693 703 if (!copylen) 694 704 copylen = GOODCOPY_LEN; 695 705 } else ··· 716 694 if (!skb) 717 695 goto err; 718 696 719 - if (zerocopy) { 697 + if (zerocopy) 720 698 err = zerocopy_sg_from_iovec(skb, iv, vnet_hdr_len, count); 721 - skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY; 722 - } else 699 + else 723 700 err = skb_copy_datagram_from_iovec(skb, 0, iv, vnet_hdr_len, 724 701 len); 725 702 if (err) ··· 737 716 rcu_read_lock_bh(); 738 717 vlan = rcu_dereference_bh(q->vlan); 739 718 /* copy skb_ubuf_info for callback when skb has no error */ 740 - if (zerocopy) 719 + if (zerocopy) { 741 720 skb_shinfo(skb)->destructor_arg = m->msg_control; 721 + skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY; 722 + } 742 723 if (vlan) 743 724 macvlan_start_xmit(skb, vlan->dev); 744 725 else
+5 -2
drivers/vhost/net.c
··· 166 166 if (wmem < sock->sk->sk_sndbuf / 2) 167 167 tx_poll_stop(net); 168 168 hdr_size = vq->vhost_hlen; 169 - zcopy = vhost_sock_zcopy(sock); 169 + zcopy = vq->ubufs; 170 170 171 171 for (;;) { 172 172 /* Release DMAs done buffers first */ ··· 257 257 UIO_MAXIOV; 258 258 } 259 259 vhost_discard_vq_desc(vq, 1); 260 - tx_poll_start(net, sock); 260 + if (err == -EAGAIN || err == -ENOBUFS) 261 + tx_poll_start(net, sock); 261 262 break; 262 263 } 263 264 if (err != len) ··· 266 265 " len %d != %zd\n", err, len); 267 266 if (!zcopy) 268 267 vhost_add_used_and_signal(&net->dev, vq, head, 0); 268 + else 269 + vhost_zerocopy_signal_used(vq); 269 270 total_len += len; 270 271 if (unlikely(total_len >= VHOST_NET_WEIGHT)) { 271 272 vhost_poll_queue(&vq->poll);
+1
drivers/vhost/vhost.c
··· 1603 1603 struct vhost_ubuf_ref *ubufs = ubuf->ctx; 1604 1604 struct vhost_virtqueue *vq = ubufs->vq; 1605 1605 1606 + vhost_poll_queue(&vq->poll); 1606 1607 /* set len = 1 to mark this desc buffers done DMA */ 1607 1608 vq->heads[ubuf->desc].len = VHOST_DMA_DONE_LEN; 1608 1609 kref_put(&ubufs->kref, vhost_zerocopy_done_signal);
+1
tools/virtio/linux/virtio.h
··· 203 203 void virtqueue_disable_cb(struct virtqueue *vq); 204 204 205 205 bool virtqueue_enable_cb(struct virtqueue *vq); 206 + bool virtqueue_enable_cb_delayed(struct virtqueue *vq); 206 207 207 208 void *virtqueue_detach_unused_buf(struct virtqueue *vq); 208 209 struct virtqueue *vring_new_virtqueue(unsigned int num,
+22 -4
tools/virtio/virtio_test.c
··· 144 144 } 145 145 } 146 146 147 - static void run_test(struct vdev_info *dev, struct vq_info *vq, int bufs) 147 + static void run_test(struct vdev_info *dev, struct vq_info *vq, 148 + bool delayed, int bufs) 148 149 { 149 150 struct scatterlist sl; 150 151 long started = 0, completed = 0; ··· 184 183 assert(started <= bufs); 185 184 if (completed == bufs) 186 185 break; 187 - if (virtqueue_enable_cb(vq->vq)) { 188 - wait_for_interrupt(dev); 186 + if (delayed) { 187 + if (virtqueue_enable_cb_delayed(vq->vq)) 188 + wait_for_interrupt(dev); 189 + } else { 190 + if (virtqueue_enable_cb(vq->vq)) 191 + wait_for_interrupt(dev); 189 192 } 190 193 } 191 194 test = 0; ··· 221 216 .val = 'i', 222 217 }, 223 218 { 219 + .name = "delayed-interrupt", 220 + .val = 'D', 221 + }, 222 + { 223 + .name = "no-delayed-interrupt", 224 + .val = 'd', 225 + }, 226 + { 224 227 } 225 228 }; 226 229 ··· 237 224 fprintf(stderr, "Usage: virtio_test [--help]" 238 225 " [--no-indirect]" 239 226 " [--no-event-idx]" 227 + " [--delayed-interrupt]" 240 228 "\n"); 241 229 } 242 230 ··· 247 233 unsigned long long features = (1ULL << VIRTIO_RING_F_INDIRECT_DESC) | 248 234 (1ULL << VIRTIO_RING_F_EVENT_IDX); 249 235 int o; 236 + bool delayed = false; 250 237 251 238 for (;;) { 252 239 o = getopt_long(argc, argv, optstring, longopts, NULL); ··· 266 251 case 'i': 267 252 features &= ~(1ULL << VIRTIO_RING_F_INDIRECT_DESC); 268 253 break; 254 + case 'D': 255 + delayed = true; 256 + break; 269 257 default: 270 258 assert(0); 271 259 break; ··· 278 260 done: 279 261 vdev_info_init(&dev, features); 280 262 vq_info_add(&dev, 256); 281 - run_test(&dev, &dev.vqs[0], 0x100000); 263 + run_test(&dev, &dev.vqs[0], delayed, 0x100000); 282 264 return 0; 283 265 }