Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at v2.6.34-rc1 669 lines 16 kB view raw
1/* Copyright (C) 2009 Red Hat, Inc. 2 * Author: Michael S. Tsirkin <mst@redhat.com> 3 * 4 * This work is licensed under the terms of the GNU GPL, version 2. 5 * 6 * virtio-net server in host kernel. 7 */ 8 9#include <linux/compat.h> 10#include <linux/eventfd.h> 11#include <linux/vhost.h> 12#include <linux/virtio_net.h> 13#include <linux/mmu_context.h> 14#include <linux/miscdevice.h> 15#include <linux/module.h> 16#include <linux/mutex.h> 17#include <linux/workqueue.h> 18#include <linux/rcupdate.h> 19#include <linux/file.h> 20 21#include <linux/net.h> 22#include <linux/if_packet.h> 23#include <linux/if_arp.h> 24#include <linux/if_tun.h> 25#include <linux/if_macvlan.h> 26 27#include <net/sock.h> 28 29#include "vhost.h" 30 31/* Max number of bytes transferred before requeueing the job. 32 * Using this limit prevents one virtqueue from starving others. */ 33#define VHOST_NET_WEIGHT 0x80000 34 35enum { 36 VHOST_NET_VQ_RX = 0, 37 VHOST_NET_VQ_TX = 1, 38 VHOST_NET_VQ_MAX = 2, 39}; 40 41enum vhost_net_poll_state { 42 VHOST_NET_POLL_DISABLED = 0, 43 VHOST_NET_POLL_STARTED = 1, 44 VHOST_NET_POLL_STOPPED = 2, 45}; 46 47struct vhost_net { 48 struct vhost_dev dev; 49 struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX]; 50 struct vhost_poll poll[VHOST_NET_VQ_MAX]; 51 /* Tells us whether we are polling a socket for TX. 52 * We only do this when socket buffer fills up. 53 * Protected by tx vq lock. */ 54 enum vhost_net_poll_state tx_poll_state; 55}; 56 57/* Pop first len bytes from iovec. Return number of segments used. */ 58static int move_iovec_hdr(struct iovec *from, struct iovec *to, 59 size_t len, int iov_count) 60{ 61 int seg = 0; 62 size_t size; 63 while (len && seg < iov_count) { 64 size = min(from->iov_len, len); 65 to->iov_base = from->iov_base; 66 to->iov_len = size; 67 from->iov_len -= size; 68 from->iov_base += size; 69 len -= size; 70 ++from; 71 ++to; 72 ++seg; 73 } 74 return seg; 75} 76 77/* Caller must have TX VQ lock */ 78static void tx_poll_stop(struct vhost_net *net) 79{ 80 if (likely(net->tx_poll_state != VHOST_NET_POLL_STARTED)) 81 return; 82 vhost_poll_stop(net->poll + VHOST_NET_VQ_TX); 83 net->tx_poll_state = VHOST_NET_POLL_STOPPED; 84} 85 86/* Caller must have TX VQ lock */ 87static void tx_poll_start(struct vhost_net *net, struct socket *sock) 88{ 89 if (unlikely(net->tx_poll_state != VHOST_NET_POLL_STOPPED)) 90 return; 91 vhost_poll_start(net->poll + VHOST_NET_VQ_TX, sock->file); 92 net->tx_poll_state = VHOST_NET_POLL_STARTED; 93} 94 95/* Expects to be always run from workqueue - which acts as 96 * read-size critical section for our kind of RCU. */ 97static void handle_tx(struct vhost_net *net) 98{ 99 struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_TX]; 100 unsigned head, out, in, s; 101 struct msghdr msg = { 102 .msg_name = NULL, 103 .msg_namelen = 0, 104 .msg_control = NULL, 105 .msg_controllen = 0, 106 .msg_iov = vq->iov, 107 .msg_flags = MSG_DONTWAIT, 108 }; 109 size_t len, total_len = 0; 110 int err, wmem; 111 size_t hdr_size; 112 struct socket *sock = rcu_dereference(vq->private_data); 113 if (!sock) 114 return; 115 116 wmem = atomic_read(&sock->sk->sk_wmem_alloc); 117 if (wmem >= sock->sk->sk_sndbuf) { 118 mutex_lock(&vq->mutex); 119 tx_poll_start(net, sock); 120 mutex_unlock(&vq->mutex); 121 return; 122 } 123 124 use_mm(net->dev.mm); 125 mutex_lock(&vq->mutex); 126 vhost_disable_notify(vq); 127 128 if (wmem < sock->sk->sk_sndbuf * 2) 129 tx_poll_stop(net); 130 hdr_size = vq->hdr_size; 131 132 for (;;) { 133 head = vhost_get_vq_desc(&net->dev, vq, vq->iov, 134 ARRAY_SIZE(vq->iov), 135 &out, &in, 136 NULL, NULL); 137 /* Nothing new? Wait for eventfd to tell us they refilled. */ 138 if (head == vq->num) { 139 wmem = atomic_read(&sock->sk->sk_wmem_alloc); 140 if (wmem >= sock->sk->sk_sndbuf * 3 / 4) { 141 tx_poll_start(net, sock); 142 set_bit(SOCK_ASYNC_NOSPACE, &sock->flags); 143 break; 144 } 145 if (unlikely(vhost_enable_notify(vq))) { 146 vhost_disable_notify(vq); 147 continue; 148 } 149 break; 150 } 151 if (in) { 152 vq_err(vq, "Unexpected descriptor format for TX: " 153 "out %d, int %d\n", out, in); 154 break; 155 } 156 /* Skip header. TODO: support TSO. */ 157 s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, out); 158 msg.msg_iovlen = out; 159 len = iov_length(vq->iov, out); 160 /* Sanity check */ 161 if (!len) { 162 vq_err(vq, "Unexpected header len for TX: " 163 "%zd expected %zd\n", 164 iov_length(vq->hdr, s), hdr_size); 165 break; 166 } 167 /* TODO: Check specific error and bomb out unless ENOBUFS? */ 168 err = sock->ops->sendmsg(NULL, sock, &msg, len); 169 if (unlikely(err < 0)) { 170 vhost_discard_vq_desc(vq); 171 tx_poll_start(net, sock); 172 break; 173 } 174 if (err != len) 175 pr_err("Truncated TX packet: " 176 " len %d != %zd\n", err, len); 177 vhost_add_used_and_signal(&net->dev, vq, head, 0); 178 total_len += len; 179 if (unlikely(total_len >= VHOST_NET_WEIGHT)) { 180 vhost_poll_queue(&vq->poll); 181 break; 182 } 183 } 184 185 mutex_unlock(&vq->mutex); 186 unuse_mm(net->dev.mm); 187} 188 189/* Expects to be always run from workqueue - which acts as 190 * read-size critical section for our kind of RCU. */ 191static void handle_rx(struct vhost_net *net) 192{ 193 struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX]; 194 unsigned head, out, in, log, s; 195 struct vhost_log *vq_log; 196 struct msghdr msg = { 197 .msg_name = NULL, 198 .msg_namelen = 0, 199 .msg_control = NULL, /* FIXME: get and handle RX aux data. */ 200 .msg_controllen = 0, 201 .msg_iov = vq->iov, 202 .msg_flags = MSG_DONTWAIT, 203 }; 204 205 struct virtio_net_hdr hdr = { 206 .flags = 0, 207 .gso_type = VIRTIO_NET_HDR_GSO_NONE 208 }; 209 210 size_t len, total_len = 0; 211 int err; 212 size_t hdr_size; 213 struct socket *sock = rcu_dereference(vq->private_data); 214 if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue)) 215 return; 216 217 use_mm(net->dev.mm); 218 mutex_lock(&vq->mutex); 219 vhost_disable_notify(vq); 220 hdr_size = vq->hdr_size; 221 222 vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ? 223 vq->log : NULL; 224 225 for (;;) { 226 head = vhost_get_vq_desc(&net->dev, vq, vq->iov, 227 ARRAY_SIZE(vq->iov), 228 &out, &in, 229 vq_log, &log); 230 /* OK, now we need to know about added descriptors. */ 231 if (head == vq->num) { 232 if (unlikely(vhost_enable_notify(vq))) { 233 /* They have slipped one in as we were 234 * doing that: check again. */ 235 vhost_disable_notify(vq); 236 continue; 237 } 238 /* Nothing new? Wait for eventfd to tell us 239 * they refilled. */ 240 break; 241 } 242 /* We don't need to be notified again. */ 243 if (out) { 244 vq_err(vq, "Unexpected descriptor format for RX: " 245 "out %d, int %d\n", 246 out, in); 247 break; 248 } 249 /* Skip header. TODO: support TSO/mergeable rx buffers. */ 250 s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, in); 251 msg.msg_iovlen = in; 252 len = iov_length(vq->iov, in); 253 /* Sanity check */ 254 if (!len) { 255 vq_err(vq, "Unexpected header len for RX: " 256 "%zd expected %zd\n", 257 iov_length(vq->hdr, s), hdr_size); 258 break; 259 } 260 err = sock->ops->recvmsg(NULL, sock, &msg, 261 len, MSG_DONTWAIT | MSG_TRUNC); 262 /* TODO: Check specific error and bomb out unless EAGAIN? */ 263 if (err < 0) { 264 vhost_discard_vq_desc(vq); 265 break; 266 } 267 /* TODO: Should check and handle checksum. */ 268 if (err > len) { 269 pr_err("Discarded truncated rx packet: " 270 " len %d > %zd\n", err, len); 271 vhost_discard_vq_desc(vq); 272 continue; 273 } 274 len = err; 275 err = memcpy_toiovec(vq->hdr, (unsigned char *)&hdr, hdr_size); 276 if (err) { 277 vq_err(vq, "Unable to write vnet_hdr at addr %p: %d\n", 278 vq->iov->iov_base, err); 279 break; 280 } 281 len += hdr_size; 282 vhost_add_used_and_signal(&net->dev, vq, head, len); 283 if (unlikely(vq_log)) 284 vhost_log_write(vq, vq_log, log, len); 285 total_len += len; 286 if (unlikely(total_len >= VHOST_NET_WEIGHT)) { 287 vhost_poll_queue(&vq->poll); 288 break; 289 } 290 } 291 292 mutex_unlock(&vq->mutex); 293 unuse_mm(net->dev.mm); 294} 295 296static void handle_tx_kick(struct work_struct *work) 297{ 298 struct vhost_virtqueue *vq; 299 struct vhost_net *net; 300 vq = container_of(work, struct vhost_virtqueue, poll.work); 301 net = container_of(vq->dev, struct vhost_net, dev); 302 handle_tx(net); 303} 304 305static void handle_rx_kick(struct work_struct *work) 306{ 307 struct vhost_virtqueue *vq; 308 struct vhost_net *net; 309 vq = container_of(work, struct vhost_virtqueue, poll.work); 310 net = container_of(vq->dev, struct vhost_net, dev); 311 handle_rx(net); 312} 313 314static void handle_tx_net(struct work_struct *work) 315{ 316 struct vhost_net *net; 317 net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_TX].work); 318 handle_tx(net); 319} 320 321static void handle_rx_net(struct work_struct *work) 322{ 323 struct vhost_net *net; 324 net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_RX].work); 325 handle_rx(net); 326} 327 328static int vhost_net_open(struct inode *inode, struct file *f) 329{ 330 struct vhost_net *n = kmalloc(sizeof *n, GFP_KERNEL); 331 int r; 332 if (!n) 333 return -ENOMEM; 334 n->vqs[VHOST_NET_VQ_TX].handle_kick = handle_tx_kick; 335 n->vqs[VHOST_NET_VQ_RX].handle_kick = handle_rx_kick; 336 r = vhost_dev_init(&n->dev, n->vqs, VHOST_NET_VQ_MAX); 337 if (r < 0) { 338 kfree(n); 339 return r; 340 } 341 342 vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT); 343 vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN); 344 n->tx_poll_state = VHOST_NET_POLL_DISABLED; 345 346 f->private_data = n; 347 348 return 0; 349} 350 351static void vhost_net_disable_vq(struct vhost_net *n, 352 struct vhost_virtqueue *vq) 353{ 354 if (!vq->private_data) 355 return; 356 if (vq == n->vqs + VHOST_NET_VQ_TX) { 357 tx_poll_stop(n); 358 n->tx_poll_state = VHOST_NET_POLL_DISABLED; 359 } else 360 vhost_poll_stop(n->poll + VHOST_NET_VQ_RX); 361} 362 363static void vhost_net_enable_vq(struct vhost_net *n, 364 struct vhost_virtqueue *vq) 365{ 366 struct socket *sock = vq->private_data; 367 if (!sock) 368 return; 369 if (vq == n->vqs + VHOST_NET_VQ_TX) { 370 n->tx_poll_state = VHOST_NET_POLL_STOPPED; 371 tx_poll_start(n, sock); 372 } else 373 vhost_poll_start(n->poll + VHOST_NET_VQ_RX, sock->file); 374} 375 376static struct socket *vhost_net_stop_vq(struct vhost_net *n, 377 struct vhost_virtqueue *vq) 378{ 379 struct socket *sock; 380 381 mutex_lock(&vq->mutex); 382 sock = vq->private_data; 383 vhost_net_disable_vq(n, vq); 384 rcu_assign_pointer(vq->private_data, NULL); 385 mutex_unlock(&vq->mutex); 386 return sock; 387} 388 389static void vhost_net_stop(struct vhost_net *n, struct socket **tx_sock, 390 struct socket **rx_sock) 391{ 392 *tx_sock = vhost_net_stop_vq(n, n->vqs + VHOST_NET_VQ_TX); 393 *rx_sock = vhost_net_stop_vq(n, n->vqs + VHOST_NET_VQ_RX); 394} 395 396static void vhost_net_flush_vq(struct vhost_net *n, int index) 397{ 398 vhost_poll_flush(n->poll + index); 399 vhost_poll_flush(&n->dev.vqs[index].poll); 400} 401 402static void vhost_net_flush(struct vhost_net *n) 403{ 404 vhost_net_flush_vq(n, VHOST_NET_VQ_TX); 405 vhost_net_flush_vq(n, VHOST_NET_VQ_RX); 406} 407 408static int vhost_net_release(struct inode *inode, struct file *f) 409{ 410 struct vhost_net *n = f->private_data; 411 struct socket *tx_sock; 412 struct socket *rx_sock; 413 414 vhost_net_stop(n, &tx_sock, &rx_sock); 415 vhost_net_flush(n); 416 vhost_dev_cleanup(&n->dev); 417 if (tx_sock) 418 fput(tx_sock->file); 419 if (rx_sock) 420 fput(rx_sock->file); 421 /* We do an extra flush before freeing memory, 422 * since jobs can re-queue themselves. */ 423 vhost_net_flush(n); 424 kfree(n); 425 return 0; 426} 427 428static struct socket *get_raw_socket(int fd) 429{ 430 struct { 431 struct sockaddr_ll sa; 432 char buf[MAX_ADDR_LEN]; 433 } uaddr; 434 int uaddr_len = sizeof uaddr, r; 435 struct socket *sock = sockfd_lookup(fd, &r); 436 if (!sock) 437 return ERR_PTR(-ENOTSOCK); 438 439 /* Parameter checking */ 440 if (sock->sk->sk_type != SOCK_RAW) { 441 r = -ESOCKTNOSUPPORT; 442 goto err; 443 } 444 445 r = sock->ops->getname(sock, (struct sockaddr *)&uaddr.sa, 446 &uaddr_len, 0); 447 if (r) 448 goto err; 449 450 if (uaddr.sa.sll_family != AF_PACKET) { 451 r = -EPFNOSUPPORT; 452 goto err; 453 } 454 return sock; 455err: 456 fput(sock->file); 457 return ERR_PTR(r); 458} 459 460static struct socket *get_tap_socket(int fd) 461{ 462 struct file *file = fget(fd); 463 struct socket *sock; 464 if (!file) 465 return ERR_PTR(-EBADF); 466 sock = tun_get_socket(file); 467 if (!IS_ERR(sock)) 468 return sock; 469 sock = macvtap_get_socket(file); 470 if (IS_ERR(sock)) 471 fput(file); 472 return sock; 473} 474 475static struct socket *get_socket(int fd) 476{ 477 struct socket *sock; 478 /* special case to disable backend */ 479 if (fd == -1) 480 return NULL; 481 sock = get_raw_socket(fd); 482 if (!IS_ERR(sock)) 483 return sock; 484 sock = get_tap_socket(fd); 485 if (!IS_ERR(sock)) 486 return sock; 487 return ERR_PTR(-ENOTSOCK); 488} 489 490static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd) 491{ 492 struct socket *sock, *oldsock; 493 struct vhost_virtqueue *vq; 494 int r; 495 496 mutex_lock(&n->dev.mutex); 497 r = vhost_dev_check_owner(&n->dev); 498 if (r) 499 goto err; 500 501 if (index >= VHOST_NET_VQ_MAX) { 502 r = -ENOBUFS; 503 goto err; 504 } 505 vq = n->vqs + index; 506 mutex_lock(&vq->mutex); 507 508 /* Verify that ring has been setup correctly. */ 509 if (!vhost_vq_access_ok(vq)) { 510 r = -EFAULT; 511 goto err; 512 } 513 sock = get_socket(fd); 514 if (IS_ERR(sock)) { 515 r = PTR_ERR(sock); 516 goto err; 517 } 518 519 /* start polling new socket */ 520 oldsock = vq->private_data; 521 if (sock == oldsock) 522 goto done; 523 524 vhost_net_disable_vq(n, vq); 525 rcu_assign_pointer(vq->private_data, sock); 526 vhost_net_enable_vq(n, vq); 527 mutex_unlock(&vq->mutex); 528done: 529 if (oldsock) { 530 vhost_net_flush_vq(n, index); 531 fput(oldsock->file); 532 } 533err: 534 mutex_unlock(&n->dev.mutex); 535 return r; 536} 537 538static long vhost_net_reset_owner(struct vhost_net *n) 539{ 540 struct socket *tx_sock = NULL; 541 struct socket *rx_sock = NULL; 542 long err; 543 mutex_lock(&n->dev.mutex); 544 err = vhost_dev_check_owner(&n->dev); 545 if (err) 546 goto done; 547 vhost_net_stop(n, &tx_sock, &rx_sock); 548 vhost_net_flush(n); 549 err = vhost_dev_reset_owner(&n->dev); 550done: 551 mutex_unlock(&n->dev.mutex); 552 if (tx_sock) 553 fput(tx_sock->file); 554 if (rx_sock) 555 fput(rx_sock->file); 556 return err; 557} 558 559static int vhost_net_set_features(struct vhost_net *n, u64 features) 560{ 561 size_t hdr_size = features & (1 << VHOST_NET_F_VIRTIO_NET_HDR) ? 562 sizeof(struct virtio_net_hdr) : 0; 563 int i; 564 mutex_lock(&n->dev.mutex); 565 if ((features & (1 << VHOST_F_LOG_ALL)) && 566 !vhost_log_access_ok(&n->dev)) { 567 mutex_unlock(&n->dev.mutex); 568 return -EFAULT; 569 } 570 n->dev.acked_features = features; 571 smp_wmb(); 572 for (i = 0; i < VHOST_NET_VQ_MAX; ++i) { 573 mutex_lock(&n->vqs[i].mutex); 574 n->vqs[i].hdr_size = hdr_size; 575 mutex_unlock(&n->vqs[i].mutex); 576 } 577 vhost_net_flush(n); 578 mutex_unlock(&n->dev.mutex); 579 return 0; 580} 581 582static long vhost_net_ioctl(struct file *f, unsigned int ioctl, 583 unsigned long arg) 584{ 585 struct vhost_net *n = f->private_data; 586 void __user *argp = (void __user *)arg; 587 u64 __user *featurep = argp; 588 struct vhost_vring_file backend; 589 u64 features; 590 int r; 591 switch (ioctl) { 592 case VHOST_NET_SET_BACKEND: 593 r = copy_from_user(&backend, argp, sizeof backend); 594 if (r < 0) 595 return r; 596 return vhost_net_set_backend(n, backend.index, backend.fd); 597 case VHOST_GET_FEATURES: 598 features = VHOST_FEATURES; 599 return copy_to_user(featurep, &features, sizeof features); 600 case VHOST_SET_FEATURES: 601 r = copy_from_user(&features, featurep, sizeof features); 602 if (r < 0) 603 return r; 604 if (features & ~VHOST_FEATURES) 605 return -EOPNOTSUPP; 606 return vhost_net_set_features(n, features); 607 case VHOST_RESET_OWNER: 608 return vhost_net_reset_owner(n); 609 default: 610 mutex_lock(&n->dev.mutex); 611 r = vhost_dev_ioctl(&n->dev, ioctl, arg); 612 vhost_net_flush(n); 613 mutex_unlock(&n->dev.mutex); 614 return r; 615 } 616} 617 618#ifdef CONFIG_COMPAT 619static long vhost_net_compat_ioctl(struct file *f, unsigned int ioctl, 620 unsigned long arg) 621{ 622 return vhost_net_ioctl(f, ioctl, (unsigned long)compat_ptr(arg)); 623} 624#endif 625 626const static struct file_operations vhost_net_fops = { 627 .owner = THIS_MODULE, 628 .release = vhost_net_release, 629 .unlocked_ioctl = vhost_net_ioctl, 630#ifdef CONFIG_COMPAT 631 .compat_ioctl = vhost_net_compat_ioctl, 632#endif 633 .open = vhost_net_open, 634}; 635 636static struct miscdevice vhost_net_misc = { 637 VHOST_NET_MINOR, 638 "vhost-net", 639 &vhost_net_fops, 640}; 641 642int vhost_net_init(void) 643{ 644 int r = vhost_init(); 645 if (r) 646 goto err_init; 647 r = misc_register(&vhost_net_misc); 648 if (r) 649 goto err_reg; 650 return 0; 651err_reg: 652 vhost_cleanup(); 653err_init: 654 return r; 655 656} 657module_init(vhost_net_init); 658 659void vhost_net_exit(void) 660{ 661 misc_deregister(&vhost_net_misc); 662 vhost_cleanup(); 663} 664module_exit(vhost_net_exit); 665 666MODULE_VERSION("0.0.1"); 667MODULE_LICENSE("GPL v2"); 668MODULE_AUTHOR("Michael S. Tsirkin"); 669MODULE_DESCRIPTION("Host kernel accelerator for virtio net");