drivers/vhost/net.c at v2.6.34-rc1

tjh.dev / kernel
fork
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork
kernel / drivers / vhost / net.c
at v2.6.34-rc1 669 lines 16 kB view raw
wrap content
  1/* Copyright (C) 2009 Red Hat, Inc.
  2 * Author: Michael S. Tsirkin <mst@redhat.com>
  3 *
  4 * This work is licensed under the terms of the GNU GPL, version 2.
  5 *
  6 * virtio-net server in host kernel.
  7 */
  8
  9#include <linux/compat.h>
 10#include <linux/eventfd.h>
 11#include <linux/vhost.h>
 12#include <linux/virtio_net.h>
 13#include <linux/mmu_context.h>
 14#include <linux/miscdevice.h>
 15#include <linux/module.h>
 16#include <linux/mutex.h>
 17#include <linux/workqueue.h>
 18#include <linux/rcupdate.h>
 19#include <linux/file.h>
 20
 21#include <linux/net.h>
 22#include <linux/if_packet.h>
 23#include <linux/if_arp.h>
 24#include <linux/if_tun.h>
 25#include <linux/if_macvlan.h>
 26
 27#include <net/sock.h>
 28
 29#include "vhost.h"
 30
 31/* Max number of bytes transferred before requeueing the job.
 32 * Using this limit prevents one virtqueue from starving others. */
 33#define VHOST_NET_WEIGHT 0x80000
 34
 35enum {
 36	VHOST_NET_VQ_RX = 0,
 37	VHOST_NET_VQ_TX = 1,
 38	VHOST_NET_VQ_MAX = 2,
 39};
 40
 41enum vhost_net_poll_state {
 42	VHOST_NET_POLL_DISABLED = 0,
 43	VHOST_NET_POLL_STARTED = 1,
 44	VHOST_NET_POLL_STOPPED = 2,
 45};
 46
 47struct vhost_net {
 48	struct vhost_dev dev;
 49	struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX];
 50	struct vhost_poll poll[VHOST_NET_VQ_MAX];
 51	/* Tells us whether we are polling a socket for TX.
 52	 * We only do this when socket buffer fills up.
 53	 * Protected by tx vq lock. */
 54	enum vhost_net_poll_state tx_poll_state;
 55};
 56
 57/* Pop first len bytes from iovec. Return number of segments used. */
 58static int move_iovec_hdr(struct iovec *from, struct iovec *to,
 59			  size_t len, int iov_count)
 60{
 61	int seg = 0;
 62	size_t size;
 63	while (len && seg < iov_count) {
 64		size = min(from->iov_len, len);
 65		to->iov_base = from->iov_base;
 66		to->iov_len = size;
 67		from->iov_len -= size;
 68		from->iov_base += size;
 69		len -= size;
 70		++from;
 71		++to;
 72		++seg;
 73	}
 74	return seg;
 75}
 76
 77/* Caller must have TX VQ lock */
 78static void tx_poll_stop(struct vhost_net *net)
 79{
 80	if (likely(net->tx_poll_state != VHOST_NET_POLL_STARTED))
 81		return;
 82	vhost_poll_stop(net->poll + VHOST_NET_VQ_TX);
 83	net->tx_poll_state = VHOST_NET_POLL_STOPPED;
 84}
 85
 86/* Caller must have TX VQ lock */
 87static void tx_poll_start(struct vhost_net *net, struct socket *sock)
 88{
 89	if (unlikely(net->tx_poll_state != VHOST_NET_POLL_STOPPED))
 90		return;
 91	vhost_poll_start(net->poll + VHOST_NET_VQ_TX, sock->file);
 92	net->tx_poll_state = VHOST_NET_POLL_STARTED;
 93}
 94
 95/* Expects to be always run from workqueue - which acts as
 96 * read-size critical section for our kind of RCU. */
 97static void handle_tx(struct vhost_net *net)
 98{
 99	struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_TX];
100	unsigned head, out, in, s;
101	struct msghdr msg = {
102		.msg_name = NULL,
103		.msg_namelen = 0,
104		.msg_control = NULL,
105		.msg_controllen = 0,
106		.msg_iov = vq->iov,
107		.msg_flags = MSG_DONTWAIT,
108	};
109	size_t len, total_len = 0;
110	int err, wmem;
111	size_t hdr_size;
112	struct socket *sock = rcu_dereference(vq->private_data);
113	if (!sock)
114		return;
115
116	wmem = atomic_read(&sock->sk->sk_wmem_alloc);
117	if (wmem >= sock->sk->sk_sndbuf) {
118		mutex_lock(&vq->mutex);
119		tx_poll_start(net, sock);
120		mutex_unlock(&vq->mutex);
121		return;
122	}
123
124	use_mm(net->dev.mm);
125	mutex_lock(&vq->mutex);
126	vhost_disable_notify(vq);
127
128	if (wmem < sock->sk->sk_sndbuf * 2)
129		tx_poll_stop(net);
130	hdr_size = vq->hdr_size;
131
132	for (;;) {
133		head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
134					 ARRAY_SIZE(vq->iov),
135					 &out, &in,
136					 NULL, NULL);
137		/* Nothing new?  Wait for eventfd to tell us they refilled. */
138		if (head == vq->num) {
139			wmem = atomic_read(&sock->sk->sk_wmem_alloc);
140			if (wmem >= sock->sk->sk_sndbuf * 3 / 4) {
141				tx_poll_start(net, sock);
142				set_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
143				break;
144			}
145			if (unlikely(vhost_enable_notify(vq))) {
146				vhost_disable_notify(vq);
147				continue;
148			}
149			break;
150		}
151		if (in) {
152			vq_err(vq, "Unexpected descriptor format for TX: "
153			       "out %d, int %d\n", out, in);
154			break;
155		}
156		/* Skip header. TODO: support TSO. */
157		s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, out);
158		msg.msg_iovlen = out;
159		len = iov_length(vq->iov, out);
160		/* Sanity check */
161		if (!len) {
162			vq_err(vq, "Unexpected header len for TX: "
163			       "%zd expected %zd\n",
164			       iov_length(vq->hdr, s), hdr_size);
165			break;
166		}
167		/* TODO: Check specific error and bomb out unless ENOBUFS? */
168		err = sock->ops->sendmsg(NULL, sock, &msg, len);
169		if (unlikely(err < 0)) {
170			vhost_discard_vq_desc(vq);
171			tx_poll_start(net, sock);
172			break;
173		}
174		if (err != len)
175			pr_err("Truncated TX packet: "
176			       " len %d != %zd\n", err, len);
177		vhost_add_used_and_signal(&net->dev, vq, head, 0);
178		total_len += len;
179		if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
180			vhost_poll_queue(&vq->poll);
181			break;
182		}
183	}
184
185	mutex_unlock(&vq->mutex);
186	unuse_mm(net->dev.mm);
187}
188
189/* Expects to be always run from workqueue - which acts as
190 * read-size critical section for our kind of RCU. */
191static void handle_rx(struct vhost_net *net)
192{
193	struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
194	unsigned head, out, in, log, s;
195	struct vhost_log *vq_log;
196	struct msghdr msg = {
197		.msg_name = NULL,
198		.msg_namelen = 0,
199		.msg_control = NULL, /* FIXME: get and handle RX aux data. */
200		.msg_controllen = 0,
201		.msg_iov = vq->iov,
202		.msg_flags = MSG_DONTWAIT,
203	};
204
205	struct virtio_net_hdr hdr = {
206		.flags = 0,
207		.gso_type = VIRTIO_NET_HDR_GSO_NONE
208	};
209
210	size_t len, total_len = 0;
211	int err;
212	size_t hdr_size;
213	struct socket *sock = rcu_dereference(vq->private_data);
214	if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue))
215		return;
216
217	use_mm(net->dev.mm);
218	mutex_lock(&vq->mutex);
219	vhost_disable_notify(vq);
220	hdr_size = vq->hdr_size;
221
222	vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
223		vq->log : NULL;
224
225	for (;;) {
226		head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
227					 ARRAY_SIZE(vq->iov),
228					 &out, &in,
229					 vq_log, &log);
230		/* OK, now we need to know about added descriptors. */
231		if (head == vq->num) {
232			if (unlikely(vhost_enable_notify(vq))) {
233				/* They have slipped one in as we were
234				 * doing that: check again. */
235				vhost_disable_notify(vq);
236				continue;
237			}
238			/* Nothing new?  Wait for eventfd to tell us
239			 * they refilled. */
240			break;
241		}
242		/* We don't need to be notified again. */
243		if (out) {
244			vq_err(vq, "Unexpected descriptor format for RX: "
245			       "out %d, int %d\n",
246			       out, in);
247			break;
248		}
249		/* Skip header. TODO: support TSO/mergeable rx buffers. */
250		s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, in);
251		msg.msg_iovlen = in;
252		len = iov_length(vq->iov, in);
253		/* Sanity check */
254		if (!len) {
255			vq_err(vq, "Unexpected header len for RX: "
256			       "%zd expected %zd\n",
257			       iov_length(vq->hdr, s), hdr_size);
258			break;
259		}
260		err = sock->ops->recvmsg(NULL, sock, &msg,
261					 len, MSG_DONTWAIT | MSG_TRUNC);
262		/* TODO: Check specific error and bomb out unless EAGAIN? */
263		if (err < 0) {
264			vhost_discard_vq_desc(vq);
265			break;
266		}
267		/* TODO: Should check and handle checksum. */
268		if (err > len) {
269			pr_err("Discarded truncated rx packet: "
270			       " len %d > %zd\n", err, len);
271			vhost_discard_vq_desc(vq);
272			continue;
273		}
274		len = err;
275		err = memcpy_toiovec(vq->hdr, (unsigned char *)&hdr, hdr_size);
276		if (err) {
277			vq_err(vq, "Unable to write vnet_hdr at addr %p: %d\n",
278			       vq->iov->iov_base, err);
279			break;
280		}
281		len += hdr_size;
282		vhost_add_used_and_signal(&net->dev, vq, head, len);
283		if (unlikely(vq_log))
284			vhost_log_write(vq, vq_log, log, len);
285		total_len += len;
286		if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
287			vhost_poll_queue(&vq->poll);
288			break;
289		}
290	}
291
292	mutex_unlock(&vq->mutex);
293	unuse_mm(net->dev.mm);
294}
295
296static void handle_tx_kick(struct work_struct *work)
297{
298	struct vhost_virtqueue *vq;
299	struct vhost_net *net;
300	vq = container_of(work, struct vhost_virtqueue, poll.work);
301	net = container_of(vq->dev, struct vhost_net, dev);
302	handle_tx(net);
303}
304
305static void handle_rx_kick(struct work_struct *work)
306{
307	struct vhost_virtqueue *vq;
308	struct vhost_net *net;
309	vq = container_of(work, struct vhost_virtqueue, poll.work);
310	net = container_of(vq->dev, struct vhost_net, dev);
311	handle_rx(net);
312}
313
314static void handle_tx_net(struct work_struct *work)
315{
316	struct vhost_net *net;
317	net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_TX].work);
318	handle_tx(net);
319}
320
321static void handle_rx_net(struct work_struct *work)
322{
323	struct vhost_net *net;
324	net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_RX].work);
325	handle_rx(net);
326}
327
328static int vhost_net_open(struct inode *inode, struct file *f)
329{
330	struct vhost_net *n = kmalloc(sizeof *n, GFP_KERNEL);
331	int r;
332	if (!n)
333		return -ENOMEM;
334	n->vqs[VHOST_NET_VQ_TX].handle_kick = handle_tx_kick;
335	n->vqs[VHOST_NET_VQ_RX].handle_kick = handle_rx_kick;
336	r = vhost_dev_init(&n->dev, n->vqs, VHOST_NET_VQ_MAX);
337	if (r < 0) {
338		kfree(n);
339		return r;
340	}
341
342	vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT);
343	vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN);
344	n->tx_poll_state = VHOST_NET_POLL_DISABLED;
345
346	f->private_data = n;
347
348	return 0;
349}
350
351static void vhost_net_disable_vq(struct vhost_net *n,
352				 struct vhost_virtqueue *vq)
353{
354	if (!vq->private_data)
355		return;
356	if (vq == n->vqs + VHOST_NET_VQ_TX) {
357		tx_poll_stop(n);
358		n->tx_poll_state = VHOST_NET_POLL_DISABLED;
359	} else
360		vhost_poll_stop(n->poll + VHOST_NET_VQ_RX);
361}
362
363static void vhost_net_enable_vq(struct vhost_net *n,
364				struct vhost_virtqueue *vq)
365{
366	struct socket *sock = vq->private_data;
367	if (!sock)
368		return;
369	if (vq == n->vqs + VHOST_NET_VQ_TX) {
370		n->tx_poll_state = VHOST_NET_POLL_STOPPED;
371		tx_poll_start(n, sock);
372	} else
373		vhost_poll_start(n->poll + VHOST_NET_VQ_RX, sock->file);
374}
375
376static struct socket *vhost_net_stop_vq(struct vhost_net *n,
377					struct vhost_virtqueue *vq)
378{
379	struct socket *sock;
380
381	mutex_lock(&vq->mutex);
382	sock = vq->private_data;
383	vhost_net_disable_vq(n, vq);
384	rcu_assign_pointer(vq->private_data, NULL);
385	mutex_unlock(&vq->mutex);
386	return sock;
387}
388
389static void vhost_net_stop(struct vhost_net *n, struct socket **tx_sock,
390			   struct socket **rx_sock)
391{
392	*tx_sock = vhost_net_stop_vq(n, n->vqs + VHOST_NET_VQ_TX);
393	*rx_sock = vhost_net_stop_vq(n, n->vqs + VHOST_NET_VQ_RX);
394}
395
396static void vhost_net_flush_vq(struct vhost_net *n, int index)
397{
398	vhost_poll_flush(n->poll + index);
399	vhost_poll_flush(&n->dev.vqs[index].poll);
400}
401
402static void vhost_net_flush(struct vhost_net *n)
403{
404	vhost_net_flush_vq(n, VHOST_NET_VQ_TX);
405	vhost_net_flush_vq(n, VHOST_NET_VQ_RX);
406}
407
408static int vhost_net_release(struct inode *inode, struct file *f)
409{
410	struct vhost_net *n = f->private_data;
411	struct socket *tx_sock;
412	struct socket *rx_sock;
413
414	vhost_net_stop(n, &tx_sock, &rx_sock);
415	vhost_net_flush(n);
416	vhost_dev_cleanup(&n->dev);
417	if (tx_sock)
418		fput(tx_sock->file);
419	if (rx_sock)
420		fput(rx_sock->file);
421	/* We do an extra flush before freeing memory,
422	 * since jobs can re-queue themselves. */
423	vhost_net_flush(n);
424	kfree(n);
425	return 0;
426}
427
428static struct socket *get_raw_socket(int fd)
429{
430	struct {
431		struct sockaddr_ll sa;
432		char  buf[MAX_ADDR_LEN];
433	} uaddr;
434	int uaddr_len = sizeof uaddr, r;
435	struct socket *sock = sockfd_lookup(fd, &r);
436	if (!sock)
437		return ERR_PTR(-ENOTSOCK);
438
439	/* Parameter checking */
440	if (sock->sk->sk_type != SOCK_RAW) {
441		r = -ESOCKTNOSUPPORT;
442		goto err;
443	}
444
445	r = sock->ops->getname(sock, (struct sockaddr *)&uaddr.sa,
446			       &uaddr_len, 0);
447	if (r)
448		goto err;
449
450	if (uaddr.sa.sll_family != AF_PACKET) {
451		r = -EPFNOSUPPORT;
452		goto err;
453	}
454	return sock;
455err:
456	fput(sock->file);
457	return ERR_PTR(r);
458}
459
460static struct socket *get_tap_socket(int fd)
461{
462	struct file *file = fget(fd);
463	struct socket *sock;
464	if (!file)
465		return ERR_PTR(-EBADF);
466	sock = tun_get_socket(file);
467	if (!IS_ERR(sock))
468		return sock;
469	sock = macvtap_get_socket(file);
470	if (IS_ERR(sock))
471		fput(file);
472	return sock;
473}
474
475static struct socket *get_socket(int fd)
476{
477	struct socket *sock;
478	/* special case to disable backend */
479	if (fd == -1)
480		return NULL;
481	sock = get_raw_socket(fd);
482	if (!IS_ERR(sock))
483		return sock;
484	sock = get_tap_socket(fd);
485	if (!IS_ERR(sock))
486		return sock;
487	return ERR_PTR(-ENOTSOCK);
488}
489
490static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
491{
492	struct socket *sock, *oldsock;
493	struct vhost_virtqueue *vq;
494	int r;
495
496	mutex_lock(&n->dev.mutex);
497	r = vhost_dev_check_owner(&n->dev);
498	if (r)
499		goto err;
500
501	if (index >= VHOST_NET_VQ_MAX) {
502		r = -ENOBUFS;
503		goto err;
504	}
505	vq = n->vqs + index;
506	mutex_lock(&vq->mutex);
507
508	/* Verify that ring has been setup correctly. */
509	if (!vhost_vq_access_ok(vq)) {
510		r = -EFAULT;
511		goto err;
512	}
513	sock = get_socket(fd);
514	if (IS_ERR(sock)) {
515		r = PTR_ERR(sock);
516		goto err;
517	}
518
519	/* start polling new socket */
520	oldsock = vq->private_data;
521	if (sock == oldsock)
522		goto done;
523
524	vhost_net_disable_vq(n, vq);
525	rcu_assign_pointer(vq->private_data, sock);
526	vhost_net_enable_vq(n, vq);
527	mutex_unlock(&vq->mutex);
528done:
529	if (oldsock) {
530		vhost_net_flush_vq(n, index);
531		fput(oldsock->file);
532	}
533err:
534	mutex_unlock(&n->dev.mutex);
535	return r;
536}
537
538static long vhost_net_reset_owner(struct vhost_net *n)
539{
540	struct socket *tx_sock = NULL;
541	struct socket *rx_sock = NULL;
542	long err;
543	mutex_lock(&n->dev.mutex);
544	err = vhost_dev_check_owner(&n->dev);
545	if (err)
546		goto done;
547	vhost_net_stop(n, &tx_sock, &rx_sock);
548	vhost_net_flush(n);
549	err = vhost_dev_reset_owner(&n->dev);
550done:
551	mutex_unlock(&n->dev.mutex);
552	if (tx_sock)
553		fput(tx_sock->file);
554	if (rx_sock)
555		fput(rx_sock->file);
556	return err;
557}
558
559static int vhost_net_set_features(struct vhost_net *n, u64 features)
560{
561	size_t hdr_size = features & (1 << VHOST_NET_F_VIRTIO_NET_HDR) ?
562		sizeof(struct virtio_net_hdr) : 0;
563	int i;
564	mutex_lock(&n->dev.mutex);
565	if ((features & (1 << VHOST_F_LOG_ALL)) &&
566	    !vhost_log_access_ok(&n->dev)) {
567		mutex_unlock(&n->dev.mutex);
568		return -EFAULT;
569	}
570	n->dev.acked_features = features;
571	smp_wmb();
572	for (i = 0; i < VHOST_NET_VQ_MAX; ++i) {
573		mutex_lock(&n->vqs[i].mutex);
574		n->vqs[i].hdr_size = hdr_size;
575		mutex_unlock(&n->vqs[i].mutex);
576	}
577	vhost_net_flush(n);
578	mutex_unlock(&n->dev.mutex);
579	return 0;
580}
581
582static long vhost_net_ioctl(struct file *f, unsigned int ioctl,
583			    unsigned long arg)
584{
585	struct vhost_net *n = f->private_data;
586	void __user *argp = (void __user *)arg;
587	u64 __user *featurep = argp;
588	struct vhost_vring_file backend;
589	u64 features;
590	int r;
591	switch (ioctl) {
592	case VHOST_NET_SET_BACKEND:
593		r = copy_from_user(&backend, argp, sizeof backend);
594		if (r < 0)
595			return r;
596		return vhost_net_set_backend(n, backend.index, backend.fd);
597	case VHOST_GET_FEATURES:
598		features = VHOST_FEATURES;
599		return copy_to_user(featurep, &features, sizeof features);
600	case VHOST_SET_FEATURES:
601		r = copy_from_user(&features, featurep, sizeof features);
602		if (r < 0)
603			return r;
604		if (features & ~VHOST_FEATURES)
605			return -EOPNOTSUPP;
606		return vhost_net_set_features(n, features);
607	case VHOST_RESET_OWNER:
608		return vhost_net_reset_owner(n);
609	default:
610		mutex_lock(&n->dev.mutex);
611		r = vhost_dev_ioctl(&n->dev, ioctl, arg);
612		vhost_net_flush(n);
613		mutex_unlock(&n->dev.mutex);
614		return r;
615	}
616}
617
618#ifdef CONFIG_COMPAT
619static long vhost_net_compat_ioctl(struct file *f, unsigned int ioctl,
620				   unsigned long arg)
621{
622	return vhost_net_ioctl(f, ioctl, (unsigned long)compat_ptr(arg));
623}
624#endif
625
626const static struct file_operations vhost_net_fops = {
627	.owner          = THIS_MODULE,
628	.release        = vhost_net_release,
629	.unlocked_ioctl = vhost_net_ioctl,
630#ifdef CONFIG_COMPAT
631	.compat_ioctl   = vhost_net_compat_ioctl,
632#endif
633	.open           = vhost_net_open,
634};
635
636static struct miscdevice vhost_net_misc = {
637	VHOST_NET_MINOR,
638	"vhost-net",
639	&vhost_net_fops,
640};
641
642int vhost_net_init(void)
643{
644	int r = vhost_init();
645	if (r)
646		goto err_init;
647	r = misc_register(&vhost_net_misc);
648	if (r)
649		goto err_reg;
650	return 0;
651err_reg:
652	vhost_cleanup();
653err_init:
654	return r;
655
656}
657module_init(vhost_net_init);
658
659void vhost_net_exit(void)
660{
661	misc_deregister(&vhost_net_misc);
662	vhost_cleanup();
663}
664module_exit(vhost_net_exit);
665
666MODULE_VERSION("0.0.1");
667MODULE_LICENSE("GPL v2");
668MODULE_AUTHOR("Michael S. Tsirkin");
669MODULE_DESCRIPTION("Host kernel accelerator for virtio net");
Configure Feed

Configure Feed