lguest: Tell Guest net not to notify us on every packet xmit

virtio_ring has the ability to suppress notifications. This prevents
a guest exit for every packet, but we need to set a timer on packet
receipt to re-check if there were any remaining packets.

Here are the times for 1G TCP Guest->Host with different timeout
settings (it matters because the TCP window doesn't grow big enough to
fill the entire buffer):

Timeout value Seconds Xmit/Recv/Timeout
None (before) 25.3784 xmit 7750233 recv 1
2500 usec 62.5119 xmit 207020 recv 2 timeout 207020
1000 usec 34.5379 xmit 207003 recv 2 timeout 207003
750 usec 29.2305 xmit 207002 recv 1 timeout 207002
500 usec 19.1887 xmit 561141 recv 1 timeout 559657
250 usec 20.0465 xmit 214128 recv 2 timeout 214110
100 usec 19.2583 xmit 561621 recv 1 timeout 560153

(Note that these values are sensitive to the GSO patches which come
later, and probably other traffic-related variables, so take with a
large grain of salt).

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>

+93 -13
+93 -13
Documentation/lguest/lguest.c
··· 36 36 #include <sched.h> 37 37 #include <limits.h> 38 38 #include <stddef.h> 39 + #include <signal.h> 39 40 #include "linux/lguest_launcher.h" 40 41 #include "linux/virtio_config.h" 41 42 #include "linux/virtio_net.h" ··· 82 81 static void *guest_base; 83 82 /* The maximum guest physical address allowed, and maximum possible. */ 84 83 static unsigned long guest_limit, guest_max; 84 + /* The pipe for signal hander to write to. */ 85 + static int timeoutpipe[2]; 85 86 86 87 /* a per-cpu variable indicating whose vcpu is currently running */ 87 88 static unsigned int __thread cpu_id; ··· 159 156 /* Last available index we saw. */ 160 157 u16 last_avail_idx; 161 158 162 - /* The routine to call when the Guest pings us. */ 163 - void (*handle_output)(int fd, struct virtqueue *me); 159 + /* The routine to call when the Guest pings us, or timeout. */ 160 + void (*handle_output)(int fd, struct virtqueue *me, bool timeout); 164 161 165 162 /* Outstanding buffers */ 166 163 unsigned int inflight; 164 + 165 + /* Is this blocked awaiting a timer? */ 166 + bool blocked; 167 167 }; 168 168 169 169 /* Remember the arguments to the program so we can "reboot" */ ··· 880 874 881 875 /* Handling output for console is simple: we just get all the output buffers 882 876 * and write them to stdout. */ 883 - static void handle_console_output(int fd, struct virtqueue *vq) 877 + static void handle_console_output(int fd, struct virtqueue *vq, bool timeout) 884 878 { 885 879 unsigned int head, out, in; 886 880 int len; ··· 895 889 } 896 890 } 897 891 892 + static void block_vq(struct virtqueue *vq) 893 + { 894 + struct itimerval itm; 895 + 896 + vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY; 897 + vq->blocked = true; 898 + 899 + itm.it_interval.tv_sec = 0; 900 + itm.it_interval.tv_usec = 0; 901 + itm.it_value.tv_sec = 0; 902 + itm.it_value.tv_usec = 500; 903 + 904 + setitimer(ITIMER_REAL, &itm, NULL); 905 + } 906 + 898 907 /* 899 908 * The Network 900 909 * ··· 917 896 * and write them (ignoring the first element) to this device's file descriptor 918 897 * (/dev/net/tun). 919 898 */ 920 - static void handle_net_output(int fd, struct virtqueue *vq) 899 + static void handle_net_output(int fd, struct virtqueue *vq, bool timeout) 921 900 { 922 - unsigned int head, out, in; 901 + unsigned int head, out, in, num = 0; 923 902 int len; 924 903 struct iovec iov[vq->vring.num]; 925 904 ··· 933 912 (void)convert(&iov[0], struct virtio_net_hdr); 934 913 len = writev(vq->dev->fd, iov+1, out-1); 935 914 add_used_and_trigger(fd, vq, head, len); 915 + num++; 936 916 } 917 + 918 + /* Block further kicks and set up a timer if we saw anything. */ 919 + if (!timeout && num) 920 + block_vq(vq); 937 921 } 938 922 939 923 /* This is where we handle a packet coming in from the tun device to our ··· 993 967 /*L:215 This is the callback attached to the network and console input 994 968 * virtqueues: it ensures we try again, in case we stopped console or net 995 969 * delivery because Guest didn't have any buffers. */ 996 - static void enable_fd(int fd, struct virtqueue *vq) 970 + static void enable_fd(int fd, struct virtqueue *vq, bool timeout) 997 971 { 998 972 add_device_fd(vq->dev->fd); 999 973 /* Tell waker to listen to it again */ 1000 974 write(waker_fd, &vq->dev->fd, sizeof(vq->dev->fd)); 1001 975 } 1002 976 1003 - static void net_enable_fd(int fd, struct virtqueue *vq) 977 + static void net_enable_fd(int fd, struct virtqueue *vq, bool timeout) 1004 978 { 1005 979 /* We don't need to know again when Guest refills receive buffer. */ 1006 980 vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY; 1007 - enable_fd(fd, vq); 981 + enable_fd(fd, vq, timeout); 1008 982 } 1009 983 1010 984 /* When the Guest tells us they updated the status field, we handle it. */ ··· 1073 1047 if (strcmp(vq->dev->name, "console") != 0) 1074 1048 verbose("Output to %s\n", vq->dev->name); 1075 1049 if (vq->handle_output) 1076 - vq->handle_output(fd, vq); 1050 + vq->handle_output(fd, vq, false); 1077 1051 return; 1078 1052 } 1079 1053 } ··· 1087 1061 strnlen(from_guest_phys(addr), guest_limit - addr)); 1088 1062 } 1089 1063 1064 + static void handle_timeout(int fd) 1065 + { 1066 + char buf[32]; 1067 + struct device *i; 1068 + struct virtqueue *vq; 1069 + 1070 + /* Clear the pipe */ 1071 + read(timeoutpipe[0], buf, sizeof(buf)); 1072 + 1073 + /* Check each device and virtqueue: flush blocked ones. */ 1074 + for (i = devices.dev; i; i = i->next) { 1075 + for (vq = i->vq; vq; vq = vq->next) { 1076 + if (!vq->blocked) 1077 + continue; 1078 + 1079 + vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY; 1080 + vq->blocked = false; 1081 + if (vq->handle_output) 1082 + vq->handle_output(fd, vq, true); 1083 + } 1084 + } 1085 + } 1086 + 1090 1087 /* This is called when the Waker wakes us up: check for incoming file 1091 1088 * descriptors. */ 1092 1089 static void handle_input(int fd) ··· 1120 1071 for (;;) { 1121 1072 struct device *i; 1122 1073 fd_set fds = devices.infds; 1074 + int num; 1123 1075 1076 + num = select(devices.max_infd+1, &fds, NULL, NULL, &poll); 1077 + /* Could get interrupted */ 1078 + if (num < 0) 1079 + continue; 1124 1080 /* If nothing is ready, we're done. */ 1125 - if (select(devices.max_infd+1, &fds, NULL, NULL, &poll) == 0) 1081 + if (num == 0) 1126 1082 break; 1127 1083 1128 1084 /* Otherwise, call the device(s) which have readable file ··· 1151 1097 write(waker_fd, &dev_fd, sizeof(dev_fd)); 1152 1098 } 1153 1099 } 1100 + 1101 + /* Is this the timeout fd? */ 1102 + if (FD_ISSET(timeoutpipe[0], &fds)) 1103 + handle_timeout(fd); 1154 1104 } 1155 1105 } 1156 1106 ··· 1203 1145 /* Each device descriptor is followed by the description of its virtqueues. We 1204 1146 * specify how many descriptors the virtqueue is to have. */ 1205 1147 static void add_virtqueue(struct device *dev, unsigned int num_descs, 1206 - void (*handle_output)(int fd, struct virtqueue *me)) 1148 + void (*handle_output)(int, struct virtqueue *, bool)) 1207 1149 { 1208 1150 unsigned int pages; 1209 1151 struct virtqueue **i, *vq = malloc(sizeof(*vq)); ··· 1219 1161 vq->last_avail_idx = 0; 1220 1162 vq->dev = dev; 1221 1163 vq->inflight = 0; 1164 + vq->blocked = false; 1222 1165 1223 1166 /* Initialize the configuration. */ 1224 1167 vq->config.num = num_descs; ··· 1351 1292 verbose("device %u: console\n", devices.device_num++); 1352 1293 } 1353 1294 /*:*/ 1295 + 1296 + static void timeout_alarm(int sig) 1297 + { 1298 + write(timeoutpipe[1], "", 1); 1299 + } 1300 + 1301 + static void setup_timeout(void) 1302 + { 1303 + if (pipe(timeoutpipe) != 0) 1304 + err(1, "Creating timeout pipe"); 1305 + 1306 + if (fcntl(timeoutpipe[1], F_SETFL, 1307 + fcntl(timeoutpipe[1], F_GETFL) | O_NONBLOCK) != 0) 1308 + err(1, "Making timeout pipe nonblocking"); 1309 + 1310 + add_device_fd(timeoutpipe[0]); 1311 + signal(SIGALRM, timeout_alarm); 1312 + } 1354 1313 1355 1314 /*M:010 Inter-guest networking is an interesting area. Simplest is to have a 1356 1315 * --sharenet=<name> option which opens or creates a named pipe. This can be ··· 1730 1653 } 1731 1654 1732 1655 /* When the Guest submits some I/O, we just need to wake the I/O thread. */ 1733 - static void handle_virtblk_output(int fd, struct virtqueue *vq) 1656 + static void handle_virtblk_output(int fd, struct virtqueue *vq, bool timeout) 1734 1657 { 1735 1658 struct vblk_info *vblk = vq->dev->priv; 1736 1659 char c = 0; ··· 1901 1824 /* ERESTART means that we need to reboot the guest */ 1902 1825 } else if (errno == ERESTART) { 1903 1826 restart_guest(); 1904 - /* EAGAIN means the Waker wanted us to look at some input. 1827 + /* EAGAIN means a signal (timeout). 1905 1828 * Anything else means a bug or incompatible change. */ 1906 1829 } else if (errno != EAGAIN) 1907 1830 err(1, "Running guest failed"); ··· 2024 1947 2025 1948 /* We always have a console device */ 2026 1949 setup_console(); 1950 + 1951 + /* We can timeout waiting for Guest network transmit. */ 1952 + setup_timeout(); 2027 1953 2028 1954 /* Now we load the kernel */ 2029 1955 start = load_kernel(open_or_die(argv[optind+1], O_RDONLY));