VSOCK: Introduce VM Sockets · tjh.dev/kernel@d021c34

+3 -1

include/linux/socket.h

··· 178 178 #define AF_CAIF 37 /* CAIF sockets */ 179 179 #define AF_ALG 38 /* Algorithm sockets */ 180 180 #define AF_NFC 39 /* NFC sockets */ 181 - #define AF_MAX 40 /* For now.. */ 181 + #define AF_VSOCK 40 /* vSockets */ 182 + #define AF_MAX 41 /* For now.. */ 182 183 183 184 /* Protocol families, same as address families. */ 184 185 #define PF_UNSPEC AF_UNSPEC ··· 222 221 #define PF_CAIF AF_CAIF 223 222 #define PF_ALG AF_ALG 224 223 #define PF_NFC AF_NFC 224 + #define PF_VSOCK AF_VSOCK 225 225 #define PF_MAX AF_MAX 226 226 227 227 /* Maximum queue length specifiable by listen. */

+171

include/uapi/linux/vm_sockets.h

··· 1 + /* 2 + * VMware vSockets Driver 3 + * 4 + * Copyright (C) 2007-2013 VMware, Inc. All rights reserved. 5 + * 6 + * This program is free software; you can redistribute it and/or modify it 7 + * under the terms of the GNU General Public License as published by the Free 8 + * Software Foundation version 2 and no later version. 9 + * 10 + * This program is distributed in the hope that it will be useful, but WITHOUT 11 + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 13 + * more details. 14 + */ 15 + 16 + #ifndef _VM_SOCKETS_H_ 17 + #define _VM_SOCKETS_H_ 18 + 19 + #if !defined(__KERNEL__) 20 + #include <sys/socket.h> 21 + #endif 22 + 23 + /* Option name for STREAM socket buffer size. Use as the option name in 24 + * setsockopt(3) or getsockopt(3) to set or get an unsigned long long that 25 + * specifies the size of the buffer underlying a vSockets STREAM socket. 26 + * Value is clamped to the MIN and MAX. 27 + */ 28 + 29 + #define SO_VM_SOCKETS_BUFFER_SIZE 0 30 + 31 + /* Option name for STREAM socket minimum buffer size. Use as the option name 32 + * in setsockopt(3) or getsockopt(3) to set or get an unsigned long long that 33 + * specifies the minimum size allowed for the buffer underlying a vSockets 34 + * STREAM socket. 35 + */ 36 + 37 + #define SO_VM_SOCKETS_BUFFER_MIN_SIZE 1 38 + 39 + /* Option name for STREAM socket maximum buffer size. Use as the option name 40 + * in setsockopt(3) or getsockopt(3) to set or get an unsigned long long 41 + * that specifies the maximum size allowed for the buffer underlying a 42 + * vSockets STREAM socket. 43 + */ 44 + 45 + #define SO_VM_SOCKETS_BUFFER_MAX_SIZE 2 46 + 47 + /* Option name for socket peer's host-specific VM ID. Use as the option name 48 + * in getsockopt(3) to get a host-specific identifier for the peer endpoint's 49 + * VM. The identifier is a signed integer. 50 + * Only available for hypervisor endpoints. 51 + */ 52 + 53 + #define SO_VM_SOCKETS_PEER_HOST_VM_ID 3 54 + 55 + /* Option name for socket's service label. Use as the option name in 56 + * setsockopt(3) or getsockopt(3) to set or get the service label for a socket. 57 + * The service label is a C-style NUL-terminated string. Only available for 58 + * hypervisor endpoints. 59 + */ 60 + 61 + #define SO_VM_SOCKETS_SERVICE_LABEL 4 62 + 63 + /* Option name for determining if a socket is trusted. Use as the option name 64 + * in getsockopt(3) to determine if a socket is trusted. The value is a 65 + * signed integer. 66 + */ 67 + 68 + #define SO_VM_SOCKETS_TRUSTED 5 69 + 70 + /* Option name for STREAM socket connection timeout. Use as the option name 71 + * in setsockopt(3) or getsockopt(3) to set or get the connection 72 + * timeout for a STREAM socket. 73 + */ 74 + 75 + #define SO_VM_SOCKETS_CONNECT_TIMEOUT 6 76 + 77 + /* Option name for using non-blocking send/receive. Use as the option name 78 + * for setsockopt(3) or getsockopt(3) to set or get the non-blocking 79 + * transmit/receive flag for a STREAM socket. This flag determines whether 80 + * send() and recv() can be called in non-blocking contexts for the given 81 + * socket. The value is a signed integer. 82 + * 83 + * This option is only relevant to kernel endpoints, where descheduling the 84 + * thread of execution is not allowed, for example, while holding a spinlock. 85 + * It is not to be confused with conventional non-blocking socket operations. 86 + * 87 + * Only available for hypervisor endpoints. 88 + */ 89 + 90 + #define SO_VM_SOCKETS_NONBLOCK_TXRX 7 91 + 92 + /* The vSocket equivalent of INADDR_ANY. This works for the svm_cid field of 93 + * sockaddr_vm and indicates the context ID of the current endpoint. 94 + */ 95 + 96 + #define VMADDR_CID_ANY -1U 97 + 98 + /* Bind to any available port. Works for the svm_port field of 99 + * sockaddr_vm. 100 + */ 101 + 102 + #define VMADDR_PORT_ANY -1U 103 + 104 + /* Use this as the destination CID in an address when referring to the 105 + * hypervisor. VMCI relies on it being 0, but this would be useful for other 106 + * transports too. 107 + */ 108 + 109 + #define VMADDR_CID_HYPERVISOR 0 110 + 111 + /* This CID is specific to VMCI and can be considered reserved (even VMCI 112 + * doesn't use it anymore, it's a legacy value from an older release). 113 + */ 114 + 115 + #define VMADDR_CID_RESERVED 1 116 + 117 + /* Use this as the destination CID in an address when referring to the host 118 + * (any process other than the hypervisor). VMCI relies on it being 2, but 119 + * this would be useful for other transports too. 120 + */ 121 + 122 + #define VMADDR_CID_HOST 2 123 + 124 + /* Invalid vSockets version. */ 125 + 126 + #define VM_SOCKETS_INVALID_VERSION -1U 127 + 128 + /* The epoch (first) component of the vSockets version. A single byte 129 + * representing the epoch component of the vSockets version. 130 + */ 131 + 132 + #define VM_SOCKETS_VERSION_EPOCH(_v) (((_v) & 0xFF000000) >> 24) 133 + 134 + /* The major (second) component of the vSockets version. A single byte 135 + * representing the major component of the vSockets version. Typically 136 + * changes for every major release of a product. 137 + */ 138 + 139 + #define VM_SOCKETS_VERSION_MAJOR(_v) (((_v) & 0x00FF0000) >> 16) 140 + 141 + /* The minor (third) component of the vSockets version. Two bytes representing 142 + * the minor component of the vSockets version. 143 + */ 144 + 145 + #define VM_SOCKETS_VERSION_MINOR(_v) (((_v) & 0x0000FFFF)) 146 + 147 + /* Address structure for vSockets. The address family should be set to 148 + * whatever vmci_sock_get_af_value_fd() returns. The structure members should 149 + * all align on their natural boundaries without resorting to compiler packing 150 + * directives. The total size of this structure should be exactly the same as 151 + * that of struct sockaddr. 152 + */ 153 + 154 + struct sockaddr_vm { 155 + sa_family_t svm_family; 156 + unsigned short svm_reserved1; 157 + unsigned int svm_port; 158 + unsigned int svm_cid; 159 + unsigned char svm_zero[sizeof(struct sockaddr) - 160 + sizeof(sa_family_t) - 161 + sizeof(unsigned short) - 162 + sizeof(unsigned int) - sizeof(unsigned int)]; 163 + }; 164 + 165 + #define IOCTL_VM_SOCKETS_GET_LOCAL_CID _IO(7, 0xb9) 166 + 167 + #if defined(__KERNEL__) 168 + int vm_sockets_get_local_cid(void); 169 + #endif 170 + 171 + #endif

+1

net/Kconfig

··· 217 217 source "net/dns_resolver/Kconfig" 218 218 source "net/batman-adv/Kconfig" 219 219 source "net/openvswitch/Kconfig" 220 + source "net/vmw_vsock/Kconfig" 220 221 221 222 config RPS 222 223 boolean

+1

net/Makefile

··· 69 69 obj-$(CONFIG_BATMAN_ADV) += batman-adv/ 70 70 obj-$(CONFIG_NFC) += nfc/ 71 71 obj-$(CONFIG_OPENVSWITCH) += openvswitch/ 72 + obj-$(CONFIG_VSOCKETS) += vmw_vsock/

+28

net/vmw_vsock/Kconfig

··· 1 + # 2 + # Vsock protocol 3 + # 4 + 5 + config VSOCKETS 6 + tristate "Virtual Socket protocol" 7 + help 8 + Virtual Socket Protocol is a socket protocol similar to TCP/IP 9 + allowing comunication between Virtual Machines and hypervisor 10 + or host. 11 + 12 + You should also select one or more hypervisor-specific transports 13 + below. 14 + 15 + To compile this driver as a module, choose M here: the module 16 + will be called vsock. If unsure, say N. 17 + 18 + config VMWARE_VMCI_VSOCKETS 19 + tristate "VMware VMCI transport for Virtual Sockets" 20 + depends on VSOCKETS && VMWARE_VMCI 21 + help 22 + This module implements a VMCI transport for Virtual Sockets. 23 + 24 + Enable this transport if your Virtual Machine runs on a VMware 25 + hypervisor. 26 + 27 + To compile this driver as a module, choose M here: the module 28 + will be called vmw_vsock_vmci_transport. If unsure, say N.

+7

net/vmw_vsock/Makefile

··· 1 + obj-$(CONFIG_VSOCKETS) += vsock.o 2 + obj-$(CONFIG_VMWARE_VMCI_VSOCKETS) += vmw_vsock_vmci_transport.o 3 + 4 + vsock-y += af_vsock.o vsock_addr.o 5 + 6 + vmw_vsock_vmci_transport-y += vmci_transport.o vmci_transport_notify.o \ 7 + vmci_transport_notify_qstate.o

+2015

net/vmw_vsock/af_vsock.c

··· 1 + /* 2 + * VMware vSockets Driver 3 + * 4 + * Copyright (C) 2007-2013 VMware, Inc. All rights reserved. 5 + * 6 + * This program is free software; you can redistribute it and/or modify it 7 + * under the terms of the GNU General Public License as published by the Free 8 + * Software Foundation version 2 and no later version. 9 + * 10 + * This program is distributed in the hope that it will be useful, but WITHOUT 11 + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 13 + * more details. 14 + */ 15 + 16 + /* Implementation notes: 17 + * 18 + * - There are two kinds of sockets: those created by user action (such as 19 + * calling socket(2)) and those created by incoming connection request packets. 20 + * 21 + * - There are two "global" tables, one for bound sockets (sockets that have 22 + * specified an address that they are responsible for) and one for connected 23 + * sockets (sockets that have established a connection with another socket). 24 + * These tables are "global" in that all sockets on the system are placed 25 + * within them. - Note, though, that the bound table contains an extra entry 26 + * for a list of unbound sockets and SOCK_DGRAM sockets will always remain in 27 + * that list. The bound table is used solely for lookup of sockets when packets 28 + * are received and that's not necessary for SOCK_DGRAM sockets since we create 29 + * a datagram handle for each and need not perform a lookup. Keeping SOCK_DGRAM 30 + * sockets out of the bound hash buckets will reduce the chance of collisions 31 + * when looking for SOCK_STREAM sockets and prevents us from having to check the 32 + * socket type in the hash table lookups. 33 + * 34 + * - Sockets created by user action will either be "client" sockets that 35 + * initiate a connection or "server" sockets that listen for connections; we do 36 + * not support simultaneous connects (two "client" sockets connecting). 37 + * 38 + * - "Server" sockets are referred to as listener sockets throughout this 39 + * implementation because they are in the SS_LISTEN state. When a connection 40 + * request is received (the second kind of socket mentioned above), we create a 41 + * new socket and refer to it as a pending socket. These pending sockets are 42 + * placed on the pending connection list of the listener socket. When future 43 + * packets are received for the address the listener socket is bound to, we 44 + * check if the source of the packet is from one that has an existing pending 45 + * connection. If it does, we process the packet for the pending socket. When 46 + * that socket reaches the connected state, it is removed from the listener 47 + * socket's pending list and enqueued in the listener socket's accept queue. 48 + * Callers of accept(2) will accept connected sockets from the listener socket's 49 + * accept queue. If the socket cannot be accepted for some reason then it is 50 + * marked rejected. Once the connection is accepted, it is owned by the user 51 + * process and the responsibility for cleanup falls with that user process. 52 + * 53 + * - It is possible that these pending sockets will never reach the connected 54 + * state; in fact, we may never receive another packet after the connection 55 + * request. Because of this, we must schedule a cleanup function to run in the 56 + * future, after some amount of time passes where a connection should have been 57 + * established. This function ensures that the socket is off all lists so it 58 + * cannot be retrieved, then drops all references to the socket so it is cleaned 59 + * up (sock_put() -> sk_free() -> our sk_destruct implementation). Note this 60 + * function will also cleanup rejected sockets, those that reach the connected 61 + * state but leave it before they have been accepted. 62 + * 63 + * - Sockets created by user action will be cleaned up when the user process 64 + * calls close(2), causing our release implementation to be called. Our release 65 + * implementation will perform some cleanup then drop the last reference so our 66 + * sk_destruct implementation is invoked. Our sk_destruct implementation will 67 + * perform additional cleanup that's common for both types of sockets. 68 + * 69 + * - A socket's reference count is what ensures that the structure won't be 70 + * freed. Each entry in a list (such as the "global" bound and connected tables 71 + * and the listener socket's pending list and connected queue) ensures a 72 + * reference. When we defer work until process context and pass a socket as our 73 + * argument, we must ensure the reference count is increased to ensure the 74 + * socket isn't freed before the function is run; the deferred function will 75 + * then drop the reference. 76 + */ 77 + 78 + #include <linux/types.h> 79 + 80 + #define EXPORT_SYMTAB 81 + #include <linux/bitops.h> 82 + #include <linux/cred.h> 83 + #include <linux/init.h> 84 + #include <linux/io.h> 85 + #include <linux/kernel.h> 86 + #include <linux/kmod.h> 87 + #include <linux/list.h> 88 + #include <linux/miscdevice.h> 89 + #include <linux/module.h> 90 + #include <linux/mutex.h> 91 + #include <linux/net.h> 92 + #include <linux/poll.h> 93 + #include <linux/skbuff.h> 94 + #include <linux/smp.h> 95 + #include <linux/socket.h> 96 + #include <linux/stddef.h> 97 + #include <linux/unistd.h> 98 + #include <linux/wait.h> 99 + #include <linux/workqueue.h> 100 + #include <net/sock.h> 101 + 102 + #include "af_vsock.h" 103 + #include "vsock_version.h" 104 + 105 + static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr); 106 + static void vsock_sk_destruct(struct sock *sk); 107 + static int vsock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); 108 + 109 + /* Protocol family. */ 110 + static struct proto vsock_proto = { 111 + .name = "AF_VSOCK", 112 + .owner = THIS_MODULE, 113 + .obj_size = sizeof(struct vsock_sock), 114 + }; 115 + 116 + /* The default peer timeout indicates how long we will wait for a peer response 117 + * to a control message. 118 + */ 119 + #define VSOCK_DEFAULT_CONNECT_TIMEOUT (2 * HZ) 120 + 121 + #define SS_LISTEN 255 122 + 123 + static const struct vsock_transport *transport; 124 + static DEFINE_MUTEX(vsock_register_mutex); 125 + 126 + /**** EXPORTS ****/ 127 + 128 + /* Get the ID of the local context. This is transport dependent. */ 129 + 130 + int vm_sockets_get_local_cid(void) 131 + { 132 + return transport->get_local_cid(); 133 + } 134 + EXPORT_SYMBOL_GPL(vm_sockets_get_local_cid); 135 + 136 + /**** UTILS ****/ 137 + 138 + /* Each bound VSocket is stored in the bind hash table and each connected 139 + * VSocket is stored in the connected hash table. 140 + * 141 + * Unbound sockets are all put on the same list attached to the end of the hash 142 + * table (vsock_unbound_sockets). Bound sockets are added to the hash table in 143 + * the bucket that their local address hashes to (vsock_bound_sockets(addr) 144 + * represents the list that addr hashes to). 145 + * 146 + * Specifically, we initialize the vsock_bind_table array to a size of 147 + * VSOCK_HASH_SIZE + 1 so that vsock_bind_table[0] through 148 + * vsock_bind_table[VSOCK_HASH_SIZE - 1] are for bound sockets and 149 + * vsock_bind_table[VSOCK_HASH_SIZE] is for unbound sockets. The hash function 150 + * mods with VSOCK_HASH_SIZE - 1 to ensure this. 151 + */ 152 + #define VSOCK_HASH_SIZE 251 153 + #define MAX_PORT_RETRIES 24 154 + 155 + #define VSOCK_HASH(addr) ((addr)->svm_port % (VSOCK_HASH_SIZE - 1)) 156 + #define vsock_bound_sockets(addr) (&vsock_bind_table[VSOCK_HASH(addr)]) 157 + #define vsock_unbound_sockets (&vsock_bind_table[VSOCK_HASH_SIZE]) 158 + 159 + /* XXX This can probably be implemented in a better way. */ 160 + #define VSOCK_CONN_HASH(src, dst) \ 161 + (((src)->svm_cid ^ (dst)->svm_port) % (VSOCK_HASH_SIZE - 1)) 162 + #define vsock_connected_sockets(src, dst) \ 163 + (&vsock_connected_table[VSOCK_CONN_HASH(src, dst)]) 164 + #define vsock_connected_sockets_vsk(vsk) \ 165 + vsock_connected_sockets(&(vsk)->remote_addr, &(vsk)->local_addr) 166 + 167 + static struct list_head vsock_bind_table[VSOCK_HASH_SIZE + 1]; 168 + static struct list_head vsock_connected_table[VSOCK_HASH_SIZE]; 169 + static DEFINE_SPINLOCK(vsock_table_lock); 170 + 171 + static __init void vsock_init_tables(void) 172 + { 173 + int i; 174 + 175 + for (i = 0; i < ARRAY_SIZE(vsock_bind_table); i++) 176 + INIT_LIST_HEAD(&vsock_bind_table[i]); 177 + 178 + for (i = 0; i < ARRAY_SIZE(vsock_connected_table); i++) 179 + INIT_LIST_HEAD(&vsock_connected_table[i]); 180 + } 181 + 182 + static void __vsock_insert_bound(struct list_head *list, 183 + struct vsock_sock *vsk) 184 + { 185 + sock_hold(&vsk->sk); 186 + list_add(&vsk->bound_table, list); 187 + } 188 + 189 + static void __vsock_insert_connected(struct list_head *list, 190 + struct vsock_sock *vsk) 191 + { 192 + sock_hold(&vsk->sk); 193 + list_add(&vsk->connected_table, list); 194 + } 195 + 196 + static void __vsock_remove_bound(struct vsock_sock *vsk) 197 + { 198 + list_del_init(&vsk->bound_table); 199 + sock_put(&vsk->sk); 200 + } 201 + 202 + static void __vsock_remove_connected(struct vsock_sock *vsk) 203 + { 204 + list_del_init(&vsk->connected_table); 205 + sock_put(&vsk->sk); 206 + } 207 + 208 + static struct sock *__vsock_find_bound_socket(struct sockaddr_vm *addr) 209 + { 210 + struct vsock_sock *vsk; 211 + 212 + list_for_each_entry(vsk, vsock_bound_sockets(addr), bound_table) 213 + if (vsock_addr_equals_addr_any(addr, &vsk->local_addr)) 214 + return sk_vsock(vsk); 215 + 216 + return NULL; 217 + } 218 + 219 + static struct sock *__vsock_find_connected_socket(struct sockaddr_vm *src, 220 + struct sockaddr_vm *dst) 221 + { 222 + struct vsock_sock *vsk; 223 + 224 + list_for_each_entry(vsk, vsock_connected_sockets(src, dst), 225 + connected_table) { 226 + if (vsock_addr_equals_addr(src, &vsk->remote_addr) 227 + && vsock_addr_equals_addr(dst, &vsk->local_addr)) { 228 + return sk_vsock(vsk); 229 + } 230 + } 231 + 232 + return NULL; 233 + } 234 + 235 + static bool __vsock_in_bound_table(struct vsock_sock *vsk) 236 + { 237 + return !list_empty(&vsk->bound_table); 238 + } 239 + 240 + static bool __vsock_in_connected_table(struct vsock_sock *vsk) 241 + { 242 + return !list_empty(&vsk->connected_table); 243 + } 244 + 245 + static void vsock_insert_unbound(struct vsock_sock *vsk) 246 + { 247 + spin_lock_bh(&vsock_table_lock); 248 + __vsock_insert_bound(vsock_unbound_sockets, vsk); 249 + spin_unlock_bh(&vsock_table_lock); 250 + } 251 + 252 + void vsock_insert_connected(struct vsock_sock *vsk) 253 + { 254 + struct list_head *list = vsock_connected_sockets( 255 + &vsk->remote_addr, &vsk->local_addr); 256 + 257 + spin_lock_bh(&vsock_table_lock); 258 + __vsock_insert_connected(list, vsk); 259 + spin_unlock_bh(&vsock_table_lock); 260 + } 261 + EXPORT_SYMBOL_GPL(vsock_insert_connected); 262 + 263 + void vsock_remove_bound(struct vsock_sock *vsk) 264 + { 265 + spin_lock_bh(&vsock_table_lock); 266 + __vsock_remove_bound(vsk); 267 + spin_unlock_bh(&vsock_table_lock); 268 + } 269 + EXPORT_SYMBOL_GPL(vsock_remove_bound); 270 + 271 + void vsock_remove_connected(struct vsock_sock *vsk) 272 + { 273 + spin_lock_bh(&vsock_table_lock); 274 + __vsock_remove_connected(vsk); 275 + spin_unlock_bh(&vsock_table_lock); 276 + } 277 + EXPORT_SYMBOL_GPL(vsock_remove_connected); 278 + 279 + struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr) 280 + { 281 + struct sock *sk; 282 + 283 + spin_lock_bh(&vsock_table_lock); 284 + sk = __vsock_find_bound_socket(addr); 285 + if (sk) 286 + sock_hold(sk); 287 + 288 + spin_unlock_bh(&vsock_table_lock); 289 + 290 + return sk; 291 + } 292 + EXPORT_SYMBOL_GPL(vsock_find_bound_socket); 293 + 294 + struct sock *vsock_find_connected_socket(struct sockaddr_vm *src, 295 + struct sockaddr_vm *dst) 296 + { 297 + struct sock *sk; 298 + 299 + spin_lock_bh(&vsock_table_lock); 300 + sk = __vsock_find_connected_socket(src, dst); 301 + if (sk) 302 + sock_hold(sk); 303 + 304 + spin_unlock_bh(&vsock_table_lock); 305 + 306 + return sk; 307 + } 308 + EXPORT_SYMBOL_GPL(vsock_find_connected_socket); 309 + 310 + static bool vsock_in_bound_table(struct vsock_sock *vsk) 311 + { 312 + bool ret; 313 + 314 + spin_lock_bh(&vsock_table_lock); 315 + ret = __vsock_in_bound_table(vsk); 316 + spin_unlock_bh(&vsock_table_lock); 317 + 318 + return ret; 319 + } 320 + 321 + static bool vsock_in_connected_table(struct vsock_sock *vsk) 322 + { 323 + bool ret; 324 + 325 + spin_lock_bh(&vsock_table_lock); 326 + ret = __vsock_in_connected_table(vsk); 327 + spin_unlock_bh(&vsock_table_lock); 328 + 329 + return ret; 330 + } 331 + 332 + void vsock_for_each_connected_socket(void (*fn)(struct sock *sk)) 333 + { 334 + int i; 335 + 336 + spin_lock_bh(&vsock_table_lock); 337 + 338 + for (i = 0; i < ARRAY_SIZE(vsock_connected_table); i++) { 339 + struct vsock_sock *vsk; 340 + list_for_each_entry(vsk, &vsock_connected_table[i], 341 + connected_table); 342 + fn(sk_vsock(vsk)); 343 + } 344 + 345 + spin_unlock_bh(&vsock_table_lock); 346 + } 347 + EXPORT_SYMBOL_GPL(vsock_for_each_connected_socket); 348 + 349 + void vsock_add_pending(struct sock *listener, struct sock *pending) 350 + { 351 + struct vsock_sock *vlistener; 352 + struct vsock_sock *vpending; 353 + 354 + vlistener = vsock_sk(listener); 355 + vpending = vsock_sk(pending); 356 + 357 + sock_hold(pending); 358 + sock_hold(listener); 359 + list_add_tail(&vpending->pending_links, &vlistener->pending_links); 360 + } 361 + EXPORT_SYMBOL_GPL(vsock_add_pending); 362 + 363 + void vsock_remove_pending(struct sock *listener, struct sock *pending) 364 + { 365 + struct vsock_sock *vpending = vsock_sk(pending); 366 + 367 + list_del_init(&vpending->pending_links); 368 + sock_put(listener); 369 + sock_put(pending); 370 + } 371 + EXPORT_SYMBOL_GPL(vsock_remove_pending); 372 + 373 + void vsock_enqueue_accept(struct sock *listener, struct sock *connected) 374 + { 375 + struct vsock_sock *vlistener; 376 + struct vsock_sock *vconnected; 377 + 378 + vlistener = vsock_sk(listener); 379 + vconnected = vsock_sk(connected); 380 + 381 + sock_hold(connected); 382 + sock_hold(listener); 383 + list_add_tail(&vconnected->accept_queue, &vlistener->accept_queue); 384 + } 385 + EXPORT_SYMBOL_GPL(vsock_enqueue_accept); 386 + 387 + static struct sock *vsock_dequeue_accept(struct sock *listener) 388 + { 389 + struct vsock_sock *vlistener; 390 + struct vsock_sock *vconnected; 391 + 392 + vlistener = vsock_sk(listener); 393 + 394 + if (list_empty(&vlistener->accept_queue)) 395 + return NULL; 396 + 397 + vconnected = list_entry(vlistener->accept_queue.next, 398 + struct vsock_sock, accept_queue); 399 + 400 + list_del_init(&vconnected->accept_queue); 401 + sock_put(listener); 402 + /* The caller will need a reference on the connected socket so we let 403 + * it call sock_put(). 404 + */ 405 + 406 + return sk_vsock(vconnected); 407 + } 408 + 409 + static bool vsock_is_accept_queue_empty(struct sock *sk) 410 + { 411 + struct vsock_sock *vsk = vsock_sk(sk); 412 + return list_empty(&vsk->accept_queue); 413 + } 414 + 415 + static bool vsock_is_pending(struct sock *sk) 416 + { 417 + struct vsock_sock *vsk = vsock_sk(sk); 418 + return !list_empty(&vsk->pending_links); 419 + } 420 + 421 + static int vsock_send_shutdown(struct sock *sk, int mode) 422 + { 423 + return transport->shutdown(vsock_sk(sk), mode); 424 + } 425 + 426 + void vsock_pending_work(struct work_struct *work) 427 + { 428 + struct sock *sk; 429 + struct sock *listener; 430 + struct vsock_sock *vsk; 431 + bool cleanup; 432 + 433 + vsk = container_of(work, struct vsock_sock, dwork.work); 434 + sk = sk_vsock(vsk); 435 + listener = vsk->listener; 436 + cleanup = true; 437 + 438 + lock_sock(listener); 439 + lock_sock(sk); 440 + 441 + if (vsock_is_pending(sk)) { 442 + vsock_remove_pending(listener, sk); 443 + } else if (!vsk->rejected) { 444 + /* We are not on the pending list and accept() did not reject 445 + * us, so we must have been accepted by our user process. We 446 + * just need to drop our references to the sockets and be on 447 + * our way. 448 + */ 449 + cleanup = false; 450 + goto out; 451 + } 452 + 453 + listener->sk_ack_backlog--; 454 + 455 + /* We need to remove ourself from the global connected sockets list so 456 + * incoming packets can't find this socket, and to reduce the reference 457 + * count. 458 + */ 459 + if (vsock_in_connected_table(vsk)) 460 + vsock_remove_connected(vsk); 461 + 462 + sk->sk_state = SS_FREE; 463 + 464 + out: 465 + release_sock(sk); 466 + release_sock(listener); 467 + if (cleanup) 468 + sock_put(sk); 469 + 470 + sock_put(sk); 471 + sock_put(listener); 472 + } 473 + EXPORT_SYMBOL_GPL(vsock_pending_work); 474 + 475 + /**** SOCKET OPERATIONS ****/ 476 + 477 + static int __vsock_bind_stream(struct vsock_sock *vsk, 478 + struct sockaddr_vm *addr) 479 + { 480 + static u32 port = LAST_RESERVED_PORT + 1; 481 + struct sockaddr_vm new_addr; 482 + 483 + vsock_addr_init(&new_addr, addr->svm_cid, addr->svm_port); 484 + 485 + if (addr->svm_port == VMADDR_PORT_ANY) { 486 + bool found = false; 487 + unsigned int i; 488 + 489 + for (i = 0; i < MAX_PORT_RETRIES; i++) { 490 + if (port <= LAST_RESERVED_PORT) 491 + port = LAST_RESERVED_PORT + 1; 492 + 493 + new_addr.svm_port = port++; 494 + 495 + if (!__vsock_find_bound_socket(&new_addr)) { 496 + found = true; 497 + break; 498 + } 499 + } 500 + 501 + if (!found) 502 + return -EADDRNOTAVAIL; 503 + } else { 504 + /* If port is in reserved range, ensure caller 505 + * has necessary privileges. 506 + */ 507 + if (addr->svm_port <= LAST_RESERVED_PORT && 508 + !capable(CAP_NET_BIND_SERVICE)) { 509 + return -EACCES; 510 + } 511 + 512 + if (__vsock_find_bound_socket(&new_addr)) 513 + return -EADDRINUSE; 514 + } 515 + 516 + vsock_addr_init(&vsk->local_addr, new_addr.svm_cid, new_addr.svm_port); 517 + 518 + /* Remove stream sockets from the unbound list and add them to the hash 519 + * table for easy lookup by its address. The unbound list is simply an 520 + * extra entry at the end of the hash table, a trick used by AF_UNIX. 521 + */ 522 + __vsock_remove_bound(vsk); 523 + __vsock_insert_bound(vsock_bound_sockets(&vsk->local_addr), vsk); 524 + 525 + return 0; 526 + } 527 + 528 + static int __vsock_bind_dgram(struct vsock_sock *vsk, 529 + struct sockaddr_vm *addr) 530 + { 531 + return transport->dgram_bind(vsk, addr); 532 + } 533 + 534 + static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr) 535 + { 536 + struct vsock_sock *vsk = vsock_sk(sk); 537 + u32 cid; 538 + int retval; 539 + 540 + /* First ensure this socket isn't already bound. */ 541 + if (vsock_addr_bound(&vsk->local_addr)) 542 + return -EINVAL; 543 + 544 + /* Now bind to the provided address or select appropriate values if 545 + * none are provided (VMADDR_CID_ANY and VMADDR_PORT_ANY). Note that 546 + * like AF_INET prevents binding to a non-local IP address (in most 547 + * cases), we only allow binding to the local CID. 548 + */ 549 + cid = transport->get_local_cid(); 550 + if (addr->svm_cid != cid && addr->svm_cid != VMADDR_CID_ANY) 551 + return -EADDRNOTAVAIL; 552 + 553 + switch (sk->sk_socket->type) { 554 + case SOCK_STREAM: 555 + spin_lock_bh(&vsock_table_lock); 556 + retval = __vsock_bind_stream(vsk, addr); 557 + spin_unlock_bh(&vsock_table_lock); 558 + break; 559 + 560 + case SOCK_DGRAM: 561 + retval = __vsock_bind_dgram(vsk, addr); 562 + break; 563 + 564 + default: 565 + retval = -EINVAL; 566 + break; 567 + } 568 + 569 + return retval; 570 + } 571 + 572 + struct sock *__vsock_create(struct net *net, 573 + struct socket *sock, 574 + struct sock *parent, 575 + gfp_t priority, 576 + unsigned short type) 577 + { 578 + struct sock *sk; 579 + struct vsock_sock *psk; 580 + struct vsock_sock *vsk; 581 + 582 + sk = sk_alloc(net, AF_VSOCK, priority, &vsock_proto); 583 + if (!sk) 584 + return NULL; 585 + 586 + sock_init_data(sock, sk); 587 + 588 + /* sk->sk_type is normally set in sock_init_data, but only if sock is 589 + * non-NULL. We make sure that our sockets always have a type by 590 + * setting it here if needed. 591 + */ 592 + if (!sock) 593 + sk->sk_type = type; 594 + 595 + vsk = vsock_sk(sk); 596 + vsock_addr_init(&vsk->local_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY); 597 + vsock_addr_init(&vsk->remote_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY); 598 + 599 + sk->sk_destruct = vsock_sk_destruct; 600 + sk->sk_backlog_rcv = vsock_queue_rcv_skb; 601 + sk->sk_state = 0; 602 + sock_reset_flag(sk, SOCK_DONE); 603 + 604 + INIT_LIST_HEAD(&vsk->bound_table); 605 + INIT_LIST_HEAD(&vsk->connected_table); 606 + vsk->listener = NULL; 607 + INIT_LIST_HEAD(&vsk->pending_links); 608 + INIT_LIST_HEAD(&vsk->accept_queue); 609 + vsk->rejected = false; 610 + vsk->sent_request = false; 611 + vsk->ignore_connecting_rst = false; 612 + vsk->peer_shutdown = 0; 613 + 614 + psk = parent ? vsock_sk(parent) : NULL; 615 + if (parent) { 616 + vsk->trusted = psk->trusted; 617 + vsk->owner = get_cred(psk->owner); 618 + vsk->connect_timeout = psk->connect_timeout; 619 + } else { 620 + vsk->trusted = capable(CAP_NET_ADMIN); 621 + vsk->owner = get_current_cred(); 622 + vsk->connect_timeout = VSOCK_DEFAULT_CONNECT_TIMEOUT; 623 + } 624 + 625 + if (transport->init(vsk, psk) < 0) { 626 + sk_free(sk); 627 + return NULL; 628 + } 629 + 630 + if (sock) 631 + vsock_insert_unbound(vsk); 632 + 633 + return sk; 634 + } 635 + EXPORT_SYMBOL_GPL(__vsock_create); 636 + 637 + static void __vsock_release(struct sock *sk) 638 + { 639 + if (sk) { 640 + struct sk_buff *skb; 641 + struct sock *pending; 642 + struct vsock_sock *vsk; 643 + 644 + vsk = vsock_sk(sk); 645 + pending = NULL; /* Compiler warning. */ 646 + 647 + if (vsock_in_bound_table(vsk)) 648 + vsock_remove_bound(vsk); 649 + 650 + if (vsock_in_connected_table(vsk)) 651 + vsock_remove_connected(vsk); 652 + 653 + transport->release(vsk); 654 + 655 + lock_sock(sk); 656 + sock_orphan(sk); 657 + sk->sk_shutdown = SHUTDOWN_MASK; 658 + 659 + while ((skb = skb_dequeue(&sk->sk_receive_queue))) 660 + kfree_skb(skb); 661 + 662 + /* Clean up any sockets that never were accepted. */ 663 + while ((pending = vsock_dequeue_accept(sk)) != NULL) { 664 + __vsock_release(pending); 665 + sock_put(pending); 666 + } 667 + 668 + release_sock(sk); 669 + sock_put(sk); 670 + } 671 + } 672 + 673 + static void vsock_sk_destruct(struct sock *sk) 674 + { 675 + struct vsock_sock *vsk = vsock_sk(sk); 676 + 677 + transport->destruct(vsk); 678 + 679 + /* When clearing these addresses, there's no need to set the family and 680 + * possibly register the address family with the kernel. 681 + */ 682 + vsock_addr_init(&vsk->local_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY); 683 + vsock_addr_init(&vsk->remote_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY); 684 + 685 + put_cred(vsk->owner); 686 + } 687 + 688 + static int vsock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) 689 + { 690 + int err; 691 + 692 + err = sock_queue_rcv_skb(sk, skb); 693 + if (err) 694 + kfree_skb(skb); 695 + 696 + return err; 697 + } 698 + 699 + s64 vsock_stream_has_data(struct vsock_sock *vsk) 700 + { 701 + return transport->stream_has_data(vsk); 702 + } 703 + EXPORT_SYMBOL_GPL(vsock_stream_has_data); 704 + 705 + s64 vsock_stream_has_space(struct vsock_sock *vsk) 706 + { 707 + return transport->stream_has_space(vsk); 708 + } 709 + EXPORT_SYMBOL_GPL(vsock_stream_has_space); 710 + 711 + static int vsock_release(struct socket *sock) 712 + { 713 + __vsock_release(sock->sk); 714 + sock->sk = NULL; 715 + sock->state = SS_FREE; 716 + 717 + return 0; 718 + } 719 + 720 + static int 721 + vsock_bind(struct socket *sock, struct sockaddr *addr, int addr_len) 722 + { 723 + int err; 724 + struct sock *sk; 725 + struct sockaddr_vm *vm_addr; 726 + 727 + sk = sock->sk; 728 + 729 + if (vsock_addr_cast(addr, addr_len, &vm_addr) != 0) 730 + return -EINVAL; 731 + 732 + lock_sock(sk); 733 + err = __vsock_bind(sk, vm_addr); 734 + release_sock(sk); 735 + 736 + return err; 737 + } 738 + 739 + static int vsock_getname(struct socket *sock, 740 + struct sockaddr *addr, int *addr_len, int peer) 741 + { 742 + int err; 743 + struct sock *sk; 744 + struct vsock_sock *vsk; 745 + struct sockaddr_vm *vm_addr; 746 + 747 + sk = sock->sk; 748 + vsk = vsock_sk(sk); 749 + err = 0; 750 + 751 + lock_sock(sk); 752 + 753 + if (peer) { 754 + if (sock->state != SS_CONNECTED) { 755 + err = -ENOTCONN; 756 + goto out; 757 + } 758 + vm_addr = &vsk->remote_addr; 759 + } else { 760 + vm_addr = &vsk->local_addr; 761 + } 762 + 763 + if (!vm_addr) { 764 + err = -EINVAL; 765 + goto out; 766 + } 767 + 768 + /* sys_getsockname() and sys_getpeername() pass us a 769 + * MAX_SOCK_ADDR-sized buffer and don't set addr_len. Unfortunately 770 + * that macro is defined in socket.c instead of .h, so we hardcode its 771 + * value here. 772 + */ 773 + BUILD_BUG_ON(sizeof(*vm_addr) > 128); 774 + memcpy(addr, vm_addr, sizeof(*vm_addr)); 775 + *addr_len = sizeof(*vm_addr); 776 + 777 + out: 778 + release_sock(sk); 779 + return err; 780 + } 781 + 782 + static int vsock_shutdown(struct socket *sock, int mode) 783 + { 784 + int err; 785 + struct sock *sk; 786 + 787 + /* User level uses SHUT_RD (0) and SHUT_WR (1), but the kernel uses 788 + * RCV_SHUTDOWN (1) and SEND_SHUTDOWN (2), so we must increment mode 789 + * here like the other address families do. Note also that the 790 + * increment makes SHUT_RDWR (2) into RCV_SHUTDOWN | SEND_SHUTDOWN (3), 791 + * which is what we want. 792 + */ 793 + mode++; 794 + 795 + if ((mode & ~SHUTDOWN_MASK) || !mode) 796 + return -EINVAL; 797 + 798 + /* If this is a STREAM socket and it is not connected then bail out 799 + * immediately. If it is a DGRAM socket then we must first kick the 800 + * socket so that it wakes up from any sleeping calls, for example 801 + * recv(), and then afterwards return the error. 802 + */ 803 + 804 + sk = sock->sk; 805 + if (sock->state == SS_UNCONNECTED) { 806 + err = -ENOTCONN; 807 + if (sk->sk_type == SOCK_STREAM) 808 + return err; 809 + } else { 810 + sock->state = SS_DISCONNECTING; 811 + err = 0; 812 + } 813 + 814 + /* Receive and send shutdowns are treated alike. */ 815 + mode = mode & (RCV_SHUTDOWN | SEND_SHUTDOWN); 816 + if (mode) { 817 + lock_sock(sk); 818 + sk->sk_shutdown |= mode; 819 + sk->sk_state_change(sk); 820 + release_sock(sk); 821 + 822 + if (sk->sk_type == SOCK_STREAM) { 823 + sock_reset_flag(sk, SOCK_DONE); 824 + vsock_send_shutdown(sk, mode); 825 + } 826 + } 827 + 828 + return err; 829 + } 830 + 831 + static unsigned int vsock_poll(struct file *file, struct socket *sock, 832 + poll_table *wait) 833 + { 834 + struct sock *sk; 835 + unsigned int mask; 836 + struct vsock_sock *vsk; 837 + 838 + sk = sock->sk; 839 + vsk = vsock_sk(sk); 840 + 841 + poll_wait(file, sk_sleep(sk), wait); 842 + mask = 0; 843 + 844 + if (sk->sk_err) 845 + /* Signify that there has been an error on this socket. */ 846 + mask |= POLLERR; 847 + 848 + /* INET sockets treat local write shutdown and peer write shutdown as a 849 + * case of POLLHUP set. 850 + */ 851 + if ((sk->sk_shutdown == SHUTDOWN_MASK) || 852 + ((sk->sk_shutdown & SEND_SHUTDOWN) && 853 + (vsk->peer_shutdown & SEND_SHUTDOWN))) { 854 + mask |= POLLHUP; 855 + } 856 + 857 + if (sk->sk_shutdown & RCV_SHUTDOWN || 858 + vsk->peer_shutdown & SEND_SHUTDOWN) { 859 + mask |= POLLRDHUP; 860 + } 861 + 862 + if (sock->type == SOCK_DGRAM) { 863 + /* For datagram sockets we can read if there is something in 864 + * the queue and write as long as the socket isn't shutdown for 865 + * sending. 866 + */ 867 + if (!skb_queue_empty(&sk->sk_receive_queue) || 868 + (sk->sk_shutdown & RCV_SHUTDOWN)) { 869 + mask |= POLLIN | POLLRDNORM; 870 + } 871 + 872 + if (!(sk->sk_shutdown & SEND_SHUTDOWN)) 873 + mask |= POLLOUT | POLLWRNORM | POLLWRBAND; 874 + 875 + } else if (sock->type == SOCK_STREAM) { 876 + lock_sock(sk); 877 + 878 + /* Listening sockets that have connections in their accept 879 + * queue can be read. 880 + */ 881 + if (sk->sk_state == SS_LISTEN 882 + && !vsock_is_accept_queue_empty(sk)) 883 + mask |= POLLIN | POLLRDNORM; 884 + 885 + /* If there is something in the queue then we can read. */ 886 + if (transport->stream_is_active(vsk) && 887 + !(sk->sk_shutdown & RCV_SHUTDOWN)) { 888 + bool data_ready_now = false; 889 + int ret = transport->notify_poll_in( 890 + vsk, 1, &data_ready_now); 891 + if (ret < 0) { 892 + mask |= POLLERR; 893 + } else { 894 + if (data_ready_now) 895 + mask |= POLLIN | POLLRDNORM; 896 + 897 + } 898 + } 899 + 900 + /* Sockets whose connections have been closed, reset, or 901 + * terminated should also be considered read, and we check the 902 + * shutdown flag for that. 903 + */ 904 + if (sk->sk_shutdown & RCV_SHUTDOWN || 905 + vsk->peer_shutdown & SEND_SHUTDOWN) { 906 + mask |= POLLIN | POLLRDNORM; 907 + } 908 + 909 + /* Connected sockets that can produce data can be written. */ 910 + if (sk->sk_state == SS_CONNECTED) { 911 + if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { 912 + bool space_avail_now = false; 913 + int ret = transport->notify_poll_out( 914 + vsk, 1, &space_avail_now); 915 + if (ret < 0) { 916 + mask |= POLLERR; 917 + } else { 918 + if (space_avail_now) 919 + /* Remove POLLWRBAND since INET 920 + * sockets are not setting it. 921 + */ 922 + mask |= POLLOUT | POLLWRNORM; 923 + 924 + } 925 + } 926 + } 927 + 928 + /* Simulate INET socket poll behaviors, which sets 929 + * POLLOUT|POLLWRNORM when peer is closed and nothing to read, 930 + * but local send is not shutdown. 931 + */ 932 + if (sk->sk_state == SS_UNCONNECTED) { 933 + if (!(sk->sk_shutdown & SEND_SHUTDOWN)) 934 + mask |= POLLOUT | POLLWRNORM; 935 + 936 + } 937 + 938 + release_sock(sk); 939 + } 940 + 941 + return mask; 942 + } 943 + 944 + static int vsock_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock, 945 + struct msghdr *msg, size_t len) 946 + { 947 + int err; 948 + struct sock *sk; 949 + struct vsock_sock *vsk; 950 + struct sockaddr_vm *remote_addr; 951 + 952 + if (msg->msg_flags & MSG_OOB) 953 + return -EOPNOTSUPP; 954 + 955 + /* For now, MSG_DONTWAIT is always assumed... */ 956 + err = 0; 957 + sk = sock->sk; 958 + vsk = vsock_sk(sk); 959 + 960 + lock_sock(sk); 961 + 962 + if (!vsock_addr_bound(&vsk->local_addr)) { 963 + struct sockaddr_vm local_addr; 964 + 965 + vsock_addr_init(&local_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY); 966 + err = __vsock_bind(sk, &local_addr); 967 + if (err != 0) 968 + goto out; 969 + 970 + } 971 + 972 + /* If the provided message contains an address, use that. Otherwise 973 + * fall back on the socket's remote handle (if it has been connected). 974 + */ 975 + if (msg->msg_name && 976 + vsock_addr_cast(msg->msg_name, msg->msg_namelen, 977 + &remote_addr) == 0) { 978 + /* Ensure this address is of the right type and is a valid 979 + * destination. 980 + */ 981 + 982 + if (remote_addr->svm_cid == VMADDR_CID_ANY) 983 + remote_addr->svm_cid = transport->get_local_cid(); 984 + 985 + if (!vsock_addr_bound(remote_addr)) { 986 + err = -EINVAL; 987 + goto out; 988 + } 989 + } else if (sock->state == SS_CONNECTED) { 990 + remote_addr = &vsk->remote_addr; 991 + 992 + if (remote_addr->svm_cid == VMADDR_CID_ANY) 993 + remote_addr->svm_cid = transport->get_local_cid(); 994 + 995 + /* XXX Should connect() or this function ensure remote_addr is 996 + * bound? 997 + */ 998 + if (!vsock_addr_bound(&vsk->remote_addr)) { 999 + err = -EINVAL; 1000 + goto out; 1001 + } 1002 + } else { 1003 + err = -EINVAL; 1004 + goto out; 1005 + } 1006 + 1007 + if (!transport->dgram_allow(remote_addr->svm_cid, 1008 + remote_addr->svm_port)) { 1009 + err = -EINVAL; 1010 + goto out; 1011 + } 1012 + 1013 + err = transport->dgram_enqueue(vsk, remote_addr, msg->msg_iov, len); 1014 + 1015 + out: 1016 + release_sock(sk); 1017 + return err; 1018 + } 1019 + 1020 + static int vsock_dgram_connect(struct socket *sock, 1021 + struct sockaddr *addr, int addr_len, int flags) 1022 + { 1023 + int err; 1024 + struct sock *sk; 1025 + struct vsock_sock *vsk; 1026 + struct sockaddr_vm *remote_addr; 1027 + 1028 + sk = sock->sk; 1029 + vsk = vsock_sk(sk); 1030 + 1031 + err = vsock_addr_cast(addr, addr_len, &remote_addr); 1032 + if (err == -EAFNOSUPPORT && remote_addr->svm_family == AF_UNSPEC) { 1033 + lock_sock(sk); 1034 + vsock_addr_init(&vsk->remote_addr, VMADDR_CID_ANY, 1035 + VMADDR_PORT_ANY); 1036 + sock->state = SS_UNCONNECTED; 1037 + release_sock(sk); 1038 + return 0; 1039 + } else if (err != 0) 1040 + return -EINVAL; 1041 + 1042 + lock_sock(sk); 1043 + 1044 + if (!vsock_addr_bound(&vsk->local_addr)) { 1045 + struct sockaddr_vm local_addr; 1046 + 1047 + vsock_addr_init(&local_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY); 1048 + err = __vsock_bind(sk, &local_addr); 1049 + if (err != 0) 1050 + goto out; 1051 + 1052 + } 1053 + 1054 + if (!transport->dgram_allow(remote_addr->svm_cid, 1055 + remote_addr->svm_port)) { 1056 + err = -EINVAL; 1057 + goto out; 1058 + } 1059 + 1060 + memcpy(&vsk->remote_addr, remote_addr, sizeof(vsk->remote_addr)); 1061 + sock->state = SS_CONNECTED; 1062 + 1063 + out: 1064 + release_sock(sk); 1065 + return err; 1066 + } 1067 + 1068 + static int vsock_dgram_recvmsg(struct kiocb *kiocb, struct socket *sock, 1069 + struct msghdr *msg, size_t len, int flags) 1070 + { 1071 + return transport->dgram_dequeue(kiocb, vsock_sk(sock->sk), msg, len, 1072 + flags); 1073 + } 1074 + 1075 + static const struct proto_ops vsock_dgram_ops = { 1076 + .family = PF_VSOCK, 1077 + .owner = THIS_MODULE, 1078 + .release = vsock_release, 1079 + .bind = vsock_bind, 1080 + .connect = vsock_dgram_connect, 1081 + .socketpair = sock_no_socketpair, 1082 + .accept = sock_no_accept, 1083 + .getname = vsock_getname, 1084 + .poll = vsock_poll, 1085 + .ioctl = sock_no_ioctl, 1086 + .listen = sock_no_listen, 1087 + .shutdown = vsock_shutdown, 1088 + .setsockopt = sock_no_setsockopt, 1089 + .getsockopt = sock_no_getsockopt, 1090 + .sendmsg = vsock_dgram_sendmsg, 1091 + .recvmsg = vsock_dgram_recvmsg, 1092 + .mmap = sock_no_mmap, 1093 + .sendpage = sock_no_sendpage, 1094 + }; 1095 + 1096 + static void vsock_connect_timeout(struct work_struct *work) 1097 + { 1098 + struct sock *sk; 1099 + struct vsock_sock *vsk; 1100 + 1101 + vsk = container_of(work, struct vsock_sock, dwork.work); 1102 + sk = sk_vsock(vsk); 1103 + 1104 + lock_sock(sk); 1105 + if (sk->sk_state == SS_CONNECTING && 1106 + (sk->sk_shutdown != SHUTDOWN_MASK)) { 1107 + sk->sk_state = SS_UNCONNECTED; 1108 + sk->sk_err = ETIMEDOUT; 1109 + sk->sk_error_report(sk); 1110 + } 1111 + release_sock(sk); 1112 + 1113 + sock_put(sk); 1114 + } 1115 + 1116 + static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr, 1117 + int addr_len, int flags) 1118 + { 1119 + int err; 1120 + struct sock *sk; 1121 + struct vsock_sock *vsk; 1122 + struct sockaddr_vm *remote_addr; 1123 + long timeout; 1124 + DEFINE_WAIT(wait); 1125 + 1126 + err = 0; 1127 + sk = sock->sk; 1128 + vsk = vsock_sk(sk); 1129 + 1130 + lock_sock(sk); 1131 + 1132 + /* XXX AF_UNSPEC should make us disconnect like AF_INET. */ 1133 + switch (sock->state) { 1134 + case SS_CONNECTED: 1135 + err = -EISCONN; 1136 + goto out; 1137 + case SS_DISCONNECTING: 1138 + err = -EINVAL; 1139 + goto out; 1140 + case SS_CONNECTING: 1141 + /* This continues on so we can move sock into the SS_CONNECTED 1142 + * state once the connection has completed (at which point err 1143 + * will be set to zero also). Otherwise, we will either wait 1144 + * for the connection or return -EALREADY should this be a 1145 + * non-blocking call. 1146 + */ 1147 + err = -EALREADY; 1148 + break; 1149 + default: 1150 + if ((sk->sk_state == SS_LISTEN) || 1151 + vsock_addr_cast(addr, addr_len, &remote_addr) != 0) { 1152 + err = -EINVAL; 1153 + goto out; 1154 + } 1155 + 1156 + /* The hypervisor and well-known contexts do not have socket 1157 + * endpoints. 1158 + */ 1159 + if (!transport->stream_allow(remote_addr->svm_cid, 1160 + remote_addr->svm_port)) { 1161 + err = -ENETUNREACH; 1162 + goto out; 1163 + } 1164 + 1165 + /* Set the remote address that we are connecting to. */ 1166 + memcpy(&vsk->remote_addr, remote_addr, 1167 + sizeof(vsk->remote_addr)); 1168 + 1169 + /* Autobind this socket to the local address if necessary. */ 1170 + if (!vsock_addr_bound(&vsk->local_addr)) { 1171 + struct sockaddr_vm local_addr; 1172 + 1173 + vsock_addr_init(&local_addr, VMADDR_CID_ANY, 1174 + VMADDR_PORT_ANY); 1175 + err = __vsock_bind(sk, &local_addr); 1176 + if (err != 0) 1177 + goto out; 1178 + 1179 + } 1180 + 1181 + sk->sk_state = SS_CONNECTING; 1182 + 1183 + err = transport->connect(vsk); 1184 + if (err < 0) 1185 + goto out; 1186 + 1187 + /* Mark sock as connecting and set the error code to in 1188 + * progress in case this is a non-blocking connect. 1189 + */ 1190 + sock->state = SS_CONNECTING; 1191 + err = -EINPROGRESS; 1192 + } 1193 + 1194 + /* The receive path will handle all communication until we are able to 1195 + * enter the connected state. Here we wait for the connection to be 1196 + * completed or a notification of an error. 1197 + */ 1198 + timeout = vsk->connect_timeout; 1199 + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); 1200 + 1201 + while (sk->sk_state != SS_CONNECTED && sk->sk_err == 0) { 1202 + if (flags & O_NONBLOCK) { 1203 + /* If we're not going to block, we schedule a timeout 1204 + * function to generate a timeout on the connection 1205 + * attempt, in case the peer doesn't respond in a 1206 + * timely manner. We hold on to the socket until the 1207 + * timeout fires. 1208 + */ 1209 + sock_hold(sk); 1210 + INIT_DELAYED_WORK(&vsk->dwork, 1211 + vsock_connect_timeout); 1212 + schedule_delayed_work(&vsk->dwork, timeout); 1213 + 1214 + /* Skip ahead to preserve error code set above. */ 1215 + goto out_wait; 1216 + } 1217 + 1218 + release_sock(sk); 1219 + timeout = schedule_timeout(timeout); 1220 + lock_sock(sk); 1221 + 1222 + if (signal_pending(current)) { 1223 + err = sock_intr_errno(timeout); 1224 + goto out_wait_error; 1225 + } else if (timeout == 0) { 1226 + err = -ETIMEDOUT; 1227 + goto out_wait_error; 1228 + } 1229 + 1230 + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); 1231 + } 1232 + 1233 + if (sk->sk_err) { 1234 + err = -sk->sk_err; 1235 + goto out_wait_error; 1236 + } else 1237 + err = 0; 1238 + 1239 + out_wait: 1240 + finish_wait(sk_sleep(sk), &wait); 1241 + out: 1242 + release_sock(sk); 1243 + return err; 1244 + 1245 + out_wait_error: 1246 + sk->sk_state = SS_UNCONNECTED; 1247 + sock->state = SS_UNCONNECTED; 1248 + goto out_wait; 1249 + } 1250 + 1251 + static int vsock_accept(struct socket *sock, struct socket *newsock, int flags) 1252 + { 1253 + struct sock *listener; 1254 + int err; 1255 + struct sock *connected; 1256 + struct vsock_sock *vconnected; 1257 + long timeout; 1258 + DEFINE_WAIT(wait); 1259 + 1260 + err = 0; 1261 + listener = sock->sk; 1262 + 1263 + lock_sock(listener); 1264 + 1265 + if (sock->type != SOCK_STREAM) { 1266 + err = -EOPNOTSUPP; 1267 + goto out; 1268 + } 1269 + 1270 + if (listener->sk_state != SS_LISTEN) { 1271 + err = -EINVAL; 1272 + goto out; 1273 + } 1274 + 1275 + /* Wait for children sockets to appear; these are the new sockets 1276 + * created upon connection establishment. 1277 + */ 1278 + timeout = sock_sndtimeo(listener, flags & O_NONBLOCK); 1279 + prepare_to_wait(sk_sleep(listener), &wait, TASK_INTERRUPTIBLE); 1280 + 1281 + while ((connected = vsock_dequeue_accept(listener)) == NULL && 1282 + listener->sk_err == 0) { 1283 + release_sock(listener); 1284 + timeout = schedule_timeout(timeout); 1285 + lock_sock(listener); 1286 + 1287 + if (signal_pending(current)) { 1288 + err = sock_intr_errno(timeout); 1289 + goto out_wait; 1290 + } else if (timeout == 0) { 1291 + err = -EAGAIN; 1292 + goto out_wait; 1293 + } 1294 + 1295 + prepare_to_wait(sk_sleep(listener), &wait, TASK_INTERRUPTIBLE); 1296 + } 1297 + 1298 + if (listener->sk_err) 1299 + err = -listener->sk_err; 1300 + 1301 + if (connected) { 1302 + listener->sk_ack_backlog--; 1303 + 1304 + lock_sock(connected); 1305 + vconnected = vsock_sk(connected); 1306 + 1307 + /* If the listener socket has received an error, then we should 1308 + * reject this socket and return. Note that we simply mark the 1309 + * socket rejected, drop our reference, and let the cleanup 1310 + * function handle the cleanup; the fact that we found it in 1311 + * the listener's accept queue guarantees that the cleanup 1312 + * function hasn't run yet. 1313 + */ 1314 + if (err) { 1315 + vconnected->rejected = true; 1316 + release_sock(connected); 1317 + sock_put(connected); 1318 + goto out_wait; 1319 + } 1320 + 1321 + newsock->state = SS_CONNECTED; 1322 + sock_graft(connected, newsock); 1323 + release_sock(connected); 1324 + sock_put(connected); 1325 + } 1326 + 1327 + out_wait: 1328 + finish_wait(sk_sleep(listener), &wait); 1329 + out: 1330 + release_sock(listener); 1331 + return err; 1332 + } 1333 + 1334 + static int vsock_listen(struct socket *sock, int backlog) 1335 + { 1336 + int err; 1337 + struct sock *sk; 1338 + struct vsock_sock *vsk; 1339 + 1340 + sk = sock->sk; 1341 + 1342 + lock_sock(sk); 1343 + 1344 + if (sock->type != SOCK_STREAM) { 1345 + err = -EOPNOTSUPP; 1346 + goto out; 1347 + } 1348 + 1349 + if (sock->state != SS_UNCONNECTED) { 1350 + err = -EINVAL; 1351 + goto out; 1352 + } 1353 + 1354 + vsk = vsock_sk(sk); 1355 + 1356 + if (!vsock_addr_bound(&vsk->local_addr)) { 1357 + err = -EINVAL; 1358 + goto out; 1359 + } 1360 + 1361 + sk->sk_max_ack_backlog = backlog; 1362 + sk->sk_state = SS_LISTEN; 1363 + 1364 + err = 0; 1365 + 1366 + out: 1367 + release_sock(sk); 1368 + return err; 1369 + } 1370 + 1371 + static int vsock_stream_setsockopt(struct socket *sock, 1372 + int level, 1373 + int optname, 1374 + char __user *optval, 1375 + unsigned int optlen) 1376 + { 1377 + int err; 1378 + struct sock *sk; 1379 + struct vsock_sock *vsk; 1380 + u64 val; 1381 + 1382 + if (level != AF_VSOCK) 1383 + return -ENOPROTOOPT; 1384 + 1385 + #define COPY_IN(_v) \ 1386 + do { \ 1387 + if (optlen < sizeof(_v)) { \ 1388 + err = -EINVAL; \ 1389 + goto exit; \ 1390 + } \ 1391 + if (copy_from_user(&_v, optval, sizeof(_v)) != 0) { \ 1392 + err = -EFAULT; \ 1393 + goto exit; \ 1394 + } \ 1395 + } while (0) 1396 + 1397 + err = 0; 1398 + sk = sock->sk; 1399 + vsk = vsock_sk(sk); 1400 + 1401 + lock_sock(sk); 1402 + 1403 + switch (optname) { 1404 + case SO_VM_SOCKETS_BUFFER_SIZE: 1405 + COPY_IN(val); 1406 + transport->set_buffer_size(vsk, val); 1407 + break; 1408 + 1409 + case SO_VM_SOCKETS_BUFFER_MAX_SIZE: 1410 + COPY_IN(val); 1411 + transport->set_max_buffer_size(vsk, val); 1412 + break; 1413 + 1414 + case SO_VM_SOCKETS_BUFFER_MIN_SIZE: 1415 + COPY_IN(val); 1416 + transport->set_min_buffer_size(vsk, val); 1417 + break; 1418 + 1419 + case SO_VM_SOCKETS_CONNECT_TIMEOUT: { 1420 + struct timeval tv; 1421 + COPY_IN(tv); 1422 + if (tv.tv_sec >= 0 && tv.tv_usec < USEC_PER_SEC && 1423 + tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)) { 1424 + vsk->connect_timeout = tv.tv_sec * HZ + 1425 + DIV_ROUND_UP(tv.tv_usec, (1000000 / HZ)); 1426 + if (vsk->connect_timeout == 0) 1427 + vsk->connect_timeout = 1428 + VSOCK_DEFAULT_CONNECT_TIMEOUT; 1429 + 1430 + } else { 1431 + err = -ERANGE; 1432 + } 1433 + break; 1434 + } 1435 + 1436 + default: 1437 + err = -ENOPROTOOPT; 1438 + break; 1439 + } 1440 + 1441 + #undef COPY_IN 1442 + 1443 + exit: 1444 + release_sock(sk); 1445 + return err; 1446 + } 1447 + 1448 + static int vsock_stream_getsockopt(struct socket *sock, 1449 + int level, int optname, 1450 + char __user *optval, 1451 + int __user *optlen) 1452 + { 1453 + int err; 1454 + int len; 1455 + struct sock *sk; 1456 + struct vsock_sock *vsk; 1457 + u64 val; 1458 + 1459 + if (level != AF_VSOCK) 1460 + return -ENOPROTOOPT; 1461 + 1462 + err = get_user(len, optlen); 1463 + if (err != 0) 1464 + return err; 1465 + 1466 + #define COPY_OUT(_v) \ 1467 + do { \ 1468 + if (len < sizeof(_v)) \ 1469 + return -EINVAL; \ 1470 + \ 1471 + len = sizeof(_v); \ 1472 + if (copy_to_user(optval, &_v, len) != 0) \ 1473 + return -EFAULT; \ 1474 + \ 1475 + } while (0) 1476 + 1477 + err = 0; 1478 + sk = sock->sk; 1479 + vsk = vsock_sk(sk); 1480 + 1481 + switch (optname) { 1482 + case SO_VM_SOCKETS_BUFFER_SIZE: 1483 + val = transport->get_buffer_size(vsk); 1484 + COPY_OUT(val); 1485 + break; 1486 + 1487 + case SO_VM_SOCKETS_BUFFER_MAX_SIZE: 1488 + val = transport->get_max_buffer_size(vsk); 1489 + COPY_OUT(val); 1490 + break; 1491 + 1492 + case SO_VM_SOCKETS_BUFFER_MIN_SIZE: 1493 + val = transport->get_min_buffer_size(vsk); 1494 + COPY_OUT(val); 1495 + break; 1496 + 1497 + case SO_VM_SOCKETS_CONNECT_TIMEOUT: { 1498 + struct timeval tv; 1499 + tv.tv_sec = vsk->connect_timeout / HZ; 1500 + tv.tv_usec = 1501 + (vsk->connect_timeout - 1502 + tv.tv_sec * HZ) * (1000000 / HZ); 1503 + COPY_OUT(tv); 1504 + break; 1505 + } 1506 + default: 1507 + return -ENOPROTOOPT; 1508 + } 1509 + 1510 + err = put_user(len, optlen); 1511 + if (err != 0) 1512 + return -EFAULT; 1513 + 1514 + #undef COPY_OUT 1515 + 1516 + return 0; 1517 + } 1518 + 1519 + static int vsock_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, 1520 + struct msghdr *msg, size_t len) 1521 + { 1522 + struct sock *sk; 1523 + struct vsock_sock *vsk; 1524 + ssize_t total_written; 1525 + long timeout; 1526 + int err; 1527 + struct vsock_transport_send_notify_data send_data; 1528 + 1529 + DEFINE_WAIT(wait); 1530 + 1531 + sk = sock->sk; 1532 + vsk = vsock_sk(sk); 1533 + total_written = 0; 1534 + err = 0; 1535 + 1536 + if (msg->msg_flags & MSG_OOB) 1537 + return -EOPNOTSUPP; 1538 + 1539 + lock_sock(sk); 1540 + 1541 + /* Callers should not provide a destination with stream sockets. */ 1542 + if (msg->msg_namelen) { 1543 + err = sk->sk_state == SS_CONNECTED ? -EISCONN : -EOPNOTSUPP; 1544 + goto out; 1545 + } 1546 + 1547 + /* Send data only if both sides are not shutdown in the direction. */ 1548 + if (sk->sk_shutdown & SEND_SHUTDOWN || 1549 + vsk->peer_shutdown & RCV_SHUTDOWN) { 1550 + err = -EPIPE; 1551 + goto out; 1552 + } 1553 + 1554 + if (sk->sk_state != SS_CONNECTED || 1555 + !vsock_addr_bound(&vsk->local_addr)) { 1556 + err = -ENOTCONN; 1557 + goto out; 1558 + } 1559 + 1560 + if (!vsock_addr_bound(&vsk->remote_addr)) { 1561 + err = -EDESTADDRREQ; 1562 + goto out; 1563 + } 1564 + 1565 + /* Wait for room in the produce queue to enqueue our user's data. */ 1566 + timeout = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); 1567 + 1568 + err = transport->notify_send_init(vsk, &send_data); 1569 + if (err < 0) 1570 + goto out; 1571 + 1572 + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); 1573 + 1574 + while (total_written < len) { 1575 + ssize_t written; 1576 + 1577 + while (vsock_stream_has_space(vsk) == 0 && 1578 + sk->sk_err == 0 && 1579 + !(sk->sk_shutdown & SEND_SHUTDOWN) && 1580 + !(vsk->peer_shutdown & RCV_SHUTDOWN)) { 1581 + 1582 + /* Don't wait for non-blocking sockets. */ 1583 + if (timeout == 0) { 1584 + err = -EAGAIN; 1585 + goto out_wait; 1586 + } 1587 + 1588 + err = transport->notify_send_pre_block(vsk, &send_data); 1589 + if (err < 0) 1590 + goto out_wait; 1591 + 1592 + release_sock(sk); 1593 + timeout = schedule_timeout(timeout); 1594 + lock_sock(sk); 1595 + if (signal_pending(current)) { 1596 + err = sock_intr_errno(timeout); 1597 + goto out_wait; 1598 + } else if (timeout == 0) { 1599 + err = -EAGAIN; 1600 + goto out_wait; 1601 + } 1602 + 1603 + prepare_to_wait(sk_sleep(sk), &wait, 1604 + TASK_INTERRUPTIBLE); 1605 + } 1606 + 1607 + /* These checks occur both as part of and after the loop 1608 + * conditional since we need to check before and after 1609 + * sleeping. 1610 + */ 1611 + if (sk->sk_err) { 1612 + err = -sk->sk_err; 1613 + goto out_wait; 1614 + } else if ((sk->sk_shutdown & SEND_SHUTDOWN) || 1615 + (vsk->peer_shutdown & RCV_SHUTDOWN)) { 1616 + err = -EPIPE; 1617 + goto out_wait; 1618 + } 1619 + 1620 + err = transport->notify_send_pre_enqueue(vsk, &send_data); 1621 + if (err < 0) 1622 + goto out_wait; 1623 + 1624 + /* Note that enqueue will only write as many bytes as are free 1625 + * in the produce queue, so we don't need to ensure len is 1626 + * smaller than the queue size. It is the caller's 1627 + * responsibility to check how many bytes we were able to send. 1628 + */ 1629 + 1630 + written = transport->stream_enqueue( 1631 + vsk, msg->msg_iov, 1632 + len - total_written); 1633 + if (written < 0) { 1634 + err = -ENOMEM; 1635 + goto out_wait; 1636 + } 1637 + 1638 + total_written += written; 1639 + 1640 + err = transport->notify_send_post_enqueue( 1641 + vsk, written, &send_data); 1642 + if (err < 0) 1643 + goto out_wait; 1644 + 1645 + } 1646 + 1647 + out_wait: 1648 + if (total_written > 0) 1649 + err = total_written; 1650 + finish_wait(sk_sleep(sk), &wait); 1651 + out: 1652 + release_sock(sk); 1653 + return err; 1654 + } 1655 + 1656 + 1657 + static int 1658 + vsock_stream_recvmsg(struct kiocb *kiocb, 1659 + struct socket *sock, 1660 + struct msghdr *msg, size_t len, int flags) 1661 + { 1662 + struct sock *sk; 1663 + struct vsock_sock *vsk; 1664 + int err; 1665 + size_t target; 1666 + ssize_t copied; 1667 + long timeout; 1668 + struct vsock_transport_recv_notify_data recv_data; 1669 + 1670 + DEFINE_WAIT(wait); 1671 + 1672 + sk = sock->sk; 1673 + vsk = vsock_sk(sk); 1674 + err = 0; 1675 + 1676 + lock_sock(sk); 1677 + 1678 + if (sk->sk_state != SS_CONNECTED) { 1679 + /* Recvmsg is supposed to return 0 if a peer performs an 1680 + * orderly shutdown. Differentiate between that case and when a 1681 + * peer has not connected or a local shutdown occured with the 1682 + * SOCK_DONE flag. 1683 + */ 1684 + if (sock_flag(sk, SOCK_DONE)) 1685 + err = 0; 1686 + else 1687 + err = -ENOTCONN; 1688 + 1689 + goto out; 1690 + } 1691 + 1692 + if (flags & MSG_OOB) { 1693 + err = -EOPNOTSUPP; 1694 + goto out; 1695 + } 1696 + 1697 + /* We don't check peer_shutdown flag here since peer may actually shut 1698 + * down, but there can be data in the queue that a local socket can 1699 + * receive. 1700 + */ 1701 + if (sk->sk_shutdown & RCV_SHUTDOWN) { 1702 + err = 0; 1703 + goto out; 1704 + } 1705 + 1706 + /* It is valid on Linux to pass in a zero-length receive buffer. This 1707 + * is not an error. We may as well bail out now. 1708 + */ 1709 + if (!len) { 1710 + err = 0; 1711 + goto out; 1712 + } 1713 + 1714 + /* We must not copy less than target bytes into the user's buffer 1715 + * before returning successfully, so we wait for the consume queue to 1716 + * have that much data to consume before dequeueing. Note that this 1717 + * makes it impossible to handle cases where target is greater than the 1718 + * queue size. 1719 + */ 1720 + target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); 1721 + if (target >= transport->stream_rcvhiwat(vsk)) { 1722 + err = -ENOMEM; 1723 + goto out; 1724 + } 1725 + timeout = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); 1726 + copied = 0; 1727 + 1728 + err = transport->notify_recv_init(vsk, target, &recv_data); 1729 + if (err < 0) 1730 + goto out; 1731 + 1732 + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); 1733 + 1734 + while (1) { 1735 + s64 ready = vsock_stream_has_data(vsk); 1736 + 1737 + if (ready < 0) { 1738 + /* Invalid queue pair content. XXX This should be 1739 + * changed to a connection reset in a later change. 1740 + */ 1741 + 1742 + err = -ENOMEM; 1743 + goto out_wait; 1744 + } else if (ready > 0) { 1745 + ssize_t read; 1746 + 1747 + err = transport->notify_recv_pre_dequeue( 1748 + vsk, target, &recv_data); 1749 + if (err < 0) 1750 + break; 1751 + 1752 + read = transport->stream_dequeue( 1753 + vsk, msg->msg_iov, 1754 + len - copied, flags); 1755 + if (read < 0) { 1756 + err = -ENOMEM; 1757 + break; 1758 + } 1759 + 1760 + copied += read; 1761 + 1762 + err = transport->notify_recv_post_dequeue( 1763 + vsk, target, read, 1764 + !(flags & MSG_PEEK), &recv_data); 1765 + if (err < 0) 1766 + goto out_wait; 1767 + 1768 + if (read >= target || flags & MSG_PEEK) 1769 + break; 1770 + 1771 + target -= read; 1772 + } else { 1773 + if (sk->sk_err != 0 || (sk->sk_shutdown & RCV_SHUTDOWN) 1774 + || (vsk->peer_shutdown & SEND_SHUTDOWN)) { 1775 + break; 1776 + } 1777 + /* Don't wait for non-blocking sockets. */ 1778 + if (timeout == 0) { 1779 + err = -EAGAIN; 1780 + break; 1781 + } 1782 + 1783 + err = transport->notify_recv_pre_block( 1784 + vsk, target, &recv_data); 1785 + if (err < 0) 1786 + break; 1787 + 1788 + release_sock(sk); 1789 + timeout = schedule_timeout(timeout); 1790 + lock_sock(sk); 1791 + 1792 + if (signal_pending(current)) { 1793 + err = sock_intr_errno(timeout); 1794 + break; 1795 + } else if (timeout == 0) { 1796 + err = -EAGAIN; 1797 + break; 1798 + } 1799 + 1800 + prepare_to_wait(sk_sleep(sk), &wait, 1801 + TASK_INTERRUPTIBLE); 1802 + } 1803 + } 1804 + 1805 + if (sk->sk_err) 1806 + err = -sk->sk_err; 1807 + else if (sk->sk_shutdown & RCV_SHUTDOWN) 1808 + err = 0; 1809 + 1810 + if (copied > 0) { 1811 + /* We only do these additional bookkeeping/notification steps 1812 + * if we actually copied something out of the queue pair 1813 + * instead of just peeking ahead. 1814 + */ 1815 + 1816 + if (!(flags & MSG_PEEK)) { 1817 + /* If the other side has shutdown for sending and there 1818 + * is nothing more to read, then modify the socket 1819 + * state. 1820 + */ 1821 + if (vsk->peer_shutdown & SEND_SHUTDOWN) { 1822 + if (vsock_stream_has_data(vsk) <= 0) { 1823 + sk->sk_state = SS_UNCONNECTED; 1824 + sock_set_flag(sk, SOCK_DONE); 1825 + sk->sk_state_change(sk); 1826 + } 1827 + } 1828 + } 1829 + err = copied; 1830 + } 1831 + 1832 + out_wait: 1833 + finish_wait(sk_sleep(sk), &wait); 1834 + out: 1835 + release_sock(sk); 1836 + return err; 1837 + } 1838 + 1839 + static const struct proto_ops vsock_stream_ops = { 1840 + .family = PF_VSOCK, 1841 + .owner = THIS_MODULE, 1842 + .release = vsock_release, 1843 + .bind = vsock_bind, 1844 + .connect = vsock_stream_connect, 1845 + .socketpair = sock_no_socketpair, 1846 + .accept = vsock_accept, 1847 + .getname = vsock_getname, 1848 + .poll = vsock_poll, 1849 + .ioctl = sock_no_ioctl, 1850 + .listen = vsock_listen, 1851 + .shutdown = vsock_shutdown, 1852 + .setsockopt = vsock_stream_setsockopt, 1853 + .getsockopt = vsock_stream_getsockopt, 1854 + .sendmsg = vsock_stream_sendmsg, 1855 + .recvmsg = vsock_stream_recvmsg, 1856 + .mmap = sock_no_mmap, 1857 + .sendpage = sock_no_sendpage, 1858 + }; 1859 + 1860 + static int vsock_create(struct net *net, struct socket *sock, 1861 + int protocol, int kern) 1862 + { 1863 + if (!sock) 1864 + return -EINVAL; 1865 + 1866 + if (protocol) 1867 + return -EPROTONOSUPPORT; 1868 + 1869 + switch (sock->type) { 1870 + case SOCK_DGRAM: 1871 + sock->ops = &vsock_dgram_ops; 1872 + break; 1873 + case SOCK_STREAM: 1874 + sock->ops = &vsock_stream_ops; 1875 + break; 1876 + default: 1877 + return -ESOCKTNOSUPPORT; 1878 + } 1879 + 1880 + sock->state = SS_UNCONNECTED; 1881 + 1882 + return __vsock_create(net, sock, NULL, GFP_KERNEL, 0) ? 0 : -ENOMEM; 1883 + } 1884 + 1885 + static const struct net_proto_family vsock_family_ops = { 1886 + .family = AF_VSOCK, 1887 + .create = vsock_create, 1888 + .owner = THIS_MODULE, 1889 + }; 1890 + 1891 + static long vsock_dev_do_ioctl(struct file *filp, 1892 + unsigned int cmd, void __user *ptr) 1893 + { 1894 + u32 __user *p = ptr; 1895 + int retval = 0; 1896 + 1897 + switch (cmd) { 1898 + case IOCTL_VM_SOCKETS_GET_LOCAL_CID: 1899 + if (put_user(transport->get_local_cid(), p) != 0) 1900 + retval = -EFAULT; 1901 + break; 1902 + 1903 + default: 1904 + pr_err("Unknown ioctl %d\n", cmd); 1905 + retval = -EINVAL; 1906 + } 1907 + 1908 + return retval; 1909 + } 1910 + 1911 + static long vsock_dev_ioctl(struct file *filp, 1912 + unsigned int cmd, unsigned long arg) 1913 + { 1914 + return vsock_dev_do_ioctl(filp, cmd, (void __user *)arg); 1915 + } 1916 + 1917 + #ifdef CONFIG_COMPAT 1918 + static long vsock_dev_compat_ioctl(struct file *filp, 1919 + unsigned int cmd, unsigned long arg) 1920 + { 1921 + return vsock_dev_do_ioctl(filp, cmd, compat_ptr(arg)); 1922 + } 1923 + #endif 1924 + 1925 + static const struct file_operations vsock_device_ops = { 1926 + .owner = THIS_MODULE, 1927 + .unlocked_ioctl = vsock_dev_ioctl, 1928 + #ifdef CONFIG_COMPAT 1929 + .compat_ioctl = vsock_dev_compat_ioctl, 1930 + #endif 1931 + .open = nonseekable_open, 1932 + }; 1933 + 1934 + static struct miscdevice vsock_device = { 1935 + .name = "vsock", 1936 + .minor = MISC_DYNAMIC_MINOR, 1937 + .fops = &vsock_device_ops, 1938 + }; 1939 + 1940 + static int __vsock_core_init(void) 1941 + { 1942 + int err; 1943 + 1944 + vsock_init_tables(); 1945 + 1946 + err = misc_register(&vsock_device); 1947 + if (err) { 1948 + pr_err("Failed to register misc device\n"); 1949 + return -ENOENT; 1950 + } 1951 + 1952 + err = proto_register(&vsock_proto, 1); /* we want our slab */ 1953 + if (err) { 1954 + pr_err("Cannot register vsock protocol\n"); 1955 + goto err_misc_deregister; 1956 + } 1957 + 1958 + err = sock_register(&vsock_family_ops); 1959 + if (err) { 1960 + pr_err("could not register af_vsock (%d) address family: %d\n", 1961 + AF_VSOCK, err); 1962 + goto err_unregister_proto; 1963 + } 1964 + 1965 + return 0; 1966 + 1967 + err_unregister_proto: 1968 + proto_unregister(&vsock_proto); 1969 + err_misc_deregister: 1970 + misc_deregister(&vsock_device); 1971 + return err; 1972 + } 1973 + 1974 + int vsock_core_init(const struct vsock_transport *t) 1975 + { 1976 + int retval = mutex_lock_interruptible(&vsock_register_mutex); 1977 + if (retval) 1978 + return retval; 1979 + 1980 + if (transport) { 1981 + retval = -EBUSY; 1982 + goto out; 1983 + } 1984 + 1985 + transport = t; 1986 + retval = __vsock_core_init(); 1987 + if (retval) 1988 + transport = NULL; 1989 + 1990 + out: 1991 + mutex_unlock(&vsock_register_mutex); 1992 + return retval; 1993 + } 1994 + EXPORT_SYMBOL_GPL(vsock_core_init); 1995 + 1996 + void vsock_core_exit(void) 1997 + { 1998 + mutex_lock(&vsock_register_mutex); 1999 + 2000 + misc_deregister(&vsock_device); 2001 + sock_unregister(AF_VSOCK); 2002 + proto_unregister(&vsock_proto); 2003 + 2004 + /* We do not want the assignment below re-ordered. */ 2005 + mb(); 2006 + transport = NULL; 2007 + 2008 + mutex_unlock(&vsock_register_mutex); 2009 + } 2010 + EXPORT_SYMBOL_GPL(vsock_core_exit); 2011 + 2012 + MODULE_AUTHOR("VMware, Inc."); 2013 + MODULE_DESCRIPTION("VMware Virtual Socket Family"); 2014 + MODULE_VERSION(VSOCK_DRIVER_VERSION_STRING); 2015 + MODULE_LICENSE("GPL v2");

+175

net/vmw_vsock/af_vsock.h

··· 1 + /* 2 + * VMware vSockets Driver 3 + * 4 + * Copyright (C) 2007-2013 VMware, Inc. All rights reserved. 5 + * 6 + * This program is free software; you can redistribute it and/or modify it 7 + * under the terms of the GNU General Public License as published by the Free 8 + * Software Foundation version 2 and no later version. 9 + * 10 + * This program is distributed in the hope that it will be useful, but WITHOUT 11 + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 13 + * more details. 14 + */ 15 + 16 + #ifndef __AF_VSOCK_H__ 17 + #define __AF_VSOCK_H__ 18 + 19 + #include <linux/kernel.h> 20 + #include <linux/workqueue.h> 21 + #include <linux/vm_sockets.h> 22 + 23 + #include "vsock_addr.h" 24 + 25 + #define LAST_RESERVED_PORT 1023 26 + 27 + #define vsock_sk(__sk) ((struct vsock_sock *)__sk) 28 + #define sk_vsock(__vsk) (&(__vsk)->sk) 29 + 30 + struct vsock_sock { 31 + /* sk must be the first member. */ 32 + struct sock sk; 33 + struct sockaddr_vm local_addr; 34 + struct sockaddr_vm remote_addr; 35 + /* Links for the global tables of bound and connected sockets. */ 36 + struct list_head bound_table; 37 + struct list_head connected_table; 38 + /* Accessed without the socket lock held. This means it can never be 39 + * modified outsided of socket create or destruct. 40 + */ 41 + bool trusted; 42 + bool cached_peer_allow_dgram; /* Dgram communication allowed to 43 + * cached peer? 44 + */ 45 + u32 cached_peer; /* Context ID of last dgram destination check. */ 46 + const struct cred *owner; 47 + /* Rest are SOCK_STREAM only. */ 48 + long connect_timeout; 49 + /* Listening socket that this came from. */ 50 + struct sock *listener; 51 + /* Used for pending list and accept queue during connection handshake. 52 + * The listening socket is the head for both lists. Sockets created 53 + * for connection requests are placed in the pending list until they 54 + * are connected, at which point they are put in the accept queue list 55 + * so they can be accepted in accept(). If accept() cannot accept the 56 + * connection, it is marked as rejected so the cleanup function knows 57 + * to clean up the socket. 58 + */ 59 + struct list_head pending_links; 60 + struct list_head accept_queue; 61 + bool rejected; 62 + struct delayed_work dwork; 63 + u32 peer_shutdown; 64 + bool sent_request; 65 + bool ignore_connecting_rst; 66 + 67 + /* Private to transport. */ 68 + void *trans; 69 + }; 70 + 71 + s64 vsock_stream_has_data(struct vsock_sock *vsk); 72 + s64 vsock_stream_has_space(struct vsock_sock *vsk); 73 + void vsock_pending_work(struct work_struct *work); 74 + struct sock *__vsock_create(struct net *net, 75 + struct socket *sock, 76 + struct sock *parent, 77 + gfp_t priority, unsigned short type); 78 + 79 + /**** TRANSPORT ****/ 80 + 81 + struct vsock_transport_recv_notify_data { 82 + u64 data1; /* Transport-defined. */ 83 + u64 data2; /* Transport-defined. */ 84 + bool notify_on_block; 85 + }; 86 + 87 + struct vsock_transport_send_notify_data { 88 + u64 data1; /* Transport-defined. */ 89 + u64 data2; /* Transport-defined. */ 90 + }; 91 + 92 + struct vsock_transport { 93 + /* Initialize/tear-down socket. */ 94 + int (*init)(struct vsock_sock *, struct vsock_sock *); 95 + void (*destruct)(struct vsock_sock *); 96 + void (*release)(struct vsock_sock *); 97 + 98 + /* Connections. */ 99 + int (*connect)(struct vsock_sock *); 100 + 101 + /* DGRAM. */ 102 + int (*dgram_bind)(struct vsock_sock *, struct sockaddr_vm *); 103 + int (*dgram_dequeue)(struct kiocb *kiocb, struct vsock_sock *vsk, 104 + struct msghdr *msg, size_t len, int flags); 105 + int (*dgram_enqueue)(struct vsock_sock *, struct sockaddr_vm *, 106 + struct iovec *, size_t len); 107 + bool (*dgram_allow)(u32 cid, u32 port); 108 + 109 + /* STREAM. */ 110 + /* TODO: stream_bind() */ 111 + ssize_t (*stream_dequeue)(struct vsock_sock *, struct iovec *, 112 + size_t len, int flags); 113 + ssize_t (*stream_enqueue)(struct vsock_sock *, struct iovec *, 114 + size_t len); 115 + s64 (*stream_has_data)(struct vsock_sock *); 116 + s64 (*stream_has_space)(struct vsock_sock *); 117 + u64 (*stream_rcvhiwat)(struct vsock_sock *); 118 + bool (*stream_is_active)(struct vsock_sock *); 119 + bool (*stream_allow)(u32 cid, u32 port); 120 + 121 + /* Notification. */ 122 + int (*notify_poll_in)(struct vsock_sock *, size_t, bool *); 123 + int (*notify_poll_out)(struct vsock_sock *, size_t, bool *); 124 + int (*notify_recv_init)(struct vsock_sock *, size_t, 125 + struct vsock_transport_recv_notify_data *); 126 + int (*notify_recv_pre_block)(struct vsock_sock *, size_t, 127 + struct vsock_transport_recv_notify_data *); 128 + int (*notify_recv_pre_dequeue)(struct vsock_sock *, size_t, 129 + struct vsock_transport_recv_notify_data *); 130 + int (*notify_recv_post_dequeue)(struct vsock_sock *, size_t, 131 + ssize_t, bool, struct vsock_transport_recv_notify_data *); 132 + int (*notify_send_init)(struct vsock_sock *, 133 + struct vsock_transport_send_notify_data *); 134 + int (*notify_send_pre_block)(struct vsock_sock *, 135 + struct vsock_transport_send_notify_data *); 136 + int (*notify_send_pre_enqueue)(struct vsock_sock *, 137 + struct vsock_transport_send_notify_data *); 138 + int (*notify_send_post_enqueue)(struct vsock_sock *, ssize_t, 139 + struct vsock_transport_send_notify_data *); 140 + 141 + /* Shutdown. */ 142 + int (*shutdown)(struct vsock_sock *, int); 143 + 144 + /* Buffer sizes. */ 145 + void (*set_buffer_size)(struct vsock_sock *, u64); 146 + void (*set_min_buffer_size)(struct vsock_sock *, u64); 147 + void (*set_max_buffer_size)(struct vsock_sock *, u64); 148 + u64 (*get_buffer_size)(struct vsock_sock *); 149 + u64 (*get_min_buffer_size)(struct vsock_sock *); 150 + u64 (*get_max_buffer_size)(struct vsock_sock *); 151 + 152 + /* Addressing. */ 153 + u32 (*get_local_cid)(void); 154 + }; 155 + 156 + /**** CORE ****/ 157 + 158 + int vsock_core_init(const struct vsock_transport *t); 159 + void vsock_core_exit(void); 160 + 161 + /**** UTILS ****/ 162 + 163 + void vsock_release_pending(struct sock *pending); 164 + void vsock_add_pending(struct sock *listener, struct sock *pending); 165 + void vsock_remove_pending(struct sock *listener, struct sock *pending); 166 + void vsock_enqueue_accept(struct sock *listener, struct sock *connected); 167 + void vsock_insert_connected(struct vsock_sock *vsk); 168 + void vsock_remove_bound(struct vsock_sock *vsk); 169 + void vsock_remove_connected(struct vsock_sock *vsk); 170 + struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr); 171 + struct sock *vsock_find_connected_socket(struct sockaddr_vm *src, 172 + struct sockaddr_vm *dst); 173 + void vsock_for_each_connected_socket(void (*fn)(struct sock *sk)); 174 + 175 + #endif /* __AF_VSOCK_H__ */

+2157

net/vmw_vsock/vmci_transport.c

··· 1 + /* 2 + * VMware vSockets Driver 3 + * 4 + * Copyright (C) 2007-2013 VMware, Inc. All rights reserved. 5 + * 6 + * This program is free software; you can redistribute it and/or modify it 7 + * under the terms of the GNU General Public License as published by the Free 8 + * Software Foundation version 2 and no later version. 9 + * 10 + * This program is distributed in the hope that it will be useful, but WITHOUT 11 + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 13 + * more details. 14 + */ 15 + 16 + #include <linux/types.h> 17 + 18 + #define EXPORT_SYMTAB 19 + #include <linux/bitops.h> 20 + #include <linux/cred.h> 21 + #include <linux/init.h> 22 + #include <linux/io.h> 23 + #include <linux/kernel.h> 24 + #include <linux/kmod.h> 25 + #include <linux/list.h> 26 + #include <linux/miscdevice.h> 27 + #include <linux/module.h> 28 + #include <linux/mutex.h> 29 + #include <linux/net.h> 30 + #include <linux/poll.h> 31 + #include <linux/skbuff.h> 32 + #include <linux/smp.h> 33 + #include <linux/socket.h> 34 + #include <linux/stddef.h> 35 + #include <linux/unistd.h> 36 + #include <linux/wait.h> 37 + #include <linux/workqueue.h> 38 + #include <net/sock.h> 39 + 40 + #include "af_vsock.h" 41 + #include "vmci_transport_notify.h" 42 + 43 + static int vmci_transport_recv_dgram_cb(void *data, struct vmci_datagram *dg); 44 + static int vmci_transport_recv_stream_cb(void *data, struct vmci_datagram *dg); 45 + static void vmci_transport_peer_attach_cb(u32 sub_id, 46 + const struct vmci_event_data *ed, 47 + void *client_data); 48 + static void vmci_transport_peer_detach_cb(u32 sub_id, 49 + const struct vmci_event_data *ed, 50 + void *client_data); 51 + static void vmci_transport_recv_pkt_work(struct work_struct *work); 52 + static int vmci_transport_recv_listen(struct sock *sk, 53 + struct vmci_transport_packet *pkt); 54 + static int vmci_transport_recv_connecting_server( 55 + struct sock *sk, 56 + struct sock *pending, 57 + struct vmci_transport_packet *pkt); 58 + static int vmci_transport_recv_connecting_client( 59 + struct sock *sk, 60 + struct vmci_transport_packet *pkt); 61 + static int vmci_transport_recv_connecting_client_negotiate( 62 + struct sock *sk, 63 + struct vmci_transport_packet *pkt); 64 + static int vmci_transport_recv_connecting_client_invalid( 65 + struct sock *sk, 66 + struct vmci_transport_packet *pkt); 67 + static int vmci_transport_recv_connected(struct sock *sk, 68 + struct vmci_transport_packet *pkt); 69 + static bool vmci_transport_old_proto_override(bool *old_pkt_proto); 70 + static u16 vmci_transport_new_proto_supported_versions(void); 71 + static bool vmci_transport_proto_to_notify_struct(struct sock *sk, u16 *proto, 72 + bool old_pkt_proto); 73 + 74 + struct vmci_transport_recv_pkt_info { 75 + struct work_struct work; 76 + struct sock *sk; 77 + struct vmci_transport_packet pkt; 78 + }; 79 + 80 + static struct vmci_handle vmci_transport_stream_handle = { VMCI_INVALID_ID, 81 + VMCI_INVALID_ID }; 82 + static u32 vmci_transport_qp_resumed_sub_id = VMCI_INVALID_ID; 83 + 84 + static int PROTOCOL_OVERRIDE = -1; 85 + 86 + #define VMCI_TRANSPORT_DEFAULT_QP_SIZE_MIN 128 87 + #define VMCI_TRANSPORT_DEFAULT_QP_SIZE 262144 88 + #define VMCI_TRANSPORT_DEFAULT_QP_SIZE_MAX 262144 89 + 90 + /* The default peer timeout indicates how long we will wait for a peer response 91 + * to a control message. 92 + */ 93 + #define VSOCK_DEFAULT_CONNECT_TIMEOUT (2 * HZ) 94 + 95 + #define SS_LISTEN 255 96 + 97 + /* Helper function to convert from a VMCI error code to a VSock error code. */ 98 + 99 + static s32 vmci_transport_error_to_vsock_error(s32 vmci_error) 100 + { 101 + int err; 102 + 103 + switch (vmci_error) { 104 + case VMCI_ERROR_NO_MEM: 105 + err = ENOMEM; 106 + break; 107 + case VMCI_ERROR_DUPLICATE_ENTRY: 108 + case VMCI_ERROR_ALREADY_EXISTS: 109 + err = EADDRINUSE; 110 + break; 111 + case VMCI_ERROR_NO_ACCESS: 112 + err = EPERM; 113 + break; 114 + case VMCI_ERROR_NO_RESOURCES: 115 + err = ENOBUFS; 116 + break; 117 + case VMCI_ERROR_INVALID_RESOURCE: 118 + err = EHOSTUNREACH; 119 + break; 120 + case VMCI_ERROR_INVALID_ARGS: 121 + default: 122 + err = EINVAL; 123 + } 124 + 125 + return err > 0 ? -err : err; 126 + } 127 + 128 + static inline void 129 + vmci_transport_packet_init(struct vmci_transport_packet *pkt, 130 + struct sockaddr_vm *src, 131 + struct sockaddr_vm *dst, 132 + u8 type, 133 + u64 size, 134 + u64 mode, 135 + struct vmci_transport_waiting_info *wait, 136 + u16 proto, 137 + struct vmci_handle handle) 138 + { 139 + /* We register the stream control handler as an any cid handle so we 140 + * must always send from a source address of VMADDR_CID_ANY 141 + */ 142 + pkt->dg.src = vmci_make_handle(VMADDR_CID_ANY, 143 + VMCI_TRANSPORT_PACKET_RID); 144 + pkt->dg.dst = vmci_make_handle(dst->svm_cid, 145 + VMCI_TRANSPORT_PACKET_RID); 146 + pkt->dg.payload_size = sizeof(*pkt) - sizeof(pkt->dg); 147 + pkt->version = VMCI_TRANSPORT_PACKET_VERSION; 148 + pkt->type = type; 149 + pkt->src_port = src->svm_port; 150 + pkt->dst_port = dst->svm_port; 151 + memset(&pkt->proto, 0, sizeof(pkt->proto)); 152 + memset(&pkt->_reserved2, 0, sizeof(pkt->_reserved2)); 153 + 154 + switch (pkt->type) { 155 + case VMCI_TRANSPORT_PACKET_TYPE_INVALID: 156 + pkt->u.size = 0; 157 + break; 158 + 159 + case VMCI_TRANSPORT_PACKET_TYPE_REQUEST: 160 + case VMCI_TRANSPORT_PACKET_TYPE_NEGOTIATE: 161 + pkt->u.size = size; 162 + break; 163 + 164 + case VMCI_TRANSPORT_PACKET_TYPE_OFFER: 165 + case VMCI_TRANSPORT_PACKET_TYPE_ATTACH: 166 + pkt->u.handle = handle; 167 + break; 168 + 169 + case VMCI_TRANSPORT_PACKET_TYPE_WROTE: 170 + case VMCI_TRANSPORT_PACKET_TYPE_READ: 171 + case VMCI_TRANSPORT_PACKET_TYPE_RST: 172 + pkt->u.size = 0; 173 + break; 174 + 175 + case VMCI_TRANSPORT_PACKET_TYPE_SHUTDOWN: 176 + pkt->u.mode = mode; 177 + break; 178 + 179 + case VMCI_TRANSPORT_PACKET_TYPE_WAITING_READ: 180 + case VMCI_TRANSPORT_PACKET_TYPE_WAITING_WRITE: 181 + memcpy(&pkt->u.wait, wait, sizeof(pkt->u.wait)); 182 + break; 183 + 184 + case VMCI_TRANSPORT_PACKET_TYPE_REQUEST2: 185 + case VMCI_TRANSPORT_PACKET_TYPE_NEGOTIATE2: 186 + pkt->u.size = size; 187 + pkt->proto = proto; 188 + break; 189 + } 190 + } 191 + 192 + static inline void 193 + vmci_transport_packet_get_addresses(struct vmci_transport_packet *pkt, 194 + struct sockaddr_vm *local, 195 + struct sockaddr_vm *remote) 196 + { 197 + vsock_addr_init(local, pkt->dg.dst.context, pkt->dst_port); 198 + vsock_addr_init(remote, pkt->dg.src.context, pkt->src_port); 199 + } 200 + 201 + static int 202 + __vmci_transport_send_control_pkt(struct vmci_transport_packet *pkt, 203 + struct sockaddr_vm *src, 204 + struct sockaddr_vm *dst, 205 + enum vmci_transport_packet_type type, 206 + u64 size, 207 + u64 mode, 208 + struct vmci_transport_waiting_info *wait, 209 + u16 proto, 210 + struct vmci_handle handle, 211 + bool convert_error) 212 + { 213 + int err; 214 + 215 + vmci_transport_packet_init(pkt, src, dst, type, size, mode, wait, 216 + proto, handle); 217 + err = vmci_datagram_send(&pkt->dg); 218 + if (convert_error && (err < 0)) 219 + return vmci_transport_error_to_vsock_error(err); 220 + 221 + return err; 222 + } 223 + 224 + static int 225 + vmci_transport_reply_control_pkt_fast(struct vmci_transport_packet *pkt, 226 + enum vmci_transport_packet_type type, 227 + u64 size, 228 + u64 mode, 229 + struct vmci_transport_waiting_info *wait, 230 + struct vmci_handle handle) 231 + { 232 + struct vmci_transport_packet reply; 233 + struct sockaddr_vm src, dst; 234 + 235 + if (pkt->type == VMCI_TRANSPORT_PACKET_TYPE_RST) { 236 + return 0; 237 + } else { 238 + vmci_transport_packet_get_addresses(pkt, &src, &dst); 239 + return __vmci_transport_send_control_pkt(&reply, &src, &dst, 240 + type, 241 + size, mode, wait, 242 + VSOCK_PROTO_INVALID, 243 + handle, true); 244 + } 245 + } 246 + 247 + static int 248 + vmci_transport_send_control_pkt_bh(struct sockaddr_vm *src, 249 + struct sockaddr_vm *dst, 250 + enum vmci_transport_packet_type type, 251 + u64 size, 252 + u64 mode, 253 + struct vmci_transport_waiting_info *wait, 254 + struct vmci_handle handle) 255 + { 256 + /* Note that it is safe to use a single packet across all CPUs since 257 + * two tasklets of the same type are guaranteed to not ever run 258 + * simultaneously. If that ever changes, or VMCI stops using tasklets, 259 + * we can use per-cpu packets. 260 + */ 261 + static struct vmci_transport_packet pkt; 262 + 263 + return __vmci_transport_send_control_pkt(&pkt, src, dst, type, 264 + size, mode, wait, 265 + VSOCK_PROTO_INVALID, handle, 266 + false); 267 + } 268 + 269 + static int 270 + vmci_transport_send_control_pkt(struct sock *sk, 271 + enum vmci_transport_packet_type type, 272 + u64 size, 273 + u64 mode, 274 + struct vmci_transport_waiting_info *wait, 275 + u16 proto, 276 + struct vmci_handle handle) 277 + { 278 + struct vmci_transport_packet *pkt; 279 + struct vsock_sock *vsk; 280 + int err; 281 + 282 + vsk = vsock_sk(sk); 283 + 284 + if (!vsock_addr_bound(&vsk->local_addr)) 285 + return -EINVAL; 286 + 287 + if (!vsock_addr_bound(&vsk->remote_addr)) 288 + return -EINVAL; 289 + 290 + pkt = kmalloc(sizeof(*pkt), GFP_KERNEL); 291 + if (!pkt) 292 + return -ENOMEM; 293 + 294 + err = __vmci_transport_send_control_pkt(pkt, &vsk->local_addr, 295 + &vsk->remote_addr, type, size, 296 + mode, wait, proto, handle, 297 + true); 298 + kfree(pkt); 299 + 300 + return err; 301 + } 302 + 303 + static int vmci_transport_send_reset_bh(struct sockaddr_vm *dst, 304 + struct sockaddr_vm *src, 305 + struct vmci_transport_packet *pkt) 306 + { 307 + if (pkt->type == VMCI_TRANSPORT_PACKET_TYPE_RST) 308 + return 0; 309 + return vmci_transport_send_control_pkt_bh( 310 + dst, src, 311 + VMCI_TRANSPORT_PACKET_TYPE_RST, 0, 312 + 0, NULL, VMCI_INVALID_HANDLE); 313 + } 314 + 315 + static int vmci_transport_send_reset(struct sock *sk, 316 + struct vmci_transport_packet *pkt) 317 + { 318 + if (pkt->type == VMCI_TRANSPORT_PACKET_TYPE_RST) 319 + return 0; 320 + return vmci_transport_send_control_pkt(sk, 321 + VMCI_TRANSPORT_PACKET_TYPE_RST, 322 + 0, 0, NULL, VSOCK_PROTO_INVALID, 323 + VMCI_INVALID_HANDLE); 324 + } 325 + 326 + static int vmci_transport_send_negotiate(struct sock *sk, size_t size) 327 + { 328 + return vmci_transport_send_control_pkt( 329 + sk, 330 + VMCI_TRANSPORT_PACKET_TYPE_NEGOTIATE, 331 + size, 0, NULL, 332 + VSOCK_PROTO_INVALID, 333 + VMCI_INVALID_HANDLE); 334 + } 335 + 336 + static int vmci_transport_send_negotiate2(struct sock *sk, size_t size, 337 + u16 version) 338 + { 339 + return vmci_transport_send_control_pkt( 340 + sk, 341 + VMCI_TRANSPORT_PACKET_TYPE_NEGOTIATE2, 342 + size, 0, NULL, version, 343 + VMCI_INVALID_HANDLE); 344 + } 345 + 346 + static int vmci_transport_send_qp_offer(struct sock *sk, 347 + struct vmci_handle handle) 348 + { 349 + return vmci_transport_send_control_pkt( 350 + sk, VMCI_TRANSPORT_PACKET_TYPE_OFFER, 0, 351 + 0, NULL, 352 + VSOCK_PROTO_INVALID, handle); 353 + } 354 + 355 + static int vmci_transport_send_attach(struct sock *sk, 356 + struct vmci_handle handle) 357 + { 358 + return vmci_transport_send_control_pkt( 359 + sk, VMCI_TRANSPORT_PACKET_TYPE_ATTACH, 360 + 0, 0, NULL, VSOCK_PROTO_INVALID, 361 + handle); 362 + } 363 + 364 + static int vmci_transport_reply_reset(struct vmci_transport_packet *pkt) 365 + { 366 + return vmci_transport_reply_control_pkt_fast( 367 + pkt, 368 + VMCI_TRANSPORT_PACKET_TYPE_RST, 369 + 0, 0, NULL, 370 + VMCI_INVALID_HANDLE); 371 + } 372 + 373 + static int vmci_transport_send_invalid_bh(struct sockaddr_vm *dst, 374 + struct sockaddr_vm *src) 375 + { 376 + return vmci_transport_send_control_pkt_bh( 377 + dst, src, 378 + VMCI_TRANSPORT_PACKET_TYPE_INVALID, 379 + 0, 0, NULL, VMCI_INVALID_HANDLE); 380 + } 381 + 382 + int vmci_transport_send_wrote_bh(struct sockaddr_vm *dst, 383 + struct sockaddr_vm *src) 384 + { 385 + return vmci_transport_send_control_pkt_bh( 386 + dst, src, 387 + VMCI_TRANSPORT_PACKET_TYPE_WROTE, 0, 388 + 0, NULL, VMCI_INVALID_HANDLE); 389 + } 390 + 391 + int vmci_transport_send_read_bh(struct sockaddr_vm *dst, 392 + struct sockaddr_vm *src) 393 + { 394 + return vmci_transport_send_control_pkt_bh( 395 + dst, src, 396 + VMCI_TRANSPORT_PACKET_TYPE_READ, 0, 397 + 0, NULL, VMCI_INVALID_HANDLE); 398 + } 399 + 400 + int vmci_transport_send_wrote(struct sock *sk) 401 + { 402 + return vmci_transport_send_control_pkt( 403 + sk, VMCI_TRANSPORT_PACKET_TYPE_WROTE, 0, 404 + 0, NULL, VSOCK_PROTO_INVALID, 405 + VMCI_INVALID_HANDLE); 406 + } 407 + 408 + int vmci_transport_send_read(struct sock *sk) 409 + { 410 + return vmci_transport_send_control_pkt( 411 + sk, VMCI_TRANSPORT_PACKET_TYPE_READ, 0, 412 + 0, NULL, VSOCK_PROTO_INVALID, 413 + VMCI_INVALID_HANDLE); 414 + } 415 + 416 + int vmci_transport_send_waiting_write(struct sock *sk, 417 + struct vmci_transport_waiting_info *wait) 418 + { 419 + return vmci_transport_send_control_pkt( 420 + sk, VMCI_TRANSPORT_PACKET_TYPE_WAITING_WRITE, 421 + 0, 0, wait, VSOCK_PROTO_INVALID, 422 + VMCI_INVALID_HANDLE); 423 + } 424 + 425 + int vmci_transport_send_waiting_read(struct sock *sk, 426 + struct vmci_transport_waiting_info *wait) 427 + { 428 + return vmci_transport_send_control_pkt( 429 + sk, VMCI_TRANSPORT_PACKET_TYPE_WAITING_READ, 430 + 0, 0, wait, VSOCK_PROTO_INVALID, 431 + VMCI_INVALID_HANDLE); 432 + } 433 + 434 + static int vmci_transport_shutdown(struct vsock_sock *vsk, int mode) 435 + { 436 + return vmci_transport_send_control_pkt( 437 + &vsk->sk, 438 + VMCI_TRANSPORT_PACKET_TYPE_SHUTDOWN, 439 + 0, mode, NULL, 440 + VSOCK_PROTO_INVALID, 441 + VMCI_INVALID_HANDLE); 442 + } 443 + 444 + static int vmci_transport_send_conn_request(struct sock *sk, size_t size) 445 + { 446 + return vmci_transport_send_control_pkt(sk, 447 + VMCI_TRANSPORT_PACKET_TYPE_REQUEST, 448 + size, 0, NULL, 449 + VSOCK_PROTO_INVALID, 450 + VMCI_INVALID_HANDLE); 451 + } 452 + 453 + static int vmci_transport_send_conn_request2(struct sock *sk, size_t size, 454 + u16 version) 455 + { 456 + return vmci_transport_send_control_pkt( 457 + sk, VMCI_TRANSPORT_PACKET_TYPE_REQUEST2, 458 + size, 0, NULL, version, 459 + VMCI_INVALID_HANDLE); 460 + } 461 + 462 + static struct sock *vmci_transport_get_pending( 463 + struct sock *listener, 464 + struct vmci_transport_packet *pkt) 465 + { 466 + struct vsock_sock *vlistener; 467 + struct vsock_sock *vpending; 468 + struct sock *pending; 469 + 470 + vlistener = vsock_sk(listener); 471 + 472 + list_for_each_entry(vpending, &vlistener->pending_links, 473 + pending_links) { 474 + struct sockaddr_vm src; 475 + struct sockaddr_vm dst; 476 + 477 + vsock_addr_init(&src, pkt->dg.src.context, pkt->src_port); 478 + vsock_addr_init(&dst, pkt->dg.dst.context, pkt->dst_port); 479 + 480 + if (vsock_addr_equals_addr(&src, &vpending->remote_addr) && 481 + vsock_addr_equals_addr(&dst, &vpending->local_addr)) { 482 + pending = sk_vsock(vpending); 483 + sock_hold(pending); 484 + goto found; 485 + } 486 + } 487 + 488 + pending = NULL; 489 + found: 490 + return pending; 491 + 492 + } 493 + 494 + static void vmci_transport_release_pending(struct sock *pending) 495 + { 496 + sock_put(pending); 497 + } 498 + 499 + /* We allow two kinds of sockets to communicate with a restricted VM: 1) 500 + * trusted sockets 2) sockets from applications running as the same user as the 501 + * VM (this is only true for the host side and only when using hosted products) 502 + */ 503 + 504 + static bool vmci_transport_is_trusted(struct vsock_sock *vsock, u32 peer_cid) 505 + { 506 + return vsock->trusted || 507 + vmci_is_context_owner(peer_cid, vsock->owner->uid); 508 + } 509 + 510 + /* We allow sending datagrams to and receiving datagrams from a restricted VM 511 + * only if it is trusted as described in vmci_transport_is_trusted. 512 + */ 513 + 514 + static bool vmci_transport_allow_dgram(struct vsock_sock *vsock, u32 peer_cid) 515 + { 516 + if (vsock->cached_peer != peer_cid) { 517 + vsock->cached_peer = peer_cid; 518 + if (!vmci_transport_is_trusted(vsock, peer_cid) && 519 + (vmci_context_get_priv_flags(peer_cid) & 520 + VMCI_PRIVILEGE_FLAG_RESTRICTED)) { 521 + vsock->cached_peer_allow_dgram = false; 522 + } else { 523 + vsock->cached_peer_allow_dgram = true; 524 + } 525 + } 526 + 527 + return vsock->cached_peer_allow_dgram; 528 + } 529 + 530 + static int 531 + vmci_transport_queue_pair_alloc(struct vmci_qp **qpair, 532 + struct vmci_handle *handle, 533 + u64 produce_size, 534 + u64 consume_size, 535 + u32 peer, u32 flags, bool trusted) 536 + { 537 + int err = 0; 538 + 539 + if (trusted) { 540 + /* Try to allocate our queue pair as trusted. This will only 541 + * work if vsock is running in the host. 542 + */ 543 + 544 + err = vmci_qpair_alloc(qpair, handle, produce_size, 545 + consume_size, 546 + peer, flags, 547 + VMCI_PRIVILEGE_FLAG_TRUSTED); 548 + if (err != VMCI_ERROR_NO_ACCESS) 549 + goto out; 550 + 551 + } 552 + 553 + err = vmci_qpair_alloc(qpair, handle, produce_size, consume_size, 554 + peer, flags, VMCI_NO_PRIVILEGE_FLAGS); 555 + out: 556 + if (err < 0) { 557 + pr_err("Could not attach to queue pair with %d\n", 558 + err); 559 + err = vmci_transport_error_to_vsock_error(err); 560 + } 561 + 562 + return err; 563 + } 564 + 565 + static int 566 + vmci_transport_datagram_create_hnd(u32 resource_id, 567 + u32 flags, 568 + vmci_datagram_recv_cb recv_cb, 569 + void *client_data, 570 + struct vmci_handle *out_handle) 571 + { 572 + int err = 0; 573 + 574 + /* Try to allocate our datagram handler as trusted. This will only work 575 + * if vsock is running in the host. 576 + */ 577 + 578 + err = vmci_datagram_create_handle_priv(resource_id, flags, 579 + VMCI_PRIVILEGE_FLAG_TRUSTED, 580 + recv_cb, 581 + client_data, out_handle); 582 + 583 + if (err == VMCI_ERROR_NO_ACCESS) 584 + err = vmci_datagram_create_handle(resource_id, flags, 585 + recv_cb, client_data, 586 + out_handle); 587 + 588 + return err; 589 + } 590 + 591 + /* This is invoked as part of a tasklet that's scheduled when the VMCI 592 + * interrupt fires. This is run in bottom-half context and if it ever needs to 593 + * sleep it should defer that work to a work queue. 594 + */ 595 + 596 + static int vmci_transport_recv_dgram_cb(void *data, struct vmci_datagram *dg) 597 + { 598 + struct sock *sk; 599 + size_t size; 600 + struct sk_buff *skb; 601 + struct vsock_sock *vsk; 602 + 603 + sk = (struct sock *)data; 604 + 605 + /* This handler is privileged when this module is running on the host. 606 + * We will get datagrams from all endpoints (even VMs that are in a 607 + * restricted context). If we get one from a restricted context then 608 + * the destination socket must be trusted. 609 + * 610 + * NOTE: We access the socket struct without holding the lock here. 611 + * This is ok because the field we are interested is never modified 612 + * outside of the create and destruct socket functions. 613 + */ 614 + vsk = vsock_sk(sk); 615 + if (!vmci_transport_allow_dgram(vsk, dg->src.context)) 616 + return VMCI_ERROR_NO_ACCESS; 617 + 618 + size = VMCI_DG_SIZE(dg); 619 + 620 + /* Attach the packet to the socket's receive queue as an sk_buff. */ 621 + skb = alloc_skb(size, GFP_ATOMIC); 622 + if (skb) { 623 + /* sk_receive_skb() will do a sock_put(), so hold here. */ 624 + sock_hold(sk); 625 + skb_put(skb, size); 626 + memcpy(skb->data, dg, size); 627 + sk_receive_skb(sk, skb, 0); 628 + } 629 + 630 + return VMCI_SUCCESS; 631 + } 632 + 633 + static bool vmci_transport_stream_allow(u32 cid, u32 port) 634 + { 635 + static const u32 non_socket_contexts[] = { 636 + VMADDR_CID_HYPERVISOR, 637 + VMADDR_CID_RESERVED, 638 + }; 639 + int i; 640 + 641 + BUILD_BUG_ON(sizeof(cid) != sizeof(*non_socket_contexts)); 642 + 643 + for (i = 0; i < ARRAY_SIZE(non_socket_contexts); i++) { 644 + if (cid == non_socket_contexts[i]) 645 + return false; 646 + } 647 + 648 + return true; 649 + } 650 + 651 + /* This is invoked as part of a tasklet that's scheduled when the VMCI 652 + * interrupt fires. This is run in bottom-half context but it defers most of 653 + * its work to the packet handling work queue. 654 + */ 655 + 656 + static int vmci_transport_recv_stream_cb(void *data, struct vmci_datagram *dg) 657 + { 658 + struct sock *sk; 659 + struct sockaddr_vm dst; 660 + struct sockaddr_vm src; 661 + struct vmci_transport_packet *pkt; 662 + struct vsock_sock *vsk; 663 + bool bh_process_pkt; 664 + int err; 665 + 666 + sk = NULL; 667 + err = VMCI_SUCCESS; 668 + bh_process_pkt = false; 669 + 670 + /* Ignore incoming packets from contexts without sockets, or resources 671 + * that aren't vsock implementations. 672 + */ 673 + 674 + if (!vmci_transport_stream_allow(dg->src.context, -1) 675 + || VMCI_TRANSPORT_PACKET_RID != dg->src.resource) 676 + return VMCI_ERROR_NO_ACCESS; 677 + 678 + if (VMCI_DG_SIZE(dg) < sizeof(*pkt)) 679 + /* Drop datagrams that do not contain full VSock packets. */ 680 + return VMCI_ERROR_INVALID_ARGS; 681 + 682 + pkt = (struct vmci_transport_packet *)dg; 683 + 684 + /* Find the socket that should handle this packet. First we look for a 685 + * connected socket and if there is none we look for a socket bound to 686 + * the destintation address. 687 + */ 688 + vsock_addr_init(&src, pkt->dg.src.context, pkt->src_port); 689 + vsock_addr_init(&dst, pkt->dg.dst.context, pkt->dst_port); 690 + 691 + sk = vsock_find_connected_socket(&src, &dst); 692 + if (!sk) { 693 + sk = vsock_find_bound_socket(&dst); 694 + if (!sk) { 695 + /* We could not find a socket for this specified 696 + * address. If this packet is a RST, we just drop it. 697 + * If it is another packet, we send a RST. Note that 698 + * we do not send a RST reply to RSTs so that we do not 699 + * continually send RSTs between two endpoints. 700 + * 701 + * Note that since this is a reply, dst is src and src 702 + * is dst. 703 + */ 704 + if (vmci_transport_send_reset_bh(&dst, &src, pkt) < 0) 705 + pr_err("unable to send reset\n"); 706 + 707 + err = VMCI_ERROR_NOT_FOUND; 708 + goto out; 709 + } 710 + } 711 + 712 + /* If the received packet type is beyond all types known to this 713 + * implementation, reply with an invalid message. Hopefully this will 714 + * help when implementing backwards compatibility in the future. 715 + */ 716 + if (pkt->type >= VMCI_TRANSPORT_PACKET_TYPE_MAX) { 717 + vmci_transport_send_invalid_bh(&dst, &src); 718 + err = VMCI_ERROR_INVALID_ARGS; 719 + goto out; 720 + } 721 + 722 + /* This handler is privileged when this module is running on the host. 723 + * We will get datagram connect requests from all endpoints (even VMs 724 + * that are in a restricted context). If we get one from a restricted 725 + * context then the destination socket must be trusted. 726 + * 727 + * NOTE: We access the socket struct without holding the lock here. 728 + * This is ok because the field we are interested is never modified 729 + * outside of the create and destruct socket functions. 730 + */ 731 + vsk = vsock_sk(sk); 732 + if (!vmci_transport_allow_dgram(vsk, pkt->dg.src.context)) { 733 + err = VMCI_ERROR_NO_ACCESS; 734 + goto out; 735 + } 736 + 737 + /* We do most everything in a work queue, but let's fast path the 738 + * notification of reads and writes to help data transfer performance. 739 + * We can only do this if there is no process context code executing 740 + * for this socket since that may change the state. 741 + */ 742 + bh_lock_sock(sk); 743 + 744 + if (!sock_owned_by_user(sk) && sk->sk_state == SS_CONNECTED) 745 + vmci_trans(vsk)->notify_ops->handle_notify_pkt( 746 + sk, pkt, true, &dst, &src, 747 + &bh_process_pkt); 748 + 749 + bh_unlock_sock(sk); 750 + 751 + if (!bh_process_pkt) { 752 + struct vmci_transport_recv_pkt_info *recv_pkt_info; 753 + 754 + recv_pkt_info = kmalloc(sizeof(*recv_pkt_info), GFP_ATOMIC); 755 + if (!recv_pkt_info) { 756 + if (vmci_transport_send_reset_bh(&dst, &src, pkt) < 0) 757 + pr_err("unable to send reset\n"); 758 + 759 + err = VMCI_ERROR_NO_MEM; 760 + goto out; 761 + } 762 + 763 + recv_pkt_info->sk = sk; 764 + memcpy(&recv_pkt_info->pkt, pkt, sizeof(recv_pkt_info->pkt)); 765 + INIT_WORK(&recv_pkt_info->work, vmci_transport_recv_pkt_work); 766 + 767 + schedule_work(&recv_pkt_info->work); 768 + /* Clear sk so that the reference count incremented by one of 769 + * the Find functions above is not decremented below. We need 770 + * that reference count for the packet handler we've scheduled 771 + * to run. 772 + */ 773 + sk = NULL; 774 + } 775 + 776 + out: 777 + if (sk) 778 + sock_put(sk); 779 + 780 + return err; 781 + } 782 + 783 + static void vmci_transport_peer_attach_cb(u32 sub_id, 784 + const struct vmci_event_data *e_data, 785 + void *client_data) 786 + { 787 + struct sock *sk = client_data; 788 + const struct vmci_event_payload_qp *e_payload; 789 + struct vsock_sock *vsk; 790 + 791 + e_payload = vmci_event_data_const_payload(e_data); 792 + 793 + vsk = vsock_sk(sk); 794 + 795 + /* We don't ask for delayed CBs when we subscribe to this event (we 796 + * pass 0 as flags to vmci_event_subscribe()). VMCI makes no 797 + * guarantees in that case about what context we might be running in, 798 + * so it could be BH or process, blockable or non-blockable. So we 799 + * need to account for all possible contexts here. 800 + */ 801 + local_bh_disable(); 802 + bh_lock_sock(sk); 803 + 804 + /* XXX This is lame, we should provide a way to lookup sockets by 805 + * qp_handle. 806 + */ 807 + if (vmci_handle_is_equal(vmci_trans(vsk)->qp_handle, 808 + e_payload->handle)) { 809 + /* XXX This doesn't do anything, but in the future we may want 810 + * to set a flag here to verify the attach really did occur and 811 + * we weren't just sent a datagram claiming it was. 812 + */ 813 + goto out; 814 + } 815 + 816 + out: 817 + bh_unlock_sock(sk); 818 + local_bh_enable(); 819 + } 820 + 821 + static void vmci_transport_handle_detach(struct sock *sk) 822 + { 823 + struct vsock_sock *vsk; 824 + 825 + vsk = vsock_sk(sk); 826 + if (!vmci_handle_is_invalid(vmci_trans(vsk)->qp_handle)) { 827 + sock_set_flag(sk, SOCK_DONE); 828 + 829 + /* On a detach the peer will not be sending or receiving 830 + * anymore. 831 + */ 832 + vsk->peer_shutdown = SHUTDOWN_MASK; 833 + 834 + /* We should not be sending anymore since the peer won't be 835 + * there to receive, but we can still receive if there is data 836 + * left in our consume queue. 837 + */ 838 + if (vsock_stream_has_data(vsk) <= 0) { 839 + if (sk->sk_state == SS_CONNECTING) { 840 + /* The peer may detach from a queue pair while 841 + * we are still in the connecting state, i.e., 842 + * if the peer VM is killed after attaching to 843 + * a queue pair, but before we complete the 844 + * handshake. In that case, we treat the detach 845 + * event like a reset. 846 + */ 847 + 848 + sk->sk_state = SS_UNCONNECTED; 849 + sk->sk_err = ECONNRESET; 850 + sk->sk_error_report(sk); 851 + return; 852 + } 853 + sk->sk_state = SS_UNCONNECTED; 854 + } 855 + sk->sk_state_change(sk); 856 + } 857 + } 858 + 859 + static void vmci_transport_peer_detach_cb(u32 sub_id, 860 + const struct vmci_event_data *e_data, 861 + void *client_data) 862 + { 863 + struct sock *sk = client_data; 864 + const struct vmci_event_payload_qp *e_payload; 865 + struct vsock_sock *vsk; 866 + 867 + e_payload = vmci_event_data_const_payload(e_data); 868 + vsk = vsock_sk(sk); 869 + if (vmci_handle_is_invalid(e_payload->handle)) 870 + return; 871 + 872 + /* Same rules for locking as for peer_attach_cb(). */ 873 + local_bh_disable(); 874 + bh_lock_sock(sk); 875 + 876 + /* XXX This is lame, we should provide a way to lookup sockets by 877 + * qp_handle. 878 + */ 879 + if (vmci_handle_is_equal(vmci_trans(vsk)->qp_handle, 880 + e_payload->handle)) 881 + vmci_transport_handle_detach(sk); 882 + 883 + bh_unlock_sock(sk); 884 + local_bh_enable(); 885 + } 886 + 887 + static void vmci_transport_qp_resumed_cb(u32 sub_id, 888 + const struct vmci_event_data *e_data, 889 + void *client_data) 890 + { 891 + vsock_for_each_connected_socket(vmci_transport_handle_detach); 892 + } 893 + 894 + static void vmci_transport_recv_pkt_work(struct work_struct *work) 895 + { 896 + struct vmci_transport_recv_pkt_info *recv_pkt_info; 897 + struct vmci_transport_packet *pkt; 898 + struct sock *sk; 899 + 900 + recv_pkt_info = 901 + container_of(work, struct vmci_transport_recv_pkt_info, work); 902 + sk = recv_pkt_info->sk; 903 + pkt = &recv_pkt_info->pkt; 904 + 905 + lock_sock(sk); 906 + 907 + switch (sk->sk_state) { 908 + case SS_LISTEN: 909 + vmci_transport_recv_listen(sk, pkt); 910 + break; 911 + case SS_CONNECTING: 912 + /* Processing of pending connections for servers goes through 913 + * the listening socket, so see vmci_transport_recv_listen() 914 + * for that path. 915 + */ 916 + vmci_transport_recv_connecting_client(sk, pkt); 917 + break; 918 + case SS_CONNECTED: 919 + vmci_transport_recv_connected(sk, pkt); 920 + break; 921 + default: 922 + /* Because this function does not run in the same context as 923 + * vmci_transport_recv_stream_cb it is possible that the 924 + * socket has closed. We need to let the other side know or it 925 + * could be sitting in a connect and hang forever. Send a 926 + * reset to prevent that. 927 + */ 928 + vmci_transport_send_reset(sk, pkt); 929 + goto out; 930 + } 931 + 932 + out: 933 + release_sock(sk); 934 + kfree(recv_pkt_info); 935 + /* Release reference obtained in the stream callback when we fetched 936 + * this socket out of the bound or connected list. 937 + */ 938 + sock_put(sk); 939 + } 940 + 941 + static int vmci_transport_recv_listen(struct sock *sk, 942 + struct vmci_transport_packet *pkt) 943 + { 944 + struct sock *pending; 945 + struct vsock_sock *vpending; 946 + int err; 947 + u64 qp_size; 948 + bool old_request = false; 949 + bool old_pkt_proto = false; 950 + 951 + err = 0; 952 + 953 + /* Because we are in the listen state, we could be receiving a packet 954 + * for ourself or any previous connection requests that we received. 955 + * If it's the latter, we try to find a socket in our list of pending 956 + * connections and, if we do, call the appropriate handler for the 957 + * state that that socket is in. Otherwise we try to service the 958 + * connection request. 959 + */ 960 + pending = vmci_transport_get_pending(sk, pkt); 961 + if (pending) { 962 + lock_sock(pending); 963 + switch (pending->sk_state) { 964 + case SS_CONNECTING: 965 + err = vmci_transport_recv_connecting_server(sk, 966 + pending, 967 + pkt); 968 + break; 969 + default: 970 + vmci_transport_send_reset(pending, pkt); 971 + err = -EINVAL; 972 + } 973 + 974 + if (err < 0) 975 + vsock_remove_pending(sk, pending); 976 + 977 + release_sock(pending); 978 + vmci_transport_release_pending(pending); 979 + 980 + return err; 981 + } 982 + 983 + /* The listen state only accepts connection requests. Reply with a 984 + * reset unless we received a reset. 985 + */ 986 + 987 + if (!(pkt->type == VMCI_TRANSPORT_PACKET_TYPE_REQUEST || 988 + pkt->type == VMCI_TRANSPORT_PACKET_TYPE_REQUEST2)) { 989 + vmci_transport_reply_reset(pkt); 990 + return -EINVAL; 991 + } 992 + 993 + if (pkt->u.size == 0) { 994 + vmci_transport_reply_reset(pkt); 995 + return -EINVAL; 996 + } 997 + 998 + /* If this socket can't accommodate this connection request, we send a 999 + * reset. Otherwise we create and initialize a child socket and reply 1000 + * with a connection negotiation. 1001 + */ 1002 + if (sk->sk_ack_backlog >= sk->sk_max_ack_backlog) { 1003 + vmci_transport_reply_reset(pkt); 1004 + return -ECONNREFUSED; 1005 + } 1006 + 1007 + pending = __vsock_create(sock_net(sk), NULL, sk, GFP_KERNEL, 1008 + sk->sk_type); 1009 + if (!pending) { 1010 + vmci_transport_send_reset(sk, pkt); 1011 + return -ENOMEM; 1012 + } 1013 + 1014 + vpending = vsock_sk(pending); 1015 + 1016 + vsock_addr_init(&vpending->local_addr, pkt->dg.dst.context, 1017 + pkt->dst_port); 1018 + vsock_addr_init(&vpending->remote_addr, pkt->dg.src.context, 1019 + pkt->src_port); 1020 + 1021 + /* If the proposed size fits within our min/max, accept it. Otherwise 1022 + * propose our own size. 1023 + */ 1024 + if (pkt->u.size >= vmci_trans(vpending)->queue_pair_min_size && 1025 + pkt->u.size <= vmci_trans(vpending)->queue_pair_max_size) { 1026 + qp_size = pkt->u.size; 1027 + } else { 1028 + qp_size = vmci_trans(vpending)->queue_pair_size; 1029 + } 1030 + 1031 + /* Figure out if we are using old or new requests based on the 1032 + * overrides pkt types sent by our peer. 1033 + */ 1034 + if (vmci_transport_old_proto_override(&old_pkt_proto)) { 1035 + old_request = old_pkt_proto; 1036 + } else { 1037 + if (pkt->type == VMCI_TRANSPORT_PACKET_TYPE_REQUEST) 1038 + old_request = true; 1039 + else if (pkt->type == VMCI_TRANSPORT_PACKET_TYPE_REQUEST2) 1040 + old_request = false; 1041 + 1042 + } 1043 + 1044 + if (old_request) { 1045 + /* Handle a REQUEST (or override) */ 1046 + u16 version = VSOCK_PROTO_INVALID; 1047 + if (vmci_transport_proto_to_notify_struct( 1048 + pending, &version, true)) 1049 + err = vmci_transport_send_negotiate(pending, qp_size); 1050 + else 1051 + err = -EINVAL; 1052 + 1053 + } else { 1054 + /* Handle a REQUEST2 (or override) */ 1055 + int proto_int = pkt->proto; 1056 + int pos; 1057 + u16 active_proto_version = 0; 1058 + 1059 + /* The list of possible protocols is the intersection of all 1060 + * protocols the client supports ... plus all the protocols we 1061 + * support. 1062 + */ 1063 + proto_int &= vmci_transport_new_proto_supported_versions(); 1064 + 1065 + /* We choose the highest possible protocol version and use that 1066 + * one. 1067 + */ 1068 + pos = fls(proto_int); 1069 + if (pos) { 1070 + active_proto_version = (1 << (pos - 1)); 1071 + if (vmci_transport_proto_to_notify_struct( 1072 + pending, &active_proto_version, false)) 1073 + err = vmci_transport_send_negotiate2(pending, 1074 + qp_size, 1075 + active_proto_version); 1076 + else 1077 + err = -EINVAL; 1078 + 1079 + } else { 1080 + err = -EINVAL; 1081 + } 1082 + } 1083 + 1084 + if (err < 0) { 1085 + vmci_transport_send_reset(sk, pkt); 1086 + sock_put(pending); 1087 + err = vmci_transport_error_to_vsock_error(err); 1088 + goto out; 1089 + } 1090 + 1091 + vsock_add_pending(sk, pending); 1092 + sk->sk_ack_backlog++; 1093 + 1094 + pending->sk_state = SS_CONNECTING; 1095 + vmci_trans(vpending)->produce_size = 1096 + vmci_trans(vpending)->consume_size = qp_size; 1097 + vmci_trans(vpending)->queue_pair_size = qp_size; 1098 + 1099 + vmci_trans(vpending)->notify_ops->process_request(pending); 1100 + 1101 + /* We might never receive another message for this socket and it's not 1102 + * connected to any process, so we have to ensure it gets cleaned up 1103 + * ourself. Our delayed work function will take care of that. Note 1104 + * that we do not ever cancel this function since we have few 1105 + * guarantees about its state when calling cancel_delayed_work(). 1106 + * Instead we hold a reference on the socket for that function and make 1107 + * it capable of handling cases where it needs to do nothing but 1108 + * release that reference. 1109 + */ 1110 + vpending->listener = sk; 1111 + sock_hold(sk); 1112 + sock_hold(pending); 1113 + INIT_DELAYED_WORK(&vpending->dwork, vsock_pending_work); 1114 + schedule_delayed_work(&vpending->dwork, HZ); 1115 + 1116 + out: 1117 + return err; 1118 + } 1119 + 1120 + static int 1121 + vmci_transport_recv_connecting_server(struct sock *listener, 1122 + struct sock *pending, 1123 + struct vmci_transport_packet *pkt) 1124 + { 1125 + struct vsock_sock *vpending; 1126 + struct vmci_handle handle; 1127 + struct vmci_qp *qpair; 1128 + bool is_local; 1129 + u32 flags; 1130 + u32 detach_sub_id; 1131 + int err; 1132 + int skerr; 1133 + 1134 + vpending = vsock_sk(pending); 1135 + detach_sub_id = VMCI_INVALID_ID; 1136 + 1137 + switch (pkt->type) { 1138 + case VMCI_TRANSPORT_PACKET_TYPE_OFFER: 1139 + if (vmci_handle_is_invalid(pkt->u.handle)) { 1140 + vmci_transport_send_reset(pending, pkt); 1141 + skerr = EPROTO; 1142 + err = -EINVAL; 1143 + goto destroy; 1144 + } 1145 + break; 1146 + default: 1147 + /* Close and cleanup the connection. */ 1148 + vmci_transport_send_reset(pending, pkt); 1149 + skerr = EPROTO; 1150 + err = pkt->type == VMCI_TRANSPORT_PACKET_TYPE_RST ? 0 : -EINVAL; 1151 + goto destroy; 1152 + } 1153 + 1154 + /* In order to complete the connection we need to attach to the offered 1155 + * queue pair and send an attach notification. We also subscribe to the 1156 + * detach event so we know when our peer goes away, and we do that 1157 + * before attaching so we don't miss an event. If all this succeeds, 1158 + * we update our state and wakeup anything waiting in accept() for a 1159 + * connection. 1160 + */ 1161 + 1162 + /* We don't care about attach since we ensure the other side has 1163 + * attached by specifying the ATTACH_ONLY flag below. 1164 + */ 1165 + err = vmci_event_subscribe(VMCI_EVENT_QP_PEER_DETACH, 1166 + vmci_transport_peer_detach_cb, 1167 + pending, &detach_sub_id); 1168 + if (err < VMCI_SUCCESS) { 1169 + vmci_transport_send_reset(pending, pkt); 1170 + err = vmci_transport_error_to_vsock_error(err); 1171 + skerr = -err; 1172 + goto destroy; 1173 + } 1174 + 1175 + vmci_trans(vpending)->detach_sub_id = detach_sub_id; 1176 + 1177 + /* Now attach to the queue pair the client created. */ 1178 + handle = pkt->u.handle; 1179 + 1180 + /* vpending->local_addr always has a context id so we do not need to 1181 + * worry about VMADDR_CID_ANY in this case. 1182 + */ 1183 + is_local = 1184 + vpending->remote_addr.svm_cid == vpending->local_addr.svm_cid; 1185 + flags = VMCI_QPFLAG_ATTACH_ONLY; 1186 + flags |= is_local ? VMCI_QPFLAG_LOCAL : 0; 1187 + 1188 + err = vmci_transport_queue_pair_alloc( 1189 + &qpair, 1190 + &handle, 1191 + vmci_trans(vpending)->produce_size, 1192 + vmci_trans(vpending)->consume_size, 1193 + pkt->dg.src.context, 1194 + flags, 1195 + vmci_transport_is_trusted( 1196 + vpending, 1197 + vpending->remote_addr.svm_cid)); 1198 + if (err < 0) { 1199 + vmci_transport_send_reset(pending, pkt); 1200 + skerr = -err; 1201 + goto destroy; 1202 + } 1203 + 1204 + vmci_trans(vpending)->qp_handle = handle; 1205 + vmci_trans(vpending)->qpair = qpair; 1206 + 1207 + /* When we send the attach message, we must be ready to handle incoming 1208 + * control messages on the newly connected socket. So we move the 1209 + * pending socket to the connected state before sending the attach 1210 + * message. Otherwise, an incoming packet triggered by the attach being 1211 + * received by the peer may be processed concurrently with what happens 1212 + * below after sending the attach message, and that incoming packet 1213 + * will find the listening socket instead of the (currently) pending 1214 + * socket. Note that enqueueing the socket increments the reference 1215 + * count, so even if a reset comes before the connection is accepted, 1216 + * the socket will be valid until it is removed from the queue. 1217 + * 1218 + * If we fail sending the attach below, we remove the socket from the 1219 + * connected list and move the socket to SS_UNCONNECTED before 1220 + * releasing the lock, so a pending slow path processing of an incoming 1221 + * packet will not see the socket in the connected state in that case. 1222 + */ 1223 + pending->sk_state = SS_CONNECTED; 1224 + 1225 + vsock_insert_connected(vpending); 1226 + 1227 + /* Notify our peer of our attach. */ 1228 + err = vmci_transport_send_attach(pending, handle); 1229 + if (err < 0) { 1230 + vsock_remove_connected(vpending); 1231 + pr_err("Could not send attach\n"); 1232 + vmci_transport_send_reset(pending, pkt); 1233 + err = vmci_transport_error_to_vsock_error(err); 1234 + skerr = -err; 1235 + goto destroy; 1236 + } 1237 + 1238 + /* We have a connection. Move the now connected socket from the 1239 + * listener's pending list to the accept queue so callers of accept() 1240 + * can find it. 1241 + */ 1242 + vsock_remove_pending(listener, pending); 1243 + vsock_enqueue_accept(listener, pending); 1244 + 1245 + /* Callers of accept() will be be waiting on the listening socket, not 1246 + * the pending socket. 1247 + */ 1248 + listener->sk_state_change(listener); 1249 + 1250 + return 0; 1251 + 1252 + destroy: 1253 + pending->sk_err = skerr; 1254 + pending->sk_state = SS_UNCONNECTED; 1255 + /* As long as we drop our reference, all necessary cleanup will handle 1256 + * when the cleanup function drops its reference and our destruct 1257 + * implementation is called. Note that since the listen handler will 1258 + * remove pending from the pending list upon our failure, the cleanup 1259 + * function won't drop the additional reference, which is why we do it 1260 + * here. 1261 + */ 1262 + sock_put(pending); 1263 + 1264 + return err; 1265 + } 1266 + 1267 + static int 1268 + vmci_transport_recv_connecting_client(struct sock *sk, 1269 + struct vmci_transport_packet *pkt) 1270 + { 1271 + struct vsock_sock *vsk; 1272 + int err; 1273 + int skerr; 1274 + 1275 + vsk = vsock_sk(sk); 1276 + 1277 + switch (pkt->type) { 1278 + case VMCI_TRANSPORT_PACKET_TYPE_ATTACH: 1279 + if (vmci_handle_is_invalid(pkt->u.handle) || 1280 + !vmci_handle_is_equal(pkt->u.handle, 1281 + vmci_trans(vsk)->qp_handle)) { 1282 + skerr = EPROTO; 1283 + err = -EINVAL; 1284 + goto destroy; 1285 + } 1286 + 1287 + /* Signify the socket is connected and wakeup the waiter in 1288 + * connect(). Also place the socket in the connected table for 1289 + * accounting (it can already be found since it's in the bound 1290 + * table). 1291 + */ 1292 + sk->sk_state = SS_CONNECTED; 1293 + sk->sk_socket->state = SS_CONNECTED; 1294 + vsock_insert_connected(vsk); 1295 + sk->sk_state_change(sk); 1296 + 1297 + break; 1298 + case VMCI_TRANSPORT_PACKET_TYPE_NEGOTIATE: 1299 + case VMCI_TRANSPORT_PACKET_TYPE_NEGOTIATE2: 1300 + if (pkt->u.size == 0 1301 + || pkt->dg.src.context != vsk->remote_addr.svm_cid 1302 + || pkt->src_port != vsk->remote_addr.svm_port 1303 + || !vmci_handle_is_invalid(vmci_trans(vsk)->qp_handle) 1304 + || vmci_trans(vsk)->qpair 1305 + || vmci_trans(vsk)->produce_size != 0 1306 + || vmci_trans(vsk)->consume_size != 0 1307 + || vmci_trans(vsk)->attach_sub_id != VMCI_INVALID_ID 1308 + || vmci_trans(vsk)->detach_sub_id != VMCI_INVALID_ID) { 1309 + skerr = EPROTO; 1310 + err = -EINVAL; 1311 + 1312 + goto destroy; 1313 + } 1314 + 1315 + err = vmci_transport_recv_connecting_client_negotiate(sk, pkt); 1316 + if (err) { 1317 + skerr = -err; 1318 + goto destroy; 1319 + } 1320 + 1321 + break; 1322 + case VMCI_TRANSPORT_PACKET_TYPE_INVALID: 1323 + err = vmci_transport_recv_connecting_client_invalid(sk, pkt); 1324 + if (err) { 1325 + skerr = -err; 1326 + goto destroy; 1327 + } 1328 + 1329 + break; 1330 + case VMCI_TRANSPORT_PACKET_TYPE_RST: 1331 + /* Older versions of the linux code (WS 6.5 / ESX 4.0) used to 1332 + * continue processing here after they sent an INVALID packet. 1333 + * This meant that we got a RST after the INVALID. We ignore a 1334 + * RST after an INVALID. The common code doesn't send the RST 1335 + * ... so we can hang if an old version of the common code 1336 + * fails between getting a REQUEST and sending an OFFER back. 1337 + * Not much we can do about it... except hope that it doesn't 1338 + * happen. 1339 + */ 1340 + if (vsk->ignore_connecting_rst) { 1341 + vsk->ignore_connecting_rst = false; 1342 + } else { 1343 + skerr = ECONNRESET; 1344 + err = 0; 1345 + goto destroy; 1346 + } 1347 + 1348 + break; 1349 + default: 1350 + /* Close and cleanup the connection. */ 1351 + skerr = EPROTO; 1352 + err = -EINVAL; 1353 + goto destroy; 1354 + } 1355 + 1356 + return 0; 1357 + 1358 + destroy: 1359 + vmci_transport_send_reset(sk, pkt); 1360 + 1361 + sk->sk_state = SS_UNCONNECTED; 1362 + sk->sk_err = skerr; 1363 + sk->sk_error_report(sk); 1364 + return err; 1365 + } 1366 + 1367 + static int vmci_transport_recv_connecting_client_negotiate( 1368 + struct sock *sk, 1369 + struct vmci_transport_packet *pkt) 1370 + { 1371 + int err; 1372 + struct vsock_sock *vsk; 1373 + struct vmci_handle handle; 1374 + struct vmci_qp *qpair; 1375 + u32 attach_sub_id; 1376 + u32 detach_sub_id; 1377 + bool is_local; 1378 + u32 flags; 1379 + bool old_proto = true; 1380 + bool old_pkt_proto; 1381 + u16 version; 1382 + 1383 + vsk = vsock_sk(sk); 1384 + handle = VMCI_INVALID_HANDLE; 1385 + attach_sub_id = VMCI_INVALID_ID; 1386 + detach_sub_id = VMCI_INVALID_ID; 1387 + 1388 + /* If we have gotten here then we should be past the point where old 1389 + * linux vsock could have sent the bogus rst. 1390 + */ 1391 + vsk->sent_request = false; 1392 + vsk->ignore_connecting_rst = false; 1393 + 1394 + /* Verify that we're OK with the proposed queue pair size */ 1395 + if (pkt->u.size < vmci_trans(vsk)->queue_pair_min_size || 1396 + pkt->u.size > vmci_trans(vsk)->queue_pair_max_size) { 1397 + err = -EINVAL; 1398 + goto destroy; 1399 + } 1400 + 1401 + /* At this point we know the CID the peer is using to talk to us. */ 1402 + 1403 + if (vsk->local_addr.svm_cid == VMADDR_CID_ANY) 1404 + vsk->local_addr.svm_cid = pkt->dg.dst.context; 1405 + 1406 + /* Setup the notify ops to be the highest supported version that both 1407 + * the server and the client support. 1408 + */ 1409 + 1410 + if (vmci_transport_old_proto_override(&old_pkt_proto)) { 1411 + old_proto = old_pkt_proto; 1412 + } else { 1413 + if (pkt->type == VMCI_TRANSPORT_PACKET_TYPE_NEGOTIATE) 1414 + old_proto = true; 1415 + else if (pkt->type == VMCI_TRANSPORT_PACKET_TYPE_NEGOTIATE2) 1416 + old_proto = false; 1417 + 1418 + } 1419 + 1420 + if (old_proto) 1421 + version = VSOCK_PROTO_INVALID; 1422 + else 1423 + version = pkt->proto; 1424 + 1425 + if (!vmci_transport_proto_to_notify_struct(sk, &version, old_proto)) { 1426 + err = -EINVAL; 1427 + goto destroy; 1428 + } 1429 + 1430 + /* Subscribe to attach and detach events first. 1431 + * 1432 + * XXX We attach once for each queue pair created for now so it is easy 1433 + * to find the socket (it's provided), but later we should only 1434 + * subscribe once and add a way to lookup sockets by queue pair handle. 1435 + */ 1436 + err = vmci_event_subscribe(VMCI_EVENT_QP_PEER_ATTACH, 1437 + vmci_transport_peer_attach_cb, 1438 + sk, &attach_sub_id); 1439 + if (err < VMCI_SUCCESS) { 1440 + err = vmci_transport_error_to_vsock_error(err); 1441 + goto destroy; 1442 + } 1443 + 1444 + err = vmci_event_subscribe(VMCI_EVENT_QP_PEER_DETACH, 1445 + vmci_transport_peer_detach_cb, 1446 + sk, &detach_sub_id); 1447 + if (err < VMCI_SUCCESS) { 1448 + err = vmci_transport_error_to_vsock_error(err); 1449 + goto destroy; 1450 + } 1451 + 1452 + /* Make VMCI select the handle for us. */ 1453 + handle = VMCI_INVALID_HANDLE; 1454 + is_local = vsk->remote_addr.svm_cid == vsk->local_addr.svm_cid; 1455 + flags = is_local ? VMCI_QPFLAG_LOCAL : 0; 1456 + 1457 + err = vmci_transport_queue_pair_alloc(&qpair, 1458 + &handle, 1459 + pkt->u.size, 1460 + pkt->u.size, 1461 + vsk->remote_addr.svm_cid, 1462 + flags, 1463 + vmci_transport_is_trusted( 1464 + vsk, 1465 + vsk-> 1466 + remote_addr.svm_cid)); 1467 + if (err < 0) 1468 + goto destroy; 1469 + 1470 + err = vmci_transport_send_qp_offer(sk, handle); 1471 + if (err < 0) { 1472 + err = vmci_transport_error_to_vsock_error(err); 1473 + goto destroy; 1474 + } 1475 + 1476 + vmci_trans(vsk)->qp_handle = handle; 1477 + vmci_trans(vsk)->qpair = qpair; 1478 + 1479 + vmci_trans(vsk)->produce_size = vmci_trans(vsk)->consume_size = 1480 + pkt->u.size; 1481 + 1482 + vmci_trans(vsk)->attach_sub_id = attach_sub_id; 1483 + vmci_trans(vsk)->detach_sub_id = detach_sub_id; 1484 + 1485 + vmci_trans(vsk)->notify_ops->process_negotiate(sk); 1486 + 1487 + return 0; 1488 + 1489 + destroy: 1490 + if (attach_sub_id != VMCI_INVALID_ID) 1491 + vmci_event_unsubscribe(attach_sub_id); 1492 + 1493 + if (detach_sub_id != VMCI_INVALID_ID) 1494 + vmci_event_unsubscribe(detach_sub_id); 1495 + 1496 + if (!vmci_handle_is_invalid(handle)) 1497 + vmci_qpair_detach(&qpair); 1498 + 1499 + return err; 1500 + } 1501 + 1502 + static int 1503 + vmci_transport_recv_connecting_client_invalid(struct sock *sk, 1504 + struct vmci_transport_packet *pkt) 1505 + { 1506 + int err = 0; 1507 + struct vsock_sock *vsk = vsock_sk(sk); 1508 + 1509 + if (vsk->sent_request) { 1510 + vsk->sent_request = false; 1511 + vsk->ignore_connecting_rst = true; 1512 + 1513 + err = vmci_transport_send_conn_request( 1514 + sk, vmci_trans(vsk)->queue_pair_size); 1515 + if (err < 0) 1516 + err = vmci_transport_error_to_vsock_error(err); 1517 + else 1518 + err = 0; 1519 + 1520 + } 1521 + 1522 + return err; 1523 + } 1524 + 1525 + static int vmci_transport_recv_connected(struct sock *sk, 1526 + struct vmci_transport_packet *pkt) 1527 + { 1528 + struct vsock_sock *vsk; 1529 + bool pkt_processed = false; 1530 + 1531 + /* In cases where we are closing the connection, it's sufficient to 1532 + * mark the state change (and maybe error) and wake up any waiting 1533 + * threads. Since this is a connected socket, it's owned by a user 1534 + * process and will be cleaned up when the failure is passed back on 1535 + * the current or next system call. Our system call implementations 1536 + * must therefore check for error and state changes on entry and when 1537 + * being awoken. 1538 + */ 1539 + switch (pkt->type) { 1540 + case VMCI_TRANSPORT_PACKET_TYPE_SHUTDOWN: 1541 + if (pkt->u.mode) { 1542 + vsk = vsock_sk(sk); 1543 + 1544 + vsk->peer_shutdown |= pkt->u.mode; 1545 + sk->sk_state_change(sk); 1546 + } 1547 + break; 1548 + 1549 + case VMCI_TRANSPORT_PACKET_TYPE_RST: 1550 + vsk = vsock_sk(sk); 1551 + /* It is possible that we sent our peer a message (e.g a 1552 + * WAITING_READ) right before we got notified that the peer had 1553 + * detached. If that happens then we can get a RST pkt back 1554 + * from our peer even though there is data available for us to 1555 + * read. In that case, don't shutdown the socket completely but 1556 + * instead allow the local client to finish reading data off 1557 + * the queuepair. Always treat a RST pkt in connected mode like 1558 + * a clean shutdown. 1559 + */ 1560 + sock_set_flag(sk, SOCK_DONE); 1561 + vsk->peer_shutdown = SHUTDOWN_MASK; 1562 + if (vsock_stream_has_data(vsk) <= 0) 1563 + sk->sk_state = SS_DISCONNECTING; 1564 + 1565 + sk->sk_state_change(sk); 1566 + break; 1567 + 1568 + default: 1569 + vsk = vsock_sk(sk); 1570 + vmci_trans(vsk)->notify_ops->handle_notify_pkt( 1571 + sk, pkt, false, NULL, NULL, 1572 + &pkt_processed); 1573 + if (!pkt_processed) 1574 + return -EINVAL; 1575 + 1576 + break; 1577 + } 1578 + 1579 + return 0; 1580 + } 1581 + 1582 + static int vmci_transport_socket_init(struct vsock_sock *vsk, 1583 + struct vsock_sock *psk) 1584 + { 1585 + vsk->trans = kmalloc(sizeof(struct vmci_transport), GFP_KERNEL); 1586 + if (!vsk->trans) 1587 + return -ENOMEM; 1588 + 1589 + vmci_trans(vsk)->dg_handle = VMCI_INVALID_HANDLE; 1590 + vmci_trans(vsk)->qp_handle = VMCI_INVALID_HANDLE; 1591 + vmci_trans(vsk)->qpair = NULL; 1592 + vmci_trans(vsk)->produce_size = vmci_trans(vsk)->consume_size = 0; 1593 + vmci_trans(vsk)->attach_sub_id = vmci_trans(vsk)->detach_sub_id = 1594 + VMCI_INVALID_ID; 1595 + vmci_trans(vsk)->notify_ops = NULL; 1596 + if (psk) { 1597 + vmci_trans(vsk)->queue_pair_size = 1598 + vmci_trans(psk)->queue_pair_size; 1599 + vmci_trans(vsk)->queue_pair_min_size = 1600 + vmci_trans(psk)->queue_pair_min_size; 1601 + vmci_trans(vsk)->queue_pair_max_size = 1602 + vmci_trans(psk)->queue_pair_max_size; 1603 + } else { 1604 + vmci_trans(vsk)->queue_pair_size = 1605 + VMCI_TRANSPORT_DEFAULT_QP_SIZE; 1606 + vmci_trans(vsk)->queue_pair_min_size = 1607 + VMCI_TRANSPORT_DEFAULT_QP_SIZE_MIN; 1608 + vmci_trans(vsk)->queue_pair_max_size = 1609 + VMCI_TRANSPORT_DEFAULT_QP_SIZE_MAX; 1610 + } 1611 + 1612 + return 0; 1613 + } 1614 + 1615 + static void vmci_transport_destruct(struct vsock_sock *vsk) 1616 + { 1617 + if (vmci_trans(vsk)->attach_sub_id != VMCI_INVALID_ID) { 1618 + vmci_event_unsubscribe(vmci_trans(vsk)->attach_sub_id); 1619 + vmci_trans(vsk)->attach_sub_id = VMCI_INVALID_ID; 1620 + } 1621 + 1622 + if (vmci_trans(vsk)->detach_sub_id != VMCI_INVALID_ID) { 1623 + vmci_event_unsubscribe(vmci_trans(vsk)->detach_sub_id); 1624 + vmci_trans(vsk)->detach_sub_id = VMCI_INVALID_ID; 1625 + } 1626 + 1627 + if (!vmci_handle_is_invalid(vmci_trans(vsk)->qp_handle)) { 1628 + vmci_qpair_detach(&vmci_trans(vsk)->qpair); 1629 + vmci_trans(vsk)->qp_handle = VMCI_INVALID_HANDLE; 1630 + vmci_trans(vsk)->produce_size = 0; 1631 + vmci_trans(vsk)->consume_size = 0; 1632 + } 1633 + 1634 + if (vmci_trans(vsk)->notify_ops) 1635 + vmci_trans(vsk)->notify_ops->socket_destruct(vsk); 1636 + 1637 + kfree(vsk->trans); 1638 + vsk->trans = NULL; 1639 + } 1640 + 1641 + static void vmci_transport_release(struct vsock_sock *vsk) 1642 + { 1643 + if (!vmci_handle_is_invalid(vmci_trans(vsk)->dg_handle)) { 1644 + vmci_datagram_destroy_handle(vmci_trans(vsk)->dg_handle); 1645 + vmci_trans(vsk)->dg_handle = VMCI_INVALID_HANDLE; 1646 + } 1647 + } 1648 + 1649 + static int vmci_transport_dgram_bind(struct vsock_sock *vsk, 1650 + struct sockaddr_vm *addr) 1651 + { 1652 + u32 port; 1653 + u32 flags; 1654 + int err; 1655 + 1656 + /* VMCI will select a resource ID for us if we provide 1657 + * VMCI_INVALID_ID. 1658 + */ 1659 + port = addr->svm_port == VMADDR_PORT_ANY ? 1660 + VMCI_INVALID_ID : addr->svm_port; 1661 + 1662 + if (port <= LAST_RESERVED_PORT && !capable(CAP_NET_BIND_SERVICE)) 1663 + return -EACCES; 1664 + 1665 + flags = addr->svm_cid == VMADDR_CID_ANY ? 1666 + VMCI_FLAG_ANYCID_DG_HND : 0; 1667 + 1668 + err = vmci_transport_datagram_create_hnd(port, flags, 1669 + vmci_transport_recv_dgram_cb, 1670 + &vsk->sk, 1671 + &vmci_trans(vsk)->dg_handle); 1672 + if (err < VMCI_SUCCESS) 1673 + return vmci_transport_error_to_vsock_error(err); 1674 + vsock_addr_init(&vsk->local_addr, addr->svm_cid, 1675 + vmci_trans(vsk)->dg_handle.resource); 1676 + 1677 + return 0; 1678 + } 1679 + 1680 + static int vmci_transport_dgram_enqueue( 1681 + struct vsock_sock *vsk, 1682 + struct sockaddr_vm *remote_addr, 1683 + struct iovec *iov, 1684 + size_t len) 1685 + { 1686 + int err; 1687 + struct vmci_datagram *dg; 1688 + 1689 + if (len > VMCI_MAX_DG_PAYLOAD_SIZE) 1690 + return -EMSGSIZE; 1691 + 1692 + if (!vmci_transport_allow_dgram(vsk, remote_addr->svm_cid)) 1693 + return -EPERM; 1694 + 1695 + /* Allocate a buffer for the user's message and our packet header. */ 1696 + dg = kmalloc(len + sizeof(*dg), GFP_KERNEL); 1697 + if (!dg) 1698 + return -ENOMEM; 1699 + 1700 + memcpy_fromiovec(VMCI_DG_PAYLOAD(dg), iov, len); 1701 + 1702 + dg->dst = vmci_make_handle(remote_addr->svm_cid, 1703 + remote_addr->svm_port); 1704 + dg->src = vmci_make_handle(vsk->local_addr.svm_cid, 1705 + vsk->local_addr.svm_port); 1706 + dg->payload_size = len; 1707 + 1708 + err = vmci_datagram_send(dg); 1709 + kfree(dg); 1710 + if (err < 0) 1711 + return vmci_transport_error_to_vsock_error(err); 1712 + 1713 + return err - sizeof(*dg); 1714 + } 1715 + 1716 + static int vmci_transport_dgram_dequeue(struct kiocb *kiocb, 1717 + struct vsock_sock *vsk, 1718 + struct msghdr *msg, size_t len, 1719 + int flags) 1720 + { 1721 + int err; 1722 + int noblock; 1723 + struct vmci_datagram *dg; 1724 + size_t payload_len; 1725 + struct sk_buff *skb; 1726 + 1727 + noblock = flags & MSG_DONTWAIT; 1728 + 1729 + if (flags & MSG_OOB || flags & MSG_ERRQUEUE) 1730 + return -EOPNOTSUPP; 1731 + 1732 + /* Retrieve the head sk_buff from the socket's receive queue. */ 1733 + err = 0; 1734 + skb = skb_recv_datagram(&vsk->sk, flags, noblock, &err); 1735 + if (err) 1736 + return err; 1737 + 1738 + if (!skb) 1739 + return -EAGAIN; 1740 + 1741 + dg = (struct vmci_datagram *)skb->data; 1742 + if (!dg) 1743 + /* err is 0, meaning we read zero bytes. */ 1744 + goto out; 1745 + 1746 + payload_len = dg->payload_size; 1747 + /* Ensure the sk_buff matches the payload size claimed in the packet. */ 1748 + if (payload_len != skb->len - sizeof(*dg)) { 1749 + err = -EINVAL; 1750 + goto out; 1751 + } 1752 + 1753 + if (payload_len > len) { 1754 + payload_len = len; 1755 + msg->msg_flags |= MSG_TRUNC; 1756 + } 1757 + 1758 + /* Place the datagram payload in the user's iovec. */ 1759 + err = skb_copy_datagram_iovec(skb, sizeof(*dg), msg->msg_iov, 1760 + payload_len); 1761 + if (err) 1762 + goto out; 1763 + 1764 + msg->msg_namelen = 0; 1765 + if (msg->msg_name) { 1766 + struct sockaddr_vm *vm_addr; 1767 + 1768 + /* Provide the address of the sender. */ 1769 + vm_addr = (struct sockaddr_vm *)msg->msg_name; 1770 + vsock_addr_init(vm_addr, dg->src.context, dg->src.resource); 1771 + msg->msg_namelen = sizeof(*vm_addr); 1772 + } 1773 + err = payload_len; 1774 + 1775 + out: 1776 + skb_free_datagram(&vsk->sk, skb); 1777 + return err; 1778 + } 1779 + 1780 + static bool vmci_transport_dgram_allow(u32 cid, u32 port) 1781 + { 1782 + if (cid == VMADDR_CID_HYPERVISOR) { 1783 + /* Registrations of PBRPC Servers do not modify VMX/Hypervisor 1784 + * state and are allowed. 1785 + */ 1786 + return port == VMCI_UNITY_PBRPC_REGISTER; 1787 + } 1788 + 1789 + return true; 1790 + } 1791 + 1792 + static int vmci_transport_connect(struct vsock_sock *vsk) 1793 + { 1794 + int err; 1795 + bool old_pkt_proto = false; 1796 + struct sock *sk = &vsk->sk; 1797 + 1798 + if (vmci_transport_old_proto_override(&old_pkt_proto) && 1799 + old_pkt_proto) { 1800 + err = vmci_transport_send_conn_request( 1801 + sk, vmci_trans(vsk)->queue_pair_size); 1802 + if (err < 0) { 1803 + sk->sk_state = SS_UNCONNECTED; 1804 + return err; 1805 + } 1806 + } else { 1807 + int supported_proto_versions = 1808 + vmci_transport_new_proto_supported_versions(); 1809 + err = vmci_transport_send_conn_request2( 1810 + sk, vmci_trans(vsk)->queue_pair_size, 1811 + supported_proto_versions); 1812 + if (err < 0) { 1813 + sk->sk_state = SS_UNCONNECTED; 1814 + return err; 1815 + } 1816 + 1817 + vsk->sent_request = true; 1818 + } 1819 + 1820 + return err; 1821 + } 1822 + 1823 + static ssize_t vmci_transport_stream_dequeue( 1824 + struct vsock_sock *vsk, 1825 + struct iovec *iov, 1826 + size_t len, 1827 + int flags) 1828 + { 1829 + if (flags & MSG_PEEK) 1830 + return vmci_qpair_peekv(vmci_trans(vsk)->qpair, iov, len, 0); 1831 + else 1832 + return vmci_qpair_dequev(vmci_trans(vsk)->qpair, iov, len, 0); 1833 + } 1834 + 1835 + static ssize_t vmci_transport_stream_enqueue( 1836 + struct vsock_sock *vsk, 1837 + struct iovec *iov, 1838 + size_t len) 1839 + { 1840 + return vmci_qpair_enquev(vmci_trans(vsk)->qpair, iov, len, 0); 1841 + } 1842 + 1843 + static s64 vmci_transport_stream_has_data(struct vsock_sock *vsk) 1844 + { 1845 + return vmci_qpair_consume_buf_ready(vmci_trans(vsk)->qpair); 1846 + } 1847 + 1848 + static s64 vmci_transport_stream_has_space(struct vsock_sock *vsk) 1849 + { 1850 + return vmci_qpair_produce_free_space(vmci_trans(vsk)->qpair); 1851 + } 1852 + 1853 + static u64 vmci_transport_stream_rcvhiwat(struct vsock_sock *vsk) 1854 + { 1855 + return vmci_trans(vsk)->consume_size; 1856 + } 1857 + 1858 + static bool vmci_transport_stream_is_active(struct vsock_sock *vsk) 1859 + { 1860 + return !vmci_handle_is_invalid(vmci_trans(vsk)->qp_handle); 1861 + } 1862 + 1863 + static u64 vmci_transport_get_buffer_size(struct vsock_sock *vsk) 1864 + { 1865 + return vmci_trans(vsk)->queue_pair_size; 1866 + } 1867 + 1868 + static u64 vmci_transport_get_min_buffer_size(struct vsock_sock *vsk) 1869 + { 1870 + return vmci_trans(vsk)->queue_pair_min_size; 1871 + } 1872 + 1873 + static u64 vmci_transport_get_max_buffer_size(struct vsock_sock *vsk) 1874 + { 1875 + return vmci_trans(vsk)->queue_pair_max_size; 1876 + } 1877 + 1878 + static void vmci_transport_set_buffer_size(struct vsock_sock *vsk, u64 val) 1879 + { 1880 + if (val < vmci_trans(vsk)->queue_pair_min_size) 1881 + vmci_trans(vsk)->queue_pair_min_size = val; 1882 + if (val > vmci_trans(vsk)->queue_pair_max_size) 1883 + vmci_trans(vsk)->queue_pair_max_size = val; 1884 + vmci_trans(vsk)->queue_pair_size = val; 1885 + } 1886 + 1887 + static void vmci_transport_set_min_buffer_size(struct vsock_sock *vsk, 1888 + u64 val) 1889 + { 1890 + if (val > vmci_trans(vsk)->queue_pair_size) 1891 + vmci_trans(vsk)->queue_pair_size = val; 1892 + vmci_trans(vsk)->queue_pair_min_size = val; 1893 + } 1894 + 1895 + static void vmci_transport_set_max_buffer_size(struct vsock_sock *vsk, 1896 + u64 val) 1897 + { 1898 + if (val < vmci_trans(vsk)->queue_pair_size) 1899 + vmci_trans(vsk)->queue_pair_size = val; 1900 + vmci_trans(vsk)->queue_pair_max_size = val; 1901 + } 1902 + 1903 + static int vmci_transport_notify_poll_in( 1904 + struct vsock_sock *vsk, 1905 + size_t target, 1906 + bool *data_ready_now) 1907 + { 1908 + return vmci_trans(vsk)->notify_ops->poll_in( 1909 + &vsk->sk, target, data_ready_now); 1910 + } 1911 + 1912 + static int vmci_transport_notify_poll_out( 1913 + struct vsock_sock *vsk, 1914 + size_t target, 1915 + bool *space_available_now) 1916 + { 1917 + return vmci_trans(vsk)->notify_ops->poll_out( 1918 + &vsk->sk, target, space_available_now); 1919 + } 1920 + 1921 + static int vmci_transport_notify_recv_init( 1922 + struct vsock_sock *vsk, 1923 + size_t target, 1924 + struct vsock_transport_recv_notify_data *data) 1925 + { 1926 + return vmci_trans(vsk)->notify_ops->recv_init( 1927 + &vsk->sk, target, 1928 + (struct vmci_transport_recv_notify_data *)data); 1929 + } 1930 + 1931 + static int vmci_transport_notify_recv_pre_block( 1932 + struct vsock_sock *vsk, 1933 + size_t target, 1934 + struct vsock_transport_recv_notify_data *data) 1935 + { 1936 + return vmci_trans(vsk)->notify_ops->recv_pre_block( 1937 + &vsk->sk, target, 1938 + (struct vmci_transport_recv_notify_data *)data); 1939 + } 1940 + 1941 + static int vmci_transport_notify_recv_pre_dequeue( 1942 + struct vsock_sock *vsk, 1943 + size_t target, 1944 + struct vsock_transport_recv_notify_data *data) 1945 + { 1946 + return vmci_trans(vsk)->notify_ops->recv_pre_dequeue( 1947 + &vsk->sk, target, 1948 + (struct vmci_transport_recv_notify_data *)data); 1949 + } 1950 + 1951 + static int vmci_transport_notify_recv_post_dequeue( 1952 + struct vsock_sock *vsk, 1953 + size_t target, 1954 + ssize_t copied, 1955 + bool data_read, 1956 + struct vsock_transport_recv_notify_data *data) 1957 + { 1958 + return vmci_trans(vsk)->notify_ops->recv_post_dequeue( 1959 + &vsk->sk, target, copied, data_read, 1960 + (struct vmci_transport_recv_notify_data *)data); 1961 + } 1962 + 1963 + static int vmci_transport_notify_send_init( 1964 + struct vsock_sock *vsk, 1965 + struct vsock_transport_send_notify_data *data) 1966 + { 1967 + return vmci_trans(vsk)->notify_ops->send_init( 1968 + &vsk->sk, 1969 + (struct vmci_transport_send_notify_data *)data); 1970 + } 1971 + 1972 + static int vmci_transport_notify_send_pre_block( 1973 + struct vsock_sock *vsk, 1974 + struct vsock_transport_send_notify_data *data) 1975 + { 1976 + return vmci_trans(vsk)->notify_ops->send_pre_block( 1977 + &vsk->sk, 1978 + (struct vmci_transport_send_notify_data *)data); 1979 + } 1980 + 1981 + static int vmci_transport_notify_send_pre_enqueue( 1982 + struct vsock_sock *vsk, 1983 + struct vsock_transport_send_notify_data *data) 1984 + { 1985 + return vmci_trans(vsk)->notify_ops->send_pre_enqueue( 1986 + &vsk->sk, 1987 + (struct vmci_transport_send_notify_data *)data); 1988 + } 1989 + 1990 + static int vmci_transport_notify_send_post_enqueue( 1991 + struct vsock_sock *vsk, 1992 + ssize_t written, 1993 + struct vsock_transport_send_notify_data *data) 1994 + { 1995 + return vmci_trans(vsk)->notify_ops->send_post_enqueue( 1996 + &vsk->sk, written, 1997 + (struct vmci_transport_send_notify_data *)data); 1998 + } 1999 + 2000 + static bool vmci_transport_old_proto_override(bool *old_pkt_proto) 2001 + { 2002 + if (PROTOCOL_OVERRIDE != -1) { 2003 + if (PROTOCOL_OVERRIDE == 0) 2004 + *old_pkt_proto = true; 2005 + else 2006 + *old_pkt_proto = false; 2007 + 2008 + pr_info("Proto override in use\n"); 2009 + return true; 2010 + } 2011 + 2012 + return false; 2013 + } 2014 + 2015 + static bool vmci_transport_proto_to_notify_struct(struct sock *sk, 2016 + u16 *proto, 2017 + bool old_pkt_proto) 2018 + { 2019 + struct vsock_sock *vsk = vsock_sk(sk); 2020 + 2021 + if (old_pkt_proto) { 2022 + if (*proto != VSOCK_PROTO_INVALID) { 2023 + pr_err("Can't set both an old and new protocol\n"); 2024 + return false; 2025 + } 2026 + vmci_trans(vsk)->notify_ops = &vmci_transport_notify_pkt_ops; 2027 + goto exit; 2028 + } 2029 + 2030 + switch (*proto) { 2031 + case VSOCK_PROTO_PKT_ON_NOTIFY: 2032 + vmci_trans(vsk)->notify_ops = 2033 + &vmci_transport_notify_pkt_q_state_ops; 2034 + break; 2035 + default: 2036 + pr_err("Unknown notify protocol version\n"); 2037 + return false; 2038 + } 2039 + 2040 + exit: 2041 + vmci_trans(vsk)->notify_ops->socket_init(sk); 2042 + return true; 2043 + } 2044 + 2045 + static u16 vmci_transport_new_proto_supported_versions(void) 2046 + { 2047 + if (PROTOCOL_OVERRIDE != -1) 2048 + return PROTOCOL_OVERRIDE; 2049 + 2050 + return VSOCK_PROTO_ALL_SUPPORTED; 2051 + } 2052 + 2053 + static u32 vmci_transport_get_local_cid(void) 2054 + { 2055 + return vmci_get_context_id(); 2056 + } 2057 + 2058 + static struct vsock_transport vmci_transport = { 2059 + .init = vmci_transport_socket_init, 2060 + .destruct = vmci_transport_destruct, 2061 + .release = vmci_transport_release, 2062 + .connect = vmci_transport_connect, 2063 + .dgram_bind = vmci_transport_dgram_bind, 2064 + .dgram_dequeue = vmci_transport_dgram_dequeue, 2065 + .dgram_enqueue = vmci_transport_dgram_enqueue, 2066 + .dgram_allow = vmci_transport_dgram_allow, 2067 + .stream_dequeue = vmci_transport_stream_dequeue, 2068 + .stream_enqueue = vmci_transport_stream_enqueue, 2069 + .stream_has_data = vmci_transport_stream_has_data, 2070 + .stream_has_space = vmci_transport_stream_has_space, 2071 + .stream_rcvhiwat = vmci_transport_stream_rcvhiwat, 2072 + .stream_is_active = vmci_transport_stream_is_active, 2073 + .stream_allow = vmci_transport_stream_allow, 2074 + .notify_poll_in = vmci_transport_notify_poll_in, 2075 + .notify_poll_out = vmci_transport_notify_poll_out, 2076 + .notify_recv_init = vmci_transport_notify_recv_init, 2077 + .notify_recv_pre_block = vmci_transport_notify_recv_pre_block, 2078 + .notify_recv_pre_dequeue = vmci_transport_notify_recv_pre_dequeue, 2079 + .notify_recv_post_dequeue = vmci_transport_notify_recv_post_dequeue, 2080 + .notify_send_init = vmci_transport_notify_send_init, 2081 + .notify_send_pre_block = vmci_transport_notify_send_pre_block, 2082 + .notify_send_pre_enqueue = vmci_transport_notify_send_pre_enqueue, 2083 + .notify_send_post_enqueue = vmci_transport_notify_send_post_enqueue, 2084 + .shutdown = vmci_transport_shutdown, 2085 + .set_buffer_size = vmci_transport_set_buffer_size, 2086 + .set_min_buffer_size = vmci_transport_set_min_buffer_size, 2087 + .set_max_buffer_size = vmci_transport_set_max_buffer_size, 2088 + .get_buffer_size = vmci_transport_get_buffer_size, 2089 + .get_min_buffer_size = vmci_transport_get_min_buffer_size, 2090 + .get_max_buffer_size = vmci_transport_get_max_buffer_size, 2091 + .get_local_cid = vmci_transport_get_local_cid, 2092 + }; 2093 + 2094 + static int __init vmci_transport_init(void) 2095 + { 2096 + int err; 2097 + 2098 + /* Create the datagram handle that we will use to send and receive all 2099 + * VSocket control messages for this context. 2100 + */ 2101 + err = vmci_transport_datagram_create_hnd(VMCI_TRANSPORT_PACKET_RID, 2102 + VMCI_FLAG_ANYCID_DG_HND, 2103 + vmci_transport_recv_stream_cb, 2104 + NULL, 2105 + &vmci_transport_stream_handle); 2106 + if (err < VMCI_SUCCESS) { 2107 + pr_err("Unable to create datagram handle. (%d)\n", err); 2108 + return vmci_transport_error_to_vsock_error(err); 2109 + } 2110 + 2111 + err = vmci_event_subscribe(VMCI_EVENT_QP_RESUMED, 2112 + vmci_transport_qp_resumed_cb, 2113 + NULL, &vmci_transport_qp_resumed_sub_id); 2114 + if (err < VMCI_SUCCESS) { 2115 + pr_err("Unable to subscribe to resumed event. (%d)\n", err); 2116 + err = vmci_transport_error_to_vsock_error(err); 2117 + vmci_transport_qp_resumed_sub_id = VMCI_INVALID_ID; 2118 + goto err_destroy_stream_handle; 2119 + } 2120 + 2121 + err = vsock_core_init(&vmci_transport); 2122 + if (err < 0) 2123 + goto err_unsubscribe; 2124 + 2125 + return 0; 2126 + 2127 + err_unsubscribe: 2128 + vmci_event_unsubscribe(vmci_transport_qp_resumed_sub_id); 2129 + err_destroy_stream_handle: 2130 + vmci_datagram_destroy_handle(vmci_transport_stream_handle); 2131 + return err; 2132 + } 2133 + module_init(vmci_transport_init); 2134 + 2135 + static void __exit vmci_transport_exit(void) 2136 + { 2137 + if (!vmci_handle_is_invalid(vmci_transport_stream_handle)) { 2138 + if (vmci_datagram_destroy_handle( 2139 + vmci_transport_stream_handle) != VMCI_SUCCESS) 2140 + pr_err("Couldn't destroy datagram handle\n"); 2141 + vmci_transport_stream_handle = VMCI_INVALID_HANDLE; 2142 + } 2143 + 2144 + if (vmci_transport_qp_resumed_sub_id != VMCI_INVALID_ID) { 2145 + vmci_event_unsubscribe(vmci_transport_qp_resumed_sub_id); 2146 + vmci_transport_qp_resumed_sub_id = VMCI_INVALID_ID; 2147 + } 2148 + 2149 + vsock_core_exit(); 2150 + } 2151 + module_exit(vmci_transport_exit); 2152 + 2153 + MODULE_AUTHOR("VMware, Inc."); 2154 + MODULE_DESCRIPTION("VMCI transport for Virtual Sockets"); 2155 + MODULE_LICENSE("GPL v2"); 2156 + MODULE_ALIAS("vmware_vsock"); 2157 + MODULE_ALIAS_NETPROTO(PF_VSOCK);

+139

net/vmw_vsock/vmci_transport.h

··· 1 + /* 2 + * VMware vSockets Driver 3 + * 4 + * Copyright (C) 2013 VMware, Inc. All rights reserved. 5 + * 6 + * This program is free software; you can redistribute it and/or modify it 7 + * under the terms of the GNU General Public License as published by the Free 8 + * Software Foundation version 2 and no later version. 9 + * 10 + * This program is distributed in the hope that it will be useful, but WITHOUT 11 + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 13 + * more details. 14 + */ 15 + 16 + #ifndef _VMCI_TRANSPORT_H_ 17 + #define _VMCI_TRANSPORT_H_ 18 + 19 + #include <linux/vmw_vmci_defs.h> 20 + #include <linux/vmw_vmci_api.h> 21 + 22 + #include "vsock_addr.h" 23 + #include "af_vsock.h" 24 + 25 + /* If the packet format changes in a release then this should change too. */ 26 + #define VMCI_TRANSPORT_PACKET_VERSION 1 27 + 28 + /* The resource ID on which control packets are sent. */ 29 + #define VMCI_TRANSPORT_PACKET_RID 1 30 + 31 + #define VSOCK_PROTO_INVALID 0 32 + #define VSOCK_PROTO_PKT_ON_NOTIFY (1 << 0) 33 + #define VSOCK_PROTO_ALL_SUPPORTED (VSOCK_PROTO_PKT_ON_NOTIFY) 34 + 35 + #define vmci_trans(_vsk) ((struct vmci_transport *)((_vsk)->trans)) 36 + 37 + enum vmci_transport_packet_type { 38 + VMCI_TRANSPORT_PACKET_TYPE_INVALID = 0, 39 + VMCI_TRANSPORT_PACKET_TYPE_REQUEST, 40 + VMCI_TRANSPORT_PACKET_TYPE_NEGOTIATE, 41 + VMCI_TRANSPORT_PACKET_TYPE_OFFER, 42 + VMCI_TRANSPORT_PACKET_TYPE_ATTACH, 43 + VMCI_TRANSPORT_PACKET_TYPE_WROTE, 44 + VMCI_TRANSPORT_PACKET_TYPE_READ, 45 + VMCI_TRANSPORT_PACKET_TYPE_RST, 46 + VMCI_TRANSPORT_PACKET_TYPE_SHUTDOWN, 47 + VMCI_TRANSPORT_PACKET_TYPE_WAITING_WRITE, 48 + VMCI_TRANSPORT_PACKET_TYPE_WAITING_READ, 49 + VMCI_TRANSPORT_PACKET_TYPE_REQUEST2, 50 + VMCI_TRANSPORT_PACKET_TYPE_NEGOTIATE2, 51 + VMCI_TRANSPORT_PACKET_TYPE_MAX 52 + }; 53 + 54 + struct vmci_transport_waiting_info { 55 + u64 generation; 56 + u64 offset; 57 + }; 58 + 59 + /* Control packet type for STREAM sockets. DGRAMs have no control packets nor 60 + * special packet header for data packets, they are just raw VMCI DGRAM 61 + * messages. For STREAMs, control packets are sent over the control channel 62 + * while data is written and read directly from queue pairs with no packet 63 + * format. 64 + */ 65 + struct vmci_transport_packet { 66 + struct vmci_datagram dg; 67 + u8 version; 68 + u8 type; 69 + u16 proto; 70 + u32 src_port; 71 + u32 dst_port; 72 + u32 _reserved2; 73 + union { 74 + u64 size; 75 + u64 mode; 76 + struct vmci_handle handle; 77 + struct vmci_transport_waiting_info wait; 78 + } u; 79 + }; 80 + 81 + struct vmci_transport_notify_pkt { 82 + u64 write_notify_window; 83 + u64 write_notify_min_window; 84 + bool peer_waiting_read; 85 + bool peer_waiting_write; 86 + bool peer_waiting_write_detected; 87 + bool sent_waiting_read; 88 + bool sent_waiting_write; 89 + struct vmci_transport_waiting_info peer_waiting_read_info; 90 + struct vmci_transport_waiting_info peer_waiting_write_info; 91 + u64 produce_q_generation; 92 + u64 consume_q_generation; 93 + }; 94 + 95 + struct vmci_transport_notify_pkt_q_state { 96 + u64 write_notify_window; 97 + u64 write_notify_min_window; 98 + bool peer_waiting_write; 99 + bool peer_waiting_write_detected; 100 + }; 101 + 102 + union vmci_transport_notify { 103 + struct vmci_transport_notify_pkt pkt; 104 + struct vmci_transport_notify_pkt_q_state pkt_q_state; 105 + }; 106 + 107 + /* Our transport-specific data. */ 108 + struct vmci_transport { 109 + /* For DGRAMs. */ 110 + struct vmci_handle dg_handle; 111 + /* For STREAMs. */ 112 + struct vmci_handle qp_handle; 113 + struct vmci_qp *qpair; 114 + u64 produce_size; 115 + u64 consume_size; 116 + u64 queue_pair_size; 117 + u64 queue_pair_min_size; 118 + u64 queue_pair_max_size; 119 + u32 attach_sub_id; 120 + u32 detach_sub_id; 121 + union vmci_transport_notify notify; 122 + struct vmci_transport_notify_ops *notify_ops; 123 + }; 124 + 125 + int vmci_transport_register(void); 126 + void vmci_transport_unregister(void); 127 + 128 + int vmci_transport_send_wrote_bh(struct sockaddr_vm *dst, 129 + struct sockaddr_vm *src); 130 + int vmci_transport_send_read_bh(struct sockaddr_vm *dst, 131 + struct sockaddr_vm *src); 132 + int vmci_transport_send_wrote(struct sock *sk); 133 + int vmci_transport_send_read(struct sock *sk); 134 + int vmci_transport_send_waiting_write(struct sock *sk, 135 + struct vmci_transport_waiting_info *wait); 136 + int vmci_transport_send_waiting_read(struct sock *sk, 137 + struct vmci_transport_waiting_info *wait); 138 + 139 + #endif

+680

net/vmw_vsock/vmci_transport_notify.c

··· 1 + /* 2 + * VMware vSockets Driver 3 + * 4 + * Copyright (C) 2009-2013 VMware, Inc. All rights reserved. 5 + * 6 + * This program is free software; you can redistribute it and/or modify it 7 + * under the terms of the GNU General Public License as published by the Free 8 + * Software Foundation version 2 and no later version. 9 + * 10 + * This program is distributed in the hope that it will be useful, but WITHOUT 11 + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 13 + * more details. 14 + */ 15 + 16 + #include <linux/types.h> 17 + #include <linux/socket.h> 18 + #include <linux/stddef.h> 19 + #include <net/sock.h> 20 + 21 + #include "vmci_transport_notify.h" 22 + 23 + #define PKT_FIELD(vsk, field_name) (vmci_trans(vsk)->notify.pkt.field_name) 24 + 25 + static bool vmci_transport_notify_waiting_write(struct vsock_sock *vsk) 26 + { 27 + #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY) 28 + bool retval; 29 + u64 notify_limit; 30 + 31 + if (!PKT_FIELD(vsk, peer_waiting_write)) 32 + return false; 33 + 34 + #ifdef VSOCK_OPTIMIZATION_FLOW_CONTROL 35 + /* When the sender blocks, we take that as a sign that the sender is 36 + * faster than the receiver. To reduce the transmit rate of the sender, 37 + * we delay the sending of the read notification by decreasing the 38 + * write_notify_window. The notification is delayed until the number of 39 + * bytes used in the queue drops below the write_notify_window. 40 + */ 41 + 42 + if (!PKT_FIELD(vsk, peer_waiting_write_detected)) { 43 + PKT_FIELD(vsk, peer_waiting_write_detected) = true; 44 + if (PKT_FIELD(vsk, write_notify_window) < PAGE_SIZE) { 45 + PKT_FIELD(vsk, write_notify_window) = 46 + PKT_FIELD(vsk, write_notify_min_window); 47 + } else { 48 + PKT_FIELD(vsk, write_notify_window) -= PAGE_SIZE; 49 + if (PKT_FIELD(vsk, write_notify_window) < 50 + PKT_FIELD(vsk, write_notify_min_window)) 51 + PKT_FIELD(vsk, write_notify_window) = 52 + PKT_FIELD(vsk, write_notify_min_window); 53 + 54 + } 55 + } 56 + notify_limit = vmci_trans(vsk)->consume_size - 57 + PKT_FIELD(vsk, write_notify_window); 58 + #else 59 + notify_limit = 0; 60 + #endif 61 + 62 + /* For now we ignore the wait information and just see if the free 63 + * space exceeds the notify limit. Note that improving this function 64 + * to be more intelligent will not require a protocol change and will 65 + * retain compatibility between endpoints with mixed versions of this 66 + * function. 67 + * 68 + * The notify_limit is used to delay notifications in the case where 69 + * flow control is enabled. Below the test is expressed in terms of 70 + * free space in the queue: if free_space > ConsumeSize - 71 + * write_notify_window then notify An alternate way of expressing this 72 + * is to rewrite the expression to use the data ready in the receive 73 + * queue: if write_notify_window > bufferReady then notify as 74 + * free_space == ConsumeSize - bufferReady. 75 + */ 76 + retval = vmci_qpair_consume_free_space(vmci_trans(vsk)->qpair) > 77 + notify_limit; 78 + #ifdef VSOCK_OPTIMIZATION_FLOW_CONTROL 79 + if (retval) { 80 + /* 81 + * Once we notify the peer, we reset the detected flag so the 82 + * next wait will again cause a decrease in the window size. 83 + */ 84 + 85 + PKT_FIELD(vsk, peer_waiting_write_detected) = false; 86 + } 87 + #endif 88 + return retval; 89 + #else 90 + return true; 91 + #endif 92 + } 93 + 94 + static bool vmci_transport_notify_waiting_read(struct vsock_sock *vsk) 95 + { 96 + #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY) 97 + if (!PKT_FIELD(vsk, peer_waiting_read)) 98 + return false; 99 + 100 + /* For now we ignore the wait information and just see if there is any 101 + * data for our peer to read. Note that improving this function to be 102 + * more intelligent will not require a protocol change and will retain 103 + * compatibility between endpoints with mixed versions of this 104 + * function. 105 + */ 106 + return vmci_qpair_produce_buf_ready(vmci_trans(vsk)->qpair) > 0; 107 + #else 108 + return true; 109 + #endif 110 + } 111 + 112 + static void 113 + vmci_transport_handle_waiting_read(struct sock *sk, 114 + struct vmci_transport_packet *pkt, 115 + bool bottom_half, 116 + struct sockaddr_vm *dst, 117 + struct sockaddr_vm *src) 118 + { 119 + #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY) 120 + struct vsock_sock *vsk; 121 + 122 + vsk = vsock_sk(sk); 123 + 124 + PKT_FIELD(vsk, peer_waiting_read) = true; 125 + memcpy(&PKT_FIELD(vsk, peer_waiting_read_info), &pkt->u.wait, 126 + sizeof(PKT_FIELD(vsk, peer_waiting_read_info))); 127 + 128 + if (vmci_transport_notify_waiting_read(vsk)) { 129 + bool sent; 130 + 131 + if (bottom_half) 132 + sent = vmci_transport_send_wrote_bh(dst, src) > 0; 133 + else 134 + sent = vmci_transport_send_wrote(sk) > 0; 135 + 136 + if (sent) 137 + PKT_FIELD(vsk, peer_waiting_read) = false; 138 + } 139 + #endif 140 + } 141 + 142 + static void 143 + vmci_transport_handle_waiting_write(struct sock *sk, 144 + struct vmci_transport_packet *pkt, 145 + bool bottom_half, 146 + struct sockaddr_vm *dst, 147 + struct sockaddr_vm *src) 148 + { 149 + #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY) 150 + struct vsock_sock *vsk; 151 + 152 + vsk = vsock_sk(sk); 153 + 154 + PKT_FIELD(vsk, peer_waiting_write) = true; 155 + memcpy(&PKT_FIELD(vsk, peer_waiting_write_info), &pkt->u.wait, 156 + sizeof(PKT_FIELD(vsk, peer_waiting_write_info))); 157 + 158 + if (vmci_transport_notify_waiting_write(vsk)) { 159 + bool sent; 160 + 161 + if (bottom_half) 162 + sent = vmci_transport_send_read_bh(dst, src) > 0; 163 + else 164 + sent = vmci_transport_send_read(sk) > 0; 165 + 166 + if (sent) 167 + PKT_FIELD(vsk, peer_waiting_write) = false; 168 + } 169 + #endif 170 + } 171 + 172 + static void 173 + vmci_transport_handle_read(struct sock *sk, 174 + struct vmci_transport_packet *pkt, 175 + bool bottom_half, 176 + struct sockaddr_vm *dst, struct sockaddr_vm *src) 177 + { 178 + #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY) 179 + struct vsock_sock *vsk; 180 + 181 + vsk = vsock_sk(sk); 182 + PKT_FIELD(vsk, sent_waiting_write) = false; 183 + #endif 184 + 185 + sk->sk_write_space(sk); 186 + } 187 + 188 + static bool send_waiting_read(struct sock *sk, u64 room_needed) 189 + { 190 + #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY) 191 + struct vsock_sock *vsk; 192 + struct vmci_transport_waiting_info waiting_info; 193 + u64 tail; 194 + u64 head; 195 + u64 room_left; 196 + bool ret; 197 + 198 + vsk = vsock_sk(sk); 199 + 200 + if (PKT_FIELD(vsk, sent_waiting_read)) 201 + return true; 202 + 203 + if (PKT_FIELD(vsk, write_notify_window) < 204 + vmci_trans(vsk)->consume_size) 205 + PKT_FIELD(vsk, write_notify_window) = 206 + min(PKT_FIELD(vsk, write_notify_window) + PAGE_SIZE, 207 + vmci_trans(vsk)->consume_size); 208 + 209 + vmci_qpair_get_consume_indexes(vmci_trans(vsk)->qpair, &tail, &head); 210 + room_left = vmci_trans(vsk)->consume_size - head; 211 + if (room_needed >= room_left) { 212 + waiting_info.offset = room_needed - room_left; 213 + waiting_info.generation = 214 + PKT_FIELD(vsk, consume_q_generation) + 1; 215 + } else { 216 + waiting_info.offset = head + room_needed; 217 + waiting_info.generation = PKT_FIELD(vsk, consume_q_generation); 218 + } 219 + 220 + ret = vmci_transport_send_waiting_read(sk, &waiting_info) > 0; 221 + if (ret) 222 + PKT_FIELD(vsk, sent_waiting_read) = true; 223 + 224 + return ret; 225 + #else 226 + return true; 227 + #endif 228 + } 229 + 230 + static bool send_waiting_write(struct sock *sk, u64 room_needed) 231 + { 232 + #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY) 233 + struct vsock_sock *vsk; 234 + struct vmci_transport_waiting_info waiting_info; 235 + u64 tail; 236 + u64 head; 237 + u64 room_left; 238 + bool ret; 239 + 240 + vsk = vsock_sk(sk); 241 + 242 + if (PKT_FIELD(vsk, sent_waiting_write)) 243 + return true; 244 + 245 + vmci_qpair_get_produce_indexes(vmci_trans(vsk)->qpair, &tail, &head); 246 + room_left = vmci_trans(vsk)->produce_size - tail; 247 + if (room_needed + 1 >= room_left) { 248 + /* Wraps around to current generation. */ 249 + waiting_info.offset = room_needed + 1 - room_left; 250 + waiting_info.generation = PKT_FIELD(vsk, produce_q_generation); 251 + } else { 252 + waiting_info.offset = tail + room_needed + 1; 253 + waiting_info.generation = 254 + PKT_FIELD(vsk, produce_q_generation) - 1; 255 + } 256 + 257 + ret = vmci_transport_send_waiting_write(sk, &waiting_info) > 0; 258 + if (ret) 259 + PKT_FIELD(vsk, sent_waiting_write) = true; 260 + 261 + return ret; 262 + #else 263 + return true; 264 + #endif 265 + } 266 + 267 + static int vmci_transport_send_read_notification(struct sock *sk) 268 + { 269 + struct vsock_sock *vsk; 270 + bool sent_read; 271 + unsigned int retries; 272 + int err; 273 + 274 + vsk = vsock_sk(sk); 275 + sent_read = false; 276 + retries = 0; 277 + err = 0; 278 + 279 + if (vmci_transport_notify_waiting_write(vsk)) { 280 + /* Notify the peer that we have read, retrying the send on 281 + * failure up to our maximum value. XXX For now we just log 282 + * the failure, but later we should schedule a work item to 283 + * handle the resend until it succeeds. That would require 284 + * keeping track of work items in the vsk and cleaning them up 285 + * upon socket close. 286 + */ 287 + while (!(vsk->peer_shutdown & RCV_SHUTDOWN) && 288 + !sent_read && 289 + retries < VMCI_TRANSPORT_MAX_DGRAM_RESENDS) { 290 + err = vmci_transport_send_read(sk); 291 + if (err >= 0) 292 + sent_read = true; 293 + 294 + retries++; 295 + } 296 + 297 + if (retries >= VMCI_TRANSPORT_MAX_DGRAM_RESENDS) 298 + pr_err("%p unable to send read notify to peer\n", sk); 299 + else 300 + #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY) 301 + PKT_FIELD(vsk, peer_waiting_write) = false; 302 + #endif 303 + 304 + } 305 + return err; 306 + } 307 + 308 + static void 309 + vmci_transport_handle_wrote(struct sock *sk, 310 + struct vmci_transport_packet *pkt, 311 + bool bottom_half, 312 + struct sockaddr_vm *dst, struct sockaddr_vm *src) 313 + { 314 + #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY) 315 + struct vsock_sock *vsk = vsock_sk(sk); 316 + PKT_FIELD(vsk, sent_waiting_read) = false; 317 + #endif 318 + sk->sk_data_ready(sk, 0); 319 + } 320 + 321 + static void vmci_transport_notify_pkt_socket_init(struct sock *sk) 322 + { 323 + struct vsock_sock *vsk = vsock_sk(sk); 324 + 325 + PKT_FIELD(vsk, write_notify_window) = PAGE_SIZE; 326 + PKT_FIELD(vsk, write_notify_min_window) = PAGE_SIZE; 327 + PKT_FIELD(vsk, peer_waiting_read) = false; 328 + PKT_FIELD(vsk, peer_waiting_write) = false; 329 + PKT_FIELD(vsk, peer_waiting_write_detected) = false; 330 + PKT_FIELD(vsk, sent_waiting_read) = false; 331 + PKT_FIELD(vsk, sent_waiting_write) = false; 332 + PKT_FIELD(vsk, produce_q_generation) = 0; 333 + PKT_FIELD(vsk, consume_q_generation) = 0; 334 + 335 + memset(&PKT_FIELD(vsk, peer_waiting_read_info), 0, 336 + sizeof(PKT_FIELD(vsk, peer_waiting_read_info))); 337 + memset(&PKT_FIELD(vsk, peer_waiting_write_info), 0, 338 + sizeof(PKT_FIELD(vsk, peer_waiting_write_info))); 339 + } 340 + 341 + static void vmci_transport_notify_pkt_socket_destruct(struct vsock_sock *vsk) 342 + { 343 + } 344 + 345 + static int 346 + vmci_transport_notify_pkt_poll_in(struct sock *sk, 347 + size_t target, bool *data_ready_now) 348 + { 349 + struct vsock_sock *vsk = vsock_sk(sk); 350 + 351 + if (vsock_stream_has_data(vsk)) { 352 + *data_ready_now = true; 353 + } else { 354 + /* We can't read right now because there is nothing in the 355 + * queue. Ask for notifications when there is something to 356 + * read. 357 + */ 358 + if (sk->sk_state == SS_CONNECTED) { 359 + if (!send_waiting_read(sk, 1)) 360 + return -1; 361 + 362 + } 363 + *data_ready_now = false; 364 + } 365 + 366 + return 0; 367 + } 368 + 369 + static int 370 + vmci_transport_notify_pkt_poll_out(struct sock *sk, 371 + size_t target, bool *space_avail_now) 372 + { 373 + s64 produce_q_free_space; 374 + struct vsock_sock *vsk = vsock_sk(sk); 375 + 376 + produce_q_free_space = vsock_stream_has_space(vsk); 377 + if (produce_q_free_space > 0) { 378 + *space_avail_now = true; 379 + return 0; 380 + } else if (produce_q_free_space == 0) { 381 + /* This is a connected socket but we can't currently send data. 382 + * Notify the peer that we are waiting if the queue is full. We 383 + * only send a waiting write if the queue is full because 384 + * otherwise we end up in an infinite WAITING_WRITE, READ, 385 + * WAITING_WRITE, READ, etc. loop. Treat failing to send the 386 + * notification as a socket error, passing that back through 387 + * the mask. 388 + */ 389 + if (!send_waiting_write(sk, 1)) 390 + return -1; 391 + 392 + *space_avail_now = false; 393 + } 394 + 395 + return 0; 396 + } 397 + 398 + static int 399 + vmci_transport_notify_pkt_recv_init( 400 + struct sock *sk, 401 + size_t target, 402 + struct vmci_transport_recv_notify_data *data) 403 + { 404 + struct vsock_sock *vsk = vsock_sk(sk); 405 + 406 + #ifdef VSOCK_OPTIMIZATION_WAITING_NOTIFY 407 + data->consume_head = 0; 408 + data->produce_tail = 0; 409 + #ifdef VSOCK_OPTIMIZATION_FLOW_CONTROL 410 + data->notify_on_block = false; 411 + 412 + if (PKT_FIELD(vsk, write_notify_min_window) < target + 1) { 413 + PKT_FIELD(vsk, write_notify_min_window) = target + 1; 414 + if (PKT_FIELD(vsk, write_notify_window) < 415 + PKT_FIELD(vsk, write_notify_min_window)) { 416 + /* If the current window is smaller than the new 417 + * minimal window size, we need to reevaluate whether 418 + * we need to notify the sender. If the number of ready 419 + * bytes are smaller than the new window, we need to 420 + * send a notification to the sender before we block. 421 + */ 422 + 423 + PKT_FIELD(vsk, write_notify_window) = 424 + PKT_FIELD(vsk, write_notify_min_window); 425 + data->notify_on_block = true; 426 + } 427 + } 428 + #endif 429 + #endif 430 + 431 + return 0; 432 + } 433 + 434 + static int 435 + vmci_transport_notify_pkt_recv_pre_block( 436 + struct sock *sk, 437 + size_t target, 438 + struct vmci_transport_recv_notify_data *data) 439 + { 440 + int err = 0; 441 + 442 + /* Notify our peer that we are waiting for data to read. */ 443 + if (!send_waiting_read(sk, target)) { 444 + err = -EHOSTUNREACH; 445 + return err; 446 + } 447 + #ifdef VSOCK_OPTIMIZATION_FLOW_CONTROL 448 + if (data->notify_on_block) { 449 + err = vmci_transport_send_read_notification(sk); 450 + if (err < 0) 451 + return err; 452 + 453 + data->notify_on_block = false; 454 + } 455 + #endif 456 + 457 + return err; 458 + } 459 + 460 + static int 461 + vmci_transport_notify_pkt_recv_pre_dequeue( 462 + struct sock *sk, 463 + size_t target, 464 + struct vmci_transport_recv_notify_data *data) 465 + { 466 + struct vsock_sock *vsk = vsock_sk(sk); 467 + 468 + /* Now consume up to len bytes from the queue. Note that since we have 469 + * the socket locked we should copy at least ready bytes. 470 + */ 471 + #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY) 472 + vmci_qpair_get_consume_indexes(vmci_trans(vsk)->qpair, 473 + &data->produce_tail, 474 + &data->consume_head); 475 + #endif 476 + 477 + return 0; 478 + } 479 + 480 + static int 481 + vmci_transport_notify_pkt_recv_post_dequeue( 482 + struct sock *sk, 483 + size_t target, 484 + ssize_t copied, 485 + bool data_read, 486 + struct vmci_transport_recv_notify_data *data) 487 + { 488 + struct vsock_sock *vsk; 489 + int err; 490 + 491 + vsk = vsock_sk(sk); 492 + err = 0; 493 + 494 + if (data_read) { 495 + #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY) 496 + /* Detect a wrap-around to maintain queue generation. Note 497 + * that this is safe since we hold the socket lock across the 498 + * two queue pair operations. 499 + */ 500 + if (copied >= 501 + vmci_trans(vsk)->consume_size - data->consume_head) 502 + PKT_FIELD(vsk, consume_q_generation)++; 503 + #endif 504 + 505 + err = vmci_transport_send_read_notification(sk); 506 + if (err < 0) 507 + return err; 508 + 509 + } 510 + return err; 511 + } 512 + 513 + static int 514 + vmci_transport_notify_pkt_send_init( 515 + struct sock *sk, 516 + struct vmci_transport_send_notify_data *data) 517 + { 518 + #ifdef VSOCK_OPTIMIZATION_WAITING_NOTIFY 519 + data->consume_head = 0; 520 + data->produce_tail = 0; 521 + #endif 522 + 523 + return 0; 524 + } 525 + 526 + static int 527 + vmci_transport_notify_pkt_send_pre_block( 528 + struct sock *sk, 529 + struct vmci_transport_send_notify_data *data) 530 + { 531 + /* Notify our peer that we are waiting for room to write. */ 532 + if (!send_waiting_write(sk, 1)) 533 + return -EHOSTUNREACH; 534 + 535 + return 0; 536 + } 537 + 538 + static int 539 + vmci_transport_notify_pkt_send_pre_enqueue( 540 + struct sock *sk, 541 + struct vmci_transport_send_notify_data *data) 542 + { 543 + struct vsock_sock *vsk = vsock_sk(sk); 544 + 545 + #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY) 546 + vmci_qpair_get_produce_indexes(vmci_trans(vsk)->qpair, 547 + &data->produce_tail, 548 + &data->consume_head); 549 + #endif 550 + 551 + return 0; 552 + } 553 + 554 + static int 555 + vmci_transport_notify_pkt_send_post_enqueue( 556 + struct sock *sk, 557 + ssize_t written, 558 + struct vmci_transport_send_notify_data *data) 559 + { 560 + int err = 0; 561 + struct vsock_sock *vsk; 562 + bool sent_wrote = false; 563 + int retries = 0; 564 + 565 + vsk = vsock_sk(sk); 566 + 567 + #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY) 568 + /* Detect a wrap-around to maintain queue generation. Note that this 569 + * is safe since we hold the socket lock across the two queue pair 570 + * operations. 571 + */ 572 + if (written >= vmci_trans(vsk)->produce_size - data->produce_tail) 573 + PKT_FIELD(vsk, produce_q_generation)++; 574 + 575 + #endif 576 + 577 + if (vmci_transport_notify_waiting_read(vsk)) { 578 + /* Notify the peer that we have written, retrying the send on 579 + * failure up to our maximum value. See the XXX comment for the 580 + * corresponding piece of code in StreamRecvmsg() for potential 581 + * improvements. 582 + */ 583 + while (!(vsk->peer_shutdown & RCV_SHUTDOWN) && 584 + !sent_wrote && 585 + retries < VMCI_TRANSPORT_MAX_DGRAM_RESENDS) { 586 + err = vmci_transport_send_wrote(sk); 587 + if (err >= 0) 588 + sent_wrote = true; 589 + 590 + retries++; 591 + } 592 + 593 + if (retries >= VMCI_TRANSPORT_MAX_DGRAM_RESENDS) { 594 + pr_err("%p unable to send wrote notify to peer\n", sk); 595 + return err; 596 + } else { 597 + #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY) 598 + PKT_FIELD(vsk, peer_waiting_read) = false; 599 + #endif 600 + } 601 + } 602 + return err; 603 + } 604 + 605 + static void 606 + vmci_transport_notify_pkt_handle_pkt( 607 + struct sock *sk, 608 + struct vmci_transport_packet *pkt, 609 + bool bottom_half, 610 + struct sockaddr_vm *dst, 611 + struct sockaddr_vm *src, bool *pkt_processed) 612 + { 613 + bool processed = false; 614 + 615 + switch (pkt->type) { 616 + case VMCI_TRANSPORT_PACKET_TYPE_WROTE: 617 + vmci_transport_handle_wrote(sk, pkt, bottom_half, dst, src); 618 + processed = true; 619 + break; 620 + case VMCI_TRANSPORT_PACKET_TYPE_READ: 621 + vmci_transport_handle_read(sk, pkt, bottom_half, dst, src); 622 + processed = true; 623 + break; 624 + case VMCI_TRANSPORT_PACKET_TYPE_WAITING_WRITE: 625 + vmci_transport_handle_waiting_write(sk, pkt, bottom_half, 626 + dst, src); 627 + processed = true; 628 + break; 629 + 630 + case VMCI_TRANSPORT_PACKET_TYPE_WAITING_READ: 631 + vmci_transport_handle_waiting_read(sk, pkt, bottom_half, 632 + dst, src); 633 + processed = true; 634 + break; 635 + } 636 + 637 + if (pkt_processed) 638 + *pkt_processed = processed; 639 + } 640 + 641 + static void vmci_transport_notify_pkt_process_request(struct sock *sk) 642 + { 643 + struct vsock_sock *vsk = vsock_sk(sk); 644 + 645 + PKT_FIELD(vsk, write_notify_window) = vmci_trans(vsk)->consume_size; 646 + if (vmci_trans(vsk)->consume_size < 647 + PKT_FIELD(vsk, write_notify_min_window)) 648 + PKT_FIELD(vsk, write_notify_min_window) = 649 + vmci_trans(vsk)->consume_size; 650 + } 651 + 652 + static void vmci_transport_notify_pkt_process_negotiate(struct sock *sk) 653 + { 654 + struct vsock_sock *vsk = vsock_sk(sk); 655 + 656 + PKT_FIELD(vsk, write_notify_window) = vmci_trans(vsk)->consume_size; 657 + if (vmci_trans(vsk)->consume_size < 658 + PKT_FIELD(vsk, write_notify_min_window)) 659 + PKT_FIELD(vsk, write_notify_min_window) = 660 + vmci_trans(vsk)->consume_size; 661 + } 662 + 663 + /* Socket control packet based operations. */ 664 + struct vmci_transport_notify_ops vmci_transport_notify_pkt_ops = { 665 + vmci_transport_notify_pkt_socket_init, 666 + vmci_transport_notify_pkt_socket_destruct, 667 + vmci_transport_notify_pkt_poll_in, 668 + vmci_transport_notify_pkt_poll_out, 669 + vmci_transport_notify_pkt_handle_pkt, 670 + vmci_transport_notify_pkt_recv_init, 671 + vmci_transport_notify_pkt_recv_pre_block, 672 + vmci_transport_notify_pkt_recv_pre_dequeue, 673 + vmci_transport_notify_pkt_recv_post_dequeue, 674 + vmci_transport_notify_pkt_send_init, 675 + vmci_transport_notify_pkt_send_pre_block, 676 + vmci_transport_notify_pkt_send_pre_enqueue, 677 + vmci_transport_notify_pkt_send_post_enqueue, 678 + vmci_transport_notify_pkt_process_request, 679 + vmci_transport_notify_pkt_process_negotiate, 680 + };

+83

net/vmw_vsock/vmci_transport_notify.h

··· 1 + /* 2 + * VMware vSockets Driver 3 + * 4 + * Copyright (C) 2009-2013 VMware, Inc. All rights reserved. 5 + * 6 + * This program is free software; you can redistribute it and/or modify it 7 + * under the terms of the GNU General Public License as published by the Free 8 + * Software Foundation version 2 and no later version. 9 + * 10 + * This program is distributed in the hope that it will be useful, but WITHOUT 11 + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 13 + * more details. 14 + */ 15 + 16 + #ifndef __VMCI_TRANSPORT_NOTIFY_H__ 17 + #define __VMCI_TRANSPORT_NOTIFY_H__ 18 + 19 + #include <linux/types.h> 20 + #include <linux/vmw_vmci_defs.h> 21 + #include <linux/vmw_vmci_api.h> 22 + #include <linux/vm_sockets.h> 23 + 24 + #include "vmci_transport.h" 25 + 26 + /* Comment this out to compare with old protocol. */ 27 + #define VSOCK_OPTIMIZATION_WAITING_NOTIFY 1 28 + #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY) 29 + /* Comment this out to remove flow control for "new" protocol */ 30 + #define VSOCK_OPTIMIZATION_FLOW_CONTROL 1 31 + #endif 32 + 33 + #define VMCI_TRANSPORT_MAX_DGRAM_RESENDS 10 34 + 35 + struct vmci_transport_recv_notify_data { 36 + u64 consume_head; 37 + u64 produce_tail; 38 + bool notify_on_block; 39 + }; 40 + 41 + struct vmci_transport_send_notify_data { 42 + u64 consume_head; 43 + u64 produce_tail; 44 + }; 45 + 46 + /* Socket notification callbacks. */ 47 + struct vmci_transport_notify_ops { 48 + void (*socket_init) (struct sock *sk); 49 + void (*socket_destruct) (struct vsock_sock *vsk); 50 + int (*poll_in) (struct sock *sk, size_t target, 51 + bool *data_ready_now); 52 + int (*poll_out) (struct sock *sk, size_t target, 53 + bool *space_avail_now); 54 + void (*handle_notify_pkt) (struct sock *sk, 55 + struct vmci_transport_packet *pkt, 56 + bool bottom_half, struct sockaddr_vm *dst, 57 + struct sockaddr_vm *src, 58 + bool *pkt_processed); 59 + int (*recv_init) (struct sock *sk, size_t target, 60 + struct vmci_transport_recv_notify_data *data); 61 + int (*recv_pre_block) (struct sock *sk, size_t target, 62 + struct vmci_transport_recv_notify_data *data); 63 + int (*recv_pre_dequeue) (struct sock *sk, size_t target, 64 + struct vmci_transport_recv_notify_data *data); 65 + int (*recv_post_dequeue) (struct sock *sk, size_t target, 66 + ssize_t copied, bool data_read, 67 + struct vmci_transport_recv_notify_data *data); 68 + int (*send_init) (struct sock *sk, 69 + struct vmci_transport_send_notify_data *data); 70 + int (*send_pre_block) (struct sock *sk, 71 + struct vmci_transport_send_notify_data *data); 72 + int (*send_pre_enqueue) (struct sock *sk, 73 + struct vmci_transport_send_notify_data *data); 74 + int (*send_post_enqueue) (struct sock *sk, ssize_t written, 75 + struct vmci_transport_send_notify_data *data); 76 + void (*process_request) (struct sock *sk); 77 + void (*process_negotiate) (struct sock *sk); 78 + }; 79 + 80 + extern struct vmci_transport_notify_ops vmci_transport_notify_pkt_ops; 81 + extern struct vmci_transport_notify_ops vmci_transport_notify_pkt_q_state_ops; 82 + 83 + #endif /* __VMCI_TRANSPORT_NOTIFY_H__ */

+438

net/vmw_vsock/vmci_transport_notify_qstate.c

··· 1 + /* 2 + * VMware vSockets Driver 3 + * 4 + * Copyright (C) 2009-2013 VMware, Inc. All rights reserved. 5 + * 6 + * This program is free software; you can redistribute it and/or modify it 7 + * under the terms of the GNU General Public License as published by the Free 8 + * Software Foundation version 2 and no later version. 9 + * 10 + * This program is distributed in the hope that it will be useful, but WITHOUT 11 + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 13 + * more details. 14 + */ 15 + 16 + #include <linux/types.h> 17 + #include <linux/socket.h> 18 + #include <linux/stddef.h> 19 + #include <net/sock.h> 20 + 21 + #include "vmci_transport_notify.h" 22 + 23 + #define PKT_FIELD(vsk, field_name) \ 24 + (vmci_trans(vsk)->notify.pkt_q_state.field_name) 25 + 26 + static bool vmci_transport_notify_waiting_write(struct vsock_sock *vsk) 27 + { 28 + bool retval; 29 + u64 notify_limit; 30 + 31 + if (!PKT_FIELD(vsk, peer_waiting_write)) 32 + return false; 33 + 34 + /* When the sender blocks, we take that as a sign that the sender is 35 + * faster than the receiver. To reduce the transmit rate of the sender, 36 + * we delay the sending of the read notification by decreasing the 37 + * write_notify_window. The notification is delayed until the number of 38 + * bytes used in the queue drops below the write_notify_window. 39 + */ 40 + 41 + if (!PKT_FIELD(vsk, peer_waiting_write_detected)) { 42 + PKT_FIELD(vsk, peer_waiting_write_detected) = true; 43 + if (PKT_FIELD(vsk, write_notify_window) < PAGE_SIZE) { 44 + PKT_FIELD(vsk, write_notify_window) = 45 + PKT_FIELD(vsk, write_notify_min_window); 46 + } else { 47 + PKT_FIELD(vsk, write_notify_window) -= PAGE_SIZE; 48 + if (PKT_FIELD(vsk, write_notify_window) < 49 + PKT_FIELD(vsk, write_notify_min_window)) 50 + PKT_FIELD(vsk, write_notify_window) = 51 + PKT_FIELD(vsk, write_notify_min_window); 52 + 53 + } 54 + } 55 + notify_limit = vmci_trans(vsk)->consume_size - 56 + PKT_FIELD(vsk, write_notify_window); 57 + 58 + /* The notify_limit is used to delay notifications in the case where 59 + * flow control is enabled. Below the test is expressed in terms of 60 + * free space in the queue: if free_space > ConsumeSize - 61 + * write_notify_window then notify An alternate way of expressing this 62 + * is to rewrite the expression to use the data ready in the receive 63 + * queue: if write_notify_window > bufferReady then notify as 64 + * free_space == ConsumeSize - bufferReady. 65 + */ 66 + 67 + retval = vmci_qpair_consume_free_space(vmci_trans(vsk)->qpair) > 68 + notify_limit; 69 + 70 + if (retval) { 71 + /* Once we notify the peer, we reset the detected flag so the 72 + * next wait will again cause a decrease in the window size. 73 + */ 74 + 75 + PKT_FIELD(vsk, peer_waiting_write_detected) = false; 76 + } 77 + return retval; 78 + } 79 + 80 + static void 81 + vmci_transport_handle_read(struct sock *sk, 82 + struct vmci_transport_packet *pkt, 83 + bool bottom_half, 84 + struct sockaddr_vm *dst, struct sockaddr_vm *src) 85 + { 86 + sk->sk_write_space(sk); 87 + } 88 + 89 + static void 90 + vmci_transport_handle_wrote(struct sock *sk, 91 + struct vmci_transport_packet *pkt, 92 + bool bottom_half, 93 + struct sockaddr_vm *dst, struct sockaddr_vm *src) 94 + { 95 + sk->sk_data_ready(sk, 0); 96 + } 97 + 98 + static void vsock_block_update_write_window(struct sock *sk) 99 + { 100 + struct vsock_sock *vsk = vsock_sk(sk); 101 + 102 + if (PKT_FIELD(vsk, write_notify_window) < vmci_trans(vsk)->consume_size) 103 + PKT_FIELD(vsk, write_notify_window) = 104 + min(PKT_FIELD(vsk, write_notify_window) + PAGE_SIZE, 105 + vmci_trans(vsk)->consume_size); 106 + } 107 + 108 + static int vmci_transport_send_read_notification(struct sock *sk) 109 + { 110 + struct vsock_sock *vsk; 111 + bool sent_read; 112 + unsigned int retries; 113 + int err; 114 + 115 + vsk = vsock_sk(sk); 116 + sent_read = false; 117 + retries = 0; 118 + err = 0; 119 + 120 + if (vmci_transport_notify_waiting_write(vsk)) { 121 + /* Notify the peer that we have read, retrying the send on 122 + * failure up to our maximum value. XXX For now we just log 123 + * the failure, but later we should schedule a work item to 124 + * handle the resend until it succeeds. That would require 125 + * keeping track of work items in the vsk and cleaning them up 126 + * upon socket close. 127 + */ 128 + while (!(vsk->peer_shutdown & RCV_SHUTDOWN) && 129 + !sent_read && 130 + retries < VMCI_TRANSPORT_MAX_DGRAM_RESENDS) { 131 + err = vmci_transport_send_read(sk); 132 + if (err >= 0) 133 + sent_read = true; 134 + 135 + retries++; 136 + } 137 + 138 + if (retries >= VMCI_TRANSPORT_MAX_DGRAM_RESENDS && !sent_read) 139 + pr_err("%p unable to send read notification to peer\n", 140 + sk); 141 + else 142 + PKT_FIELD(vsk, peer_waiting_write) = false; 143 + 144 + } 145 + return err; 146 + } 147 + 148 + static void vmci_transport_notify_pkt_socket_init(struct sock *sk) 149 + { 150 + struct vsock_sock *vsk = vsock_sk(sk); 151 + 152 + PKT_FIELD(vsk, write_notify_window) = PAGE_SIZE; 153 + PKT_FIELD(vsk, write_notify_min_window) = PAGE_SIZE; 154 + PKT_FIELD(vsk, peer_waiting_write) = false; 155 + PKT_FIELD(vsk, peer_waiting_write_detected) = false; 156 + } 157 + 158 + static void vmci_transport_notify_pkt_socket_destruct(struct vsock_sock *vsk) 159 + { 160 + PKT_FIELD(vsk, write_notify_window) = PAGE_SIZE; 161 + PKT_FIELD(vsk, write_notify_min_window) = PAGE_SIZE; 162 + PKT_FIELD(vsk, peer_waiting_write) = false; 163 + PKT_FIELD(vsk, peer_waiting_write_detected) = false; 164 + } 165 + 166 + static int 167 + vmci_transport_notify_pkt_poll_in(struct sock *sk, 168 + size_t target, bool *data_ready_now) 169 + { 170 + struct vsock_sock *vsk = vsock_sk(sk); 171 + 172 + if (vsock_stream_has_data(vsk)) { 173 + *data_ready_now = true; 174 + } else { 175 + /* We can't read right now because there is nothing in the 176 + * queue. Ask for notifications when there is something to 177 + * read. 178 + */ 179 + if (sk->sk_state == SS_CONNECTED) 180 + vsock_block_update_write_window(sk); 181 + *data_ready_now = false; 182 + } 183 + 184 + return 0; 185 + } 186 + 187 + static int 188 + vmci_transport_notify_pkt_poll_out(struct sock *sk, 189 + size_t target, bool *space_avail_now) 190 + { 191 + s64 produce_q_free_space; 192 + struct vsock_sock *vsk = vsock_sk(sk); 193 + 194 + produce_q_free_space = vsock_stream_has_space(vsk); 195 + if (produce_q_free_space > 0) { 196 + *space_avail_now = true; 197 + return 0; 198 + } else if (produce_q_free_space == 0) { 199 + /* This is a connected socket but we can't currently send data. 200 + * Nothing else to do. 201 + */ 202 + *space_avail_now = false; 203 + } 204 + 205 + return 0; 206 + } 207 + 208 + static int 209 + vmci_transport_notify_pkt_recv_init( 210 + struct sock *sk, 211 + size_t target, 212 + struct vmci_transport_recv_notify_data *data) 213 + { 214 + struct vsock_sock *vsk = vsock_sk(sk); 215 + 216 + data->consume_head = 0; 217 + data->produce_tail = 0; 218 + data->notify_on_block = false; 219 + 220 + if (PKT_FIELD(vsk, write_notify_min_window) < target + 1) { 221 + PKT_FIELD(vsk, write_notify_min_window) = target + 1; 222 + if (PKT_FIELD(vsk, write_notify_window) < 223 + PKT_FIELD(vsk, write_notify_min_window)) { 224 + /* If the current window is smaller than the new 225 + * minimal window size, we need to reevaluate whether 226 + * we need to notify the sender. If the number of ready 227 + * bytes are smaller than the new window, we need to 228 + * send a notification to the sender before we block. 229 + */ 230 + 231 + PKT_FIELD(vsk, write_notify_window) = 232 + PKT_FIELD(vsk, write_notify_min_window); 233 + data->notify_on_block = true; 234 + } 235 + } 236 + 237 + return 0; 238 + } 239 + 240 + static int 241 + vmci_transport_notify_pkt_recv_pre_block( 242 + struct sock *sk, 243 + size_t target, 244 + struct vmci_transport_recv_notify_data *data) 245 + { 246 + int err = 0; 247 + 248 + vsock_block_update_write_window(sk); 249 + 250 + if (data->notify_on_block) { 251 + err = vmci_transport_send_read_notification(sk); 252 + if (err < 0) 253 + return err; 254 + data->notify_on_block = false; 255 + } 256 + 257 + return err; 258 + } 259 + 260 + static int 261 + vmci_transport_notify_pkt_recv_post_dequeue( 262 + struct sock *sk, 263 + size_t target, 264 + ssize_t copied, 265 + bool data_read, 266 + struct vmci_transport_recv_notify_data *data) 267 + { 268 + struct vsock_sock *vsk; 269 + int err; 270 + bool was_full = false; 271 + u64 free_space; 272 + 273 + vsk = vsock_sk(sk); 274 + err = 0; 275 + 276 + if (data_read) { 277 + smp_mb(); 278 + 279 + free_space = 280 + vmci_qpair_consume_free_space(vmci_trans(vsk)->qpair); 281 + was_full = free_space == copied; 282 + 283 + if (was_full) 284 + PKT_FIELD(vsk, peer_waiting_write) = true; 285 + 286 + err = vmci_transport_send_read_notification(sk); 287 + if (err < 0) 288 + return err; 289 + 290 + /* See the comment in 291 + * vmci_transport_notify_pkt_send_post_enqueue(). 292 + */ 293 + sk->sk_data_ready(sk, 0); 294 + } 295 + 296 + return err; 297 + } 298 + 299 + static int 300 + vmci_transport_notify_pkt_send_init( 301 + struct sock *sk, 302 + struct vmci_transport_send_notify_data *data) 303 + { 304 + data->consume_head = 0; 305 + data->produce_tail = 0; 306 + 307 + return 0; 308 + } 309 + 310 + static int 311 + vmci_transport_notify_pkt_send_post_enqueue( 312 + struct sock *sk, 313 + ssize_t written, 314 + struct vmci_transport_send_notify_data *data) 315 + { 316 + int err = 0; 317 + struct vsock_sock *vsk; 318 + bool sent_wrote = false; 319 + bool was_empty; 320 + int retries = 0; 321 + 322 + vsk = vsock_sk(sk); 323 + 324 + smp_mb(); 325 + 326 + was_empty = 327 + vmci_qpair_produce_buf_ready(vmci_trans(vsk)->qpair) == written; 328 + if (was_empty) { 329 + while (!(vsk->peer_shutdown & RCV_SHUTDOWN) && 330 + !sent_wrote && 331 + retries < VMCI_TRANSPORT_MAX_DGRAM_RESENDS) { 332 + err = vmci_transport_send_wrote(sk); 333 + if (err >= 0) 334 + sent_wrote = true; 335 + 336 + retries++; 337 + } 338 + } 339 + 340 + if (retries >= VMCI_TRANSPORT_MAX_DGRAM_RESENDS && !sent_wrote) { 341 + pr_err("%p unable to send wrote notification to peer\n", 342 + sk); 343 + return err; 344 + } 345 + 346 + return err; 347 + } 348 + 349 + static void 350 + vmci_transport_notify_pkt_handle_pkt( 351 + struct sock *sk, 352 + struct vmci_transport_packet *pkt, 353 + bool bottom_half, 354 + struct sockaddr_vm *dst, 355 + struct sockaddr_vm *src, bool *pkt_processed) 356 + { 357 + bool processed = false; 358 + 359 + switch (pkt->type) { 360 + case VMCI_TRANSPORT_PACKET_TYPE_WROTE: 361 + vmci_transport_handle_wrote(sk, pkt, bottom_half, dst, src); 362 + processed = true; 363 + break; 364 + case VMCI_TRANSPORT_PACKET_TYPE_READ: 365 + vmci_transport_handle_read(sk, pkt, bottom_half, dst, src); 366 + processed = true; 367 + break; 368 + } 369 + 370 + if (pkt_processed) 371 + *pkt_processed = processed; 372 + } 373 + 374 + static void vmci_transport_notify_pkt_process_request(struct sock *sk) 375 + { 376 + struct vsock_sock *vsk = vsock_sk(sk); 377 + 378 + PKT_FIELD(vsk, write_notify_window) = vmci_trans(vsk)->consume_size; 379 + if (vmci_trans(vsk)->consume_size < 380 + PKT_FIELD(vsk, write_notify_min_window)) 381 + PKT_FIELD(vsk, write_notify_min_window) = 382 + vmci_trans(vsk)->consume_size; 383 + } 384 + 385 + static void vmci_transport_notify_pkt_process_negotiate(struct sock *sk) 386 + { 387 + struct vsock_sock *vsk = vsock_sk(sk); 388 + 389 + PKT_FIELD(vsk, write_notify_window) = vmci_trans(vsk)->consume_size; 390 + if (vmci_trans(vsk)->consume_size < 391 + PKT_FIELD(vsk, write_notify_min_window)) 392 + PKT_FIELD(vsk, write_notify_min_window) = 393 + vmci_trans(vsk)->consume_size; 394 + } 395 + 396 + static int 397 + vmci_transport_notify_pkt_recv_pre_dequeue( 398 + struct sock *sk, 399 + size_t target, 400 + struct vmci_transport_recv_notify_data *data) 401 + { 402 + return 0; /* NOP for QState. */ 403 + } 404 + 405 + static int 406 + vmci_transport_notify_pkt_send_pre_block( 407 + struct sock *sk, 408 + struct vmci_transport_send_notify_data *data) 409 + { 410 + return 0; /* NOP for QState. */ 411 + } 412 + 413 + static int 414 + vmci_transport_notify_pkt_send_pre_enqueue( 415 + struct sock *sk, 416 + struct vmci_transport_send_notify_data *data) 417 + { 418 + return 0; /* NOP for QState. */ 419 + } 420 + 421 + /* Socket always on control packet based operations. */ 422 + struct vmci_transport_notify_ops vmci_transport_notify_pkt_q_state_ops = { 423 + vmci_transport_notify_pkt_socket_init, 424 + vmci_transport_notify_pkt_socket_destruct, 425 + vmci_transport_notify_pkt_poll_in, 426 + vmci_transport_notify_pkt_poll_out, 427 + vmci_transport_notify_pkt_handle_pkt, 428 + vmci_transport_notify_pkt_recv_init, 429 + vmci_transport_notify_pkt_recv_pre_block, 430 + vmci_transport_notify_pkt_recv_pre_dequeue, 431 + vmci_transport_notify_pkt_recv_post_dequeue, 432 + vmci_transport_notify_pkt_send_init, 433 + vmci_transport_notify_pkt_send_pre_block, 434 + vmci_transport_notify_pkt_send_pre_enqueue, 435 + vmci_transport_notify_pkt_send_post_enqueue, 436 + vmci_transport_notify_pkt_process_request, 437 + vmci_transport_notify_pkt_process_negotiate, 438 + };

+86

net/vmw_vsock/vsock_addr.c

··· 1 + /* 2 + * VMware vSockets Driver 3 + * 4 + * Copyright (C) 2007-2012 VMware, Inc. All rights reserved. 5 + * 6 + * This program is free software; you can redistribute it and/or modify it 7 + * under the terms of the GNU General Public License as published by the Free 8 + * Software Foundation version 2 and no later version. 9 + * 10 + * This program is distributed in the hope that it will be useful, but WITHOUT 11 + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 13 + * more details. 14 + */ 15 + 16 + #include <linux/types.h> 17 + #include <linux/socket.h> 18 + #include <linux/stddef.h> 19 + #include <net/sock.h> 20 + 21 + #include "vsock_addr.h" 22 + 23 + void vsock_addr_init(struct sockaddr_vm *addr, u32 cid, u32 port) 24 + { 25 + memset(addr, 0, sizeof(*addr)); 26 + addr->svm_family = AF_VSOCK; 27 + addr->svm_cid = cid; 28 + addr->svm_port = port; 29 + } 30 + EXPORT_SYMBOL_GPL(vsock_addr_init); 31 + 32 + int vsock_addr_validate(const struct sockaddr_vm *addr) 33 + { 34 + if (!addr) 35 + return -EFAULT; 36 + 37 + if (addr->svm_family != AF_VSOCK) 38 + return -EAFNOSUPPORT; 39 + 40 + if (addr->svm_zero[0] != 0) 41 + return -EINVAL; 42 + 43 + return 0; 44 + } 45 + EXPORT_SYMBOL_GPL(vsock_addr_validate); 46 + 47 + bool vsock_addr_bound(const struct sockaddr_vm *addr) 48 + { 49 + return addr->svm_port != VMADDR_PORT_ANY; 50 + } 51 + EXPORT_SYMBOL_GPL(vsock_addr_bound); 52 + 53 + void vsock_addr_unbind(struct sockaddr_vm *addr) 54 + { 55 + vsock_addr_init(addr, VMADDR_CID_ANY, VMADDR_PORT_ANY); 56 + } 57 + EXPORT_SYMBOL_GPL(vsock_addr_unbind); 58 + 59 + bool vsock_addr_equals_addr(const struct sockaddr_vm *addr, 60 + const struct sockaddr_vm *other) 61 + { 62 + return addr->svm_cid == other->svm_cid && 63 + addr->svm_port == other->svm_port; 64 + } 65 + EXPORT_SYMBOL_GPL(vsock_addr_equals_addr); 66 + 67 + bool vsock_addr_equals_addr_any(const struct sockaddr_vm *addr, 68 + const struct sockaddr_vm *other) 69 + { 70 + return (addr->svm_cid == VMADDR_CID_ANY || 71 + other->svm_cid == VMADDR_CID_ANY || 72 + addr->svm_cid == other->svm_cid) && 73 + addr->svm_port == other->svm_port; 74 + } 75 + EXPORT_SYMBOL_GPL(vsock_addr_equals_addr_any); 76 + 77 + int vsock_addr_cast(const struct sockaddr *addr, 78 + size_t len, struct sockaddr_vm **out_addr) 79 + { 80 + if (len < sizeof(**out_addr)) 81 + return -EFAULT; 82 + 83 + *out_addr = (struct sockaddr_vm *)addr; 84 + return vsock_addr_validate(*out_addr); 85 + } 86 + EXPORT_SYMBOL_GPL(vsock_addr_cast);

+32

net/vmw_vsock/vsock_addr.h

··· 1 + /* 2 + * VMware vSockets Driver 3 + * 4 + * Copyright (C) 2007-2013 VMware, Inc. All rights reserved. 5 + * 6 + * This program is free software; you can redistribute it and/or modify it 7 + * under the terms of the GNU General Public License as published by the Free 8 + * Software Foundation version 2 and no later version. 9 + * 10 + * This program is distributed in the hope that it will be useful, but WITHOUT 11 + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 13 + * more details. 14 + */ 15 + 16 + #ifndef _VSOCK_ADDR_H_ 17 + #define _VSOCK_ADDR_H_ 18 + 19 + #include <linux/vm_sockets.h> 20 + 21 + void vsock_addr_init(struct sockaddr_vm *addr, u32 cid, u32 port); 22 + int vsock_addr_validate(const struct sockaddr_vm *addr); 23 + bool vsock_addr_bound(const struct sockaddr_vm *addr); 24 + void vsock_addr_unbind(struct sockaddr_vm *addr); 25 + bool vsock_addr_equals_addr(const struct sockaddr_vm *addr, 26 + const struct sockaddr_vm *other); 27 + bool vsock_addr_equals_addr_any(const struct sockaddr_vm *addr, 28 + const struct sockaddr_vm *other); 29 + int vsock_addr_cast(const struct sockaddr *addr, size_t len, 30 + struct sockaddr_vm **out_addr); 31 + 32 + #endif

+22

net/vmw_vsock/vsock_version.h

··· 1 + /* 2 + * VMware vSockets Driver 3 + * 4 + * Copyright (C) 2011-2012 VMware, Inc. All rights reserved. 5 + * 6 + * This program is free software; you can redistribute it and/or modify it 7 + * under the terms of the GNU General Public License as published by the Free 8 + * Software Foundation version 2 and no later version. 9 + * 10 + * This program is distributed in the hope that it will be useful, but WITHOUT 11 + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 13 + * more details. 14 + */ 15 + 16 + #ifndef _VSOCK_VERSION_H_ 17 + #define _VSOCK_VERSION_H_ 18 + 19 + #define VSOCK_DRIVER_VERSION_PARTS { 1, 0, 0, 0 } 20 + #define VSOCK_DRIVER_VERSION_STRING "1.0.0.0-k" 21 + 22 + #endif /* _VSOCK_VERSION_H_ */