Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'vsock-virtio-fix-tx-credit-handling'

Stefano Garzarella says:

====================
vsock/virtio: fix TX credit handling

The original series was posted by Melbin K Mathew <mlbnkm1@gmail.com> till v4.
Since it's a real issue and the original author seems busy, I'm sending
the new version fixing my comments but keeping the authorship (and restoring
mine on patch 2 as reported on v4).

v5: https://lore.kernel.org/netdev/20260116201517.273302-1-sgarzare@redhat.com/
v4: https://lore.kernel.org/netdev/20251217181206.3681159-1-mlbnkm1@gmail.com/

From Melbin K Mathew <mlbnkm1@gmail.com>:

This series fixes TX credit handling in virtio-vsock:

Patch 1: Fix potential underflow in get_credit() using s64 arithmetic
Patch 2: Fix vsock_test seqpacket bounds test
Patch 3: Cap TX credit to local buffer size (security hardening)
Patch 4: Add stream TX credit bounds regression test

The core issue is that a malicious guest can advertise a huge buffer
size via SO_VM_SOCKETS_BUFFER_SIZE, causing the host to allocate
excessive sk_buff memory when sending data to that guest.

On an unpatched Ubuntu 22.04 host (~64 GiB RAM), running a PoC with
32 guest vsock connections advertising 2 GiB each and reading slowly
drove Slab/SUnreclaim from ~0.5 GiB to ~57 GiB; the system only
recovered after killing the QEMU process.

With this series applied, the same PoC shows only ~35 MiB increase in
Slab/SUnreclaim, no host OOM, and the guest remains responsive.
====================

Link: https://patch.msgid.link/20260121093628.9941-1-sgarzare@redhat.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>

+133 -9
+21 -9
net/vmw_vsock/virtio_transport_common.c
··· 28 28 29 29 static void virtio_transport_cancel_close_work(struct vsock_sock *vsk, 30 30 bool cancel_timeout); 31 + static s64 virtio_transport_has_space(struct virtio_vsock_sock *vvs); 31 32 32 33 static const struct virtio_transport * 33 34 virtio_transport_get_ops(struct vsock_sock *vsk) ··· 500 499 return 0; 501 500 502 501 spin_lock_bh(&vvs->tx_lock); 503 - ret = vvs->peer_buf_alloc - (vvs->tx_cnt - vvs->peer_fwd_cnt); 504 - if (ret > credit) 505 - ret = credit; 502 + ret = min_t(u32, credit, virtio_transport_has_space(vvs)); 506 503 vvs->tx_cnt += ret; 507 504 vvs->bytes_unsent += ret; 508 505 spin_unlock_bh(&vvs->tx_lock); ··· 821 822 } 822 823 EXPORT_SYMBOL_GPL(virtio_transport_seqpacket_dequeue); 823 824 825 + static u32 virtio_transport_tx_buf_size(struct virtio_vsock_sock *vvs) 826 + { 827 + /* The peer advertises its receive buffer via peer_buf_alloc, but we 828 + * cap it to our local buf_alloc so a remote peer cannot force us to 829 + * queue more data than our own buffer configuration allows. 830 + */ 831 + return min(vvs->peer_buf_alloc, vvs->buf_alloc); 832 + } 833 + 824 834 int 825 835 virtio_transport_seqpacket_enqueue(struct vsock_sock *vsk, 826 836 struct msghdr *msg, ··· 839 831 840 832 spin_lock_bh(&vvs->tx_lock); 841 833 842 - if (len > vvs->peer_buf_alloc) { 834 + if (len > virtio_transport_tx_buf_size(vvs)) { 843 835 spin_unlock_bh(&vvs->tx_lock); 844 836 return -EMSGSIZE; 845 837 } ··· 885 877 } 886 878 EXPORT_SYMBOL_GPL(virtio_transport_seqpacket_has_data); 887 879 888 - static s64 virtio_transport_has_space(struct vsock_sock *vsk) 880 + static s64 virtio_transport_has_space(struct virtio_vsock_sock *vvs) 889 881 { 890 - struct virtio_vsock_sock *vvs = vsk->trans; 891 882 s64 bytes; 892 883 893 - bytes = (s64)vvs->peer_buf_alloc - (vvs->tx_cnt - vvs->peer_fwd_cnt); 884 + /* Use s64 arithmetic so if the peer shrinks peer_buf_alloc while 885 + * we have bytes in flight (tx_cnt - peer_fwd_cnt), the subtraction 886 + * does not underflow. 887 + */ 888 + bytes = (s64)virtio_transport_tx_buf_size(vvs) - 889 + (vvs->tx_cnt - vvs->peer_fwd_cnt); 894 890 if (bytes < 0) 895 891 bytes = 0; 896 892 ··· 907 895 s64 bytes; 908 896 909 897 spin_lock_bh(&vvs->tx_lock); 910 - bytes = virtio_transport_has_space(vsk); 898 + bytes = virtio_transport_has_space(vvs); 911 899 spin_unlock_bh(&vvs->tx_lock); 912 900 913 901 return bytes; ··· 1504 1492 spin_lock_bh(&vvs->tx_lock); 1505 1493 vvs->peer_buf_alloc = le32_to_cpu(hdr->buf_alloc); 1506 1494 vvs->peer_fwd_cnt = le32_to_cpu(hdr->fwd_cnt); 1507 - space_available = virtio_transport_has_space(vsk); 1495 + space_available = virtio_transport_has_space(vvs); 1508 1496 spin_unlock_bh(&vvs->tx_lock); 1509 1497 return space_available; 1510 1498 }
+112
tools/testing/vsock/vsock_test.c
··· 347 347 } 348 348 349 349 #define SOCK_BUF_SIZE (2 * 1024 * 1024) 350 + #define SOCK_BUF_SIZE_SMALL (64 * 1024) 350 351 #define MAX_MSG_PAGES 4 351 352 352 353 static void test_seqpacket_msg_bounds_client(const struct test_opts *opts) 353 354 { 355 + unsigned long long sock_buf_size; 354 356 unsigned long curr_hash; 355 357 size_t max_msg_size; 356 358 int page_size; ··· 364 362 perror("connect"); 365 363 exit(EXIT_FAILURE); 366 364 } 365 + 366 + sock_buf_size = SOCK_BUF_SIZE; 367 + 368 + setsockopt_ull_check(fd, AF_VSOCK, SO_VM_SOCKETS_BUFFER_MAX_SIZE, 369 + sock_buf_size, 370 + "setsockopt(SO_VM_SOCKETS_BUFFER_MAX_SIZE)"); 371 + 372 + setsockopt_ull_check(fd, AF_VSOCK, SO_VM_SOCKETS_BUFFER_SIZE, 373 + sock_buf_size, 374 + "setsockopt(SO_VM_SOCKETS_BUFFER_SIZE)"); 367 375 368 376 /* Wait, until receiver sets buffer size. */ 369 377 control_expectln("SRVREADY"); ··· 2231 2219 close(fd); 2232 2220 } 2233 2221 2222 + static void test_stream_tx_credit_bounds_client(const struct test_opts *opts) 2223 + { 2224 + unsigned long long sock_buf_size; 2225 + size_t total = 0; 2226 + char buf[4096]; 2227 + int fd; 2228 + 2229 + memset(buf, 'A', sizeof(buf)); 2230 + 2231 + fd = vsock_stream_connect(opts->peer_cid, opts->peer_port); 2232 + if (fd < 0) { 2233 + perror("connect"); 2234 + exit(EXIT_FAILURE); 2235 + } 2236 + 2237 + sock_buf_size = SOCK_BUF_SIZE_SMALL; 2238 + 2239 + setsockopt_ull_check(fd, AF_VSOCK, SO_VM_SOCKETS_BUFFER_MAX_SIZE, 2240 + sock_buf_size, 2241 + "setsockopt(SO_VM_SOCKETS_BUFFER_MAX_SIZE)"); 2242 + 2243 + setsockopt_ull_check(fd, AF_VSOCK, SO_VM_SOCKETS_BUFFER_SIZE, 2244 + sock_buf_size, 2245 + "setsockopt(SO_VM_SOCKETS_BUFFER_SIZE)"); 2246 + 2247 + if (fcntl(fd, F_SETFL, fcntl(fd, F_GETFL, 0) | O_NONBLOCK) < 0) { 2248 + perror("fcntl(F_SETFL)"); 2249 + exit(EXIT_FAILURE); 2250 + } 2251 + 2252 + control_expectln("SRVREADY"); 2253 + 2254 + for (;;) { 2255 + ssize_t sent = send(fd, buf, sizeof(buf), 0); 2256 + 2257 + if (sent == 0) { 2258 + fprintf(stderr, "unexpected EOF while sending bytes\n"); 2259 + exit(EXIT_FAILURE); 2260 + } 2261 + 2262 + if (sent < 0) { 2263 + if (errno == EINTR) 2264 + continue; 2265 + 2266 + if (errno == EAGAIN || errno == EWOULDBLOCK) 2267 + break; 2268 + 2269 + perror("send"); 2270 + exit(EXIT_FAILURE); 2271 + } 2272 + 2273 + total += sent; 2274 + } 2275 + 2276 + control_writeln("CLIDONE"); 2277 + close(fd); 2278 + 2279 + /* We should not be able to send more bytes than the value set as 2280 + * local buffer size. 2281 + */ 2282 + if (total > sock_buf_size) { 2283 + fprintf(stderr, 2284 + "TX credit too large: queued %zu bytes (expected <= %llu)\n", 2285 + total, sock_buf_size); 2286 + exit(EXIT_FAILURE); 2287 + } 2288 + } 2289 + 2290 + static void test_stream_tx_credit_bounds_server(const struct test_opts *opts) 2291 + { 2292 + unsigned long long sock_buf_size; 2293 + int fd; 2294 + 2295 + fd = vsock_stream_accept(VMADDR_CID_ANY, opts->peer_port, NULL); 2296 + if (fd < 0) { 2297 + perror("accept"); 2298 + exit(EXIT_FAILURE); 2299 + } 2300 + 2301 + sock_buf_size = SOCK_BUF_SIZE; 2302 + 2303 + setsockopt_ull_check(fd, AF_VSOCK, SO_VM_SOCKETS_BUFFER_MAX_SIZE, 2304 + sock_buf_size, 2305 + "setsockopt(SO_VM_SOCKETS_BUFFER_MAX_SIZE)"); 2306 + 2307 + setsockopt_ull_check(fd, AF_VSOCK, SO_VM_SOCKETS_BUFFER_SIZE, 2308 + sock_buf_size, 2309 + "setsockopt(SO_VM_SOCKETS_BUFFER_SIZE)"); 2310 + 2311 + control_writeln("SRVREADY"); 2312 + control_expectln("CLIDONE"); 2313 + 2314 + close(fd); 2315 + } 2316 + 2234 2317 static struct test_case test_cases[] = { 2235 2318 { 2236 2319 .name = "SOCK_STREAM connection reset", ··· 2514 2407 .name = "SOCK_STREAM virtio MSG_ZEROCOPY coalescence corruption", 2515 2408 .run_client = test_stream_msgzcopy_mangle_client, 2516 2409 .run_server = test_stream_msgzcopy_mangle_server, 2410 + }, 2411 + { 2412 + .name = "SOCK_STREAM TX credit bounds", 2413 + .run_client = test_stream_tx_credit_bounds_client, 2414 + .run_server = test_stream_tx_credit_bounds_server, 2517 2415 }, 2518 2416 {}, 2519 2417 };