Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

wireguard: queueing: get rid of per-peer ring buffers

Having two ring buffers per-peer means that every peer results in two
massive ring allocations. On an 8-core x86_64 machine, this commit
reduces the per-peer allocation from 18,688 bytes to 1,856 bytes, which
is an 90% reduction. Ninety percent! With some single-machine
deployments approaching 500,000 peers, we're talking about a reduction
from 7 gigs of memory down to 700 megs of memory.

In order to get rid of these per-peer allocations, this commit switches
to using a list-based queueing approach. Currently GSO fragments are
chained together using the skb->next pointer (the skb_list_* singly
linked list approach), so we form the per-peer queue around the unused
skb->prev pointer (which sort of makes sense because the links are
pointing backwards). Use of skb_queue_* is not possible here, because
that is based on doubly linked lists and spinlocks. Multiple cores can
write into the queue at any given time, because its writes occur in the
start_xmit path or in the udp_recv path. But reads happen in a single
workqueue item per-peer, amounting to a multi-producer, single-consumer
paradigm.

The MPSC queue is implemented locklessly and never blocks. However, it
is not linearizable (though it is serializable), with a very tight and
unlikely race on writes, which, when hit (some tiny fraction of the
0.15% of partial adds on a fully loaded 16-core x86_64 system), causes
the queue reader to terminate early. However, because every packet sent
queues up the same workqueue item after it is fully added, the worker
resumes again, and stopping early isn't actually a problem, since at
that point the packet wouldn't have yet been added to the encryption
queue. These properties allow us to avoid disabling interrupts or
spinning. The design is based on Dmitry Vyukov's algorithm [1].

Performance-wise, ordinarily list-based queues aren't preferable to
ringbuffers, because of cache misses when following pointers around.
However, we *already* have to follow the adjacent pointers when working
through fragments, so there shouldn't actually be any change there. A
potential downside is that dequeueing is a bit more complicated, but the
ptr_ring structure used prior had a spinlock when dequeueing, so all and
all the difference appears to be a wash.

Actually, from profiling, the biggest performance hit, by far, of this
commit winds up being atomic_add_unless(count, 1, max) and atomic_
dec(count), which account for the majority of CPU time, according to
perf. In that sense, the previous ring buffer was superior in that it
could check if it was full by head==tail, which the list-based approach
cannot do.

But all and all, this enables us to get massive memory savings, allowing
WireGuard to scale for real world deployments, without taking much of a
performance hit.

[1] http://www.1024cores.net/home/lock-free-algorithms/queues/intrusive-mpsc-node-based-queue

Reviewed-by: Dmitry Vyukov <dvyukov@google.com>
Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com>
Fixes: e7096c131e51 ("net: WireGuard secure network tunnel")
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

authored by

Jason A. Donenfeld and committed by
Jakub Kicinski
8b5553ac 99fff526

+144 -93
+6 -6
drivers/net/wireguard/device.c
··· 235 235 destroy_workqueue(wg->handshake_receive_wq); 236 236 destroy_workqueue(wg->handshake_send_wq); 237 237 destroy_workqueue(wg->packet_crypt_wq); 238 - wg_packet_queue_free(&wg->decrypt_queue, true); 239 - wg_packet_queue_free(&wg->encrypt_queue, true); 238 + wg_packet_queue_free(&wg->decrypt_queue); 239 + wg_packet_queue_free(&wg->encrypt_queue); 240 240 rcu_barrier(); /* Wait for all the peers to be actually freed. */ 241 241 wg_ratelimiter_uninit(); 242 242 memzero_explicit(&wg->static_identity, sizeof(wg->static_identity)); ··· 338 338 goto err_destroy_handshake_send; 339 339 340 340 ret = wg_packet_queue_init(&wg->encrypt_queue, wg_packet_encrypt_worker, 341 - true, MAX_QUEUED_PACKETS); 341 + MAX_QUEUED_PACKETS); 342 342 if (ret < 0) 343 343 goto err_destroy_packet_crypt; 344 344 345 345 ret = wg_packet_queue_init(&wg->decrypt_queue, wg_packet_decrypt_worker, 346 - true, MAX_QUEUED_PACKETS); 346 + MAX_QUEUED_PACKETS); 347 347 if (ret < 0) 348 348 goto err_free_encrypt_queue; 349 349 ··· 368 368 err_uninit_ratelimiter: 369 369 wg_ratelimiter_uninit(); 370 370 err_free_decrypt_queue: 371 - wg_packet_queue_free(&wg->decrypt_queue, true); 371 + wg_packet_queue_free(&wg->decrypt_queue); 372 372 err_free_encrypt_queue: 373 - wg_packet_queue_free(&wg->encrypt_queue, true); 373 + wg_packet_queue_free(&wg->encrypt_queue); 374 374 err_destroy_packet_crypt: 375 375 destroy_workqueue(wg->packet_crypt_wq); 376 376 err_destroy_handshake_send:
+8 -7
drivers/net/wireguard/device.h
··· 27 27 28 28 struct crypt_queue { 29 29 struct ptr_ring ring; 30 - union { 31 - struct { 32 - struct multicore_worker __percpu *worker; 33 - int last_cpu; 34 - }; 35 - struct work_struct work; 36 - }; 30 + struct multicore_worker __percpu *worker; 31 + int last_cpu; 32 + }; 33 + 34 + struct prev_queue { 35 + struct sk_buff *head, *tail, *peeked; 36 + struct { struct sk_buff *next, *prev; } empty; // Match first 2 members of struct sk_buff. 37 + atomic_t count; 37 38 }; 38 39 39 40 struct wg_device {
+9 -19
drivers/net/wireguard/peer.c
··· 32 32 peer = kzalloc(sizeof(*peer), GFP_KERNEL); 33 33 if (unlikely(!peer)) 34 34 return ERR_PTR(ret); 35 - peer->device = wg; 35 + if (dst_cache_init(&peer->endpoint_cache, GFP_KERNEL)) 36 + goto err; 36 37 38 + peer->device = wg; 37 39 wg_noise_handshake_init(&peer->handshake, &wg->static_identity, 38 40 public_key, preshared_key, peer); 39 - if (dst_cache_init(&peer->endpoint_cache, GFP_KERNEL)) 40 - goto err_1; 41 - if (wg_packet_queue_init(&peer->tx_queue, wg_packet_tx_worker, false, 42 - MAX_QUEUED_PACKETS)) 43 - goto err_2; 44 - if (wg_packet_queue_init(&peer->rx_queue, NULL, false, 45 - MAX_QUEUED_PACKETS)) 46 - goto err_3; 47 - 48 41 peer->internal_id = atomic64_inc_return(&peer_counter); 49 42 peer->serial_work_cpu = nr_cpumask_bits; 50 43 wg_cookie_init(&peer->latest_cookie); 51 44 wg_timers_init(peer); 52 45 wg_cookie_checker_precompute_peer_keys(peer); 53 46 spin_lock_init(&peer->keypairs.keypair_update_lock); 54 - INIT_WORK(&peer->transmit_handshake_work, 55 - wg_packet_handshake_send_worker); 47 + INIT_WORK(&peer->transmit_handshake_work, wg_packet_handshake_send_worker); 48 + INIT_WORK(&peer->transmit_packet_work, wg_packet_tx_worker); 49 + wg_prev_queue_init(&peer->tx_queue); 50 + wg_prev_queue_init(&peer->rx_queue); 56 51 rwlock_init(&peer->endpoint_lock); 57 52 kref_init(&peer->refcount); 58 53 skb_queue_head_init(&peer->staged_packet_queue); ··· 63 68 pr_debug("%s: Peer %llu created\n", wg->dev->name, peer->internal_id); 64 69 return peer; 65 70 66 - err_3: 67 - wg_packet_queue_free(&peer->tx_queue, false); 68 - err_2: 69 - dst_cache_destroy(&peer->endpoint_cache); 70 - err_1: 71 + err: 71 72 kfree(peer); 72 73 return ERR_PTR(ret); 73 74 } ··· 188 197 struct wg_peer *peer = container_of(rcu, struct wg_peer, rcu); 189 198 190 199 dst_cache_destroy(&peer->endpoint_cache); 191 - wg_packet_queue_free(&peer->rx_queue, false); 192 - wg_packet_queue_free(&peer->tx_queue, false); 200 + WARN_ON(wg_prev_queue_peek(&peer->tx_queue) || wg_prev_queue_peek(&peer->rx_queue)); 193 201 194 202 /* The final zeroing takes care of clearing any remaining handshake key 195 203 * material and other potentially sensitive information.
+2 -2
drivers/net/wireguard/peer.h
··· 36 36 37 37 struct wg_peer { 38 38 struct wg_device *device; 39 - struct crypt_queue tx_queue, rx_queue; 39 + struct prev_queue tx_queue, rx_queue; 40 40 struct sk_buff_head staged_packet_queue; 41 41 int serial_work_cpu; 42 42 bool is_dead; ··· 46 46 rwlock_t endpoint_lock; 47 47 struct noise_handshake handshake; 48 48 atomic64_t last_sent_handshake; 49 - struct work_struct transmit_handshake_work, clear_peer_work; 49 + struct work_struct transmit_handshake_work, clear_peer_work, transmit_packet_work; 50 50 struct cookie latest_cookie; 51 51 struct hlist_node pubkey_hash; 52 52 u64 rx_bytes, tx_bytes;
+69 -17
drivers/net/wireguard/queueing.c
··· 9 9 wg_packet_percpu_multicore_worker_alloc(work_func_t function, void *ptr) 10 10 { 11 11 int cpu; 12 - struct multicore_worker __percpu *worker = 13 - alloc_percpu(struct multicore_worker); 12 + struct multicore_worker __percpu *worker = alloc_percpu(struct multicore_worker); 14 13 15 14 if (!worker) 16 15 return NULL; ··· 22 23 } 23 24 24 25 int wg_packet_queue_init(struct crypt_queue *queue, work_func_t function, 25 - bool multicore, unsigned int len) 26 + unsigned int len) 26 27 { 27 28 int ret; 28 29 ··· 30 31 ret = ptr_ring_init(&queue->ring, len, GFP_KERNEL); 31 32 if (ret) 32 33 return ret; 33 - if (function) { 34 - if (multicore) { 35 - queue->worker = wg_packet_percpu_multicore_worker_alloc( 36 - function, queue); 37 - if (!queue->worker) { 38 - ptr_ring_cleanup(&queue->ring, NULL); 39 - return -ENOMEM; 40 - } 41 - } else { 42 - INIT_WORK(&queue->work, function); 43 - } 34 + queue->worker = wg_packet_percpu_multicore_worker_alloc(function, queue); 35 + if (!queue->worker) { 36 + ptr_ring_cleanup(&queue->ring, NULL); 37 + return -ENOMEM; 44 38 } 45 39 return 0; 46 40 } 47 41 48 - void wg_packet_queue_free(struct crypt_queue *queue, bool multicore) 42 + void wg_packet_queue_free(struct crypt_queue *queue) 49 43 { 50 - if (multicore) 51 - free_percpu(queue->worker); 44 + free_percpu(queue->worker); 52 45 WARN_ON(!__ptr_ring_empty(&queue->ring)); 53 46 ptr_ring_cleanup(&queue->ring, NULL); 54 47 } 48 + 49 + #define NEXT(skb) ((skb)->prev) 50 + #define STUB(queue) ((struct sk_buff *)&queue->empty) 51 + 52 + void wg_prev_queue_init(struct prev_queue *queue) 53 + { 54 + NEXT(STUB(queue)) = NULL; 55 + queue->head = queue->tail = STUB(queue); 56 + queue->peeked = NULL; 57 + atomic_set(&queue->count, 0); 58 + BUILD_BUG_ON( 59 + offsetof(struct sk_buff, next) != offsetof(struct prev_queue, empty.next) - 60 + offsetof(struct prev_queue, empty) || 61 + offsetof(struct sk_buff, prev) != offsetof(struct prev_queue, empty.prev) - 62 + offsetof(struct prev_queue, empty)); 63 + } 64 + 65 + static void __wg_prev_queue_enqueue(struct prev_queue *queue, struct sk_buff *skb) 66 + { 67 + WRITE_ONCE(NEXT(skb), NULL); 68 + WRITE_ONCE(NEXT(xchg_release(&queue->head, skb)), skb); 69 + } 70 + 71 + bool wg_prev_queue_enqueue(struct prev_queue *queue, struct sk_buff *skb) 72 + { 73 + if (!atomic_add_unless(&queue->count, 1, MAX_QUEUED_PACKETS)) 74 + return false; 75 + __wg_prev_queue_enqueue(queue, skb); 76 + return true; 77 + } 78 + 79 + struct sk_buff *wg_prev_queue_dequeue(struct prev_queue *queue) 80 + { 81 + struct sk_buff *tail = queue->tail, *next = smp_load_acquire(&NEXT(tail)); 82 + 83 + if (tail == STUB(queue)) { 84 + if (!next) 85 + return NULL; 86 + queue->tail = next; 87 + tail = next; 88 + next = smp_load_acquire(&NEXT(next)); 89 + } 90 + if (next) { 91 + queue->tail = next; 92 + atomic_dec(&queue->count); 93 + return tail; 94 + } 95 + if (tail != READ_ONCE(queue->head)) 96 + return NULL; 97 + __wg_prev_queue_enqueue(queue, STUB(queue)); 98 + next = smp_load_acquire(&NEXT(tail)); 99 + if (next) { 100 + queue->tail = next; 101 + atomic_dec(&queue->count); 102 + return tail; 103 + } 104 + return NULL; 105 + } 106 + 107 + #undef NEXT 108 + #undef STUB
+33 -12
drivers/net/wireguard/queueing.h
··· 17 17 struct wg_peer; 18 18 struct multicore_worker; 19 19 struct crypt_queue; 20 + struct prev_queue; 20 21 struct sk_buff; 21 22 22 23 /* queueing.c APIs: */ 23 24 int wg_packet_queue_init(struct crypt_queue *queue, work_func_t function, 24 - bool multicore, unsigned int len); 25 - void wg_packet_queue_free(struct crypt_queue *queue, bool multicore); 25 + unsigned int len); 26 + void wg_packet_queue_free(struct crypt_queue *queue); 26 27 struct multicore_worker __percpu * 27 28 wg_packet_percpu_multicore_worker_alloc(work_func_t function, void *ptr); 28 29 ··· 136 135 return cpu; 137 136 } 138 137 138 + void wg_prev_queue_init(struct prev_queue *queue); 139 + 140 + /* Multi producer */ 141 + bool wg_prev_queue_enqueue(struct prev_queue *queue, struct sk_buff *skb); 142 + 143 + /* Single consumer */ 144 + struct sk_buff *wg_prev_queue_dequeue(struct prev_queue *queue); 145 + 146 + /* Single consumer */ 147 + static inline struct sk_buff *wg_prev_queue_peek(struct prev_queue *queue) 148 + { 149 + if (queue->peeked) 150 + return queue->peeked; 151 + queue->peeked = wg_prev_queue_dequeue(queue); 152 + return queue->peeked; 153 + } 154 + 155 + /* Single consumer */ 156 + static inline void wg_prev_queue_drop_peeked(struct prev_queue *queue) 157 + { 158 + queue->peeked = NULL; 159 + } 160 + 139 161 static inline int wg_queue_enqueue_per_device_and_peer( 140 - struct crypt_queue *device_queue, struct crypt_queue *peer_queue, 162 + struct crypt_queue *device_queue, struct prev_queue *peer_queue, 141 163 struct sk_buff *skb, struct workqueue_struct *wq, int *next_cpu) 142 164 { 143 165 int cpu; ··· 169 145 /* We first queue this up for the peer ingestion, but the consumer 170 146 * will wait for the state to change to CRYPTED or DEAD before. 171 147 */ 172 - if (unlikely(ptr_ring_produce_bh(&peer_queue->ring, skb))) 148 + if (unlikely(!wg_prev_queue_enqueue(peer_queue, skb))) 173 149 return -ENOSPC; 150 + 174 151 /* Then we queue it up in the device queue, which consumes the 175 152 * packet as soon as it can. 176 153 */ ··· 182 157 return 0; 183 158 } 184 159 185 - static inline void wg_queue_enqueue_per_peer(struct crypt_queue *queue, 186 - struct sk_buff *skb, 187 - enum packet_state state) 160 + static inline void wg_queue_enqueue_per_peer_tx(struct sk_buff *skb, enum packet_state state) 188 161 { 189 162 /* We take a reference, because as soon as we call atomic_set, the 190 163 * peer can be freed from below us. ··· 190 167 struct wg_peer *peer = wg_peer_get(PACKET_PEER(skb)); 191 168 192 169 atomic_set_release(&PACKET_CB(skb)->state, state); 193 - queue_work_on(wg_cpumask_choose_online(&peer->serial_work_cpu, 194 - peer->internal_id), 195 - peer->device->packet_crypt_wq, &queue->work); 170 + queue_work_on(wg_cpumask_choose_online(&peer->serial_work_cpu, peer->internal_id), 171 + peer->device->packet_crypt_wq, &peer->transmit_packet_work); 196 172 wg_peer_put(peer); 197 173 } 198 174 199 - static inline void wg_queue_enqueue_per_peer_napi(struct sk_buff *skb, 200 - enum packet_state state) 175 + static inline void wg_queue_enqueue_per_peer_rx(struct sk_buff *skb, enum packet_state state) 201 176 { 202 177 /* We take a reference, because as soon as we call atomic_set, the 203 178 * peer can be freed from below us.
+6 -10
drivers/net/wireguard/receive.c
··· 444 444 int wg_packet_rx_poll(struct napi_struct *napi, int budget) 445 445 { 446 446 struct wg_peer *peer = container_of(napi, struct wg_peer, napi); 447 - struct crypt_queue *queue = &peer->rx_queue; 448 447 struct noise_keypair *keypair; 449 448 struct endpoint endpoint; 450 449 enum packet_state state; ··· 454 455 if (unlikely(budget <= 0)) 455 456 return 0; 456 457 457 - while ((skb = __ptr_ring_peek(&queue->ring)) != NULL && 458 + while ((skb = wg_prev_queue_peek(&peer->rx_queue)) != NULL && 458 459 (state = atomic_read_acquire(&PACKET_CB(skb)->state)) != 459 460 PACKET_STATE_UNCRYPTED) { 460 - __ptr_ring_discard_one(&queue->ring); 461 - peer = PACKET_PEER(skb); 461 + wg_prev_queue_drop_peeked(&peer->rx_queue); 462 462 keypair = PACKET_CB(skb)->keypair; 463 463 free = true; 464 464 ··· 506 508 enum packet_state state = 507 509 likely(decrypt_packet(skb, PACKET_CB(skb)->keypair)) ? 508 510 PACKET_STATE_CRYPTED : PACKET_STATE_DEAD; 509 - wg_queue_enqueue_per_peer_napi(skb, state); 511 + wg_queue_enqueue_per_peer_rx(skb, state); 510 512 if (need_resched()) 511 513 cond_resched(); 512 514 } ··· 529 531 if (unlikely(READ_ONCE(peer->is_dead))) 530 532 goto err; 531 533 532 - ret = wg_queue_enqueue_per_device_and_peer(&wg->decrypt_queue, 533 - &peer->rx_queue, skb, 534 - wg->packet_crypt_wq, 535 - &wg->decrypt_queue.last_cpu); 534 + ret = wg_queue_enqueue_per_device_and_peer(&wg->decrypt_queue, &peer->rx_queue, skb, 535 + wg->packet_crypt_wq, &wg->decrypt_queue.last_cpu); 536 536 if (unlikely(ret == -EPIPE)) 537 - wg_queue_enqueue_per_peer_napi(skb, PACKET_STATE_DEAD); 537 + wg_queue_enqueue_per_peer_rx(skb, PACKET_STATE_DEAD); 538 538 if (likely(!ret || ret == -EPIPE)) { 539 539 rcu_read_unlock_bh(); 540 540 return;
+11 -20
drivers/net/wireguard/send.c
··· 239 239 wg_packet_send_staged_packets(peer); 240 240 } 241 241 242 - static void wg_packet_create_data_done(struct sk_buff *first, 243 - struct wg_peer *peer) 242 + static void wg_packet_create_data_done(struct wg_peer *peer, struct sk_buff *first) 244 243 { 245 244 struct sk_buff *skb, *next; 246 245 bool is_keepalive, data_sent = false; ··· 261 262 262 263 void wg_packet_tx_worker(struct work_struct *work) 263 264 { 264 - struct crypt_queue *queue = container_of(work, struct crypt_queue, 265 - work); 265 + struct wg_peer *peer = container_of(work, struct wg_peer, transmit_packet_work); 266 266 struct noise_keypair *keypair; 267 267 enum packet_state state; 268 268 struct sk_buff *first; 269 - struct wg_peer *peer; 270 269 271 - while ((first = __ptr_ring_peek(&queue->ring)) != NULL && 270 + while ((first = wg_prev_queue_peek(&peer->tx_queue)) != NULL && 272 271 (state = atomic_read_acquire(&PACKET_CB(first)->state)) != 273 272 PACKET_STATE_UNCRYPTED) { 274 - __ptr_ring_discard_one(&queue->ring); 275 - peer = PACKET_PEER(first); 273 + wg_prev_queue_drop_peeked(&peer->tx_queue); 276 274 keypair = PACKET_CB(first)->keypair; 277 275 278 276 if (likely(state == PACKET_STATE_CRYPTED)) 279 - wg_packet_create_data_done(first, peer); 277 + wg_packet_create_data_done(peer, first); 280 278 else 281 279 kfree_skb_list(first); 282 280 ··· 302 306 break; 303 307 } 304 308 } 305 - wg_queue_enqueue_per_peer(&PACKET_PEER(first)->tx_queue, first, 306 - state); 309 + wg_queue_enqueue_per_peer_tx(first, state); 307 310 if (need_resched()) 308 311 cond_resched(); 309 312 } 310 313 } 311 314 312 - static void wg_packet_create_data(struct sk_buff *first) 315 + static void wg_packet_create_data(struct wg_peer *peer, struct sk_buff *first) 313 316 { 314 - struct wg_peer *peer = PACKET_PEER(first); 315 317 struct wg_device *wg = peer->device; 316 318 int ret = -EINVAL; 317 319 ··· 317 323 if (unlikely(READ_ONCE(peer->is_dead))) 318 324 goto err; 319 325 320 - ret = wg_queue_enqueue_per_device_and_peer(&wg->encrypt_queue, 321 - &peer->tx_queue, first, 322 - wg->packet_crypt_wq, 323 - &wg->encrypt_queue.last_cpu); 326 + ret = wg_queue_enqueue_per_device_and_peer(&wg->encrypt_queue, &peer->tx_queue, first, 327 + wg->packet_crypt_wq, &wg->encrypt_queue.last_cpu); 324 328 if (unlikely(ret == -EPIPE)) 325 - wg_queue_enqueue_per_peer(&peer->tx_queue, first, 326 - PACKET_STATE_DEAD); 329 + wg_queue_enqueue_per_peer_tx(first, PACKET_STATE_DEAD); 327 330 err: 328 331 rcu_read_unlock_bh(); 329 332 if (likely(!ret || ret == -EPIPE)) ··· 384 393 packets.prev->next = NULL; 385 394 wg_peer_get(keypair->entry.peer); 386 395 PACKET_CB(packets.next)->keypair = keypair; 387 - wg_packet_create_data(packets.next); 396 + wg_packet_create_data(peer, packets.next); 388 397 return; 389 398 390 399 out_invalid: