Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

udp: remove busylock and add per NUMA queues

busylock was protecting UDP sockets against packet floods,
but unfortunately was not protecting the host itself.

Under stress, many cpus could spin while acquiring the busylock,
and NIC had to drop packets. Or packets would be dropped
in cpu backlog if RPS/RFS were in place.

This patch replaces the busylock by intermediate
lockless queues. (One queue per NUMA node).

This means that fewer number of cpus have to acquire
the UDP receive queue lock.

Most of the cpus can either:
- immediately drop the packet.
- or queue it in their NUMA aware lockless queue.

Then one of the cpu is chosen to process this lockless queue
in a batch.

The batch only contains packets that were cooked on the same
NUMA node, thus with very limited latency impact.

Tested:

DDOS targeting a victim UDP socket, on a platform with 6 NUMA nodes
(Intel(R) Xeon(R) 6985P-C)

Before:

nstat -n ; sleep 1 ; nstat | grep Udp
Udp6InDatagrams 1004179 0.0
Udp6InErrors 3117 0.0
Udp6RcvbufErrors 3117 0.0

After:
nstat -n ; sleep 1 ; nstat | grep Udp
Udp6InDatagrams 1116633 0.0
Udp6InErrors 14197275 0.0
Udp6RcvbufErrors 14197275 0.0

We can see this host can now proces 14.2 M more packets per second
while under attack, and the victim socket can receive 11 % more
packets.

I used a small bpftrace program measuring time (in us) spent in
__udp_enqueue_schedule_skb().

Before:

@udp_enqueue_us[398]:
[0] 24901 |@@@ |
[1] 63512 |@@@@@@@@@ |
[2, 4) 344827 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
[4, 8) 244673 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ |
[8, 16) 54022 |@@@@@@@@ |
[16, 32) 222134 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ |
[32, 64) 232042 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ |
[64, 128) 4219 | |
[128, 256) 188 | |

After:

@udp_enqueue_us[398]:
[0] 5608855 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
[1] 1111277 |@@@@@@@@@@ |
[2, 4) 501439 |@@@@ |
[4, 8) 102921 | |
[8, 16) 29895 | |
[16, 32) 43500 | |
[32, 64) 31552 | |
[64, 128) 979 | |
[128, 256) 13 | |

Note that the remaining bottleneck for this platform is in
udp_drops_inc() because we limited struct numa_drop_counters
to only two nodes so far.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20250922104240.2182559-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

authored by

Eric Dumazet and committed by
Jakub Kicinski
b650bf09 df152675

+91 -51
+8 -1
include/linux/udp.h
··· 44 44 UDP_FLAGS_UDPLITE_RECV_CC, /* set via udplite setsockopt */ 45 45 }; 46 46 47 + /* per NUMA structure for lockless producer usage. */ 48 + struct udp_prod_queue { 49 + struct llist_head ll_root ____cacheline_aligned_in_smp; 50 + atomic_t rmem_alloc; 51 + }; 52 + 47 53 struct udp_sock { 48 54 /* inet_sock has to be the first member */ 49 55 struct inet_sock inet; ··· 96 90 struct sk_buff *skb, 97 91 int nhoff); 98 92 93 + struct udp_prod_queue *udp_prod_queue; 94 + 99 95 /* udp_recvmsg try to use this before splicing sk_receive_queue */ 100 96 struct sk_buff_head reader_queue ____cacheline_aligned_in_smp; 101 97 ··· 117 109 */ 118 110 struct hlist_node tunnel_list; 119 111 struct numa_drop_counters drop_counters; 120 - spinlock_t busylock ____cacheline_aligned_in_smp; 121 112 }; 122 113 123 114 #define udp_test_bit(nr, sk) \
+9 -2
include/net/udp.h
··· 284 284 struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, 285 285 netdev_features_t features, bool is_ipv6); 286 286 287 - static inline void udp_lib_init_sock(struct sock *sk) 287 + static inline int udp_lib_init_sock(struct sock *sk) 288 288 { 289 289 struct udp_sock *up = udp_sk(sk); 290 290 291 291 sk->sk_drop_counters = &up->drop_counters; 292 - spin_lock_init(&up->busylock); 293 292 skb_queue_head_init(&up->reader_queue); 294 293 INIT_HLIST_NODE(&up->tunnel_list); 295 294 up->forward_threshold = sk->sk_rcvbuf >> 2; 296 295 set_bit(SOCK_CUSTOM_SOCKOPT, &sk->sk_socket->flags); 296 + 297 + up->udp_prod_queue = kcalloc(nr_node_ids, sizeof(*up->udp_prod_queue), 298 + GFP_KERNEL); 299 + if (!up->udp_prod_queue) 300 + return -ENOMEM; 301 + for (int i = 0; i < nr_node_ids; i++) 302 + init_llist_head(&up->udp_prod_queue[i].ll_root); 303 + return 0; 297 304 } 298 305 299 306 static inline void udp_drops_inc(struct sock *sk)
+71 -46
net/ipv4/udp.c
··· 1685 1685 udp_rmem_release(sk, udp_skb_truesize(skb), 1, true); 1686 1686 } 1687 1687 1688 - /* Idea of busylocks is to let producers grab an extra spinlock 1689 - * to relieve pressure on the receive_queue spinlock shared by consumer. 1690 - * Under flood, this means that only one producer can be in line 1691 - * trying to acquire the receive_queue spinlock. 1692 - */ 1693 - static spinlock_t *busylock_acquire(struct sock *sk) 1694 - { 1695 - spinlock_t *busy = &udp_sk(sk)->busylock; 1696 - 1697 - spin_lock(busy); 1698 - return busy; 1699 - } 1700 - 1701 - static void busylock_release(spinlock_t *busy) 1702 - { 1703 - if (busy) 1704 - spin_unlock(busy); 1705 - } 1706 - 1707 1688 static int udp_rmem_schedule(struct sock *sk, int size) 1708 1689 { 1709 1690 int delta; ··· 1699 1718 int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) 1700 1719 { 1701 1720 struct sk_buff_head *list = &sk->sk_receive_queue; 1721 + struct udp_prod_queue *udp_prod_queue; 1722 + struct sk_buff *next, *to_drop = NULL; 1723 + struct llist_node *ll_list; 1702 1724 unsigned int rmem, rcvbuf; 1703 - spinlock_t *busy = NULL; 1704 1725 int size, err = -ENOMEM; 1726 + int total_size = 0; 1727 + int q_size = 0; 1728 + int dropcount; 1729 + int nb = 0; 1705 1730 1706 1731 rmem = atomic_read(&sk->sk_rmem_alloc); 1707 1732 rcvbuf = READ_ONCE(sk->sk_rcvbuf); 1708 1733 size = skb->truesize; 1734 + 1735 + udp_prod_queue = &udp_sk(sk)->udp_prod_queue[numa_node_id()]; 1736 + 1737 + rmem += atomic_read(&udp_prod_queue->rmem_alloc); 1709 1738 1710 1739 /* Immediately drop when the receive queue is full. 1711 1740 * Cast to unsigned int performs the boundary check for INT_MAX. ··· 1738 1747 if (rmem > (rcvbuf >> 1)) { 1739 1748 skb_condense(skb); 1740 1749 size = skb->truesize; 1741 - rmem = atomic_add_return(size, &sk->sk_rmem_alloc); 1742 - if (rmem > rcvbuf) 1743 - goto uncharge_drop; 1744 - busy = busylock_acquire(sk); 1745 - } else { 1746 - atomic_add(size, &sk->sk_rmem_alloc); 1747 1750 } 1748 1751 1749 1752 udp_set_dev_scratch(skb); 1750 1753 1754 + atomic_add(size, &udp_prod_queue->rmem_alloc); 1755 + 1756 + if (!llist_add(&skb->ll_node, &udp_prod_queue->ll_root)) 1757 + return 0; 1758 + 1759 + dropcount = sock_flag(sk, SOCK_RXQ_OVFL) ? sk_drops_read(sk) : 0; 1760 + 1751 1761 spin_lock(&list->lock); 1752 - err = udp_rmem_schedule(sk, size); 1753 - if (err) { 1754 - spin_unlock(&list->lock); 1755 - goto uncharge_drop; 1762 + 1763 + ll_list = llist_del_all(&udp_prod_queue->ll_root); 1764 + 1765 + ll_list = llist_reverse_order(ll_list); 1766 + 1767 + llist_for_each_entry_safe(skb, next, ll_list, ll_node) { 1768 + size = udp_skb_truesize(skb); 1769 + total_size += size; 1770 + err = udp_rmem_schedule(sk, size); 1771 + if (unlikely(err)) { 1772 + /* Free the skbs outside of locked section. */ 1773 + skb->next = to_drop; 1774 + to_drop = skb; 1775 + continue; 1776 + } 1777 + 1778 + q_size += size; 1779 + sk_forward_alloc_add(sk, -size); 1780 + 1781 + /* no need to setup a destructor, we will explicitly release the 1782 + * forward allocated memory on dequeue 1783 + */ 1784 + SOCK_SKB_CB(skb)->dropcount = dropcount; 1785 + nb++; 1786 + __skb_queue_tail(list, skb); 1756 1787 } 1757 1788 1758 - sk_forward_alloc_add(sk, -size); 1789 + atomic_add(q_size, &sk->sk_rmem_alloc); 1759 1790 1760 - /* no need to setup a destructor, we will explicitly release the 1761 - * forward allocated memory on dequeue 1762 - */ 1763 - sock_skb_set_dropcount(sk, skb); 1764 - 1765 - __skb_queue_tail(list, skb); 1766 1791 spin_unlock(&list->lock); 1767 1792 1768 - if (!sock_flag(sk, SOCK_DEAD)) 1769 - INDIRECT_CALL_1(sk->sk_data_ready, sock_def_readable, sk); 1793 + if (!sock_flag(sk, SOCK_DEAD)) { 1794 + /* Multiple threads might be blocked in recvmsg(), 1795 + * using prepare_to_wait_exclusive(). 1796 + */ 1797 + while (nb) { 1798 + INDIRECT_CALL_1(sk->sk_data_ready, 1799 + sock_def_readable, sk); 1800 + nb--; 1801 + } 1802 + } 1770 1803 1771 - busylock_release(busy); 1804 + if (unlikely(to_drop)) { 1805 + for (nb = 0; to_drop != NULL; nb++) { 1806 + skb = to_drop; 1807 + to_drop = skb->next; 1808 + skb_mark_not_on_list(skb); 1809 + /* TODO: update SNMP values. */ 1810 + sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_PROTO_MEM); 1811 + } 1812 + numa_drop_add(&udp_sk(sk)->drop_counters, nb); 1813 + } 1814 + 1815 + atomic_sub(total_size, &udp_prod_queue->rmem_alloc); 1816 + 1772 1817 return 0; 1773 - 1774 - uncharge_drop: 1775 - atomic_sub(skb->truesize, &sk->sk_rmem_alloc); 1776 1818 1777 1819 drop: 1778 1820 udp_drops_inc(sk); 1779 - busylock_release(busy); 1780 1821 return err; 1781 1822 } 1782 1823 EXPORT_IPV6_MOD_GPL(__udp_enqueue_schedule_skb); ··· 1826 1803 kfree_skb(skb); 1827 1804 } 1828 1805 udp_rmem_release(sk, total, 0, true); 1806 + kfree(up->udp_prod_queue); 1829 1807 } 1830 1808 EXPORT_IPV6_MOD_GPL(udp_destruct_common); 1831 1809 ··· 1838 1814 1839 1815 int udp_init_sock(struct sock *sk) 1840 1816 { 1841 - udp_lib_init_sock(sk); 1817 + int res = udp_lib_init_sock(sk); 1818 + 1842 1819 sk->sk_destruct = udp_destruct_sock; 1843 1820 set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags); 1844 - return 0; 1821 + return res; 1845 1822 } 1846 1823 1847 1824 void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len)
+3 -2
net/ipv6/udp.c
··· 67 67 68 68 int udpv6_init_sock(struct sock *sk) 69 69 { 70 - udp_lib_init_sock(sk); 70 + int res = udp_lib_init_sock(sk); 71 + 71 72 sk->sk_destruct = udpv6_destruct_sock; 72 73 set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags); 73 - return 0; 74 + return res; 74 75 } 75 76 76 77 INDIRECT_CALLABLE_SCOPE