Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

icmp: add a global rate limitation

Current ICMP rate limiting uses inetpeer cache, which is an RBL tree
protected by a lock, meaning that hosts can be stuck hard if all cpus
want to check ICMP limits.

When say a DNS or NTP server process is restarted, inetpeer tree grows
quick and machine comes to its knees.

iptables can not help because the bottleneck happens before ICMP
messages are even cooked and sent.

This patch adds a new global limitation, using a token bucket filter,
controlled by two new sysctl :

icmp_msgs_per_sec - INTEGER
Limit maximal number of ICMP packets sent per second from this host.
Only messages whose type matches icmp_ratemask are
controlled by this limit.
Default: 1000

icmp_msgs_burst - INTEGER
icmp_msgs_per_sec controls number of ICMP packets sent per second,
while icmp_msgs_burst controls the burst size of these packets.
Default: 50

Note that if we really want to send millions of ICMP messages per
second, we might extend idea and infra added in commit 04ca6973f7c1a
("ip: make IP identifiers less predictable") :
add a token bucket in the ip_idents hash and no longer rely on inetpeer.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

authored by

Eric Dumazet and committed by
David S. Miller
4cdf507d e8b56d55

+105 -12
+13
Documentation/networking/ip-sysctl.txt
··· 769 769 icmp_ratemask (see below) to specific targets. 770 770 0 to disable any limiting, 771 771 otherwise the minimal space between responses in milliseconds. 772 + Note that another sysctl, icmp_msgs_per_sec limits the number 773 + of ICMP packets sent on all targets. 772 774 Default: 1000 775 + 776 + icmp_msgs_per_sec - INTEGER 777 + Limit maximal number of ICMP packets sent per second from this host. 778 + Only messages whose type matches icmp_ratemask (see below) are 779 + controlled by this limit. 780 + Default: 1000 781 + 782 + icmp_msgs_burst - INTEGER 783 + icmp_msgs_per_sec controls number of ICMP packets sent per second, 784 + while icmp_msgs_burst controls the burst size of these packets. 785 + Default: 50 773 786 774 787 icmp_ratemask - INTEGER 775 788 Mask made of ICMP types for which rates are being limited.
+4
include/net/ip.h
··· 548 548 void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 dport, 549 549 u32 info); 550 550 551 + bool icmp_global_allow(void); 552 + extern int sysctl_icmp_msgs_per_sec; 553 + extern int sysctl_icmp_msgs_burst; 554 + 551 555 #ifdef CONFIG_PROC_FS 552 556 int ip_misc_proc_init(void); 553 557 #endif
+60 -4
net/ipv4/icmp.c
··· 231 231 spin_unlock_bh(&sk->sk_lock.slock); 232 232 } 233 233 234 + int sysctl_icmp_msgs_per_sec __read_mostly = 1000; 235 + int sysctl_icmp_msgs_burst __read_mostly = 50; 236 + 237 + static struct { 238 + spinlock_t lock; 239 + u32 credit; 240 + u32 stamp; 241 + } icmp_global = { 242 + .lock = __SPIN_LOCK_UNLOCKED(icmp_global.lock), 243 + }; 244 + 245 + /** 246 + * icmp_global_allow - Are we allowed to send one more ICMP message ? 247 + * 248 + * Uses a token bucket to limit our ICMP messages to sysctl_icmp_msgs_per_sec. 249 + * Returns false if we reached the limit and can not send another packet. 250 + * Note: called with BH disabled 251 + */ 252 + bool icmp_global_allow(void) 253 + { 254 + u32 credit, delta, incr = 0, now = (u32)jiffies; 255 + bool rc = false; 256 + 257 + /* Check if token bucket is empty and cannot be refilled 258 + * without taking the spinlock. 259 + */ 260 + if (!icmp_global.credit) { 261 + delta = min_t(u32, now - icmp_global.stamp, HZ); 262 + if (delta < HZ / 50) 263 + return false; 264 + } 265 + 266 + spin_lock(&icmp_global.lock); 267 + delta = min_t(u32, now - icmp_global.stamp, HZ); 268 + if (delta >= HZ / 50) { 269 + incr = sysctl_icmp_msgs_per_sec * delta / HZ ; 270 + if (incr) 271 + icmp_global.stamp = now; 272 + } 273 + credit = min_t(u32, icmp_global.credit + incr, sysctl_icmp_msgs_burst); 274 + if (credit) { 275 + credit--; 276 + rc = true; 277 + } 278 + icmp_global.credit = credit; 279 + spin_unlock(&icmp_global.lock); 280 + return rc; 281 + } 282 + EXPORT_SYMBOL(icmp_global_allow); 283 + 234 284 /* 235 285 * Send an ICMP frame. 236 286 */ 237 287 238 - static inline bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt, 239 - struct flowi4 *fl4, int type, int code) 288 + static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt, 289 + struct flowi4 *fl4, int type, int code) 240 290 { 241 291 struct dst_entry *dst = &rt->dst; 242 292 bool rc = true; ··· 303 253 goto out; 304 254 305 255 /* Limit if icmp type is enabled in ratemask. */ 306 - if ((1 << type) & net->ipv4.sysctl_icmp_ratemask) { 307 - struct inet_peer *peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, 1); 256 + if (!((1 << type) & net->ipv4.sysctl_icmp_ratemask)) 257 + goto out; 258 + 259 + rc = false; 260 + if (icmp_global_allow()) { 261 + struct inet_peer *peer; 262 + 263 + peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, 1); 308 264 rc = inet_peer_xrlim_allow(peer, 309 265 net->ipv4.sysctl_icmp_ratelimit); 310 266 if (peer)
+16
net/ipv4/sysctl_net_ipv4.c
··· 731 731 .extra2 = &one, 732 732 }, 733 733 { 734 + .procname = "icmp_msgs_per_sec", 735 + .data = &sysctl_icmp_msgs_per_sec, 736 + .maxlen = sizeof(int), 737 + .mode = 0644, 738 + .proc_handler = proc_dointvec_minmax, 739 + .extra1 = &zero, 740 + }, 741 + { 742 + .procname = "icmp_msgs_burst", 743 + .data = &sysctl_icmp_msgs_burst, 744 + .maxlen = sizeof(int), 745 + .mode = 0644, 746 + .proc_handler = proc_dointvec_minmax, 747 + .extra1 = &zero, 748 + }, 749 + { 734 750 .procname = "udp_mem", 735 751 .data = &sysctl_udp_mem, 736 752 .maxlen = sizeof(sysctl_udp_mem),
+12 -8
net/ipv6/icmp.c
··· 170 170 /* 171 171 * Check the ICMP output rate limit 172 172 */ 173 - static inline bool icmpv6_xrlim_allow(struct sock *sk, u8 type, 174 - struct flowi6 *fl6) 173 + static bool icmpv6_xrlim_allow(struct sock *sk, u8 type, 174 + struct flowi6 *fl6) 175 175 { 176 - struct dst_entry *dst; 177 176 struct net *net = sock_net(sk); 177 + struct dst_entry *dst; 178 178 bool res = false; 179 179 180 180 /* Informational messages are not limited. */ ··· 199 199 } else { 200 200 struct rt6_info *rt = (struct rt6_info *)dst; 201 201 int tmo = net->ipv6.sysctl.icmpv6_time; 202 - struct inet_peer *peer; 203 202 204 203 /* Give more bandwidth to wider prefixes. */ 205 204 if (rt->rt6i_dst.plen < 128) 206 205 tmo >>= ((128 - rt->rt6i_dst.plen)>>5); 207 206 208 - peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1); 209 - res = inet_peer_xrlim_allow(peer, tmo); 210 - if (peer) 211 - inet_putpeer(peer); 207 + if (icmp_global_allow()) { 208 + struct inet_peer *peer; 209 + 210 + peer = inet_getpeer_v6(net->ipv6.peers, 211 + &rt->rt6i_dst.addr, 1); 212 + res = inet_peer_xrlim_allow(peer, tmo); 213 + if (peer) 214 + inet_putpeer(peer); 215 + } 212 216 } 213 217 dst_release(dst); 214 218 return res;