Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

netfilter: snat: evict closing tcp entries on reply tuple collision

When all tried source tuples are in use, the connection request (skb)
and the new conntrack will be dropped in nf_confirm() due to the
non-recoverable clash.

Make it so that the last 32 attempts are allowed to evict a colliding
entry if this connection is already closing and the new sequence number
has advanced past the old one.

Such "all tuples taken" secenario can happen with tcp-rpc workloads where
same dst:dport gets queried repeatedly.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>

authored by

Florian Westphal and committed by
Pablo Neira Ayuso
45897255 96b2ef9b

+88 -4
+88 -4
net/netfilter/nf_nat_core.c
··· 27 27 28 28 #include "nf_internals.h" 29 29 30 + #define NF_NAT_MAX_ATTEMPTS 128 31 + #define NF_NAT_HARDER_THRESH (NF_NAT_MAX_ATTEMPTS / 4) 32 + 30 33 static spinlock_t nf_nat_locks[CONNTRACK_LOCKS]; 31 34 32 35 static DEFINE_MUTEX(nf_nat_proto_mutex); ··· 198 195 199 196 nf_ct_invert_tuple(&reply, tuple); 200 197 return nf_conntrack_tuple_taken(&reply, ignored_conntrack); 198 + } 199 + 200 + static bool nf_nat_may_kill(struct nf_conn *ct, unsigned long flags) 201 + { 202 + static const unsigned long flags_refuse = IPS_FIXED_TIMEOUT | 203 + IPS_DYING; 204 + static const unsigned long flags_needed = IPS_SRC_NAT; 205 + enum tcp_conntrack old_state; 206 + 207 + old_state = READ_ONCE(ct->proto.tcp.state); 208 + if (old_state < TCP_CONNTRACK_TIME_WAIT) 209 + return false; 210 + 211 + if (flags & flags_refuse) 212 + return false; 213 + 214 + return (flags & flags_needed) == flags_needed; 215 + } 216 + 217 + /* reverse direction will send packets to new source, so 218 + * make sure such packets are invalid. 219 + */ 220 + static bool nf_seq_has_advanced(const struct nf_conn *old, const struct nf_conn *new) 221 + { 222 + return (__s32)(new->proto.tcp.seen[0].td_end - 223 + old->proto.tcp.seen[0].td_end) > 0; 224 + } 225 + 226 + static int 227 + nf_nat_used_tuple_harder(const struct nf_conntrack_tuple *tuple, 228 + const struct nf_conn *ignored_conntrack, 229 + unsigned int attempts_left) 230 + { 231 + static const unsigned long flags_offload = IPS_OFFLOAD | IPS_HW_OFFLOAD; 232 + struct nf_conntrack_tuple_hash *thash; 233 + const struct nf_conntrack_zone *zone; 234 + struct nf_conntrack_tuple reply; 235 + unsigned long flags; 236 + struct nf_conn *ct; 237 + bool taken = true; 238 + struct net *net; 239 + 240 + nf_ct_invert_tuple(&reply, tuple); 241 + 242 + if (attempts_left > NF_NAT_HARDER_THRESH || 243 + tuple->dst.protonum != IPPROTO_TCP || 244 + ignored_conntrack->proto.tcp.state != TCP_CONNTRACK_SYN_SENT) 245 + return nf_conntrack_tuple_taken(&reply, ignored_conntrack); 246 + 247 + /* :ast few attempts to find a free tcp port. Destructive 248 + * action: evict colliding if its in timewait state and the 249 + * tcp sequence number has advanced past the one used by the 250 + * old entry. 251 + */ 252 + net = nf_ct_net(ignored_conntrack); 253 + zone = nf_ct_zone(ignored_conntrack); 254 + 255 + thash = nf_conntrack_find_get(net, zone, &reply); 256 + if (!thash) 257 + return false; 258 + 259 + ct = nf_ct_tuplehash_to_ctrack(thash); 260 + 261 + if (thash->tuple.dst.dir == IP_CT_DIR_ORIGINAL) 262 + goto out; 263 + 264 + if (WARN_ON_ONCE(ct == ignored_conntrack)) 265 + goto out; 266 + 267 + flags = READ_ONCE(ct->status); 268 + if (!nf_nat_may_kill(ct, flags)) 269 + goto out; 270 + 271 + if (!nf_seq_has_advanced(ct, ignored_conntrack)) 272 + goto out; 273 + 274 + /* Even if we can evict do not reuse if entry is offloaded. */ 275 + if (nf_ct_kill(ct)) 276 + taken = flags & flags_offload; 277 + out: 278 + nf_ct_put(ct); 279 + return taken; 201 280 } 202 281 203 282 static bool nf_nat_inet_in_range(const struct nf_conntrack_tuple *t, ··· 470 385 unsigned int range_size, min, max, i, attempts; 471 386 __be16 *keyptr; 472 387 u16 off; 473 - static const unsigned int max_attempts = 128; 474 388 475 389 switch (tuple->dst.protonum) { 476 390 case IPPROTO_ICMP: ··· 555 471 off = get_random_u16(); 556 472 557 473 attempts = range_size; 558 - if (attempts > max_attempts) 559 - attempts = max_attempts; 474 + if (attempts > NF_NAT_MAX_ATTEMPTS) 475 + attempts = NF_NAT_MAX_ATTEMPTS; 560 476 561 477 /* We are in softirq; doing a search of the entire range risks 562 478 * soft lockup when all tuples are already used. ··· 567 483 another_round: 568 484 for (i = 0; i < attempts; i++, off++) { 569 485 *keyptr = htons(min + off % range_size); 570 - if (!nf_nat_used_tuple(tuple, ct)) 486 + if (!nf_nat_used_tuple_harder(tuple, ct, attempts - i)) 571 487 return; 572 488 } 573 489