net/ipv4/inet_timewait_sock.c at master · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / net / ipv4 / inet_timewait_sock.c
at master 11 kB view raw
  1// SPDX-License-Identifier: GPL-2.0-only
  2/*
  3 * INET		An implementation of the TCP/IP protocol suite for the LINUX
  4 *		operating system.  INET is implemented using the  BSD Socket
  5 *		interface as the means of communication with the user level.
  6 *
  7 *		Generic TIME_WAIT sockets functions
  8 *
  9 *		From code orinally in TCP
 10 */
 11
 12#include <linux/kernel.h>
 13#include <linux/slab.h>
 14#include <linux/module.h>
 15#include <net/inet_hashtables.h>
 16#include <net/inet_timewait_sock.h>
 17#include <net/ip.h>
 18#include <net/tcp.h>
 19#include <net/psp.h>
 20
 21/**
 22 *	inet_twsk_bind_unhash - unhash a timewait socket from bind hash
 23 *	@tw: timewait socket
 24 *	@hashinfo: hashinfo pointer
 25 *
 26 *	unhash a timewait socket from bind hash, if hashed.
 27 *	bind hash lock must be held by caller.
 28 *	Returns 1 if caller should call inet_twsk_put() after lock release.
 29 */
 30void inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
 31			  struct inet_hashinfo *hashinfo)
 32{
 33	struct inet_bind2_bucket *tb2 = tw->tw_tb2;
 34	struct inet_bind_bucket *tb = tw->tw_tb;
 35
 36	if (!tb)
 37		return;
 38
 39	__sk_del_bind_node((struct sock *)tw);
 40	tw->tw_tb = NULL;
 41	tw->tw_tb2 = NULL;
 42	inet_bind2_bucket_destroy(hashinfo->bind2_bucket_cachep, tb2);
 43	inet_bind_bucket_destroy(tb);
 44
 45	__sock_put((struct sock *)tw);
 46}
 47
 48/* Must be called with locally disabled BHs. */
 49static void inet_twsk_kill(struct inet_timewait_sock *tw)
 50{
 51	struct inet_hashinfo *hashinfo = tw->tw_dr->hashinfo;
 52	spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash);
 53	struct inet_bind_hashbucket *bhead, *bhead2;
 54
 55	spin_lock(lock);
 56	sk_nulls_del_node_init_rcu((struct sock *)tw);
 57	spin_unlock(lock);
 58
 59	/* Disassociate with bind bucket. */
 60	bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), tw->tw_num,
 61			hashinfo->bhash_size)];
 62	bhead2 = inet_bhashfn_portaddr(hashinfo, (struct sock *)tw,
 63				       twsk_net(tw), tw->tw_num);
 64
 65	spin_lock(&bhead->lock);
 66	spin_lock(&bhead2->lock);
 67	inet_twsk_bind_unhash(tw, hashinfo);
 68	spin_unlock(&bhead2->lock);
 69	spin_unlock(&bhead->lock);
 70
 71	refcount_dec(&tw->tw_dr->tw_refcount);
 72	inet_twsk_put(tw);
 73}
 74
 75void inet_twsk_free(struct inet_timewait_sock *tw)
 76{
 77	struct module *owner = tw->tw_prot->owner;
 78
 79	tcp_twsk_destructor((struct sock *)tw);
 80	kmem_cache_free(tw->tw_prot->twsk_prot->twsk_slab, tw);
 81	module_put(owner);
 82}
 83
 84void inet_twsk_put(struct inet_timewait_sock *tw)
 85{
 86	if (refcount_dec_and_test(&tw->tw_refcnt))
 87		inet_twsk_free(tw);
 88}
 89EXPORT_SYMBOL_GPL(inet_twsk_put);
 90
 91static void inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo)
 92{
 93	__inet_twsk_schedule(tw, timeo, false);
 94}
 95
 96/*
 97 * Enter the time wait state.
 98 * Essentially we whip up a timewait bucket, copy the relevant info into it
 99 * from the SK, and mess with hash chains and list linkage.
100 *
101 * The caller must not access @tw anymore after this function returns.
102 */
103void inet_twsk_hashdance_schedule(struct inet_timewait_sock *tw,
104				  struct sock *sk,
105				  struct inet_hashinfo *hashinfo,
106				  int timeo)
107{
108	const struct inet_sock *inet = inet_sk(sk);
109	const struct inet_connection_sock *icsk = inet_csk(sk);
110	spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
111	struct inet_bind_hashbucket *bhead, *bhead2;
112
113	/* Put TW into bind hash. Original socket stays there too.
114	 * Note, that any socket with inet->num != 0 MUST be bound in
115	 * binding cache, even if it is closed.
116	 */
117	bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), inet->inet_num,
118			hashinfo->bhash_size)];
119	bhead2 = inet_bhashfn_portaddr(hashinfo, sk, twsk_net(tw), inet->inet_num);
120
121	local_bh_disable();
122	spin_lock(&bhead->lock);
123	spin_lock(&bhead2->lock);
124
125	tw->tw_tb = icsk->icsk_bind_hash;
126	WARN_ON(!icsk->icsk_bind_hash);
127
128	tw->tw_tb2 = icsk->icsk_bind2_hash;
129	WARN_ON(!icsk->icsk_bind2_hash);
130	sk_add_bind_node((struct sock *)tw, &tw->tw_tb2->owners);
131
132	spin_unlock(&bhead2->lock);
133	spin_unlock(&bhead->lock);
134
135	spin_lock(lock);
136
137	/* tw_refcnt is set to 3 because we have :
138	 * - one reference for bhash chain.
139	 * - one reference for ehash chain.
140	 * - one reference for timer.
141	 * Also note that after this point, we lost our implicit reference
142	 * so we are not allowed to use tw anymore.
143	 */
144	refcount_set(&tw->tw_refcnt, 3);
145
146	/* Ensure tw_refcnt has been set before tw is published.
147	 * smp_wmb() provides the necessary memory barrier to enforce this
148	 * ordering.
149	 */
150	smp_wmb();
151
152	hlist_nulls_replace_init_rcu(&sk->sk_nulls_node, &tw->tw_node);
153	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
154
155	inet_twsk_schedule(tw, timeo);
156
157	spin_unlock(lock);
158	local_bh_enable();
159}
160
161static void tw_timer_handler(struct timer_list *t)
162{
163	struct inet_timewait_sock *tw = timer_container_of(tw, t, tw_timer);
164
165	inet_twsk_kill(tw);
166}
167
168struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk,
169					   struct inet_timewait_death_row *dr,
170					   const int state)
171{
172	struct inet_timewait_sock *tw;
173
174	if (refcount_read(&dr->tw_refcount) - 1 >=
175	    READ_ONCE(dr->sysctl_max_tw_buckets))
176		return NULL;
177
178	tw = kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab,
179			      GFP_ATOMIC);
180	if (tw) {
181		const struct inet_sock *inet = inet_sk(sk);
182
183		tw->tw_dr	    = dr;
184		/* Give us an identity. */
185		tw->tw_daddr	    = inet->inet_daddr;
186		tw->tw_rcv_saddr    = inet->inet_rcv_saddr;
187		tw->tw_bound_dev_if = sk->sk_bound_dev_if;
188		tw->tw_tos	    = inet->tos;
189		tw->tw_num	    = inet->inet_num;
190		tw->tw_state	    = TCP_TIME_WAIT;
191		tw->tw_substate	    = state;
192		tw->tw_sport	    = inet->inet_sport;
193		tw->tw_dport	    = inet->inet_dport;
194		tw->tw_family	    = sk->sk_family;
195		tw->tw_reuse	    = sk->sk_reuse;
196		tw->tw_reuseport    = sk->sk_reuseport;
197		tw->tw_hash	    = sk->sk_hash;
198		tw->tw_ipv6only	    = 0;
199		tw->tw_transparent  = inet_test_bit(TRANSPARENT, sk);
200		tw->tw_connect_bind = !!(sk->sk_userlocks & SOCK_CONNECT_BIND);
201		tw->tw_prot	    = sk->sk_prot_creator;
202		atomic64_set(&tw->tw_cookie, atomic64_read(&sk->sk_cookie));
203		twsk_net_set(tw, sock_net(sk));
204		timer_setup(&tw->tw_timer, tw_timer_handler, 0);
205#ifdef CONFIG_SOCK_VALIDATE_XMIT
206		tw->tw_validate_xmit_skb = NULL;
207#endif
208		/*
209		 * Because we use RCU lookups, we should not set tw_refcnt
210		 * to a non null value before everything is setup for this
211		 * timewait socket.
212		 */
213		refcount_set(&tw->tw_refcnt, 0);
214
215		__module_get(tw->tw_prot->owner);
216		psp_twsk_init(tw, sk);
217	}
218
219	return tw;
220}
221
222/* These are always called from BH context.  See callers in
223 * tcp_input.c to verify this.
224 */
225
226/* This is for handling early-kills of TIME_WAIT sockets.
227 * Warning : consume reference.
228 * Caller should not access tw anymore.
229 */
230void inet_twsk_deschedule_put(struct inet_timewait_sock *tw)
231{
232	struct inet_hashinfo *hashinfo = tw->tw_dr->hashinfo;
233	spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash);
234
235	/* inet_twsk_purge() walks over all sockets, including tw ones,
236	 * and removes them via inet_twsk_deschedule_put() after a
237	 * refcount_inc_not_zero().
238	 *
239	 * inet_twsk_hashdance_schedule() must (re)init the refcount before
240	 * arming the timer, i.e. inet_twsk_purge can obtain a reference to
241	 * a twsk that did not yet schedule the timer.
242	 *
243	 * The ehash lock synchronizes these two:
244	 * After acquiring the lock, the timer is always scheduled (else
245	 * timer_shutdown returns false), because hashdance_schedule releases
246	 * the ehash lock only after completing the timer initialization.
247	 *
248	 * Without grabbing the ehash lock, we get:
249	 * 1) cpu x sets twsk refcount to 3
250	 * 2) cpu y bumps refcount to 4
251	 * 3) cpu y calls inet_twsk_deschedule_put() and shuts timer down
252	 * 4) cpu x tries to start timer, but mod_timer is a noop post-shutdown
253	 * -> timer refcount is never decremented.
254	 */
255	spin_lock(lock);
256	/*  Makes sure hashdance_schedule() has completed */
257	spin_unlock(lock);
258
259	if (timer_shutdown_sync(&tw->tw_timer))
260		inet_twsk_kill(tw);
261	inet_twsk_put(tw);
262}
263EXPORT_SYMBOL(inet_twsk_deschedule_put);
264
265void __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo, bool rearm)
266{
267	/* timeout := RTO * 3.5
268	 *
269	 * 3.5 = 1+2+0.5 to wait for two retransmits.
270	 *
271	 * RATIONALE: if FIN arrived and we entered TIME-WAIT state,
272	 * our ACK acking that FIN can be lost. If N subsequent retransmitted
273	 * FINs (or previous seqments) are lost (probability of such event
274	 * is p^(N+1), where p is probability to lose single packet and
275	 * time to detect the loss is about RTO*(2^N - 1) with exponential
276	 * backoff). Normal timewait length is calculated so, that we
277	 * waited at least for one retransmitted FIN (maximal RTO is 120sec).
278	 * [ BTW Linux. following BSD, violates this requirement waiting
279	 *   only for 60sec, we should wait at least for 240 secs.
280	 *   Well, 240 consumes too much of resources 8)
281	 * ]
282	 * This interval is not reduced to catch old duplicate and
283	 * responces to our wandering segments living for two MSLs.
284	 * However, if we use PAWS to detect
285	 * old duplicates, we can reduce the interval to bounds required
286	 * by RTO, rather than MSL. So, if peer understands PAWS, we
287	 * kill tw bucket after 3.5*RTO (it is important that this number
288	 * is greater than TS tick!) and detect old duplicates with help
289	 * of PAWS.
290	 */
291
292	if (!rearm) {
293		bool kill = timeo <= 4*HZ;
294
295		__NET_INC_STATS(twsk_net(tw), kill ? LINUX_MIB_TIMEWAITKILLED :
296						     LINUX_MIB_TIMEWAITED);
297		BUG_ON(mod_timer(&tw->tw_timer, jiffies + timeo));
298		refcount_inc(&tw->tw_dr->tw_refcount);
299	} else {
300		mod_timer_pending(&tw->tw_timer, jiffies + timeo);
301	}
302}
303
304/* Remove all non full sockets (TIME_WAIT and NEW_SYN_RECV) for dead netns */
305void inet_twsk_purge(struct inet_hashinfo *hashinfo)
306{
307	struct inet_ehash_bucket *head = &hashinfo->ehash[0];
308	unsigned int ehash_mask = hashinfo->ehash_mask;
309	struct hlist_nulls_node *node;
310	unsigned int slot;
311	struct sock *sk;
312
313	for (slot = 0; slot <= ehash_mask; slot++, head++) {
314		if (hlist_nulls_empty(&head->chain))
315			continue;
316
317restart_rcu:
318		cond_resched();
319		rcu_read_lock();
320restart:
321		sk_nulls_for_each_rcu(sk, node, &head->chain) {
322			int state = inet_sk_state_load(sk);
323
324			if ((1 << state) & ~(TCPF_TIME_WAIT |
325					     TCPF_NEW_SYN_RECV))
326				continue;
327
328			if (check_net(sock_net(sk)))
329				continue;
330
331			if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt)))
332				continue;
333
334			if (check_net(sock_net(sk))) {
335				sock_gen_put(sk);
336				goto restart;
337			}
338
339			rcu_read_unlock();
340			local_bh_disable();
341			if (state == TCP_TIME_WAIT) {
342				inet_twsk_deschedule_put(inet_twsk(sk));
343			} else {
344				struct request_sock *req = inet_reqsk(sk);
345
346				inet_csk_reqsk_queue_drop_and_put(req->rsk_listener,
347								  req);
348			}
349			local_bh_enable();
350			goto restart_rcu;
351		}
352		/* If the nulls value we got at the end of this lookup is
353		 * not the expected one, we must restart lookup.
354		 * We probably met an item that was moved to another chain.
355		 */
356		if (get_nulls_value(node) != slot)
357			goto restart;
358		rcu_read_unlock();
359	}
360}