Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

[INET]: Generalise the TCP sock ID lookup routines

And also some TIME_WAIT functions.

[acme@toy net-2.6.14]$ grep built-in /tmp/before.size /tmp/after.size
/tmp/before.size: 282955 13122 9312 305389 4a8ed net/ipv4/built-in.o
/tmp/after.size: 281566 13122 9312 304000 4a380 net/ipv4/built-in.o
[acme@toy net-2.6.14]$

I kept them still inlined, will uninline at some point to see what
would be the performance difference.

Signed-off-by: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Signed-off-by: David S. Miller <davem@davemloft.net>

authored by

Arnaldo Carvalho de Melo and committed by
David S. Miller
e48c414e 8feaf0c0

+188 -165
+72 -5
include/net/inet_hashtables.h
··· 30 30 #include <net/tcp_states.h> 31 31 32 32 #include <asm/atomic.h> 33 + #include <asm/byteorder.h> 33 34 34 35 /* This is for all connections with a full identity, no wildcards. 35 36 * New scheme, half the table is for TIME_WAIT, the other half is ··· 286 285 const int dif); 287 286 288 287 /* Optimize the common listener case. */ 289 - static inline struct sock *inet_lookup_listener(struct inet_hashinfo *hashinfo, 290 - const u32 daddr, 291 - const unsigned short hnum, 292 - const int dif) 288 + static inline struct sock * 289 + inet_lookup_listener(struct inet_hashinfo *hashinfo, 290 + const u32 daddr, 291 + const unsigned short hnum, const int dif) 293 292 { 294 293 struct sock *sk = NULL; 295 - struct hlist_head *head; 294 + const struct hlist_head *head; 296 295 297 296 read_lock(&hashinfo->lhash_lock); 298 297 head = &hashinfo->listening_hash[inet_lhashfn(hnum)]; ··· 352 351 ((*((__u32 *)&(inet_twsk(__sk)->tw_dport))) == (__ports)) && \ 353 352 (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif)))) 354 353 #endif /* 64-bit arch */ 354 + 355 + /* 356 + * Sockets in TCP_CLOSE state are _always_ taken out of the hash, so we need 357 + * not check it for lookups anymore, thanks Alexey. -DaveM 358 + * 359 + * Local BH must be disabled here. 360 + */ 361 + static inline struct sock * 362 + __inet_lookup_established(struct inet_hashinfo *hashinfo, 363 + const u32 saddr, const u16 sport, 364 + const u32 daddr, const u16 hnum, 365 + const int dif) 366 + { 367 + INET_ADDR_COOKIE(acookie, saddr, daddr) 368 + const __u32 ports = INET_COMBINED_PORTS(sport, hnum); 369 + struct sock *sk; 370 + const struct hlist_node *node; 371 + /* Optimize here for direct hit, only listening connections can 372 + * have wildcards anyways. 373 + */ 374 + const int hash = inet_ehashfn(daddr, hnum, saddr, sport, hashinfo->ehash_size); 375 + struct inet_ehash_bucket *head = &hashinfo->ehash[hash]; 376 + 377 + read_lock(&head->lock); 378 + sk_for_each(sk, node, &head->chain) { 379 + if (INET_MATCH(sk, acookie, saddr, daddr, ports, dif)) 380 + goto hit; /* You sunk my battleship! */ 381 + } 382 + 383 + /* Must check for a TIME_WAIT'er before going to listener hash. */ 384 + sk_for_each(sk, node, &(head + hashinfo->ehash_size)->chain) { 385 + if (INET_TW_MATCH(sk, acookie, saddr, daddr, ports, dif)) 386 + goto hit; 387 + } 388 + sk = NULL; 389 + out: 390 + read_unlock(&head->lock); 391 + return sk; 392 + hit: 393 + sock_hold(sk); 394 + goto out; 395 + } 396 + 397 + static inline struct sock *__inet_lookup(struct inet_hashinfo *hashinfo, 398 + const u32 saddr, const u16 sport, 399 + const u32 daddr, const u16 hnum, 400 + const int dif) 401 + { 402 + struct sock *sk = __inet_lookup_established(hashinfo, saddr, sport, daddr, 403 + hnum, dif); 404 + return sk ? : inet_lookup_listener(hashinfo, daddr, hnum, dif); 405 + } 406 + 407 + static inline struct sock *inet_lookup(struct inet_hashinfo *hashinfo, 408 + const u32 saddr, const u16 sport, 409 + const u32 daddr, const u16 dport, 410 + const int dif) 411 + { 412 + struct sock *sk; 413 + 414 + local_bh_disable(); 415 + sk = __inet_lookup(hashinfo, saddr, sport, daddr, ntohs(dport), dif); 416 + local_bh_enable(); 417 + 418 + return sk; 419 + } 355 420 #endif /* _INET_HASHTABLES_H */
+9
include/net/inet_timewait_sock.h
··· 17 17 18 18 #include <linux/config.h> 19 19 20 + #include <linux/ip.h> 20 21 #include <linux/list.h> 21 22 #include <linux/types.h> 22 23 ··· 33 32 #endif 34 33 35 34 struct inet_bind_bucket; 35 + struct inet_hashinfo; 36 36 37 37 /* 38 38 * This is a TIME_WAIT sock. It works around the memory consumption ··· 141 139 kmem_cache_free(tw->tw_prot->twsk_slab, tw); 142 140 } 143 141 } 142 + 143 + extern void __inet_twsk_kill(struct inet_timewait_sock *tw, 144 + struct inet_hashinfo *hashinfo); 145 + 146 + extern void __inet_twsk_hashdance(struct inet_timewait_sock *tw, 147 + struct sock *sk, 148 + struct inet_hashinfo *hashinfo); 144 149 #endif /* _INET_TIMEWAIT_SOCK_ */
+6 -6
include/net/sock.h
··· 255 255 /* 256 256 * Hashed lists helper routines 257 257 */ 258 - static inline struct sock *__sk_head(struct hlist_head *head) 258 + static inline struct sock *__sk_head(const struct hlist_head *head) 259 259 { 260 260 return hlist_entry(head->first, struct sock, sk_node); 261 261 } 262 262 263 - static inline struct sock *sk_head(struct hlist_head *head) 263 + static inline struct sock *sk_head(const struct hlist_head *head) 264 264 { 265 265 return hlist_empty(head) ? NULL : __sk_head(head); 266 266 } 267 267 268 - static inline struct sock *sk_next(struct sock *sk) 268 + static inline struct sock *sk_next(const struct sock *sk) 269 269 { 270 270 return sk->sk_node.next ? 271 271 hlist_entry(sk->sk_node.next, struct sock, sk_node) : NULL; 272 272 } 273 273 274 - static inline int sk_unhashed(struct sock *sk) 274 + static inline int sk_unhashed(const struct sock *sk) 275 275 { 276 276 return hlist_unhashed(&sk->sk_node); 277 277 } 278 278 279 - static inline int sk_hashed(struct sock *sk) 279 + static inline int sk_hashed(const struct sock *sk) 280 280 { 281 281 return sk->sk_node.pprev != NULL; 282 282 } ··· 494 494 struct request_sock_ops; 495 495 496 496 /* Here is the right place to enable sock refcounting debugging */ 497 - #define SOCK_REFCNT_DEBUG 497 + //#define SOCK_REFCNT_DEBUG 498 498 499 499 /* Networking protocol blocks we attach to sockets. 500 500 * socket layer -> transport layer interface
+1
net/ipv4/Makefile
··· 5 5 obj-y := route.o inetpeer.o protocol.o \ 6 6 ip_input.o ip_fragment.o ip_forward.o ip_options.o \ 7 7 ip_output.o ip_sockglue.o inet_hashtables.o \ 8 + inet_timewait_sock.o \ 8 9 tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \ 9 10 tcp_minisocks.o tcp_cong.o \ 10 11 datagram.o raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \
+2
net/ipv4/inet_hashtables.c
··· 162 162 } 163 163 return result; 164 164 } 165 + 166 + EXPORT_SYMBOL_GPL(__inet_lookup_listener);
+83
net/ipv4/inet_timewait_sock.c
··· 1 + /* 2 + * INET An implementation of the TCP/IP protocol suite for the LINUX 3 + * operating system. INET is implemented using the BSD Socket 4 + * interface as the means of communication with the user level. 5 + * 6 + * Generic TIME_WAIT sockets functions 7 + * 8 + * From code orinally in TCP 9 + */ 10 + 11 + #include <linux/config.h> 12 + 13 + #include <net/inet_hashtables.h> 14 + #include <net/inet_timewait_sock.h> 15 + 16 + /* Must be called with locally disabled BHs. */ 17 + void __inet_twsk_kill(struct inet_timewait_sock *tw, struct inet_hashinfo *hashinfo) 18 + { 19 + struct inet_bind_hashbucket *bhead; 20 + struct inet_bind_bucket *tb; 21 + /* Unlink from established hashes. */ 22 + struct inet_ehash_bucket *ehead = &hashinfo->ehash[tw->tw_hashent]; 23 + 24 + write_lock(&ehead->lock); 25 + if (hlist_unhashed(&tw->tw_node)) { 26 + write_unlock(&ehead->lock); 27 + return; 28 + } 29 + __hlist_del(&tw->tw_node); 30 + sk_node_init(&tw->tw_node); 31 + write_unlock(&ehead->lock); 32 + 33 + /* Disassociate with bind bucket. */ 34 + bhead = &hashinfo->bhash[inet_bhashfn(tw->tw_num, hashinfo->bhash_size)]; 35 + spin_lock(&bhead->lock); 36 + tb = tw->tw_tb; 37 + __hlist_del(&tw->tw_bind_node); 38 + tw->tw_tb = NULL; 39 + inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); 40 + spin_unlock(&bhead->lock); 41 + #ifdef SOCK_REFCNT_DEBUG 42 + if (atomic_read(&tw->tw_refcnt) != 1) { 43 + printk(KERN_DEBUG "%s timewait_sock %p refcnt=%d\n", 44 + tw->tw_prot->name, tw, atomic_read(&tw->tw_refcnt)); 45 + } 46 + #endif 47 + inet_twsk_put(tw); 48 + } 49 + 50 + /* 51 + * Enter the time wait state. This is called with locally disabled BH. 52 + * Essentially we whip up a timewait bucket, copy the relevant info into it 53 + * from the SK, and mess with hash chains and list linkage. 54 + */ 55 + void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, 56 + struct inet_hashinfo *hashinfo) 57 + { 58 + const struct inet_sock *inet = inet_sk(sk); 59 + struct inet_ehash_bucket *ehead = &hashinfo->ehash[sk->sk_hashent]; 60 + struct inet_bind_hashbucket *bhead; 61 + /* Step 1: Put TW into bind hash. Original socket stays there too. 62 + Note, that any socket with inet->num != 0 MUST be bound in 63 + binding cache, even if it is closed. 64 + */ 65 + bhead = &hashinfo->bhash[inet_bhashfn(inet->num, hashinfo->bhash_size)]; 66 + spin_lock(&bhead->lock); 67 + tw->tw_tb = inet->bind_hash; 68 + BUG_TRAP(inet->bind_hash); 69 + inet_twsk_add_bind_node(tw, &tw->tw_tb->owners); 70 + spin_unlock(&bhead->lock); 71 + 72 + write_lock(&ehead->lock); 73 + 74 + /* Step 2: Remove SK from established hash. */ 75 + if (__sk_del_node_init(sk)) 76 + sock_prot_dec_use(sk->sk_prot); 77 + 78 + /* Step 3: Hash TW into TIMEWAIT half of established hash table. */ 79 + inet_twsk_add_node(tw, &(ehead + hashinfo->ehash_size)->chain); 80 + atomic_inc(&tw->tw_refcnt); 81 + 82 + write_unlock(&ehead->lock); 83 + }
+3 -5
net/ipv4/tcp_diag.c
··· 174 174 return -1; 175 175 } 176 176 177 - extern struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, 178 - int dif); 179 177 #ifdef CONFIG_IP_TCPDIAG_IPV6 180 178 extern struct sock *tcp_v6_lookup(struct in6_addr *saddr, u16 sport, 181 179 struct in6_addr *daddr, u16 dport, ··· 195 197 struct sk_buff *rep; 196 198 197 199 if (req->tcpdiag_family == AF_INET) { 198 - sk = tcp_v4_lookup(req->id.tcpdiag_dst[0], req->id.tcpdiag_dport, 199 - req->id.tcpdiag_src[0], req->id.tcpdiag_sport, 200 - req->id.tcpdiag_if); 200 + sk = inet_lookup(&tcp_hashinfo, req->id.tcpdiag_dst[0], 201 + req->id.tcpdiag_dport, req->id.tcpdiag_src[0], 202 + req->id.tcpdiag_sport, req->id.tcpdiag_if); 201 203 } 202 204 #ifdef CONFIG_IP_TCPDIAG_IPV6 203 205 else if (req->tcpdiag_family == AF_INET6) {
+8 -75
net/ipv4/tcp_ipv4.c
··· 238 238 inet_unhash(&tcp_hashinfo, sk); 239 239 } 240 240 241 - /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so 242 - * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM 243 - * 244 - * Local BH must be disabled here. 245 - */ 246 - 247 - static inline struct sock *__tcp_v4_lookup_established(const u32 saddr, 248 - const u16 sport, 249 - const u32 daddr, 250 - const u16 hnum, 251 - const int dif) 252 - { 253 - struct inet_ehash_bucket *head; 254 - INET_ADDR_COOKIE(acookie, saddr, daddr) 255 - const __u32 ports = INET_COMBINED_PORTS(sport, hnum); 256 - struct sock *sk; 257 - const struct hlist_node *node; 258 - /* Optimize here for direct hit, only listening connections can 259 - * have wildcards anyways. 260 - */ 261 - const int hash = inet_ehashfn(daddr, hnum, saddr, sport, tcp_hashinfo.ehash_size); 262 - head = &tcp_hashinfo.ehash[hash]; 263 - read_lock(&head->lock); 264 - sk_for_each(sk, node, &head->chain) { 265 - if (INET_MATCH(sk, acookie, saddr, daddr, ports, dif)) 266 - goto hit; /* You sunk my battleship! */ 267 - } 268 - 269 - /* Must check for a TIME_WAIT'er before going to listener hash. */ 270 - sk_for_each(sk, node, &(head + tcp_hashinfo.ehash_size)->chain) { 271 - if (INET_TW_MATCH(sk, acookie, saddr, daddr, ports, dif)) 272 - goto hit; 273 - } 274 - sk = NULL; 275 - out: 276 - read_unlock(&head->lock); 277 - return sk; 278 - hit: 279 - sock_hold(sk); 280 - goto out; 281 - } 282 - 283 - static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport, 284 - u32 daddr, u16 hnum, int dif) 285 - { 286 - struct sock *sk = __tcp_v4_lookup_established(saddr, sport, 287 - daddr, hnum, dif); 288 - 289 - return sk ? : inet_lookup_listener(&tcp_hashinfo, daddr, hnum, dif); 290 - } 291 - 292 - inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, 293 - u16 dport, int dif) 294 - { 295 - struct sock *sk; 296 - 297 - local_bh_disable(); 298 - sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif); 299 - local_bh_enable(); 300 - 301 - return sk; 302 - } 303 - 304 - EXPORT_SYMBOL_GPL(tcp_v4_lookup); 305 - 306 241 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb) 307 242 { 308 243 return secure_tcp_sequence_number(skb->nh.iph->daddr, ··· 686 751 return; 687 752 } 688 753 689 - sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr, 690 - th->source, tcp_v4_iif(skb)); 754 + sk = inet_lookup(&tcp_hashinfo, iph->daddr, th->dest, iph->saddr, 755 + th->source, tcp_v4_iif(skb)); 691 756 if (!sk) { 692 757 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); 693 758 return; ··· 1294 1359 if (req) 1295 1360 return tcp_check_req(sk, skb, req, prev); 1296 1361 1297 - nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr, 1298 - th->source, 1299 - skb->nh.iph->daddr, 1300 - ntohs(th->dest), 1301 - tcp_v4_iif(skb)); 1362 + nsk = __inet_lookup_established(&tcp_hashinfo, skb->nh.iph->saddr, 1363 + th->source, skb->nh.iph->daddr, 1364 + ntohs(th->dest), tcp_v4_iif(skb)); 1302 1365 1303 1366 if (nsk) { 1304 1367 if (nsk->sk_state != TCP_TIME_WAIT) { ··· 1438 1505 TCP_SKB_CB(skb)->flags = skb->nh.iph->tos; 1439 1506 TCP_SKB_CB(skb)->sacked = 0; 1440 1507 1441 - sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source, 1442 - skb->nh.iph->daddr, ntohs(th->dest), 1443 - tcp_v4_iif(skb)); 1508 + sk = __inet_lookup(&tcp_hashinfo, skb->nh.iph->saddr, th->source, 1509 + skb->nh.iph->daddr, ntohs(th->dest), 1510 + tcp_v4_iif(skb)); 1444 1511 1445 1512 if (!sk) 1446 1513 goto no_tcp_socket;
+4 -74
net/ipv4/tcp_minisocks.c
··· 56 56 57 57 int tcp_tw_count; 58 58 59 - 60 - /* Must be called with locally disabled BHs. */ 61 - static void tcp_timewait_kill(struct inet_timewait_sock *tw) 62 - { 63 - struct inet_bind_hashbucket *bhead; 64 - struct inet_bind_bucket *tb; 65 - /* Unlink from established hashes. */ 66 - struct inet_ehash_bucket *ehead = &tcp_hashinfo.ehash[tw->tw_hashent]; 67 - 68 - write_lock(&ehead->lock); 69 - if (hlist_unhashed(&tw->tw_node)) { 70 - write_unlock(&ehead->lock); 71 - return; 72 - } 73 - __hlist_del(&tw->tw_node); 74 - sk_node_init(&tw->tw_node); 75 - write_unlock(&ehead->lock); 76 - 77 - /* Disassociate with bind bucket. */ 78 - bhead = &tcp_hashinfo.bhash[inet_bhashfn(tw->tw_num, tcp_hashinfo.bhash_size)]; 79 - spin_lock(&bhead->lock); 80 - tb = tw->tw_tb; 81 - __hlist_del(&tw->tw_bind_node); 82 - tw->tw_tb = NULL; 83 - inet_bind_bucket_destroy(tcp_hashinfo.bind_bucket_cachep, tb); 84 - spin_unlock(&bhead->lock); 85 - 86 - #ifdef SOCK_REFCNT_DEBUG 87 - if (atomic_read(&tw->tw_refcnt) != 1) { 88 - printk(KERN_DEBUG "%s timewait_sock %p refcnt=%d\n", 89 - tw->tw_prot->name, tw, atomic_read(&tw->tw_refcnt)); 90 - } 91 - #endif 92 - inet_twsk_put(tw); 93 - } 94 - 95 59 /* 96 60 * * Main purpose of TIME-WAIT state is to close connection gracefully, 97 61 * when one of ends sits in LAST-ACK or CLOSING retransmitting FIN ··· 254 290 return TCP_TW_SUCCESS; 255 291 } 256 292 257 - /* Enter the time wait state. This is called with locally disabled BH. 258 - * Essentially we whip up a timewait bucket, copy the 259 - * relevant info into it from the SK, and mess with hash chains 260 - * and list linkage. 261 - */ 262 - static void __tcp_tw_hashdance(struct sock *sk, struct inet_timewait_sock *tw) 263 - { 264 - const struct inet_sock *inet = inet_sk(sk); 265 - struct inet_ehash_bucket *ehead = &tcp_hashinfo.ehash[sk->sk_hashent]; 266 - struct inet_bind_hashbucket *bhead; 267 - /* Step 1: Put TW into bind hash. Original socket stays there too. 268 - Note, that any socket with inet->num != 0 MUST be bound in 269 - binding cache, even if it is closed. 270 - */ 271 - bhead = &tcp_hashinfo.bhash[inet_bhashfn(inet->num, tcp_hashinfo.bhash_size)]; 272 - spin_lock(&bhead->lock); 273 - tw->tw_tb = inet->bind_hash; 274 - BUG_TRAP(inet->bind_hash); 275 - inet_twsk_add_bind_node(tw, &tw->tw_tb->owners); 276 - spin_unlock(&bhead->lock); 277 - 278 - write_lock(&ehead->lock); 279 - 280 - /* Step 2: Remove SK from established hash. */ 281 - if (__sk_del_node_init(sk)) 282 - sock_prot_dec_use(sk->sk_prot); 283 - 284 - /* Step 3: Hash TW into TIMEWAIT half of established hash table. */ 285 - inet_twsk_add_node(tw, &(ehead + tcp_hashinfo.ehash_size)->chain); 286 - atomic_inc(&tw->tw_refcnt); 287 - 288 - write_unlock(&ehead->lock); 289 - } 290 - 291 293 /* 292 294 * Move a socket to time-wait or dead fin-wait-2 state. 293 295 */ ··· 311 381 tw->tw_ipv6only = 0; 312 382 #endif 313 383 /* Linkage updates. */ 314 - __tcp_tw_hashdance(sk, tw); 384 + __inet_twsk_hashdance(tw, sk, &tcp_hashinfo); 315 385 316 386 /* Get the TIME_WAIT timeout firing. */ 317 387 if (timeo < rto) ··· 378 448 inet_twsk_for_each_inmate(tw, node, &tcp_tw_death_row[slot]) { 379 449 __inet_twsk_del_dead_node(tw); 380 450 spin_unlock(&tw_death_lock); 381 - tcp_timewait_kill(tw); 451 + __inet_twsk_kill(tw, &tcp_hashinfo); 382 452 inet_twsk_put(tw); 383 453 killed++; 384 454 spin_lock(&tw_death_lock); ··· 474 544 del_timer(&tcp_tw_timer); 475 545 } 476 546 spin_unlock(&tw_death_lock); 477 - tcp_timewait_kill(tw); 547 + __inet_twsk_kill(tw, &tcp_hashinfo); 478 548 } 479 549 480 550 /* Short-time timewait calendar */ ··· 583 653 inet_twsk_for_each_inmate_safe(tw, node, safe, 584 654 &tcp_twcal_row[slot]) { 585 655 __inet_twsk_del_dead_node(tw); 586 - tcp_timewait_kill(tw); 656 + __inet_twsk_kill(tw, &tcp_hashinfo); 587 657 inet_twsk_put(tw); 588 658 killed++; 589 659 }