Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at v2.6.13 903 lines 22 kB view raw
1/* 2 * IPVS An implementation of the IP virtual server support for the 3 * LINUX operating system. IPVS is now implemented as a module 4 * over the Netfilter framework. IPVS can be used to build a 5 * high-performance and highly available server based on a 6 * cluster of servers. 7 * 8 * Version: $Id: ip_vs_conn.c,v 1.31 2003/04/18 09:03:16 wensong Exp $ 9 * 10 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> 11 * Peter Kese <peter.kese@ijs.si> 12 * Julian Anastasov <ja@ssi.bg> 13 * 14 * This program is free software; you can redistribute it and/or 15 * modify it under the terms of the GNU General Public License 16 * as published by the Free Software Foundation; either version 17 * 2 of the License, or (at your option) any later version. 18 * 19 * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese, 20 * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms 21 * and others. Many code here is taken from IP MASQ code of kernel 2.2. 22 * 23 * Changes: 24 * 25 */ 26 27#include <linux/kernel.h> 28#include <linux/vmalloc.h> 29#include <linux/proc_fs.h> /* for proc_net_* */ 30#include <linux/seq_file.h> 31#include <linux/jhash.h> 32#include <linux/random.h> 33 34#include <net/ip_vs.h> 35 36 37/* 38 * Connection hash table: for input and output packets lookups of IPVS 39 */ 40static struct list_head *ip_vs_conn_tab; 41 42/* SLAB cache for IPVS connections */ 43static kmem_cache_t *ip_vs_conn_cachep; 44 45/* counter for current IPVS connections */ 46static atomic_t ip_vs_conn_count = ATOMIC_INIT(0); 47 48/* counter for no client port connections */ 49static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0); 50 51/* random value for IPVS connection hash */ 52static unsigned int ip_vs_conn_rnd; 53 54/* 55 * Fine locking granularity for big connection hash table 56 */ 57#define CT_LOCKARRAY_BITS 4 58#define CT_LOCKARRAY_SIZE (1<<CT_LOCKARRAY_BITS) 59#define CT_LOCKARRAY_MASK (CT_LOCKARRAY_SIZE-1) 60 61struct ip_vs_aligned_lock 62{ 63 rwlock_t l; 64} __attribute__((__aligned__(SMP_CACHE_BYTES))); 65 66/* lock array for conn table */ 67static struct ip_vs_aligned_lock 68__ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned; 69 70static inline void ct_read_lock(unsigned key) 71{ 72 read_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); 73} 74 75static inline void ct_read_unlock(unsigned key) 76{ 77 read_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); 78} 79 80static inline void ct_write_lock(unsigned key) 81{ 82 write_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); 83} 84 85static inline void ct_write_unlock(unsigned key) 86{ 87 write_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); 88} 89 90static inline void ct_read_lock_bh(unsigned key) 91{ 92 read_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); 93} 94 95static inline void ct_read_unlock_bh(unsigned key) 96{ 97 read_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); 98} 99 100static inline void ct_write_lock_bh(unsigned key) 101{ 102 write_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); 103} 104 105static inline void ct_write_unlock_bh(unsigned key) 106{ 107 write_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); 108} 109 110 111/* 112 * Returns hash value for IPVS connection entry 113 */ 114static unsigned int ip_vs_conn_hashkey(unsigned proto, __u32 addr, __u16 port) 115{ 116 return jhash_3words(addr, port, proto, ip_vs_conn_rnd) 117 & IP_VS_CONN_TAB_MASK; 118} 119 120 121/* 122 * Hashes ip_vs_conn in ip_vs_conn_tab by proto,addr,port. 123 * returns bool success. 124 */ 125static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) 126{ 127 unsigned hash; 128 int ret; 129 130 /* Hash by protocol, client address and port */ 131 hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr, cp->cport); 132 133 ct_write_lock(hash); 134 135 if (!(cp->flags & IP_VS_CONN_F_HASHED)) { 136 list_add(&cp->c_list, &ip_vs_conn_tab[hash]); 137 cp->flags |= IP_VS_CONN_F_HASHED; 138 atomic_inc(&cp->refcnt); 139 ret = 1; 140 } else { 141 IP_VS_ERR("ip_vs_conn_hash(): request for already hashed, " 142 "called from %p\n", __builtin_return_address(0)); 143 ret = 0; 144 } 145 146 ct_write_unlock(hash); 147 148 return ret; 149} 150 151 152/* 153 * UNhashes ip_vs_conn from ip_vs_conn_tab. 154 * returns bool success. 155 */ 156static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp) 157{ 158 unsigned hash; 159 int ret; 160 161 /* unhash it and decrease its reference counter */ 162 hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr, cp->cport); 163 164 ct_write_lock(hash); 165 166 if (cp->flags & IP_VS_CONN_F_HASHED) { 167 list_del(&cp->c_list); 168 cp->flags &= ~IP_VS_CONN_F_HASHED; 169 atomic_dec(&cp->refcnt); 170 ret = 1; 171 } else 172 ret = 0; 173 174 ct_write_unlock(hash); 175 176 return ret; 177} 178 179 180/* 181 * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab. 182 * Called for pkts coming from OUTside-to-INside. 183 * s_addr, s_port: pkt source address (foreign host) 184 * d_addr, d_port: pkt dest address (load balancer) 185 */ 186static inline struct ip_vs_conn *__ip_vs_conn_in_get 187(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port) 188{ 189 unsigned hash; 190 struct ip_vs_conn *cp; 191 192 hash = ip_vs_conn_hashkey(protocol, s_addr, s_port); 193 194 ct_read_lock(hash); 195 196 list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { 197 if (s_addr==cp->caddr && s_port==cp->cport && 198 d_port==cp->vport && d_addr==cp->vaddr && 199 protocol==cp->protocol) { 200 /* HIT */ 201 atomic_inc(&cp->refcnt); 202 ct_read_unlock(hash); 203 return cp; 204 } 205 } 206 207 ct_read_unlock(hash); 208 209 return NULL; 210} 211 212struct ip_vs_conn *ip_vs_conn_in_get 213(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port) 214{ 215 struct ip_vs_conn *cp; 216 217 cp = __ip_vs_conn_in_get(protocol, s_addr, s_port, d_addr, d_port); 218 if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt)) 219 cp = __ip_vs_conn_in_get(protocol, s_addr, 0, d_addr, d_port); 220 221 IP_VS_DBG(7, "lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", 222 ip_vs_proto_name(protocol), 223 NIPQUAD(s_addr), ntohs(s_port), 224 NIPQUAD(d_addr), ntohs(d_port), 225 cp?"hit":"not hit"); 226 227 return cp; 228} 229 230 231/* 232 * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab. 233 * Called for pkts coming from inside-to-OUTside. 234 * s_addr, s_port: pkt source address (inside host) 235 * d_addr, d_port: pkt dest address (foreign host) 236 */ 237struct ip_vs_conn *ip_vs_conn_out_get 238(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port) 239{ 240 unsigned hash; 241 struct ip_vs_conn *cp, *ret=NULL; 242 243 /* 244 * Check for "full" addressed entries 245 */ 246 hash = ip_vs_conn_hashkey(protocol, d_addr, d_port); 247 248 ct_read_lock(hash); 249 250 list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { 251 if (d_addr == cp->caddr && d_port == cp->cport && 252 s_port == cp->dport && s_addr == cp->daddr && 253 protocol == cp->protocol) { 254 /* HIT */ 255 atomic_inc(&cp->refcnt); 256 ret = cp; 257 break; 258 } 259 } 260 261 ct_read_unlock(hash); 262 263 IP_VS_DBG(7, "lookup/out %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", 264 ip_vs_proto_name(protocol), 265 NIPQUAD(s_addr), ntohs(s_port), 266 NIPQUAD(d_addr), ntohs(d_port), 267 ret?"hit":"not hit"); 268 269 return ret; 270} 271 272 273/* 274 * Put back the conn and restart its timer with its timeout 275 */ 276void ip_vs_conn_put(struct ip_vs_conn *cp) 277{ 278 /* reset it expire in its timeout */ 279 mod_timer(&cp->timer, jiffies+cp->timeout); 280 281 __ip_vs_conn_put(cp); 282} 283 284 285/* 286 * Fill a no_client_port connection with a client port number 287 */ 288void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __u16 cport) 289{ 290 if (ip_vs_conn_unhash(cp)) { 291 spin_lock(&cp->lock); 292 if (cp->flags & IP_VS_CONN_F_NO_CPORT) { 293 atomic_dec(&ip_vs_conn_no_cport_cnt); 294 cp->flags &= ~IP_VS_CONN_F_NO_CPORT; 295 cp->cport = cport; 296 } 297 spin_unlock(&cp->lock); 298 299 /* hash on new dport */ 300 ip_vs_conn_hash(cp); 301 } 302} 303 304 305/* 306 * Bind a connection entry with the corresponding packet_xmit. 307 * Called by ip_vs_conn_new. 308 */ 309static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp) 310{ 311 switch (IP_VS_FWD_METHOD(cp)) { 312 case IP_VS_CONN_F_MASQ: 313 cp->packet_xmit = ip_vs_nat_xmit; 314 break; 315 316 case IP_VS_CONN_F_TUNNEL: 317 cp->packet_xmit = ip_vs_tunnel_xmit; 318 break; 319 320 case IP_VS_CONN_F_DROUTE: 321 cp->packet_xmit = ip_vs_dr_xmit; 322 break; 323 324 case IP_VS_CONN_F_LOCALNODE: 325 cp->packet_xmit = ip_vs_null_xmit; 326 break; 327 328 case IP_VS_CONN_F_BYPASS: 329 cp->packet_xmit = ip_vs_bypass_xmit; 330 break; 331 } 332} 333 334 335static inline int ip_vs_dest_totalconns(struct ip_vs_dest *dest) 336{ 337 return atomic_read(&dest->activeconns) 338 + atomic_read(&dest->inactconns); 339} 340 341/* 342 * Bind a connection entry with a virtual service destination 343 * Called just after a new connection entry is created. 344 */ 345static inline void 346ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest) 347{ 348 /* if dest is NULL, then return directly */ 349 if (!dest) 350 return; 351 352 /* Increase the refcnt counter of the dest */ 353 atomic_inc(&dest->refcnt); 354 355 /* Bind with the destination and its corresponding transmitter */ 356 cp->flags |= atomic_read(&dest->conn_flags); 357 cp->dest = dest; 358 359 IP_VS_DBG(9, "Bind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d " 360 "d:%u.%u.%u.%u:%d fwd:%c s:%u flg:%X cnt:%d destcnt:%d\n", 361 ip_vs_proto_name(cp->protocol), 362 NIPQUAD(cp->caddr), ntohs(cp->cport), 363 NIPQUAD(cp->vaddr), ntohs(cp->vport), 364 NIPQUAD(cp->daddr), ntohs(cp->dport), 365 ip_vs_fwd_tag(cp), cp->state, 366 cp->flags, atomic_read(&cp->refcnt), 367 atomic_read(&dest->refcnt)); 368 369 /* Update the connection counters */ 370 if (cp->cport || (cp->flags & IP_VS_CONN_F_NO_CPORT)) { 371 /* It is a normal connection, so increase the inactive 372 connection counter because it is in TCP SYNRECV 373 state (inactive) or other protocol inacive state */ 374 atomic_inc(&dest->inactconns); 375 } else { 376 /* It is a persistent connection/template, so increase 377 the peristent connection counter */ 378 atomic_inc(&dest->persistconns); 379 } 380 381 if (dest->u_threshold != 0 && 382 ip_vs_dest_totalconns(dest) >= dest->u_threshold) 383 dest->flags |= IP_VS_DEST_F_OVERLOAD; 384} 385 386 387/* 388 * Unbind a connection entry with its VS destination 389 * Called by the ip_vs_conn_expire function. 390 */ 391static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp) 392{ 393 struct ip_vs_dest *dest = cp->dest; 394 395 if (!dest) 396 return; 397 398 IP_VS_DBG(9, "Unbind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d " 399 "d:%u.%u.%u.%u:%d fwd:%c s:%u flg:%X cnt:%d destcnt:%d\n", 400 ip_vs_proto_name(cp->protocol), 401 NIPQUAD(cp->caddr), ntohs(cp->cport), 402 NIPQUAD(cp->vaddr), ntohs(cp->vport), 403 NIPQUAD(cp->daddr), ntohs(cp->dport), 404 ip_vs_fwd_tag(cp), cp->state, 405 cp->flags, atomic_read(&cp->refcnt), 406 atomic_read(&dest->refcnt)); 407 408 /* Update the connection counters */ 409 if (cp->cport || (cp->flags & IP_VS_CONN_F_NO_CPORT)) { 410 /* It is a normal connection, so decrease the inactconns 411 or activeconns counter */ 412 if (cp->flags & IP_VS_CONN_F_INACTIVE) { 413 atomic_dec(&dest->inactconns); 414 } else { 415 atomic_dec(&dest->activeconns); 416 } 417 } else { 418 /* It is a persistent connection/template, so decrease 419 the peristent connection counter */ 420 atomic_dec(&dest->persistconns); 421 } 422 423 if (dest->l_threshold != 0) { 424 if (ip_vs_dest_totalconns(dest) < dest->l_threshold) 425 dest->flags &= ~IP_VS_DEST_F_OVERLOAD; 426 } else if (dest->u_threshold != 0) { 427 if (ip_vs_dest_totalconns(dest) * 4 < dest->u_threshold * 3) 428 dest->flags &= ~IP_VS_DEST_F_OVERLOAD; 429 } else { 430 if (dest->flags & IP_VS_DEST_F_OVERLOAD) 431 dest->flags &= ~IP_VS_DEST_F_OVERLOAD; 432 } 433 434 /* 435 * Simply decrease the refcnt of the dest, because the 436 * dest will be either in service's destination list 437 * or in the trash. 438 */ 439 atomic_dec(&dest->refcnt); 440} 441 442 443/* 444 * Checking if the destination of a connection template is available. 445 * If available, return 1, otherwise invalidate this connection 446 * template and return 0. 447 */ 448int ip_vs_check_template(struct ip_vs_conn *ct) 449{ 450 struct ip_vs_dest *dest = ct->dest; 451 452 /* 453 * Checking the dest server status. 454 */ 455 if ((dest == NULL) || 456 !(dest->flags & IP_VS_DEST_F_AVAILABLE) || 457 (sysctl_ip_vs_expire_quiescent_template && 458 (atomic_read(&dest->weight) == 0))) { 459 IP_VS_DBG(9, "check_template: dest not available for " 460 "protocol %s s:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d " 461 "-> d:%u.%u.%u.%u:%d\n", 462 ip_vs_proto_name(ct->protocol), 463 NIPQUAD(ct->caddr), ntohs(ct->cport), 464 NIPQUAD(ct->vaddr), ntohs(ct->vport), 465 NIPQUAD(ct->daddr), ntohs(ct->dport)); 466 467 /* 468 * Invalidate the connection template 469 */ 470 if (ct->cport) { 471 if (ip_vs_conn_unhash(ct)) { 472 ct->dport = 65535; 473 ct->vport = 65535; 474 ct->cport = 0; 475 ip_vs_conn_hash(ct); 476 } 477 } 478 479 /* 480 * Simply decrease the refcnt of the template, 481 * don't restart its timer. 482 */ 483 atomic_dec(&ct->refcnt); 484 return 0; 485 } 486 return 1; 487} 488 489static void ip_vs_conn_expire(unsigned long data) 490{ 491 struct ip_vs_conn *cp = (struct ip_vs_conn *)data; 492 493 cp->timeout = 60*HZ; 494 495 /* 496 * hey, I'm using it 497 */ 498 atomic_inc(&cp->refcnt); 499 500 /* 501 * do I control anybody? 502 */ 503 if (atomic_read(&cp->n_control)) 504 goto expire_later; 505 506 /* 507 * unhash it if it is hashed in the conn table 508 */ 509 if (!ip_vs_conn_unhash(cp)) 510 goto expire_later; 511 512 /* 513 * refcnt==1 implies I'm the only one referrer 514 */ 515 if (likely(atomic_read(&cp->refcnt) == 1)) { 516 /* delete the timer if it is activated by other users */ 517 if (timer_pending(&cp->timer)) 518 del_timer(&cp->timer); 519 520 /* does anybody control me? */ 521 if (cp->control) 522 ip_vs_control_del(cp); 523 524 if (unlikely(cp->app != NULL)) 525 ip_vs_unbind_app(cp); 526 ip_vs_unbind_dest(cp); 527 if (cp->flags & IP_VS_CONN_F_NO_CPORT) 528 atomic_dec(&ip_vs_conn_no_cport_cnt); 529 atomic_dec(&ip_vs_conn_count); 530 531 kmem_cache_free(ip_vs_conn_cachep, cp); 532 return; 533 } 534 535 /* hash it back to the table */ 536 ip_vs_conn_hash(cp); 537 538 expire_later: 539 IP_VS_DBG(7, "delayed: refcnt-1=%d conn.n_control=%d\n", 540 atomic_read(&cp->refcnt)-1, 541 atomic_read(&cp->n_control)); 542 543 ip_vs_conn_put(cp); 544} 545 546 547void ip_vs_conn_expire_now(struct ip_vs_conn *cp) 548{ 549 if (del_timer(&cp->timer)) 550 mod_timer(&cp->timer, jiffies); 551} 552 553 554/* 555 * Create a new connection entry and hash it into the ip_vs_conn_tab 556 */ 557struct ip_vs_conn * 558ip_vs_conn_new(int proto, __u32 caddr, __u16 cport, __u32 vaddr, __u16 vport, 559 __u32 daddr, __u16 dport, unsigned flags, 560 struct ip_vs_dest *dest) 561{ 562 struct ip_vs_conn *cp; 563 struct ip_vs_protocol *pp = ip_vs_proto_get(proto); 564 565 cp = kmem_cache_alloc(ip_vs_conn_cachep, GFP_ATOMIC); 566 if (cp == NULL) { 567 IP_VS_ERR_RL("ip_vs_conn_new: no memory available.\n"); 568 return NULL; 569 } 570 571 memset(cp, 0, sizeof(*cp)); 572 INIT_LIST_HEAD(&cp->c_list); 573 init_timer(&cp->timer); 574 cp->timer.data = (unsigned long)cp; 575 cp->timer.function = ip_vs_conn_expire; 576 cp->protocol = proto; 577 cp->caddr = caddr; 578 cp->cport = cport; 579 cp->vaddr = vaddr; 580 cp->vport = vport; 581 cp->daddr = daddr; 582 cp->dport = dport; 583 cp->flags = flags; 584 spin_lock_init(&cp->lock); 585 586 /* 587 * Set the entry is referenced by the current thread before hashing 588 * it in the table, so that other thread run ip_vs_random_dropentry 589 * but cannot drop this entry. 590 */ 591 atomic_set(&cp->refcnt, 1); 592 593 atomic_set(&cp->n_control, 0); 594 atomic_set(&cp->in_pkts, 0); 595 596 atomic_inc(&ip_vs_conn_count); 597 if (flags & IP_VS_CONN_F_NO_CPORT) 598 atomic_inc(&ip_vs_conn_no_cport_cnt); 599 600 /* Bind the connection with a destination server */ 601 ip_vs_bind_dest(cp, dest); 602 603 /* Set its state and timeout */ 604 cp->state = 0; 605 cp->timeout = 3*HZ; 606 607 /* Bind its packet transmitter */ 608 ip_vs_bind_xmit(cp); 609 610 if (unlikely(pp && atomic_read(&pp->appcnt))) 611 ip_vs_bind_app(cp, pp); 612 613 /* Hash it in the ip_vs_conn_tab finally */ 614 ip_vs_conn_hash(cp); 615 616 return cp; 617} 618 619 620/* 621 * /proc/net/ip_vs_conn entries 622 */ 623#ifdef CONFIG_PROC_FS 624 625static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos) 626{ 627 int idx; 628 struct ip_vs_conn *cp; 629 630 for(idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) { 631 ct_read_lock_bh(idx); 632 list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { 633 if (pos-- == 0) { 634 seq->private = &ip_vs_conn_tab[idx]; 635 return cp; 636 } 637 } 638 ct_read_unlock_bh(idx); 639 } 640 641 return NULL; 642} 643 644static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos) 645{ 646 seq->private = NULL; 647 return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN; 648} 649 650static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos) 651{ 652 struct ip_vs_conn *cp = v; 653 struct list_head *e, *l = seq->private; 654 int idx; 655 656 ++*pos; 657 if (v == SEQ_START_TOKEN) 658 return ip_vs_conn_array(seq, 0); 659 660 /* more on same hash chain? */ 661 if ((e = cp->c_list.next) != l) 662 return list_entry(e, struct ip_vs_conn, c_list); 663 664 idx = l - ip_vs_conn_tab; 665 ct_read_unlock_bh(idx); 666 667 while (++idx < IP_VS_CONN_TAB_SIZE) { 668 ct_read_lock_bh(idx); 669 list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { 670 seq->private = &ip_vs_conn_tab[idx]; 671 return cp; 672 } 673 ct_read_unlock_bh(idx); 674 } 675 seq->private = NULL; 676 return NULL; 677} 678 679static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v) 680{ 681 struct list_head *l = seq->private; 682 683 if (l) 684 ct_read_unlock_bh(l - ip_vs_conn_tab); 685} 686 687static int ip_vs_conn_seq_show(struct seq_file *seq, void *v) 688{ 689 690 if (v == SEQ_START_TOKEN) 691 seq_puts(seq, 692 "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Expires\n"); 693 else { 694 const struct ip_vs_conn *cp = v; 695 696 seq_printf(seq, 697 "%-3s %08X %04X %08X %04X %08X %04X %-11s %7lu\n", 698 ip_vs_proto_name(cp->protocol), 699 ntohl(cp->caddr), ntohs(cp->cport), 700 ntohl(cp->vaddr), ntohs(cp->vport), 701 ntohl(cp->daddr), ntohs(cp->dport), 702 ip_vs_state_name(cp->protocol, cp->state), 703 (cp->timer.expires-jiffies)/HZ); 704 } 705 return 0; 706} 707 708static struct seq_operations ip_vs_conn_seq_ops = { 709 .start = ip_vs_conn_seq_start, 710 .next = ip_vs_conn_seq_next, 711 .stop = ip_vs_conn_seq_stop, 712 .show = ip_vs_conn_seq_show, 713}; 714 715static int ip_vs_conn_open(struct inode *inode, struct file *file) 716{ 717 return seq_open(file, &ip_vs_conn_seq_ops); 718} 719 720static struct file_operations ip_vs_conn_fops = { 721 .owner = THIS_MODULE, 722 .open = ip_vs_conn_open, 723 .read = seq_read, 724 .llseek = seq_lseek, 725 .release = seq_release, 726}; 727#endif 728 729 730/* 731 * Randomly drop connection entries before running out of memory 732 */ 733static inline int todrop_entry(struct ip_vs_conn *cp) 734{ 735 /* 736 * The drop rate array needs tuning for real environments. 737 * Called from timer bh only => no locking 738 */ 739 static char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8}; 740 static char todrop_counter[9] = {0}; 741 int i; 742 743 /* if the conn entry hasn't lasted for 60 seconds, don't drop it. 744 This will leave enough time for normal connection to get 745 through. */ 746 if (time_before(cp->timeout + jiffies, cp->timer.expires + 60*HZ)) 747 return 0; 748 749 /* Don't drop the entry if its number of incoming packets is not 750 located in [0, 8] */ 751 i = atomic_read(&cp->in_pkts); 752 if (i > 8 || i < 0) return 0; 753 754 if (!todrop_rate[i]) return 0; 755 if (--todrop_counter[i] > 0) return 0; 756 757 todrop_counter[i] = todrop_rate[i]; 758 return 1; 759} 760 761/* Called from keventd and must protect itself from softirqs */ 762void ip_vs_random_dropentry(void) 763{ 764 int idx; 765 struct ip_vs_conn *cp; 766 767 /* 768 * Randomly scan 1/32 of the whole table every second 769 */ 770 for (idx = 0; idx < (IP_VS_CONN_TAB_SIZE>>5); idx++) { 771 unsigned hash = net_random() & IP_VS_CONN_TAB_MASK; 772 773 /* 774 * Lock is actually needed in this loop. 775 */ 776 ct_write_lock_bh(hash); 777 778 list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { 779 if (!cp->cport && !(cp->flags & IP_VS_CONN_F_NO_CPORT)) 780 /* connection template */ 781 continue; 782 783 if (cp->protocol == IPPROTO_TCP) { 784 switch(cp->state) { 785 case IP_VS_TCP_S_SYN_RECV: 786 case IP_VS_TCP_S_SYNACK: 787 break; 788 789 case IP_VS_TCP_S_ESTABLISHED: 790 if (todrop_entry(cp)) 791 break; 792 continue; 793 794 default: 795 continue; 796 } 797 } else { 798 if (!todrop_entry(cp)) 799 continue; 800 } 801 802 IP_VS_DBG(4, "del connection\n"); 803 ip_vs_conn_expire_now(cp); 804 if (cp->control) { 805 IP_VS_DBG(4, "del conn template\n"); 806 ip_vs_conn_expire_now(cp->control); 807 } 808 } 809 ct_write_unlock_bh(hash); 810 } 811} 812 813 814/* 815 * Flush all the connection entries in the ip_vs_conn_tab 816 */ 817static void ip_vs_conn_flush(void) 818{ 819 int idx; 820 struct ip_vs_conn *cp; 821 822 flush_again: 823 for (idx=0; idx<IP_VS_CONN_TAB_SIZE; idx++) { 824 /* 825 * Lock is actually needed in this loop. 826 */ 827 ct_write_lock_bh(idx); 828 829 list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { 830 831 IP_VS_DBG(4, "del connection\n"); 832 ip_vs_conn_expire_now(cp); 833 if (cp->control) { 834 IP_VS_DBG(4, "del conn template\n"); 835 ip_vs_conn_expire_now(cp->control); 836 } 837 } 838 ct_write_unlock_bh(idx); 839 } 840 841 /* the counter may be not NULL, because maybe some conn entries 842 are run by slow timer handler or unhashed but still referred */ 843 if (atomic_read(&ip_vs_conn_count) != 0) { 844 schedule(); 845 goto flush_again; 846 } 847} 848 849 850int ip_vs_conn_init(void) 851{ 852 int idx; 853 854 /* 855 * Allocate the connection hash table and initialize its list heads 856 */ 857 ip_vs_conn_tab = vmalloc(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head)); 858 if (!ip_vs_conn_tab) 859 return -ENOMEM; 860 861 /* Allocate ip_vs_conn slab cache */ 862 ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn", 863 sizeof(struct ip_vs_conn), 0, 864 SLAB_HWCACHE_ALIGN, NULL, NULL); 865 if (!ip_vs_conn_cachep) { 866 vfree(ip_vs_conn_tab); 867 return -ENOMEM; 868 } 869 870 IP_VS_INFO("Connection hash table configured " 871 "(size=%d, memory=%ldKbytes)\n", 872 IP_VS_CONN_TAB_SIZE, 873 (long)(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head))/1024); 874 IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n", 875 sizeof(struct ip_vs_conn)); 876 877 for (idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) { 878 INIT_LIST_HEAD(&ip_vs_conn_tab[idx]); 879 } 880 881 for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++) { 882 rwlock_init(&__ip_vs_conntbl_lock_array[idx].l); 883 } 884 885 proc_net_fops_create("ip_vs_conn", 0, &ip_vs_conn_fops); 886 887 /* calculate the random value for connection hash */ 888 get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd)); 889 890 return 0; 891} 892 893 894void ip_vs_conn_cleanup(void) 895{ 896 /* flush all the connection entries first */ 897 ip_vs_conn_flush(); 898 899 /* Release the empty cache */ 900 kmem_cache_destroy(ip_vs_conn_cachep); 901 proc_net_remove("ip_vs_conn"); 902 vfree(ip_vs_conn_tab); 903}