Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at v2.6.19 944 lines 23 kB view raw
1/* 2 * IPVS An implementation of the IP virtual server support for the 3 * LINUX operating system. IPVS is now implemented as a module 4 * over the Netfilter framework. IPVS can be used to build a 5 * high-performance and highly available server based on a 6 * cluster of servers. 7 * 8 * Version: $Id: ip_vs_conn.c,v 1.31 2003/04/18 09:03:16 wensong Exp $ 9 * 10 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> 11 * Peter Kese <peter.kese@ijs.si> 12 * Julian Anastasov <ja@ssi.bg> 13 * 14 * This program is free software; you can redistribute it and/or 15 * modify it under the terms of the GNU General Public License 16 * as published by the Free Software Foundation; either version 17 * 2 of the License, or (at your option) any later version. 18 * 19 * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese, 20 * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms 21 * and others. Many code here is taken from IP MASQ code of kernel 2.2. 22 * 23 * Changes: 24 * 25 */ 26 27#include <linux/interrupt.h> 28#include <linux/in.h> 29#include <linux/net.h> 30#include <linux/kernel.h> 31#include <linux/module.h> 32#include <linux/vmalloc.h> 33#include <linux/proc_fs.h> /* for proc_net_* */ 34#include <linux/seq_file.h> 35#include <linux/jhash.h> 36#include <linux/random.h> 37 38#include <net/ip_vs.h> 39 40 41/* 42 * Connection hash table: for input and output packets lookups of IPVS 43 */ 44static struct list_head *ip_vs_conn_tab; 45 46/* SLAB cache for IPVS connections */ 47static kmem_cache_t *ip_vs_conn_cachep __read_mostly; 48 49/* counter for current IPVS connections */ 50static atomic_t ip_vs_conn_count = ATOMIC_INIT(0); 51 52/* counter for no client port connections */ 53static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0); 54 55/* random value for IPVS connection hash */ 56static unsigned int ip_vs_conn_rnd; 57 58/* 59 * Fine locking granularity for big connection hash table 60 */ 61#define CT_LOCKARRAY_BITS 4 62#define CT_LOCKARRAY_SIZE (1<<CT_LOCKARRAY_BITS) 63#define CT_LOCKARRAY_MASK (CT_LOCKARRAY_SIZE-1) 64 65struct ip_vs_aligned_lock 66{ 67 rwlock_t l; 68} __attribute__((__aligned__(SMP_CACHE_BYTES))); 69 70/* lock array for conn table */ 71static struct ip_vs_aligned_lock 72__ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned; 73 74static inline void ct_read_lock(unsigned key) 75{ 76 read_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); 77} 78 79static inline void ct_read_unlock(unsigned key) 80{ 81 read_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); 82} 83 84static inline void ct_write_lock(unsigned key) 85{ 86 write_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); 87} 88 89static inline void ct_write_unlock(unsigned key) 90{ 91 write_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); 92} 93 94static inline void ct_read_lock_bh(unsigned key) 95{ 96 read_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); 97} 98 99static inline void ct_read_unlock_bh(unsigned key) 100{ 101 read_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); 102} 103 104static inline void ct_write_lock_bh(unsigned key) 105{ 106 write_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); 107} 108 109static inline void ct_write_unlock_bh(unsigned key) 110{ 111 write_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); 112} 113 114 115/* 116 * Returns hash value for IPVS connection entry 117 */ 118static unsigned int ip_vs_conn_hashkey(unsigned proto, __be32 addr, __be16 port) 119{ 120 return jhash_3words((__force u32)addr, (__force u32)port, proto, ip_vs_conn_rnd) 121 & IP_VS_CONN_TAB_MASK; 122} 123 124 125/* 126 * Hashes ip_vs_conn in ip_vs_conn_tab by proto,addr,port. 127 * returns bool success. 128 */ 129static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) 130{ 131 unsigned hash; 132 int ret; 133 134 /* Hash by protocol, client address and port */ 135 hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr, cp->cport); 136 137 ct_write_lock(hash); 138 139 if (!(cp->flags & IP_VS_CONN_F_HASHED)) { 140 list_add(&cp->c_list, &ip_vs_conn_tab[hash]); 141 cp->flags |= IP_VS_CONN_F_HASHED; 142 atomic_inc(&cp->refcnt); 143 ret = 1; 144 } else { 145 IP_VS_ERR("ip_vs_conn_hash(): request for already hashed, " 146 "called from %p\n", __builtin_return_address(0)); 147 ret = 0; 148 } 149 150 ct_write_unlock(hash); 151 152 return ret; 153} 154 155 156/* 157 * UNhashes ip_vs_conn from ip_vs_conn_tab. 158 * returns bool success. 159 */ 160static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp) 161{ 162 unsigned hash; 163 int ret; 164 165 /* unhash it and decrease its reference counter */ 166 hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr, cp->cport); 167 168 ct_write_lock(hash); 169 170 if (cp->flags & IP_VS_CONN_F_HASHED) { 171 list_del(&cp->c_list); 172 cp->flags &= ~IP_VS_CONN_F_HASHED; 173 atomic_dec(&cp->refcnt); 174 ret = 1; 175 } else 176 ret = 0; 177 178 ct_write_unlock(hash); 179 180 return ret; 181} 182 183 184/* 185 * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab. 186 * Called for pkts coming from OUTside-to-INside. 187 * s_addr, s_port: pkt source address (foreign host) 188 * d_addr, d_port: pkt dest address (load balancer) 189 */ 190static inline struct ip_vs_conn *__ip_vs_conn_in_get 191(int protocol, __be32 s_addr, __be16 s_port, __be32 d_addr, __be16 d_port) 192{ 193 unsigned hash; 194 struct ip_vs_conn *cp; 195 196 hash = ip_vs_conn_hashkey(protocol, s_addr, s_port); 197 198 ct_read_lock(hash); 199 200 list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { 201 if (s_addr==cp->caddr && s_port==cp->cport && 202 d_port==cp->vport && d_addr==cp->vaddr && 203 ((!s_port) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) && 204 protocol==cp->protocol) { 205 /* HIT */ 206 atomic_inc(&cp->refcnt); 207 ct_read_unlock(hash); 208 return cp; 209 } 210 } 211 212 ct_read_unlock(hash); 213 214 return NULL; 215} 216 217struct ip_vs_conn *ip_vs_conn_in_get 218(int protocol, __be32 s_addr, __be16 s_port, __be32 d_addr, __be16 d_port) 219{ 220 struct ip_vs_conn *cp; 221 222 cp = __ip_vs_conn_in_get(protocol, s_addr, s_port, d_addr, d_port); 223 if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt)) 224 cp = __ip_vs_conn_in_get(protocol, s_addr, 0, d_addr, d_port); 225 226 IP_VS_DBG(9, "lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", 227 ip_vs_proto_name(protocol), 228 NIPQUAD(s_addr), ntohs(s_port), 229 NIPQUAD(d_addr), ntohs(d_port), 230 cp?"hit":"not hit"); 231 232 return cp; 233} 234 235/* Get reference to connection template */ 236struct ip_vs_conn *ip_vs_ct_in_get 237(int protocol, __be32 s_addr, __be16 s_port, __be32 d_addr, __be16 d_port) 238{ 239 unsigned hash; 240 struct ip_vs_conn *cp; 241 242 hash = ip_vs_conn_hashkey(protocol, s_addr, s_port); 243 244 ct_read_lock(hash); 245 246 list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { 247 if (s_addr==cp->caddr && s_port==cp->cport && 248 d_port==cp->vport && d_addr==cp->vaddr && 249 cp->flags & IP_VS_CONN_F_TEMPLATE && 250 protocol==cp->protocol) { 251 /* HIT */ 252 atomic_inc(&cp->refcnt); 253 goto out; 254 } 255 } 256 cp = NULL; 257 258 out: 259 ct_read_unlock(hash); 260 261 IP_VS_DBG(9, "template lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", 262 ip_vs_proto_name(protocol), 263 NIPQUAD(s_addr), ntohs(s_port), 264 NIPQUAD(d_addr), ntohs(d_port), 265 cp?"hit":"not hit"); 266 267 return cp; 268} 269 270/* 271 * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab. 272 * Called for pkts coming from inside-to-OUTside. 273 * s_addr, s_port: pkt source address (inside host) 274 * d_addr, d_port: pkt dest address (foreign host) 275 */ 276struct ip_vs_conn *ip_vs_conn_out_get 277(int protocol, __be32 s_addr, __be16 s_port, __be32 d_addr, __be16 d_port) 278{ 279 unsigned hash; 280 struct ip_vs_conn *cp, *ret=NULL; 281 282 /* 283 * Check for "full" addressed entries 284 */ 285 hash = ip_vs_conn_hashkey(protocol, d_addr, d_port); 286 287 ct_read_lock(hash); 288 289 list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { 290 if (d_addr == cp->caddr && d_port == cp->cport && 291 s_port == cp->dport && s_addr == cp->daddr && 292 protocol == cp->protocol) { 293 /* HIT */ 294 atomic_inc(&cp->refcnt); 295 ret = cp; 296 break; 297 } 298 } 299 300 ct_read_unlock(hash); 301 302 IP_VS_DBG(9, "lookup/out %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", 303 ip_vs_proto_name(protocol), 304 NIPQUAD(s_addr), ntohs(s_port), 305 NIPQUAD(d_addr), ntohs(d_port), 306 ret?"hit":"not hit"); 307 308 return ret; 309} 310 311 312/* 313 * Put back the conn and restart its timer with its timeout 314 */ 315void ip_vs_conn_put(struct ip_vs_conn *cp) 316{ 317 /* reset it expire in its timeout */ 318 mod_timer(&cp->timer, jiffies+cp->timeout); 319 320 __ip_vs_conn_put(cp); 321} 322 323 324/* 325 * Fill a no_client_port connection with a client port number 326 */ 327void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport) 328{ 329 if (ip_vs_conn_unhash(cp)) { 330 spin_lock(&cp->lock); 331 if (cp->flags & IP_VS_CONN_F_NO_CPORT) { 332 atomic_dec(&ip_vs_conn_no_cport_cnt); 333 cp->flags &= ~IP_VS_CONN_F_NO_CPORT; 334 cp->cport = cport; 335 } 336 spin_unlock(&cp->lock); 337 338 /* hash on new dport */ 339 ip_vs_conn_hash(cp); 340 } 341} 342 343 344/* 345 * Bind a connection entry with the corresponding packet_xmit. 346 * Called by ip_vs_conn_new. 347 */ 348static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp) 349{ 350 switch (IP_VS_FWD_METHOD(cp)) { 351 case IP_VS_CONN_F_MASQ: 352 cp->packet_xmit = ip_vs_nat_xmit; 353 break; 354 355 case IP_VS_CONN_F_TUNNEL: 356 cp->packet_xmit = ip_vs_tunnel_xmit; 357 break; 358 359 case IP_VS_CONN_F_DROUTE: 360 cp->packet_xmit = ip_vs_dr_xmit; 361 break; 362 363 case IP_VS_CONN_F_LOCALNODE: 364 cp->packet_xmit = ip_vs_null_xmit; 365 break; 366 367 case IP_VS_CONN_F_BYPASS: 368 cp->packet_xmit = ip_vs_bypass_xmit; 369 break; 370 } 371} 372 373 374static inline int ip_vs_dest_totalconns(struct ip_vs_dest *dest) 375{ 376 return atomic_read(&dest->activeconns) 377 + atomic_read(&dest->inactconns); 378} 379 380/* 381 * Bind a connection entry with a virtual service destination 382 * Called just after a new connection entry is created. 383 */ 384static inline void 385ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest) 386{ 387 /* if dest is NULL, then return directly */ 388 if (!dest) 389 return; 390 391 /* Increase the refcnt counter of the dest */ 392 atomic_inc(&dest->refcnt); 393 394 /* Bind with the destination and its corresponding transmitter */ 395 cp->flags |= atomic_read(&dest->conn_flags); 396 cp->dest = dest; 397 398 IP_VS_DBG(7, "Bind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d " 399 "d:%u.%u.%u.%u:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d " 400 "dest->refcnt:%d\n", 401 ip_vs_proto_name(cp->protocol), 402 NIPQUAD(cp->caddr), ntohs(cp->cport), 403 NIPQUAD(cp->vaddr), ntohs(cp->vport), 404 NIPQUAD(cp->daddr), ntohs(cp->dport), 405 ip_vs_fwd_tag(cp), cp->state, 406 cp->flags, atomic_read(&cp->refcnt), 407 atomic_read(&dest->refcnt)); 408 409 /* Update the connection counters */ 410 if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) { 411 /* It is a normal connection, so increase the inactive 412 connection counter because it is in TCP SYNRECV 413 state (inactive) or other protocol inacive state */ 414 atomic_inc(&dest->inactconns); 415 } else { 416 /* It is a persistent connection/template, so increase 417 the peristent connection counter */ 418 atomic_inc(&dest->persistconns); 419 } 420 421 if (dest->u_threshold != 0 && 422 ip_vs_dest_totalconns(dest) >= dest->u_threshold) 423 dest->flags |= IP_VS_DEST_F_OVERLOAD; 424} 425 426 427/* 428 * Unbind a connection entry with its VS destination 429 * Called by the ip_vs_conn_expire function. 430 */ 431static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp) 432{ 433 struct ip_vs_dest *dest = cp->dest; 434 435 if (!dest) 436 return; 437 438 IP_VS_DBG(7, "Unbind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d " 439 "d:%u.%u.%u.%u:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d " 440 "dest->refcnt:%d\n", 441 ip_vs_proto_name(cp->protocol), 442 NIPQUAD(cp->caddr), ntohs(cp->cport), 443 NIPQUAD(cp->vaddr), ntohs(cp->vport), 444 NIPQUAD(cp->daddr), ntohs(cp->dport), 445 ip_vs_fwd_tag(cp), cp->state, 446 cp->flags, atomic_read(&cp->refcnt), 447 atomic_read(&dest->refcnt)); 448 449 /* Update the connection counters */ 450 if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) { 451 /* It is a normal connection, so decrease the inactconns 452 or activeconns counter */ 453 if (cp->flags & IP_VS_CONN_F_INACTIVE) { 454 atomic_dec(&dest->inactconns); 455 } else { 456 atomic_dec(&dest->activeconns); 457 } 458 } else { 459 /* It is a persistent connection/template, so decrease 460 the peristent connection counter */ 461 atomic_dec(&dest->persistconns); 462 } 463 464 if (dest->l_threshold != 0) { 465 if (ip_vs_dest_totalconns(dest) < dest->l_threshold) 466 dest->flags &= ~IP_VS_DEST_F_OVERLOAD; 467 } else if (dest->u_threshold != 0) { 468 if (ip_vs_dest_totalconns(dest) * 4 < dest->u_threshold * 3) 469 dest->flags &= ~IP_VS_DEST_F_OVERLOAD; 470 } else { 471 if (dest->flags & IP_VS_DEST_F_OVERLOAD) 472 dest->flags &= ~IP_VS_DEST_F_OVERLOAD; 473 } 474 475 /* 476 * Simply decrease the refcnt of the dest, because the 477 * dest will be either in service's destination list 478 * or in the trash. 479 */ 480 atomic_dec(&dest->refcnt); 481} 482 483 484/* 485 * Checking if the destination of a connection template is available. 486 * If available, return 1, otherwise invalidate this connection 487 * template and return 0. 488 */ 489int ip_vs_check_template(struct ip_vs_conn *ct) 490{ 491 struct ip_vs_dest *dest = ct->dest; 492 493 /* 494 * Checking the dest server status. 495 */ 496 if ((dest == NULL) || 497 !(dest->flags & IP_VS_DEST_F_AVAILABLE) || 498 (sysctl_ip_vs_expire_quiescent_template && 499 (atomic_read(&dest->weight) == 0))) { 500 IP_VS_DBG(9, "check_template: dest not available for " 501 "protocol %s s:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d " 502 "-> d:%u.%u.%u.%u:%d\n", 503 ip_vs_proto_name(ct->protocol), 504 NIPQUAD(ct->caddr), ntohs(ct->cport), 505 NIPQUAD(ct->vaddr), ntohs(ct->vport), 506 NIPQUAD(ct->daddr), ntohs(ct->dport)); 507 508 /* 509 * Invalidate the connection template 510 */ 511 if (ct->vport != htons(0xffff)) { 512 if (ip_vs_conn_unhash(ct)) { 513 ct->dport = htons(0xffff); 514 ct->vport = htons(0xffff); 515 ct->cport = 0; 516 ip_vs_conn_hash(ct); 517 } 518 } 519 520 /* 521 * Simply decrease the refcnt of the template, 522 * don't restart its timer. 523 */ 524 atomic_dec(&ct->refcnt); 525 return 0; 526 } 527 return 1; 528} 529 530static void ip_vs_conn_expire(unsigned long data) 531{ 532 struct ip_vs_conn *cp = (struct ip_vs_conn *)data; 533 534 cp->timeout = 60*HZ; 535 536 /* 537 * hey, I'm using it 538 */ 539 atomic_inc(&cp->refcnt); 540 541 /* 542 * do I control anybody? 543 */ 544 if (atomic_read(&cp->n_control)) 545 goto expire_later; 546 547 /* 548 * unhash it if it is hashed in the conn table 549 */ 550 if (!ip_vs_conn_unhash(cp)) 551 goto expire_later; 552 553 /* 554 * refcnt==1 implies I'm the only one referrer 555 */ 556 if (likely(atomic_read(&cp->refcnt) == 1)) { 557 /* delete the timer if it is activated by other users */ 558 if (timer_pending(&cp->timer)) 559 del_timer(&cp->timer); 560 561 /* does anybody control me? */ 562 if (cp->control) 563 ip_vs_control_del(cp); 564 565 if (unlikely(cp->app != NULL)) 566 ip_vs_unbind_app(cp); 567 ip_vs_unbind_dest(cp); 568 if (cp->flags & IP_VS_CONN_F_NO_CPORT) 569 atomic_dec(&ip_vs_conn_no_cport_cnt); 570 atomic_dec(&ip_vs_conn_count); 571 572 kmem_cache_free(ip_vs_conn_cachep, cp); 573 return; 574 } 575 576 /* hash it back to the table */ 577 ip_vs_conn_hash(cp); 578 579 expire_later: 580 IP_VS_DBG(7, "delayed: conn->refcnt-1=%d conn->n_control=%d\n", 581 atomic_read(&cp->refcnt)-1, 582 atomic_read(&cp->n_control)); 583 584 ip_vs_conn_put(cp); 585} 586 587 588void ip_vs_conn_expire_now(struct ip_vs_conn *cp) 589{ 590 if (del_timer(&cp->timer)) 591 mod_timer(&cp->timer, jiffies); 592} 593 594 595/* 596 * Create a new connection entry and hash it into the ip_vs_conn_tab 597 */ 598struct ip_vs_conn * 599ip_vs_conn_new(int proto, __be32 caddr, __be16 cport, __be32 vaddr, __be16 vport, 600 __be32 daddr, __be16 dport, unsigned flags, 601 struct ip_vs_dest *dest) 602{ 603 struct ip_vs_conn *cp; 604 struct ip_vs_protocol *pp = ip_vs_proto_get(proto); 605 606 cp = kmem_cache_alloc(ip_vs_conn_cachep, GFP_ATOMIC); 607 if (cp == NULL) { 608 IP_VS_ERR_RL("ip_vs_conn_new: no memory available.\n"); 609 return NULL; 610 } 611 612 memset(cp, 0, sizeof(*cp)); 613 INIT_LIST_HEAD(&cp->c_list); 614 init_timer(&cp->timer); 615 cp->timer.data = (unsigned long)cp; 616 cp->timer.function = ip_vs_conn_expire; 617 cp->protocol = proto; 618 cp->caddr = caddr; 619 cp->cport = cport; 620 cp->vaddr = vaddr; 621 cp->vport = vport; 622 cp->daddr = daddr; 623 cp->dport = dport; 624 cp->flags = flags; 625 spin_lock_init(&cp->lock); 626 627 /* 628 * Set the entry is referenced by the current thread before hashing 629 * it in the table, so that other thread run ip_vs_random_dropentry 630 * but cannot drop this entry. 631 */ 632 atomic_set(&cp->refcnt, 1); 633 634 atomic_set(&cp->n_control, 0); 635 atomic_set(&cp->in_pkts, 0); 636 637 atomic_inc(&ip_vs_conn_count); 638 if (flags & IP_VS_CONN_F_NO_CPORT) 639 atomic_inc(&ip_vs_conn_no_cport_cnt); 640 641 /* Bind the connection with a destination server */ 642 ip_vs_bind_dest(cp, dest); 643 644 /* Set its state and timeout */ 645 cp->state = 0; 646 cp->timeout = 3*HZ; 647 648 /* Bind its packet transmitter */ 649 ip_vs_bind_xmit(cp); 650 651 if (unlikely(pp && atomic_read(&pp->appcnt))) 652 ip_vs_bind_app(cp, pp); 653 654 /* Hash it in the ip_vs_conn_tab finally */ 655 ip_vs_conn_hash(cp); 656 657 return cp; 658} 659 660 661/* 662 * /proc/net/ip_vs_conn entries 663 */ 664#ifdef CONFIG_PROC_FS 665 666static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos) 667{ 668 int idx; 669 struct ip_vs_conn *cp; 670 671 for(idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) { 672 ct_read_lock_bh(idx); 673 list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { 674 if (pos-- == 0) { 675 seq->private = &ip_vs_conn_tab[idx]; 676 return cp; 677 } 678 } 679 ct_read_unlock_bh(idx); 680 } 681 682 return NULL; 683} 684 685static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos) 686{ 687 seq->private = NULL; 688 return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN; 689} 690 691static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos) 692{ 693 struct ip_vs_conn *cp = v; 694 struct list_head *e, *l = seq->private; 695 int idx; 696 697 ++*pos; 698 if (v == SEQ_START_TOKEN) 699 return ip_vs_conn_array(seq, 0); 700 701 /* more on same hash chain? */ 702 if ((e = cp->c_list.next) != l) 703 return list_entry(e, struct ip_vs_conn, c_list); 704 705 idx = l - ip_vs_conn_tab; 706 ct_read_unlock_bh(idx); 707 708 while (++idx < IP_VS_CONN_TAB_SIZE) { 709 ct_read_lock_bh(idx); 710 list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { 711 seq->private = &ip_vs_conn_tab[idx]; 712 return cp; 713 } 714 ct_read_unlock_bh(idx); 715 } 716 seq->private = NULL; 717 return NULL; 718} 719 720static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v) 721{ 722 struct list_head *l = seq->private; 723 724 if (l) 725 ct_read_unlock_bh(l - ip_vs_conn_tab); 726} 727 728static int ip_vs_conn_seq_show(struct seq_file *seq, void *v) 729{ 730 731 if (v == SEQ_START_TOKEN) 732 seq_puts(seq, 733 "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Expires\n"); 734 else { 735 const struct ip_vs_conn *cp = v; 736 737 seq_printf(seq, 738 "%-3s %08X %04X %08X %04X %08X %04X %-11s %7lu\n", 739 ip_vs_proto_name(cp->protocol), 740 ntohl(cp->caddr), ntohs(cp->cport), 741 ntohl(cp->vaddr), ntohs(cp->vport), 742 ntohl(cp->daddr), ntohs(cp->dport), 743 ip_vs_state_name(cp->protocol, cp->state), 744 (cp->timer.expires-jiffies)/HZ); 745 } 746 return 0; 747} 748 749static struct seq_operations ip_vs_conn_seq_ops = { 750 .start = ip_vs_conn_seq_start, 751 .next = ip_vs_conn_seq_next, 752 .stop = ip_vs_conn_seq_stop, 753 .show = ip_vs_conn_seq_show, 754}; 755 756static int ip_vs_conn_open(struct inode *inode, struct file *file) 757{ 758 return seq_open(file, &ip_vs_conn_seq_ops); 759} 760 761static struct file_operations ip_vs_conn_fops = { 762 .owner = THIS_MODULE, 763 .open = ip_vs_conn_open, 764 .read = seq_read, 765 .llseek = seq_lseek, 766 .release = seq_release, 767}; 768#endif 769 770 771/* 772 * Randomly drop connection entries before running out of memory 773 */ 774static inline int todrop_entry(struct ip_vs_conn *cp) 775{ 776 /* 777 * The drop rate array needs tuning for real environments. 778 * Called from timer bh only => no locking 779 */ 780 static const char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8}; 781 static char todrop_counter[9] = {0}; 782 int i; 783 784 /* if the conn entry hasn't lasted for 60 seconds, don't drop it. 785 This will leave enough time for normal connection to get 786 through. */ 787 if (time_before(cp->timeout + jiffies, cp->timer.expires + 60*HZ)) 788 return 0; 789 790 /* Don't drop the entry if its number of incoming packets is not 791 located in [0, 8] */ 792 i = atomic_read(&cp->in_pkts); 793 if (i > 8 || i < 0) return 0; 794 795 if (!todrop_rate[i]) return 0; 796 if (--todrop_counter[i] > 0) return 0; 797 798 todrop_counter[i] = todrop_rate[i]; 799 return 1; 800} 801 802/* Called from keventd and must protect itself from softirqs */ 803void ip_vs_random_dropentry(void) 804{ 805 int idx; 806 struct ip_vs_conn *cp; 807 808 /* 809 * Randomly scan 1/32 of the whole table every second 810 */ 811 for (idx = 0; idx < (IP_VS_CONN_TAB_SIZE>>5); idx++) { 812 unsigned hash = net_random() & IP_VS_CONN_TAB_MASK; 813 814 /* 815 * Lock is actually needed in this loop. 816 */ 817 ct_write_lock_bh(hash); 818 819 list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { 820 if (cp->flags & IP_VS_CONN_F_TEMPLATE) 821 /* connection template */ 822 continue; 823 824 if (cp->protocol == IPPROTO_TCP) { 825 switch(cp->state) { 826 case IP_VS_TCP_S_SYN_RECV: 827 case IP_VS_TCP_S_SYNACK: 828 break; 829 830 case IP_VS_TCP_S_ESTABLISHED: 831 if (todrop_entry(cp)) 832 break; 833 continue; 834 835 default: 836 continue; 837 } 838 } else { 839 if (!todrop_entry(cp)) 840 continue; 841 } 842 843 IP_VS_DBG(4, "del connection\n"); 844 ip_vs_conn_expire_now(cp); 845 if (cp->control) { 846 IP_VS_DBG(4, "del conn template\n"); 847 ip_vs_conn_expire_now(cp->control); 848 } 849 } 850 ct_write_unlock_bh(hash); 851 } 852} 853 854 855/* 856 * Flush all the connection entries in the ip_vs_conn_tab 857 */ 858static void ip_vs_conn_flush(void) 859{ 860 int idx; 861 struct ip_vs_conn *cp; 862 863 flush_again: 864 for (idx=0; idx<IP_VS_CONN_TAB_SIZE; idx++) { 865 /* 866 * Lock is actually needed in this loop. 867 */ 868 ct_write_lock_bh(idx); 869 870 list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { 871 872 IP_VS_DBG(4, "del connection\n"); 873 ip_vs_conn_expire_now(cp); 874 if (cp->control) { 875 IP_VS_DBG(4, "del conn template\n"); 876 ip_vs_conn_expire_now(cp->control); 877 } 878 } 879 ct_write_unlock_bh(idx); 880 } 881 882 /* the counter may be not NULL, because maybe some conn entries 883 are run by slow timer handler or unhashed but still referred */ 884 if (atomic_read(&ip_vs_conn_count) != 0) { 885 schedule(); 886 goto flush_again; 887 } 888} 889 890 891int ip_vs_conn_init(void) 892{ 893 int idx; 894 895 /* 896 * Allocate the connection hash table and initialize its list heads 897 */ 898 ip_vs_conn_tab = vmalloc(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head)); 899 if (!ip_vs_conn_tab) 900 return -ENOMEM; 901 902 /* Allocate ip_vs_conn slab cache */ 903 ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn", 904 sizeof(struct ip_vs_conn), 0, 905 SLAB_HWCACHE_ALIGN, NULL, NULL); 906 if (!ip_vs_conn_cachep) { 907 vfree(ip_vs_conn_tab); 908 return -ENOMEM; 909 } 910 911 IP_VS_INFO("Connection hash table configured " 912 "(size=%d, memory=%ldKbytes)\n", 913 IP_VS_CONN_TAB_SIZE, 914 (long)(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head))/1024); 915 IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n", 916 sizeof(struct ip_vs_conn)); 917 918 for (idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) { 919 INIT_LIST_HEAD(&ip_vs_conn_tab[idx]); 920 } 921 922 for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++) { 923 rwlock_init(&__ip_vs_conntbl_lock_array[idx].l); 924 } 925 926 proc_net_fops_create("ip_vs_conn", 0, &ip_vs_conn_fops); 927 928 /* calculate the random value for connection hash */ 929 get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd)); 930 931 return 0; 932} 933 934 935void ip_vs_conn_cleanup(void) 936{ 937 /* flush all the connection entries first */ 938 ip_vs_conn_flush(); 939 940 /* Release the empty cache */ 941 kmem_cache_destroy(ip_vs_conn_cachep); 942 proc_net_remove("ip_vs_conn"); 943 vfree(ip_vs_conn_tab); 944}