Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at v2.6.27 1023 lines 25 kB view raw
1/* 2 * IPVS An implementation of the IP virtual server support for the 3 * LINUX operating system. IPVS is now implemented as a module 4 * over the Netfilter framework. IPVS can be used to build a 5 * high-performance and highly available server based on a 6 * cluster of servers. 7 * 8 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> 9 * Peter Kese <peter.kese@ijs.si> 10 * Julian Anastasov <ja@ssi.bg> 11 * 12 * This program is free software; you can redistribute it and/or 13 * modify it under the terms of the GNU General Public License 14 * as published by the Free Software Foundation; either version 15 * 2 of the License, or (at your option) any later version. 16 * 17 * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese, 18 * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms 19 * and others. Many code here is taken from IP MASQ code of kernel 2.2. 20 * 21 * Changes: 22 * 23 */ 24 25#include <linux/interrupt.h> 26#include <linux/in.h> 27#include <linux/net.h> 28#include <linux/kernel.h> 29#include <linux/module.h> 30#include <linux/vmalloc.h> 31#include <linux/proc_fs.h> /* for proc_net_* */ 32#include <linux/seq_file.h> 33#include <linux/jhash.h> 34#include <linux/random.h> 35 36#include <net/net_namespace.h> 37#include <net/ip_vs.h> 38 39 40/* 41 * Connection hash table: for input and output packets lookups of IPVS 42 */ 43static struct list_head *ip_vs_conn_tab; 44 45/* SLAB cache for IPVS connections */ 46static struct kmem_cache *ip_vs_conn_cachep __read_mostly; 47 48/* counter for current IPVS connections */ 49static atomic_t ip_vs_conn_count = ATOMIC_INIT(0); 50 51/* counter for no client port connections */ 52static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0); 53 54/* random value for IPVS connection hash */ 55static unsigned int ip_vs_conn_rnd; 56 57/* 58 * Fine locking granularity for big connection hash table 59 */ 60#define CT_LOCKARRAY_BITS 4 61#define CT_LOCKARRAY_SIZE (1<<CT_LOCKARRAY_BITS) 62#define CT_LOCKARRAY_MASK (CT_LOCKARRAY_SIZE-1) 63 64struct ip_vs_aligned_lock 65{ 66 rwlock_t l; 67} __attribute__((__aligned__(SMP_CACHE_BYTES))); 68 69/* lock array for conn table */ 70static struct ip_vs_aligned_lock 71__ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned; 72 73static inline void ct_read_lock(unsigned key) 74{ 75 read_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); 76} 77 78static inline void ct_read_unlock(unsigned key) 79{ 80 read_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); 81} 82 83static inline void ct_write_lock(unsigned key) 84{ 85 write_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); 86} 87 88static inline void ct_write_unlock(unsigned key) 89{ 90 write_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); 91} 92 93static inline void ct_read_lock_bh(unsigned key) 94{ 95 read_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); 96} 97 98static inline void ct_read_unlock_bh(unsigned key) 99{ 100 read_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); 101} 102 103static inline void ct_write_lock_bh(unsigned key) 104{ 105 write_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); 106} 107 108static inline void ct_write_unlock_bh(unsigned key) 109{ 110 write_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); 111} 112 113 114/* 115 * Returns hash value for IPVS connection entry 116 */ 117static unsigned int ip_vs_conn_hashkey(unsigned proto, __be32 addr, __be16 port) 118{ 119 return jhash_3words((__force u32)addr, (__force u32)port, proto, ip_vs_conn_rnd) 120 & IP_VS_CONN_TAB_MASK; 121} 122 123 124/* 125 * Hashes ip_vs_conn in ip_vs_conn_tab by proto,addr,port. 126 * returns bool success. 127 */ 128static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) 129{ 130 unsigned hash; 131 int ret; 132 133 /* Hash by protocol, client address and port */ 134 hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr, cp->cport); 135 136 ct_write_lock(hash); 137 138 if (!(cp->flags & IP_VS_CONN_F_HASHED)) { 139 list_add(&cp->c_list, &ip_vs_conn_tab[hash]); 140 cp->flags |= IP_VS_CONN_F_HASHED; 141 atomic_inc(&cp->refcnt); 142 ret = 1; 143 } else { 144 IP_VS_ERR("ip_vs_conn_hash(): request for already hashed, " 145 "called from %p\n", __builtin_return_address(0)); 146 ret = 0; 147 } 148 149 ct_write_unlock(hash); 150 151 return ret; 152} 153 154 155/* 156 * UNhashes ip_vs_conn from ip_vs_conn_tab. 157 * returns bool success. 158 */ 159static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp) 160{ 161 unsigned hash; 162 int ret; 163 164 /* unhash it and decrease its reference counter */ 165 hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr, cp->cport); 166 167 ct_write_lock(hash); 168 169 if (cp->flags & IP_VS_CONN_F_HASHED) { 170 list_del(&cp->c_list); 171 cp->flags &= ~IP_VS_CONN_F_HASHED; 172 atomic_dec(&cp->refcnt); 173 ret = 1; 174 } else 175 ret = 0; 176 177 ct_write_unlock(hash); 178 179 return ret; 180} 181 182 183/* 184 * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab. 185 * Called for pkts coming from OUTside-to-INside. 186 * s_addr, s_port: pkt source address (foreign host) 187 * d_addr, d_port: pkt dest address (load balancer) 188 */ 189static inline struct ip_vs_conn *__ip_vs_conn_in_get 190(int protocol, __be32 s_addr, __be16 s_port, __be32 d_addr, __be16 d_port) 191{ 192 unsigned hash; 193 struct ip_vs_conn *cp; 194 195 hash = ip_vs_conn_hashkey(protocol, s_addr, s_port); 196 197 ct_read_lock(hash); 198 199 list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { 200 if (s_addr==cp->caddr && s_port==cp->cport && 201 d_port==cp->vport && d_addr==cp->vaddr && 202 ((!s_port) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) && 203 protocol==cp->protocol) { 204 /* HIT */ 205 atomic_inc(&cp->refcnt); 206 ct_read_unlock(hash); 207 return cp; 208 } 209 } 210 211 ct_read_unlock(hash); 212 213 return NULL; 214} 215 216struct ip_vs_conn *ip_vs_conn_in_get 217(int protocol, __be32 s_addr, __be16 s_port, __be32 d_addr, __be16 d_port) 218{ 219 struct ip_vs_conn *cp; 220 221 cp = __ip_vs_conn_in_get(protocol, s_addr, s_port, d_addr, d_port); 222 if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt)) 223 cp = __ip_vs_conn_in_get(protocol, s_addr, 0, d_addr, d_port); 224 225 IP_VS_DBG(9, "lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", 226 ip_vs_proto_name(protocol), 227 NIPQUAD(s_addr), ntohs(s_port), 228 NIPQUAD(d_addr), ntohs(d_port), 229 cp?"hit":"not hit"); 230 231 return cp; 232} 233 234/* Get reference to connection template */ 235struct ip_vs_conn *ip_vs_ct_in_get 236(int protocol, __be32 s_addr, __be16 s_port, __be32 d_addr, __be16 d_port) 237{ 238 unsigned hash; 239 struct ip_vs_conn *cp; 240 241 hash = ip_vs_conn_hashkey(protocol, s_addr, s_port); 242 243 ct_read_lock(hash); 244 245 list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { 246 if (s_addr==cp->caddr && s_port==cp->cport && 247 d_port==cp->vport && d_addr==cp->vaddr && 248 cp->flags & IP_VS_CONN_F_TEMPLATE && 249 protocol==cp->protocol) { 250 /* HIT */ 251 atomic_inc(&cp->refcnt); 252 goto out; 253 } 254 } 255 cp = NULL; 256 257 out: 258 ct_read_unlock(hash); 259 260 IP_VS_DBG(9, "template lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", 261 ip_vs_proto_name(protocol), 262 NIPQUAD(s_addr), ntohs(s_port), 263 NIPQUAD(d_addr), ntohs(d_port), 264 cp?"hit":"not hit"); 265 266 return cp; 267} 268 269/* 270 * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab. 271 * Called for pkts coming from inside-to-OUTside. 272 * s_addr, s_port: pkt source address (inside host) 273 * d_addr, d_port: pkt dest address (foreign host) 274 */ 275struct ip_vs_conn *ip_vs_conn_out_get 276(int protocol, __be32 s_addr, __be16 s_port, __be32 d_addr, __be16 d_port) 277{ 278 unsigned hash; 279 struct ip_vs_conn *cp, *ret=NULL; 280 281 /* 282 * Check for "full" addressed entries 283 */ 284 hash = ip_vs_conn_hashkey(protocol, d_addr, d_port); 285 286 ct_read_lock(hash); 287 288 list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { 289 if (d_addr == cp->caddr && d_port == cp->cport && 290 s_port == cp->dport && s_addr == cp->daddr && 291 protocol == cp->protocol) { 292 /* HIT */ 293 atomic_inc(&cp->refcnt); 294 ret = cp; 295 break; 296 } 297 } 298 299 ct_read_unlock(hash); 300 301 IP_VS_DBG(9, "lookup/out %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", 302 ip_vs_proto_name(protocol), 303 NIPQUAD(s_addr), ntohs(s_port), 304 NIPQUAD(d_addr), ntohs(d_port), 305 ret?"hit":"not hit"); 306 307 return ret; 308} 309 310 311/* 312 * Put back the conn and restart its timer with its timeout 313 */ 314void ip_vs_conn_put(struct ip_vs_conn *cp) 315{ 316 /* reset it expire in its timeout */ 317 mod_timer(&cp->timer, jiffies+cp->timeout); 318 319 __ip_vs_conn_put(cp); 320} 321 322 323/* 324 * Fill a no_client_port connection with a client port number 325 */ 326void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport) 327{ 328 if (ip_vs_conn_unhash(cp)) { 329 spin_lock(&cp->lock); 330 if (cp->flags & IP_VS_CONN_F_NO_CPORT) { 331 atomic_dec(&ip_vs_conn_no_cport_cnt); 332 cp->flags &= ~IP_VS_CONN_F_NO_CPORT; 333 cp->cport = cport; 334 } 335 spin_unlock(&cp->lock); 336 337 /* hash on new dport */ 338 ip_vs_conn_hash(cp); 339 } 340} 341 342 343/* 344 * Bind a connection entry with the corresponding packet_xmit. 345 * Called by ip_vs_conn_new. 346 */ 347static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp) 348{ 349 switch (IP_VS_FWD_METHOD(cp)) { 350 case IP_VS_CONN_F_MASQ: 351 cp->packet_xmit = ip_vs_nat_xmit; 352 break; 353 354 case IP_VS_CONN_F_TUNNEL: 355 cp->packet_xmit = ip_vs_tunnel_xmit; 356 break; 357 358 case IP_VS_CONN_F_DROUTE: 359 cp->packet_xmit = ip_vs_dr_xmit; 360 break; 361 362 case IP_VS_CONN_F_LOCALNODE: 363 cp->packet_xmit = ip_vs_null_xmit; 364 break; 365 366 case IP_VS_CONN_F_BYPASS: 367 cp->packet_xmit = ip_vs_bypass_xmit; 368 break; 369 } 370} 371 372 373static inline int ip_vs_dest_totalconns(struct ip_vs_dest *dest) 374{ 375 return atomic_read(&dest->activeconns) 376 + atomic_read(&dest->inactconns); 377} 378 379/* 380 * Bind a connection entry with a virtual service destination 381 * Called just after a new connection entry is created. 382 */ 383static inline void 384ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest) 385{ 386 /* if dest is NULL, then return directly */ 387 if (!dest) 388 return; 389 390 /* Increase the refcnt counter of the dest */ 391 atomic_inc(&dest->refcnt); 392 393 /* Bind with the destination and its corresponding transmitter */ 394 if ((cp->flags & IP_VS_CONN_F_SYNC) && 395 (!(cp->flags & IP_VS_CONN_F_TEMPLATE))) 396 /* if the connection is not template and is created 397 * by sync, preserve the activity flag. 398 */ 399 cp->flags |= atomic_read(&dest->conn_flags) & 400 (~IP_VS_CONN_F_INACTIVE); 401 else 402 cp->flags |= atomic_read(&dest->conn_flags); 403 cp->dest = dest; 404 405 IP_VS_DBG(7, "Bind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d " 406 "d:%u.%u.%u.%u:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d " 407 "dest->refcnt:%d\n", 408 ip_vs_proto_name(cp->protocol), 409 NIPQUAD(cp->caddr), ntohs(cp->cport), 410 NIPQUAD(cp->vaddr), ntohs(cp->vport), 411 NIPQUAD(cp->daddr), ntohs(cp->dport), 412 ip_vs_fwd_tag(cp), cp->state, 413 cp->flags, atomic_read(&cp->refcnt), 414 atomic_read(&dest->refcnt)); 415 416 /* Update the connection counters */ 417 if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) { 418 /* It is a normal connection, so increase the inactive 419 connection counter because it is in TCP SYNRECV 420 state (inactive) or other protocol inacive state */ 421 if ((cp->flags & IP_VS_CONN_F_SYNC) && 422 (!(cp->flags & IP_VS_CONN_F_INACTIVE))) 423 atomic_inc(&dest->activeconns); 424 else 425 atomic_inc(&dest->inactconns); 426 } else { 427 /* It is a persistent connection/template, so increase 428 the peristent connection counter */ 429 atomic_inc(&dest->persistconns); 430 } 431 432 if (dest->u_threshold != 0 && 433 ip_vs_dest_totalconns(dest) >= dest->u_threshold) 434 dest->flags |= IP_VS_DEST_F_OVERLOAD; 435} 436 437 438/* 439 * Check if there is a destination for the connection, if so 440 * bind the connection to the destination. 441 */ 442struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp) 443{ 444 struct ip_vs_dest *dest; 445 446 if ((cp) && (!cp->dest)) { 447 dest = ip_vs_find_dest(cp->daddr, cp->dport, 448 cp->vaddr, cp->vport, cp->protocol); 449 ip_vs_bind_dest(cp, dest); 450 return dest; 451 } else 452 return NULL; 453} 454 455 456/* 457 * Unbind a connection entry with its VS destination 458 * Called by the ip_vs_conn_expire function. 459 */ 460static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp) 461{ 462 struct ip_vs_dest *dest = cp->dest; 463 464 if (!dest) 465 return; 466 467 IP_VS_DBG(7, "Unbind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d " 468 "d:%u.%u.%u.%u:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d " 469 "dest->refcnt:%d\n", 470 ip_vs_proto_name(cp->protocol), 471 NIPQUAD(cp->caddr), ntohs(cp->cport), 472 NIPQUAD(cp->vaddr), ntohs(cp->vport), 473 NIPQUAD(cp->daddr), ntohs(cp->dport), 474 ip_vs_fwd_tag(cp), cp->state, 475 cp->flags, atomic_read(&cp->refcnt), 476 atomic_read(&dest->refcnt)); 477 478 /* Update the connection counters */ 479 if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) { 480 /* It is a normal connection, so decrease the inactconns 481 or activeconns counter */ 482 if (cp->flags & IP_VS_CONN_F_INACTIVE) { 483 atomic_dec(&dest->inactconns); 484 } else { 485 atomic_dec(&dest->activeconns); 486 } 487 } else { 488 /* It is a persistent connection/template, so decrease 489 the peristent connection counter */ 490 atomic_dec(&dest->persistconns); 491 } 492 493 if (dest->l_threshold != 0) { 494 if (ip_vs_dest_totalconns(dest) < dest->l_threshold) 495 dest->flags &= ~IP_VS_DEST_F_OVERLOAD; 496 } else if (dest->u_threshold != 0) { 497 if (ip_vs_dest_totalconns(dest) * 4 < dest->u_threshold * 3) 498 dest->flags &= ~IP_VS_DEST_F_OVERLOAD; 499 } else { 500 if (dest->flags & IP_VS_DEST_F_OVERLOAD) 501 dest->flags &= ~IP_VS_DEST_F_OVERLOAD; 502 } 503 504 /* 505 * Simply decrease the refcnt of the dest, because the 506 * dest will be either in service's destination list 507 * or in the trash. 508 */ 509 atomic_dec(&dest->refcnt); 510} 511 512 513/* 514 * Checking if the destination of a connection template is available. 515 * If available, return 1, otherwise invalidate this connection 516 * template and return 0. 517 */ 518int ip_vs_check_template(struct ip_vs_conn *ct) 519{ 520 struct ip_vs_dest *dest = ct->dest; 521 522 /* 523 * Checking the dest server status. 524 */ 525 if ((dest == NULL) || 526 !(dest->flags & IP_VS_DEST_F_AVAILABLE) || 527 (sysctl_ip_vs_expire_quiescent_template && 528 (atomic_read(&dest->weight) == 0))) { 529 IP_VS_DBG(9, "check_template: dest not available for " 530 "protocol %s s:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d " 531 "-> d:%u.%u.%u.%u:%d\n", 532 ip_vs_proto_name(ct->protocol), 533 NIPQUAD(ct->caddr), ntohs(ct->cport), 534 NIPQUAD(ct->vaddr), ntohs(ct->vport), 535 NIPQUAD(ct->daddr), ntohs(ct->dport)); 536 537 /* 538 * Invalidate the connection template 539 */ 540 if (ct->vport != htons(0xffff)) { 541 if (ip_vs_conn_unhash(ct)) { 542 ct->dport = htons(0xffff); 543 ct->vport = htons(0xffff); 544 ct->cport = 0; 545 ip_vs_conn_hash(ct); 546 } 547 } 548 549 /* 550 * Simply decrease the refcnt of the template, 551 * don't restart its timer. 552 */ 553 atomic_dec(&ct->refcnt); 554 return 0; 555 } 556 return 1; 557} 558 559static void ip_vs_conn_expire(unsigned long data) 560{ 561 struct ip_vs_conn *cp = (struct ip_vs_conn *)data; 562 563 cp->timeout = 60*HZ; 564 565 /* 566 * hey, I'm using it 567 */ 568 atomic_inc(&cp->refcnt); 569 570 /* 571 * do I control anybody? 572 */ 573 if (atomic_read(&cp->n_control)) 574 goto expire_later; 575 576 /* 577 * unhash it if it is hashed in the conn table 578 */ 579 if (!ip_vs_conn_unhash(cp)) 580 goto expire_later; 581 582 /* 583 * refcnt==1 implies I'm the only one referrer 584 */ 585 if (likely(atomic_read(&cp->refcnt) == 1)) { 586 /* delete the timer if it is activated by other users */ 587 if (timer_pending(&cp->timer)) 588 del_timer(&cp->timer); 589 590 /* does anybody control me? */ 591 if (cp->control) 592 ip_vs_control_del(cp); 593 594 if (unlikely(cp->app != NULL)) 595 ip_vs_unbind_app(cp); 596 ip_vs_unbind_dest(cp); 597 if (cp->flags & IP_VS_CONN_F_NO_CPORT) 598 atomic_dec(&ip_vs_conn_no_cport_cnt); 599 atomic_dec(&ip_vs_conn_count); 600 601 kmem_cache_free(ip_vs_conn_cachep, cp); 602 return; 603 } 604 605 /* hash it back to the table */ 606 ip_vs_conn_hash(cp); 607 608 expire_later: 609 IP_VS_DBG(7, "delayed: conn->refcnt-1=%d conn->n_control=%d\n", 610 atomic_read(&cp->refcnt)-1, 611 atomic_read(&cp->n_control)); 612 613 ip_vs_conn_put(cp); 614} 615 616 617void ip_vs_conn_expire_now(struct ip_vs_conn *cp) 618{ 619 if (del_timer(&cp->timer)) 620 mod_timer(&cp->timer, jiffies); 621} 622 623 624/* 625 * Create a new connection entry and hash it into the ip_vs_conn_tab 626 */ 627struct ip_vs_conn * 628ip_vs_conn_new(int proto, __be32 caddr, __be16 cport, __be32 vaddr, __be16 vport, 629 __be32 daddr, __be16 dport, unsigned flags, 630 struct ip_vs_dest *dest) 631{ 632 struct ip_vs_conn *cp; 633 struct ip_vs_protocol *pp = ip_vs_proto_get(proto); 634 635 cp = kmem_cache_zalloc(ip_vs_conn_cachep, GFP_ATOMIC); 636 if (cp == NULL) { 637 IP_VS_ERR_RL("ip_vs_conn_new: no memory available.\n"); 638 return NULL; 639 } 640 641 INIT_LIST_HEAD(&cp->c_list); 642 setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp); 643 cp->protocol = proto; 644 cp->caddr = caddr; 645 cp->cport = cport; 646 cp->vaddr = vaddr; 647 cp->vport = vport; 648 cp->daddr = daddr; 649 cp->dport = dport; 650 cp->flags = flags; 651 spin_lock_init(&cp->lock); 652 653 /* 654 * Set the entry is referenced by the current thread before hashing 655 * it in the table, so that other thread run ip_vs_random_dropentry 656 * but cannot drop this entry. 657 */ 658 atomic_set(&cp->refcnt, 1); 659 660 atomic_set(&cp->n_control, 0); 661 atomic_set(&cp->in_pkts, 0); 662 663 atomic_inc(&ip_vs_conn_count); 664 if (flags & IP_VS_CONN_F_NO_CPORT) 665 atomic_inc(&ip_vs_conn_no_cport_cnt); 666 667 /* Bind the connection with a destination server */ 668 ip_vs_bind_dest(cp, dest); 669 670 /* Set its state and timeout */ 671 cp->state = 0; 672 cp->timeout = 3*HZ; 673 674 /* Bind its packet transmitter */ 675 ip_vs_bind_xmit(cp); 676 677 if (unlikely(pp && atomic_read(&pp->appcnt))) 678 ip_vs_bind_app(cp, pp); 679 680 /* Hash it in the ip_vs_conn_tab finally */ 681 ip_vs_conn_hash(cp); 682 683 return cp; 684} 685 686 687/* 688 * /proc/net/ip_vs_conn entries 689 */ 690#ifdef CONFIG_PROC_FS 691 692static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos) 693{ 694 int idx; 695 struct ip_vs_conn *cp; 696 697 for(idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) { 698 ct_read_lock_bh(idx); 699 list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { 700 if (pos-- == 0) { 701 seq->private = &ip_vs_conn_tab[idx]; 702 return cp; 703 } 704 } 705 ct_read_unlock_bh(idx); 706 } 707 708 return NULL; 709} 710 711static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos) 712{ 713 seq->private = NULL; 714 return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN; 715} 716 717static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos) 718{ 719 struct ip_vs_conn *cp = v; 720 struct list_head *e, *l = seq->private; 721 int idx; 722 723 ++*pos; 724 if (v == SEQ_START_TOKEN) 725 return ip_vs_conn_array(seq, 0); 726 727 /* more on same hash chain? */ 728 if ((e = cp->c_list.next) != l) 729 return list_entry(e, struct ip_vs_conn, c_list); 730 731 idx = l - ip_vs_conn_tab; 732 ct_read_unlock_bh(idx); 733 734 while (++idx < IP_VS_CONN_TAB_SIZE) { 735 ct_read_lock_bh(idx); 736 list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { 737 seq->private = &ip_vs_conn_tab[idx]; 738 return cp; 739 } 740 ct_read_unlock_bh(idx); 741 } 742 seq->private = NULL; 743 return NULL; 744} 745 746static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v) 747{ 748 struct list_head *l = seq->private; 749 750 if (l) 751 ct_read_unlock_bh(l - ip_vs_conn_tab); 752} 753 754static int ip_vs_conn_seq_show(struct seq_file *seq, void *v) 755{ 756 757 if (v == SEQ_START_TOKEN) 758 seq_puts(seq, 759 "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Expires\n"); 760 else { 761 const struct ip_vs_conn *cp = v; 762 763 seq_printf(seq, 764 "%-3s %08X %04X %08X %04X %08X %04X %-11s %7lu\n", 765 ip_vs_proto_name(cp->protocol), 766 ntohl(cp->caddr), ntohs(cp->cport), 767 ntohl(cp->vaddr), ntohs(cp->vport), 768 ntohl(cp->daddr), ntohs(cp->dport), 769 ip_vs_state_name(cp->protocol, cp->state), 770 (cp->timer.expires-jiffies)/HZ); 771 } 772 return 0; 773} 774 775static const struct seq_operations ip_vs_conn_seq_ops = { 776 .start = ip_vs_conn_seq_start, 777 .next = ip_vs_conn_seq_next, 778 .stop = ip_vs_conn_seq_stop, 779 .show = ip_vs_conn_seq_show, 780}; 781 782static int ip_vs_conn_open(struct inode *inode, struct file *file) 783{ 784 return seq_open(file, &ip_vs_conn_seq_ops); 785} 786 787static const struct file_operations ip_vs_conn_fops = { 788 .owner = THIS_MODULE, 789 .open = ip_vs_conn_open, 790 .read = seq_read, 791 .llseek = seq_lseek, 792 .release = seq_release, 793}; 794 795static const char *ip_vs_origin_name(unsigned flags) 796{ 797 if (flags & IP_VS_CONN_F_SYNC) 798 return "SYNC"; 799 else 800 return "LOCAL"; 801} 802 803static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v) 804{ 805 806 if (v == SEQ_START_TOKEN) 807 seq_puts(seq, 808 "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Origin Expires\n"); 809 else { 810 const struct ip_vs_conn *cp = v; 811 812 seq_printf(seq, 813 "%-3s %08X %04X %08X %04X %08X %04X %-11s %-6s %7lu\n", 814 ip_vs_proto_name(cp->protocol), 815 ntohl(cp->caddr), ntohs(cp->cport), 816 ntohl(cp->vaddr), ntohs(cp->vport), 817 ntohl(cp->daddr), ntohs(cp->dport), 818 ip_vs_state_name(cp->protocol, cp->state), 819 ip_vs_origin_name(cp->flags), 820 (cp->timer.expires-jiffies)/HZ); 821 } 822 return 0; 823} 824 825static const struct seq_operations ip_vs_conn_sync_seq_ops = { 826 .start = ip_vs_conn_seq_start, 827 .next = ip_vs_conn_seq_next, 828 .stop = ip_vs_conn_seq_stop, 829 .show = ip_vs_conn_sync_seq_show, 830}; 831 832static int ip_vs_conn_sync_open(struct inode *inode, struct file *file) 833{ 834 return seq_open(file, &ip_vs_conn_sync_seq_ops); 835} 836 837static const struct file_operations ip_vs_conn_sync_fops = { 838 .owner = THIS_MODULE, 839 .open = ip_vs_conn_sync_open, 840 .read = seq_read, 841 .llseek = seq_lseek, 842 .release = seq_release, 843}; 844 845#endif 846 847 848/* 849 * Randomly drop connection entries before running out of memory 850 */ 851static inline int todrop_entry(struct ip_vs_conn *cp) 852{ 853 /* 854 * The drop rate array needs tuning for real environments. 855 * Called from timer bh only => no locking 856 */ 857 static const char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8}; 858 static char todrop_counter[9] = {0}; 859 int i; 860 861 /* if the conn entry hasn't lasted for 60 seconds, don't drop it. 862 This will leave enough time for normal connection to get 863 through. */ 864 if (time_before(cp->timeout + jiffies, cp->timer.expires + 60*HZ)) 865 return 0; 866 867 /* Don't drop the entry if its number of incoming packets is not 868 located in [0, 8] */ 869 i = atomic_read(&cp->in_pkts); 870 if (i > 8 || i < 0) return 0; 871 872 if (!todrop_rate[i]) return 0; 873 if (--todrop_counter[i] > 0) return 0; 874 875 todrop_counter[i] = todrop_rate[i]; 876 return 1; 877} 878 879/* Called from keventd and must protect itself from softirqs */ 880void ip_vs_random_dropentry(void) 881{ 882 int idx; 883 struct ip_vs_conn *cp; 884 885 /* 886 * Randomly scan 1/32 of the whole table every second 887 */ 888 for (idx = 0; idx < (IP_VS_CONN_TAB_SIZE>>5); idx++) { 889 unsigned hash = net_random() & IP_VS_CONN_TAB_MASK; 890 891 /* 892 * Lock is actually needed in this loop. 893 */ 894 ct_write_lock_bh(hash); 895 896 list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { 897 if (cp->flags & IP_VS_CONN_F_TEMPLATE) 898 /* connection template */ 899 continue; 900 901 if (cp->protocol == IPPROTO_TCP) { 902 switch(cp->state) { 903 case IP_VS_TCP_S_SYN_RECV: 904 case IP_VS_TCP_S_SYNACK: 905 break; 906 907 case IP_VS_TCP_S_ESTABLISHED: 908 if (todrop_entry(cp)) 909 break; 910 continue; 911 912 default: 913 continue; 914 } 915 } else { 916 if (!todrop_entry(cp)) 917 continue; 918 } 919 920 IP_VS_DBG(4, "del connection\n"); 921 ip_vs_conn_expire_now(cp); 922 if (cp->control) { 923 IP_VS_DBG(4, "del conn template\n"); 924 ip_vs_conn_expire_now(cp->control); 925 } 926 } 927 ct_write_unlock_bh(hash); 928 } 929} 930 931 932/* 933 * Flush all the connection entries in the ip_vs_conn_tab 934 */ 935static void ip_vs_conn_flush(void) 936{ 937 int idx; 938 struct ip_vs_conn *cp; 939 940 flush_again: 941 for (idx=0; idx<IP_VS_CONN_TAB_SIZE; idx++) { 942 /* 943 * Lock is actually needed in this loop. 944 */ 945 ct_write_lock_bh(idx); 946 947 list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { 948 949 IP_VS_DBG(4, "del connection\n"); 950 ip_vs_conn_expire_now(cp); 951 if (cp->control) { 952 IP_VS_DBG(4, "del conn template\n"); 953 ip_vs_conn_expire_now(cp->control); 954 } 955 } 956 ct_write_unlock_bh(idx); 957 } 958 959 /* the counter may be not NULL, because maybe some conn entries 960 are run by slow timer handler or unhashed but still referred */ 961 if (atomic_read(&ip_vs_conn_count) != 0) { 962 schedule(); 963 goto flush_again; 964 } 965} 966 967 968int __init ip_vs_conn_init(void) 969{ 970 int idx; 971 972 /* 973 * Allocate the connection hash table and initialize its list heads 974 */ 975 ip_vs_conn_tab = vmalloc(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head)); 976 if (!ip_vs_conn_tab) 977 return -ENOMEM; 978 979 /* Allocate ip_vs_conn slab cache */ 980 ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn", 981 sizeof(struct ip_vs_conn), 0, 982 SLAB_HWCACHE_ALIGN, NULL); 983 if (!ip_vs_conn_cachep) { 984 vfree(ip_vs_conn_tab); 985 return -ENOMEM; 986 } 987 988 IP_VS_INFO("Connection hash table configured " 989 "(size=%d, memory=%ldKbytes)\n", 990 IP_VS_CONN_TAB_SIZE, 991 (long)(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head))/1024); 992 IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n", 993 sizeof(struct ip_vs_conn)); 994 995 for (idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) { 996 INIT_LIST_HEAD(&ip_vs_conn_tab[idx]); 997 } 998 999 for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++) { 1000 rwlock_init(&__ip_vs_conntbl_lock_array[idx].l); 1001 } 1002 1003 proc_net_fops_create(&init_net, "ip_vs_conn", 0, &ip_vs_conn_fops); 1004 proc_net_fops_create(&init_net, "ip_vs_conn_sync", 0, &ip_vs_conn_sync_fops); 1005 1006 /* calculate the random value for connection hash */ 1007 get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd)); 1008 1009 return 0; 1010} 1011 1012 1013void ip_vs_conn_cleanup(void) 1014{ 1015 /* flush all the connection entries first */ 1016 ip_vs_conn_flush(); 1017 1018 /* Release the empty cache */ 1019 kmem_cache_destroy(ip_vs_conn_cachep); 1020 proc_net_remove(&init_net, "ip_vs_conn"); 1021 proc_net_remove(&init_net, "ip_vs_conn_sync"); 1022 vfree(ip_vs_conn_tab); 1023}