Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at v2.6.19 599 lines 15 kB view raw
1/* 2 * IPVS: Locality-Based Least-Connection scheduling module 3 * 4 * Version: $Id: ip_vs_lblc.c,v 1.10 2002/09/15 08:14:08 wensong Exp $ 5 * 6 * Authors: Wensong Zhang <wensong@gnuchina.org> 7 * 8 * This program is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU General Public License 10 * as published by the Free Software Foundation; either version 11 * 2 of the License, or (at your option) any later version. 12 * 13 * Changes: 14 * Martin Hamilton : fixed the terrible locking bugs 15 * *lock(tbl->lock) ==> *lock(&tbl->lock) 16 * Wensong Zhang : fixed the uninitilized tbl->lock bug 17 * Wensong Zhang : added doing full expiration check to 18 * collect stale entries of 24+ hours when 19 * no partial expire check in a half hour 20 * Julian Anastasov : replaced del_timer call with del_timer_sync 21 * to avoid the possible race between timer 22 * handler and del_timer thread in SMP 23 * 24 */ 25 26/* 27 * The lblc algorithm is as follows (pseudo code): 28 * 29 * if cachenode[dest_ip] is null then 30 * n, cachenode[dest_ip] <- {weighted least-conn node}; 31 * else 32 * n <- cachenode[dest_ip]; 33 * if (n is dead) OR 34 * (n.conns>n.weight AND 35 * there is a node m with m.conns<m.weight/2) then 36 * n, cachenode[dest_ip] <- {weighted least-conn node}; 37 * 38 * return n; 39 * 40 * Thanks must go to Wenzhuo Zhang for talking WCCP to me and pushing 41 * me to write this module. 42 */ 43 44#include <linux/ip.h> 45#include <linux/module.h> 46#include <linux/kernel.h> 47#include <linux/skbuff.h> 48 49/* for sysctl */ 50#include <linux/fs.h> 51#include <linux/sysctl.h> 52 53#include <net/ip_vs.h> 54 55 56/* 57 * It is for garbage collection of stale IPVS lblc entries, 58 * when the table is full. 59 */ 60#define CHECK_EXPIRE_INTERVAL (60*HZ) 61#define ENTRY_TIMEOUT (6*60*HZ) 62 63/* 64 * It is for full expiration check. 65 * When there is no partial expiration check (garbage collection) 66 * in a half hour, do a full expiration check to collect stale 67 * entries that haven't been touched for a day. 68 */ 69#define COUNT_FOR_FULL_EXPIRATION 30 70static int sysctl_ip_vs_lblc_expiration = 24*60*60*HZ; 71 72 73/* 74 * for IPVS lblc entry hash table 75 */ 76#ifndef CONFIG_IP_VS_LBLC_TAB_BITS 77#define CONFIG_IP_VS_LBLC_TAB_BITS 10 78#endif 79#define IP_VS_LBLC_TAB_BITS CONFIG_IP_VS_LBLC_TAB_BITS 80#define IP_VS_LBLC_TAB_SIZE (1 << IP_VS_LBLC_TAB_BITS) 81#define IP_VS_LBLC_TAB_MASK (IP_VS_LBLC_TAB_SIZE - 1) 82 83 84/* 85 * IPVS lblc entry represents an association between destination 86 * IP address and its destination server 87 */ 88struct ip_vs_lblc_entry { 89 struct list_head list; 90 __be32 addr; /* destination IP address */ 91 struct ip_vs_dest *dest; /* real server (cache) */ 92 unsigned long lastuse; /* last used time */ 93}; 94 95 96/* 97 * IPVS lblc hash table 98 */ 99struct ip_vs_lblc_table { 100 rwlock_t lock; /* lock for this table */ 101 struct list_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */ 102 atomic_t entries; /* number of entries */ 103 int max_size; /* maximum size of entries */ 104 struct timer_list periodic_timer; /* collect stale entries */ 105 int rover; /* rover for expire check */ 106 int counter; /* counter for no expire */ 107}; 108 109 110/* 111 * IPVS LBLC sysctl table 112 */ 113 114static ctl_table vs_vars_table[] = { 115 { 116 .ctl_name = NET_IPV4_VS_LBLC_EXPIRE, 117 .procname = "lblc_expiration", 118 .data = &sysctl_ip_vs_lblc_expiration, 119 .maxlen = sizeof(int), 120 .mode = 0644, 121 .proc_handler = &proc_dointvec_jiffies, 122 }, 123 { .ctl_name = 0 } 124}; 125 126static ctl_table vs_table[] = { 127 { 128 .ctl_name = NET_IPV4_VS, 129 .procname = "vs", 130 .mode = 0555, 131 .child = vs_vars_table 132 }, 133 { .ctl_name = 0 } 134}; 135 136static ctl_table ipvs_ipv4_table[] = { 137 { 138 .ctl_name = NET_IPV4, 139 .procname = "ipv4", 140 .mode = 0555, 141 .child = vs_table 142 }, 143 { .ctl_name = 0 } 144}; 145 146static ctl_table lblc_root_table[] = { 147 { 148 .ctl_name = CTL_NET, 149 .procname = "net", 150 .mode = 0555, 151 .child = ipvs_ipv4_table 152 }, 153 { .ctl_name = 0 } 154}; 155 156static struct ctl_table_header * sysctl_header; 157 158/* 159 * new/free a ip_vs_lblc_entry, which is a mapping of a destionation 160 * IP address to a server. 161 */ 162static inline struct ip_vs_lblc_entry * 163ip_vs_lblc_new(__be32 daddr, struct ip_vs_dest *dest) 164{ 165 struct ip_vs_lblc_entry *en; 166 167 en = kmalloc(sizeof(struct ip_vs_lblc_entry), GFP_ATOMIC); 168 if (en == NULL) { 169 IP_VS_ERR("ip_vs_lblc_new(): no memory\n"); 170 return NULL; 171 } 172 173 INIT_LIST_HEAD(&en->list); 174 en->addr = daddr; 175 176 atomic_inc(&dest->refcnt); 177 en->dest = dest; 178 179 return en; 180} 181 182 183static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en) 184{ 185 list_del(&en->list); 186 /* 187 * We don't kfree dest because it is refered either by its service 188 * or the trash dest list. 189 */ 190 atomic_dec(&en->dest->refcnt); 191 kfree(en); 192} 193 194 195/* 196 * Returns hash value for IPVS LBLC entry 197 */ 198static inline unsigned ip_vs_lblc_hashkey(__be32 addr) 199{ 200 return (ntohl(addr)*2654435761UL) & IP_VS_LBLC_TAB_MASK; 201} 202 203 204/* 205 * Hash an entry in the ip_vs_lblc_table. 206 * returns bool success. 207 */ 208static int 209ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en) 210{ 211 unsigned hash; 212 213 if (!list_empty(&en->list)) { 214 IP_VS_ERR("ip_vs_lblc_hash(): request for already hashed, " 215 "called from %p\n", __builtin_return_address(0)); 216 return 0; 217 } 218 219 /* 220 * Hash by destination IP address 221 */ 222 hash = ip_vs_lblc_hashkey(en->addr); 223 224 write_lock(&tbl->lock); 225 list_add(&en->list, &tbl->bucket[hash]); 226 atomic_inc(&tbl->entries); 227 write_unlock(&tbl->lock); 228 229 return 1; 230} 231 232 233/* 234 * Get ip_vs_lblc_entry associated with supplied parameters. 235 */ 236static inline struct ip_vs_lblc_entry * 237ip_vs_lblc_get(struct ip_vs_lblc_table *tbl, __be32 addr) 238{ 239 unsigned hash; 240 struct ip_vs_lblc_entry *en; 241 242 hash = ip_vs_lblc_hashkey(addr); 243 244 read_lock(&tbl->lock); 245 246 list_for_each_entry(en, &tbl->bucket[hash], list) { 247 if (en->addr == addr) { 248 /* HIT */ 249 read_unlock(&tbl->lock); 250 return en; 251 } 252 } 253 254 read_unlock(&tbl->lock); 255 256 return NULL; 257} 258 259 260/* 261 * Flush all the entries of the specified table. 262 */ 263static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl) 264{ 265 int i; 266 struct ip_vs_lblc_entry *en, *nxt; 267 268 for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) { 269 write_lock(&tbl->lock); 270 list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) { 271 ip_vs_lblc_free(en); 272 atomic_dec(&tbl->entries); 273 } 274 write_unlock(&tbl->lock); 275 } 276} 277 278 279static inline void ip_vs_lblc_full_check(struct ip_vs_lblc_table *tbl) 280{ 281 unsigned long now = jiffies; 282 int i, j; 283 struct ip_vs_lblc_entry *en, *nxt; 284 285 for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) { 286 j = (j + 1) & IP_VS_LBLC_TAB_MASK; 287 288 write_lock(&tbl->lock); 289 list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { 290 if (time_before(now, 291 en->lastuse + sysctl_ip_vs_lblc_expiration)) 292 continue; 293 294 ip_vs_lblc_free(en); 295 atomic_dec(&tbl->entries); 296 } 297 write_unlock(&tbl->lock); 298 } 299 tbl->rover = j; 300} 301 302 303/* 304 * Periodical timer handler for IPVS lblc table 305 * It is used to collect stale entries when the number of entries 306 * exceeds the maximum size of the table. 307 * 308 * Fixme: we probably need more complicated algorithm to collect 309 * entries that have not been used for a long time even 310 * if the number of entries doesn't exceed the maximum size 311 * of the table. 312 * The full expiration check is for this purpose now. 313 */ 314static void ip_vs_lblc_check_expire(unsigned long data) 315{ 316 struct ip_vs_lblc_table *tbl; 317 unsigned long now = jiffies; 318 int goal; 319 int i, j; 320 struct ip_vs_lblc_entry *en, *nxt; 321 322 tbl = (struct ip_vs_lblc_table *)data; 323 324 if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) { 325 /* do full expiration check */ 326 ip_vs_lblc_full_check(tbl); 327 tbl->counter = 1; 328 goto out; 329 } 330 331 if (atomic_read(&tbl->entries) <= tbl->max_size) { 332 tbl->counter++; 333 goto out; 334 } 335 336 goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3; 337 if (goal > tbl->max_size/2) 338 goal = tbl->max_size/2; 339 340 for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) { 341 j = (j + 1) & IP_VS_LBLC_TAB_MASK; 342 343 write_lock(&tbl->lock); 344 list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { 345 if (time_before(now, en->lastuse + ENTRY_TIMEOUT)) 346 continue; 347 348 ip_vs_lblc_free(en); 349 atomic_dec(&tbl->entries); 350 goal--; 351 } 352 write_unlock(&tbl->lock); 353 if (goal <= 0) 354 break; 355 } 356 tbl->rover = j; 357 358 out: 359 mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL); 360} 361 362 363static int ip_vs_lblc_init_svc(struct ip_vs_service *svc) 364{ 365 int i; 366 struct ip_vs_lblc_table *tbl; 367 368 /* 369 * Allocate the ip_vs_lblc_table for this service 370 */ 371 tbl = kmalloc(sizeof(struct ip_vs_lblc_table), GFP_ATOMIC); 372 if (tbl == NULL) { 373 IP_VS_ERR("ip_vs_lblc_init_svc(): no memory\n"); 374 return -ENOMEM; 375 } 376 svc->sched_data = tbl; 377 IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) allocated for " 378 "current service\n", 379 sizeof(struct ip_vs_lblc_table)); 380 381 /* 382 * Initialize the hash buckets 383 */ 384 for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) { 385 INIT_LIST_HEAD(&tbl->bucket[i]); 386 } 387 rwlock_init(&tbl->lock); 388 tbl->max_size = IP_VS_LBLC_TAB_SIZE*16; 389 tbl->rover = 0; 390 tbl->counter = 1; 391 392 /* 393 * Hook periodic timer for garbage collection 394 */ 395 init_timer(&tbl->periodic_timer); 396 tbl->periodic_timer.data = (unsigned long)tbl; 397 tbl->periodic_timer.function = ip_vs_lblc_check_expire; 398 tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL; 399 add_timer(&tbl->periodic_timer); 400 401 return 0; 402} 403 404 405static int ip_vs_lblc_done_svc(struct ip_vs_service *svc) 406{ 407 struct ip_vs_lblc_table *tbl = svc->sched_data; 408 409 /* remove periodic timer */ 410 del_timer_sync(&tbl->periodic_timer); 411 412 /* got to clean up table entries here */ 413 ip_vs_lblc_flush(tbl); 414 415 /* release the table itself */ 416 kfree(svc->sched_data); 417 IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n", 418 sizeof(struct ip_vs_lblc_table)); 419 420 return 0; 421} 422 423 424static int ip_vs_lblc_update_svc(struct ip_vs_service *svc) 425{ 426 return 0; 427} 428 429 430static inline struct ip_vs_dest * 431__ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph) 432{ 433 struct ip_vs_dest *dest, *least; 434 int loh, doh; 435 436 /* 437 * We think the overhead of processing active connections is fifty 438 * times higher than that of inactive connections in average. (This 439 * fifty times might not be accurate, we will change it later.) We 440 * use the following formula to estimate the overhead: 441 * dest->activeconns*50 + dest->inactconns 442 * and the load: 443 * (dest overhead) / dest->weight 444 * 445 * Remember -- no floats in kernel mode!!! 446 * The comparison of h1*w2 > h2*w1 is equivalent to that of 447 * h1/w1 > h2/w2 448 * if every weight is larger than zero. 449 * 450 * The server with weight=0 is quiesced and will not receive any 451 * new connection. 452 */ 453 list_for_each_entry(dest, &svc->destinations, n_list) { 454 if (dest->flags & IP_VS_DEST_F_OVERLOAD) 455 continue; 456 if (atomic_read(&dest->weight) > 0) { 457 least = dest; 458 loh = atomic_read(&least->activeconns) * 50 459 + atomic_read(&least->inactconns); 460 goto nextstage; 461 } 462 } 463 return NULL; 464 465 /* 466 * Find the destination with the least load. 467 */ 468 nextstage: 469 list_for_each_entry_continue(dest, &svc->destinations, n_list) { 470 if (dest->flags & IP_VS_DEST_F_OVERLOAD) 471 continue; 472 473 doh = atomic_read(&dest->activeconns) * 50 474 + atomic_read(&dest->inactconns); 475 if (loh * atomic_read(&dest->weight) > 476 doh * atomic_read(&least->weight)) { 477 least = dest; 478 loh = doh; 479 } 480 } 481 482 IP_VS_DBG(6, "LBLC: server %d.%d.%d.%d:%d " 483 "activeconns %d refcnt %d weight %d overhead %d\n", 484 NIPQUAD(least->addr), ntohs(least->port), 485 atomic_read(&least->activeconns), 486 atomic_read(&least->refcnt), 487 atomic_read(&least->weight), loh); 488 489 return least; 490} 491 492 493/* 494 * If this destination server is overloaded and there is a less loaded 495 * server, then return true. 496 */ 497static inline int 498is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc) 499{ 500 if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) { 501 struct ip_vs_dest *d; 502 503 list_for_each_entry(d, &svc->destinations, n_list) { 504 if (atomic_read(&d->activeconns)*2 505 < atomic_read(&d->weight)) { 506 return 1; 507 } 508 } 509 } 510 return 0; 511} 512 513 514/* 515 * Locality-Based (weighted) Least-Connection scheduling 516 */ 517static struct ip_vs_dest * 518ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) 519{ 520 struct ip_vs_dest *dest; 521 struct ip_vs_lblc_table *tbl; 522 struct ip_vs_lblc_entry *en; 523 struct iphdr *iph = skb->nh.iph; 524 525 IP_VS_DBG(6, "ip_vs_lblc_schedule(): Scheduling...\n"); 526 527 tbl = (struct ip_vs_lblc_table *)svc->sched_data; 528 en = ip_vs_lblc_get(tbl, iph->daddr); 529 if (en == NULL) { 530 dest = __ip_vs_wlc_schedule(svc, iph); 531 if (dest == NULL) { 532 IP_VS_DBG(1, "no destination available\n"); 533 return NULL; 534 } 535 en = ip_vs_lblc_new(iph->daddr, dest); 536 if (en == NULL) { 537 return NULL; 538 } 539 ip_vs_lblc_hash(tbl, en); 540 } else { 541 dest = en->dest; 542 if (!(dest->flags & IP_VS_DEST_F_AVAILABLE) 543 || atomic_read(&dest->weight) <= 0 544 || is_overloaded(dest, svc)) { 545 dest = __ip_vs_wlc_schedule(svc, iph); 546 if (dest == NULL) { 547 IP_VS_DBG(1, "no destination available\n"); 548 return NULL; 549 } 550 atomic_dec(&en->dest->refcnt); 551 atomic_inc(&dest->refcnt); 552 en->dest = dest; 553 } 554 } 555 en->lastuse = jiffies; 556 557 IP_VS_DBG(6, "LBLC: destination IP address %u.%u.%u.%u " 558 "--> server %u.%u.%u.%u:%d\n", 559 NIPQUAD(en->addr), 560 NIPQUAD(dest->addr), 561 ntohs(dest->port)); 562 563 return dest; 564} 565 566 567/* 568 * IPVS LBLC Scheduler structure 569 */ 570static struct ip_vs_scheduler ip_vs_lblc_scheduler = 571{ 572 .name = "lblc", 573 .refcnt = ATOMIC_INIT(0), 574 .module = THIS_MODULE, 575 .init_service = ip_vs_lblc_init_svc, 576 .done_service = ip_vs_lblc_done_svc, 577 .update_service = ip_vs_lblc_update_svc, 578 .schedule = ip_vs_lblc_schedule, 579}; 580 581 582static int __init ip_vs_lblc_init(void) 583{ 584 INIT_LIST_HEAD(&ip_vs_lblc_scheduler.n_list); 585 sysctl_header = register_sysctl_table(lblc_root_table, 0); 586 return register_ip_vs_scheduler(&ip_vs_lblc_scheduler); 587} 588 589 590static void __exit ip_vs_lblc_cleanup(void) 591{ 592 unregister_sysctl_table(sysctl_header); 593 unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler); 594} 595 596 597module_init(ip_vs_lblc_init); 598module_exit(ip_vs_lblc_cleanup); 599MODULE_LICENSE("GPL");