Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at v2.6.27-rc7 571 lines 14 kB view raw
1/* 2 * IPVS: Locality-Based Least-Connection scheduling module 3 * 4 * Authors: Wensong Zhang <wensong@gnuchina.org> 5 * 6 * This program is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU General Public License 8 * as published by the Free Software Foundation; either version 9 * 2 of the License, or (at your option) any later version. 10 * 11 * Changes: 12 * Martin Hamilton : fixed the terrible locking bugs 13 * *lock(tbl->lock) ==> *lock(&tbl->lock) 14 * Wensong Zhang : fixed the uninitilized tbl->lock bug 15 * Wensong Zhang : added doing full expiration check to 16 * collect stale entries of 24+ hours when 17 * no partial expire check in a half hour 18 * Julian Anastasov : replaced del_timer call with del_timer_sync 19 * to avoid the possible race between timer 20 * handler and del_timer thread in SMP 21 * 22 */ 23 24/* 25 * The lblc algorithm is as follows (pseudo code): 26 * 27 * if cachenode[dest_ip] is null then 28 * n, cachenode[dest_ip] <- {weighted least-conn node}; 29 * else 30 * n <- cachenode[dest_ip]; 31 * if (n is dead) OR 32 * (n.conns>n.weight AND 33 * there is a node m with m.conns<m.weight/2) then 34 * n, cachenode[dest_ip] <- {weighted least-conn node}; 35 * 36 * return n; 37 * 38 * Thanks must go to Wenzhuo Zhang for talking WCCP to me and pushing 39 * me to write this module. 40 */ 41 42#include <linux/ip.h> 43#include <linux/module.h> 44#include <linux/kernel.h> 45#include <linux/skbuff.h> 46#include <linux/jiffies.h> 47 48/* for sysctl */ 49#include <linux/fs.h> 50#include <linux/sysctl.h> 51 52#include <net/ip_vs.h> 53 54 55/* 56 * It is for garbage collection of stale IPVS lblc entries, 57 * when the table is full. 58 */ 59#define CHECK_EXPIRE_INTERVAL (60*HZ) 60#define ENTRY_TIMEOUT (6*60*HZ) 61 62/* 63 * It is for full expiration check. 64 * When there is no partial expiration check (garbage collection) 65 * in a half hour, do a full expiration check to collect stale 66 * entries that haven't been touched for a day. 67 */ 68#define COUNT_FOR_FULL_EXPIRATION 30 69static int sysctl_ip_vs_lblc_expiration = 24*60*60*HZ; 70 71 72/* 73 * for IPVS lblc entry hash table 74 */ 75#ifndef CONFIG_IP_VS_LBLC_TAB_BITS 76#define CONFIG_IP_VS_LBLC_TAB_BITS 10 77#endif 78#define IP_VS_LBLC_TAB_BITS CONFIG_IP_VS_LBLC_TAB_BITS 79#define IP_VS_LBLC_TAB_SIZE (1 << IP_VS_LBLC_TAB_BITS) 80#define IP_VS_LBLC_TAB_MASK (IP_VS_LBLC_TAB_SIZE - 1) 81 82 83/* 84 * IPVS lblc entry represents an association between destination 85 * IP address and its destination server 86 */ 87struct ip_vs_lblc_entry { 88 struct list_head list; 89 __be32 addr; /* destination IP address */ 90 struct ip_vs_dest *dest; /* real server (cache) */ 91 unsigned long lastuse; /* last used time */ 92}; 93 94 95/* 96 * IPVS lblc hash table 97 */ 98struct ip_vs_lblc_table { 99 rwlock_t lock; /* lock for this table */ 100 struct list_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */ 101 atomic_t entries; /* number of entries */ 102 int max_size; /* maximum size of entries */ 103 struct timer_list periodic_timer; /* collect stale entries */ 104 int rover; /* rover for expire check */ 105 int counter; /* counter for no expire */ 106}; 107 108 109/* 110 * IPVS LBLC sysctl table 111 */ 112 113static ctl_table vs_vars_table[] = { 114 { 115 .procname = "lblc_expiration", 116 .data = &sysctl_ip_vs_lblc_expiration, 117 .maxlen = sizeof(int), 118 .mode = 0644, 119 .proc_handler = &proc_dointvec_jiffies, 120 }, 121 { .ctl_name = 0 } 122}; 123 124static struct ctl_table_header * sysctl_header; 125 126/* 127 * new/free a ip_vs_lblc_entry, which is a mapping of a destionation 128 * IP address to a server. 129 */ 130static inline struct ip_vs_lblc_entry * 131ip_vs_lblc_new(__be32 daddr, struct ip_vs_dest *dest) 132{ 133 struct ip_vs_lblc_entry *en; 134 135 en = kmalloc(sizeof(struct ip_vs_lblc_entry), GFP_ATOMIC); 136 if (en == NULL) { 137 IP_VS_ERR("ip_vs_lblc_new(): no memory\n"); 138 return NULL; 139 } 140 141 INIT_LIST_HEAD(&en->list); 142 en->addr = daddr; 143 144 atomic_inc(&dest->refcnt); 145 en->dest = dest; 146 147 return en; 148} 149 150 151static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en) 152{ 153 list_del(&en->list); 154 /* 155 * We don't kfree dest because it is refered either by its service 156 * or the trash dest list. 157 */ 158 atomic_dec(&en->dest->refcnt); 159 kfree(en); 160} 161 162 163/* 164 * Returns hash value for IPVS LBLC entry 165 */ 166static inline unsigned ip_vs_lblc_hashkey(__be32 addr) 167{ 168 return (ntohl(addr)*2654435761UL) & IP_VS_LBLC_TAB_MASK; 169} 170 171 172/* 173 * Hash an entry in the ip_vs_lblc_table. 174 * returns bool success. 175 */ 176static int 177ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en) 178{ 179 unsigned hash; 180 181 if (!list_empty(&en->list)) { 182 IP_VS_ERR("ip_vs_lblc_hash(): request for already hashed, " 183 "called from %p\n", __builtin_return_address(0)); 184 return 0; 185 } 186 187 /* 188 * Hash by destination IP address 189 */ 190 hash = ip_vs_lblc_hashkey(en->addr); 191 192 write_lock(&tbl->lock); 193 list_add(&en->list, &tbl->bucket[hash]); 194 atomic_inc(&tbl->entries); 195 write_unlock(&tbl->lock); 196 197 return 1; 198} 199 200 201/* 202 * Get ip_vs_lblc_entry associated with supplied parameters. 203 */ 204static inline struct ip_vs_lblc_entry * 205ip_vs_lblc_get(struct ip_vs_lblc_table *tbl, __be32 addr) 206{ 207 unsigned hash; 208 struct ip_vs_lblc_entry *en; 209 210 hash = ip_vs_lblc_hashkey(addr); 211 212 read_lock(&tbl->lock); 213 214 list_for_each_entry(en, &tbl->bucket[hash], list) { 215 if (en->addr == addr) { 216 /* HIT */ 217 read_unlock(&tbl->lock); 218 return en; 219 } 220 } 221 222 read_unlock(&tbl->lock); 223 224 return NULL; 225} 226 227 228/* 229 * Flush all the entries of the specified table. 230 */ 231static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl) 232{ 233 int i; 234 struct ip_vs_lblc_entry *en, *nxt; 235 236 for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) { 237 write_lock(&tbl->lock); 238 list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) { 239 ip_vs_lblc_free(en); 240 atomic_dec(&tbl->entries); 241 } 242 write_unlock(&tbl->lock); 243 } 244} 245 246 247static inline void ip_vs_lblc_full_check(struct ip_vs_lblc_table *tbl) 248{ 249 unsigned long now = jiffies; 250 int i, j; 251 struct ip_vs_lblc_entry *en, *nxt; 252 253 for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) { 254 j = (j + 1) & IP_VS_LBLC_TAB_MASK; 255 256 write_lock(&tbl->lock); 257 list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { 258 if (time_before(now, 259 en->lastuse + sysctl_ip_vs_lblc_expiration)) 260 continue; 261 262 ip_vs_lblc_free(en); 263 atomic_dec(&tbl->entries); 264 } 265 write_unlock(&tbl->lock); 266 } 267 tbl->rover = j; 268} 269 270 271/* 272 * Periodical timer handler for IPVS lblc table 273 * It is used to collect stale entries when the number of entries 274 * exceeds the maximum size of the table. 275 * 276 * Fixme: we probably need more complicated algorithm to collect 277 * entries that have not been used for a long time even 278 * if the number of entries doesn't exceed the maximum size 279 * of the table. 280 * The full expiration check is for this purpose now. 281 */ 282static void ip_vs_lblc_check_expire(unsigned long data) 283{ 284 struct ip_vs_lblc_table *tbl; 285 unsigned long now = jiffies; 286 int goal; 287 int i, j; 288 struct ip_vs_lblc_entry *en, *nxt; 289 290 tbl = (struct ip_vs_lblc_table *)data; 291 292 if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) { 293 /* do full expiration check */ 294 ip_vs_lblc_full_check(tbl); 295 tbl->counter = 1; 296 goto out; 297 } 298 299 if (atomic_read(&tbl->entries) <= tbl->max_size) { 300 tbl->counter++; 301 goto out; 302 } 303 304 goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3; 305 if (goal > tbl->max_size/2) 306 goal = tbl->max_size/2; 307 308 for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) { 309 j = (j + 1) & IP_VS_LBLC_TAB_MASK; 310 311 write_lock(&tbl->lock); 312 list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { 313 if (time_before(now, en->lastuse + ENTRY_TIMEOUT)) 314 continue; 315 316 ip_vs_lblc_free(en); 317 atomic_dec(&tbl->entries); 318 goal--; 319 } 320 write_unlock(&tbl->lock); 321 if (goal <= 0) 322 break; 323 } 324 tbl->rover = j; 325 326 out: 327 mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL); 328} 329 330 331static int ip_vs_lblc_init_svc(struct ip_vs_service *svc) 332{ 333 int i; 334 struct ip_vs_lblc_table *tbl; 335 336 /* 337 * Allocate the ip_vs_lblc_table for this service 338 */ 339 tbl = kmalloc(sizeof(struct ip_vs_lblc_table), GFP_ATOMIC); 340 if (tbl == NULL) { 341 IP_VS_ERR("ip_vs_lblc_init_svc(): no memory\n"); 342 return -ENOMEM; 343 } 344 svc->sched_data = tbl; 345 IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) allocated for " 346 "current service\n", 347 sizeof(struct ip_vs_lblc_table)); 348 349 /* 350 * Initialize the hash buckets 351 */ 352 for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) { 353 INIT_LIST_HEAD(&tbl->bucket[i]); 354 } 355 rwlock_init(&tbl->lock); 356 tbl->max_size = IP_VS_LBLC_TAB_SIZE*16; 357 tbl->rover = 0; 358 tbl->counter = 1; 359 360 /* 361 * Hook periodic timer for garbage collection 362 */ 363 setup_timer(&tbl->periodic_timer, ip_vs_lblc_check_expire, 364 (unsigned long)tbl); 365 tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL; 366 add_timer(&tbl->periodic_timer); 367 368 return 0; 369} 370 371 372static int ip_vs_lblc_done_svc(struct ip_vs_service *svc) 373{ 374 struct ip_vs_lblc_table *tbl = svc->sched_data; 375 376 /* remove periodic timer */ 377 del_timer_sync(&tbl->periodic_timer); 378 379 /* got to clean up table entries here */ 380 ip_vs_lblc_flush(tbl); 381 382 /* release the table itself */ 383 kfree(svc->sched_data); 384 IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n", 385 sizeof(struct ip_vs_lblc_table)); 386 387 return 0; 388} 389 390 391static int ip_vs_lblc_update_svc(struct ip_vs_service *svc) 392{ 393 return 0; 394} 395 396 397static inline struct ip_vs_dest * 398__ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph) 399{ 400 struct ip_vs_dest *dest, *least; 401 int loh, doh; 402 403 /* 404 * We think the overhead of processing active connections is fifty 405 * times higher than that of inactive connections in average. (This 406 * fifty times might not be accurate, we will change it later.) We 407 * use the following formula to estimate the overhead: 408 * dest->activeconns*50 + dest->inactconns 409 * and the load: 410 * (dest overhead) / dest->weight 411 * 412 * Remember -- no floats in kernel mode!!! 413 * The comparison of h1*w2 > h2*w1 is equivalent to that of 414 * h1/w1 > h2/w2 415 * if every weight is larger than zero. 416 * 417 * The server with weight=0 is quiesced and will not receive any 418 * new connection. 419 */ 420 list_for_each_entry(dest, &svc->destinations, n_list) { 421 if (dest->flags & IP_VS_DEST_F_OVERLOAD) 422 continue; 423 if (atomic_read(&dest->weight) > 0) { 424 least = dest; 425 loh = atomic_read(&least->activeconns) * 50 426 + atomic_read(&least->inactconns); 427 goto nextstage; 428 } 429 } 430 return NULL; 431 432 /* 433 * Find the destination with the least load. 434 */ 435 nextstage: 436 list_for_each_entry_continue(dest, &svc->destinations, n_list) { 437 if (dest->flags & IP_VS_DEST_F_OVERLOAD) 438 continue; 439 440 doh = atomic_read(&dest->activeconns) * 50 441 + atomic_read(&dest->inactconns); 442 if (loh * atomic_read(&dest->weight) > 443 doh * atomic_read(&least->weight)) { 444 least = dest; 445 loh = doh; 446 } 447 } 448 449 IP_VS_DBG(6, "LBLC: server %d.%d.%d.%d:%d " 450 "activeconns %d refcnt %d weight %d overhead %d\n", 451 NIPQUAD(least->addr), ntohs(least->port), 452 atomic_read(&least->activeconns), 453 atomic_read(&least->refcnt), 454 atomic_read(&least->weight), loh); 455 456 return least; 457} 458 459 460/* 461 * If this destination server is overloaded and there is a less loaded 462 * server, then return true. 463 */ 464static inline int 465is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc) 466{ 467 if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) { 468 struct ip_vs_dest *d; 469 470 list_for_each_entry(d, &svc->destinations, n_list) { 471 if (atomic_read(&d->activeconns)*2 472 < atomic_read(&d->weight)) { 473 return 1; 474 } 475 } 476 } 477 return 0; 478} 479 480 481/* 482 * Locality-Based (weighted) Least-Connection scheduling 483 */ 484static struct ip_vs_dest * 485ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) 486{ 487 struct ip_vs_dest *dest; 488 struct ip_vs_lblc_table *tbl; 489 struct ip_vs_lblc_entry *en; 490 struct iphdr *iph = ip_hdr(skb); 491 492 IP_VS_DBG(6, "ip_vs_lblc_schedule(): Scheduling...\n"); 493 494 tbl = (struct ip_vs_lblc_table *)svc->sched_data; 495 en = ip_vs_lblc_get(tbl, iph->daddr); 496 if (en == NULL) { 497 dest = __ip_vs_wlc_schedule(svc, iph); 498 if (dest == NULL) { 499 IP_VS_DBG(1, "no destination available\n"); 500 return NULL; 501 } 502 en = ip_vs_lblc_new(iph->daddr, dest); 503 if (en == NULL) { 504 return NULL; 505 } 506 ip_vs_lblc_hash(tbl, en); 507 } else { 508 dest = en->dest; 509 if (!(dest->flags & IP_VS_DEST_F_AVAILABLE) 510 || atomic_read(&dest->weight) <= 0 511 || is_overloaded(dest, svc)) { 512 dest = __ip_vs_wlc_schedule(svc, iph); 513 if (dest == NULL) { 514 IP_VS_DBG(1, "no destination available\n"); 515 return NULL; 516 } 517 atomic_dec(&en->dest->refcnt); 518 atomic_inc(&dest->refcnt); 519 en->dest = dest; 520 } 521 } 522 en->lastuse = jiffies; 523 524 IP_VS_DBG(6, "LBLC: destination IP address %u.%u.%u.%u " 525 "--> server %u.%u.%u.%u:%d\n", 526 NIPQUAD(en->addr), 527 NIPQUAD(dest->addr), 528 ntohs(dest->port)); 529 530 return dest; 531} 532 533 534/* 535 * IPVS LBLC Scheduler structure 536 */ 537static struct ip_vs_scheduler ip_vs_lblc_scheduler = 538{ 539 .name = "lblc", 540 .refcnt = ATOMIC_INIT(0), 541 .module = THIS_MODULE, 542 .n_list = LIST_HEAD_INIT(ip_vs_lblc_scheduler.n_list), 543 .init_service = ip_vs_lblc_init_svc, 544 .done_service = ip_vs_lblc_done_svc, 545 .update_service = ip_vs_lblc_update_svc, 546 .schedule = ip_vs_lblc_schedule, 547}; 548 549 550static int __init ip_vs_lblc_init(void) 551{ 552 int ret; 553 554 sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars_table); 555 ret = register_ip_vs_scheduler(&ip_vs_lblc_scheduler); 556 if (ret) 557 unregister_sysctl_table(sysctl_header); 558 return ret; 559} 560 561 562static void __exit ip_vs_lblc_cleanup(void) 563{ 564 unregister_sysctl_table(sysctl_header); 565 unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler); 566} 567 568 569module_init(ip_vs_lblc_init); 570module_exit(ip_vs_lblc_cleanup); 571MODULE_LICENSE("GPL");