Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'ipvs2-for-v4.3' of https://git.kernel.org/pub/scm/linux/kernel/git/horms/ipvs-next

Simon Horman says:

====================
Second Round of IPVS Updates for v4.3

I realise these are a little late in the cycle, so if you would prefer
me to repost them for v4.4 then just let me know.

The updates include:
* A new scheduler from Raducu Deaconu
* Enhanced configurability of the sync daemon from Julian Anastasov
====================

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>

+402 -136
+15 -8
include/net/ip_vs.h
··· 846 846 /* How much time to keep dests in trash */ 847 847 #define IP_VS_DEST_TRASH_PERIOD (120 * HZ) 848 848 849 + struct ipvs_sync_daemon_cfg { 850 + union nf_inet_addr mcast_group; 851 + int syncid; 852 + u16 sync_maxlen; 853 + u16 mcast_port; 854 + u8 mcast_af; 855 + u8 mcast_ttl; 856 + /* multicast interface name */ 857 + char mcast_ifn[IP_VS_IFNAME_MAXLEN]; 858 + }; 859 + 849 860 /* IPVS in network namespace */ 850 861 struct netns_ipvs { 851 862 int gen; /* Generation */ ··· 972 961 spinlock_t sync_buff_lock; 973 962 struct task_struct **backup_threads; 974 963 int threads_mask; 975 - int send_mesg_maxlen; 976 - int recv_mesg_maxlen; 977 964 volatile int sync_state; 978 - volatile int master_syncid; 979 - volatile int backup_syncid; 980 965 struct mutex sync_mutex; 981 - /* multicast interface name */ 982 - char master_mcast_ifn[IP_VS_IFNAME_MAXLEN]; 983 - char backup_mcast_ifn[IP_VS_IFNAME_MAXLEN]; 966 + struct ipvs_sync_daemon_cfg mcfg; /* Master Configuration */ 967 + struct ipvs_sync_daemon_cfg bcfg; /* Backup Configuration */ 984 968 /* net name space ptr */ 985 969 struct net *net; /* Needed by timer routines */ 986 970 /* Number of heterogeneous destinations, needed becaus heterogeneous ··· 1414 1408 /* IPVS sync daemon data and function prototypes 1415 1409 * (from ip_vs_sync.c) 1416 1410 */ 1417 - int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid); 1411 + int start_sync_thread(struct net *net, struct ipvs_sync_daemon_cfg *cfg, 1412 + int state); 1418 1413 int stop_sync_thread(struct net *net, int state); 1419 1414 void ip_vs_sync_conn(struct net *net, struct ip_vs_conn *cp, int pkts); 1420 1415
+5
include/uapi/linux/ip_vs.h
··· 406 406 IPVS_DAEMON_ATTR_STATE, /* sync daemon state (master/backup) */ 407 407 IPVS_DAEMON_ATTR_MCAST_IFN, /* multicast interface name */ 408 408 IPVS_DAEMON_ATTR_SYNC_ID, /* SyncID we belong to */ 409 + IPVS_DAEMON_ATTR_SYNC_MAXLEN, /* UDP Payload Size */ 410 + IPVS_DAEMON_ATTR_MCAST_GROUP, /* IPv4 Multicast Address */ 411 + IPVS_DAEMON_ATTR_MCAST_GROUP6, /* IPv6 Multicast Address */ 412 + IPVS_DAEMON_ATTR_MCAST_PORT, /* Multicast Port (base) */ 413 + IPVS_DAEMON_ATTR_MCAST_TTL, /* Multicast TTL */ 409 414 __IPVS_DAEMON_ATTR_MAX, 410 415 }; 411 416
+11
net/netfilter/ipvs/Kconfig
··· 162 162 If you want to compile it in kernel, say Y. To compile it as a 163 163 module, choose M here. If unsure, say N. 164 164 165 + config IP_VS_OVF 166 + tristate "weighted overflow scheduling" 167 + ---help--- 168 + The weighted overflow scheduling algorithm directs network 169 + connections to the server with the highest weight that is 170 + currently available and overflows to the next when active 171 + connections exceed the node's weight. 172 + 173 + If you want to compile it in kernel, say Y. To compile it as a 174 + module, choose M here. If unsure, say N. 175 + 165 176 config IP_VS_LBLC 166 177 tristate "locality-based least-connection scheduling" 167 178 ---help---
+1
net/netfilter/ipvs/Makefile
··· 27 27 obj-$(CONFIG_IP_VS_LC) += ip_vs_lc.o 28 28 obj-$(CONFIG_IP_VS_WLC) += ip_vs_wlc.o 29 29 obj-$(CONFIG_IP_VS_FO) += ip_vs_fo.o 30 + obj-$(CONFIG_IP_VS_OVF) += ip_vs_ovf.o 30 31 obj-$(CONFIG_IP_VS_LBLC) += ip_vs_lblc.o 31 32 obj-$(CONFIG_IP_VS_LBLCR) += ip_vs_lblcr.o 32 33 obj-$(CONFIG_IP_VS_DH) += ip_vs_dh.o
+110 -33
net/netfilter/ipvs/ip_vs_ctl.c
··· 2335 2335 cmd == IP_VS_SO_SET_STOPDAEMON) { 2336 2336 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg; 2337 2337 2338 - mutex_lock(&ipvs->sync_mutex); 2339 - if (cmd == IP_VS_SO_SET_STARTDAEMON) 2340 - ret = start_sync_thread(net, dm->state, dm->mcast_ifn, 2341 - dm->syncid); 2342 - else 2338 + if (cmd == IP_VS_SO_SET_STARTDAEMON) { 2339 + struct ipvs_sync_daemon_cfg cfg; 2340 + 2341 + memset(&cfg, 0, sizeof(cfg)); 2342 + strlcpy(cfg.mcast_ifn, dm->mcast_ifn, 2343 + sizeof(cfg.mcast_ifn)); 2344 + cfg.syncid = dm->syncid; 2345 + rtnl_lock(); 2346 + mutex_lock(&ipvs->sync_mutex); 2347 + ret = start_sync_thread(net, &cfg, dm->state); 2348 + mutex_unlock(&ipvs->sync_mutex); 2349 + rtnl_unlock(); 2350 + } else { 2351 + mutex_lock(&ipvs->sync_mutex); 2343 2352 ret = stop_sync_thread(net, dm->state); 2344 - mutex_unlock(&ipvs->sync_mutex); 2353 + mutex_unlock(&ipvs->sync_mutex); 2354 + } 2345 2355 goto out_dec; 2346 2356 } 2347 2357 ··· 2655 2645 mutex_lock(&ipvs->sync_mutex); 2656 2646 if (ipvs->sync_state & IP_VS_STATE_MASTER) { 2657 2647 d[0].state = IP_VS_STATE_MASTER; 2658 - strlcpy(d[0].mcast_ifn, ipvs->master_mcast_ifn, 2648 + strlcpy(d[0].mcast_ifn, ipvs->mcfg.mcast_ifn, 2659 2649 sizeof(d[0].mcast_ifn)); 2660 - d[0].syncid = ipvs->master_syncid; 2650 + d[0].syncid = ipvs->mcfg.syncid; 2661 2651 } 2662 2652 if (ipvs->sync_state & IP_VS_STATE_BACKUP) { 2663 2653 d[1].state = IP_VS_STATE_BACKUP; 2664 - strlcpy(d[1].mcast_ifn, ipvs->backup_mcast_ifn, 2654 + strlcpy(d[1].mcast_ifn, ipvs->bcfg.mcast_ifn, 2665 2655 sizeof(d[1].mcast_ifn)); 2666 - d[1].syncid = ipvs->backup_syncid; 2656 + d[1].syncid = ipvs->bcfg.syncid; 2667 2657 } 2668 2658 if (copy_to_user(user, &d, sizeof(d)) != 0) 2669 2659 ret = -EFAULT; ··· 2818 2808 [IPVS_DAEMON_ATTR_MCAST_IFN] = { .type = NLA_NUL_STRING, 2819 2809 .len = IP_VS_IFNAME_MAXLEN }, 2820 2810 [IPVS_DAEMON_ATTR_SYNC_ID] = { .type = NLA_U32 }, 2811 + [IPVS_DAEMON_ATTR_SYNC_MAXLEN] = { .type = NLA_U16 }, 2812 + [IPVS_DAEMON_ATTR_MCAST_GROUP] = { .type = NLA_U32 }, 2813 + [IPVS_DAEMON_ATTR_MCAST_GROUP6] = { .len = sizeof(struct in6_addr) }, 2814 + [IPVS_DAEMON_ATTR_MCAST_PORT] = { .type = NLA_U16 }, 2815 + [IPVS_DAEMON_ATTR_MCAST_TTL] = { .type = NLA_U8 }, 2821 2816 }; 2822 2817 2823 2818 /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_SERVICE */ ··· 3281 3266 } 3282 3267 3283 3268 static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __u32 state, 3284 - const char *mcast_ifn, __u32 syncid) 3269 + struct ipvs_sync_daemon_cfg *c) 3285 3270 { 3286 3271 struct nlattr *nl_daemon; 3287 3272 ··· 3290 3275 return -EMSGSIZE; 3291 3276 3292 3277 if (nla_put_u32(skb, IPVS_DAEMON_ATTR_STATE, state) || 3293 - nla_put_string(skb, IPVS_DAEMON_ATTR_MCAST_IFN, mcast_ifn) || 3294 - nla_put_u32(skb, IPVS_DAEMON_ATTR_SYNC_ID, syncid)) 3278 + nla_put_string(skb, IPVS_DAEMON_ATTR_MCAST_IFN, c->mcast_ifn) || 3279 + nla_put_u32(skb, IPVS_DAEMON_ATTR_SYNC_ID, c->syncid) || 3280 + nla_put_u16(skb, IPVS_DAEMON_ATTR_SYNC_MAXLEN, c->sync_maxlen) || 3281 + nla_put_u16(skb, IPVS_DAEMON_ATTR_MCAST_PORT, c->mcast_port) || 3282 + nla_put_u8(skb, IPVS_DAEMON_ATTR_MCAST_TTL, c->mcast_ttl)) 3295 3283 goto nla_put_failure; 3284 + #ifdef CONFIG_IP_VS_IPV6 3285 + if (c->mcast_af == AF_INET6) { 3286 + if (nla_put_in6_addr(skb, IPVS_DAEMON_ATTR_MCAST_GROUP6, 3287 + &c->mcast_group.in6)) 3288 + goto nla_put_failure; 3289 + } else 3290 + #endif 3291 + if (c->mcast_af == AF_INET && 3292 + nla_put_in_addr(skb, IPVS_DAEMON_ATTR_MCAST_GROUP, 3293 + c->mcast_group.ip)) 3294 + goto nla_put_failure; 3296 3295 nla_nest_end(skb, nl_daemon); 3297 3296 3298 3297 return 0; ··· 3317 3288 } 3318 3289 3319 3290 static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __u32 state, 3320 - const char *mcast_ifn, __u32 syncid, 3291 + struct ipvs_sync_daemon_cfg *c, 3321 3292 struct netlink_callback *cb) 3322 3293 { 3323 3294 void *hdr; ··· 3327 3298 if (!hdr) 3328 3299 return -EMSGSIZE; 3329 3300 3330 - if (ip_vs_genl_fill_daemon(skb, state, mcast_ifn, syncid)) 3301 + if (ip_vs_genl_fill_daemon(skb, state, c)) 3331 3302 goto nla_put_failure; 3332 3303 3333 3304 genlmsg_end(skb, hdr); ··· 3347 3318 mutex_lock(&ipvs->sync_mutex); 3348 3319 if ((ipvs->sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) { 3349 3320 if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER, 3350 - ipvs->master_mcast_ifn, 3351 - ipvs->master_syncid, cb) < 0) 3321 + &ipvs->mcfg, cb) < 0) 3352 3322 goto nla_put_failure; 3353 3323 3354 3324 cb->args[0] = 1; ··· 3355 3327 3356 3328 if ((ipvs->sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) { 3357 3329 if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_BACKUP, 3358 - ipvs->backup_mcast_ifn, 3359 - ipvs->backup_syncid, cb) < 0) 3330 + &ipvs->bcfg, cb) < 0) 3360 3331 goto nla_put_failure; 3361 3332 3362 3333 cb->args[1] = 1; ··· 3369 3342 3370 3343 static int ip_vs_genl_new_daemon(struct net *net, struct nlattr **attrs) 3371 3344 { 3345 + struct netns_ipvs *ipvs = net_ipvs(net); 3346 + struct ipvs_sync_daemon_cfg c; 3347 + struct nlattr *a; 3348 + int ret; 3349 + 3350 + memset(&c, 0, sizeof(c)); 3372 3351 if (!(attrs[IPVS_DAEMON_ATTR_STATE] && 3373 3352 attrs[IPVS_DAEMON_ATTR_MCAST_IFN] && 3374 3353 attrs[IPVS_DAEMON_ATTR_SYNC_ID])) 3375 3354 return -EINVAL; 3355 + strlcpy(c.mcast_ifn, nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]), 3356 + sizeof(c.mcast_ifn)); 3357 + c.syncid = nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID]); 3358 + 3359 + a = attrs[IPVS_DAEMON_ATTR_SYNC_MAXLEN]; 3360 + if (a) 3361 + c.sync_maxlen = nla_get_u16(a); 3362 + 3363 + a = attrs[IPVS_DAEMON_ATTR_MCAST_GROUP]; 3364 + if (a) { 3365 + c.mcast_af = AF_INET; 3366 + c.mcast_group.ip = nla_get_in_addr(a); 3367 + if (!ipv4_is_multicast(c.mcast_group.ip)) 3368 + return -EINVAL; 3369 + } else { 3370 + a = attrs[IPVS_DAEMON_ATTR_MCAST_GROUP6]; 3371 + if (a) { 3372 + #ifdef CONFIG_IP_VS_IPV6 3373 + int addr_type; 3374 + 3375 + c.mcast_af = AF_INET6; 3376 + c.mcast_group.in6 = nla_get_in6_addr(a); 3377 + addr_type = ipv6_addr_type(&c.mcast_group.in6); 3378 + if (!(addr_type & IPV6_ADDR_MULTICAST)) 3379 + return -EINVAL; 3380 + #else 3381 + return -EAFNOSUPPORT; 3382 + #endif 3383 + } 3384 + } 3385 + 3386 + a = attrs[IPVS_DAEMON_ATTR_MCAST_PORT]; 3387 + if (a) 3388 + c.mcast_port = nla_get_u16(a); 3389 + 3390 + a = attrs[IPVS_DAEMON_ATTR_MCAST_TTL]; 3391 + if (a) 3392 + c.mcast_ttl = nla_get_u8(a); 3376 3393 3377 3394 /* The synchronization protocol is incompatible with mixed family 3378 3395 * services 3379 3396 */ 3380 - if (net_ipvs(net)->mixed_address_family_dests > 0) 3397 + if (ipvs->mixed_address_family_dests > 0) 3381 3398 return -EINVAL; 3382 3399 3383 - return start_sync_thread(net, 3384 - nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]), 3385 - nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]), 3386 - nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID])); 3400 + rtnl_lock(); 3401 + mutex_lock(&ipvs->sync_mutex); 3402 + ret = start_sync_thread(net, &c, 3403 + nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE])); 3404 + mutex_unlock(&ipvs->sync_mutex); 3405 + rtnl_unlock(); 3406 + return ret; 3387 3407 } 3388 3408 3389 3409 static int ip_vs_genl_del_daemon(struct net *net, struct nlattr **attrs) 3390 3410 { 3411 + struct netns_ipvs *ipvs = net_ipvs(net); 3412 + int ret; 3413 + 3391 3414 if (!attrs[IPVS_DAEMON_ATTR_STATE]) 3392 3415 return -EINVAL; 3393 3416 3394 - return stop_sync_thread(net, 3395 - nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE])); 3417 + mutex_lock(&ipvs->sync_mutex); 3418 + ret = stop_sync_thread(net, 3419 + nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE])); 3420 + mutex_unlock(&ipvs->sync_mutex); 3421 + return ret; 3396 3422 } 3397 3423 3398 3424 static int ip_vs_genl_set_config(struct net *net, struct nlattr **attrs) ··· 3469 3389 3470 3390 static int ip_vs_genl_set_daemon(struct sk_buff *skb, struct genl_info *info) 3471 3391 { 3472 - int ret = 0, cmd; 3392 + int ret = -EINVAL, cmd; 3473 3393 struct net *net; 3474 3394 struct netns_ipvs *ipvs; 3475 3395 ··· 3480 3400 if (cmd == IPVS_CMD_NEW_DAEMON || cmd == IPVS_CMD_DEL_DAEMON) { 3481 3401 struct nlattr *daemon_attrs[IPVS_DAEMON_ATTR_MAX + 1]; 3482 3402 3483 - mutex_lock(&ipvs->sync_mutex); 3484 3403 if (!info->attrs[IPVS_CMD_ATTR_DAEMON] || 3485 3404 nla_parse_nested(daemon_attrs, IPVS_DAEMON_ATTR_MAX, 3486 3405 info->attrs[IPVS_CMD_ATTR_DAEMON], 3487 - ip_vs_daemon_policy)) { 3488 - ret = -EINVAL; 3406 + ip_vs_daemon_policy)) 3489 3407 goto out; 3490 - } 3491 3408 3492 3409 if (cmd == IPVS_CMD_NEW_DAEMON) 3493 3410 ret = ip_vs_genl_new_daemon(net, daemon_attrs); 3494 3411 else 3495 3412 ret = ip_vs_genl_del_daemon(net, daemon_attrs); 3496 - out: 3497 - mutex_unlock(&ipvs->sync_mutex); 3498 3413 } 3414 + 3415 + out: 3499 3416 return ret; 3500 3417 } 3501 3418
+86
net/netfilter/ipvs/ip_vs_ovf.c
··· 1 + /* 2 + * IPVS: Overflow-Connection Scheduling module 3 + * 4 + * Authors: Raducu Deaconu <rhadoo_io@yahoo.com> 5 + * 6 + * This program is free software; you can redistribute it and/or 7 + * modify it under the terms of the GNU General Public License 8 + * as published by the Free Software Foundation; either version 9 + * 2 of the License, or (at your option) any later version. 10 + * 11 + * Scheduler implements "overflow" loadbalancing according to number of active 12 + * connections , will keep all conections to the node with the highest weight 13 + * and overflow to the next node if the number of connections exceeds the node's 14 + * weight. 15 + * Note that this scheduler might not be suitable for UDP because it only uses 16 + * active connections 17 + * 18 + */ 19 + 20 + #define KMSG_COMPONENT "IPVS" 21 + #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt 22 + 23 + #include <linux/module.h> 24 + #include <linux/kernel.h> 25 + 26 + #include <net/ip_vs.h> 27 + 28 + /* OVF Connection scheduling */ 29 + static struct ip_vs_dest * 30 + ip_vs_ovf_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, 31 + struct ip_vs_iphdr *iph) 32 + { 33 + struct ip_vs_dest *dest, *h = NULL; 34 + int hw = 0, w; 35 + 36 + IP_VS_DBG(6, "ip_vs_ovf_schedule(): Scheduling...\n"); 37 + /* select the node with highest weight, go to next in line if active 38 + * connections exceed weight 39 + */ 40 + list_for_each_entry_rcu(dest, &svc->destinations, n_list) { 41 + w = atomic_read(&dest->weight); 42 + if ((dest->flags & IP_VS_DEST_F_OVERLOAD) || 43 + atomic_read(&dest->activeconns) > w || 44 + w == 0) 45 + continue; 46 + if (!h || w > hw) { 47 + h = dest; 48 + hw = w; 49 + } 50 + } 51 + 52 + if (h) { 53 + IP_VS_DBG_BUF(6, "OVF: server %s:%u active %d w %d\n", 54 + IP_VS_DBG_ADDR(h->af, &h->addr), 55 + ntohs(h->port), 56 + atomic_read(&h->activeconns), 57 + atomic_read(&h->weight)); 58 + return h; 59 + } 60 + 61 + ip_vs_scheduler_err(svc, "no destination available"); 62 + return NULL; 63 + } 64 + 65 + static struct ip_vs_scheduler ip_vs_ovf_scheduler = { 66 + .name = "ovf", 67 + .refcnt = ATOMIC_INIT(0), 68 + .module = THIS_MODULE, 69 + .n_list = LIST_HEAD_INIT(ip_vs_ovf_scheduler.n_list), 70 + .schedule = ip_vs_ovf_schedule, 71 + }; 72 + 73 + static int __init ip_vs_ovf_init(void) 74 + { 75 + return register_ip_vs_scheduler(&ip_vs_ovf_scheduler); 76 + } 77 + 78 + static void __exit ip_vs_ovf_cleanup(void) 79 + { 80 + unregister_ip_vs_scheduler(&ip_vs_ovf_scheduler); 81 + synchronize_rcu(); 82 + } 83 + 84 + module_init(ip_vs_ovf_init); 85 + module_exit(ip_vs_ovf_cleanup); 86 + MODULE_LICENSE("GPL");
+174 -95
net/netfilter/ipvs/ip_vs_sync.c
··· 262 262 /* ip_vs_sync_conn entries start here */ 263 263 }; 264 264 265 + union ipvs_sockaddr { 266 + struct sockaddr_in in; 267 + struct sockaddr_in6 in6; 268 + }; 269 + 265 270 struct ip_vs_sync_buff { 266 271 struct list_head list; 267 272 unsigned long firstuse; ··· 325 320 * Create a new sync buffer for Version 1 proto. 326 321 */ 327 322 static inline struct ip_vs_sync_buff * 328 - ip_vs_sync_buff_create(struct netns_ipvs *ipvs) 323 + ip_vs_sync_buff_create(struct netns_ipvs *ipvs, unsigned int len) 329 324 { 330 325 struct ip_vs_sync_buff *sb; 331 326 332 327 if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC))) 333 328 return NULL; 334 329 335 - sb->mesg = kmalloc(ipvs->send_mesg_maxlen, GFP_ATOMIC); 330 + len = max_t(unsigned int, len + sizeof(struct ip_vs_sync_mesg), 331 + ipvs->mcfg.sync_maxlen); 332 + sb->mesg = kmalloc(len, GFP_ATOMIC); 336 333 if (!sb->mesg) { 337 334 kfree(sb); 338 335 return NULL; 339 336 } 340 337 sb->mesg->reserved = 0; /* old nr_conns i.e. must be zero now */ 341 338 sb->mesg->version = SYNC_PROTO_VER; 342 - sb->mesg->syncid = ipvs->master_syncid; 339 + sb->mesg->syncid = ipvs->mcfg.syncid; 343 340 sb->mesg->size = htons(sizeof(struct ip_vs_sync_mesg)); 344 341 sb->mesg->nr_conns = 0; 345 342 sb->mesg->spare = 0; 346 343 sb->head = (unsigned char *)sb->mesg + sizeof(struct ip_vs_sync_mesg); 347 - sb->end = (unsigned char *)sb->mesg + ipvs->send_mesg_maxlen; 344 + sb->end = (unsigned char *)sb->mesg + len; 348 345 349 346 sb->firstuse = jiffies; 350 347 return sb; ··· 409 402 * Create a new sync buffer for Version 0 proto. 410 403 */ 411 404 static inline struct ip_vs_sync_buff * 412 - ip_vs_sync_buff_create_v0(struct netns_ipvs *ipvs) 405 + ip_vs_sync_buff_create_v0(struct netns_ipvs *ipvs, unsigned int len) 413 406 { 414 407 struct ip_vs_sync_buff *sb; 415 408 struct ip_vs_sync_mesg_v0 *mesg; ··· 417 410 if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC))) 418 411 return NULL; 419 412 420 - sb->mesg = kmalloc(ipvs->send_mesg_maxlen, GFP_ATOMIC); 413 + len = max_t(unsigned int, len + sizeof(struct ip_vs_sync_mesg_v0), 414 + ipvs->mcfg.sync_maxlen); 415 + sb->mesg = kmalloc(len, GFP_ATOMIC); 421 416 if (!sb->mesg) { 422 417 kfree(sb); 423 418 return NULL; 424 419 } 425 420 mesg = (struct ip_vs_sync_mesg_v0 *)sb->mesg; 426 421 mesg->nr_conns = 0; 427 - mesg->syncid = ipvs->master_syncid; 422 + mesg->syncid = ipvs->mcfg.syncid; 428 423 mesg->size = htons(sizeof(struct ip_vs_sync_mesg_v0)); 429 424 sb->head = (unsigned char *)mesg + sizeof(struct ip_vs_sync_mesg_v0); 430 - sb->end = (unsigned char *)mesg + ipvs->send_mesg_maxlen; 425 + sb->end = (unsigned char *)mesg + len; 431 426 sb->firstuse = jiffies; 432 427 return sb; 433 428 } ··· 542 533 struct ip_vs_sync_buff *buff; 543 534 struct ipvs_master_sync_state *ms; 544 535 int id; 545 - int len; 536 + unsigned int len; 546 537 547 538 if (unlikely(cp->af != AF_INET)) 548 539 return; ··· 562 553 id = select_master_thread_id(ipvs, cp); 563 554 ms = &ipvs->ms[id]; 564 555 buff = ms->sync_buff; 556 + len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE : 557 + SIMPLE_CONN_SIZE; 565 558 if (buff) { 566 559 m = (struct ip_vs_sync_mesg_v0 *) buff->mesg; 567 560 /* Send buffer if it is for v1 */ 568 - if (!m->nr_conns) { 561 + if (buff->head + len > buff->end || !m->nr_conns) { 569 562 sb_queue_tail(ipvs, ms); 570 563 ms->sync_buff = NULL; 571 564 buff = NULL; 572 565 } 573 566 } 574 567 if (!buff) { 575 - buff = ip_vs_sync_buff_create_v0(ipvs); 568 + buff = ip_vs_sync_buff_create_v0(ipvs, len); 576 569 if (!buff) { 577 570 spin_unlock_bh(&ipvs->sync_buff_lock); 578 571 pr_err("ip_vs_sync_buff_create failed.\n"); ··· 583 572 ms->sync_buff = buff; 584 573 } 585 574 586 - len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE : 587 - SIMPLE_CONN_SIZE; 588 575 m = (struct ip_vs_sync_mesg_v0 *) buff->mesg; 589 576 s = (struct ip_vs_sync_conn_v0 *) buff->head; 590 577 ··· 606 597 m->nr_conns++; 607 598 m->size = htons(ntohs(m->size) + len); 608 599 buff->head += len; 609 - 610 - /* check if there is a space for next one */ 611 - if (buff->head + FULL_CONN_SIZE > buff->end) { 612 - sb_queue_tail(ipvs, ms); 613 - ms->sync_buff = NULL; 614 - } 615 600 spin_unlock_bh(&ipvs->sync_buff_lock); 616 601 617 602 /* synchronize its controller if it has */ ··· 697 694 } 698 695 699 696 if (!buff) { 700 - buff = ip_vs_sync_buff_create(ipvs); 697 + buff = ip_vs_sync_buff_create(ipvs, len); 701 698 if (!buff) { 702 699 spin_unlock_bh(&ipvs->sync_buff_lock); 703 700 pr_err("ip_vs_sync_buff_create failed.\n"); ··· 1222 1219 return; 1223 1220 } 1224 1221 /* SyncID sanity check */ 1225 - if (ipvs->backup_syncid != 0 && m2->syncid != ipvs->backup_syncid) { 1222 + if (ipvs->bcfg.syncid != 0 && m2->syncid != ipvs->bcfg.syncid) { 1226 1223 IP_VS_DBG(7, "BACKUP, Ignoring syncid = %d\n", m2->syncid); 1227 1224 return; 1228 1225 } ··· 1306 1303 /* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */ 1307 1304 lock_sock(sk); 1308 1305 inet->mc_loop = loop ? 1 : 0; 1306 + #ifdef CONFIG_IP_VS_IPV6 1307 + if (sk->sk_family == AF_INET6) { 1308 + struct ipv6_pinfo *np = inet6_sk(sk); 1309 + 1310 + /* IPV6_MULTICAST_LOOP */ 1311 + np->mc_loop = loop ? 1 : 0; 1312 + } 1313 + #endif 1309 1314 release_sock(sk); 1310 1315 } 1311 1316 ··· 1327 1316 /* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */ 1328 1317 lock_sock(sk); 1329 1318 inet->mc_ttl = ttl; 1319 + #ifdef CONFIG_IP_VS_IPV6 1320 + if (sk->sk_family == AF_INET6) { 1321 + struct ipv6_pinfo *np = inet6_sk(sk); 1322 + 1323 + /* IPV6_MULTICAST_HOPS */ 1324 + np->mcast_hops = ttl; 1325 + } 1326 + #endif 1327 + release_sock(sk); 1328 + } 1329 + 1330 + /* Control fragmentation of messages */ 1331 + static void set_mcast_pmtudisc(struct sock *sk, int val) 1332 + { 1333 + struct inet_sock *inet = inet_sk(sk); 1334 + 1335 + /* setsockopt(sock, SOL_IP, IP_MTU_DISCOVER, &val, sizeof(val)); */ 1336 + lock_sock(sk); 1337 + inet->pmtudisc = val; 1338 + #ifdef CONFIG_IP_VS_IPV6 1339 + if (sk->sk_family == AF_INET6) { 1340 + struct ipv6_pinfo *np = inet6_sk(sk); 1341 + 1342 + /* IPV6_MTU_DISCOVER */ 1343 + np->pmtudisc = val; 1344 + } 1345 + #endif 1330 1346 release_sock(sk); 1331 1347 } 1332 1348 ··· 1376 1338 lock_sock(sk); 1377 1339 inet->mc_index = dev->ifindex; 1378 1340 /* inet->mc_addr = 0; */ 1379 - release_sock(sk); 1341 + #ifdef CONFIG_IP_VS_IPV6 1342 + if (sk->sk_family == AF_INET6) { 1343 + struct ipv6_pinfo *np = inet6_sk(sk); 1380 1344 1381 - return 0; 1382 - } 1383 - 1384 - 1385 - /* 1386 - * Set the maximum length of sync message according to the 1387 - * specified interface's MTU. 1388 - */ 1389 - static int set_sync_mesg_maxlen(struct net *net, int sync_state) 1390 - { 1391 - struct netns_ipvs *ipvs = net_ipvs(net); 1392 - struct net_device *dev; 1393 - int num; 1394 - 1395 - if (sync_state == IP_VS_STATE_MASTER) { 1396 - dev = __dev_get_by_name(net, ipvs->master_mcast_ifn); 1397 - if (!dev) 1398 - return -ENODEV; 1399 - 1400 - num = (dev->mtu - sizeof(struct iphdr) - 1401 - sizeof(struct udphdr) - 1402 - SYNC_MESG_HEADER_LEN - 20) / SIMPLE_CONN_SIZE; 1403 - ipvs->send_mesg_maxlen = SYNC_MESG_HEADER_LEN + 1404 - SIMPLE_CONN_SIZE * min(num, MAX_CONNS_PER_SYNCBUFF); 1405 - IP_VS_DBG(7, "setting the maximum length of sync sending " 1406 - "message %d.\n", ipvs->send_mesg_maxlen); 1407 - } else if (sync_state == IP_VS_STATE_BACKUP) { 1408 - dev = __dev_get_by_name(net, ipvs->backup_mcast_ifn); 1409 - if (!dev) 1410 - return -ENODEV; 1411 - 1412 - ipvs->recv_mesg_maxlen = dev->mtu - 1413 - sizeof(struct iphdr) - sizeof(struct udphdr); 1414 - IP_VS_DBG(7, "setting the maximum length of sync receiving " 1415 - "message %d.\n", ipvs->recv_mesg_maxlen); 1345 + /* IPV6_MULTICAST_IF */ 1346 + np->mcast_oif = dev->ifindex; 1416 1347 } 1348 + #endif 1349 + release_sock(sk); 1417 1350 1418 1351 return 0; 1419 1352 } ··· 1414 1405 1415 1406 mreq.imr_ifindex = dev->ifindex; 1416 1407 1417 - rtnl_lock(); 1418 1408 lock_sock(sk); 1419 1409 ret = ip_mc_join_group(sk, &mreq); 1420 1410 release_sock(sk); 1421 - rtnl_unlock(); 1422 1411 1423 1412 return ret; 1424 1413 } 1425 1414 1415 + #ifdef CONFIG_IP_VS_IPV6 1416 + static int join_mcast_group6(struct sock *sk, struct in6_addr *addr, 1417 + char *ifname) 1418 + { 1419 + struct net *net = sock_net(sk); 1420 + struct net_device *dev; 1421 + int ret; 1422 + 1423 + dev = __dev_get_by_name(net, ifname); 1424 + if (!dev) 1425 + return -ENODEV; 1426 + if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) 1427 + return -EINVAL; 1428 + 1429 + lock_sock(sk); 1430 + ret = ipv6_sock_mc_join(sk, dev->ifindex, addr); 1431 + release_sock(sk); 1432 + 1433 + return ret; 1434 + } 1435 + #endif 1426 1436 1427 1437 static int bind_mcastif_addr(struct socket *sock, char *ifname) 1428 1438 { ··· 1470 1442 return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin)); 1471 1443 } 1472 1444 1445 + static void get_mcast_sockaddr(union ipvs_sockaddr *sa, int *salen, 1446 + struct ipvs_sync_daemon_cfg *c, int id) 1447 + { 1448 + if (AF_INET6 == c->mcast_af) { 1449 + sa->in6 = (struct sockaddr_in6) { 1450 + .sin6_family = AF_INET6, 1451 + .sin6_port = htons(c->mcast_port + id), 1452 + }; 1453 + sa->in6.sin6_addr = c->mcast_group.in6; 1454 + *salen = sizeof(sa->in6); 1455 + } else { 1456 + sa->in = (struct sockaddr_in) { 1457 + .sin_family = AF_INET, 1458 + .sin_port = htons(c->mcast_port + id), 1459 + }; 1460 + sa->in.sin_addr = c->mcast_group.in; 1461 + *salen = sizeof(sa->in); 1462 + } 1463 + } 1464 + 1473 1465 /* 1474 1466 * Set up sending multicast socket over UDP 1475 1467 */ ··· 1497 1449 { 1498 1450 struct netns_ipvs *ipvs = net_ipvs(net); 1499 1451 /* multicast addr */ 1500 - struct sockaddr_in mcast_addr = { 1501 - .sin_family = AF_INET, 1502 - .sin_port = cpu_to_be16(IP_VS_SYNC_PORT + id), 1503 - .sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP), 1504 - }; 1452 + union ipvs_sockaddr mcast_addr; 1505 1453 struct socket *sock; 1506 - int result; 1454 + int result, salen; 1507 1455 1508 1456 /* First create a socket */ 1509 - result = sock_create_kern(net, PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); 1457 + result = sock_create_kern(net, ipvs->mcfg.mcast_af, SOCK_DGRAM, 1458 + IPPROTO_UDP, &sock); 1510 1459 if (result < 0) { 1511 1460 pr_err("Error during creation of socket; terminating\n"); 1512 1461 return ERR_PTR(result); 1513 1462 } 1514 - result = set_mcast_if(sock->sk, ipvs->master_mcast_ifn); 1463 + result = set_mcast_if(sock->sk, ipvs->mcfg.mcast_ifn); 1515 1464 if (result < 0) { 1516 1465 pr_err("Error setting outbound mcast interface\n"); 1517 1466 goto error; 1518 1467 } 1519 1468 1520 1469 set_mcast_loop(sock->sk, 0); 1521 - set_mcast_ttl(sock->sk, 1); 1470 + set_mcast_ttl(sock->sk, ipvs->mcfg.mcast_ttl); 1471 + /* Allow fragmentation if MTU changes */ 1472 + set_mcast_pmtudisc(sock->sk, IP_PMTUDISC_DONT); 1522 1473 result = sysctl_sync_sock_size(ipvs); 1523 1474 if (result > 0) 1524 1475 set_sock_size(sock->sk, 1, result); 1525 1476 1526 - result = bind_mcastif_addr(sock, ipvs->master_mcast_ifn); 1477 + if (AF_INET == ipvs->mcfg.mcast_af) 1478 + result = bind_mcastif_addr(sock, ipvs->mcfg.mcast_ifn); 1479 + else 1480 + result = 0; 1527 1481 if (result < 0) { 1528 1482 pr_err("Error binding address of the mcast interface\n"); 1529 1483 goto error; 1530 1484 } 1531 1485 1486 + get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->mcfg, id); 1532 1487 result = sock->ops->connect(sock, (struct sockaddr *) &mcast_addr, 1533 - sizeof(struct sockaddr), 0); 1488 + salen, 0); 1534 1489 if (result < 0) { 1535 1490 pr_err("Error connecting to the multicast addr\n"); 1536 1491 goto error; ··· 1554 1503 { 1555 1504 struct netns_ipvs *ipvs = net_ipvs(net); 1556 1505 /* multicast addr */ 1557 - struct sockaddr_in mcast_addr = { 1558 - .sin_family = AF_INET, 1559 - .sin_port = cpu_to_be16(IP_VS_SYNC_PORT + id), 1560 - .sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP), 1561 - }; 1506 + union ipvs_sockaddr mcast_addr; 1562 1507 struct socket *sock; 1563 - int result; 1508 + int result, salen; 1564 1509 1565 1510 /* First create a socket */ 1566 - result = sock_create_kern(net, PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); 1511 + result = sock_create_kern(net, ipvs->bcfg.mcast_af, SOCK_DGRAM, 1512 + IPPROTO_UDP, &sock); 1567 1513 if (result < 0) { 1568 1514 pr_err("Error during creation of socket; terminating\n"); 1569 1515 return ERR_PTR(result); ··· 1571 1523 if (result > 0) 1572 1524 set_sock_size(sock->sk, 0, result); 1573 1525 1574 - result = sock->ops->bind(sock, (struct sockaddr *) &mcast_addr, 1575 - sizeof(struct sockaddr)); 1526 + get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->bcfg, id); 1527 + result = sock->ops->bind(sock, (struct sockaddr *)&mcast_addr, salen); 1576 1528 if (result < 0) { 1577 1529 pr_err("Error binding to the multicast addr\n"); 1578 1530 goto error; 1579 1531 } 1580 1532 1581 1533 /* join the multicast group */ 1582 - result = join_mcast_group(sock->sk, 1583 - (struct in_addr *) &mcast_addr.sin_addr, 1584 - ipvs->backup_mcast_ifn); 1534 + #ifdef CONFIG_IP_VS_IPV6 1535 + if (ipvs->bcfg.mcast_af == AF_INET6) 1536 + result = join_mcast_group6(sock->sk, &mcast_addr.in6.sin6_addr, 1537 + ipvs->bcfg.mcast_ifn); 1538 + else 1539 + #endif 1540 + result = join_mcast_group(sock->sk, &mcast_addr.in.sin_addr, 1541 + ipvs->bcfg.mcast_ifn); 1585 1542 if (result < 0) { 1586 1543 pr_err("Error joining to the multicast group\n"); 1587 1544 goto error; ··· 1694 1641 1695 1642 pr_info("sync thread started: state = MASTER, mcast_ifn = %s, " 1696 1643 "syncid = %d, id = %d\n", 1697 - ipvs->master_mcast_ifn, ipvs->master_syncid, tinfo->id); 1644 + ipvs->mcfg.mcast_ifn, ipvs->mcfg.syncid, tinfo->id); 1698 1645 1699 1646 for (;;) { 1700 1647 sb = next_sync_buff(ipvs, ms); ··· 1748 1695 1749 1696 pr_info("sync thread started: state = BACKUP, mcast_ifn = %s, " 1750 1697 "syncid = %d, id = %d\n", 1751 - ipvs->backup_mcast_ifn, ipvs->backup_syncid, tinfo->id); 1698 + ipvs->bcfg.mcast_ifn, ipvs->bcfg.syncid, tinfo->id); 1752 1699 1753 1700 while (!kthread_should_stop()) { 1754 1701 wait_event_interruptible(*sk_sleep(tinfo->sock->sk), ··· 1758 1705 /* do we have data now? */ 1759 1706 while (!skb_queue_empty(&(tinfo->sock->sk->sk_receive_queue))) { 1760 1707 len = ip_vs_receive(tinfo->sock, tinfo->buf, 1761 - ipvs->recv_mesg_maxlen); 1708 + ipvs->bcfg.sync_maxlen); 1762 1709 if (len <= 0) { 1763 1710 if (len != -EAGAIN) 1764 1711 pr_err("receiving message error\n"); ··· 1778 1725 } 1779 1726 1780 1727 1781 - int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid) 1728 + int start_sync_thread(struct net *net, struct ipvs_sync_daemon_cfg *c, 1729 + int state) 1782 1730 { 1783 1731 struct ip_vs_sync_thread_data *tinfo; 1784 1732 struct task_struct **array = NULL, *task; 1785 1733 struct socket *sock; 1786 1734 struct netns_ipvs *ipvs = net_ipvs(net); 1735 + struct net_device *dev; 1787 1736 char *name; 1788 1737 int (*threadfn)(void *data); 1789 - int id, count; 1738 + int id, count, hlen; 1790 1739 int result = -ENOMEM; 1740 + u16 mtu, min_mtu; 1791 1741 1792 1742 IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current)); 1793 1743 IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %Zd bytes\n", ··· 1802 1746 } else 1803 1747 count = ipvs->threads_mask + 1; 1804 1748 1749 + if (c->mcast_af == AF_UNSPEC) { 1750 + c->mcast_af = AF_INET; 1751 + c->mcast_group.ip = cpu_to_be32(IP_VS_SYNC_GROUP); 1752 + } 1753 + if (!c->mcast_port) 1754 + c->mcast_port = IP_VS_SYNC_PORT; 1755 + if (!c->mcast_ttl) 1756 + c->mcast_ttl = 1; 1757 + 1758 + dev = __dev_get_by_name(net, c->mcast_ifn); 1759 + if (!dev) { 1760 + pr_err("Unknown mcast interface: %s\n", c->mcast_ifn); 1761 + return -ENODEV; 1762 + } 1763 + hlen = (AF_INET6 == c->mcast_af) ? 1764 + sizeof(struct ipv6hdr) + sizeof(struct udphdr) : 1765 + sizeof(struct iphdr) + sizeof(struct udphdr); 1766 + mtu = (state == IP_VS_STATE_BACKUP) ? 1767 + clamp(dev->mtu, 1500U, 65535U) : 1500U; 1768 + min_mtu = (state == IP_VS_STATE_BACKUP) ? 1024 : 1; 1769 + 1770 + if (c->sync_maxlen) 1771 + c->sync_maxlen = clamp_t(unsigned int, 1772 + c->sync_maxlen, min_mtu, 1773 + 65535 - hlen); 1774 + else 1775 + c->sync_maxlen = mtu - hlen; 1776 + 1805 1777 if (state == IP_VS_STATE_MASTER) { 1806 1778 if (ipvs->ms) 1807 1779 return -EEXIST; 1808 1780 1809 - strlcpy(ipvs->master_mcast_ifn, mcast_ifn, 1810 - sizeof(ipvs->master_mcast_ifn)); 1811 - ipvs->master_syncid = syncid; 1781 + ipvs->mcfg = *c; 1812 1782 name = "ipvs-m:%d:%d"; 1813 1783 threadfn = sync_thread_master; 1814 1784 } else if (state == IP_VS_STATE_BACKUP) { 1815 1785 if (ipvs->backup_threads) 1816 1786 return -EEXIST; 1817 1787 1818 - strlcpy(ipvs->backup_mcast_ifn, mcast_ifn, 1819 - sizeof(ipvs->backup_mcast_ifn)); 1820 - ipvs->backup_syncid = syncid; 1788 + ipvs->bcfg = *c; 1821 1789 name = "ipvs-b:%d:%d"; 1822 1790 threadfn = sync_thread_backup; 1823 1791 } else { ··· 1869 1789 if (!array) 1870 1790 goto out; 1871 1791 } 1872 - set_sync_mesg_maxlen(net, state); 1873 1792 1874 1793 tinfo = NULL; 1875 1794 for (id = 0; id < count; id++) { ··· 1886 1807 tinfo->net = net; 1887 1808 tinfo->sock = sock; 1888 1809 if (state == IP_VS_STATE_BACKUP) { 1889 - tinfo->buf = kmalloc(ipvs->recv_mesg_maxlen, 1810 + tinfo->buf = kmalloc(ipvs->bcfg.sync_maxlen, 1890 1811 GFP_KERNEL); 1891 1812 if (!tinfo->buf) 1892 1813 goto outtinfo;