net/ipv4/ipvs/ip_vs_core.c at v2.6.24-rc6

tjh.dev / kernel
fork
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork
kernel / net / ipv4 / ipvs / ip_vs_core.c
at v2.6.24-rc6 1152 lines 30 kB view raw
wrap content
   1/*
   2 * IPVS         An implementation of the IP virtual server support for the
   3 *              LINUX operating system.  IPVS is now implemented as a module
   4 *              over the Netfilter framework. IPVS can be used to build a
   5 *              high-performance and highly available server based on a
   6 *              cluster of servers.
   7 *
   8 * Version:     $Id: ip_vs_core.c,v 1.34 2003/05/10 03:05:23 wensong Exp $
   9 *
  10 * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
  11 *              Peter Kese <peter.kese@ijs.si>
  12 *              Julian Anastasov <ja@ssi.bg>
  13 *
  14 *              This program is free software; you can redistribute it and/or
  15 *              modify it under the terms of the GNU General Public License
  16 *              as published by the Free Software Foundation; either version
  17 *              2 of the License, or (at your option) any later version.
  18 *
  19 * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
  20 * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
  21 * and others.
  22 *
  23 * Changes:
  24 *	Paul `Rusty' Russell		properly handle non-linear skbs
  25 *	Harald Welte			don't use nfcache
  26 *
  27 */
  28
  29#include <linux/module.h>
  30#include <linux/kernel.h>
  31#include <linux/ip.h>
  32#include <linux/tcp.h>
  33#include <linux/icmp.h>
  34
  35#include <net/ip.h>
  36#include <net/tcp.h>
  37#include <net/udp.h>
  38#include <net/icmp.h>                   /* for icmp_send */
  39#include <net/route.h>
  40
  41#include <linux/netfilter.h>
  42#include <linux/netfilter_ipv4.h>
  43
  44#include <net/ip_vs.h>
  45
  46
  47EXPORT_SYMBOL(register_ip_vs_scheduler);
  48EXPORT_SYMBOL(unregister_ip_vs_scheduler);
  49EXPORT_SYMBOL(ip_vs_skb_replace);
  50EXPORT_SYMBOL(ip_vs_proto_name);
  51EXPORT_SYMBOL(ip_vs_conn_new);
  52EXPORT_SYMBOL(ip_vs_conn_in_get);
  53EXPORT_SYMBOL(ip_vs_conn_out_get);
  54#ifdef CONFIG_IP_VS_PROTO_TCP
  55EXPORT_SYMBOL(ip_vs_tcp_conn_listen);
  56#endif
  57EXPORT_SYMBOL(ip_vs_conn_put);
  58#ifdef CONFIG_IP_VS_DEBUG
  59EXPORT_SYMBOL(ip_vs_get_debug_level);
  60#endif
  61
  62
  63/* ID used in ICMP lookups */
  64#define icmp_id(icmph)          (((icmph)->un).echo.id)
  65
  66const char *ip_vs_proto_name(unsigned proto)
  67{
  68	static char buf[20];
  69
  70	switch (proto) {
  71	case IPPROTO_IP:
  72		return "IP";
  73	case IPPROTO_UDP:
  74		return "UDP";
  75	case IPPROTO_TCP:
  76		return "TCP";
  77	case IPPROTO_ICMP:
  78		return "ICMP";
  79	default:
  80		sprintf(buf, "IP_%d", proto);
  81		return buf;
  82	}
  83}
  84
  85void ip_vs_init_hash_table(struct list_head *table, int rows)
  86{
  87	while (--rows >= 0)
  88		INIT_LIST_HEAD(&table[rows]);
  89}
  90
  91static inline void
  92ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
  93{
  94	struct ip_vs_dest *dest = cp->dest;
  95	if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
  96		spin_lock(&dest->stats.lock);
  97		dest->stats.inpkts++;
  98		dest->stats.inbytes += skb->len;
  99		spin_unlock(&dest->stats.lock);
 100
 101		spin_lock(&dest->svc->stats.lock);
 102		dest->svc->stats.inpkts++;
 103		dest->svc->stats.inbytes += skb->len;
 104		spin_unlock(&dest->svc->stats.lock);
 105
 106		spin_lock(&ip_vs_stats.lock);
 107		ip_vs_stats.inpkts++;
 108		ip_vs_stats.inbytes += skb->len;
 109		spin_unlock(&ip_vs_stats.lock);
 110	}
 111}
 112
 113
 114static inline void
 115ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
 116{
 117	struct ip_vs_dest *dest = cp->dest;
 118	if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
 119		spin_lock(&dest->stats.lock);
 120		dest->stats.outpkts++;
 121		dest->stats.outbytes += skb->len;
 122		spin_unlock(&dest->stats.lock);
 123
 124		spin_lock(&dest->svc->stats.lock);
 125		dest->svc->stats.outpkts++;
 126		dest->svc->stats.outbytes += skb->len;
 127		spin_unlock(&dest->svc->stats.lock);
 128
 129		spin_lock(&ip_vs_stats.lock);
 130		ip_vs_stats.outpkts++;
 131		ip_vs_stats.outbytes += skb->len;
 132		spin_unlock(&ip_vs_stats.lock);
 133	}
 134}
 135
 136
 137static inline void
 138ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc)
 139{
 140	spin_lock(&cp->dest->stats.lock);
 141	cp->dest->stats.conns++;
 142	spin_unlock(&cp->dest->stats.lock);
 143
 144	spin_lock(&svc->stats.lock);
 145	svc->stats.conns++;
 146	spin_unlock(&svc->stats.lock);
 147
 148	spin_lock(&ip_vs_stats.lock);
 149	ip_vs_stats.conns++;
 150	spin_unlock(&ip_vs_stats.lock);
 151}
 152
 153
 154static inline int
 155ip_vs_set_state(struct ip_vs_conn *cp, int direction,
 156		const struct sk_buff *skb,
 157		struct ip_vs_protocol *pp)
 158{
 159	if (unlikely(!pp->state_transition))
 160		return 0;
 161	return pp->state_transition(cp, direction, skb, pp);
 162}
 163
 164
 165/*
 166 *  IPVS persistent scheduling function
 167 *  It creates a connection entry according to its template if exists,
 168 *  or selects a server and creates a connection entry plus a template.
 169 *  Locking: we are svc user (svc->refcnt), so we hold all dests too
 170 *  Protocols supported: TCP, UDP
 171 */
 172static struct ip_vs_conn *
 173ip_vs_sched_persist(struct ip_vs_service *svc,
 174		    const struct sk_buff *skb,
 175		    __be16 ports[2])
 176{
 177	struct ip_vs_conn *cp = NULL;
 178	struct iphdr *iph = ip_hdr(skb);
 179	struct ip_vs_dest *dest;
 180	struct ip_vs_conn *ct;
 181	__be16  dport;	 /* destination port to forward */
 182	__be32  snet;	 /* source network of the client, after masking */
 183
 184	/* Mask saddr with the netmask to adjust template granularity */
 185	snet = iph->saddr & svc->netmask;
 186
 187	IP_VS_DBG(6, "p-schedule: src %u.%u.%u.%u:%u dest %u.%u.%u.%u:%u "
 188		  "mnet %u.%u.%u.%u\n",
 189		  NIPQUAD(iph->saddr), ntohs(ports[0]),
 190		  NIPQUAD(iph->daddr), ntohs(ports[1]),
 191		  NIPQUAD(snet));
 192
 193	/*
 194	 * As far as we know, FTP is a very complicated network protocol, and
 195	 * it uses control connection and data connections. For active FTP,
 196	 * FTP server initialize data connection to the client, its source port
 197	 * is often 20. For passive FTP, FTP server tells the clients the port
 198	 * that it passively listens to,  and the client issues the data
 199	 * connection. In the tunneling or direct routing mode, the load
 200	 * balancer is on the client-to-server half of connection, the port
 201	 * number is unknown to the load balancer. So, a conn template like
 202	 * <caddr, 0, vaddr, 0, daddr, 0> is created for persistent FTP
 203	 * service, and a template like <caddr, 0, vaddr, vport, daddr, dport>
 204	 * is created for other persistent services.
 205	 */
 206	if (ports[1] == svc->port) {
 207		/* Check if a template already exists */
 208		if (svc->port != FTPPORT)
 209			ct = ip_vs_ct_in_get(iph->protocol, snet, 0,
 210					       iph->daddr, ports[1]);
 211		else
 212			ct = ip_vs_ct_in_get(iph->protocol, snet, 0,
 213					       iph->daddr, 0);
 214
 215		if (!ct || !ip_vs_check_template(ct)) {
 216			/*
 217			 * No template found or the dest of the connection
 218			 * template is not available.
 219			 */
 220			dest = svc->scheduler->schedule(svc, skb);
 221			if (dest == NULL) {
 222				IP_VS_DBG(1, "p-schedule: no dest found.\n");
 223				return NULL;
 224			}
 225
 226			/*
 227			 * Create a template like <protocol,caddr,0,
 228			 * vaddr,vport,daddr,dport> for non-ftp service,
 229			 * and <protocol,caddr,0,vaddr,0,daddr,0>
 230			 * for ftp service.
 231			 */
 232			if (svc->port != FTPPORT)
 233				ct = ip_vs_conn_new(iph->protocol,
 234						    snet, 0,
 235						    iph->daddr,
 236						    ports[1],
 237						    dest->addr, dest->port,
 238						    IP_VS_CONN_F_TEMPLATE,
 239						    dest);
 240			else
 241				ct = ip_vs_conn_new(iph->protocol,
 242						    snet, 0,
 243						    iph->daddr, 0,
 244						    dest->addr, 0,
 245						    IP_VS_CONN_F_TEMPLATE,
 246						    dest);
 247			if (ct == NULL)
 248				return NULL;
 249
 250			ct->timeout = svc->timeout;
 251		} else {
 252			/* set destination with the found template */
 253			dest = ct->dest;
 254		}
 255		dport = dest->port;
 256	} else {
 257		/*
 258		 * Note: persistent fwmark-based services and persistent
 259		 * port zero service are handled here.
 260		 * fwmark template: <IPPROTO_IP,caddr,0,fwmark,0,daddr,0>
 261		 * port zero template: <protocol,caddr,0,vaddr,0,daddr,0>
 262		 */
 263		if (svc->fwmark)
 264			ct = ip_vs_ct_in_get(IPPROTO_IP, snet, 0,
 265					       htonl(svc->fwmark), 0);
 266		else
 267			ct = ip_vs_ct_in_get(iph->protocol, snet, 0,
 268					       iph->daddr, 0);
 269
 270		if (!ct || !ip_vs_check_template(ct)) {
 271			/*
 272			 * If it is not persistent port zero, return NULL,
 273			 * otherwise create a connection template.
 274			 */
 275			if (svc->port)
 276				return NULL;
 277
 278			dest = svc->scheduler->schedule(svc, skb);
 279			if (dest == NULL) {
 280				IP_VS_DBG(1, "p-schedule: no dest found.\n");
 281				return NULL;
 282			}
 283
 284			/*
 285			 * Create a template according to the service
 286			 */
 287			if (svc->fwmark)
 288				ct = ip_vs_conn_new(IPPROTO_IP,
 289						    snet, 0,
 290						    htonl(svc->fwmark), 0,
 291						    dest->addr, 0,
 292						    IP_VS_CONN_F_TEMPLATE,
 293						    dest);
 294			else
 295				ct = ip_vs_conn_new(iph->protocol,
 296						    snet, 0,
 297						    iph->daddr, 0,
 298						    dest->addr, 0,
 299						    IP_VS_CONN_F_TEMPLATE,
 300						    dest);
 301			if (ct == NULL)
 302				return NULL;
 303
 304			ct->timeout = svc->timeout;
 305		} else {
 306			/* set destination with the found template */
 307			dest = ct->dest;
 308		}
 309		dport = ports[1];
 310	}
 311
 312	/*
 313	 *    Create a new connection according to the template
 314	 */
 315	cp = ip_vs_conn_new(iph->protocol,
 316			    iph->saddr, ports[0],
 317			    iph->daddr, ports[1],
 318			    dest->addr, dport,
 319			    0,
 320			    dest);
 321	if (cp == NULL) {
 322		ip_vs_conn_put(ct);
 323		return NULL;
 324	}
 325
 326	/*
 327	 *    Add its control
 328	 */
 329	ip_vs_control_add(cp, ct);
 330	ip_vs_conn_put(ct);
 331
 332	ip_vs_conn_stats(cp, svc);
 333	return cp;
 334}
 335
 336
 337/*
 338 *  IPVS main scheduling function
 339 *  It selects a server according to the virtual service, and
 340 *  creates a connection entry.
 341 *  Protocols supported: TCP, UDP
 342 */
 343struct ip_vs_conn *
 344ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 345{
 346	struct ip_vs_conn *cp = NULL;
 347	struct iphdr *iph = ip_hdr(skb);
 348	struct ip_vs_dest *dest;
 349	__be16 _ports[2], *pptr;
 350
 351	pptr = skb_header_pointer(skb, iph->ihl*4,
 352				  sizeof(_ports), _ports);
 353	if (pptr == NULL)
 354		return NULL;
 355
 356	/*
 357	 *    Persistent service
 358	 */
 359	if (svc->flags & IP_VS_SVC_F_PERSISTENT)
 360		return ip_vs_sched_persist(svc, skb, pptr);
 361
 362	/*
 363	 *    Non-persistent service
 364	 */
 365	if (!svc->fwmark && pptr[1] != svc->port) {
 366		if (!svc->port)
 367			IP_VS_ERR("Schedule: port zero only supported "
 368				  "in persistent services, "
 369				  "check your ipvs configuration\n");
 370		return NULL;
 371	}
 372
 373	dest = svc->scheduler->schedule(svc, skb);
 374	if (dest == NULL) {
 375		IP_VS_DBG(1, "Schedule: no dest found.\n");
 376		return NULL;
 377	}
 378
 379	/*
 380	 *    Create a connection entry.
 381	 */
 382	cp = ip_vs_conn_new(iph->protocol,
 383			    iph->saddr, pptr[0],
 384			    iph->daddr, pptr[1],
 385			    dest->addr, dest->port?dest->port:pptr[1],
 386			    0,
 387			    dest);
 388	if (cp == NULL)
 389		return NULL;
 390
 391	IP_VS_DBG(6, "Schedule fwd:%c c:%u.%u.%u.%u:%u v:%u.%u.%u.%u:%u "
 392		  "d:%u.%u.%u.%u:%u conn->flags:%X conn->refcnt:%d\n",
 393		  ip_vs_fwd_tag(cp),
 394		  NIPQUAD(cp->caddr), ntohs(cp->cport),
 395		  NIPQUAD(cp->vaddr), ntohs(cp->vport),
 396		  NIPQUAD(cp->daddr), ntohs(cp->dport),
 397		  cp->flags, atomic_read(&cp->refcnt));
 398
 399	ip_vs_conn_stats(cp, svc);
 400	return cp;
 401}
 402
 403
 404/*
 405 *  Pass or drop the packet.
 406 *  Called by ip_vs_in, when the virtual service is available but
 407 *  no destination is available for a new connection.
 408 */
 409int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
 410		struct ip_vs_protocol *pp)
 411{
 412	__be16 _ports[2], *pptr;
 413	struct iphdr *iph = ip_hdr(skb);
 414
 415	pptr = skb_header_pointer(skb, iph->ihl*4,
 416				  sizeof(_ports), _ports);
 417	if (pptr == NULL) {
 418		ip_vs_service_put(svc);
 419		return NF_DROP;
 420	}
 421
 422	/* if it is fwmark-based service, the cache_bypass sysctl is up
 423	   and the destination is RTN_UNICAST (and not local), then create
 424	   a cache_bypass connection entry */
 425	if (sysctl_ip_vs_cache_bypass && svc->fwmark
 426	    && (inet_addr_type(iph->daddr) == RTN_UNICAST)) {
 427		int ret, cs;
 428		struct ip_vs_conn *cp;
 429
 430		ip_vs_service_put(svc);
 431
 432		/* create a new connection entry */
 433		IP_VS_DBG(6, "ip_vs_leave: create a cache_bypass entry\n");
 434		cp = ip_vs_conn_new(iph->protocol,
 435				    iph->saddr, pptr[0],
 436				    iph->daddr, pptr[1],
 437				    0, 0,
 438				    IP_VS_CONN_F_BYPASS,
 439				    NULL);
 440		if (cp == NULL)
 441			return NF_DROP;
 442
 443		/* statistics */
 444		ip_vs_in_stats(cp, skb);
 445
 446		/* set state */
 447		cs = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
 448
 449		/* transmit the first SYN packet */
 450		ret = cp->packet_xmit(skb, cp, pp);
 451		/* do not touch skb anymore */
 452
 453		atomic_inc(&cp->in_pkts);
 454		ip_vs_conn_put(cp);
 455		return ret;
 456	}
 457
 458	/*
 459	 * When the virtual ftp service is presented, packets destined
 460	 * for other services on the VIP may get here (except services
 461	 * listed in the ipvs table), pass the packets, because it is
 462	 * not ipvs job to decide to drop the packets.
 463	 */
 464	if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT)) {
 465		ip_vs_service_put(svc);
 466		return NF_ACCEPT;
 467	}
 468
 469	ip_vs_service_put(svc);
 470
 471	/*
 472	 * Notify the client that the destination is unreachable, and
 473	 * release the socket buffer.
 474	 * Since it is in IP layer, the TCP socket is not actually
 475	 * created, the TCP RST packet cannot be sent, instead that
 476	 * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ
 477	 */
 478	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
 479	return NF_DROP;
 480}
 481
 482
 483/*
 484 *      It is hooked before NF_IP_PRI_NAT_SRC at the NF_IP_POST_ROUTING
 485 *      chain, and is used for VS/NAT.
 486 *      It detects packets for VS/NAT connections and sends the packets
 487 *      immediately. This can avoid that iptable_nat mangles the packets
 488 *      for VS/NAT.
 489 */
 490static unsigned int ip_vs_post_routing(unsigned int hooknum,
 491				       struct sk_buff *skb,
 492				       const struct net_device *in,
 493				       const struct net_device *out,
 494				       int (*okfn)(struct sk_buff *))
 495{
 496	if (!skb->ipvs_property)
 497		return NF_ACCEPT;
 498	/* The packet was sent from IPVS, exit this chain */
 499	return NF_STOP;
 500}
 501
 502__sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset)
 503{
 504	return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0));
 505}
 506
 507static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user)
 508{
 509	int err = ip_defrag(skb, user);
 510
 511	if (!err)
 512		ip_send_check(ip_hdr(skb));
 513
 514	return err;
 515}
 516
 517/*
 518 * Packet has been made sufficiently writable in caller
 519 * - inout: 1=in->out, 0=out->in
 520 */
 521void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp,
 522		    struct ip_vs_conn *cp, int inout)
 523{
 524	struct iphdr *iph	 = ip_hdr(skb);
 525	unsigned int icmp_offset = iph->ihl*4;
 526	struct icmphdr *icmph	 = (struct icmphdr *)(skb_network_header(skb) +
 527						      icmp_offset);
 528	struct iphdr *ciph	 = (struct iphdr *)(icmph + 1);
 529
 530	if (inout) {
 531		iph->saddr = cp->vaddr;
 532		ip_send_check(iph);
 533		ciph->daddr = cp->vaddr;
 534		ip_send_check(ciph);
 535	} else {
 536		iph->daddr = cp->daddr;
 537		ip_send_check(iph);
 538		ciph->saddr = cp->daddr;
 539		ip_send_check(ciph);
 540	}
 541
 542	/* the TCP/UDP port */
 543	if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol) {
 544		__be16 *ports = (void *)ciph + ciph->ihl*4;
 545
 546		if (inout)
 547			ports[1] = cp->vport;
 548		else
 549			ports[0] = cp->dport;
 550	}
 551
 552	/* And finally the ICMP checksum */
 553	icmph->checksum = 0;
 554	icmph->checksum = ip_vs_checksum_complete(skb, icmp_offset);
 555	skb->ip_summed = CHECKSUM_UNNECESSARY;
 556
 557	if (inout)
 558		IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
 559			"Forwarding altered outgoing ICMP");
 560	else
 561		IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
 562			"Forwarding altered incoming ICMP");
 563}
 564
 565/*
 566 *	Handle ICMP messages in the inside-to-outside direction (outgoing).
 567 *	Find any that might be relevant, check against existing connections,
 568 *	forward to the right destination host if relevant.
 569 *	Currently handles error types - unreachable, quench, ttl exceeded.
 570 *	(Only used in VS/NAT)
 571 */
 572static int ip_vs_out_icmp(struct sk_buff *skb, int *related)
 573{
 574	struct iphdr *iph;
 575	struct icmphdr	_icmph, *ic;
 576	struct iphdr	_ciph, *cih;	/* The ip header contained within the ICMP */
 577	struct ip_vs_conn *cp;
 578	struct ip_vs_protocol *pp;
 579	unsigned int offset, ihl, verdict;
 580
 581	*related = 1;
 582
 583	/* reassemble IP fragments */
 584	if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
 585		if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT))
 586			return NF_STOLEN;
 587	}
 588
 589	iph = ip_hdr(skb);
 590	offset = ihl = iph->ihl * 4;
 591	ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
 592	if (ic == NULL)
 593		return NF_DROP;
 594
 595	IP_VS_DBG(12, "Outgoing ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n",
 596		  ic->type, ntohs(icmp_id(ic)),
 597		  NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
 598
 599	/*
 600	 * Work through seeing if this is for us.
 601	 * These checks are supposed to be in an order that means easy
 602	 * things are checked first to speed up processing.... however
 603	 * this means that some packets will manage to get a long way
 604	 * down this stack and then be rejected, but that's life.
 605	 */
 606	if ((ic->type != ICMP_DEST_UNREACH) &&
 607	    (ic->type != ICMP_SOURCE_QUENCH) &&
 608	    (ic->type != ICMP_TIME_EXCEEDED)) {
 609		*related = 0;
 610		return NF_ACCEPT;
 611	}
 612
 613	/* Now find the contained IP header */
 614	offset += sizeof(_icmph);
 615	cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
 616	if (cih == NULL)
 617		return NF_ACCEPT; /* The packet looks wrong, ignore */
 618
 619	pp = ip_vs_proto_get(cih->protocol);
 620	if (!pp)
 621		return NF_ACCEPT;
 622
 623	/* Is the embedded protocol header present? */
 624	if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
 625		     pp->dont_defrag))
 626		return NF_ACCEPT;
 627
 628	IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMP for");
 629
 630	offset += cih->ihl * 4;
 631
 632	/* The embedded headers contain source and dest in reverse order */
 633	cp = pp->conn_out_get(skb, pp, cih, offset, 1);
 634	if (!cp)
 635		return NF_ACCEPT;
 636
 637	verdict = NF_DROP;
 638
 639	if (IP_VS_FWD_METHOD(cp) != 0) {
 640		IP_VS_ERR("shouldn't reach here, because the box is on the "
 641			  "half connection in the tun/dr module.\n");
 642	}
 643
 644	/* Ensure the checksum is correct */
 645	if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
 646		/* Failed checksum! */
 647		IP_VS_DBG(1, "Forward ICMP: failed checksum from %d.%d.%d.%d!\n",
 648			  NIPQUAD(iph->saddr));
 649		goto out;
 650	}
 651
 652	if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol)
 653		offset += 2 * sizeof(__u16);
 654	if (!skb_make_writable(skb, offset))
 655		goto out;
 656
 657	ip_vs_nat_icmp(skb, pp, cp, 1);
 658
 659	/* do the statistics and put it back */
 660	ip_vs_out_stats(cp, skb);
 661
 662	skb->ipvs_property = 1;
 663	verdict = NF_ACCEPT;
 664
 665  out:
 666	__ip_vs_conn_put(cp);
 667
 668	return verdict;
 669}
 670
 671static inline int is_tcp_reset(const struct sk_buff *skb)
 672{
 673	struct tcphdr _tcph, *th;
 674
 675	th = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_tcph), &_tcph);
 676	if (th == NULL)
 677		return 0;
 678	return th->rst;
 679}
 680
 681/*
 682 *	It is hooked at the NF_IP_FORWARD chain, used only for VS/NAT.
 683 *	Check if outgoing packet belongs to the established ip_vs_conn,
 684 *      rewrite addresses of the packet and send it on its way...
 685 */
 686static unsigned int
 687ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
 688	  const struct net_device *in, const struct net_device *out,
 689	  int (*okfn)(struct sk_buff *))
 690{
 691	struct iphdr	*iph;
 692	struct ip_vs_protocol *pp;
 693	struct ip_vs_conn *cp;
 694	int ihl;
 695
 696	EnterFunction(11);
 697
 698	if (skb->ipvs_property)
 699		return NF_ACCEPT;
 700
 701	iph = ip_hdr(skb);
 702	if (unlikely(iph->protocol == IPPROTO_ICMP)) {
 703		int related, verdict = ip_vs_out_icmp(skb, &related);
 704
 705		if (related)
 706			return verdict;
 707		iph = ip_hdr(skb);
 708	}
 709
 710	pp = ip_vs_proto_get(iph->protocol);
 711	if (unlikely(!pp))
 712		return NF_ACCEPT;
 713
 714	/* reassemble IP fragments */
 715	if (unlikely(iph->frag_off & htons(IP_MF|IP_OFFSET) &&
 716		     !pp->dont_defrag)) {
 717		if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT))
 718			return NF_STOLEN;
 719		iph = ip_hdr(skb);
 720	}
 721
 722	ihl = iph->ihl << 2;
 723
 724	/*
 725	 * Check if the packet belongs to an existing entry
 726	 */
 727	cp = pp->conn_out_get(skb, pp, iph, ihl, 0);
 728
 729	if (unlikely(!cp)) {
 730		if (sysctl_ip_vs_nat_icmp_send &&
 731		    (pp->protocol == IPPROTO_TCP ||
 732		     pp->protocol == IPPROTO_UDP)) {
 733			__be16 _ports[2], *pptr;
 734
 735			pptr = skb_header_pointer(skb, ihl,
 736						  sizeof(_ports), _ports);
 737			if (pptr == NULL)
 738				return NF_ACCEPT;	/* Not for me */
 739			if (ip_vs_lookup_real_service(iph->protocol,
 740						      iph->saddr, pptr[0])) {
 741				/*
 742				 * Notify the real server: there is no
 743				 * existing entry if it is not RST
 744				 * packet or not TCP packet.
 745				 */
 746				if (iph->protocol != IPPROTO_TCP
 747				    || !is_tcp_reset(skb)) {
 748					icmp_send(skb,ICMP_DEST_UNREACH,
 749						  ICMP_PORT_UNREACH, 0);
 750					return NF_DROP;
 751				}
 752			}
 753		}
 754		IP_VS_DBG_PKT(12, pp, skb, 0,
 755			      "packet continues traversal as normal");
 756		return NF_ACCEPT;
 757	}
 758
 759	IP_VS_DBG_PKT(11, pp, skb, 0, "Outgoing packet");
 760
 761	if (!skb_make_writable(skb, ihl))
 762		goto drop;
 763
 764	/* mangle the packet */
 765	if (pp->snat_handler && !pp->snat_handler(skb, pp, cp))
 766		goto drop;
 767	ip_hdr(skb)->saddr = cp->vaddr;
 768	ip_send_check(ip_hdr(skb));
 769
 770	/* For policy routing, packets originating from this
 771	 * machine itself may be routed differently to packets
 772	 * passing through.  We want this packet to be routed as
 773	 * if it came from this machine itself.  So re-compute
 774	 * the routing information.
 775	 */
 776	if (ip_route_me_harder(skb, RTN_LOCAL) != 0)
 777		goto drop;
 778
 779	IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT");
 780
 781	ip_vs_out_stats(cp, skb);
 782	ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp);
 783	ip_vs_conn_put(cp);
 784
 785	skb->ipvs_property = 1;
 786
 787	LeaveFunction(11);
 788	return NF_ACCEPT;
 789
 790  drop:
 791	ip_vs_conn_put(cp);
 792	kfree_skb(skb);
 793	return NF_STOLEN;
 794}
 795
 796
 797/*
 798 *	Handle ICMP messages in the outside-to-inside direction (incoming).
 799 *	Find any that might be relevant, check against existing connections,
 800 *	forward to the right destination host if relevant.
 801 *	Currently handles error types - unreachable, quench, ttl exceeded.
 802 */
 803static int
 804ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
 805{
 806	struct iphdr *iph;
 807	struct icmphdr	_icmph, *ic;
 808	struct iphdr	_ciph, *cih;	/* The ip header contained within the ICMP */
 809	struct ip_vs_conn *cp;
 810	struct ip_vs_protocol *pp;
 811	unsigned int offset, ihl, verdict;
 812
 813	*related = 1;
 814
 815	/* reassemble IP fragments */
 816	if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
 817		if (ip_vs_gather_frags(skb, hooknum == NF_IP_LOCAL_IN ?
 818					    IP_DEFRAG_VS_IN : IP_DEFRAG_VS_FWD))
 819			return NF_STOLEN;
 820	}
 821
 822	iph = ip_hdr(skb);
 823	offset = ihl = iph->ihl * 4;
 824	ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
 825	if (ic == NULL)
 826		return NF_DROP;
 827
 828	IP_VS_DBG(12, "Incoming ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n",
 829		  ic->type, ntohs(icmp_id(ic)),
 830		  NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
 831
 832	/*
 833	 * Work through seeing if this is for us.
 834	 * These checks are supposed to be in an order that means easy
 835	 * things are checked first to speed up processing.... however
 836	 * this means that some packets will manage to get a long way
 837	 * down this stack and then be rejected, but that's life.
 838	 */
 839	if ((ic->type != ICMP_DEST_UNREACH) &&
 840	    (ic->type != ICMP_SOURCE_QUENCH) &&
 841	    (ic->type != ICMP_TIME_EXCEEDED)) {
 842		*related = 0;
 843		return NF_ACCEPT;
 844	}
 845
 846	/* Now find the contained IP header */
 847	offset += sizeof(_icmph);
 848	cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
 849	if (cih == NULL)
 850		return NF_ACCEPT; /* The packet looks wrong, ignore */
 851
 852	pp = ip_vs_proto_get(cih->protocol);
 853	if (!pp)
 854		return NF_ACCEPT;
 855
 856	/* Is the embedded protocol header present? */
 857	if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
 858		     pp->dont_defrag))
 859		return NF_ACCEPT;
 860
 861	IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMP for");
 862
 863	offset += cih->ihl * 4;
 864
 865	/* The embedded headers contain source and dest in reverse order */
 866	cp = pp->conn_in_get(skb, pp, cih, offset, 1);
 867	if (!cp)
 868		return NF_ACCEPT;
 869
 870	verdict = NF_DROP;
 871
 872	/* Ensure the checksum is correct */
 873	if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
 874		/* Failed checksum! */
 875		IP_VS_DBG(1, "Incoming ICMP: failed checksum from %d.%d.%d.%d!\n",
 876			  NIPQUAD(iph->saddr));
 877		goto out;
 878	}
 879
 880	/* do the statistics and put it back */
 881	ip_vs_in_stats(cp, skb);
 882	if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol)
 883		offset += 2 * sizeof(__u16);
 884	verdict = ip_vs_icmp_xmit(skb, cp, pp, offset);
 885	/* do not touch skb anymore */
 886
 887  out:
 888	__ip_vs_conn_put(cp);
 889
 890	return verdict;
 891}
 892
 893/*
 894 *	Check if it's for virtual services, look it up,
 895 *	and send it on its way...
 896 */
 897static unsigned int
 898ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
 899	 const struct net_device *in, const struct net_device *out,
 900	 int (*okfn)(struct sk_buff *))
 901{
 902	struct iphdr	*iph;
 903	struct ip_vs_protocol *pp;
 904	struct ip_vs_conn *cp;
 905	int ret, restart;
 906	int ihl;
 907
 908	/*
 909	 *	Big tappo: only PACKET_HOST (neither loopback nor mcasts)
 910	 *	... don't know why 1st test DOES NOT include 2nd (?)
 911	 */
 912	if (unlikely(skb->pkt_type != PACKET_HOST
 913		     || skb->dev->flags & IFF_LOOPBACK || skb->sk)) {
 914		IP_VS_DBG(12, "packet type=%d proto=%d daddr=%d.%d.%d.%d ignored\n",
 915			  skb->pkt_type,
 916			  ip_hdr(skb)->protocol,
 917			  NIPQUAD(ip_hdr(skb)->daddr));
 918		return NF_ACCEPT;
 919	}
 920
 921	iph = ip_hdr(skb);
 922	if (unlikely(iph->protocol == IPPROTO_ICMP)) {
 923		int related, verdict = ip_vs_in_icmp(skb, &related, hooknum);
 924
 925		if (related)
 926			return verdict;
 927		iph = ip_hdr(skb);
 928	}
 929
 930	/* Protocol supported? */
 931	pp = ip_vs_proto_get(iph->protocol);
 932	if (unlikely(!pp))
 933		return NF_ACCEPT;
 934
 935	ihl = iph->ihl << 2;
 936
 937	/*
 938	 * Check if the packet belongs to an existing connection entry
 939	 */
 940	cp = pp->conn_in_get(skb, pp, iph, ihl, 0);
 941
 942	if (unlikely(!cp)) {
 943		int v;
 944
 945		if (!pp->conn_schedule(skb, pp, &v, &cp))
 946			return v;
 947	}
 948
 949	if (unlikely(!cp)) {
 950		/* sorry, all this trouble for a no-hit :) */
 951		IP_VS_DBG_PKT(12, pp, skb, 0,
 952			      "packet continues traversal as normal");
 953		return NF_ACCEPT;
 954	}
 955
 956	IP_VS_DBG_PKT(11, pp, skb, 0, "Incoming packet");
 957
 958	/* Check the server status */
 959	if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
 960		/* the destination server is not available */
 961
 962		if (sysctl_ip_vs_expire_nodest_conn) {
 963			/* try to expire the connection immediately */
 964			ip_vs_conn_expire_now(cp);
 965		}
 966		/* don't restart its timer, and silently
 967		   drop the packet. */
 968		__ip_vs_conn_put(cp);
 969		return NF_DROP;
 970	}
 971
 972	ip_vs_in_stats(cp, skb);
 973	restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
 974	if (cp->packet_xmit)
 975		ret = cp->packet_xmit(skb, cp, pp);
 976		/* do not touch skb anymore */
 977	else {
 978		IP_VS_DBG_RL("warning: packet_xmit is null");
 979		ret = NF_ACCEPT;
 980	}
 981
 982	/* Increase its packet counter and check if it is needed
 983	 * to be synchronized
 984	 *
 985	 * Sync connection if it is about to close to
 986	 * encorage the standby servers to update the connections timeout
 987	 */
 988	atomic_inc(&cp->in_pkts);
 989	if ((ip_vs_sync_state & IP_VS_STATE_MASTER) &&
 990	    (((cp->protocol != IPPROTO_TCP ||
 991	       cp->state == IP_VS_TCP_S_ESTABLISHED) &&
 992	      (atomic_read(&cp->in_pkts) % sysctl_ip_vs_sync_threshold[1]
 993	       == sysctl_ip_vs_sync_threshold[0])) ||
 994	     ((cp->protocol == IPPROTO_TCP) && (cp->old_state != cp->state) &&
 995	      ((cp->state == IP_VS_TCP_S_FIN_WAIT) ||
 996	       (cp->state == IP_VS_TCP_S_CLOSE)))))
 997		ip_vs_sync_conn(cp);
 998	cp->old_state = cp->state;
 999
1000	ip_vs_conn_put(cp);
1001	return ret;
1002}
1003
1004
1005/*
1006 *	It is hooked at the NF_IP_FORWARD chain, in order to catch ICMP
1007 *      related packets destined for 0.0.0.0/0.
1008 *      When fwmark-based virtual service is used, such as transparent
1009 *      cache cluster, TCP packets can be marked and routed to ip_vs_in,
1010 *      but ICMP destined for 0.0.0.0/0 cannot not be easily marked and
1011 *      sent to ip_vs_in_icmp. So, catch them at the NF_IP_FORWARD chain
1012 *      and send them to ip_vs_in_icmp.
1013 */
1014static unsigned int
1015ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff *skb,
1016		   const struct net_device *in, const struct net_device *out,
1017		   int (*okfn)(struct sk_buff *))
1018{
1019	int r;
1020
1021	if (ip_hdr(skb)->protocol != IPPROTO_ICMP)
1022		return NF_ACCEPT;
1023
1024	return ip_vs_in_icmp(skb, &r, hooknum);
1025}
1026
1027
1028/* After packet filtering, forward packet through VS/DR, VS/TUN,
1029   or VS/NAT(change destination), so that filtering rules can be
1030   applied to IPVS. */
1031static struct nf_hook_ops ip_vs_in_ops = {
1032	.hook		= ip_vs_in,
1033	.owner		= THIS_MODULE,
1034	.pf		= PF_INET,
1035	.hooknum        = NF_IP_LOCAL_IN,
1036	.priority       = 100,
1037};
1038
1039/* After packet filtering, change source only for VS/NAT */
1040static struct nf_hook_ops ip_vs_out_ops = {
1041	.hook		= ip_vs_out,
1042	.owner		= THIS_MODULE,
1043	.pf		= PF_INET,
1044	.hooknum        = NF_IP_FORWARD,
1045	.priority       = 100,
1046};
1047
1048/* After packet filtering (but before ip_vs_out_icmp), catch icmp
1049   destined for 0.0.0.0/0, which is for incoming IPVS connections */
1050static struct nf_hook_ops ip_vs_forward_icmp_ops = {
1051	.hook		= ip_vs_forward_icmp,
1052	.owner		= THIS_MODULE,
1053	.pf		= PF_INET,
1054	.hooknum        = NF_IP_FORWARD,
1055	.priority       = 99,
1056};
1057
1058/* Before the netfilter connection tracking, exit from POST_ROUTING */
1059static struct nf_hook_ops ip_vs_post_routing_ops = {
1060	.hook		= ip_vs_post_routing,
1061	.owner		= THIS_MODULE,
1062	.pf		= PF_INET,
1063	.hooknum        = NF_IP_POST_ROUTING,
1064	.priority       = NF_IP_PRI_NAT_SRC-1,
1065};
1066
1067
1068/*
1069 *	Initialize IP Virtual Server
1070 */
1071static int __init ip_vs_init(void)
1072{
1073	int ret;
1074
1075	ret = ip_vs_control_init();
1076	if (ret < 0) {
1077		IP_VS_ERR("can't setup control.\n");
1078		goto cleanup_nothing;
1079	}
1080
1081	ip_vs_protocol_init();
1082
1083	ret = ip_vs_app_init();
1084	if (ret < 0) {
1085		IP_VS_ERR("can't setup application helper.\n");
1086		goto cleanup_protocol;
1087	}
1088
1089	ret = ip_vs_conn_init();
1090	if (ret < 0) {
1091		IP_VS_ERR("can't setup connection table.\n");
1092		goto cleanup_app;
1093	}
1094
1095	ret = nf_register_hook(&ip_vs_in_ops);
1096	if (ret < 0) {
1097		IP_VS_ERR("can't register in hook.\n");
1098		goto cleanup_conn;
1099	}
1100
1101	ret = nf_register_hook(&ip_vs_out_ops);
1102	if (ret < 0) {
1103		IP_VS_ERR("can't register out hook.\n");
1104		goto cleanup_inops;
1105	}
1106	ret = nf_register_hook(&ip_vs_post_routing_ops);
1107	if (ret < 0) {
1108		IP_VS_ERR("can't register post_routing hook.\n");
1109		goto cleanup_outops;
1110	}
1111	ret = nf_register_hook(&ip_vs_forward_icmp_ops);
1112	if (ret < 0) {
1113		IP_VS_ERR("can't register forward_icmp hook.\n");
1114		goto cleanup_postroutingops;
1115	}
1116
1117	IP_VS_INFO("ipvs loaded.\n");
1118	return ret;
1119
1120  cleanup_postroutingops:
1121	nf_unregister_hook(&ip_vs_post_routing_ops);
1122  cleanup_outops:
1123	nf_unregister_hook(&ip_vs_out_ops);
1124  cleanup_inops:
1125	nf_unregister_hook(&ip_vs_in_ops);
1126  cleanup_conn:
1127	ip_vs_conn_cleanup();
1128  cleanup_app:
1129	ip_vs_app_cleanup();
1130  cleanup_protocol:
1131	ip_vs_protocol_cleanup();
1132	ip_vs_control_cleanup();
1133  cleanup_nothing:
1134	return ret;
1135}
1136
1137static void __exit ip_vs_cleanup(void)
1138{
1139	nf_unregister_hook(&ip_vs_forward_icmp_ops);
1140	nf_unregister_hook(&ip_vs_post_routing_ops);
1141	nf_unregister_hook(&ip_vs_out_ops);
1142	nf_unregister_hook(&ip_vs_in_ops);
1143	ip_vs_conn_cleanup();
1144	ip_vs_app_cleanup();
1145	ip_vs_protocol_cleanup();
1146	ip_vs_control_cleanup();
1147	IP_VS_INFO("ipvs unloaded.\n");
1148}
1149
1150module_init(ip_vs_init);
1151module_exit(ip_vs_cleanup);
1152MODULE_LICENSE("GPL");
Configure Feed

Configure Feed