Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at v2.6.20 561 lines 13 kB view raw
1/* 2 * ip_vs_xmit.c: various packet transmitters for IPVS 3 * 4 * Version: $Id: ip_vs_xmit.c,v 1.2 2002/11/30 01:50:35 wensong Exp $ 5 * 6 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> 7 * Julian Anastasov <ja@ssi.bg> 8 * 9 * This program is free software; you can redistribute it and/or 10 * modify it under the terms of the GNU General Public License 11 * as published by the Free Software Foundation; either version 12 * 2 of the License, or (at your option) any later version. 13 * 14 * Changes: 15 * 16 */ 17 18#include <linux/kernel.h> 19#include <linux/ip.h> 20#include <linux/tcp.h> /* for tcphdr */ 21#include <net/tcp.h> /* for csum_tcpudp_magic */ 22#include <net/udp.h> 23#include <net/icmp.h> /* for icmp_send */ 24#include <net/route.h> /* for ip_route_output */ 25#include <linux/netfilter.h> 26#include <linux/netfilter_ipv4.h> 27 28#include <net/ip_vs.h> 29 30 31/* 32 * Destination cache to speed up outgoing route lookup 33 */ 34static inline void 35__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst) 36{ 37 struct dst_entry *old_dst; 38 39 old_dst = dest->dst_cache; 40 dest->dst_cache = dst; 41 dest->dst_rtos = rtos; 42 dst_release(old_dst); 43} 44 45static inline struct dst_entry * 46__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie) 47{ 48 struct dst_entry *dst = dest->dst_cache; 49 50 if (!dst) 51 return NULL; 52 if ((dst->obsolete || rtos != dest->dst_rtos) && 53 dst->ops->check(dst, cookie) == NULL) { 54 dest->dst_cache = NULL; 55 dst_release(dst); 56 return NULL; 57 } 58 dst_hold(dst); 59 return dst; 60} 61 62static inline struct rtable * 63__ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos) 64{ 65 struct rtable *rt; /* Route to the other host */ 66 struct ip_vs_dest *dest = cp->dest; 67 68 if (dest) { 69 spin_lock(&dest->dst_lock); 70 if (!(rt = (struct rtable *) 71 __ip_vs_dst_check(dest, rtos, 0))) { 72 struct flowi fl = { 73 .oif = 0, 74 .nl_u = { 75 .ip4_u = { 76 .daddr = dest->addr, 77 .saddr = 0, 78 .tos = rtos, } }, 79 }; 80 81 if (ip_route_output_key(&rt, &fl)) { 82 spin_unlock(&dest->dst_lock); 83 IP_VS_DBG_RL("ip_route_output error, " 84 "dest: %u.%u.%u.%u\n", 85 NIPQUAD(dest->addr)); 86 return NULL; 87 } 88 __ip_vs_dst_set(dest, rtos, dst_clone(&rt->u.dst)); 89 IP_VS_DBG(10, "new dst %u.%u.%u.%u, refcnt=%d, rtos=%X\n", 90 NIPQUAD(dest->addr), 91 atomic_read(&rt->u.dst.__refcnt), rtos); 92 } 93 spin_unlock(&dest->dst_lock); 94 } else { 95 struct flowi fl = { 96 .oif = 0, 97 .nl_u = { 98 .ip4_u = { 99 .daddr = cp->daddr, 100 .saddr = 0, 101 .tos = rtos, } }, 102 }; 103 104 if (ip_route_output_key(&rt, &fl)) { 105 IP_VS_DBG_RL("ip_route_output error, dest: " 106 "%u.%u.%u.%u\n", NIPQUAD(cp->daddr)); 107 return NULL; 108 } 109 } 110 111 return rt; 112} 113 114 115/* 116 * Release dest->dst_cache before a dest is removed 117 */ 118void 119ip_vs_dst_reset(struct ip_vs_dest *dest) 120{ 121 struct dst_entry *old_dst; 122 123 old_dst = dest->dst_cache; 124 dest->dst_cache = NULL; 125 dst_release(old_dst); 126} 127 128#define IP_VS_XMIT(skb, rt) \ 129do { \ 130 (skb)->ipvs_property = 1; \ 131 (skb)->ip_summed = CHECKSUM_NONE; \ 132 NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, (skb), NULL, \ 133 (rt)->u.dst.dev, dst_output); \ 134} while (0) 135 136 137/* 138 * NULL transmitter (do nothing except return NF_ACCEPT) 139 */ 140int 141ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, 142 struct ip_vs_protocol *pp) 143{ 144 /* we do not touch skb and do not need pskb ptr */ 145 return NF_ACCEPT; 146} 147 148 149/* 150 * Bypass transmitter 151 * Let packets bypass the destination when the destination is not 152 * available, it may be only used in transparent cache cluster. 153 */ 154int 155ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, 156 struct ip_vs_protocol *pp) 157{ 158 struct rtable *rt; /* Route to the other host */ 159 struct iphdr *iph = skb->nh.iph; 160 u8 tos = iph->tos; 161 int mtu; 162 struct flowi fl = { 163 .oif = 0, 164 .nl_u = { 165 .ip4_u = { 166 .daddr = iph->daddr, 167 .saddr = 0, 168 .tos = RT_TOS(tos), } }, 169 }; 170 171 EnterFunction(10); 172 173 if (ip_route_output_key(&rt, &fl)) { 174 IP_VS_DBG_RL("ip_vs_bypass_xmit(): ip_route_output error, " 175 "dest: %u.%u.%u.%u\n", NIPQUAD(iph->daddr)); 176 goto tx_error_icmp; 177 } 178 179 /* MTU checking */ 180 mtu = dst_mtu(&rt->u.dst); 181 if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) { 182 ip_rt_put(rt); 183 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); 184 IP_VS_DBG_RL("ip_vs_bypass_xmit(): frag needed\n"); 185 goto tx_error; 186 } 187 188 /* 189 * Call ip_send_check because we are not sure it is called 190 * after ip_defrag. Is copy-on-write needed? 191 */ 192 if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) { 193 ip_rt_put(rt); 194 return NF_STOLEN; 195 } 196 ip_send_check(skb->nh.iph); 197 198 /* drop old route */ 199 dst_release(skb->dst); 200 skb->dst = &rt->u.dst; 201 202 /* Another hack: avoid icmp_send in ip_fragment */ 203 skb->local_df = 1; 204 205 IP_VS_XMIT(skb, rt); 206 207 LeaveFunction(10); 208 return NF_STOLEN; 209 210 tx_error_icmp: 211 dst_link_failure(skb); 212 tx_error: 213 kfree_skb(skb); 214 LeaveFunction(10); 215 return NF_STOLEN; 216} 217 218 219/* 220 * NAT transmitter (only for outside-to-inside nat forwarding) 221 * Not used for related ICMP 222 */ 223int 224ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, 225 struct ip_vs_protocol *pp) 226{ 227 struct rtable *rt; /* Route to the other host */ 228 int mtu; 229 struct iphdr *iph = skb->nh.iph; 230 231 EnterFunction(10); 232 233 /* check if it is a connection of no-client-port */ 234 if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) { 235 __be16 _pt, *p; 236 p = skb_header_pointer(skb, iph->ihl*4, sizeof(_pt), &_pt); 237 if (p == NULL) 238 goto tx_error; 239 ip_vs_conn_fill_cport(cp, *p); 240 IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); 241 } 242 243 if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos)))) 244 goto tx_error_icmp; 245 246 /* MTU checking */ 247 mtu = dst_mtu(&rt->u.dst); 248 if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) { 249 ip_rt_put(rt); 250 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); 251 IP_VS_DBG_RL_PKT(0, pp, skb, 0, "ip_vs_nat_xmit(): frag needed for"); 252 goto tx_error; 253 } 254 255 /* copy-on-write the packet before mangling it */ 256 if (!ip_vs_make_skb_writable(&skb, sizeof(struct iphdr))) 257 goto tx_error_put; 258 259 if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) 260 goto tx_error_put; 261 262 /* drop old route */ 263 dst_release(skb->dst); 264 skb->dst = &rt->u.dst; 265 266 /* mangle the packet */ 267 if (pp->dnat_handler && !pp->dnat_handler(&skb, pp, cp)) 268 goto tx_error; 269 skb->nh.iph->daddr = cp->daddr; 270 ip_send_check(skb->nh.iph); 271 272 IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT"); 273 274 /* FIXME: when application helper enlarges the packet and the length 275 is larger than the MTU of outgoing device, there will be still 276 MTU problem. */ 277 278 /* Another hack: avoid icmp_send in ip_fragment */ 279 skb->local_df = 1; 280 281 IP_VS_XMIT(skb, rt); 282 283 LeaveFunction(10); 284 return NF_STOLEN; 285 286 tx_error_icmp: 287 dst_link_failure(skb); 288 tx_error: 289 LeaveFunction(10); 290 kfree_skb(skb); 291 return NF_STOLEN; 292 tx_error_put: 293 ip_rt_put(rt); 294 goto tx_error; 295} 296 297 298/* 299 * IP Tunneling transmitter 300 * 301 * This function encapsulates the packet in a new IP packet, its 302 * destination will be set to cp->daddr. Most code of this function 303 * is taken from ipip.c. 304 * 305 * It is used in VS/TUN cluster. The load balancer selects a real 306 * server from a cluster based on a scheduling algorithm, 307 * encapsulates the request packet and forwards it to the selected 308 * server. For example, all real servers are configured with 309 * "ifconfig tunl0 <Virtual IP Address> up". When the server receives 310 * the encapsulated packet, it will decapsulate the packet, processe 311 * the request and return the response packets directly to the client 312 * without passing the load balancer. This can greatly increase the 313 * scalability of virtual server. 314 * 315 * Used for ANY protocol 316 */ 317int 318ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, 319 struct ip_vs_protocol *pp) 320{ 321 struct rtable *rt; /* Route to the other host */ 322 struct net_device *tdev; /* Device to other host */ 323 struct iphdr *old_iph = skb->nh.iph; 324 u8 tos = old_iph->tos; 325 __be16 df = old_iph->frag_off; 326 struct iphdr *iph; /* Our new IP header */ 327 int max_headroom; /* The extra header space needed */ 328 int mtu; 329 330 EnterFunction(10); 331 332 if (skb->protocol != __constant_htons(ETH_P_IP)) { 333 IP_VS_DBG_RL("ip_vs_tunnel_xmit(): protocol error, " 334 "ETH_P_IP: %d, skb protocol: %d\n", 335 __constant_htons(ETH_P_IP), skb->protocol); 336 goto tx_error; 337 } 338 339 if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos)))) 340 goto tx_error_icmp; 341 342 tdev = rt->u.dst.dev; 343 344 mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr); 345 if (mtu < 68) { 346 ip_rt_put(rt); 347 IP_VS_DBG_RL("ip_vs_tunnel_xmit(): mtu less than 68\n"); 348 goto tx_error; 349 } 350 if (skb->dst) 351 skb->dst->ops->update_pmtu(skb->dst, mtu); 352 353 df |= (old_iph->frag_off&__constant_htons(IP_DF)); 354 355 if ((old_iph->frag_off&__constant_htons(IP_DF)) 356 && mtu < ntohs(old_iph->tot_len)) { 357 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); 358 ip_rt_put(rt); 359 IP_VS_DBG_RL("ip_vs_tunnel_xmit(): frag needed\n"); 360 goto tx_error; 361 } 362 363 /* 364 * Okay, now see if we can stuff it in the buffer as-is. 365 */ 366 max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr); 367 368 if (skb_headroom(skb) < max_headroom 369 || skb_cloned(skb) || skb_shared(skb)) { 370 struct sk_buff *new_skb = 371 skb_realloc_headroom(skb, max_headroom); 372 if (!new_skb) { 373 ip_rt_put(rt); 374 kfree_skb(skb); 375 IP_VS_ERR_RL("ip_vs_tunnel_xmit(): no memory\n"); 376 return NF_STOLEN; 377 } 378 kfree_skb(skb); 379 skb = new_skb; 380 old_iph = skb->nh.iph; 381 } 382 383 skb->h.raw = (void *) old_iph; 384 385 /* fix old IP header checksum */ 386 ip_send_check(old_iph); 387 388 skb->nh.raw = skb_push(skb, sizeof(struct iphdr)); 389 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); 390 391 /* drop old route */ 392 dst_release(skb->dst); 393 skb->dst = &rt->u.dst; 394 395 /* 396 * Push down and install the IPIP header. 397 */ 398 iph = skb->nh.iph; 399 iph->version = 4; 400 iph->ihl = sizeof(struct iphdr)>>2; 401 iph->frag_off = df; 402 iph->protocol = IPPROTO_IPIP; 403 iph->tos = tos; 404 iph->daddr = rt->rt_dst; 405 iph->saddr = rt->rt_src; 406 iph->ttl = old_iph->ttl; 407 iph->tot_len = htons(skb->len); 408 ip_select_ident(iph, &rt->u.dst, NULL); 409 ip_send_check(iph); 410 411 /* Another hack: avoid icmp_send in ip_fragment */ 412 skb->local_df = 1; 413 414 IP_VS_XMIT(skb, rt); 415 416 LeaveFunction(10); 417 418 return NF_STOLEN; 419 420 tx_error_icmp: 421 dst_link_failure(skb); 422 tx_error: 423 kfree_skb(skb); 424 LeaveFunction(10); 425 return NF_STOLEN; 426} 427 428 429/* 430 * Direct Routing transmitter 431 * Used for ANY protocol 432 */ 433int 434ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, 435 struct ip_vs_protocol *pp) 436{ 437 struct rtable *rt; /* Route to the other host */ 438 struct iphdr *iph = skb->nh.iph; 439 int mtu; 440 441 EnterFunction(10); 442 443 if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos)))) 444 goto tx_error_icmp; 445 446 /* MTU checking */ 447 mtu = dst_mtu(&rt->u.dst); 448 if ((iph->frag_off&__constant_htons(IP_DF)) && skb->len > mtu) { 449 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); 450 ip_rt_put(rt); 451 IP_VS_DBG_RL("ip_vs_dr_xmit(): frag needed\n"); 452 goto tx_error; 453 } 454 455 /* 456 * Call ip_send_check because we are not sure it is called 457 * after ip_defrag. Is copy-on-write needed? 458 */ 459 if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) { 460 ip_rt_put(rt); 461 return NF_STOLEN; 462 } 463 ip_send_check(skb->nh.iph); 464 465 /* drop old route */ 466 dst_release(skb->dst); 467 skb->dst = &rt->u.dst; 468 469 /* Another hack: avoid icmp_send in ip_fragment */ 470 skb->local_df = 1; 471 472 IP_VS_XMIT(skb, rt); 473 474 LeaveFunction(10); 475 return NF_STOLEN; 476 477 tx_error_icmp: 478 dst_link_failure(skb); 479 tx_error: 480 kfree_skb(skb); 481 LeaveFunction(10); 482 return NF_STOLEN; 483} 484 485 486/* 487 * ICMP packet transmitter 488 * called by the ip_vs_in_icmp 489 */ 490int 491ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, 492 struct ip_vs_protocol *pp, int offset) 493{ 494 struct rtable *rt; /* Route to the other host */ 495 int mtu; 496 int rc; 497 498 EnterFunction(10); 499 500 /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be 501 forwarded directly here, because there is no need to 502 translate address/port back */ 503 if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { 504 if (cp->packet_xmit) 505 rc = cp->packet_xmit(skb, cp, pp); 506 else 507 rc = NF_ACCEPT; 508 /* do not touch skb anymore */ 509 atomic_inc(&cp->in_pkts); 510 goto out; 511 } 512 513 /* 514 * mangle and send the packet here (only for VS/NAT) 515 */ 516 517 if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(skb->nh.iph->tos)))) 518 goto tx_error_icmp; 519 520 /* MTU checking */ 521 mtu = dst_mtu(&rt->u.dst); 522 if ((skb->len > mtu) && (skb->nh.iph->frag_off&__constant_htons(IP_DF))) { 523 ip_rt_put(rt); 524 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); 525 IP_VS_DBG_RL("ip_vs_in_icmp(): frag needed\n"); 526 goto tx_error; 527 } 528 529 /* copy-on-write the packet before mangling it */ 530 if (!ip_vs_make_skb_writable(&skb, offset)) 531 goto tx_error_put; 532 533 if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) 534 goto tx_error_put; 535 536 /* drop the old route when skb is not shared */ 537 dst_release(skb->dst); 538 skb->dst = &rt->u.dst; 539 540 ip_vs_nat_icmp(skb, pp, cp, 0); 541 542 /* Another hack: avoid icmp_send in ip_fragment */ 543 skb->local_df = 1; 544 545 IP_VS_XMIT(skb, rt); 546 547 rc = NF_STOLEN; 548 goto out; 549 550 tx_error_icmp: 551 dst_link_failure(skb); 552 tx_error: 553 dev_kfree_skb(skb); 554 rc = NF_STOLEN; 555 out: 556 LeaveFunction(10); 557 return rc; 558 tx_error_put: 559 ip_rt_put(rt); 560 goto tx_error; 561}