Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at v2.6.22-rc3 563 lines 13 kB view raw
1/* 2 * ip_vs_xmit.c: various packet transmitters for IPVS 3 * 4 * Version: $Id: ip_vs_xmit.c,v 1.2 2002/11/30 01:50:35 wensong Exp $ 5 * 6 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> 7 * Julian Anastasov <ja@ssi.bg> 8 * 9 * This program is free software; you can redistribute it and/or 10 * modify it under the terms of the GNU General Public License 11 * as published by the Free Software Foundation; either version 12 * 2 of the License, or (at your option) any later version. 13 * 14 * Changes: 15 * 16 */ 17 18#include <linux/kernel.h> 19#include <linux/ip.h> 20#include <linux/tcp.h> /* for tcphdr */ 21#include <net/tcp.h> /* for csum_tcpudp_magic */ 22#include <net/udp.h> 23#include <net/icmp.h> /* for icmp_send */ 24#include <net/route.h> /* for ip_route_output */ 25#include <linux/netfilter.h> 26#include <linux/netfilter_ipv4.h> 27 28#include <net/ip_vs.h> 29 30 31/* 32 * Destination cache to speed up outgoing route lookup 33 */ 34static inline void 35__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst) 36{ 37 struct dst_entry *old_dst; 38 39 old_dst = dest->dst_cache; 40 dest->dst_cache = dst; 41 dest->dst_rtos = rtos; 42 dst_release(old_dst); 43} 44 45static inline struct dst_entry * 46__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie) 47{ 48 struct dst_entry *dst = dest->dst_cache; 49 50 if (!dst) 51 return NULL; 52 if ((dst->obsolete || rtos != dest->dst_rtos) && 53 dst->ops->check(dst, cookie) == NULL) { 54 dest->dst_cache = NULL; 55 dst_release(dst); 56 return NULL; 57 } 58 dst_hold(dst); 59 return dst; 60} 61 62static inline struct rtable * 63__ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos) 64{ 65 struct rtable *rt; /* Route to the other host */ 66 struct ip_vs_dest *dest = cp->dest; 67 68 if (dest) { 69 spin_lock(&dest->dst_lock); 70 if (!(rt = (struct rtable *) 71 __ip_vs_dst_check(dest, rtos, 0))) { 72 struct flowi fl = { 73 .oif = 0, 74 .nl_u = { 75 .ip4_u = { 76 .daddr = dest->addr, 77 .saddr = 0, 78 .tos = rtos, } }, 79 }; 80 81 if (ip_route_output_key(&rt, &fl)) { 82 spin_unlock(&dest->dst_lock); 83 IP_VS_DBG_RL("ip_route_output error, " 84 "dest: %u.%u.%u.%u\n", 85 NIPQUAD(dest->addr)); 86 return NULL; 87 } 88 __ip_vs_dst_set(dest, rtos, dst_clone(&rt->u.dst)); 89 IP_VS_DBG(10, "new dst %u.%u.%u.%u, refcnt=%d, rtos=%X\n", 90 NIPQUAD(dest->addr), 91 atomic_read(&rt->u.dst.__refcnt), rtos); 92 } 93 spin_unlock(&dest->dst_lock); 94 } else { 95 struct flowi fl = { 96 .oif = 0, 97 .nl_u = { 98 .ip4_u = { 99 .daddr = cp->daddr, 100 .saddr = 0, 101 .tos = rtos, } }, 102 }; 103 104 if (ip_route_output_key(&rt, &fl)) { 105 IP_VS_DBG_RL("ip_route_output error, dest: " 106 "%u.%u.%u.%u\n", NIPQUAD(cp->daddr)); 107 return NULL; 108 } 109 } 110 111 return rt; 112} 113 114 115/* 116 * Release dest->dst_cache before a dest is removed 117 */ 118void 119ip_vs_dst_reset(struct ip_vs_dest *dest) 120{ 121 struct dst_entry *old_dst; 122 123 old_dst = dest->dst_cache; 124 dest->dst_cache = NULL; 125 dst_release(old_dst); 126} 127 128#define IP_VS_XMIT(skb, rt) \ 129do { \ 130 (skb)->ipvs_property = 1; \ 131 (skb)->ip_summed = CHECKSUM_NONE; \ 132 NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, (skb), NULL, \ 133 (rt)->u.dst.dev, dst_output); \ 134} while (0) 135 136 137/* 138 * NULL transmitter (do nothing except return NF_ACCEPT) 139 */ 140int 141ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, 142 struct ip_vs_protocol *pp) 143{ 144 /* we do not touch skb and do not need pskb ptr */ 145 return NF_ACCEPT; 146} 147 148 149/* 150 * Bypass transmitter 151 * Let packets bypass the destination when the destination is not 152 * available, it may be only used in transparent cache cluster. 153 */ 154int 155ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, 156 struct ip_vs_protocol *pp) 157{ 158 struct rtable *rt; /* Route to the other host */ 159 struct iphdr *iph = ip_hdr(skb); 160 u8 tos = iph->tos; 161 int mtu; 162 struct flowi fl = { 163 .oif = 0, 164 .nl_u = { 165 .ip4_u = { 166 .daddr = iph->daddr, 167 .saddr = 0, 168 .tos = RT_TOS(tos), } }, 169 }; 170 171 EnterFunction(10); 172 173 if (ip_route_output_key(&rt, &fl)) { 174 IP_VS_DBG_RL("ip_vs_bypass_xmit(): ip_route_output error, " 175 "dest: %u.%u.%u.%u\n", NIPQUAD(iph->daddr)); 176 goto tx_error_icmp; 177 } 178 179 /* MTU checking */ 180 mtu = dst_mtu(&rt->u.dst); 181 if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) { 182 ip_rt_put(rt); 183 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); 184 IP_VS_DBG_RL("ip_vs_bypass_xmit(): frag needed\n"); 185 goto tx_error; 186 } 187 188 /* 189 * Call ip_send_check because we are not sure it is called 190 * after ip_defrag. Is copy-on-write needed? 191 */ 192 if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) { 193 ip_rt_put(rt); 194 return NF_STOLEN; 195 } 196 ip_send_check(ip_hdr(skb)); 197 198 /* drop old route */ 199 dst_release(skb->dst); 200 skb->dst = &rt->u.dst; 201 202 /* Another hack: avoid icmp_send in ip_fragment */ 203 skb->local_df = 1; 204 205 IP_VS_XMIT(skb, rt); 206 207 LeaveFunction(10); 208 return NF_STOLEN; 209 210 tx_error_icmp: 211 dst_link_failure(skb); 212 tx_error: 213 kfree_skb(skb); 214 LeaveFunction(10); 215 return NF_STOLEN; 216} 217 218 219/* 220 * NAT transmitter (only for outside-to-inside nat forwarding) 221 * Not used for related ICMP 222 */ 223int 224ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, 225 struct ip_vs_protocol *pp) 226{ 227 struct rtable *rt; /* Route to the other host */ 228 int mtu; 229 struct iphdr *iph = ip_hdr(skb); 230 231 EnterFunction(10); 232 233 /* check if it is a connection of no-client-port */ 234 if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) { 235 __be16 _pt, *p; 236 p = skb_header_pointer(skb, iph->ihl*4, sizeof(_pt), &_pt); 237 if (p == NULL) 238 goto tx_error; 239 ip_vs_conn_fill_cport(cp, *p); 240 IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); 241 } 242 243 if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos)))) 244 goto tx_error_icmp; 245 246 /* MTU checking */ 247 mtu = dst_mtu(&rt->u.dst); 248 if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) { 249 ip_rt_put(rt); 250 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); 251 IP_VS_DBG_RL_PKT(0, pp, skb, 0, "ip_vs_nat_xmit(): frag needed for"); 252 goto tx_error; 253 } 254 255 /* copy-on-write the packet before mangling it */ 256 if (!ip_vs_make_skb_writable(&skb, sizeof(struct iphdr))) 257 goto tx_error_put; 258 259 if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) 260 goto tx_error_put; 261 262 /* drop old route */ 263 dst_release(skb->dst); 264 skb->dst = &rt->u.dst; 265 266 /* mangle the packet */ 267 if (pp->dnat_handler && !pp->dnat_handler(&skb, pp, cp)) 268 goto tx_error; 269 ip_hdr(skb)->daddr = cp->daddr; 270 ip_send_check(ip_hdr(skb)); 271 272 IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT"); 273 274 /* FIXME: when application helper enlarges the packet and the length 275 is larger than the MTU of outgoing device, there will be still 276 MTU problem. */ 277 278 /* Another hack: avoid icmp_send in ip_fragment */ 279 skb->local_df = 1; 280 281 IP_VS_XMIT(skb, rt); 282 283 LeaveFunction(10); 284 return NF_STOLEN; 285 286 tx_error_icmp: 287 dst_link_failure(skb); 288 tx_error: 289 LeaveFunction(10); 290 kfree_skb(skb); 291 return NF_STOLEN; 292 tx_error_put: 293 ip_rt_put(rt); 294 goto tx_error; 295} 296 297 298/* 299 * IP Tunneling transmitter 300 * 301 * This function encapsulates the packet in a new IP packet, its 302 * destination will be set to cp->daddr. Most code of this function 303 * is taken from ipip.c. 304 * 305 * It is used in VS/TUN cluster. The load balancer selects a real 306 * server from a cluster based on a scheduling algorithm, 307 * encapsulates the request packet and forwards it to the selected 308 * server. For example, all real servers are configured with 309 * "ifconfig tunl0 <Virtual IP Address> up". When the server receives 310 * the encapsulated packet, it will decapsulate the packet, processe 311 * the request and return the response packets directly to the client 312 * without passing the load balancer. This can greatly increase the 313 * scalability of virtual server. 314 * 315 * Used for ANY protocol 316 */ 317int 318ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, 319 struct ip_vs_protocol *pp) 320{ 321 struct rtable *rt; /* Route to the other host */ 322 struct net_device *tdev; /* Device to other host */ 323 struct iphdr *old_iph = ip_hdr(skb); 324 u8 tos = old_iph->tos; 325 __be16 df = old_iph->frag_off; 326 sk_buff_data_t old_transport_header = skb->transport_header; 327 struct iphdr *iph; /* Our new IP header */ 328 int max_headroom; /* The extra header space needed */ 329 int mtu; 330 331 EnterFunction(10); 332 333 if (skb->protocol != htons(ETH_P_IP)) { 334 IP_VS_DBG_RL("ip_vs_tunnel_xmit(): protocol error, " 335 "ETH_P_IP: %d, skb protocol: %d\n", 336 htons(ETH_P_IP), skb->protocol); 337 goto tx_error; 338 } 339 340 if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos)))) 341 goto tx_error_icmp; 342 343 tdev = rt->u.dst.dev; 344 345 mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr); 346 if (mtu < 68) { 347 ip_rt_put(rt); 348 IP_VS_DBG_RL("ip_vs_tunnel_xmit(): mtu less than 68\n"); 349 goto tx_error; 350 } 351 if (skb->dst) 352 skb->dst->ops->update_pmtu(skb->dst, mtu); 353 354 df |= (old_iph->frag_off & htons(IP_DF)); 355 356 if ((old_iph->frag_off & htons(IP_DF)) 357 && mtu < ntohs(old_iph->tot_len)) { 358 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); 359 ip_rt_put(rt); 360 IP_VS_DBG_RL("ip_vs_tunnel_xmit(): frag needed\n"); 361 goto tx_error; 362 } 363 364 /* 365 * Okay, now see if we can stuff it in the buffer as-is. 366 */ 367 max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr); 368 369 if (skb_headroom(skb) < max_headroom 370 || skb_cloned(skb) || skb_shared(skb)) { 371 struct sk_buff *new_skb = 372 skb_realloc_headroom(skb, max_headroom); 373 if (!new_skb) { 374 ip_rt_put(rt); 375 kfree_skb(skb); 376 IP_VS_ERR_RL("ip_vs_tunnel_xmit(): no memory\n"); 377 return NF_STOLEN; 378 } 379 kfree_skb(skb); 380 skb = new_skb; 381 old_iph = ip_hdr(skb); 382 } 383 384 skb->transport_header = old_transport_header; 385 386 /* fix old IP header checksum */ 387 ip_send_check(old_iph); 388 389 skb_push(skb, sizeof(struct iphdr)); 390 skb_reset_network_header(skb); 391 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); 392 393 /* drop old route */ 394 dst_release(skb->dst); 395 skb->dst = &rt->u.dst; 396 397 /* 398 * Push down and install the IPIP header. 399 */ 400 iph = ip_hdr(skb); 401 iph->version = 4; 402 iph->ihl = sizeof(struct iphdr)>>2; 403 iph->frag_off = df; 404 iph->protocol = IPPROTO_IPIP; 405 iph->tos = tos; 406 iph->daddr = rt->rt_dst; 407 iph->saddr = rt->rt_src; 408 iph->ttl = old_iph->ttl; 409 iph->tot_len = htons(skb->len); 410 ip_select_ident(iph, &rt->u.dst, NULL); 411 ip_send_check(iph); 412 413 /* Another hack: avoid icmp_send in ip_fragment */ 414 skb->local_df = 1; 415 416 IP_VS_XMIT(skb, rt); 417 418 LeaveFunction(10); 419 420 return NF_STOLEN; 421 422 tx_error_icmp: 423 dst_link_failure(skb); 424 tx_error: 425 kfree_skb(skb); 426 LeaveFunction(10); 427 return NF_STOLEN; 428} 429 430 431/* 432 * Direct Routing transmitter 433 * Used for ANY protocol 434 */ 435int 436ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, 437 struct ip_vs_protocol *pp) 438{ 439 struct rtable *rt; /* Route to the other host */ 440 struct iphdr *iph = ip_hdr(skb); 441 int mtu; 442 443 EnterFunction(10); 444 445 if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos)))) 446 goto tx_error_icmp; 447 448 /* MTU checking */ 449 mtu = dst_mtu(&rt->u.dst); 450 if ((iph->frag_off & htons(IP_DF)) && skb->len > mtu) { 451 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); 452 ip_rt_put(rt); 453 IP_VS_DBG_RL("ip_vs_dr_xmit(): frag needed\n"); 454 goto tx_error; 455 } 456 457 /* 458 * Call ip_send_check because we are not sure it is called 459 * after ip_defrag. Is copy-on-write needed? 460 */ 461 if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) { 462 ip_rt_put(rt); 463 return NF_STOLEN; 464 } 465 ip_send_check(ip_hdr(skb)); 466 467 /* drop old route */ 468 dst_release(skb->dst); 469 skb->dst = &rt->u.dst; 470 471 /* Another hack: avoid icmp_send in ip_fragment */ 472 skb->local_df = 1; 473 474 IP_VS_XMIT(skb, rt); 475 476 LeaveFunction(10); 477 return NF_STOLEN; 478 479 tx_error_icmp: 480 dst_link_failure(skb); 481 tx_error: 482 kfree_skb(skb); 483 LeaveFunction(10); 484 return NF_STOLEN; 485} 486 487 488/* 489 * ICMP packet transmitter 490 * called by the ip_vs_in_icmp 491 */ 492int 493ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, 494 struct ip_vs_protocol *pp, int offset) 495{ 496 struct rtable *rt; /* Route to the other host */ 497 int mtu; 498 int rc; 499 500 EnterFunction(10); 501 502 /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be 503 forwarded directly here, because there is no need to 504 translate address/port back */ 505 if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { 506 if (cp->packet_xmit) 507 rc = cp->packet_xmit(skb, cp, pp); 508 else 509 rc = NF_ACCEPT; 510 /* do not touch skb anymore */ 511 atomic_inc(&cp->in_pkts); 512 goto out; 513 } 514 515 /* 516 * mangle and send the packet here (only for VS/NAT) 517 */ 518 519 if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(ip_hdr(skb)->tos)))) 520 goto tx_error_icmp; 521 522 /* MTU checking */ 523 mtu = dst_mtu(&rt->u.dst); 524 if ((skb->len > mtu) && (ip_hdr(skb)->frag_off & htons(IP_DF))) { 525 ip_rt_put(rt); 526 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); 527 IP_VS_DBG_RL("ip_vs_in_icmp(): frag needed\n"); 528 goto tx_error; 529 } 530 531 /* copy-on-write the packet before mangling it */ 532 if (!ip_vs_make_skb_writable(&skb, offset)) 533 goto tx_error_put; 534 535 if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) 536 goto tx_error_put; 537 538 /* drop the old route when skb is not shared */ 539 dst_release(skb->dst); 540 skb->dst = &rt->u.dst; 541 542 ip_vs_nat_icmp(skb, pp, cp, 0); 543 544 /* Another hack: avoid icmp_send in ip_fragment */ 545 skb->local_df = 1; 546 547 IP_VS_XMIT(skb, rt); 548 549 rc = NF_STOLEN; 550 goto out; 551 552 tx_error_icmp: 553 dst_link_failure(skb); 554 tx_error: 555 dev_kfree_skb(skb); 556 rc = NF_STOLEN; 557 out: 558 LeaveFunction(10); 559 return rc; 560 tx_error_put: 561 ip_rt_put(rt); 562 goto tx_error; 563}