Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

ipvs: implement passive PMTUD for IPIP packets

IPVS is missing the logic to update PMTU in routing
for its IPIP packets. We monitor the dst_mtu and can return
FRAG_NEEDED messages but if the tunneled packets get ICMP
error we can not rely on other traffic to save the lowest
MTU.

The following patch adds ICMP handling for IPIP
packets in incoming direction, from some remote host to
our local IP used as saddr in the outer header. By this
way we can forward any related ICMP traffic if it is for IPVS
TUN connection. For the special case of PMTUD we update the
routing and if client requested DF we can forward the
error.

To properly update the routing we have to bind
the cached route (dest->dst_cache) to the selected saddr
because ipv4_update_pmtu uses saddr for dst lookup.
Add IP_VS_RT_MODE_CONNECT flag to force such binding with
second route.

Update ip_vs_tunnel_xmit to provide IP_VS_RT_MODE_CONNECT
and change the code to copy DF. For now we prefer not to
force PMTU discovery (outer DF=1) because we don't have
configuration option to enable or disable PMTUD. As we
do not keep any packets to resend, we prefer not to
play games with packets without DF bit because the sender
is not informed when they are rejected.

Also, change ops->update_pmtu to be called only
for local clients because there is no point to update
MTU for input routes, in our case skb->dst->dev is lo.
It seems the code is copied from ipip.c where the skb
dst points to tunnel device.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>

authored by

Julian Anastasov and committed by
Simon Horman
f2edb9f7 2b2d2808

+128 -27
+73 -3
net/netfilter/ipvs/ip_vs_core.c
··· 1303 1303 struct ip_vs_conn *cp; 1304 1304 struct ip_vs_protocol *pp; 1305 1305 struct ip_vs_proto_data *pd; 1306 - unsigned int offset, ihl, verdict; 1306 + unsigned int offset, offset2, ihl, verdict; 1307 + bool ipip; 1307 1308 1308 1309 *related = 1; 1309 1310 ··· 1346 1345 1347 1346 net = skb_net(skb); 1348 1347 1348 + /* Special case for errors for IPIP packets */ 1349 + ipip = false; 1350 + if (cih->protocol == IPPROTO_IPIP) { 1351 + if (unlikely(cih->frag_off & htons(IP_OFFSET))) 1352 + return NF_ACCEPT; 1353 + /* Error for our IPIP must arrive at LOCAL_IN */ 1354 + if (!(skb_rtable(skb)->rt_flags & RTCF_LOCAL)) 1355 + return NF_ACCEPT; 1356 + offset += cih->ihl * 4; 1357 + cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); 1358 + if (cih == NULL) 1359 + return NF_ACCEPT; /* The packet looks wrong, ignore */ 1360 + ipip = true; 1361 + } 1362 + 1349 1363 pd = ip_vs_proto_data_get(net, cih->protocol); 1350 1364 if (!pd) 1351 1365 return NF_ACCEPT; ··· 1374 1358 IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset, 1375 1359 "Checking incoming ICMP for"); 1376 1360 1361 + offset2 = offset; 1377 1362 offset += cih->ihl * 4; 1378 1363 1379 1364 ip_vs_fill_iphdr(AF_INET, cih, &ciph); 1380 - /* The embedded headers contain source and dest in reverse order */ 1381 - cp = pp->conn_in_get(AF_INET, skb, &ciph, offset, 1); 1365 + /* The embedded headers contain source and dest in reverse order. 1366 + * For IPIP this is error for request, not for reply. 1367 + */ 1368 + cp = pp->conn_in_get(AF_INET, skb, &ciph, offset, ipip ? 0 : 1); 1382 1369 if (!cp) 1383 1370 return NF_ACCEPT; 1384 1371 ··· 1392 1373 /* Failed checksum! */ 1393 1374 IP_VS_DBG(1, "Incoming ICMP: failed checksum from %pI4!\n", 1394 1375 &iph->saddr); 1376 + goto out; 1377 + } 1378 + 1379 + if (ipip) { 1380 + __be32 info = ic->un.gateway; 1381 + 1382 + /* Update the MTU */ 1383 + if (ic->type == ICMP_DEST_UNREACH && 1384 + ic->code == ICMP_FRAG_NEEDED) { 1385 + struct ip_vs_dest *dest = cp->dest; 1386 + u32 mtu = ntohs(ic->un.frag.mtu); 1387 + 1388 + /* Strip outer IP and ICMP, go to IPIP header */ 1389 + __skb_pull(skb, ihl + sizeof(_icmph)); 1390 + offset2 -= ihl + sizeof(_icmph); 1391 + skb_reset_network_header(skb); 1392 + IP_VS_DBG(12, "ICMP for IPIP %pI4->%pI4: mtu=%u\n", 1393 + &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, mtu); 1394 + rcu_read_lock(); 1395 + ipv4_update_pmtu(skb, dev_net(skb->dev), 1396 + mtu, 0, 0, 0, 0); 1397 + rcu_read_unlock(); 1398 + /* Client uses PMTUD? */ 1399 + if (!(cih->frag_off & htons(IP_DF))) 1400 + goto ignore_ipip; 1401 + /* Prefer the resulting PMTU */ 1402 + if (dest) { 1403 + spin_lock(&dest->dst_lock); 1404 + if (dest->dst_cache) 1405 + mtu = dst_mtu(dest->dst_cache); 1406 + spin_unlock(&dest->dst_lock); 1407 + } 1408 + if (mtu > 68 + sizeof(struct iphdr)) 1409 + mtu -= sizeof(struct iphdr); 1410 + info = htonl(mtu); 1411 + } 1412 + /* Strip outer IP, ICMP and IPIP, go to IP header of 1413 + * original request. 1414 + */ 1415 + __skb_pull(skb, offset2); 1416 + skb_reset_network_header(skb); 1417 + IP_VS_DBG(12, "Sending ICMP for %pI4->%pI4: t=%u, c=%u, i=%u\n", 1418 + &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, 1419 + ic->type, ic->code, ntohl(info)); 1420 + icmp_send(skb, ic->type, ic->code, info); 1421 + /* ICMP can be shorter but anyways, account it */ 1422 + ip_vs_out_stats(cp, skb); 1423 + 1424 + ignore_ipip: 1425 + consume_skb(skb); 1426 + verdict = NF_STOLEN; 1395 1427 goto out; 1396 1428 } 1397 1429
+55 -24
net/netfilter/ipvs/ip_vs_xmit.c
··· 49 49 IP_VS_RT_MODE_RDR = 4, /* Allow redirect from remote daddr to 50 50 * local 51 51 */ 52 + IP_VS_RT_MODE_CONNECT = 8, /* Always bind route to saddr */ 52 53 }; 53 54 54 55 /* ··· 85 84 return dst; 86 85 } 87 86 87 + /* Get route to daddr, update *saddr, optionally bind route to saddr */ 88 + static struct rtable *do_output_route4(struct net *net, __be32 daddr, 89 + u32 rtos, int rt_mode, __be32 *saddr) 90 + { 91 + struct flowi4 fl4; 92 + struct rtable *rt; 93 + int loop = 0; 94 + 95 + memset(&fl4, 0, sizeof(fl4)); 96 + fl4.daddr = daddr; 97 + fl4.saddr = (rt_mode & IP_VS_RT_MODE_CONNECT) ? *saddr : 0; 98 + fl4.flowi4_tos = rtos; 99 + 100 + retry: 101 + rt = ip_route_output_key(net, &fl4); 102 + if (IS_ERR(rt)) { 103 + /* Invalid saddr ? */ 104 + if (PTR_ERR(rt) == -EINVAL && *saddr && 105 + rt_mode & IP_VS_RT_MODE_CONNECT && !loop) { 106 + *saddr = 0; 107 + flowi4_update_output(&fl4, 0, rtos, daddr, 0); 108 + goto retry; 109 + } 110 + IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", &daddr); 111 + return NULL; 112 + } else if (!*saddr && rt_mode & IP_VS_RT_MODE_CONNECT && fl4.saddr) { 113 + ip_rt_put(rt); 114 + *saddr = fl4.saddr; 115 + flowi4_update_output(&fl4, 0, rtos, daddr, fl4.saddr); 116 + loop++; 117 + goto retry; 118 + } 119 + *saddr = fl4.saddr; 120 + return rt; 121 + } 122 + 88 123 /* Get route to destination or remote server */ 89 124 static struct rtable * 90 125 __ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_dest *dest, ··· 135 98 spin_lock(&dest->dst_lock); 136 99 if (!(rt = (struct rtable *) 137 100 __ip_vs_dst_check(dest, rtos))) { 138 - struct flowi4 fl4; 139 - 140 - memset(&fl4, 0, sizeof(fl4)); 141 - fl4.daddr = dest->addr.ip; 142 - fl4.flowi4_tos = rtos; 143 - rt = ip_route_output_key(net, &fl4); 144 - if (IS_ERR(rt)) { 101 + rt = do_output_route4(net, dest->addr.ip, rtos, 102 + rt_mode, &dest->dst_saddr.ip); 103 + if (!rt) { 145 104 spin_unlock(&dest->dst_lock); 146 - IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", 147 - &dest->addr.ip); 148 105 return NULL; 149 106 } 150 107 __ip_vs_dst_set(dest, rtos, dst_clone(&rt->dst), 0); 151 - dest->dst_saddr.ip = fl4.saddr; 152 108 IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d, " 153 109 "rtos=%X\n", 154 110 &dest->addr.ip, &dest->dst_saddr.ip, ··· 152 122 *ret_saddr = dest->dst_saddr.ip; 153 123 spin_unlock(&dest->dst_lock); 154 124 } else { 155 - struct flowi4 fl4; 125 + __be32 saddr = htonl(INADDR_ANY); 156 126 157 - memset(&fl4, 0, sizeof(fl4)); 158 - fl4.daddr = daddr; 159 - fl4.flowi4_tos = rtos; 160 - rt = ip_route_output_key(net, &fl4); 161 - if (IS_ERR(rt)) { 162 - IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", 163 - &daddr); 127 + /* For such unconfigured boxes avoid many route lookups 128 + * for performance reasons because we do not remember saddr 129 + */ 130 + rt_mode &= ~IP_VS_RT_MODE_CONNECT; 131 + rt = do_output_route4(net, daddr, rtos, rt_mode, &saddr); 132 + if (!rt) 164 133 return NULL; 165 - } 166 134 if (ret_saddr) 167 - *ret_saddr = fl4.saddr; 135 + *ret_saddr = saddr; 168 136 } 169 137 170 138 local = rt->rt_flags & RTCF_LOCAL; ··· 359 331 old_dst = dest->dst_cache; 360 332 dest->dst_cache = NULL; 361 333 dst_release(old_dst); 334 + dest->dst_saddr.ip = 0; 362 335 } 363 336 364 337 #define IP_VS_XMIT_TUNNEL(skb, cp) \ ··· 800 771 struct net_device *tdev; /* Device to other host */ 801 772 struct iphdr *old_iph = ip_hdr(skb); 802 773 u8 tos = old_iph->tos; 803 - __be16 df = old_iph->frag_off; 774 + __be16 df; 804 775 struct iphdr *iph; /* Our new IP header */ 805 776 unsigned int max_headroom; /* The extra header space needed */ 806 777 int mtu; ··· 810 781 811 782 if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, 812 783 RT_TOS(tos), IP_VS_RT_MODE_LOCAL | 813 - IP_VS_RT_MODE_NON_LOCAL, 784 + IP_VS_RT_MODE_NON_LOCAL | 785 + IP_VS_RT_MODE_CONNECT, 814 786 &saddr))) 815 787 goto tx_error_icmp; 816 788 if (rt->rt_flags & RTCF_LOCAL) { ··· 826 796 IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__); 827 797 goto tx_error_put; 828 798 } 829 - if (skb_dst(skb)) 799 + if (rt_is_output_route(skb_rtable(skb))) 830 800 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); 831 801 832 - df |= (old_iph->frag_off & htons(IP_DF)); 802 + /* Copy DF, reset fragment offset and MF */ 803 + df = old_iph->frag_off & htons(IP_DF); 833 804 834 805 if ((old_iph->frag_off & htons(IP_DF) && 835 806 mtu < ntohs(old_iph->tot_len) && !skb_is_gso(skb))) {