Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

ipv4: properly refresh rtable entries on pmtu/redirect events

This reverts commit 05ab86c5 (xfrm4: Invalidate all ipv4 routes on
IPsec pmtu events). Flushing all cached entries is not needed.

Instead, invalidate only the related next hop dsts to recheck for
the added next hop exception where needed. This also fixes a subtle
race due to bumping generation id's before updating the pmtu.

Cc: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: Timo Teräs <timo.teras@iki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>

authored by

Timo Teräs and committed by
David S. Miller
387aa65a 829a5071

+43 -41
+2 -5
net/ipv4/ah4.c
··· 419 419 if (!x) 420 420 return; 421 421 422 - if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) { 423 - atomic_inc(&flow_cache_genid); 424 - rt_genid_bump(net); 425 - 422 + if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) 426 423 ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_AH, 0); 427 - } else 424 + else 428 425 ipv4_redirect(skb, net, 0, 0, IPPROTO_AH, 0); 429 426 xfrm_state_put(x); 430 427 }
+2 -5
net/ipv4/esp4.c
··· 502 502 if (!x) 503 503 return; 504 504 505 - if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) { 506 - atomic_inc(&flow_cache_genid); 507 - rt_genid_bump(net); 508 - 505 + if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) 509 506 ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_ESP, 0); 510 - } else 507 + else 511 508 ipv4_redirect(skb, net, 0, 0, IPPROTO_ESP, 0); 512 509 xfrm_state_put(x); 513 510 }
+2 -5
net/ipv4/ipcomp.c
··· 47 47 if (!x) 48 48 return; 49 49 50 - if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) { 51 - atomic_inc(&flow_cache_genid); 52 - rt_genid_bump(net); 53 - 50 + if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) 54 51 ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_COMP, 0); 55 - } else 52 + else 56 53 ipv4_redirect(skb, net, 0, 0, IPPROTO_COMP, 0); 57 54 xfrm_state_put(x); 58 55 }
+37 -26
net/ipv4/route.c
··· 594 594 return hval & (FNHE_HASH_SIZE - 1); 595 595 } 596 596 597 + static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe) 598 + { 599 + rt->rt_pmtu = fnhe->fnhe_pmtu; 600 + rt->dst.expires = fnhe->fnhe_expires; 601 + 602 + if (fnhe->fnhe_gw) { 603 + rt->rt_flags |= RTCF_REDIRECTED; 604 + rt->rt_gateway = fnhe->fnhe_gw; 605 + rt->rt_uses_gateway = 1; 606 + } 607 + } 608 + 597 609 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw, 598 610 u32 pmtu, unsigned long expires) 599 611 { 600 612 struct fnhe_hash_bucket *hash; 601 613 struct fib_nh_exception *fnhe; 614 + struct rtable *rt; 615 + unsigned int i; 602 616 int depth; 603 617 u32 hval = fnhe_hashfun(daddr); 604 618 ··· 641 627 fnhe->fnhe_gw = gw; 642 628 if (pmtu) { 643 629 fnhe->fnhe_pmtu = pmtu; 644 - fnhe->fnhe_expires = expires; 630 + fnhe->fnhe_expires = max(1UL, expires); 645 631 } 632 + /* Update all cached dsts too */ 633 + rt = rcu_dereference(fnhe->fnhe_rth); 634 + if (rt) 635 + fill_route_from_fnhe(rt, fnhe); 646 636 } else { 647 637 if (depth > FNHE_RECLAIM_DEPTH) 648 638 fnhe = fnhe_oldest(hash); ··· 662 644 fnhe->fnhe_gw = gw; 663 645 fnhe->fnhe_pmtu = pmtu; 664 646 fnhe->fnhe_expires = expires; 647 + 648 + /* Exception created; mark the cached routes for the nexthop 649 + * stale, so anyone caching it rechecks if this exception 650 + * applies to them. 651 + */ 652 + for_each_possible_cpu(i) { 653 + struct rtable __rcu **prt; 654 + prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i); 655 + rt = rcu_dereference(*prt); 656 + if (rt) 657 + rt->dst.obsolete = DST_OBSOLETE_KILL; 658 + } 665 659 } 666 660 667 661 fnhe->fnhe_stamp = jiffies; ··· 947 917 if (mtu < ip_rt_min_pmtu) 948 918 mtu = ip_rt_min_pmtu; 949 919 950 - if (!rt->rt_pmtu) { 951 - dst->obsolete = DST_OBSOLETE_KILL; 952 - } else { 953 - rt->rt_pmtu = mtu; 954 - dst->expires = max(1UL, jiffies + ip_rt_mtu_expires); 955 - } 956 - 957 920 rcu_read_lock(); 958 921 if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) { 959 922 struct fib_nh *nh = &FIB_RES_NH(res); ··· 1086 1063 * DST_OBSOLETE_FORCE_CHK which forces validation calls down 1087 1064 * into this function always. 1088 1065 * 1089 - * When a PMTU/redirect information update invalidates a 1090 - * route, this is indicated by setting obsolete to 1091 - * DST_OBSOLETE_KILL. 1066 + * When a PMTU/redirect information update invalidates a route, 1067 + * this is indicated by setting obsolete to DST_OBSOLETE_KILL or 1068 + * DST_OBSOLETE_DEAD by dst_free(). 1092 1069 */ 1093 - if (dst->obsolete == DST_OBSOLETE_KILL || rt_is_expired(rt)) 1070 + if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt)) 1094 1071 return NULL; 1095 1072 return dst; 1096 1073 } ··· 1238 1215 fnhe->fnhe_pmtu = 0; 1239 1216 fnhe->fnhe_expires = 0; 1240 1217 } 1241 - if (fnhe->fnhe_pmtu) { 1242 - unsigned long expires = fnhe->fnhe_expires; 1243 - unsigned long diff = expires - jiffies; 1244 - 1245 - if (time_before(jiffies, expires)) { 1246 - rt->rt_pmtu = fnhe->fnhe_pmtu; 1247 - dst_set_expires(&rt->dst, diff); 1248 - } 1249 - } 1250 - if (fnhe->fnhe_gw) { 1251 - rt->rt_flags |= RTCF_REDIRECTED; 1252 - rt->rt_gateway = fnhe->fnhe_gw; 1253 - rt->rt_uses_gateway = 1; 1254 - } else if (!rt->rt_gateway) 1218 + fill_route_from_fnhe(rt, fnhe); 1219 + if (!rt->rt_gateway) 1255 1220 rt->rt_gateway = daddr; 1256 1221 1257 1222 rcu_assign_pointer(fnhe->fnhe_rth, rt);