[XFRM]: Allow packet drops during larval state resolution.

The current IPSEC rule resolution behavior we have does not work for a
lot of people, even though technically it's an improvement from the
-EAGAIN buisness we had before.

Right now we'll block until the key manager resolves the route. That
works for simple cases, but many folks would rather packets get
silently dropped until the key manager resolves the IPSEC rules.

We can't tell these folks to "set the socket non-blocking" because
they don't have control over the non-block setting of things like the
sockets used to resolve DNS deep inside of the resolver libraries in
libc.

With that in mind I coded up the patch below with some help from
Herbert Xu which provides packet-drop behavior during larval state
resolution, controllable via sysctl and off by default.

This lays the framework to either:

1) Make this default at some point or...

2) Move this logic into xfrm{4,6}_policy.c and implement the
ARP-like resolution queue we've all been dreaming of.
The idea would be to queue packets to the policy, then
once the larval state is resolved by the key manager we
re-resolve the route and push the packets out. The
packets would timeout if the rule didn't get resolved
in a certain amount of time.

Signed-off-by: David S. Miller <davem@davemloft.net>

+209 -14
+7
include/net/dst.h
··· 265 265 { 266 266 return 0; 267 267 } 268 + static inline int __xfrm_lookup(struct dst_entry **dst_p, struct flowi *fl, 269 + struct sock *sk, int flags) 270 + { 271 + return 0; 272 + } 268 273 #else 269 274 extern int xfrm_lookup(struct dst_entry **dst_p, struct flowi *fl, 270 275 struct sock *sk, int flags); 276 + extern int __xfrm_lookup(struct dst_entry **dst_p, struct flowi *fl, 277 + struct sock *sk, int flags); 271 278 #endif 272 279 #endif 273 280
+3
include/net/ipv6.h
··· 469 469 extern int ip6_dst_lookup(struct sock *sk, 470 470 struct dst_entry **dst, 471 471 struct flowi *fl); 472 + extern int ip6_dst_blackhole(struct sock *sk, 473 + struct dst_entry **dst, 474 + struct flowi *fl); 472 475 extern int ip6_sk_dst_lookup(struct sock *sk, 473 476 struct dst_entry **dst, 474 477 struct flowi *fl);
+9
net/core/sysctl_net_core.c
··· 24 24 #ifdef CONFIG_XFRM 25 25 extern u32 sysctl_xfrm_aevent_etime; 26 26 extern u32 sysctl_xfrm_aevent_rseqth; 27 + extern int sysctl_xfrm_larval_drop; 27 28 #endif 28 29 29 30 ctl_table core_table[] = { ··· 116 115 .procname = "xfrm_aevent_rseqth", 117 116 .data = &sysctl_xfrm_aevent_rseqth, 118 117 .maxlen = sizeof(u32), 118 + .mode = 0644, 119 + .proc_handler = &proc_dointvec 120 + }, 121 + { 122 + .ctl_name = CTL_UNNUMBERED, 123 + .procname = "xfrm_larval_drop", 124 + .data = &sysctl_xfrm_larval_drop, 125 + .maxlen = sizeof(int), 119 126 .mode = 0644, 120 127 .proc_handler = &proc_dointvec 121 128 },
+7 -3
net/dccp/ipv6.c
··· 1043 1043 if (final_p) 1044 1044 ipv6_addr_copy(&fl.fl6_dst, final_p); 1045 1045 1046 - err = xfrm_lookup(&dst, &fl, sk, 1); 1047 - if (err < 0) 1048 - goto failure; 1046 + err = __xfrm_lookup(&dst, &fl, sk, 1); 1047 + if (err < 0) { 1048 + if (err == -EREMOTE) 1049 + err = ip6_dst_blackhole(sk, &dst, &fl); 1050 + if (err < 0) 1051 + goto failure; 1052 + } 1049 1053 1050 1054 if (saddr == NULL) { 1051 1055 saddr = &fl.fl6_src;
+70 -1
net/ipv4/route.c
··· 2598 2598 2599 2599 EXPORT_SYMBOL_GPL(__ip_route_output_key); 2600 2600 2601 + static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu) 2602 + { 2603 + } 2604 + 2605 + static struct dst_ops ipv4_dst_blackhole_ops = { 2606 + .family = AF_INET, 2607 + .protocol = __constant_htons(ETH_P_IP), 2608 + .destroy = ipv4_dst_destroy, 2609 + .check = ipv4_dst_check, 2610 + .update_pmtu = ipv4_rt_blackhole_update_pmtu, 2611 + .entry_size = sizeof(struct rtable), 2612 + }; 2613 + 2614 + 2615 + static int ipv4_blackhole_output(struct sk_buff *skb) 2616 + { 2617 + kfree_skb(skb); 2618 + return 0; 2619 + } 2620 + 2621 + static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp, struct sock *sk) 2622 + { 2623 + struct rtable *ort = *rp; 2624 + struct rtable *rt = (struct rtable *) 2625 + dst_alloc(&ipv4_dst_blackhole_ops); 2626 + 2627 + if (rt) { 2628 + struct dst_entry *new = &rt->u.dst; 2629 + 2630 + atomic_set(&new->__refcnt, 1); 2631 + new->__use = 1; 2632 + new->input = ipv4_blackhole_output; 2633 + new->output = ipv4_blackhole_output; 2634 + memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32)); 2635 + 2636 + new->dev = ort->u.dst.dev; 2637 + if (new->dev) 2638 + dev_hold(new->dev); 2639 + 2640 + rt->fl = ort->fl; 2641 + 2642 + rt->idev = ort->idev; 2643 + if (rt->idev) 2644 + in_dev_hold(rt->idev); 2645 + rt->rt_flags = ort->rt_flags; 2646 + rt->rt_type = ort->rt_type; 2647 + rt->rt_dst = ort->rt_dst; 2648 + rt->rt_src = ort->rt_src; 2649 + rt->rt_iif = ort->rt_iif; 2650 + rt->rt_gateway = ort->rt_gateway; 2651 + rt->rt_spec_dst = ort->rt_spec_dst; 2652 + rt->peer = ort->peer; 2653 + if (rt->peer) 2654 + atomic_inc(&rt->peer->refcnt); 2655 + 2656 + dst_free(new); 2657 + } 2658 + 2659 + dst_release(&(*rp)->u.dst); 2660 + *rp = rt; 2661 + return (rt ? 0 : -ENOMEM); 2662 + } 2663 + 2601 2664 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags) 2602 2665 { 2603 2666 int err; ··· 2673 2610 flp->fl4_src = (*rp)->rt_src; 2674 2611 if (!flp->fl4_dst) 2675 2612 flp->fl4_dst = (*rp)->rt_dst; 2676 - return xfrm_lookup((struct dst_entry **)rp, flp, sk, flags); 2613 + err = __xfrm_lookup((struct dst_entry **)rp, flp, sk, flags); 2614 + if (err == -EREMOTE) 2615 + err = ipv4_dst_blackhole(rp, flp, sk); 2616 + 2617 + return err; 2677 2618 } 2678 2619 2679 2620 return 0; ··· 3205 3138 ipv4_dst_ops.kmem_cachep = 3206 3139 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0, 3207 3140 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); 3141 + 3142 + ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep; 3208 3143 3209 3144 rt_hash_table = (struct rt_hash_bucket *) 3210 3145 alloc_large_system_hash("IP route cache",
+6 -2
net/ipv6/datagram.c
··· 177 177 if (final_p) 178 178 ipv6_addr_copy(&fl.fl6_dst, final_p); 179 179 180 - if ((err = xfrm_lookup(&dst, &fl, sk, 1)) < 0) 181 - goto out; 180 + if ((err = __xfrm_lookup(&dst, &fl, sk, 1)) < 0) { 181 + if (err == -EREMOTE) 182 + err = ip6_dst_blackhole(sk, &dst, &fl); 183 + if (err < 0) 184 + goto out; 185 + } 182 186 183 187 /* source address lookup done in ip6_dst_lookup */ 184 188
+6 -2
net/ipv6/raw.c
··· 818 818 if (final_p) 819 819 ipv6_addr_copy(&fl.fl6_dst, final_p); 820 820 821 - if ((err = xfrm_lookup(&dst, &fl, sk, 1)) < 0) 822 - goto out; 821 + if ((err = __xfrm_lookup(&dst, &fl, sk, 1)) < 0) { 822 + if (err == -EREMOTE) 823 + err = ip6_dst_blackhole(sk, &dst, &fl); 824 + if (err < 0) 825 + goto out; 826 + } 823 827 824 828 if (hlimit < 0) { 825 829 if (ipv6_addr_is_multicast(&fl.fl6_dst))
+63
net/ipv6/route.c
··· 119 119 .entry_size = sizeof(struct rt6_info), 120 120 }; 121 121 122 + static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu) 123 + { 124 + } 125 + 126 + static struct dst_ops ip6_dst_blackhole_ops = { 127 + .family = AF_INET6, 128 + .protocol = __constant_htons(ETH_P_IPV6), 129 + .destroy = ip6_dst_destroy, 130 + .check = ip6_dst_check, 131 + .update_pmtu = ip6_rt_blackhole_update_pmtu, 132 + .entry_size = sizeof(struct rt6_info), 133 + }; 134 + 122 135 struct rt6_info ip6_null_entry = { 123 136 .u = { 124 137 .dst = { ··· 845 832 } 846 833 847 834 EXPORT_SYMBOL(ip6_route_output); 835 + 836 + static int ip6_blackhole_output(struct sk_buff *skb) 837 + { 838 + kfree_skb(skb); 839 + return 0; 840 + } 841 + 842 + int ip6_dst_blackhole(struct sock *sk, struct dst_entry **dstp, struct flowi *fl) 843 + { 844 + struct rt6_info *ort = (struct rt6_info *) *dstp; 845 + struct rt6_info *rt = (struct rt6_info *) 846 + dst_alloc(&ip6_dst_blackhole_ops); 847 + struct dst_entry *new = NULL; 848 + 849 + if (rt) { 850 + new = &rt->u.dst; 851 + 852 + atomic_set(&new->__refcnt, 1); 853 + new->__use = 1; 854 + new->input = ip6_blackhole_output; 855 + new->output = ip6_blackhole_output; 856 + 857 + memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32)); 858 + new->dev = ort->u.dst.dev; 859 + if (new->dev) 860 + dev_hold(new->dev); 861 + rt->rt6i_idev = ort->rt6i_idev; 862 + if (rt->rt6i_idev) 863 + in6_dev_hold(rt->rt6i_idev); 864 + rt->rt6i_expires = 0; 865 + 866 + ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway); 867 + rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES; 868 + rt->rt6i_metric = 0; 869 + 870 + memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key)); 871 + #ifdef CONFIG_IPV6_SUBTREES 872 + memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key)); 873 + #endif 874 + 875 + dst_free(new); 876 + } 877 + 878 + dst_release(*dstp); 879 + *dstp = new; 880 + return (new ? 0 : -ENOMEM); 881 + } 882 + EXPORT_SYMBOL_GPL(ip6_dst_blackhole); 848 883 849 884 /* 850 885 * Destination cache support functions ··· 2556 2495 ip6_dst_ops.kmem_cachep = 2557 2496 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0, 2558 2497 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); 2498 + ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops.kmem_cachep; 2499 + 2559 2500 fib6_init(); 2560 2501 #ifdef CONFIG_PROC_FS 2561 2502 p = proc_net_create("ipv6_route", 0, rt6_proc_info);
+6 -2
net/ipv6/tcp_ipv6.c
··· 265 265 if (final_p) 266 266 ipv6_addr_copy(&fl.fl6_dst, final_p); 267 267 268 - if ((err = xfrm_lookup(&dst, &fl, sk, 1)) < 0) 269 - goto failure; 268 + if ((err = __xfrm_lookup(&dst, &fl, sk, 1)) < 0) { 269 + if (err == -EREMOTE) 270 + err = ip6_dst_blackhole(sk, &dst, &fl); 271 + if (err < 0) 272 + goto failure; 273 + } 270 274 271 275 if (saddr == NULL) { 272 276 saddr = &fl.fl6_src;
+6 -2
net/ipv6/udp.c
··· 767 767 if (final_p) 768 768 ipv6_addr_copy(&fl.fl6_dst, final_p); 769 769 770 - if ((err = xfrm_lookup(&dst, &fl, sk, 1)) < 0) 771 - goto out; 770 + if ((err = __xfrm_lookup(&dst, &fl, sk, 1)) < 0) { 771 + if (err == -EREMOTE) 772 + err = ip6_dst_blackhole(sk, &dst, &fl); 773 + if (err < 0) 774 + goto out; 775 + } 772 776 773 777 if (hlimit < 0) { 774 778 if (ipv6_addr_is_multicast(&fl.fl6_dst))
+26 -2
net/xfrm/xfrm_policy.c
··· 29 29 30 30 #include "xfrm_hash.h" 31 31 32 + int sysctl_xfrm_larval_drop; 33 + 32 34 DEFINE_MUTEX(xfrm_cfg_mutex); 33 35 EXPORT_SYMBOL(xfrm_cfg_mutex); 34 36 ··· 1392 1390 * At the moment we eat a raw IP route. Mostly to speed up lookups 1393 1391 * on interfaces with disabled IPsec. 1394 1392 */ 1395 - int xfrm_lookup(struct dst_entry **dst_p, struct flowi *fl, 1396 - struct sock *sk, int flags) 1393 + int __xfrm_lookup(struct dst_entry **dst_p, struct flowi *fl, 1394 + struct sock *sk, int flags) 1397 1395 { 1398 1396 struct xfrm_policy *policy; 1399 1397 struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX]; ··· 1511 1509 1512 1510 if (unlikely(nx<0)) { 1513 1511 err = nx; 1512 + if (err == -EAGAIN && sysctl_xfrm_larval_drop) { 1513 + /* EREMOTE tells the caller to generate 1514 + * a one-shot blackhole route. 1515 + */ 1516 + xfrm_pol_put(policy); 1517 + return -EREMOTE; 1518 + } 1514 1519 if (err == -EAGAIN && flags) { 1515 1520 DECLARE_WAITQUEUE(wait, current); 1516 1521 ··· 1605 1596 dst_release(dst_orig); 1606 1597 xfrm_pols_put(pols, npols); 1607 1598 *dst_p = NULL; 1599 + return err; 1600 + } 1601 + EXPORT_SYMBOL(__xfrm_lookup); 1602 + 1603 + int xfrm_lookup(struct dst_entry **dst_p, struct flowi *fl, 1604 + struct sock *sk, int flags) 1605 + { 1606 + int err = __xfrm_lookup(dst_p, fl, sk, flags); 1607 + 1608 + if (err == -EREMOTE) { 1609 + dst_release(*dst_p); 1610 + *dst_p = NULL; 1611 + err = -EAGAIN; 1612 + } 1613 + 1608 1614 return err; 1609 1615 } 1610 1616 EXPORT_SYMBOL(xfrm_lookup);