[XFRM]: Allow packet drops during larval state resolution.

The current IPSEC rule resolution behavior we have does not work for a
lot of people, even though technically it's an improvement from the
-EAGAIN buisness we had before.

Right now we'll block until the key manager resolves the route. That
works for simple cases, but many folks would rather packets get
silently dropped until the key manager resolves the IPSEC rules.

We can't tell these folks to "set the socket non-blocking" because
they don't have control over the non-block setting of things like the
sockets used to resolve DNS deep inside of the resolver libraries in
libc.

With that in mind I coded up the patch below with some help from
Herbert Xu which provides packet-drop behavior during larval state
resolution, controllable via sysctl and off by default.

This lays the framework to either:

1) Make this default at some point or...

2) Move this logic into xfrm{4,6}_policy.c and implement the
ARP-like resolution queue we've all been dreaming of.
The idea would be to queue packets to the policy, then
once the larval state is resolved by the key manager we
re-resolve the route and push the packets out. The
packets would timeout if the rule didn't get resolved
in a certain amount of time.

Signed-off-by: David S. Miller <davem@davemloft.net>

+209 -14
+7
include/net/dst.h
··· 265 { 266 return 0; 267 } 268 #else 269 extern int xfrm_lookup(struct dst_entry **dst_p, struct flowi *fl, 270 struct sock *sk, int flags); 271 #endif 272 #endif 273
··· 265 { 266 return 0; 267 } 268 + static inline int __xfrm_lookup(struct dst_entry **dst_p, struct flowi *fl, 269 + struct sock *sk, int flags) 270 + { 271 + return 0; 272 + } 273 #else 274 extern int xfrm_lookup(struct dst_entry **dst_p, struct flowi *fl, 275 struct sock *sk, int flags); 276 + extern int __xfrm_lookup(struct dst_entry **dst_p, struct flowi *fl, 277 + struct sock *sk, int flags); 278 #endif 279 #endif 280
+3
include/net/ipv6.h
··· 469 extern int ip6_dst_lookup(struct sock *sk, 470 struct dst_entry **dst, 471 struct flowi *fl); 472 extern int ip6_sk_dst_lookup(struct sock *sk, 473 struct dst_entry **dst, 474 struct flowi *fl);
··· 469 extern int ip6_dst_lookup(struct sock *sk, 470 struct dst_entry **dst, 471 struct flowi *fl); 472 + extern int ip6_dst_blackhole(struct sock *sk, 473 + struct dst_entry **dst, 474 + struct flowi *fl); 475 extern int ip6_sk_dst_lookup(struct sock *sk, 476 struct dst_entry **dst, 477 struct flowi *fl);
+9
net/core/sysctl_net_core.c
··· 24 #ifdef CONFIG_XFRM 25 extern u32 sysctl_xfrm_aevent_etime; 26 extern u32 sysctl_xfrm_aevent_rseqth; 27 #endif 28 29 ctl_table core_table[] = { ··· 116 .procname = "xfrm_aevent_rseqth", 117 .data = &sysctl_xfrm_aevent_rseqth, 118 .maxlen = sizeof(u32), 119 .mode = 0644, 120 .proc_handler = &proc_dointvec 121 },
··· 24 #ifdef CONFIG_XFRM 25 extern u32 sysctl_xfrm_aevent_etime; 26 extern u32 sysctl_xfrm_aevent_rseqth; 27 + extern int sysctl_xfrm_larval_drop; 28 #endif 29 30 ctl_table core_table[] = { ··· 115 .procname = "xfrm_aevent_rseqth", 116 .data = &sysctl_xfrm_aevent_rseqth, 117 .maxlen = sizeof(u32), 118 + .mode = 0644, 119 + .proc_handler = &proc_dointvec 120 + }, 121 + { 122 + .ctl_name = CTL_UNNUMBERED, 123 + .procname = "xfrm_larval_drop", 124 + .data = &sysctl_xfrm_larval_drop, 125 + .maxlen = sizeof(int), 126 .mode = 0644, 127 .proc_handler = &proc_dointvec 128 },
+7 -3
net/dccp/ipv6.c
··· 1043 if (final_p) 1044 ipv6_addr_copy(&fl.fl6_dst, final_p); 1045 1046 - err = xfrm_lookup(&dst, &fl, sk, 1); 1047 - if (err < 0) 1048 - goto failure; 1049 1050 if (saddr == NULL) { 1051 saddr = &fl.fl6_src;
··· 1043 if (final_p) 1044 ipv6_addr_copy(&fl.fl6_dst, final_p); 1045 1046 + err = __xfrm_lookup(&dst, &fl, sk, 1); 1047 + if (err < 0) { 1048 + if (err == -EREMOTE) 1049 + err = ip6_dst_blackhole(sk, &dst, &fl); 1050 + if (err < 0) 1051 + goto failure; 1052 + } 1053 1054 if (saddr == NULL) { 1055 saddr = &fl.fl6_src;
+70 -1
net/ipv4/route.c
··· 2598 2599 EXPORT_SYMBOL_GPL(__ip_route_output_key); 2600 2601 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags) 2602 { 2603 int err; ··· 2673 flp->fl4_src = (*rp)->rt_src; 2674 if (!flp->fl4_dst) 2675 flp->fl4_dst = (*rp)->rt_dst; 2676 - return xfrm_lookup((struct dst_entry **)rp, flp, sk, flags); 2677 } 2678 2679 return 0; ··· 3205 ipv4_dst_ops.kmem_cachep = 3206 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0, 3207 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); 3208 3209 rt_hash_table = (struct rt_hash_bucket *) 3210 alloc_large_system_hash("IP route cache",
··· 2598 2599 EXPORT_SYMBOL_GPL(__ip_route_output_key); 2600 2601 + static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu) 2602 + { 2603 + } 2604 + 2605 + static struct dst_ops ipv4_dst_blackhole_ops = { 2606 + .family = AF_INET, 2607 + .protocol = __constant_htons(ETH_P_IP), 2608 + .destroy = ipv4_dst_destroy, 2609 + .check = ipv4_dst_check, 2610 + .update_pmtu = ipv4_rt_blackhole_update_pmtu, 2611 + .entry_size = sizeof(struct rtable), 2612 + }; 2613 + 2614 + 2615 + static int ipv4_blackhole_output(struct sk_buff *skb) 2616 + { 2617 + kfree_skb(skb); 2618 + return 0; 2619 + } 2620 + 2621 + static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp, struct sock *sk) 2622 + { 2623 + struct rtable *ort = *rp; 2624 + struct rtable *rt = (struct rtable *) 2625 + dst_alloc(&ipv4_dst_blackhole_ops); 2626 + 2627 + if (rt) { 2628 + struct dst_entry *new = &rt->u.dst; 2629 + 2630 + atomic_set(&new->__refcnt, 1); 2631 + new->__use = 1; 2632 + new->input = ipv4_blackhole_output; 2633 + new->output = ipv4_blackhole_output; 2634 + memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32)); 2635 + 2636 + new->dev = ort->u.dst.dev; 2637 + if (new->dev) 2638 + dev_hold(new->dev); 2639 + 2640 + rt->fl = ort->fl; 2641 + 2642 + rt->idev = ort->idev; 2643 + if (rt->idev) 2644 + in_dev_hold(rt->idev); 2645 + rt->rt_flags = ort->rt_flags; 2646 + rt->rt_type = ort->rt_type; 2647 + rt->rt_dst = ort->rt_dst; 2648 + rt->rt_src = ort->rt_src; 2649 + rt->rt_iif = ort->rt_iif; 2650 + rt->rt_gateway = ort->rt_gateway; 2651 + rt->rt_spec_dst = ort->rt_spec_dst; 2652 + rt->peer = ort->peer; 2653 + if (rt->peer) 2654 + atomic_inc(&rt->peer->refcnt); 2655 + 2656 + dst_free(new); 2657 + } 2658 + 2659 + dst_release(&(*rp)->u.dst); 2660 + *rp = rt; 2661 + return (rt ? 0 : -ENOMEM); 2662 + } 2663 + 2664 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags) 2665 { 2666 int err; ··· 2610 flp->fl4_src = (*rp)->rt_src; 2611 if (!flp->fl4_dst) 2612 flp->fl4_dst = (*rp)->rt_dst; 2613 + err = __xfrm_lookup((struct dst_entry **)rp, flp, sk, flags); 2614 + if (err == -EREMOTE) 2615 + err = ipv4_dst_blackhole(rp, flp, sk); 2616 + 2617 + return err; 2618 } 2619 2620 return 0; ··· 3138 ipv4_dst_ops.kmem_cachep = 3139 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0, 3140 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); 3141 + 3142 + ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep; 3143 3144 rt_hash_table = (struct rt_hash_bucket *) 3145 alloc_large_system_hash("IP route cache",
+6 -2
net/ipv6/datagram.c
··· 177 if (final_p) 178 ipv6_addr_copy(&fl.fl6_dst, final_p); 179 180 - if ((err = xfrm_lookup(&dst, &fl, sk, 1)) < 0) 181 - goto out; 182 183 /* source address lookup done in ip6_dst_lookup */ 184
··· 177 if (final_p) 178 ipv6_addr_copy(&fl.fl6_dst, final_p); 179 180 + if ((err = __xfrm_lookup(&dst, &fl, sk, 1)) < 0) { 181 + if (err == -EREMOTE) 182 + err = ip6_dst_blackhole(sk, &dst, &fl); 183 + if (err < 0) 184 + goto out; 185 + } 186 187 /* source address lookup done in ip6_dst_lookup */ 188
+6 -2
net/ipv6/raw.c
··· 818 if (final_p) 819 ipv6_addr_copy(&fl.fl6_dst, final_p); 820 821 - if ((err = xfrm_lookup(&dst, &fl, sk, 1)) < 0) 822 - goto out; 823 824 if (hlimit < 0) { 825 if (ipv6_addr_is_multicast(&fl.fl6_dst))
··· 818 if (final_p) 819 ipv6_addr_copy(&fl.fl6_dst, final_p); 820 821 + if ((err = __xfrm_lookup(&dst, &fl, sk, 1)) < 0) { 822 + if (err == -EREMOTE) 823 + err = ip6_dst_blackhole(sk, &dst, &fl); 824 + if (err < 0) 825 + goto out; 826 + } 827 828 if (hlimit < 0) { 829 if (ipv6_addr_is_multicast(&fl.fl6_dst))
+63
net/ipv6/route.c
··· 119 .entry_size = sizeof(struct rt6_info), 120 }; 121 122 struct rt6_info ip6_null_entry = { 123 .u = { 124 .dst = { ··· 845 } 846 847 EXPORT_SYMBOL(ip6_route_output); 848 849 /* 850 * Destination cache support functions ··· 2556 ip6_dst_ops.kmem_cachep = 2557 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0, 2558 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); 2559 fib6_init(); 2560 #ifdef CONFIG_PROC_FS 2561 p = proc_net_create("ipv6_route", 0, rt6_proc_info);
··· 119 .entry_size = sizeof(struct rt6_info), 120 }; 121 122 + static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu) 123 + { 124 + } 125 + 126 + static struct dst_ops ip6_dst_blackhole_ops = { 127 + .family = AF_INET6, 128 + .protocol = __constant_htons(ETH_P_IPV6), 129 + .destroy = ip6_dst_destroy, 130 + .check = ip6_dst_check, 131 + .update_pmtu = ip6_rt_blackhole_update_pmtu, 132 + .entry_size = sizeof(struct rt6_info), 133 + }; 134 + 135 struct rt6_info ip6_null_entry = { 136 .u = { 137 .dst = { ··· 832 } 833 834 EXPORT_SYMBOL(ip6_route_output); 835 + 836 + static int ip6_blackhole_output(struct sk_buff *skb) 837 + { 838 + kfree_skb(skb); 839 + return 0; 840 + } 841 + 842 + int ip6_dst_blackhole(struct sock *sk, struct dst_entry **dstp, struct flowi *fl) 843 + { 844 + struct rt6_info *ort = (struct rt6_info *) *dstp; 845 + struct rt6_info *rt = (struct rt6_info *) 846 + dst_alloc(&ip6_dst_blackhole_ops); 847 + struct dst_entry *new = NULL; 848 + 849 + if (rt) { 850 + new = &rt->u.dst; 851 + 852 + atomic_set(&new->__refcnt, 1); 853 + new->__use = 1; 854 + new->input = ip6_blackhole_output; 855 + new->output = ip6_blackhole_output; 856 + 857 + memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32)); 858 + new->dev = ort->u.dst.dev; 859 + if (new->dev) 860 + dev_hold(new->dev); 861 + rt->rt6i_idev = ort->rt6i_idev; 862 + if (rt->rt6i_idev) 863 + in6_dev_hold(rt->rt6i_idev); 864 + rt->rt6i_expires = 0; 865 + 866 + ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway); 867 + rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES; 868 + rt->rt6i_metric = 0; 869 + 870 + memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key)); 871 + #ifdef CONFIG_IPV6_SUBTREES 872 + memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key)); 873 + #endif 874 + 875 + dst_free(new); 876 + } 877 + 878 + dst_release(*dstp); 879 + *dstp = new; 880 + return (new ? 0 : -ENOMEM); 881 + } 882 + EXPORT_SYMBOL_GPL(ip6_dst_blackhole); 883 884 /* 885 * Destination cache support functions ··· 2495 ip6_dst_ops.kmem_cachep = 2496 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0, 2497 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); 2498 + ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops.kmem_cachep; 2499 + 2500 fib6_init(); 2501 #ifdef CONFIG_PROC_FS 2502 p = proc_net_create("ipv6_route", 0, rt6_proc_info);
+6 -2
net/ipv6/tcp_ipv6.c
··· 265 if (final_p) 266 ipv6_addr_copy(&fl.fl6_dst, final_p); 267 268 - if ((err = xfrm_lookup(&dst, &fl, sk, 1)) < 0) 269 - goto failure; 270 271 if (saddr == NULL) { 272 saddr = &fl.fl6_src;
··· 265 if (final_p) 266 ipv6_addr_copy(&fl.fl6_dst, final_p); 267 268 + if ((err = __xfrm_lookup(&dst, &fl, sk, 1)) < 0) { 269 + if (err == -EREMOTE) 270 + err = ip6_dst_blackhole(sk, &dst, &fl); 271 + if (err < 0) 272 + goto failure; 273 + } 274 275 if (saddr == NULL) { 276 saddr = &fl.fl6_src;
+6 -2
net/ipv6/udp.c
··· 767 if (final_p) 768 ipv6_addr_copy(&fl.fl6_dst, final_p); 769 770 - if ((err = xfrm_lookup(&dst, &fl, sk, 1)) < 0) 771 - goto out; 772 773 if (hlimit < 0) { 774 if (ipv6_addr_is_multicast(&fl.fl6_dst))
··· 767 if (final_p) 768 ipv6_addr_copy(&fl.fl6_dst, final_p); 769 770 + if ((err = __xfrm_lookup(&dst, &fl, sk, 1)) < 0) { 771 + if (err == -EREMOTE) 772 + err = ip6_dst_blackhole(sk, &dst, &fl); 773 + if (err < 0) 774 + goto out; 775 + } 776 777 if (hlimit < 0) { 778 if (ipv6_addr_is_multicast(&fl.fl6_dst))
+26 -2
net/xfrm/xfrm_policy.c
··· 29 30 #include "xfrm_hash.h" 31 32 DEFINE_MUTEX(xfrm_cfg_mutex); 33 EXPORT_SYMBOL(xfrm_cfg_mutex); 34 ··· 1392 * At the moment we eat a raw IP route. Mostly to speed up lookups 1393 * on interfaces with disabled IPsec. 1394 */ 1395 - int xfrm_lookup(struct dst_entry **dst_p, struct flowi *fl, 1396 - struct sock *sk, int flags) 1397 { 1398 struct xfrm_policy *policy; 1399 struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX]; ··· 1511 1512 if (unlikely(nx<0)) { 1513 err = nx; 1514 if (err == -EAGAIN && flags) { 1515 DECLARE_WAITQUEUE(wait, current); 1516 ··· 1605 dst_release(dst_orig); 1606 xfrm_pols_put(pols, npols); 1607 *dst_p = NULL; 1608 return err; 1609 } 1610 EXPORT_SYMBOL(xfrm_lookup);
··· 29 30 #include "xfrm_hash.h" 31 32 + int sysctl_xfrm_larval_drop; 33 + 34 DEFINE_MUTEX(xfrm_cfg_mutex); 35 EXPORT_SYMBOL(xfrm_cfg_mutex); 36 ··· 1390 * At the moment we eat a raw IP route. Mostly to speed up lookups 1391 * on interfaces with disabled IPsec. 1392 */ 1393 + int __xfrm_lookup(struct dst_entry **dst_p, struct flowi *fl, 1394 + struct sock *sk, int flags) 1395 { 1396 struct xfrm_policy *policy; 1397 struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX]; ··· 1509 1510 if (unlikely(nx<0)) { 1511 err = nx; 1512 + if (err == -EAGAIN && sysctl_xfrm_larval_drop) { 1513 + /* EREMOTE tells the caller to generate 1514 + * a one-shot blackhole route. 1515 + */ 1516 + xfrm_pol_put(policy); 1517 + return -EREMOTE; 1518 + } 1519 if (err == -EAGAIN && flags) { 1520 DECLARE_WAITQUEUE(wait, current); 1521 ··· 1596 dst_release(dst_orig); 1597 xfrm_pols_put(pols, npols); 1598 *dst_p = NULL; 1599 + return err; 1600 + } 1601 + EXPORT_SYMBOL(__xfrm_lookup); 1602 + 1603 + int xfrm_lookup(struct dst_entry **dst_p, struct flowi *fl, 1604 + struct sock *sk, int flags) 1605 + { 1606 + int err = __xfrm_lookup(dst_p, fl, sk, flags); 1607 + 1608 + if (err == -EREMOTE) { 1609 + dst_release(*dst_p); 1610 + *dst_p = NULL; 1611 + err = -EAGAIN; 1612 + } 1613 + 1614 return err; 1615 } 1616 EXPORT_SYMBOL(xfrm_lookup);