Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

xfrm: support sending NAT keepalives in ESP in UDP states

Add the ability to send out RFC-3948 NAT keepalives from the xfrm stack.

To use, Userspace sets an XFRM_NAT_KEEPALIVE_INTERVAL integer property when
creating XFRM outbound states which denotes the number of seconds between
keepalive messages.

Keepalive messages are sent from a per net delayed work which iterates over
the xfrm states. The logic is guarded by the xfrm state spinlock due to the
xfrm state walk iterator.

Possible future enhancements:

- Adding counters to keep track of sent keepalives.
- deduplicate NAT keepalives between states sharing the same nat keepalive
parameters.
- provisioning hardware offloads for devices capable of implementing this.
- revise xfrm state list to use an rcu list in order to avoid running this
under spinlock.

Suggested-by: Paul Wouters <paul.wouters@aiven.io>
Tested-by: Paul Wouters <paul.wouters@aiven.io>
Tested-by: Antony Antony <antony.antony@secunet.com>
Signed-off-by: Eyal Birger <eyal.birger@gmail.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>

authored by

Eyal Birger and committed by
Steffen Klassert
f531d13b 5233a55a

+361 -3
+3
include/net/ipv6_stubs.h
··· 9 9 #include <net/flow.h> 10 10 #include <net/neighbour.h> 11 11 #include <net/sock.h> 12 + #include <net/ipv6.h> 12 13 13 14 /* structs from net/ip6_fib.h */ 14 15 struct fib6_info; ··· 73 72 int (*output)(struct net *, struct sock *, struct sk_buff *)); 74 73 struct net_device *(*ipv6_dev_find)(struct net *net, const struct in6_addr *addr, 75 74 struct net_device *dev); 75 + int (*ip6_xmit)(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, 76 + __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority); 76 77 }; 77 78 extern const struct ipv6_stub *ipv6_stub __read_mostly; 78 79
+1
include/net/netns/xfrm.h
··· 83 83 84 84 spinlock_t xfrm_policy_lock; 85 85 struct mutex xfrm_cfg_mutex; 86 + struct delayed_work nat_keepalive_work; 86 87 }; 87 88 88 89 #endif
+10
include/net/xfrm.h
··· 229 229 struct xfrm_encap_tmpl *encap; 230 230 struct sock __rcu *encap_sk; 231 231 232 + /* NAT keepalive */ 233 + u32 nat_keepalive_interval; /* seconds */ 234 + time64_t nat_keepalive_expiration; 235 + 232 236 /* Data for care-of address */ 233 237 xfrm_address_t *coaddr; 234 238 ··· 2206 2202 return 0; 2207 2203 } 2208 2204 #endif 2205 + 2206 + int xfrm_nat_keepalive_init(unsigned short family); 2207 + void xfrm_nat_keepalive_fini(unsigned short family); 2208 + int xfrm_nat_keepalive_net_init(struct net *net); 2209 + int xfrm_nat_keepalive_net_fini(struct net *net); 2210 + void xfrm_nat_keepalive_state_updated(struct xfrm_state *x); 2209 2211 2210 2212 #endif /* _NET_XFRM_H */
+1
include/uapi/linux/xfrm.h
··· 321 321 XFRMA_IF_ID, /* __u32 */ 322 322 XFRMA_MTIMER_THRESH, /* __u32 in seconds for input SA */ 323 323 XFRMA_SA_DIR, /* __u8 */ 324 + XFRMA_NAT_KEEPALIVE_INTERVAL, /* __u32 in seconds for NAT keepalive */ 324 325 __XFRMA_MAX 325 326 326 327 #define XFRMA_OUTPUT_MARK XFRMA_SET_MARK /* Compatibility */
+1
net/ipv6/af_inet6.c
··· 1060 1060 .nd_tbl = &nd_tbl, 1061 1061 .ipv6_fragment = ip6_fragment, 1062 1062 .ipv6_dev_find = ipv6_dev_find, 1063 + .ip6_xmit = ip6_xmit, 1063 1064 }; 1064 1065 1065 1066 static const struct ipv6_bpf_stub ipv6_bpf_stub_impl = {
+7
net/ipv6/xfrm6_policy.c
··· 284 284 ret = register_pernet_subsys(&xfrm6_net_ops); 285 285 if (ret) 286 286 goto out_protocol; 287 + 288 + ret = xfrm_nat_keepalive_init(AF_INET6); 289 + if (ret) 290 + goto out_nat_keepalive; 287 291 out: 288 292 return ret; 293 + out_nat_keepalive: 294 + unregister_pernet_subsys(&xfrm6_net_ops); 289 295 out_protocol: 290 296 xfrm6_protocol_fini(); 291 297 out_state: ··· 303 297 304 298 void xfrm6_fini(void) 305 299 { 300 + xfrm_nat_keepalive_fini(AF_INET6); 306 301 unregister_pernet_subsys(&xfrm6_net_ops); 307 302 xfrm6_protocol_fini(); 308 303 xfrm6_policy_fini();
+2 -1
net/xfrm/Makefile
··· 13 13 14 14 obj-$(CONFIG_XFRM) := xfrm_policy.o xfrm_state.o xfrm_hash.o \ 15 15 xfrm_input.o xfrm_output.o \ 16 - xfrm_sysctl.o xfrm_replay.o xfrm_device.o 16 + xfrm_sysctl.o xfrm_replay.o xfrm_device.o \ 17 + xfrm_nat_keepalive.o 17 18 obj-$(CONFIG_XFRM_STATISTICS) += xfrm_proc.o 18 19 obj-$(CONFIG_XFRM_ALGO) += xfrm_algo.o 19 20 obj-$(CONFIG_XFRM_USER) += xfrm_user.o
+4 -2
net/xfrm/xfrm_compat.c
··· 131 131 [XFRMA_IF_ID] = { .type = NLA_U32 }, 132 132 [XFRMA_MTIMER_THRESH] = { .type = NLA_U32 }, 133 133 [XFRMA_SA_DIR] = NLA_POLICY_RANGE(NLA_U8, XFRM_SA_DIR_IN, XFRM_SA_DIR_OUT), 134 + [XFRMA_NAT_KEEPALIVE_INTERVAL] = { .type = NLA_U32 }, 134 135 }; 135 136 136 137 static struct nlmsghdr *xfrm_nlmsg_put_compat(struct sk_buff *skb, ··· 281 280 case XFRMA_IF_ID: 282 281 case XFRMA_MTIMER_THRESH: 283 282 case XFRMA_SA_DIR: 283 + case XFRMA_NAT_KEEPALIVE_INTERVAL: 284 284 return xfrm_nla_cpy(dst, src, nla_len(src)); 285 285 default: 286 - BUILD_BUG_ON(XFRMA_MAX != XFRMA_SA_DIR); 286 + BUILD_BUG_ON(XFRMA_MAX != XFRMA_NAT_KEEPALIVE_INTERVAL); 287 287 pr_warn_once("unsupported nla_type %d\n", src->nla_type); 288 288 return -EOPNOTSUPP; 289 289 } ··· 439 437 int err; 440 438 441 439 if (type > XFRMA_MAX) { 442 - BUILD_BUG_ON(XFRMA_MAX != XFRMA_SA_DIR); 440 + BUILD_BUG_ON(XFRMA_MAX != XFRMA_NAT_KEEPALIVE_INTERVAL); 443 441 NL_SET_ERR_MSG(extack, "Bad attribute"); 444 442 return -EOPNOTSUPP; 445 443 }
+292
net/xfrm/xfrm_nat_keepalive.c
··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* 3 + * xfrm_nat_keepalive.c 4 + * 5 + * (c) 2024 Eyal Birger <eyal.birger@gmail.com> 6 + */ 7 + 8 + #include <net/inet_common.h> 9 + #include <net/ip6_checksum.h> 10 + #include <net/xfrm.h> 11 + 12 + static DEFINE_PER_CPU(struct sock *, nat_keepalive_sk_ipv4); 13 + #if IS_ENABLED(CONFIG_IPV6) 14 + static DEFINE_PER_CPU(struct sock *, nat_keepalive_sk_ipv6); 15 + #endif 16 + 17 + struct nat_keepalive { 18 + struct net *net; 19 + u16 family; 20 + xfrm_address_t saddr; 21 + xfrm_address_t daddr; 22 + __be16 encap_sport; 23 + __be16 encap_dport; 24 + __u32 smark; 25 + }; 26 + 27 + static void nat_keepalive_init(struct nat_keepalive *ka, struct xfrm_state *x) 28 + { 29 + ka->net = xs_net(x); 30 + ka->family = x->props.family; 31 + ka->saddr = x->props.saddr; 32 + ka->daddr = x->id.daddr; 33 + ka->encap_sport = x->encap->encap_sport; 34 + ka->encap_dport = x->encap->encap_dport; 35 + ka->smark = xfrm_smark_get(0, x); 36 + } 37 + 38 + static int nat_keepalive_send_ipv4(struct sk_buff *skb, 39 + struct nat_keepalive *ka) 40 + { 41 + struct net *net = ka->net; 42 + struct flowi4 fl4; 43 + struct rtable *rt; 44 + struct sock *sk; 45 + __u8 tos = 0; 46 + int err; 47 + 48 + flowi4_init_output(&fl4, 0 /* oif */, skb->mark, tos, 49 + RT_SCOPE_UNIVERSE, IPPROTO_UDP, 0, 50 + ka->daddr.a4, ka->saddr.a4, ka->encap_dport, 51 + ka->encap_sport, sock_net_uid(net, NULL)); 52 + 53 + rt = ip_route_output_key(net, &fl4); 54 + if (IS_ERR(rt)) 55 + return PTR_ERR(rt); 56 + 57 + skb_dst_set(skb, &rt->dst); 58 + 59 + sk = *this_cpu_ptr(&nat_keepalive_sk_ipv4); 60 + sock_net_set(sk, net); 61 + err = ip_build_and_send_pkt(skb, sk, fl4.saddr, fl4.daddr, NULL, tos); 62 + sock_net_set(sk, &init_net); 63 + return err; 64 + } 65 + 66 + #if IS_ENABLED(CONFIG_IPV6) 67 + static int nat_keepalive_send_ipv6(struct sk_buff *skb, 68 + struct nat_keepalive *ka, 69 + struct udphdr *uh) 70 + { 71 + struct net *net = ka->net; 72 + struct dst_entry *dst; 73 + struct flowi6 fl6; 74 + struct sock *sk; 75 + __wsum csum; 76 + int err; 77 + 78 + csum = skb_checksum(skb, 0, skb->len, 0); 79 + uh->check = csum_ipv6_magic(&ka->saddr.in6, &ka->daddr.in6, 80 + skb->len, IPPROTO_UDP, csum); 81 + if (uh->check == 0) 82 + uh->check = CSUM_MANGLED_0; 83 + 84 + memset(&fl6, 0, sizeof(fl6)); 85 + fl6.flowi6_mark = skb->mark; 86 + fl6.saddr = ka->saddr.in6; 87 + fl6.daddr = ka->daddr.in6; 88 + fl6.flowi6_proto = IPPROTO_UDP; 89 + fl6.fl6_sport = ka->encap_sport; 90 + fl6.fl6_dport = ka->encap_dport; 91 + 92 + sk = *this_cpu_ptr(&nat_keepalive_sk_ipv6); 93 + sock_net_set(sk, net); 94 + dst = ipv6_stub->ipv6_dst_lookup_flow(net, sk, &fl6, NULL); 95 + if (IS_ERR(dst)) 96 + return PTR_ERR(dst); 97 + 98 + skb_dst_set(skb, dst); 99 + err = ipv6_stub->ip6_xmit(sk, skb, &fl6, skb->mark, NULL, 0, 0); 100 + sock_net_set(sk, &init_net); 101 + return err; 102 + } 103 + #endif 104 + 105 + static void nat_keepalive_send(struct nat_keepalive *ka) 106 + { 107 + const int nat_ka_hdrs_len = max(sizeof(struct iphdr), 108 + sizeof(struct ipv6hdr)) + 109 + sizeof(struct udphdr); 110 + const u8 nat_ka_payload = 0xFF; 111 + int err = -EAFNOSUPPORT; 112 + struct sk_buff *skb; 113 + struct udphdr *uh; 114 + 115 + skb = alloc_skb(nat_ka_hdrs_len + sizeof(nat_ka_payload), GFP_ATOMIC); 116 + if (unlikely(!skb)) 117 + return; 118 + 119 + skb_reserve(skb, nat_ka_hdrs_len); 120 + 121 + skb_put_u8(skb, nat_ka_payload); 122 + 123 + uh = skb_push(skb, sizeof(*uh)); 124 + uh->source = ka->encap_sport; 125 + uh->dest = ka->encap_dport; 126 + uh->len = htons(skb->len); 127 + uh->check = 0; 128 + 129 + skb->mark = ka->smark; 130 + 131 + switch (ka->family) { 132 + case AF_INET: 133 + err = nat_keepalive_send_ipv4(skb, ka); 134 + break; 135 + #if IS_ENABLED(CONFIG_IPV6) 136 + case AF_INET6: 137 + err = nat_keepalive_send_ipv6(skb, ka, uh); 138 + break; 139 + #endif 140 + } 141 + if (err) 142 + kfree_skb(skb); 143 + } 144 + 145 + struct nat_keepalive_work_ctx { 146 + time64_t next_run; 147 + time64_t now; 148 + }; 149 + 150 + static int nat_keepalive_work_single(struct xfrm_state *x, int count, void *ptr) 151 + { 152 + struct nat_keepalive_work_ctx *ctx = ptr; 153 + bool send_keepalive = false; 154 + struct nat_keepalive ka; 155 + time64_t next_run; 156 + u32 interval; 157 + int delta; 158 + 159 + interval = x->nat_keepalive_interval; 160 + if (!interval) 161 + return 0; 162 + 163 + spin_lock(&x->lock); 164 + 165 + delta = (int)(ctx->now - x->lastused); 166 + if (delta < interval) { 167 + x->nat_keepalive_expiration = ctx->now + interval - delta; 168 + next_run = x->nat_keepalive_expiration; 169 + } else if (x->nat_keepalive_expiration > ctx->now) { 170 + next_run = x->nat_keepalive_expiration; 171 + } else { 172 + next_run = ctx->now + interval; 173 + nat_keepalive_init(&ka, x); 174 + send_keepalive = true; 175 + } 176 + 177 + spin_unlock(&x->lock); 178 + 179 + if (send_keepalive) 180 + nat_keepalive_send(&ka); 181 + 182 + if (!ctx->next_run || next_run < ctx->next_run) 183 + ctx->next_run = next_run; 184 + return 0; 185 + } 186 + 187 + static void nat_keepalive_work(struct work_struct *work) 188 + { 189 + struct nat_keepalive_work_ctx ctx; 190 + struct xfrm_state_walk walk; 191 + struct net *net; 192 + 193 + ctx.next_run = 0; 194 + ctx.now = ktime_get_real_seconds(); 195 + 196 + net = container_of(work, struct net, xfrm.nat_keepalive_work.work); 197 + xfrm_state_walk_init(&walk, IPPROTO_ESP, NULL); 198 + xfrm_state_walk(net, &walk, nat_keepalive_work_single, &ctx); 199 + xfrm_state_walk_done(&walk, net); 200 + if (ctx.next_run) 201 + schedule_delayed_work(&net->xfrm.nat_keepalive_work, 202 + (ctx.next_run - ctx.now) * HZ); 203 + } 204 + 205 + static int nat_keepalive_sk_init(struct sock * __percpu *socks, 206 + unsigned short family) 207 + { 208 + struct sock *sk; 209 + int err, i; 210 + 211 + for_each_possible_cpu(i) { 212 + err = inet_ctl_sock_create(&sk, family, SOCK_RAW, IPPROTO_UDP, 213 + &init_net); 214 + if (err < 0) 215 + goto err; 216 + 217 + *per_cpu_ptr(socks, i) = sk; 218 + } 219 + 220 + return 0; 221 + err: 222 + for_each_possible_cpu(i) 223 + inet_ctl_sock_destroy(*per_cpu_ptr(socks, i)); 224 + return err; 225 + } 226 + 227 + static void nat_keepalive_sk_fini(struct sock * __percpu *socks) 228 + { 229 + int i; 230 + 231 + for_each_possible_cpu(i) 232 + inet_ctl_sock_destroy(*per_cpu_ptr(socks, i)); 233 + } 234 + 235 + void xfrm_nat_keepalive_state_updated(struct xfrm_state *x) 236 + { 237 + struct net *net; 238 + 239 + if (!x->nat_keepalive_interval) 240 + return; 241 + 242 + net = xs_net(x); 243 + schedule_delayed_work(&net->xfrm.nat_keepalive_work, 0); 244 + } 245 + 246 + int __net_init xfrm_nat_keepalive_net_init(struct net *net) 247 + { 248 + INIT_DELAYED_WORK(&net->xfrm.nat_keepalive_work, nat_keepalive_work); 249 + return 0; 250 + } 251 + 252 + int xfrm_nat_keepalive_net_fini(struct net *net) 253 + { 254 + cancel_delayed_work_sync(&net->xfrm.nat_keepalive_work); 255 + return 0; 256 + } 257 + 258 + int xfrm_nat_keepalive_init(unsigned short family) 259 + { 260 + int err = -EAFNOSUPPORT; 261 + 262 + switch (family) { 263 + case AF_INET: 264 + err = nat_keepalive_sk_init(&nat_keepalive_sk_ipv4, PF_INET); 265 + break; 266 + #if IS_ENABLED(CONFIG_IPV6) 267 + case AF_INET6: 268 + err = nat_keepalive_sk_init(&nat_keepalive_sk_ipv6, PF_INET6); 269 + break; 270 + #endif 271 + } 272 + 273 + if (err) 274 + pr_err("xfrm nat keepalive init: failed to init err:%d\n", err); 275 + return err; 276 + } 277 + EXPORT_SYMBOL_GPL(xfrm_nat_keepalive_init); 278 + 279 + void xfrm_nat_keepalive_fini(unsigned short family) 280 + { 281 + switch (family) { 282 + case AF_INET: 283 + nat_keepalive_sk_fini(&nat_keepalive_sk_ipv4); 284 + break; 285 + #if IS_ENABLED(CONFIG_IPV6) 286 + case AF_INET6: 287 + nat_keepalive_sk_fini(&nat_keepalive_sk_ipv6); 288 + break; 289 + #endif 290 + } 291 + } 292 + EXPORT_SYMBOL_GPL(xfrm_nat_keepalive_fini);
+8
net/xfrm/xfrm_policy.c
··· 4289 4289 if (rv < 0) 4290 4290 goto out_sysctl; 4291 4291 4292 + rv = xfrm_nat_keepalive_net_init(net); 4293 + if (rv < 0) 4294 + goto out_nat_keepalive; 4295 + 4292 4296 return 0; 4293 4297 4298 + out_nat_keepalive: 4299 + xfrm_sysctl_fini(net); 4294 4300 out_sysctl: 4295 4301 xfrm_policy_fini(net); 4296 4302 out_policy: ··· 4309 4303 4310 4304 static void __net_exit xfrm_net_exit(struct net *net) 4311 4305 { 4306 + xfrm_nat_keepalive_net_fini(net); 4312 4307 xfrm_sysctl_fini(net); 4313 4308 xfrm_policy_fini(net); 4314 4309 xfrm_state_fini(net); ··· 4371 4364 #endif 4372 4365 4373 4366 register_xfrm_state_bpf(); 4367 + xfrm_nat_keepalive_init(AF_INET); 4374 4368 } 4375 4369 4376 4370 #ifdef CONFIG_AUDITSYSCALL
+17
net/xfrm/xfrm_state.c
··· 715 715 if (x->id.spi) 716 716 hlist_del_rcu(&x->byspi); 717 717 net->xfrm.state_num--; 718 + xfrm_nat_keepalive_state_updated(x); 718 719 spin_unlock(&net->xfrm.xfrm_state_lock); 719 720 720 721 if (x->encap_sk) ··· 1454 1453 net->xfrm.state_num++; 1455 1454 1456 1455 xfrm_hash_grow_check(net, x->bydst.next != NULL); 1456 + xfrm_nat_keepalive_state_updated(x); 1457 1457 } 1458 1458 1459 1459 /* net->xfrm.xfrm_state_lock is held */ ··· 2871 2869 err = xfrm_init_replay(x, extack); 2872 2870 if (err) 2873 2871 goto error; 2872 + } 2873 + 2874 + if (x->nat_keepalive_interval) { 2875 + if (x->dir != XFRM_SA_DIR_OUT) { 2876 + NL_SET_ERR_MSG(extack, "NAT keepalive is only supported for outbound SAs"); 2877 + err = -EINVAL; 2878 + goto error; 2879 + } 2880 + 2881 + if (!x->encap || x->encap->encap_type != UDP_ENCAP_ESPINUDP) { 2882 + NL_SET_ERR_MSG(extack, 2883 + "NAT keepalive is only supported for UDP encapsulation"); 2884 + err = -EINVAL; 2885 + goto error; 2886 + } 2874 2887 } 2875 2888 2876 2889 error:
+15
net/xfrm/xfrm_user.c
··· 833 833 if (attrs[XFRMA_SA_DIR]) 834 834 x->dir = nla_get_u8(attrs[XFRMA_SA_DIR]); 835 835 836 + if (attrs[XFRMA_NAT_KEEPALIVE_INTERVAL]) 837 + x->nat_keepalive_interval = 838 + nla_get_u32(attrs[XFRMA_NAT_KEEPALIVE_INTERVAL]); 839 + 836 840 err = __xfrm_init_state(x, false, attrs[XFRMA_OFFLOAD_DEV], extack); 837 841 if (err) 838 842 goto error; ··· 1292 1288 } 1293 1289 if (x->dir) 1294 1290 ret = nla_put_u8(skb, XFRMA_SA_DIR, x->dir); 1291 + 1292 + if (x->nat_keepalive_interval) { 1293 + ret = nla_put_u32(skb, XFRMA_NAT_KEEPALIVE_INTERVAL, 1294 + x->nat_keepalive_interval); 1295 + if (ret) 1296 + goto out; 1297 + } 1295 1298 out: 1296 1299 return ret; 1297 1300 } ··· 3176 3165 [XFRMA_IF_ID] = { .type = NLA_U32 }, 3177 3166 [XFRMA_MTIMER_THRESH] = { .type = NLA_U32 }, 3178 3167 [XFRMA_SA_DIR] = NLA_POLICY_RANGE(NLA_U8, XFRM_SA_DIR_IN, XFRM_SA_DIR_OUT), 3168 + [XFRMA_NAT_KEEPALIVE_INTERVAL] = { .type = NLA_U32 }, 3179 3169 }; 3180 3170 EXPORT_SYMBOL_GPL(xfrma_policy); 3181 3171 ··· 3485 3473 3486 3474 if (x->dir) 3487 3475 l += nla_total_size(sizeof(x->dir)); 3476 + 3477 + if (x->nat_keepalive_interval) 3478 + l += nla_total_size(sizeof(x->nat_keepalive_interval)); 3488 3479 3489 3480 return l; 3490 3481 }