Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

bpf: Add socket assign support

Add support for TPROXY via a new bpf helper, bpf_sk_assign().

This helper requires the BPF program to discover the socket via a call
to bpf_sk*_lookup_*(), then pass this socket to the new helper. The
helper takes its own reference to the socket in addition to any existing
reference that may or may not currently be obtained for the duration of
BPF processing. For the destination socket to receive the traffic, the
traffic must be routed towards that socket via local route. The
simplest example route is below, but in practice you may want to route
traffic more narrowly (eg by CIDR):

$ ip route add local default dev lo

This patch avoids trying to introduce an extra bit into the skb->sk, as
that would require more invasive changes to all code interacting with
the socket to ensure that the bit is handled correctly, such as all
error-handling cases along the path from the helper in BPF through to
the orphan path in the input. Instead, we opt to use the destructor
variable to switch on the prefetch of the socket.

Signed-off-by: Joe Stringer <joe@wand.net.nz>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20200329225342.16317-2-joe@wand.net.nz

authored by

Joe Stringer and committed by
Alexei Starovoitov
cf7fbe66 b49e42a2

+108 -4
+11
include/net/sock.h
··· 1659 1659 void sock_efree(struct sk_buff *skb); 1660 1660 #ifdef CONFIG_INET 1661 1661 void sock_edemux(struct sk_buff *skb); 1662 + void sock_pfree(struct sk_buff *skb); 1662 1663 #else 1663 1664 #define sock_edemux sock_efree 1664 1665 #endif ··· 2525 2524 void sock_net_set(struct sock *sk, struct net *net) 2526 2525 { 2527 2526 write_pnet(&sk->sk_net, net); 2527 + } 2528 + 2529 + static inline bool 2530 + skb_sk_is_prefetched(struct sk_buff *skb) 2531 + { 2532 + #ifdef CONFIG_INET 2533 + return skb->destructor == sock_pfree; 2534 + #else 2535 + return false; 2536 + #endif /* CONFIG_INET */ 2528 2537 } 2529 2538 2530 2539 static inline struct sock *skb_steal_sock(struct sk_buff *skb)
+24 -1
include/uapi/linux/bpf.h
··· 2983 2983 * **bpf_get_current_cgroup_id**\ (). 2984 2984 * Return 2985 2985 * The id is returned or 0 in case the id could not be retrieved. 2986 + * 2987 + * int bpf_sk_assign(struct sk_buff *skb, struct bpf_sock *sk, u64 flags) 2988 + * Description 2989 + * Assign the *sk* to the *skb*. When combined with appropriate 2990 + * routing configuration to receive the packet towards the socket, 2991 + * will cause *skb* to be delivered to the specified socket. 2992 + * Subsequent redirection of *skb* via **bpf_redirect**\ (), 2993 + * **bpf_clone_redirect**\ () or other methods outside of BPF may 2994 + * interfere with successful delivery to the socket. 2995 + * 2996 + * This operation is only valid from TC ingress path. 2997 + * 2998 + * The *flags* argument must be zero. 2999 + * Return 3000 + * 0 on success, or a negative errno in case of failure. 3001 + * 3002 + * * **-EINVAL** Unsupported flags specified. 3003 + * * **-ENOENT** Socket is unavailable for assignment. 3004 + * * **-ENETUNREACH** Socket is unreachable (wrong netns). 3005 + * * **-EOPNOTSUPP** Unsupported operation, for example a 3006 + * call from outside of TC ingress. 3007 + * * **-ESOCKTNOSUPPORT** Socket type not supported (reuseport). 2986 3008 */ 2987 3009 #define __BPF_FUNC_MAPPER(FN) \ 2988 3010 FN(unspec), \ ··· 3130 3108 FN(get_ns_current_pid_tgid), \ 3131 3109 FN(xdp_output), \ 3132 3110 FN(get_netns_cookie), \ 3133 - FN(get_current_ancestor_cgroup_id), 3111 + FN(get_current_ancestor_cgroup_id), \ 3112 + FN(sk_assign), 3134 3113 3135 3114 /* integer value in 'imm' field of BPF_CALL instruction selects which helper 3136 3115 * function eBPF program intends to call
+31
net/core/filter.c
··· 5918 5918 .arg5_type = ARG_CONST_SIZE, 5919 5919 }; 5920 5920 5921 + BPF_CALL_3(bpf_sk_assign, struct sk_buff *, skb, struct sock *, sk, u64, flags) 5922 + { 5923 + if (flags != 0) 5924 + return -EINVAL; 5925 + if (!skb_at_tc_ingress(skb)) 5926 + return -EOPNOTSUPP; 5927 + if (unlikely(dev_net(skb->dev) != sock_net(sk))) 5928 + return -ENETUNREACH; 5929 + if (unlikely(sk->sk_reuseport)) 5930 + return -ESOCKTNOSUPPORT; 5931 + if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) 5932 + return -ENOENT; 5933 + 5934 + skb_orphan(skb); 5935 + skb->sk = sk; 5936 + skb->destructor = sock_pfree; 5937 + 5938 + return 0; 5939 + } 5940 + 5941 + static const struct bpf_func_proto bpf_sk_assign_proto = { 5942 + .func = bpf_sk_assign, 5943 + .gpl_only = false, 5944 + .ret_type = RET_INTEGER, 5945 + .arg1_type = ARG_PTR_TO_CTX, 5946 + .arg2_type = ARG_PTR_TO_SOCK_COMMON, 5947 + .arg3_type = ARG_ANYTHING, 5948 + }; 5949 + 5921 5950 #endif /* CONFIG_INET */ 5922 5951 5923 5952 bool bpf_helper_changes_pkt_data(void *func) ··· 6278 6249 return &bpf_skb_ecn_set_ce_proto; 6279 6250 case BPF_FUNC_tcp_gen_syncookie: 6280 6251 return &bpf_tcp_gen_syncookie_proto; 6252 + case BPF_FUNC_sk_assign: 6253 + return &bpf_sk_assign_proto; 6281 6254 #endif 6282 6255 default: 6283 6256 return bpf_base_func_proto(func_id);
+11
net/core/sock.c
··· 2071 2071 } 2072 2072 EXPORT_SYMBOL(sock_efree); 2073 2073 2074 + /* Buffer destructor for prefetch/receive path where reference count may 2075 + * not be held, e.g. for listen sockets. 2076 + */ 2077 + #ifdef CONFIG_INET 2078 + void sock_pfree(struct sk_buff *skb) 2079 + { 2080 + sock_gen_put(skb->sk); 2081 + } 2082 + EXPORT_SYMBOL(sock_pfree); 2083 + #endif /* CONFIG_INET */ 2084 + 2074 2085 kuid_t sock_i_uid(struct sock *sk) 2075 2086 { 2076 2087 kuid_t uid;
+2 -1
net/ipv4/ip_input.c
··· 509 509 IPCB(skb)->iif = skb->skb_iif; 510 510 511 511 /* Must drop socket now because of tproxy. */ 512 - skb_orphan(skb); 512 + if (!skb_sk_is_prefetched(skb)) 513 + skb_orphan(skb); 513 514 514 515 return skb; 515 516
+2 -1
net/ipv6/ip6_input.c
··· 285 285 rcu_read_unlock(); 286 286 287 287 /* Must drop socket now because of tproxy. */ 288 - skb_orphan(skb); 288 + if (!skb_sk_is_prefetched(skb)) 289 + skb_orphan(skb); 289 290 290 291 return skb; 291 292 err:
+3
net/sched/act_bpf.c
··· 12 12 #include <linux/bpf.h> 13 13 14 14 #include <net/netlink.h> 15 + #include <net/sock.h> 15 16 #include <net/pkt_sched.h> 16 17 #include <net/pkt_cls.h> 17 18 ··· 54 53 bpf_compute_data_pointers(skb); 55 54 filter_res = BPF_PROG_RUN(filter, skb); 56 55 } 56 + if (skb_sk_is_prefetched(skb) && filter_res != TC_ACT_OK) 57 + skb_orphan(skb); 57 58 rcu_read_unlock(); 58 59 59 60 /* A BPF program may overwrite the default action opcode.
+24 -1
tools/include/uapi/linux/bpf.h
··· 2983 2983 * **bpf_get_current_cgroup_id**\ (). 2984 2984 * Return 2985 2985 * The id is returned or 0 in case the id could not be retrieved. 2986 + * 2987 + * int bpf_sk_assign(struct sk_buff *skb, struct bpf_sock *sk, u64 flags) 2988 + * Description 2989 + * Assign the *sk* to the *skb*. When combined with appropriate 2990 + * routing configuration to receive the packet towards the socket, 2991 + * will cause *skb* to be delivered to the specified socket. 2992 + * Subsequent redirection of *skb* via **bpf_redirect**\ (), 2993 + * **bpf_clone_redirect**\ () or other methods outside of BPF may 2994 + * interfere with successful delivery to the socket. 2995 + * 2996 + * This operation is only valid from TC ingress path. 2997 + * 2998 + * The *flags* argument must be zero. 2999 + * Return 3000 + * 0 on success, or a negative errno in case of failure. 3001 + * 3002 + * * **-EINVAL** Unsupported flags specified. 3003 + * * **-ENOENT** Socket is unavailable for assignment. 3004 + * * **-ENETUNREACH** Socket is unreachable (wrong netns). 3005 + * * **-EOPNOTSUPP** Unsupported operation, for example a 3006 + * call from outside of TC ingress. 3007 + * * **-ESOCKTNOSUPPORT** Socket type not supported (reuseport). 2986 3008 */ 2987 3009 #define __BPF_FUNC_MAPPER(FN) \ 2988 3010 FN(unspec), \ ··· 3130 3108 FN(get_ns_current_pid_tgid), \ 3131 3109 FN(xdp_output), \ 3132 3110 FN(get_netns_cookie), \ 3133 - FN(get_current_ancestor_cgroup_id), 3111 + FN(get_current_ancestor_cgroup_id), \ 3112 + FN(sk_assign), 3134 3113 3135 3114 /* integer value in 'imm' field of BPF_CALL instruction selects which helper 3136 3115 * function eBPF program intends to call