bpf: Add socket assign support · tjh.dev/kernel@cf7fbe6

+11

include/net/sock.h

··· 1659 1659 void sock_efree(struct sk_buff *skb); 1660 1660 #ifdef CONFIG_INET 1661 1661 void sock_edemux(struct sk_buff *skb); 1662 + void sock_pfree(struct sk_buff *skb); 1662 1663 #else 1663 1664 #define sock_edemux sock_efree 1664 1665 #endif ··· 2525 2524 void sock_net_set(struct sock *sk, struct net *net) 2526 2525 { 2527 2526 write_pnet(&sk->sk_net, net); 2527 + } 2528 + 2529 + static inline bool 2530 + skb_sk_is_prefetched(struct sk_buff *skb) 2531 + { 2532 + #ifdef CONFIG_INET 2533 + return skb->destructor == sock_pfree; 2534 + #else 2535 + return false; 2536 + #endif /* CONFIG_INET */ 2528 2537 } 2529 2538 2530 2539 static inline struct sock *skb_steal_sock(struct sk_buff *skb)

+24 -1

include/uapi/linux/bpf.h

··· 2983 2983 * **bpf_get_current_cgroup_id**\ (). 2984 2984 * Return 2985 2985 * The id is returned or 0 in case the id could not be retrieved. 2986 + * 2987 + * int bpf_sk_assign(struct sk_buff *skb, struct bpf_sock *sk, u64 flags) 2988 + * Description 2989 + * Assign the *sk* to the *skb*. When combined with appropriate 2990 + * routing configuration to receive the packet towards the socket, 2991 + * will cause *skb* to be delivered to the specified socket. 2992 + * Subsequent redirection of *skb* via **bpf_redirect**\ (), 2993 + * **bpf_clone_redirect**\ () or other methods outside of BPF may 2994 + * interfere with successful delivery to the socket. 2995 + * 2996 + * This operation is only valid from TC ingress path. 2997 + * 2998 + * The *flags* argument must be zero. 2999 + * Return 3000 + * 0 on success, or a negative errno in case of failure. 3001 + * 3002 + * * **-EINVAL** Unsupported flags specified. 3003 + * * **-ENOENT** Socket is unavailable for assignment. 3004 + * * **-ENETUNREACH** Socket is unreachable (wrong netns). 3005 + * * **-EOPNOTSUPP** Unsupported operation, for example a 3006 + * call from outside of TC ingress. 3007 + * * **-ESOCKTNOSUPPORT** Socket type not supported (reuseport). 2986 3008 */ 2987 3009 #define __BPF_FUNC_MAPPER(FN) \ 2988 3010 FN(unspec), \ ··· 3130 3108 FN(get_ns_current_pid_tgid), \ 3131 3109 FN(xdp_output), \ 3132 3110 FN(get_netns_cookie), \ 3133 - FN(get_current_ancestor_cgroup_id), 3111 + FN(get_current_ancestor_cgroup_id), \ 3112 + FN(sk_assign), 3134 3113 3135 3114 /* integer value in 'imm' field of BPF_CALL instruction selects which helper 3136 3115 * function eBPF program intends to call

+31

net/core/filter.c

··· 5918 5918 .arg5_type = ARG_CONST_SIZE, 5919 5919 }; 5920 5920 5921 + BPF_CALL_3(bpf_sk_assign, struct sk_buff *, skb, struct sock *, sk, u64, flags) 5922 + { 5923 + if (flags != 0) 5924 + return -EINVAL; 5925 + if (!skb_at_tc_ingress(skb)) 5926 + return -EOPNOTSUPP; 5927 + if (unlikely(dev_net(skb->dev) != sock_net(sk))) 5928 + return -ENETUNREACH; 5929 + if (unlikely(sk->sk_reuseport)) 5930 + return -ESOCKTNOSUPPORT; 5931 + if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) 5932 + return -ENOENT; 5933 + 5934 + skb_orphan(skb); 5935 + skb->sk = sk; 5936 + skb->destructor = sock_pfree; 5937 + 5938 + return 0; 5939 + } 5940 + 5941 + static const struct bpf_func_proto bpf_sk_assign_proto = { 5942 + .func = bpf_sk_assign, 5943 + .gpl_only = false, 5944 + .ret_type = RET_INTEGER, 5945 + .arg1_type = ARG_PTR_TO_CTX, 5946 + .arg2_type = ARG_PTR_TO_SOCK_COMMON, 5947 + .arg3_type = ARG_ANYTHING, 5948 + }; 5949 + 5921 5950 #endif /* CONFIG_INET */ 5922 5951 5923 5952 bool bpf_helper_changes_pkt_data(void *func) ··· 6278 6249 return &bpf_skb_ecn_set_ce_proto; 6279 6250 case BPF_FUNC_tcp_gen_syncookie: 6280 6251 return &bpf_tcp_gen_syncookie_proto; 6252 + case BPF_FUNC_sk_assign: 6253 + return &bpf_sk_assign_proto; 6281 6254 #endif 6282 6255 default: 6283 6256 return bpf_base_func_proto(func_id);

+11

net/core/sock.c

··· 2071 2071 } 2072 2072 EXPORT_SYMBOL(sock_efree); 2073 2073 2074 + /* Buffer destructor for prefetch/receive path where reference count may 2075 + * not be held, e.g. for listen sockets. 2076 + */ 2077 + #ifdef CONFIG_INET 2078 + void sock_pfree(struct sk_buff *skb) 2079 + { 2080 + sock_gen_put(skb->sk); 2081 + } 2082 + EXPORT_SYMBOL(sock_pfree); 2083 + #endif /* CONFIG_INET */ 2084 + 2074 2085 kuid_t sock_i_uid(struct sock *sk) 2075 2086 { 2076 2087 kuid_t uid;

+2 -1

net/ipv4/ip_input.c

··· 509 509 IPCB(skb)->iif = skb->skb_iif; 510 510 511 511 /* Must drop socket now because of tproxy. */ 512 - skb_orphan(skb); 512 + if (!skb_sk_is_prefetched(skb)) 513 + skb_orphan(skb); 513 514 514 515 return skb; 515 516

+2 -1

net/ipv6/ip6_input.c

··· 285 285 rcu_read_unlock(); 286 286 287 287 /* Must drop socket now because of tproxy. */ 288 - skb_orphan(skb); 288 + if (!skb_sk_is_prefetched(skb)) 289 + skb_orphan(skb); 289 290 290 291 return skb; 291 292 err:

+3

net/sched/act_bpf.c

··· 12 12 #include <linux/bpf.h> 13 13 14 14 #include <net/netlink.h> 15 + #include <net/sock.h> 15 16 #include <net/pkt_sched.h> 16 17 #include <net/pkt_cls.h> 17 18 ··· 54 53 bpf_compute_data_pointers(skb); 55 54 filter_res = BPF_PROG_RUN(filter, skb); 56 55 } 56 + if (skb_sk_is_prefetched(skb) && filter_res != TC_ACT_OK) 57 + skb_orphan(skb); 57 58 rcu_read_unlock(); 58 59 59 60 /* A BPF program may overwrite the default action opcode.

+24 -1

tools/include/uapi/linux/bpf.h

··· 2983 2983 * **bpf_get_current_cgroup_id**\ (). 2984 2984 * Return 2985 2985 * The id is returned or 0 in case the id could not be retrieved. 2986 + * 2987 + * int bpf_sk_assign(struct sk_buff *skb, struct bpf_sock *sk, u64 flags) 2988 + * Description 2989 + * Assign the *sk* to the *skb*. When combined with appropriate 2990 + * routing configuration to receive the packet towards the socket, 2991 + * will cause *skb* to be delivered to the specified socket. 2992 + * Subsequent redirection of *skb* via **bpf_redirect**\ (), 2993 + * **bpf_clone_redirect**\ () or other methods outside of BPF may 2994 + * interfere with successful delivery to the socket. 2995 + * 2996 + * This operation is only valid from TC ingress path. 2997 + * 2998 + * The *flags* argument must be zero. 2999 + * Return 3000 + * 0 on success, or a negative errno in case of failure. 3001 + * 3002 + * * **-EINVAL** Unsupported flags specified. 3003 + * * **-ENOENT** Socket is unavailable for assignment. 3004 + * * **-ENETUNREACH** Socket is unreachable (wrong netns). 3005 + * * **-EOPNOTSUPP** Unsupported operation, for example a 3006 + * call from outside of TC ingress. 3007 + * * **-ESOCKTNOSUPPORT** Socket type not supported (reuseport). 2986 3008 */ 2987 3009 #define __BPF_FUNC_MAPPER(FN) \ 2988 3010 FN(unspec), \ ··· 3130 3108 FN(get_ns_current_pid_tgid), \ 3131 3109 FN(xdp_output), \ 3132 3110 FN(get_netns_cookie), \ 3133 - FN(get_current_ancestor_cgroup_id), 3111 + FN(get_current_ancestor_cgroup_id), \ 3112 + FN(sk_assign), 3134 3113 3135 3114 /* integer value in 'imm' field of BPF_CALL instruction selects which helper 3136 3115 * function eBPF program intends to call