Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

bpf, sockmap: Allow skipping sk_skb parser program

Currently, we often run with a nop parser namely one that just does
this, 'return skb->len'. This happens when either our verdict program
can handle streaming data or it is only looking at socket data such
as IP addresses and other metadata associated with the flow. The second
case is common for a L3/L4 proxy for instance.

So lets allow loading programs without the parser then we can skip
the stream parser logic and avoid having to add a BPF program that
is effectively a nop.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/160239297866.8495.13345662302749219672.stgit@john-Precision-5820-Tower

authored by

John Fastabend and committed by
Alexei Starovoitov
ef565928 743df8b7

+95 -7
+2
include/linux/skmsg.h
··· 308 308 int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock); 309 309 void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock); 310 310 void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock); 311 + void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock); 312 + void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock); 311 313 312 314 int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock, 313 315 struct sk_msg *msg);
+78
net/core/skmsg.c
··· 627 627 rcu_assign_sk_user_data(sk, NULL); 628 628 if (psock->progs.skb_parser) 629 629 sk_psock_stop_strp(sk, psock); 630 + else if (psock->progs.skb_verdict) 631 + sk_psock_stop_verdict(sk, psock); 630 632 write_unlock_bh(&sk->sk_callback_lock); 631 633 sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED); 632 634 ··· 873 871 rcu_read_unlock(); 874 872 } 875 873 874 + static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb, 875 + unsigned int offset, size_t orig_len) 876 + { 877 + struct sock *sk = (struct sock *)desc->arg.data; 878 + struct sk_psock *psock; 879 + struct bpf_prog *prog; 880 + int ret = __SK_DROP; 881 + int len = skb->len; 882 + 883 + /* clone here so sk_eat_skb() in tcp_read_sock does not drop our data */ 884 + skb = skb_clone(skb, GFP_ATOMIC); 885 + if (!skb) { 886 + desc->error = -ENOMEM; 887 + return 0; 888 + } 889 + 890 + rcu_read_lock(); 891 + psock = sk_psock(sk); 892 + if (unlikely(!psock)) { 893 + len = 0; 894 + kfree_skb(skb); 895 + goto out; 896 + } 897 + skb_set_owner_r(skb, sk); 898 + prog = READ_ONCE(psock->progs.skb_verdict); 899 + if (likely(prog)) { 900 + tcp_skb_bpf_redirect_clear(skb); 901 + ret = sk_psock_bpf_run(psock, prog, skb); 902 + ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb)); 903 + } 904 + sk_psock_verdict_apply(psock, skb, ret); 905 + out: 906 + rcu_read_unlock(); 907 + return len; 908 + } 909 + 910 + static void sk_psock_verdict_data_ready(struct sock *sk) 911 + { 912 + struct socket *sock = sk->sk_socket; 913 + read_descriptor_t desc; 914 + 915 + if (unlikely(!sock || !sock->ops || !sock->ops->read_sock)) 916 + return; 917 + 918 + desc.arg.data = sk; 919 + desc.error = 0; 920 + desc.count = 1; 921 + 922 + sock->ops->read_sock(sk, &desc, sk_psock_verdict_recv); 923 + } 924 + 876 925 static void sk_psock_write_space(struct sock *sk) 877 926 { 878 927 struct sk_psock *psock; ··· 953 900 return strp_init(&psock->parser.strp, sk, &cb); 954 901 } 955 902 903 + void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock) 904 + { 905 + struct sk_psock_parser *parser = &psock->parser; 906 + 907 + if (parser->enabled) 908 + return; 909 + 910 + parser->saved_data_ready = sk->sk_data_ready; 911 + sk->sk_data_ready = sk_psock_verdict_data_ready; 912 + sk->sk_write_space = sk_psock_write_space; 913 + parser->enabled = true; 914 + } 915 + 956 916 void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock) 957 917 { 958 918 struct sk_psock_parser *parser = &psock->parser; ··· 989 923 sk->sk_data_ready = parser->saved_data_ready; 990 924 parser->saved_data_ready = NULL; 991 925 strp_stop(&parser->strp); 926 + parser->enabled = false; 927 + } 928 + 929 + void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock) 930 + { 931 + struct sk_psock_parser *parser = &psock->parser; 932 + 933 + if (!parser->enabled) 934 + return; 935 + 936 + sk->sk_data_ready = parser->saved_data_ready; 937 + parser->saved_data_ready = NULL; 992 938 parser->enabled = false; 993 939 }
+15 -7
net/core/sock_map.c
··· 148 148 static void sock_map_del_link(struct sock *sk, 149 149 struct sk_psock *psock, void *link_raw) 150 150 { 151 + bool strp_stop = false, verdict_stop = false; 151 152 struct sk_psock_link *link, *tmp; 152 - bool strp_stop = false; 153 153 154 154 spin_lock_bh(&psock->link_lock); 155 155 list_for_each_entry_safe(link, tmp, &psock->link, list) { ··· 159 159 map); 160 160 if (psock->parser.enabled && stab->progs.skb_parser) 161 161 strp_stop = true; 162 + if (psock->parser.enabled && stab->progs.skb_verdict) 163 + verdict_stop = true; 162 164 list_del(&link->list); 163 165 sk_psock_free_link(link); 164 166 } 165 167 } 166 168 spin_unlock_bh(&psock->link_lock); 167 - if (strp_stop) { 169 + if (strp_stop || verdict_stop) { 168 170 write_lock_bh(&sk->sk_callback_lock); 169 - sk_psock_stop_strp(sk, psock); 171 + if (strp_stop) 172 + sk_psock_stop_strp(sk, psock); 173 + else 174 + sk_psock_stop_verdict(sk, psock); 170 175 write_unlock_bh(&sk->sk_callback_lock); 171 176 } 172 177 } ··· 293 288 write_lock_bh(&sk->sk_callback_lock); 294 289 if (skb_parser && skb_verdict && !psock->parser.enabled) { 295 290 ret = sk_psock_init_strp(sk, psock); 296 - if (ret) { 297 - write_unlock_bh(&sk->sk_callback_lock); 298 - goto out_drop; 299 - } 291 + if (ret) 292 + goto out_unlock_drop; 300 293 psock_set_prog(&psock->progs.skb_verdict, skb_verdict); 301 294 psock_set_prog(&psock->progs.skb_parser, skb_parser); 302 295 sk_psock_start_strp(sk, psock); 296 + } else if (!skb_parser && skb_verdict && !psock->parser.enabled) { 297 + psock_set_prog(&psock->progs.skb_verdict, skb_verdict); 298 + sk_psock_start_verdict(sk,psock); 303 299 } 304 300 write_unlock_bh(&sk->sk_callback_lock); 305 301 return 0; 302 + out_unlock_drop: 303 + write_unlock_bh(&sk->sk_callback_lock); 306 304 out_drop: 307 305 sk_psock_put(sk, psock); 308 306 out_progs: