Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

bpf: support cloning sk storage on accept()

Add new helper bpf_sk_storage_clone which optionally clones sk storage
and call it from sk_clone_lock.

Cc: Martin KaFai Lau <kafai@fb.com>
Cc: Yonghong Song <yhs@fb.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>

authored by

Stanislav Fomichev and committed by
Daniel Borkmann
8f51dfc7 b0e4701c

+120 -6
+10
include/net/bpf_sk_storage.h
··· 10 10 extern const struct bpf_func_proto bpf_sk_storage_get_proto; 11 11 extern const struct bpf_func_proto bpf_sk_storage_delete_proto; 12 12 13 + #ifdef CONFIG_BPF_SYSCALL 14 + int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk); 15 + #else 16 + static inline int bpf_sk_storage_clone(const struct sock *sk, 17 + struct sock *newsk) 18 + { 19 + return 0; 20 + } 21 + #endif 22 + 13 23 #endif /* _BPF_SK_STORAGE_H */
+3
include/uapi/linux/bpf.h
··· 337 337 #define BPF_F_RDONLY_PROG (1U << 7) 338 338 #define BPF_F_WRONLY_PROG (1U << 8) 339 339 340 + /* Clone map from listener for newly accepted socket */ 341 + #define BPF_F_CLONE (1U << 9) 342 + 340 343 /* flags for BPF_PROG_QUERY */ 341 344 #define BPF_F_QUERY_EFFECTIVE (1U << 0) 342 345
+101 -3
net/core/bpf_sk_storage.c
··· 12 12 13 13 static atomic_t cache_idx; 14 14 15 + #define SK_STORAGE_CREATE_FLAG_MASK \ 16 + (BPF_F_NO_PREALLOC | BPF_F_CLONE) 17 + 15 18 struct bucket { 16 19 struct hlist_head list; 17 20 raw_spinlock_t lock; ··· 212 209 kfree_rcu(sk_storage, rcu); 213 210 } 214 211 215 - /* sk_storage->lock must be held and sk_storage->list cannot be empty */ 216 212 static void __selem_link_sk(struct bpf_sk_storage *sk_storage, 217 213 struct bpf_sk_storage_elem *selem) 218 214 { ··· 511 509 return 0; 512 510 } 513 511 514 - /* Called by __sk_destruct() */ 512 + /* Called by __sk_destruct() & bpf_sk_storage_clone() */ 515 513 void bpf_sk_storage_free(struct sock *sk) 516 514 { 517 515 struct bpf_sk_storage_elem *selem; ··· 559 557 560 558 smap = (struct bpf_sk_storage_map *)map; 561 559 560 + /* Note that this map might be concurrently cloned from 561 + * bpf_sk_storage_clone. Wait for any existing bpf_sk_storage_clone 562 + * RCU read section to finish before proceeding. New RCU 563 + * read sections should be prevented via bpf_map_inc_not_zero. 564 + */ 562 565 synchronize_rcu(); 563 566 564 567 /* bpf prog and the userspace can no longer access this map ··· 608 601 609 602 static int bpf_sk_storage_map_alloc_check(union bpf_attr *attr) 610 603 { 611 - if (attr->map_flags != BPF_F_NO_PREALLOC || attr->max_entries || 604 + if (attr->map_flags & ~SK_STORAGE_CREATE_FLAG_MASK || 605 + !(attr->map_flags & BPF_F_NO_PREALLOC) || 606 + attr->max_entries || 612 607 attr->key_size != sizeof(int) || !attr->value_size || 613 608 /* Enforce BTF for userspace sk dumping */ 614 609 !attr->btf_key_type_id || !attr->btf_value_type_id) ··· 746 737 } 747 738 748 739 return err; 740 + } 741 + 742 + static struct bpf_sk_storage_elem * 743 + bpf_sk_storage_clone_elem(struct sock *newsk, 744 + struct bpf_sk_storage_map *smap, 745 + struct bpf_sk_storage_elem *selem) 746 + { 747 + struct bpf_sk_storage_elem *copy_selem; 748 + 749 + copy_selem = selem_alloc(smap, newsk, NULL, true); 750 + if (!copy_selem) 751 + return NULL; 752 + 753 + if (map_value_has_spin_lock(&smap->map)) 754 + copy_map_value_locked(&smap->map, SDATA(copy_selem)->data, 755 + SDATA(selem)->data, true); 756 + else 757 + copy_map_value(&smap->map, SDATA(copy_selem)->data, 758 + SDATA(selem)->data); 759 + 760 + return copy_selem; 761 + } 762 + 763 + int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk) 764 + { 765 + struct bpf_sk_storage *new_sk_storage = NULL; 766 + struct bpf_sk_storage *sk_storage; 767 + struct bpf_sk_storage_elem *selem; 768 + int ret = 0; 769 + 770 + RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL); 771 + 772 + rcu_read_lock(); 773 + sk_storage = rcu_dereference(sk->sk_bpf_storage); 774 + 775 + if (!sk_storage || hlist_empty(&sk_storage->list)) 776 + goto out; 777 + 778 + hlist_for_each_entry_rcu(selem, &sk_storage->list, snode) { 779 + struct bpf_sk_storage_elem *copy_selem; 780 + struct bpf_sk_storage_map *smap; 781 + struct bpf_map *map; 782 + 783 + smap = rcu_dereference(SDATA(selem)->smap); 784 + if (!(smap->map.map_flags & BPF_F_CLONE)) 785 + continue; 786 + 787 + /* Note that for lockless listeners adding new element 788 + * here can race with cleanup in bpf_sk_storage_map_free. 789 + * Try to grab map refcnt to make sure that it's still 790 + * alive and prevent concurrent removal. 791 + */ 792 + map = bpf_map_inc_not_zero(&smap->map, false); 793 + if (IS_ERR(map)) 794 + continue; 795 + 796 + copy_selem = bpf_sk_storage_clone_elem(newsk, smap, selem); 797 + if (!copy_selem) { 798 + ret = -ENOMEM; 799 + bpf_map_put(map); 800 + goto out; 801 + } 802 + 803 + if (new_sk_storage) { 804 + selem_link_map(smap, copy_selem); 805 + __selem_link_sk(new_sk_storage, copy_selem); 806 + } else { 807 + ret = sk_storage_alloc(newsk, smap, copy_selem); 808 + if (ret) { 809 + kfree(copy_selem); 810 + atomic_sub(smap->elem_size, 811 + &newsk->sk_omem_alloc); 812 + bpf_map_put(map); 813 + goto out; 814 + } 815 + 816 + new_sk_storage = rcu_dereference(copy_selem->sk_storage); 817 + } 818 + bpf_map_put(map); 819 + } 820 + 821 + out: 822 + rcu_read_unlock(); 823 + 824 + /* In case of an error, don't free anything explicitly here, the 825 + * caller is responsible to call bpf_sk_storage_free. 826 + */ 827 + 828 + return ret; 749 829 } 750 830 751 831 BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk,
+6 -3
net/core/sock.c
··· 1851 1851 goto out; 1852 1852 } 1853 1853 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL); 1854 - #ifdef CONFIG_BPF_SYSCALL 1855 - RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL); 1856 - #endif 1854 + 1855 + if (bpf_sk_storage_clone(sk, newsk)) { 1856 + sk_free_unlock_clone(newsk); 1857 + newsk = NULL; 1858 + goto out; 1859 + } 1857 1860 1858 1861 newsk->sk_err = 0; 1859 1862 newsk->sk_err_soft = 0;