Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

bpf, sockmap: Fix preempt_rt splat when using raw_spin_lock_t

Sockmap and sockhash maps are a collection of psocks that are
objects representing a socket plus a set of metadata needed
to manage the BPF programs associated with the socket. These
maps use the stab->lock to protect from concurrent operations
on the maps, e.g. trying to insert to objects into the array
at the same time in the same slot. Additionally, a sockhash map
has a bucket lock to protect iteration and insert/delete into
the hash entry.

Each psock has a psock->link which is a linked list of all the
maps that a psock is attached to. This allows a psock (socket)
to be included in multiple sockmap and sockhash maps. This
linked list is protected the psock->link_lock.

They _must_ be nested correctly to avoid deadlock:

lock(stab->lock)
: do BPF map operations and psock insert/delete
lock(psock->link_lock)
: add map to psock linked list of maps
unlock(psock->link_lock)
unlock(stab->lock)

For non PREEMPT_RT kernels both raw_spin_lock_t and spin_lock_t
are guaranteed to not sleep. But, with PREEMPT_RT kernels the
spin_lock_t variants may sleep. In the current code we have
many patterns like this:

rcu_critical_section:
raw_spin_lock(stab->lock)
spin_lock(psock->link_lock) <- may sleep ouch
spin_unlock(psock->link_lock)
raw_spin_unlock(stab->lock)
rcu_critical_section

Nesting spin_lock() inside a raw_spin_lock() violates locking
rules for PREEMPT_RT kernels. And additionally we do alloc(GFP_ATOMICS)
inside the stab->lock, but those might sleep on PREEMPT_RT kernels.
The result is splats like this:

./test_progs -t sockmap_basic
[ 33.344330] bpf_testmod: loading out-of-tree module taints kernel.
[ 33.441933]
[ 33.442089] =============================
[ 33.442421] [ BUG: Invalid wait context ]
[ 33.442763] 6.5.0-rc5-01731-gec0ded2e0282 #4958 Tainted: G O
[ 33.443320] -----------------------------
[ 33.443624] test_progs/2073 is trying to lock:
[ 33.443960] ffff888102a1c290 (&psock->link_lock){....}-{3:3}, at: sock_map_update_common+0x2c2/0x3d0
[ 33.444636] other info that might help us debug this:
[ 33.444991] context-{5:5}
[ 33.445183] 3 locks held by test_progs/2073:
[ 33.445498] #0: ffff88811a208d30 (sk_lock-AF_INET){+.+.}-{0:0}, at: sock_map_update_elem_sys+0xff/0x330
[ 33.446159] #1: ffffffff842539e0 (rcu_read_lock){....}-{1:3}, at: sock_map_update_elem_sys+0xf5/0x330
[ 33.446809] #2: ffff88810d687240 (&stab->lock){+...}-{2:2}, at: sock_map_update_common+0x177/0x3d0
[ 33.447445] stack backtrace:
[ 33.447655] CPU: 10 PID

To fix observe we can't readily remove the allocations (for that
we would need to use/create something similar to bpf_map_alloc). So
convert raw_spin_lock_t to spin_lock_t. We note that sock_map_update
that would trigger the allocate and potential sleep is only allowed
through sys_bpf ops and via sock_ops which precludes hw interrupts
and low level atomic sections in RT preempt kernel. On non RT
preempt kernel there are no changes here and spin locks sections
and alloc(GFP_ATOMIC) are still not sleepable.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20230830053517.166611-1-john.fastabend@gmail.com

authored by

John Fastabend and committed by
Daniel Borkmann
35d2b7ff be4033d3

+18 -18
+18 -18
net/core/sock_map.c
··· 18 18 struct bpf_map map; 19 19 struct sock **sks; 20 20 struct sk_psock_progs progs; 21 - raw_spinlock_t lock; 21 + spinlock_t lock; 22 22 }; 23 23 24 24 #define SOCK_CREATE_FLAG_MASK \ ··· 44 44 return ERR_PTR(-ENOMEM); 45 45 46 46 bpf_map_init_from_attr(&stab->map, attr); 47 - raw_spin_lock_init(&stab->lock); 47 + spin_lock_init(&stab->lock); 48 48 49 49 stab->sks = bpf_map_area_alloc((u64) stab->map.max_entries * 50 50 sizeof(struct sock *), ··· 411 411 struct sock *sk; 412 412 int err = 0; 413 413 414 - raw_spin_lock_bh(&stab->lock); 414 + spin_lock_bh(&stab->lock); 415 415 sk = *psk; 416 416 if (!sk_test || sk_test == sk) 417 417 sk = xchg(psk, NULL); ··· 421 421 else 422 422 err = -EINVAL; 423 423 424 - raw_spin_unlock_bh(&stab->lock); 424 + spin_unlock_bh(&stab->lock); 425 425 return err; 426 426 } 427 427 ··· 487 487 psock = sk_psock(sk); 488 488 WARN_ON_ONCE(!psock); 489 489 490 - raw_spin_lock_bh(&stab->lock); 490 + spin_lock_bh(&stab->lock); 491 491 osk = stab->sks[idx]; 492 492 if (osk && flags == BPF_NOEXIST) { 493 493 ret = -EEXIST; ··· 501 501 stab->sks[idx] = sk; 502 502 if (osk) 503 503 sock_map_unref(osk, &stab->sks[idx]); 504 - raw_spin_unlock_bh(&stab->lock); 504 + spin_unlock_bh(&stab->lock); 505 505 return 0; 506 506 out_unlock: 507 - raw_spin_unlock_bh(&stab->lock); 507 + spin_unlock_bh(&stab->lock); 508 508 if (psock) 509 509 sk_psock_put(sk, psock); 510 510 out_free: ··· 835 835 836 836 struct bpf_shtab_bucket { 837 837 struct hlist_head head; 838 - raw_spinlock_t lock; 838 + spinlock_t lock; 839 839 }; 840 840 841 841 struct bpf_shtab { ··· 910 910 * is okay since it's going away only after RCU grace period. 911 911 * However, we need to check whether it's still present. 912 912 */ 913 - raw_spin_lock_bh(&bucket->lock); 913 + spin_lock_bh(&bucket->lock); 914 914 elem_probe = sock_hash_lookup_elem_raw(&bucket->head, elem->hash, 915 915 elem->key, map->key_size); 916 916 if (elem_probe && elem_probe == elem) { ··· 918 918 sock_map_unref(elem->sk, elem); 919 919 sock_hash_free_elem(htab, elem); 920 920 } 921 - raw_spin_unlock_bh(&bucket->lock); 921 + spin_unlock_bh(&bucket->lock); 922 922 } 923 923 924 924 static long sock_hash_delete_elem(struct bpf_map *map, void *key) ··· 932 932 hash = sock_hash_bucket_hash(key, key_size); 933 933 bucket = sock_hash_select_bucket(htab, hash); 934 934 935 - raw_spin_lock_bh(&bucket->lock); 935 + spin_lock_bh(&bucket->lock); 936 936 elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size); 937 937 if (elem) { 938 938 hlist_del_rcu(&elem->node); ··· 940 940 sock_hash_free_elem(htab, elem); 941 941 ret = 0; 942 942 } 943 - raw_spin_unlock_bh(&bucket->lock); 943 + spin_unlock_bh(&bucket->lock); 944 944 return ret; 945 945 } 946 946 ··· 1000 1000 hash = sock_hash_bucket_hash(key, key_size); 1001 1001 bucket = sock_hash_select_bucket(htab, hash); 1002 1002 1003 - raw_spin_lock_bh(&bucket->lock); 1003 + spin_lock_bh(&bucket->lock); 1004 1004 elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size); 1005 1005 if (elem && flags == BPF_NOEXIST) { 1006 1006 ret = -EEXIST; ··· 1026 1026 sock_map_unref(elem->sk, elem); 1027 1027 sock_hash_free_elem(htab, elem); 1028 1028 } 1029 - raw_spin_unlock_bh(&bucket->lock); 1029 + spin_unlock_bh(&bucket->lock); 1030 1030 return 0; 1031 1031 out_unlock: 1032 - raw_spin_unlock_bh(&bucket->lock); 1032 + spin_unlock_bh(&bucket->lock); 1033 1033 sk_psock_put(sk, psock); 1034 1034 out_free: 1035 1035 sk_psock_free_link(link); ··· 1115 1115 1116 1116 for (i = 0; i < htab->buckets_num; i++) { 1117 1117 INIT_HLIST_HEAD(&htab->buckets[i].head); 1118 - raw_spin_lock_init(&htab->buckets[i].lock); 1118 + spin_lock_init(&htab->buckets[i].lock); 1119 1119 } 1120 1120 1121 1121 return &htab->map; ··· 1147 1147 * exists, psock exists and holds a ref to socket. That 1148 1148 * lets us to grab a socket ref too. 1149 1149 */ 1150 - raw_spin_lock_bh(&bucket->lock); 1150 + spin_lock_bh(&bucket->lock); 1151 1151 hlist_for_each_entry(elem, &bucket->head, node) 1152 1152 sock_hold(elem->sk); 1153 1153 hlist_move_list(&bucket->head, &unlink_list); 1154 - raw_spin_unlock_bh(&bucket->lock); 1154 + spin_unlock_bh(&bucket->lock); 1155 1155 1156 1156 /* Process removed entries out of atomic context to 1157 1157 * block for socket lock before deleting the psock's