Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

bpf: Prevent deadlock from recursive bpf_task_storage_[get|delete]

BPF helpers bpf_task_storage_[get|delete] could hold two locks:
bpf_local_storage_map_bucket->lock and bpf_local_storage->lock. Calling
these helpers from fentry/fexit programs on functions in bpf_*_storage.c
may cause deadlock on either locks.

Prevent such deadlock with a per cpu counter, bpf_task_storage_busy. We
need this counter to be global, because the two locks here belong to two
different objects: bpf_local_storage_map and bpf_local_storage. If we
pick one of them as the owner of the counter, it is still possible to
trigger deadlock on the other lock. For example, if bpf_local_storage_map
owns the counters, it cannot prevent deadlock on bpf_local_storage->lock
when two maps are used.

Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20210225234319.336131-3-songliubraving@fb.com

authored by

Song Liu and committed by
Alexei Starovoitov
bc235cdb a10787e6

+65 -12
+2 -1
include/linux/bpf_local_storage.h
··· 126 126 struct bpf_local_storage_map *smap, 127 127 bool cacheit_lockit); 128 128 129 - void bpf_local_storage_map_free(struct bpf_local_storage_map *smap); 129 + void bpf_local_storage_map_free(struct bpf_local_storage_map *smap, 130 + int __percpu *busy_counter); 130 131 131 132 int bpf_local_storage_map_check_btf(const struct bpf_map *map, 132 133 const struct btf *btf,
+1 -1
kernel/bpf/bpf_inode_storage.c
··· 237 237 238 238 smap = (struct bpf_local_storage_map *)map; 239 239 bpf_local_storage_cache_idx_free(&inode_cache, smap->cache_idx); 240 - bpf_local_storage_map_free(smap); 240 + bpf_local_storage_map_free(smap, NULL); 241 241 } 242 242 243 243 static int inode_storage_map_btf_id;
+10 -1
kernel/bpf/bpf_local_storage.c
··· 474 474 spin_unlock(&cache->idx_lock); 475 475 } 476 476 477 - void bpf_local_storage_map_free(struct bpf_local_storage_map *smap) 477 + void bpf_local_storage_map_free(struct bpf_local_storage_map *smap, 478 + int __percpu *busy_counter) 478 479 { 479 480 struct bpf_local_storage_elem *selem; 480 481 struct bpf_local_storage_map_bucket *b; ··· 504 503 while ((selem = hlist_entry_safe( 505 504 rcu_dereference_raw(hlist_first_rcu(&b->list)), 506 505 struct bpf_local_storage_elem, map_node))) { 506 + if (busy_counter) { 507 + migrate_disable(); 508 + __this_cpu_inc(*busy_counter); 509 + } 507 510 bpf_selem_unlink(selem); 511 + if (busy_counter) { 512 + __this_cpu_dec(*busy_counter); 513 + migrate_enable(); 514 + } 508 515 cond_resched_rcu(); 509 516 } 510 517 rcu_read_unlock();
+51 -8
kernel/bpf/bpf_task_storage.c
··· 20 20 21 21 DEFINE_BPF_STORAGE_CACHE(task_cache); 22 22 23 + DEFINE_PER_CPU(int, bpf_task_storage_busy); 24 + 25 + static void bpf_task_storage_lock(void) 26 + { 27 + migrate_disable(); 28 + __this_cpu_inc(bpf_task_storage_busy); 29 + } 30 + 31 + static void bpf_task_storage_unlock(void) 32 + { 33 + __this_cpu_dec(bpf_task_storage_busy); 34 + migrate_enable(); 35 + } 36 + 37 + static bool bpf_task_storage_trylock(void) 38 + { 39 + migrate_disable(); 40 + if (unlikely(__this_cpu_inc_return(bpf_task_storage_busy) != 1)) { 41 + __this_cpu_dec(bpf_task_storage_busy); 42 + migrate_enable(); 43 + return false; 44 + } 45 + return true; 46 + } 47 + 23 48 static struct bpf_local_storage __rcu **task_storage_ptr(void *owner) 24 49 { 25 50 struct task_struct *task = owner; ··· 92 67 * when unlinking elem from the local_storage->list and 93 68 * the map's bucket->list. 94 69 */ 70 + bpf_task_storage_lock(); 95 71 raw_spin_lock_irqsave(&local_storage->lock, flags); 96 72 hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) { 97 73 /* Always unlink from map before unlinking from ··· 103 77 local_storage, selem, false); 104 78 } 105 79 raw_spin_unlock_irqrestore(&local_storage->lock, flags); 80 + bpf_task_storage_unlock(); 106 81 rcu_read_unlock(); 107 82 108 83 /* free_task_storage should always be true as long as ··· 136 109 goto out; 137 110 } 138 111 112 + bpf_task_storage_lock(); 139 113 sdata = task_storage_lookup(task, map, true); 114 + bpf_task_storage_unlock(); 140 115 put_pid(pid); 141 116 return sdata ? sdata->data : NULL; 142 117 out: ··· 170 141 goto out; 171 142 } 172 143 144 + bpf_task_storage_lock(); 173 145 sdata = bpf_local_storage_update( 174 146 task, (struct bpf_local_storage_map *)map, value, map_flags); 147 + bpf_task_storage_unlock(); 175 148 176 149 err = PTR_ERR_OR_ZERO(sdata); 177 150 out: ··· 216 185 goto out; 217 186 } 218 187 188 + bpf_task_storage_lock(); 219 189 err = task_storage_delete(task, map); 190 + bpf_task_storage_unlock(); 220 191 out: 221 192 put_pid(pid); 222 193 return err; ··· 235 202 if (!task) 236 203 return (unsigned long)NULL; 237 204 205 + if (!bpf_task_storage_trylock()) 206 + return (unsigned long)NULL; 207 + 238 208 sdata = task_storage_lookup(task, map, true); 239 209 if (sdata) 240 - return (unsigned long)sdata->data; 210 + goto unlock; 241 211 242 212 /* only allocate new storage, when the task is refcounted */ 243 213 if (refcount_read(&task->usage) && 244 - (flags & BPF_LOCAL_STORAGE_GET_F_CREATE)) { 214 + (flags & BPF_LOCAL_STORAGE_GET_F_CREATE)) 245 215 sdata = bpf_local_storage_update( 246 216 task, (struct bpf_local_storage_map *)map, value, 247 217 BPF_NOEXIST); 248 - return IS_ERR(sdata) ? (unsigned long)NULL : 249 - (unsigned long)sdata->data; 250 - } 251 218 252 - return (unsigned long)NULL; 219 + unlock: 220 + bpf_task_storage_unlock(); 221 + return IS_ERR_OR_NULL(sdata) ? (unsigned long)NULL : 222 + (unsigned long)sdata->data; 253 223 } 254 224 255 225 BPF_CALL_2(bpf_task_storage_delete, struct bpf_map *, map, struct task_struct *, 256 226 task) 257 227 { 228 + int ret; 229 + 258 230 if (!task) 259 231 return -EINVAL; 232 + 233 + if (!bpf_task_storage_trylock()) 234 + return -EBUSY; 260 235 261 236 /* This helper must only be called from places where the lifetime of the task 262 237 * is guaranteed. Either by being refcounted or by being protected 263 238 * by an RCU read-side critical section. 264 239 */ 265 - return task_storage_delete(task, map); 240 + ret = task_storage_delete(task, map); 241 + bpf_task_storage_unlock(); 242 + return ret; 266 243 } 267 244 268 245 static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key) ··· 298 255 299 256 smap = (struct bpf_local_storage_map *)map; 300 257 bpf_local_storage_cache_idx_free(&task_cache, smap->cache_idx); 301 - bpf_local_storage_map_free(smap); 258 + bpf_local_storage_map_free(smap, &bpf_task_storage_busy); 302 259 } 303 260 304 261 static int task_storage_map_btf_id;
+1 -1
net/core/bpf_sk_storage.c
··· 89 89 90 90 smap = (struct bpf_local_storage_map *)map; 91 91 bpf_local_storage_cache_idx_free(&sk_cache, smap->cache_idx); 92 - bpf_local_storage_map_free(smap); 92 + bpf_local_storage_map_free(smap, NULL); 93 93 } 94 94 95 95 static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr)