Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

bpf: Enable task local storage for tracing programs

To access per-task data, BPF programs usually creates a hash table with
pid as the key. This is not ideal because:
1. The user need to estimate the proper size of the hash table, which may
be inaccurate;
2. Big hash tables are slow;
3. To clean up the data properly during task terminations, the user need
to write extra logic.

Task local storage overcomes these issues and offers a better option for
these per-task data. Task local storage is only available to BPF_LSM. Now
enable it for tracing programs.

Unlike LSM programs, tracing programs can be called in IRQ contexts.
Helpers that access task local storage are updated to use
raw_spin_lock_irqsave() instead of raw_spin_lock_bh().

Tracing programs can attach to functions on the task free path, e.g.
exit_creds(). To avoid allocating task local storage after
bpf_task_storage_free(). bpf_task_storage_get() is updated to not allocate
new storage when the task is not refcounted (task->usage == 0).

Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: KP Singh <kpsingh@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20210225234319.336131-2-songliubraving@fb.com

authored by

Song Liu and committed by
Alexei Starovoitov
a10787e6 9c8f21e6

+51 -72
+7
include/linux/bpf.h
··· 1499 1499 struct bpf_link *bpf_link_by_id(u32 id); 1500 1500 1501 1501 const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id); 1502 + void bpf_task_storage_free(struct task_struct *task); 1502 1503 #else /* !CONFIG_BPF_SYSCALL */ 1503 1504 static inline struct bpf_prog *bpf_prog_get(u32 ufd) 1504 1505 { ··· 1684 1683 bpf_base_func_proto(enum bpf_func_id func_id) 1685 1684 { 1686 1685 return NULL; 1686 + } 1687 + 1688 + static inline void bpf_task_storage_free(struct task_struct *task) 1689 + { 1687 1690 } 1688 1691 #endif /* CONFIG_BPF_SYSCALL */ 1689 1692 ··· 1891 1886 extern const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto; 1892 1887 extern const struct bpf_func_proto bpf_sock_from_file_proto; 1893 1888 extern const struct bpf_func_proto bpf_get_socket_ptr_cookie_proto; 1889 + extern const struct bpf_func_proto bpf_task_storage_get_proto; 1890 + extern const struct bpf_func_proto bpf_task_storage_delete_proto; 1894 1891 1895 1892 const struct bpf_func_proto *bpf_tracing_func_proto( 1896 1893 enum bpf_func_id func_id, const struct bpf_prog *prog);
-22
include/linux/bpf_lsm.h
··· 38 38 return inode->i_security + bpf_lsm_blob_sizes.lbs_inode; 39 39 } 40 40 41 - static inline struct bpf_storage_blob *bpf_task( 42 - const struct task_struct *task) 43 - { 44 - if (unlikely(!task->security)) 45 - return NULL; 46 - 47 - return task->security + bpf_lsm_blob_sizes.lbs_task; 48 - } 49 - 50 41 extern const struct bpf_func_proto bpf_inode_storage_get_proto; 51 42 extern const struct bpf_func_proto bpf_inode_storage_delete_proto; 52 - extern const struct bpf_func_proto bpf_task_storage_get_proto; 53 - extern const struct bpf_func_proto bpf_task_storage_delete_proto; 54 43 void bpf_inode_storage_free(struct inode *inode); 55 - void bpf_task_storage_free(struct task_struct *task); 56 44 57 45 #else /* !CONFIG_BPF_LSM */ 58 46 ··· 61 73 return NULL; 62 74 } 63 75 64 - static inline struct bpf_storage_blob *bpf_task( 65 - const struct task_struct *task) 66 - { 67 - return NULL; 68 - } 69 - 70 76 static inline void bpf_inode_storage_free(struct inode *inode) 71 - { 72 - } 73 - 74 - static inline void bpf_task_storage_free(struct task_struct *task) 75 77 { 76 78 } 77 79
+1 -1
include/linux/bpf_types.h
··· 109 109 #endif 110 110 #ifdef CONFIG_BPF_LSM 111 111 BPF_MAP_TYPE(BPF_MAP_TYPE_INODE_STORAGE, inode_storage_map_ops) 112 - BPF_MAP_TYPE(BPF_MAP_TYPE_TASK_STORAGE, task_storage_map_ops) 113 112 #endif 113 + BPF_MAP_TYPE(BPF_MAP_TYPE_TASK_STORAGE, task_storage_map_ops) 114 114 BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops) 115 115 #if defined(CONFIG_XDP_SOCKETS) 116 116 BPF_MAP_TYPE(BPF_MAP_TYPE_XSKMAP, xsk_map_ops)
+5
include/linux/sched.h
··· 42 42 struct backing_dev_info; 43 43 struct bio_list; 44 44 struct blk_plug; 45 + struct bpf_local_storage; 45 46 struct capture_control; 46 47 struct cfs_rq; 47 48 struct fs_struct; ··· 1348 1347 #ifdef CONFIG_SECURITY 1349 1348 /* Used by LSM modules for access restriction: */ 1350 1349 void *security; 1350 + #endif 1351 + #ifdef CONFIG_BPF_SYSCALL 1352 + /* Used by BPF task local storage */ 1353 + struct bpf_local_storage __rcu *bpf_storage; 1351 1354 #endif 1352 1355 1353 1356 #ifdef CONFIG_GCC_PLUGIN_STACKLEAK
+1 -2
kernel/bpf/Makefile
··· 9 9 obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o 10 10 obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o 11 11 obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o 12 + obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o 12 13 obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o 13 - obj-${CONFIG_BPF_LSM} += bpf_task_storage.o 14 14 obj-$(CONFIG_BPF_SYSCALL) += disasm.o 15 15 obj-$(CONFIG_BPF_JIT) += trampoline.o 16 16 obj-$(CONFIG_BPF_SYSCALL) += btf.o ··· 18 18 ifeq ($(CONFIG_NET),y) 19 19 obj-$(CONFIG_BPF_SYSCALL) += devmap.o 20 20 obj-$(CONFIG_BPF_SYSCALL) += cpumap.o 21 - obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o 22 21 obj-$(CONFIG_BPF_SYSCALL) += offload.o 23 22 obj-$(CONFIG_BPF_SYSCALL) += net_namespace.o 24 23 endif
+17 -11
kernel/bpf/bpf_local_storage.c
··· 140 140 { 141 141 struct bpf_local_storage *local_storage; 142 142 bool free_local_storage = false; 143 + unsigned long flags; 143 144 144 145 if (unlikely(!selem_linked_to_storage(selem))) 145 146 /* selem has already been unlinked from sk */ 146 147 return; 147 148 148 149 local_storage = rcu_dereference(selem->local_storage); 149 - raw_spin_lock_bh(&local_storage->lock); 150 + raw_spin_lock_irqsave(&local_storage->lock, flags); 150 151 if (likely(selem_linked_to_storage(selem))) 151 152 free_local_storage = bpf_selem_unlink_storage_nolock( 152 153 local_storage, selem, true); 153 - raw_spin_unlock_bh(&local_storage->lock); 154 + raw_spin_unlock_irqrestore(&local_storage->lock, flags); 154 155 155 156 if (free_local_storage) 156 157 kfree_rcu(local_storage, rcu); ··· 168 167 { 169 168 struct bpf_local_storage_map *smap; 170 169 struct bpf_local_storage_map_bucket *b; 170 + unsigned long flags; 171 171 172 172 if (unlikely(!selem_linked_to_map(selem))) 173 173 /* selem has already be unlinked from smap */ ··· 176 174 177 175 smap = rcu_dereference(SDATA(selem)->smap); 178 176 b = select_bucket(smap, selem); 179 - raw_spin_lock_bh(&b->lock); 177 + raw_spin_lock_irqsave(&b->lock, flags); 180 178 if (likely(selem_linked_to_map(selem))) 181 179 hlist_del_init_rcu(&selem->map_node); 182 - raw_spin_unlock_bh(&b->lock); 180 + raw_spin_unlock_irqrestore(&b->lock, flags); 183 181 } 184 182 185 183 void bpf_selem_link_map(struct bpf_local_storage_map *smap, 186 184 struct bpf_local_storage_elem *selem) 187 185 { 188 186 struct bpf_local_storage_map_bucket *b = select_bucket(smap, selem); 187 + unsigned long flags; 189 188 190 - raw_spin_lock_bh(&b->lock); 189 + raw_spin_lock_irqsave(&b->lock, flags); 191 190 RCU_INIT_POINTER(SDATA(selem)->smap, smap); 192 191 hlist_add_head_rcu(&selem->map_node, &b->list); 193 - raw_spin_unlock_bh(&b->lock); 192 + raw_spin_unlock_irqrestore(&b->lock, flags); 194 193 } 195 194 196 195 void bpf_selem_unlink(struct bpf_local_storage_elem *selem) ··· 227 224 228 225 sdata = SDATA(selem); 229 226 if (cacheit_lockit) { 227 + unsigned long flags; 228 + 230 229 /* spinlock is needed to avoid racing with the 231 230 * parallel delete. Otherwise, publishing an already 232 231 * deleted sdata to the cache will become a use-after-free 233 232 * problem in the next bpf_local_storage_lookup(). 234 233 */ 235 - raw_spin_lock_bh(&local_storage->lock); 234 + raw_spin_lock_irqsave(&local_storage->lock, flags); 236 235 if (selem_linked_to_storage(selem)) 237 236 rcu_assign_pointer(local_storage->cache[smap->cache_idx], 238 237 sdata); 239 - raw_spin_unlock_bh(&local_storage->lock); 238 + raw_spin_unlock_irqrestore(&local_storage->lock, flags); 240 239 } 241 240 242 241 return sdata; ··· 332 327 struct bpf_local_storage_data *old_sdata = NULL; 333 328 struct bpf_local_storage_elem *selem; 334 329 struct bpf_local_storage *local_storage; 330 + unsigned long flags; 335 331 int err; 336 332 337 333 /* BPF_EXIST and BPF_NOEXIST cannot be both set */ ··· 380 374 } 381 375 } 382 376 383 - raw_spin_lock_bh(&local_storage->lock); 377 + raw_spin_lock_irqsave(&local_storage->lock, flags); 384 378 385 379 /* Recheck local_storage->list under local_storage->lock */ 386 380 if (unlikely(hlist_empty(&local_storage->list))) { ··· 434 428 } 435 429 436 430 unlock: 437 - raw_spin_unlock_bh(&local_storage->lock); 431 + raw_spin_unlock_irqrestore(&local_storage->lock, flags); 438 432 return SDATA(selem); 439 433 440 434 unlock_err: 441 - raw_spin_unlock_bh(&local_storage->lock); 435 + raw_spin_unlock_irqrestore(&local_storage->lock, flags); 442 436 return ERR_PTR(err); 443 437 } 444 438
-4
kernel/bpf/bpf_lsm.c
··· 115 115 return &bpf_spin_lock_proto; 116 116 case BPF_FUNC_spin_unlock: 117 117 return &bpf_spin_unlock_proto; 118 - case BPF_FUNC_task_storage_get: 119 - return &bpf_task_storage_get_proto; 120 - case BPF_FUNC_task_storage_delete: 121 - return &bpf_task_storage_delete_proto; 122 118 case BPF_FUNC_bprm_opts_set: 123 119 return &bpf_bprm_opts_set_proto; 124 120 case BPF_FUNC_ima_inode_hash:
+11 -32
kernel/bpf/bpf_task_storage.c
··· 15 15 #include <linux/bpf_local_storage.h> 16 16 #include <linux/filter.h> 17 17 #include <uapi/linux/btf.h> 18 - #include <linux/bpf_lsm.h> 19 18 #include <linux/btf_ids.h> 20 19 #include <linux/fdtable.h> 21 20 ··· 23 24 static struct bpf_local_storage __rcu **task_storage_ptr(void *owner) 24 25 { 25 26 struct task_struct *task = owner; 26 - struct bpf_storage_blob *bsb; 27 27 28 - bsb = bpf_task(task); 29 - if (!bsb) 30 - return NULL; 31 - return &bsb->storage; 28 + return &task->bpf_storage; 32 29 } 33 30 34 31 static struct bpf_local_storage_data * ··· 33 38 { 34 39 struct bpf_local_storage *task_storage; 35 40 struct bpf_local_storage_map *smap; 36 - struct bpf_storage_blob *bsb; 37 41 38 - bsb = bpf_task(task); 39 - if (!bsb) 40 - return NULL; 41 - 42 - task_storage = rcu_dereference(bsb->storage); 42 + task_storage = rcu_dereference(task->bpf_storage); 43 43 if (!task_storage) 44 44 return NULL; 45 45 ··· 47 57 struct bpf_local_storage_elem *selem; 48 58 struct bpf_local_storage *local_storage; 49 59 bool free_task_storage = false; 50 - struct bpf_storage_blob *bsb; 51 60 struct hlist_node *n; 52 - 53 - bsb = bpf_task(task); 54 - if (!bsb) 55 - return; 61 + unsigned long flags; 56 62 57 63 rcu_read_lock(); 58 64 59 - local_storage = rcu_dereference(bsb->storage); 65 + local_storage = rcu_dereference(task->bpf_storage); 60 66 if (!local_storage) { 61 67 rcu_read_unlock(); 62 68 return; ··· 67 81 * when unlinking elem from the local_storage->list and 68 82 * the map's bucket->list. 69 83 */ 70 - raw_spin_lock_bh(&local_storage->lock); 84 + raw_spin_lock_irqsave(&local_storage->lock, flags); 71 85 hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) { 72 86 /* Always unlink from map before unlinking from 73 87 * local_storage. ··· 76 90 free_task_storage = bpf_selem_unlink_storage_nolock( 77 91 local_storage, selem, false); 78 92 } 79 - raw_spin_unlock_bh(&local_storage->lock); 93 + raw_spin_unlock_irqrestore(&local_storage->lock, flags); 80 94 rcu_read_unlock(); 81 95 82 96 /* free_task_storage should always be true as long as ··· 136 150 */ 137 151 WARN_ON_ONCE(!rcu_read_lock_held()); 138 152 task = pid_task(pid, PIDTYPE_PID); 139 - if (!task || !task_storage_ptr(task)) { 153 + if (!task) { 140 154 err = -ENOENT; 141 155 goto out; 142 156 } ··· 199 213 if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE)) 200 214 return (unsigned long)NULL; 201 215 202 - /* explicitly check that the task_storage_ptr is not 203 - * NULL as task_storage_lookup returns NULL in this case and 204 - * bpf_local_storage_update expects the owner to have a 205 - * valid storage pointer. 206 - */ 207 - if (!task || !task_storage_ptr(task)) 216 + if (!task) 208 217 return (unsigned long)NULL; 209 218 210 219 sdata = task_storage_lookup(task, map, true); 211 220 if (sdata) 212 221 return (unsigned long)sdata->data; 213 222 214 - /* This helper must only be called from places where the lifetime of the task 215 - * is guaranteed. Either by being refcounted or by being protected 216 - * by an RCU read-side critical section. 217 - */ 218 - if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) { 223 + /* only allocate new storage, when the task is refcounted */ 224 + if (refcount_read(&task->usage) && 225 + (flags & BPF_LOCAL_STORAGE_GET_F_CREATE)) { 219 226 sdata = bpf_local_storage_update( 220 227 task, (struct bpf_local_storage_map *)map, value, 221 228 BPF_NOEXIST);
+5
kernel/fork.c
··· 96 96 #include <linux/kasan.h> 97 97 #include <linux/scs.h> 98 98 #include <linux/io_uring.h> 99 + #include <linux/bpf.h> 99 100 100 101 #include <asm/pgalloc.h> 101 102 #include <linux/uaccess.h> ··· 735 734 cgroup_free(tsk); 736 735 task_numa_free(tsk, true); 737 736 security_task_free(tsk); 737 + bpf_task_storage_free(tsk); 738 738 exit_creds(tsk); 739 739 delayacct_tsk_free(tsk); 740 740 put_signal_struct(tsk->signal); ··· 2063 2061 #ifdef CONFIG_BCACHE 2064 2062 p->sequential_io = 0; 2065 2063 p->sequential_io_avg = 0; 2064 + #endif 2065 + #ifdef CONFIG_BPF_SYSCALL 2066 + RCU_INIT_POINTER(p->bpf_storage, NULL); 2066 2067 #endif 2067 2068 2068 2069 /* Perform scheduler related setup. Assign this task to a CPU. */
+4
kernel/trace/bpf_trace.c
··· 1367 1367 return &bpf_per_cpu_ptr_proto; 1368 1368 case BPF_FUNC_this_cpu_ptr: 1369 1369 return &bpf_this_cpu_ptr_proto; 1370 + case BPF_FUNC_task_storage_get: 1371 + return &bpf_task_storage_get_proto; 1372 + case BPF_FUNC_task_storage_delete: 1373 + return &bpf_task_storage_delete_proto; 1370 1374 default: 1371 1375 return NULL; 1372 1376 }