Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

bpf: Support kptrs in local storage maps

Enable support for kptrs in local storage maps by wiring up the freeing
of these kptrs from map value. Freeing of bpf_local_storage_map is only
delayed in case there are special fields, therefore bpf_selem_free_*
path can also only dereference smap safely in that case. This is
recorded using a bool utilizing a hole in bpF_local_storage_elem. It
could have been tagged in the pointer value smap using the lowest bit
(since alignment > 1), but since there was already a hole I went with
the simpler option. Only the map structure freeing is delayed using RCU
barriers, as the buckets aren't used when selem is being freed, so they
can be freed once all readers of the bucket lists can no longer access
it.

Cc: Martin KaFai Lau <martin.lau@kernel.org>
Cc: KP Singh <kpsingh@kernel.org>
Cc: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20230225154010.391965-3-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>

authored by

Kumar Kartikeya Dwivedi and committed by
Alexei Starovoitov
9db44fdd 65334e64

+63 -9
+6
include/linux/bpf_local_storage.h
··· 74 74 struct hlist_node snode; /* Linked to bpf_local_storage */ 75 75 struct bpf_local_storage __rcu *local_storage; 76 76 struct rcu_head rcu; 77 + bool can_use_smap; /* Is it safe to access smap in bpf_selem_free_* RCU 78 + * callbacks? bpf_local_storage_map_free only 79 + * executes rcu_barrier when there are special 80 + * fields, this field remembers that to ensure we 81 + * don't access already freed smap in sdata. 82 + */ 77 83 /* 8 bytes hole */ 78 84 /* The data is stored in another cacheline to minimize 79 85 * the number of cachelines access during a cache hit.
+44 -4
kernel/bpf/bpf_local_storage.c
··· 85 85 if (selem) { 86 86 if (value) 87 87 copy_map_value(&smap->map, SDATA(selem)->data, value); 88 + /* No need to call check_and_init_map_value as memory is zero init */ 88 89 return selem; 89 90 } 90 91 ··· 114 113 struct bpf_local_storage_elem *selem; 115 114 116 115 selem = container_of(rcu, struct bpf_local_storage_elem, rcu); 116 + /* The can_use_smap bool is set whenever we need to free additional 117 + * fields in selem data before freeing selem. bpf_local_storage_map_free 118 + * only executes rcu_barrier to wait for RCU callbacks when it has 119 + * special fields, hence we can only conditionally dereference smap, as 120 + * by this time the map might have already been freed without waiting 121 + * for our call_rcu callback if it did not have any special fields. 122 + */ 123 + if (selem->can_use_smap) 124 + bpf_obj_free_fields(SDATA(selem)->smap->map.record, SDATA(selem)->data); 125 + kfree(selem); 126 + } 127 + 128 + static void bpf_selem_free_tasks_trace_rcu(struct rcu_head *rcu) 129 + { 130 + /* Free directly if Tasks Trace RCU GP also implies RCU GP */ 117 131 if (rcu_trace_implies_rcu_gp()) 118 - kfree(selem); 132 + bpf_selem_free_rcu(rcu); 119 133 else 120 - kfree_rcu(selem, rcu); 134 + call_rcu(rcu, bpf_selem_free_rcu); 121 135 } 122 136 123 137 /* local_storage->lock must be held and selem->local_storage == local_storage. ··· 186 170 RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL); 187 171 188 172 if (use_trace_rcu) 189 - call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_rcu); 173 + call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_tasks_trace_rcu); 190 174 else 191 - kfree_rcu(selem, rcu); 175 + call_rcu(&selem->rcu, bpf_selem_free_rcu); 192 176 193 177 return free_local_storage; 194 178 } ··· 256 240 RCU_INIT_POINTER(SDATA(selem)->smap, smap); 257 241 hlist_add_head_rcu(&selem->map_node, &b->list); 258 242 raw_spin_unlock_irqrestore(&b->lock, flags); 243 + 244 + /* If our data will have special fields, smap will wait for us to use 245 + * its record in bpf_selem_free_* RCU callbacks before freeing itself. 246 + */ 247 + selem->can_use_smap = !IS_ERR_OR_NULL(smap->map.record); 259 248 } 260 249 261 250 void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool use_trace_rcu) ··· 744 723 */ 745 724 synchronize_rcu(); 746 725 726 + /* Only delay freeing of smap, buckets are not needed anymore */ 747 727 kvfree(smap->buckets); 728 + 729 + /* When local storage has special fields, callbacks for 730 + * bpf_selem_free_rcu and bpf_selem_free_tasks_trace_rcu will keep using 731 + * the map BTF record, we need to execute an RCU barrier to wait for 732 + * them as the record will be freed right after our map_free callback. 733 + */ 734 + if (!IS_ERR_OR_NULL(smap->map.record)) { 735 + rcu_barrier_tasks_trace(); 736 + /* We cannot skip rcu_barrier() when rcu_trace_implies_rcu_gp() 737 + * is true, because while call_rcu invocation is skipped in that 738 + * case in bpf_selem_free_tasks_trace_rcu (and all local storage 739 + * maps pass use_trace_rcu = true), there can be call_rcu 740 + * callbacks based on use_trace_rcu = false in the earlier while 741 + * ((selem = ...)) loop or from bpf_local_storage_unlink_nolock 742 + * called from owner's free path. 743 + */ 744 + rcu_barrier(); 745 + } 748 746 bpf_map_area_free(smap); 749 747 }
+5 -1
kernel/bpf/syscall.c
··· 1063 1063 map->map_type != BPF_MAP_TYPE_LRU_HASH && 1064 1064 map->map_type != BPF_MAP_TYPE_LRU_PERCPU_HASH && 1065 1065 map->map_type != BPF_MAP_TYPE_ARRAY && 1066 - map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY) { 1066 + map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY && 1067 + map->map_type != BPF_MAP_TYPE_SK_STORAGE && 1068 + map->map_type != BPF_MAP_TYPE_INODE_STORAGE && 1069 + map->map_type != BPF_MAP_TYPE_TASK_STORAGE && 1070 + map->map_type != BPF_MAP_TYPE_CGRP_STORAGE) { 1067 1071 ret = -EOPNOTSUPP; 1068 1072 goto free_map_tab; 1069 1073 }
+8 -4
kernel/bpf/verifier.c
··· 7222 7222 break; 7223 7223 case BPF_MAP_TYPE_SK_STORAGE: 7224 7224 if (func_id != BPF_FUNC_sk_storage_get && 7225 - func_id != BPF_FUNC_sk_storage_delete) 7225 + func_id != BPF_FUNC_sk_storage_delete && 7226 + func_id != BPF_FUNC_kptr_xchg) 7226 7227 goto error; 7227 7228 break; 7228 7229 case BPF_MAP_TYPE_INODE_STORAGE: 7229 7230 if (func_id != BPF_FUNC_inode_storage_get && 7230 - func_id != BPF_FUNC_inode_storage_delete) 7231 + func_id != BPF_FUNC_inode_storage_delete && 7232 + func_id != BPF_FUNC_kptr_xchg) 7231 7233 goto error; 7232 7234 break; 7233 7235 case BPF_MAP_TYPE_TASK_STORAGE: 7234 7236 if (func_id != BPF_FUNC_task_storage_get && 7235 - func_id != BPF_FUNC_task_storage_delete) 7237 + func_id != BPF_FUNC_task_storage_delete && 7238 + func_id != BPF_FUNC_kptr_xchg) 7236 7239 goto error; 7237 7240 break; 7238 7241 case BPF_MAP_TYPE_CGRP_STORAGE: 7239 7242 if (func_id != BPF_FUNC_cgrp_storage_get && 7240 - func_id != BPF_FUNC_cgrp_storage_delete) 7243 + func_id != BPF_FUNC_cgrp_storage_delete && 7244 + func_id != BPF_FUNC_kptr_xchg) 7241 7245 goto error; 7242 7246 break; 7243 7247 case BPF_MAP_TYPE_BLOOM_FILTER: