Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

bpf: introduce per-cpu cgroup local storage

This commit introduced per-cpu cgroup local storage.

Per-cpu cgroup local storage is very similar to simple cgroup storage
(let's call it shared), except all the data is per-cpu.

The main goal of per-cpu variant is to implement super fast
counters (e.g. packet counters), which don't require neither
lookups, neither atomic operations.

>From userspace's point of view, accessing a per-cpu cgroup storage
is similar to other per-cpu map types (e.g. per-cpu hashmaps and
arrays).

Writing to a per-cpu cgroup storage is not atomic, but is performed
by copying longs, so some minimal atomicity is here, exactly
as with other per-cpu maps.

Signed-off-by: Roman Gushchin <guro@fb.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Alexei Starovoitov <ast@kernel.org>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>

authored by

Roman Gushchin and committed by
Daniel Borkmann
b741f163 f294b37e

+179 -28
+19 -1
include/linux/bpf-cgroup.h
··· 37 37 }; 38 38 39 39 struct bpf_cgroup_storage { 40 - struct bpf_storage_buffer *buf; 40 + union { 41 + struct bpf_storage_buffer *buf; 42 + void __percpu *percpu_buf; 43 + }; 41 44 struct bpf_cgroup_storage_map *map; 42 45 struct bpf_cgroup_storage_key key; 43 46 struct list_head list; ··· 112 109 static inline enum bpf_cgroup_storage_type cgroup_storage_type( 113 110 struct bpf_map *map) 114 111 { 112 + if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) 113 + return BPF_CGROUP_STORAGE_PERCPU; 114 + 115 115 return BPF_CGROUP_STORAGE_SHARED; 116 116 } 117 117 ··· 136 130 void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage *storage); 137 131 int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *map); 138 132 void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *map); 133 + 134 + int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key, void *value); 135 + int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, 136 + void *value, u64 flags); 139 137 140 138 /* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */ 141 139 #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb) \ ··· 295 285 struct bpf_prog *prog, enum bpf_cgroup_storage_type stype) { return 0; } 296 286 static inline void bpf_cgroup_storage_free( 297 287 struct bpf_cgroup_storage *storage) {} 288 + static inline int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key, 289 + void *value) { 290 + return 0; 291 + } 292 + static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, 293 + void *key, void *value, u64 flags) { 294 + return 0; 295 + } 298 296 299 297 #define cgroup_bpf_enabled (0) 300 298 #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0)
+1
include/linux/bpf.h
··· 274 274 275 275 enum bpf_cgroup_storage_type { 276 276 BPF_CGROUP_STORAGE_SHARED, 277 + BPF_CGROUP_STORAGE_PERCPU, 277 278 __BPF_CGROUP_STORAGE_MAX 278 279 }; 279 280
+1
include/linux/bpf_types.h
··· 43 43 #endif 44 44 #ifdef CONFIG_CGROUP_BPF 45 45 BPF_MAP_TYPE(BPF_MAP_TYPE_CGROUP_STORAGE, cgroup_storage_map_ops) 46 + BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, cgroup_storage_map_ops) 46 47 #endif 47 48 BPF_MAP_TYPE(BPF_MAP_TYPE_HASH, htab_map_ops) 48 49 BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_HASH, htab_percpu_map_ops)
+1
include/uapi/linux/bpf.h
··· 127 127 BPF_MAP_TYPE_SOCKHASH, 128 128 BPF_MAP_TYPE_CGROUP_STORAGE, 129 129 BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, 130 + BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, 130 131 }; 131 132 132 133 enum bpf_prog_type {
+7 -1
kernel/bpf/helpers.c
··· 206 206 */ 207 207 enum bpf_cgroup_storage_type stype = cgroup_storage_type(map); 208 208 struct bpf_cgroup_storage *storage; 209 + void *ptr; 209 210 210 211 storage = this_cpu_read(bpf_cgroup_storage[stype]); 211 212 212 - return (unsigned long)&READ_ONCE(storage->buf)->data[0]; 213 + if (stype == BPF_CGROUP_STORAGE_SHARED) 214 + ptr = &READ_ONCE(storage->buf)->data[0]; 215 + else 216 + ptr = this_cpu_ptr(storage->percpu_buf); 217 + 218 + return (unsigned long)ptr; 213 219 } 214 220 215 221 const struct bpf_func_proto bpf_get_local_storage_proto = {
+130 -20
kernel/bpf/local_storage.c
··· 152 152 return 0; 153 153 } 154 154 155 + int bpf_percpu_cgroup_storage_copy(struct bpf_map *_map, void *_key, 156 + void *value) 157 + { 158 + struct bpf_cgroup_storage_map *map = map_to_storage(_map); 159 + struct bpf_cgroup_storage_key *key = _key; 160 + struct bpf_cgroup_storage *storage; 161 + int cpu, off = 0; 162 + u32 size; 163 + 164 + rcu_read_lock(); 165 + storage = cgroup_storage_lookup(map, key, false); 166 + if (!storage) { 167 + rcu_read_unlock(); 168 + return -ENOENT; 169 + } 170 + 171 + /* per_cpu areas are zero-filled and bpf programs can only 172 + * access 'value_size' of them, so copying rounded areas 173 + * will not leak any kernel data 174 + */ 175 + size = round_up(_map->value_size, 8); 176 + for_each_possible_cpu(cpu) { 177 + bpf_long_memcpy(value + off, 178 + per_cpu_ptr(storage->percpu_buf, cpu), size); 179 + off += size; 180 + } 181 + rcu_read_unlock(); 182 + return 0; 183 + } 184 + 185 + int bpf_percpu_cgroup_storage_update(struct bpf_map *_map, void *_key, 186 + void *value, u64 map_flags) 187 + { 188 + struct bpf_cgroup_storage_map *map = map_to_storage(_map); 189 + struct bpf_cgroup_storage_key *key = _key; 190 + struct bpf_cgroup_storage *storage; 191 + int cpu, off = 0; 192 + u32 size; 193 + 194 + if (map_flags != BPF_ANY && map_flags != BPF_EXIST) 195 + return -EINVAL; 196 + 197 + rcu_read_lock(); 198 + storage = cgroup_storage_lookup(map, key, false); 199 + if (!storage) { 200 + rcu_read_unlock(); 201 + return -ENOENT; 202 + } 203 + 204 + /* the user space will provide round_up(value_size, 8) bytes that 205 + * will be copied into per-cpu area. bpf programs can only access 206 + * value_size of it. During lookup the same extra bytes will be 207 + * returned or zeros which were zero-filled by percpu_alloc, 208 + * so no kernel data leaks possible 209 + */ 210 + size = round_up(_map->value_size, 8); 211 + for_each_possible_cpu(cpu) { 212 + bpf_long_memcpy(per_cpu_ptr(storage->percpu_buf, cpu), 213 + value + off, size); 214 + off += size; 215 + } 216 + rcu_read_unlock(); 217 + return 0; 218 + } 219 + 155 220 static int cgroup_storage_get_next_key(struct bpf_map *_map, void *_key, 156 221 void *_next_key) 157 222 { ··· 352 287 spin_unlock_bh(&map->lock); 353 288 } 354 289 290 + static size_t bpf_cgroup_storage_calculate_size(struct bpf_map *map, u32 *pages) 291 + { 292 + size_t size; 293 + 294 + if (cgroup_storage_type(map) == BPF_CGROUP_STORAGE_SHARED) { 295 + size = sizeof(struct bpf_storage_buffer) + map->value_size; 296 + *pages = round_up(sizeof(struct bpf_cgroup_storage) + size, 297 + PAGE_SIZE) >> PAGE_SHIFT; 298 + } else { 299 + size = map->value_size; 300 + *pages = round_up(round_up(size, 8) * num_possible_cpus(), 301 + PAGE_SIZE) >> PAGE_SHIFT; 302 + } 303 + 304 + return size; 305 + } 306 + 355 307 struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog, 356 308 enum bpf_cgroup_storage_type stype) 357 309 { 358 310 struct bpf_cgroup_storage *storage; 359 311 struct bpf_map *map; 312 + gfp_t flags; 313 + size_t size; 360 314 u32 pages; 361 315 362 316 map = prog->aux->cgroup_storage[stype]; 363 317 if (!map) 364 318 return NULL; 365 319 366 - pages = round_up(sizeof(struct bpf_cgroup_storage) + 367 - sizeof(struct bpf_storage_buffer) + 368 - map->value_size, PAGE_SIZE) >> PAGE_SHIFT; 320 + size = bpf_cgroup_storage_calculate_size(map, &pages); 321 + 369 322 if (bpf_map_charge_memlock(map, pages)) 370 323 return ERR_PTR(-EPERM); 371 324 372 325 storage = kmalloc_node(sizeof(struct bpf_cgroup_storage), 373 326 __GFP_ZERO | GFP_USER, map->numa_node); 374 - if (!storage) { 375 - bpf_map_uncharge_memlock(map, pages); 376 - return ERR_PTR(-ENOMEM); 377 - } 327 + if (!storage) 328 + goto enomem; 378 329 379 - storage->buf = kmalloc_node(sizeof(struct bpf_storage_buffer) + 380 - map->value_size, __GFP_ZERO | GFP_USER, 381 - map->numa_node); 382 - if (!storage->buf) { 383 - bpf_map_uncharge_memlock(map, pages); 384 - kfree(storage); 385 - return ERR_PTR(-ENOMEM); 330 + flags = __GFP_ZERO | GFP_USER; 331 + 332 + if (stype == BPF_CGROUP_STORAGE_SHARED) { 333 + storage->buf = kmalloc_node(size, flags, map->numa_node); 334 + if (!storage->buf) 335 + goto enomem; 336 + } else { 337 + storage->percpu_buf = __alloc_percpu_gfp(size, 8, flags); 338 + if (!storage->percpu_buf) 339 + goto enomem; 386 340 } 387 341 388 342 storage->map = (struct bpf_cgroup_storage_map *)map; 389 343 390 344 return storage; 345 + 346 + enomem: 347 + bpf_map_uncharge_memlock(map, pages); 348 + kfree(storage); 349 + return ERR_PTR(-ENOMEM); 350 + } 351 + 352 + static void free_shared_cgroup_storage_rcu(struct rcu_head *rcu) 353 + { 354 + struct bpf_cgroup_storage *storage = 355 + container_of(rcu, struct bpf_cgroup_storage, rcu); 356 + 357 + kfree(storage->buf); 358 + kfree(storage); 359 + } 360 + 361 + static void free_percpu_cgroup_storage_rcu(struct rcu_head *rcu) 362 + { 363 + struct bpf_cgroup_storage *storage = 364 + container_of(rcu, struct bpf_cgroup_storage, rcu); 365 + 366 + free_percpu(storage->percpu_buf); 367 + kfree(storage); 391 368 } 392 369 393 370 void bpf_cgroup_storage_free(struct bpf_cgroup_storage *storage) 394 371 { 395 - u32 pages; 372 + enum bpf_cgroup_storage_type stype; 396 373 struct bpf_map *map; 374 + u32 pages; 397 375 398 376 if (!storage) 399 377 return; 400 378 401 379 map = &storage->map->map; 402 - pages = round_up(sizeof(struct bpf_cgroup_storage) + 403 - sizeof(struct bpf_storage_buffer) + 404 - map->value_size, PAGE_SIZE) >> PAGE_SHIFT; 380 + 381 + bpf_cgroup_storage_calculate_size(map, &pages); 405 382 bpf_map_uncharge_memlock(map, pages); 406 383 407 - kfree_rcu(storage->buf, rcu); 408 - kfree_rcu(storage, rcu); 384 + stype = cgroup_storage_type(map); 385 + if (stype == BPF_CGROUP_STORAGE_SHARED) 386 + call_rcu(&storage->rcu, free_shared_cgroup_storage_rcu); 387 + else 388 + call_rcu(&storage->rcu, free_percpu_cgroup_storage_rcu); 409 389 } 410 390 411 391 void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage,
+9 -2
kernel/bpf/syscall.c
··· 686 686 687 687 if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || 688 688 map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || 689 - map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) 689 + map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY || 690 + map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) 690 691 value_size = round_up(map->value_size, 8) * num_possible_cpus(); 691 692 else if (IS_FD_MAP(map)) 692 693 value_size = sizeof(u32); ··· 706 705 err = bpf_percpu_hash_copy(map, key, value); 707 706 } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { 708 707 err = bpf_percpu_array_copy(map, key, value); 708 + } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) { 709 + err = bpf_percpu_cgroup_storage_copy(map, key, value); 709 710 } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) { 710 711 err = bpf_stackmap_copy(map, key, value); 711 712 } else if (IS_FD_ARRAY(map)) { ··· 777 774 778 775 if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || 779 776 map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || 780 - map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) 777 + map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY || 778 + map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) 781 779 value_size = round_up(map->value_size, 8) * num_possible_cpus(); 782 780 else 783 781 value_size = map->value_size; ··· 813 809 err = bpf_percpu_hash_update(map, key, value, attr->flags); 814 810 } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { 815 811 err = bpf_percpu_array_update(map, key, value, attr->flags); 812 + } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) { 813 + err = bpf_percpu_cgroup_storage_update(map, key, value, 814 + attr->flags); 816 815 } else if (IS_FD_ARRAY(map)) { 817 816 rcu_read_lock(); 818 817 err = bpf_fd_array_map_update_elem(map, f.file, key, value,
+11 -4
kernel/bpf/verifier.c
··· 2074 2074 goto error; 2075 2075 break; 2076 2076 case BPF_MAP_TYPE_CGROUP_STORAGE: 2077 + case BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE: 2077 2078 if (func_id != BPF_FUNC_get_local_storage) 2078 2079 goto error; 2079 2080 break; ··· 2165 2164 goto error; 2166 2165 break; 2167 2166 case BPF_FUNC_get_local_storage: 2168 - if (map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE) 2167 + if (map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE && 2168 + map->map_type != BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) 2169 2169 goto error; 2170 2170 break; 2171 2171 case BPF_FUNC_sk_select_reuseport: ··· 5051 5049 return 0; 5052 5050 } 5053 5051 5052 + static bool bpf_map_is_cgroup_storage(struct bpf_map *map) 5053 + { 5054 + return (map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE || 5055 + map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE); 5056 + } 5057 + 5054 5058 /* look for pseudo eBPF instructions that access map FDs and 5055 5059 * replace them with actual map pointers 5056 5060 */ ··· 5147 5139 } 5148 5140 env->used_maps[env->used_map_cnt++] = map; 5149 5141 5150 - if (map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE && 5142 + if (bpf_map_is_cgroup_storage(map) && 5151 5143 bpf_cgroup_storage_assign(env->prog, map)) { 5152 - verbose(env, 5153 - "only one cgroup storage is allowed\n"); 5144 + verbose(env, "only one cgroup storage of each type is allowed\n"); 5154 5145 fdput(f); 5155 5146 return -EBUSY; 5156 5147 }