Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

bpf: add queue and stack maps

Queue/stack maps implement a FIFO/LIFO data storage for ebpf programs.
These maps support peek, pop and push operations that are exposed to eBPF
programs through the new bpf_map[peek/pop/push] helpers. Those operations
are exposed to userspace applications through the already existing
syscalls in the following way:

BPF_MAP_LOOKUP_ELEM -> peek
BPF_MAP_LOOKUP_AND_DELETE_ELEM -> pop
BPF_MAP_UPDATE_ELEM -> push

Queue/stack maps are implemented using a buffer, tail and head indexes,
hence BPF_F_NO_PREALLOC is not supported.

As opposite to other maps, queue and stack do not use RCU for protecting
maps values, the bpf_map[peek/pop] have a ARG_PTR_TO_UNINIT_MAP_VALUE
argument that is a pointer to a memory zone where to save the value of a
map. Basically the same as ARG_PTR_TO_UNINIT_MEM, but the size has not
be passed as an extra argument.

Our main motivation for implementing queue/stack maps was to keep track
of a pool of elements, like network ports in a SNAT, however we forsee
other use cases, like for exampling saving last N kernel events in a map
and then analysing from userspace.

Signed-off-by: Mauricio Vasquez B <mauricio.vasquez@polito.it>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>

authored by

Mauricio Vasquez B and committed by
Alexei Starovoitov
f1a2e44a 2ea864c5

+401 -3
+6
include/linux/bpf.h
··· 39 39 void *(*map_lookup_elem)(struct bpf_map *map, void *key); 40 40 int (*map_update_elem)(struct bpf_map *map, void *key, void *value, u64 flags); 41 41 int (*map_delete_elem)(struct bpf_map *map, void *key); 42 + int (*map_push_elem)(struct bpf_map *map, void *value, u64 flags); 43 + int (*map_pop_elem)(struct bpf_map *map, void *value); 44 + int (*map_peek_elem)(struct bpf_map *map, void *value); 42 45 43 46 /* funcs called by prog_array and perf_event_array map */ 44 47 void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file, ··· 814 811 extern const struct bpf_func_proto bpf_map_lookup_elem_proto; 815 812 extern const struct bpf_func_proto bpf_map_update_elem_proto; 816 813 extern const struct bpf_func_proto bpf_map_delete_elem_proto; 814 + extern const struct bpf_func_proto bpf_map_push_elem_proto; 815 + extern const struct bpf_func_proto bpf_map_pop_elem_proto; 816 + extern const struct bpf_func_proto bpf_map_peek_elem_proto; 817 817 818 818 extern const struct bpf_func_proto bpf_get_prandom_u32_proto; 819 819 extern const struct bpf_func_proto bpf_get_smp_processor_id_proto;
+2
include/linux/bpf_types.h
··· 69 69 BPF_MAP_TYPE(BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, reuseport_array_ops) 70 70 #endif 71 71 #endif 72 + BPF_MAP_TYPE(BPF_MAP_TYPE_QUEUE, queue_map_ops) 73 + BPF_MAP_TYPE(BPF_MAP_TYPE_STACK, stack_map_ops)
+28 -1
include/uapi/linux/bpf.h
··· 128 128 BPF_MAP_TYPE_CGROUP_STORAGE, 129 129 BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, 130 130 BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, 131 + BPF_MAP_TYPE_QUEUE, 132 + BPF_MAP_TYPE_STACK, 131 133 }; 132 134 133 135 enum bpf_prog_type { ··· 462 460 * Description 463 461 * Delete entry with *key* from *map*. 464 462 * Return 463 + * 0 on success, or a negative error in case of failure. 464 + * 465 + * int bpf_map_push_elem(struct bpf_map *map, const void *value, u64 flags) 466 + * Description 467 + * Push an element *value* in *map*. *flags* is one of: 468 + * 469 + * **BPF_EXIST** 470 + * If the queue/stack is full, the oldest element is removed to 471 + * make room for this. 472 + * Return 473 + * 0 on success, or a negative error in case of failure. 474 + * 475 + * int bpf_map_pop_elem(struct bpf_map *map, void *value) 476 + * Description 477 + * Pop an element from *map*. 478 + * Return 479 + * 0 on success, or a negative error in case of failure. 480 + * 481 + * int bpf_map_peek_elem(struct bpf_map *map, void *value) 482 + * Description 483 + * Get an element from *map* without removing it. 484 + * Return 465 485 * 0 on success, or a negative error in case of failure. 466 486 * 467 487 * int bpf_probe_read(void *dst, u32 size, const void *src) ··· 2327 2303 FN(skb_ancestor_cgroup_id), \ 2328 2304 FN(sk_lookup_tcp), \ 2329 2305 FN(sk_lookup_udp), \ 2330 - FN(sk_release), 2306 + FN(sk_release), \ 2307 + FN(map_push_elem), \ 2308 + FN(map_pop_elem), \ 2309 + FN(map_peek_elem), 2331 2310 2332 2311 /* integer value in 'imm' field of BPF_CALL instruction selects which helper 2333 2312 * function eBPF program intends to call
+1 -1
kernel/bpf/Makefile
··· 3 3 4 4 obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o 5 5 obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o 6 - obj-$(CONFIG_BPF_SYSCALL) += local_storage.o 6 + obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o 7 7 obj-$(CONFIG_BPF_SYSCALL) += disasm.o 8 8 obj-$(CONFIG_BPF_SYSCALL) += btf.o 9 9 ifeq ($(CONFIG_NET),y)
+3
kernel/bpf/core.c
··· 1783 1783 const struct bpf_func_proto bpf_map_lookup_elem_proto __weak; 1784 1784 const struct bpf_func_proto bpf_map_update_elem_proto __weak; 1785 1785 const struct bpf_func_proto bpf_map_delete_elem_proto __weak; 1786 + const struct bpf_func_proto bpf_map_push_elem_proto __weak; 1787 + const struct bpf_func_proto bpf_map_pop_elem_proto __weak; 1788 + const struct bpf_func_proto bpf_map_peek_elem_proto __weak; 1786 1789 1787 1790 const struct bpf_func_proto bpf_get_prandom_u32_proto __weak; 1788 1791 const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak;
+43
kernel/bpf/helpers.c
··· 76 76 .arg2_type = ARG_PTR_TO_MAP_KEY, 77 77 }; 78 78 79 + BPF_CALL_3(bpf_map_push_elem, struct bpf_map *, map, void *, value, u64, flags) 80 + { 81 + return map->ops->map_push_elem(map, value, flags); 82 + } 83 + 84 + const struct bpf_func_proto bpf_map_push_elem_proto = { 85 + .func = bpf_map_push_elem, 86 + .gpl_only = false, 87 + .pkt_access = true, 88 + .ret_type = RET_INTEGER, 89 + .arg1_type = ARG_CONST_MAP_PTR, 90 + .arg2_type = ARG_PTR_TO_MAP_VALUE, 91 + .arg3_type = ARG_ANYTHING, 92 + }; 93 + 94 + BPF_CALL_2(bpf_map_pop_elem, struct bpf_map *, map, void *, value) 95 + { 96 + return map->ops->map_pop_elem(map, value); 97 + } 98 + 99 + const struct bpf_func_proto bpf_map_pop_elem_proto = { 100 + .func = bpf_map_pop_elem, 101 + .gpl_only = false, 102 + .pkt_access = true, 103 + .ret_type = RET_INTEGER, 104 + .arg1_type = ARG_CONST_MAP_PTR, 105 + .arg2_type = ARG_PTR_TO_UNINIT_MAP_VALUE, 106 + }; 107 + 108 + BPF_CALL_2(bpf_map_peek_elem, struct bpf_map *, map, void *, value) 109 + { 110 + return map->ops->map_peek_elem(map, value); 111 + } 112 + 113 + const struct bpf_func_proto bpf_map_peek_elem_proto = { 114 + .func = bpf_map_pop_elem, 115 + .gpl_only = false, 116 + .pkt_access = true, 117 + .ret_type = RET_INTEGER, 118 + .arg1_type = ARG_CONST_MAP_PTR, 119 + .arg2_type = ARG_PTR_TO_UNINIT_MAP_VALUE, 120 + }; 121 + 79 122 const struct bpf_func_proto bpf_get_prandom_u32_proto = { 80 123 .func = bpf_user_rnd_u32, 81 124 .gpl_only = false,
+288
kernel/bpf/queue_stack_maps.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * queue_stack_maps.c: BPF queue and stack maps 4 + * 5 + * Copyright (c) 2018 Politecnico di Torino 6 + */ 7 + #include <linux/bpf.h> 8 + #include <linux/list.h> 9 + #include <linux/slab.h> 10 + #include "percpu_freelist.h" 11 + 12 + #define QUEUE_STACK_CREATE_FLAG_MASK \ 13 + (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) 14 + 15 + 16 + struct bpf_queue_stack { 17 + struct bpf_map map; 18 + raw_spinlock_t lock; 19 + u32 head, tail; 20 + u32 size; /* max_entries + 1 */ 21 + 22 + char elements[0] __aligned(8); 23 + }; 24 + 25 + static struct bpf_queue_stack *bpf_queue_stack(struct bpf_map *map) 26 + { 27 + return container_of(map, struct bpf_queue_stack, map); 28 + } 29 + 30 + static bool queue_stack_map_is_empty(struct bpf_queue_stack *qs) 31 + { 32 + return qs->head == qs->tail; 33 + } 34 + 35 + static bool queue_stack_map_is_full(struct bpf_queue_stack *qs) 36 + { 37 + u32 head = qs->head + 1; 38 + 39 + if (unlikely(head >= qs->size)) 40 + head = 0; 41 + 42 + return head == qs->tail; 43 + } 44 + 45 + /* Called from syscall */ 46 + static int queue_stack_map_alloc_check(union bpf_attr *attr) 47 + { 48 + /* check sanity of attributes */ 49 + if (attr->max_entries == 0 || attr->key_size != 0 || 50 + attr->map_flags & ~QUEUE_STACK_CREATE_FLAG_MASK) 51 + return -EINVAL; 52 + 53 + if (attr->value_size > KMALLOC_MAX_SIZE) 54 + /* if value_size is bigger, the user space won't be able to 55 + * access the elements. 56 + */ 57 + return -E2BIG; 58 + 59 + return 0; 60 + } 61 + 62 + static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr) 63 + { 64 + int ret, numa_node = bpf_map_attr_numa_node(attr); 65 + struct bpf_queue_stack *qs; 66 + u32 size, value_size; 67 + u64 queue_size, cost; 68 + 69 + size = attr->max_entries + 1; 70 + value_size = attr->value_size; 71 + 72 + queue_size = sizeof(*qs) + (u64) value_size * size; 73 + 74 + cost = queue_size; 75 + if (cost >= U32_MAX - PAGE_SIZE) 76 + return ERR_PTR(-E2BIG); 77 + 78 + cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; 79 + 80 + ret = bpf_map_precharge_memlock(cost); 81 + if (ret < 0) 82 + return ERR_PTR(ret); 83 + 84 + qs = bpf_map_area_alloc(queue_size, numa_node); 85 + if (!qs) 86 + return ERR_PTR(-ENOMEM); 87 + 88 + memset(qs, 0, sizeof(*qs)); 89 + 90 + bpf_map_init_from_attr(&qs->map, attr); 91 + 92 + qs->map.pages = cost; 93 + qs->size = size; 94 + 95 + raw_spin_lock_init(&qs->lock); 96 + 97 + return &qs->map; 98 + } 99 + 100 + /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ 101 + static void queue_stack_map_free(struct bpf_map *map) 102 + { 103 + struct bpf_queue_stack *qs = bpf_queue_stack(map); 104 + 105 + /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, 106 + * so the programs (can be more than one that used this map) were 107 + * disconnected from events. Wait for outstanding critical sections in 108 + * these programs to complete 109 + */ 110 + synchronize_rcu(); 111 + 112 + bpf_map_area_free(qs); 113 + } 114 + 115 + static int __queue_map_get(struct bpf_map *map, void *value, bool delete) 116 + { 117 + struct bpf_queue_stack *qs = bpf_queue_stack(map); 118 + unsigned long flags; 119 + int err = 0; 120 + void *ptr; 121 + 122 + raw_spin_lock_irqsave(&qs->lock, flags); 123 + 124 + if (queue_stack_map_is_empty(qs)) { 125 + err = -ENOENT; 126 + goto out; 127 + } 128 + 129 + ptr = &qs->elements[qs->tail * qs->map.value_size]; 130 + memcpy(value, ptr, qs->map.value_size); 131 + 132 + if (delete) { 133 + if (unlikely(++qs->tail >= qs->size)) 134 + qs->tail = 0; 135 + } 136 + 137 + out: 138 + raw_spin_unlock_irqrestore(&qs->lock, flags); 139 + return err; 140 + } 141 + 142 + 143 + static int __stack_map_get(struct bpf_map *map, void *value, bool delete) 144 + { 145 + struct bpf_queue_stack *qs = bpf_queue_stack(map); 146 + unsigned long flags; 147 + int err = 0; 148 + void *ptr; 149 + u32 index; 150 + 151 + raw_spin_lock_irqsave(&qs->lock, flags); 152 + 153 + if (queue_stack_map_is_empty(qs)) { 154 + err = -ENOENT; 155 + goto out; 156 + } 157 + 158 + index = qs->head - 1; 159 + if (unlikely(index >= qs->size)) 160 + index = qs->size - 1; 161 + 162 + ptr = &qs->elements[index * qs->map.value_size]; 163 + memcpy(value, ptr, qs->map.value_size); 164 + 165 + if (delete) 166 + qs->head = index; 167 + 168 + out: 169 + raw_spin_unlock_irqrestore(&qs->lock, flags); 170 + return err; 171 + } 172 + 173 + /* Called from syscall or from eBPF program */ 174 + static int queue_map_peek_elem(struct bpf_map *map, void *value) 175 + { 176 + return __queue_map_get(map, value, false); 177 + } 178 + 179 + /* Called from syscall or from eBPF program */ 180 + static int stack_map_peek_elem(struct bpf_map *map, void *value) 181 + { 182 + return __stack_map_get(map, value, false); 183 + } 184 + 185 + /* Called from syscall or from eBPF program */ 186 + static int queue_map_pop_elem(struct bpf_map *map, void *value) 187 + { 188 + return __queue_map_get(map, value, true); 189 + } 190 + 191 + /* Called from syscall or from eBPF program */ 192 + static int stack_map_pop_elem(struct bpf_map *map, void *value) 193 + { 194 + return __stack_map_get(map, value, true); 195 + } 196 + 197 + /* Called from syscall or from eBPF program */ 198 + static int queue_stack_map_push_elem(struct bpf_map *map, void *value, 199 + u64 flags) 200 + { 201 + struct bpf_queue_stack *qs = bpf_queue_stack(map); 202 + unsigned long irq_flags; 203 + int err = 0; 204 + void *dst; 205 + 206 + /* BPF_EXIST is used to force making room for a new element in case the 207 + * map is full 208 + */ 209 + bool replace = (flags & BPF_EXIST); 210 + 211 + /* Check supported flags for queue and stack maps */ 212 + if (flags & BPF_NOEXIST || flags > BPF_EXIST) 213 + return -EINVAL; 214 + 215 + raw_spin_lock_irqsave(&qs->lock, irq_flags); 216 + 217 + if (queue_stack_map_is_full(qs)) { 218 + if (!replace) { 219 + err = -E2BIG; 220 + goto out; 221 + } 222 + /* advance tail pointer to overwrite oldest element */ 223 + if (unlikely(++qs->tail >= qs->size)) 224 + qs->tail = 0; 225 + } 226 + 227 + dst = &qs->elements[qs->head * qs->map.value_size]; 228 + memcpy(dst, value, qs->map.value_size); 229 + 230 + if (unlikely(++qs->head >= qs->size)) 231 + qs->head = 0; 232 + 233 + out: 234 + raw_spin_unlock_irqrestore(&qs->lock, irq_flags); 235 + return err; 236 + } 237 + 238 + /* Called from syscall or from eBPF program */ 239 + static void *queue_stack_map_lookup_elem(struct bpf_map *map, void *key) 240 + { 241 + return NULL; 242 + } 243 + 244 + /* Called from syscall or from eBPF program */ 245 + static int queue_stack_map_update_elem(struct bpf_map *map, void *key, 246 + void *value, u64 flags) 247 + { 248 + return -EINVAL; 249 + } 250 + 251 + /* Called from syscall or from eBPF program */ 252 + static int queue_stack_map_delete_elem(struct bpf_map *map, void *key) 253 + { 254 + return -EINVAL; 255 + } 256 + 257 + /* Called from syscall */ 258 + static int queue_stack_map_get_next_key(struct bpf_map *map, void *key, 259 + void *next_key) 260 + { 261 + return -EINVAL; 262 + } 263 + 264 + const struct bpf_map_ops queue_map_ops = { 265 + .map_alloc_check = queue_stack_map_alloc_check, 266 + .map_alloc = queue_stack_map_alloc, 267 + .map_free = queue_stack_map_free, 268 + .map_lookup_elem = queue_stack_map_lookup_elem, 269 + .map_update_elem = queue_stack_map_update_elem, 270 + .map_delete_elem = queue_stack_map_delete_elem, 271 + .map_push_elem = queue_stack_map_push_elem, 272 + .map_pop_elem = queue_map_pop_elem, 273 + .map_peek_elem = queue_map_peek_elem, 274 + .map_get_next_key = queue_stack_map_get_next_key, 275 + }; 276 + 277 + const struct bpf_map_ops stack_map_ops = { 278 + .map_alloc_check = queue_stack_map_alloc_check, 279 + .map_alloc = queue_stack_map_alloc, 280 + .map_free = queue_stack_map_free, 281 + .map_lookup_elem = queue_stack_map_lookup_elem, 282 + .map_update_elem = queue_stack_map_update_elem, 283 + .map_delete_elem = queue_stack_map_delete_elem, 284 + .map_push_elem = queue_stack_map_push_elem, 285 + .map_pop_elem = stack_map_pop_elem, 286 + .map_peek_elem = stack_map_peek_elem, 287 + .map_get_next_key = queue_stack_map_get_next_key, 288 + };
+6
kernel/bpf/syscall.c
··· 727 727 err = bpf_fd_htab_map_lookup_elem(map, key, value); 728 728 } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) { 729 729 err = bpf_fd_reuseport_array_lookup_elem(map, key, value); 730 + } else if (map->map_type == BPF_MAP_TYPE_QUEUE || 731 + map->map_type == BPF_MAP_TYPE_STACK) { 732 + err = map->ops->map_peek_elem(map, value); 730 733 } else { 731 734 rcu_read_lock(); 732 735 ptr = map->ops->map_lookup_elem(map, key); ··· 860 857 /* rcu_read_lock() is not needed */ 861 858 err = bpf_fd_reuseport_array_update_elem(map, key, value, 862 859 attr->flags); 860 + } else if (map->map_type == BPF_MAP_TYPE_QUEUE || 861 + map->map_type == BPF_MAP_TYPE_STACK) { 862 + err = map->ops->map_push_elem(map, value, attr->flags); 863 863 } else { 864 864 rcu_read_lock(); 865 865 err = map->ops->map_update_elem(map, key, value, attr->flags);
+18 -1
kernel/bpf/verifier.c
··· 2324 2324 if (func_id != BPF_FUNC_sk_select_reuseport) 2325 2325 goto error; 2326 2326 break; 2327 + case BPF_MAP_TYPE_QUEUE: 2328 + case BPF_MAP_TYPE_STACK: 2329 + if (func_id != BPF_FUNC_map_peek_elem && 2330 + func_id != BPF_FUNC_map_pop_elem && 2331 + func_id != BPF_FUNC_map_push_elem) 2332 + goto error; 2333 + break; 2327 2334 default: 2328 2335 break; 2329 2336 } ··· 2385 2378 break; 2386 2379 case BPF_FUNC_sk_select_reuseport: 2387 2380 if (map->map_type != BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) 2381 + goto error; 2382 + break; 2383 + case BPF_FUNC_map_peek_elem: 2384 + case BPF_FUNC_map_pop_elem: 2385 + case BPF_FUNC_map_push_elem: 2386 + if (map->map_type != BPF_MAP_TYPE_QUEUE && 2387 + map->map_type != BPF_MAP_TYPE_STACK) 2388 2388 goto error; 2389 2389 break; 2390 2390 default: ··· 2689 2675 if (func_id != BPF_FUNC_tail_call && 2690 2676 func_id != BPF_FUNC_map_lookup_elem && 2691 2677 func_id != BPF_FUNC_map_update_elem && 2692 - func_id != BPF_FUNC_map_delete_elem) 2678 + func_id != BPF_FUNC_map_delete_elem && 2679 + func_id != BPF_FUNC_map_push_elem && 2680 + func_id != BPF_FUNC_map_pop_elem && 2681 + func_id != BPF_FUNC_map_peek_elem) 2693 2682 return 0; 2694 2683 2695 2684 if (meta->map_ptr == NULL) {
+6
net/core/filter.c
··· 4876 4876 return &bpf_map_update_elem_proto; 4877 4877 case BPF_FUNC_map_delete_elem: 4878 4878 return &bpf_map_delete_elem_proto; 4879 + case BPF_FUNC_map_push_elem: 4880 + return &bpf_map_push_elem_proto; 4881 + case BPF_FUNC_map_pop_elem: 4882 + return &bpf_map_pop_elem_proto; 4883 + case BPF_FUNC_map_peek_elem: 4884 + return &bpf_map_peek_elem_proto; 4879 4885 case BPF_FUNC_get_prandom_u32: 4880 4886 return &bpf_get_prandom_u32_proto; 4881 4887 case BPF_FUNC_get_smp_processor_id: