Merge branch 'bpf-fixes-for-per-cpu-kptr'

+1

include/linux/bpf.h

··· 2058 2058 bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *rec_b); 2059 2059 void bpf_obj_free_timer(const struct btf_record *rec, void *obj); 2060 2060 void bpf_obj_free_fields(const struct btf_record *rec, void *obj); 2061 + void __bpf_obj_drop_impl(void *p, const struct btf_record *rec, bool percpu); 2061 2062 2062 2063 struct bpf_map *bpf_map_get(u32 ufd); 2063 2064 struct bpf_map *bpf_map_get_with_uref(u32 ufd);

+1

include/linux/bpf_mem_alloc.h

··· 11 11 struct bpf_mem_alloc { 12 12 struct bpf_mem_caches __percpu *caches; 13 13 struct bpf_mem_cache __percpu *cache; 14 + bool percpu; 14 15 struct work_struct work; 15 16 }; 16 17

+1

include/linux/percpu.h

··· 132 132 extern void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp) __alloc_size(1); 133 133 extern void __percpu *__alloc_percpu(size_t size, size_t align) __alloc_size(1); 134 134 extern void free_percpu(void __percpu *__pdata); 135 + extern size_t pcpu_alloc_size(void __percpu *__pdata); 135 136 136 137 DEFINE_FREE(free_percpu, void __percpu *, free_percpu(_T)) 137 138

+15 -11

kernel/bpf/helpers.c

··· 1811 1811 } 1812 1812 } 1813 1813 1814 - void __bpf_obj_drop_impl(void *p, const struct btf_record *rec); 1815 - 1816 1814 void bpf_list_head_free(const struct btf_field *field, void *list_head, 1817 1815 struct bpf_spin_lock *spin_lock) 1818 1816 { ··· 1842 1844 * bpf_list_head which needs to be freed. 1843 1845 */ 1844 1846 migrate_disable(); 1845 - __bpf_obj_drop_impl(obj, field->graph_root.value_rec); 1847 + __bpf_obj_drop_impl(obj, field->graph_root.value_rec, false); 1846 1848 migrate_enable(); 1847 1849 } 1848 1850 } ··· 1881 1883 1882 1884 1883 1885 migrate_disable(); 1884 - __bpf_obj_drop_impl(obj, field->graph_root.value_rec); 1886 + __bpf_obj_drop_impl(obj, field->graph_root.value_rec, false); 1885 1887 migrate_enable(); 1886 1888 } 1887 1889 } ··· 1913 1915 } 1914 1916 1915 1917 /* Must be called under migrate_disable(), as required by bpf_mem_free */ 1916 - void __bpf_obj_drop_impl(void *p, const struct btf_record *rec) 1918 + void __bpf_obj_drop_impl(void *p, const struct btf_record *rec, bool percpu) 1917 1919 { 1920 + struct bpf_mem_alloc *ma; 1921 + 1918 1922 if (rec && rec->refcount_off >= 0 && 1919 1923 !refcount_dec_and_test((refcount_t *)(p + rec->refcount_off))) { 1920 1924 /* Object is refcounted and refcount_dec didn't result in 0 ··· 1928 1928 if (rec) 1929 1929 bpf_obj_free_fields(rec, p); 1930 1930 1931 - if (rec && rec->refcount_off >= 0) 1932 - bpf_mem_free_rcu(&bpf_global_ma, p); 1931 + if (percpu) 1932 + ma = &bpf_global_percpu_ma; 1933 1933 else 1934 - bpf_mem_free(&bpf_global_ma, p); 1934 + ma = &bpf_global_ma; 1935 + if (rec && rec->refcount_off >= 0) 1936 + bpf_mem_free_rcu(ma, p); 1937 + else 1938 + bpf_mem_free(ma, p); 1935 1939 } 1936 1940 1937 1941 __bpf_kfunc void bpf_obj_drop_impl(void *p__alloc, void *meta__ign) ··· 1943 1939 struct btf_struct_meta *meta = meta__ign; 1944 1940 void *p = p__alloc; 1945 1941 1946 - __bpf_obj_drop_impl(p, meta ? meta->record : NULL); 1942 + __bpf_obj_drop_impl(p, meta ? meta->record : NULL, false); 1947 1943 } 1948 1944 1949 1945 __bpf_kfunc void bpf_percpu_obj_drop_impl(void *p__alloc, void *meta__ign) ··· 1987 1983 */ 1988 1984 if (cmpxchg(&node->owner, NULL, BPF_PTR_POISON)) { 1989 1985 /* Only called from BPF prog, no need to migrate_disable */ 1990 - __bpf_obj_drop_impl((void *)n - off, rec); 1986 + __bpf_obj_drop_impl((void *)n - off, rec, false); 1991 1987 return -EINVAL; 1992 1988 } 1993 1989 ··· 2086 2082 */ 2087 2083 if (cmpxchg(&node->owner, NULL, BPF_PTR_POISON)) { 2088 2084 /* Only called from BPF prog, no need to migrate_disable */ 2089 - __bpf_obj_drop_impl((void *)n - off, rec); 2085 + __bpf_obj_drop_impl((void *)n - off, rec, false); 2090 2086 return -EINVAL; 2091 2087 } 2092 2088

+26 -12

kernel/bpf/memalloc.c

··· 491 491 struct llist_node *first; 492 492 unsigned int obj_size; 493 493 494 - /* For per-cpu allocator, the size of free objects in free list doesn't 495 - * match with unit_size and now there is no way to get the size of 496 - * per-cpu pointer saved in free object, so just skip the checking. 497 - */ 498 - if (c->percpu_size) 499 - return 0; 500 - 501 494 first = c->free_llist.first; 502 495 if (!first) 503 496 return 0; 504 497 505 - obj_size = ksize(first); 498 + if (c->percpu_size) 499 + obj_size = pcpu_alloc_size(((void **)first)[1]); 500 + else 501 + obj_size = ksize(first); 506 502 if (obj_size != c->unit_size) { 507 - WARN_ONCE(1, "bpf_mem_cache[%u]: unexpected object size %u, expect %u\n", 508 - idx, obj_size, c->unit_size); 503 + WARN_ONCE(1, "bpf_mem_cache[%u]: percpu %d, unexpected object size %u, expect %u\n", 504 + idx, c->percpu_size, obj_size, c->unit_size); 509 505 return -EINVAL; 510 506 } 511 507 return 0; ··· 525 529 /* room for llist_node and per-cpu pointer */ 526 530 if (percpu) 527 531 percpu_size = LLIST_NODE_SZ + sizeof(void *); 532 + ma->percpu = percpu; 528 533 529 534 if (size) { 530 535 pc = __alloc_percpu_gfp(sizeof(*pc), 8, GFP_KERNEL); ··· 875 878 return !ret ? NULL : ret + LLIST_NODE_SZ; 876 879 } 877 880 881 + static notrace int bpf_mem_free_idx(void *ptr, bool percpu) 882 + { 883 + size_t size; 884 + 885 + if (percpu) 886 + size = pcpu_alloc_size(*((void **)ptr)); 887 + else 888 + size = ksize(ptr - LLIST_NODE_SZ); 889 + return bpf_mem_cache_idx(size); 890 + } 891 + 878 892 void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr) 879 893 { 880 894 int idx; ··· 893 885 if (!ptr) 894 886 return; 895 887 896 - idx = bpf_mem_cache_idx(ksize(ptr - LLIST_NODE_SZ)); 888 + idx = bpf_mem_free_idx(ptr, ma->percpu); 897 889 if (idx < 0) 898 890 return; 899 891 ··· 907 899 if (!ptr) 908 900 return; 909 901 910 - idx = bpf_mem_cache_idx(ksize(ptr - LLIST_NODE_SZ)); 902 + idx = bpf_mem_free_idx(ptr, ma->percpu); 911 903 if (idx < 0) 912 904 return; 913 905 ··· 981 973 return !ret ? NULL : ret + LLIST_NODE_SZ; 982 974 } 983 975 976 + /* The alignment of dynamic per-cpu area is 8, so c->unit_size and the 977 + * actual size of dynamic per-cpu area will always be matched and there is 978 + * no need to adjust size_index for per-cpu allocation. However for the 979 + * simplicity of the implementation, use an unified size_index for both 980 + * kmalloc and per-cpu allocation. 981 + */ 984 982 static __init int bpf_mem_cache_adjust_size(void) 985 983 { 986 984 unsigned int size;

+2 -4

kernel/bpf/syscall.c

··· 626 626 bpf_timer_cancel_and_free(obj + rec->timer_off); 627 627 } 628 628 629 - extern void __bpf_obj_drop_impl(void *p, const struct btf_record *rec); 630 - 631 629 void bpf_obj_free_fields(const struct btf_record *rec, void *obj) 632 630 { 633 631 const struct btf_field *fields; ··· 660 662 field->kptr.btf_id); 661 663 migrate_disable(); 662 664 __bpf_obj_drop_impl(xchgd_field, pointee_struct_meta ? 663 - pointee_struct_meta->record : 664 - NULL); 665 + pointee_struct_meta->record : NULL, 666 + fields[i].type == BPF_KPTR_PERCPU); 665 667 migrate_enable(); 666 668 } else { 667 669 field->kptr.dtor(xchgd_field);

+32 -3

mm/percpu.c

··· 2245 2245 } 2246 2246 2247 2247 /** 2248 + * pcpu_alloc_size - the size of the dynamic percpu area 2249 + * @ptr: pointer to the dynamic percpu area 2250 + * 2251 + * Returns the size of the @ptr allocation. This is undefined for statically 2252 + * defined percpu variables as there is no corresponding chunk->bound_map. 2253 + * 2254 + * RETURNS: 2255 + * The size of the dynamic percpu area. 2256 + * 2257 + * CONTEXT: 2258 + * Can be called from atomic context. 2259 + */ 2260 + size_t pcpu_alloc_size(void __percpu *ptr) 2261 + { 2262 + struct pcpu_chunk *chunk; 2263 + unsigned long bit_off, end; 2264 + void *addr; 2265 + 2266 + if (!ptr) 2267 + return 0; 2268 + 2269 + addr = __pcpu_ptr_to_addr(ptr); 2270 + /* No pcpu_lock here: ptr has not been freed, so chunk is still alive */ 2271 + chunk = pcpu_chunk_addr_search(addr); 2272 + bit_off = (addr - chunk->base_addr) / PCPU_MIN_ALLOC_SIZE; 2273 + end = find_next_bit(chunk->bound_map, pcpu_chunk_map_bits(chunk), 2274 + bit_off + 1); 2275 + return (end - bit_off) * PCPU_MIN_ALLOC_SIZE; 2276 + } 2277 + 2278 + /** 2248 2279 * free_percpu - free percpu area 2249 2280 * @ptr: pointer to area to free 2250 2281 * ··· 2298 2267 kmemleak_free_percpu(ptr); 2299 2268 2300 2269 addr = __pcpu_ptr_to_addr(ptr); 2301 - 2302 - spin_lock_irqsave(&pcpu_lock, flags); 2303 - 2304 2270 chunk = pcpu_chunk_addr_search(addr); 2305 2271 off = addr - chunk->base_addr; 2306 2272 2273 + spin_lock_irqsave(&pcpu_lock, flags); 2307 2274 size = pcpu_free_area(chunk, off); 2308 2275 2309 2276 pcpu_memcg_free_hook(chunk, off, size);

+19 -1

tools/testing/selftests/bpf/prog_tests/test_bpf_ma.c

··· 9 9 10 10 #include "test_bpf_ma.skel.h" 11 11 12 - void test_test_bpf_ma(void) 12 + static void do_bpf_ma_test(const char *name) 13 13 { 14 14 struct test_bpf_ma *skel; 15 + struct bpf_program *prog; 15 16 struct btf *btf; 16 17 int i, err; 17 18 ··· 35 34 skel->rodata->data_btf_ids[i] = id; 36 35 } 37 36 37 + prog = bpf_object__find_program_by_name(skel->obj, name); 38 + if (!ASSERT_OK_PTR(prog, "invalid prog name")) 39 + goto out; 40 + bpf_program__set_autoload(prog, true); 41 + 38 42 err = test_bpf_ma__load(skel); 39 43 if (!ASSERT_OK(err, "load")) 40 44 goto out; ··· 53 47 ASSERT_OK(skel->bss->err, "test error"); 54 48 out: 55 49 test_bpf_ma__destroy(skel); 50 + } 51 + 52 + void test_test_bpf_ma(void) 53 + { 54 + if (test__start_subtest("batch_alloc_free")) 55 + do_bpf_ma_test("test_batch_alloc_free"); 56 + if (test__start_subtest("free_through_map_free")) 57 + do_bpf_ma_test("test_free_through_map_free"); 58 + if (test__start_subtest("batch_percpu_alloc_free")) 59 + do_bpf_ma_test("test_batch_percpu_alloc_free"); 60 + if (test__start_subtest("percpu_free_through_map_free")) 61 + do_bpf_ma_test("test_percpu_free_through_map_free"); 56 62 }

+174 -6

tools/testing/selftests/bpf/progs/test_bpf_ma.c

··· 37 37 __type(key, int); \ 38 38 __type(value, struct map_value_##_size); \ 39 39 __uint(max_entries, 128); \ 40 - } array_##_size SEC(".maps"); 40 + } array_##_size SEC(".maps") 41 41 42 - static __always_inline void batch_alloc_free(struct bpf_map *map, unsigned int batch, 43 - unsigned int idx) 42 + #define DEFINE_ARRAY_WITH_PERCPU_KPTR(_size) \ 43 + struct map_value_percpu_##_size { \ 44 + struct bin_data_##_size __percpu_kptr * data; \ 45 + }; \ 46 + struct { \ 47 + __uint(type, BPF_MAP_TYPE_ARRAY); \ 48 + __type(key, int); \ 49 + __type(value, struct map_value_percpu_##_size); \ 50 + __uint(max_entries, 128); \ 51 + } array_percpu_##_size SEC(".maps") 52 + 53 + static __always_inline void batch_alloc(struct bpf_map *map, unsigned int batch, unsigned int idx) 44 54 { 45 55 struct generic_map_value *value; 46 56 unsigned int i, key; ··· 75 65 return; 76 66 } 77 67 } 68 + } 69 + 70 + static __always_inline void batch_free(struct bpf_map *map, unsigned int batch, unsigned int idx) 71 + { 72 + struct generic_map_value *value; 73 + unsigned int i, key; 74 + void *old; 75 + 78 76 for (i = 0; i < batch; i++) { 79 77 key = i; 80 78 value = bpf_map_lookup_elem(map, &key); ··· 99 81 } 100 82 } 101 83 84 + static __always_inline void batch_percpu_alloc(struct bpf_map *map, unsigned int batch, 85 + unsigned int idx) 86 + { 87 + struct generic_map_value *value; 88 + unsigned int i, key; 89 + void *old, *new; 90 + 91 + for (i = 0; i < batch; i++) { 92 + key = i; 93 + value = bpf_map_lookup_elem(map, &key); 94 + if (!value) { 95 + err = 1; 96 + return; 97 + } 98 + /* per-cpu allocator may not be able to refill in time */ 99 + new = bpf_percpu_obj_new_impl(data_btf_ids[idx], NULL); 100 + if (!new) 101 + continue; 102 + 103 + old = bpf_kptr_xchg(&value->data, new); 104 + if (old) { 105 + bpf_percpu_obj_drop(old); 106 + err = 2; 107 + return; 108 + } 109 + } 110 + } 111 + 112 + static __always_inline void batch_percpu_free(struct bpf_map *map, unsigned int batch, 113 + unsigned int idx) 114 + { 115 + struct generic_map_value *value; 116 + unsigned int i, key; 117 + void *old; 118 + 119 + for (i = 0; i < batch; i++) { 120 + key = i; 121 + value = bpf_map_lookup_elem(map, &key); 122 + if (!value) { 123 + err = 3; 124 + return; 125 + } 126 + old = bpf_kptr_xchg(&value->data, NULL); 127 + if (!old) 128 + continue; 129 + bpf_percpu_obj_drop(old); 130 + } 131 + } 132 + 133 + #define CALL_BATCH_ALLOC(size, batch, idx) \ 134 + batch_alloc((struct bpf_map *)(&array_##size), batch, idx) 135 + 102 136 #define CALL_BATCH_ALLOC_FREE(size, batch, idx) \ 103 - batch_alloc_free((struct bpf_map *)(&array_##size), batch, idx) 137 + do { \ 138 + batch_alloc((struct bpf_map *)(&array_##size), batch, idx); \ 139 + batch_free((struct bpf_map *)(&array_##size), batch, idx); \ 140 + } while (0) 141 + 142 + #define CALL_BATCH_PERCPU_ALLOC(size, batch, idx) \ 143 + batch_percpu_alloc((struct bpf_map *)(&array_percpu_##size), batch, idx) 144 + 145 + #define CALL_BATCH_PERCPU_ALLOC_FREE(size, batch, idx) \ 146 + do { \ 147 + batch_percpu_alloc((struct bpf_map *)(&array_percpu_##size), batch, idx); \ 148 + batch_percpu_free((struct bpf_map *)(&array_percpu_##size), batch, idx); \ 149 + } while (0) 104 150 105 151 DEFINE_ARRAY_WITH_KPTR(8); 106 152 DEFINE_ARRAY_WITH_KPTR(16); ··· 179 97 DEFINE_ARRAY_WITH_KPTR(2048); 180 98 DEFINE_ARRAY_WITH_KPTR(4096); 181 99 182 - SEC("fentry/" SYS_PREFIX "sys_nanosleep") 183 - int test_bpf_mem_alloc_free(void *ctx) 100 + /* per-cpu kptr doesn't support bin_data_8 which is a zero-sized array */ 101 + DEFINE_ARRAY_WITH_PERCPU_KPTR(16); 102 + DEFINE_ARRAY_WITH_PERCPU_KPTR(32); 103 + DEFINE_ARRAY_WITH_PERCPU_KPTR(64); 104 + DEFINE_ARRAY_WITH_PERCPU_KPTR(96); 105 + DEFINE_ARRAY_WITH_PERCPU_KPTR(128); 106 + DEFINE_ARRAY_WITH_PERCPU_KPTR(192); 107 + DEFINE_ARRAY_WITH_PERCPU_KPTR(256); 108 + DEFINE_ARRAY_WITH_PERCPU_KPTR(512); 109 + DEFINE_ARRAY_WITH_PERCPU_KPTR(1024); 110 + DEFINE_ARRAY_WITH_PERCPU_KPTR(2048); 111 + DEFINE_ARRAY_WITH_PERCPU_KPTR(4096); 112 + 113 + SEC("?fentry/" SYS_PREFIX "sys_nanosleep") 114 + int test_batch_alloc_free(void *ctx) 184 115 { 185 116 if ((u32)bpf_get_current_pid_tgid() != pid) 186 117 return 0; ··· 213 118 CALL_BATCH_ALLOC_FREE(1024, 32, 9); 214 119 CALL_BATCH_ALLOC_FREE(2048, 16, 10); 215 120 CALL_BATCH_ALLOC_FREE(4096, 8, 11); 121 + 122 + return 0; 123 + } 124 + 125 + SEC("?fentry/" SYS_PREFIX "sys_nanosleep") 126 + int test_free_through_map_free(void *ctx) 127 + { 128 + if ((u32)bpf_get_current_pid_tgid() != pid) 129 + return 0; 130 + 131 + /* Alloc 128 8-bytes objects in batch to trigger refilling, 132 + * then free these objects through map free. 133 + */ 134 + CALL_BATCH_ALLOC(8, 128, 0); 135 + CALL_BATCH_ALLOC(16, 128, 1); 136 + CALL_BATCH_ALLOC(32, 128, 2); 137 + CALL_BATCH_ALLOC(64, 128, 3); 138 + CALL_BATCH_ALLOC(96, 128, 4); 139 + CALL_BATCH_ALLOC(128, 128, 5); 140 + CALL_BATCH_ALLOC(192, 128, 6); 141 + CALL_BATCH_ALLOC(256, 128, 7); 142 + CALL_BATCH_ALLOC(512, 64, 8); 143 + CALL_BATCH_ALLOC(1024, 32, 9); 144 + CALL_BATCH_ALLOC(2048, 16, 10); 145 + CALL_BATCH_ALLOC(4096, 8, 11); 146 + 147 + return 0; 148 + } 149 + 150 + SEC("?fentry/" SYS_PREFIX "sys_nanosleep") 151 + int test_batch_percpu_alloc_free(void *ctx) 152 + { 153 + if ((u32)bpf_get_current_pid_tgid() != pid) 154 + return 0; 155 + 156 + /* Alloc 128 16-bytes per-cpu objects in batch to trigger refilling, 157 + * then free 128 16-bytes per-cpu objects in batch to trigger freeing. 158 + */ 159 + CALL_BATCH_PERCPU_ALLOC_FREE(16, 128, 1); 160 + CALL_BATCH_PERCPU_ALLOC_FREE(32, 128, 2); 161 + CALL_BATCH_PERCPU_ALLOC_FREE(64, 128, 3); 162 + CALL_BATCH_PERCPU_ALLOC_FREE(96, 128, 4); 163 + CALL_BATCH_PERCPU_ALLOC_FREE(128, 128, 5); 164 + CALL_BATCH_PERCPU_ALLOC_FREE(192, 128, 6); 165 + CALL_BATCH_PERCPU_ALLOC_FREE(256, 128, 7); 166 + CALL_BATCH_PERCPU_ALLOC_FREE(512, 64, 8); 167 + CALL_BATCH_PERCPU_ALLOC_FREE(1024, 32, 9); 168 + CALL_BATCH_PERCPU_ALLOC_FREE(2048, 16, 10); 169 + CALL_BATCH_PERCPU_ALLOC_FREE(4096, 8, 11); 170 + 171 + return 0; 172 + } 173 + 174 + SEC("?fentry/" SYS_PREFIX "sys_nanosleep") 175 + int test_percpu_free_through_map_free(void *ctx) 176 + { 177 + if ((u32)bpf_get_current_pid_tgid() != pid) 178 + return 0; 179 + 180 + /* Alloc 128 16-bytes per-cpu objects in batch to trigger refilling, 181 + * then free these object through map free. 182 + */ 183 + CALL_BATCH_PERCPU_ALLOC(16, 128, 1); 184 + CALL_BATCH_PERCPU_ALLOC(32, 128, 2); 185 + CALL_BATCH_PERCPU_ALLOC(64, 128, 3); 186 + CALL_BATCH_PERCPU_ALLOC(96, 128, 4); 187 + CALL_BATCH_PERCPU_ALLOC(128, 128, 5); 188 + CALL_BATCH_PERCPU_ALLOC(192, 128, 6); 189 + CALL_BATCH_PERCPU_ALLOC(256, 128, 7); 190 + CALL_BATCH_PERCPU_ALLOC(512, 64, 8); 191 + CALL_BATCH_PERCPU_ALLOC(1024, 32, 9); 192 + CALL_BATCH_PERCPU_ALLOC(2048, 16, 10); 193 + CALL_BATCH_PERCPU_ALLOC(4096, 8, 11); 216 194 217 195 return 0; 218 196 }