Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

bpf: Add "live packet" mode for XDP in BPF_PROG_RUN

This adds support for running XDP programs through BPF_PROG_RUN in a mode
that enables live packet processing of the resulting frames. Previous uses
of BPF_PROG_RUN for XDP returned the XDP program return code and the
modified packet data to userspace, which is useful for unit testing of XDP
programs.

The existing BPF_PROG_RUN for XDP allows userspace to set the ingress
ifindex and RXQ number as part of the context object being passed to the
kernel. This patch reuses that code, but adds a new mode with different
semantics, which can be selected with the new BPF_F_TEST_XDP_LIVE_FRAMES
flag.

When running BPF_PROG_RUN in this mode, the XDP program return codes will
be honoured: returning XDP_PASS will result in the frame being injected
into the networking stack as if it came from the selected networking
interface, while returning XDP_TX and XDP_REDIRECT will result in the frame
being transmitted out that interface. XDP_TX is translated into an
XDP_REDIRECT operation to the same interface, since the real XDP_TX action
is only possible from within the network drivers themselves, not from the
process context where BPF_PROG_RUN is executed.

Internally, this new mode of operation creates a page pool instance while
setting up the test run, and feeds pages from that into the XDP program.
The setup cost of this is amortised over the number of repetitions
specified by userspace.

To support the performance testing use case, we further optimise the setup
step so that all pages in the pool are pre-initialised with the packet
data, and pre-computed context and xdp_frame objects stored at the start of
each page. This makes it possible to entirely avoid touching the page
content on each XDP program invocation, and enables sending up to 9
Mpps/core on my test box.

Because the data pages are recycled by the page pool, and the test runner
doesn't re-initialise them for each run, subsequent invocations of the XDP
program will see the packet data in the state it was after the last time it
ran on that particular page. This means that an XDP program that modifies
the packet before redirecting it has to be careful about which assumptions
it makes about the packet content, but that is only an issue for the most
naively written programs.

Enabling the new flag is only allowed when not setting ctx_out and data_out
in the test specification, since using it means frames will be redirected
somewhere else, so they can't be returned.

Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20220309105346.100053-2-toke@redhat.com

authored by

Toke Høiland-Jørgensen and committed by
Alexei Starovoitov
b530e9e1 3399dd9f

+328 -15
+3
include/uapi/linux/bpf.h
··· 1232 1232 1233 1233 /* If set, run the test on the cpu specified by bpf_attr.test.cpu */ 1234 1234 #define BPF_F_TEST_RUN_ON_CPU (1U << 0) 1235 + /* If set, XDP frames will be transmitted after processing */ 1236 + #define BPF_F_TEST_XDP_LIVE_FRAMES (1U << 1) 1235 1237 1236 1238 /* type for BPF_ENABLE_STATS */ 1237 1239 enum bpf_stats_type { ··· 1395 1393 __aligned_u64 ctx_out; 1396 1394 __u32 flags; 1397 1395 __u32 cpu; 1396 + __u32 batch_size; 1398 1397 } test; 1399 1398 1400 1399 struct { /* anonymous struct used by BPF_*_GET_*_ID */
+1
kernel/bpf/Kconfig
··· 30 30 select TASKS_TRACE_RCU 31 31 select BINARY_PRINTF 32 32 select NET_SOCK_MSG if NET 33 + select PAGE_POOL if NET 33 34 default n 34 35 help 35 36 Enable the bpf() system call that allows to manipulate BPF programs
+1 -1
kernel/bpf/syscall.c
··· 3336 3336 } 3337 3337 } 3338 3338 3339 - #define BPF_PROG_TEST_RUN_LAST_FIELD test.cpu 3339 + #define BPF_PROG_TEST_RUN_LAST_FIELD test.batch_size 3340 3340 3341 3341 static int bpf_prog_test_run(const union bpf_attr *attr, 3342 3342 union bpf_attr __user *uattr)
+320 -14
net/bpf/test_run.c
··· 15 15 #include <net/sock.h> 16 16 #include <net/tcp.h> 17 17 #include <net/net_namespace.h> 18 + #include <net/page_pool.h> 18 19 #include <linux/error-injection.h> 19 20 #include <linux/smp.h> 20 21 #include <linux/sock_diag.h> ··· 54 53 rcu_read_unlock(); 55 54 } 56 55 57 - static bool bpf_test_timer_continue(struct bpf_test_timer *t, u32 repeat, int *err, u32 *duration) 56 + static bool bpf_test_timer_continue(struct bpf_test_timer *t, int iterations, 57 + u32 repeat, int *err, u32 *duration) 58 58 __must_hold(rcu) 59 59 { 60 - t->i++; 60 + t->i += iterations; 61 61 if (t->i >= repeat) { 62 62 /* We're done. */ 63 63 t->time_spent += ktime_get_ns() - t->time_start; ··· 88 86 reset: 89 87 t->i = 0; 90 88 return false; 89 + } 90 + 91 + /* We put this struct at the head of each page with a context and frame 92 + * initialised when the page is allocated, so we don't have to do this on each 93 + * repetition of the test run. 94 + */ 95 + struct xdp_page_head { 96 + struct xdp_buff orig_ctx; 97 + struct xdp_buff ctx; 98 + struct xdp_frame frm; 99 + u8 data[]; 100 + }; 101 + 102 + struct xdp_test_data { 103 + struct xdp_buff *orig_ctx; 104 + struct xdp_rxq_info rxq; 105 + struct net_device *dev; 106 + struct page_pool *pp; 107 + struct xdp_frame **frames; 108 + struct sk_buff **skbs; 109 + u32 batch_size; 110 + u32 frame_cnt; 111 + }; 112 + 113 + #define TEST_XDP_FRAME_SIZE (PAGE_SIZE - sizeof(struct xdp_page_head) \ 114 + - sizeof(struct skb_shared_info)) 115 + #define TEST_XDP_MAX_BATCH 256 116 + 117 + static void xdp_test_run_init_page(struct page *page, void *arg) 118 + { 119 + struct xdp_page_head *head = phys_to_virt(page_to_phys(page)); 120 + struct xdp_buff *new_ctx, *orig_ctx; 121 + u32 headroom = XDP_PACKET_HEADROOM; 122 + struct xdp_test_data *xdp = arg; 123 + size_t frm_len, meta_len; 124 + struct xdp_frame *frm; 125 + void *data; 126 + 127 + orig_ctx = xdp->orig_ctx; 128 + frm_len = orig_ctx->data_end - orig_ctx->data_meta; 129 + meta_len = orig_ctx->data - orig_ctx->data_meta; 130 + headroom -= meta_len; 131 + 132 + new_ctx = &head->ctx; 133 + frm = &head->frm; 134 + data = &head->data; 135 + memcpy(data + headroom, orig_ctx->data_meta, frm_len); 136 + 137 + xdp_init_buff(new_ctx, TEST_XDP_FRAME_SIZE, &xdp->rxq); 138 + xdp_prepare_buff(new_ctx, data, headroom, frm_len, true); 139 + new_ctx->data = new_ctx->data_meta + meta_len; 140 + 141 + xdp_update_frame_from_buff(new_ctx, frm); 142 + frm->mem = new_ctx->rxq->mem; 143 + 144 + memcpy(&head->orig_ctx, new_ctx, sizeof(head->orig_ctx)); 145 + } 146 + 147 + static int xdp_test_run_setup(struct xdp_test_data *xdp, struct xdp_buff *orig_ctx) 148 + { 149 + struct xdp_mem_info mem = {}; 150 + struct page_pool *pp; 151 + int err = -ENOMEM; 152 + struct page_pool_params pp_params = { 153 + .order = 0, 154 + .flags = 0, 155 + .pool_size = xdp->batch_size, 156 + .nid = NUMA_NO_NODE, 157 + .max_len = TEST_XDP_FRAME_SIZE, 158 + .init_callback = xdp_test_run_init_page, 159 + .init_arg = xdp, 160 + }; 161 + 162 + xdp->frames = kvmalloc_array(xdp->batch_size, sizeof(void *), GFP_KERNEL); 163 + if (!xdp->frames) 164 + return -ENOMEM; 165 + 166 + xdp->skbs = kvmalloc_array(xdp->batch_size, sizeof(void *), GFP_KERNEL); 167 + if (!xdp->skbs) 168 + goto err_skbs; 169 + 170 + pp = page_pool_create(&pp_params); 171 + if (IS_ERR(pp)) { 172 + err = PTR_ERR(pp); 173 + goto err_pp; 174 + } 175 + 176 + /* will copy 'mem.id' into pp->xdp_mem_id */ 177 + err = xdp_reg_mem_model(&mem, MEM_TYPE_PAGE_POOL, pp); 178 + if (err) 179 + goto err_mmodel; 180 + 181 + xdp->pp = pp; 182 + 183 + /* We create a 'fake' RXQ referencing the original dev, but with an 184 + * xdp_mem_info pointing to our page_pool 185 + */ 186 + xdp_rxq_info_reg(&xdp->rxq, orig_ctx->rxq->dev, 0, 0); 187 + xdp->rxq.mem.type = MEM_TYPE_PAGE_POOL; 188 + xdp->rxq.mem.id = pp->xdp_mem_id; 189 + xdp->dev = orig_ctx->rxq->dev; 190 + xdp->orig_ctx = orig_ctx; 191 + 192 + return 0; 193 + 194 + err_mmodel: 195 + page_pool_destroy(pp); 196 + err_pp: 197 + kfree(xdp->skbs); 198 + err_skbs: 199 + kfree(xdp->frames); 200 + return err; 201 + } 202 + 203 + static void xdp_test_run_teardown(struct xdp_test_data *xdp) 204 + { 205 + page_pool_destroy(xdp->pp); 206 + kfree(xdp->frames); 207 + kfree(xdp->skbs); 208 + } 209 + 210 + static bool ctx_was_changed(struct xdp_page_head *head) 211 + { 212 + return head->orig_ctx.data != head->ctx.data || 213 + head->orig_ctx.data_meta != head->ctx.data_meta || 214 + head->orig_ctx.data_end != head->ctx.data_end; 215 + } 216 + 217 + static void reset_ctx(struct xdp_page_head *head) 218 + { 219 + if (likely(!ctx_was_changed(head))) 220 + return; 221 + 222 + head->ctx.data = head->orig_ctx.data; 223 + head->ctx.data_meta = head->orig_ctx.data_meta; 224 + head->ctx.data_end = head->orig_ctx.data_end; 225 + xdp_update_frame_from_buff(&head->ctx, &head->frm); 226 + } 227 + 228 + static int xdp_recv_frames(struct xdp_frame **frames, int nframes, 229 + struct sk_buff **skbs, 230 + struct net_device *dev) 231 + { 232 + gfp_t gfp = __GFP_ZERO | GFP_ATOMIC; 233 + int i, n; 234 + LIST_HEAD(list); 235 + 236 + n = kmem_cache_alloc_bulk(skbuff_head_cache, gfp, nframes, (void **)skbs); 237 + if (unlikely(n == 0)) { 238 + for (i = 0; i < nframes; i++) 239 + xdp_return_frame(frames[i]); 240 + return -ENOMEM; 241 + } 242 + 243 + for (i = 0; i < nframes; i++) { 244 + struct xdp_frame *xdpf = frames[i]; 245 + struct sk_buff *skb = skbs[i]; 246 + 247 + skb = __xdp_build_skb_from_frame(xdpf, skb, dev); 248 + if (!skb) { 249 + xdp_return_frame(xdpf); 250 + continue; 251 + } 252 + 253 + list_add_tail(&skb->list, &list); 254 + } 255 + netif_receive_skb_list(&list); 256 + 257 + return 0; 258 + } 259 + 260 + static int xdp_test_run_batch(struct xdp_test_data *xdp, struct bpf_prog *prog, 261 + u32 repeat) 262 + { 263 + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); 264 + int err = 0, act, ret, i, nframes = 0, batch_sz; 265 + struct xdp_frame **frames = xdp->frames; 266 + struct xdp_page_head *head; 267 + struct xdp_frame *frm; 268 + bool redirect = false; 269 + struct xdp_buff *ctx; 270 + struct page *page; 271 + 272 + batch_sz = min_t(u32, repeat, xdp->batch_size); 273 + 274 + local_bh_disable(); 275 + xdp_set_return_frame_no_direct(); 276 + 277 + for (i = 0; i < batch_sz; i++) { 278 + page = page_pool_dev_alloc_pages(xdp->pp); 279 + if (!page) { 280 + err = -ENOMEM; 281 + goto out; 282 + } 283 + 284 + head = phys_to_virt(page_to_phys(page)); 285 + reset_ctx(head); 286 + ctx = &head->ctx; 287 + frm = &head->frm; 288 + xdp->frame_cnt++; 289 + 290 + act = bpf_prog_run_xdp(prog, ctx); 291 + 292 + /* if program changed pkt bounds we need to update the xdp_frame */ 293 + if (unlikely(ctx_was_changed(head))) { 294 + ret = xdp_update_frame_from_buff(ctx, frm); 295 + if (ret) { 296 + xdp_return_buff(ctx); 297 + continue; 298 + } 299 + } 300 + 301 + switch (act) { 302 + case XDP_TX: 303 + /* we can't do a real XDP_TX since we're not in the 304 + * driver, so turn it into a REDIRECT back to the same 305 + * index 306 + */ 307 + ri->tgt_index = xdp->dev->ifindex; 308 + ri->map_id = INT_MAX; 309 + ri->map_type = BPF_MAP_TYPE_UNSPEC; 310 + fallthrough; 311 + case XDP_REDIRECT: 312 + redirect = true; 313 + ret = xdp_do_redirect_frame(xdp->dev, ctx, frm, prog); 314 + if (ret) 315 + xdp_return_buff(ctx); 316 + break; 317 + case XDP_PASS: 318 + frames[nframes++] = frm; 319 + break; 320 + default: 321 + bpf_warn_invalid_xdp_action(NULL, prog, act); 322 + fallthrough; 323 + case XDP_DROP: 324 + xdp_return_buff(ctx); 325 + break; 326 + } 327 + } 328 + 329 + out: 330 + if (redirect) 331 + xdp_do_flush(); 332 + if (nframes) { 333 + ret = xdp_recv_frames(frames, nframes, xdp->skbs, xdp->dev); 334 + if (ret) 335 + err = ret; 336 + } 337 + 338 + xdp_clear_return_frame_no_direct(); 339 + local_bh_enable(); 340 + return err; 341 + } 342 + 343 + static int bpf_test_run_xdp_live(struct bpf_prog *prog, struct xdp_buff *ctx, 344 + u32 repeat, u32 batch_size, u32 *time) 345 + 346 + { 347 + struct xdp_test_data xdp = { .batch_size = batch_size }; 348 + struct bpf_test_timer t = { .mode = NO_MIGRATE }; 349 + int ret; 350 + 351 + if (!repeat) 352 + repeat = 1; 353 + 354 + ret = xdp_test_run_setup(&xdp, ctx); 355 + if (ret) 356 + return ret; 357 + 358 + bpf_test_timer_enter(&t); 359 + do { 360 + xdp.frame_cnt = 0; 361 + ret = xdp_test_run_batch(&xdp, prog, repeat - t.i); 362 + if (unlikely(ret < 0)) 363 + break; 364 + } while (bpf_test_timer_continue(&t, xdp.frame_cnt, repeat, &ret, time)); 365 + bpf_test_timer_leave(&t); 366 + 367 + xdp_test_run_teardown(&xdp); 368 + return ret; 91 369 } 92 370 93 371 static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, ··· 401 119 *retval = bpf_prog_run_xdp(prog, ctx); 402 120 else 403 121 *retval = bpf_prog_run(prog, ctx); 404 - } while (bpf_test_timer_continue(&t, repeat, &ret, time)); 122 + } while (bpf_test_timer_continue(&t, 1, repeat, &ret, time)); 405 123 bpf_reset_run_ctx(old_ctx); 406 124 bpf_test_timer_leave(&t); 407 125 ··· 728 446 int b = 2, err = -EFAULT; 729 447 u32 retval = 0; 730 448 731 - if (kattr->test.flags || kattr->test.cpu) 449 + if (kattr->test.flags || kattr->test.cpu || kattr->test.batch_size) 732 450 return -EINVAL; 733 451 734 452 switch (prog->expected_attach_type) { ··· 792 510 /* doesn't support data_in/out, ctx_out, duration, or repeat */ 793 511 if (kattr->test.data_in || kattr->test.data_out || 794 512 kattr->test.ctx_out || kattr->test.duration || 795 - kattr->test.repeat) 513 + kattr->test.repeat || kattr->test.batch_size) 796 514 return -EINVAL; 797 515 798 516 if (ctx_size_in < prog->aux->max_ctx_offset || ··· 1023 741 void *data; 1024 742 int ret; 1025 743 1026 - if (kattr->test.flags || kattr->test.cpu) 744 + if (kattr->test.flags || kattr->test.cpu || kattr->test.batch_size) 1027 745 return -EINVAL; 1028 746 1029 747 data = bpf_test_init(kattr, kattr->test.data_size_in, ··· 1204 922 int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, 1205 923 union bpf_attr __user *uattr) 1206 924 { 925 + bool do_live = (kattr->test.flags & BPF_F_TEST_XDP_LIVE_FRAMES); 1207 926 u32 tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 927 + u32 batch_size = kattr->test.batch_size; 1208 928 u32 size = kattr->test.data_size_in; 1209 929 u32 headroom = XDP_PACKET_HEADROOM; 1210 930 u32 retval, duration, max_data_sz; ··· 1222 938 prog->expected_attach_type == BPF_XDP_CPUMAP) 1223 939 return -EINVAL; 1224 940 941 + if (kattr->test.flags & ~BPF_F_TEST_XDP_LIVE_FRAMES) 942 + return -EINVAL; 943 + 944 + if (do_live) { 945 + if (!batch_size) 946 + batch_size = NAPI_POLL_WEIGHT; 947 + else if (batch_size > TEST_XDP_MAX_BATCH) 948 + return -E2BIG; 949 + } else if (batch_size) { 950 + return -EINVAL; 951 + } 952 + 1225 953 ctx = bpf_ctx_init(kattr, sizeof(struct xdp_md)); 1226 954 if (IS_ERR(ctx)) 1227 955 return PTR_ERR(ctx); ··· 1242 946 /* There can't be user provided data before the meta data */ 1243 947 if (ctx->data_meta || ctx->data_end != size || 1244 948 ctx->data > ctx->data_end || 1245 - unlikely(xdp_metalen_invalid(ctx->data))) 949 + unlikely(xdp_metalen_invalid(ctx->data)) || 950 + (do_live && (kattr->test.data_out || kattr->test.ctx_out))) 1246 951 goto free_ctx; 1247 952 /* Meta data is allocated from the headroom */ 1248 953 headroom -= ctx->data; 1249 954 } 1250 955 1251 956 max_data_sz = 4096 - headroom - tailroom; 1252 - size = min_t(u32, size, max_data_sz); 957 + if (size > max_data_sz) { 958 + /* disallow live data mode for jumbo frames */ 959 + if (do_live) 960 + goto free_ctx; 961 + size = max_data_sz; 962 + } 1253 963 1254 964 data = bpf_test_init(kattr, size, max_data_sz, headroom, tailroom); 1255 965 if (IS_ERR(data)) { ··· 1313 1011 if (repeat > 1) 1314 1012 bpf_prog_change_xdp(NULL, prog); 1315 1013 1316 - ret = bpf_test_run(prog, &xdp, repeat, &retval, &duration, true); 1014 + if (do_live) 1015 + ret = bpf_test_run_xdp_live(prog, &xdp, repeat, batch_size, &duration); 1016 + else 1017 + ret = bpf_test_run(prog, &xdp, repeat, &retval, &duration, true); 1317 1018 /* We convert the xdp_buff back to an xdp_md before checking the return 1318 1019 * code so the reference count of any held netdevice will be decremented 1319 1020 * even if the test run failed. ··· 1378 1073 if (prog->type != BPF_PROG_TYPE_FLOW_DISSECTOR) 1379 1074 return -EINVAL; 1380 1075 1381 - if (kattr->test.flags || kattr->test.cpu) 1076 + if (kattr->test.flags || kattr->test.cpu || kattr->test.batch_size) 1382 1077 return -EINVAL; 1383 1078 1384 1079 if (size < ETH_HLEN) ··· 1413 1108 do { 1414 1109 retval = bpf_flow_dissect(prog, &ctx, eth->h_proto, ETH_HLEN, 1415 1110 size, flags); 1416 - } while (bpf_test_timer_continue(&t, repeat, &ret, &duration)); 1111 + } while (bpf_test_timer_continue(&t, 1, repeat, &ret, &duration)); 1417 1112 bpf_test_timer_leave(&t); 1418 1113 1419 1114 if (ret < 0) ··· 1445 1140 if (prog->type != BPF_PROG_TYPE_SK_LOOKUP) 1446 1141 return -EINVAL; 1447 1142 1448 - if (kattr->test.flags || kattr->test.cpu) 1143 + if (kattr->test.flags || kattr->test.cpu || kattr->test.batch_size) 1449 1144 return -EINVAL; 1450 1145 1451 1146 if (kattr->test.data_in || kattr->test.data_size_in || kattr->test.data_out || ··· 1508 1203 do { 1509 1204 ctx.selected_sk = NULL; 1510 1205 retval = BPF_PROG_SK_LOOKUP_RUN_ARRAY(progs, ctx, bpf_prog_run); 1511 - } while (bpf_test_timer_continue(&t, repeat, &ret, &duration)); 1206 + } while (bpf_test_timer_continue(&t, 1, repeat, &ret, &duration)); 1512 1207 bpf_test_timer_leave(&t); 1513 1208 1514 1209 if (ret < 0) ··· 1547 1242 /* doesn't support data_in/out, ctx_out, duration, or repeat or flags */ 1548 1243 if (kattr->test.data_in || kattr->test.data_out || 1549 1244 kattr->test.ctx_out || kattr->test.duration || 1550 - kattr->test.repeat || kattr->test.flags) 1245 + kattr->test.repeat || kattr->test.flags || 1246 + kattr->test.batch_size) 1551 1247 return -EINVAL; 1552 1248 1553 1249 if (ctx_size_in < prog->aux->max_ctx_offset ||
+3
tools/include/uapi/linux/bpf.h
··· 1232 1232 1233 1233 /* If set, run the test on the cpu specified by bpf_attr.test.cpu */ 1234 1234 #define BPF_F_TEST_RUN_ON_CPU (1U << 0) 1235 + /* If set, XDP frames will be transmitted after processing */ 1236 + #define BPF_F_TEST_XDP_LIVE_FRAMES (1U << 1) 1235 1237 1236 1238 /* type for BPF_ENABLE_STATS */ 1237 1239 enum bpf_stats_type { ··· 1395 1393 __aligned_u64 ctx_out; 1396 1394 __u32 flags; 1397 1395 __u32 cpu; 1396 + __u32 batch_size; 1398 1397 } test; 1399 1398 1400 1399 struct { /* anonymous struct used by BPF_*_GET_*_ID */