Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next

Daniel Borkmann says:

====================
pull-request: bpf-next 2018-08-13

The following pull-request contains BPF updates for your *net-next* tree.

The main changes are:

1) Add driver XDP support for veth. This can be used in conjunction with
redirect of another XDP program e.g. sitting on NIC so the xdp_frame
can be forwarded to the peer veth directly without modification,
from Toshiaki.

2) Add a new BPF map type REUSEPORT_SOCKARRAY and prog type SK_REUSEPORT
in order to provide more control and visibility on where a SO_REUSEPORT
sk should be located, and the latter enables to directly select a sk
from the bpf map. This also enables map-in-map for application migration
use cases, from Martin.

3) Add a new BPF helper bpf_skb_ancestor_cgroup_id() that returns the id
of cgroup v2 that is the ancestor of the cgroup associated with the
skb at the ancestor_level, from Andrey.

4) Implement BPF fs map pretty-print support based on BTF data for regular
hash table and LRU map, from Yonghong.

5) Decouple the ability to attach BTF for a map from the key and value
pretty-printer in BPF fs, and enable further support of BTF for maps for
percpu and LPM trie, from Daniel.

6) Implement a better BPF sample of using XDP's CPU redirect feature for
load balancing SKB processing to remote CPU. The sample implements the
same XDP load balancing as Suricata does which is symmetric hash based
on IP and L4 protocol, from Jesper.

7) Revert adding NULL pointer check with WARN_ON_ONCE() in __xdp_return()'s
critical path as it is ensured that the allocator is present, from Björn.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>

+3708 -177
+741 -9
drivers/net/veth.c
··· 17 17 #include <net/rtnetlink.h> 18 18 #include <net/dst.h> 19 19 #include <net/xfrm.h> 20 + #include <net/xdp.h> 20 21 #include <linux/veth.h> 21 22 #include <linux/module.h> 23 + #include <linux/bpf.h> 24 + #include <linux/filter.h> 25 + #include <linux/ptr_ring.h> 26 + #include <linux/bpf_trace.h> 22 27 23 28 #define DRV_NAME "veth" 24 29 #define DRV_VERSION "1.0" 30 + 31 + #define VETH_XDP_FLAG BIT(0) 32 + #define VETH_RING_SIZE 256 33 + #define VETH_XDP_HEADROOM (XDP_PACKET_HEADROOM + NET_IP_ALIGN) 34 + 35 + /* Separating two types of XDP xmit */ 36 + #define VETH_XDP_TX BIT(0) 37 + #define VETH_XDP_REDIR BIT(1) 25 38 26 39 struct pcpu_vstats { 27 40 u64 packets; ··· 42 29 struct u64_stats_sync syncp; 43 30 }; 44 31 32 + struct veth_rq { 33 + struct napi_struct xdp_napi; 34 + struct net_device *dev; 35 + struct bpf_prog __rcu *xdp_prog; 36 + struct xdp_mem_info xdp_mem; 37 + bool rx_notify_masked; 38 + struct ptr_ring xdp_ring; 39 + struct xdp_rxq_info xdp_rxq; 40 + }; 41 + 45 42 struct veth_priv { 46 43 struct net_device __rcu *peer; 47 44 atomic64_t dropped; 48 - unsigned requested_headroom; 45 + struct bpf_prog *_xdp_prog; 46 + struct veth_rq *rq; 47 + unsigned int requested_headroom; 49 48 }; 50 49 51 50 /* ··· 123 98 .get_link_ksettings = veth_get_link_ksettings, 124 99 }; 125 100 101 + /* general routines */ 102 + 103 + static bool veth_is_xdp_frame(void *ptr) 104 + { 105 + return (unsigned long)ptr & VETH_XDP_FLAG; 106 + } 107 + 108 + static void *veth_ptr_to_xdp(void *ptr) 109 + { 110 + return (void *)((unsigned long)ptr & ~VETH_XDP_FLAG); 111 + } 112 + 113 + static void *veth_xdp_to_ptr(void *ptr) 114 + { 115 + return (void *)((unsigned long)ptr | VETH_XDP_FLAG); 116 + } 117 + 118 + static void veth_ptr_free(void *ptr) 119 + { 120 + if (veth_is_xdp_frame(ptr)) 121 + xdp_return_frame(veth_ptr_to_xdp(ptr)); 122 + else 123 + kfree_skb(ptr); 124 + } 125 + 126 + static void __veth_xdp_flush(struct veth_rq *rq) 127 + { 128 + /* Write ptr_ring before reading rx_notify_masked */ 129 + smp_mb(); 130 + if (!rq->rx_notify_masked) { 131 + rq->rx_notify_masked = true; 132 + napi_schedule(&rq->xdp_napi); 133 + } 134 + } 135 + 136 + static int veth_xdp_rx(struct veth_rq *rq, struct sk_buff *skb) 137 + { 138 + if (unlikely(ptr_ring_produce(&rq->xdp_ring, skb))) { 139 + dev_kfree_skb_any(skb); 140 + return NET_RX_DROP; 141 + } 142 + 143 + return NET_RX_SUCCESS; 144 + } 145 + 146 + static int veth_forward_skb(struct net_device *dev, struct sk_buff *skb, 147 + struct veth_rq *rq, bool xdp) 148 + { 149 + return __dev_forward_skb(dev, skb) ?: xdp ? 150 + veth_xdp_rx(rq, skb) : 151 + netif_rx(skb); 152 + } 153 + 126 154 static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev) 127 155 { 128 - struct veth_priv *priv = netdev_priv(dev); 156 + struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 157 + struct veth_rq *rq = NULL; 129 158 struct net_device *rcv; 130 159 int length = skb->len; 160 + bool rcv_xdp = false; 161 + int rxq; 131 162 132 163 rcu_read_lock(); 133 164 rcv = rcu_dereference(priv->peer); ··· 192 111 goto drop; 193 112 } 194 113 195 - if (likely(dev_forward_skb(rcv, skb) == NET_RX_SUCCESS)) { 114 + rcv_priv = netdev_priv(rcv); 115 + rxq = skb_get_queue_mapping(skb); 116 + if (rxq < rcv->real_num_rx_queues) { 117 + rq = &rcv_priv->rq[rxq]; 118 + rcv_xdp = rcu_access_pointer(rq->xdp_prog); 119 + if (rcv_xdp) 120 + skb_record_rx_queue(skb, rxq); 121 + } 122 + 123 + if (likely(veth_forward_skb(rcv, skb, rq, rcv_xdp) == NET_RX_SUCCESS)) { 196 124 struct pcpu_vstats *stats = this_cpu_ptr(dev->vstats); 197 125 198 126 u64_stats_update_begin(&stats->syncp); ··· 212 122 drop: 213 123 atomic64_inc(&priv->dropped); 214 124 } 125 + 126 + if (rcv_xdp) 127 + __veth_xdp_flush(rq); 128 + 215 129 rcu_read_unlock(); 130 + 216 131 return NETDEV_TX_OK; 217 132 } 218 - 219 - /* 220 - * general routines 221 - */ 222 133 223 134 static u64 veth_stats_one(struct pcpu_vstats *result, struct net_device *dev) 224 135 { ··· 270 179 { 271 180 } 272 181 182 + static struct sk_buff *veth_build_skb(void *head, int headroom, int len, 183 + int buflen) 184 + { 185 + struct sk_buff *skb; 186 + 187 + if (!buflen) { 188 + buflen = SKB_DATA_ALIGN(headroom + len) + 189 + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 190 + } 191 + skb = build_skb(head, buflen); 192 + if (!skb) 193 + return NULL; 194 + 195 + skb_reserve(skb, headroom); 196 + skb_put(skb, len); 197 + 198 + return skb; 199 + } 200 + 201 + static int veth_select_rxq(struct net_device *dev) 202 + { 203 + return smp_processor_id() % dev->real_num_rx_queues; 204 + } 205 + 206 + static int veth_xdp_xmit(struct net_device *dev, int n, 207 + struct xdp_frame **frames, u32 flags) 208 + { 209 + struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 210 + struct net_device *rcv; 211 + unsigned int max_len; 212 + struct veth_rq *rq; 213 + int i, drops = 0; 214 + 215 + if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) 216 + return -EINVAL; 217 + 218 + rcv = rcu_dereference(priv->peer); 219 + if (unlikely(!rcv)) 220 + return -ENXIO; 221 + 222 + rcv_priv = netdev_priv(rcv); 223 + rq = &rcv_priv->rq[veth_select_rxq(rcv)]; 224 + /* Non-NULL xdp_prog ensures that xdp_ring is initialized on receive 225 + * side. This means an XDP program is loaded on the peer and the peer 226 + * device is up. 227 + */ 228 + if (!rcu_access_pointer(rq->xdp_prog)) 229 + return -ENXIO; 230 + 231 + max_len = rcv->mtu + rcv->hard_header_len + VLAN_HLEN; 232 + 233 + spin_lock(&rq->xdp_ring.producer_lock); 234 + for (i = 0; i < n; i++) { 235 + struct xdp_frame *frame = frames[i]; 236 + void *ptr = veth_xdp_to_ptr(frame); 237 + 238 + if (unlikely(frame->len > max_len || 239 + __ptr_ring_produce(&rq->xdp_ring, ptr))) { 240 + xdp_return_frame_rx_napi(frame); 241 + drops++; 242 + } 243 + } 244 + spin_unlock(&rq->xdp_ring.producer_lock); 245 + 246 + if (flags & XDP_XMIT_FLUSH) 247 + __veth_xdp_flush(rq); 248 + 249 + return n - drops; 250 + } 251 + 252 + static void veth_xdp_flush(struct net_device *dev) 253 + { 254 + struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 255 + struct net_device *rcv; 256 + struct veth_rq *rq; 257 + 258 + rcu_read_lock(); 259 + rcv = rcu_dereference(priv->peer); 260 + if (unlikely(!rcv)) 261 + goto out; 262 + 263 + rcv_priv = netdev_priv(rcv); 264 + rq = &rcv_priv->rq[veth_select_rxq(rcv)]; 265 + /* xdp_ring is initialized on receive side? */ 266 + if (unlikely(!rcu_access_pointer(rq->xdp_prog))) 267 + goto out; 268 + 269 + __veth_xdp_flush(rq); 270 + out: 271 + rcu_read_unlock(); 272 + } 273 + 274 + static int veth_xdp_tx(struct net_device *dev, struct xdp_buff *xdp) 275 + { 276 + struct xdp_frame *frame = convert_to_xdp_frame(xdp); 277 + 278 + if (unlikely(!frame)) 279 + return -EOVERFLOW; 280 + 281 + return veth_xdp_xmit(dev, 1, &frame, 0); 282 + } 283 + 284 + static struct sk_buff *veth_xdp_rcv_one(struct veth_rq *rq, 285 + struct xdp_frame *frame, 286 + unsigned int *xdp_xmit) 287 + { 288 + void *hard_start = frame->data - frame->headroom; 289 + void *head = hard_start - sizeof(struct xdp_frame); 290 + int len = frame->len, delta = 0; 291 + struct xdp_frame orig_frame; 292 + struct bpf_prog *xdp_prog; 293 + unsigned int headroom; 294 + struct sk_buff *skb; 295 + 296 + rcu_read_lock(); 297 + xdp_prog = rcu_dereference(rq->xdp_prog); 298 + if (likely(xdp_prog)) { 299 + struct xdp_buff xdp; 300 + u32 act; 301 + 302 + xdp.data_hard_start = hard_start; 303 + xdp.data = frame->data; 304 + xdp.data_end = frame->data + frame->len; 305 + xdp.data_meta = frame->data - frame->metasize; 306 + xdp.rxq = &rq->xdp_rxq; 307 + 308 + act = bpf_prog_run_xdp(xdp_prog, &xdp); 309 + 310 + switch (act) { 311 + case XDP_PASS: 312 + delta = frame->data - xdp.data; 313 + len = xdp.data_end - xdp.data; 314 + break; 315 + case XDP_TX: 316 + orig_frame = *frame; 317 + xdp.data_hard_start = head; 318 + xdp.rxq->mem = frame->mem; 319 + if (unlikely(veth_xdp_tx(rq->dev, &xdp) < 0)) { 320 + trace_xdp_exception(rq->dev, xdp_prog, act); 321 + frame = &orig_frame; 322 + goto err_xdp; 323 + } 324 + *xdp_xmit |= VETH_XDP_TX; 325 + rcu_read_unlock(); 326 + goto xdp_xmit; 327 + case XDP_REDIRECT: 328 + orig_frame = *frame; 329 + xdp.data_hard_start = head; 330 + xdp.rxq->mem = frame->mem; 331 + if (xdp_do_redirect(rq->dev, &xdp, xdp_prog)) { 332 + frame = &orig_frame; 333 + goto err_xdp; 334 + } 335 + *xdp_xmit |= VETH_XDP_REDIR; 336 + rcu_read_unlock(); 337 + goto xdp_xmit; 338 + default: 339 + bpf_warn_invalid_xdp_action(act); 340 + case XDP_ABORTED: 341 + trace_xdp_exception(rq->dev, xdp_prog, act); 342 + case XDP_DROP: 343 + goto err_xdp; 344 + } 345 + } 346 + rcu_read_unlock(); 347 + 348 + headroom = sizeof(struct xdp_frame) + frame->headroom - delta; 349 + skb = veth_build_skb(head, headroom, len, 0); 350 + if (!skb) { 351 + xdp_return_frame(frame); 352 + goto err; 353 + } 354 + 355 + xdp_scrub_frame(frame); 356 + skb->protocol = eth_type_trans(skb, rq->dev); 357 + err: 358 + return skb; 359 + err_xdp: 360 + rcu_read_unlock(); 361 + xdp_return_frame(frame); 362 + xdp_xmit: 363 + return NULL; 364 + } 365 + 366 + static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, struct sk_buff *skb, 367 + unsigned int *xdp_xmit) 368 + { 369 + u32 pktlen, headroom, act, metalen; 370 + void *orig_data, *orig_data_end; 371 + struct bpf_prog *xdp_prog; 372 + int mac_len, delta, off; 373 + struct xdp_buff xdp; 374 + 375 + rcu_read_lock(); 376 + xdp_prog = rcu_dereference(rq->xdp_prog); 377 + if (unlikely(!xdp_prog)) { 378 + rcu_read_unlock(); 379 + goto out; 380 + } 381 + 382 + mac_len = skb->data - skb_mac_header(skb); 383 + pktlen = skb->len + mac_len; 384 + headroom = skb_headroom(skb) - mac_len; 385 + 386 + if (skb_shared(skb) || skb_head_is_locked(skb) || 387 + skb_is_nonlinear(skb) || headroom < XDP_PACKET_HEADROOM) { 388 + struct sk_buff *nskb; 389 + int size, head_off; 390 + void *head, *start; 391 + struct page *page; 392 + 393 + size = SKB_DATA_ALIGN(VETH_XDP_HEADROOM + pktlen) + 394 + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 395 + if (size > PAGE_SIZE) 396 + goto drop; 397 + 398 + page = alloc_page(GFP_ATOMIC | __GFP_NOWARN); 399 + if (!page) 400 + goto drop; 401 + 402 + head = page_address(page); 403 + start = head + VETH_XDP_HEADROOM; 404 + if (skb_copy_bits(skb, -mac_len, start, pktlen)) { 405 + page_frag_free(head); 406 + goto drop; 407 + } 408 + 409 + nskb = veth_build_skb(head, 410 + VETH_XDP_HEADROOM + mac_len, skb->len, 411 + PAGE_SIZE); 412 + if (!nskb) { 413 + page_frag_free(head); 414 + goto drop; 415 + } 416 + 417 + skb_copy_header(nskb, skb); 418 + head_off = skb_headroom(nskb) - skb_headroom(skb); 419 + skb_headers_offset_update(nskb, head_off); 420 + if (skb->sk) 421 + skb_set_owner_w(nskb, skb->sk); 422 + consume_skb(skb); 423 + skb = nskb; 424 + } 425 + 426 + xdp.data_hard_start = skb->head; 427 + xdp.data = skb_mac_header(skb); 428 + xdp.data_end = xdp.data + pktlen; 429 + xdp.data_meta = xdp.data; 430 + xdp.rxq = &rq->xdp_rxq; 431 + orig_data = xdp.data; 432 + orig_data_end = xdp.data_end; 433 + 434 + act = bpf_prog_run_xdp(xdp_prog, &xdp); 435 + 436 + switch (act) { 437 + case XDP_PASS: 438 + break; 439 + case XDP_TX: 440 + get_page(virt_to_page(xdp.data)); 441 + consume_skb(skb); 442 + xdp.rxq->mem = rq->xdp_mem; 443 + if (unlikely(veth_xdp_tx(rq->dev, &xdp) < 0)) { 444 + trace_xdp_exception(rq->dev, xdp_prog, act); 445 + goto err_xdp; 446 + } 447 + *xdp_xmit |= VETH_XDP_TX; 448 + rcu_read_unlock(); 449 + goto xdp_xmit; 450 + case XDP_REDIRECT: 451 + get_page(virt_to_page(xdp.data)); 452 + consume_skb(skb); 453 + xdp.rxq->mem = rq->xdp_mem; 454 + if (xdp_do_redirect(rq->dev, &xdp, xdp_prog)) 455 + goto err_xdp; 456 + *xdp_xmit |= VETH_XDP_REDIR; 457 + rcu_read_unlock(); 458 + goto xdp_xmit; 459 + default: 460 + bpf_warn_invalid_xdp_action(act); 461 + case XDP_ABORTED: 462 + trace_xdp_exception(rq->dev, xdp_prog, act); 463 + case XDP_DROP: 464 + goto drop; 465 + } 466 + rcu_read_unlock(); 467 + 468 + delta = orig_data - xdp.data; 469 + off = mac_len + delta; 470 + if (off > 0) 471 + __skb_push(skb, off); 472 + else if (off < 0) 473 + __skb_pull(skb, -off); 474 + skb->mac_header -= delta; 475 + off = xdp.data_end - orig_data_end; 476 + if (off != 0) 477 + __skb_put(skb, off); 478 + skb->protocol = eth_type_trans(skb, rq->dev); 479 + 480 + metalen = xdp.data - xdp.data_meta; 481 + if (metalen) 482 + skb_metadata_set(skb, metalen); 483 + out: 484 + return skb; 485 + drop: 486 + rcu_read_unlock(); 487 + kfree_skb(skb); 488 + return NULL; 489 + err_xdp: 490 + rcu_read_unlock(); 491 + page_frag_free(xdp.data); 492 + xdp_xmit: 493 + return NULL; 494 + } 495 + 496 + static int veth_xdp_rcv(struct veth_rq *rq, int budget, unsigned int *xdp_xmit) 497 + { 498 + int i, done = 0; 499 + 500 + for (i = 0; i < budget; i++) { 501 + void *ptr = __ptr_ring_consume(&rq->xdp_ring); 502 + struct sk_buff *skb; 503 + 504 + if (!ptr) 505 + break; 506 + 507 + if (veth_is_xdp_frame(ptr)) { 508 + skb = veth_xdp_rcv_one(rq, veth_ptr_to_xdp(ptr), 509 + xdp_xmit); 510 + } else { 511 + skb = veth_xdp_rcv_skb(rq, ptr, xdp_xmit); 512 + } 513 + 514 + if (skb) 515 + napi_gro_receive(&rq->xdp_napi, skb); 516 + 517 + done++; 518 + } 519 + 520 + return done; 521 + } 522 + 523 + static int veth_poll(struct napi_struct *napi, int budget) 524 + { 525 + struct veth_rq *rq = 526 + container_of(napi, struct veth_rq, xdp_napi); 527 + unsigned int xdp_xmit = 0; 528 + int done; 529 + 530 + xdp_set_return_frame_no_direct(); 531 + done = veth_xdp_rcv(rq, budget, &xdp_xmit); 532 + 533 + if (done < budget && napi_complete_done(napi, done)) { 534 + /* Write rx_notify_masked before reading ptr_ring */ 535 + smp_store_mb(rq->rx_notify_masked, false); 536 + if (unlikely(!__ptr_ring_empty(&rq->xdp_ring))) { 537 + rq->rx_notify_masked = true; 538 + napi_schedule(&rq->xdp_napi); 539 + } 540 + } 541 + 542 + if (xdp_xmit & VETH_XDP_TX) 543 + veth_xdp_flush(rq->dev); 544 + if (xdp_xmit & VETH_XDP_REDIR) 545 + xdp_do_flush_map(); 546 + xdp_clear_return_frame_no_direct(); 547 + 548 + return done; 549 + } 550 + 551 + static int veth_napi_add(struct net_device *dev) 552 + { 553 + struct veth_priv *priv = netdev_priv(dev); 554 + int err, i; 555 + 556 + for (i = 0; i < dev->real_num_rx_queues; i++) { 557 + struct veth_rq *rq = &priv->rq[i]; 558 + 559 + err = ptr_ring_init(&rq->xdp_ring, VETH_RING_SIZE, GFP_KERNEL); 560 + if (err) 561 + goto err_xdp_ring; 562 + } 563 + 564 + for (i = 0; i < dev->real_num_rx_queues; i++) { 565 + struct veth_rq *rq = &priv->rq[i]; 566 + 567 + netif_napi_add(dev, &rq->xdp_napi, veth_poll, NAPI_POLL_WEIGHT); 568 + napi_enable(&rq->xdp_napi); 569 + } 570 + 571 + return 0; 572 + err_xdp_ring: 573 + for (i--; i >= 0; i--) 574 + ptr_ring_cleanup(&priv->rq[i].xdp_ring, veth_ptr_free); 575 + 576 + return err; 577 + } 578 + 579 + static void veth_napi_del(struct net_device *dev) 580 + { 581 + struct veth_priv *priv = netdev_priv(dev); 582 + int i; 583 + 584 + for (i = 0; i < dev->real_num_rx_queues; i++) { 585 + struct veth_rq *rq = &priv->rq[i]; 586 + 587 + napi_disable(&rq->xdp_napi); 588 + napi_hash_del(&rq->xdp_napi); 589 + } 590 + synchronize_net(); 591 + 592 + for (i = 0; i < dev->real_num_rx_queues; i++) { 593 + struct veth_rq *rq = &priv->rq[i]; 594 + 595 + netif_napi_del(&rq->xdp_napi); 596 + rq->rx_notify_masked = false; 597 + ptr_ring_cleanup(&rq->xdp_ring, veth_ptr_free); 598 + } 599 + } 600 + 601 + static int veth_enable_xdp(struct net_device *dev) 602 + { 603 + struct veth_priv *priv = netdev_priv(dev); 604 + int err, i; 605 + 606 + if (!xdp_rxq_info_is_reg(&priv->rq[0].xdp_rxq)) { 607 + for (i = 0; i < dev->real_num_rx_queues; i++) { 608 + struct veth_rq *rq = &priv->rq[i]; 609 + 610 + err = xdp_rxq_info_reg(&rq->xdp_rxq, dev, i); 611 + if (err < 0) 612 + goto err_rxq_reg; 613 + 614 + err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq, 615 + MEM_TYPE_PAGE_SHARED, 616 + NULL); 617 + if (err < 0) 618 + goto err_reg_mem; 619 + 620 + /* Save original mem info as it can be overwritten */ 621 + rq->xdp_mem = rq->xdp_rxq.mem; 622 + } 623 + 624 + err = veth_napi_add(dev); 625 + if (err) 626 + goto err_rxq_reg; 627 + } 628 + 629 + for (i = 0; i < dev->real_num_rx_queues; i++) 630 + rcu_assign_pointer(priv->rq[i].xdp_prog, priv->_xdp_prog); 631 + 632 + return 0; 633 + err_reg_mem: 634 + xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq); 635 + err_rxq_reg: 636 + for (i--; i >= 0; i--) 637 + xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq); 638 + 639 + return err; 640 + } 641 + 642 + static void veth_disable_xdp(struct net_device *dev) 643 + { 644 + struct veth_priv *priv = netdev_priv(dev); 645 + int i; 646 + 647 + for (i = 0; i < dev->real_num_rx_queues; i++) 648 + rcu_assign_pointer(priv->rq[i].xdp_prog, NULL); 649 + veth_napi_del(dev); 650 + for (i = 0; i < dev->real_num_rx_queues; i++) { 651 + struct veth_rq *rq = &priv->rq[i]; 652 + 653 + rq->xdp_rxq.mem = rq->xdp_mem; 654 + xdp_rxq_info_unreg(&rq->xdp_rxq); 655 + } 656 + } 657 + 273 658 static int veth_open(struct net_device *dev) 274 659 { 275 660 struct veth_priv *priv = netdev_priv(dev); 276 661 struct net_device *peer = rtnl_dereference(priv->peer); 662 + int err; 277 663 278 664 if (!peer) 279 665 return -ENOTCONN; 666 + 667 + if (priv->_xdp_prog) { 668 + err = veth_enable_xdp(dev); 669 + if (err) 670 + return err; 671 + } 280 672 281 673 if (peer->flags & IFF_UP) { 282 674 netif_carrier_on(dev); 283 675 netif_carrier_on(peer); 284 676 } 677 + 285 678 return 0; 286 679 } 287 680 ··· 777 202 netif_carrier_off(dev); 778 203 if (peer) 779 204 netif_carrier_off(peer); 205 + 206 + if (priv->_xdp_prog) 207 + veth_disable_xdp(dev); 780 208 781 209 return 0; 782 210 } ··· 806 228 static void veth_poll_controller(struct net_device *dev) 807 229 { 808 230 /* veth only receives frames when its peer sends one 809 - * Since it's a synchronous operation, we are guaranteed 231 + * Since it has nothing to do with disabling irqs, we are guaranteed 810 232 * never to have pending data when we poll for it so 811 233 * there is nothing to do here. 812 234 * ··· 829 251 rcu_read_unlock(); 830 252 831 253 return iflink; 254 + } 255 + 256 + static netdev_features_t veth_fix_features(struct net_device *dev, 257 + netdev_features_t features) 258 + { 259 + struct veth_priv *priv = netdev_priv(dev); 260 + struct net_device *peer; 261 + 262 + peer = rtnl_dereference(priv->peer); 263 + if (peer) { 264 + struct veth_priv *peer_priv = netdev_priv(peer); 265 + 266 + if (peer_priv->_xdp_prog) 267 + features &= ~NETIF_F_GSO_SOFTWARE; 268 + } 269 + 270 + return features; 832 271 } 833 272 834 273 static void veth_set_rx_headroom(struct net_device *dev, int new_hr) ··· 871 276 rcu_read_unlock(); 872 277 } 873 278 279 + static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog, 280 + struct netlink_ext_ack *extack) 281 + { 282 + struct veth_priv *priv = netdev_priv(dev); 283 + struct bpf_prog *old_prog; 284 + struct net_device *peer; 285 + unsigned int max_mtu; 286 + int err; 287 + 288 + old_prog = priv->_xdp_prog; 289 + priv->_xdp_prog = prog; 290 + peer = rtnl_dereference(priv->peer); 291 + 292 + if (prog) { 293 + if (!peer) { 294 + NL_SET_ERR_MSG_MOD(extack, "Cannot set XDP when peer is detached"); 295 + err = -ENOTCONN; 296 + goto err; 297 + } 298 + 299 + max_mtu = PAGE_SIZE - VETH_XDP_HEADROOM - 300 + peer->hard_header_len - 301 + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 302 + if (peer->mtu > max_mtu) { 303 + NL_SET_ERR_MSG_MOD(extack, "Peer MTU is too large to set XDP"); 304 + err = -ERANGE; 305 + goto err; 306 + } 307 + 308 + if (dev->real_num_rx_queues < peer->real_num_tx_queues) { 309 + NL_SET_ERR_MSG_MOD(extack, "XDP expects number of rx queues not less than peer tx queues"); 310 + err = -ENOSPC; 311 + goto err; 312 + } 313 + 314 + if (dev->flags & IFF_UP) { 315 + err = veth_enable_xdp(dev); 316 + if (err) { 317 + NL_SET_ERR_MSG_MOD(extack, "Setup for XDP failed"); 318 + goto err; 319 + } 320 + } 321 + 322 + if (!old_prog) { 323 + peer->hw_features &= ~NETIF_F_GSO_SOFTWARE; 324 + peer->max_mtu = max_mtu; 325 + } 326 + } 327 + 328 + if (old_prog) { 329 + if (!prog) { 330 + if (dev->flags & IFF_UP) 331 + veth_disable_xdp(dev); 332 + 333 + if (peer) { 334 + peer->hw_features |= NETIF_F_GSO_SOFTWARE; 335 + peer->max_mtu = ETH_MAX_MTU; 336 + } 337 + } 338 + bpf_prog_put(old_prog); 339 + } 340 + 341 + if ((!!old_prog ^ !!prog) && peer) 342 + netdev_update_features(peer); 343 + 344 + return 0; 345 + err: 346 + priv->_xdp_prog = old_prog; 347 + 348 + return err; 349 + } 350 + 351 + static u32 veth_xdp_query(struct net_device *dev) 352 + { 353 + struct veth_priv *priv = netdev_priv(dev); 354 + const struct bpf_prog *xdp_prog; 355 + 356 + xdp_prog = priv->_xdp_prog; 357 + if (xdp_prog) 358 + return xdp_prog->aux->id; 359 + 360 + return 0; 361 + } 362 + 363 + static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp) 364 + { 365 + switch (xdp->command) { 366 + case XDP_SETUP_PROG: 367 + return veth_xdp_set(dev, xdp->prog, xdp->extack); 368 + case XDP_QUERY_PROG: 369 + xdp->prog_id = veth_xdp_query(dev); 370 + return 0; 371 + default: 372 + return -EINVAL; 373 + } 374 + } 375 + 874 376 static const struct net_device_ops veth_netdev_ops = { 875 377 .ndo_init = veth_dev_init, 876 378 .ndo_open = veth_open, ··· 980 288 .ndo_poll_controller = veth_poll_controller, 981 289 #endif 982 290 .ndo_get_iflink = veth_get_iflink, 291 + .ndo_fix_features = veth_fix_features, 983 292 .ndo_features_check = passthru_features_check, 984 293 .ndo_set_rx_headroom = veth_set_rx_headroom, 294 + .ndo_bpf = veth_xdp, 295 + .ndo_xdp_xmit = veth_xdp_xmit, 985 296 }; 986 297 987 298 #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \ ··· 1040 345 return 0; 1041 346 } 1042 347 348 + static int veth_alloc_queues(struct net_device *dev) 349 + { 350 + struct veth_priv *priv = netdev_priv(dev); 351 + 352 + priv->rq = kcalloc(dev->num_rx_queues, sizeof(*priv->rq), GFP_KERNEL); 353 + if (!priv->rq) 354 + return -ENOMEM; 355 + 356 + return 0; 357 + } 358 + 359 + static void veth_free_queues(struct net_device *dev) 360 + { 361 + struct veth_priv *priv = netdev_priv(dev); 362 + 363 + kfree(priv->rq); 364 + } 365 + 1043 366 static struct rtnl_link_ops veth_link_ops; 1044 367 1045 368 static int veth_newlink(struct net *src_net, struct net_device *dev, 1046 369 struct nlattr *tb[], struct nlattr *data[], 1047 370 struct netlink_ext_ack *extack) 1048 371 { 1049 - int err; 372 + int err, i; 1050 373 struct net_device *peer; 1051 374 struct veth_priv *priv; 1052 375 char ifname[IFNAMSIZ]; ··· 1117 404 return PTR_ERR(peer); 1118 405 } 1119 406 407 + err = veth_alloc_queues(peer); 408 + if (err) { 409 + put_net(net); 410 + goto err_peer_alloc_queues; 411 + } 412 + 1120 413 if (!ifmp || !tbp[IFLA_ADDRESS]) 1121 414 eth_hw_addr_random(peer); 1122 415 ··· 1151 432 * should be re-allocated 1152 433 */ 1153 434 435 + err = veth_alloc_queues(dev); 436 + if (err) 437 + goto err_alloc_queues; 438 + 1154 439 if (tb[IFLA_ADDRESS] == NULL) 1155 440 eth_hw_addr_random(dev); 1156 441 ··· 1174 451 */ 1175 452 1176 453 priv = netdev_priv(dev); 454 + for (i = 0; i < dev->real_num_rx_queues; i++) 455 + priv->rq[i].dev = dev; 1177 456 rcu_assign_pointer(priv->peer, peer); 1178 457 1179 458 priv = netdev_priv(peer); 459 + for (i = 0; i < peer->real_num_rx_queues; i++) 460 + priv->rq[i].dev = peer; 1180 461 rcu_assign_pointer(priv->peer, dev); 462 + 1181 463 return 0; 1182 464 1183 465 err_register_dev: 466 + veth_free_queues(dev); 467 + err_alloc_queues: 1184 468 /* nothing to do */ 1185 469 err_configure_peer: 1186 470 unregister_netdevice(peer); 1187 471 return err; 1188 472 1189 473 err_register_peer: 474 + veth_free_queues(peer); 475 + err_peer_alloc_queues: 1190 476 free_netdev(peer); 1191 477 return err; 1192 478 }
+37 -4
include/linux/bpf.h
··· 23 23 struct bpf_map; 24 24 struct sock; 25 25 struct seq_file; 26 - struct btf; 26 + struct btf_type; 27 27 28 28 /* map is generic key/value storage optionally accesible by eBPF programs */ 29 29 struct bpf_map_ops { ··· 48 48 u32 (*map_fd_sys_lookup_elem)(void *ptr); 49 49 void (*map_seq_show_elem)(struct bpf_map *map, void *key, 50 50 struct seq_file *m); 51 - int (*map_check_btf)(const struct bpf_map *map, const struct btf *btf, 52 - u32 key_type_id, u32 value_type_id); 51 + int (*map_check_btf)(const struct bpf_map *map, 52 + const struct btf_type *key_type, 53 + const struct btf_type *value_type); 53 54 }; 54 55 55 56 struct bpf_map { ··· 119 118 120 119 static inline bool bpf_map_support_seq_show(const struct bpf_map *map) 121 120 { 122 - return map->ops->map_seq_show_elem && map->ops->map_check_btf; 121 + return map->btf && map->ops->map_seq_show_elem; 123 122 } 123 + 124 + int map_check_no_btf(const struct bpf_map *map, 125 + const struct btf_type *key_type, 126 + const struct btf_type *value_type); 124 127 125 128 extern const struct bpf_map_ops bpf_map_offload_ops; 126 129 ··· 529 524 } 530 525 531 526 struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type); 527 + int array_map_alloc_check(union bpf_attr *attr); 532 528 533 529 #else /* !CONFIG_BPF_SYSCALL */ 534 530 static inline struct bpf_prog *bpf_prog_get(u32 ufd) ··· 774 768 { 775 769 } 776 770 #endif 771 + 772 + #if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL) 773 + void bpf_sk_reuseport_detach(struct sock *sk); 774 + int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key, 775 + void *value); 776 + int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key, 777 + void *value, u64 map_flags); 778 + #else 779 + static inline void bpf_sk_reuseport_detach(struct sock *sk) 780 + { 781 + } 782 + 783 + #ifdef CONFIG_BPF_SYSCALL 784 + static inline int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, 785 + void *key, void *value) 786 + { 787 + return -EOPNOTSUPP; 788 + } 789 + 790 + static inline int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, 791 + void *key, void *value, 792 + u64 map_flags) 793 + { 794 + return -EOPNOTSUPP; 795 + } 796 + #endif /* CONFIG_BPF_SYSCALL */ 797 + #endif /* defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL) */ 777 798 778 799 /* verifier prototypes for helper functions called from eBPF programs */ 779 800 extern const struct bpf_func_proto bpf_map_lookup_elem_proto;
+6
include/linux/bpf_types.h
··· 29 29 #ifdef CONFIG_BPF_LIRC_MODE2 30 30 BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2) 31 31 #endif 32 + #ifdef CONFIG_INET 33 + BPF_PROG_TYPE(BPF_PROG_TYPE_SK_REUSEPORT, sk_reuseport) 34 + #endif 32 35 33 36 BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops) 34 37 BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops) ··· 62 59 BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops) 63 60 #if defined(CONFIG_XDP_SOCKETS) 64 61 BPF_MAP_TYPE(BPF_MAP_TYPE_XSKMAP, xsk_map_ops) 62 + #endif 63 + #ifdef CONFIG_INET 64 + BPF_MAP_TYPE(BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, reuseport_array_ops) 65 65 #endif 66 66 #endif
+30
include/linux/cgroup.h
··· 554 554 } 555 555 556 556 /** 557 + * cgroup_ancestor - find ancestor of cgroup 558 + * @cgrp: cgroup to find ancestor of 559 + * @ancestor_level: level of ancestor to find starting from root 560 + * 561 + * Find ancestor of cgroup at specified level starting from root if it exists 562 + * and return pointer to it. Return NULL if @cgrp doesn't have ancestor at 563 + * @ancestor_level. 564 + * 565 + * This function is safe to call as long as @cgrp is accessible. 566 + */ 567 + static inline struct cgroup *cgroup_ancestor(struct cgroup *cgrp, 568 + int ancestor_level) 569 + { 570 + struct cgroup *ptr; 571 + 572 + if (cgrp->level < ancestor_level) 573 + return NULL; 574 + 575 + for (ptr = cgrp; 576 + ptr && ptr->level > ancestor_level; 577 + ptr = cgroup_parent(ptr)) 578 + ; 579 + 580 + if (ptr && ptr->level == ancestor_level) 581 + return ptr; 582 + 583 + return NULL; 584 + } 585 + 586 + /** 557 587 * task_under_cgroup_hierarchy - test task's membership of cgroup ancestry 558 588 * @task: the task to be tested 559 589 * @ancestor: possible ancestor of @task's cgroup
+51
include/linux/filter.h
··· 32 32 struct bpf_prog_aux; 33 33 struct xdp_rxq_info; 34 34 struct xdp_buff; 35 + struct sock_reuseport; 35 36 36 37 /* ArgX, context and stack frame pointer register positions. Note, 37 38 * Arg1, Arg2, Arg3, etc are used as argument mappings of function ··· 538 537 struct list_head list; 539 538 }; 540 539 540 + struct bpf_redirect_info { 541 + u32 ifindex; 542 + u32 flags; 543 + struct bpf_map *map; 544 + struct bpf_map *map_to_flush; 545 + unsigned long map_owner; 546 + u32 kern_flags; 547 + }; 548 + 549 + DECLARE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info); 550 + 551 + /* flags for bpf_redirect_info kern_flags */ 552 + #define BPF_RI_F_RF_NO_DIRECT BIT(0) /* no napi_direct on return_frame */ 553 + 541 554 /* Compute the linear packet data range [data, data_end) which 542 555 * will be accessed by various program types (cls_bpf, act_bpf, 543 556 * lwt, ...). Subsystems allowing direct data access must (!) ··· 753 738 int sk_attach_bpf(u32 ufd, struct sock *sk); 754 739 int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk); 755 740 int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk); 741 + void sk_reuseport_prog_free(struct bpf_prog *prog); 756 742 int sk_detach_filter(struct sock *sk); 757 743 int sk_get_filter(struct sock *sk, struct sock_filter __user *filter, 758 744 unsigned int len); ··· 780 764 781 765 struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, 782 766 const struct bpf_insn *patch, u32 len); 767 + 768 + static inline bool xdp_return_frame_no_direct(void) 769 + { 770 + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); 771 + 772 + return ri->kern_flags & BPF_RI_F_RF_NO_DIRECT; 773 + } 774 + 775 + static inline void xdp_set_return_frame_no_direct(void) 776 + { 777 + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); 778 + 779 + ri->kern_flags |= BPF_RI_F_RF_NO_DIRECT; 780 + } 781 + 782 + static inline void xdp_clear_return_frame_no_direct(void) 783 + { 784 + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); 785 + 786 + ri->kern_flags &= ~BPF_RI_F_RF_NO_DIRECT; 787 + } 783 788 784 789 static inline int xdp_ok_fwd_dev(const struct net_device *fwd, 785 790 unsigned int pktlen) ··· 834 797 835 798 struct sock *do_sk_redirect_map(struct sk_buff *skb); 836 799 struct sock *do_msg_redirect_map(struct sk_msg_buff *md); 800 + 801 + #ifdef CONFIG_INET 802 + struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, 803 + struct bpf_prog *prog, struct sk_buff *skb, 804 + u32 hash); 805 + #else 806 + static inline struct sock * 807 + bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, 808 + struct bpf_prog *prog, struct sk_buff *skb, 809 + u32 hash) 810 + { 811 + return NULL; 812 + } 813 + #endif 837 814 838 815 #ifdef CONFIG_BPF_JIT 839 816 extern int bpf_jit_enable;
+1
include/linux/skbuff.h
··· 1038 1038 } 1039 1039 1040 1040 struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src); 1041 + void skb_headers_offset_update(struct sk_buff *skb, int off); 1041 1042 int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask); 1042 1043 struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t priority); 1043 1044 void skb_copy_header(struct sk_buff *new, const struct sk_buff *old);
+1
include/net/addrconf.h
··· 108 108 u32 banned_flags); 109 109 bool inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, 110 110 bool match_wildcard); 111 + bool inet_rcv_saddr_any(const struct sock *sk); 111 112 void addrconf_join_solict(struct net_device *dev, const struct in6_addr *addr); 112 113 void addrconf_leave_solict(struct inet6_dev *idev, const struct in6_addr *addr); 113 114
+15 -4
include/net/sock_reuseport.h
··· 5 5 #include <linux/filter.h> 6 6 #include <linux/skbuff.h> 7 7 #include <linux/types.h> 8 + #include <linux/spinlock.h> 8 9 #include <net/sock.h> 10 + 11 + extern spinlock_t reuseport_lock; 9 12 10 13 struct sock_reuseport { 11 14 struct rcu_head rcu; 12 15 13 16 u16 max_socks; /* length of socks */ 14 17 u16 num_socks; /* elements in socks */ 18 + /* The last synq overflow event timestamp of this 19 + * reuse->socks[] group. 20 + */ 21 + unsigned int synq_overflow_ts; 22 + /* ID stays the same even after the size of socks[] grows. */ 23 + unsigned int reuseport_id; 24 + bool bind_inany; 15 25 struct bpf_prog __rcu *prog; /* optional BPF sock selector */ 16 26 struct sock *socks[0]; /* array of sock pointers */ 17 27 }; 18 28 19 - extern int reuseport_alloc(struct sock *sk); 20 - extern int reuseport_add_sock(struct sock *sk, struct sock *sk2); 29 + extern int reuseport_alloc(struct sock *sk, bool bind_inany); 30 + extern int reuseport_add_sock(struct sock *sk, struct sock *sk2, 31 + bool bind_inany); 21 32 extern void reuseport_detach_sock(struct sock *sk); 22 33 extern struct sock *reuseport_select_sock(struct sock *sk, 23 34 u32 hash, 24 35 struct sk_buff *skb, 25 36 int hdr_len); 26 - extern struct bpf_prog *reuseport_attach_prog(struct sock *sk, 27 - struct bpf_prog *prog); 37 + extern int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog); 38 + int reuseport_get_id(struct sock_reuseport *reuse); 28 39 29 40 #endif /* _SOCK_REUSEPORT_H */
+28 -2
include/net/tcp.h
··· 36 36 #include <net/inet_hashtables.h> 37 37 #include <net/checksum.h> 38 38 #include <net/request_sock.h> 39 + #include <net/sock_reuseport.h> 39 40 #include <net/sock.h> 40 41 #include <net/snmp.h> 41 42 #include <net/ip.h> ··· 474 473 */ 475 474 static inline void tcp_synq_overflow(const struct sock *sk) 476 475 { 477 - unsigned int last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp; 476 + unsigned int last_overflow; 478 477 unsigned int now = jiffies; 479 478 479 + if (sk->sk_reuseport) { 480 + struct sock_reuseport *reuse; 481 + 482 + reuse = rcu_dereference(sk->sk_reuseport_cb); 483 + if (likely(reuse)) { 484 + last_overflow = READ_ONCE(reuse->synq_overflow_ts); 485 + if (time_after32(now, last_overflow + HZ)) 486 + WRITE_ONCE(reuse->synq_overflow_ts, now); 487 + return; 488 + } 489 + } 490 + 491 + last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp; 480 492 if (time_after32(now, last_overflow + HZ)) 481 493 tcp_sk(sk)->rx_opt.ts_recent_stamp = now; 482 494 } ··· 497 483 /* syncookies: no recent synqueue overflow on this listening socket? */ 498 484 static inline bool tcp_synq_no_recent_overflow(const struct sock *sk) 499 485 { 500 - unsigned int last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp; 486 + unsigned int last_overflow; 501 487 unsigned int now = jiffies; 502 488 489 + if (sk->sk_reuseport) { 490 + struct sock_reuseport *reuse; 491 + 492 + reuse = rcu_dereference(sk->sk_reuseport_cb); 493 + if (likely(reuse)) { 494 + last_overflow = READ_ONCE(reuse->synq_overflow_ts); 495 + return time_after32(now, last_overflow + 496 + TCP_SYNCOOKIE_VALID); 497 + } 498 + } 499 + 500 + last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp; 503 501 return time_after32(now, last_overflow + TCP_SYNCOOKIE_VALID); 504 502 } 505 503
+7
include/net/xdp.h
··· 84 84 struct net_device *dev_rx; /* used by cpumap */ 85 85 }; 86 86 87 + /* Clear kernel pointers in xdp_frame */ 88 + static inline void xdp_scrub_frame(struct xdp_frame *frame) 89 + { 90 + frame->data = NULL; 91 + frame->dev_rx = NULL; 92 + } 93 + 87 94 /* Convert xdp_buff to xdp_frame */ 88 95 static inline 89 96 struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp)
+55 -1
include/uapi/linux/bpf.h
··· 126 126 BPF_MAP_TYPE_XSKMAP, 127 127 BPF_MAP_TYPE_SOCKHASH, 128 128 BPF_MAP_TYPE_CGROUP_STORAGE, 129 + BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, 129 130 }; 130 131 131 132 enum bpf_prog_type { ··· 151 150 BPF_PROG_TYPE_CGROUP_SOCK_ADDR, 152 151 BPF_PROG_TYPE_LWT_SEG6LOCAL, 153 152 BPF_PROG_TYPE_LIRC_MODE2, 153 + BPF_PROG_TYPE_SK_REUSEPORT, 154 154 }; 155 155 156 156 enum bpf_attach_type { ··· 2093 2091 * Return 2094 2092 * The id is returned or 0 in case the id could not be retrieved. 2095 2093 * 2094 + * u64 bpf_skb_ancestor_cgroup_id(struct sk_buff *skb, int ancestor_level) 2095 + * Description 2096 + * Return id of cgroup v2 that is ancestor of cgroup associated 2097 + * with the *skb* at the *ancestor_level*. The root cgroup is at 2098 + * *ancestor_level* zero and each step down the hierarchy 2099 + * increments the level. If *ancestor_level* == level of cgroup 2100 + * associated with *skb*, then return value will be same as that 2101 + * of **bpf_skb_cgroup_id**\ (). 2102 + * 2103 + * The helper is useful to implement policies based on cgroups 2104 + * that are upper in hierarchy than immediate cgroup associated 2105 + * with *skb*. 2106 + * 2107 + * The format of returned id and helper limitations are same as in 2108 + * **bpf_skb_cgroup_id**\ (). 2109 + * Return 2110 + * The id is returned or 0 in case the id could not be retrieved. 2111 + * 2096 2112 * u64 bpf_get_current_cgroup_id(void) 2097 2113 * Return 2098 2114 * A 64-bit integer containing the current cgroup id based ··· 2133 2113 * the shared data. 2134 2114 * Return 2135 2115 * Pointer to the local storage area. 2116 + * 2117 + * int bpf_sk_select_reuseport(struct sk_reuseport_md *reuse, struct bpf_map *map, void *key, u64 flags) 2118 + * Description 2119 + * Select a SO_REUSEPORT sk from a BPF_MAP_TYPE_REUSEPORT_ARRAY map 2120 + * It checks the selected sk is matching the incoming 2121 + * request in the skb. 2122 + * Return 2123 + * 0 on success, or a negative error in case of failure. 2136 2124 */ 2137 2125 #define __BPF_FUNC_MAPPER(FN) \ 2138 2126 FN(unspec), \ ··· 2224 2196 FN(rc_keydown), \ 2225 2197 FN(skb_cgroup_id), \ 2226 2198 FN(get_current_cgroup_id), \ 2227 - FN(get_local_storage), 2199 + FN(get_local_storage), \ 2200 + FN(sk_select_reuseport), \ 2201 + FN(skb_ancestor_cgroup_id), 2228 2202 2229 2203 /* integer value in 'imm' field of BPF_CALL instruction selects which helper 2230 2204 * function eBPF program intends to call ··· 2441 2411 __u32 local_ip6[4]; /* Stored in network byte order */ 2442 2412 __u32 remote_port; /* Stored in network byte order */ 2443 2413 __u32 local_port; /* stored in host byte order */ 2414 + }; 2415 + 2416 + struct sk_reuseport_md { 2417 + /* 2418 + * Start of directly accessible data. It begins from 2419 + * the tcp/udp header. 2420 + */ 2421 + void *data; 2422 + void *data_end; /* End of directly accessible data */ 2423 + /* 2424 + * Total length of packet (starting from the tcp/udp header). 2425 + * Note that the directly accessible bytes (data_end - data) 2426 + * could be less than this "len". Those bytes could be 2427 + * indirectly read by a helper "bpf_skb_load_bytes()". 2428 + */ 2429 + __u32 len; 2430 + /* 2431 + * Eth protocol in the mac header (network byte order). e.g. 2432 + * ETH_P_IP(0x0800) and ETH_P_IPV6(0x86DD) 2433 + */ 2434 + __u32 eth_protocol; 2435 + __u32 ip_protocol; /* IP protocol. e.g. IPPROTO_TCP, IPPROTO_UDP */ 2436 + __u32 bind_inany; /* Is sock bound to an INANY address? */ 2437 + __u32 hash; /* A hash of the packet 4 tuples */ 2444 2438 }; 2445 2439 2446 2440 #define BPF_TAG_SIZE 8
+3
kernel/bpf/Makefile
··· 23 23 obj-$(CONFIG_BPF_SYSCALL) += stackmap.o 24 24 endif 25 25 obj-$(CONFIG_CGROUP_BPF) += cgroup.o 26 + ifeq ($(CONFIG_INET),y) 27 + obj-$(CONFIG_BPF_SYSCALL) += reuseport_array.o 28 + endif
+13 -15
kernel/bpf/arraymap.c
··· 54 54 } 55 55 56 56 /* Called from syscall */ 57 - static int array_map_alloc_check(union bpf_attr *attr) 57 + int array_map_alloc_check(union bpf_attr *attr) 58 58 { 59 59 bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; 60 60 int numa_node = bpf_map_attr_numa_node(attr); ··· 358 358 rcu_read_unlock(); 359 359 } 360 360 361 - static int array_map_check_btf(const struct bpf_map *map, const struct btf *btf, 362 - u32 btf_key_id, u32 btf_value_id) 361 + static int array_map_check_btf(const struct bpf_map *map, 362 + const struct btf_type *key_type, 363 + const struct btf_type *value_type) 363 364 { 364 - const struct btf_type *key_type, *value_type; 365 - u32 key_size, value_size; 366 365 u32 int_data; 367 366 368 - key_type = btf_type_id_size(btf, &btf_key_id, &key_size); 369 - if (!key_type || BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) 367 + if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) 370 368 return -EINVAL; 371 369 372 370 int_data = *(u32 *)(key_type + 1); 373 - /* bpf array can only take a u32 key. This check makes 374 - * sure that the btf matches the attr used during map_create. 371 + /* bpf array can only take a u32 key. This check makes sure 372 + * that the btf matches the attr used during map_create. 375 373 */ 376 - if (BTF_INT_BITS(int_data) != 32 || key_size != 4 || 377 - BTF_INT_OFFSET(int_data)) 378 - return -EINVAL; 379 - 380 - value_type = btf_type_id_size(btf, &btf_value_id, &value_size); 381 - if (!value_type || value_size != map->value_size) 374 + if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data)) 382 375 return -EINVAL; 383 376 384 377 return 0; ··· 398 405 .map_lookup_elem = percpu_array_map_lookup_elem, 399 406 .map_update_elem = array_map_update_elem, 400 407 .map_delete_elem = array_map_delete_elem, 408 + .map_check_btf = array_map_check_btf, 401 409 }; 402 410 403 411 static int fd_array_map_alloc_check(union bpf_attr *attr) ··· 540 546 .map_fd_put_ptr = prog_fd_array_put_ptr, 541 547 .map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem, 542 548 .map_release_uref = bpf_fd_array_map_clear, 549 + .map_check_btf = map_check_no_btf, 543 550 }; 544 551 545 552 static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file, ··· 629 634 .map_fd_get_ptr = perf_event_fd_array_get_ptr, 630 635 .map_fd_put_ptr = perf_event_fd_array_put_ptr, 631 636 .map_release = perf_event_fd_array_release, 637 + .map_check_btf = map_check_no_btf, 632 638 }; 633 639 634 640 #ifdef CONFIG_CGROUPS ··· 661 665 .map_delete_elem = fd_array_map_delete_elem, 662 666 .map_fd_get_ptr = cgroup_fd_array_get_ptr, 663 667 .map_fd_put_ptr = cgroup_fd_array_put_ptr, 668 + .map_check_btf = map_check_no_btf, 664 669 }; 665 670 #endif 666 671 ··· 746 749 .map_fd_put_ptr = bpf_map_fd_put_ptr, 747 750 .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, 748 751 .map_gen_lookup = array_of_map_gen_lookup, 752 + .map_check_btf = map_check_no_btf, 749 753 };
+1
kernel/bpf/cpumap.c
··· 555 555 .map_update_elem = cpu_map_update_elem, 556 556 .map_lookup_elem = cpu_map_lookup_elem, 557 557 .map_get_next_key = cpu_map_get_next_key, 558 + .map_check_btf = map_check_no_btf, 558 559 }; 559 560 560 561 static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
+1
kernel/bpf/devmap.c
··· 488 488 .map_lookup_elem = dev_map_lookup_elem, 489 489 .map_update_elem = dev_map_update_elem, 490 490 .map_delete_elem = dev_map_delete_elem, 491 + .map_check_btf = map_check_no_btf, 491 492 }; 492 493 493 494 static int dev_map_notification(struct notifier_block *notifier,
+26
kernel/bpf/hashtab.c
··· 11 11 * General Public License for more details. 12 12 */ 13 13 #include <linux/bpf.h> 14 + #include <linux/btf.h> 14 15 #include <linux/jhash.h> 15 16 #include <linux/filter.h> 16 17 #include <linux/rculist_nulls.h> 18 + #include <uapi/linux/btf.h> 17 19 #include "percpu_freelist.h" 18 20 #include "bpf_lru_list.h" 19 21 #include "map_in_map.h" ··· 1164 1162 kfree(htab); 1165 1163 } 1166 1164 1165 + static void htab_map_seq_show_elem(struct bpf_map *map, void *key, 1166 + struct seq_file *m) 1167 + { 1168 + void *value; 1169 + 1170 + rcu_read_lock(); 1171 + 1172 + value = htab_map_lookup_elem(map, key); 1173 + if (!value) { 1174 + rcu_read_unlock(); 1175 + return; 1176 + } 1177 + 1178 + btf_type_seq_show(map->btf, map->btf_key_type_id, key, m); 1179 + seq_puts(m, ": "); 1180 + btf_type_seq_show(map->btf, map->btf_value_type_id, value, m); 1181 + seq_puts(m, "\n"); 1182 + 1183 + rcu_read_unlock(); 1184 + } 1185 + 1167 1186 const struct bpf_map_ops htab_map_ops = { 1168 1187 .map_alloc_check = htab_map_alloc_check, 1169 1188 .map_alloc = htab_map_alloc, ··· 1194 1171 .map_update_elem = htab_map_update_elem, 1195 1172 .map_delete_elem = htab_map_delete_elem, 1196 1173 .map_gen_lookup = htab_map_gen_lookup, 1174 + .map_seq_show_elem = htab_map_seq_show_elem, 1197 1175 }; 1198 1176 1199 1177 const struct bpf_map_ops htab_lru_map_ops = { ··· 1206 1182 .map_update_elem = htab_lru_map_update_elem, 1207 1183 .map_delete_elem = htab_lru_map_delete_elem, 1208 1184 .map_gen_lookup = htab_lru_map_gen_lookup, 1185 + .map_seq_show_elem = htab_map_seq_show_elem, 1209 1186 }; 1210 1187 1211 1188 /* Called from eBPF program */ ··· 1433 1408 .map_fd_put_ptr = bpf_map_fd_put_ptr, 1434 1409 .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, 1435 1410 .map_gen_lookup = htab_of_map_gen_lookup, 1411 + .map_check_btf = map_check_no_btf, 1436 1412 };
+7 -4
kernel/bpf/inode.c
··· 196 196 { 197 197 struct bpf_map *map = seq_file_to_map(m); 198 198 void *key = map_iter(m)->key; 199 + void *prev_key; 199 200 200 201 if (map_iter(m)->done) 201 202 return NULL; 202 203 203 204 if (unlikely(v == SEQ_START_TOKEN)) 204 - goto done; 205 + prev_key = NULL; 206 + else 207 + prev_key = key; 205 208 206 - if (map->ops->map_get_next_key(map, key, key)) { 209 + if (map->ops->map_get_next_key(map, prev_key, key)) { 207 210 map_iter(m)->done = true; 208 211 return NULL; 209 212 } 210 213 211 - done: 212 214 ++(*pos); 213 215 return key; 214 216 } ··· 334 332 struct bpf_map *map = arg; 335 333 336 334 return bpf_mkobj_ops(dentry, mode, arg, &bpf_map_iops, 337 - map->btf ? &bpffs_map_fops : &bpffs_obj_fops); 335 + bpf_map_support_seq_show(map) ? 336 + &bpffs_map_fops : &bpffs_obj_fops); 338 337 } 339 338 340 339 static struct dentry *
+1
kernel/bpf/local_storage.c
··· 246 246 .map_lookup_elem = cgroup_storage_lookup_elem, 247 247 .map_update_elem = cgroup_storage_update_elem, 248 248 .map_delete_elem = cgroup_storage_delete_elem, 249 + .map_check_btf = map_check_no_btf, 249 250 }; 250 251 251 252 int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *_map)
+12
kernel/bpf/lpm_trie.c
··· 10 10 */ 11 11 12 12 #include <linux/bpf.h> 13 + #include <linux/btf.h> 13 14 #include <linux/err.h> 14 15 #include <linux/slab.h> 15 16 #include <linux/spinlock.h> 16 17 #include <linux/vmalloc.h> 17 18 #include <net/ipv6.h> 19 + #include <uapi/linux/btf.h> 18 20 19 21 /* Intermediate node */ 20 22 #define LPM_TREE_NODE_FLAG_IM BIT(0) ··· 688 686 return err; 689 687 } 690 688 689 + static int trie_check_btf(const struct bpf_map *map, 690 + const struct btf_type *key_type, 691 + const struct btf_type *value_type) 692 + { 693 + /* Keys must have struct bpf_lpm_trie_key embedded. */ 694 + return BTF_INFO_KIND(key_type->info) != BTF_KIND_STRUCT ? 695 + -EINVAL : 0; 696 + } 697 + 691 698 const struct bpf_map_ops trie_map_ops = { 692 699 .map_alloc = trie_alloc, 693 700 .map_free = trie_free, ··· 704 693 .map_lookup_elem = trie_lookup_elem, 705 694 .map_update_elem = trie_update_elem, 706 695 .map_delete_elem = trie_delete_elem, 696 + .map_check_btf = trie_check_btf, 707 697 };
+363
kernel/bpf/reuseport_array.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * Copyright (c) 2018 Facebook 4 + */ 5 + #include <linux/bpf.h> 6 + #include <linux/err.h> 7 + #include <linux/sock_diag.h> 8 + #include <net/sock_reuseport.h> 9 + 10 + struct reuseport_array { 11 + struct bpf_map map; 12 + struct sock __rcu *ptrs[]; 13 + }; 14 + 15 + static struct reuseport_array *reuseport_array(struct bpf_map *map) 16 + { 17 + return (struct reuseport_array *)map; 18 + } 19 + 20 + /* The caller must hold the reuseport_lock */ 21 + void bpf_sk_reuseport_detach(struct sock *sk) 22 + { 23 + struct sock __rcu **socks; 24 + 25 + write_lock_bh(&sk->sk_callback_lock); 26 + socks = sk->sk_user_data; 27 + if (socks) { 28 + WRITE_ONCE(sk->sk_user_data, NULL); 29 + /* 30 + * Do not move this NULL assignment outside of 31 + * sk->sk_callback_lock because there is 32 + * a race with reuseport_array_free() 33 + * which does not hold the reuseport_lock. 34 + */ 35 + RCU_INIT_POINTER(*socks, NULL); 36 + } 37 + write_unlock_bh(&sk->sk_callback_lock); 38 + } 39 + 40 + static int reuseport_array_alloc_check(union bpf_attr *attr) 41 + { 42 + if (attr->value_size != sizeof(u32) && 43 + attr->value_size != sizeof(u64)) 44 + return -EINVAL; 45 + 46 + return array_map_alloc_check(attr); 47 + } 48 + 49 + static void *reuseport_array_lookup_elem(struct bpf_map *map, void *key) 50 + { 51 + struct reuseport_array *array = reuseport_array(map); 52 + u32 index = *(u32 *)key; 53 + 54 + if (unlikely(index >= array->map.max_entries)) 55 + return NULL; 56 + 57 + return rcu_dereference(array->ptrs[index]); 58 + } 59 + 60 + /* Called from syscall only */ 61 + static int reuseport_array_delete_elem(struct bpf_map *map, void *key) 62 + { 63 + struct reuseport_array *array = reuseport_array(map); 64 + u32 index = *(u32 *)key; 65 + struct sock *sk; 66 + int err; 67 + 68 + if (index >= map->max_entries) 69 + return -E2BIG; 70 + 71 + if (!rcu_access_pointer(array->ptrs[index])) 72 + return -ENOENT; 73 + 74 + spin_lock_bh(&reuseport_lock); 75 + 76 + sk = rcu_dereference_protected(array->ptrs[index], 77 + lockdep_is_held(&reuseport_lock)); 78 + if (sk) { 79 + write_lock_bh(&sk->sk_callback_lock); 80 + WRITE_ONCE(sk->sk_user_data, NULL); 81 + RCU_INIT_POINTER(array->ptrs[index], NULL); 82 + write_unlock_bh(&sk->sk_callback_lock); 83 + err = 0; 84 + } else { 85 + err = -ENOENT; 86 + } 87 + 88 + spin_unlock_bh(&reuseport_lock); 89 + 90 + return err; 91 + } 92 + 93 + static void reuseport_array_free(struct bpf_map *map) 94 + { 95 + struct reuseport_array *array = reuseport_array(map); 96 + struct sock *sk; 97 + u32 i; 98 + 99 + synchronize_rcu(); 100 + 101 + /* 102 + * ops->map_*_elem() will not be able to access this 103 + * array now. Hence, this function only races with 104 + * bpf_sk_reuseport_detach() which was triggerred by 105 + * close() or disconnect(). 106 + * 107 + * This function and bpf_sk_reuseport_detach() are 108 + * both removing sk from "array". Who removes it 109 + * first does not matter. 110 + * 111 + * The only concern here is bpf_sk_reuseport_detach() 112 + * may access "array" which is being freed here. 113 + * bpf_sk_reuseport_detach() access this "array" 114 + * through sk->sk_user_data _and_ with sk->sk_callback_lock 115 + * held which is enough because this "array" is not freed 116 + * until all sk->sk_user_data has stopped referencing this "array". 117 + * 118 + * Hence, due to the above, taking "reuseport_lock" is not 119 + * needed here. 120 + */ 121 + 122 + /* 123 + * Since reuseport_lock is not taken, sk is accessed under 124 + * rcu_read_lock() 125 + */ 126 + rcu_read_lock(); 127 + for (i = 0; i < map->max_entries; i++) { 128 + sk = rcu_dereference(array->ptrs[i]); 129 + if (sk) { 130 + write_lock_bh(&sk->sk_callback_lock); 131 + /* 132 + * No need for WRITE_ONCE(). At this point, 133 + * no one is reading it without taking the 134 + * sk->sk_callback_lock. 135 + */ 136 + sk->sk_user_data = NULL; 137 + write_unlock_bh(&sk->sk_callback_lock); 138 + RCU_INIT_POINTER(array->ptrs[i], NULL); 139 + } 140 + } 141 + rcu_read_unlock(); 142 + 143 + /* 144 + * Once reaching here, all sk->sk_user_data is not 145 + * referenceing this "array". "array" can be freed now. 146 + */ 147 + bpf_map_area_free(array); 148 + } 149 + 150 + static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr) 151 + { 152 + int err, numa_node = bpf_map_attr_numa_node(attr); 153 + struct reuseport_array *array; 154 + u64 cost, array_size; 155 + 156 + if (!capable(CAP_SYS_ADMIN)) 157 + return ERR_PTR(-EPERM); 158 + 159 + array_size = sizeof(*array); 160 + array_size += (u64)attr->max_entries * sizeof(struct sock *); 161 + 162 + /* make sure there is no u32 overflow later in round_up() */ 163 + cost = array_size; 164 + if (cost >= U32_MAX - PAGE_SIZE) 165 + return ERR_PTR(-ENOMEM); 166 + cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; 167 + 168 + err = bpf_map_precharge_memlock(cost); 169 + if (err) 170 + return ERR_PTR(err); 171 + 172 + /* allocate all map elements and zero-initialize them */ 173 + array = bpf_map_area_alloc(array_size, numa_node); 174 + if (!array) 175 + return ERR_PTR(-ENOMEM); 176 + 177 + /* copy mandatory map attributes */ 178 + bpf_map_init_from_attr(&array->map, attr); 179 + array->map.pages = cost; 180 + 181 + return &array->map; 182 + } 183 + 184 + int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key, 185 + void *value) 186 + { 187 + struct sock *sk; 188 + int err; 189 + 190 + if (map->value_size != sizeof(u64)) 191 + return -ENOSPC; 192 + 193 + rcu_read_lock(); 194 + sk = reuseport_array_lookup_elem(map, key); 195 + if (sk) { 196 + *(u64 *)value = sock_gen_cookie(sk); 197 + err = 0; 198 + } else { 199 + err = -ENOENT; 200 + } 201 + rcu_read_unlock(); 202 + 203 + return err; 204 + } 205 + 206 + static int 207 + reuseport_array_update_check(const struct reuseport_array *array, 208 + const struct sock *nsk, 209 + const struct sock *osk, 210 + const struct sock_reuseport *nsk_reuse, 211 + u32 map_flags) 212 + { 213 + if (osk && map_flags == BPF_NOEXIST) 214 + return -EEXIST; 215 + 216 + if (!osk && map_flags == BPF_EXIST) 217 + return -ENOENT; 218 + 219 + if (nsk->sk_protocol != IPPROTO_UDP && nsk->sk_protocol != IPPROTO_TCP) 220 + return -ENOTSUPP; 221 + 222 + if (nsk->sk_family != AF_INET && nsk->sk_family != AF_INET6) 223 + return -ENOTSUPP; 224 + 225 + if (nsk->sk_type != SOCK_STREAM && nsk->sk_type != SOCK_DGRAM) 226 + return -ENOTSUPP; 227 + 228 + /* 229 + * sk must be hashed (i.e. listening in the TCP case or binded 230 + * in the UDP case) and 231 + * it must also be a SO_REUSEPORT sk (i.e. reuse cannot be NULL). 232 + * 233 + * Also, sk will be used in bpf helper that is protected by 234 + * rcu_read_lock(). 235 + */ 236 + if (!sock_flag(nsk, SOCK_RCU_FREE) || !sk_hashed(nsk) || !nsk_reuse) 237 + return -EINVAL; 238 + 239 + /* READ_ONCE because the sk->sk_callback_lock may not be held here */ 240 + if (READ_ONCE(nsk->sk_user_data)) 241 + return -EBUSY; 242 + 243 + return 0; 244 + } 245 + 246 + /* 247 + * Called from syscall only. 248 + * The "nsk" in the fd refcnt. 249 + * The "osk" and "reuse" are protected by reuseport_lock. 250 + */ 251 + int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key, 252 + void *value, u64 map_flags) 253 + { 254 + struct reuseport_array *array = reuseport_array(map); 255 + struct sock *free_osk = NULL, *osk, *nsk; 256 + struct sock_reuseport *reuse; 257 + u32 index = *(u32 *)key; 258 + struct socket *socket; 259 + int err, fd; 260 + 261 + if (map_flags > BPF_EXIST) 262 + return -EINVAL; 263 + 264 + if (index >= map->max_entries) 265 + return -E2BIG; 266 + 267 + if (map->value_size == sizeof(u64)) { 268 + u64 fd64 = *(u64 *)value; 269 + 270 + if (fd64 > S32_MAX) 271 + return -EINVAL; 272 + fd = fd64; 273 + } else { 274 + fd = *(int *)value; 275 + } 276 + 277 + socket = sockfd_lookup(fd, &err); 278 + if (!socket) 279 + return err; 280 + 281 + nsk = socket->sk; 282 + if (!nsk) { 283 + err = -EINVAL; 284 + goto put_file; 285 + } 286 + 287 + /* Quick checks before taking reuseport_lock */ 288 + err = reuseport_array_update_check(array, nsk, 289 + rcu_access_pointer(array->ptrs[index]), 290 + rcu_access_pointer(nsk->sk_reuseport_cb), 291 + map_flags); 292 + if (err) 293 + goto put_file; 294 + 295 + spin_lock_bh(&reuseport_lock); 296 + /* 297 + * Some of the checks only need reuseport_lock 298 + * but it is done under sk_callback_lock also 299 + * for simplicity reason. 300 + */ 301 + write_lock_bh(&nsk->sk_callback_lock); 302 + 303 + osk = rcu_dereference_protected(array->ptrs[index], 304 + lockdep_is_held(&reuseport_lock)); 305 + reuse = rcu_dereference_protected(nsk->sk_reuseport_cb, 306 + lockdep_is_held(&reuseport_lock)); 307 + err = reuseport_array_update_check(array, nsk, osk, reuse, map_flags); 308 + if (err) 309 + goto put_file_unlock; 310 + 311 + /* Ensure reuse->reuseport_id is set */ 312 + err = reuseport_get_id(reuse); 313 + if (err < 0) 314 + goto put_file_unlock; 315 + 316 + WRITE_ONCE(nsk->sk_user_data, &array->ptrs[index]); 317 + rcu_assign_pointer(array->ptrs[index], nsk); 318 + free_osk = osk; 319 + err = 0; 320 + 321 + put_file_unlock: 322 + write_unlock_bh(&nsk->sk_callback_lock); 323 + 324 + if (free_osk) { 325 + write_lock_bh(&free_osk->sk_callback_lock); 326 + WRITE_ONCE(free_osk->sk_user_data, NULL); 327 + write_unlock_bh(&free_osk->sk_callback_lock); 328 + } 329 + 330 + spin_unlock_bh(&reuseport_lock); 331 + put_file: 332 + fput(socket->file); 333 + return err; 334 + } 335 + 336 + /* Called from syscall */ 337 + static int reuseport_array_get_next_key(struct bpf_map *map, void *key, 338 + void *next_key) 339 + { 340 + struct reuseport_array *array = reuseport_array(map); 341 + u32 index = key ? *(u32 *)key : U32_MAX; 342 + u32 *next = (u32 *)next_key; 343 + 344 + if (index >= array->map.max_entries) { 345 + *next = 0; 346 + return 0; 347 + } 348 + 349 + if (index == array->map.max_entries - 1) 350 + return -ENOENT; 351 + 352 + *next = index + 1; 353 + return 0; 354 + } 355 + 356 + const struct bpf_map_ops reuseport_array_ops = { 357 + .map_alloc_check = reuseport_array_alloc_check, 358 + .map_alloc = reuseport_array_alloc, 359 + .map_free = reuseport_array_free, 360 + .map_lookup_elem = reuseport_array_lookup_elem, 361 + .map_get_next_key = reuseport_array_get_next_key, 362 + .map_delete_elem = reuseport_array_delete_elem, 363 + };
+2
kernel/bpf/sockmap.c
··· 2498 2498 .map_update_elem = sock_map_update_elem, 2499 2499 .map_delete_elem = sock_map_delete_elem, 2500 2500 .map_release_uref = sock_map_release, 2501 + .map_check_btf = map_check_no_btf, 2501 2502 }; 2502 2503 2503 2504 const struct bpf_map_ops sock_hash_ops = { ··· 2509 2508 .map_update_elem = sock_hash_update_elem, 2510 2509 .map_delete_elem = sock_hash_delete_elem, 2511 2510 .map_release_uref = sock_map_release, 2511 + .map_check_btf = map_check_no_btf, 2512 2512 }; 2513 2513 2514 2514 BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock,
+1
kernel/bpf/stackmap.c
··· 607 607 .map_lookup_elem = stack_map_lookup_elem, 608 608 .map_update_elem = stack_map_update_elem, 609 609 .map_delete_elem = stack_map_delete_elem, 610 + .map_check_btf = map_check_no_btf, 610 611 }; 611 612 612 613 static int __init stack_map_init(void)
+38 -4
kernel/bpf/syscall.c
··· 103 103 const struct bpf_map_ops bpf_map_offload_ops = { 104 104 .map_alloc = bpf_map_offload_map_alloc, 105 105 .map_free = bpf_map_offload_map_free, 106 + .map_check_btf = map_check_no_btf, 106 107 }; 107 108 108 109 static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) ··· 456 455 return 0; 457 456 } 458 457 458 + int map_check_no_btf(const struct bpf_map *map, 459 + const struct btf_type *key_type, 460 + const struct btf_type *value_type) 461 + { 462 + return -ENOTSUPP; 463 + } 464 + 465 + static int map_check_btf(const struct bpf_map *map, const struct btf *btf, 466 + u32 btf_key_id, u32 btf_value_id) 467 + { 468 + const struct btf_type *key_type, *value_type; 469 + u32 key_size, value_size; 470 + int ret = 0; 471 + 472 + key_type = btf_type_id_size(btf, &btf_key_id, &key_size); 473 + if (!key_type || key_size != map->key_size) 474 + return -EINVAL; 475 + 476 + value_type = btf_type_id_size(btf, &btf_value_id, &value_size); 477 + if (!value_type || value_size != map->value_size) 478 + return -EINVAL; 479 + 480 + if (map->ops->map_check_btf) 481 + ret = map->ops->map_check_btf(map, key_type, value_type); 482 + 483 + return ret; 484 + } 485 + 459 486 #define BPF_MAP_CREATE_LAST_FIELD btf_value_type_id 460 487 /* called via syscall */ 461 488 static int map_create(union bpf_attr *attr) ··· 518 489 atomic_set(&map->refcnt, 1); 519 490 atomic_set(&map->usercnt, 1); 520 491 521 - if (bpf_map_support_seq_show(map) && 522 - (attr->btf_key_type_id || attr->btf_value_type_id)) { 492 + if (attr->btf_key_type_id || attr->btf_value_type_id) { 523 493 struct btf *btf; 524 494 525 495 if (!attr->btf_key_type_id || !attr->btf_value_type_id) { ··· 532 504 goto free_map_nouncharge; 533 505 } 534 506 535 - err = map->ops->map_check_btf(map, btf, attr->btf_key_type_id, 536 - attr->btf_value_type_id); 507 + err = map_check_btf(map, btf, attr->btf_key_type_id, 508 + attr->btf_value_type_id); 537 509 if (err) { 538 510 btf_put(btf); 539 511 goto free_map_nouncharge; ··· 712 684 err = bpf_fd_array_map_lookup_elem(map, key, value); 713 685 } else if (IS_FD_HASH(map)) { 714 686 err = bpf_fd_htab_map_lookup_elem(map, key, value); 687 + } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) { 688 + err = bpf_fd_reuseport_array_lookup_elem(map, key, value); 715 689 } else { 716 690 rcu_read_lock(); 717 691 ptr = map->ops->map_lookup_elem(map, key); ··· 820 790 err = bpf_fd_htab_map_update_elem(map, f.file, key, value, 821 791 attr->flags); 822 792 rcu_read_unlock(); 793 + } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) { 794 + /* rcu_read_lock() is not needed */ 795 + err = bpf_fd_reuseport_array_update_elem(map, key, value, 796 + attr->flags); 823 797 } else { 824 798 rcu_read_lock(); 825 799 err = map->ops->map_update_elem(map, key, value, attr->flags);
+9
kernel/bpf/verifier.c
··· 1310 1310 case BPF_PROG_TYPE_LWT_IN: 1311 1311 case BPF_PROG_TYPE_LWT_OUT: 1312 1312 case BPF_PROG_TYPE_LWT_SEG6LOCAL: 1313 + case BPF_PROG_TYPE_SK_REUSEPORT: 1313 1314 /* dst_input() and dst_output() can't write for now */ 1314 1315 if (t == BPF_WRITE) 1315 1316 return false; ··· 2167 2166 func_id != BPF_FUNC_msg_redirect_hash) 2168 2167 goto error; 2169 2168 break; 2169 + case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY: 2170 + if (func_id != BPF_FUNC_sk_select_reuseport) 2171 + goto error; 2172 + break; 2170 2173 default: 2171 2174 break; 2172 2175 } ··· 2220 2215 break; 2221 2216 case BPF_FUNC_get_local_storage: 2222 2217 if (map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE) 2218 + goto error; 2219 + break; 2220 + case BPF_FUNC_sk_select_reuseport: 2221 + if (map->map_type != BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) 2223 2222 goto error; 2224 2223 break; 2225 2224 default:
+1 -2
kernel/bpf/xskmap.c
··· 227 227 .map_lookup_elem = xsk_map_lookup_elem, 228 228 .map_update_elem = xsk_map_update_elem, 229 229 .map_delete_elem = xsk_map_delete_elem, 230 + .map_check_btf = map_check_no_btf, 230 231 }; 231 - 232 -
+359 -54
net/core/filter.c
··· 1453 1453 return 0; 1454 1454 } 1455 1455 1456 - static int __reuseport_attach_prog(struct bpf_prog *prog, struct sock *sk) 1457 - { 1458 - struct bpf_prog *old_prog; 1459 - int err; 1460 - 1461 - if (bpf_prog_size(prog->len) > sysctl_optmem_max) 1462 - return -ENOMEM; 1463 - 1464 - if (sk_unhashed(sk) && sk->sk_reuseport) { 1465 - err = reuseport_alloc(sk); 1466 - if (err) 1467 - return err; 1468 - } else if (!rcu_access_pointer(sk->sk_reuseport_cb)) { 1469 - /* The socket wasn't bound with SO_REUSEPORT */ 1470 - return -EINVAL; 1471 - } 1472 - 1473 - old_prog = reuseport_attach_prog(sk, prog); 1474 - if (old_prog) 1475 - bpf_prog_destroy(old_prog); 1476 - 1477 - return 0; 1478 - } 1479 - 1480 1456 static 1481 1457 struct bpf_prog *__get_filter(struct sock_fprog *fprog, struct sock *sk) 1482 1458 { ··· 1526 1550 if (IS_ERR(prog)) 1527 1551 return PTR_ERR(prog); 1528 1552 1529 - err = __reuseport_attach_prog(prog, sk); 1530 - if (err < 0) { 1531 - __bpf_prog_release(prog); 1532 - return err; 1533 - } 1553 + if (bpf_prog_size(prog->len) > sysctl_optmem_max) 1554 + err = -ENOMEM; 1555 + else 1556 + err = reuseport_attach_prog(sk, prog); 1534 1557 1535 - return 0; 1558 + if (err) 1559 + __bpf_prog_release(prog); 1560 + 1561 + return err; 1536 1562 } 1537 1563 1538 1564 static struct bpf_prog *__get_bpf(u32 ufd, struct sock *sk) ··· 1564 1586 1565 1587 int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk) 1566 1588 { 1567 - struct bpf_prog *prog = __get_bpf(ufd, sk); 1589 + struct bpf_prog *prog; 1568 1590 int err; 1569 1591 1592 + if (sock_flag(sk, SOCK_FILTER_LOCKED)) 1593 + return -EPERM; 1594 + 1595 + prog = bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER); 1596 + if (IS_ERR(prog) && PTR_ERR(prog) == -EINVAL) 1597 + prog = bpf_prog_get_type(ufd, BPF_PROG_TYPE_SK_REUSEPORT); 1570 1598 if (IS_ERR(prog)) 1571 1599 return PTR_ERR(prog); 1572 1600 1573 - err = __reuseport_attach_prog(prog, sk); 1574 - if (err < 0) { 1575 - bpf_prog_put(prog); 1576 - return err; 1601 + if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT) { 1602 + /* Like other non BPF_PROG_TYPE_SOCKET_FILTER 1603 + * bpf prog (e.g. sockmap). It depends on the 1604 + * limitation imposed by bpf_prog_load(). 1605 + * Hence, sysctl_optmem_max is not checked. 1606 + */ 1607 + if ((sk->sk_type != SOCK_STREAM && 1608 + sk->sk_type != SOCK_DGRAM) || 1609 + (sk->sk_protocol != IPPROTO_UDP && 1610 + sk->sk_protocol != IPPROTO_TCP) || 1611 + (sk->sk_family != AF_INET && 1612 + sk->sk_family != AF_INET6)) { 1613 + err = -ENOTSUPP; 1614 + goto err_prog_put; 1615 + } 1616 + } else { 1617 + /* BPF_PROG_TYPE_SOCKET_FILTER */ 1618 + if (bpf_prog_size(prog->len) > sysctl_optmem_max) { 1619 + err = -ENOMEM; 1620 + goto err_prog_put; 1621 + } 1577 1622 } 1578 1623 1579 - return 0; 1624 + err = reuseport_attach_prog(sk, prog); 1625 + err_prog_put: 1626 + if (err) 1627 + bpf_prog_put(prog); 1628 + 1629 + return err; 1630 + } 1631 + 1632 + void sk_reuseport_prog_free(struct bpf_prog *prog) 1633 + { 1634 + if (!prog) 1635 + return; 1636 + 1637 + if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT) 1638 + bpf_prog_put(prog); 1639 + else 1640 + bpf_prog_destroy(prog); 1580 1641 } 1581 1642 1582 1643 struct bpf_scratchpad { ··· 2099 2082 .arg3_type = ARG_ANYTHING, 2100 2083 }; 2101 2084 2102 - struct redirect_info { 2103 - u32 ifindex; 2104 - u32 flags; 2105 - struct bpf_map *map; 2106 - struct bpf_map *map_to_flush; 2107 - unsigned long map_owner; 2108 - }; 2109 - 2110 - static DEFINE_PER_CPU(struct redirect_info, redirect_info); 2085 + DEFINE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info); 2086 + EXPORT_PER_CPU_SYMBOL_GPL(bpf_redirect_info); 2111 2087 2112 2088 BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags) 2113 2089 { 2114 - struct redirect_info *ri = this_cpu_ptr(&redirect_info); 2090 + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); 2115 2091 2116 2092 if (unlikely(flags & ~(BPF_F_INGRESS))) 2117 2093 return TC_ACT_SHOT; ··· 2117 2107 2118 2108 int skb_do_redirect(struct sk_buff *skb) 2119 2109 { 2120 - struct redirect_info *ri = this_cpu_ptr(&redirect_info); 2110 + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); 2121 2111 struct net_device *dev; 2122 2112 2123 2113 dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->ifindex); ··· 3210 3200 3211 3201 void xdp_do_flush_map(void) 3212 3202 { 3213 - struct redirect_info *ri = this_cpu_ptr(&redirect_info); 3203 + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); 3214 3204 struct bpf_map *map = ri->map_to_flush; 3215 3205 3216 3206 ri->map_to_flush = NULL; ··· 3255 3245 static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, 3256 3246 struct bpf_prog *xdp_prog) 3257 3247 { 3258 - struct redirect_info *ri = this_cpu_ptr(&redirect_info); 3248 + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); 3259 3249 unsigned long map_owner = ri->map_owner; 3260 3250 struct bpf_map *map = ri->map; 3261 3251 u32 index = ri->ifindex; ··· 3295 3285 int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, 3296 3286 struct bpf_prog *xdp_prog) 3297 3287 { 3298 - struct redirect_info *ri = this_cpu_ptr(&redirect_info); 3288 + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); 3299 3289 struct net_device *fwd; 3300 3290 u32 index = ri->ifindex; 3301 3291 int err; ··· 3327 3317 struct xdp_buff *xdp, 3328 3318 struct bpf_prog *xdp_prog) 3329 3319 { 3330 - struct redirect_info *ri = this_cpu_ptr(&redirect_info); 3320 + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); 3331 3321 unsigned long map_owner = ri->map_owner; 3332 3322 struct bpf_map *map = ri->map; 3333 3323 u32 index = ri->ifindex; ··· 3378 3368 int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, 3379 3369 struct xdp_buff *xdp, struct bpf_prog *xdp_prog) 3380 3370 { 3381 - struct redirect_info *ri = this_cpu_ptr(&redirect_info); 3371 + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); 3382 3372 u32 index = ri->ifindex; 3383 3373 struct net_device *fwd; 3384 3374 int err = 0; ··· 3409 3399 3410 3400 BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags) 3411 3401 { 3412 - struct redirect_info *ri = this_cpu_ptr(&redirect_info); 3402 + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); 3413 3403 3414 3404 if (unlikely(flags)) 3415 3405 return XDP_ABORTED; ··· 3433 3423 BPF_CALL_4(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex, u64, flags, 3434 3424 unsigned long, map_owner) 3435 3425 { 3436 - struct redirect_info *ri = this_cpu_ptr(&redirect_info); 3426 + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); 3437 3427 3438 3428 if (unlikely(flags)) 3439 3429 return XDP_ABORTED; ··· 3777 3767 .gpl_only = false, 3778 3768 .ret_type = RET_INTEGER, 3779 3769 .arg1_type = ARG_PTR_TO_CTX, 3770 + }; 3771 + 3772 + BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int, 3773 + ancestor_level) 3774 + { 3775 + struct sock *sk = skb_to_full_sk(skb); 3776 + struct cgroup *ancestor; 3777 + struct cgroup *cgrp; 3778 + 3779 + if (!sk || !sk_fullsock(sk)) 3780 + return 0; 3781 + 3782 + cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); 3783 + ancestor = cgroup_ancestor(cgrp, ancestor_level); 3784 + if (!ancestor) 3785 + return 0; 3786 + 3787 + return ancestor->kn->id.id; 3788 + } 3789 + 3790 + static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = { 3791 + .func = bpf_skb_ancestor_cgroup_id, 3792 + .gpl_only = false, 3793 + .ret_type = RET_INTEGER, 3794 + .arg1_type = ARG_PTR_TO_CTX, 3795 + .arg2_type = ARG_ANYTHING, 3780 3796 }; 3781 3797 #endif 3782 3798 ··· 4992 4956 #ifdef CONFIG_SOCK_CGROUP_DATA 4993 4957 case BPF_FUNC_skb_cgroup_id: 4994 4958 return &bpf_skb_cgroup_id_proto; 4959 + case BPF_FUNC_skb_ancestor_cgroup_id: 4960 + return &bpf_skb_ancestor_cgroup_id_proto; 4995 4961 #endif 4996 4962 default: 4997 4963 return bpf_base_func_proto(func_id); ··· 7058 7020 release_sock(sk); 7059 7021 return ret; 7060 7022 } 7023 + 7024 + #ifdef CONFIG_INET 7025 + struct sk_reuseport_kern { 7026 + struct sk_buff *skb; 7027 + struct sock *sk; 7028 + struct sock *selected_sk; 7029 + void *data_end; 7030 + u32 hash; 7031 + u32 reuseport_id; 7032 + bool bind_inany; 7033 + }; 7034 + 7035 + static void bpf_init_reuseport_kern(struct sk_reuseport_kern *reuse_kern, 7036 + struct sock_reuseport *reuse, 7037 + struct sock *sk, struct sk_buff *skb, 7038 + u32 hash) 7039 + { 7040 + reuse_kern->skb = skb; 7041 + reuse_kern->sk = sk; 7042 + reuse_kern->selected_sk = NULL; 7043 + reuse_kern->data_end = skb->data + skb_headlen(skb); 7044 + reuse_kern->hash = hash; 7045 + reuse_kern->reuseport_id = reuse->reuseport_id; 7046 + reuse_kern->bind_inany = reuse->bind_inany; 7047 + } 7048 + 7049 + struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, 7050 + struct bpf_prog *prog, struct sk_buff *skb, 7051 + u32 hash) 7052 + { 7053 + struct sk_reuseport_kern reuse_kern; 7054 + enum sk_action action; 7055 + 7056 + bpf_init_reuseport_kern(&reuse_kern, reuse, sk, skb, hash); 7057 + action = BPF_PROG_RUN(prog, &reuse_kern); 7058 + 7059 + if (action == SK_PASS) 7060 + return reuse_kern.selected_sk; 7061 + else 7062 + return ERR_PTR(-ECONNREFUSED); 7063 + } 7064 + 7065 + BPF_CALL_4(sk_select_reuseport, struct sk_reuseport_kern *, reuse_kern, 7066 + struct bpf_map *, map, void *, key, u32, flags) 7067 + { 7068 + struct sock_reuseport *reuse; 7069 + struct sock *selected_sk; 7070 + 7071 + selected_sk = map->ops->map_lookup_elem(map, key); 7072 + if (!selected_sk) 7073 + return -ENOENT; 7074 + 7075 + reuse = rcu_dereference(selected_sk->sk_reuseport_cb); 7076 + if (!reuse) 7077 + /* selected_sk is unhashed (e.g. by close()) after the 7078 + * above map_lookup_elem(). Treat selected_sk has already 7079 + * been removed from the map. 7080 + */ 7081 + return -ENOENT; 7082 + 7083 + if (unlikely(reuse->reuseport_id != reuse_kern->reuseport_id)) { 7084 + struct sock *sk; 7085 + 7086 + if (unlikely(!reuse_kern->reuseport_id)) 7087 + /* There is a small race between adding the 7088 + * sk to the map and setting the 7089 + * reuse_kern->reuseport_id. 7090 + * Treat it as the sk has not been added to 7091 + * the bpf map yet. 7092 + */ 7093 + return -ENOENT; 7094 + 7095 + sk = reuse_kern->sk; 7096 + if (sk->sk_protocol != selected_sk->sk_protocol) 7097 + return -EPROTOTYPE; 7098 + else if (sk->sk_family != selected_sk->sk_family) 7099 + return -EAFNOSUPPORT; 7100 + 7101 + /* Catch all. Likely bound to a different sockaddr. */ 7102 + return -EBADFD; 7103 + } 7104 + 7105 + reuse_kern->selected_sk = selected_sk; 7106 + 7107 + return 0; 7108 + } 7109 + 7110 + static const struct bpf_func_proto sk_select_reuseport_proto = { 7111 + .func = sk_select_reuseport, 7112 + .gpl_only = false, 7113 + .ret_type = RET_INTEGER, 7114 + .arg1_type = ARG_PTR_TO_CTX, 7115 + .arg2_type = ARG_CONST_MAP_PTR, 7116 + .arg3_type = ARG_PTR_TO_MAP_KEY, 7117 + .arg4_type = ARG_ANYTHING, 7118 + }; 7119 + 7120 + BPF_CALL_4(sk_reuseport_load_bytes, 7121 + const struct sk_reuseport_kern *, reuse_kern, u32, offset, 7122 + void *, to, u32, len) 7123 + { 7124 + return ____bpf_skb_load_bytes(reuse_kern->skb, offset, to, len); 7125 + } 7126 + 7127 + static const struct bpf_func_proto sk_reuseport_load_bytes_proto = { 7128 + .func = sk_reuseport_load_bytes, 7129 + .gpl_only = false, 7130 + .ret_type = RET_INTEGER, 7131 + .arg1_type = ARG_PTR_TO_CTX, 7132 + .arg2_type = ARG_ANYTHING, 7133 + .arg3_type = ARG_PTR_TO_UNINIT_MEM, 7134 + .arg4_type = ARG_CONST_SIZE, 7135 + }; 7136 + 7137 + BPF_CALL_5(sk_reuseport_load_bytes_relative, 7138 + const struct sk_reuseport_kern *, reuse_kern, u32, offset, 7139 + void *, to, u32, len, u32, start_header) 7140 + { 7141 + return ____bpf_skb_load_bytes_relative(reuse_kern->skb, offset, to, 7142 + len, start_header); 7143 + } 7144 + 7145 + static const struct bpf_func_proto sk_reuseport_load_bytes_relative_proto = { 7146 + .func = sk_reuseport_load_bytes_relative, 7147 + .gpl_only = false, 7148 + .ret_type = RET_INTEGER, 7149 + .arg1_type = ARG_PTR_TO_CTX, 7150 + .arg2_type = ARG_ANYTHING, 7151 + .arg3_type = ARG_PTR_TO_UNINIT_MEM, 7152 + .arg4_type = ARG_CONST_SIZE, 7153 + .arg5_type = ARG_ANYTHING, 7154 + }; 7155 + 7156 + static const struct bpf_func_proto * 7157 + sk_reuseport_func_proto(enum bpf_func_id func_id, 7158 + const struct bpf_prog *prog) 7159 + { 7160 + switch (func_id) { 7161 + case BPF_FUNC_sk_select_reuseport: 7162 + return &sk_select_reuseport_proto; 7163 + case BPF_FUNC_skb_load_bytes: 7164 + return &sk_reuseport_load_bytes_proto; 7165 + case BPF_FUNC_skb_load_bytes_relative: 7166 + return &sk_reuseport_load_bytes_relative_proto; 7167 + default: 7168 + return bpf_base_func_proto(func_id); 7169 + } 7170 + } 7171 + 7172 + static bool 7173 + sk_reuseport_is_valid_access(int off, int size, 7174 + enum bpf_access_type type, 7175 + const struct bpf_prog *prog, 7176 + struct bpf_insn_access_aux *info) 7177 + { 7178 + const u32 size_default = sizeof(__u32); 7179 + 7180 + if (off < 0 || off >= sizeof(struct sk_reuseport_md) || 7181 + off % size || type != BPF_READ) 7182 + return false; 7183 + 7184 + switch (off) { 7185 + case offsetof(struct sk_reuseport_md, data): 7186 + info->reg_type = PTR_TO_PACKET; 7187 + return size == sizeof(__u64); 7188 + 7189 + case offsetof(struct sk_reuseport_md, data_end): 7190 + info->reg_type = PTR_TO_PACKET_END; 7191 + return size == sizeof(__u64); 7192 + 7193 + case offsetof(struct sk_reuseport_md, hash): 7194 + return size == size_default; 7195 + 7196 + /* Fields that allow narrowing */ 7197 + case offsetof(struct sk_reuseport_md, eth_protocol): 7198 + if (size < FIELD_SIZEOF(struct sk_buff, protocol)) 7199 + return false; 7200 + case offsetof(struct sk_reuseport_md, ip_protocol): 7201 + case offsetof(struct sk_reuseport_md, bind_inany): 7202 + case offsetof(struct sk_reuseport_md, len): 7203 + bpf_ctx_record_field_size(info, size_default); 7204 + return bpf_ctx_narrow_access_ok(off, size, size_default); 7205 + 7206 + default: 7207 + return false; 7208 + } 7209 + } 7210 + 7211 + #define SK_REUSEPORT_LOAD_FIELD(F) ({ \ 7212 + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_reuseport_kern, F), \ 7213 + si->dst_reg, si->src_reg, \ 7214 + bpf_target_off(struct sk_reuseport_kern, F, \ 7215 + FIELD_SIZEOF(struct sk_reuseport_kern, F), \ 7216 + target_size)); \ 7217 + }) 7218 + 7219 + #define SK_REUSEPORT_LOAD_SKB_FIELD(SKB_FIELD) \ 7220 + SOCK_ADDR_LOAD_NESTED_FIELD(struct sk_reuseport_kern, \ 7221 + struct sk_buff, \ 7222 + skb, \ 7223 + SKB_FIELD) 7224 + 7225 + #define SK_REUSEPORT_LOAD_SK_FIELD_SIZE_OFF(SK_FIELD, BPF_SIZE, EXTRA_OFF) \ 7226 + SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(struct sk_reuseport_kern, \ 7227 + struct sock, \ 7228 + sk, \ 7229 + SK_FIELD, BPF_SIZE, EXTRA_OFF) 7230 + 7231 + static u32 sk_reuseport_convert_ctx_access(enum bpf_access_type type, 7232 + const struct bpf_insn *si, 7233 + struct bpf_insn *insn_buf, 7234 + struct bpf_prog *prog, 7235 + u32 *target_size) 7236 + { 7237 + struct bpf_insn *insn = insn_buf; 7238 + 7239 + switch (si->off) { 7240 + case offsetof(struct sk_reuseport_md, data): 7241 + SK_REUSEPORT_LOAD_SKB_FIELD(data); 7242 + break; 7243 + 7244 + case offsetof(struct sk_reuseport_md, len): 7245 + SK_REUSEPORT_LOAD_SKB_FIELD(len); 7246 + break; 7247 + 7248 + case offsetof(struct sk_reuseport_md, eth_protocol): 7249 + SK_REUSEPORT_LOAD_SKB_FIELD(protocol); 7250 + break; 7251 + 7252 + case offsetof(struct sk_reuseport_md, ip_protocol): 7253 + BUILD_BUG_ON(hweight_long(SK_FL_PROTO_MASK) != BITS_PER_BYTE); 7254 + SK_REUSEPORT_LOAD_SK_FIELD_SIZE_OFF(__sk_flags_offset, 7255 + BPF_W, 0); 7256 + *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK); 7257 + *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, 7258 + SK_FL_PROTO_SHIFT); 7259 + /* SK_FL_PROTO_MASK and SK_FL_PROTO_SHIFT are endian 7260 + * aware. No further narrowing or masking is needed. 7261 + */ 7262 + *target_size = 1; 7263 + break; 7264 + 7265 + case offsetof(struct sk_reuseport_md, data_end): 7266 + SK_REUSEPORT_LOAD_FIELD(data_end); 7267 + break; 7268 + 7269 + case offsetof(struct sk_reuseport_md, hash): 7270 + SK_REUSEPORT_LOAD_FIELD(hash); 7271 + break; 7272 + 7273 + case offsetof(struct sk_reuseport_md, bind_inany): 7274 + SK_REUSEPORT_LOAD_FIELD(bind_inany); 7275 + break; 7276 + } 7277 + 7278 + return insn - insn_buf; 7279 + } 7280 + 7281 + const struct bpf_verifier_ops sk_reuseport_verifier_ops = { 7282 + .get_func_proto = sk_reuseport_func_proto, 7283 + .is_valid_access = sk_reuseport_is_valid_access, 7284 + .convert_ctx_access = sk_reuseport_convert_ctx_access, 7285 + }; 7286 + 7287 + const struct bpf_prog_ops sk_reuseport_prog_ops = { 7288 + }; 7289 + #endif /* CONFIG_INET */
+2 -1
net/core/skbuff.c
··· 1291 1291 } 1292 1292 EXPORT_SYMBOL(skb_clone); 1293 1293 1294 - static void skb_headers_offset_update(struct sk_buff *skb, int off) 1294 + void skb_headers_offset_update(struct sk_buff *skb, int off) 1295 1295 { 1296 1296 /* Only adjust this if it actually is csum_start rather than csum */ 1297 1297 if (skb->ip_summed == CHECKSUM_PARTIAL) ··· 1305 1305 skb->inner_network_header += off; 1306 1306 skb->inner_mac_header += off; 1307 1307 } 1308 + EXPORT_SYMBOL(skb_headers_offset_update); 1308 1309 1309 1310 void skb_copy_header(struct sk_buff *new, const struct sk_buff *old) 1310 1311 {
+76 -16
net/core/sock_reuseport.c
··· 8 8 9 9 #include <net/sock_reuseport.h> 10 10 #include <linux/bpf.h> 11 + #include <linux/idr.h> 12 + #include <linux/filter.h> 11 13 #include <linux/rcupdate.h> 12 14 13 15 #define INIT_SOCKS 128 14 16 15 - static DEFINE_SPINLOCK(reuseport_lock); 17 + DEFINE_SPINLOCK(reuseport_lock); 18 + 19 + #define REUSEPORT_MIN_ID 1 20 + static DEFINE_IDA(reuseport_ida); 21 + 22 + int reuseport_get_id(struct sock_reuseport *reuse) 23 + { 24 + int id; 25 + 26 + if (reuse->reuseport_id) 27 + return reuse->reuseport_id; 28 + 29 + id = ida_simple_get(&reuseport_ida, REUSEPORT_MIN_ID, 0, 30 + /* Called under reuseport_lock */ 31 + GFP_ATOMIC); 32 + if (id < 0) 33 + return id; 34 + 35 + reuse->reuseport_id = id; 36 + 37 + return reuse->reuseport_id; 38 + } 16 39 17 40 static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks) 18 41 { ··· 52 29 return reuse; 53 30 } 54 31 55 - int reuseport_alloc(struct sock *sk) 32 + int reuseport_alloc(struct sock *sk, bool bind_inany) 56 33 { 57 34 struct sock_reuseport *reuse; 58 35 ··· 64 41 /* Allocation attempts can occur concurrently via the setsockopt path 65 42 * and the bind/hash path. Nothing to do when we lose the race. 66 43 */ 67 - if (rcu_dereference_protected(sk->sk_reuseport_cb, 68 - lockdep_is_held(&reuseport_lock))) 44 + reuse = rcu_dereference_protected(sk->sk_reuseport_cb, 45 + lockdep_is_held(&reuseport_lock)); 46 + if (reuse) { 47 + /* Only set reuse->bind_inany if the bind_inany is true. 48 + * Otherwise, it will overwrite the reuse->bind_inany 49 + * which was set by the bind/hash path. 50 + */ 51 + if (bind_inany) 52 + reuse->bind_inany = bind_inany; 69 53 goto out; 54 + } 70 55 71 56 reuse = __reuseport_alloc(INIT_SOCKS); 72 57 if (!reuse) { ··· 84 53 85 54 reuse->socks[0] = sk; 86 55 reuse->num_socks = 1; 56 + reuse->bind_inany = bind_inany; 87 57 rcu_assign_pointer(sk->sk_reuseport_cb, reuse); 88 58 89 59 out: ··· 110 78 more_reuse->max_socks = more_socks_size; 111 79 more_reuse->num_socks = reuse->num_socks; 112 80 more_reuse->prog = reuse->prog; 81 + more_reuse->reuseport_id = reuse->reuseport_id; 82 + more_reuse->bind_inany = reuse->bind_inany; 113 83 114 84 memcpy(more_reuse->socks, reuse->socks, 115 85 reuse->num_socks * sizeof(struct sock *)); 86 + more_reuse->synq_overflow_ts = READ_ONCE(reuse->synq_overflow_ts); 116 87 117 88 for (i = 0; i < reuse->num_socks; ++i) 118 89 rcu_assign_pointer(reuse->socks[i]->sk_reuseport_cb, ··· 134 99 struct sock_reuseport *reuse; 135 100 136 101 reuse = container_of(head, struct sock_reuseport, rcu); 137 - if (reuse->prog) 138 - bpf_prog_destroy(reuse->prog); 102 + sk_reuseport_prog_free(rcu_dereference_protected(reuse->prog, 1)); 103 + if (reuse->reuseport_id) 104 + ida_simple_remove(&reuseport_ida, reuse->reuseport_id); 139 105 kfree(reuse); 140 106 } 141 107 ··· 146 110 * @sk2: Socket belonging to the existing reuseport group. 147 111 * May return ENOMEM and not add socket to group under memory pressure. 148 112 */ 149 - int reuseport_add_sock(struct sock *sk, struct sock *sk2) 113 + int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany) 150 114 { 151 115 struct sock_reuseport *old_reuse, *reuse; 152 116 153 117 if (!rcu_access_pointer(sk2->sk_reuseport_cb)) { 154 - int err = reuseport_alloc(sk2); 118 + int err = reuseport_alloc(sk2, bind_inany); 155 119 156 120 if (err) 157 121 return err; ··· 196 160 spin_lock_bh(&reuseport_lock); 197 161 reuse = rcu_dereference_protected(sk->sk_reuseport_cb, 198 162 lockdep_is_held(&reuseport_lock)); 163 + 164 + /* At least one of the sk in this reuseport group is added to 165 + * a bpf map. Notify the bpf side. The bpf map logic will 166 + * remove the sk if it is indeed added to a bpf map. 167 + */ 168 + if (reuse->reuseport_id) 169 + bpf_sk_reuseport_detach(sk); 170 + 199 171 rcu_assign_pointer(sk->sk_reuseport_cb, NULL); 200 172 201 173 for (i = 0; i < reuse->num_socks; i++) { ··· 219 175 } 220 176 EXPORT_SYMBOL(reuseport_detach_sock); 221 177 222 - static struct sock *run_bpf(struct sock_reuseport *reuse, u16 socks, 223 - struct bpf_prog *prog, struct sk_buff *skb, 224 - int hdr_len) 178 + static struct sock *run_bpf_filter(struct sock_reuseport *reuse, u16 socks, 179 + struct bpf_prog *prog, struct sk_buff *skb, 180 + int hdr_len) 225 181 { 226 182 struct sk_buff *nskb = NULL; 227 183 u32 index; ··· 282 238 /* paired with smp_wmb() in reuseport_add_sock() */ 283 239 smp_rmb(); 284 240 285 - if (prog && skb) 286 - sk2 = run_bpf(reuse, socks, prog, skb, hdr_len); 241 + if (!prog || !skb) 242 + goto select_by_hash; 287 243 244 + if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT) 245 + sk2 = bpf_run_sk_reuseport(reuse, sk, prog, skb, hash); 246 + else 247 + sk2 = run_bpf_filter(reuse, socks, prog, skb, hdr_len); 248 + 249 + select_by_hash: 288 250 /* no bpf or invalid bpf result: fall back to hash usage */ 289 251 if (!sk2) 290 252 sk2 = reuse->socks[reciprocal_scale(hash, socks)]; ··· 302 252 } 303 253 EXPORT_SYMBOL(reuseport_select_sock); 304 254 305 - struct bpf_prog * 306 - reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog) 255 + int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog) 307 256 { 308 257 struct sock_reuseport *reuse; 309 258 struct bpf_prog *old_prog; 259 + 260 + if (sk_unhashed(sk) && sk->sk_reuseport) { 261 + int err = reuseport_alloc(sk, false); 262 + 263 + if (err) 264 + return err; 265 + } else if (!rcu_access_pointer(sk->sk_reuseport_cb)) { 266 + /* The socket wasn't bound with SO_REUSEPORT */ 267 + return -EINVAL; 268 + } 310 269 311 270 spin_lock_bh(&reuseport_lock); 312 271 reuse = rcu_dereference_protected(sk->sk_reuseport_cb, ··· 325 266 rcu_assign_pointer(reuse->prog, prog); 326 267 spin_unlock_bh(&reuseport_lock); 327 268 328 - return old_prog; 269 + sk_reuseport_prog_free(old_prog); 270 + return 0; 329 271 } 330 272 EXPORT_SYMBOL(reuseport_attach_prog);
+5 -4
net/core/xdp.c
··· 330 330 /* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */ 331 331 xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); 332 332 page = virt_to_head_page(data); 333 - if (xa) 333 + if (xa) { 334 + napi_direct &= !xdp_return_frame_no_direct(); 334 335 page_pool_put_page(xa->page_pool, page, napi_direct); 335 - else 336 + } else { 336 337 put_page(page); 338 + } 337 339 rcu_read_unlock(); 338 340 break; 339 341 case MEM_TYPE_PAGE_SHARED: ··· 350 348 rcu_read_lock(); 351 349 /* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */ 352 350 xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); 353 - if (!WARN_ON_ONCE(!xa)) 354 - xa->zc_alloc->free(xa->zc_alloc, handle); 351 + xa->zc_alloc->free(xa->zc_alloc, handle); 355 352 rcu_read_unlock(); 356 353 default: 357 354 /* Not possible, checked in xdp_rxq_info_reg_mem_model() */
+9
net/ipv4/inet_connection_sock.c
··· 107 107 } 108 108 EXPORT_SYMBOL(inet_rcv_saddr_equal); 109 109 110 + bool inet_rcv_saddr_any(const struct sock *sk) 111 + { 112 + #if IS_ENABLED(CONFIG_IPV6) 113 + if (sk->sk_family == AF_INET6) 114 + return ipv6_addr_any(&sk->sk_v6_rcv_saddr); 115 + #endif 116 + return !sk->sk_rcv_saddr; 117 + } 118 + 110 119 void inet_get_local_port_range(struct net *net, int *low, int *high) 111 120 { 112 121 unsigned int seq;
+12 -7
net/ipv4/inet_hashtables.c
··· 328 328 saddr, sport, daddr, hnum, 329 329 dif, sdif); 330 330 if (result) 331 - return result; 331 + goto done; 332 332 333 333 /* Lookup lhash2 with INADDR_ANY */ 334 334 ··· 337 337 if (ilb2->count > ilb->count) 338 338 goto port_lookup; 339 339 340 - return inet_lhash2_lookup(net, ilb2, skb, doff, 341 - saddr, sport, daddr, hnum, 342 - dif, sdif); 340 + result = inet_lhash2_lookup(net, ilb2, skb, doff, 341 + saddr, sport, daddr, hnum, 342 + dif, sdif); 343 + goto done; 343 344 344 345 port_lookup: 345 346 sk_for_each_rcu(sk, &ilb->head) { ··· 353 352 result = reuseport_select_sock(sk, phash, 354 353 skb, doff); 355 354 if (result) 356 - return result; 355 + goto done; 357 356 } 358 357 result = sk; 359 358 hiscore = score; 360 359 } 361 360 } 361 + done: 362 + if (unlikely(IS_ERR(result))) 363 + return NULL; 362 364 return result; 363 365 } 364 366 EXPORT_SYMBOL_GPL(__inet_lookup_listener); ··· 571 567 inet_csk(sk2)->icsk_bind_hash == tb && 572 568 sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) && 573 569 inet_rcv_saddr_equal(sk, sk2, false)) 574 - return reuseport_add_sock(sk, sk2); 570 + return reuseport_add_sock(sk, sk2, 571 + inet_rcv_saddr_any(sk)); 575 572 } 576 573 577 - return reuseport_alloc(sk); 574 + return reuseport_alloc(sk, inet_rcv_saddr_any(sk)); 578 575 } 579 576 580 577 int __inet_hash(struct sock *sk, struct sock *osk)
+7 -2
net/ipv4/udp.c
··· 221 221 (sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && 222 222 sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) && 223 223 inet_rcv_saddr_equal(sk, sk2, false)) { 224 - return reuseport_add_sock(sk, sk2); 224 + return reuseport_add_sock(sk, sk2, 225 + inet_rcv_saddr_any(sk)); 225 226 } 226 227 } 227 228 228 - return reuseport_alloc(sk); 229 + return reuseport_alloc(sk, inet_rcv_saddr_any(sk)); 229 230 } 230 231 231 232 /** ··· 499 498 daddr, hnum, dif, sdif, 500 499 exact_dif, hslot2, skb); 501 500 } 501 + if (unlikely(IS_ERR(result))) 502 + return NULL; 502 503 return result; 503 504 } 504 505 begin: ··· 515 512 saddr, sport); 516 513 result = reuseport_select_sock(sk, hash, skb, 517 514 sizeof(struct udphdr)); 515 + if (unlikely(IS_ERR(result))) 516 + return NULL; 518 517 if (result) 519 518 return result; 520 519 }
+9 -5
net/ipv6/inet6_hashtables.c
··· 191 191 saddr, sport, daddr, hnum, 192 192 dif, sdif); 193 193 if (result) 194 - return result; 194 + goto done; 195 195 196 196 /* Lookup lhash2 with in6addr_any */ 197 197 ··· 200 200 if (ilb2->count > ilb->count) 201 201 goto port_lookup; 202 202 203 - return inet6_lhash2_lookup(net, ilb2, skb, doff, 204 - saddr, sport, daddr, hnum, 205 - dif, sdif); 203 + result = inet6_lhash2_lookup(net, ilb2, skb, doff, 204 + saddr, sport, daddr, hnum, 205 + dif, sdif); 206 + goto done; 206 207 207 208 port_lookup: 208 209 sk_for_each(sk, &ilb->head) { ··· 215 214 result = reuseport_select_sock(sk, phash, 216 215 skb, doff); 217 216 if (result) 218 - return result; 217 + goto done; 219 218 } 220 219 result = sk; 221 220 hiscore = score; 222 221 } 223 222 } 223 + done: 224 + if (unlikely(IS_ERR(result))) 225 + return NULL; 224 226 return result; 225 227 } 226 228 EXPORT_SYMBOL_GPL(inet6_lookup_listener);
+4
net/ipv6/udp.c
··· 235 235 exact_dif, hslot2, 236 236 skb); 237 237 } 238 + if (unlikely(IS_ERR(result))) 239 + return NULL; 238 240 return result; 239 241 } 240 242 begin: ··· 251 249 saddr, sport); 252 250 result = reuseport_select_sock(sk, hash, skb, 253 251 sizeof(struct udphdr)); 252 + if (unlikely(IS_ERR(result))) 253 + return NULL; 254 254 if (result) 255 255 return result; 256 256 }
+55
samples/bpf/hash_func01.h
··· 1 + /* SPDX-License-Identifier: LGPL-2.1 2 + * 3 + * Based on Paul Hsieh's (LGPG 2.1) hash function 4 + * From: http://www.azillionmonkeys.com/qed/hash.html 5 + */ 6 + 7 + #define get16bits(d) (*((const __u16 *) (d))) 8 + 9 + static __always_inline 10 + __u32 SuperFastHash (const char *data, int len, __u32 initval) { 11 + __u32 hash = initval; 12 + __u32 tmp; 13 + int rem; 14 + 15 + if (len <= 0 || data == NULL) return 0; 16 + 17 + rem = len & 3; 18 + len >>= 2; 19 + 20 + /* Main loop */ 21 + #pragma clang loop unroll(full) 22 + for (;len > 0; len--) { 23 + hash += get16bits (data); 24 + tmp = (get16bits (data+2) << 11) ^ hash; 25 + hash = (hash << 16) ^ tmp; 26 + data += 2*sizeof (__u16); 27 + hash += hash >> 11; 28 + } 29 + 30 + /* Handle end cases */ 31 + switch (rem) { 32 + case 3: hash += get16bits (data); 33 + hash ^= hash << 16; 34 + hash ^= ((signed char)data[sizeof (__u16)]) << 18; 35 + hash += hash >> 11; 36 + break; 37 + case 2: hash += get16bits (data); 38 + hash ^= hash << 11; 39 + hash += hash >> 17; 40 + break; 41 + case 1: hash += (signed char)*data; 42 + hash ^= hash << 10; 43 + hash += hash >> 1; 44 + } 45 + 46 + /* Force "avalanching" of final 127 bits */ 47 + hash ^= hash << 3; 48 + hash += hash >> 5; 49 + hash ^= hash << 4; 50 + hash += hash >> 17; 51 + hash ^= hash << 25; 52 + hash += hash >> 6; 53 + 54 + return hash; 55 + }
+103
samples/bpf/xdp_redirect_cpu_kern.c
··· 13 13 14 14 #include <uapi/linux/bpf.h> 15 15 #include "bpf_helpers.h" 16 + #include "hash_func01.h" 16 17 17 18 #define MAX_CPUS 64 /* WARNING - sync with _user.c */ 18 19 ··· 462 461 return bpf_redirect_map(&cpu_map, cpu_dest, 0); 463 462 } 464 463 464 + /* Hashing initval */ 465 + #define INITVAL 15485863 466 + 467 + static __always_inline 468 + u32 get_ipv4_hash_ip_pair(struct xdp_md *ctx, u64 nh_off) 469 + { 470 + void *data_end = (void *)(long)ctx->data_end; 471 + void *data = (void *)(long)ctx->data; 472 + struct iphdr *iph = data + nh_off; 473 + u32 cpu_hash; 474 + 475 + if (iph + 1 > data_end) 476 + return 0; 477 + 478 + cpu_hash = iph->saddr + iph->daddr; 479 + cpu_hash = SuperFastHash((char *)&cpu_hash, 4, INITVAL + iph->protocol); 480 + 481 + return cpu_hash; 482 + } 483 + 484 + static __always_inline 485 + u32 get_ipv6_hash_ip_pair(struct xdp_md *ctx, u64 nh_off) 486 + { 487 + void *data_end = (void *)(long)ctx->data_end; 488 + void *data = (void *)(long)ctx->data; 489 + struct ipv6hdr *ip6h = data + nh_off; 490 + u32 cpu_hash; 491 + 492 + if (ip6h + 1 > data_end) 493 + return 0; 494 + 495 + cpu_hash = ip6h->saddr.s6_addr32[0] + ip6h->daddr.s6_addr32[0]; 496 + cpu_hash += ip6h->saddr.s6_addr32[1] + ip6h->daddr.s6_addr32[1]; 497 + cpu_hash += ip6h->saddr.s6_addr32[2] + ip6h->daddr.s6_addr32[2]; 498 + cpu_hash += ip6h->saddr.s6_addr32[3] + ip6h->daddr.s6_addr32[3]; 499 + cpu_hash = SuperFastHash((char *)&cpu_hash, 4, INITVAL + ip6h->nexthdr); 500 + 501 + return cpu_hash; 502 + } 503 + 504 + /* Load-Balance traffic based on hashing IP-addrs + L4-proto. The 505 + * hashing scheme is symmetric, meaning swapping IP src/dest still hit 506 + * same CPU. 507 + */ 508 + SEC("xdp_cpu_map5_lb_hash_ip_pairs") 509 + int xdp_prognum5_lb_hash_ip_pairs(struct xdp_md *ctx) 510 + { 511 + void *data_end = (void *)(long)ctx->data_end; 512 + void *data = (void *)(long)ctx->data; 513 + struct ethhdr *eth = data; 514 + u8 ip_proto = IPPROTO_UDP; 515 + struct datarec *rec; 516 + u16 eth_proto = 0; 517 + u64 l3_offset = 0; 518 + u32 cpu_dest = 0; 519 + u32 cpu_idx = 0; 520 + u32 *cpu_lookup; 521 + u32 *cpu_max; 522 + u32 cpu_hash; 523 + u32 key = 0; 524 + 525 + /* Count RX packet in map */ 526 + rec = bpf_map_lookup_elem(&rx_cnt, &key); 527 + if (!rec) 528 + return XDP_ABORTED; 529 + rec->processed++; 530 + 531 + cpu_max = bpf_map_lookup_elem(&cpus_count, &key); 532 + if (!cpu_max) 533 + return XDP_ABORTED; 534 + 535 + if (!(parse_eth(eth, data_end, &eth_proto, &l3_offset))) 536 + return XDP_PASS; /* Just skip */ 537 + 538 + /* Hash for IPv4 and IPv6 */ 539 + switch (eth_proto) { 540 + case ETH_P_IP: 541 + cpu_hash = get_ipv4_hash_ip_pair(ctx, l3_offset); 542 + break; 543 + case ETH_P_IPV6: 544 + cpu_hash = get_ipv6_hash_ip_pair(ctx, l3_offset); 545 + break; 546 + case ETH_P_ARP: /* ARP packet handled on CPU idx 0 */ 547 + default: 548 + cpu_hash = 0; 549 + } 550 + 551 + /* Choose CPU based on hash */ 552 + cpu_idx = cpu_hash % *cpu_max; 553 + 554 + cpu_lookup = bpf_map_lookup_elem(&cpus_available, &cpu_idx); 555 + if (!cpu_lookup) 556 + return XDP_ABORTED; 557 + cpu_dest = *cpu_lookup; 558 + 559 + if (cpu_dest >= MAX_CPUS) { 560 + rec->issue++; 561 + return XDP_ABORTED; 562 + } 563 + 564 + return bpf_redirect_map(&cpu_map, cpu_dest, 0); 565 + } 465 566 466 567 char _license[] SEC("license") = "GPL"; 467 568
+2 -2
samples/bpf/xdp_redirect_cpu_user.c
··· 22 22 #define MAX_CPUS 64 /* WARNING - sync with _kern.c */ 23 23 24 24 /* How many xdp_progs are defined in _kern.c */ 25 - #define MAX_PROG 5 25 + #define MAX_PROG 6 26 26 27 27 /* Wanted to get rid of bpf_load.h and fake-"libbpf.h" (and instead 28 28 * use bpf/libbpf.h), but cannot as (currently) needed for XDP ··· 567 567 int added_cpus = 0; 568 568 int longindex = 0; 569 569 int interval = 2; 570 - int prog_num = 0; 570 + int prog_num = 5; 571 571 int add_cpu = -1; 572 572 __u32 qsize; 573 573 int opt;
+55 -1
tools/include/uapi/linux/bpf.h
··· 126 126 BPF_MAP_TYPE_XSKMAP, 127 127 BPF_MAP_TYPE_SOCKHASH, 128 128 BPF_MAP_TYPE_CGROUP_STORAGE, 129 + BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, 129 130 }; 130 131 131 132 enum bpf_prog_type { ··· 151 150 BPF_PROG_TYPE_CGROUP_SOCK_ADDR, 152 151 BPF_PROG_TYPE_LWT_SEG6LOCAL, 153 152 BPF_PROG_TYPE_LIRC_MODE2, 153 + BPF_PROG_TYPE_SK_REUSEPORT, 154 154 }; 155 155 156 156 enum bpf_attach_type { ··· 2093 2091 * Return 2094 2092 * The id is returned or 0 in case the id could not be retrieved. 2095 2093 * 2094 + * u64 bpf_skb_ancestor_cgroup_id(struct sk_buff *skb, int ancestor_level) 2095 + * Description 2096 + * Return id of cgroup v2 that is ancestor of cgroup associated 2097 + * with the *skb* at the *ancestor_level*. The root cgroup is at 2098 + * *ancestor_level* zero and each step down the hierarchy 2099 + * increments the level. If *ancestor_level* == level of cgroup 2100 + * associated with *skb*, then return value will be same as that 2101 + * of **bpf_skb_cgroup_id**\ (). 2102 + * 2103 + * The helper is useful to implement policies based on cgroups 2104 + * that are upper in hierarchy than immediate cgroup associated 2105 + * with *skb*. 2106 + * 2107 + * The format of returned id and helper limitations are same as in 2108 + * **bpf_skb_cgroup_id**\ (). 2109 + * Return 2110 + * The id is returned or 0 in case the id could not be retrieved. 2111 + * 2096 2112 * u64 bpf_get_current_cgroup_id(void) 2097 2113 * Return 2098 2114 * A 64-bit integer containing the current cgroup id based ··· 2133 2113 * the shared data. 2134 2114 * Return 2135 2115 * Pointer to the local storage area. 2116 + * 2117 + * int bpf_sk_select_reuseport(struct sk_reuseport_md *reuse, struct bpf_map *map, void *key, u64 flags) 2118 + * Description 2119 + * Select a SO_REUSEPORT sk from a BPF_MAP_TYPE_REUSEPORT_ARRAY map 2120 + * It checks the selected sk is matching the incoming 2121 + * request in the skb. 2122 + * Return 2123 + * 0 on success, or a negative error in case of failure. 2136 2124 */ 2137 2125 #define __BPF_FUNC_MAPPER(FN) \ 2138 2126 FN(unspec), \ ··· 2224 2196 FN(rc_keydown), \ 2225 2197 FN(skb_cgroup_id), \ 2226 2198 FN(get_current_cgroup_id), \ 2227 - FN(get_local_storage), 2199 + FN(get_local_storage), \ 2200 + FN(sk_select_reuseport), \ 2201 + FN(skb_ancestor_cgroup_id), 2228 2202 2229 2203 /* integer value in 'imm' field of BPF_CALL instruction selects which helper 2230 2204 * function eBPF program intends to call ··· 2441 2411 __u32 local_ip6[4]; /* Stored in network byte order */ 2442 2412 __u32 remote_port; /* Stored in network byte order */ 2443 2413 __u32 local_port; /* stored in host byte order */ 2414 + }; 2415 + 2416 + struct sk_reuseport_md { 2417 + /* 2418 + * Start of directly accessible data. It begins from 2419 + * the tcp/udp header. 2420 + */ 2421 + void *data; 2422 + void *data_end; /* End of directly accessible data */ 2423 + /* 2424 + * Total length of packet (starting from the tcp/udp header). 2425 + * Note that the directly accessible bytes (data_end - data) 2426 + * could be less than this "len". Those bytes could be 2427 + * indirectly read by a helper "bpf_skb_load_bytes()". 2428 + */ 2429 + __u32 len; 2430 + /* 2431 + * Eth protocol in the mac header (network byte order). e.g. 2432 + * ETH_P_IP(0x0800) and ETH_P_IPV6(0x86DD) 2433 + */ 2434 + __u32 eth_protocol; 2435 + __u32 ip_protocol; /* IP protocol. e.g. IPPROTO_TCP, IPPROTO_UDP */ 2436 + __u32 bind_inany; /* Is sock bound to an INANY address? */ 2437 + __u32 hash; /* A hash of the packet 4 tuples */ 2444 2438 }; 2445 2439 2446 2440 #define BPF_TAG_SIZE 8
+1
tools/lib/bpf/bpf.c
··· 92 92 attr.btf_key_type_id = create_attr->btf_key_type_id; 93 93 attr.btf_value_type_id = create_attr->btf_value_type_id; 94 94 attr.map_ifindex = create_attr->map_ifindex; 95 + attr.inner_map_fd = create_attr->inner_map_fd; 95 96 96 97 return sys_bpf(BPF_MAP_CREATE, &attr, sizeof(attr)); 97 98 }
+1
tools/lib/bpf/bpf.h
··· 39 39 __u32 btf_key_type_id; 40 40 __u32 btf_value_type_id; 41 41 __u32 map_ifindex; 42 + __u32 inner_map_fd; 42 43 }; 43 44 44 45 int bpf_create_map_xattr(const struct bpf_create_map_attr *create_attr);
+1
tools/lib/bpf/libbpf.c
··· 1501 1501 case BPF_PROG_TYPE_SK_MSG: 1502 1502 case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: 1503 1503 case BPF_PROG_TYPE_LIRC_MODE2: 1504 + case BPF_PROG_TYPE_SK_REUSEPORT: 1504 1505 return false; 1505 1506 case BPF_PROG_TYPE_UNSPEC: 1506 1507 case BPF_PROG_TYPE_KPROBE:
+7 -4
tools/testing/selftests/bpf/Makefile
··· 23 23 TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs \ 24 24 test_align test_verifier_log test_dev_cgroup test_tcpbpf_user \ 25 25 test_sock test_btf test_sockmap test_lirc_mode2_user get_cgroup_id_user \ 26 - test_socket_cookie test_cgroup_storage 26 + test_socket_cookie test_cgroup_storage test_select_reuseport 27 27 28 28 TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test_obj_id.o \ 29 29 test_pkt_md_access.o test_xdp_redirect.o test_xdp_meta.o sockmap_parse_prog.o \ ··· 34 34 test_btf_haskv.o test_btf_nokv.o test_sockmap_kern.o test_tunnel_kern.o \ 35 35 test_get_stack_rawtp.o test_sockmap_kern.o test_sockhash_kern.o \ 36 36 test_lwt_seg6local.o sendmsg4_prog.o sendmsg6_prog.o test_lirc_mode2_kern.o \ 37 - get_cgroup_id_kern.o socket_cookie_prog.o 37 + get_cgroup_id_kern.o socket_cookie_prog.o test_select_reuseport_kern.o \ 38 + test_skb_cgroup_id_kern.o 38 39 39 40 # Order correspond to 'make run_tests' order 40 41 TEST_PROGS := test_kmod.sh \ ··· 46 45 test_sock_addr.sh \ 47 46 test_tunnel.sh \ 48 47 test_lwt_seg6local.sh \ 49 - test_lirc_mode2.sh 48 + test_lirc_mode2.sh \ 49 + test_skb_cgroup_id.sh 50 50 51 51 # Compile but not part of 'make run_tests' 52 - TEST_GEN_PROGS_EXTENDED = test_libbpf_open test_sock_addr 52 + TEST_GEN_PROGS_EXTENDED = test_libbpf_open test_sock_addr test_skb_cgroup_id_user 53 53 54 54 include ../lib.mk 55 55 ··· 61 59 $(TEST_GEN_PROGS_EXTENDED): $(OUTPUT)/libbpf.a 62 60 63 61 $(OUTPUT)/test_dev_cgroup: cgroup_helpers.c 62 + $(OUTPUT)/test_skb_cgroup_id_user: cgroup_helpers.c 64 63 $(OUTPUT)/test_sock: cgroup_helpers.c 65 64 $(OUTPUT)/test_sock_addr: cgroup_helpers.c 66 65 $(OUTPUT)/test_socket_cookie: cgroup_helpers.c
+8
tools/testing/selftests/bpf/bpf_helpers.h
··· 111 111 static int (*bpf_skb_get_xfrm_state)(void *ctx, int index, void *state, 112 112 int size, int flags) = 113 113 (void *) BPF_FUNC_skb_get_xfrm_state; 114 + static int (*bpf_sk_select_reuseport)(void *ctx, void *map, void *key, __u32 flags) = 115 + (void *) BPF_FUNC_sk_select_reuseport; 114 116 static int (*bpf_get_stack)(void *ctx, void *buf, int size, int flags) = 115 117 (void *) BPF_FUNC_get_stack; 116 118 static int (*bpf_fib_lookup)(void *ctx, struct bpf_fib_lookup *params, ··· 139 137 (void *) BPF_FUNC_get_current_cgroup_id; 140 138 static void *(*bpf_get_local_storage)(void *map, unsigned long long flags) = 141 139 (void *) BPF_FUNC_get_local_storage; 140 + static unsigned long long (*bpf_skb_cgroup_id)(void *ctx) = 141 + (void *) BPF_FUNC_skb_cgroup_id; 142 + static unsigned long long (*bpf_skb_ancestor_cgroup_id)(void *ctx, int level) = 143 + (void *) BPF_FUNC_skb_ancestor_cgroup_id; 142 144 143 145 /* llvm builtin functions that eBPF C program may use to 144 146 * emit BPF_LD_ABS and BPF_LD_IND instructions ··· 179 173 180 174 static int (*bpf_skb_load_bytes)(void *ctx, int off, void *to, int len) = 181 175 (void *) BPF_FUNC_skb_load_bytes; 176 + static int (*bpf_skb_load_bytes_relative)(void *ctx, int off, void *to, int len, __u32 start_header) = 177 + (void *) BPF_FUNC_skb_load_bytes_relative; 182 178 static int (*bpf_skb_store_bytes)(void *ctx, int off, void *from, int len, int flags) = 183 179 (void *) BPF_FUNC_skb_store_bytes; 184 180 static int (*bpf_l3_csum_replace)(void *ctx, int off, int from, int to, int flags) =
+4
tools/testing/selftests/bpf/bpf_util.h
··· 44 44 name[bpf_num_possible_cpus()] 45 45 #define bpf_percpu(name, cpu) name[(cpu)].v 46 46 47 + #ifndef ARRAY_SIZE 48 + # define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) 49 + #endif 50 + 47 51 #endif /* __BPF_UTIL__ */
+1 -4
tools/testing/selftests/bpf/test_align.c
··· 18 18 19 19 #include "../../../include/linux/filter.h" 20 20 #include "bpf_rlimit.h" 21 - 22 - #ifndef ARRAY_SIZE 23 - # define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) 24 - #endif 21 + #include "bpf_util.h" 25 22 26 23 #define MAX_INSNS 512 27 24 #define MAX_MATCHES 16
+73 -19
tools/testing/selftests/bpf/test_btf.c
··· 19 19 #include <bpf/btf.h> 20 20 21 21 #include "bpf_rlimit.h" 22 + #include "bpf_util.h" 22 23 23 24 static uint32_t pass_cnt; 24 25 static uint32_t error_cnt; ··· 94 93 #define MAX_NR_RAW_TYPES 1024 95 94 #define BTF_LOG_BUF_SIZE 65535 96 95 97 - #ifndef ARRAY_SIZE 98 - # define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) 99 - #endif 100 - 101 96 static struct args { 102 97 unsigned int raw_test_num; 103 98 unsigned int file_test_num; ··· 128 131 __u32 max_entries; 129 132 bool btf_load_err; 130 133 bool map_create_err; 134 + bool ordered_map; 135 + bool lossless_map; 131 136 int hdr_len_delta; 132 137 int type_off_delta; 133 138 int str_off_delta; ··· 2092 2093 } aenum; 2093 2094 }; 2094 2095 2095 - static struct btf_raw_test pprint_test = { 2096 - .descr = "BTF pretty print test #1", 2096 + static struct btf_raw_test pprint_test_template = { 2097 2097 .raw_types = { 2098 2098 /* unsighed char */ /* [1] */ 2099 2099 BTF_TYPE_INT_ENC(NAME_TBD, 0, 0, 8, 1), ··· 2144 2146 }, 2145 2147 .str_sec = "\0unsigned char\0unsigned short\0unsigned int\0int\0unsigned long long\0uint8_t\0uint16_t\0uint32_t\0int32_t\0uint64_t\0ui64\0ui8a\0ENUM_ZERO\0ENUM_ONE\0ENUM_TWO\0ENUM_THREE\0pprint_mapv\0ui32\0ui16\0si32\0unused_bits2a\0bits28\0unused_bits2b\0aenum", 2146 2148 .str_sec_size = sizeof("\0unsigned char\0unsigned short\0unsigned int\0int\0unsigned long long\0uint8_t\0uint16_t\0uint32_t\0int32_t\0uint64_t\0ui64\0ui8a\0ENUM_ZERO\0ENUM_ONE\0ENUM_TWO\0ENUM_THREE\0pprint_mapv\0ui32\0ui16\0si32\0unused_bits2a\0bits28\0unused_bits2b\0aenum"), 2147 - .map_type = BPF_MAP_TYPE_ARRAY, 2148 - .map_name = "pprint_test", 2149 2149 .key_size = sizeof(unsigned int), 2150 2150 .value_size = sizeof(struct pprint_mapv), 2151 2151 .key_type_id = 3, /* unsigned int */ 2152 2152 .value_type_id = 16, /* struct pprint_mapv */ 2153 2153 .max_entries = 128 * 1024, 2154 2154 }; 2155 + 2156 + static struct btf_pprint_test_meta { 2157 + const char *descr; 2158 + enum bpf_map_type map_type; 2159 + const char *map_name; 2160 + bool ordered_map; 2161 + bool lossless_map; 2162 + } pprint_tests_meta[] = { 2163 + { 2164 + .descr = "BTF pretty print array", 2165 + .map_type = BPF_MAP_TYPE_ARRAY, 2166 + .map_name = "pprint_test_array", 2167 + .ordered_map = true, 2168 + .lossless_map = true, 2169 + }, 2170 + 2171 + { 2172 + .descr = "BTF pretty print hash", 2173 + .map_type = BPF_MAP_TYPE_HASH, 2174 + .map_name = "pprint_test_hash", 2175 + .ordered_map = false, 2176 + .lossless_map = true, 2177 + }, 2178 + 2179 + { 2180 + .descr = "BTF pretty print lru hash", 2181 + .map_type = BPF_MAP_TYPE_LRU_HASH, 2182 + .map_name = "pprint_test_lru_hash", 2183 + .ordered_map = false, 2184 + .lossless_map = false, 2185 + }, 2186 + 2187 + }; 2188 + 2155 2189 2156 2190 static void set_pprint_mapv(struct pprint_mapv *v, uint32_t i) 2157 2191 { ··· 2196 2166 v->aenum = i & 0x03; 2197 2167 } 2198 2168 2199 - static int test_pprint(void) 2169 + static int do_test_pprint(void) 2200 2170 { 2201 - const struct btf_raw_test *test = &pprint_test; 2171 + const struct btf_raw_test *test = &pprint_test_template; 2202 2172 struct bpf_create_map_attr create_attr = {}; 2173 + unsigned int key, nr_read_elems; 2174 + bool ordered_map, lossless_map; 2203 2175 int map_fd = -1, btf_fd = -1; 2204 2176 struct pprint_mapv mapv = {}; 2205 2177 unsigned int raw_btf_size; ··· 2210 2178 char pin_path[255]; 2211 2179 size_t line_len = 0; 2212 2180 char *line = NULL; 2213 - unsigned int key; 2214 2181 uint8_t *raw_btf; 2215 2182 ssize_t nread; 2216 2183 int err, ret; ··· 2282 2251 goto done; 2283 2252 } 2284 2253 2285 - key = 0; 2254 + nr_read_elems = 0; 2255 + ordered_map = test->ordered_map; 2256 + lossless_map = test->lossless_map; 2286 2257 do { 2287 2258 ssize_t nexpected_line; 2259 + unsigned int next_key; 2288 2260 2289 - set_pprint_mapv(&mapv, key); 2261 + next_key = ordered_map ? nr_read_elems : atoi(line); 2262 + set_pprint_mapv(&mapv, next_key); 2290 2263 nexpected_line = snprintf(expected_line, sizeof(expected_line), 2291 2264 "%u: {%u,0,%d,0x%x,0x%x,0x%x,{%lu|[%u,%u,%u,%u,%u,%u,%u,%u]},%s}\n", 2292 - key, 2265 + next_key, 2293 2266 mapv.ui32, mapv.si32, 2294 2267 mapv.unused_bits2a, mapv.bits28, mapv.unused_bits2b, 2295 2268 mapv.ui64, ··· 2316 2281 } 2317 2282 2318 2283 nread = getline(&line, &line_len, pin_file); 2319 - } while (++key < test->max_entries && nread > 0); 2284 + } while (++nr_read_elems < test->max_entries && nread > 0); 2320 2285 2321 - if (CHECK(key < test->max_entries, 2322 - "Unexpected EOF. key:%u test->max_entries:%u", 2323 - key, test->max_entries)) { 2286 + if (lossless_map && 2287 + CHECK(nr_read_elems < test->max_entries, 2288 + "Unexpected EOF. nr_read_elems:%u test->max_entries:%u", 2289 + nr_read_elems, test->max_entries)) { 2324 2290 err = -1; 2325 2291 goto done; 2326 2292 } ··· 2346 2310 fclose(pin_file); 2347 2311 unlink(pin_path); 2348 2312 free(line); 2313 + 2314 + return err; 2315 + } 2316 + 2317 + static int test_pprint(void) 2318 + { 2319 + unsigned int i; 2320 + int err = 0; 2321 + 2322 + for (i = 0; i < ARRAY_SIZE(pprint_tests_meta); i++) { 2323 + pprint_test_template.descr = pprint_tests_meta[i].descr; 2324 + pprint_test_template.map_type = pprint_tests_meta[i].map_type; 2325 + pprint_test_template.map_name = pprint_tests_meta[i].map_name; 2326 + pprint_test_template.ordered_map = pprint_tests_meta[i].ordered_map; 2327 + pprint_test_template.lossless_map = pprint_tests_meta[i].lossless_map; 2328 + 2329 + err |= count_result(do_test_pprint()); 2330 + } 2349 2331 2350 2332 return err; 2351 2333 } ··· 2463 2409 err |= test_file(); 2464 2410 2465 2411 if (args.pprint_test) 2466 - err |= count_result(test_pprint()); 2412 + err |= test_pprint(); 2467 2413 2468 2414 if (args.raw_test || args.get_info_test || args.file_test || 2469 2415 args.pprint_test)
+261 -1
tools/testing/selftests/bpf/test_maps.c
··· 17 17 #include <stdlib.h> 18 18 19 19 #include <sys/wait.h> 20 - 20 + #include <sys/socket.h> 21 + #include <netinet/in.h> 21 22 #include <linux/bpf.h> 22 23 23 24 #include <bpf/bpf.h> ··· 27 26 #include "bpf_util.h" 28 27 #include "bpf_rlimit.h" 29 28 29 + #ifndef ENOTSUPP 30 + #define ENOTSUPP 524 31 + #endif 32 + 30 33 static int map_flags; 34 + 35 + #define CHECK(condition, tag, format...) ({ \ 36 + int __ret = !!(condition); \ 37 + if (__ret) { \ 38 + printf("%s(%d):FAIL:%s ", __func__, __LINE__, tag); \ 39 + printf(format); \ 40 + exit(-1); \ 41 + } \ 42 + }) 31 43 32 44 static void test_hashmap(int task, void *data) 33 45 { ··· 1164 1150 assert(bpf_map_get_next_key(fd, &key, &value) == -1 && errno == EPERM); 1165 1151 } 1166 1152 1153 + static void prepare_reuseport_grp(int type, int map_fd, 1154 + __s64 *fds64, __u64 *sk_cookies, 1155 + unsigned int n) 1156 + { 1157 + socklen_t optlen, addrlen; 1158 + struct sockaddr_in6 s6; 1159 + const __u32 index0 = 0; 1160 + const int optval = 1; 1161 + unsigned int i; 1162 + u64 sk_cookie; 1163 + __s64 fd64; 1164 + int err; 1165 + 1166 + s6.sin6_family = AF_INET6; 1167 + s6.sin6_addr = in6addr_any; 1168 + s6.sin6_port = 0; 1169 + addrlen = sizeof(s6); 1170 + optlen = sizeof(sk_cookie); 1171 + 1172 + for (i = 0; i < n; i++) { 1173 + fd64 = socket(AF_INET6, type, 0); 1174 + CHECK(fd64 == -1, "socket()", 1175 + "sock_type:%d fd64:%lld errno:%d\n", 1176 + type, fd64, errno); 1177 + 1178 + err = setsockopt(fd64, SOL_SOCKET, SO_REUSEPORT, 1179 + &optval, sizeof(optval)); 1180 + CHECK(err == -1, "setsockopt(SO_REUSEEPORT)", 1181 + "err:%d errno:%d\n", err, errno); 1182 + 1183 + /* reuseport_array does not allow unbound sk */ 1184 + err = bpf_map_update_elem(map_fd, &index0, &fd64, 1185 + BPF_ANY); 1186 + CHECK(err != -1 || errno != EINVAL, 1187 + "reuseport array update unbound sk", 1188 + "sock_type:%d err:%d errno:%d\n", 1189 + type, err, errno); 1190 + 1191 + err = bind(fd64, (struct sockaddr *)&s6, sizeof(s6)); 1192 + CHECK(err == -1, "bind()", 1193 + "sock_type:%d err:%d errno:%d\n", type, err, errno); 1194 + 1195 + if (i == 0) { 1196 + err = getsockname(fd64, (struct sockaddr *)&s6, 1197 + &addrlen); 1198 + CHECK(err == -1, "getsockname()", 1199 + "sock_type:%d err:%d errno:%d\n", 1200 + type, err, errno); 1201 + } 1202 + 1203 + err = getsockopt(fd64, SOL_SOCKET, SO_COOKIE, &sk_cookie, 1204 + &optlen); 1205 + CHECK(err == -1, "getsockopt(SO_COOKIE)", 1206 + "sock_type:%d err:%d errno:%d\n", type, err, errno); 1207 + 1208 + if (type == SOCK_STREAM) { 1209 + /* 1210 + * reuseport_array does not allow 1211 + * non-listening tcp sk. 1212 + */ 1213 + err = bpf_map_update_elem(map_fd, &index0, &fd64, 1214 + BPF_ANY); 1215 + CHECK(err != -1 || errno != EINVAL, 1216 + "reuseport array update non-listening sk", 1217 + "sock_type:%d err:%d errno:%d\n", 1218 + type, err, errno); 1219 + err = listen(fd64, 0); 1220 + CHECK(err == -1, "listen()", 1221 + "sock_type:%d, err:%d errno:%d\n", 1222 + type, err, errno); 1223 + } 1224 + 1225 + fds64[i] = fd64; 1226 + sk_cookies[i] = sk_cookie; 1227 + } 1228 + } 1229 + 1230 + static void test_reuseport_array(void) 1231 + { 1232 + #define REUSEPORT_FD_IDX(err, last) ({ (err) ? last : !last; }) 1233 + 1234 + const __u32 array_size = 4, index0 = 0, index3 = 3; 1235 + int types[2] = { SOCK_STREAM, SOCK_DGRAM }, type; 1236 + __u64 grpa_cookies[2], sk_cookie, map_cookie; 1237 + __s64 grpa_fds64[2] = { -1, -1 }, fd64 = -1; 1238 + const __u32 bad_index = array_size; 1239 + int map_fd, err, t, f; 1240 + __u32 fds_idx = 0; 1241 + int fd; 1242 + 1243 + map_fd = bpf_create_map(BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, 1244 + sizeof(__u32), sizeof(__u64), array_size, 0); 1245 + CHECK(map_fd == -1, "reuseport array create", 1246 + "map_fd:%d, errno:%d\n", map_fd, errno); 1247 + 1248 + /* Test lookup/update/delete with invalid index */ 1249 + err = bpf_map_delete_elem(map_fd, &bad_index); 1250 + CHECK(err != -1 || errno != E2BIG, "reuseport array del >=max_entries", 1251 + "err:%d errno:%d\n", err, errno); 1252 + 1253 + err = bpf_map_update_elem(map_fd, &bad_index, &fd64, BPF_ANY); 1254 + CHECK(err != -1 || errno != E2BIG, 1255 + "reuseport array update >=max_entries", 1256 + "err:%d errno:%d\n", err, errno); 1257 + 1258 + err = bpf_map_lookup_elem(map_fd, &bad_index, &map_cookie); 1259 + CHECK(err != -1 || errno != ENOENT, 1260 + "reuseport array update >=max_entries", 1261 + "err:%d errno:%d\n", err, errno); 1262 + 1263 + /* Test lookup/delete non existence elem */ 1264 + err = bpf_map_lookup_elem(map_fd, &index3, &map_cookie); 1265 + CHECK(err != -1 || errno != ENOENT, 1266 + "reuseport array lookup not-exist elem", 1267 + "err:%d errno:%d\n", err, errno); 1268 + err = bpf_map_delete_elem(map_fd, &index3); 1269 + CHECK(err != -1 || errno != ENOENT, 1270 + "reuseport array del not-exist elem", 1271 + "err:%d errno:%d\n", err, errno); 1272 + 1273 + for (t = 0; t < ARRAY_SIZE(types); t++) { 1274 + type = types[t]; 1275 + 1276 + prepare_reuseport_grp(type, map_fd, grpa_fds64, 1277 + grpa_cookies, ARRAY_SIZE(grpa_fds64)); 1278 + 1279 + /* Test BPF_* update flags */ 1280 + /* BPF_EXIST failure case */ 1281 + err = bpf_map_update_elem(map_fd, &index3, &grpa_fds64[fds_idx], 1282 + BPF_EXIST); 1283 + CHECK(err != -1 || errno != ENOENT, 1284 + "reuseport array update empty elem BPF_EXIST", 1285 + "sock_type:%d err:%d errno:%d\n", 1286 + type, err, errno); 1287 + fds_idx = REUSEPORT_FD_IDX(err, fds_idx); 1288 + 1289 + /* BPF_NOEXIST success case */ 1290 + err = bpf_map_update_elem(map_fd, &index3, &grpa_fds64[fds_idx], 1291 + BPF_NOEXIST); 1292 + CHECK(err == -1, 1293 + "reuseport array update empty elem BPF_NOEXIST", 1294 + "sock_type:%d err:%d errno:%d\n", 1295 + type, err, errno); 1296 + fds_idx = REUSEPORT_FD_IDX(err, fds_idx); 1297 + 1298 + /* BPF_EXIST success case. */ 1299 + err = bpf_map_update_elem(map_fd, &index3, &grpa_fds64[fds_idx], 1300 + BPF_EXIST); 1301 + CHECK(err == -1, 1302 + "reuseport array update same elem BPF_EXIST", 1303 + "sock_type:%d err:%d errno:%d\n", type, err, errno); 1304 + fds_idx = REUSEPORT_FD_IDX(err, fds_idx); 1305 + 1306 + /* BPF_NOEXIST failure case */ 1307 + err = bpf_map_update_elem(map_fd, &index3, &grpa_fds64[fds_idx], 1308 + BPF_NOEXIST); 1309 + CHECK(err != -1 || errno != EEXIST, 1310 + "reuseport array update non-empty elem BPF_NOEXIST", 1311 + "sock_type:%d err:%d errno:%d\n", 1312 + type, err, errno); 1313 + fds_idx = REUSEPORT_FD_IDX(err, fds_idx); 1314 + 1315 + /* BPF_ANY case (always succeed) */ 1316 + err = bpf_map_update_elem(map_fd, &index3, &grpa_fds64[fds_idx], 1317 + BPF_ANY); 1318 + CHECK(err == -1, 1319 + "reuseport array update same sk with BPF_ANY", 1320 + "sock_type:%d err:%d errno:%d\n", type, err, errno); 1321 + 1322 + fd64 = grpa_fds64[fds_idx]; 1323 + sk_cookie = grpa_cookies[fds_idx]; 1324 + 1325 + /* The same sk cannot be added to reuseport_array twice */ 1326 + err = bpf_map_update_elem(map_fd, &index3, &fd64, BPF_ANY); 1327 + CHECK(err != -1 || errno != EBUSY, 1328 + "reuseport array update same sk with same index", 1329 + "sock_type:%d err:%d errno:%d\n", 1330 + type, err, errno); 1331 + 1332 + err = bpf_map_update_elem(map_fd, &index0, &fd64, BPF_ANY); 1333 + CHECK(err != -1 || errno != EBUSY, 1334 + "reuseport array update same sk with different index", 1335 + "sock_type:%d err:%d errno:%d\n", 1336 + type, err, errno); 1337 + 1338 + /* Test delete elem */ 1339 + err = bpf_map_delete_elem(map_fd, &index3); 1340 + CHECK(err == -1, "reuseport array delete sk", 1341 + "sock_type:%d err:%d errno:%d\n", 1342 + type, err, errno); 1343 + 1344 + /* Add it back with BPF_NOEXIST */ 1345 + err = bpf_map_update_elem(map_fd, &index3, &fd64, BPF_NOEXIST); 1346 + CHECK(err == -1, 1347 + "reuseport array re-add with BPF_NOEXIST after del", 1348 + "sock_type:%d err:%d errno:%d\n", type, err, errno); 1349 + 1350 + /* Test cookie */ 1351 + err = bpf_map_lookup_elem(map_fd, &index3, &map_cookie); 1352 + CHECK(err == -1 || sk_cookie != map_cookie, 1353 + "reuseport array lookup re-added sk", 1354 + "sock_type:%d err:%d errno:%d sk_cookie:0x%llx map_cookie:0x%llxn", 1355 + type, err, errno, sk_cookie, map_cookie); 1356 + 1357 + /* Test elem removed by close() */ 1358 + for (f = 0; f < ARRAY_SIZE(grpa_fds64); f++) 1359 + close(grpa_fds64[f]); 1360 + err = bpf_map_lookup_elem(map_fd, &index3, &map_cookie); 1361 + CHECK(err != -1 || errno != ENOENT, 1362 + "reuseport array lookup after close()", 1363 + "sock_type:%d err:%d errno:%d\n", 1364 + type, err, errno); 1365 + } 1366 + 1367 + /* Test SOCK_RAW */ 1368 + fd64 = socket(AF_INET6, SOCK_RAW, IPPROTO_UDP); 1369 + CHECK(fd64 == -1, "socket(SOCK_RAW)", "err:%d errno:%d\n", 1370 + err, errno); 1371 + err = bpf_map_update_elem(map_fd, &index3, &fd64, BPF_NOEXIST); 1372 + CHECK(err != -1 || errno != ENOTSUPP, "reuseport array update SOCK_RAW", 1373 + "err:%d errno:%d\n", err, errno); 1374 + close(fd64); 1375 + 1376 + /* Close the 64 bit value map */ 1377 + close(map_fd); 1378 + 1379 + /* Test 32 bit fd */ 1380 + map_fd = bpf_create_map(BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, 1381 + sizeof(__u32), sizeof(__u32), array_size, 0); 1382 + CHECK(map_fd == -1, "reuseport array create", 1383 + "map_fd:%d, errno:%d\n", map_fd, errno); 1384 + prepare_reuseport_grp(SOCK_STREAM, map_fd, &fd64, &sk_cookie, 1); 1385 + fd = fd64; 1386 + err = bpf_map_update_elem(map_fd, &index3, &fd, BPF_NOEXIST); 1387 + CHECK(err == -1, "reuseport array update 32 bit fd", 1388 + "err:%d errno:%d\n", err, errno); 1389 + err = bpf_map_lookup_elem(map_fd, &index3, &map_cookie); 1390 + CHECK(err != -1 || errno != ENOSPC, 1391 + "reuseport array lookup 32 bit fd", 1392 + "err:%d errno:%d\n", err, errno); 1393 + close(fd); 1394 + close(map_fd); 1395 + } 1396 + 1167 1397 static void run_all_tests(void) 1168 1398 { 1169 1399 test_hashmap(0, NULL); ··· 1428 1170 1429 1171 test_map_rdonly(); 1430 1172 test_map_wronly(); 1173 + 1174 + test_reuseport_array(); 1431 1175 } 1432 1176 1433 1177 int main(void)
+688
tools/testing/selftests/bpf/test_select_reuseport.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* Copyright (c) 2018 Facebook */ 3 + 4 + #include <stdlib.h> 5 + #include <unistd.h> 6 + #include <stdbool.h> 7 + #include <string.h> 8 + #include <errno.h> 9 + #include <assert.h> 10 + #include <fcntl.h> 11 + #include <linux/bpf.h> 12 + #include <linux/err.h> 13 + #include <linux/types.h> 14 + #include <linux/if_ether.h> 15 + #include <sys/types.h> 16 + #include <sys/epoll.h> 17 + #include <sys/socket.h> 18 + #include <netinet/in.h> 19 + #include <bpf/bpf.h> 20 + #include <bpf/libbpf.h> 21 + #include "bpf_rlimit.h" 22 + #include "bpf_util.h" 23 + #include "test_select_reuseport_common.h" 24 + 25 + #define MIN_TCPHDR_LEN 20 26 + #define UDPHDR_LEN 8 27 + 28 + #define TCP_SYNCOOKIE_SYSCTL "/proc/sys/net/ipv4/tcp_syncookies" 29 + #define TCP_FO_SYSCTL "/proc/sys/net/ipv4/tcp_fastopen" 30 + #define REUSEPORT_ARRAY_SIZE 32 31 + 32 + static int result_map, tmp_index_ovr_map, linum_map, data_check_map; 33 + static enum result expected_results[NR_RESULTS]; 34 + static int sk_fds[REUSEPORT_ARRAY_SIZE]; 35 + static int reuseport_array, outer_map; 36 + static int select_by_skb_data_prog; 37 + static int saved_tcp_syncookie; 38 + static struct bpf_object *obj; 39 + static int saved_tcp_fo; 40 + static __u32 index_zero; 41 + static int epfd; 42 + 43 + static union sa46 { 44 + struct sockaddr_in6 v6; 45 + struct sockaddr_in v4; 46 + sa_family_t family; 47 + } srv_sa; 48 + 49 + #define CHECK(condition, tag, format...) ({ \ 50 + int __ret = !!(condition); \ 51 + if (__ret) { \ 52 + printf("%s(%d):FAIL:%s ", __func__, __LINE__, tag); \ 53 + printf(format); \ 54 + exit(-1); \ 55 + } \ 56 + }) 57 + 58 + static void create_maps(void) 59 + { 60 + struct bpf_create_map_attr attr = {}; 61 + 62 + /* Creating reuseport_array */ 63 + attr.name = "reuseport_array"; 64 + attr.map_type = BPF_MAP_TYPE_REUSEPORT_SOCKARRAY; 65 + attr.key_size = sizeof(__u32); 66 + attr.value_size = sizeof(__u32); 67 + attr.max_entries = REUSEPORT_ARRAY_SIZE; 68 + 69 + reuseport_array = bpf_create_map_xattr(&attr); 70 + CHECK(reuseport_array == -1, "creating reuseport_array", 71 + "reuseport_array:%d errno:%d\n", reuseport_array, errno); 72 + 73 + /* Creating outer_map */ 74 + attr.name = "outer_map"; 75 + attr.map_type = BPF_MAP_TYPE_ARRAY_OF_MAPS; 76 + attr.key_size = sizeof(__u32); 77 + attr.value_size = sizeof(__u32); 78 + attr.max_entries = 1; 79 + attr.inner_map_fd = reuseport_array; 80 + outer_map = bpf_create_map_xattr(&attr); 81 + CHECK(outer_map == -1, "creating outer_map", 82 + "outer_map:%d errno:%d\n", outer_map, errno); 83 + } 84 + 85 + static void prepare_bpf_obj(void) 86 + { 87 + struct bpf_program *prog; 88 + struct bpf_map *map; 89 + int err; 90 + struct bpf_object_open_attr attr = { 91 + .file = "test_select_reuseport_kern.o", 92 + .prog_type = BPF_PROG_TYPE_SK_REUSEPORT, 93 + }; 94 + 95 + obj = bpf_object__open_xattr(&attr); 96 + CHECK(IS_ERR_OR_NULL(obj), "open test_select_reuseport_kern.o", 97 + "obj:%p PTR_ERR(obj):%ld\n", obj, PTR_ERR(obj)); 98 + 99 + prog = bpf_program__next(NULL, obj); 100 + CHECK(!prog, "get first bpf_program", "!prog\n"); 101 + bpf_program__set_type(prog, attr.prog_type); 102 + 103 + map = bpf_object__find_map_by_name(obj, "outer_map"); 104 + CHECK(!map, "find outer_map", "!map\n"); 105 + err = bpf_map__reuse_fd(map, outer_map); 106 + CHECK(err, "reuse outer_map", "err:%d\n", err); 107 + 108 + err = bpf_object__load(obj); 109 + CHECK(err, "load bpf_object", "err:%d\n", err); 110 + 111 + select_by_skb_data_prog = bpf_program__fd(prog); 112 + CHECK(select_by_skb_data_prog == -1, "get prog fd", 113 + "select_by_skb_data_prog:%d\n", select_by_skb_data_prog); 114 + 115 + map = bpf_object__find_map_by_name(obj, "result_map"); 116 + CHECK(!map, "find result_map", "!map\n"); 117 + result_map = bpf_map__fd(map); 118 + CHECK(result_map == -1, "get result_map fd", 119 + "result_map:%d\n", result_map); 120 + 121 + map = bpf_object__find_map_by_name(obj, "tmp_index_ovr_map"); 122 + CHECK(!map, "find tmp_index_ovr_map", "!map\n"); 123 + tmp_index_ovr_map = bpf_map__fd(map); 124 + CHECK(tmp_index_ovr_map == -1, "get tmp_index_ovr_map fd", 125 + "tmp_index_ovr_map:%d\n", tmp_index_ovr_map); 126 + 127 + map = bpf_object__find_map_by_name(obj, "linum_map"); 128 + CHECK(!map, "find linum_map", "!map\n"); 129 + linum_map = bpf_map__fd(map); 130 + CHECK(linum_map == -1, "get linum_map fd", 131 + "linum_map:%d\n", linum_map); 132 + 133 + map = bpf_object__find_map_by_name(obj, "data_check_map"); 134 + CHECK(!map, "find data_check_map", "!map\n"); 135 + data_check_map = bpf_map__fd(map); 136 + CHECK(data_check_map == -1, "get data_check_map fd", 137 + "data_check_map:%d\n", data_check_map); 138 + } 139 + 140 + static void sa46_init_loopback(union sa46 *sa, sa_family_t family) 141 + { 142 + memset(sa, 0, sizeof(*sa)); 143 + sa->family = family; 144 + if (sa->family == AF_INET6) 145 + sa->v6.sin6_addr = in6addr_loopback; 146 + else 147 + sa->v4.sin_addr.s_addr = htonl(INADDR_LOOPBACK); 148 + } 149 + 150 + static void sa46_init_inany(union sa46 *sa, sa_family_t family) 151 + { 152 + memset(sa, 0, sizeof(*sa)); 153 + sa->family = family; 154 + if (sa->family == AF_INET6) 155 + sa->v6.sin6_addr = in6addr_any; 156 + else 157 + sa->v4.sin_addr.s_addr = INADDR_ANY; 158 + } 159 + 160 + static int read_int_sysctl(const char *sysctl) 161 + { 162 + char buf[16]; 163 + int fd, ret; 164 + 165 + fd = open(sysctl, 0); 166 + CHECK(fd == -1, "open(sysctl)", "sysctl:%s fd:%d errno:%d\n", 167 + sysctl, fd, errno); 168 + 169 + ret = read(fd, buf, sizeof(buf)); 170 + CHECK(ret <= 0, "read(sysctl)", "sysctl:%s ret:%d errno:%d\n", 171 + sysctl, ret, errno); 172 + close(fd); 173 + 174 + return atoi(buf); 175 + } 176 + 177 + static void write_int_sysctl(const char *sysctl, int v) 178 + { 179 + int fd, ret, size; 180 + char buf[16]; 181 + 182 + fd = open(sysctl, O_RDWR); 183 + CHECK(fd == -1, "open(sysctl)", "sysctl:%s fd:%d errno:%d\n", 184 + sysctl, fd, errno); 185 + 186 + size = snprintf(buf, sizeof(buf), "%d", v); 187 + ret = write(fd, buf, size); 188 + CHECK(ret != size, "write(sysctl)", 189 + "sysctl:%s ret:%d size:%d errno:%d\n", sysctl, ret, size, errno); 190 + close(fd); 191 + } 192 + 193 + static void restore_sysctls(void) 194 + { 195 + write_int_sysctl(TCP_FO_SYSCTL, saved_tcp_fo); 196 + write_int_sysctl(TCP_SYNCOOKIE_SYSCTL, saved_tcp_syncookie); 197 + } 198 + 199 + static void enable_fastopen(void) 200 + { 201 + int fo; 202 + 203 + fo = read_int_sysctl(TCP_FO_SYSCTL); 204 + write_int_sysctl(TCP_FO_SYSCTL, fo | 7); 205 + } 206 + 207 + static void enable_syncookie(void) 208 + { 209 + write_int_sysctl(TCP_SYNCOOKIE_SYSCTL, 2); 210 + } 211 + 212 + static void disable_syncookie(void) 213 + { 214 + write_int_sysctl(TCP_SYNCOOKIE_SYSCTL, 0); 215 + } 216 + 217 + static __u32 get_linum(void) 218 + { 219 + __u32 linum; 220 + int err; 221 + 222 + err = bpf_map_lookup_elem(linum_map, &index_zero, &linum); 223 + CHECK(err == -1, "lookup_elem(linum_map)", "err:%d errno:%d\n", 224 + err, errno); 225 + 226 + return linum; 227 + } 228 + 229 + static void check_data(int type, sa_family_t family, const struct cmd *cmd, 230 + int cli_fd) 231 + { 232 + struct data_check expected = {}, result; 233 + union sa46 cli_sa; 234 + socklen_t addrlen; 235 + int err; 236 + 237 + addrlen = sizeof(cli_sa); 238 + err = getsockname(cli_fd, (struct sockaddr *)&cli_sa, 239 + &addrlen); 240 + CHECK(err == -1, "getsockname(cli_fd)", "err:%d errno:%d\n", 241 + err, errno); 242 + 243 + err = bpf_map_lookup_elem(data_check_map, &index_zero, &result); 244 + CHECK(err == -1, "lookup_elem(data_check_map)", "err:%d errno:%d\n", 245 + err, errno); 246 + 247 + if (type == SOCK_STREAM) { 248 + expected.len = MIN_TCPHDR_LEN; 249 + expected.ip_protocol = IPPROTO_TCP; 250 + } else { 251 + expected.len = UDPHDR_LEN; 252 + expected.ip_protocol = IPPROTO_UDP; 253 + } 254 + 255 + if (family == AF_INET6) { 256 + expected.eth_protocol = htons(ETH_P_IPV6); 257 + expected.bind_inany = !srv_sa.v6.sin6_addr.s6_addr32[3] && 258 + !srv_sa.v6.sin6_addr.s6_addr32[2] && 259 + !srv_sa.v6.sin6_addr.s6_addr32[1] && 260 + !srv_sa.v6.sin6_addr.s6_addr32[0]; 261 + 262 + memcpy(&expected.skb_addrs[0], cli_sa.v6.sin6_addr.s6_addr32, 263 + sizeof(cli_sa.v6.sin6_addr)); 264 + memcpy(&expected.skb_addrs[4], &in6addr_loopback, 265 + sizeof(in6addr_loopback)); 266 + expected.skb_ports[0] = cli_sa.v6.sin6_port; 267 + expected.skb_ports[1] = srv_sa.v6.sin6_port; 268 + } else { 269 + expected.eth_protocol = htons(ETH_P_IP); 270 + expected.bind_inany = !srv_sa.v4.sin_addr.s_addr; 271 + 272 + expected.skb_addrs[0] = cli_sa.v4.sin_addr.s_addr; 273 + expected.skb_addrs[1] = htonl(INADDR_LOOPBACK); 274 + expected.skb_ports[0] = cli_sa.v4.sin_port; 275 + expected.skb_ports[1] = srv_sa.v4.sin_port; 276 + } 277 + 278 + if (memcmp(&result, &expected, offsetof(struct data_check, 279 + equal_check_end))) { 280 + printf("unexpected data_check\n"); 281 + printf(" result: (0x%x, %u, %u)\n", 282 + result.eth_protocol, result.ip_protocol, 283 + result.bind_inany); 284 + printf("expected: (0x%x, %u, %u)\n", 285 + expected.eth_protocol, expected.ip_protocol, 286 + expected.bind_inany); 287 + CHECK(1, "data_check result != expected", 288 + "bpf_prog_linum:%u\n", get_linum()); 289 + } 290 + 291 + CHECK(!result.hash, "data_check result.hash empty", 292 + "result.hash:%u", result.hash); 293 + 294 + expected.len += cmd ? sizeof(*cmd) : 0; 295 + if (type == SOCK_STREAM) 296 + CHECK(expected.len > result.len, "expected.len > result.len", 297 + "expected.len:%u result.len:%u bpf_prog_linum:%u\n", 298 + expected.len, result.len, get_linum()); 299 + else 300 + CHECK(expected.len != result.len, "expected.len != result.len", 301 + "expected.len:%u result.len:%u bpf_prog_linum:%u\n", 302 + expected.len, result.len, get_linum()); 303 + } 304 + 305 + static void check_results(void) 306 + { 307 + __u32 results[NR_RESULTS]; 308 + __u32 i, broken = 0; 309 + int err; 310 + 311 + for (i = 0; i < NR_RESULTS; i++) { 312 + err = bpf_map_lookup_elem(result_map, &i, &results[i]); 313 + CHECK(err == -1, "lookup_elem(result_map)", 314 + "i:%u err:%d errno:%d\n", i, err, errno); 315 + } 316 + 317 + for (i = 0; i < NR_RESULTS; i++) { 318 + if (results[i] != expected_results[i]) { 319 + broken = i; 320 + break; 321 + } 322 + } 323 + 324 + if (i == NR_RESULTS) 325 + return; 326 + 327 + printf("unexpected result\n"); 328 + printf(" result: ["); 329 + printf("%u", results[0]); 330 + for (i = 1; i < NR_RESULTS; i++) 331 + printf(", %u", results[i]); 332 + printf("]\n"); 333 + 334 + printf("expected: ["); 335 + printf("%u", expected_results[0]); 336 + for (i = 1; i < NR_RESULTS; i++) 337 + printf(", %u", expected_results[i]); 338 + printf("]\n"); 339 + 340 + CHECK(expected_results[broken] != results[broken], 341 + "unexpected result", 342 + "expected_results[%u] != results[%u] bpf_prog_linum:%u\n", 343 + broken, broken, get_linum()); 344 + } 345 + 346 + static int send_data(int type, sa_family_t family, void *data, size_t len, 347 + enum result expected) 348 + { 349 + union sa46 cli_sa; 350 + int fd, err; 351 + 352 + fd = socket(family, type, 0); 353 + CHECK(fd == -1, "socket()", "fd:%d errno:%d\n", fd, errno); 354 + 355 + sa46_init_loopback(&cli_sa, family); 356 + err = bind(fd, (struct sockaddr *)&cli_sa, sizeof(cli_sa)); 357 + CHECK(fd == -1, "bind(cli_sa)", "err:%d errno:%d\n", err, errno); 358 + 359 + err = sendto(fd, data, len, MSG_FASTOPEN, (struct sockaddr *)&srv_sa, 360 + sizeof(srv_sa)); 361 + CHECK(err != len && expected >= PASS, 362 + "sendto()", "family:%u err:%d errno:%d expected:%d\n", 363 + family, err, errno, expected); 364 + 365 + return fd; 366 + } 367 + 368 + static void do_test(int type, sa_family_t family, struct cmd *cmd, 369 + enum result expected) 370 + { 371 + int nev, srv_fd, cli_fd; 372 + struct epoll_event ev; 373 + struct cmd rcv_cmd; 374 + ssize_t nread; 375 + 376 + cli_fd = send_data(type, family, cmd, cmd ? sizeof(*cmd) : 0, 377 + expected); 378 + nev = epoll_wait(epfd, &ev, 1, expected >= PASS ? 5 : 0); 379 + CHECK((nev <= 0 && expected >= PASS) || 380 + (nev > 0 && expected < PASS), 381 + "nev <> expected", 382 + "nev:%d expected:%d type:%d family:%d data:(%d, %d)\n", 383 + nev, expected, type, family, 384 + cmd ? cmd->reuseport_index : -1, 385 + cmd ? cmd->pass_on_failure : -1); 386 + check_results(); 387 + check_data(type, family, cmd, cli_fd); 388 + 389 + if (expected < PASS) 390 + return; 391 + 392 + CHECK(expected != PASS_ERR_SK_SELECT_REUSEPORT && 393 + cmd->reuseport_index != ev.data.u32, 394 + "check cmd->reuseport_index", 395 + "cmd:(%u, %u) ev.data.u32:%u\n", 396 + cmd->pass_on_failure, cmd->reuseport_index, ev.data.u32); 397 + 398 + srv_fd = sk_fds[ev.data.u32]; 399 + if (type == SOCK_STREAM) { 400 + int new_fd = accept(srv_fd, NULL, 0); 401 + 402 + CHECK(new_fd == -1, "accept(srv_fd)", 403 + "ev.data.u32:%u new_fd:%d errno:%d\n", 404 + ev.data.u32, new_fd, errno); 405 + 406 + nread = recv(new_fd, &rcv_cmd, sizeof(rcv_cmd), MSG_DONTWAIT); 407 + CHECK(nread != sizeof(rcv_cmd), 408 + "recv(new_fd)", 409 + "ev.data.u32:%u nread:%zd sizeof(rcv_cmd):%zu errno:%d\n", 410 + ev.data.u32, nread, sizeof(rcv_cmd), errno); 411 + 412 + close(new_fd); 413 + } else { 414 + nread = recv(srv_fd, &rcv_cmd, sizeof(rcv_cmd), MSG_DONTWAIT); 415 + CHECK(nread != sizeof(rcv_cmd), 416 + "recv(sk_fds)", 417 + "ev.data.u32:%u nread:%zd sizeof(rcv_cmd):%zu errno:%d\n", 418 + ev.data.u32, nread, sizeof(rcv_cmd), errno); 419 + } 420 + 421 + close(cli_fd); 422 + } 423 + 424 + static void test_err_inner_map(int type, sa_family_t family) 425 + { 426 + struct cmd cmd = { 427 + .reuseport_index = 0, 428 + .pass_on_failure = 0, 429 + }; 430 + 431 + printf("%s: ", __func__); 432 + expected_results[DROP_ERR_INNER_MAP]++; 433 + do_test(type, family, &cmd, DROP_ERR_INNER_MAP); 434 + printf("OK\n"); 435 + } 436 + 437 + static void test_err_skb_data(int type, sa_family_t family) 438 + { 439 + printf("%s: ", __func__); 440 + expected_results[DROP_ERR_SKB_DATA]++; 441 + do_test(type, family, NULL, DROP_ERR_SKB_DATA); 442 + printf("OK\n"); 443 + } 444 + 445 + static void test_err_sk_select_port(int type, sa_family_t family) 446 + { 447 + struct cmd cmd = { 448 + .reuseport_index = REUSEPORT_ARRAY_SIZE, 449 + .pass_on_failure = 0, 450 + }; 451 + 452 + printf("%s: ", __func__); 453 + expected_results[DROP_ERR_SK_SELECT_REUSEPORT]++; 454 + do_test(type, family, &cmd, DROP_ERR_SK_SELECT_REUSEPORT); 455 + printf("OK\n"); 456 + } 457 + 458 + static void test_pass(int type, sa_family_t family) 459 + { 460 + struct cmd cmd; 461 + int i; 462 + 463 + printf("%s: ", __func__); 464 + cmd.pass_on_failure = 0; 465 + for (i = 0; i < REUSEPORT_ARRAY_SIZE; i++) { 466 + expected_results[PASS]++; 467 + cmd.reuseport_index = i; 468 + do_test(type, family, &cmd, PASS); 469 + } 470 + printf("OK\n"); 471 + } 472 + 473 + static void test_syncookie(int type, sa_family_t family) 474 + { 475 + int err, tmp_index = 1; 476 + struct cmd cmd = { 477 + .reuseport_index = 0, 478 + .pass_on_failure = 0, 479 + }; 480 + 481 + if (type != SOCK_STREAM) 482 + return; 483 + 484 + printf("%s: ", __func__); 485 + /* 486 + * +1 for TCP-SYN and 487 + * +1 for the TCP-ACK (ack the syncookie) 488 + */ 489 + expected_results[PASS] += 2; 490 + enable_syncookie(); 491 + /* 492 + * Simulate TCP-SYN and TCP-ACK are handled by two different sk: 493 + * TCP-SYN: select sk_fds[tmp_index = 1] tmp_index is from the 494 + * tmp_index_ovr_map 495 + * TCP-ACK: select sk_fds[reuseport_index = 0] reuseport_index 496 + * is from the cmd.reuseport_index 497 + */ 498 + err = bpf_map_update_elem(tmp_index_ovr_map, &index_zero, 499 + &tmp_index, BPF_ANY); 500 + CHECK(err == -1, "update_elem(tmp_index_ovr_map, 0, 1)", 501 + "err:%d errno:%d\n", err, errno); 502 + do_test(type, family, &cmd, PASS); 503 + err = bpf_map_lookup_elem(tmp_index_ovr_map, &index_zero, 504 + &tmp_index); 505 + CHECK(err == -1 || tmp_index != -1, 506 + "lookup_elem(tmp_index_ovr_map)", 507 + "err:%d errno:%d tmp_index:%d\n", 508 + err, errno, tmp_index); 509 + disable_syncookie(); 510 + printf("OK\n"); 511 + } 512 + 513 + static void test_pass_on_err(int type, sa_family_t family) 514 + { 515 + struct cmd cmd = { 516 + .reuseport_index = REUSEPORT_ARRAY_SIZE, 517 + .pass_on_failure = 1, 518 + }; 519 + 520 + printf("%s: ", __func__); 521 + expected_results[PASS_ERR_SK_SELECT_REUSEPORT] += 1; 522 + do_test(type, family, &cmd, PASS_ERR_SK_SELECT_REUSEPORT); 523 + printf("OK\n"); 524 + } 525 + 526 + static void prepare_sk_fds(int type, sa_family_t family, bool inany) 527 + { 528 + const int first = REUSEPORT_ARRAY_SIZE - 1; 529 + int i, err, optval = 1; 530 + struct epoll_event ev; 531 + socklen_t addrlen; 532 + 533 + if (inany) 534 + sa46_init_inany(&srv_sa, family); 535 + else 536 + sa46_init_loopback(&srv_sa, family); 537 + addrlen = sizeof(srv_sa); 538 + 539 + /* 540 + * The sk_fds[] is filled from the back such that the order 541 + * is exactly opposite to the (struct sock_reuseport *)reuse->socks[]. 542 + */ 543 + for (i = first; i >= 0; i--) { 544 + sk_fds[i] = socket(family, type, 0); 545 + CHECK(sk_fds[i] == -1, "socket()", "sk_fds[%d]:%d errno:%d\n", 546 + i, sk_fds[i], errno); 547 + err = setsockopt(sk_fds[i], SOL_SOCKET, SO_REUSEPORT, 548 + &optval, sizeof(optval)); 549 + CHECK(err == -1, "setsockopt(SO_REUSEPORT)", 550 + "sk_fds[%d] err:%d errno:%d\n", 551 + i, err, errno); 552 + 553 + if (i == first) { 554 + err = setsockopt(sk_fds[i], SOL_SOCKET, 555 + SO_ATTACH_REUSEPORT_EBPF, 556 + &select_by_skb_data_prog, 557 + sizeof(select_by_skb_data_prog)); 558 + CHECK(err == -1, "setsockopt(SO_ATTACH_REUEPORT_EBPF)", 559 + "err:%d errno:%d\n", err, errno); 560 + } 561 + 562 + err = bind(sk_fds[i], (struct sockaddr *)&srv_sa, addrlen); 563 + CHECK(err == -1, "bind()", "sk_fds[%d] err:%d errno:%d\n", 564 + i, err, errno); 565 + 566 + if (type == SOCK_STREAM) { 567 + err = listen(sk_fds[i], 10); 568 + CHECK(err == -1, "listen()", 569 + "sk_fds[%d] err:%d errno:%d\n", 570 + i, err, errno); 571 + } 572 + 573 + err = bpf_map_update_elem(reuseport_array, &i, &sk_fds[i], 574 + BPF_NOEXIST); 575 + CHECK(err == -1, "update_elem(reuseport_array)", 576 + "sk_fds[%d] err:%d errno:%d\n", i, err, errno); 577 + 578 + if (i == first) { 579 + socklen_t addrlen = sizeof(srv_sa); 580 + 581 + err = getsockname(sk_fds[i], (struct sockaddr *)&srv_sa, 582 + &addrlen); 583 + CHECK(err == -1, "getsockname()", 584 + "sk_fds[%d] err:%d errno:%d\n", i, err, errno); 585 + } 586 + } 587 + 588 + epfd = epoll_create(1); 589 + CHECK(epfd == -1, "epoll_create(1)", 590 + "epfd:%d errno:%d\n", epfd, errno); 591 + 592 + ev.events = EPOLLIN; 593 + for (i = 0; i < REUSEPORT_ARRAY_SIZE; i++) { 594 + ev.data.u32 = i; 595 + err = epoll_ctl(epfd, EPOLL_CTL_ADD, sk_fds[i], &ev); 596 + CHECK(err, "epoll_ctl(EPOLL_CTL_ADD)", "sk_fds[%d]\n", i); 597 + } 598 + } 599 + 600 + static void setup_per_test(int type, unsigned short family, bool inany) 601 + { 602 + int ovr = -1, err; 603 + 604 + prepare_sk_fds(type, family, inany); 605 + err = bpf_map_update_elem(tmp_index_ovr_map, &index_zero, &ovr, 606 + BPF_ANY); 607 + CHECK(err == -1, "update_elem(tmp_index_ovr_map, 0, -1)", 608 + "err:%d errno:%d\n", err, errno); 609 + } 610 + 611 + static void cleanup_per_test(void) 612 + { 613 + int i, err; 614 + 615 + for (i = 0; i < REUSEPORT_ARRAY_SIZE; i++) 616 + close(sk_fds[i]); 617 + close(epfd); 618 + 619 + err = bpf_map_delete_elem(outer_map, &index_zero); 620 + CHECK(err == -1, "delete_elem(outer_map)", 621 + "err:%d errno:%d\n", err, errno); 622 + } 623 + 624 + static void cleanup(void) 625 + { 626 + close(outer_map); 627 + close(reuseport_array); 628 + bpf_object__close(obj); 629 + } 630 + 631 + static void test_all(void) 632 + { 633 + /* Extra SOCK_STREAM to test bind_inany==true */ 634 + const int types[] = { SOCK_STREAM, SOCK_DGRAM, SOCK_STREAM }; 635 + const char * const type_strings[] = { "TCP", "UDP", "TCP" }; 636 + const char * const family_strings[] = { "IPv6", "IPv4" }; 637 + const unsigned short families[] = { AF_INET6, AF_INET }; 638 + const bool bind_inany[] = { false, false, true }; 639 + int t, f, err; 640 + 641 + for (f = 0; f < ARRAY_SIZE(families); f++) { 642 + unsigned short family = families[f]; 643 + 644 + for (t = 0; t < ARRAY_SIZE(types); t++) { 645 + bool inany = bind_inany[t]; 646 + int type = types[t]; 647 + 648 + printf("######## %s/%s %s ########\n", 649 + family_strings[f], type_strings[t], 650 + inany ? " INANY " : "LOOPBACK"); 651 + 652 + setup_per_test(type, family, inany); 653 + 654 + test_err_inner_map(type, family); 655 + 656 + /* Install reuseport_array to the outer_map */ 657 + err = bpf_map_update_elem(outer_map, &index_zero, 658 + &reuseport_array, BPF_ANY); 659 + CHECK(err == -1, "update_elem(outer_map)", 660 + "err:%d errno:%d\n", err, errno); 661 + 662 + test_err_skb_data(type, family); 663 + test_err_sk_select_port(type, family); 664 + test_pass(type, family); 665 + test_syncookie(type, family); 666 + test_pass_on_err(type, family); 667 + 668 + cleanup_per_test(); 669 + printf("\n"); 670 + } 671 + } 672 + } 673 + 674 + int main(int argc, const char **argv) 675 + { 676 + create_maps(); 677 + prepare_bpf_obj(); 678 + saved_tcp_fo = read_int_sysctl(TCP_FO_SYSCTL); 679 + saved_tcp_syncookie = read_int_sysctl(TCP_SYNCOOKIE_SYSCTL); 680 + enable_fastopen(); 681 + disable_syncookie(); 682 + atexit(restore_sysctls); 683 + 684 + test_all(); 685 + 686 + cleanup(); 687 + return 0; 688 + }
+36
tools/testing/selftests/bpf/test_select_reuseport_common.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + /* Copyright (c) 2018 Facebook */ 3 + 4 + #ifndef __TEST_SELECT_REUSEPORT_COMMON_H 5 + #define __TEST_SELECT_REUSEPORT_COMMON_H 6 + 7 + #include <linux/types.h> 8 + 9 + enum result { 10 + DROP_ERR_INNER_MAP, 11 + DROP_ERR_SKB_DATA, 12 + DROP_ERR_SK_SELECT_REUSEPORT, 13 + DROP_MISC, 14 + PASS, 15 + PASS_ERR_SK_SELECT_REUSEPORT, 16 + NR_RESULTS, 17 + }; 18 + 19 + struct cmd { 20 + __u32 reuseport_index; 21 + __u32 pass_on_failure; 22 + }; 23 + 24 + struct data_check { 25 + __u32 ip_protocol; 26 + __u32 skb_addrs[8]; 27 + __u16 skb_ports[2]; 28 + __u16 eth_protocol; 29 + __u8 bind_inany; 30 + __u8 equal_check_end[0]; 31 + 32 + __u32 len; 33 + __u32 hash; 34 + }; 35 + 36 + #endif
+180
tools/testing/selftests/bpf/test_select_reuseport_kern.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* Copyright (c) 2018 Facebook */ 3 + 4 + #include <stdlib.h> 5 + #include <linux/in.h> 6 + #include <linux/ip.h> 7 + #include <linux/ipv6.h> 8 + #include <linux/tcp.h> 9 + #include <linux/udp.h> 10 + #include <linux/bpf.h> 11 + #include <linux/types.h> 12 + #include <linux/if_ether.h> 13 + 14 + #include "bpf_endian.h" 15 + #include "bpf_helpers.h" 16 + #include "test_select_reuseport_common.h" 17 + 18 + int _version SEC("version") = 1; 19 + 20 + #ifndef offsetof 21 + #define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER) 22 + #endif 23 + 24 + struct bpf_map_def SEC("maps") outer_map = { 25 + .type = BPF_MAP_TYPE_ARRAY_OF_MAPS, 26 + .key_size = sizeof(__u32), 27 + .value_size = sizeof(__u32), 28 + .max_entries = 1, 29 + }; 30 + 31 + struct bpf_map_def SEC("maps") result_map = { 32 + .type = BPF_MAP_TYPE_ARRAY, 33 + .key_size = sizeof(__u32), 34 + .value_size = sizeof(__u32), 35 + .max_entries = NR_RESULTS, 36 + }; 37 + 38 + struct bpf_map_def SEC("maps") tmp_index_ovr_map = { 39 + .type = BPF_MAP_TYPE_ARRAY, 40 + .key_size = sizeof(__u32), 41 + .value_size = sizeof(int), 42 + .max_entries = 1, 43 + }; 44 + 45 + struct bpf_map_def SEC("maps") linum_map = { 46 + .type = BPF_MAP_TYPE_ARRAY, 47 + .key_size = sizeof(__u32), 48 + .value_size = sizeof(__u32), 49 + .max_entries = 1, 50 + }; 51 + 52 + struct bpf_map_def SEC("maps") data_check_map = { 53 + .type = BPF_MAP_TYPE_ARRAY, 54 + .key_size = sizeof(__u32), 55 + .value_size = sizeof(struct data_check), 56 + .max_entries = 1, 57 + }; 58 + 59 + #define GOTO_DONE(_result) ({ \ 60 + result = (_result); \ 61 + linum = __LINE__; \ 62 + goto done; \ 63 + }) 64 + 65 + SEC("select_by_skb_data") 66 + int _select_by_skb_data(struct sk_reuseport_md *reuse_md) 67 + { 68 + __u32 linum, index = 0, flags = 0, index_zero = 0; 69 + __u32 *result_cnt, *linum_value; 70 + struct data_check data_check = {}; 71 + struct cmd *cmd, cmd_copy; 72 + void *data, *data_end; 73 + void *reuseport_array; 74 + enum result result; 75 + int *index_ovr; 76 + int err; 77 + 78 + data = reuse_md->data; 79 + data_end = reuse_md->data_end; 80 + data_check.len = reuse_md->len; 81 + data_check.eth_protocol = reuse_md->eth_protocol; 82 + data_check.ip_protocol = reuse_md->ip_protocol; 83 + data_check.hash = reuse_md->hash; 84 + data_check.bind_inany = reuse_md->bind_inany; 85 + if (data_check.eth_protocol == bpf_htons(ETH_P_IP)) { 86 + if (bpf_skb_load_bytes_relative(reuse_md, 87 + offsetof(struct iphdr, saddr), 88 + data_check.skb_addrs, 8, 89 + BPF_HDR_START_NET)) 90 + GOTO_DONE(DROP_MISC); 91 + } else { 92 + if (bpf_skb_load_bytes_relative(reuse_md, 93 + offsetof(struct ipv6hdr, saddr), 94 + data_check.skb_addrs, 32, 95 + BPF_HDR_START_NET)) 96 + GOTO_DONE(DROP_MISC); 97 + } 98 + 99 + /* 100 + * The ip_protocol could be a compile time decision 101 + * if the bpf_prog.o is dedicated to either TCP or 102 + * UDP. 103 + * 104 + * Otherwise, reuse_md->ip_protocol or 105 + * the protocol field in the iphdr can be used. 106 + */ 107 + if (data_check.ip_protocol == IPPROTO_TCP) { 108 + struct tcphdr *th = data; 109 + 110 + if (th + 1 > data_end) 111 + GOTO_DONE(DROP_MISC); 112 + 113 + data_check.skb_ports[0] = th->source; 114 + data_check.skb_ports[1] = th->dest; 115 + 116 + if ((th->doff << 2) + sizeof(*cmd) > data_check.len) 117 + GOTO_DONE(DROP_ERR_SKB_DATA); 118 + if (bpf_skb_load_bytes(reuse_md, th->doff << 2, &cmd_copy, 119 + sizeof(cmd_copy))) 120 + GOTO_DONE(DROP_MISC); 121 + cmd = &cmd_copy; 122 + } else if (data_check.ip_protocol == IPPROTO_UDP) { 123 + struct udphdr *uh = data; 124 + 125 + if (uh + 1 > data_end) 126 + GOTO_DONE(DROP_MISC); 127 + 128 + data_check.skb_ports[0] = uh->source; 129 + data_check.skb_ports[1] = uh->dest; 130 + 131 + if (sizeof(struct udphdr) + sizeof(*cmd) > data_check.len) 132 + GOTO_DONE(DROP_ERR_SKB_DATA); 133 + if (data + sizeof(struct udphdr) + sizeof(*cmd) > data_end) { 134 + if (bpf_skb_load_bytes(reuse_md, sizeof(struct udphdr), 135 + &cmd_copy, sizeof(cmd_copy))) 136 + GOTO_DONE(DROP_MISC); 137 + cmd = &cmd_copy; 138 + } else { 139 + cmd = data + sizeof(struct udphdr); 140 + } 141 + } else { 142 + GOTO_DONE(DROP_MISC); 143 + } 144 + 145 + reuseport_array = bpf_map_lookup_elem(&outer_map, &index_zero); 146 + if (!reuseport_array) 147 + GOTO_DONE(DROP_ERR_INNER_MAP); 148 + 149 + index = cmd->reuseport_index; 150 + index_ovr = bpf_map_lookup_elem(&tmp_index_ovr_map, &index_zero); 151 + if (!index_ovr) 152 + GOTO_DONE(DROP_MISC); 153 + 154 + if (*index_ovr != -1) { 155 + index = *index_ovr; 156 + *index_ovr = -1; 157 + } 158 + err = bpf_sk_select_reuseport(reuse_md, reuseport_array, &index, 159 + flags); 160 + if (!err) 161 + GOTO_DONE(PASS); 162 + 163 + if (cmd->pass_on_failure) 164 + GOTO_DONE(PASS_ERR_SK_SELECT_REUSEPORT); 165 + else 166 + GOTO_DONE(DROP_ERR_SK_SELECT_REUSEPORT); 167 + 168 + done: 169 + result_cnt = bpf_map_lookup_elem(&result_map, &result); 170 + if (!result_cnt) 171 + return SK_DROP; 172 + 173 + bpf_map_update_elem(&linum_map, &index_zero, &linum, BPF_ANY); 174 + bpf_map_update_elem(&data_check_map, &index_zero, &data_check, BPF_ANY); 175 + 176 + (*result_cnt)++; 177 + return result < PASS ? SK_DROP : SK_PASS; 178 + } 179 + 180 + char _license[] SEC("license") = "GPL";
+62
tools/testing/selftests/bpf/test_skb_cgroup_id.sh
··· 1 + #!/bin/sh 2 + # SPDX-License-Identifier: GPL-2.0 3 + # Copyright (c) 2018 Facebook 4 + 5 + set -eu 6 + 7 + wait_for_ip() 8 + { 9 + local _i 10 + echo -n "Wait for testing link-local IP to become available " 11 + for _i in $(seq ${MAX_PING_TRIES}); do 12 + echo -n "." 13 + if ping -6 -q -c 1 -W 1 ff02::1%${TEST_IF} >/dev/null 2>&1; then 14 + echo " OK" 15 + return 16 + fi 17 + sleep 1 18 + done 19 + echo 1>&2 "ERROR: Timeout waiting for test IP to become available." 20 + exit 1 21 + } 22 + 23 + setup() 24 + { 25 + # Create testing interfaces not to interfere with current environment. 26 + ip link add dev ${TEST_IF} type veth peer name ${TEST_IF_PEER} 27 + ip link set ${TEST_IF} up 28 + ip link set ${TEST_IF_PEER} up 29 + 30 + wait_for_ip 31 + 32 + tc qdisc add dev ${TEST_IF} clsact 33 + tc filter add dev ${TEST_IF} egress bpf obj ${BPF_PROG_OBJ} \ 34 + sec ${BPF_PROG_SECTION} da 35 + 36 + BPF_PROG_ID=$(tc filter show dev ${TEST_IF} egress | \ 37 + awk '/ id / {sub(/.* id /, "", $0); print($1)}') 38 + } 39 + 40 + cleanup() 41 + { 42 + ip link del ${TEST_IF} 2>/dev/null || : 43 + ip link del ${TEST_IF_PEER} 2>/dev/null || : 44 + } 45 + 46 + main() 47 + { 48 + trap cleanup EXIT 2 3 6 15 49 + setup 50 + ${PROG} ${TEST_IF} ${BPF_PROG_ID} 51 + } 52 + 53 + DIR=$(dirname $0) 54 + TEST_IF="test_cgid_1" 55 + TEST_IF_PEER="test_cgid_2" 56 + MAX_PING_TRIES=5 57 + BPF_PROG_OBJ="${DIR}/test_skb_cgroup_id_kern.o" 58 + BPF_PROG_SECTION="cgroup_id_logger" 59 + BPF_PROG_ID=0 60 + PROG="${DIR}/test_skb_cgroup_id_user" 61 + 62 + main
+47
tools/testing/selftests/bpf/test_skb_cgroup_id_kern.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + // Copyright (c) 2018 Facebook 3 + 4 + #include <linux/bpf.h> 5 + #include <linux/pkt_cls.h> 6 + 7 + #include <string.h> 8 + 9 + #include "bpf_helpers.h" 10 + 11 + #define NUM_CGROUP_LEVELS 4 12 + 13 + struct bpf_map_def SEC("maps") cgroup_ids = { 14 + .type = BPF_MAP_TYPE_ARRAY, 15 + .key_size = sizeof(__u32), 16 + .value_size = sizeof(__u64), 17 + .max_entries = NUM_CGROUP_LEVELS, 18 + }; 19 + 20 + static __always_inline void log_nth_level(struct __sk_buff *skb, __u32 level) 21 + { 22 + __u64 id; 23 + 24 + /* [1] &level passed to external function that may change it, it's 25 + * incompatible with loop unroll. 26 + */ 27 + id = bpf_skb_ancestor_cgroup_id(skb, level); 28 + bpf_map_update_elem(&cgroup_ids, &level, &id, 0); 29 + } 30 + 31 + SEC("cgroup_id_logger") 32 + int log_cgroup_id(struct __sk_buff *skb) 33 + { 34 + /* Loop unroll can't be used here due to [1]. Unrolling manually. 35 + * Number of calls should be in sync with NUM_CGROUP_LEVELS. 36 + */ 37 + log_nth_level(skb, 0); 38 + log_nth_level(skb, 1); 39 + log_nth_level(skb, 2); 40 + log_nth_level(skb, 3); 41 + 42 + return TC_ACT_OK; 43 + } 44 + 45 + int _version SEC("version") = 1; 46 + 47 + char _license[] SEC("license") = "GPL";
+187
tools/testing/selftests/bpf/test_skb_cgroup_id_user.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + // Copyright (c) 2018 Facebook 3 + 4 + #include <stdlib.h> 5 + #include <string.h> 6 + #include <unistd.h> 7 + 8 + #include <arpa/inet.h> 9 + #include <net/if.h> 10 + #include <netinet/in.h> 11 + #include <sys/socket.h> 12 + #include <sys/types.h> 13 + 14 + 15 + #include <bpf/bpf.h> 16 + #include <bpf/libbpf.h> 17 + 18 + #include "bpf_rlimit.h" 19 + #include "cgroup_helpers.h" 20 + 21 + #define CGROUP_PATH "/skb_cgroup_test" 22 + #define NUM_CGROUP_LEVELS 4 23 + 24 + /* RFC 4291, Section 2.7.1 */ 25 + #define LINKLOCAL_MULTICAST "ff02::1" 26 + 27 + static int mk_dst_addr(const char *ip, const char *iface, 28 + struct sockaddr_in6 *dst) 29 + { 30 + memset(dst, 0, sizeof(*dst)); 31 + 32 + dst->sin6_family = AF_INET6; 33 + dst->sin6_port = htons(1025); 34 + 35 + if (inet_pton(AF_INET6, ip, &dst->sin6_addr) != 1) { 36 + log_err("Invalid IPv6: %s", ip); 37 + return -1; 38 + } 39 + 40 + dst->sin6_scope_id = if_nametoindex(iface); 41 + if (!dst->sin6_scope_id) { 42 + log_err("Failed to get index of iface: %s", iface); 43 + return -1; 44 + } 45 + 46 + return 0; 47 + } 48 + 49 + static int send_packet(const char *iface) 50 + { 51 + struct sockaddr_in6 dst; 52 + char msg[] = "msg"; 53 + int err = 0; 54 + int fd = -1; 55 + 56 + if (mk_dst_addr(LINKLOCAL_MULTICAST, iface, &dst)) 57 + goto err; 58 + 59 + fd = socket(AF_INET6, SOCK_DGRAM, 0); 60 + if (fd == -1) { 61 + log_err("Failed to create UDP socket"); 62 + goto err; 63 + } 64 + 65 + if (sendto(fd, &msg, sizeof(msg), 0, (const struct sockaddr *)&dst, 66 + sizeof(dst)) == -1) { 67 + log_err("Failed to send datagram"); 68 + goto err; 69 + } 70 + 71 + goto out; 72 + err: 73 + err = -1; 74 + out: 75 + if (fd >= 0) 76 + close(fd); 77 + return err; 78 + } 79 + 80 + int get_map_fd_by_prog_id(int prog_id) 81 + { 82 + struct bpf_prog_info info = {}; 83 + __u32 info_len = sizeof(info); 84 + __u32 map_ids[1]; 85 + int prog_fd = -1; 86 + int map_fd = -1; 87 + 88 + prog_fd = bpf_prog_get_fd_by_id(prog_id); 89 + if (prog_fd < 0) { 90 + log_err("Failed to get fd by prog id %d", prog_id); 91 + goto err; 92 + } 93 + 94 + info.nr_map_ids = 1; 95 + info.map_ids = (__u64) (unsigned long) map_ids; 96 + 97 + if (bpf_obj_get_info_by_fd(prog_fd, &info, &info_len)) { 98 + log_err("Failed to get info by prog fd %d", prog_fd); 99 + goto err; 100 + } 101 + 102 + if (!info.nr_map_ids) { 103 + log_err("No maps found for prog fd %d", prog_fd); 104 + goto err; 105 + } 106 + 107 + map_fd = bpf_map_get_fd_by_id(map_ids[0]); 108 + if (map_fd < 0) 109 + log_err("Failed to get fd by map id %d", map_ids[0]); 110 + err: 111 + if (prog_fd >= 0) 112 + close(prog_fd); 113 + return map_fd; 114 + } 115 + 116 + int check_ancestor_cgroup_ids(int prog_id) 117 + { 118 + __u64 actual_ids[NUM_CGROUP_LEVELS], expected_ids[NUM_CGROUP_LEVELS]; 119 + __u32 level; 120 + int err = 0; 121 + int map_fd; 122 + 123 + expected_ids[0] = 0x100000001; /* root cgroup */ 124 + expected_ids[1] = get_cgroup_id(""); 125 + expected_ids[2] = get_cgroup_id(CGROUP_PATH); 126 + expected_ids[3] = 0; /* non-existent cgroup */ 127 + 128 + map_fd = get_map_fd_by_prog_id(prog_id); 129 + if (map_fd < 0) 130 + goto err; 131 + 132 + for (level = 0; level < NUM_CGROUP_LEVELS; ++level) { 133 + if (bpf_map_lookup_elem(map_fd, &level, &actual_ids[level])) { 134 + log_err("Failed to lookup key %d", level); 135 + goto err; 136 + } 137 + if (actual_ids[level] != expected_ids[level]) { 138 + log_err("%llx (actual) != %llx (expected), level: %u\n", 139 + actual_ids[level], expected_ids[level], level); 140 + goto err; 141 + } 142 + } 143 + 144 + goto out; 145 + err: 146 + err = -1; 147 + out: 148 + if (map_fd >= 0) 149 + close(map_fd); 150 + return err; 151 + } 152 + 153 + int main(int argc, char **argv) 154 + { 155 + int cgfd = -1; 156 + int err = 0; 157 + 158 + if (argc < 3) { 159 + fprintf(stderr, "Usage: %s iface prog_id\n", argv[0]); 160 + exit(EXIT_FAILURE); 161 + } 162 + 163 + if (setup_cgroup_environment()) 164 + goto err; 165 + 166 + cgfd = create_and_get_cgroup(CGROUP_PATH); 167 + if (!cgfd) 168 + goto err; 169 + 170 + if (join_cgroup(CGROUP_PATH)) 171 + goto err; 172 + 173 + if (send_packet(argv[1])) 174 + goto err; 175 + 176 + if (check_ancestor_cgroup_ids(atoi(argv[2]))) 177 + goto err; 178 + 179 + goto out; 180 + err: 181 + err = -1; 182 + out: 183 + close(cgfd); 184 + cleanup_cgroup_environment(); 185 + printf("[%s]\n", err ? "FAIL" : "PASS"); 186 + return err; 187 + }
+1 -4
tools/testing/selftests/bpf/test_sock.c
··· 14 14 15 15 #include "cgroup_helpers.h" 16 16 #include "bpf_rlimit.h" 17 - 18 - #ifndef ARRAY_SIZE 19 - # define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) 20 - #endif 17 + #include "bpf_util.h" 21 18 22 19 #define CG_PATH "/foo" 23 20 #define MAX_INSNS 512
+1 -4
tools/testing/selftests/bpf/test_sock_addr.c
··· 20 20 21 21 #include "cgroup_helpers.h" 22 22 #include "bpf_rlimit.h" 23 + #include "bpf_util.h" 23 24 24 25 #ifndef ENOTSUPP 25 26 # define ENOTSUPP 524 26 - #endif 27 - 28 - #ifndef ARRAY_SIZE 29 - # define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) 30 27 #endif 31 28 32 29 #define CG_PATH "/foo"
+1 -4
tools/testing/selftests/bpf/test_verifier.c
··· 42 42 #endif 43 43 #include "bpf_rlimit.h" 44 44 #include "bpf_rand.h" 45 + #include "bpf_util.h" 45 46 #include "../../../include/linux/filter.h" 46 - 47 - #ifndef ARRAY_SIZE 48 - # define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) 49 - #endif 50 47 51 48 #define MAX_INSNS BPF_MAXINSNS 52 49 #define MAX_FIXUPS 8