Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next

+741 -9

drivers/net/veth.c

··· 17 17 #include <net/rtnetlink.h> 18 18 #include <net/dst.h> 19 19 #include <net/xfrm.h> 20 + #include <net/xdp.h> 20 21 #include <linux/veth.h> 21 22 #include <linux/module.h> 23 + #include <linux/bpf.h> 24 + #include <linux/filter.h> 25 + #include <linux/ptr_ring.h> 26 + #include <linux/bpf_trace.h> 22 27 23 28 #define DRV_NAME "veth" 24 29 #define DRV_VERSION "1.0" 30 + 31 + #define VETH_XDP_FLAG BIT(0) 32 + #define VETH_RING_SIZE 256 33 + #define VETH_XDP_HEADROOM (XDP_PACKET_HEADROOM + NET_IP_ALIGN) 34 + 35 + /* Separating two types of XDP xmit */ 36 + #define VETH_XDP_TX BIT(0) 37 + #define VETH_XDP_REDIR BIT(1) 25 38 26 39 struct pcpu_vstats { 27 40 u64 packets; ··· 42 29 struct u64_stats_sync syncp; 43 30 }; 44 31 32 + struct veth_rq { 33 + struct napi_struct xdp_napi; 34 + struct net_device *dev; 35 + struct bpf_prog __rcu *xdp_prog; 36 + struct xdp_mem_info xdp_mem; 37 + bool rx_notify_masked; 38 + struct ptr_ring xdp_ring; 39 + struct xdp_rxq_info xdp_rxq; 40 + }; 41 + 45 42 struct veth_priv { 46 43 struct net_device __rcu *peer; 47 44 atomic64_t dropped; 48 - unsigned requested_headroom; 45 + struct bpf_prog *_xdp_prog; 46 + struct veth_rq *rq; 47 + unsigned int requested_headroom; 49 48 }; 50 49 51 50 /* ··· 123 98 .get_link_ksettings = veth_get_link_ksettings, 124 99 }; 125 100 101 + /* general routines */ 102 + 103 + static bool veth_is_xdp_frame(void *ptr) 104 + { 105 + return (unsigned long)ptr & VETH_XDP_FLAG; 106 + } 107 + 108 + static void *veth_ptr_to_xdp(void *ptr) 109 + { 110 + return (void *)((unsigned long)ptr & ~VETH_XDP_FLAG); 111 + } 112 + 113 + static void *veth_xdp_to_ptr(void *ptr) 114 + { 115 + return (void *)((unsigned long)ptr | VETH_XDP_FLAG); 116 + } 117 + 118 + static void veth_ptr_free(void *ptr) 119 + { 120 + if (veth_is_xdp_frame(ptr)) 121 + xdp_return_frame(veth_ptr_to_xdp(ptr)); 122 + else 123 + kfree_skb(ptr); 124 + } 125 + 126 + static void __veth_xdp_flush(struct veth_rq *rq) 127 + { 128 + /* Write ptr_ring before reading rx_notify_masked */ 129 + smp_mb(); 130 + if (!rq->rx_notify_masked) { 131 + rq->rx_notify_masked = true; 132 + napi_schedule(&rq->xdp_napi); 133 + } 134 + } 135 + 136 + static int veth_xdp_rx(struct veth_rq *rq, struct sk_buff *skb) 137 + { 138 + if (unlikely(ptr_ring_produce(&rq->xdp_ring, skb))) { 139 + dev_kfree_skb_any(skb); 140 + return NET_RX_DROP; 141 + } 142 + 143 + return NET_RX_SUCCESS; 144 + } 145 + 146 + static int veth_forward_skb(struct net_device *dev, struct sk_buff *skb, 147 + struct veth_rq *rq, bool xdp) 148 + { 149 + return __dev_forward_skb(dev, skb) ?: xdp ? 150 + veth_xdp_rx(rq, skb) : 151 + netif_rx(skb); 152 + } 153 + 126 154 static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev) 127 155 { 128 - struct veth_priv *priv = netdev_priv(dev); 156 + struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 157 + struct veth_rq *rq = NULL; 129 158 struct net_device *rcv; 130 159 int length = skb->len; 160 + bool rcv_xdp = false; 161 + int rxq; 131 162 132 163 rcu_read_lock(); 133 164 rcv = rcu_dereference(priv->peer); ··· 192 111 goto drop; 193 112 } 194 113 195 - if (likely(dev_forward_skb(rcv, skb) == NET_RX_SUCCESS)) { 114 + rcv_priv = netdev_priv(rcv); 115 + rxq = skb_get_queue_mapping(skb); 116 + if (rxq < rcv->real_num_rx_queues) { 117 + rq = &rcv_priv->rq[rxq]; 118 + rcv_xdp = rcu_access_pointer(rq->xdp_prog); 119 + if (rcv_xdp) 120 + skb_record_rx_queue(skb, rxq); 121 + } 122 + 123 + if (likely(veth_forward_skb(rcv, skb, rq, rcv_xdp) == NET_RX_SUCCESS)) { 196 124 struct pcpu_vstats *stats = this_cpu_ptr(dev->vstats); 197 125 198 126 u64_stats_update_begin(&stats->syncp); ··· 212 122 drop: 213 123 atomic64_inc(&priv->dropped); 214 124 } 125 + 126 + if (rcv_xdp) 127 + __veth_xdp_flush(rq); 128 + 215 129 rcu_read_unlock(); 130 + 216 131 return NETDEV_TX_OK; 217 132 } 218 - 219 - /* 220 - * general routines 221 - */ 222 133 223 134 static u64 veth_stats_one(struct pcpu_vstats *result, struct net_device *dev) 224 135 { ··· 270 179 { 271 180 } 272 181 182 + static struct sk_buff *veth_build_skb(void *head, int headroom, int len, 183 + int buflen) 184 + { 185 + struct sk_buff *skb; 186 + 187 + if (!buflen) { 188 + buflen = SKB_DATA_ALIGN(headroom + len) + 189 + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 190 + } 191 + skb = build_skb(head, buflen); 192 + if (!skb) 193 + return NULL; 194 + 195 + skb_reserve(skb, headroom); 196 + skb_put(skb, len); 197 + 198 + return skb; 199 + } 200 + 201 + static int veth_select_rxq(struct net_device *dev) 202 + { 203 + return smp_processor_id() % dev->real_num_rx_queues; 204 + } 205 + 206 + static int veth_xdp_xmit(struct net_device *dev, int n, 207 + struct xdp_frame **frames, u32 flags) 208 + { 209 + struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 210 + struct net_device *rcv; 211 + unsigned int max_len; 212 + struct veth_rq *rq; 213 + int i, drops = 0; 214 + 215 + if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) 216 + return -EINVAL; 217 + 218 + rcv = rcu_dereference(priv->peer); 219 + if (unlikely(!rcv)) 220 + return -ENXIO; 221 + 222 + rcv_priv = netdev_priv(rcv); 223 + rq = &rcv_priv->rq[veth_select_rxq(rcv)]; 224 + /* Non-NULL xdp_prog ensures that xdp_ring is initialized on receive 225 + * side. This means an XDP program is loaded on the peer and the peer 226 + * device is up. 227 + */ 228 + if (!rcu_access_pointer(rq->xdp_prog)) 229 + return -ENXIO; 230 + 231 + max_len = rcv->mtu + rcv->hard_header_len + VLAN_HLEN; 232 + 233 + spin_lock(&rq->xdp_ring.producer_lock); 234 + for (i = 0; i < n; i++) { 235 + struct xdp_frame *frame = frames[i]; 236 + void *ptr = veth_xdp_to_ptr(frame); 237 + 238 + if (unlikely(frame->len > max_len || 239 + __ptr_ring_produce(&rq->xdp_ring, ptr))) { 240 + xdp_return_frame_rx_napi(frame); 241 + drops++; 242 + } 243 + } 244 + spin_unlock(&rq->xdp_ring.producer_lock); 245 + 246 + if (flags & XDP_XMIT_FLUSH) 247 + __veth_xdp_flush(rq); 248 + 249 + return n - drops; 250 + } 251 + 252 + static void veth_xdp_flush(struct net_device *dev) 253 + { 254 + struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 255 + struct net_device *rcv; 256 + struct veth_rq *rq; 257 + 258 + rcu_read_lock(); 259 + rcv = rcu_dereference(priv->peer); 260 + if (unlikely(!rcv)) 261 + goto out; 262 + 263 + rcv_priv = netdev_priv(rcv); 264 + rq = &rcv_priv->rq[veth_select_rxq(rcv)]; 265 + /* xdp_ring is initialized on receive side? */ 266 + if (unlikely(!rcu_access_pointer(rq->xdp_prog))) 267 + goto out; 268 + 269 + __veth_xdp_flush(rq); 270 + out: 271 + rcu_read_unlock(); 272 + } 273 + 274 + static int veth_xdp_tx(struct net_device *dev, struct xdp_buff *xdp) 275 + { 276 + struct xdp_frame *frame = convert_to_xdp_frame(xdp); 277 + 278 + if (unlikely(!frame)) 279 + return -EOVERFLOW; 280 + 281 + return veth_xdp_xmit(dev, 1, &frame, 0); 282 + } 283 + 284 + static struct sk_buff *veth_xdp_rcv_one(struct veth_rq *rq, 285 + struct xdp_frame *frame, 286 + unsigned int *xdp_xmit) 287 + { 288 + void *hard_start = frame->data - frame->headroom; 289 + void *head = hard_start - sizeof(struct xdp_frame); 290 + int len = frame->len, delta = 0; 291 + struct xdp_frame orig_frame; 292 + struct bpf_prog *xdp_prog; 293 + unsigned int headroom; 294 + struct sk_buff *skb; 295 + 296 + rcu_read_lock(); 297 + xdp_prog = rcu_dereference(rq->xdp_prog); 298 + if (likely(xdp_prog)) { 299 + struct xdp_buff xdp; 300 + u32 act; 301 + 302 + xdp.data_hard_start = hard_start; 303 + xdp.data = frame->data; 304 + xdp.data_end = frame->data + frame->len; 305 + xdp.data_meta = frame->data - frame->metasize; 306 + xdp.rxq = &rq->xdp_rxq; 307 + 308 + act = bpf_prog_run_xdp(xdp_prog, &xdp); 309 + 310 + switch (act) { 311 + case XDP_PASS: 312 + delta = frame->data - xdp.data; 313 + len = xdp.data_end - xdp.data; 314 + break; 315 + case XDP_TX: 316 + orig_frame = *frame; 317 + xdp.data_hard_start = head; 318 + xdp.rxq->mem = frame->mem; 319 + if (unlikely(veth_xdp_tx(rq->dev, &xdp) < 0)) { 320 + trace_xdp_exception(rq->dev, xdp_prog, act); 321 + frame = &orig_frame; 322 + goto err_xdp; 323 + } 324 + *xdp_xmit |= VETH_XDP_TX; 325 + rcu_read_unlock(); 326 + goto xdp_xmit; 327 + case XDP_REDIRECT: 328 + orig_frame = *frame; 329 + xdp.data_hard_start = head; 330 + xdp.rxq->mem = frame->mem; 331 + if (xdp_do_redirect(rq->dev, &xdp, xdp_prog)) { 332 + frame = &orig_frame; 333 + goto err_xdp; 334 + } 335 + *xdp_xmit |= VETH_XDP_REDIR; 336 + rcu_read_unlock(); 337 + goto xdp_xmit; 338 + default: 339 + bpf_warn_invalid_xdp_action(act); 340 + case XDP_ABORTED: 341 + trace_xdp_exception(rq->dev, xdp_prog, act); 342 + case XDP_DROP: 343 + goto err_xdp; 344 + } 345 + } 346 + rcu_read_unlock(); 347 + 348 + headroom = sizeof(struct xdp_frame) + frame->headroom - delta; 349 + skb = veth_build_skb(head, headroom, len, 0); 350 + if (!skb) { 351 + xdp_return_frame(frame); 352 + goto err; 353 + } 354 + 355 + xdp_scrub_frame(frame); 356 + skb->protocol = eth_type_trans(skb, rq->dev); 357 + err: 358 + return skb; 359 + err_xdp: 360 + rcu_read_unlock(); 361 + xdp_return_frame(frame); 362 + xdp_xmit: 363 + return NULL; 364 + } 365 + 366 + static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, struct sk_buff *skb, 367 + unsigned int *xdp_xmit) 368 + { 369 + u32 pktlen, headroom, act, metalen; 370 + void *orig_data, *orig_data_end; 371 + struct bpf_prog *xdp_prog; 372 + int mac_len, delta, off; 373 + struct xdp_buff xdp; 374 + 375 + rcu_read_lock(); 376 + xdp_prog = rcu_dereference(rq->xdp_prog); 377 + if (unlikely(!xdp_prog)) { 378 + rcu_read_unlock(); 379 + goto out; 380 + } 381 + 382 + mac_len = skb->data - skb_mac_header(skb); 383 + pktlen = skb->len + mac_len; 384 + headroom = skb_headroom(skb) - mac_len; 385 + 386 + if (skb_shared(skb) || skb_head_is_locked(skb) || 387 + skb_is_nonlinear(skb) || headroom < XDP_PACKET_HEADROOM) { 388 + struct sk_buff *nskb; 389 + int size, head_off; 390 + void *head, *start; 391 + struct page *page; 392 + 393 + size = SKB_DATA_ALIGN(VETH_XDP_HEADROOM + pktlen) + 394 + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 395 + if (size > PAGE_SIZE) 396 + goto drop; 397 + 398 + page = alloc_page(GFP_ATOMIC | __GFP_NOWARN); 399 + if (!page) 400 + goto drop; 401 + 402 + head = page_address(page); 403 + start = head + VETH_XDP_HEADROOM; 404 + if (skb_copy_bits(skb, -mac_len, start, pktlen)) { 405 + page_frag_free(head); 406 + goto drop; 407 + } 408 + 409 + nskb = veth_build_skb(head, 410 + VETH_XDP_HEADROOM + mac_len, skb->len, 411 + PAGE_SIZE); 412 + if (!nskb) { 413 + page_frag_free(head); 414 + goto drop; 415 + } 416 + 417 + skb_copy_header(nskb, skb); 418 + head_off = skb_headroom(nskb) - skb_headroom(skb); 419 + skb_headers_offset_update(nskb, head_off); 420 + if (skb->sk) 421 + skb_set_owner_w(nskb, skb->sk); 422 + consume_skb(skb); 423 + skb = nskb; 424 + } 425 + 426 + xdp.data_hard_start = skb->head; 427 + xdp.data = skb_mac_header(skb); 428 + xdp.data_end = xdp.data + pktlen; 429 + xdp.data_meta = xdp.data; 430 + xdp.rxq = &rq->xdp_rxq; 431 + orig_data = xdp.data; 432 + orig_data_end = xdp.data_end; 433 + 434 + act = bpf_prog_run_xdp(xdp_prog, &xdp); 435 + 436 + switch (act) { 437 + case XDP_PASS: 438 + break; 439 + case XDP_TX: 440 + get_page(virt_to_page(xdp.data)); 441 + consume_skb(skb); 442 + xdp.rxq->mem = rq->xdp_mem; 443 + if (unlikely(veth_xdp_tx(rq->dev, &xdp) < 0)) { 444 + trace_xdp_exception(rq->dev, xdp_prog, act); 445 + goto err_xdp; 446 + } 447 + *xdp_xmit |= VETH_XDP_TX; 448 + rcu_read_unlock(); 449 + goto xdp_xmit; 450 + case XDP_REDIRECT: 451 + get_page(virt_to_page(xdp.data)); 452 + consume_skb(skb); 453 + xdp.rxq->mem = rq->xdp_mem; 454 + if (xdp_do_redirect(rq->dev, &xdp, xdp_prog)) 455 + goto err_xdp; 456 + *xdp_xmit |= VETH_XDP_REDIR; 457 + rcu_read_unlock(); 458 + goto xdp_xmit; 459 + default: 460 + bpf_warn_invalid_xdp_action(act); 461 + case XDP_ABORTED: 462 + trace_xdp_exception(rq->dev, xdp_prog, act); 463 + case XDP_DROP: 464 + goto drop; 465 + } 466 + rcu_read_unlock(); 467 + 468 + delta = orig_data - xdp.data; 469 + off = mac_len + delta; 470 + if (off > 0) 471 + __skb_push(skb, off); 472 + else if (off < 0) 473 + __skb_pull(skb, -off); 474 + skb->mac_header -= delta; 475 + off = xdp.data_end - orig_data_end; 476 + if (off != 0) 477 + __skb_put(skb, off); 478 + skb->protocol = eth_type_trans(skb, rq->dev); 479 + 480 + metalen = xdp.data - xdp.data_meta; 481 + if (metalen) 482 + skb_metadata_set(skb, metalen); 483 + out: 484 + return skb; 485 + drop: 486 + rcu_read_unlock(); 487 + kfree_skb(skb); 488 + return NULL; 489 + err_xdp: 490 + rcu_read_unlock(); 491 + page_frag_free(xdp.data); 492 + xdp_xmit: 493 + return NULL; 494 + } 495 + 496 + static int veth_xdp_rcv(struct veth_rq *rq, int budget, unsigned int *xdp_xmit) 497 + { 498 + int i, done = 0; 499 + 500 + for (i = 0; i < budget; i++) { 501 + void *ptr = __ptr_ring_consume(&rq->xdp_ring); 502 + struct sk_buff *skb; 503 + 504 + if (!ptr) 505 + break; 506 + 507 + if (veth_is_xdp_frame(ptr)) { 508 + skb = veth_xdp_rcv_one(rq, veth_ptr_to_xdp(ptr), 509 + xdp_xmit); 510 + } else { 511 + skb = veth_xdp_rcv_skb(rq, ptr, xdp_xmit); 512 + } 513 + 514 + if (skb) 515 + napi_gro_receive(&rq->xdp_napi, skb); 516 + 517 + done++; 518 + } 519 + 520 + return done; 521 + } 522 + 523 + static int veth_poll(struct napi_struct *napi, int budget) 524 + { 525 + struct veth_rq *rq = 526 + container_of(napi, struct veth_rq, xdp_napi); 527 + unsigned int xdp_xmit = 0; 528 + int done; 529 + 530 + xdp_set_return_frame_no_direct(); 531 + done = veth_xdp_rcv(rq, budget, &xdp_xmit); 532 + 533 + if (done < budget && napi_complete_done(napi, done)) { 534 + /* Write rx_notify_masked before reading ptr_ring */ 535 + smp_store_mb(rq->rx_notify_masked, false); 536 + if (unlikely(!__ptr_ring_empty(&rq->xdp_ring))) { 537 + rq->rx_notify_masked = true; 538 + napi_schedule(&rq->xdp_napi); 539 + } 540 + } 541 + 542 + if (xdp_xmit & VETH_XDP_TX) 543 + veth_xdp_flush(rq->dev); 544 + if (xdp_xmit & VETH_XDP_REDIR) 545 + xdp_do_flush_map(); 546 + xdp_clear_return_frame_no_direct(); 547 + 548 + return done; 549 + } 550 + 551 + static int veth_napi_add(struct net_device *dev) 552 + { 553 + struct veth_priv *priv = netdev_priv(dev); 554 + int err, i; 555 + 556 + for (i = 0; i < dev->real_num_rx_queues; i++) { 557 + struct veth_rq *rq = &priv->rq[i]; 558 + 559 + err = ptr_ring_init(&rq->xdp_ring, VETH_RING_SIZE, GFP_KERNEL); 560 + if (err) 561 + goto err_xdp_ring; 562 + } 563 + 564 + for (i = 0; i < dev->real_num_rx_queues; i++) { 565 + struct veth_rq *rq = &priv->rq[i]; 566 + 567 + netif_napi_add(dev, &rq->xdp_napi, veth_poll, NAPI_POLL_WEIGHT); 568 + napi_enable(&rq->xdp_napi); 569 + } 570 + 571 + return 0; 572 + err_xdp_ring: 573 + for (i--; i >= 0; i--) 574 + ptr_ring_cleanup(&priv->rq[i].xdp_ring, veth_ptr_free); 575 + 576 + return err; 577 + } 578 + 579 + static void veth_napi_del(struct net_device *dev) 580 + { 581 + struct veth_priv *priv = netdev_priv(dev); 582 + int i; 583 + 584 + for (i = 0; i < dev->real_num_rx_queues; i++) { 585 + struct veth_rq *rq = &priv->rq[i]; 586 + 587 + napi_disable(&rq->xdp_napi); 588 + napi_hash_del(&rq->xdp_napi); 589 + } 590 + synchronize_net(); 591 + 592 + for (i = 0; i < dev->real_num_rx_queues; i++) { 593 + struct veth_rq *rq = &priv->rq[i]; 594 + 595 + netif_napi_del(&rq->xdp_napi); 596 + rq->rx_notify_masked = false; 597 + ptr_ring_cleanup(&rq->xdp_ring, veth_ptr_free); 598 + } 599 + } 600 + 601 + static int veth_enable_xdp(struct net_device *dev) 602 + { 603 + struct veth_priv *priv = netdev_priv(dev); 604 + int err, i; 605 + 606 + if (!xdp_rxq_info_is_reg(&priv->rq[0].xdp_rxq)) { 607 + for (i = 0; i < dev->real_num_rx_queues; i++) { 608 + struct veth_rq *rq = &priv->rq[i]; 609 + 610 + err = xdp_rxq_info_reg(&rq->xdp_rxq, dev, i); 611 + if (err < 0) 612 + goto err_rxq_reg; 613 + 614 + err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq, 615 + MEM_TYPE_PAGE_SHARED, 616 + NULL); 617 + if (err < 0) 618 + goto err_reg_mem; 619 + 620 + /* Save original mem info as it can be overwritten */ 621 + rq->xdp_mem = rq->xdp_rxq.mem; 622 + } 623 + 624 + err = veth_napi_add(dev); 625 + if (err) 626 + goto err_rxq_reg; 627 + } 628 + 629 + for (i = 0; i < dev->real_num_rx_queues; i++) 630 + rcu_assign_pointer(priv->rq[i].xdp_prog, priv->_xdp_prog); 631 + 632 + return 0; 633 + err_reg_mem: 634 + xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq); 635 + err_rxq_reg: 636 + for (i--; i >= 0; i--) 637 + xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq); 638 + 639 + return err; 640 + } 641 + 642 + static void veth_disable_xdp(struct net_device *dev) 643 + { 644 + struct veth_priv *priv = netdev_priv(dev); 645 + int i; 646 + 647 + for (i = 0; i < dev->real_num_rx_queues; i++) 648 + rcu_assign_pointer(priv->rq[i].xdp_prog, NULL); 649 + veth_napi_del(dev); 650 + for (i = 0; i < dev->real_num_rx_queues; i++) { 651 + struct veth_rq *rq = &priv->rq[i]; 652 + 653 + rq->xdp_rxq.mem = rq->xdp_mem; 654 + xdp_rxq_info_unreg(&rq->xdp_rxq); 655 + } 656 + } 657 + 273 658 static int veth_open(struct net_device *dev) 274 659 { 275 660 struct veth_priv *priv = netdev_priv(dev); 276 661 struct net_device *peer = rtnl_dereference(priv->peer); 662 + int err; 277 663 278 664 if (!peer) 279 665 return -ENOTCONN; 666 + 667 + if (priv->_xdp_prog) { 668 + err = veth_enable_xdp(dev); 669 + if (err) 670 + return err; 671 + } 280 672 281 673 if (peer->flags & IFF_UP) { 282 674 netif_carrier_on(dev); 283 675 netif_carrier_on(peer); 284 676 } 677 + 285 678 return 0; 286 679 } 287 680 ··· 777 202 netif_carrier_off(dev); 778 203 if (peer) 779 204 netif_carrier_off(peer); 205 + 206 + if (priv->_xdp_prog) 207 + veth_disable_xdp(dev); 780 208 781 209 return 0; 782 210 } ··· 806 228 static void veth_poll_controller(struct net_device *dev) 807 229 { 808 230 /* veth only receives frames when its peer sends one 809 - * Since it's a synchronous operation, we are guaranteed 231 + * Since it has nothing to do with disabling irqs, we are guaranteed 810 232 * never to have pending data when we poll for it so 811 233 * there is nothing to do here. 812 234 * ··· 829 251 rcu_read_unlock(); 830 252 831 253 return iflink; 254 + } 255 + 256 + static netdev_features_t veth_fix_features(struct net_device *dev, 257 + netdev_features_t features) 258 + { 259 + struct veth_priv *priv = netdev_priv(dev); 260 + struct net_device *peer; 261 + 262 + peer = rtnl_dereference(priv->peer); 263 + if (peer) { 264 + struct veth_priv *peer_priv = netdev_priv(peer); 265 + 266 + if (peer_priv->_xdp_prog) 267 + features &= ~NETIF_F_GSO_SOFTWARE; 268 + } 269 + 270 + return features; 832 271 } 833 272 834 273 static void veth_set_rx_headroom(struct net_device *dev, int new_hr) ··· 871 276 rcu_read_unlock(); 872 277 } 873 278 279 + static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog, 280 + struct netlink_ext_ack *extack) 281 + { 282 + struct veth_priv *priv = netdev_priv(dev); 283 + struct bpf_prog *old_prog; 284 + struct net_device *peer; 285 + unsigned int max_mtu; 286 + int err; 287 + 288 + old_prog = priv->_xdp_prog; 289 + priv->_xdp_prog = prog; 290 + peer = rtnl_dereference(priv->peer); 291 + 292 + if (prog) { 293 + if (!peer) { 294 + NL_SET_ERR_MSG_MOD(extack, "Cannot set XDP when peer is detached"); 295 + err = -ENOTCONN; 296 + goto err; 297 + } 298 + 299 + max_mtu = PAGE_SIZE - VETH_XDP_HEADROOM - 300 + peer->hard_header_len - 301 + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 302 + if (peer->mtu > max_mtu) { 303 + NL_SET_ERR_MSG_MOD(extack, "Peer MTU is too large to set XDP"); 304 + err = -ERANGE; 305 + goto err; 306 + } 307 + 308 + if (dev->real_num_rx_queues < peer->real_num_tx_queues) { 309 + NL_SET_ERR_MSG_MOD(extack, "XDP expects number of rx queues not less than peer tx queues"); 310 + err = -ENOSPC; 311 + goto err; 312 + } 313 + 314 + if (dev->flags & IFF_UP) { 315 + err = veth_enable_xdp(dev); 316 + if (err) { 317 + NL_SET_ERR_MSG_MOD(extack, "Setup for XDP failed"); 318 + goto err; 319 + } 320 + } 321 + 322 + if (!old_prog) { 323 + peer->hw_features &= ~NETIF_F_GSO_SOFTWARE; 324 + peer->max_mtu = max_mtu; 325 + } 326 + } 327 + 328 + if (old_prog) { 329 + if (!prog) { 330 + if (dev->flags & IFF_UP) 331 + veth_disable_xdp(dev); 332 + 333 + if (peer) { 334 + peer->hw_features |= NETIF_F_GSO_SOFTWARE; 335 + peer->max_mtu = ETH_MAX_MTU; 336 + } 337 + } 338 + bpf_prog_put(old_prog); 339 + } 340 + 341 + if ((!!old_prog ^ !!prog) && peer) 342 + netdev_update_features(peer); 343 + 344 + return 0; 345 + err: 346 + priv->_xdp_prog = old_prog; 347 + 348 + return err; 349 + } 350 + 351 + static u32 veth_xdp_query(struct net_device *dev) 352 + { 353 + struct veth_priv *priv = netdev_priv(dev); 354 + const struct bpf_prog *xdp_prog; 355 + 356 + xdp_prog = priv->_xdp_prog; 357 + if (xdp_prog) 358 + return xdp_prog->aux->id; 359 + 360 + return 0; 361 + } 362 + 363 + static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp) 364 + { 365 + switch (xdp->command) { 366 + case XDP_SETUP_PROG: 367 + return veth_xdp_set(dev, xdp->prog, xdp->extack); 368 + case XDP_QUERY_PROG: 369 + xdp->prog_id = veth_xdp_query(dev); 370 + return 0; 371 + default: 372 + return -EINVAL; 373 + } 374 + } 375 + 874 376 static const struct net_device_ops veth_netdev_ops = { 875 377 .ndo_init = veth_dev_init, 876 378 .ndo_open = veth_open, ··· 980 288 .ndo_poll_controller = veth_poll_controller, 981 289 #endif 982 290 .ndo_get_iflink = veth_get_iflink, 291 + .ndo_fix_features = veth_fix_features, 983 292 .ndo_features_check = passthru_features_check, 984 293 .ndo_set_rx_headroom = veth_set_rx_headroom, 294 + .ndo_bpf = veth_xdp, 295 + .ndo_xdp_xmit = veth_xdp_xmit, 985 296 }; 986 297 987 298 #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \ ··· 1040 345 return 0; 1041 346 } 1042 347 348 + static int veth_alloc_queues(struct net_device *dev) 349 + { 350 + struct veth_priv *priv = netdev_priv(dev); 351 + 352 + priv->rq = kcalloc(dev->num_rx_queues, sizeof(*priv->rq), GFP_KERNEL); 353 + if (!priv->rq) 354 + return -ENOMEM; 355 + 356 + return 0; 357 + } 358 + 359 + static void veth_free_queues(struct net_device *dev) 360 + { 361 + struct veth_priv *priv = netdev_priv(dev); 362 + 363 + kfree(priv->rq); 364 + } 365 + 1043 366 static struct rtnl_link_ops veth_link_ops; 1044 367 1045 368 static int veth_newlink(struct net *src_net, struct net_device *dev, 1046 369 struct nlattr *tb[], struct nlattr *data[], 1047 370 struct netlink_ext_ack *extack) 1048 371 { 1049 - int err; 372 + int err, i; 1050 373 struct net_device *peer; 1051 374 struct veth_priv *priv; 1052 375 char ifname[IFNAMSIZ]; ··· 1117 404 return PTR_ERR(peer); 1118 405 } 1119 406 407 + err = veth_alloc_queues(peer); 408 + if (err) { 409 + put_net(net); 410 + goto err_peer_alloc_queues; 411 + } 412 + 1120 413 if (!ifmp || !tbp[IFLA_ADDRESS]) 1121 414 eth_hw_addr_random(peer); 1122 415 ··· 1151 432 * should be re-allocated 1152 433 */ 1153 434 435 + err = veth_alloc_queues(dev); 436 + if (err) 437 + goto err_alloc_queues; 438 + 1154 439 if (tb[IFLA_ADDRESS] == NULL) 1155 440 eth_hw_addr_random(dev); 1156 441 ··· 1174 451 */ 1175 452 1176 453 priv = netdev_priv(dev); 454 + for (i = 0; i < dev->real_num_rx_queues; i++) 455 + priv->rq[i].dev = dev; 1177 456 rcu_assign_pointer(priv->peer, peer); 1178 457 1179 458 priv = netdev_priv(peer); 459 + for (i = 0; i < peer->real_num_rx_queues; i++) 460 + priv->rq[i].dev = peer; 1180 461 rcu_assign_pointer(priv->peer, dev); 462 + 1181 463 return 0; 1182 464 1183 465 err_register_dev: 466 + veth_free_queues(dev); 467 + err_alloc_queues: 1184 468 /* nothing to do */ 1185 469 err_configure_peer: 1186 470 unregister_netdevice(peer); 1187 471 return err; 1188 472 1189 473 err_register_peer: 474 + veth_free_queues(peer); 475 + err_peer_alloc_queues: 1190 476 free_netdev(peer); 1191 477 return err; 1192 478 }

+37 -4

include/linux/bpf.h

··· 23 23 struct bpf_map; 24 24 struct sock; 25 25 struct seq_file; 26 - struct btf; 26 + struct btf_type; 27 27 28 28 /* map is generic key/value storage optionally accesible by eBPF programs */ 29 29 struct bpf_map_ops { ··· 48 48 u32 (*map_fd_sys_lookup_elem)(void *ptr); 49 49 void (*map_seq_show_elem)(struct bpf_map *map, void *key, 50 50 struct seq_file *m); 51 - int (*map_check_btf)(const struct bpf_map *map, const struct btf *btf, 52 - u32 key_type_id, u32 value_type_id); 51 + int (*map_check_btf)(const struct bpf_map *map, 52 + const struct btf_type *key_type, 53 + const struct btf_type *value_type); 53 54 }; 54 55 55 56 struct bpf_map { ··· 119 118 120 119 static inline bool bpf_map_support_seq_show(const struct bpf_map *map) 121 120 { 122 - return map->ops->map_seq_show_elem && map->ops->map_check_btf; 121 + return map->btf && map->ops->map_seq_show_elem; 123 122 } 123 + 124 + int map_check_no_btf(const struct bpf_map *map, 125 + const struct btf_type *key_type, 126 + const struct btf_type *value_type); 124 127 125 128 extern const struct bpf_map_ops bpf_map_offload_ops; 126 129 ··· 529 524 } 530 525 531 526 struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type); 527 + int array_map_alloc_check(union bpf_attr *attr); 532 528 533 529 #else /* !CONFIG_BPF_SYSCALL */ 534 530 static inline struct bpf_prog *bpf_prog_get(u32 ufd) ··· 774 768 { 775 769 } 776 770 #endif 771 + 772 + #if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL) 773 + void bpf_sk_reuseport_detach(struct sock *sk); 774 + int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key, 775 + void *value); 776 + int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key, 777 + void *value, u64 map_flags); 778 + #else 779 + static inline void bpf_sk_reuseport_detach(struct sock *sk) 780 + { 781 + } 782 + 783 + #ifdef CONFIG_BPF_SYSCALL 784 + static inline int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, 785 + void *key, void *value) 786 + { 787 + return -EOPNOTSUPP; 788 + } 789 + 790 + static inline int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, 791 + void *key, void *value, 792 + u64 map_flags) 793 + { 794 + return -EOPNOTSUPP; 795 + } 796 + #endif /* CONFIG_BPF_SYSCALL */ 797 + #endif /* defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL) */ 777 798 778 799 /* verifier prototypes for helper functions called from eBPF programs */ 779 800 extern const struct bpf_func_proto bpf_map_lookup_elem_proto;

+6

include/linux/bpf_types.h

··· 29 29 #ifdef CONFIG_BPF_LIRC_MODE2 30 30 BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2) 31 31 #endif 32 + #ifdef CONFIG_INET 33 + BPF_PROG_TYPE(BPF_PROG_TYPE_SK_REUSEPORT, sk_reuseport) 34 + #endif 32 35 33 36 BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops) 34 37 BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops) ··· 62 59 BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops) 63 60 #if defined(CONFIG_XDP_SOCKETS) 64 61 BPF_MAP_TYPE(BPF_MAP_TYPE_XSKMAP, xsk_map_ops) 62 + #endif 63 + #ifdef CONFIG_INET 64 + BPF_MAP_TYPE(BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, reuseport_array_ops) 65 65 #endif 66 66 #endif

+30

include/linux/cgroup.h

··· 554 554 } 555 555 556 556 /** 557 + * cgroup_ancestor - find ancestor of cgroup 558 + * @cgrp: cgroup to find ancestor of 559 + * @ancestor_level: level of ancestor to find starting from root 560 + * 561 + * Find ancestor of cgroup at specified level starting from root if it exists 562 + * and return pointer to it. Return NULL if @cgrp doesn't have ancestor at 563 + * @ancestor_level. 564 + * 565 + * This function is safe to call as long as @cgrp is accessible. 566 + */ 567 + static inline struct cgroup *cgroup_ancestor(struct cgroup *cgrp, 568 + int ancestor_level) 569 + { 570 + struct cgroup *ptr; 571 + 572 + if (cgrp->level < ancestor_level) 573 + return NULL; 574 + 575 + for (ptr = cgrp; 576 + ptr && ptr->level > ancestor_level; 577 + ptr = cgroup_parent(ptr)) 578 + ; 579 + 580 + if (ptr && ptr->level == ancestor_level) 581 + return ptr; 582 + 583 + return NULL; 584 + } 585 + 586 + /** 557 587 * task_under_cgroup_hierarchy - test task's membership of cgroup ancestry 558 588 * @task: the task to be tested 559 589 * @ancestor: possible ancestor of @task's cgroup

+51

include/linux/filter.h

··· 32 32 struct bpf_prog_aux; 33 33 struct xdp_rxq_info; 34 34 struct xdp_buff; 35 + struct sock_reuseport; 35 36 36 37 /* ArgX, context and stack frame pointer register positions. Note, 37 38 * Arg1, Arg2, Arg3, etc are used as argument mappings of function ··· 538 537 struct list_head list; 539 538 }; 540 539 540 + struct bpf_redirect_info { 541 + u32 ifindex; 542 + u32 flags; 543 + struct bpf_map *map; 544 + struct bpf_map *map_to_flush; 545 + unsigned long map_owner; 546 + u32 kern_flags; 547 + }; 548 + 549 + DECLARE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info); 550 + 551 + /* flags for bpf_redirect_info kern_flags */ 552 + #define BPF_RI_F_RF_NO_DIRECT BIT(0) /* no napi_direct on return_frame */ 553 + 541 554 /* Compute the linear packet data range [data, data_end) which 542 555 * will be accessed by various program types (cls_bpf, act_bpf, 543 556 * lwt, ...). Subsystems allowing direct data access must (!) ··· 753 738 int sk_attach_bpf(u32 ufd, struct sock *sk); 754 739 int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk); 755 740 int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk); 741 + void sk_reuseport_prog_free(struct bpf_prog *prog); 756 742 int sk_detach_filter(struct sock *sk); 757 743 int sk_get_filter(struct sock *sk, struct sock_filter __user *filter, 758 744 unsigned int len); ··· 780 764 781 765 struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, 782 766 const struct bpf_insn *patch, u32 len); 767 + 768 + static inline bool xdp_return_frame_no_direct(void) 769 + { 770 + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); 771 + 772 + return ri->kern_flags & BPF_RI_F_RF_NO_DIRECT; 773 + } 774 + 775 + static inline void xdp_set_return_frame_no_direct(void) 776 + { 777 + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); 778 + 779 + ri->kern_flags |= BPF_RI_F_RF_NO_DIRECT; 780 + } 781 + 782 + static inline void xdp_clear_return_frame_no_direct(void) 783 + { 784 + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); 785 + 786 + ri->kern_flags &= ~BPF_RI_F_RF_NO_DIRECT; 787 + } 783 788 784 789 static inline int xdp_ok_fwd_dev(const struct net_device *fwd, 785 790 unsigned int pktlen) ··· 834 797 835 798 struct sock *do_sk_redirect_map(struct sk_buff *skb); 836 799 struct sock *do_msg_redirect_map(struct sk_msg_buff *md); 800 + 801 + #ifdef CONFIG_INET 802 + struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, 803 + struct bpf_prog *prog, struct sk_buff *skb, 804 + u32 hash); 805 + #else 806 + static inline struct sock * 807 + bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, 808 + struct bpf_prog *prog, struct sk_buff *skb, 809 + u32 hash) 810 + { 811 + return NULL; 812 + } 813 + #endif 837 814 838 815 #ifdef CONFIG_BPF_JIT 839 816 extern int bpf_jit_enable;

+1

include/linux/skbuff.h

··· 1038 1038 } 1039 1039 1040 1040 struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src); 1041 + void skb_headers_offset_update(struct sk_buff *skb, int off); 1041 1042 int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask); 1042 1043 struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t priority); 1043 1044 void skb_copy_header(struct sk_buff *new, const struct sk_buff *old);

+1

include/net/addrconf.h

··· 108 108 u32 banned_flags); 109 109 bool inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, 110 110 bool match_wildcard); 111 + bool inet_rcv_saddr_any(const struct sock *sk); 111 112 void addrconf_join_solict(struct net_device *dev, const struct in6_addr *addr); 112 113 void addrconf_leave_solict(struct inet6_dev *idev, const struct in6_addr *addr); 113 114

+15 -4

include/net/sock_reuseport.h

··· 5 5 #include <linux/filter.h> 6 6 #include <linux/skbuff.h> 7 7 #include <linux/types.h> 8 + #include <linux/spinlock.h> 8 9 #include <net/sock.h> 10 + 11 + extern spinlock_t reuseport_lock; 9 12 10 13 struct sock_reuseport { 11 14 struct rcu_head rcu; 12 15 13 16 u16 max_socks; /* length of socks */ 14 17 u16 num_socks; /* elements in socks */ 18 + /* The last synq overflow event timestamp of this 19 + * reuse->socks[] group. 20 + */ 21 + unsigned int synq_overflow_ts; 22 + /* ID stays the same even after the size of socks[] grows. */ 23 + unsigned int reuseport_id; 24 + bool bind_inany; 15 25 struct bpf_prog __rcu *prog; /* optional BPF sock selector */ 16 26 struct sock *socks[0]; /* array of sock pointers */ 17 27 }; 18 28 19 - extern int reuseport_alloc(struct sock *sk); 20 - extern int reuseport_add_sock(struct sock *sk, struct sock *sk2); 29 + extern int reuseport_alloc(struct sock *sk, bool bind_inany); 30 + extern int reuseport_add_sock(struct sock *sk, struct sock *sk2, 31 + bool bind_inany); 21 32 extern void reuseport_detach_sock(struct sock *sk); 22 33 extern struct sock *reuseport_select_sock(struct sock *sk, 23 34 u32 hash, 24 35 struct sk_buff *skb, 25 36 int hdr_len); 26 - extern struct bpf_prog *reuseport_attach_prog(struct sock *sk, 27 - struct bpf_prog *prog); 37 + extern int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog); 38 + int reuseport_get_id(struct sock_reuseport *reuse); 28 39 29 40 #endif /* _SOCK_REUSEPORT_H */

+28 -2

include/net/tcp.h

··· 36 36 #include <net/inet_hashtables.h> 37 37 #include <net/checksum.h> 38 38 #include <net/request_sock.h> 39 + #include <net/sock_reuseport.h> 39 40 #include <net/sock.h> 40 41 #include <net/snmp.h> 41 42 #include <net/ip.h> ··· 474 473 */ 475 474 static inline void tcp_synq_overflow(const struct sock *sk) 476 475 { 477 - unsigned int last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp; 476 + unsigned int last_overflow; 478 477 unsigned int now = jiffies; 479 478 479 + if (sk->sk_reuseport) { 480 + struct sock_reuseport *reuse; 481 + 482 + reuse = rcu_dereference(sk->sk_reuseport_cb); 483 + if (likely(reuse)) { 484 + last_overflow = READ_ONCE(reuse->synq_overflow_ts); 485 + if (time_after32(now, last_overflow + HZ)) 486 + WRITE_ONCE(reuse->synq_overflow_ts, now); 487 + return; 488 + } 489 + } 490 + 491 + last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp; 480 492 if (time_after32(now, last_overflow + HZ)) 481 493 tcp_sk(sk)->rx_opt.ts_recent_stamp = now; 482 494 } ··· 497 483 /* syncookies: no recent synqueue overflow on this listening socket? */ 498 484 static inline bool tcp_synq_no_recent_overflow(const struct sock *sk) 499 485 { 500 - unsigned int last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp; 486 + unsigned int last_overflow; 501 487 unsigned int now = jiffies; 502 488 489 + if (sk->sk_reuseport) { 490 + struct sock_reuseport *reuse; 491 + 492 + reuse = rcu_dereference(sk->sk_reuseport_cb); 493 + if (likely(reuse)) { 494 + last_overflow = READ_ONCE(reuse->synq_overflow_ts); 495 + return time_after32(now, last_overflow + 496 + TCP_SYNCOOKIE_VALID); 497 + } 498 + } 499 + 500 + last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp; 503 501 return time_after32(now, last_overflow + TCP_SYNCOOKIE_VALID); 504 502 } 505 503

+7

include/net/xdp.h

··· 84 84 struct net_device *dev_rx; /* used by cpumap */ 85 85 }; 86 86 87 + /* Clear kernel pointers in xdp_frame */ 88 + static inline void xdp_scrub_frame(struct xdp_frame *frame) 89 + { 90 + frame->data = NULL; 91 + frame->dev_rx = NULL; 92 + } 93 + 87 94 /* Convert xdp_buff to xdp_frame */ 88 95 static inline 89 96 struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp)

+55 -1

include/uapi/linux/bpf.h

··· 126 126 BPF_MAP_TYPE_XSKMAP, 127 127 BPF_MAP_TYPE_SOCKHASH, 128 128 BPF_MAP_TYPE_CGROUP_STORAGE, 129 + BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, 129 130 }; 130 131 131 132 enum bpf_prog_type { ··· 151 150 BPF_PROG_TYPE_CGROUP_SOCK_ADDR, 152 151 BPF_PROG_TYPE_LWT_SEG6LOCAL, 153 152 BPF_PROG_TYPE_LIRC_MODE2, 153 + BPF_PROG_TYPE_SK_REUSEPORT, 154 154 }; 155 155 156 156 enum bpf_attach_type { ··· 2093 2091 * Return 2094 2092 * The id is returned or 0 in case the id could not be retrieved. 2095 2093 * 2094 + * u64 bpf_skb_ancestor_cgroup_id(struct sk_buff *skb, int ancestor_level) 2095 + * Description 2096 + * Return id of cgroup v2 that is ancestor of cgroup associated 2097 + * with the *skb* at the *ancestor_level*. The root cgroup is at 2098 + * *ancestor_level* zero and each step down the hierarchy 2099 + * increments the level. If *ancestor_level* == level of cgroup 2100 + * associated with *skb*, then return value will be same as that 2101 + * of **bpf_skb_cgroup_id**\ (). 2102 + * 2103 + * The helper is useful to implement policies based on cgroups 2104 + * that are upper in hierarchy than immediate cgroup associated 2105 + * with *skb*. 2106 + * 2107 + * The format of returned id and helper limitations are same as in 2108 + * **bpf_skb_cgroup_id**\ (). 2109 + * Return 2110 + * The id is returned or 0 in case the id could not be retrieved. 2111 + * 2096 2112 * u64 bpf_get_current_cgroup_id(void) 2097 2113 * Return 2098 2114 * A 64-bit integer containing the current cgroup id based ··· 2133 2113 * the shared data. 2134 2114 * Return 2135 2115 * Pointer to the local storage area. 2116 + * 2117 + * int bpf_sk_select_reuseport(struct sk_reuseport_md *reuse, struct bpf_map *map, void *key, u64 flags) 2118 + * Description 2119 + * Select a SO_REUSEPORT sk from a BPF_MAP_TYPE_REUSEPORT_ARRAY map 2120 + * It checks the selected sk is matching the incoming 2121 + * request in the skb. 2122 + * Return 2123 + * 0 on success, or a negative error in case of failure. 2136 2124 */ 2137 2125 #define __BPF_FUNC_MAPPER(FN) \ 2138 2126 FN(unspec), \ ··· 2224 2196 FN(rc_keydown), \ 2225 2197 FN(skb_cgroup_id), \ 2226 2198 FN(get_current_cgroup_id), \ 2227 - FN(get_local_storage), 2199 + FN(get_local_storage), \ 2200 + FN(sk_select_reuseport), \ 2201 + FN(skb_ancestor_cgroup_id), 2228 2202 2229 2203 /* integer value in 'imm' field of BPF_CALL instruction selects which helper 2230 2204 * function eBPF program intends to call ··· 2441 2411 __u32 local_ip6[4]; /* Stored in network byte order */ 2442 2412 __u32 remote_port; /* Stored in network byte order */ 2443 2413 __u32 local_port; /* stored in host byte order */ 2414 + }; 2415 + 2416 + struct sk_reuseport_md { 2417 + /* 2418 + * Start of directly accessible data. It begins from 2419 + * the tcp/udp header. 2420 + */ 2421 + void *data; 2422 + void *data_end; /* End of directly accessible data */ 2423 + /* 2424 + * Total length of packet (starting from the tcp/udp header). 2425 + * Note that the directly accessible bytes (data_end - data) 2426 + * could be less than this "len". Those bytes could be 2427 + * indirectly read by a helper "bpf_skb_load_bytes()". 2428 + */ 2429 + __u32 len; 2430 + /* 2431 + * Eth protocol in the mac header (network byte order). e.g. 2432 + * ETH_P_IP(0x0800) and ETH_P_IPV6(0x86DD) 2433 + */ 2434 + __u32 eth_protocol; 2435 + __u32 ip_protocol; /* IP protocol. e.g. IPPROTO_TCP, IPPROTO_UDP */ 2436 + __u32 bind_inany; /* Is sock bound to an INANY address? */ 2437 + __u32 hash; /* A hash of the packet 4 tuples */ 2444 2438 }; 2445 2439 2446 2440 #define BPF_TAG_SIZE 8

+3

kernel/bpf/Makefile

··· 23 23 obj-$(CONFIG_BPF_SYSCALL) += stackmap.o 24 24 endif 25 25 obj-$(CONFIG_CGROUP_BPF) += cgroup.o 26 + ifeq ($(CONFIG_INET),y) 27 + obj-$(CONFIG_BPF_SYSCALL) += reuseport_array.o 28 + endif

+13 -15

kernel/bpf/arraymap.c

··· 54 54 } 55 55 56 56 /* Called from syscall */ 57 - static int array_map_alloc_check(union bpf_attr *attr) 57 + int array_map_alloc_check(union bpf_attr *attr) 58 58 { 59 59 bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; 60 60 int numa_node = bpf_map_attr_numa_node(attr); ··· 358 358 rcu_read_unlock(); 359 359 } 360 360 361 - static int array_map_check_btf(const struct bpf_map *map, const struct btf *btf, 362 - u32 btf_key_id, u32 btf_value_id) 361 + static int array_map_check_btf(const struct bpf_map *map, 362 + const struct btf_type *key_type, 363 + const struct btf_type *value_type) 363 364 { 364 - const struct btf_type *key_type, *value_type; 365 - u32 key_size, value_size; 366 365 u32 int_data; 367 366 368 - key_type = btf_type_id_size(btf, &btf_key_id, &key_size); 369 - if (!key_type || BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) 367 + if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) 370 368 return -EINVAL; 371 369 372 370 int_data = *(u32 *)(key_type + 1); 373 - /* bpf array can only take a u32 key. This check makes 374 - * sure that the btf matches the attr used during map_create. 371 + /* bpf array can only take a u32 key. This check makes sure 372 + * that the btf matches the attr used during map_create. 375 373 */ 376 - if (BTF_INT_BITS(int_data) != 32 || key_size != 4 || 377 - BTF_INT_OFFSET(int_data)) 378 - return -EINVAL; 379 - 380 - value_type = btf_type_id_size(btf, &btf_value_id, &value_size); 381 - if (!value_type || value_size != map->value_size) 374 + if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data)) 382 375 return -EINVAL; 383 376 384 377 return 0; ··· 398 405 .map_lookup_elem = percpu_array_map_lookup_elem, 399 406 .map_update_elem = array_map_update_elem, 400 407 .map_delete_elem = array_map_delete_elem, 408 + .map_check_btf = array_map_check_btf, 401 409 }; 402 410 403 411 static int fd_array_map_alloc_check(union bpf_attr *attr) ··· 540 546 .map_fd_put_ptr = prog_fd_array_put_ptr, 541 547 .map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem, 542 548 .map_release_uref = bpf_fd_array_map_clear, 549 + .map_check_btf = map_check_no_btf, 543 550 }; 544 551 545 552 static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file, ··· 629 634 .map_fd_get_ptr = perf_event_fd_array_get_ptr, 630 635 .map_fd_put_ptr = perf_event_fd_array_put_ptr, 631 636 .map_release = perf_event_fd_array_release, 637 + .map_check_btf = map_check_no_btf, 632 638 }; 633 639 634 640 #ifdef CONFIG_CGROUPS ··· 661 665 .map_delete_elem = fd_array_map_delete_elem, 662 666 .map_fd_get_ptr = cgroup_fd_array_get_ptr, 663 667 .map_fd_put_ptr = cgroup_fd_array_put_ptr, 668 + .map_check_btf = map_check_no_btf, 664 669 }; 665 670 #endif 666 671 ··· 746 749 .map_fd_put_ptr = bpf_map_fd_put_ptr, 747 750 .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, 748 751 .map_gen_lookup = array_of_map_gen_lookup, 752 + .map_check_btf = map_check_no_btf, 749 753 };

+1

kernel/bpf/cpumap.c

··· 555 555 .map_update_elem = cpu_map_update_elem, 556 556 .map_lookup_elem = cpu_map_lookup_elem, 557 557 .map_get_next_key = cpu_map_get_next_key, 558 + .map_check_btf = map_check_no_btf, 558 559 }; 559 560 560 561 static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,

+1

kernel/bpf/devmap.c

··· 488 488 .map_lookup_elem = dev_map_lookup_elem, 489 489 .map_update_elem = dev_map_update_elem, 490 490 .map_delete_elem = dev_map_delete_elem, 491 + .map_check_btf = map_check_no_btf, 491 492 }; 492 493 493 494 static int dev_map_notification(struct notifier_block *notifier,

+26

kernel/bpf/hashtab.c

··· 11 11 * General Public License for more details. 12 12 */ 13 13 #include <linux/bpf.h> 14 + #include <linux/btf.h> 14 15 #include <linux/jhash.h> 15 16 #include <linux/filter.h> 16 17 #include <linux/rculist_nulls.h> 18 + #include <uapi/linux/btf.h> 17 19 #include "percpu_freelist.h" 18 20 #include "bpf_lru_list.h" 19 21 #include "map_in_map.h" ··· 1164 1162 kfree(htab); 1165 1163 } 1166 1164 1165 + static void htab_map_seq_show_elem(struct bpf_map *map, void *key, 1166 + struct seq_file *m) 1167 + { 1168 + void *value; 1169 + 1170 + rcu_read_lock(); 1171 + 1172 + value = htab_map_lookup_elem(map, key); 1173 + if (!value) { 1174 + rcu_read_unlock(); 1175 + return; 1176 + } 1177 + 1178 + btf_type_seq_show(map->btf, map->btf_key_type_id, key, m); 1179 + seq_puts(m, ": "); 1180 + btf_type_seq_show(map->btf, map->btf_value_type_id, value, m); 1181 + seq_puts(m, "\n"); 1182 + 1183 + rcu_read_unlock(); 1184 + } 1185 + 1167 1186 const struct bpf_map_ops htab_map_ops = { 1168 1187 .map_alloc_check = htab_map_alloc_check, 1169 1188 .map_alloc = htab_map_alloc, ··· 1194 1171 .map_update_elem = htab_map_update_elem, 1195 1172 .map_delete_elem = htab_map_delete_elem, 1196 1173 .map_gen_lookup = htab_map_gen_lookup, 1174 + .map_seq_show_elem = htab_map_seq_show_elem, 1197 1175 }; 1198 1176 1199 1177 const struct bpf_map_ops htab_lru_map_ops = { ··· 1206 1182 .map_update_elem = htab_lru_map_update_elem, 1207 1183 .map_delete_elem = htab_lru_map_delete_elem, 1208 1184 .map_gen_lookup = htab_lru_map_gen_lookup, 1185 + .map_seq_show_elem = htab_map_seq_show_elem, 1209 1186 }; 1210 1187 1211 1188 /* Called from eBPF program */ ··· 1433 1408 .map_fd_put_ptr = bpf_map_fd_put_ptr, 1434 1409 .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, 1435 1410 .map_gen_lookup = htab_of_map_gen_lookup, 1411 + .map_check_btf = map_check_no_btf, 1436 1412 };

+7 -4

kernel/bpf/inode.c

··· 196 196 { 197 197 struct bpf_map *map = seq_file_to_map(m); 198 198 void *key = map_iter(m)->key; 199 + void *prev_key; 199 200 200 201 if (map_iter(m)->done) 201 202 return NULL; 202 203 203 204 if (unlikely(v == SEQ_START_TOKEN)) 204 - goto done; 205 + prev_key = NULL; 206 + else 207 + prev_key = key; 205 208 206 - if (map->ops->map_get_next_key(map, key, key)) { 209 + if (map->ops->map_get_next_key(map, prev_key, key)) { 207 210 map_iter(m)->done = true; 208 211 return NULL; 209 212 } 210 213 211 - done: 212 214 ++(*pos); 213 215 return key; 214 216 } ··· 334 332 struct bpf_map *map = arg; 335 333 336 334 return bpf_mkobj_ops(dentry, mode, arg, &bpf_map_iops, 337 - map->btf ? &bpffs_map_fops : &bpffs_obj_fops); 335 + bpf_map_support_seq_show(map) ? 336 + &bpffs_map_fops : &bpffs_obj_fops); 338 337 } 339 338 340 339 static struct dentry *

+1

kernel/bpf/local_storage.c

··· 246 246 .map_lookup_elem = cgroup_storage_lookup_elem, 247 247 .map_update_elem = cgroup_storage_update_elem, 248 248 .map_delete_elem = cgroup_storage_delete_elem, 249 + .map_check_btf = map_check_no_btf, 249 250 }; 250 251 251 252 int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *_map)

+12

kernel/bpf/lpm_trie.c

··· 10 10 */ 11 11 12 12 #include <linux/bpf.h> 13 + #include <linux/btf.h> 13 14 #include <linux/err.h> 14 15 #include <linux/slab.h> 15 16 #include <linux/spinlock.h> 16 17 #include <linux/vmalloc.h> 17 18 #include <net/ipv6.h> 19 + #include <uapi/linux/btf.h> 18 20 19 21 /* Intermediate node */ 20 22 #define LPM_TREE_NODE_FLAG_IM BIT(0) ··· 688 686 return err; 689 687 } 690 688 689 + static int trie_check_btf(const struct bpf_map *map, 690 + const struct btf_type *key_type, 691 + const struct btf_type *value_type) 692 + { 693 + /* Keys must have struct bpf_lpm_trie_key embedded. */ 694 + return BTF_INFO_KIND(key_type->info) != BTF_KIND_STRUCT ? 695 + -EINVAL : 0; 696 + } 697 + 691 698 const struct bpf_map_ops trie_map_ops = { 692 699 .map_alloc = trie_alloc, 693 700 .map_free = trie_free, ··· 704 693 .map_lookup_elem = trie_lookup_elem, 705 694 .map_update_elem = trie_update_elem, 706 695 .map_delete_elem = trie_delete_elem, 696 + .map_check_btf = trie_check_btf, 707 697 };

+363

kernel/bpf/reuseport_array.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * Copyright (c) 2018 Facebook 4 + */ 5 + #include <linux/bpf.h> 6 + #include <linux/err.h> 7 + #include <linux/sock_diag.h> 8 + #include <net/sock_reuseport.h> 9 + 10 + struct reuseport_array { 11 + struct bpf_map map; 12 + struct sock __rcu *ptrs[]; 13 + }; 14 + 15 + static struct reuseport_array *reuseport_array(struct bpf_map *map) 16 + { 17 + return (struct reuseport_array *)map; 18 + } 19 + 20 + /* The caller must hold the reuseport_lock */ 21 + void bpf_sk_reuseport_detach(struct sock *sk) 22 + { 23 + struct sock __rcu **socks; 24 + 25 + write_lock_bh(&sk->sk_callback_lock); 26 + socks = sk->sk_user_data; 27 + if (socks) { 28 + WRITE_ONCE(sk->sk_user_data, NULL); 29 + /* 30 + * Do not move this NULL assignment outside of 31 + * sk->sk_callback_lock because there is 32 + * a race with reuseport_array_free() 33 + * which does not hold the reuseport_lock. 34 + */ 35 + RCU_INIT_POINTER(*socks, NULL); 36 + } 37 + write_unlock_bh(&sk->sk_callback_lock); 38 + } 39 + 40 + static int reuseport_array_alloc_check(union bpf_attr *attr) 41 + { 42 + if (attr->value_size != sizeof(u32) && 43 + attr->value_size != sizeof(u64)) 44 + return -EINVAL; 45 + 46 + return array_map_alloc_check(attr); 47 + } 48 + 49 + static void *reuseport_array_lookup_elem(struct bpf_map *map, void *key) 50 + { 51 + struct reuseport_array *array = reuseport_array(map); 52 + u32 index = *(u32 *)key; 53 + 54 + if (unlikely(index >= array->map.max_entries)) 55 + return NULL; 56 + 57 + return rcu_dereference(array->ptrs[index]); 58 + } 59 + 60 + /* Called from syscall only */ 61 + static int reuseport_array_delete_elem(struct bpf_map *map, void *key) 62 + { 63 + struct reuseport_array *array = reuseport_array(map); 64 + u32 index = *(u32 *)key; 65 + struct sock *sk; 66 + int err; 67 + 68 + if (index >= map->max_entries) 69 + return -E2BIG; 70 + 71 + if (!rcu_access_pointer(array->ptrs[index])) 72 + return -ENOENT; 73 + 74 + spin_lock_bh(&reuseport_lock); 75 + 76 + sk = rcu_dereference_protected(array->ptrs[index], 77 + lockdep_is_held(&reuseport_lock)); 78 + if (sk) { 79 + write_lock_bh(&sk->sk_callback_lock); 80 + WRITE_ONCE(sk->sk_user_data, NULL); 81 + RCU_INIT_POINTER(array->ptrs[index], NULL); 82 + write_unlock_bh(&sk->sk_callback_lock); 83 + err = 0; 84 + } else { 85 + err = -ENOENT; 86 + } 87 + 88 + spin_unlock_bh(&reuseport_lock); 89 + 90 + return err; 91 + } 92 + 93 + static void reuseport_array_free(struct bpf_map *map) 94 + { 95 + struct reuseport_array *array = reuseport_array(map); 96 + struct sock *sk; 97 + u32 i; 98 + 99 + synchronize_rcu(); 100 + 101 + /* 102 + * ops->map_*_elem() will not be able to access this 103 + * array now. Hence, this function only races with 104 + * bpf_sk_reuseport_detach() which was triggerred by 105 + * close() or disconnect(). 106 + * 107 + * This function and bpf_sk_reuseport_detach() are 108 + * both removing sk from "array". Who removes it 109 + * first does not matter. 110 + * 111 + * The only concern here is bpf_sk_reuseport_detach() 112 + * may access "array" which is being freed here. 113 + * bpf_sk_reuseport_detach() access this "array" 114 + * through sk->sk_user_data _and_ with sk->sk_callback_lock 115 + * held which is enough because this "array" is not freed 116 + * until all sk->sk_user_data has stopped referencing this "array". 117 + * 118 + * Hence, due to the above, taking "reuseport_lock" is not 119 + * needed here. 120 + */ 121 + 122 + /* 123 + * Since reuseport_lock is not taken, sk is accessed under 124 + * rcu_read_lock() 125 + */ 126 + rcu_read_lock(); 127 + for (i = 0; i < map->max_entries; i++) { 128 + sk = rcu_dereference(array->ptrs[i]); 129 + if (sk) { 130 + write_lock_bh(&sk->sk_callback_lock); 131 + /* 132 + * No need for WRITE_ONCE(). At this point, 133 + * no one is reading it without taking the 134 + * sk->sk_callback_lock. 135 + */ 136 + sk->sk_user_data = NULL; 137 + write_unlock_bh(&sk->sk_callback_lock); 138 + RCU_INIT_POINTER(array->ptrs[i], NULL); 139 + } 140 + } 141 + rcu_read_unlock(); 142 + 143 + /* 144 + * Once reaching here, all sk->sk_user_data is not 145 + * referenceing this "array". "array" can be freed now. 146 + */ 147 + bpf_map_area_free(array); 148 + } 149 + 150 + static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr) 151 + { 152 + int err, numa_node = bpf_map_attr_numa_node(attr); 153 + struct reuseport_array *array; 154 + u64 cost, array_size; 155 + 156 + if (!capable(CAP_SYS_ADMIN)) 157 + return ERR_PTR(-EPERM); 158 + 159 + array_size = sizeof(*array); 160 + array_size += (u64)attr->max_entries * sizeof(struct sock *); 161 + 162 + /* make sure there is no u32 overflow later in round_up() */ 163 + cost = array_size; 164 + if (cost >= U32_MAX - PAGE_SIZE) 165 + return ERR_PTR(-ENOMEM); 166 + cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; 167 + 168 + err = bpf_map_precharge_memlock(cost); 169 + if (err) 170 + return ERR_PTR(err); 171 + 172 + /* allocate all map elements and zero-initialize them */ 173 + array = bpf_map_area_alloc(array_size, numa_node); 174 + if (!array) 175 + return ERR_PTR(-ENOMEM); 176 + 177 + /* copy mandatory map attributes */ 178 + bpf_map_init_from_attr(&array->map, attr); 179 + array->map.pages = cost; 180 + 181 + return &array->map; 182 + } 183 + 184 + int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key, 185 + void *value) 186 + { 187 + struct sock *sk; 188 + int err; 189 + 190 + if (map->value_size != sizeof(u64)) 191 + return -ENOSPC; 192 + 193 + rcu_read_lock(); 194 + sk = reuseport_array_lookup_elem(map, key); 195 + if (sk) { 196 + *(u64 *)value = sock_gen_cookie(sk); 197 + err = 0; 198 + } else { 199 + err = -ENOENT; 200 + } 201 + rcu_read_unlock(); 202 + 203 + return err; 204 + } 205 + 206 + static int 207 + reuseport_array_update_check(const struct reuseport_array *array, 208 + const struct sock *nsk, 209 + const struct sock *osk, 210 + const struct sock_reuseport *nsk_reuse, 211 + u32 map_flags) 212 + { 213 + if (osk && map_flags == BPF_NOEXIST) 214 + return -EEXIST; 215 + 216 + if (!osk && map_flags == BPF_EXIST) 217 + return -ENOENT; 218 + 219 + if (nsk->sk_protocol != IPPROTO_UDP && nsk->sk_protocol != IPPROTO_TCP) 220 + return -ENOTSUPP; 221 + 222 + if (nsk->sk_family != AF_INET && nsk->sk_family != AF_INET6) 223 + return -ENOTSUPP; 224 + 225 + if (nsk->sk_type != SOCK_STREAM && nsk->sk_type != SOCK_DGRAM) 226 + return -ENOTSUPP; 227 + 228 + /* 229 + * sk must be hashed (i.e. listening in the TCP case or binded 230 + * in the UDP case) and 231 + * it must also be a SO_REUSEPORT sk (i.e. reuse cannot be NULL). 232 + * 233 + * Also, sk will be used in bpf helper that is protected by 234 + * rcu_read_lock(). 235 + */ 236 + if (!sock_flag(nsk, SOCK_RCU_FREE) || !sk_hashed(nsk) || !nsk_reuse) 237 + return -EINVAL; 238 + 239 + /* READ_ONCE because the sk->sk_callback_lock may not be held here */ 240 + if (READ_ONCE(nsk->sk_user_data)) 241 + return -EBUSY; 242 + 243 + return 0; 244 + } 245 + 246 + /* 247 + * Called from syscall only. 248 + * The "nsk" in the fd refcnt. 249 + * The "osk" and "reuse" are protected by reuseport_lock. 250 + */ 251 + int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key, 252 + void *value, u64 map_flags) 253 + { 254 + struct reuseport_array *array = reuseport_array(map); 255 + struct sock *free_osk = NULL, *osk, *nsk; 256 + struct sock_reuseport *reuse; 257 + u32 index = *(u32 *)key; 258 + struct socket *socket; 259 + int err, fd; 260 + 261 + if (map_flags > BPF_EXIST) 262 + return -EINVAL; 263 + 264 + if (index >= map->max_entries) 265 + return -E2BIG; 266 + 267 + if (map->value_size == sizeof(u64)) { 268 + u64 fd64 = *(u64 *)value; 269 + 270 + if (fd64 > S32_MAX) 271 + return -EINVAL; 272 + fd = fd64; 273 + } else { 274 + fd = *(int *)value; 275 + } 276 + 277 + socket = sockfd_lookup(fd, &err); 278 + if (!socket) 279 + return err; 280 + 281 + nsk = socket->sk; 282 + if (!nsk) { 283 + err = -EINVAL; 284 + goto put_file; 285 + } 286 + 287 + /* Quick checks before taking reuseport_lock */ 288 + err = reuseport_array_update_check(array, nsk, 289 + rcu_access_pointer(array->ptrs[index]), 290 + rcu_access_pointer(nsk->sk_reuseport_cb), 291 + map_flags); 292 + if (err) 293 + goto put_file; 294 + 295 + spin_lock_bh(&reuseport_lock); 296 + /* 297 + * Some of the checks only need reuseport_lock 298 + * but it is done under sk_callback_lock also 299 + * for simplicity reason. 300 + */ 301 + write_lock_bh(&nsk->sk_callback_lock); 302 + 303 + osk = rcu_dereference_protected(array->ptrs[index], 304 + lockdep_is_held(&reuseport_lock)); 305 + reuse = rcu_dereference_protected(nsk->sk_reuseport_cb, 306 + lockdep_is_held(&reuseport_lock)); 307 + err = reuseport_array_update_check(array, nsk, osk, reuse, map_flags); 308 + if (err) 309 + goto put_file_unlock; 310 + 311 + /* Ensure reuse->reuseport_id is set */ 312 + err = reuseport_get_id(reuse); 313 + if (err < 0) 314 + goto put_file_unlock; 315 + 316 + WRITE_ONCE(nsk->sk_user_data, &array->ptrs[index]); 317 + rcu_assign_pointer(array->ptrs[index], nsk); 318 + free_osk = osk; 319 + err = 0; 320 + 321 + put_file_unlock: 322 + write_unlock_bh(&nsk->sk_callback_lock); 323 + 324 + if (free_osk) { 325 + write_lock_bh(&free_osk->sk_callback_lock); 326 + WRITE_ONCE(free_osk->sk_user_data, NULL); 327 + write_unlock_bh(&free_osk->sk_callback_lock); 328 + } 329 + 330 + spin_unlock_bh(&reuseport_lock); 331 + put_file: 332 + fput(socket->file); 333 + return err; 334 + } 335 + 336 + /* Called from syscall */ 337 + static int reuseport_array_get_next_key(struct bpf_map *map, void *key, 338 + void *next_key) 339 + { 340 + struct reuseport_array *array = reuseport_array(map); 341 + u32 index = key ? *(u32 *)key : U32_MAX; 342 + u32 *next = (u32 *)next_key; 343 + 344 + if (index >= array->map.max_entries) { 345 + *next = 0; 346 + return 0; 347 + } 348 + 349 + if (index == array->map.max_entries - 1) 350 + return -ENOENT; 351 + 352 + *next = index + 1; 353 + return 0; 354 + } 355 + 356 + const struct bpf_map_ops reuseport_array_ops = { 357 + .map_alloc_check = reuseport_array_alloc_check, 358 + .map_alloc = reuseport_array_alloc, 359 + .map_free = reuseport_array_free, 360 + .map_lookup_elem = reuseport_array_lookup_elem, 361 + .map_get_next_key = reuseport_array_get_next_key, 362 + .map_delete_elem = reuseport_array_delete_elem, 363 + };

+2

kernel/bpf/sockmap.c

··· 2498 2498 .map_update_elem = sock_map_update_elem, 2499 2499 .map_delete_elem = sock_map_delete_elem, 2500 2500 .map_release_uref = sock_map_release, 2501 + .map_check_btf = map_check_no_btf, 2501 2502 }; 2502 2503 2503 2504 const struct bpf_map_ops sock_hash_ops = { ··· 2509 2508 .map_update_elem = sock_hash_update_elem, 2510 2509 .map_delete_elem = sock_hash_delete_elem, 2511 2510 .map_release_uref = sock_map_release, 2511 + .map_check_btf = map_check_no_btf, 2512 2512 }; 2513 2513 2514 2514 BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock,

+1

kernel/bpf/stackmap.c

··· 607 607 .map_lookup_elem = stack_map_lookup_elem, 608 608 .map_update_elem = stack_map_update_elem, 609 609 .map_delete_elem = stack_map_delete_elem, 610 + .map_check_btf = map_check_no_btf, 610 611 }; 611 612 612 613 static int __init stack_map_init(void)

+38 -4

kernel/bpf/syscall.c

··· 103 103 const struct bpf_map_ops bpf_map_offload_ops = { 104 104 .map_alloc = bpf_map_offload_map_alloc, 105 105 .map_free = bpf_map_offload_map_free, 106 + .map_check_btf = map_check_no_btf, 106 107 }; 107 108 108 109 static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) ··· 456 455 return 0; 457 456 } 458 457 458 + int map_check_no_btf(const struct bpf_map *map, 459 + const struct btf_type *key_type, 460 + const struct btf_type *value_type) 461 + { 462 + return -ENOTSUPP; 463 + } 464 + 465 + static int map_check_btf(const struct bpf_map *map, const struct btf *btf, 466 + u32 btf_key_id, u32 btf_value_id) 467 + { 468 + const struct btf_type *key_type, *value_type; 469 + u32 key_size, value_size; 470 + int ret = 0; 471 + 472 + key_type = btf_type_id_size(btf, &btf_key_id, &key_size); 473 + if (!key_type || key_size != map->key_size) 474 + return -EINVAL; 475 + 476 + value_type = btf_type_id_size(btf, &btf_value_id, &value_size); 477 + if (!value_type || value_size != map->value_size) 478 + return -EINVAL; 479 + 480 + if (map->ops->map_check_btf) 481 + ret = map->ops->map_check_btf(map, key_type, value_type); 482 + 483 + return ret; 484 + } 485 + 459 486 #define BPF_MAP_CREATE_LAST_FIELD btf_value_type_id 460 487 /* called via syscall */ 461 488 static int map_create(union bpf_attr *attr) ··· 518 489 atomic_set(&map->refcnt, 1); 519 490 atomic_set(&map->usercnt, 1); 520 491 521 - if (bpf_map_support_seq_show(map) && 522 - (attr->btf_key_type_id || attr->btf_value_type_id)) { 492 + if (attr->btf_key_type_id || attr->btf_value_type_id) { 523 493 struct btf *btf; 524 494 525 495 if (!attr->btf_key_type_id || !attr->btf_value_type_id) { ··· 532 504 goto free_map_nouncharge; 533 505 } 534 506 535 - err = map->ops->map_check_btf(map, btf, attr->btf_key_type_id, 536 - attr->btf_value_type_id); 507 + err = map_check_btf(map, btf, attr->btf_key_type_id, 508 + attr->btf_value_type_id); 537 509 if (err) { 538 510 btf_put(btf); 539 511 goto free_map_nouncharge; ··· 712 684 err = bpf_fd_array_map_lookup_elem(map, key, value); 713 685 } else if (IS_FD_HASH(map)) { 714 686 err = bpf_fd_htab_map_lookup_elem(map, key, value); 687 + } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) { 688 + err = bpf_fd_reuseport_array_lookup_elem(map, key, value); 715 689 } else { 716 690 rcu_read_lock(); 717 691 ptr = map->ops->map_lookup_elem(map, key); ··· 820 790 err = bpf_fd_htab_map_update_elem(map, f.file, key, value, 821 791 attr->flags); 822 792 rcu_read_unlock(); 793 + } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) { 794 + /* rcu_read_lock() is not needed */ 795 + err = bpf_fd_reuseport_array_update_elem(map, key, value, 796 + attr->flags); 823 797 } else { 824 798 rcu_read_lock(); 825 799 err = map->ops->map_update_elem(map, key, value, attr->flags);

+9

kernel/bpf/verifier.c

··· 1310 1310 case BPF_PROG_TYPE_LWT_IN: 1311 1311 case BPF_PROG_TYPE_LWT_OUT: 1312 1312 case BPF_PROG_TYPE_LWT_SEG6LOCAL: 1313 + case BPF_PROG_TYPE_SK_REUSEPORT: 1313 1314 /* dst_input() and dst_output() can't write for now */ 1314 1315 if (t == BPF_WRITE) 1315 1316 return false; ··· 2167 2166 func_id != BPF_FUNC_msg_redirect_hash) 2168 2167 goto error; 2169 2168 break; 2169 + case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY: 2170 + if (func_id != BPF_FUNC_sk_select_reuseport) 2171 + goto error; 2172 + break; 2170 2173 default: 2171 2174 break; 2172 2175 } ··· 2220 2215 break; 2221 2216 case BPF_FUNC_get_local_storage: 2222 2217 if (map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE) 2218 + goto error; 2219 + break; 2220 + case BPF_FUNC_sk_select_reuseport: 2221 + if (map->map_type != BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) 2223 2222 goto error; 2224 2223 break; 2225 2224 default:

+1 -2

kernel/bpf/xskmap.c

··· 227 227 .map_lookup_elem = xsk_map_lookup_elem, 228 228 .map_update_elem = xsk_map_update_elem, 229 229 .map_delete_elem = xsk_map_delete_elem, 230 + .map_check_btf = map_check_no_btf, 230 231 }; 231 - 232 -

+359 -54

net/core/filter.c

··· 1453 1453 return 0; 1454 1454 } 1455 1455 1456 - static int __reuseport_attach_prog(struct bpf_prog *prog, struct sock *sk) 1457 - { 1458 - struct bpf_prog *old_prog; 1459 - int err; 1460 - 1461 - if (bpf_prog_size(prog->len) > sysctl_optmem_max) 1462 - return -ENOMEM; 1463 - 1464 - if (sk_unhashed(sk) && sk->sk_reuseport) { 1465 - err = reuseport_alloc(sk); 1466 - if (err) 1467 - return err; 1468 - } else if (!rcu_access_pointer(sk->sk_reuseport_cb)) { 1469 - /* The socket wasn't bound with SO_REUSEPORT */ 1470 - return -EINVAL; 1471 - } 1472 - 1473 - old_prog = reuseport_attach_prog(sk, prog); 1474 - if (old_prog) 1475 - bpf_prog_destroy(old_prog); 1476 - 1477 - return 0; 1478 - } 1479 - 1480 1456 static 1481 1457 struct bpf_prog *__get_filter(struct sock_fprog *fprog, struct sock *sk) 1482 1458 { ··· 1526 1550 if (IS_ERR(prog)) 1527 1551 return PTR_ERR(prog); 1528 1552 1529 - err = __reuseport_attach_prog(prog, sk); 1530 - if (err < 0) { 1531 - __bpf_prog_release(prog); 1532 - return err; 1533 - } 1553 + if (bpf_prog_size(prog->len) > sysctl_optmem_max) 1554 + err = -ENOMEM; 1555 + else 1556 + err = reuseport_attach_prog(sk, prog); 1534 1557 1535 - return 0; 1558 + if (err) 1559 + __bpf_prog_release(prog); 1560 + 1561 + return err; 1536 1562 } 1537 1563 1538 1564 static struct bpf_prog *__get_bpf(u32 ufd, struct sock *sk) ··· 1564 1586 1565 1587 int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk) 1566 1588 { 1567 - struct bpf_prog *prog = __get_bpf(ufd, sk); 1589 + struct bpf_prog *prog; 1568 1590 int err; 1569 1591 1592 + if (sock_flag(sk, SOCK_FILTER_LOCKED)) 1593 + return -EPERM; 1594 + 1595 + prog = bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER); 1596 + if (IS_ERR(prog) && PTR_ERR(prog) == -EINVAL) 1597 + prog = bpf_prog_get_type(ufd, BPF_PROG_TYPE_SK_REUSEPORT); 1570 1598 if (IS_ERR(prog)) 1571 1599 return PTR_ERR(prog); 1572 1600 1573 - err = __reuseport_attach_prog(prog, sk); 1574 - if (err < 0) { 1575 - bpf_prog_put(prog); 1576 - return err; 1601 + if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT) { 1602 + /* Like other non BPF_PROG_TYPE_SOCKET_FILTER 1603 + * bpf prog (e.g. sockmap). It depends on the 1604 + * limitation imposed by bpf_prog_load(). 1605 + * Hence, sysctl_optmem_max is not checked. 1606 + */ 1607 + if ((sk->sk_type != SOCK_STREAM && 1608 + sk->sk_type != SOCK_DGRAM) || 1609 + (sk->sk_protocol != IPPROTO_UDP && 1610 + sk->sk_protocol != IPPROTO_TCP) || 1611 + (sk->sk_family != AF_INET && 1612 + sk->sk_family != AF_INET6)) { 1613 + err = -ENOTSUPP; 1614 + goto err_prog_put; 1615 + } 1616 + } else { 1617 + /* BPF_PROG_TYPE_SOCKET_FILTER */ 1618 + if (bpf_prog_size(prog->len) > sysctl_optmem_max) { 1619 + err = -ENOMEM; 1620 + goto err_prog_put; 1621 + } 1577 1622 } 1578 1623 1579 - return 0; 1624 + err = reuseport_attach_prog(sk, prog); 1625 + err_prog_put: 1626 + if (err) 1627 + bpf_prog_put(prog); 1628 + 1629 + return err; 1630 + } 1631 + 1632 + void sk_reuseport_prog_free(struct bpf_prog *prog) 1633 + { 1634 + if (!prog) 1635 + return; 1636 + 1637 + if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT) 1638 + bpf_prog_put(prog); 1639 + else 1640 + bpf_prog_destroy(prog); 1580 1641 } 1581 1642 1582 1643 struct bpf_scratchpad { ··· 2099 2082 .arg3_type = ARG_ANYTHING, 2100 2083 }; 2101 2084 2102 - struct redirect_info { 2103 - u32 ifindex; 2104 - u32 flags; 2105 - struct bpf_map *map; 2106 - struct bpf_map *map_to_flush; 2107 - unsigned long map_owner; 2108 - }; 2109 - 2110 - static DEFINE_PER_CPU(struct redirect_info, redirect_info); 2085 + DEFINE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info); 2086 + EXPORT_PER_CPU_SYMBOL_GPL(bpf_redirect_info); 2111 2087 2112 2088 BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags) 2113 2089 { 2114 - struct redirect_info *ri = this_cpu_ptr(&redirect_info); 2090 + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); 2115 2091 2116 2092 if (unlikely(flags & ~(BPF_F_INGRESS))) 2117 2093 return TC_ACT_SHOT; ··· 2117 2107 2118 2108 int skb_do_redirect(struct sk_buff *skb) 2119 2109 { 2120 - struct redirect_info *ri = this_cpu_ptr(&redirect_info); 2110 + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); 2121 2111 struct net_device *dev; 2122 2112 2123 2113 dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->ifindex); ··· 3210 3200 3211 3201 void xdp_do_flush_map(void) 3212 3202 { 3213 - struct redirect_info *ri = this_cpu_ptr(&redirect_info); 3203 + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); 3214 3204 struct bpf_map *map = ri->map_to_flush; 3215 3205 3216 3206 ri->map_to_flush = NULL; ··· 3255 3245 static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, 3256 3246 struct bpf_prog *xdp_prog) 3257 3247 { 3258 - struct redirect_info *ri = this_cpu_ptr(&redirect_info); 3248 + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); 3259 3249 unsigned long map_owner = ri->map_owner; 3260 3250 struct bpf_map *map = ri->map; 3261 3251 u32 index = ri->ifindex; ··· 3295 3285 int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, 3296 3286 struct bpf_prog *xdp_prog) 3297 3287 { 3298 - struct redirect_info *ri = this_cpu_ptr(&redirect_info); 3288 + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); 3299 3289 struct net_device *fwd; 3300 3290 u32 index = ri->ifindex; 3301 3291 int err; ··· 3327 3317 struct xdp_buff *xdp, 3328 3318 struct bpf_prog *xdp_prog) 3329 3319 { 3330 - struct redirect_info *ri = this_cpu_ptr(&redirect_info); 3320 + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); 3331 3321 unsigned long map_owner = ri->map_owner; 3332 3322 struct bpf_map *map = ri->map; 3333 3323 u32 index = ri->ifindex; ··· 3378 3368 int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, 3379 3369 struct xdp_buff *xdp, struct bpf_prog *xdp_prog) 3380 3370 { 3381 - struct redirect_info *ri = this_cpu_ptr(&redirect_info); 3371 + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); 3382 3372 u32 index = ri->ifindex; 3383 3373 struct net_device *fwd; 3384 3374 int err = 0; ··· 3409 3399 3410 3400 BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags) 3411 3401 { 3412 - struct redirect_info *ri = this_cpu_ptr(&redirect_info); 3402 + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); 3413 3403 3414 3404 if (unlikely(flags)) 3415 3405 return XDP_ABORTED; ··· 3433 3423 BPF_CALL_4(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex, u64, flags, 3434 3424 unsigned long, map_owner) 3435 3425 { 3436 - struct redirect_info *ri = this_cpu_ptr(&redirect_info); 3426 + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); 3437 3427 3438 3428 if (unlikely(flags)) 3439 3429 return XDP_ABORTED; ··· 3777 3767 .gpl_only = false, 3778 3768 .ret_type = RET_INTEGER, 3779 3769 .arg1_type = ARG_PTR_TO_CTX, 3770 + }; 3771 + 3772 + BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int, 3773 + ancestor_level) 3774 + { 3775 + struct sock *sk = skb_to_full_sk(skb); 3776 + struct cgroup *ancestor; 3777 + struct cgroup *cgrp; 3778 + 3779 + if (!sk || !sk_fullsock(sk)) 3780 + return 0; 3781 + 3782 + cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); 3783 + ancestor = cgroup_ancestor(cgrp, ancestor_level); 3784 + if (!ancestor) 3785 + return 0; 3786 + 3787 + return ancestor->kn->id.id; 3788 + } 3789 + 3790 + static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = { 3791 + .func = bpf_skb_ancestor_cgroup_id, 3792 + .gpl_only = false, 3793 + .ret_type = RET_INTEGER, 3794 + .arg1_type = ARG_PTR_TO_CTX, 3795 + .arg2_type = ARG_ANYTHING, 3780 3796 }; 3781 3797 #endif 3782 3798 ··· 4992 4956 #ifdef CONFIG_SOCK_CGROUP_DATA 4993 4957 case BPF_FUNC_skb_cgroup_id: 4994 4958 return &bpf_skb_cgroup_id_proto; 4959 + case BPF_FUNC_skb_ancestor_cgroup_id: 4960 + return &bpf_skb_ancestor_cgroup_id_proto; 4995 4961 #endif 4996 4962 default: 4997 4963 return bpf_base_func_proto(func_id); ··· 7058 7020 release_sock(sk); 7059 7021 return ret; 7060 7022 } 7023 + 7024 + #ifdef CONFIG_INET 7025 + struct sk_reuseport_kern { 7026 + struct sk_buff *skb; 7027 + struct sock *sk; 7028 + struct sock *selected_sk; 7029 + void *data_end; 7030 + u32 hash; 7031 + u32 reuseport_id; 7032 + bool bind_inany; 7033 + }; 7034 + 7035 + static void bpf_init_reuseport_kern(struct sk_reuseport_kern *reuse_kern, 7036 + struct sock_reuseport *reuse, 7037 + struct sock *sk, struct sk_buff *skb, 7038 + u32 hash) 7039 + { 7040 + reuse_kern->skb = skb; 7041 + reuse_kern->sk = sk; 7042 + reuse_kern->selected_sk = NULL; 7043 + reuse_kern->data_end = skb->data + skb_headlen(skb); 7044 + reuse_kern->hash = hash; 7045 + reuse_kern->reuseport_id = reuse->reuseport_id; 7046 + reuse_kern->bind_inany = reuse->bind_inany; 7047 + } 7048 + 7049 + struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, 7050 + struct bpf_prog *prog, struct sk_buff *skb, 7051 + u32 hash) 7052 + { 7053 + struct sk_reuseport_kern reuse_kern; 7054 + enum sk_action action; 7055 + 7056 + bpf_init_reuseport_kern(&reuse_kern, reuse, sk, skb, hash); 7057 + action = BPF_PROG_RUN(prog, &reuse_kern); 7058 + 7059 + if (action == SK_PASS) 7060 + return reuse_kern.selected_sk; 7061 + else 7062 + return ERR_PTR(-ECONNREFUSED); 7063 + } 7064 + 7065 + BPF_CALL_4(sk_select_reuseport, struct sk_reuseport_kern *, reuse_kern, 7066 + struct bpf_map *, map, void *, key, u32, flags) 7067 + { 7068 + struct sock_reuseport *reuse; 7069 + struct sock *selected_sk; 7070 + 7071 + selected_sk = map->ops->map_lookup_elem(map, key); 7072 + if (!selected_sk) 7073 + return -ENOENT; 7074 + 7075 + reuse = rcu_dereference(selected_sk->sk_reuseport_cb); 7076 + if (!reuse) 7077 + /* selected_sk is unhashed (e.g. by close()) after the 7078 + * above map_lookup_elem(). Treat selected_sk has already 7079 + * been removed from the map. 7080 + */ 7081 + return -ENOENT; 7082 + 7083 + if (unlikely(reuse->reuseport_id != reuse_kern->reuseport_id)) { 7084 + struct sock *sk; 7085 + 7086 + if (unlikely(!reuse_kern->reuseport_id)) 7087 + /* There is a small race between adding the 7088 + * sk to the map and setting the 7089 + * reuse_kern->reuseport_id. 7090 + * Treat it as the sk has not been added to 7091 + * the bpf map yet. 7092 + */ 7093 + return -ENOENT; 7094 + 7095 + sk = reuse_kern->sk; 7096 + if (sk->sk_protocol != selected_sk->sk_protocol) 7097 + return -EPROTOTYPE; 7098 + else if (sk->sk_family != selected_sk->sk_family) 7099 + return -EAFNOSUPPORT; 7100 + 7101 + /* Catch all. Likely bound to a different sockaddr. */ 7102 + return -EBADFD; 7103 + } 7104 + 7105 + reuse_kern->selected_sk = selected_sk; 7106 + 7107 + return 0; 7108 + } 7109 + 7110 + static const struct bpf_func_proto sk_select_reuseport_proto = { 7111 + .func = sk_select_reuseport, 7112 + .gpl_only = false, 7113 + .ret_type = RET_INTEGER, 7114 + .arg1_type = ARG_PTR_TO_CTX, 7115 + .arg2_type = ARG_CONST_MAP_PTR, 7116 + .arg3_type = ARG_PTR_TO_MAP_KEY, 7117 + .arg4_type = ARG_ANYTHING, 7118 + }; 7119 + 7120 + BPF_CALL_4(sk_reuseport_load_bytes, 7121 + const struct sk_reuseport_kern *, reuse_kern, u32, offset, 7122 + void *, to, u32, len) 7123 + { 7124 + return ____bpf_skb_load_bytes(reuse_kern->skb, offset, to, len); 7125 + } 7126 + 7127 + static const struct bpf_func_proto sk_reuseport_load_bytes_proto = { 7128 + .func = sk_reuseport_load_bytes, 7129 + .gpl_only = false, 7130 + .ret_type = RET_INTEGER, 7131 + .arg1_type = ARG_PTR_TO_CTX, 7132 + .arg2_type = ARG_ANYTHING, 7133 + .arg3_type = ARG_PTR_TO_UNINIT_MEM, 7134 + .arg4_type = ARG_CONST_SIZE, 7135 + }; 7136 + 7137 + BPF_CALL_5(sk_reuseport_load_bytes_relative, 7138 + const struct sk_reuseport_kern *, reuse_kern, u32, offset, 7139 + void *, to, u32, len, u32, start_header) 7140 + { 7141 + return ____bpf_skb_load_bytes_relative(reuse_kern->skb, offset, to, 7142 + len, start_header); 7143 + } 7144 + 7145 + static const struct bpf_func_proto sk_reuseport_load_bytes_relative_proto = { 7146 + .func = sk_reuseport_load_bytes_relative, 7147 + .gpl_only = false, 7148 + .ret_type = RET_INTEGER, 7149 + .arg1_type = ARG_PTR_TO_CTX, 7150 + .arg2_type = ARG_ANYTHING, 7151 + .arg3_type = ARG_PTR_TO_UNINIT_MEM, 7152 + .arg4_type = ARG_CONST_SIZE, 7153 + .arg5_type = ARG_ANYTHING, 7154 + }; 7155 + 7156 + static const struct bpf_func_proto * 7157 + sk_reuseport_func_proto(enum bpf_func_id func_id, 7158 + const struct bpf_prog *prog) 7159 + { 7160 + switch (func_id) { 7161 + case BPF_FUNC_sk_select_reuseport: 7162 + return &sk_select_reuseport_proto; 7163 + case BPF_FUNC_skb_load_bytes: 7164 + return &sk_reuseport_load_bytes_proto; 7165 + case BPF_FUNC_skb_load_bytes_relative: 7166 + return &sk_reuseport_load_bytes_relative_proto; 7167 + default: 7168 + return bpf_base_func_proto(func_id); 7169 + } 7170 + } 7171 + 7172 + static bool 7173 + sk_reuseport_is_valid_access(int off, int size, 7174 + enum bpf_access_type type, 7175 + const struct bpf_prog *prog, 7176 + struct bpf_insn_access_aux *info) 7177 + { 7178 + const u32 size_default = sizeof(__u32); 7179 + 7180 + if (off < 0 || off >= sizeof(struct sk_reuseport_md) || 7181 + off % size || type != BPF_READ) 7182 + return false; 7183 + 7184 + switch (off) { 7185 + case offsetof(struct sk_reuseport_md, data): 7186 + info->reg_type = PTR_TO_PACKET; 7187 + return size == sizeof(__u64); 7188 + 7189 + case offsetof(struct sk_reuseport_md, data_end): 7190 + info->reg_type = PTR_TO_PACKET_END; 7191 + return size == sizeof(__u64); 7192 + 7193 + case offsetof(struct sk_reuseport_md, hash): 7194 + return size == size_default; 7195 + 7196 + /* Fields that allow narrowing */ 7197 + case offsetof(struct sk_reuseport_md, eth_protocol): 7198 + if (size < FIELD_SIZEOF(struct sk_buff, protocol)) 7199 + return false; 7200 + case offsetof(struct sk_reuseport_md, ip_protocol): 7201 + case offsetof(struct sk_reuseport_md, bind_inany): 7202 + case offsetof(struct sk_reuseport_md, len): 7203 + bpf_ctx_record_field_size(info, size_default); 7204 + return bpf_ctx_narrow_access_ok(off, size, size_default); 7205 + 7206 + default: 7207 + return false; 7208 + } 7209 + } 7210 + 7211 + #define SK_REUSEPORT_LOAD_FIELD(F) ({ \ 7212 + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_reuseport_kern, F), \ 7213 + si->dst_reg, si->src_reg, \ 7214 + bpf_target_off(struct sk_reuseport_kern, F, \ 7215 + FIELD_SIZEOF(struct sk_reuseport_kern, F), \ 7216 + target_size)); \ 7217 + }) 7218 + 7219 + #define SK_REUSEPORT_LOAD_SKB_FIELD(SKB_FIELD) \ 7220 + SOCK_ADDR_LOAD_NESTED_FIELD(struct sk_reuseport_kern, \ 7221 + struct sk_buff, \ 7222 + skb, \ 7223 + SKB_FIELD) 7224 + 7225 + #define SK_REUSEPORT_LOAD_SK_FIELD_SIZE_OFF(SK_FIELD, BPF_SIZE, EXTRA_OFF) \ 7226 + SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(struct sk_reuseport_kern, \ 7227 + struct sock, \ 7228 + sk, \ 7229 + SK_FIELD, BPF_SIZE, EXTRA_OFF) 7230 + 7231 + static u32 sk_reuseport_convert_ctx_access(enum bpf_access_type type, 7232 + const struct bpf_insn *si, 7233 + struct bpf_insn *insn_buf, 7234 + struct bpf_prog *prog, 7235 + u32 *target_size) 7236 + { 7237 + struct bpf_insn *insn = insn_buf; 7238 + 7239 + switch (si->off) { 7240 + case offsetof(struct sk_reuseport_md, data): 7241 + SK_REUSEPORT_LOAD_SKB_FIELD(data); 7242 + break; 7243 + 7244 + case offsetof(struct sk_reuseport_md, len): 7245 + SK_REUSEPORT_LOAD_SKB_FIELD(len); 7246 + break; 7247 + 7248 + case offsetof(struct sk_reuseport_md, eth_protocol): 7249 + SK_REUSEPORT_LOAD_SKB_FIELD(protocol); 7250 + break; 7251 + 7252 + case offsetof(struct sk_reuseport_md, ip_protocol): 7253 + BUILD_BUG_ON(hweight_long(SK_FL_PROTO_MASK) != BITS_PER_BYTE); 7254 + SK_REUSEPORT_LOAD_SK_FIELD_SIZE_OFF(__sk_flags_offset, 7255 + BPF_W, 0); 7256 + *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK); 7257 + *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, 7258 + SK_FL_PROTO_SHIFT); 7259 + /* SK_FL_PROTO_MASK and SK_FL_PROTO_SHIFT are endian 7260 + * aware. No further narrowing or masking is needed. 7261 + */ 7262 + *target_size = 1; 7263 + break; 7264 + 7265 + case offsetof(struct sk_reuseport_md, data_end): 7266 + SK_REUSEPORT_LOAD_FIELD(data_end); 7267 + break; 7268 + 7269 + case offsetof(struct sk_reuseport_md, hash): 7270 + SK_REUSEPORT_LOAD_FIELD(hash); 7271 + break; 7272 + 7273 + case offsetof(struct sk_reuseport_md, bind_inany): 7274 + SK_REUSEPORT_LOAD_FIELD(bind_inany); 7275 + break; 7276 + } 7277 + 7278 + return insn - insn_buf; 7279 + } 7280 + 7281 + const struct bpf_verifier_ops sk_reuseport_verifier_ops = { 7282 + .get_func_proto = sk_reuseport_func_proto, 7283 + .is_valid_access = sk_reuseport_is_valid_access, 7284 + .convert_ctx_access = sk_reuseport_convert_ctx_access, 7285 + }; 7286 + 7287 + const struct bpf_prog_ops sk_reuseport_prog_ops = { 7288 + }; 7289 + #endif /* CONFIG_INET */

+2 -1

net/core/skbuff.c

··· 1291 1291 } 1292 1292 EXPORT_SYMBOL(skb_clone); 1293 1293 1294 - static void skb_headers_offset_update(struct sk_buff *skb, int off) 1294 + void skb_headers_offset_update(struct sk_buff *skb, int off) 1295 1295 { 1296 1296 /* Only adjust this if it actually is csum_start rather than csum */ 1297 1297 if (skb->ip_summed == CHECKSUM_PARTIAL) ··· 1305 1305 skb->inner_network_header += off; 1306 1306 skb->inner_mac_header += off; 1307 1307 } 1308 + EXPORT_SYMBOL(skb_headers_offset_update); 1308 1309 1309 1310 void skb_copy_header(struct sk_buff *new, const struct sk_buff *old) 1310 1311 {

+76 -16

net/core/sock_reuseport.c

··· 8 8 9 9 #include <net/sock_reuseport.h> 10 10 #include <linux/bpf.h> 11 + #include <linux/idr.h> 12 + #include <linux/filter.h> 11 13 #include <linux/rcupdate.h> 12 14 13 15 #define INIT_SOCKS 128 14 16 15 - static DEFINE_SPINLOCK(reuseport_lock); 17 + DEFINE_SPINLOCK(reuseport_lock); 18 + 19 + #define REUSEPORT_MIN_ID 1 20 + static DEFINE_IDA(reuseport_ida); 21 + 22 + int reuseport_get_id(struct sock_reuseport *reuse) 23 + { 24 + int id; 25 + 26 + if (reuse->reuseport_id) 27 + return reuse->reuseport_id; 28 + 29 + id = ida_simple_get(&reuseport_ida, REUSEPORT_MIN_ID, 0, 30 + /* Called under reuseport_lock */ 31 + GFP_ATOMIC); 32 + if (id < 0) 33 + return id; 34 + 35 + reuse->reuseport_id = id; 36 + 37 + return reuse->reuseport_id; 38 + } 16 39 17 40 static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks) 18 41 { ··· 52 29 return reuse; 53 30 } 54 31 55 - int reuseport_alloc(struct sock *sk) 32 + int reuseport_alloc(struct sock *sk, bool bind_inany) 56 33 { 57 34 struct sock_reuseport *reuse; 58 35 ··· 64 41 /* Allocation attempts can occur concurrently via the setsockopt path 65 42 * and the bind/hash path. Nothing to do when we lose the race. 66 43 */ 67 - if (rcu_dereference_protected(sk->sk_reuseport_cb, 68 - lockdep_is_held(&reuseport_lock))) 44 + reuse = rcu_dereference_protected(sk->sk_reuseport_cb, 45 + lockdep_is_held(&reuseport_lock)); 46 + if (reuse) { 47 + /* Only set reuse->bind_inany if the bind_inany is true. 48 + * Otherwise, it will overwrite the reuse->bind_inany 49 + * which was set by the bind/hash path. 50 + */ 51 + if (bind_inany) 52 + reuse->bind_inany = bind_inany; 69 53 goto out; 54 + } 70 55 71 56 reuse = __reuseport_alloc(INIT_SOCKS); 72 57 if (!reuse) { ··· 84 53 85 54 reuse->socks[0] = sk; 86 55 reuse->num_socks = 1; 56 + reuse->bind_inany = bind_inany; 87 57 rcu_assign_pointer(sk->sk_reuseport_cb, reuse); 88 58 89 59 out: ··· 110 78 more_reuse->max_socks = more_socks_size; 111 79 more_reuse->num_socks = reuse->num_socks; 112 80 more_reuse->prog = reuse->prog; 81 + more_reuse->reuseport_id = reuse->reuseport_id; 82 + more_reuse->bind_inany = reuse->bind_inany; 113 83 114 84 memcpy(more_reuse->socks, reuse->socks, 115 85 reuse->num_socks * sizeof(struct sock *)); 86 + more_reuse->synq_overflow_ts = READ_ONCE(reuse->synq_overflow_ts); 116 87 117 88 for (i = 0; i < reuse->num_socks; ++i) 118 89 rcu_assign_pointer(reuse->socks[i]->sk_reuseport_cb, ··· 134 99 struct sock_reuseport *reuse; 135 100 136 101 reuse = container_of(head, struct sock_reuseport, rcu); 137 - if (reuse->prog) 138 - bpf_prog_destroy(reuse->prog); 102 + sk_reuseport_prog_free(rcu_dereference_protected(reuse->prog, 1)); 103 + if (reuse->reuseport_id) 104 + ida_simple_remove(&reuseport_ida, reuse->reuseport_id); 139 105 kfree(reuse); 140 106 } 141 107 ··· 146 110 * @sk2: Socket belonging to the existing reuseport group. 147 111 * May return ENOMEM and not add socket to group under memory pressure. 148 112 */ 149 - int reuseport_add_sock(struct sock *sk, struct sock *sk2) 113 + int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany) 150 114 { 151 115 struct sock_reuseport *old_reuse, *reuse; 152 116 153 117 if (!rcu_access_pointer(sk2->sk_reuseport_cb)) { 154 - int err = reuseport_alloc(sk2); 118 + int err = reuseport_alloc(sk2, bind_inany); 155 119 156 120 if (err) 157 121 return err; ··· 196 160 spin_lock_bh(&reuseport_lock); 197 161 reuse = rcu_dereference_protected(sk->sk_reuseport_cb, 198 162 lockdep_is_held(&reuseport_lock)); 163 + 164 + /* At least one of the sk in this reuseport group is added to 165 + * a bpf map. Notify the bpf side. The bpf map logic will 166 + * remove the sk if it is indeed added to a bpf map. 167 + */ 168 + if (reuse->reuseport_id) 169 + bpf_sk_reuseport_detach(sk); 170 + 199 171 rcu_assign_pointer(sk->sk_reuseport_cb, NULL); 200 172 201 173 for (i = 0; i < reuse->num_socks; i++) { ··· 219 175 } 220 176 EXPORT_SYMBOL(reuseport_detach_sock); 221 177 222 - static struct sock *run_bpf(struct sock_reuseport *reuse, u16 socks, 223 - struct bpf_prog *prog, struct sk_buff *skb, 224 - int hdr_len) 178 + static struct sock *run_bpf_filter(struct sock_reuseport *reuse, u16 socks, 179 + struct bpf_prog *prog, struct sk_buff *skb, 180 + int hdr_len) 225 181 { 226 182 struct sk_buff *nskb = NULL; 227 183 u32 index; ··· 282 238 /* paired with smp_wmb() in reuseport_add_sock() */ 283 239 smp_rmb(); 284 240 285 - if (prog && skb) 286 - sk2 = run_bpf(reuse, socks, prog, skb, hdr_len); 241 + if (!prog || !skb) 242 + goto select_by_hash; 287 243 244 + if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT) 245 + sk2 = bpf_run_sk_reuseport(reuse, sk, prog, skb, hash); 246 + else 247 + sk2 = run_bpf_filter(reuse, socks, prog, skb, hdr_len); 248 + 249 + select_by_hash: 288 250 /* no bpf or invalid bpf result: fall back to hash usage */ 289 251 if (!sk2) 290 252 sk2 = reuse->socks[reciprocal_scale(hash, socks)]; ··· 302 252 } 303 253 EXPORT_SYMBOL(reuseport_select_sock); 304 254 305 - struct bpf_prog * 306 - reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog) 255 + int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog) 307 256 { 308 257 struct sock_reuseport *reuse; 309 258 struct bpf_prog *old_prog; 259 + 260 + if (sk_unhashed(sk) && sk->sk_reuseport) { 261 + int err = reuseport_alloc(sk, false); 262 + 263 + if (err) 264 + return err; 265 + } else if (!rcu_access_pointer(sk->sk_reuseport_cb)) { 266 + /* The socket wasn't bound with SO_REUSEPORT */ 267 + return -EINVAL; 268 + } 310 269 311 270 spin_lock_bh(&reuseport_lock); 312 271 reuse = rcu_dereference_protected(sk->sk_reuseport_cb, ··· 325 266 rcu_assign_pointer(reuse->prog, prog); 326 267 spin_unlock_bh(&reuseport_lock); 327 268 328 - return old_prog; 269 + sk_reuseport_prog_free(old_prog); 270 + return 0; 329 271 } 330 272 EXPORT_SYMBOL(reuseport_attach_prog);

+5 -4

net/core/xdp.c

··· 330 330 /* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */ 331 331 xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); 332 332 page = virt_to_head_page(data); 333 - if (xa) 333 + if (xa) { 334 + napi_direct &= !xdp_return_frame_no_direct(); 334 335 page_pool_put_page(xa->page_pool, page, napi_direct); 335 - else 336 + } else { 336 337 put_page(page); 338 + } 337 339 rcu_read_unlock(); 338 340 break; 339 341 case MEM_TYPE_PAGE_SHARED: ··· 350 348 rcu_read_lock(); 351 349 /* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */ 352 350 xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); 353 - if (!WARN_ON_ONCE(!xa)) 354 - xa->zc_alloc->free(xa->zc_alloc, handle); 351 + xa->zc_alloc->free(xa->zc_alloc, handle); 355 352 rcu_read_unlock(); 356 353 default: 357 354 /* Not possible, checked in xdp_rxq_info_reg_mem_model() */

+9

net/ipv4/inet_connection_sock.c

··· 107 107 } 108 108 EXPORT_SYMBOL(inet_rcv_saddr_equal); 109 109 110 + bool inet_rcv_saddr_any(const struct sock *sk) 111 + { 112 + #if IS_ENABLED(CONFIG_IPV6) 113 + if (sk->sk_family == AF_INET6) 114 + return ipv6_addr_any(&sk->sk_v6_rcv_saddr); 115 + #endif 116 + return !sk->sk_rcv_saddr; 117 + } 118 + 110 119 void inet_get_local_port_range(struct net *net, int *low, int *high) 111 120 { 112 121 unsigned int seq;

+12 -7

net/ipv4/inet_hashtables.c

··· 328 328 saddr, sport, daddr, hnum, 329 329 dif, sdif); 330 330 if (result) 331 - return result; 331 + goto done; 332 332 333 333 /* Lookup lhash2 with INADDR_ANY */ 334 334 ··· 337 337 if (ilb2->count > ilb->count) 338 338 goto port_lookup; 339 339 340 - return inet_lhash2_lookup(net, ilb2, skb, doff, 341 - saddr, sport, daddr, hnum, 342 - dif, sdif); 340 + result = inet_lhash2_lookup(net, ilb2, skb, doff, 341 + saddr, sport, daddr, hnum, 342 + dif, sdif); 343 + goto done; 343 344 344 345 port_lookup: 345 346 sk_for_each_rcu(sk, &ilb->head) { ··· 353 352 result = reuseport_select_sock(sk, phash, 354 353 skb, doff); 355 354 if (result) 356 - return result; 355 + goto done; 357 356 } 358 357 result = sk; 359 358 hiscore = score; 360 359 } 361 360 } 361 + done: 362 + if (unlikely(IS_ERR(result))) 363 + return NULL; 362 364 return result; 363 365 } 364 366 EXPORT_SYMBOL_GPL(__inet_lookup_listener); ··· 571 567 inet_csk(sk2)->icsk_bind_hash == tb && 572 568 sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) && 573 569 inet_rcv_saddr_equal(sk, sk2, false)) 574 - return reuseport_add_sock(sk, sk2); 570 + return reuseport_add_sock(sk, sk2, 571 + inet_rcv_saddr_any(sk)); 575 572 } 576 573 577 - return reuseport_alloc(sk); 574 + return reuseport_alloc(sk, inet_rcv_saddr_any(sk)); 578 575 } 579 576 580 577 int __inet_hash(struct sock *sk, struct sock *osk)

+7 -2

net/ipv4/udp.c

··· 221 221 (sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && 222 222 sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) && 223 223 inet_rcv_saddr_equal(sk, sk2, false)) { 224 - return reuseport_add_sock(sk, sk2); 224 + return reuseport_add_sock(sk, sk2, 225 + inet_rcv_saddr_any(sk)); 225 226 } 226 227 } 227 228 228 - return reuseport_alloc(sk); 229 + return reuseport_alloc(sk, inet_rcv_saddr_any(sk)); 229 230 } 230 231 231 232 /** ··· 499 498 daddr, hnum, dif, sdif, 500 499 exact_dif, hslot2, skb); 501 500 } 501 + if (unlikely(IS_ERR(result))) 502 + return NULL; 502 503 return result; 503 504 } 504 505 begin: ··· 515 512 saddr, sport); 516 513 result = reuseport_select_sock(sk, hash, skb, 517 514 sizeof(struct udphdr)); 515 + if (unlikely(IS_ERR(result))) 516 + return NULL; 518 517 if (result) 519 518 return result; 520 519 }

+9 -5

net/ipv6/inet6_hashtables.c

··· 191 191 saddr, sport, daddr, hnum, 192 192 dif, sdif); 193 193 if (result) 194 - return result; 194 + goto done; 195 195 196 196 /* Lookup lhash2 with in6addr_any */ 197 197 ··· 200 200 if (ilb2->count > ilb->count) 201 201 goto port_lookup; 202 202 203 - return inet6_lhash2_lookup(net, ilb2, skb, doff, 204 - saddr, sport, daddr, hnum, 205 - dif, sdif); 203 + result = inet6_lhash2_lookup(net, ilb2, skb, doff, 204 + saddr, sport, daddr, hnum, 205 + dif, sdif); 206 + goto done; 206 207 207 208 port_lookup: 208 209 sk_for_each(sk, &ilb->head) { ··· 215 214 result = reuseport_select_sock(sk, phash, 216 215 skb, doff); 217 216 if (result) 218 - return result; 217 + goto done; 219 218 } 220 219 result = sk; 221 220 hiscore = score; 222 221 } 223 222 } 223 + done: 224 + if (unlikely(IS_ERR(result))) 225 + return NULL; 224 226 return result; 225 227 } 226 228 EXPORT_SYMBOL_GPL(inet6_lookup_listener);

+4

net/ipv6/udp.c

··· 235 235 exact_dif, hslot2, 236 236 skb); 237 237 } 238 + if (unlikely(IS_ERR(result))) 239 + return NULL; 238 240 return result; 239 241 } 240 242 begin: ··· 251 249 saddr, sport); 252 250 result = reuseport_select_sock(sk, hash, skb, 253 251 sizeof(struct udphdr)); 252 + if (unlikely(IS_ERR(result))) 253 + return NULL; 254 254 if (result) 255 255 return result; 256 256 }

+55

samples/bpf/hash_func01.h

··· 1 + /* SPDX-License-Identifier: LGPL-2.1 2 + * 3 + * Based on Paul Hsieh's (LGPG 2.1) hash function 4 + * From: http://www.azillionmonkeys.com/qed/hash.html 5 + */ 6 + 7 + #define get16bits(d) (*((const __u16 *) (d))) 8 + 9 + static __always_inline 10 + __u32 SuperFastHash (const char *data, int len, __u32 initval) { 11 + __u32 hash = initval; 12 + __u32 tmp; 13 + int rem; 14 + 15 + if (len <= 0 || data == NULL) return 0; 16 + 17 + rem = len & 3; 18 + len >>= 2; 19 + 20 + /* Main loop */ 21 + #pragma clang loop unroll(full) 22 + for (;len > 0; len--) { 23 + hash += get16bits (data); 24 + tmp = (get16bits (data+2) << 11) ^ hash; 25 + hash = (hash << 16) ^ tmp; 26 + data += 2*sizeof (__u16); 27 + hash += hash >> 11; 28 + } 29 + 30 + /* Handle end cases */ 31 + switch (rem) { 32 + case 3: hash += get16bits (data); 33 + hash ^= hash << 16; 34 + hash ^= ((signed char)data[sizeof (__u16)]) << 18; 35 + hash += hash >> 11; 36 + break; 37 + case 2: hash += get16bits (data); 38 + hash ^= hash << 11; 39 + hash += hash >> 17; 40 + break; 41 + case 1: hash += (signed char)*data; 42 + hash ^= hash << 10; 43 + hash += hash >> 1; 44 + } 45 + 46 + /* Force "avalanching" of final 127 bits */ 47 + hash ^= hash << 3; 48 + hash += hash >> 5; 49 + hash ^= hash << 4; 50 + hash += hash >> 17; 51 + hash ^= hash << 25; 52 + hash += hash >> 6; 53 + 54 + return hash; 55 + }

+103

samples/bpf/xdp_redirect_cpu_kern.c

··· 13 13 14 14 #include <uapi/linux/bpf.h> 15 15 #include "bpf_helpers.h" 16 + #include "hash_func01.h" 16 17 17 18 #define MAX_CPUS 64 /* WARNING - sync with _user.c */ 18 19 ··· 462 461 return bpf_redirect_map(&cpu_map, cpu_dest, 0); 463 462 } 464 463 464 + /* Hashing initval */ 465 + #define INITVAL 15485863 466 + 467 + static __always_inline 468 + u32 get_ipv4_hash_ip_pair(struct xdp_md *ctx, u64 nh_off) 469 + { 470 + void *data_end = (void *)(long)ctx->data_end; 471 + void *data = (void *)(long)ctx->data; 472 + struct iphdr *iph = data + nh_off; 473 + u32 cpu_hash; 474 + 475 + if (iph + 1 > data_end) 476 + return 0; 477 + 478 + cpu_hash = iph->saddr + iph->daddr; 479 + cpu_hash = SuperFastHash((char *)&cpu_hash, 4, INITVAL + iph->protocol); 480 + 481 + return cpu_hash; 482 + } 483 + 484 + static __always_inline 485 + u32 get_ipv6_hash_ip_pair(struct xdp_md *ctx, u64 nh_off) 486 + { 487 + void *data_end = (void *)(long)ctx->data_end; 488 + void *data = (void *)(long)ctx->data; 489 + struct ipv6hdr *ip6h = data + nh_off; 490 + u32 cpu_hash; 491 + 492 + if (ip6h + 1 > data_end) 493 + return 0; 494 + 495 + cpu_hash = ip6h->saddr.s6_addr32[0] + ip6h->daddr.s6_addr32[0]; 496 + cpu_hash += ip6h->saddr.s6_addr32[1] + ip6h->daddr.s6_addr32[1]; 497 + cpu_hash += ip6h->saddr.s6_addr32[2] + ip6h->daddr.s6_addr32[2]; 498 + cpu_hash += ip6h->saddr.s6_addr32[3] + ip6h->daddr.s6_addr32[3]; 499 + cpu_hash = SuperFastHash((char *)&cpu_hash, 4, INITVAL + ip6h->nexthdr); 500 + 501 + return cpu_hash; 502 + } 503 + 504 + /* Load-Balance traffic based on hashing IP-addrs + L4-proto. The 505 + * hashing scheme is symmetric, meaning swapping IP src/dest still hit 506 + * same CPU. 507 + */ 508 + SEC("xdp_cpu_map5_lb_hash_ip_pairs") 509 + int xdp_prognum5_lb_hash_ip_pairs(struct xdp_md *ctx) 510 + { 511 + void *data_end = (void *)(long)ctx->data_end; 512 + void *data = (void *)(long)ctx->data; 513 + struct ethhdr *eth = data; 514 + u8 ip_proto = IPPROTO_UDP; 515 + struct datarec *rec; 516 + u16 eth_proto = 0; 517 + u64 l3_offset = 0; 518 + u32 cpu_dest = 0; 519 + u32 cpu_idx = 0; 520 + u32 *cpu_lookup; 521 + u32 *cpu_max; 522 + u32 cpu_hash; 523 + u32 key = 0; 524 + 525 + /* Count RX packet in map */ 526 + rec = bpf_map_lookup_elem(&rx_cnt, &key); 527 + if (!rec) 528 + return XDP_ABORTED; 529 + rec->processed++; 530 + 531 + cpu_max = bpf_map_lookup_elem(&cpus_count, &key); 532 + if (!cpu_max) 533 + return XDP_ABORTED; 534 + 535 + if (!(parse_eth(eth, data_end, &eth_proto, &l3_offset))) 536 + return XDP_PASS; /* Just skip */ 537 + 538 + /* Hash for IPv4 and IPv6 */ 539 + switch (eth_proto) { 540 + case ETH_P_IP: 541 + cpu_hash = get_ipv4_hash_ip_pair(ctx, l3_offset); 542 + break; 543 + case ETH_P_IPV6: 544 + cpu_hash = get_ipv6_hash_ip_pair(ctx, l3_offset); 545 + break; 546 + case ETH_P_ARP: /* ARP packet handled on CPU idx 0 */ 547 + default: 548 + cpu_hash = 0; 549 + } 550 + 551 + /* Choose CPU based on hash */ 552 + cpu_idx = cpu_hash % *cpu_max; 553 + 554 + cpu_lookup = bpf_map_lookup_elem(&cpus_available, &cpu_idx); 555 + if (!cpu_lookup) 556 + return XDP_ABORTED; 557 + cpu_dest = *cpu_lookup; 558 + 559 + if (cpu_dest >= MAX_CPUS) { 560 + rec->issue++; 561 + return XDP_ABORTED; 562 + } 563 + 564 + return bpf_redirect_map(&cpu_map, cpu_dest, 0); 565 + } 465 566 466 567 char _license[] SEC("license") = "GPL"; 467 568

+2 -2

samples/bpf/xdp_redirect_cpu_user.c

··· 22 22 #define MAX_CPUS 64 /* WARNING - sync with _kern.c */ 23 23 24 24 /* How many xdp_progs are defined in _kern.c */ 25 - #define MAX_PROG 5 25 + #define MAX_PROG 6 26 26 27 27 /* Wanted to get rid of bpf_load.h and fake-"libbpf.h" (and instead 28 28 * use bpf/libbpf.h), but cannot as (currently) needed for XDP ··· 567 567 int added_cpus = 0; 568 568 int longindex = 0; 569 569 int interval = 2; 570 - int prog_num = 0; 570 + int prog_num = 5; 571 571 int add_cpu = -1; 572 572 __u32 qsize; 573 573 int opt;

+55 -1

tools/include/uapi/linux/bpf.h

··· 126 126 BPF_MAP_TYPE_XSKMAP, 127 127 BPF_MAP_TYPE_SOCKHASH, 128 128 BPF_MAP_TYPE_CGROUP_STORAGE, 129 + BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, 129 130 }; 130 131 131 132 enum bpf_prog_type { ··· 151 150 BPF_PROG_TYPE_CGROUP_SOCK_ADDR, 152 151 BPF_PROG_TYPE_LWT_SEG6LOCAL, 153 152 BPF_PROG_TYPE_LIRC_MODE2, 153 + BPF_PROG_TYPE_SK_REUSEPORT, 154 154 }; 155 155 156 156 enum bpf_attach_type { ··· 2093 2091 * Return 2094 2092 * The id is returned or 0 in case the id could not be retrieved. 2095 2093 * 2094 + * u64 bpf_skb_ancestor_cgroup_id(struct sk_buff *skb, int ancestor_level) 2095 + * Description 2096 + * Return id of cgroup v2 that is ancestor of cgroup associated 2097 + * with the *skb* at the *ancestor_level*. The root cgroup is at 2098 + * *ancestor_level* zero and each step down the hierarchy 2099 + * increments the level. If *ancestor_level* == level of cgroup 2100 + * associated with *skb*, then return value will be same as that 2101 + * of **bpf_skb_cgroup_id**\ (). 2102 + * 2103 + * The helper is useful to implement policies based on cgroups 2104 + * that are upper in hierarchy than immediate cgroup associated 2105 + * with *skb*. 2106 + * 2107 + * The format of returned id and helper limitations are same as in 2108 + * **bpf_skb_cgroup_id**\ (). 2109 + * Return 2110 + * The id is returned or 0 in case the id could not be retrieved. 2111 + * 2096 2112 * u64 bpf_get_current_cgroup_id(void) 2097 2113 * Return 2098 2114 * A 64-bit integer containing the current cgroup id based ··· 2133 2113 * the shared data. 2134 2114 * Return 2135 2115 * Pointer to the local storage area. 2116 + * 2117 + * int bpf_sk_select_reuseport(struct sk_reuseport_md *reuse, struct bpf_map *map, void *key, u64 flags) 2118 + * Description 2119 + * Select a SO_REUSEPORT sk from a BPF_MAP_TYPE_REUSEPORT_ARRAY map 2120 + * It checks the selected sk is matching the incoming 2121 + * request in the skb. 2122 + * Return 2123 + * 0 on success, or a negative error in case of failure. 2136 2124 */ 2137 2125 #define __BPF_FUNC_MAPPER(FN) \ 2138 2126 FN(unspec), \ ··· 2224 2196 FN(rc_keydown), \ 2225 2197 FN(skb_cgroup_id), \ 2226 2198 FN(get_current_cgroup_id), \ 2227 - FN(get_local_storage), 2199 + FN(get_local_storage), \ 2200 + FN(sk_select_reuseport), \ 2201 + FN(skb_ancestor_cgroup_id), 2228 2202 2229 2203 /* integer value in 'imm' field of BPF_CALL instruction selects which helper 2230 2204 * function eBPF program intends to call ··· 2441 2411 __u32 local_ip6[4]; /* Stored in network byte order */ 2442 2412 __u32 remote_port; /* Stored in network byte order */ 2443 2413 __u32 local_port; /* stored in host byte order */ 2414 + }; 2415 + 2416 + struct sk_reuseport_md { 2417 + /* 2418 + * Start of directly accessible data. It begins from 2419 + * the tcp/udp header. 2420 + */ 2421 + void *data; 2422 + void *data_end; /* End of directly accessible data */ 2423 + /* 2424 + * Total length of packet (starting from the tcp/udp header). 2425 + * Note that the directly accessible bytes (data_end - data) 2426 + * could be less than this "len". Those bytes could be 2427 + * indirectly read by a helper "bpf_skb_load_bytes()". 2428 + */ 2429 + __u32 len; 2430 + /* 2431 + * Eth protocol in the mac header (network byte order). e.g. 2432 + * ETH_P_IP(0x0800) and ETH_P_IPV6(0x86DD) 2433 + */ 2434 + __u32 eth_protocol; 2435 + __u32 ip_protocol; /* IP protocol. e.g. IPPROTO_TCP, IPPROTO_UDP */ 2436 + __u32 bind_inany; /* Is sock bound to an INANY address? */ 2437 + __u32 hash; /* A hash of the packet 4 tuples */ 2444 2438 }; 2445 2439 2446 2440 #define BPF_TAG_SIZE 8

+1

tools/lib/bpf/bpf.c

··· 92 92 attr.btf_key_type_id = create_attr->btf_key_type_id; 93 93 attr.btf_value_type_id = create_attr->btf_value_type_id; 94 94 attr.map_ifindex = create_attr->map_ifindex; 95 + attr.inner_map_fd = create_attr->inner_map_fd; 95 96 96 97 return sys_bpf(BPF_MAP_CREATE, &attr, sizeof(attr)); 97 98 }

+1

tools/lib/bpf/bpf.h

··· 39 39 __u32 btf_key_type_id; 40 40 __u32 btf_value_type_id; 41 41 __u32 map_ifindex; 42 + __u32 inner_map_fd; 42 43 }; 43 44 44 45 int bpf_create_map_xattr(const struct bpf_create_map_attr *create_attr);

+1

tools/lib/bpf/libbpf.c

··· 1501 1501 case BPF_PROG_TYPE_SK_MSG: 1502 1502 case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: 1503 1503 case BPF_PROG_TYPE_LIRC_MODE2: 1504 + case BPF_PROG_TYPE_SK_REUSEPORT: 1504 1505 return false; 1505 1506 case BPF_PROG_TYPE_UNSPEC: 1506 1507 case BPF_PROG_TYPE_KPROBE:

+7 -4

tools/testing/selftests/bpf/Makefile

··· 23 23 TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs \ 24 24 test_align test_verifier_log test_dev_cgroup test_tcpbpf_user \ 25 25 test_sock test_btf test_sockmap test_lirc_mode2_user get_cgroup_id_user \ 26 - test_socket_cookie test_cgroup_storage 26 + test_socket_cookie test_cgroup_storage test_select_reuseport 27 27 28 28 TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test_obj_id.o \ 29 29 test_pkt_md_access.o test_xdp_redirect.o test_xdp_meta.o sockmap_parse_prog.o \ ··· 34 34 test_btf_haskv.o test_btf_nokv.o test_sockmap_kern.o test_tunnel_kern.o \ 35 35 test_get_stack_rawtp.o test_sockmap_kern.o test_sockhash_kern.o \ 36 36 test_lwt_seg6local.o sendmsg4_prog.o sendmsg6_prog.o test_lirc_mode2_kern.o \ 37 - get_cgroup_id_kern.o socket_cookie_prog.o 37 + get_cgroup_id_kern.o socket_cookie_prog.o test_select_reuseport_kern.o \ 38 + test_skb_cgroup_id_kern.o 38 39 39 40 # Order correspond to 'make run_tests' order 40 41 TEST_PROGS := test_kmod.sh \ ··· 46 45 test_sock_addr.sh \ 47 46 test_tunnel.sh \ 48 47 test_lwt_seg6local.sh \ 49 - test_lirc_mode2.sh 48 + test_lirc_mode2.sh \ 49 + test_skb_cgroup_id.sh 50 50 51 51 # Compile but not part of 'make run_tests' 52 - TEST_GEN_PROGS_EXTENDED = test_libbpf_open test_sock_addr 52 + TEST_GEN_PROGS_EXTENDED = test_libbpf_open test_sock_addr test_skb_cgroup_id_user 53 53 54 54 include ../lib.mk 55 55 ··· 61 59 $(TEST_GEN_PROGS_EXTENDED): $(OUTPUT)/libbpf.a 62 60 63 61 $(OUTPUT)/test_dev_cgroup: cgroup_helpers.c 62 + $(OUTPUT)/test_skb_cgroup_id_user: cgroup_helpers.c 64 63 $(OUTPUT)/test_sock: cgroup_helpers.c 65 64 $(OUTPUT)/test_sock_addr: cgroup_helpers.c 66 65 $(OUTPUT)/test_socket_cookie: cgroup_helpers.c

+8

tools/testing/selftests/bpf/bpf_helpers.h

··· 111 111 static int (*bpf_skb_get_xfrm_state)(void *ctx, int index, void *state, 112 112 int size, int flags) = 113 113 (void *) BPF_FUNC_skb_get_xfrm_state; 114 + static int (*bpf_sk_select_reuseport)(void *ctx, void *map, void *key, __u32 flags) = 115 + (void *) BPF_FUNC_sk_select_reuseport; 114 116 static int (*bpf_get_stack)(void *ctx, void *buf, int size, int flags) = 115 117 (void *) BPF_FUNC_get_stack; 116 118 static int (*bpf_fib_lookup)(void *ctx, struct bpf_fib_lookup *params, ··· 139 137 (void *) BPF_FUNC_get_current_cgroup_id; 140 138 static void *(*bpf_get_local_storage)(void *map, unsigned long long flags) = 141 139 (void *) BPF_FUNC_get_local_storage; 140 + static unsigned long long (*bpf_skb_cgroup_id)(void *ctx) = 141 + (void *) BPF_FUNC_skb_cgroup_id; 142 + static unsigned long long (*bpf_skb_ancestor_cgroup_id)(void *ctx, int level) = 143 + (void *) BPF_FUNC_skb_ancestor_cgroup_id; 142 144 143 145 /* llvm builtin functions that eBPF C program may use to 144 146 * emit BPF_LD_ABS and BPF_LD_IND instructions ··· 179 173 180 174 static int (*bpf_skb_load_bytes)(void *ctx, int off, void *to, int len) = 181 175 (void *) BPF_FUNC_skb_load_bytes; 176 + static int (*bpf_skb_load_bytes_relative)(void *ctx, int off, void *to, int len, __u32 start_header) = 177 + (void *) BPF_FUNC_skb_load_bytes_relative; 182 178 static int (*bpf_skb_store_bytes)(void *ctx, int off, void *from, int len, int flags) = 183 179 (void *) BPF_FUNC_skb_store_bytes; 184 180 static int (*bpf_l3_csum_replace)(void *ctx, int off, int from, int to, int flags) =

+4

tools/testing/selftests/bpf/bpf_util.h

··· 44 44 name[bpf_num_possible_cpus()] 45 45 #define bpf_percpu(name, cpu) name[(cpu)].v 46 46 47 + #ifndef ARRAY_SIZE 48 + # define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) 49 + #endif 50 + 47 51 #endif /* __BPF_UTIL__ */

+1 -4

tools/testing/selftests/bpf/test_align.c

··· 18 18 19 19 #include "../../../include/linux/filter.h" 20 20 #include "bpf_rlimit.h" 21 - 22 - #ifndef ARRAY_SIZE 23 - # define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) 24 - #endif 21 + #include "bpf_util.h" 25 22 26 23 #define MAX_INSNS 512 27 24 #define MAX_MATCHES 16

+73 -19

tools/testing/selftests/bpf/test_btf.c

··· 19 19 #include <bpf/btf.h> 20 20 21 21 #include "bpf_rlimit.h" 22 + #include "bpf_util.h" 22 23 23 24 static uint32_t pass_cnt; 24 25 static uint32_t error_cnt; ··· 94 93 #define MAX_NR_RAW_TYPES 1024 95 94 #define BTF_LOG_BUF_SIZE 65535 96 95 97 - #ifndef ARRAY_SIZE 98 - # define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) 99 - #endif 100 - 101 96 static struct args { 102 97 unsigned int raw_test_num; 103 98 unsigned int file_test_num; ··· 128 131 __u32 max_entries; 129 132 bool btf_load_err; 130 133 bool map_create_err; 134 + bool ordered_map; 135 + bool lossless_map; 131 136 int hdr_len_delta; 132 137 int type_off_delta; 133 138 int str_off_delta; ··· 2092 2093 } aenum; 2093 2094 }; 2094 2095 2095 - static struct btf_raw_test pprint_test = { 2096 - .descr = "BTF pretty print test #1", 2096 + static struct btf_raw_test pprint_test_template = { 2097 2097 .raw_types = { 2098 2098 /* unsighed char */ /* [1] */ 2099 2099 BTF_TYPE_INT_ENC(NAME_TBD, 0, 0, 8, 1), ··· 2144 2146 }, 2145 2147 .str_sec = "\0unsigned char\0unsigned short\0unsigned int\0int\0unsigned long long\0uint8_t\0uint16_t\0uint32_t\0int32_t\0uint64_t\0ui64\0ui8a\0ENUM_ZERO\0ENUM_ONE\0ENUM_TWO\0ENUM_THREE\0pprint_mapv\0ui32\0ui16\0si32\0unused_bits2a\0bits28\0unused_bits2b\0aenum", 2146 2148 .str_sec_size = sizeof("\0unsigned char\0unsigned short\0unsigned int\0int\0unsigned long long\0uint8_t\0uint16_t\0uint32_t\0int32_t\0uint64_t\0ui64\0ui8a\0ENUM_ZERO\0ENUM_ONE\0ENUM_TWO\0ENUM_THREE\0pprint_mapv\0ui32\0ui16\0si32\0unused_bits2a\0bits28\0unused_bits2b\0aenum"), 2147 - .map_type = BPF_MAP_TYPE_ARRAY, 2148 - .map_name = "pprint_test", 2149 2149 .key_size = sizeof(unsigned int), 2150 2150 .value_size = sizeof(struct pprint_mapv), 2151 2151 .key_type_id = 3, /* unsigned int */ 2152 2152 .value_type_id = 16, /* struct pprint_mapv */ 2153 2153 .max_entries = 128 * 1024, 2154 2154 }; 2155 + 2156 + static struct btf_pprint_test_meta { 2157 + const char *descr; 2158 + enum bpf_map_type map_type; 2159 + const char *map_name; 2160 + bool ordered_map; 2161 + bool lossless_map; 2162 + } pprint_tests_meta[] = { 2163 + { 2164 + .descr = "BTF pretty print array", 2165 + .map_type = BPF_MAP_TYPE_ARRAY, 2166 + .map_name = "pprint_test_array", 2167 + .ordered_map = true, 2168 + .lossless_map = true, 2169 + }, 2170 + 2171 + { 2172 + .descr = "BTF pretty print hash", 2173 + .map_type = BPF_MAP_TYPE_HASH, 2174 + .map_name = "pprint_test_hash", 2175 + .ordered_map = false, 2176 + .lossless_map = true, 2177 + }, 2178 + 2179 + { 2180 + .descr = "BTF pretty print lru hash", 2181 + .map_type = BPF_MAP_TYPE_LRU_HASH, 2182 + .map_name = "pprint_test_lru_hash", 2183 + .ordered_map = false, 2184 + .lossless_map = false, 2185 + }, 2186 + 2187 + }; 2188 + 2155 2189 2156 2190 static void set_pprint_mapv(struct pprint_mapv *v, uint32_t i) 2157 2191 { ··· 2196 2166 v->aenum = i & 0x03; 2197 2167 } 2198 2168 2199 - static int test_pprint(void) 2169 + static int do_test_pprint(void) 2200 2170 { 2201 - const struct btf_raw_test *test = &pprint_test; 2171 + const struct btf_raw_test *test = &pprint_test_template; 2202 2172 struct bpf_create_map_attr create_attr = {}; 2173 + unsigned int key, nr_read_elems; 2174 + bool ordered_map, lossless_map; 2203 2175 int map_fd = -1, btf_fd = -1; 2204 2176 struct pprint_mapv mapv = {}; 2205 2177 unsigned int raw_btf_size; ··· 2210 2178 char pin_path[255]; 2211 2179 size_t line_len = 0; 2212 2180 char *line = NULL; 2213 - unsigned int key; 2214 2181 uint8_t *raw_btf; 2215 2182 ssize_t nread; 2216 2183 int err, ret; ··· 2282 2251 goto done; 2283 2252 } 2284 2253 2285 - key = 0; 2254 + nr_read_elems = 0; 2255 + ordered_map = test->ordered_map; 2256 + lossless_map = test->lossless_map; 2286 2257 do { 2287 2258 ssize_t nexpected_line; 2259 + unsigned int next_key; 2288 2260 2289 - set_pprint_mapv(&mapv, key); 2261 + next_key = ordered_map ? nr_read_elems : atoi(line); 2262 + set_pprint_mapv(&mapv, next_key); 2290 2263 nexpected_line = snprintf(expected_line, sizeof(expected_line), 2291 2264 "%u: {%u,0,%d,0x%x,0x%x,0x%x,{%lu|[%u,%u,%u,%u,%u,%u,%u,%u]},%s}\n", 2292 - key, 2265 + next_key, 2293 2266 mapv.ui32, mapv.si32, 2294 2267 mapv.unused_bits2a, mapv.bits28, mapv.unused_bits2b, 2295 2268 mapv.ui64, ··· 2316 2281 } 2317 2282 2318 2283 nread = getline(&line, &line_len, pin_file); 2319 - } while (++key < test->max_entries && nread > 0); 2284 + } while (++nr_read_elems < test->max_entries && nread > 0); 2320 2285 2321 - if (CHECK(key < test->max_entries, 2322 - "Unexpected EOF. key:%u test->max_entries:%u", 2323 - key, test->max_entries)) { 2286 + if (lossless_map && 2287 + CHECK(nr_read_elems < test->max_entries, 2288 + "Unexpected EOF. nr_read_elems:%u test->max_entries:%u", 2289 + nr_read_elems, test->max_entries)) { 2324 2290 err = -1; 2325 2291 goto done; 2326 2292 } ··· 2346 2310 fclose(pin_file); 2347 2311 unlink(pin_path); 2348 2312 free(line); 2313 + 2314 + return err; 2315 + } 2316 + 2317 + static int test_pprint(void) 2318 + { 2319 + unsigned int i; 2320 + int err = 0; 2321 + 2322 + for (i = 0; i < ARRAY_SIZE(pprint_tests_meta); i++) { 2323 + pprint_test_template.descr = pprint_tests_meta[i].descr; 2324 + pprint_test_template.map_type = pprint_tests_meta[i].map_type; 2325 + pprint_test_template.map_name = pprint_tests_meta[i].map_name; 2326 + pprint_test_template.ordered_map = pprint_tests_meta[i].ordered_map; 2327 + pprint_test_template.lossless_map = pprint_tests_meta[i].lossless_map; 2328 + 2329 + err |= count_result(do_test_pprint()); 2330 + } 2349 2331 2350 2332 return err; 2351 2333 } ··· 2463 2409 err |= test_file(); 2464 2410 2465 2411 if (args.pprint_test) 2466 - err |= count_result(test_pprint()); 2412 + err |= test_pprint(); 2467 2413 2468 2414 if (args.raw_test || args.get_info_test || args.file_test || 2469 2415 args.pprint_test)

+261 -1

tools/testing/selftests/bpf/test_maps.c

··· 17 17 #include <stdlib.h> 18 18 19 19 #include <sys/wait.h> 20 - 20 + #include <sys/socket.h> 21 + #include <netinet/in.h> 21 22 #include <linux/bpf.h> 22 23 23 24 #include <bpf/bpf.h> ··· 27 26 #include "bpf_util.h" 28 27 #include "bpf_rlimit.h" 29 28 29 + #ifndef ENOTSUPP 30 + #define ENOTSUPP 524 31 + #endif 32 + 30 33 static int map_flags; 34 + 35 + #define CHECK(condition, tag, format...) ({ \ 36 + int __ret = !!(condition); \ 37 + if (__ret) { \ 38 + printf("%s(%d):FAIL:%s ", __func__, __LINE__, tag); \ 39 + printf(format); \ 40 + exit(-1); \ 41 + } \ 42 + }) 31 43 32 44 static void test_hashmap(int task, void *data) 33 45 { ··· 1164 1150 assert(bpf_map_get_next_key(fd, &key, &value) == -1 && errno == EPERM); 1165 1151 } 1166 1152 1153 + static void prepare_reuseport_grp(int type, int map_fd, 1154 + __s64 *fds64, __u64 *sk_cookies, 1155 + unsigned int n) 1156 + { 1157 + socklen_t optlen, addrlen; 1158 + struct sockaddr_in6 s6; 1159 + const __u32 index0 = 0; 1160 + const int optval = 1; 1161 + unsigned int i; 1162 + u64 sk_cookie; 1163 + __s64 fd64; 1164 + int err; 1165 + 1166 + s6.sin6_family = AF_INET6; 1167 + s6.sin6_addr = in6addr_any; 1168 + s6.sin6_port = 0; 1169 + addrlen = sizeof(s6); 1170 + optlen = sizeof(sk_cookie); 1171 + 1172 + for (i = 0; i < n; i++) { 1173 + fd64 = socket(AF_INET6, type, 0); 1174 + CHECK(fd64 == -1, "socket()", 1175 + "sock_type:%d fd64:%lld errno:%d\n", 1176 + type, fd64, errno); 1177 + 1178 + err = setsockopt(fd64, SOL_SOCKET, SO_REUSEPORT, 1179 + &optval, sizeof(optval)); 1180 + CHECK(err == -1, "setsockopt(SO_REUSEEPORT)", 1181 + "err:%d errno:%d\n", err, errno); 1182 + 1183 + /* reuseport_array does not allow unbound sk */ 1184 + err = bpf_map_update_elem(map_fd, &index0, &fd64, 1185 + BPF_ANY); 1186 + CHECK(err != -1 || errno != EINVAL, 1187 + "reuseport array update unbound sk", 1188 + "sock_type:%d err:%d errno:%d\n", 1189 + type, err, errno); 1190 + 1191 + err = bind(fd64, (struct sockaddr *)&s6, sizeof(s6)); 1192 + CHECK(err == -1, "bind()", 1193 + "sock_type:%d err:%d errno:%d\n", type, err, errno); 1194 + 1195 + if (i == 0) { 1196 + err = getsockname(fd64, (struct sockaddr *)&s6, 1197 + &addrlen); 1198 + CHECK(err == -1, "getsockname()", 1199 + "sock_type:%d err:%d errno:%d\n", 1200 + type, err, errno); 1201 + } 1202 + 1203 + err = getsockopt(fd64, SOL_SOCKET, SO_COOKIE, &sk_cookie, 1204 + &optlen); 1205 + CHECK(err == -1, "getsockopt(SO_COOKIE)", 1206 + "sock_type:%d err:%d errno:%d\n", type, err, errno); 1207 + 1208 + if (type == SOCK_STREAM) { 1209 + /* 1210 + * reuseport_array does not allow 1211 + * non-listening tcp sk. 1212 + */ 1213 + err = bpf_map_update_elem(map_fd, &index0, &fd64, 1214 + BPF_ANY); 1215 + CHECK(err != -1 || errno != EINVAL, 1216 + "reuseport array update non-listening sk", 1217 + "sock_type:%d err:%d errno:%d\n", 1218 + type, err, errno); 1219 + err = listen(fd64, 0); 1220 + CHECK(err == -1, "listen()", 1221 + "sock_type:%d, err:%d errno:%d\n", 1222 + type, err, errno); 1223 + } 1224 + 1225 + fds64[i] = fd64; 1226 + sk_cookies[i] = sk_cookie; 1227 + } 1228 + } 1229 + 1230 + static void test_reuseport_array(void) 1231 + { 1232 + #define REUSEPORT_FD_IDX(err, last) ({ (err) ? last : !last; }) 1233 + 1234 + const __u32 array_size = 4, index0 = 0, index3 = 3; 1235 + int types[2] = { SOCK_STREAM, SOCK_DGRAM }, type; 1236 + __u64 grpa_cookies[2], sk_cookie, map_cookie; 1237 + __s64 grpa_fds64[2] = { -1, -1 }, fd64 = -1; 1238 + const __u32 bad_index = array_size; 1239 + int map_fd, err, t, f; 1240 + __u32 fds_idx = 0; 1241 + int fd; 1242 + 1243 + map_fd = bpf_create_map(BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, 1244 + sizeof(__u32), sizeof(__u64), array_size, 0); 1245 + CHECK(map_fd == -1, "reuseport array create", 1246 + "map_fd:%d, errno:%d\n", map_fd, errno); 1247 + 1248 + /* Test lookup/update/delete with invalid index */ 1249 + err = bpf_map_delete_elem(map_fd, &bad_index); 1250 + CHECK(err != -1 || errno != E2BIG, "reuseport array del >=max_entries", 1251 + "err:%d errno:%d\n", err, errno); 1252 + 1253 + err = bpf_map_update_elem(map_fd, &bad_index, &fd64, BPF_ANY); 1254 + CHECK(err != -1 || errno != E2BIG, 1255 + "reuseport array update >=max_entries", 1256 + "err:%d errno:%d\n", err, errno); 1257 + 1258 + err = bpf_map_lookup_elem(map_fd, &bad_index, &map_cookie); 1259 + CHECK(err != -1 || errno != ENOENT, 1260 + "reuseport array update >=max_entries", 1261 + "err:%d errno:%d\n", err, errno); 1262 + 1263 + /* Test lookup/delete non existence elem */ 1264 + err = bpf_map_lookup_elem(map_fd, &index3, &map_cookie); 1265 + CHECK(err != -1 || errno != ENOENT, 1266 + "reuseport array lookup not-exist elem", 1267 + "err:%d errno:%d\n", err, errno); 1268 + err = bpf_map_delete_elem(map_fd, &index3); 1269 + CHECK(err != -1 || errno != ENOENT, 1270 + "reuseport array del not-exist elem", 1271 + "err:%d errno:%d\n", err, errno); 1272 + 1273 + for (t = 0; t < ARRAY_SIZE(types); t++) { 1274 + type = types[t]; 1275 + 1276 + prepare_reuseport_grp(type, map_fd, grpa_fds64, 1277 + grpa_cookies, ARRAY_SIZE(grpa_fds64)); 1278 + 1279 + /* Test BPF_* update flags */ 1280 + /* BPF_EXIST failure case */ 1281 + err = bpf_map_update_elem(map_fd, &index3, &grpa_fds64[fds_idx], 1282 + BPF_EXIST); 1283 + CHECK(err != -1 || errno != ENOENT, 1284 + "reuseport array update empty elem BPF_EXIST", 1285 + "sock_type:%d err:%d errno:%d\n", 1286 + type, err, errno); 1287 + fds_idx = REUSEPORT_FD_IDX(err, fds_idx); 1288 + 1289 + /* BPF_NOEXIST success case */ 1290 + err = bpf_map_update_elem(map_fd, &index3, &grpa_fds64[fds_idx], 1291 + BPF_NOEXIST); 1292 + CHECK(err == -1, 1293 + "reuseport array update empty elem BPF_NOEXIST", 1294 + "sock_type:%d err:%d errno:%d\n", 1295 + type, err, errno); 1296 + fds_idx = REUSEPORT_FD_IDX(err, fds_idx); 1297 + 1298 + /* BPF_EXIST success case. */ 1299 + err = bpf_map_update_elem(map_fd, &index3, &grpa_fds64[fds_idx], 1300 + BPF_EXIST); 1301 + CHECK(err == -1, 1302 + "reuseport array update same elem BPF_EXIST", 1303 + "sock_type:%d err:%d errno:%d\n", type, err, errno); 1304 + fds_idx = REUSEPORT_FD_IDX(err, fds_idx); 1305 + 1306 + /* BPF_NOEXIST failure case */ 1307 + err = bpf_map_update_elem(map_fd, &index3, &grpa_fds64[fds_idx], 1308 + BPF_NOEXIST); 1309 + CHECK(err != -1 || errno != EEXIST, 1310 + "reuseport array update non-empty elem BPF_NOEXIST", 1311 + "sock_type:%d err:%d errno:%d\n", 1312 + type, err, errno); 1313 + fds_idx = REUSEPORT_FD_IDX(err, fds_idx); 1314 + 1315 + /* BPF_ANY case (always succeed) */ 1316 + err = bpf_map_update_elem(map_fd, &index3, &grpa_fds64[fds_idx], 1317 + BPF_ANY); 1318 + CHECK(err == -1, 1319 + "reuseport array update same sk with BPF_ANY", 1320 + "sock_type:%d err:%d errno:%d\n", type, err, errno); 1321 + 1322 + fd64 = grpa_fds64[fds_idx]; 1323 + sk_cookie = grpa_cookies[fds_idx]; 1324 + 1325 + /* The same sk cannot be added to reuseport_array twice */ 1326 + err = bpf_map_update_elem(map_fd, &index3, &fd64, BPF_ANY); 1327 + CHECK(err != -1 || errno != EBUSY, 1328 + "reuseport array update same sk with same index", 1329 + "sock_type:%d err:%d errno:%d\n", 1330 + type, err, errno); 1331 + 1332 + err = bpf_map_update_elem(map_fd, &index0, &fd64, BPF_ANY); 1333 + CHECK(err != -1 || errno != EBUSY, 1334 + "reuseport array update same sk with different index", 1335 + "sock_type:%d err:%d errno:%d\n", 1336 + type, err, errno); 1337 + 1338 + /* Test delete elem */ 1339 + err = bpf_map_delete_elem(map_fd, &index3); 1340 + CHECK(err == -1, "reuseport array delete sk", 1341 + "sock_type:%d err:%d errno:%d\n", 1342 + type, err, errno); 1343 + 1344 + /* Add it back with BPF_NOEXIST */ 1345 + err = bpf_map_update_elem(map_fd, &index3, &fd64, BPF_NOEXIST); 1346 + CHECK(err == -1, 1347 + "reuseport array re-add with BPF_NOEXIST after del", 1348 + "sock_type:%d err:%d errno:%d\n", type, err, errno); 1349 + 1350 + /* Test cookie */ 1351 + err = bpf_map_lookup_elem(map_fd, &index3, &map_cookie); 1352 + CHECK(err == -1 || sk_cookie != map_cookie, 1353 + "reuseport array lookup re-added sk", 1354 + "sock_type:%d err:%d errno:%d sk_cookie:0x%llx map_cookie:0x%llxn", 1355 + type, err, errno, sk_cookie, map_cookie); 1356 + 1357 + /* Test elem removed by close() */ 1358 + for (f = 0; f < ARRAY_SIZE(grpa_fds64); f++) 1359 + close(grpa_fds64[f]); 1360 + err = bpf_map_lookup_elem(map_fd, &index3, &map_cookie); 1361 + CHECK(err != -1 || errno != ENOENT, 1362 + "reuseport array lookup after close()", 1363 + "sock_type:%d err:%d errno:%d\n", 1364 + type, err, errno); 1365 + } 1366 + 1367 + /* Test SOCK_RAW */ 1368 + fd64 = socket(AF_INET6, SOCK_RAW, IPPROTO_UDP); 1369 + CHECK(fd64 == -1, "socket(SOCK_RAW)", "err:%d errno:%d\n", 1370 + err, errno); 1371 + err = bpf_map_update_elem(map_fd, &index3, &fd64, BPF_NOEXIST); 1372 + CHECK(err != -1 || errno != ENOTSUPP, "reuseport array update SOCK_RAW", 1373 + "err:%d errno:%d\n", err, errno); 1374 + close(fd64); 1375 + 1376 + /* Close the 64 bit value map */ 1377 + close(map_fd); 1378 + 1379 + /* Test 32 bit fd */ 1380 + map_fd = bpf_create_map(BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, 1381 + sizeof(__u32), sizeof(__u32), array_size, 0); 1382 + CHECK(map_fd == -1, "reuseport array create", 1383 + "map_fd:%d, errno:%d\n", map_fd, errno); 1384 + prepare_reuseport_grp(SOCK_STREAM, map_fd, &fd64, &sk_cookie, 1); 1385 + fd = fd64; 1386 + err = bpf_map_update_elem(map_fd, &index3, &fd, BPF_NOEXIST); 1387 + CHECK(err == -1, "reuseport array update 32 bit fd", 1388 + "err:%d errno:%d\n", err, errno); 1389 + err = bpf_map_lookup_elem(map_fd, &index3, &map_cookie); 1390 + CHECK(err != -1 || errno != ENOSPC, 1391 + "reuseport array lookup 32 bit fd", 1392 + "err:%d errno:%d\n", err, errno); 1393 + close(fd); 1394 + close(map_fd); 1395 + } 1396 + 1167 1397 static void run_all_tests(void) 1168 1398 { 1169 1399 test_hashmap(0, NULL); ··· 1428 1170 1429 1171 test_map_rdonly(); 1430 1172 test_map_wronly(); 1173 + 1174 + test_reuseport_array(); 1431 1175 } 1432 1176 1433 1177 int main(void)

+688

tools/testing/selftests/bpf/test_select_reuseport.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* Copyright (c) 2018 Facebook */ 3 + 4 + #include <stdlib.h> 5 + #include <unistd.h> 6 + #include <stdbool.h> 7 + #include <string.h> 8 + #include <errno.h> 9 + #include <assert.h> 10 + #include <fcntl.h> 11 + #include <linux/bpf.h> 12 + #include <linux/err.h> 13 + #include <linux/types.h> 14 + #include <linux/if_ether.h> 15 + #include <sys/types.h> 16 + #include <sys/epoll.h> 17 + #include <sys/socket.h> 18 + #include <netinet/in.h> 19 + #include <bpf/bpf.h> 20 + #include <bpf/libbpf.h> 21 + #include "bpf_rlimit.h" 22 + #include "bpf_util.h" 23 + #include "test_select_reuseport_common.h" 24 + 25 + #define MIN_TCPHDR_LEN 20 26 + #define UDPHDR_LEN 8 27 + 28 + #define TCP_SYNCOOKIE_SYSCTL "/proc/sys/net/ipv4/tcp_syncookies" 29 + #define TCP_FO_SYSCTL "/proc/sys/net/ipv4/tcp_fastopen" 30 + #define REUSEPORT_ARRAY_SIZE 32 31 + 32 + static int result_map, tmp_index_ovr_map, linum_map, data_check_map; 33 + static enum result expected_results[NR_RESULTS]; 34 + static int sk_fds[REUSEPORT_ARRAY_SIZE]; 35 + static int reuseport_array, outer_map; 36 + static int select_by_skb_data_prog; 37 + static int saved_tcp_syncookie; 38 + static struct bpf_object *obj; 39 + static int saved_tcp_fo; 40 + static __u32 index_zero; 41 + static int epfd; 42 + 43 + static union sa46 { 44 + struct sockaddr_in6 v6; 45 + struct sockaddr_in v4; 46 + sa_family_t family; 47 + } srv_sa; 48 + 49 + #define CHECK(condition, tag, format...) ({ \ 50 + int __ret = !!(condition); \ 51 + if (__ret) { \ 52 + printf("%s(%d):FAIL:%s ", __func__, __LINE__, tag); \ 53 + printf(format); \ 54 + exit(-1); \ 55 + } \ 56 + }) 57 + 58 + static void create_maps(void) 59 + { 60 + struct bpf_create_map_attr attr = {}; 61 + 62 + /* Creating reuseport_array */ 63 + attr.name = "reuseport_array"; 64 + attr.map_type = BPF_MAP_TYPE_REUSEPORT_SOCKARRAY; 65 + attr.key_size = sizeof(__u32); 66 + attr.value_size = sizeof(__u32); 67 + attr.max_entries = REUSEPORT_ARRAY_SIZE; 68 + 69 + reuseport_array = bpf_create_map_xattr(&attr); 70 + CHECK(reuseport_array == -1, "creating reuseport_array", 71 + "reuseport_array:%d errno:%d\n", reuseport_array, errno); 72 + 73 + /* Creating outer_map */ 74 + attr.name = "outer_map"; 75 + attr.map_type = BPF_MAP_TYPE_ARRAY_OF_MAPS; 76 + attr.key_size = sizeof(__u32); 77 + attr.value_size = sizeof(__u32); 78 + attr.max_entries = 1; 79 + attr.inner_map_fd = reuseport_array; 80 + outer_map = bpf_create_map_xattr(&attr); 81 + CHECK(outer_map == -1, "creating outer_map", 82 + "outer_map:%d errno:%d\n", outer_map, errno); 83 + } 84 + 85 + static void prepare_bpf_obj(void) 86 + { 87 + struct bpf_program *prog; 88 + struct bpf_map *map; 89 + int err; 90 + struct bpf_object_open_attr attr = { 91 + .file = "test_select_reuseport_kern.o", 92 + .prog_type = BPF_PROG_TYPE_SK_REUSEPORT, 93 + }; 94 + 95 + obj = bpf_object__open_xattr(&attr); 96 + CHECK(IS_ERR_OR_NULL(obj), "open test_select_reuseport_kern.o", 97 + "obj:%p PTR_ERR(obj):%ld\n", obj, PTR_ERR(obj)); 98 + 99 + prog = bpf_program__next(NULL, obj); 100 + CHECK(!prog, "get first bpf_program", "!prog\n"); 101 + bpf_program__set_type(prog, attr.prog_type); 102 + 103 + map = bpf_object__find_map_by_name(obj, "outer_map"); 104 + CHECK(!map, "find outer_map", "!map\n"); 105 + err = bpf_map__reuse_fd(map, outer_map); 106 + CHECK(err, "reuse outer_map", "err:%d\n", err); 107 + 108 + err = bpf_object__load(obj); 109 + CHECK(err, "load bpf_object", "err:%d\n", err); 110 + 111 + select_by_skb_data_prog = bpf_program__fd(prog); 112 + CHECK(select_by_skb_data_prog == -1, "get prog fd", 113 + "select_by_skb_data_prog:%d\n", select_by_skb_data_prog); 114 + 115 + map = bpf_object__find_map_by_name(obj, "result_map"); 116 + CHECK(!map, "find result_map", "!map\n"); 117 + result_map = bpf_map__fd(map); 118 + CHECK(result_map == -1, "get result_map fd", 119 + "result_map:%d\n", result_map); 120 + 121 + map = bpf_object__find_map_by_name(obj, "tmp_index_ovr_map"); 122 + CHECK(!map, "find tmp_index_ovr_map", "!map\n"); 123 + tmp_index_ovr_map = bpf_map__fd(map); 124 + CHECK(tmp_index_ovr_map == -1, "get tmp_index_ovr_map fd", 125 + "tmp_index_ovr_map:%d\n", tmp_index_ovr_map); 126 + 127 + map = bpf_object__find_map_by_name(obj, "linum_map"); 128 + CHECK(!map, "find linum_map", "!map\n"); 129 + linum_map = bpf_map__fd(map); 130 + CHECK(linum_map == -1, "get linum_map fd", 131 + "linum_map:%d\n", linum_map); 132 + 133 + map = bpf_object__find_map_by_name(obj, "data_check_map"); 134 + CHECK(!map, "find data_check_map", "!map\n"); 135 + data_check_map = bpf_map__fd(map); 136 + CHECK(data_check_map == -1, "get data_check_map fd", 137 + "data_check_map:%d\n", data_check_map); 138 + } 139 + 140 + static void sa46_init_loopback(union sa46 *sa, sa_family_t family) 141 + { 142 + memset(sa, 0, sizeof(*sa)); 143 + sa->family = family; 144 + if (sa->family == AF_INET6) 145 + sa->v6.sin6_addr = in6addr_loopback; 146 + else 147 + sa->v4.sin_addr.s_addr = htonl(INADDR_LOOPBACK); 148 + } 149 + 150 + static void sa46_init_inany(union sa46 *sa, sa_family_t family) 151 + { 152 + memset(sa, 0, sizeof(*sa)); 153 + sa->family = family; 154 + if (sa->family == AF_INET6) 155 + sa->v6.sin6_addr = in6addr_any; 156 + else 157 + sa->v4.sin_addr.s_addr = INADDR_ANY; 158 + } 159 + 160 + static int read_int_sysctl(const char *sysctl) 161 + { 162 + char buf[16]; 163 + int fd, ret; 164 + 165 + fd = open(sysctl, 0); 166 + CHECK(fd == -1, "open(sysctl)", "sysctl:%s fd:%d errno:%d\n", 167 + sysctl, fd, errno); 168 + 169 + ret = read(fd, buf, sizeof(buf)); 170 + CHECK(ret <= 0, "read(sysctl)", "sysctl:%s ret:%d errno:%d\n", 171 + sysctl, ret, errno); 172 + close(fd); 173 + 174 + return atoi(buf); 175 + } 176 + 177 + static void write_int_sysctl(const char *sysctl, int v) 178 + { 179 + int fd, ret, size; 180 + char buf[16]; 181 + 182 + fd = open(sysctl, O_RDWR); 183 + CHECK(fd == -1, "open(sysctl)", "sysctl:%s fd:%d errno:%d\n", 184 + sysctl, fd, errno); 185 + 186 + size = snprintf(buf, sizeof(buf), "%d", v); 187 + ret = write(fd, buf, size); 188 + CHECK(ret != size, "write(sysctl)", 189 + "sysctl:%s ret:%d size:%d errno:%d\n", sysctl, ret, size, errno); 190 + close(fd); 191 + } 192 + 193 + static void restore_sysctls(void) 194 + { 195 + write_int_sysctl(TCP_FO_SYSCTL, saved_tcp_fo); 196 + write_int_sysctl(TCP_SYNCOOKIE_SYSCTL, saved_tcp_syncookie); 197 + } 198 + 199 + static void enable_fastopen(void) 200 + { 201 + int fo; 202 + 203 + fo = read_int_sysctl(TCP_FO_SYSCTL); 204 + write_int_sysctl(TCP_FO_SYSCTL, fo | 7); 205 + } 206 + 207 + static void enable_syncookie(void) 208 + { 209 + write_int_sysctl(TCP_SYNCOOKIE_SYSCTL, 2); 210 + } 211 + 212 + static void disable_syncookie(void) 213 + { 214 + write_int_sysctl(TCP_SYNCOOKIE_SYSCTL, 0); 215 + } 216 + 217 + static __u32 get_linum(void) 218 + { 219 + __u32 linum; 220 + int err; 221 + 222 + err = bpf_map_lookup_elem(linum_map, &index_zero, &linum); 223 + CHECK(err == -1, "lookup_elem(linum_map)", "err:%d errno:%d\n", 224 + err, errno); 225 + 226 + return linum; 227 + } 228 + 229 + static void check_data(int type, sa_family_t family, const struct cmd *cmd, 230 + int cli_fd) 231 + { 232 + struct data_check expected = {}, result; 233 + union sa46 cli_sa; 234 + socklen_t addrlen; 235 + int err; 236 + 237 + addrlen = sizeof(cli_sa); 238 + err = getsockname(cli_fd, (struct sockaddr *)&cli_sa, 239 + &addrlen); 240 + CHECK(err == -1, "getsockname(cli_fd)", "err:%d errno:%d\n", 241 + err, errno); 242 + 243 + err = bpf_map_lookup_elem(data_check_map, &index_zero, &result); 244 + CHECK(err == -1, "lookup_elem(data_check_map)", "err:%d errno:%d\n", 245 + err, errno); 246 + 247 + if (type == SOCK_STREAM) { 248 + expected.len = MIN_TCPHDR_LEN; 249 + expected.ip_protocol = IPPROTO_TCP; 250 + } else { 251 + expected.len = UDPHDR_LEN; 252 + expected.ip_protocol = IPPROTO_UDP; 253 + } 254 + 255 + if (family == AF_INET6) { 256 + expected.eth_protocol = htons(ETH_P_IPV6); 257 + expected.bind_inany = !srv_sa.v6.sin6_addr.s6_addr32[3] && 258 + !srv_sa.v6.sin6_addr.s6_addr32[2] && 259 + !srv_sa.v6.sin6_addr.s6_addr32[1] && 260 + !srv_sa.v6.sin6_addr.s6_addr32[0]; 261 + 262 + memcpy(&expected.skb_addrs[0], cli_sa.v6.sin6_addr.s6_addr32, 263 + sizeof(cli_sa.v6.sin6_addr)); 264 + memcpy(&expected.skb_addrs[4], &in6addr_loopback, 265 + sizeof(in6addr_loopback)); 266 + expected.skb_ports[0] = cli_sa.v6.sin6_port; 267 + expected.skb_ports[1] = srv_sa.v6.sin6_port; 268 + } else { 269 + expected.eth_protocol = htons(ETH_P_IP); 270 + expected.bind_inany = !srv_sa.v4.sin_addr.s_addr; 271 + 272 + expected.skb_addrs[0] = cli_sa.v4.sin_addr.s_addr; 273 + expected.skb_addrs[1] = htonl(INADDR_LOOPBACK); 274 + expected.skb_ports[0] = cli_sa.v4.sin_port; 275 + expected.skb_ports[1] = srv_sa.v4.sin_port; 276 + } 277 + 278 + if (memcmp(&result, &expected, offsetof(struct data_check, 279 + equal_check_end))) { 280 + printf("unexpected data_check\n"); 281 + printf(" result: (0x%x, %u, %u)\n", 282 + result.eth_protocol, result.ip_protocol, 283 + result.bind_inany); 284 + printf("expected: (0x%x, %u, %u)\n", 285 + expected.eth_protocol, expected.ip_protocol, 286 + expected.bind_inany); 287 + CHECK(1, "data_check result != expected", 288 + "bpf_prog_linum:%u\n", get_linum()); 289 + } 290 + 291 + CHECK(!result.hash, "data_check result.hash empty", 292 + "result.hash:%u", result.hash); 293 + 294 + expected.len += cmd ? sizeof(*cmd) : 0; 295 + if (type == SOCK_STREAM) 296 + CHECK(expected.len > result.len, "expected.len > result.len", 297 + "expected.len:%u result.len:%u bpf_prog_linum:%u\n", 298 + expected.len, result.len, get_linum()); 299 + else 300 + CHECK(expected.len != result.len, "expected.len != result.len", 301 + "expected.len:%u result.len:%u bpf_prog_linum:%u\n", 302 + expected.len, result.len, get_linum()); 303 + } 304 + 305 + static void check_results(void) 306 + { 307 + __u32 results[NR_RESULTS]; 308 + __u32 i, broken = 0; 309 + int err; 310 + 311 + for (i = 0; i < NR_RESULTS; i++) { 312 + err = bpf_map_lookup_elem(result_map, &i, &results[i]); 313 + CHECK(err == -1, "lookup_elem(result_map)", 314 + "i:%u err:%d errno:%d\n", i, err, errno); 315 + } 316 + 317 + for (i = 0; i < NR_RESULTS; i++) { 318 + if (results[i] != expected_results[i]) { 319 + broken = i; 320 + break; 321 + } 322 + } 323 + 324 + if (i == NR_RESULTS) 325 + return; 326 + 327 + printf("unexpected result\n"); 328 + printf(" result: ["); 329 + printf("%u", results[0]); 330 + for (i = 1; i < NR_RESULTS; i++) 331 + printf(", %u", results[i]); 332 + printf("]\n"); 333 + 334 + printf("expected: ["); 335 + printf("%u", expected_results[0]); 336 + for (i = 1; i < NR_RESULTS; i++) 337 + printf(", %u", expected_results[i]); 338 + printf("]\n"); 339 + 340 + CHECK(expected_results[broken] != results[broken], 341 + "unexpected result", 342 + "expected_results[%u] != results[%u] bpf_prog_linum:%u\n", 343 + broken, broken, get_linum()); 344 + } 345 + 346 + static int send_data(int type, sa_family_t family, void *data, size_t len, 347 + enum result expected) 348 + { 349 + union sa46 cli_sa; 350 + int fd, err; 351 + 352 + fd = socket(family, type, 0); 353 + CHECK(fd == -1, "socket()", "fd:%d errno:%d\n", fd, errno); 354 + 355 + sa46_init_loopback(&cli_sa, family); 356 + err = bind(fd, (struct sockaddr *)&cli_sa, sizeof(cli_sa)); 357 + CHECK(fd == -1, "bind(cli_sa)", "err:%d errno:%d\n", err, errno); 358 + 359 + err = sendto(fd, data, len, MSG_FASTOPEN, (struct sockaddr *)&srv_sa, 360 + sizeof(srv_sa)); 361 + CHECK(err != len && expected >= PASS, 362 + "sendto()", "family:%u err:%d errno:%d expected:%d\n", 363 + family, err, errno, expected); 364 + 365 + return fd; 366 + } 367 + 368 + static void do_test(int type, sa_family_t family, struct cmd *cmd, 369 + enum result expected) 370 + { 371 + int nev, srv_fd, cli_fd; 372 + struct epoll_event ev; 373 + struct cmd rcv_cmd; 374 + ssize_t nread; 375 + 376 + cli_fd = send_data(type, family, cmd, cmd ? sizeof(*cmd) : 0, 377 + expected); 378 + nev = epoll_wait(epfd, &ev, 1, expected >= PASS ? 5 : 0); 379 + CHECK((nev <= 0 && expected >= PASS) || 380 + (nev > 0 && expected < PASS), 381 + "nev <> expected", 382 + "nev:%d expected:%d type:%d family:%d data:(%d, %d)\n", 383 + nev, expected, type, family, 384 + cmd ? cmd->reuseport_index : -1, 385 + cmd ? cmd->pass_on_failure : -1); 386 + check_results(); 387 + check_data(type, family, cmd, cli_fd); 388 + 389 + if (expected < PASS) 390 + return; 391 + 392 + CHECK(expected != PASS_ERR_SK_SELECT_REUSEPORT && 393 + cmd->reuseport_index != ev.data.u32, 394 + "check cmd->reuseport_index", 395 + "cmd:(%u, %u) ev.data.u32:%u\n", 396 + cmd->pass_on_failure, cmd->reuseport_index, ev.data.u32); 397 + 398 + srv_fd = sk_fds[ev.data.u32]; 399 + if (type == SOCK_STREAM) { 400 + int new_fd = accept(srv_fd, NULL, 0); 401 + 402 + CHECK(new_fd == -1, "accept(srv_fd)", 403 + "ev.data.u32:%u new_fd:%d errno:%d\n", 404 + ev.data.u32, new_fd, errno); 405 + 406 + nread = recv(new_fd, &rcv_cmd, sizeof(rcv_cmd), MSG_DONTWAIT); 407 + CHECK(nread != sizeof(rcv_cmd), 408 + "recv(new_fd)", 409 + "ev.data.u32:%u nread:%zd sizeof(rcv_cmd):%zu errno:%d\n", 410 + ev.data.u32, nread, sizeof(rcv_cmd), errno); 411 + 412 + close(new_fd); 413 + } else { 414 + nread = recv(srv_fd, &rcv_cmd, sizeof(rcv_cmd), MSG_DONTWAIT); 415 + CHECK(nread != sizeof(rcv_cmd), 416 + "recv(sk_fds)", 417 + "ev.data.u32:%u nread:%zd sizeof(rcv_cmd):%zu errno:%d\n", 418 + ev.data.u32, nread, sizeof(rcv_cmd), errno); 419 + } 420 + 421 + close(cli_fd); 422 + } 423 + 424 + static void test_err_inner_map(int type, sa_family_t family) 425 + { 426 + struct cmd cmd = { 427 + .reuseport_index = 0, 428 + .pass_on_failure = 0, 429 + }; 430 + 431 + printf("%s: ", __func__); 432 + expected_results[DROP_ERR_INNER_MAP]++; 433 + do_test(type, family, &cmd, DROP_ERR_INNER_MAP); 434 + printf("OK\n"); 435 + } 436 + 437 + static void test_err_skb_data(int type, sa_family_t family) 438 + { 439 + printf("%s: ", __func__); 440 + expected_results[DROP_ERR_SKB_DATA]++; 441 + do_test(type, family, NULL, DROP_ERR_SKB_DATA); 442 + printf("OK\n"); 443 + } 444 + 445 + static void test_err_sk_select_port(int type, sa_family_t family) 446 + { 447 + struct cmd cmd = { 448 + .reuseport_index = REUSEPORT_ARRAY_SIZE, 449 + .pass_on_failure = 0, 450 + }; 451 + 452 + printf("%s: ", __func__); 453 + expected_results[DROP_ERR_SK_SELECT_REUSEPORT]++; 454 + do_test(type, family, &cmd, DROP_ERR_SK_SELECT_REUSEPORT); 455 + printf("OK\n"); 456 + } 457 + 458 + static void test_pass(int type, sa_family_t family) 459 + { 460 + struct cmd cmd; 461 + int i; 462 + 463 + printf("%s: ", __func__); 464 + cmd.pass_on_failure = 0; 465 + for (i = 0; i < REUSEPORT_ARRAY_SIZE; i++) { 466 + expected_results[PASS]++; 467 + cmd.reuseport_index = i; 468 + do_test(type, family, &cmd, PASS); 469 + } 470 + printf("OK\n"); 471 + } 472 + 473 + static void test_syncookie(int type, sa_family_t family) 474 + { 475 + int err, tmp_index = 1; 476 + struct cmd cmd = { 477 + .reuseport_index = 0, 478 + .pass_on_failure = 0, 479 + }; 480 + 481 + if (type != SOCK_STREAM) 482 + return; 483 + 484 + printf("%s: ", __func__); 485 + /* 486 + * +1 for TCP-SYN and 487 + * +1 for the TCP-ACK (ack the syncookie) 488 + */ 489 + expected_results[PASS] += 2; 490 + enable_syncookie(); 491 + /* 492 + * Simulate TCP-SYN and TCP-ACK are handled by two different sk: 493 + * TCP-SYN: select sk_fds[tmp_index = 1] tmp_index is from the 494 + * tmp_index_ovr_map 495 + * TCP-ACK: select sk_fds[reuseport_index = 0] reuseport_index 496 + * is from the cmd.reuseport_index 497 + */ 498 + err = bpf_map_update_elem(tmp_index_ovr_map, &index_zero, 499 + &tmp_index, BPF_ANY); 500 + CHECK(err == -1, "update_elem(tmp_index_ovr_map, 0, 1)", 501 + "err:%d errno:%d\n", err, errno); 502 + do_test(type, family, &cmd, PASS); 503 + err = bpf_map_lookup_elem(tmp_index_ovr_map, &index_zero, 504 + &tmp_index); 505 + CHECK(err == -1 || tmp_index != -1, 506 + "lookup_elem(tmp_index_ovr_map)", 507 + "err:%d errno:%d tmp_index:%d\n", 508 + err, errno, tmp_index); 509 + disable_syncookie(); 510 + printf("OK\n"); 511 + } 512 + 513 + static void test_pass_on_err(int type, sa_family_t family) 514 + { 515 + struct cmd cmd = { 516 + .reuseport_index = REUSEPORT_ARRAY_SIZE, 517 + .pass_on_failure = 1, 518 + }; 519 + 520 + printf("%s: ", __func__); 521 + expected_results[PASS_ERR_SK_SELECT_REUSEPORT] += 1; 522 + do_test(type, family, &cmd, PASS_ERR_SK_SELECT_REUSEPORT); 523 + printf("OK\n"); 524 + } 525 + 526 + static void prepare_sk_fds(int type, sa_family_t family, bool inany) 527 + { 528 + const int first = REUSEPORT_ARRAY_SIZE - 1; 529 + int i, err, optval = 1; 530 + struct epoll_event ev; 531 + socklen_t addrlen; 532 + 533 + if (inany) 534 + sa46_init_inany(&srv_sa, family); 535 + else 536 + sa46_init_loopback(&srv_sa, family); 537 + addrlen = sizeof(srv_sa); 538 + 539 + /* 540 + * The sk_fds[] is filled from the back such that the order 541 + * is exactly opposite to the (struct sock_reuseport *)reuse->socks[]. 542 + */ 543 + for (i = first; i >= 0; i--) { 544 + sk_fds[i] = socket(family, type, 0); 545 + CHECK(sk_fds[i] == -1, "socket()", "sk_fds[%d]:%d errno:%d\n", 546 + i, sk_fds[i], errno); 547 + err = setsockopt(sk_fds[i], SOL_SOCKET, SO_REUSEPORT, 548 + &optval, sizeof(optval)); 549 + CHECK(err == -1, "setsockopt(SO_REUSEPORT)", 550 + "sk_fds[%d] err:%d errno:%d\n", 551 + i, err, errno); 552 + 553 + if (i == first) { 554 + err = setsockopt(sk_fds[i], SOL_SOCKET, 555 + SO_ATTACH_REUSEPORT_EBPF, 556 + &select_by_skb_data_prog, 557 + sizeof(select_by_skb_data_prog)); 558 + CHECK(err == -1, "setsockopt(SO_ATTACH_REUEPORT_EBPF)", 559 + "err:%d errno:%d\n", err, errno); 560 + } 561 + 562 + err = bind(sk_fds[i], (struct sockaddr *)&srv_sa, addrlen); 563 + CHECK(err == -1, "bind()", "sk_fds[%d] err:%d errno:%d\n", 564 + i, err, errno); 565 + 566 + if (type == SOCK_STREAM) { 567 + err = listen(sk_fds[i], 10); 568 + CHECK(err == -1, "listen()", 569 + "sk_fds[%d] err:%d errno:%d\n", 570 + i, err, errno); 571 + } 572 + 573 + err = bpf_map_update_elem(reuseport_array, &i, &sk_fds[i], 574 + BPF_NOEXIST); 575 + CHECK(err == -1, "update_elem(reuseport_array)", 576 + "sk_fds[%d] err:%d errno:%d\n", i, err, errno); 577 + 578 + if (i == first) { 579 + socklen_t addrlen = sizeof(srv_sa); 580 + 581 + err = getsockname(sk_fds[i], (struct sockaddr *)&srv_sa, 582 + &addrlen); 583 + CHECK(err == -1, "getsockname()", 584 + "sk_fds[%d] err:%d errno:%d\n", i, err, errno); 585 + } 586 + } 587 + 588 + epfd = epoll_create(1); 589 + CHECK(epfd == -1, "epoll_create(1)", 590 + "epfd:%d errno:%d\n", epfd, errno); 591 + 592 + ev.events = EPOLLIN; 593 + for (i = 0; i < REUSEPORT_ARRAY_SIZE; i++) { 594 + ev.data.u32 = i; 595 + err = epoll_ctl(epfd, EPOLL_CTL_ADD, sk_fds[i], &ev); 596 + CHECK(err, "epoll_ctl(EPOLL_CTL_ADD)", "sk_fds[%d]\n", i); 597 + } 598 + } 599 + 600 + static void setup_per_test(int type, unsigned short family, bool inany) 601 + { 602 + int ovr = -1, err; 603 + 604 + prepare_sk_fds(type, family, inany); 605 + err = bpf_map_update_elem(tmp_index_ovr_map, &index_zero, &ovr, 606 + BPF_ANY); 607 + CHECK(err == -1, "update_elem(tmp_index_ovr_map, 0, -1)", 608 + "err:%d errno:%d\n", err, errno); 609 + } 610 + 611 + static void cleanup_per_test(void) 612 + { 613 + int i, err; 614 + 615 + for (i = 0; i < REUSEPORT_ARRAY_SIZE; i++) 616 + close(sk_fds[i]); 617 + close(epfd); 618 + 619 + err = bpf_map_delete_elem(outer_map, &index_zero); 620 + CHECK(err == -1, "delete_elem(outer_map)", 621 + "err:%d errno:%d\n", err, errno); 622 + } 623 + 624 + static void cleanup(void) 625 + { 626 + close(outer_map); 627 + close(reuseport_array); 628 + bpf_object__close(obj); 629 + } 630 + 631 + static void test_all(void) 632 + { 633 + /* Extra SOCK_STREAM to test bind_inany==true */ 634 + const int types[] = { SOCK_STREAM, SOCK_DGRAM, SOCK_STREAM }; 635 + const char * const type_strings[] = { "TCP", "UDP", "TCP" }; 636 + const char * const family_strings[] = { "IPv6", "IPv4" }; 637 + const unsigned short families[] = { AF_INET6, AF_INET }; 638 + const bool bind_inany[] = { false, false, true }; 639 + int t, f, err; 640 + 641 + for (f = 0; f < ARRAY_SIZE(families); f++) { 642 + unsigned short family = families[f]; 643 + 644 + for (t = 0; t < ARRAY_SIZE(types); t++) { 645 + bool inany = bind_inany[t]; 646 + int type = types[t]; 647 + 648 + printf("######## %s/%s %s ########\n", 649 + family_strings[f], type_strings[t], 650 + inany ? " INANY " : "LOOPBACK"); 651 + 652 + setup_per_test(type, family, inany); 653 + 654 + test_err_inner_map(type, family); 655 + 656 + /* Install reuseport_array to the outer_map */ 657 + err = bpf_map_update_elem(outer_map, &index_zero, 658 + &reuseport_array, BPF_ANY); 659 + CHECK(err == -1, "update_elem(outer_map)", 660 + "err:%d errno:%d\n", err, errno); 661 + 662 + test_err_skb_data(type, family); 663 + test_err_sk_select_port(type, family); 664 + test_pass(type, family); 665 + test_syncookie(type, family); 666 + test_pass_on_err(type, family); 667 + 668 + cleanup_per_test(); 669 + printf("\n"); 670 + } 671 + } 672 + } 673 + 674 + int main(int argc, const char **argv) 675 + { 676 + create_maps(); 677 + prepare_bpf_obj(); 678 + saved_tcp_fo = read_int_sysctl(TCP_FO_SYSCTL); 679 + saved_tcp_syncookie = read_int_sysctl(TCP_SYNCOOKIE_SYSCTL); 680 + enable_fastopen(); 681 + disable_syncookie(); 682 + atexit(restore_sysctls); 683 + 684 + test_all(); 685 + 686 + cleanup(); 687 + return 0; 688 + }

+36

tools/testing/selftests/bpf/test_select_reuseport_common.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + /* Copyright (c) 2018 Facebook */ 3 + 4 + #ifndef __TEST_SELECT_REUSEPORT_COMMON_H 5 + #define __TEST_SELECT_REUSEPORT_COMMON_H 6 + 7 + #include <linux/types.h> 8 + 9 + enum result { 10 + DROP_ERR_INNER_MAP, 11 + DROP_ERR_SKB_DATA, 12 + DROP_ERR_SK_SELECT_REUSEPORT, 13 + DROP_MISC, 14 + PASS, 15 + PASS_ERR_SK_SELECT_REUSEPORT, 16 + NR_RESULTS, 17 + }; 18 + 19 + struct cmd { 20 + __u32 reuseport_index; 21 + __u32 pass_on_failure; 22 + }; 23 + 24 + struct data_check { 25 + __u32 ip_protocol; 26 + __u32 skb_addrs[8]; 27 + __u16 skb_ports[2]; 28 + __u16 eth_protocol; 29 + __u8 bind_inany; 30 + __u8 equal_check_end[0]; 31 + 32 + __u32 len; 33 + __u32 hash; 34 + }; 35 + 36 + #endif

+180

tools/testing/selftests/bpf/test_select_reuseport_kern.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* Copyright (c) 2018 Facebook */ 3 + 4 + #include <stdlib.h> 5 + #include <linux/in.h> 6 + #include <linux/ip.h> 7 + #include <linux/ipv6.h> 8 + #include <linux/tcp.h> 9 + #include <linux/udp.h> 10 + #include <linux/bpf.h> 11 + #include <linux/types.h> 12 + #include <linux/if_ether.h> 13 + 14 + #include "bpf_endian.h" 15 + #include "bpf_helpers.h" 16 + #include "test_select_reuseport_common.h" 17 + 18 + int _version SEC("version") = 1; 19 + 20 + #ifndef offsetof 21 + #define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER) 22 + #endif 23 + 24 + struct bpf_map_def SEC("maps") outer_map = { 25 + .type = BPF_MAP_TYPE_ARRAY_OF_MAPS, 26 + .key_size = sizeof(__u32), 27 + .value_size = sizeof(__u32), 28 + .max_entries = 1, 29 + }; 30 + 31 + struct bpf_map_def SEC("maps") result_map = { 32 + .type = BPF_MAP_TYPE_ARRAY, 33 + .key_size = sizeof(__u32), 34 + .value_size = sizeof(__u32), 35 + .max_entries = NR_RESULTS, 36 + }; 37 + 38 + struct bpf_map_def SEC("maps") tmp_index_ovr_map = { 39 + .type = BPF_MAP_TYPE_ARRAY, 40 + .key_size = sizeof(__u32), 41 + .value_size = sizeof(int), 42 + .max_entries = 1, 43 + }; 44 + 45 + struct bpf_map_def SEC("maps") linum_map = { 46 + .type = BPF_MAP_TYPE_ARRAY, 47 + .key_size = sizeof(__u32), 48 + .value_size = sizeof(__u32), 49 + .max_entries = 1, 50 + }; 51 + 52 + struct bpf_map_def SEC("maps") data_check_map = { 53 + .type = BPF_MAP_TYPE_ARRAY, 54 + .key_size = sizeof(__u32), 55 + .value_size = sizeof(struct data_check), 56 + .max_entries = 1, 57 + }; 58 + 59 + #define GOTO_DONE(_result) ({ \ 60 + result = (_result); \ 61 + linum = __LINE__; \ 62 + goto done; \ 63 + }) 64 + 65 + SEC("select_by_skb_data") 66 + int _select_by_skb_data(struct sk_reuseport_md *reuse_md) 67 + { 68 + __u32 linum, index = 0, flags = 0, index_zero = 0; 69 + __u32 *result_cnt, *linum_value; 70 + struct data_check data_check = {}; 71 + struct cmd *cmd, cmd_copy; 72 + void *data, *data_end; 73 + void *reuseport_array; 74 + enum result result; 75 + int *index_ovr; 76 + int err; 77 + 78 + data = reuse_md->data; 79 + data_end = reuse_md->data_end; 80 + data_check.len = reuse_md->len; 81 + data_check.eth_protocol = reuse_md->eth_protocol; 82 + data_check.ip_protocol = reuse_md->ip_protocol; 83 + data_check.hash = reuse_md->hash; 84 + data_check.bind_inany = reuse_md->bind_inany; 85 + if (data_check.eth_protocol == bpf_htons(ETH_P_IP)) { 86 + if (bpf_skb_load_bytes_relative(reuse_md, 87 + offsetof(struct iphdr, saddr), 88 + data_check.skb_addrs, 8, 89 + BPF_HDR_START_NET)) 90 + GOTO_DONE(DROP_MISC); 91 + } else { 92 + if (bpf_skb_load_bytes_relative(reuse_md, 93 + offsetof(struct ipv6hdr, saddr), 94 + data_check.skb_addrs, 32, 95 + BPF_HDR_START_NET)) 96 + GOTO_DONE(DROP_MISC); 97 + } 98 + 99 + /* 100 + * The ip_protocol could be a compile time decision 101 + * if the bpf_prog.o is dedicated to either TCP or 102 + * UDP. 103 + * 104 + * Otherwise, reuse_md->ip_protocol or 105 + * the protocol field in the iphdr can be used. 106 + */ 107 + if (data_check.ip_protocol == IPPROTO_TCP) { 108 + struct tcphdr *th = data; 109 + 110 + if (th + 1 > data_end) 111 + GOTO_DONE(DROP_MISC); 112 + 113 + data_check.skb_ports[0] = th->source; 114 + data_check.skb_ports[1] = th->dest; 115 + 116 + if ((th->doff << 2) + sizeof(*cmd) > data_check.len) 117 + GOTO_DONE(DROP_ERR_SKB_DATA); 118 + if (bpf_skb_load_bytes(reuse_md, th->doff << 2, &cmd_copy, 119 + sizeof(cmd_copy))) 120 + GOTO_DONE(DROP_MISC); 121 + cmd = &cmd_copy; 122 + } else if (data_check.ip_protocol == IPPROTO_UDP) { 123 + struct udphdr *uh = data; 124 + 125 + if (uh + 1 > data_end) 126 + GOTO_DONE(DROP_MISC); 127 + 128 + data_check.skb_ports[0] = uh->source; 129 + data_check.skb_ports[1] = uh->dest; 130 + 131 + if (sizeof(struct udphdr) + sizeof(*cmd) > data_check.len) 132 + GOTO_DONE(DROP_ERR_SKB_DATA); 133 + if (data + sizeof(struct udphdr) + sizeof(*cmd) > data_end) { 134 + if (bpf_skb_load_bytes(reuse_md, sizeof(struct udphdr), 135 + &cmd_copy, sizeof(cmd_copy))) 136 + GOTO_DONE(DROP_MISC); 137 + cmd = &cmd_copy; 138 + } else { 139 + cmd = data + sizeof(struct udphdr); 140 + } 141 + } else { 142 + GOTO_DONE(DROP_MISC); 143 + } 144 + 145 + reuseport_array = bpf_map_lookup_elem(&outer_map, &index_zero); 146 + if (!reuseport_array) 147 + GOTO_DONE(DROP_ERR_INNER_MAP); 148 + 149 + index = cmd->reuseport_index; 150 + index_ovr = bpf_map_lookup_elem(&tmp_index_ovr_map, &index_zero); 151 + if (!index_ovr) 152 + GOTO_DONE(DROP_MISC); 153 + 154 + if (*index_ovr != -1) { 155 + index = *index_ovr; 156 + *index_ovr = -1; 157 + } 158 + err = bpf_sk_select_reuseport(reuse_md, reuseport_array, &index, 159 + flags); 160 + if (!err) 161 + GOTO_DONE(PASS); 162 + 163 + if (cmd->pass_on_failure) 164 + GOTO_DONE(PASS_ERR_SK_SELECT_REUSEPORT); 165 + else 166 + GOTO_DONE(DROP_ERR_SK_SELECT_REUSEPORT); 167 + 168 + done: 169 + result_cnt = bpf_map_lookup_elem(&result_map, &result); 170 + if (!result_cnt) 171 + return SK_DROP; 172 + 173 + bpf_map_update_elem(&linum_map, &index_zero, &linum, BPF_ANY); 174 + bpf_map_update_elem(&data_check_map, &index_zero, &data_check, BPF_ANY); 175 + 176 + (*result_cnt)++; 177 + return result < PASS ? SK_DROP : SK_PASS; 178 + } 179 + 180 + char _license[] SEC("license") = "GPL";

+62

tools/testing/selftests/bpf/test_skb_cgroup_id.sh

··· 1 + #!/bin/sh 2 + # SPDX-License-Identifier: GPL-2.0 3 + # Copyright (c) 2018 Facebook 4 + 5 + set -eu 6 + 7 + wait_for_ip() 8 + { 9 + local _i 10 + echo -n "Wait for testing link-local IP to become available " 11 + for _i in $(seq ${MAX_PING_TRIES}); do 12 + echo -n "." 13 + if ping -6 -q -c 1 -W 1 ff02::1%${TEST_IF} >/dev/null 2>&1; then 14 + echo " OK" 15 + return 16 + fi 17 + sleep 1 18 + done 19 + echo 1>&2 "ERROR: Timeout waiting for test IP to become available." 20 + exit 1 21 + } 22 + 23 + setup() 24 + { 25 + # Create testing interfaces not to interfere with current environment. 26 + ip link add dev ${TEST_IF} type veth peer name ${TEST_IF_PEER} 27 + ip link set ${TEST_IF} up 28 + ip link set ${TEST_IF_PEER} up 29 + 30 + wait_for_ip 31 + 32 + tc qdisc add dev ${TEST_IF} clsact 33 + tc filter add dev ${TEST_IF} egress bpf obj ${BPF_PROG_OBJ} \ 34 + sec ${BPF_PROG_SECTION} da 35 + 36 + BPF_PROG_ID=$(tc filter show dev ${TEST_IF} egress | \ 37 + awk '/ id / {sub(/.* id /, "", $0); print($1)}') 38 + } 39 + 40 + cleanup() 41 + { 42 + ip link del ${TEST_IF} 2>/dev/null || : 43 + ip link del ${TEST_IF_PEER} 2>/dev/null || : 44 + } 45 + 46 + main() 47 + { 48 + trap cleanup EXIT 2 3 6 15 49 + setup 50 + ${PROG} ${TEST_IF} ${BPF_PROG_ID} 51 + } 52 + 53 + DIR=$(dirname $0) 54 + TEST_IF="test_cgid_1" 55 + TEST_IF_PEER="test_cgid_2" 56 + MAX_PING_TRIES=5 57 + BPF_PROG_OBJ="${DIR}/test_skb_cgroup_id_kern.o" 58 + BPF_PROG_SECTION="cgroup_id_logger" 59 + BPF_PROG_ID=0 60 + PROG="${DIR}/test_skb_cgroup_id_user" 61 + 62 + main

+47

tools/testing/selftests/bpf/test_skb_cgroup_id_kern.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + // Copyright (c) 2018 Facebook 3 + 4 + #include <linux/bpf.h> 5 + #include <linux/pkt_cls.h> 6 + 7 + #include <string.h> 8 + 9 + #include "bpf_helpers.h" 10 + 11 + #define NUM_CGROUP_LEVELS 4 12 + 13 + struct bpf_map_def SEC("maps") cgroup_ids = { 14 + .type = BPF_MAP_TYPE_ARRAY, 15 + .key_size = sizeof(__u32), 16 + .value_size = sizeof(__u64), 17 + .max_entries = NUM_CGROUP_LEVELS, 18 + }; 19 + 20 + static __always_inline void log_nth_level(struct __sk_buff *skb, __u32 level) 21 + { 22 + __u64 id; 23 + 24 + /* [1] &level passed to external function that may change it, it's 25 + * incompatible with loop unroll. 26 + */ 27 + id = bpf_skb_ancestor_cgroup_id(skb, level); 28 + bpf_map_update_elem(&cgroup_ids, &level, &id, 0); 29 + } 30 + 31 + SEC("cgroup_id_logger") 32 + int log_cgroup_id(struct __sk_buff *skb) 33 + { 34 + /* Loop unroll can't be used here due to [1]. Unrolling manually. 35 + * Number of calls should be in sync with NUM_CGROUP_LEVELS. 36 + */ 37 + log_nth_level(skb, 0); 38 + log_nth_level(skb, 1); 39 + log_nth_level(skb, 2); 40 + log_nth_level(skb, 3); 41 + 42 + return TC_ACT_OK; 43 + } 44 + 45 + int _version SEC("version") = 1; 46 + 47 + char _license[] SEC("license") = "GPL";

+187

tools/testing/selftests/bpf/test_skb_cgroup_id_user.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + // Copyright (c) 2018 Facebook 3 + 4 + #include <stdlib.h> 5 + #include <string.h> 6 + #include <unistd.h> 7 + 8 + #include <arpa/inet.h> 9 + #include <net/if.h> 10 + #include <netinet/in.h> 11 + #include <sys/socket.h> 12 + #include <sys/types.h> 13 + 14 + 15 + #include <bpf/bpf.h> 16 + #include <bpf/libbpf.h> 17 + 18 + #include "bpf_rlimit.h" 19 + #include "cgroup_helpers.h" 20 + 21 + #define CGROUP_PATH "/skb_cgroup_test" 22 + #define NUM_CGROUP_LEVELS 4 23 + 24 + /* RFC 4291, Section 2.7.1 */ 25 + #define LINKLOCAL_MULTICAST "ff02::1" 26 + 27 + static int mk_dst_addr(const char *ip, const char *iface, 28 + struct sockaddr_in6 *dst) 29 + { 30 + memset(dst, 0, sizeof(*dst)); 31 + 32 + dst->sin6_family = AF_INET6; 33 + dst->sin6_port = htons(1025); 34 + 35 + if (inet_pton(AF_INET6, ip, &dst->sin6_addr) != 1) { 36 + log_err("Invalid IPv6: %s", ip); 37 + return -1; 38 + } 39 + 40 + dst->sin6_scope_id = if_nametoindex(iface); 41 + if (!dst->sin6_scope_id) { 42 + log_err("Failed to get index of iface: %s", iface); 43 + return -1; 44 + } 45 + 46 + return 0; 47 + } 48 + 49 + static int send_packet(const char *iface) 50 + { 51 + struct sockaddr_in6 dst; 52 + char msg[] = "msg"; 53 + int err = 0; 54 + int fd = -1; 55 + 56 + if (mk_dst_addr(LINKLOCAL_MULTICAST, iface, &dst)) 57 + goto err; 58 + 59 + fd = socket(AF_INET6, SOCK_DGRAM, 0); 60 + if (fd == -1) { 61 + log_err("Failed to create UDP socket"); 62 + goto err; 63 + } 64 + 65 + if (sendto(fd, &msg, sizeof(msg), 0, (const struct sockaddr *)&dst, 66 + sizeof(dst)) == -1) { 67 + log_err("Failed to send datagram"); 68 + goto err; 69 + } 70 + 71 + goto out; 72 + err: 73 + err = -1; 74 + out: 75 + if (fd >= 0) 76 + close(fd); 77 + return err; 78 + } 79 + 80 + int get_map_fd_by_prog_id(int prog_id) 81 + { 82 + struct bpf_prog_info info = {}; 83 + __u32 info_len = sizeof(info); 84 + __u32 map_ids[1]; 85 + int prog_fd = -1; 86 + int map_fd = -1; 87 + 88 + prog_fd = bpf_prog_get_fd_by_id(prog_id); 89 + if (prog_fd < 0) { 90 + log_err("Failed to get fd by prog id %d", prog_id); 91 + goto err; 92 + } 93 + 94 + info.nr_map_ids = 1; 95 + info.map_ids = (__u64) (unsigned long) map_ids; 96 + 97 + if (bpf_obj_get_info_by_fd(prog_fd, &info, &info_len)) { 98 + log_err("Failed to get info by prog fd %d", prog_fd); 99 + goto err; 100 + } 101 + 102 + if (!info.nr_map_ids) { 103 + log_err("No maps found for prog fd %d", prog_fd); 104 + goto err; 105 + } 106 + 107 + map_fd = bpf_map_get_fd_by_id(map_ids[0]); 108 + if (map_fd < 0) 109 + log_err("Failed to get fd by map id %d", map_ids[0]); 110 + err: 111 + if (prog_fd >= 0) 112 + close(prog_fd); 113 + return map_fd; 114 + } 115 + 116 + int check_ancestor_cgroup_ids(int prog_id) 117 + { 118 + __u64 actual_ids[NUM_CGROUP_LEVELS], expected_ids[NUM_CGROUP_LEVELS]; 119 + __u32 level; 120 + int err = 0; 121 + int map_fd; 122 + 123 + expected_ids[0] = 0x100000001; /* root cgroup */ 124 + expected_ids[1] = get_cgroup_id(""); 125 + expected_ids[2] = get_cgroup_id(CGROUP_PATH); 126 + expected_ids[3] = 0; /* non-existent cgroup */ 127 + 128 + map_fd = get_map_fd_by_prog_id(prog_id); 129 + if (map_fd < 0) 130 + goto err; 131 + 132 + for (level = 0; level < NUM_CGROUP_LEVELS; ++level) { 133 + if (bpf_map_lookup_elem(map_fd, &level, &actual_ids[level])) { 134 + log_err("Failed to lookup key %d", level); 135 + goto err; 136 + } 137 + if (actual_ids[level] != expected_ids[level]) { 138 + log_err("%llx (actual) != %llx (expected), level: %u\n", 139 + actual_ids[level], expected_ids[level], level); 140 + goto err; 141 + } 142 + } 143 + 144 + goto out; 145 + err: 146 + err = -1; 147 + out: 148 + if (map_fd >= 0) 149 + close(map_fd); 150 + return err; 151 + } 152 + 153 + int main(int argc, char **argv) 154 + { 155 + int cgfd = -1; 156 + int err = 0; 157 + 158 + if (argc < 3) { 159 + fprintf(stderr, "Usage: %s iface prog_id\n", argv[0]); 160 + exit(EXIT_FAILURE); 161 + } 162 + 163 + if (setup_cgroup_environment()) 164 + goto err; 165 + 166 + cgfd = create_and_get_cgroup(CGROUP_PATH); 167 + if (!cgfd) 168 + goto err; 169 + 170 + if (join_cgroup(CGROUP_PATH)) 171 + goto err; 172 + 173 + if (send_packet(argv[1])) 174 + goto err; 175 + 176 + if (check_ancestor_cgroup_ids(atoi(argv[2]))) 177 + goto err; 178 + 179 + goto out; 180 + err: 181 + err = -1; 182 + out: 183 + close(cgfd); 184 + cleanup_cgroup_environment(); 185 + printf("[%s]\n", err ? "FAIL" : "PASS"); 186 + return err; 187 + }

+1 -4

tools/testing/selftests/bpf/test_sock.c

··· 14 14 15 15 #include "cgroup_helpers.h" 16 16 #include "bpf_rlimit.h" 17 - 18 - #ifndef ARRAY_SIZE 19 - # define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) 20 - #endif 17 + #include "bpf_util.h" 21 18 22 19 #define CG_PATH "/foo" 23 20 #define MAX_INSNS 512

+1 -4

tools/testing/selftests/bpf/test_sock_addr.c

··· 20 20 21 21 #include "cgroup_helpers.h" 22 22 #include "bpf_rlimit.h" 23 + #include "bpf_util.h" 23 24 24 25 #ifndef ENOTSUPP 25 26 # define ENOTSUPP 524 26 - #endif 27 - 28 - #ifndef ARRAY_SIZE 29 - # define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) 30 27 #endif 31 28 32 29 #define CG_PATH "/foo"

+1 -4

tools/testing/selftests/bpf/test_verifier.c

··· 42 42 #endif 43 43 #include "bpf_rlimit.h" 44 44 #include "bpf_rand.h" 45 + #include "bpf_util.h" 45 46 #include "../../../include/linux/filter.h" 46 - 47 - #ifndef ARRAY_SIZE 48 - # define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) 49 - #endif 50 47 51 48 #define MAX_INSNS BPF_MAXINSNS 52 49 #define MAX_FIXUPS 8